diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..6aad062a9ead2d88a859379d5932cd3dc78a776b 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+llamafactory/extras/__pycache__/constants.cpython-312.pyc filter=lfs diff=lfs merge=lfs -text
diff --git a/README.md b/README.md
index c4747101b23ead88704837d275618fe368198a4d..6cf72efb35b793371fac42a092678981ab5317a2 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,6 @@
 ---
-title: App
-emoji: 🏢
-colorFrom: blue
-colorTo: purple
+title: app
+app_file: webui.py
 sdk: gradio
-sdk_version: 5.49.1
-app_file: app.py
-pinned: false
+sdk_version: 5.45.0
 ---
-
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
diff --git a/api.py b/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..61215459ed91c6fa529a719cb9dac57223754d2e
--- /dev/null
+++ b/api.py
@@ -0,0 +1,33 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import uvicorn
+
+from llamafactory.api.app import create_app
+from llamafactory.chat import ChatModel
+
+
+def main():
+    chat_model = ChatModel()
+    app = create_app(chat_model)
+    api_host = os.getenv("API_HOST", "0.0.0.0")
+    api_port = int(os.getenv("API_PORT", "8000"))
+    print(f"Visit http://localhost:{api_port}/docs for API document.")
+    uvicorn.run(app, host=api_host, port=api_port)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/llamafactory.egg-info/PKG-INFO b/llamafactory.egg-info/PKG-INFO
new file mode 100644
index 0000000000000000000000000000000000000000..b0a26490c345a22dc3e831d6325c1d3de71902c4
--- /dev/null
+++ b/llamafactory.egg-info/PKG-INFO
@@ -0,0 +1,1124 @@
+Metadata-Version: 2.4
+Name: llamafactory
+Version: 0.9.4.dev0
+Summary: Unified Efficient Fine-Tuning of 100+ LLMs
+Home-page: https://github.com/hiyouga/LLaMA-Factory
+Author: hiyouga
+Author-email: hiyouga@buaa.edu.cn
+License: Apache 2.0 License
+Keywords: AI,LLM,GPT,ChatGPT,Llama,Transformer,DeepSeek,Pytorch
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Education
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: Apache Software License
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Requires-Python: >=3.9.0
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: transformers!=4.52.0,<=4.56.2,>=4.49.0; python_version < "3.10"
+Requires-Dist: transformers!=4.52.0,!=4.57.0,<=4.57.1,>=4.49.0; python_version >= "3.10"
+Requires-Dist: datasets<=4.0.0,>=2.16.0
+Requires-Dist: accelerate<=1.11.0,>=1.3.0
+Requires-Dist: peft<=0.17.1,>=0.14.0
+Requires-Dist: trl<=0.9.6,>=0.8.6
+Requires-Dist: gradio<=5.45.0,>=4.38.0
+Requires-Dist: matplotlib>=3.7.0
+Requires-Dist: tyro<0.9.0
+Requires-Dist: einops
+Requires-Dist: numpy<2.0.0
+Requires-Dist: pandas>=2.0.0
+Requires-Dist: scipy
+Requires-Dist: sentencepiece
+Requires-Dist: tiktoken
+Requires-Dist: modelscope>=1.14.0
+Requires-Dist: hf-transfer
+Requires-Dist: safetensors<=0.5.3
+Requires-Dist: fire
+Requires-Dist: omegaconf
+Requires-Dist: packaging
+Requires-Dist: protobuf
+Requires-Dist: pyyaml
+Requires-Dist: pydantic<=2.10.6
+Requires-Dist: uvicorn
+Requires-Dist: fastapi
+Requires-Dist: sse-starlette
+Requires-Dist: av
+Requires-Dist: librosa
+Requires-Dist: propcache!=0.4.0
+Provides-Extra: torch
+Requires-Dist: torch>=2.0.0; extra == "torch"
+Requires-Dist: torchvision>=0.15.0; extra == "torch"
+Provides-Extra: torch-npu
+Requires-Dist: torch==2.7.1; extra == "torch-npu"
+Requires-Dist: torch-npu==2.7.1; extra == "torch-npu"
+Requires-Dist: torchvision==0.22.1; extra == "torch-npu"
+Requires-Dist: decorator; extra == "torch-npu"
+Provides-Extra: metrics
+Requires-Dist: nltk; extra == "metrics"
+Requires-Dist: jieba; extra == "metrics"
+Requires-Dist: rouge-chinese; extra == "metrics"
+Provides-Extra: deepspeed
+Requires-Dist: deepspeed<=0.16.9,>=0.10.0; extra == "deepspeed"
+Provides-Extra: liger-kernel
+Requires-Dist: liger-kernel>=0.5.5; extra == "liger-kernel"
+Provides-Extra: bitsandbytes
+Requires-Dist: bitsandbytes>=0.39.0; extra == "bitsandbytes"
+Provides-Extra: hqq
+Requires-Dist: hqq; extra == "hqq"
+Provides-Extra: eetq
+Requires-Dist: eetq; extra == "eetq"
+Provides-Extra: gptq
+Requires-Dist: optimum>=1.24.0; extra == "gptq"
+Requires-Dist: gptqmodel>=2.0.0; extra == "gptq"
+Provides-Extra: aqlm
+Requires-Dist: aqlm[gpu]>=1.1.0; extra == "aqlm"
+Provides-Extra: vllm
+Requires-Dist: vllm<=0.11.0,>=0.4.3; extra == "vllm"
+Provides-Extra: sglang
+Requires-Dist: sglang[srt]>=0.4.5; extra == "sglang"
+Requires-Dist: transformers==4.51.1; extra == "sglang"
+Provides-Extra: galore
+Requires-Dist: galore-torch; extra == "galore"
+Provides-Extra: apollo
+Requires-Dist: apollo-torch; extra == "apollo"
+Provides-Extra: badam
+Requires-Dist: badam>=1.2.1; extra == "badam"
+Provides-Extra: adam-mini
+Requires-Dist: adam-mini; extra == "adam-mini"
+Provides-Extra: minicpm-v
+Requires-Dist: soundfile; extra == "minicpm-v"
+Requires-Dist: torchvision; extra == "minicpm-v"
+Requires-Dist: torchaudio; extra == "minicpm-v"
+Requires-Dist: vector_quantize_pytorch; extra == "minicpm-v"
+Requires-Dist: vocos; extra == "minicpm-v"
+Requires-Dist: msgpack; extra == "minicpm-v"
+Requires-Dist: referencing; extra == "minicpm-v"
+Requires-Dist: jsonschema_specifications; extra == "minicpm-v"
+Provides-Extra: openmind
+Requires-Dist: openmind; extra == "openmind"
+Provides-Extra: swanlab
+Requires-Dist: swanlab; extra == "swanlab"
+Provides-Extra: fp8
+Requires-Dist: torchao>=0.8.0; extra == "fp8"
+Requires-Dist: accelerate>=1.10.0; extra == "fp8"
+Provides-Extra: fp8-te
+Requires-Dist: transformer_engine[pytorch]>=2.0.0; extra == "fp8-te"
+Requires-Dist: accelerate>=1.10.0; extra == "fp8-te"
+Provides-Extra: fp8-all
+Requires-Dist: torchao>=0.8.0; extra == "fp8-all"
+Requires-Dist: transformer_engine[pytorch]>=2.0.0; extra == "fp8-all"
+Requires-Dist: accelerate>=1.10.0; extra == "fp8-all"
+Provides-Extra: dev
+Requires-Dist: pre-commit; extra == "dev"
+Requires-Dist: ruff; extra == "dev"
+Requires-Dist: pytest; extra == "dev"
+Requires-Dist: build; extra == "dev"
+Dynamic: author
+Dynamic: author-email
+Dynamic: classifier
+Dynamic: description
+Dynamic: description-content-type
+Dynamic: home-page
+Dynamic: keywords
+Dynamic: license
+Dynamic: license-file
+Dynamic: provides-extra
+Dynamic: requires-dist
+Dynamic: requires-python
+Dynamic: summary
+
+![# LLaMA Factory](assets/logo.png)
+
+[![GitHub Repo stars](https://img.shields.io/github/stars/hiyouga/LLaMA-Factory?style=social)](https://github.com/hiyouga/LLaMA-Factory/stargazers)
+[![GitHub last commit](https://img.shields.io/github/last-commit/hiyouga/LLaMA-Factory)](https://github.com/hiyouga/LLaMA-Factory/commits/main)
+[![GitHub contributors](https://img.shields.io/github/contributors/hiyouga/LLaMA-Factory?color=orange)](https://github.com/hiyouga/LLaMA-Factory/graphs/contributors)
+[![GitHub workflow](https://github.com/hiyouga/LLaMA-Factory/actions/workflows/tests.yml/badge.svg)](https://github.com/hiyouga/LLaMA-Factory/actions/workflows/tests.yml)
+[![PyPI](https://img.shields.io/pypi/v/llamafactory)](https://pypi.org/project/llamafactory/)
+[![Citation](https://img.shields.io/badge/citation-1000+-green)](https://scholar.google.com/scholar?cites=12620864006390196564)
+[![Docker Pulls](https://img.shields.io/docker/pulls/hiyouga/llamafactory)](https://hub.docker.com/r/hiyouga/llamafactory/tags)
+
+[![Twitter](https://img.shields.io/twitter/follow/llamafactory_ai)](https://twitter.com/llamafactory_ai)
+[![Discord](assets/thirdparty/discord.svg)](https://discord.gg/rKfvV9r9FK)
+[![WeChat](https://img.shields.io/badge/WeChat-User%20Group-blue?logo=wechat)](https://github.com/hiyouga/llamafactory-community)
+[![Blog](https://img.shields.io/badge/Hugo-Official%20Blog-blue?logo=hugo)](https://blog.llamafactory.net/en/)
+
+[![Open in Colab](assets/thirdparty/colab.svg)](https://colab.research.google.com/drive/1eRTPn37ltBbYsISy9Aw2NuI2Aq5CQrD9?usp=sharing)
+[![Open in DSW](assets/thirdparty/dsw.svg)](https://gallery.pai-ml.com/#/preview/deepLearning/nlp/llama_factory)
+[![Open in Lab4ai](assets/thirdparty/lab4ai.svg)](https://www.lab4ai.cn/course/detail?id=7c13e60f6137474eb40f6fd3983c0f46&utm_source=LLaMA-Factory)
+[![Open in Online](assets/thirdparty/online.svg)](https://www.llamafactory.com.cn/?utm_source=LLaMA-Factory)
+[![Open in Spaces](https://img.shields.io/badge/🤗-Open%20in%20Spaces-blue)](https://huggingface.co/spaces/hiyouga/LLaMA-Board)
+[![Open in Studios](https://img.shields.io/badge/ModelScope-Open%20in%20Studios-blue)](https://modelscope.cn/studios/hiyouga/LLaMA-Board)
+[![Open in Novita](https://img.shields.io/badge/Novita-Deploy%20Template-blue)](https://novita.ai/templates-library/105981?sharer=88115474-394e-4bda-968e-b88e123d0c47)
+
+### Used by [Amazon](https://aws.amazon.com/cn/blogs/machine-learning/how-apoidea-group-enhances-visual-information-extraction-from-banking-documents-with-multimodal-models-using-llama-factory-on-amazon-sagemaker-hyperpod/), [NVIDIA](https://developer.nvidia.com/rtx/ai-toolkit), [Aliyun](https://help.aliyun.com/zh/pai/use-cases/fine-tune-a-llama-3-model-with-llama-factory), etc.
+
+<div align="center" markdown="1">
+
+### Supporters ❤️
+
+| <div style="text-align: center;"><a href="https://warp.dev/llama-factory"><img alt="Warp sponsorship" width="400" src="assets/sponsors/warp.jpg"></a><br><a href="https://warp.dev/llama-factory" style="font-size:larger;">Warp, the agentic terminal for developers</a><br><a href="https://warp.dev/llama-factory">Available for MacOS, Linux, & Windows</a> | <a href="https://serpapi.com"><img alt="SerpAPI sponsorship" width="250" src="assets/sponsors/serpapi.svg"> </a> |
+| ---- | ---- |
+
+----
+
+### Easily fine-tune 100+ large language models with zero-code [CLI](#quickstart) and [Web UI](#fine-tuning-with-llama-board-gui-powered-by-gradio)
+
+![GitHub Trend](https://trendshift.io/api/badge/repositories/4535)
+
+</div>
+
+👋 Join our [WeChat](https://github.com/hiyouga/llamafactory-community/blob/main/wechat/main.jpg), [NPU](https://github.com/hiyouga/llamafactory-community/blob/main/wechat/npu.jpg), [Lab4AI](https://github.com/hiyouga/llamafactory-community/blob/main/wechat/lab4ai.jpg), [LLaMA Factory Online](https://github.com/hiyouga/llamafactory-community/blob/main/wechat/online.jpg) user group.
+
+\[ English | [中文](README_zh.md) \]
+
+**Fine-tuning a large language model can be easy as...**
+
+https://github.com/user-attachments/assets/3991a3a8-4276-4d30-9cab-4cb0c4b9b99e
+
+Start local training:
+- Please refer to [usage](#getting-started)
+
+Start cloud training:
+- **Colab (free)**: https://colab.research.google.com/drive/1eRTPn37ltBbYsISy9Aw2NuI2Aq5CQrD9?usp=sharing
+- **PAI-DSW (free trial)**: https://gallery.pai-ml.com/#/preview/deepLearning/nlp/llama_factory
+- **LLaMA Factory Online**: https://www.llamafactory.com.cn/?utm_source=LLaMA-Factory
+- **Alaya NeW (cloud GPU deal)**: https://docs.alayanew.com/docs/documents/useGuide/LLaMAFactory/mutiple/?utm_source=LLaMA-Factory
+
+Read technical notes:
+- **Documentation (WIP)**: https://llamafactory.readthedocs.io/en/latest/
+- **Documentation (AMD GPU)**: https://rocm.docs.amd.com/projects/ai-developer-hub/en/latest/notebooks/fine_tune/llama_factory_llama3.html
+- **Official Blog**: https://blog.llamafactory.net/en/
+- **Official Course**: https://www.lab4ai.cn/course/detail?id=7c13e60f6137474eb40f6fd3983c0f46&utm_source=LLaMA-Factory
+
+> [!NOTE]
+> Except for the above links, all other websites are unauthorized third-party websites. Please carefully use them.
+
+## Table of Contents
+
+- [Features](#features)
+- [Blogs](#blogs)
+- [Changelog](#changelog)
+- [Supported Models](#supported-models)
+- [Supported Training Approaches](#supported-training-approaches)
+- [Provided Datasets](#provided-datasets)
+- [Requirement](#requirement)
+- [Getting Started](#getting-started)
+  - [Installation](#installation)
+  - [Data Preparation](#data-preparation)
+  - [Quickstart](#quickstart)
+  - [Fine-Tuning with LLaMA Board GUI](#fine-tuning-with-llama-board-gui-powered-by-gradio)
+  - [LLaMA Factory Online](#llama-factory-online)
+  - [Build Docker](#build-docker)
+  - [Deploy with OpenAI-style API and vLLM](#deploy-with-openai-style-api-and-vllm)
+  - [Download from ModelScope Hub](#download-from-modelscope-hub)
+  - [Download from Modelers Hub](#download-from-modelers-hub)
+  - [Use W&B Logger](#use-wb-logger)
+  - [Use SwanLab Logger](#use-swanlab-logger)
+- [Projects using LLaMA Factory](#projects-using-llama-factory)
+- [License](#license)
+- [Citation](#citation)
+- [Acknowledgement](#acknowledgement)
+
+## Features
+
+- **Various models**: LLaMA, LLaVA, Mistral, Mixtral-MoE, Qwen, Qwen2-VL, DeepSeek, Yi, Gemma, ChatGLM, Phi, etc.
+- **Integrated methods**: (Continuous) pre-training, (multimodal) supervised fine-tuning, reward modeling, PPO, DPO, KTO, ORPO, etc.
+- **Scalable resources**: 16-bit full-tuning, freeze-tuning, LoRA and 2/3/4/5/6/8-bit QLoRA via AQLM/AWQ/GPTQ/LLM.int8/HQQ/EETQ.
+- **Advanced algorithms**: [GaLore](https://github.com/jiaweizzhao/GaLore), [BAdam](https://github.com/Ledzy/BAdam), [APOLLO](https://github.com/zhuhanqing/APOLLO), [Adam-mini](https://github.com/zyushun/Adam-mini), [Muon](https://github.com/KellerJordan/Muon), [OFT](https://github.com/huggingface/peft/tree/main/src/peft/tuners/oft), DoRA, LongLoRA, LLaMA Pro, Mixture-of-Depths, LoRA+, LoftQ and PiSSA.
+- **Practical tricks**: [FlashAttention-2](https://github.com/Dao-AILab/flash-attention), [Unsloth](https://github.com/unslothai/unsloth), [Liger Kernel](https://github.com/linkedin/Liger-Kernel), RoPE scaling, NEFTune and rsLoRA.
+- **Wide tasks**: Multi-turn dialogue, tool using, image understanding, visual grounding, video recognition, audio understanding, etc.
+- **Experiment monitors**: LlamaBoard, TensorBoard, Wandb, MLflow, [SwanLab](https://github.com/SwanHubX/SwanLab), etc.
+- **Faster inference**: OpenAI-style API, Gradio UI and CLI with [vLLM worker](https://github.com/vllm-project/vllm) or [SGLang worker](https://github.com/sgl-project/sglang).
+
+### Day-N Support for Fine-Tuning Cutting-Edge Models
+
+| Support Date | Model Name                                                           |
+| ------------ | -------------------------------------------------------------------- |
+| Day 0        | Qwen3 / Qwen2.5-VL / Gemma 3 / GLM-4.1V / InternLM 3 / MiniCPM-o-2.6 |
+| Day 1        | Llama 3 / GLM-4 / Mistral Small / PaliGemma2 / Llama 4               |
+
+## Blogs
+
+> [!TIP]
+> Now we have a dedicated blog for LLaMA Factory!
+>
+> Website: https://blog.llamafactory.net/en/
+
+- 💡 [Easy Dataset × LLaMA Factory: Enabling LLMs to Efficiently Learn Domain Knowledge](https://buaa-act.feishu.cn/wiki/GVzlwYcRFiR8OLkHbL6cQpYin7g) (English)
+- [Fine-tune a mental health LLM using LLaMA-Factory](https://www.lab4ai.cn/project/detail?id=25cce32ec131497b9e06a93336a0817f&type=project&utm_source=LLaMA-Factory) (Chinese)
+- [Fine-tune GPT-OSS for Role-Playing using LLaMA-Factory](https://docs.llamafactory.com.cn/docs/documents/best-practice/gptroleplay/?utm_source=LLaMA-Factory) (Chinese)
+- [A One-Stop Code-Free Model Reinforcement Learning and Deployment Platform based on LLaMA-Factory and EasyR1](https://aws.amazon.com/cn/blogs/china/building-llm-model-hub-based-on-llamafactory-and-easyr1/) (Chinese)
+- [How Apoidea Group enhances visual information extraction from banking documents with multimodal models using LLaMA-Factory on Amazon SageMaker HyperPod](https://aws.amazon.com/cn/blogs/machine-learning/how-apoidea-group-enhances-visual-information-extraction-from-banking-documents-with-multimodal-models-using-llama-factory-on-amazon-sagemaker-hyperpod/) (English)
+
+<details><summary>All Blogs</summary>
+
+- [Fine-tune Llama3.1-70B for Medical Diagnosis using LLaMA-Factory](https://docs.alayanew.com/docs/documents/bestPractice/bigModel/llama70B/?utm_source=LLaMA-Factory) (Chinese)
+- [Fine-tune Qwen2.5-VL for Autonomous Driving using LLaMA-Factory](https://docs.alayanew.com/docs/documents/useGuide/LLaMAFactory/mutiple/?utm_source=LLaMA-Factory) (Chinese)
+- [LLaMA Factory: Fine-tuning the DeepSeek-R1-Distill-Qwen-7B Model for News Classifier](https://gallery.pai-ml.com/#/preview/deepLearning/nlp/llama_factory_deepseek_r1_distill_7b) (Chinese)
+- [A One-Stop Code-Free Model Fine-Tuning \& Deployment Platform based on SageMaker and LLaMA-Factory](https://aws.amazon.com/cn/blogs/china/a-one-stop-code-free-model-fine-tuning-deployment-platform-based-on-sagemaker-and-llama-factory/) (Chinese)
+- [LLaMA Factory Multi-Modal Fine-Tuning Practice: Fine-Tuning Qwen2-VL for Personal Tourist Guide](https://gallery.pai-ml.com/#/preview/deepLearning/nlp/llama_factory_qwen2vl) (Chinese)
+- [LLaMA Factory: Fine-tuning Llama3 for Role-Playing](https://gallery.pai-ml.com/#/preview/deepLearning/nlp/llama_factory) (Chinese)
+
+</details>
+
+## Changelog
+
+[25/10/26] We support Megatron-core training backend with [**mcore_adapter**](https://github.com/alibaba/ROLL/tree/main/mcore_adapter). See [PR #9237](https://github.com/hiyouga/LLaMA-Factory/pull/9237) to get started.
+
+[25/08/22] We supported **[OFT](https://arxiv.org/abs/2306.07280)** and **[OFTv2](https://arxiv.org/abs/2506.19847)**. See [examples](examples/README.md) for usage.
+
+[25/08/20] We supported fine-tuning the **[Intern-S1-mini](https://huggingface.co/internlm/Intern-S1-mini)** models. See [PR #8976](https://github.com/hiyouga/LLaMA-Factory/pull/8976) to get started.
+
+[25/08/06] We supported fine-tuning the **[GPT-OSS](https://github.com/openai/gpt-oss)** models. See [PR #8826](https://github.com/hiyouga/LLaMA-Factory/pull/8826) to get started.
+
+<details><summary>Full Changelog</summary>
+
+[25/07/02] We supported fine-tuning the **[GLM-4.1V-9B-Thinking](https://github.com/THUDM/GLM-4.1V-Thinking)** model.
+
+[25/04/28] We supported fine-tuning the **[Qwen3](https://qwenlm.github.io/blog/qwen3/)** model family.
+
+[25/04/21] We supported the **[Muon](https://github.com/KellerJordan/Muon)** optimizer. See [examples](examples/README.md) for usage. Thank [@tianshijing](https://github.com/tianshijing)'s PR.
+
+[25/04/16] We supported fine-tuning the **[InternVL3](https://huggingface.co/OpenGVLab/InternVL3-8B)** model. See [PR #7258](https://github.com/hiyouga/LLaMA-Factory/pull/7258) to get started.
+
+[25/04/14] We supported fine-tuning the **[GLM-Z1](https://huggingface.co/THUDM/GLM-Z1-9B-0414)** and **[Kimi-VL](https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct)** models.
+
+[25/04/06] We supported fine-tuning the **[Llama 4](https://ai.meta.com/blog/llama-4-multimodal-intelligence/)** model. See [PR #7611](https://github.com/hiyouga/LLaMA-Factory/pull/7611) to get started.
+
+[25/03/31] We supported fine-tuning the **[Qwen2.5 Omni](https://qwenlm.github.io/blog/qwen2.5-omni/)** model. See [PR #7537](https://github.com/hiyouga/LLaMA-Factory/pull/7537) to get started.
+
+[25/03/15] We supported **[SGLang](https://github.com/sgl-project/sglang)** as inference backend. Try `infer_backend: sglang` to accelerate inference.
+
+[25/03/12] We supported fine-tuning the **[Gemma 3](https://huggingface.co/blog/gemma3)** model.
+
+[25/02/24] Announcing **[EasyR1](https://github.com/hiyouga/EasyR1)**, an efficient, scalable and multi-modality RL training framework for efficient GRPO training.
+
+[25/02/11] We supported saving the **[Ollama](https://github.com/ollama/ollama)** modelfile when exporting the model checkpoints. See [examples](examples/README.md) for usage.
+
+[25/02/05] We supported fine-tuning the **[Qwen2-Audio](Qwen/Qwen2-Audio-7B-Instruct)** and **[MiniCPM-o-2.6](https://huggingface.co/openbmb/MiniCPM-o-2_6)** on audio understanding tasks.
+
+[25/01/31] We supported fine-tuning the **[DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1)** and **[Qwen2.5-VL](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct)** models.
+
+[25/01/15] We supported **[APOLLO](https://arxiv.org/abs/2412.05270)** optimizer. See [examples](examples/README.md) for usage.
+
+[25/01/14] We supported fine-tuning the **[MiniCPM-o-2.6](https://huggingface.co/openbmb/MiniCPM-o-2_6)** and **[MiniCPM-V-2.6](https://huggingface.co/openbmb/MiniCPM-V-2_6)** models. Thank [@BUAADreamer](https://github.com/BUAADreamer)'s PR.
+
+[25/01/14] We supported fine-tuning the **[InternLM 3](https://huggingface.co/collections/internlm/)** models. Thank [@hhaAndroid](https://github.com/hhaAndroid)'s PR.
+
+[25/01/10] We supported fine-tuning the **[Phi-4](https://huggingface.co/microsoft/phi-4)** model.
+
+[24/12/21] We supported using **[SwanLab](https://github.com/SwanHubX/SwanLab)** for experiment tracking and visualization. See [this section](#use-swanlab-logger) for details.
+
+[24/11/27] We supported fine-tuning the **[Skywork-o1](https://huggingface.co/Skywork/Skywork-o1-Open-Llama-3.1-8B)** model and the **[OpenO1](https://huggingface.co/datasets/O1-OPEN/OpenO1-SFT)** dataset.
+
+[24/10/09] We supported downloading pre-trained models and datasets from the **[Modelers Hub](https://modelers.cn/models)**. See [this tutorial](#download-from-modelers-hub) for usage.
+
+[24/09/19] We supported fine-tuning the **[Qwen2.5](https://qwenlm.github.io/blog/qwen2.5/)** models.
+
+[24/08/30] We supported fine-tuning the **[Qwen2-VL](https://qwenlm.github.io/blog/qwen2-vl/)** models. Thank [@simonJJJ](https://github.com/simonJJJ)'s PR.
+
+[24/08/27] We supported **[Liger Kernel](https://github.com/linkedin/Liger-Kernel)**. Try `enable_liger_kernel: true` for efficient training.
+
+[24/08/09] We supported **[Adam-mini](https://github.com/zyushun/Adam-mini)** optimizer. See [examples](examples/README.md) for usage. Thank [@relic-yuexi](https://github.com/relic-yuexi)'s PR.
+
+[24/07/04] We supported [contamination-free packed training](https://github.com/MeetKai/functionary/tree/main/functionary/train/packing). Use `neat_packing: true` to activate it. Thank [@chuan298](https://github.com/chuan298)'s PR.
+
+[24/06/16] We supported **[PiSSA](https://arxiv.org/abs/2404.02948)** algorithm. See [examples](examples/README.md) for usage.
+
+[24/06/07] We supported fine-tuning the **[Qwen2](https://qwenlm.github.io/blog/qwen2/)** and **[GLM-4](https://github.com/THUDM/GLM-4)** models.
+
+[24/05/26] We supported **[SimPO](https://arxiv.org/abs/2405.14734)** algorithm for preference learning. See [examples](examples/README.md) for usage.
+
+[24/05/20] We supported fine-tuning the **PaliGemma** series models. Note that the PaliGemma models are pre-trained models, you need to fine-tune them with `paligemma` template for chat completion.
+
+[24/05/18] We supported **[KTO](https://arxiv.org/abs/2402.01306)** algorithm for preference learning. See [examples](examples/README.md) for usage.
+
+[24/05/14] We supported training and inference on the Ascend NPU devices. Check [installation](#installation) section for details.
+
+[24/04/26] We supported fine-tuning the **LLaVA-1.5** multimodal LLMs. See [examples](examples/README.md) for usage.
+
+[24/04/22] We provided a **[Colab notebook](https://colab.research.google.com/drive/1eRTPn37ltBbYsISy9Aw2NuI2Aq5CQrD9?usp=sharing)** for fine-tuning the Llama-3 model on a free T4 GPU. Two Llama-3-derived models fine-tuned using LLaMA Factory are available at Hugging Face, check [Llama3-8B-Chinese-Chat](https://huggingface.co/shenzhi-wang/Llama3-8B-Chinese-Chat) and [Llama3-Chinese](https://huggingface.co/zhichen/Llama3-Chinese) for details.
+
+[24/04/21] We supported **[Mixture-of-Depths](https://arxiv.org/abs/2404.02258)** according to [AstraMindAI's implementation](https://github.com/astramind-ai/Mixture-of-depths). See [examples](examples/README.md) for usage.
+
+[24/04/16] We supported **[BAdam](https://arxiv.org/abs/2404.02827)** optimizer. See [examples](examples/README.md) for usage.
+
+[24/04/16] We supported **[unsloth](https://github.com/unslothai/unsloth)**'s long-sequence training (Llama-2-7B-56k within 24GB). It achieves **117%** speed and **50%** memory compared with FlashAttention-2, more benchmarks can be found in [this page](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison).
+
+[24/03/31] We supported **[ORPO](https://arxiv.org/abs/2403.07691)**. See [examples](examples/README.md) for usage.
+
+[24/03/21] Our paper "[LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models](https://arxiv.org/abs/2403.13372)" is available at arXiv!
+
+[24/03/20] We supported **FSDP+QLoRA** that fine-tunes a 70B model on 2x24GB GPUs. See [examples](examples/README.md) for usage.
+
+[24/03/13] We supported **[LoRA+](https://arxiv.org/abs/2402.12354)**. See [examples](examples/README.md) for usage.
+
+[24/03/07] We supported **[GaLore](https://arxiv.org/abs/2403.03507)** optimizer. See [examples](examples/README.md) for usage.
+
+[24/03/07] We integrated **[vLLM](https://github.com/vllm-project/vllm)** for faster and concurrent inference. Try `infer_backend: vllm` to enjoy **270%** inference speed.
+
+[24/02/28] We supported weight-decomposed LoRA (**[DoRA](https://arxiv.org/abs/2402.09353)**). Try `use_dora: true` to activate DoRA training.
+
+[24/02/15] We supported **block expansion** proposed by [LLaMA Pro](https://github.com/TencentARC/LLaMA-Pro). See [examples](examples/README.md) for usage.
+
+[24/02/05] Qwen1.5 (Qwen2 beta version) series models are supported in LLaMA-Factory. Check this [blog post](https://qwenlm.github.io/blog/qwen1.5/) for details.
+
+[24/01/18] We supported **agent tuning** for most models, equipping model with tool using abilities by fine-tuning with `dataset: glaive_toolcall_en`.
+
+[23/12/23] We supported **[unsloth](https://github.com/unslothai/unsloth)**'s implementation to boost LoRA tuning for the LLaMA, Mistral and Yi models. Try `use_unsloth: true` argument to activate unsloth patch. It achieves **170%** speed in our benchmark, check [this page](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison) for details.
+
+[23/12/12] We supported fine-tuning the latest MoE model **[Mixtral 8x7B](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)** in our framework. See hardware requirement [here](#hardware-requirement).
+
+[23/12/01] We supported downloading pre-trained models and datasets from the **[ModelScope Hub](https://modelscope.cn/models)**. See [this tutorial](#download-from-modelscope-hub) for usage.
+
+[23/10/21] We supported **[NEFTune](https://arxiv.org/abs/2310.05914)** trick for fine-tuning. Try `neftune_noise_alpha: 5` argument to activate NEFTune.
+
+[23/09/27] We supported **$S^2$-Attn** proposed by [LongLoRA](https://github.com/dvlab-research/LongLoRA) for the LLaMA models. Try `shift_attn: true` argument to enable shift short attention.
+
+[23/09/23] We integrated MMLU, C-Eval and CMMLU benchmarks in this repo. See [examples](examples/README.md) for usage.
+
+[23/09/10] We supported **[FlashAttention-2](https://github.com/Dao-AILab/flash-attention)**. Try `flash_attn: fa2` argument to enable FlashAttention-2 if you are using RTX4090, A100 or H100 GPUs.
+
+[23/08/12] We supported **RoPE scaling** to extend the context length of the LLaMA models. Try `rope_scaling: linear` argument in training and `rope_scaling: dynamic` argument at inference to extrapolate the position embeddings.
+
+[23/08/11] We supported **[DPO training](https://arxiv.org/abs/2305.18290)** for instruction-tuned models. See [examples](examples/README.md) for usage.
+
+[23/07/31] We supported **dataset streaming**. Try `streaming: true` and `max_steps: 10000` arguments to load your dataset in streaming mode.
+
+[23/07/29] We released two instruction-tuned 13B models at Hugging Face. See these Hugging Face Repos ([LLaMA-2](https://huggingface.co/hiyouga/Llama-2-Chinese-13b-chat) / [Baichuan](https://huggingface.co/hiyouga/Baichuan-13B-sft)) for details.
+
+[23/07/18] We developed an **all-in-one Web UI** for training, evaluation and inference. Try `train_web.py` to fine-tune models in your Web browser. Thank [@KanadeSiina](https://github.com/KanadeSiina) and [@codemayq](https://github.com/codemayq) for their efforts in the development.
+
+[23/07/09] We released **[FastEdit](https://github.com/hiyouga/FastEdit)** ⚡🩹, an easy-to-use package for editing the factual knowledge of large language models efficiently. Please follow [FastEdit](https://github.com/hiyouga/FastEdit) if you are interested.
+
+[23/06/29] We provided a **reproducible example** of training a chat model using instruction-following datasets, see [Baichuan-7B-sft](https://huggingface.co/hiyouga/Baichuan-7B-sft) for details.
+
+[23/06/22] We aligned the [demo API](src/api_demo.py) with the [OpenAI's](https://platform.openai.com/docs/api-reference/chat) format where you can insert the fine-tuned model in **arbitrary ChatGPT-based applications**.
+
+[23/06/03] We supported quantized training and inference (aka **[QLoRA](https://github.com/artidoro/qlora)**). See [examples](examples/README.md) for usage.
+
+</details>
+
+> [!TIP]
+> If you cannot use the latest feature, please pull the latest code and install LLaMA-Factory again.
+
+## Supported Models
+
+| Model                                                             | Model size                       | Template             |
+| ----------------------------------------------------------------- | -------------------------------- | -------------------- |
+| [Baichuan 2](https://huggingface.co/baichuan-inc)                 | 7B/13B                           | baichuan2            |
+| [BLOOM/BLOOMZ](https://huggingface.co/bigscience)                 | 560M/1.1B/1.7B/3B/7.1B/176B      | -                    |
+| [ChatGLM3](https://huggingface.co/THUDM)                          | 6B                               | chatglm3             |
+| [Command R](https://huggingface.co/CohereForAI)                   | 35B/104B                         | cohere               |
+| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai)         | 7B/16B/67B/236B                  | deepseek             |
+| [DeepSeek 2.5/3](https://huggingface.co/deepseek-ai)              | 236B/671B                        | deepseek3            |
+| [DeepSeek R1 (Distill)](https://huggingface.co/deepseek-ai)       | 1.5B/7B/8B/14B/32B/70B/671B      | deepseekr1           |
+| [ERNIE-4.5](https://huggingface.co/baidu)                         | 0.3B/21B/300B                    | ernie/ernie_nothink  |
+| [Falcon](https://huggingface.co/tiiuae)                           | 7B/11B/40B/180B                  | falcon               |
+| [Falcon-H1](https://huggingface.co/tiiuae)                        | 0.5B/1.5B/3B/7B/34B              | falcon_h1            |
+| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google)          | 2B/7B/9B/27B                     | gemma/gemma2         |
+| [Gemma 3/Gemma 3n](https://huggingface.co/google)                 | 270M/1B/4B/6B/8B/12B/27B         | gemma3/gemma3n       |
+| [GLM-4/GLM-4-0414/GLM-Z1](https://huggingface.co/zai-org)         | 9B/32B                           | glm4/glmz1           |
+| [GLM-4.1V](https://huggingface.co/zai-org)                        | 9B                               | glm4v                |
+| [GLM-4.5/GLM-4.5V](https://huggingface.co/zai-org)                | 106B/355B                        | glm4_moe/glm4v_moe   |
+| [GPT-2](https://huggingface.co/openai-community)                  | 0.1B/0.4B/0.8B/1.5B              | -                    |
+| [GPT-OSS](https://huggingface.co/openai)                          | 20B/120B                         | gpt                  |
+| [Granite 3.0-3.3](https://huggingface.co/ibm-granite)             | 1B/2B/3B/8B                      | granite3             |
+| [Granite 4](https://huggingface.co/ibm-granite)                   | 7B                               | granite4             |
+| [Hunyuan (MT)](https://huggingface.co/tencent/)                   | 7B                               | hunyuan              |
+| [Index](https://huggingface.co/IndexTeam)                         | 1.9B                             | index                |
+| [InternLM 2-3](https://huggingface.co/internlm)                   | 7B/8B/20B                        | intern2              |
+| [InternVL 2.5-3.5](https://huggingface.co/OpenGVLab)              | 1B/2B/4B/8B/14B/30B/38B/78B/241B | intern_vl            |
+| [InternLM/Intern-S1-mini](https://huggingface.co/internlm/)       | 8B                               | intern_s1            |
+| [Kimi-VL](https://huggingface.co/moonshotai)                      | 16B                              | kimi_vl              |
+| [Ling 2.0 (mini/flash)](https://huggingface.co/inclusionAI)       | 16B/100B                         | bailing_v2           |
+| [Llama](https://github.com/facebookresearch/llama)                | 7B/13B/33B/65B                   | -                    |
+| [Llama 2](https://huggingface.co/meta-llama)                      | 7B/13B/70B                       | llama2               |
+| [Llama 3-3.3](https://huggingface.co/meta-llama)                  | 1B/3B/8B/70B                     | llama3               |
+| [Llama 4](https://huggingface.co/meta-llama)                      | 109B/402B                        | llama4               |
+| [Llama 3.2 Vision](https://huggingface.co/meta-llama)             | 11B/90B                          | mllama               |
+| [LLaVA-1.5](https://huggingface.co/llava-hf)                      | 7B/13B                           | llava                |
+| [LLaVA-NeXT](https://huggingface.co/llava-hf)                     | 7B/8B/13B/34B/72B/110B           | llava_next           |
+| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf)               | 7B/34B                           | llava_next_video     |
+| [MiMo](https://huggingface.co/XiaomiMiMo)                         | 7B                               | mimo                 |
+| [MiniCPM 1-4.1](https://huggingface.co/openbmb)                   | 0.5B/1B/2B/4B/8B                 | cpm/cpm3/cpm4        |
+| [MiniCPM-o-2.6/MiniCPM-V-2.6](https://huggingface.co/openbmb)     | 8B                               | minicpm_o/minicpm_v  |
+| [Ministral/Mistral-Nemo](https://huggingface.co/mistralai)        | 8B/12B                           | ministral            |
+| [Mistral/Mixtral](https://huggingface.co/mistralai)               | 7B/8x7B/8x22B                    | mistral              |
+| [Mistral Small](https://huggingface.co/mistralai)                 | 24B                              | mistral_small        |
+| [OLMo](https://huggingface.co/allenai)                            | 1B/7B                            | -                    |
+| [PaliGemma/PaliGemma2](https://huggingface.co/google)             | 3B/10B/28B                       | paligemma            |
+| [Phi-1.5/Phi-2](https://huggingface.co/microsoft)                 | 1.3B/2.7B                        | -                    |
+| [Phi-3/Phi-3.5](https://huggingface.co/microsoft)                 | 4B/14B                           | phi                  |
+| [Phi-3-small](https://huggingface.co/microsoft)                   | 7B                               | phi_small            |
+| [Phi-4](https://huggingface.co/microsoft)                         | 14B                              | phi4                 |
+| [Pixtral](https://huggingface.co/mistralai)                       | 12B                              | pixtral              |
+| [Qwen (1-2.5) (Code/Math/MoE/QwQ)](https://huggingface.co/Qwen)   | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen                 |
+| [Qwen3 (MoE/Instruct/Thinking/Next)](https://huggingface.co/Qwen) | 0.6B/1.7B/4B/8B/14B/32B/80B/235B | qwen3/qwen3_nothink  |
+| [Qwen2-Audio](https://huggingface.co/Qwen)                        | 7B                               | qwen2_audio          |
+| [Qwen2.5-Omni](https://huggingface.co/Qwen)                       | 3B/7B                            | qwen2_omni           |
+| [Qwen3-Omni](https://huggingface.co/Qwen)                         | 30B                              | qwen3_omni           |
+| [Qwen2-VL/Qwen2.5-VL/QVQ](https://huggingface.co/Qwen)            | 2B/3B/7B/32B/72B                 | qwen2_vl             |
+| [Qwen3-VL](https://huggingface.co/Qwen)                           | 2B/4B/8B/30B/32B/235B            | qwen3_vl             |
+| [Seed (OSS/Coder)](https://huggingface.co/ByteDance-Seed)         | 8B/36B                           | seed_oss/seed_coder  |
+| [Skywork o1](https://huggingface.co/Skywork)                      | 8B                               | skywork_o1           |
+| [StarCoder 2](https://huggingface.co/bigcode)                     | 3B/7B/15B                        | -                    |
+| [TeleChat2](https://huggingface.co/Tele-AI)                       | 3B/7B/35B/115B                   | telechat2            |
+| [XVERSE](https://huggingface.co/xverse)                           | 7B/13B/65B                       | xverse               |
+| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai)                  | 1.5B/6B/9B/34B                   | yi                   |
+| [Yi-VL](https://huggingface.co/01-ai)                             | 6B/34B                           | yi_vl                |
+| [Yuan 2](https://huggingface.co/IEITYuan)                         | 2B/51B/102B                      | yuan                 |
+
+> [!NOTE]
+> For the "base" models, the `template` argument can be chosen from `default`, `alpaca`, `vicuna` etc. But make sure to use the **corresponding template** for the "instruct/chat" models.
+>
+> If the model has both reasoning and non-reasoning versions, please use the `_nothink` suffix to distinguish between them. For example, `qwen3` and `qwen3_nothink`.
+>
+> Remember to use the **SAME** template in training and inference.
+>
+> \*: You should install the `transformers` from main branch and use `DISABLE_VERSION_CHECK=1` to skip version check.
+>
+> \*\*: You need to install a specific version of `transformers` to use the corresponding model.
+
+Please refer to [constants.py](src/llamafactory/extras/constants.py) for a full list of models we supported.
+
+You also can add a custom chat template to [template.py](src/llamafactory/data/template.py).
+
+## Supported Training Approaches
+
+| Approach               |     Full-tuning    |    Freeze-tuning   |       LoRA         |       QLoRA        |        OFT         |        QOFT        |
+| ---------------------- | ------------------ | ------------------ | ------------------ | ------------------ | ------------------ | ------------------ |
+| Pre-Training           | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
+| Supervised Fine-Tuning | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
+| Reward Modeling        | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
+| PPO Training           | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
+| DPO Training           | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
+| KTO Training           | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
+| ORPO Training          | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
+| SimPO Training         | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
+
+> [!TIP]
+> The implementation details of PPO can be found in [this blog](https://newfacade.github.io/notes-on-reinforcement-learning/17-ppo-trl.html).
+
+## Provided Datasets
+
+<details><summary>Pre-training datasets</summary>
+
+- [Wiki Demo (en)](data/wiki_demo.txt)
+- [RefinedWeb (en)](https://huggingface.co/datasets/tiiuae/falcon-refinedweb)
+- [RedPajama V2 (en)](https://huggingface.co/datasets/togethercomputer/RedPajama-Data-V2)
+- [Wikipedia (en)](https://huggingface.co/datasets/olm/olm-wikipedia-20221220)
+- [Wikipedia (zh)](https://huggingface.co/datasets/pleisto/wikipedia-cn-20230720-filtered)
+- [Pile (en)](https://huggingface.co/datasets/EleutherAI/pile)
+- [SkyPile (zh)](https://huggingface.co/datasets/Skywork/SkyPile-150B)
+- [FineWeb (en)](https://huggingface.co/datasets/HuggingFaceFW/fineweb)
+- [FineWeb-Edu (en)](https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu)
+- [CCI3-HQ (zh)](https://huggingface.co/datasets/BAAI/CCI3-HQ)
+- [CCI3-Data (zh)](https://huggingface.co/datasets/BAAI/CCI3-Data)
+- [CCI4.0-M2-Base-v1 (en&zh)](https://huggingface.co/datasets/BAAI/CCI4.0-M2-Base-v1)
+- [CCI4.0-M2-CoT-v1 (en&zh)](https://huggingface.co/datasets/BAAI/CCI4.0-M2-CoT-v1)
+- [CCI4.0-M2-Extra-v1 (en&zh)](https://huggingface.co/datasets/BAAI/CCI4.0-M2-Extra-v1)
+- [The Stack (en)](https://huggingface.co/datasets/bigcode/the-stack)
+- [StarCoder (en)](https://huggingface.co/datasets/bigcode/starcoderdata)
+
+</details>
+
+<details><summary>Supervised fine-tuning datasets</summary>
+
+- [Identity (en&zh)](data/identity.json)
+- [Stanford Alpaca (en)](https://github.com/tatsu-lab/stanford_alpaca)
+- [Stanford Alpaca (zh)](https://github.com/ymcui/Chinese-LLaMA-Alpaca-3)
+- [Alpaca GPT4 (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
+- [Glaive Function Calling V2 (en&zh)](https://huggingface.co/datasets/glaiveai/glaive-function-calling-v2)
+- [LIMA (en)](https://huggingface.co/datasets/GAIR/lima)
+- [Guanaco Dataset (multilingual)](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset)
+- [BELLE 2M (zh)](https://huggingface.co/datasets/BelleGroup/train_2M_CN)
+- [BELLE 1M (zh)](https://huggingface.co/datasets/BelleGroup/train_1M_CN)
+- [BELLE 0.5M (zh)](https://huggingface.co/datasets/BelleGroup/train_0.5M_CN)
+- [BELLE Dialogue 0.4M (zh)](https://huggingface.co/datasets/BelleGroup/generated_chat_0.4M)
+- [BELLE School Math 0.25M (zh)](https://huggingface.co/datasets/BelleGroup/school_math_0.25M)
+- [BELLE Multiturn Chat 0.8M (zh)](https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M)
+- [UltraChat (en)](https://github.com/thunlp/UltraChat)
+- [OpenPlatypus (en)](https://huggingface.co/datasets/garage-bAInd/Open-Platypus)
+- [CodeAlpaca 20k (en)](https://huggingface.co/datasets/sahil2801/CodeAlpaca-20k)
+- [Alpaca CoT (multilingual)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT)
+- [OpenOrca (en)](https://huggingface.co/datasets/Open-Orca/OpenOrca)
+- [SlimOrca (en)](https://huggingface.co/datasets/Open-Orca/SlimOrca)
+- [MathInstruct (en)](https://huggingface.co/datasets/TIGER-Lab/MathInstruct)
+- [Firefly 1.1M (zh)](https://huggingface.co/datasets/YeungNLP/firefly-train-1.1M)
+- [Wiki QA (en)](https://huggingface.co/datasets/wiki_qa)
+- [Web QA (zh)](https://huggingface.co/datasets/suolyer/webqa)
+- [WebNovel (zh)](https://huggingface.co/datasets/zxbsmk/webnovel_cn)
+- [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar)
+- [deepctrl (en&zh)](https://www.modelscope.cn/datasets/deepctrl/deepctrl-sft-data)
+- [Advertise Generating (zh)](https://huggingface.co/datasets/HasturOfficial/adgen)
+- [ShareGPT Hyperfiltered (en)](https://huggingface.co/datasets/totally-not-an-llm/sharegpt-hyperfiltered-3k)
+- [ShareGPT4 (en&zh)](https://huggingface.co/datasets/shibing624/sharegpt_gpt4)
+- [UltraChat 200k (en)](https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k)
+- [Infinity Instruct (zh)](https://huggingface.co/datasets/BAAI/Infinity-Instruct)
+- [AgentInstruct (en)](https://huggingface.co/datasets/THUDM/AgentInstruct)
+- [LMSYS Chat 1M (en)](https://huggingface.co/datasets/lmsys/lmsys-chat-1m)
+- [Evol Instruct V2 (en)](https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k)
+- [Cosmopedia (en)](https://huggingface.co/datasets/HuggingFaceTB/cosmopedia)
+- [STEM (zh)](https://huggingface.co/datasets/hfl/stem_zh_instruction)
+- [Ruozhiba (zh)](https://huggingface.co/datasets/hfl/ruozhiba_gpt4_turbo)
+- [Neo-sft (zh)](https://huggingface.co/datasets/m-a-p/neo_sft_phase2)
+- [Magpie-Pro-300K-Filtered (en)](https://huggingface.co/datasets/Magpie-Align/Magpie-Pro-300K-Filtered)
+- [Magpie-ultra-v0.1 (en)](https://huggingface.co/datasets/argilla/magpie-ultra-v0.1)
+- [WebInstructSub (en)](https://huggingface.co/datasets/TIGER-Lab/WebInstructSub)
+- [OpenO1-SFT (en&zh)](https://huggingface.co/datasets/O1-OPEN/OpenO1-SFT)
+- [Open-Thoughts (en)](https://huggingface.co/datasets/open-thoughts/OpenThoughts-114k)
+- [Open-R1-Math (en)](https://huggingface.co/datasets/open-r1/OpenR1-Math-220k)
+- [Chinese-DeepSeek-R1-Distill (zh)](https://huggingface.co/datasets/Congliu/Chinese-DeepSeek-R1-Distill-data-110k-SFT)
+- [LLaVA mixed (en&zh)](https://huggingface.co/datasets/BUAADreamer/llava-en-zh-300k)
+- [Pokemon-gpt4o-captions (en&zh)](https://huggingface.co/datasets/jugg1024/pokemon-gpt4o-captions)
+- [Open Assistant (de)](https://huggingface.co/datasets/mayflowergmbh/oasst_de)
+- [Dolly 15k (de)](https://huggingface.co/datasets/mayflowergmbh/dolly-15k_de)
+- [Alpaca GPT4 (de)](https://huggingface.co/datasets/mayflowergmbh/alpaca-gpt4_de)
+- [OpenSchnabeltier (de)](https://huggingface.co/datasets/mayflowergmbh/openschnabeltier_de)
+- [Evol Instruct (de)](https://huggingface.co/datasets/mayflowergmbh/evol-instruct_de)
+- [Dolphin (de)](https://huggingface.co/datasets/mayflowergmbh/dolphin_de)
+- [Booksum (de)](https://huggingface.co/datasets/mayflowergmbh/booksum_de)
+- [Airoboros (de)](https://huggingface.co/datasets/mayflowergmbh/airoboros-3.0_de)
+- [Ultrachat (de)](https://huggingface.co/datasets/mayflowergmbh/ultra-chat_de)
+
+</details>
+
+<details><summary>Preference datasets</summary>
+
+- [DPO mixed (en&zh)](https://huggingface.co/datasets/hiyouga/DPO-En-Zh-20k)
+- [UltraFeedback (en)](https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized)
+- [COIG-P (zh)](https://huggingface.co/datasets/m-a-p/COIG-P)
+- [RLHF-V (en)](https://huggingface.co/datasets/openbmb/RLHF-V-Dataset)
+- [VLFeedback (en)](https://huggingface.co/datasets/Zhihui/VLFeedback)
+- [RLAIF-V (en)](https://huggingface.co/datasets/openbmb/RLAIF-V-Dataset)
+- [Orca DPO Pairs (en)](https://huggingface.co/datasets/Intel/orca_dpo_pairs)
+- [HH-RLHF (en)](https://huggingface.co/datasets/Anthropic/hh-rlhf)
+- [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar)
+- [Orca DPO (de)](https://huggingface.co/datasets/mayflowergmbh/intel_orca_dpo_pairs_de)
+- [KTO mixed (en)](https://huggingface.co/datasets/argilla/kto-mix-15k)
+
+</details>
+
+Some datasets require confirmation before using them, so we recommend logging in with your Hugging Face account using these commands.
+
+```bash
+pip install "huggingface_hub<1.0.0"
+huggingface-cli login
+```
+
+## Requirement
+
+| Mandatory    | Minimum | Recommend |
+| ------------ | ------- | --------- |
+| python       | 3.9     | 3.10      |
+| torch        | 2.0.0   | 2.6.0     |
+| torchvision  | 0.15.0  | 0.21.0    |
+| transformers | 4.49.0  | 4.50.0    |
+| datasets     | 2.16.0  | 3.2.0     |
+| accelerate   | 0.34.0  | 1.2.1     |
+| peft         | 0.14.0  | 0.15.1    |
+| trl          | 0.8.6   | 0.9.6     |
+
+| Optional     | Minimum | Recommend |
+| ------------ | ------- | --------- |
+| CUDA         | 11.6    | 12.2      |
+| deepspeed    | 0.10.0  | 0.16.4    |
+| bitsandbytes | 0.39.0  | 0.43.1    |
+| vllm         | 0.4.3   | 0.8.2     |
+| flash-attn   | 2.5.6   | 2.7.2     |
+
+### Hardware Requirement
+
+\* *estimated*
+
+| Method                              | Bits |   7B  |  14B  |  30B  |   70B  |   `x`B  |
+| ----------------------------------- | ---- | ----- | ----- | ----- | ------ | ------- |
+| Full (`bf16` or `fp16`)             |  32  | 120GB | 240GB | 600GB | 1200GB | `18x`GB |
+| Full (`pure_bf16`)                  |  16  |  60GB | 120GB | 300GB |  600GB |  `8x`GB |
+| Freeze/LoRA/GaLore/APOLLO/BAdam/OFT |  16  |  16GB |  32GB |  64GB |  160GB |  `2x`GB |
+| QLoRA / QOFT                        |   8  |  10GB |  20GB |  40GB |   80GB |   `x`GB |
+| QLoRA / QOFT                        |   4  |   6GB |  12GB |  24GB |   48GB | `x/2`GB |
+| QLoRA / QOFT                        |   2  |   4GB |   8GB |  16GB |   24GB | `x/4`GB |
+
+## Getting Started
+
+### Installation
+
+> [!IMPORTANT]
+> Installation is mandatory.
+
+#### Install from Source
+
+```bash
+git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git
+cd LLaMA-Factory
+pip install -e ".[torch,metrics]" --no-build-isolation
+```
+
+Extra dependencies available: torch, torch-npu, metrics, deepspeed, liger-kernel, bitsandbytes, hqq, eetq, gptq, aqlm, vllm, sglang, galore, apollo, badam, adam-mini, qwen, minicpm_v, openmind, swanlab, dev
+
+#### Install from Docker Image
+
+```bash
+docker run -it --rm --gpus=all --ipc=host hiyouga/llamafactory:latest
+```
+
+This image is built on Ubuntu 22.04 (x86\_64), CUDA 12.4, Python 3.11, PyTorch 2.6.0, and Flash-attn 2.7.4.
+
+Find the pre-built images: https://hub.docker.com/r/hiyouga/llamafactory/tags
+
+Please refer to [build docker](#build-docker) to build the image yourself.
+
+<details><summary>Setting up a virtual environment with <b>uv</b></summary>
+
+Create an isolated Python environment with [uv](https://github.com/astral-sh/uv):
+
+```bash
+uv sync --extra torch --extra metrics --prerelease=allow
+```
+
+Run LLaMA-Factory in the isolated environment:
+
+```bash
+uv run --prerelease=allow llamafactory-cli train examples/train_lora/llama3_lora_pretrain.yaml
+```
+
+</details>
+
+<details><summary>For Windows users</summary>
+
+#### Install PyTorch
+
+You need to manually install the GPU version of PyTorch on the Windows platform. Please refer to the [official website](https://pytorch.org/get-started/locally/) and the following command to install PyTorch with CUDA support:
+
+```bash
+pip uninstall torch torchvision torchaudio
+pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
+python -c "import torch; print(torch.cuda.is_available())"
+```
+
+If you see `True` then you have successfully installed PyTorch with CUDA support.
+
+Try `dataloader_num_workers: 0` if you encounter `Can't pickle local object` error.
+
+#### Install BitsAndBytes
+
+If you want to enable the quantized LoRA (QLoRA) on the Windows platform, you need to install a pre-built version of `bitsandbytes` library, which supports CUDA 11.1 to 12.2, please select the appropriate [release version](https://github.com/jllllll/bitsandbytes-windows-webui/releases/tag/wheels) based on your CUDA version.
+
+```bash
+pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.2.post2-py3-none-win_amd64.whl
+```
+
+#### Install Flash Attention-2
+
+To enable FlashAttention-2 on the Windows platform, please use the script from [flash-attention-windows-wheel](https://huggingface.co/lldacing/flash-attention-windows-wheel) to compile and install it by yourself.
+
+</details>
+
+<details><summary>For Ascend NPU users</summary>
+
+To install LLaMA Factory on Ascend NPU devices, please upgrade Python to version 3.10 or higher and specify extra dependencies: `pip install -e ".[torch-npu,metrics]"`. Additionally, you need to install the **[Ascend CANN Toolkit and Kernels](https://www.hiascend.com/developer/download/community/result?module=cann)**. Please follow the [installation tutorial](https://www.hiascend.com/document/detail/en/CANNCommunityEdition/600alphaX/softwareinstall/instg/atlasdeploy_03_0031.html) or use the following commands:
+
+```bash
+# replace the url according to your CANN version and devices
+# install CANN Toolkit
+wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Milan-ASL/Milan-ASL%20V100R001C20SPC702/Ascend-cann-toolkit_8.0.0.alpha002_linux-"$(uname -i)".run
+bash Ascend-cann-toolkit_8.0.0.alpha002_linux-"$(uname -i)".run --install
+
+# install CANN Kernels
+wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Milan-ASL/Milan-ASL%20V100R001C20SPC702/Ascend-cann-kernels-910b_8.0.0.alpha002_linux-"$(uname -i)".run
+bash Ascend-cann-kernels-910b_8.0.0.alpha002_linux-"$(uname -i)".run --install
+
+# set env variables
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+```
+
+| Requirement  | Minimum | Recommend      |
+| ------------ | ------- | -------------- |
+| CANN         | 8.0.RC1 | 8.0.0.alpha002 |
+| torch        | 2.1.0   | 2.4.0          |
+| torch-npu    | 2.1.0   | 2.4.0.post2    |
+| deepspeed    | 0.13.2  | 0.13.2         |
+| vllm-ascend  | -       | 0.7.3          |
+
+Remember to use `ASCEND_RT_VISIBLE_DEVICES` instead of `CUDA_VISIBLE_DEVICES` to specify the device to use.
+
+If you cannot infer model on NPU devices, try setting `do_sample: false` in the configurations.
+
+Download the pre-built Docker images: [32GB](http://mirrors.cn-central-221.ovaijisuan.com/detail/130.html) | [64GB](http://mirrors.cn-central-221.ovaijisuan.com/detail/131.html)
+
+#### Install BitsAndBytes
+
+To use QLoRA based on bitsandbytes on Ascend NPU, please follow these 3 steps:
+
+1. Manually compile bitsandbytes: Refer to [the installation documentation](https://huggingface.co/docs/bitsandbytes/installation?backend=Ascend+NPU&platform=Ascend+NPU) for the NPU version of bitsandbytes to complete the compilation and installation. The compilation requires a cmake version of at least 3.22.1 and a g++ version of at least 12.x.
+
+```bash
+# Install bitsandbytes from source
+# Clone bitsandbytes repo, Ascend NPU backend is currently enabled on multi-backend-refactor branch
+git clone -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git
+cd bitsandbytes/
+
+# Install dependencies
+pip install -r requirements-dev.txt
+
+# Install the dependencies for the compilation tools. Note that the commands for this step may vary depending on the operating system. The following are provided for reference
+apt-get install -y build-essential cmake
+
+# Compile & install  
+cmake -DCOMPUTE_BACKEND=npu -S .
+make
+pip install .
+```
+
+2. Install transformers from the main branch.
+
+```bash
+git clone -b main https://github.com/huggingface/transformers.git
+cd transformers
+pip install .
+```
+
+3. Set `double_quantization: false` in the configuration. You can refer to the [example](examples/train_qlora/llama3_lora_sft_bnb_npu.yaml).
+
+</details>
+
+### Data Preparation
+
+Please refer to [data/README.md](data/README.md) for checking the details about the format of dataset files. You can use datasets on HuggingFace / ModelScope / Modelers hub, load the dataset in local disk, or specify a path to s3/gcs cloud storage.
+
+> [!NOTE]
+> Please update `data/dataset_info.json` to use your custom dataset.
+
+You can also use **[Easy Dataset](https://github.com/ConardLi/easy-dataset)**, **[DataFlow](https://github.com/OpenDCAI/DataFlow)** and **[GraphGen](https://github.com/open-sciencelab/GraphGen)** to create synthetic data for fine-tuning.
+
+### Quickstart
+
+Use the following 3 commands to run LoRA **fine-tuning**, **inference** and **merging** of the Llama3-8B-Instruct model, respectively.
+
+```bash
+llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
+llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
+llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
+```
+
+See [examples/README.md](examples/README.md) for advanced usage (including distributed training).
+
+> [!TIP]
+> Use `llamafactory-cli help` to show help information.
+>
+> Read [FAQs](https://github.com/hiyouga/LLaMA-Factory/issues/4614) first if you encounter any problems.
+
+### Fine-Tuning with LLaMA Board GUI (powered by [Gradio](https://github.com/gradio-app/gradio))
+
+```bash
+llamafactory-cli webui
+```
+
+### LLaMA Factory Online
+
+Read our [documentation](https://docs.llamafactory.com.cn/docs/documents/quickstart/getstarted/?utm_source=LLaMA-Factory).
+
+### Build Docker
+
+For CUDA users:
+
+```bash
+cd docker/docker-cuda/
+docker compose up -d
+docker compose exec llamafactory bash
+```
+
+For Ascend NPU users:
+
+```bash
+cd docker/docker-npu/
+docker compose up -d
+docker compose exec llamafactory bash
+```
+
+For AMD ROCm users:
+
+```bash
+cd docker/docker-rocm/
+docker compose up -d
+docker compose exec llamafactory bash
+```
+
+<details><summary>Build without Docker Compose</summary>
+
+For CUDA users:
+
+```bash
+docker build -f ./docker/docker-cuda/Dockerfile \
+    --build-arg PIP_INDEX=https://pypi.org/simple \
+    --build-arg EXTRAS=metrics \
+    -t llamafactory:latest .
+
+docker run -dit --ipc=host --gpus=all \
+    -p 7860:7860 \
+    -p 8000:8000 \
+    --name llamafactory \
+    llamafactory:latest
+
+docker exec -it llamafactory bash
+```
+
+For Ascend NPU users:
+
+```bash
+docker build -f ./docker/docker-npu/Dockerfile \
+    --build-arg PIP_INDEX=https://pypi.org/simple \
+    --build-arg EXTRAS=torch-npu,metrics \
+    -t llamafactory:latest .
+
+docker run -dit --ipc=host \
+    -v /usr/local/dcmi:/usr/local/dcmi \
+    -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
+    -v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
+    -v /etc/ascend_install.info:/etc/ascend_install.info \
+    -p 7860:7860 \
+    -p 8000:8000 \
+    --device /dev/davinci0 \
+    --device /dev/davinci_manager \
+    --device /dev/devmm_svm \
+    --device /dev/hisi_hdc \
+    --name llamafactory \
+    llamafactory:latest
+
+docker exec -it llamafactory bash
+```
+
+For AMD ROCm users:
+
+```bash
+docker build -f ./docker/docker-rocm/Dockerfile \
+    --build-arg PIP_INDEX=https://pypi.org/simple \
+    --build-arg EXTRAS=metrics \
+    -t llamafactory:latest .
+
+docker run -dit --ipc=host \
+    -p 7860:7860 \
+    -p 8000:8000 \
+    --device /dev/kfd \
+    --device /dev/dri \
+    --name llamafactory \
+    llamafactory:latest
+
+docker exec -it llamafactory bash
+```
+
+</details>
+
+<details><summary>Use Docker volumes</summary>
+
+You can uncomment `VOLUME [ "/root/.cache/huggingface", "/app/shared_data", "/app/output" ]` in the Dockerfile to use data volumes.
+
+When building the Docker image, use `-v ./hf_cache:/root/.cache/huggingface` argument to mount the local directory to the container. The following data volumes are available.
+
+- `hf_cache`: Utilize Hugging Face cache on the host machine.
+- `shared_data`: The directionary to store datasets on the host machine.
+- `output`: Set export dir to this location so that the merged result can be accessed directly on the host machine.
+
+</details>
+
+### Deploy with OpenAI-style API and vLLM
+
+```bash
+API_PORT=8000 llamafactory-cli api examples/inference/llama3.yaml infer_backend=vllm vllm_enforce_eager=true
+```
+
+> [!TIP]
+> Visit [this page](https://platform.openai.com/docs/api-reference/chat/create) for API document.
+>
+> Examples: [Image understanding](scripts/api_example/test_image.py) | [Function calling](scripts/api_example/test_toolcall.py)
+
+### Download from ModelScope Hub
+
+If you have trouble with downloading models and datasets from Hugging Face, you can use ModelScope.
+
+```bash
+export USE_MODELSCOPE_HUB=1 # `set USE_MODELSCOPE_HUB=1` for Windows
+```
+
+Train the model by specifying a model ID of the ModelScope Hub as the `model_name_or_path`. You can find a full list of model IDs at [ModelScope Hub](https://modelscope.cn/models), e.g., `LLM-Research/Meta-Llama-3-8B-Instruct`.
+
+### Download from Modelers Hub
+
+You can also use Modelers Hub to download models and datasets.
+
+```bash
+export USE_OPENMIND_HUB=1 # `set USE_OPENMIND_HUB=1` for Windows
+```
+
+Train the model by specifying a model ID of the Modelers Hub as the `model_name_or_path`. You can find a full list of model IDs at [Modelers Hub](https://modelers.cn/models), e.g., `TeleAI/TeleChat-7B-pt`.
+
+### Use W&B Logger
+
+To use [Weights & Biases](https://wandb.ai) for logging experimental results, you need to add the following arguments to yaml files.
+
+```yaml
+report_to: wandb
+run_name: test_run # optional
+```
+
+Set `WANDB_API_KEY` to [your key](https://wandb.ai/authorize) when launching training tasks to log in with your W&B account.
+
+### Use SwanLab Logger
+
+To use [SwanLab](https://github.com/SwanHubX/SwanLab) for logging experimental results, you need to add the following arguments to yaml files.
+
+```yaml
+use_swanlab: true
+swanlab_run_name: test_run # optional
+```
+
+When launching training tasks, you can log in to SwanLab in three ways:
+
+1. Add `swanlab_api_key=<your_api_key>` to the yaml file, and set it to your [API key](https://swanlab.cn/settings).
+2. Set the environment variable `SWANLAB_API_KEY` to your [API key](https://swanlab.cn/settings).
+3. Use the `swanlab login` command to complete the login.
+
+## Projects using LLaMA Factory
+
+If you have a project that should be incorporated, please contact via email or create a pull request.
+
+<details><summary>Click to show</summary>
+
+1. Wang et al. ESRL: Efficient Sampling-based Reinforcement Learning for Sequence Generation. 2023. [[arxiv]](https://arxiv.org/abs/2308.02223)
+1. Yu et al. Open, Closed, or Small Language Models for Text Classification? 2023. [[arxiv]](https://arxiv.org/abs/2308.10092)
+1. Wang et al. UbiPhysio: Support Daily Functioning, Fitness, and Rehabilitation with Action Understanding and Feedback in Natural Language. 2023. [[arxiv]](https://arxiv.org/abs/2308.10526)
+1. Luceri et al. Leveraging Large Language Models to Detect Influence Campaigns in Social Media. 2023. [[arxiv]](https://arxiv.org/abs/2311.07816)
+1. Zhang et al. Alleviating Hallucinations of Large Language Models through Induced Hallucinations. 2023. [[arxiv]](https://arxiv.org/abs/2312.15710)
+1. Wang et al. Know Your Needs Better: Towards Structured Understanding of Marketer Demands with Analogical Reasoning Augmented LLMs. KDD 2024. [[arxiv]](https://arxiv.org/abs/2401.04319)
+1. Wang et al. CANDLE: Iterative Conceptualization and Instantiation Distillation from Large Language Models for Commonsense Reasoning. ACL 2024. [[arxiv]](https://arxiv.org/abs/2401.07286)
+1. Choi et al. FACT-GPT: Fact-Checking Augmentation via Claim Matching with LLMs. 2024. [[arxiv]](https://arxiv.org/abs/2402.05904)
+1. Zhang et al. AutoMathText: Autonomous Data Selection with Language Models for Mathematical Texts. 2024. [[arxiv]](https://arxiv.org/abs/2402.07625)
+1. Lyu et al. KnowTuning: Knowledge-aware Fine-tuning for Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2402.11176)
+1. Yang et al. LaCo: Large Language Model Pruning via Layer Collaps. 2024. [[arxiv]](https://arxiv.org/abs/2402.11187)
+1. Bhardwaj et al. Language Models are Homer Simpson! Safety Re-Alignment of Fine-tuned Language Models through Task Arithmetic. 2024. [[arxiv]](https://arxiv.org/abs/2402.11746)
+1. Yang et al. Enhancing Empathetic Response Generation by Augmenting LLMs with Small-scale Empathetic Models. 2024. [[arxiv]](https://arxiv.org/abs/2402.11801)
+1. Yi et al. Generation Meets Verification: Accelerating Large Language Model Inference with Smart Parallel Auto-Correct Decoding. ACL 2024 Findings. [[arxiv]](https://arxiv.org/abs/2402.11809)
+1. Cao et al. Head-wise Shareable Attention for Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2402.11819)
+1. Zhang et al. Enhancing Multilingual Capabilities of Large Language Models through Self-Distillation from Resource-Rich Languages. 2024. [[arxiv]](https://arxiv.org/abs/2402.12204)
+1. Kim et al. Efficient and Effective Vocabulary Expansion Towards Multilingual Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2402.14714)
+1. Yu et al. KIEval: A Knowledge-grounded Interactive Evaluation Framework for Large Language Models. ACL 2024. [[arxiv]](https://arxiv.org/abs/2402.15043)
+1. Huang et al. Key-Point-Driven Data Synthesis with its Enhancement on Mathematical Reasoning. 2024. [[arxiv]](https://arxiv.org/abs/2403.02333)
+1. Duan et al. Negating Negatives: Alignment without Human Positive Samples via Distributional Dispreference Optimization. 2024. [[arxiv]](https://arxiv.org/abs/2403.03419)
+1. Xie and Schwertfeger. Empowering Robotics with Large Language Models: osmAG Map Comprehension with LLMs. 2024. [[arxiv]](https://arxiv.org/abs/2403.08228)
+1. Wu et al. Large Language Models are Parallel Multilingual Learners. 2024. [[arxiv]](https://arxiv.org/abs/2403.09073)
+1. Zhang et al. EDT: Improving Large Language Models' Generation by Entropy-based Dynamic Temperature Sampling. 2024. [[arxiv]](https://arxiv.org/abs/2403.14541)
+1. Weller et al. FollowIR: Evaluating and Teaching Information Retrieval Models to Follow Instructions. 2024. [[arxiv]](https://arxiv.org/abs/2403.15246)
+1. Hongbin Na. CBT-LLM: A Chinese Large Language Model for Cognitive Behavioral Therapy-based Mental Health Question Answering. COLING 2024. [[arxiv]](https://arxiv.org/abs/2403.16008)
+1. Zan et al. CodeS: Natural Language to Code Repository via Multi-Layer Sketch. 2024. [[arxiv]](https://arxiv.org/abs/2403.16443)
+1. Liu et al. Extensive Self-Contrast Enables Feedback-Free Language Model Alignment. 2024. [[arxiv]](https://arxiv.org/abs/2404.00604)
+1. Luo et al. BAdam: A Memory Efficient Full Parameter Training Method for Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.02827)
+1. Du et al. Chinese Tiny LLM: Pretraining a Chinese-Centric Large Language Model. 2024. [[arxiv]](https://arxiv.org/abs/2404.04167)
+1. Ma et al. Parameter Efficient Quasi-Orthogonal Fine-Tuning via Givens Rotation. ICML 2024. [[arxiv]](https://arxiv.org/abs/2404.04316)
+1. Liu et al. Dynamic Generation of Personalities with Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.07084)
+1. Shang et al. How Far Have We Gone in Stripped Binary Code Understanding Using Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.09836)
+1. Huang et al. LLMTune: Accelerate Database Knob Tuning with Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.11581)
+1. Deng et al. Text-Tuple-Table: Towards Information Integration in Text-to-Table Generation via Global Tuple Extraction. 2024. [[arxiv]](https://arxiv.org/abs/2404.14215)
+1. Acikgoz et al. Hippocrates: An Open-Source Framework for Advancing Large Language Models in Healthcare. 2024. [[arxiv]](https://arxiv.org/abs/2404.16621)
+1. Zhang et al. Small Language Models Need Strong Verifiers to Self-Correct Reasoning. ACL 2024 Findings. [[arxiv]](https://arxiv.org/abs/2404.17140)
+1. Zhou et al. FREB-TQA: A Fine-Grained Robustness Evaluation Benchmark for Table Question Answering. NAACL 2024. [[arxiv]](https://arxiv.org/abs/2404.18585)
+1. Xu et al. Large Language Models for Cyber Security: A Systematic Literature Review. 2024. [[arxiv]](https://arxiv.org/abs/2405.04760)
+1. Dammu et al. "They are uncultured": Unveiling Covert Harms and Social Threats in LLM Generated Conversations. 2024. [[arxiv]](https://arxiv.org/abs/2405.05378)
+1. Yi et al. A safety realignment framework via subspace-oriented model fusion for large language models. 2024. [[arxiv]](https://arxiv.org/abs/2405.09055)
+1. Lou et al. SPO: Multi-Dimensional Preference Sequential Alignment With Implicit Reward Modeling. 2024. [[arxiv]](https://arxiv.org/abs/2405.12739)
+1. Zhang et al. Getting More from Less: Large Language Models are Good Spontaneous Multilingual Learners. 2024. [[arxiv]](https://arxiv.org/abs/2405.13816)
+1. Zhang et al. TS-Align: A Teacher-Student Collaborative Framework for Scalable Iterative Finetuning of Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2405.20215)
+1. Zihong Chen. Sentence Segmentation and Sentence Punctuation Based on XunziALLM. 2024. [[paper]](https://aclanthology.org/2024.lt4hala-1.30)
+1. Gao et al. The Best of Both Worlds: Toward an Honest and Helpful Large Language Model. 2024. [[arxiv]](https://arxiv.org/abs/2406.00380)
+1. Wang and Song. MARS: Benchmarking the Metaphysical Reasoning Abilities of Language Models with a Multi-task Evaluation Dataset. 2024. [[arxiv]](https://arxiv.org/abs/2406.02106)
+1. Hu et al. Computational Limits of Low-Rank Adaptation (LoRA) for Transformer-Based Models. 2024. [[arxiv]](https://arxiv.org/abs/2406.03136)
+1. Ge et al. Time Sensitive Knowledge Editing through Efficient Finetuning. ACL 2024. [[arxiv]](https://arxiv.org/abs/2406.04496)
+1. Tan et al. Peer Review as A Multi-Turn and Long-Context Dialogue with Role-Based Interactions. 2024. [[arxiv]](https://arxiv.org/abs/2406.05688)
+1. Song et al. Turbo Sparse: Achieving LLM SOTA Performance with Minimal Activated Parameters. 2024. [[arxiv]](https://arxiv.org/abs/2406.05955)
+1. Gu et al. RWKV-CLIP: A Robust Vision-Language Representation Learner. 2024. [[arxiv]](https://arxiv.org/abs/2406.06973)
+1. Chen et al. Advancing Tool-Augmented Large Language Models: Integrating Insights from Errors in Inference Trees. 2024. [[arxiv]](https://arxiv.org/abs/2406.07115)
+1. Zhu et al. Are Large Language Models Good Statisticians?. 2024. [[arxiv]](https://arxiv.org/abs/2406.07815)
+1. Li et al. Know the Unknown: An Uncertainty-Sensitive Method for LLM Instruction Tuning. 2024. [[arxiv]](https://arxiv.org/abs/2406.10099)
+1. Ding et al. IntentionQA: A Benchmark for Evaluating Purchase Intention Comprehension Abilities of Language Models in E-commerce. 2024. [[arxiv]](https://arxiv.org/abs/2406.10173)
+1. He et al. COMMUNITY-CROSS-INSTRUCT: Unsupervised Instruction Generation for Aligning Large Language Models to Online Communities. 2024. [[arxiv]](https://arxiv.org/abs/2406.12074)
+1. Lin et al. FVEL: Interactive Formal Verification Environment with Large Language Models via Theorem Proving. 2024. [[arxiv]](https://arxiv.org/abs/2406.14408)
+1. Treutlein et al. Connecting the Dots: LLMs can Infer and Verbalize Latent Structure from Disparate Training Data. 2024. [[arxiv]](https://arxiv.org/abs/2406.14546)
+1. Feng et al. SS-Bench: A Benchmark for Social Story Generation and Evaluation. 2024. [[arxiv]](https://arxiv.org/abs/2406.15695)
+1. Feng et al. Self-Constructed Context Decompilation with Fined-grained Alignment Enhancement. 2024. [[arxiv]](https://arxiv.org/abs/2406.17233)
+1. Liu et al. Large Language Models for Cuffless Blood Pressure Measurement From Wearable Biosignals. 2024. [[arxiv]](https://arxiv.org/abs/2406.18069)
+1. Iyer et al. Exploring Very Low-Resource Translation with LLMs: The University of Edinburgh's Submission to AmericasNLP 2024 Translation Task. AmericasNLP 2024. [[paper]](https://aclanthology.org/2024.americasnlp-1.25)
+1. Li et al. Calibrating LLMs with Preference Optimization on Thought Trees for Generating Rationale in Science Question Scoring. 2024. [[arxiv]](https://arxiv.org/abs/2406.19949)
+1. Yang et al. Financial Knowledge Large Language Model. 2024. [[arxiv]](https://arxiv.org/abs/2407.00365)
+1. Lin et al. DogeRM: Equipping Reward Models with Domain Knowledge through Model Merging. 2024. [[arxiv]](https://arxiv.org/abs/2407.01470)
+1. Bako et al. Evaluating the Semantic Profiling Abilities of LLMs for Natural Language Utterances in Data Visualization. 2024. [[arxiv]](https://arxiv.org/abs/2407.06129)
+1. Huang et al. RoLoRA: Fine-tuning Rotated Outlier-free LLMs for Effective Weight-Activation Quantization. 2024. [[arxiv]](https://arxiv.org/abs/2407.08044)
+1. Jiang et al. LLM-Collaboration on Automatic Science Journalism for the General Audience. 2024. [[arxiv]](https://arxiv.org/abs/2407.09756)
+1. Inouye et al. Applied Auto-tuning on LoRA Hyperparameters. 2024. [[paper]](https://scholarcommons.scu.edu/cseng_senior/272/)
+1. Qi et al. Research on Tibetan Tourism Viewpoints information generation system based on LLM. 2024. [[arxiv]](https://arxiv.org/abs/2407.13561)
+1. Xu et al. Course-Correction: Safety Alignment Using Synthetic Preferences. 2024. [[arxiv]](https://arxiv.org/abs/2407.16637)
+1. Sun et al. LAMBDA: A Large Model Based Data Agent. 2024. [[arxiv]](https://arxiv.org/abs/2407.17535)
+1. Zhu et al. CollectiveSFT: Scaling Large Language Models for Chinese Medical Benchmark with Collective Instructions in Healthcare. 2024. [[arxiv]](https://arxiv.org/abs/2407.19705)
+1. Yu et al. Correcting Negative Bias in Large Language Models through Negative Attention Score Alignment. 2024. [[arxiv]](https://arxiv.org/abs/2408.00137)
+1. Xie et al. The Power of Personalized Datasets: Advancing Chinese Composition Writing for Elementary School through Targeted Model Fine-Tuning. IALP 2024. [[paper]](https://www.asianlp.sg/conferences/ialp2024/proceedings/papers/IALP2024_P055.pdf)
+1. Liu et al. Instruct-Code-Llama: Improving Capabilities of Language Model in Competition Level Code Generation by Online Judge Feedback. ICIC 2024. [[paper]](https://link.springer.com/chapter/10.1007/978-981-97-5669-8_11)
+1. Wang et al. Cybernetic Sentinels: Unveiling the Impact of Safety Data Selection on Model Security in Supervised Fine-Tuning. ICIC 2024. [[paper]](https://link.springer.com/chapter/10.1007/978-981-97-5669-8_23)
+1. Xia et al. Understanding the Performance and Estimating the Cost of LLM Fine-Tuning. 2024. [[arxiv]](https://arxiv.org/abs/2408.04693)
+1. Zeng et al. Perceive, Reflect, and Plan: Designing LLM Agent for Goal-Directed City Navigation without Instructions. 2024. [[arxiv]](https://arxiv.org/abs/2408.04168)
+1. Xia et al. Using Pre-trained Language Model for Accurate ESG Prediction. FinNLP 2024. [[paper]](https://aclanthology.org/2024.finnlp-2.1/)
+1. Liang et al. I-SHEEP: Self-Alignment of LLM from Scratch through an Iterative Self-Enhancement Paradigm. 2024. [[arxiv]](https://arxiv.org/abs/2408.08072)
+1. Bai et al. Aligning Large Language Model with Direct Multi-Preference Optimization for Recommendation. CIKM 2024. [[paper]](https://dl.acm.org/doi/10.1145/3627673.3679611)
+1. Zhang et al. CPsyCoun: A Report-based Multi-turn Dialogue Reconstruction and Evaluation Framework for Chinese Psychological Counseling. ACL 2024. [[paper]](https://aclanthology.org/2024.findings-acl.830.pdf)
+1. **[StarWhisper](https://github.com/Yu-Yang-Li/StarWhisper)**: A large language model for Astronomy, based on ChatGLM2-6B and Qwen-14B.
+1. **[DISC-LawLLM](https://github.com/FudanDISC/DISC-LawLLM)**: A large language model specialized in Chinese legal domain, based on Baichuan-13B, is capable of retrieving and reasoning on legal knowledge.
+1. **[Sunsimiao](https://github.com/X-D-Lab/Sunsimiao)**: A large language model specialized in Chinese medical domain, based on Baichuan-7B and ChatGLM-6B.
+1. **[CareGPT](https://github.com/WangRongsheng/CareGPT)**: A series of large language models for Chinese medical domain, based on LLaMA2-7B and Baichuan-13B.
+1. **[MachineMindset](https://github.com/PKU-YuanGroup/Machine-Mindset/)**: A series of MBTI Personality large language models, capable of giving any LLM 16 different personality types based on different datasets and training methods.
+1. **[Luminia-13B-v3](https://huggingface.co/Nekochu/Luminia-13B-v3)**: A large language model specialized in generate metadata for stable diffusion. [[demo]](https://huggingface.co/spaces/Nekochu/Luminia-13B_SD_Prompt)
+1. **[Chinese-LLaVA-Med](https://github.com/BUAADreamer/Chinese-LLaVA-Med)**: A multimodal large language model specialized in Chinese medical domain, based on LLaVA-1.5-7B.
+1. **[AutoRE](https://github.com/THUDM/AutoRE)**: A document-level relation extraction system based on large language models.
+1. **[NVIDIA RTX AI Toolkit](https://github.com/NVIDIA/RTX-AI-Toolkit)**: SDKs for fine-tuning LLMs on Windows PC for NVIDIA RTX.
+1. **[LazyLLM](https://github.com/LazyAGI/LazyLLM)**: An easy and lazy way for building multi-agent LLMs applications and supports model fine-tuning via LLaMA Factory.
+1. **[RAG-Retrieval](https://github.com/NLPJCL/RAG-Retrieval)**: A full pipeline for RAG retrieval model fine-tuning, inference, and distillation. [[blog]](https://zhuanlan.zhihu.com/p/987727357)
+1. **[360-LLaMA-Factory](https://github.com/Qihoo360/360-LLaMA-Factory)**: A modified library that supports long sequence SFT & DPO using ring attention.
+1. **[Sky-T1](https://novasky-ai.github.io/posts/sky-t1/)**: An o1-like model fine-tuned by NovaSky AI with very small cost.
+1. **[WeClone](https://github.com/xming521/WeClone)**: One-stop solution for creating your digital avatar from chat logs.
+1. **[EmoLLM](https://github.com/SmartFlowAI/EmoLLM)**: A project about large language models (LLMs) and mental health.
+</details>
+
+## License
+
+This repository is licensed under the [Apache-2.0 License](LICENSE).
+
+Please follow the model licenses to use the corresponding model weights: [Baichuan 2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [GLM-4](https://huggingface.co/THUDM/glm-4-9b/blob/main/LICENSE) / [GPT-2](https://github.com/openai/gpt-2/blob/master/LICENSE) / [Granite](LICENSE) / [Index](https://huggingface.co/IndexTeam/Index-1.9B/blob/main/LICENSE) / [InternLM](https://github.com/InternLM/InternLM#license) / [Llama](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [Llama 2](https://ai.meta.com/llama/license/) / [Llama 3](https://llama.meta.com/llama3/license/) / [Llama 4](https://github.com/meta-llama/llama-models/blob/main/models/llama4/LICENSE) / [MiniCPM](https://github.com/OpenBMB/MiniCPM/blob/main/MiniCPM%20Model%20License.md) / [Mistral/Mixtral/Pixtral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/Phi-2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Phi-3/Phi-4](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/LICENSE) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [Skywork](https://huggingface.co/Skywork/Skywork-13B-base/blob/main/Skywork%20Community%20License.pdf) / [StarCoder 2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [TeleChat2](https://huggingface.co/Tele-AI/telechat-7B/blob/main/TeleChat%E6%A8%A1%E5%9E%8B%E7%A4%BE%E5%8C%BA%E8%AE%B8%E5%8F%AF%E5%8D%8F%E8%AE%AE.pdf) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yi-1.5](LICENSE) / [Yuan 2](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan)
+
+## Citation
+
+If this work is helpful, please kindly cite as:
+
+```bibtex
+@inproceedings{zheng2024llamafactory,
+  title={LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models},
+  author={Yaowei Zheng and Richong Zhang and Junhao Zhang and Yanhan Ye and Zheyan Luo and Zhangchi Feng and Yongqiang Ma},
+  booktitle={Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)},
+  address={Bangkok, Thailand},
+  publisher={Association for Computational Linguistics},
+  year={2024},
+  url={http://arxiv.org/abs/2403.13372}
+}
+```
+
+## Acknowledgement
+
+This repo benefits from [PEFT](https://github.com/huggingface/peft), [TRL](https://github.com/huggingface/trl), [QLoRA](https://github.com/artidoro/qlora) and [FastChat](https://github.com/lm-sys/FastChat). Thanks for their wonderful works.
+
+## Star History
+
+![Star History Chart](https://api.star-history.com/svg?repos=hiyouga/LLaMA-Factory&type=Date)
diff --git a/llamafactory.egg-info/SOURCES.txt b/llamafactory.egg-info/SOURCES.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fb4a46244940cfc2d52cfdfa74064bcb07bf2aa8
--- /dev/null
+++ b/llamafactory.egg-info/SOURCES.txt
@@ -0,0 +1,178 @@
+LICENSE
+MANIFEST.in
+README.md
+pyproject.toml
+requirements.txt
+setup.py
+src/llamafactory/__init__.py
+src/llamafactory/cli.py
+src/llamafactory/launcher.py
+src/llamafactory.egg-info/PKG-INFO
+src/llamafactory.egg-info/SOURCES.txt
+src/llamafactory.egg-info/dependency_links.txt
+src/llamafactory.egg-info/entry_points.txt
+src/llamafactory.egg-info/requires.txt
+src/llamafactory.egg-info/top_level.txt
+src/llamafactory/api/__init__.py
+src/llamafactory/api/app.py
+src/llamafactory/api/chat.py
+src/llamafactory/api/common.py
+src/llamafactory/api/protocol.py
+src/llamafactory/chat/__init__.py
+src/llamafactory/chat/base_engine.py
+src/llamafactory/chat/chat_model.py
+src/llamafactory/chat/hf_engine.py
+src/llamafactory/chat/kt_engine.py
+src/llamafactory/chat/sglang_engine.py
+src/llamafactory/chat/vllm_engine.py
+src/llamafactory/data/__init__.py
+src/llamafactory/data/collator.py
+src/llamafactory/data/converter.py
+src/llamafactory/data/data_utils.py
+src/llamafactory/data/formatter.py
+src/llamafactory/data/loader.py
+src/llamafactory/data/mm_plugin.py
+src/llamafactory/data/parser.py
+src/llamafactory/data/template.py
+src/llamafactory/data/tool_utils.py
+src/llamafactory/data/processor/__init__.py
+src/llamafactory/data/processor/feedback.py
+src/llamafactory/data/processor/pairwise.py
+src/llamafactory/data/processor/pretrain.py
+src/llamafactory/data/processor/processor_utils.py
+src/llamafactory/data/processor/supervised.py
+src/llamafactory/data/processor/unsupervised.py
+src/llamafactory/eval/__init__.py
+src/llamafactory/eval/evaluator.py
+src/llamafactory/eval/template.py
+src/llamafactory/extras/__init__.py
+src/llamafactory/extras/constants.py
+src/llamafactory/extras/env.py
+src/llamafactory/extras/logging.py
+src/llamafactory/extras/misc.py
+src/llamafactory/extras/packages.py
+src/llamafactory/extras/ploting.py
+src/llamafactory/hparams/__init__.py
+src/llamafactory/hparams/data_args.py
+src/llamafactory/hparams/evaluation_args.py
+src/llamafactory/hparams/finetuning_args.py
+src/llamafactory/hparams/generating_args.py
+src/llamafactory/hparams/model_args.py
+src/llamafactory/hparams/parser.py
+src/llamafactory/hparams/training_args.py
+src/llamafactory/model/__init__.py
+src/llamafactory/model/adapter.py
+src/llamafactory/model/loader.py
+src/llamafactory/model/patcher.py
+src/llamafactory/model/model_utils/__init__.py
+src/llamafactory/model/model_utils/attention.py
+src/llamafactory/model/model_utils/checkpointing.py
+src/llamafactory/model/model_utils/embedding.py
+src/llamafactory/model/model_utils/ktransformers.py
+src/llamafactory/model/model_utils/kv_cache.py
+src/llamafactory/model/model_utils/liger_kernel.py
+src/llamafactory/model/model_utils/longlora.py
+src/llamafactory/model/model_utils/misc.py
+src/llamafactory/model/model_utils/mod.py
+src/llamafactory/model/model_utils/moe.py
+src/llamafactory/model/model_utils/packing.py
+src/llamafactory/model/model_utils/quantization.py
+src/llamafactory/model/model_utils/rope.py
+src/llamafactory/model/model_utils/unsloth.py
+src/llamafactory/model/model_utils/valuehead.py
+src/llamafactory/model/model_utils/visual.py
+src/llamafactory/third_party/__init__.py
+src/llamafactory/third_party/muon/__init__.py
+src/llamafactory/third_party/muon/muon.py
+src/llamafactory/train/__init__.py
+src/llamafactory/train/callbacks.py
+src/llamafactory/train/fp8_utils.py
+src/llamafactory/train/test_utils.py
+src/llamafactory/train/trainer_utils.py
+src/llamafactory/train/tuner.py
+src/llamafactory/train/dpo/__init__.py
+src/llamafactory/train/dpo/trainer.py
+src/llamafactory/train/dpo/workflow.py
+src/llamafactory/train/ksft/__init__.py
+src/llamafactory/train/ksft/workflow.py
+src/llamafactory/train/kto/__init__.py
+src/llamafactory/train/kto/trainer.py
+src/llamafactory/train/kto/workflow.py
+src/llamafactory/train/mca/__init__.py
+src/llamafactory/train/mca/trainer.py
+src/llamafactory/train/mca/workflow.py
+src/llamafactory/train/ppo/__init__.py
+src/llamafactory/train/ppo/ppo_utils.py
+src/llamafactory/train/ppo/trainer.py
+src/llamafactory/train/ppo/workflow.py
+src/llamafactory/train/pt/__init__.py
+src/llamafactory/train/pt/trainer.py
+src/llamafactory/train/pt/workflow.py
+src/llamafactory/train/rm/__init__.py
+src/llamafactory/train/rm/metric.py
+src/llamafactory/train/rm/trainer.py
+src/llamafactory/train/rm/workflow.py
+src/llamafactory/train/sft/__init__.py
+src/llamafactory/train/sft/metric.py
+src/llamafactory/train/sft/trainer.py
+src/llamafactory/train/sft/workflow.py
+src/llamafactory/v1/__init__.py
+src/llamafactory/v1/launcher.py
+src/llamafactory/v1/config/__init__.py
+src/llamafactory/v1/config/data_args.py
+src/llamafactory/v1/config/model_args.py
+src/llamafactory/v1/config/parser.py
+src/llamafactory/v1/config/sample_args.py
+src/llamafactory/v1/config/training_args.py
+src/llamafactory/v1/core/__init__.py
+src/llamafactory/v1/core/base_trainer.py
+src/llamafactory/v1/core/chat_sampler.py
+src/llamafactory/v1/core/data_engine.py
+src/llamafactory/v1/core/model_engine.py
+src/llamafactory/v1/plugins/__init__.py
+src/llamafactory/v1/plugins/data_plugins/__init__.py
+src/llamafactory/v1/plugins/data_plugins/converter.py
+src/llamafactory/v1/plugins/data_plugins/loader.py
+src/llamafactory/v1/plugins/data_plugins/template.py
+src/llamafactory/v1/plugins/model_plugins/__init__.py
+src/llamafactory/v1/plugins/model_plugins/added_token.py
+src/llamafactory/v1/plugins/model_plugins/peft.py
+src/llamafactory/v1/plugins/model_plugins/kernels/__init__.py
+src/llamafactory/v1/plugins/model_plugins/kernels/constants.py
+src/llamafactory/v1/plugins/model_plugins/kernels/registry.py
+src/llamafactory/v1/plugins/model_plugins/kernels/fa/__init__.py
+src/llamafactory/v1/plugins/model_plugins/kernels/mlp/__init__.py
+src/llamafactory/v1/plugins/model_plugins/kernels/mlp/npu_fused_moe.py
+src/llamafactory/v1/plugins/model_plugins/kernels/mlp/npu_swiglu.py
+src/llamafactory/v1/plugins/model_plugins/kernels/rms_norm/__init__.py
+src/llamafactory/v1/plugins/model_plugins/kernels/rms_norm/npu_rms_norm.py
+src/llamafactory/v1/plugins/model_plugins/kernels/rope/__init__.py
+src/llamafactory/v1/plugins/model_plugins/kernels/rope/npu_rope.py
+src/llamafactory/v1/plugins/sampler_plugins/__init__.py
+src/llamafactory/v1/plugins/sampler_plugins/vllm.py
+src/llamafactory/v1/plugins/trainer_plugins/__init__.py
+src/llamafactory/v1/plugins/trainer_plugins/distributed/__init__.py
+src/llamafactory/v1/plugins/trainer_plugins/distributed/accelerate.py
+src/llamafactory/v1/trainers/__init__.py
+src/llamafactory/v1/trainers/dpo_trainer.py
+src/llamafactory/v1/trainers/rm_trainer.py
+src/llamafactory/v1/trainers/sft_trainer.py
+src/llamafactory/webui/__init__.py
+src/llamafactory/webui/chatter.py
+src/llamafactory/webui/common.py
+src/llamafactory/webui/control.py
+src/llamafactory/webui/css.py
+src/llamafactory/webui/engine.py
+src/llamafactory/webui/interface.py
+src/llamafactory/webui/locales.py
+src/llamafactory/webui/manager.py
+src/llamafactory/webui/runner.py
+src/llamafactory/webui/components/__init__.py
+src/llamafactory/webui/components/chatbot.py
+src/llamafactory/webui/components/data.py
+src/llamafactory/webui/components/eval.py
+src/llamafactory/webui/components/export.py
+src/llamafactory/webui/components/footer.py
+src/llamafactory/webui/components/infer.py
+src/llamafactory/webui/components/top.py
+src/llamafactory/webui/components/train.py
\ No newline at end of file
diff --git a/llamafactory.egg-info/dependency_links.txt b/llamafactory.egg-info/dependency_links.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc
--- /dev/null
+++ b/llamafactory.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/llamafactory.egg-info/entry_points.txt b/llamafactory.egg-info/entry_points.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bcafd0771096e3e65f55808b2ead2909c6f6691b
--- /dev/null
+++ b/llamafactory.egg-info/entry_points.txt
@@ -0,0 +1,3 @@
+[console_scripts]
+llamafactory-cli = llamafactory.cli:main
+lmf = llamafactory.cli:main
diff --git a/llamafactory.egg-info/requires.txt b/llamafactory.egg-info/requires.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2fb93071d9a6df49004f3c31adc7c4e77bee4cd8
--- /dev/null
+++ b/llamafactory.egg-info/requires.txt
@@ -0,0 +1,125 @@
+datasets<=4.0.0,>=2.16.0
+accelerate<=1.11.0,>=1.3.0
+peft<=0.17.1,>=0.14.0
+trl<=0.9.6,>=0.8.6
+gradio<=5.45.0,>=4.38.0
+matplotlib>=3.7.0
+tyro<0.9.0
+einops
+numpy<2.0.0
+pandas>=2.0.0
+scipy
+sentencepiece
+tiktoken
+modelscope>=1.14.0
+hf-transfer
+safetensors<=0.5.3
+fire
+omegaconf
+packaging
+protobuf
+pyyaml
+pydantic<=2.10.6
+uvicorn
+fastapi
+sse-starlette
+av
+librosa
+propcache!=0.4.0
+
+[:python_version < "3.10"]
+transformers!=4.52.0,<=4.56.2,>=4.49.0
+
+[:python_version >= "3.10"]
+transformers!=4.52.0,!=4.57.0,<=4.57.1,>=4.49.0
+
+[adam-mini]
+adam-mini
+
+[apollo]
+apollo-torch
+
+[aqlm]
+aqlm[gpu]>=1.1.0
+
+[badam]
+badam>=1.2.1
+
+[bitsandbytes]
+bitsandbytes>=0.39.0
+
+[deepspeed]
+deepspeed<=0.16.9,>=0.10.0
+
+[dev]
+pre-commit
+ruff
+pytest
+build
+
+[eetq]
+eetq
+
+[fp8]
+torchao>=0.8.0
+accelerate>=1.10.0
+
+[fp8-all]
+torchao>=0.8.0
+transformer_engine[pytorch]>=2.0.0
+accelerate>=1.10.0
+
+[fp8-te]
+transformer_engine[pytorch]>=2.0.0
+accelerate>=1.10.0
+
+[galore]
+galore-torch
+
+[gptq]
+optimum>=1.24.0
+gptqmodel>=2.0.0
+
+[hqq]
+hqq
+
+[liger-kernel]
+liger-kernel>=0.5.5
+
+[metrics]
+nltk
+jieba
+rouge-chinese
+
+[minicpm_v]
+soundfile
+torchvision
+torchaudio
+vector_quantize_pytorch
+vocos
+msgpack
+referencing
+jsonschema_specifications
+
+[openmind]
+openmind
+
+[sglang]
+sglang[srt]>=0.4.5
+transformers==4.51.1
+
+[swanlab]
+swanlab
+
+[torch]
+torch>=2.0.0
+torchvision>=0.15.0
+
+[torch-npu]
+torch==2.7.1
+torch-npu==2.7.1
+torchvision==0.22.1
+decorator
+
+[vllm]
+vllm<=0.11.0,>=0.4.3
diff --git a/llamafactory.egg-info/top_level.txt b/llamafactory.egg-info/top_level.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d6670a28d28bf8b6497d5cca0e63d1e13c1aee55
--- /dev/null
+++ b/llamafactory.egg-info/top_level.txt
@@ -0,0 +1 @@
+llamafactory
diff --git a/llamafactory/__init__.py b/llamafactory/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1567ef572714881cc464db25d3da3d08a460963
--- /dev/null
+++ b/llamafactory/__init__.py
@@ -0,0 +1,31 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""Efficient fine-tuning of large language models.
+
+Level:
+  api, webui > chat, eval, train > data, model > hparams > extras
+
+Disable version checking: DISABLE_VERSION_CHECK=1
+Enable VRAM recording: RECORD_VRAM=1
+Force using torchrun: FORCE_TORCHRUN=1
+Set logging verbosity: LLAMAFACTORY_VERBOSITY=WARN
+Use modelscope: USE_MODELSCOPE_HUB=1
+Use openmind: USE_OPENMIND_HUB=1
+"""
+
+from .extras.env import VERSION
+
+
+__version__ = VERSION
diff --git a/llamafactory/__pycache__/__init__.cpython-312.pyc b/llamafactory/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ae97776ebf1e10b36d5f470d1c0de9c4f5b17797
Binary files /dev/null and b/llamafactory/__pycache__/__init__.cpython-312.pyc differ
diff --git a/llamafactory/__pycache__/cli.cpython-312.pyc b/llamafactory/__pycache__/cli.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..95db2f247b7b2fe6595a5f7cc774ce0f6f472c11
Binary files /dev/null and b/llamafactory/__pycache__/cli.cpython-312.pyc differ
diff --git a/llamafactory/__pycache__/launcher.cpython-312.pyc b/llamafactory/__pycache__/launcher.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..28aab819e043e9a2500c50c92f460fa7d87533cd
Binary files /dev/null and b/llamafactory/__pycache__/launcher.cpython-312.pyc differ
diff --git a/llamafactory/api/__init__.py b/llamafactory/api/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/llamafactory/api/app.py b/llamafactory/api/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0621d80b064f00970e8fb58909ec8656ba0fb6b
--- /dev/null
+++ b/llamafactory/api/app.py
@@ -0,0 +1,133 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import os
+from contextlib import asynccontextmanager
+from functools import partial
+from typing import Annotated, Optional
+
+from ..chat import ChatModel
+from ..extras.constants import EngineName
+from ..extras.misc import torch_gc
+from ..extras.packages import is_fastapi_available, is_starlette_available, is_uvicorn_available
+from .chat import (
+    create_chat_completion_response,
+    create_score_evaluation_response,
+    create_stream_chat_completion_response,
+)
+from .protocol import (
+    ChatCompletionRequest,
+    ChatCompletionResponse,
+    ModelCard,
+    ModelList,
+    ScoreEvaluationRequest,
+    ScoreEvaluationResponse,
+)
+
+
+if is_fastapi_available():
+    from fastapi import Depends, FastAPI, HTTPException, status
+    from fastapi.middleware.cors import CORSMiddleware
+    from fastapi.security.http import HTTPAuthorizationCredentials, HTTPBearer
+
+
+if is_starlette_available():
+    from sse_starlette import EventSourceResponse
+
+
+if is_uvicorn_available():
+    import uvicorn
+
+
+async def sweeper() -> None:
+    while True:
+        torch_gc()
+        await asyncio.sleep(300)
+
+
+@asynccontextmanager
+async def lifespan(app: "FastAPI", chat_model: "ChatModel"):  # collects GPU memory
+    if chat_model.engine.name == EngineName.HF:
+        asyncio.create_task(sweeper())
+
+    yield
+    torch_gc()
+
+
+def create_app(chat_model: "ChatModel") -> "FastAPI":
+    root_path = os.getenv("FASTAPI_ROOT_PATH", "")
+    app = FastAPI(lifespan=partial(lifespan, chat_model=chat_model), root_path=root_path)
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins=["*"],
+        allow_credentials=True,
+        allow_methods=["*"],
+        allow_headers=["*"],
+    )
+    api_key = os.getenv("API_KEY")
+    security = HTTPBearer(auto_error=False)
+
+    async def verify_api_key(auth: Annotated[Optional[HTTPAuthorizationCredentials], Depends(security)]):
+        if api_key and (auth is None or auth.credentials != api_key):
+            raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Invalid API key.")
+
+    @app.get(
+        "/v1/models",
+        response_model=ModelList,
+        status_code=status.HTTP_200_OK,
+        dependencies=[Depends(verify_api_key)],
+    )
+    async def list_models():
+        model_card = ModelCard(id=os.getenv("API_MODEL_NAME", "gpt-3.5-turbo"))
+        return ModelList(data=[model_card])
+
+    @app.post(
+        "/v1/chat/completions",
+        response_model=ChatCompletionResponse,
+        status_code=status.HTTP_200_OK,
+        dependencies=[Depends(verify_api_key)],
+    )
+    async def create_chat_completion(request: ChatCompletionRequest):
+        if not chat_model.engine.can_generate:
+            raise HTTPException(status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Not allowed")
+
+        if request.stream:
+            generate = create_stream_chat_completion_response(request, chat_model)
+            return EventSourceResponse(generate, media_type="text/event-stream", sep="\n")
+        else:
+            return await create_chat_completion_response(request, chat_model)
+
+    @app.post(
+        "/v1/score/evaluation",
+        response_model=ScoreEvaluationResponse,
+        status_code=status.HTTP_200_OK,
+        dependencies=[Depends(verify_api_key)],
+    )
+    async def create_score_evaluation(request: ScoreEvaluationRequest):
+        if chat_model.engine.can_generate:
+            raise HTTPException(status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Not allowed")
+
+        return await create_score_evaluation_response(request, chat_model)
+
+    return app
+
+
+def run_api() -> None:
+    chat_model = ChatModel()
+    app = create_app(chat_model)
+    api_host = os.getenv("API_HOST", "0.0.0.0")
+    api_port = int(os.getenv("API_PORT", "8000"))
+    print(f"Visit http://localhost:{api_port}/docs for API document.")
+    uvicorn.run(app, host=api_host, port=api_port)
diff --git a/llamafactory/api/chat.py b/llamafactory/api/chat.py
new file mode 100644
index 0000000000000000000000000000000000000000..93236c5ca865492f0c45e1f5ab56a389875350ea
--- /dev/null
+++ b/llamafactory/api/chat.py
@@ -0,0 +1,291 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import base64
+import io
+import json
+import os
+import re
+import uuid
+from collections.abc import AsyncGenerator
+from typing import TYPE_CHECKING, Optional
+
+from ..data import Role as DataRole
+from ..extras import logging
+from ..extras.constants import AUDIO_PLACEHOLDER, IMAGE_PLACEHOLDER, VIDEO_PLACEHOLDER
+from ..extras.misc import is_env_enabled
+from ..extras.packages import is_fastapi_available, is_pillow_available, is_requests_available
+from .common import check_lfi_path, check_ssrf_url, dictify, jsonify
+from .protocol import (
+    ChatCompletionMessage,
+    ChatCompletionResponse,
+    ChatCompletionResponseChoice,
+    ChatCompletionResponseUsage,
+    ChatCompletionStreamResponse,
+    ChatCompletionStreamResponseChoice,
+    Finish,
+    Function,
+    FunctionCall,
+    Role,
+    ScoreEvaluationResponse,
+)
+
+
+if is_fastapi_available():
+    from fastapi import HTTPException, status
+
+
+if is_pillow_available():
+    from PIL import Image
+
+
+if is_requests_available():
+    import requests
+
+
+if TYPE_CHECKING:
+    from ..chat import ChatModel
+    from ..data.mm_plugin import AudioInput, ImageInput, VideoInput
+    from .protocol import ChatCompletionRequest, ScoreEvaluationRequest
+
+
+logger = logging.get_logger(__name__)
+ROLE_MAPPING = {
+    Role.USER: DataRole.USER.value,
+    Role.ASSISTANT: DataRole.ASSISTANT.value,
+    Role.SYSTEM: DataRole.SYSTEM.value,
+    Role.FUNCTION: DataRole.FUNCTION.value,
+    Role.TOOL: DataRole.OBSERVATION.value,
+}
+
+
+def _process_request(
+    request: "ChatCompletionRequest",
+) -> tuple[
+    list[dict[str, str]],
+    Optional[str],
+    Optional[str],
+    Optional[list["ImageInput"]],
+    Optional[list["VideoInput"]],
+    Optional[list["AudioInput"]],
+]:
+    if is_env_enabled("API_VERBOSE", "1"):
+        logger.info_rank0(f"==== request ====\n{json.dumps(dictify(request), indent=2, ensure_ascii=False)}")
+
+    if len(request.messages) == 0:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid length")
+
+    if request.messages[0].role == Role.SYSTEM:
+        content = request.messages.pop(0).content
+        system = content[0].text if isinstance(content, list) else content
+    else:
+        system = None
+
+    if len(request.messages) % 2 == 0:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Only supports u/a/u/a/u...")
+
+    input_messages = []
+    images, videos, audios = [], [], []
+    for i, message in enumerate(request.messages):
+        if i % 2 == 0 and message.role not in [Role.USER, Role.TOOL]:
+            raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid role")
+        elif i % 2 == 1 and message.role not in [Role.ASSISTANT, Role.FUNCTION]:
+            raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid role")
+
+        if message.role == Role.ASSISTANT and isinstance(message.tool_calls, list) and len(message.tool_calls):
+            tool_calls = [
+                {"name": tool_call.function.name, "arguments": tool_call.function.arguments}
+                for tool_call in message.tool_calls
+            ]
+            content = json.dumps(tool_calls, ensure_ascii=False)
+            input_messages.append({"role": ROLE_MAPPING[Role.FUNCTION], "content": content})
+        elif isinstance(message.content, list):
+            text_content = ""
+            for input_item in message.content:
+                if input_item.type == "text":
+                    text_content += input_item.text
+                elif input_item.type == "image_url":
+                    text_content += IMAGE_PLACEHOLDER
+                    image_url = input_item.image_url.url
+                    if re.match(r"^data:image\/(png|jpg|jpeg|gif|bmp);base64,(.+)$", image_url):  # base64 image
+                        image_stream = io.BytesIO(base64.b64decode(image_url.split(",", maxsplit=1)[1]))
+                    elif os.path.isfile(image_url):  # local file
+                        check_lfi_path(image_url)
+                        image_stream = open(image_url, "rb")
+                    else:  # web uri
+                        check_ssrf_url(image_url)
+                        image_stream = requests.get(image_url, stream=True).raw
+
+                    images.append(Image.open(image_stream).convert("RGB"))
+                elif input_item.type == "video_url":
+                    text_content += VIDEO_PLACEHOLDER
+                    video_url = input_item.video_url.url
+                    if re.match(r"^data:video\/(mp4|mkv|avi|mov);base64,(.+)$", video_url):  # base64 video
+                        video_stream = io.BytesIO(base64.b64decode(video_url.split(",", maxsplit=1)[1]))
+                    elif os.path.isfile(video_url):  # local file
+                        check_lfi_path(video_url)
+                        video_stream = video_url
+                    else:  # web uri
+                        check_ssrf_url(video_url)
+                        video_stream = requests.get(video_url, stream=True).raw
+
+                    videos.append(video_stream)
+                elif input_item.type == "audio_url":
+                    text_content += AUDIO_PLACEHOLDER
+                    audio_url = input_item.audio_url.url
+                    if re.match(r"^data:audio\/(mpeg|mp3|wav|ogg);base64,(.+)$", audio_url):  # base64 audio
+                        audio_stream = io.BytesIO(base64.b64decode(audio_url.split(",", maxsplit=1)[1]))
+                    elif os.path.isfile(audio_url):  # local file
+                        check_lfi_path(audio_url)
+                        audio_stream = audio_url
+                    else:  # web uri
+                        check_ssrf_url(audio_url)
+                        audio_stream = requests.get(audio_url, stream=True).raw
+
+                    audios.append(audio_stream)
+                else:
+                    raise HTTPException(
+                        status_code=status.HTTP_400_BAD_REQUEST, detail=f"Invalid input type {input_item.type}."
+                    )
+
+            input_messages.append({"role": ROLE_MAPPING[message.role], "content": text_content})
+        else:
+            input_messages.append({"role": ROLE_MAPPING[message.role], "content": message.content})
+
+    tool_list = request.tools
+    if isinstance(tool_list, list) and len(tool_list):
+        try:
+            tools = json.dumps([dictify(tool.function) for tool in tool_list], ensure_ascii=False)
+        except json.JSONDecodeError:
+            raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid tools")
+    else:
+        tools = None
+
+    return input_messages, system, tools, images or None, videos or None, audios or None
+
+
+def _create_stream_chat_completion_chunk(
+    completion_id: str,
+    model: str,
+    delta: "ChatCompletionMessage",
+    index: Optional[int] = 0,
+    finish_reason: Optional["Finish"] = None,
+) -> str:
+    choice_data = ChatCompletionStreamResponseChoice(index=index, delta=delta, finish_reason=finish_reason)
+    chunk = ChatCompletionStreamResponse(id=completion_id, model=model, choices=[choice_data])
+    return jsonify(chunk)
+
+
+async def create_chat_completion_response(
+    request: "ChatCompletionRequest", chat_model: "ChatModel"
+) -> "ChatCompletionResponse":
+    completion_id = f"chatcmpl-{uuid.uuid4().hex}"
+    input_messages, system, tools, images, videos, audios = _process_request(request)
+    responses = await chat_model.achat(
+        input_messages,
+        system,
+        tools,
+        images,
+        videos,
+        audios,
+        do_sample=request.do_sample,
+        temperature=request.temperature,
+        top_p=request.top_p,
+        max_new_tokens=request.max_tokens,
+        num_return_sequences=request.n,
+        repetition_penalty=request.presence_penalty,
+        stop=request.stop,
+    )
+
+    prompt_length, response_length = 0, 0
+    choices = []
+    for i, response in enumerate(responses):
+        if tools:
+            result = chat_model.engine.template.extract_tool(response.response_text)
+        else:
+            result = response.response_text
+
+        if isinstance(result, list):
+            tool_calls = []
+            for tool in result:
+                function = Function(name=tool.name, arguments=tool.arguments)
+                tool_calls.append(FunctionCall(id=f"call_{uuid.uuid4().hex}", function=function))
+
+            response_message = ChatCompletionMessage(role=Role.ASSISTANT, tool_calls=tool_calls)
+            finish_reason = Finish.TOOL
+        else:
+            response_message = ChatCompletionMessage(role=Role.ASSISTANT, content=result)
+            finish_reason = Finish.STOP if response.finish_reason == "stop" else Finish.LENGTH
+
+        choices.append(ChatCompletionResponseChoice(index=i, message=response_message, finish_reason=finish_reason))
+        prompt_length = response.prompt_length
+        response_length += response.response_length
+
+    usage = ChatCompletionResponseUsage(
+        prompt_tokens=prompt_length,
+        completion_tokens=response_length,
+        total_tokens=prompt_length + response_length,
+    )
+
+    return ChatCompletionResponse(id=completion_id, model=request.model, choices=choices, usage=usage)
+
+
+async def create_stream_chat_completion_response(
+    request: "ChatCompletionRequest", chat_model: "ChatModel"
+) -> AsyncGenerator[str, None]:
+    completion_id = f"chatcmpl-{uuid.uuid4().hex}"
+    input_messages, system, tools, images, videos, audios = _process_request(request)
+    if tools:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Cannot stream function calls.")
+
+    if request.n > 1:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Cannot stream multiple responses.")
+
+    yield _create_stream_chat_completion_chunk(
+        completion_id=completion_id, model=request.model, delta=ChatCompletionMessage(role=Role.ASSISTANT, content="")
+    )
+    async for new_token in chat_model.astream_chat(
+        input_messages,
+        system,
+        tools,
+        images,
+        videos,
+        audios,
+        do_sample=request.do_sample,
+        temperature=request.temperature,
+        top_p=request.top_p,
+        max_new_tokens=request.max_tokens,
+        repetition_penalty=request.presence_penalty,
+        stop=request.stop,
+    ):
+        if len(new_token) != 0:
+            yield _create_stream_chat_completion_chunk(
+                completion_id=completion_id, model=request.model, delta=ChatCompletionMessage(content=new_token)
+            )
+
+    yield _create_stream_chat_completion_chunk(
+        completion_id=completion_id, model=request.model, delta=ChatCompletionMessage(), finish_reason=Finish.STOP
+    )
+    yield "[DONE]"
+
+
+async def create_score_evaluation_response(
+    request: "ScoreEvaluationRequest", chat_model: "ChatModel"
+) -> "ScoreEvaluationResponse":
+    score_id = f"scoreval-{uuid.uuid4().hex}"
+    if len(request.messages) == 0:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid request")
+
+    scores = await chat_model.aget_scores(request.messages, max_length=request.max_length)
+    return ScoreEvaluationResponse(id=score_id, model=request.model, scores=scores)
diff --git a/llamafactory/api/common.py b/llamafactory/api/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b4e9602de7ebc10b4f15c68ad9167cb9d80d8ef
--- /dev/null
+++ b/llamafactory/api/common.py
@@ -0,0 +1,96 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import ipaddress
+import json
+import os
+import socket
+from typing import TYPE_CHECKING, Any
+from urllib.parse import urlparse
+
+from ..extras.misc import is_env_enabled
+from ..extras.packages import is_fastapi_available
+
+
+if is_fastapi_available():
+    from fastapi import HTTPException, status
+
+
+if TYPE_CHECKING:
+    from pydantic import BaseModel
+
+
+SAFE_MEDIA_PATH = os.environ.get("SAFE_MEDIA_PATH", os.path.join(os.path.dirname(__file__), "safe_media"))
+ALLOW_LOCAL_FILES = is_env_enabled("ALLOW_LOCAL_FILES", "1")
+
+
+def dictify(data: "BaseModel") -> dict[str, Any]:
+    try:  # pydantic v2
+        return data.model_dump(exclude_unset=True)
+    except AttributeError:  # pydantic v1
+        return data.dict(exclude_unset=True)
+
+
+def jsonify(data: "BaseModel") -> str:
+    try:  # pydantic v2
+        return json.dumps(data.model_dump(exclude_unset=True), ensure_ascii=False)
+    except AttributeError:  # pydantic v1
+        return data.json(exclude_unset=True, ensure_ascii=False)
+
+
+def check_lfi_path(path: str) -> None:
+    """Checks if a given path is vulnerable to LFI. Raises HTTPException if unsafe."""
+    if not ALLOW_LOCAL_FILES:
+        raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail="Local file access is disabled.")
+
+    try:
+        os.makedirs(SAFE_MEDIA_PATH, exist_ok=True)
+        real_path = os.path.realpath(path)
+        safe_path = os.path.realpath(SAFE_MEDIA_PATH)
+
+        if not real_path.startswith(safe_path):
+            raise HTTPException(
+                status_code=status.HTTP_403_FORBIDDEN, detail="File access is restricted to the safe media directory."
+            )
+    except Exception:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid or inaccessible file path.")
+
+
+def check_ssrf_url(url: str) -> None:
+    """Checks if a given URL is vulnerable to SSRF. Raises HTTPException if unsafe."""
+    try:
+        parsed_url = urlparse(url)
+        if parsed_url.scheme not in ["http", "https"]:
+            raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Only HTTP/HTTPS URLs are allowed.")
+
+        hostname = parsed_url.hostname
+        if not hostname:
+            raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid URL hostname.")
+
+        ip_info = socket.getaddrinfo(hostname, parsed_url.port)
+        ip_address_str = ip_info[0][4][0]
+        ip = ipaddress.ip_address(ip_address_str)
+
+        if not ip.is_global:
+            raise HTTPException(
+                status_code=status.HTTP_403_FORBIDDEN,
+                detail="Access to private or reserved IP addresses is not allowed.",
+            )
+
+    except socket.gaierror:
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST, detail=f"Could not resolve hostname: {parsed_url.hostname}"
+        )
+    except Exception as e:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=f"Invalid URL: {e}")
diff --git a/llamafactory/api/protocol.py b/llamafactory/api/protocol.py
new file mode 100644
index 0000000000000000000000000000000000000000..889d938e0b727ef1fca63d95fa20926c31830c52
--- /dev/null
+++ b/llamafactory/api/protocol.py
@@ -0,0 +1,157 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+from enum import Enum, unique
+from typing import Any, Optional, Union
+
+from pydantic import BaseModel, Field
+from typing_extensions import Literal
+
+
+@unique
+class Role(str, Enum):
+    USER = "user"
+    ASSISTANT = "assistant"
+    SYSTEM = "system"
+    FUNCTION = "function"
+    TOOL = "tool"
+
+
+@unique
+class Finish(str, Enum):
+    STOP = "stop"
+    LENGTH = "length"
+    TOOL = "tool_calls"
+
+
+class ModelCard(BaseModel):
+    id: str
+    object: Literal["model"] = "model"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    owned_by: Literal["owner"] = "owner"
+
+
+class ModelList(BaseModel):
+    object: Literal["list"] = "list"
+    data: list[ModelCard] = []
+
+
+class Function(BaseModel):
+    name: str
+    arguments: str
+
+
+class FunctionDefinition(BaseModel):
+    name: str
+    description: str
+    parameters: dict[str, Any]
+
+
+class FunctionAvailable(BaseModel):
+    type: Literal["function", "code_interpreter"] = "function"
+    function: Optional[FunctionDefinition] = None
+
+
+class FunctionCall(BaseModel):
+    id: str
+    type: Literal["function"] = "function"
+    function: Function
+
+
+class URL(BaseModel):
+    url: str
+    detail: Literal["auto", "low", "high"] = "auto"
+
+
+class MultimodalInputItem(BaseModel):
+    type: Literal["text", "image_url", "video_url", "audio_url"]
+    text: Optional[str] = None
+    image_url: Optional[URL] = None
+    video_url: Optional[URL] = None
+    audio_url: Optional[URL] = None
+
+
+class ChatMessage(BaseModel):
+    role: Role
+    content: Optional[Union[str, list[MultimodalInputItem]]] = None
+    tool_calls: Optional[list[FunctionCall]] = None
+
+
+class ChatCompletionMessage(BaseModel):
+    role: Optional[Role] = None
+    content: Optional[str] = None
+    tool_calls: Optional[list[FunctionCall]] = None
+
+
+class ChatCompletionRequest(BaseModel):
+    model: str
+    messages: list[ChatMessage]
+    tools: Optional[list[FunctionAvailable]] = None
+    do_sample: Optional[bool] = None
+    temperature: Optional[float] = None
+    top_p: Optional[float] = None
+    n: int = 1
+    presence_penalty: Optional[float] = None
+    max_tokens: Optional[int] = None
+    stop: Optional[Union[str, list[str]]] = None
+    stream: bool = False
+
+
+class ChatCompletionResponseChoice(BaseModel):
+    index: int
+    message: ChatCompletionMessage
+    finish_reason: Finish
+
+
+class ChatCompletionStreamResponseChoice(BaseModel):
+    index: int
+    delta: ChatCompletionMessage
+    finish_reason: Optional[Finish] = None
+
+
+class ChatCompletionResponseUsage(BaseModel):
+    prompt_tokens: int
+    completion_tokens: int
+    total_tokens: int
+
+
+class ChatCompletionResponse(BaseModel):
+    id: str
+    object: Literal["chat.completion"] = "chat.completion"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: list[ChatCompletionResponseChoice]
+    usage: ChatCompletionResponseUsage
+
+
+class ChatCompletionStreamResponse(BaseModel):
+    id: str
+    object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: list[ChatCompletionStreamResponseChoice]
+
+
+class ScoreEvaluationRequest(BaseModel):
+    model: str
+    messages: list[str]
+    max_length: Optional[int] = None
+
+
+class ScoreEvaluationResponse(BaseModel):
+    id: str
+    object: Literal["score.evaluation"] = "score.evaluation"
+    model: str
+    scores: list[float]
diff --git a/llamafactory/chat/__init__.py b/llamafactory/chat/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..15d8b9ba2d77d6f300d59300da5a49abd3ed4e57
--- /dev/null
+++ b/llamafactory/chat/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .base_engine import BaseEngine
+from .chat_model import ChatModel
+
+
+__all__ = ["BaseEngine", "ChatModel"]
diff --git a/llamafactory/chat/__pycache__/__init__.cpython-312.pyc b/llamafactory/chat/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1435170474306d36918011ef6af44e36b8fb8e44
Binary files /dev/null and b/llamafactory/chat/__pycache__/__init__.cpython-312.pyc differ
diff --git a/llamafactory/chat/__pycache__/base_engine.cpython-312.pyc b/llamafactory/chat/__pycache__/base_engine.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..358bdcffd32512c3649dea4159ee05f027a37181
Binary files /dev/null and b/llamafactory/chat/__pycache__/base_engine.cpython-312.pyc differ
diff --git a/llamafactory/chat/__pycache__/chat_model.cpython-312.pyc b/llamafactory/chat/__pycache__/chat_model.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d5abba6446be7ea5a10650a5e22a3dd0f1534446
Binary files /dev/null and b/llamafactory/chat/__pycache__/chat_model.cpython-312.pyc differ
diff --git a/llamafactory/chat/base_engine.py b/llamafactory/chat/base_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d497c1ae927f94f396c18833b18cdb894cbd59d
--- /dev/null
+++ b/llamafactory/chat/base_engine.py
@@ -0,0 +1,98 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from collections.abc import AsyncGenerator
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Literal, Optional, Union
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel, PreTrainedTokenizer
+    from vllm import AsyncLLMEngine
+
+    from ..data import Template
+    from ..data.mm_plugin import AudioInput, ImageInput, VideoInput
+    from ..extras.constants import EngineName
+    from ..hparams import DataArguments, FinetuningArguments, GeneratingArguments, ModelArguments
+
+
+@dataclass
+class Response:
+    response_text: str
+    response_length: int
+    prompt_length: int
+    finish_reason: Literal["stop", "length"]
+
+
+class BaseEngine(ABC):
+    r"""Base class for inference engine of chat models.
+
+    Must implements async methods: chat(), stream_chat() and get_scores().
+    """
+
+    name: "EngineName"
+    model: Union["PreTrainedModel", "AsyncLLMEngine"]
+    tokenizer: "PreTrainedTokenizer"
+    can_generate: bool
+    template: "Template"
+    generating_args: dict[str, Any]
+
+    @abstractmethod
+    def __init__(
+        self,
+        model_args: "ModelArguments",
+        data_args: "DataArguments",
+        finetuning_args: "FinetuningArguments",
+        generating_args: "GeneratingArguments",
+    ) -> None:
+        r"""Initialize an inference engine."""
+        ...
+
+    @abstractmethod
+    async def chat(
+        self,
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+        images: Optional[list["ImageInput"]] = None,
+        videos: Optional[list["VideoInput"]] = None,
+        audios: Optional[list["AudioInput"]] = None,
+        **input_kwargs,
+    ) -> list["Response"]:
+        r"""Get a list of responses of the chat model."""
+        ...
+
+    @abstractmethod
+    async def stream_chat(
+        self,
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+        images: Optional[list["ImageInput"]] = None,
+        videos: Optional[list["VideoInput"]] = None,
+        audios: Optional[list["AudioInput"]] = None,
+        **input_kwargs,
+    ) -> AsyncGenerator[str, None]:
+        r"""Get the response token-by-token of the chat model."""
+        ...
+
+    @abstractmethod
+    async def get_scores(
+        self,
+        batch_input: list[str],
+        **input_kwargs,
+    ) -> list[float]:
+        r"""Get a list of scores of the reward model."""
+        ...
diff --git a/llamafactory/chat/chat_model.py b/llamafactory/chat/chat_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb612f88d468d76f06eefa45b96c1bfa0351fa7c
--- /dev/null
+++ b/llamafactory/chat/chat_model.py
@@ -0,0 +1,210 @@
+# Copyright 2025 THUDM and the LlamaFactory team.
+#
+# This code is inspired by the THUDM's ChatGLM implementation.
+# https://github.com/THUDM/ChatGLM-6B/blob/main/cli_demo.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import os
+from collections.abc import AsyncGenerator, Generator
+from threading import Thread
+from typing import TYPE_CHECKING, Any, Optional
+
+from ..extras.constants import EngineName
+from ..extras.misc import torch_gc
+from ..hparams import get_infer_args
+
+
+if TYPE_CHECKING:
+    from ..data.mm_plugin import AudioInput, ImageInput, VideoInput
+    from .base_engine import BaseEngine, Response
+
+
+def _start_background_loop(loop: "asyncio.AbstractEventLoop") -> None:
+    asyncio.set_event_loop(loop)
+    loop.run_forever()
+
+
+class ChatModel:
+    r"""General class for chat models. Backed by huggingface or vllm engines.
+
+    Supports both sync and async methods.
+    Sync methods: chat(), stream_chat() and get_scores().
+    Async methods: achat(), astream_chat() and aget_scores().
+    """
+
+    def __init__(self, args: Optional[dict[str, Any]] = None) -> None:
+        model_args, data_args, finetuning_args, generating_args = get_infer_args(args)
+
+        if model_args.infer_backend == EngineName.HF:
+            from .hf_engine import HuggingfaceEngine
+
+            self.engine: BaseEngine = HuggingfaceEngine(model_args, data_args, finetuning_args, generating_args)
+        elif model_args.infer_backend == EngineName.VLLM:
+            try:
+                from .vllm_engine import VllmEngine
+
+                self.engine: BaseEngine = VllmEngine(model_args, data_args, finetuning_args, generating_args)
+            except ImportError as e:
+                raise ImportError(
+                    "vLLM not install, you may need to run `pip install vllm`\n"
+                    "or try to use HuggingFace backend: --infer_backend huggingface"
+                ) from e
+        elif model_args.infer_backend == EngineName.SGLANG:
+            try:
+                from .sglang_engine import SGLangEngine
+
+                self.engine: BaseEngine = SGLangEngine(model_args, data_args, finetuning_args, generating_args)
+            except ImportError as e:
+                raise ImportError(
+                    "SGLang not install, you may need to run `pip install sglang[all]`\n"
+                    "or try to use HuggingFace backend: --infer_backend huggingface"
+                ) from e
+        elif model_args.infer_backend == EngineName.KT:
+            try:
+                from .kt_engine import KTransformersEngine
+
+                self.engine: BaseEngine = KTransformersEngine(model_args, data_args, finetuning_args, generating_args)
+            except ImportError as e:
+                raise ImportError(
+                    "KTransformers not install, you may need to run `pip install ktransformers`\n"
+                    "or try to use HuggingFace backend: --infer_backend huggingface"
+                ) from e
+        else:
+            raise NotImplementedError(f"Unknown backend: {model_args.infer_backend}")
+
+        self._loop = asyncio.new_event_loop()
+        self._thread = Thread(target=_start_background_loop, args=(self._loop,), daemon=True)
+        self._thread.start()
+
+    def chat(
+        self,
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+        images: Optional[list["ImageInput"]] = None,
+        videos: Optional[list["VideoInput"]] = None,
+        audios: Optional[list["AudioInput"]] = None,
+        **input_kwargs,
+    ) -> list["Response"]:
+        r"""Get a list of responses of the chat model."""
+        task = asyncio.run_coroutine_threadsafe(
+            self.achat(messages, system, tools, images, videos, audios, **input_kwargs), self._loop
+        )
+        return task.result()
+
+    async def achat(
+        self,
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+        images: Optional[list["ImageInput"]] = None,
+        videos: Optional[list["VideoInput"]] = None,
+        audios: Optional[list["AudioInput"]] = None,
+        **input_kwargs,
+    ) -> list["Response"]:
+        r"""Asynchronously get a list of responses of the chat model."""
+        return await self.engine.chat(messages, system, tools, images, videos, audios, **input_kwargs)
+
+    def stream_chat(
+        self,
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+        images: Optional[list["ImageInput"]] = None,
+        videos: Optional[list["VideoInput"]] = None,
+        audios: Optional[list["AudioInput"]] = None,
+        **input_kwargs,
+    ) -> Generator[str, None, None]:
+        r"""Get the response token-by-token of the chat model."""
+        generator = self.astream_chat(messages, system, tools, images, videos, audios, **input_kwargs)
+        while True:
+            try:
+                task = asyncio.run_coroutine_threadsafe(generator.__anext__(), self._loop)
+                yield task.result()
+            except StopAsyncIteration:
+                break
+
+    async def astream_chat(
+        self,
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+        images: Optional[list["ImageInput"]] = None,
+        videos: Optional[list["VideoInput"]] = None,
+        audios: Optional[list["AudioInput"]] = None,
+        **input_kwargs,
+    ) -> AsyncGenerator[str, None]:
+        r"""Asynchronously get the response token-by-token of the chat model."""
+        async for new_token in self.engine.stream_chat(
+            messages, system, tools, images, videos, audios, **input_kwargs
+        ):
+            yield new_token
+
+    def get_scores(
+        self,
+        batch_input: list[str],
+        **input_kwargs,
+    ) -> list[float]:
+        r"""Get a list of scores of the reward model."""
+        task = asyncio.run_coroutine_threadsafe(self.aget_scores(batch_input, **input_kwargs), self._loop)
+        return task.result()
+
+    async def aget_scores(
+        self,
+        batch_input: list[str],
+        **input_kwargs,
+    ) -> list[float]:
+        r"""Asynchronously get a list of scores of the reward model."""
+        return await self.engine.get_scores(batch_input, **input_kwargs)
+
+
+def run_chat() -> None:
+    if os.name != "nt":
+        try:
+            import readline  # noqa: F401
+        except ImportError:
+            print("Install `readline` for a better experience.")
+
+    chat_model = ChatModel()
+    messages = []
+    print("Welcome to the CLI application, use `clear` to remove the history, use `exit` to exit the application.")
+
+    while True:
+        try:
+            query = input("\nUser: ")
+        except UnicodeDecodeError:
+            print("Detected decoding error at the inputs, please set the terminal encoding to utf-8.")
+            continue
+        except Exception:
+            raise
+
+        if query.strip() == "exit":
+            break
+
+        if query.strip() == "clear":
+            messages = []
+            torch_gc()
+            print("History has been removed.")
+            continue
+
+        messages.append({"role": "user", "content": query})
+        print("Assistant: ", end="", flush=True)
+
+        response = ""
+        for new_text in chat_model.stream_chat(messages):
+            print(new_text, end="", flush=True)
+            response += new_text
+        print()
+        messages.append({"role": "assistant", "content": response})
diff --git a/llamafactory/chat/hf_engine.py b/llamafactory/chat/hf_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..adaaaa872786446fde2eba4a2a8f32f7ec4cc462
--- /dev/null
+++ b/llamafactory/chat/hf_engine.py
@@ -0,0 +1,412 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import os
+from collections.abc import AsyncGenerator
+from threading import Thread
+from typing import TYPE_CHECKING, Any, Callable, Optional, Union
+
+import torch
+from transformers import GenerationConfig, TextIteratorStreamer
+from typing_extensions import override
+
+from ..data import get_template_and_fix_tokenizer
+from ..extras import logging
+from ..extras.constants import AUDIO_PLACEHOLDER, IMAGE_PLACEHOLDER, VIDEO_PLACEHOLDER, EngineName
+from ..model import load_model, load_tokenizer
+from .base_engine import BaseEngine, Response
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel, PreTrainedTokenizer, ProcessorMixin
+    from trl import PreTrainedModelWrapper
+
+    from ..data import Template
+    from ..data.mm_plugin import AudioInput, ImageInput, VideoInput
+    from ..hparams import DataArguments, FinetuningArguments, GeneratingArguments, ModelArguments
+
+
+logger = logging.get_logger(__name__)
+
+
+class HuggingfaceEngine(BaseEngine):
+    def __init__(
+        self,
+        model_args: "ModelArguments",
+        data_args: "DataArguments",
+        finetuning_args: "FinetuningArguments",
+        generating_args: "GeneratingArguments",
+    ) -> None:
+        self.name = EngineName.HF
+        self.can_generate = finetuning_args.stage == "sft"
+        tokenizer_module = load_tokenizer(model_args)
+        self.tokenizer = tokenizer_module["tokenizer"]
+        self.processor = tokenizer_module["processor"]
+        self.tokenizer.padding_side = "left" if self.can_generate else "right"
+        self.template = get_template_and_fix_tokenizer(self.tokenizer, data_args)
+        self.model = load_model(
+            self.tokenizer, model_args, finetuning_args, is_trainable=False, add_valuehead=(not self.can_generate)
+        )  # must after fixing tokenizer to resize vocab
+        self.generating_args = generating_args.to_dict()
+        try:
+            asyncio.get_event_loop()
+        except RuntimeError:
+            logger.warning_rank0_once("There is no current event loop, creating a new one.")
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+
+        self.semaphore = asyncio.Semaphore(int(os.getenv("MAX_CONCURRENT", "1")))
+
+    @staticmethod
+    def _process_args(
+        model: "PreTrainedModel",
+        tokenizer: "PreTrainedTokenizer",
+        processor: Optional["ProcessorMixin"],
+        template: "Template",
+        generating_args: dict[str, Any],
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+        images: Optional[list["ImageInput"]] = None,
+        videos: Optional[list["VideoInput"]] = None,
+        audios: Optional[list["AudioInput"]] = None,
+        input_kwargs: Optional[dict[str, Any]] = {},
+    ) -> tuple[dict[str, Any], int]:
+        mm_input_dict = {"images": [], "videos": [], "audios": [], "imglens": [0], "vidlens": [0], "audlens": [0]}
+        if images is not None:
+            mm_input_dict.update({"images": images, "imglens": [len(images)]})
+            if not any(IMAGE_PLACEHOLDER in message["content"] for message in messages):
+                messages[0]["content"] = IMAGE_PLACEHOLDER * len(images) + messages[0]["content"]
+
+        if videos is not None:
+            mm_input_dict.update({"videos": videos, "vidlens": [len(videos)]})
+            if not any(VIDEO_PLACEHOLDER in message["content"] for message in messages):
+                messages[0]["content"] = VIDEO_PLACEHOLDER * len(videos) + messages[0]["content"]
+
+        if audios is not None:
+            mm_input_dict.update({"audios": audios, "audlens": [len(audios)]})
+            if not any(AUDIO_PLACEHOLDER in message["content"] for message in messages):
+                messages[0]["content"] = AUDIO_PLACEHOLDER * len(audios) + messages[0]["content"]
+
+        messages = template.mm_plugin.process_messages(
+            messages, mm_input_dict["images"], mm_input_dict["videos"], mm_input_dict["audios"], processor
+        )
+        paired_messages = messages + [{"role": "assistant", "content": ""}]
+        prompt_ids, _ = template.encode_oneturn(tokenizer, paired_messages, system, tools)
+        prompt_ids, _ = template.mm_plugin.process_token_ids(
+            prompt_ids,
+            None,
+            mm_input_dict["images"],
+            mm_input_dict["videos"],
+            mm_input_dict["audios"],
+            tokenizer,
+            processor,
+        )
+        prompt_length = len(prompt_ids)
+        inputs = torch.tensor([prompt_ids], device=model.device)
+        attention_mask = torch.ones_like(inputs, dtype=torch.long)
+
+        do_sample: Optional[bool] = input_kwargs.pop("do_sample", None)
+        temperature: Optional[float] = input_kwargs.pop("temperature", None)
+        top_p: Optional[float] = input_kwargs.pop("top_p", None)
+        top_k: Optional[float] = input_kwargs.pop("top_k", None)
+        num_return_sequences: int = input_kwargs.pop("num_return_sequences", 1)
+        repetition_penalty: Optional[float] = input_kwargs.pop("repetition_penalty", None)
+        length_penalty: Optional[float] = input_kwargs.pop("length_penalty", None)
+        skip_special_tokens: Optional[bool] = input_kwargs.pop("skip_special_tokens", None)
+        max_length: Optional[int] = input_kwargs.pop("max_length", None)
+        max_new_tokens: Optional[int] = input_kwargs.pop("max_new_tokens", None)
+        stop: Optional[Union[str, list[str]]] = input_kwargs.pop("stop", None)
+
+        if stop is not None:
+            logger.warning_rank0("Stop parameter is not supported by the huggingface engine yet.")
+
+        generating_args = generating_args.copy()
+        generating_args.update(
+            dict(
+                do_sample=do_sample if do_sample is not None else generating_args["do_sample"],
+                temperature=temperature if temperature is not None else generating_args["temperature"],
+                top_p=top_p if top_p is not None else generating_args["top_p"],
+                top_k=top_k if top_k is not None else generating_args["top_k"],
+                num_return_sequences=num_return_sequences,
+                repetition_penalty=repetition_penalty
+                if repetition_penalty is not None
+                else generating_args["repetition_penalty"],
+                length_penalty=length_penalty if length_penalty is not None else generating_args["length_penalty"],
+                skip_special_tokens=skip_special_tokens
+                if skip_special_tokens is not None
+                else generating_args["skip_special_tokens"],
+                eos_token_id=template.get_stop_token_ids(tokenizer),
+                pad_token_id=tokenizer.pad_token_id,
+            )
+        )
+
+        if isinstance(num_return_sequences, int) and num_return_sequences > 1:  # do_sample needs temperature > 0
+            generating_args["do_sample"] = True
+            generating_args["temperature"] = generating_args["temperature"] or 1.0
+
+        if not generating_args["temperature"]:
+            generating_args["do_sample"] = False
+
+        if not generating_args["do_sample"]:
+            generating_args.pop("temperature", None)
+            generating_args.pop("top_p", None)
+
+        if max_length:
+            generating_args.pop("max_new_tokens", None)
+            generating_args["max_length"] = max_length
+
+        if max_new_tokens:
+            generating_args.pop("max_length", None)
+            generating_args["max_new_tokens"] = max_new_tokens
+
+        gen_kwargs = dict(
+            inputs=inputs,
+            attention_mask=attention_mask,
+            generation_config=GenerationConfig(**generating_args),
+        )
+
+        mm_inputs = template.mm_plugin.get_mm_inputs(**mm_input_dict, batch_ids=[prompt_ids], processor=processor)
+        for key, value in mm_inputs.items():
+            if isinstance(value, list) and isinstance(value[0], torch.Tensor):  # for pixtral inputs
+                value = torch.stack(value)  # assume they have same sizes
+            elif (
+                isinstance(value, list) and isinstance(value[0], list) and isinstance(value[0][0], torch.Tensor)
+            ):  # for minicpmv inputs
+                value = torch.stack([torch.stack(v) for v in value])
+            elif not isinstance(value, torch.Tensor):
+                value = torch.tensor(value)
+
+            if torch.is_floating_point(value):  # cast data dtype for paligemma
+                value = value.to(model.dtype)
+
+            if key == "second_per_grid_ts":  # qwen2.5vl special case
+                gen_kwargs[key] = value.tolist()
+            else:
+                gen_kwargs[key] = value.to(model.device)
+
+        if getattr(model.config, "model_type", None) in ["minicpmv", "minicpmo"]:
+            gen_kwargs["input_ids"] = inputs
+            gen_kwargs["tokenizer"] = tokenizer
+            if "audio_feature_lens" in mm_inputs:
+                gen_kwargs["audio_feature_lens"] = mm_inputs["audio_feature_lens"]
+
+            gen_kwargs.pop("image_sizes", None)
+
+        return gen_kwargs, prompt_length
+
+    @staticmethod
+    @torch.inference_mode()
+    def _chat(
+        model: "PreTrainedModel",
+        tokenizer: "PreTrainedTokenizer",
+        processor: Optional["ProcessorMixin"],
+        template: "Template",
+        generating_args: dict[str, Any],
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+        images: Optional[list["ImageInput"]] = None,
+        videos: Optional[list["VideoInput"]] = None,
+        audios: Optional[list["AudioInput"]] = None,
+        input_kwargs: Optional[dict[str, Any]] = {},
+    ) -> list["Response"]:
+        gen_kwargs, prompt_length = HuggingfaceEngine._process_args(
+            model,
+            tokenizer,
+            processor,
+            template,
+            generating_args,
+            messages,
+            system,
+            tools,
+            images,
+            videos,
+            audios,
+            input_kwargs,
+        )
+        generate_output = model.generate(**gen_kwargs)
+        if isinstance(generate_output, tuple):
+            generate_output = generate_output[1][0]  # post-process the minicpm_o output
+
+        response_ids = generate_output[:, prompt_length:]
+        response = tokenizer.batch_decode(
+            response_ids,
+            skip_special_tokens=getattr(gen_kwargs["generation_config"], "skip_special_tokens", True),
+            clean_up_tokenization_spaces=True,
+        )
+        results = []
+        for i in range(len(response)):
+            eos_index = (response_ids[i] == tokenizer.eos_token_id).nonzero()
+            response_length = (eos_index[0].item() + 1) if len(eos_index) else len(response_ids[i])
+            results.append(
+                Response(
+                    response_text=response[i],
+                    response_length=response_length,
+                    prompt_length=prompt_length,
+                    finish_reason="stop" if len(eos_index) else "length",
+                )
+            )
+
+        return results
+
+    @staticmethod
+    @torch.inference_mode()
+    def _stream_chat(
+        model: "PreTrainedModel",
+        tokenizer: "PreTrainedTokenizer",
+        processor: Optional["ProcessorMixin"],
+        template: "Template",
+        generating_args: dict[str, Any],
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+        images: Optional[list["ImageInput"]] = None,
+        videos: Optional[list["VideoInput"]] = None,
+        audios: Optional[list["AudioInput"]] = None,
+        input_kwargs: Optional[dict[str, Any]] = {},
+    ) -> Callable[[], str]:
+        gen_kwargs, _ = HuggingfaceEngine._process_args(
+            model,
+            tokenizer,
+            processor,
+            template,
+            generating_args,
+            messages,
+            system,
+            tools,
+            images,
+            videos,
+            audios,
+            input_kwargs,
+        )
+        streamer = TextIteratorStreamer(
+            tokenizer,
+            skip_prompt=True,
+            skip_special_tokens=getattr(gen_kwargs["generation_config"], "skip_special_tokens", True),
+        )
+        gen_kwargs["streamer"] = streamer
+        thread = Thread(target=model.generate, kwargs=gen_kwargs, daemon=True)
+        thread.start()
+
+        def stream():
+            try:
+                return streamer.__next__()
+            except StopIteration:
+                raise StopAsyncIteration()
+
+        return stream
+
+    @staticmethod
+    @torch.inference_mode()
+    def _get_scores(
+        model: "PreTrainedModelWrapper",
+        tokenizer: "PreTrainedTokenizer",
+        batch_input: list[str],
+        input_kwargs: Optional[dict[str, Any]] = {},
+    ) -> list[float]:
+        max_length: Optional[int] = input_kwargs.pop("max_length", None)
+        device = getattr(model.pretrained_model, "device", "cuda")
+        inputs: dict[str, torch.Tensor] = tokenizer(
+            batch_input,
+            padding=True,
+            truncation=True,
+            max_length=max_length or getattr(model.config, "max_position_embeddings", 1024),
+            return_tensors="pt",
+            add_special_tokens=False,
+        ).to(device)
+        values: torch.Tensor = model(**inputs, return_dict=True, use_cache=False)[-1]
+        scores = values.gather(dim=-1, index=(inputs["attention_mask"].sum(dim=-1, keepdim=True) - 1))
+        return scores
+
+    @override
+    async def chat(
+        self,
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+        images: Optional[list["ImageInput"]] = None,
+        videos: Optional[list["VideoInput"]] = None,
+        audios: Optional[list["AudioInput"]] = None,
+        **input_kwargs,
+    ) -> list["Response"]:
+        if not self.can_generate:
+            raise ValueError("The current model does not support `chat`.")
+
+        input_args = (
+            self.model,
+            self.tokenizer,
+            self.processor,
+            self.template,
+            self.generating_args,
+            messages,
+            system,
+            tools,
+            images,
+            videos,
+            audios,
+            input_kwargs,
+        )
+        async with self.semaphore:
+            return await asyncio.to_thread(self._chat, *input_args)
+
+    @override
+    async def stream_chat(
+        self,
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+        images: Optional[list["ImageInput"]] = None,
+        videos: Optional[list["VideoInput"]] = None,
+        audios: Optional[list["AudioInput"]] = None,
+        **input_kwargs,
+    ) -> AsyncGenerator[str, None]:
+        if not self.can_generate:
+            raise ValueError("The current model does not support `stream_chat`.")
+
+        input_args = (
+            self.model,
+            self.tokenizer,
+            self.processor,
+            self.template,
+            self.generating_args,
+            messages,
+            system,
+            tools,
+            images,
+            videos,
+            audios,
+            input_kwargs,
+        )
+        async with self.semaphore:
+            stream = self._stream_chat(*input_args)
+            while True:
+                try:
+                    yield await asyncio.to_thread(stream)
+                except StopAsyncIteration:
+                    break
+
+    @override
+    async def get_scores(
+        self,
+        batch_input: list[str],
+        **input_kwargs,
+    ) -> list[float]:
+        if self.can_generate:
+            raise ValueError("Cannot get scores using an auto-regressive model.")
+
+        input_args = (self.model, self.tokenizer, batch_input, input_kwargs)
+        async with self.semaphore:
+            return await asyncio.to_thread(self._get_scores, *input_args)
diff --git a/llamafactory/chat/kt_engine.py b/llamafactory/chat/kt_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bf3f4bb2b685ee971d538d29f0b6afa16956f2c
--- /dev/null
+++ b/llamafactory/chat/kt_engine.py
@@ -0,0 +1,284 @@
+# Copyright 2025 the KVCache.AI team, Approaching AI, and the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import os
+import platform
+from collections.abc import AsyncGenerator
+from threading import Thread
+from typing import TYPE_CHECKING, Any, Optional
+
+import torch
+from typing_extensions import override
+
+from ..data import get_template_and_fix_tokenizer
+from ..extras import logging
+from ..extras.constants import EngineName
+from ..model import load_model, load_tokenizer
+from .base_engine import BaseEngine, Response
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizer
+    from trl import PreTrainedModelWrapper
+
+    from ..data.mm_plugin import AudioInput, ImageInput, VideoInput
+    from ..hparams import DataArguments, FinetuningArguments, GeneratingArguments, ModelArguments
+
+from ktransformers.operators.flashinfer_wrapper import flashinfer_enabled
+from ktransformers.server.config.config import Config
+from ktransformers.util.utils import (
+    get_compute_capability,
+    prefill_and_generate_capture,
+)
+from ktransformers.util.vendors import GPUVendor, device_manager
+
+
+logger = logging.get_logger(__name__)
+
+
+class KTransformersEngine(BaseEngine):
+    def __init__(
+        self,
+        model_args: "ModelArguments",
+        data_args: "DataArguments",
+        finetuning_args: "FinetuningArguments",
+        generating_args: "GeneratingArguments",
+    ) -> None:
+        self.name = EngineName.KT
+        self.can_generate = finetuning_args.stage == "sft"
+
+        tok_mod = load_tokenizer(model_args)
+        self.tokenizer = tok_mod["tokenizer"]
+        self.tokenizer.padding_side = "left" if self.can_generate else "right"
+        self.template = get_template_and_fix_tokenizer(self.tokenizer, data_args)
+
+        self.model = load_model(
+            self.tokenizer, model_args, finetuning_args, is_trainable=False, add_valuehead=(not self.can_generate)
+        )
+
+        self.generating_args = generating_args.to_dict()
+        self.max_new_tokens = model_args.kt_maxlen
+        self.use_cuda_graph = model_args.kt_use_cuda_graph
+        self.mode = model_args.kt_mode
+        self.force_think = model_args.kt_force_think
+        self.chunk_size = model_args.chunk_size
+
+        try:
+            asyncio.get_event_loop()
+        except RuntimeError:
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+
+        self.semaphore = asyncio.Semaphore(int(os.getenv("MAX_CONCURRENT", "1")))
+
+    @staticmethod
+    @torch.inference_mode()
+    def _get_scores(
+        model: "PreTrainedModelWrapper",
+        tokenizer: "PreTrainedTokenizer",
+        batch_input: list[str],
+        input_kwargs: Optional[dict[str, Any]] = {},
+    ) -> list[float]:
+        max_length: Optional[int] = input_kwargs.pop("max_length", None)
+        device = getattr(model.pretrained_model, "device", "cuda")
+        inputs = tokenizer(
+            batch_input,
+            padding=True,
+            truncation=True,
+            max_length=max_length or getattr(model.config, "max_position_embeddings", 1024),
+            return_tensors="pt",
+            add_special_tokens=False,
+        ).to(device)
+        values: torch.Tensor = model(**inputs, return_dict=True, use_cache=False)[-1]
+        scores = values.gather(dim=-1, index=(inputs["attention_mask"].sum(dim=-1, keepdim=True) - 1))
+        return scores
+
+    async def _generate(
+        self,
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+        **input_kwargs,
+    ) -> AsyncGenerator[str, None]:
+        paired = messages + [{"role": "assistant", "content": ""}]
+        prompt_ids, _ = self.template.encode_oneturn(self.tokenizer, paired, system, tools)
+        prompt_len = len(prompt_ids)
+
+        max_length: Optional[int] = input_kwargs.pop("max_length", None)
+        max_new_tokens: Optional[int] = input_kwargs.pop("max_new_tokens", None)
+
+        if "max_new_tokens" in self.generating_args:
+            max_tokens = int(self.generating_args["max_new_tokens"])
+        elif "max_length" in self.generating_args:
+            gl = int(self.generating_args["max_length"])
+            max_tokens = gl - prompt_len if gl > prompt_len else 1
+        else:
+            max_tokens = self.max_new_tokens or 256
+
+        if max_length is not None:
+            max_tokens = max(max_length - prompt_len, 1)
+        if max_new_tokens is not None:
+            max_tokens = int(max_new_tokens)
+        max_tokens = max(1, int(max_tokens))
+
+        if self.mode == "long_context":
+            max_len_cfg = Config().long_context_config["max_seq_len"]
+            need = prompt_len + max_tokens
+            assert max_len_cfg > need, f"please set max_seq_len > {need} in ~/.ktransformers/config.yaml"
+
+        device = next(self.model.parameters()).device
+        input_tensor = torch.tensor([prompt_ids], dtype=torch.long, device=device)
+        if self.force_think:
+            think = torch.tensor(
+                [self.tokenizer.encode("<think>\n", add_special_tokens=False)], dtype=torch.long, device=device
+            )
+            input_tensor = torch.cat([input_tensor, think], dim=1)
+
+        use_flashinfer = (
+            platform.system() != "Windows"
+            and getattr(self.model.config, "architectures", [""])[0]
+            in {"DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"}
+            and flashinfer_enabled
+            and get_compute_capability() >= 8
+            and device_manager.gpu_vendor == GPUVendor.NVIDIA
+        )
+
+        def make_gen():
+            if use_flashinfer:
+                return prefill_and_generate_capture(
+                    self.model,
+                    self.tokenizer,
+                    input_tensor,
+                    max_tokens,
+                    self.use_cuda_graph,
+                    mode=self.mode,
+                    force_think=self.force_think,
+                    chunk_size=self.chunk_size,
+                    use_flashinfer_mla=True,
+                    num_heads=self.model.config.num_attention_heads,
+                    head_dim_ckv=getattr(self.model.config, "kv_lora_rank", 0),
+                    head_dim_kpe=getattr(self.model.config, "qk_rope_head_dim", 0),
+                    q_head_dim=getattr(self.model.config, "qk_rope_head_dim", 0)
+                    + getattr(self.model.config, "qk_nope_head_dim", 0),
+                    echo_stream=False,
+                )
+            else:
+                return prefill_and_generate_capture(
+                    self.model,
+                    self.tokenizer,
+                    input_tensor,
+                    max_tokens,
+                    self.use_cuda_graph,
+                    mode=self.mode,
+                    force_think=self.force_think,
+                    chunk_size=self.chunk_size,
+                    echo_stream=False,
+                )
+
+        loop = asyncio.get_running_loop()
+        q: asyncio.Queue[Optional[str]] = asyncio.Queue()
+
+        def producer():
+            try:
+                gen = make_gen()
+                if hasattr(gen, "__aiter__"):
+
+                    async def drain_async():
+                        async for t in gen:
+                            loop.call_soon_threadsafe(q.put_nowait, t if isinstance(t, str) else str(t))
+
+                    asyncio.run(drain_async())
+                elif hasattr(gen, "__iter__"):
+                    for t in gen:
+                        loop.call_soon_threadsafe(q.put_nowait, t if isinstance(t, str) else str(t))
+                else:
+                    loop.call_soon_threadsafe(q.put_nowait, gen if isinstance(gen, str) else str(gen))
+            finally:
+                loop.call_soon_threadsafe(q.put_nowait, None)
+
+        Thread(target=producer, daemon=True).start()
+
+        while True:
+            item = await q.get()
+            if item is None:
+                break
+            yield item
+
+    @override
+    async def chat(
+        self,
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+        images: Optional[list["ImageInput"]] = None,
+        videos: Optional[list["VideoInput"]] = None,
+        audios: Optional[list["AudioInput"]] = None,
+        **input_kwargs,
+    ) -> list["Response"]:
+        if not self.can_generate:
+            raise ValueError("The current model does not support `chat`.")
+        async with self.semaphore:
+            produced = ""
+            final_text = ""
+            async for t in self._generate(messages, system, tools, **input_kwargs):
+                delta = t
+                produced = produced + delta
+                if delta:
+                    final_text += delta
+
+            prompt_ids, _ = self.template.encode_oneturn(
+                self.tokenizer, messages + [{"role": "assistant", "content": ""}], system, tools
+            )
+            return [
+                Response(
+                    response_text=final_text,
+                    response_length=len(self.tokenizer.encode(final_text, add_special_tokens=False)),
+                    prompt_length=len(prompt_ids),
+                    finish_reason="stop",
+                )
+            ]
+
+    @override
+    async def stream_chat(
+        self,
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+        images: Optional[list["ImageInput"]] = None,
+        videos: Optional[list["VideoInput"]] = None,
+        audios: Optional[list["AudioInput"]] = None,
+        **input_kwargs,
+    ) -> AsyncGenerator[str, None]:
+        if not self.can_generate:
+            raise ValueError("The current model does not support `stream_chat`.")
+        async with self.semaphore:
+            produced = ""
+            async for t in self._generate(messages, system, tools, **input_kwargs):
+                delta = t[len(produced) :] if t.startswith(produced) else t
+                produced = t
+                if delta:
+                    yield delta
+
+    @override
+    async def get_scores(
+        self,
+        batch_input: list[str],
+        **input_kwargs,
+    ) -> list[float]:
+        if self.can_generate:
+            raise ValueError("Cannot get scores using an auto-regressive model.")
+        args = (self.model, self.tokenizer, batch_input, input_kwargs)
+        async with self.semaphore:
+            return await asyncio.to_thread(self._get_scores, *args)
diff --git a/llamafactory/chat/sglang_engine.py b/llamafactory/chat/sglang_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1d2ead33823bc70d51cda59750d25580f972083
--- /dev/null
+++ b/llamafactory/chat/sglang_engine.py
@@ -0,0 +1,289 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import atexit
+import json
+from collections.abc import AsyncGenerator, AsyncIterator, Sequence
+from typing import TYPE_CHECKING, Any, Optional, Union
+
+import requests
+from typing_extensions import override
+
+from ..data import get_template_and_fix_tokenizer
+from ..extras import logging
+from ..extras.constants import AUDIO_PLACEHOLDER, IMAGE_PLACEHOLDER, VIDEO_PLACEHOLDER, EngineName
+from ..extras.misc import get_device_count, torch_gc
+from ..extras.packages import is_sglang_available
+from ..hparams import DataArguments, FinetuningArguments, GeneratingArguments, ModelArguments
+from ..model import load_config, load_tokenizer
+from ..model.model_utils.quantization import QuantizationMethod
+from .base_engine import BaseEngine, Response
+
+
+if is_sglang_available():
+    from sglang.utils import launch_server_cmd, terminate_process, wait_for_server  # type: ignore
+
+
+if TYPE_CHECKING:
+    from ..data.mm_plugin import AudioInput, ImageInput, VideoInput
+
+
+logger = logging.get_logger(__name__)
+
+
+class SGLangEngine(BaseEngine):
+    """Inference engine for SGLang models.
+
+    This class wraps the SGLang engine to provide a consistent interface for text generation
+    that matches LLaMA Factory's requirements. It uses the SGLang HTTP server approach for
+    better interaction and performance. The engine launches a server process and communicates
+    with it via HTTP requests.
+
+    For more details on the SGLang HTTP server approach, see:
+    https://docs.sglang.ai/backend/send_request.html
+    """
+
+    def __init__(
+        self,
+        model_args: "ModelArguments",
+        data_args: "DataArguments",
+        finetuning_args: "FinetuningArguments",
+        generating_args: "GeneratingArguments",
+    ) -> None:
+        self.name = EngineName.SGLANG
+        self.model_args = model_args
+        config = load_config(model_args)  # may download model from ms hub
+        if getattr(config, "quantization_config", None):  # gptq models should use float16
+            quantization_config: dict[str, Any] = getattr(config, "quantization_config", None)
+            quant_method = quantization_config.get("quant_method", "")
+            if quant_method == QuantizationMethod.GPTQ and model_args.infer_dtype == "auto":
+                model_args.infer_dtype = "float16"
+
+        self.can_generate = finetuning_args.stage == "sft"
+        tokenizer_module = load_tokenizer(model_args)
+        self.tokenizer = tokenizer_module["tokenizer"]
+        self.processor = tokenizer_module["processor"]
+        self.tokenizer.padding_side = "left"
+        self.template = get_template_and_fix_tokenizer(self.tokenizer, data_args)
+        self.template.mm_plugin.expand_mm_tokens = False  # for sglang generate
+        self.generating_args = generating_args.to_dict()
+        if model_args.adapter_name_or_path is not None:
+            self.lora_request = True
+        else:
+            self.lora_request = False
+
+        launch_cmd = [
+            "python3 -m sglang.launch_server",
+            f"--model-path {model_args.model_name_or_path}",
+            f"--dtype {model_args.infer_dtype}",
+            f"--context-length {model_args.sglang_maxlen}",
+            f"--mem-fraction-static {model_args.sglang_mem_fraction}",
+            f"--tp-size {model_args.sglang_tp_size if model_args.sglang_tp_size != -1 else get_device_count() or 1}",
+            f"--download-dir {model_args.cache_dir}",
+            "--log-level error",
+        ]
+        if self.lora_request:
+            launch_cmd.extend(
+                [
+                    "--max-loras-per-batch 1",
+                    f"--lora-backend {model_args.sglang_lora_backend}",
+                    f"--lora-paths lora0={model_args.adapter_name_or_path[0]}",
+                    "--disable-radix-cache",
+                ]
+            )
+        launch_cmd = " ".join(launch_cmd)
+        logger.info_rank0(f"Starting SGLang server with command: {launch_cmd}")
+        try:
+            torch_gc()
+            self.server_process, port = launch_server_cmd(launch_cmd)
+            self.base_url = f"http://localhost:{port}"
+            atexit.register(self._cleanup_server)
+
+            logger.info_rank0(f"Waiting for SGLang server to be ready at {self.base_url}")
+            wait_for_server(self.base_url, timeout=300)
+            logger.info_rank0(f"SGLang server initialized successfully at {self.base_url}")
+            try:
+                response = requests.get(f"{self.base_url}/get_model_info", timeout=5)
+                if response.status_code == 200:
+                    model_info = response.json()
+                    logger.info(f"SGLang server model info: {model_info}")
+            except Exception as e:
+                logger.debug(f"Note: could not get model info: {str(e)}")
+
+        except Exception as e:
+            logger.error(f"Failed to start SGLang server: {str(e)}")
+            self._cleanup_server()  # make sure to clean up any started process
+            raise RuntimeError(f"SGLang server initialization failed: {str(e)}.")
+
+    def _cleanup_server(self):
+        r"""Clean up the server process when the engine is destroyed."""
+        if hasattr(self, "server_process") and self.server_process:
+            try:
+                logger.info("Terminating SGLang server process")
+                terminate_process(self.server_process)
+                logger.info("SGLang server process terminated")
+            except Exception as e:
+                logger.warning(f"Error terminating SGLang server: {str(e)}")
+
+    async def _generate(
+        self,
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+        images: Optional[list["ImageInput"]] = None,
+        videos: Optional[list["VideoInput"]] = None,
+        audios: Optional[list["AudioInput"]] = None,
+        **input_kwargs,
+    ) -> AsyncIterator[dict[str, Any]]:
+        if images is not None and not any(IMAGE_PLACEHOLDER in message["content"] for message in messages):
+            messages[0]["content"] = IMAGE_PLACEHOLDER * len(images) + messages[0]["content"]
+
+        if videos is not None and not any(VIDEO_PLACEHOLDER in message["content"] for message in messages):
+            messages[0]["content"] = VIDEO_PLACEHOLDER * len(videos) + messages[0]["content"]
+
+        if audios is not None and not any(AUDIO_PLACEHOLDER in message["content"] for message in messages):
+            messages[0]["content"] = AUDIO_PLACEHOLDER * len(audios) + messages[0]["content"]
+
+        messages = self.template.mm_plugin.process_messages(
+            messages, images or [], videos or [], audios or [], self.processor
+        )
+        paired_messages = messages + [{"role": "assistant", "content": ""}]
+        prompt_ids, _ = self.template.encode_oneturn(self.tokenizer, paired_messages, system, tools)
+        prompt_length = len(prompt_ids)
+
+        temperature: Optional[float] = input_kwargs.pop("temperature", None)
+        top_p: Optional[float] = input_kwargs.pop("top_p", None)
+        top_k: Optional[float] = input_kwargs.pop("top_k", None)
+        num_return_sequences: int = input_kwargs.pop("num_return_sequences", 1)
+        repetition_penalty: Optional[float] = input_kwargs.pop("repetition_penalty", None)
+        skip_special_tokens: Optional[bool] = input_kwargs.pop("skip_special_tokens", None)
+        max_length: Optional[int] = input_kwargs.pop("max_length", None)
+        max_new_tokens: Optional[int] = input_kwargs.pop("max_new_tokens", None)
+        stop: Optional[Union[str, list[str]]] = input_kwargs.pop("stop", None)
+
+        if num_return_sequences != 1:
+            raise NotImplementedError("SGLang only supports n=1.")
+
+        if "max_new_tokens" in self.generating_args:
+            max_tokens = self.generating_args["max_new_tokens"]
+        elif "max_length" in self.generating_args:
+            if self.generating_args["max_length"] > prompt_length:
+                max_tokens = self.generating_args["max_length"] - prompt_length
+            else:
+                max_tokens = 1
+
+        if max_length:
+            max_tokens = max_length - prompt_length if max_length > prompt_length else 1
+
+        if max_new_tokens:
+            max_tokens = max_new_tokens
+
+        sampling_params = {
+            "temperature": temperature if temperature is not None else self.generating_args["temperature"],
+            "top_p": (top_p if top_p is not None else self.generating_args["top_p"]) or 1.0,  # top_p must > 0
+            "top_k": (top_k if top_k is not None else self.generating_args["top_k"]) or -1,  # top_k must > 0
+            "stop": stop,
+            "stop_token_ids": self.template.get_stop_token_ids(self.tokenizer),
+            "max_new_tokens": max_tokens,
+            "repetition_penalty": (
+                repetition_penalty if repetition_penalty is not None else self.generating_args["repetition_penalty"]
+            )
+            or 1.0,  # repetition_penalty must > 0
+            "skip_special_tokens": skip_special_tokens
+            if skip_special_tokens is not None
+            else self.generating_args["skip_special_tokens"],
+        }
+
+        def stream_request():
+            json_data = {
+                "input_ids": prompt_ids,
+                "sampling_params": sampling_params,
+                "stream": True,
+            }
+            if self.lora_request:
+                json_data["lora_request"] = ["lora0"]
+            response = requests.post(f"{self.base_url}/generate", json=json_data, stream=True)
+            if response.status_code != 200:
+                raise RuntimeError(f"SGLang server error: {response.status_code}, {response.text}")
+
+            for chunk in response.iter_lines(decode_unicode=False):
+                chunk = str(chunk.decode("utf-8"))
+                if chunk == "data: [DONE]":
+                    break
+
+                if chunk and chunk.startswith("data:"):
+                    yield json.loads(chunk[5:].strip("\n"))
+
+        return await asyncio.to_thread(stream_request)
+
+    @override
+    async def chat(
+        self,
+        messages: Sequence[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+        images: Optional[Sequence["ImageInput"]] = None,
+        videos: Optional[Sequence["VideoInput"]] = None,
+        audios: Optional[Sequence["AudioInput"]] = None,
+        **input_kwargs,
+    ) -> list["Response"]:
+        final_output = None
+        generator = await self._generate(messages, system, tools, images, videos, audios, **input_kwargs)
+        for request_output in generator:
+            final_output = request_output
+
+        results = [
+            Response(
+                response_text=final_output["text"],
+                response_length=final_output["meta_info"]["completion_tokens"],
+                prompt_length=final_output["meta_info"]["prompt_tokens"],
+                finish_reason="stop" if final_output["meta_info"]["finish_reason"] == "stop" else "length",
+            )
+        ]
+        return results
+
+    @override
+    async def stream_chat(
+        self,
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+        images: Optional[list["ImageInput"]] = None,
+        videos: Optional[list["VideoInput"]] = None,
+        audios: Optional[list["AudioInput"]] = None,
+        **input_kwargs,
+    ) -> AsyncGenerator[str, None]:
+        generated_text = ""
+        generator = await self._generate(messages, system, tools, images, videos, audios, **input_kwargs)
+        for result in generator:
+            delta_text = result["text"][len(generated_text) :]
+            generated_text = result["text"]
+            yield delta_text
+
+    @override
+    async def get_scores(
+        self,
+        batch_input: list[str],
+        **input_kwargs,
+    ) -> list[float]:
+        raise NotImplementedError("SGLang engine does not support `get_scores`.")
+
+    def __del__(self):
+        r"""Ensure server is cleaned up when object is deleted."""
+        self._cleanup_server()
+        try:
+            atexit.unregister(self._cleanup_server)
+        except Exception:
+            pass
diff --git a/llamafactory/chat/vllm_engine.py b/llamafactory/chat/vllm_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..b33705279da13d398f8089b94c72b22d742f2c6f
--- /dev/null
+++ b/llamafactory/chat/vllm_engine.py
@@ -0,0 +1,263 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import uuid
+from collections.abc import AsyncGenerator, AsyncIterator
+from typing import TYPE_CHECKING, Any, Optional, Union
+
+from typing_extensions import override
+
+from ..data import get_template_and_fix_tokenizer
+from ..extras import logging
+from ..extras.constants import AUDIO_PLACEHOLDER, IMAGE_PLACEHOLDER, VIDEO_PLACEHOLDER, EngineName
+from ..extras.misc import get_device_count
+from ..extras.packages import is_vllm_available
+from ..model import load_config, load_tokenizer
+from ..model.model_utils.quantization import QuantizationMethod
+from ..model.model_utils.visual import LlavaMultiModalProjectorForYiVLForVLLM
+from .base_engine import BaseEngine, Response
+
+
+if is_vllm_available():
+    from vllm import AsyncEngineArgs, AsyncLLMEngine, RequestOutput, SamplingParams
+    from vllm.lora.request import LoRARequest
+
+
+if TYPE_CHECKING:
+    from ..data.mm_plugin import AudioInput, ImageInput, VideoInput
+    from ..hparams import DataArguments, FinetuningArguments, GeneratingArguments, ModelArguments
+
+
+logger = logging.get_logger(__name__)
+
+
+class VllmEngine(BaseEngine):
+    def __init__(
+        self,
+        model_args: "ModelArguments",
+        data_args: "DataArguments",
+        finetuning_args: "FinetuningArguments",
+        generating_args: "GeneratingArguments",
+    ) -> None:
+        self.name = EngineName.VLLM
+        self.model_args = model_args
+        config = load_config(model_args)  # may download model from ms hub
+        if getattr(config, "quantization_config", None):  # gptq models should use float16
+            quantization_config: dict[str, Any] = getattr(config, "quantization_config", None)
+            quant_method = quantization_config.get("quant_method", "")
+            if quant_method == QuantizationMethod.GPTQ and model_args.infer_dtype == "auto":
+                model_args.infer_dtype = "float16"
+
+        self.can_generate = finetuning_args.stage == "sft"
+        tokenizer_module = load_tokenizer(model_args)
+        self.tokenizer = tokenizer_module["tokenizer"]
+        self.processor = tokenizer_module["processor"]
+        self.tokenizer.padding_side = "left"
+        self.template = get_template_and_fix_tokenizer(self.tokenizer, data_args)
+        self.template.mm_plugin.expand_mm_tokens = False  # for vllm generate
+        self.generating_args = generating_args.to_dict()
+
+        engine_args = {
+            "model": model_args.model_name_or_path,
+            "trust_remote_code": model_args.trust_remote_code,
+            "download_dir": model_args.cache_dir,
+            "dtype": model_args.infer_dtype,
+            "max_model_len": model_args.vllm_maxlen,
+            "tensor_parallel_size": get_device_count() or 1,
+            "gpu_memory_utilization": model_args.vllm_gpu_util,
+            "disable_log_stats": True,
+            "disable_log_requests": True,
+            "enforce_eager": model_args.vllm_enforce_eager,
+            "enable_lora": model_args.adapter_name_or_path is not None,
+            "max_lora_rank": model_args.vllm_max_lora_rank,
+        }
+        if self.template.mm_plugin.__class__.__name__ != "BasePlugin":
+            engine_args["limit_mm_per_prompt"] = {"image": 4, "video": 2, "audio": 2}
+
+        if isinstance(model_args.vllm_config, dict):
+            engine_args.update(model_args.vllm_config)
+
+        if getattr(config, "is_yi_vl_derived_model", None):
+            import vllm.model_executor.models.llava
+
+            logger.info_rank0("Detected Yi-VL model, applying projector patch.")
+            vllm.model_executor.models.llava.LlavaMultiModalProjector = LlavaMultiModalProjectorForYiVLForVLLM
+
+        self.model = AsyncLLMEngine.from_engine_args(AsyncEngineArgs(**engine_args))
+        if model_args.adapter_name_or_path is not None:
+            self.lora_request = LoRARequest("default", 1, model_args.adapter_name_or_path[0])
+        else:
+            self.lora_request = None
+
+    async def _generate(
+        self,
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+        images: Optional[list["ImageInput"]] = None,
+        videos: Optional[list["VideoInput"]] = None,
+        audios: Optional[list["AudioInput"]] = None,
+        **input_kwargs,
+    ) -> AsyncIterator["RequestOutput"]:
+        request_id = f"chatcmpl-{uuid.uuid4().hex}"
+        if images is not None and not any(IMAGE_PLACEHOLDER in message["content"] for message in messages):
+            messages[0]["content"] = IMAGE_PLACEHOLDER * len(images) + messages[0]["content"]
+
+        if videos is not None and not any(VIDEO_PLACEHOLDER in message["content"] for message in messages):
+            messages[0]["content"] = VIDEO_PLACEHOLDER * len(videos) + messages[0]["content"]
+
+        if audios is not None and not any(AUDIO_PLACEHOLDER in message["content"] for message in messages):
+            messages[0]["content"] = AUDIO_PLACEHOLDER * len(audios) + messages[0]["content"]
+
+        messages = self.template.mm_plugin.process_messages(
+            messages, images or [], videos or [], audios or [], self.processor
+        )
+        paired_messages = messages + [{"role": "assistant", "content": ""}]
+        prompt_ids, _ = self.template.encode_oneturn(self.tokenizer, paired_messages, system, tools)
+        prompt_length = len(prompt_ids)
+
+        temperature: Optional[float] = input_kwargs.pop("temperature", None)
+        top_p: Optional[float] = input_kwargs.pop("top_p", None)
+        top_k: Optional[float] = input_kwargs.pop("top_k", None)
+        num_return_sequences: int = input_kwargs.pop("num_return_sequences", 1)
+        repetition_penalty: Optional[float] = input_kwargs.pop("repetition_penalty", None)
+        length_penalty: Optional[float] = input_kwargs.pop("length_penalty", None)
+        skip_special_tokens: Optional[bool] = input_kwargs.pop("skip_special_tokens", None)
+        max_length: Optional[int] = input_kwargs.pop("max_length", None)
+        max_new_tokens: Optional[int] = input_kwargs.pop("max_new_tokens", None)
+        stop: Optional[Union[str, list[str]]] = input_kwargs.pop("stop", None)
+
+        if length_penalty is not None:
+            logger.warning_rank0("Length penalty is not supported by the vllm engine yet.")
+
+        if "max_new_tokens" in self.generating_args:
+            max_tokens = self.generating_args["max_new_tokens"]
+        elif "max_length" in self.generating_args:
+            if self.generating_args["max_length"] > prompt_length:
+                max_tokens = self.generating_args["max_length"] - prompt_length
+            else:
+                max_tokens = 1
+
+        if max_length:
+            max_tokens = max_length - prompt_length if max_length > prompt_length else 1
+
+        if max_new_tokens:
+            max_tokens = max_new_tokens
+
+        sampling_params = SamplingParams(
+            n=num_return_sequences,
+            repetition_penalty=(
+                repetition_penalty if repetition_penalty is not None else self.generating_args["repetition_penalty"]
+            )
+            or 1.0,  # repetition_penalty must > 0
+            temperature=temperature if temperature is not None else self.generating_args["temperature"],
+            top_p=(top_p if top_p is not None else self.generating_args["top_p"]) or 1.0,  # top_p must > 0
+            top_k=(top_k if top_k is not None else self.generating_args["top_k"]) or -1,  # top_k must > 0
+            stop=stop,
+            stop_token_ids=self.template.get_stop_token_ids(self.tokenizer),
+            max_tokens=max_tokens,
+            skip_special_tokens=skip_special_tokens
+            if skip_special_tokens is not None
+            else self.generating_args["skip_special_tokens"],
+        )
+
+        if images is not None:  # add image features
+            multi_modal_data = {
+                "image": self.template.mm_plugin._regularize_images(
+                    images,
+                    image_max_pixels=self.model_args.image_max_pixels,
+                    image_min_pixels=self.model_args.image_min_pixels,
+                )["images"]
+            }
+        elif videos is not None:
+            multi_modal_data = {
+                "video": self.template.mm_plugin._regularize_videos(
+                    videos,
+                    image_max_pixels=self.model_args.video_max_pixels,
+                    image_min_pixels=self.model_args.video_min_pixels,
+                    video_fps=self.model_args.video_fps,
+                    video_maxlen=self.model_args.video_maxlen,
+                )["videos"]
+            }
+        elif audios is not None:
+            audio_data = self.template.mm_plugin._regularize_audios(
+                audios,
+                sampling_rate=self.model_args.audio_sampling_rate,
+            )
+            multi_modal_data = {"audio": zip(audio_data["audios"], audio_data["sampling_rates"])}
+        else:
+            multi_modal_data = None
+
+        result_generator = self.model.generate(
+            {"prompt_token_ids": prompt_ids, "multi_modal_data": multi_modal_data},
+            sampling_params=sampling_params,
+            request_id=request_id,
+            lora_request=self.lora_request,
+        )
+        return result_generator
+
+    @override
+    async def chat(
+        self,
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+        images: Optional[list["ImageInput"]] = None,
+        videos: Optional[list["VideoInput"]] = None,
+        audios: Optional[list["AudioInput"]] = None,
+        **input_kwargs,
+    ) -> list["Response"]:
+        final_output = None
+        generator = await self._generate(messages, system, tools, images, videos, audios, **input_kwargs)
+        async for request_output in generator:
+            final_output = request_output
+
+        results = []
+        for output in final_output.outputs:
+            results.append(
+                Response(
+                    response_text=output.text,
+                    response_length=len(output.token_ids),
+                    prompt_length=len(final_output.prompt_token_ids),
+                    finish_reason=output.finish_reason,
+                )
+            )
+
+        return results
+
+    @override
+    async def stream_chat(
+        self,
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+        images: Optional[list["ImageInput"]] = None,
+        videos: Optional[list["VideoInput"]] = None,
+        audios: Optional[list["AudioInput"]] = None,
+        **input_kwargs,
+    ) -> AsyncGenerator[str, None]:
+        generated_text = ""
+        generator = await self._generate(messages, system, tools, images, videos, audios, **input_kwargs)
+        async for result in generator:
+            delta_text = result.outputs[0].text[len(generated_text) :]
+            generated_text = result.outputs[0].text
+            yield delta_text
+
+    @override
+    async def get_scores(
+        self,
+        batch_input: list[str],
+        **input_kwargs,
+    ) -> list[float]:
+        raise NotImplementedError("vLLM engine does not support `get_scores`.")
diff --git a/llamafactory/cli.py b/llamafactory/cli.py
new file mode 100644
index 0000000000000000000000000000000000000000..d574bf1db543f5379f074e276898826234708037
--- /dev/null
+++ b/llamafactory/cli.py
@@ -0,0 +1,31 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def main():
+    from .extras.misc import is_env_enabled
+
+    if is_env_enabled("USE_V1"):
+        from .v1 import launcher
+    else:
+        from . import launcher
+
+    launcher.launch()
+
+
+if __name__ == "__main__":
+    from multiprocessing import freeze_support
+
+    freeze_support()
+    main()
diff --git a/llamafactory/data/__init__.py b/llamafactory/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..11c8c9fcecd10e736e240196fde98f833c9df3dc
--- /dev/null
+++ b/llamafactory/data/__init__.py
@@ -0,0 +1,37 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .collator import (
+    KTODataCollatorWithPadding,
+    MultiModalDataCollatorForSeq2Seq,
+    PairwiseDataCollatorWithPadding,
+    SFTDataCollatorWith4DAttentionMask,
+)
+from .data_utils import Role, split_dataset
+from .loader import get_dataset
+from .template import TEMPLATES, Template, get_template_and_fix_tokenizer
+
+
+__all__ = [
+    "TEMPLATES",
+    "KTODataCollatorWithPadding",
+    "MultiModalDataCollatorForSeq2Seq",
+    "PairwiseDataCollatorWithPadding",
+    "Role",
+    "SFTDataCollatorWith4DAttentionMask",
+    "Template",
+    "get_dataset",
+    "get_template_and_fix_tokenizer",
+    "split_dataset",
+]
diff --git a/llamafactory/data/__pycache__/__init__.cpython-312.pyc b/llamafactory/data/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8aeb4c34a1136f093a1b21a6f280408dd1286477
Binary files /dev/null and b/llamafactory/data/__pycache__/__init__.cpython-312.pyc differ
diff --git a/llamafactory/data/__pycache__/collator.cpython-312.pyc b/llamafactory/data/__pycache__/collator.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..912142ec8dd6bc455b401e5935f10e8190a6bdf3
Binary files /dev/null and b/llamafactory/data/__pycache__/collator.cpython-312.pyc differ
diff --git a/llamafactory/data/__pycache__/converter.cpython-312.pyc b/llamafactory/data/__pycache__/converter.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b29a1575867106edc4f03a5b02406ad72871b1c5
Binary files /dev/null and b/llamafactory/data/__pycache__/converter.cpython-312.pyc differ
diff --git a/llamafactory/data/__pycache__/data_utils.cpython-312.pyc b/llamafactory/data/__pycache__/data_utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4276f26997c27740fbc9ec5faaf3e99b3442d7eb
Binary files /dev/null and b/llamafactory/data/__pycache__/data_utils.cpython-312.pyc differ
diff --git a/llamafactory/data/__pycache__/formatter.cpython-312.pyc b/llamafactory/data/__pycache__/formatter.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a8f78d7f723f0fb3ea2ec501c517ee4dbdad4e49
Binary files /dev/null and b/llamafactory/data/__pycache__/formatter.cpython-312.pyc differ
diff --git a/llamafactory/data/__pycache__/loader.cpython-312.pyc b/llamafactory/data/__pycache__/loader.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d8232883361bdf807fa8523fdbca05e1b992c535
Binary files /dev/null and b/llamafactory/data/__pycache__/loader.cpython-312.pyc differ
diff --git a/llamafactory/data/__pycache__/mm_plugin.cpython-312.pyc b/llamafactory/data/__pycache__/mm_plugin.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eff12f0ba2ac7b9f274742c73de5198970487bd5
Binary files /dev/null and b/llamafactory/data/__pycache__/mm_plugin.cpython-312.pyc differ
diff --git a/llamafactory/data/__pycache__/parser.cpython-312.pyc b/llamafactory/data/__pycache__/parser.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d60110a599edbc6e1a3ceac15438706301ab9a77
Binary files /dev/null and b/llamafactory/data/__pycache__/parser.cpython-312.pyc differ
diff --git a/llamafactory/data/__pycache__/template.cpython-312.pyc b/llamafactory/data/__pycache__/template.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..df03dd6517d2b90e1d8c61583f02f7da39977c9d
Binary files /dev/null and b/llamafactory/data/__pycache__/template.cpython-312.pyc differ
diff --git a/llamafactory/data/__pycache__/tool_utils.cpython-312.pyc b/llamafactory/data/__pycache__/tool_utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7989ecbc820bda2775d2bc6f82b90fcd446ea4c2
Binary files /dev/null and b/llamafactory/data/__pycache__/tool_utils.cpython-312.pyc differ
diff --git a/llamafactory/data/collator.py b/llamafactory/data/collator.py
new file mode 100644
index 0000000000000000000000000000000000000000..162f432c9e5bf195ed4c6a821eb36d279bf3bac4
--- /dev/null
+++ b/llamafactory/data/collator.py
@@ -0,0 +1,331 @@
+# Copyright 2025 OpenAccess AI Collective and the LlamaFactory team.
+#
+# This code is inspired by the OpenAccess AI Collective's axolotl library.
+# https://github.com/OpenAccess-AI-Collective/axolotl/blob/main/src/axolotl/monkeypatch/utils.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Literal, Optional
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from peft import PeftModel
+from transformers import DataCollatorForSeq2Seq
+
+from ..extras.constants import AUDIO_PLACEHOLDER, IGNORE_INDEX, IMAGE_PLACEHOLDER
+from ..extras.packages import is_pillow_available
+
+
+if is_pillow_available():
+    from PIL import Image
+
+
+if TYPE_CHECKING:
+    from transformers import ProcessorMixin
+
+    from .template import Template
+
+
+def prepare_4d_attention_mask(attention_mask_with_indices: "torch.Tensor", dtype: "torch.dtype") -> "torch.Tensor":
+    r"""Expand 2d attention mask to 4d attention mask.
+
+    Expand the attention mask with indices from (batch_size, seq_len) to (batch_size, 1, seq_len, seq_len),
+    handle packed sequences and transforms the mask to lower triangular form to prevent future peeking.
+
+    e.g.
+    ```python
+    # input
+    [[1, 1, 2, 2, 2, 0]]
+    # output
+    [
+        [
+            [
+                [o, x, x, x, x, x],
+                [o, o, x, x, x, x],
+                [x, x, o, x, x, x],
+                [x, x, o, o, x, x],
+                [x, x, o, o, o, x],
+                [x, x, x, x, x, x],
+            ]
+        ]
+    ]
+    ```
+    where `o` equals to `0.0`, `x` equals to `min_dtype`.
+    """
+    _, seq_len = attention_mask_with_indices.size()
+    min_dtype = torch.finfo(dtype).min
+    zero_tensor = torch.tensor(0, dtype=dtype)
+
+    # Create a non-padding mask.
+    non_padding_mask = (attention_mask_with_indices != 0).unsqueeze(1).unsqueeze(2)
+    # Create indices for comparison.
+    indices = attention_mask_with_indices.unsqueeze(1).unsqueeze(2)  # [bsz, 1, 1, seq_len]
+    indices_t = attention_mask_with_indices.unsqueeze(1).unsqueeze(3)  # [bsz, 1, seq_len, 1]
+    # Create a lower triangular mask.
+    tril_mask = torch.tril(torch.ones((seq_len, seq_len), dtype=torch.bool))
+    attention_mask_4d = (indices == indices_t) & non_padding_mask & tril_mask
+    # Invert the attention mask.
+    attention_mask_4d = torch.where(attention_mask_4d, zero_tensor, min_dtype)
+    return attention_mask_4d
+
+
+@dataclass
+class MultiModalDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
+    r"""Data collator that supports VLMs.
+
+    Features should contain input_ids, attention_mask, labels, and optionally contain images, videos and audios.
+    """
+
+    template: Optional["Template"] = None
+    processor: Optional["ProcessorMixin"] = None
+
+    def __post_init__(self):
+        if self.template is None:
+            raise ValueError("Template is required for MultiModalDataCollator.")
+
+        if isinstance(self.model, PeftModel):
+            self.model = self.model.base_model.model
+
+        if self.model is not None and hasattr(self.model, "get_rope_index"):  # for qwen2vl mrope
+            self.get_rope_func = self.model.get_rope_index  # transformers < 4.52.0 or qwen2.5 omni
+        elif self.model is not None and hasattr(self.model, "model") and hasattr(self.model.model, "get_rope_index"):
+            self.get_rope_func = self.model.model.get_rope_index  # transformers >= 4.52.0
+        else:
+            self.get_rope_func = None
+
+    def __call__(self, features: list[dict[str, Any]]) -> dict[str, "torch.Tensor"]:
+        batch_images, batch_videos, batch_audios = [], [], []
+        batch_imglens, batch_vidlens, batch_audlens, batch_input_ids = [], [], [], []
+        for feature in features:
+            images = feature.pop("images", None) or []
+            videos = feature.pop("videos", None) or []
+            audios = feature.pop("audios", None) or []
+            batch_images.extend(images)
+            batch_videos.extend(videos)
+            batch_audios.extend(audios)
+            batch_imglens.append(len(images))
+            batch_vidlens.append(len(videos))
+            batch_audlens.append(len(audios))
+            batch_input_ids.append(feature["input_ids"])
+
+        fake_input_ids = []
+        if (
+            self.template.mm_plugin.image_token is not None and sum(batch_imglens) == 0 and sum(batch_vidlens) == 0
+        ):  # avoid process hanging in zero3/fsdp case
+            fake_messages = [{"role": "user", "content": IMAGE_PLACEHOLDER}]
+            fake_images = [Image.new("RGB", (64, 64), (255, 255, 255))]
+            fake_messages = self.template.mm_plugin.process_messages(
+                fake_messages, fake_images, [], [], self.processor
+            )
+            _fake_input_ids = self.tokenizer.encode(fake_messages[0]["content"], add_special_tokens=False)
+            _fake_input_ids, _ = self.template.mm_plugin.process_token_ids(
+                _fake_input_ids, None, fake_images, [], [], self.tokenizer, self.processor
+            )
+            fake_input_ids.extend(_fake_input_ids)
+            batch_images = fake_images
+            batch_imglens[0] = 1
+
+        if (
+            self.template.mm_plugin.audio_token is not None and sum(batch_audlens) == 0
+        ):  # avoid process hanging in zero3/fsdp case
+            fake_messages = [{"role": "user", "content": AUDIO_PLACEHOLDER}]
+            fake_audios = [np.zeros(1600)]
+            fake_messages = self.template.mm_plugin.process_messages(
+                fake_messages, [], [], fake_audios, self.processor
+            )
+            _fake_input_ids = self.tokenizer.encode(fake_messages[0]["content"], add_special_tokens=False)
+            _fake_input_ids, _ = self.template.mm_plugin.process_token_ids(
+                _fake_input_ids, None, [], [], fake_audios, self.tokenizer, self.processor
+            )
+            fake_input_ids.extend(_fake_input_ids)
+            batch_audios = fake_audios
+            batch_audlens[0] = 1
+
+        if len(fake_input_ids) != 0:
+            if self.tokenizer.padding_side == "right":
+                features[0]["input_ids"] = features[0]["input_ids"] + fake_input_ids
+                features[0]["attention_mask"] = features[0]["attention_mask"] + [0] * len(fake_input_ids)
+                features[0]["labels"] = features[0]["labels"] + [IGNORE_INDEX] * len(fake_input_ids)
+            else:
+                features[0]["input_ids"] = fake_input_ids + features[0]["input_ids"]
+                features[0]["attention_mask"] = [0] * len(fake_input_ids) + features[0]["attention_mask"]
+                features[0]["labels"] = [IGNORE_INDEX] * len(fake_input_ids) + features[0]["labels"]
+
+            batch_input_ids[0] = features[0]["input_ids"]
+
+        mm_inputs = self.template.mm_plugin.get_mm_inputs(
+            batch_images,
+            batch_videos,
+            batch_audios,
+            batch_imglens,
+            batch_vidlens,
+            batch_audlens,
+            batch_input_ids,
+            self.processor,
+        )
+        if "token_type_ids" in mm_inputs:
+            token_type_ids = mm_inputs.pop("token_type_ids")
+            for i, feature in enumerate(features):
+                feature["token_type_ids"] = token_type_ids[i]
+
+        features: dict[str, torch.Tensor] = super().__call__(features)
+
+        if self.get_rope_func is not None:
+            rope_index_kwargs = {
+                "input_ids": features["input_ids"],
+                "image_grid_thw": mm_inputs.get("image_grid_thw"),
+                "video_grid_thw": mm_inputs.get("video_grid_thw"),
+                "attention_mask": (features["attention_mask"] >= 1).float(),
+            }
+            if "second_per_grid_ts" in mm_inputs:  # for qwen2vl
+                rope_index_kwargs["second_per_grid_ts"] = mm_inputs.get("second_per_grid_ts")
+            elif "video_second_per_grid" in mm_inputs:  # for qwen2.5 omni
+                rope_index_kwargs["second_per_grids"] = mm_inputs.get("video_second_per_grid")
+
+            if getattr(self.model.config, "model_type", None) in ["qwen2_5_omni_thinker", "qwen3_omni_moe_thinker"]:
+                rope_index_kwargs["use_audio_in_video"] = getattr(self.processor, "use_audio_in_video", False)
+                feature_attention_mask = mm_inputs.get("feature_attention_mask", None)
+                if feature_attention_mask is not None:  # FIXME: need to get video image lengths
+                    audio_feature_lengths = torch.sum(feature_attention_mask, dim=1)
+                    rope_index_kwargs["audio_seqlens"] = audio_feature_lengths  # prepare for input
+
+                features["position_ids"], rope_deltas = self.get_rope_func(**rope_index_kwargs)
+                features["rope_deltas"] = rope_deltas - (1 - rope_index_kwargs["attention_mask"]).sum(
+                    dim=-1
+                ).unsqueeze(-1)
+            else:  # for qwen vl
+                features["position_ids"], features["rope_deltas"] = self.get_rope_func(**rope_index_kwargs)
+
+        if (
+            self.model is not None
+            and getattr(self.model.config, "model_type", None)
+            in [
+                "glm4v",
+                "Keye",
+                "qwen2_vl",
+                "qwen2_5_vl",
+                "qwen2_5_omni_thinker",
+                "qwen3_omni_moe_thinker",
+                "qwen3_vl",
+                "qwen3_vl_moe",
+            ]
+            and ("position_ids" not in features or features["position_ids"].dim() != 3)
+        ):
+            raise ValueError(f"{self.model.config.model_type} requires 3D position ids for mrope.")
+
+        if "cross_attention_mask" in mm_inputs:  # for mllama inputs when pad_to_multiple_of is enabled
+            cross_attention_mask = mm_inputs.pop("cross_attention_mask")
+            seq_len = features["input_ids"].size(1)
+            orig_len = cross_attention_mask.size(1)
+            mm_inputs["cross_attention_mask"] = F.pad(cross_attention_mask, (0, 0, 0, 0, 0, seq_len - orig_len))
+
+        features.update(mm_inputs)
+
+        if "image_bound" in features:  # for minicpmv inputs
+            bsz, seq_length = features["input_ids"].shape
+            features["position_ids"] = torch.arange(seq_length).long().repeat(bsz, 1)
+            return {"data": features, "input_ids": features["input_ids"], "labels": features["labels"]}
+
+        return features
+
+
+@dataclass
+class SFTDataCollatorWith4DAttentionMask(MultiModalDataCollatorForSeq2Seq):
+    r"""Data collator for 4d attention mask."""
+
+    block_diag_attn: bool = False
+    attn_implementation: Literal["eager", "sdpa", "flash_attention_2"] = "eager"
+    compute_dtype: "torch.dtype" = torch.float32
+
+    def __call__(self, features: list[dict[str, Any]]) -> dict[str, "torch.Tensor"]:
+        features = super().__call__(features)
+        if self.block_diag_attn and self.attn_implementation != "flash_attention_2":
+            features["attention_mask"] = prepare_4d_attention_mask(features["attention_mask"], self.compute_dtype)
+
+        for key, value in features.items():  # cast data dtype for paligemma
+            if torch.is_tensor(value) and torch.is_floating_point(value):
+                features[key] = value.to(self.compute_dtype)
+
+        return features
+
+
+@dataclass
+class PairwiseDataCollatorWithPadding(MultiModalDataCollatorForSeq2Seq):
+    r"""Data collator for pairwise data."""
+
+    def __call__(self, features: list[dict[str, Any]]) -> dict[str, "torch.Tensor"]:
+        r"""Pad batched data to the longest sequence in the batch.
+
+        We generate 2 * n examples where the first n examples represent chosen examples and
+        the last n examples represent rejected examples.
+        """
+        concatenated_features = []
+        for key in ("chosen", "rejected"):
+            for feature in features:
+                target_feature = {
+                    "input_ids": feature[f"{key}_input_ids"],
+                    "attention_mask": feature[f"{key}_attention_mask"],
+                    "labels": feature[f"{key}_labels"],
+                    "images": feature["images"],
+                    "videos": feature["videos"],
+                    "audios": feature["audios"],
+                }
+                concatenated_features.append(target_feature)
+
+        return super().__call__(concatenated_features)
+
+
+@dataclass
+class KTODataCollatorWithPadding(MultiModalDataCollatorForSeq2Seq):
+    r"""Data collator for KTO data."""
+
+    def __call__(self, features: list[dict[str, Any]]) -> dict[str, "torch.Tensor"]:
+        target_features = []
+        kl_features = []
+        kto_tags = []
+        for feature in features:
+            target_feature = {
+                "input_ids": feature["input_ids"],
+                "attention_mask": feature["attention_mask"],
+                "labels": feature["labels"],
+                "images": feature["images"],
+                "videos": feature["videos"],
+                "audios": feature["audios"],
+            }
+            kl_feature = {
+                "input_ids": feature["kl_input_ids"],
+                "attention_mask": feature["kl_attention_mask"],
+                "labels": feature["kl_labels"],
+                "images": feature["images"],
+                "videos": feature["videos"],
+                "audios": feature["audios"],
+            }
+            target_features.append(target_feature)
+            kl_features.append(kl_feature)
+            kto_tags.append(feature["kto_tags"])
+
+        batch = super().__call__(target_features)
+        kl_batch = super().__call__(kl_features)
+        batch["kl_input_ids"] = kl_batch["input_ids"]
+        batch["kl_attention_mask"] = kl_batch["attention_mask"]
+        batch["kl_labels"] = kl_batch["labels"]
+        if "cross_attention_mask" in kl_batch:  # for mllama inputs
+            batch["kl_cross_attention_mask"] = kl_batch["cross_attention_mask"]
+
+        if "token_type_ids" in kl_batch:
+            batch["kl_token_type_ids"] = kl_batch["token_type_ids"]
+
+        batch["kto_tags"] = torch.tensor(kto_tags)
+        return batch
diff --git a/llamafactory/data/converter.py b/llamafactory/data/converter.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac3735e648eee7117154c5301e8adae746a566d7
--- /dev/null
+++ b/llamafactory/data/converter.py
@@ -0,0 +1,425 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+from abc import abstractmethod
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Optional, Union
+
+from ..extras import logging
+from .data_utils import Role
+
+
+if TYPE_CHECKING:
+    from datasets import Dataset, IterableDataset
+    from transformers import Seq2SeqTrainingArguments
+
+    from ..hparams import DataArguments
+    from .mm_plugin import AudioInput, ImageInput, VideoInput
+    from .parser import DatasetAttr
+
+    MediaType = Union[ImageInput, VideoInput, AudioInput]
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+class DatasetConverter:
+    dataset_attr: "DatasetAttr"
+    data_args: "DataArguments"
+
+    def _find_medias(self, medias: Union["MediaType", list["MediaType"], None]) -> Optional[list["MediaType"]]:
+        r"""Optionally concatenate media path to media dir when loading from local disk."""
+        if medias is None:
+            return None
+        elif not isinstance(medias, list):
+            medias = [medias]
+        elif len(medias) == 0:
+            return None
+        else:
+            medias = medias[:]
+
+        if self.dataset_attr.load_from in ["script", "file"]:
+            if isinstance(medias[0], str):
+                for i in range(len(medias)):
+                    media_path = os.path.join(self.data_args.media_dir, medias[i])
+                    if os.path.isfile(media_path):
+                        medias[i] = media_path
+                    else:
+                        logger.warning_rank0_once(
+                            f"Media {medias[i]} does not exist in `media_dir`. Use original path."
+                        )
+            elif isinstance(medias[0], list):  # for processed video frames
+                # medias is a list of lists, e.g., [[frame1.jpg, frame2.jpg], [frame3.jpg, frame4.jpg]]
+                for i in range(len(medias)):
+                    for j in range(len(medias[i])):
+                        media_path = os.path.join(self.data_args.media_dir, medias[i][j])
+                        if os.path.isfile(media_path):
+                            medias[i][j] = media_path
+                        else:
+                            logger.warning_rank0_once(
+                                f"Media {medias[i][j]} does not exist in `media_dir`. Use original path."
+                            )
+
+        return medias
+
+    @abstractmethod
+    def __call__(self, example: dict[str, Any]) -> dict[str, Any]:
+        r"""Convert a single example in the dataset to the standard format."""
+        ...
+
+
+@dataclass
+class AlpacaDatasetConverter(DatasetConverter):
+    def __call__(self, example: dict[str, Any]) -> dict[str, Any]:
+        prompt = []
+        if self.dataset_attr.history and isinstance(example[self.dataset_attr.history], list):
+            for old_prompt, old_response in example[self.dataset_attr.history]:
+                prompt.append({"role": Role.USER.value, "content": old_prompt})
+                prompt.append({"role": Role.ASSISTANT.value, "content": old_response})
+
+        query = []
+        if self.dataset_attr.prompt and example[self.dataset_attr.prompt]:
+            query.append(example[self.dataset_attr.prompt])
+
+        if self.dataset_attr.query and example[self.dataset_attr.query]:
+            query.append(example[self.dataset_attr.query])
+
+        prompt.append({"role": Role.USER.value, "content": "\n".join(query)})  # "prompt\nquery"
+
+        if self.dataset_attr.kto_tag and isinstance(example[self.dataset_attr.kto_tag], bool):  # kto example
+            response = [{"role": Role.ASSISTANT.value, "content": example[self.dataset_attr.response]}]
+            if example[self.dataset_attr.kto_tag]:
+                response = response + [{"role": Role.ASSISTANT.value, "content": ""}]
+            else:
+                response = [{"role": Role.ASSISTANT.value, "content": ""}] + response
+        elif (
+            self.dataset_attr.ranking
+            and isinstance(example[self.dataset_attr.chosen], str)
+            and isinstance(example[self.dataset_attr.rejected], str)
+        ):  # pairwise example
+            response = [
+                {"role": Role.ASSISTANT.value, "content": example[self.dataset_attr.chosen]},
+                {"role": Role.ASSISTANT.value, "content": example[self.dataset_attr.rejected]},
+            ]
+        elif self.dataset_attr.response and isinstance(example[self.dataset_attr.response], str):  # normal example
+            response = [{"role": Role.ASSISTANT.value, "content": example[self.dataset_attr.response]}]
+        else:  # unsupervised
+            response = []
+
+        output = {
+            "_prompt": prompt,
+            "_response": response,
+            "_system": example[self.dataset_attr.system] if self.dataset_attr.system else "",
+            "_tools": example[self.dataset_attr.tools] if self.dataset_attr.tools else "",
+            "_images": self._find_medias(example[self.dataset_attr.images]) if self.dataset_attr.images else None,
+            "_videos": self._find_medias(example[self.dataset_attr.videos]) if self.dataset_attr.videos else None,
+            "_audios": self._find_medias(example[self.dataset_attr.audios]) if self.dataset_attr.audios else None,
+        }
+        return output
+
+
+@dataclass
+class SharegptDatasetConverter(DatasetConverter):
+    def __call__(self, example: dict[str, Any]) -> dict[str, Any]:
+        tag_mapping = {
+            self.dataset_attr.user_tag: Role.USER.value,
+            self.dataset_attr.assistant_tag: Role.ASSISTANT.value,
+            self.dataset_attr.observation_tag: Role.OBSERVATION.value,
+            self.dataset_attr.function_tag: Role.FUNCTION.value,
+            self.dataset_attr.system_tag: Role.SYSTEM.value,
+        }
+        odd_tags = (self.dataset_attr.user_tag, self.dataset_attr.observation_tag)
+        even_tags = (self.dataset_attr.assistant_tag, self.dataset_attr.function_tag)
+        accept_tags = (odd_tags, even_tags)
+        messages = example[self.dataset_attr.messages]
+        if (
+            self.dataset_attr.system_tag
+            and len(messages) != 0
+            and messages[0][self.dataset_attr.role_tag] == self.dataset_attr.system_tag
+        ):
+            system = messages[0][self.dataset_attr.content_tag]
+            messages = messages[1:]
+        else:
+            system = example[self.dataset_attr.system] if self.dataset_attr.system else ""
+
+        aligned_messages = []
+        broken_data = False
+        for turn_idx, message in enumerate(messages):
+            if message[self.dataset_attr.role_tag] not in accept_tags[turn_idx % 2]:
+                logger.warning_rank0(f"Invalid role tag in {messages}.")
+                broken_data = True
+                break
+
+            aligned_messages.append(
+                {
+                    "role": tag_mapping[message[self.dataset_attr.role_tag]],
+                    "content": message[self.dataset_attr.content_tag],
+                }
+            )
+
+        if (not self.dataset_attr.ranking and len(aligned_messages) % 2 != 0) or (
+            self.dataset_attr.ranking and len(aligned_messages) % 2 == 0
+        ):
+            logger.warning_rank0(f"Invalid message count in {messages}.")
+            broken_data = True
+
+        if broken_data:
+            logger.warning_rank0("Skipping this abnormal example.")
+            prompt, response = [], []
+        elif self.dataset_attr.kto_tag and isinstance(example[self.dataset_attr.kto_tag], bool):  # kto example
+            prompt = aligned_messages[:-1]
+            response = aligned_messages[-1:]
+            if example[self.dataset_attr.kto_tag]:
+                response = response + [{"role": Role.ASSISTANT.value, "content": ""}]
+            else:
+                response = [{"role": Role.ASSISTANT.value, "content": ""}] + response
+        elif (
+            self.dataset_attr.ranking
+            and isinstance(example[self.dataset_attr.chosen], dict)
+            and isinstance(example[self.dataset_attr.rejected], dict)
+        ):  # pairwise example
+            chosen = example[self.dataset_attr.chosen]
+            rejected = example[self.dataset_attr.rejected]
+            if (
+                chosen[self.dataset_attr.role_tag] not in accept_tags[-1]
+                or rejected[self.dataset_attr.role_tag] not in accept_tags[-1]
+            ):
+                logger.warning_rank0(f"Invalid role tag in {[chosen, rejected]}.")
+                broken_data = True
+
+            prompt = aligned_messages
+            response = [
+                {
+                    "role": tag_mapping[chosen[self.dataset_attr.role_tag]],
+                    "content": chosen[self.dataset_attr.content_tag],
+                },
+                {
+                    "role": tag_mapping[rejected[self.dataset_attr.role_tag]],
+                    "content": rejected[self.dataset_attr.content_tag],
+                },
+            ]
+        else:  # normal example
+            prompt = aligned_messages[:-1]
+            response = aligned_messages[-1:]
+
+        output = {
+            "_prompt": prompt,
+            "_response": response,
+            "_system": system,
+            "_tools": example[self.dataset_attr.tools] if self.dataset_attr.tools else "",
+            "_images": self._find_medias(example[self.dataset_attr.images]) if self.dataset_attr.images else None,
+            "_videos": self._find_medias(example[self.dataset_attr.videos]) if self.dataset_attr.videos else None,
+            "_audios": self._find_medias(example[self.dataset_attr.audios]) if self.dataset_attr.audios else None,
+        }
+        return output
+
+
+@dataclass
+class OpenAIDatasetConverter(DatasetConverter):
+    def __call__(self, example: dict[str, Any]) -> dict[str, Any]:
+        tag_mapping = {
+            self.dataset_attr.user_tag: Role.USER.value,
+            self.dataset_attr.assistant_tag: Role.ASSISTANT.value,
+            self.dataset_attr.observation_tag: Role.OBSERVATION.value,
+            self.dataset_attr.function_tag: Role.FUNCTION.value,
+            self.dataset_attr.system_tag: Role.SYSTEM.value,
+        }
+
+        messages = example[self.dataset_attr.messages]
+        if (
+            self.dataset_attr.system_tag
+            and len(messages) != 0
+            and messages[0][self.dataset_attr.role_tag] == self.dataset_attr.system_tag
+        ):
+            system = messages[0][self.dataset_attr.content_tag]
+            messages = messages[1:]
+        else:
+            system = example.get(self.dataset_attr.system, "") if self.dataset_attr.system else ""
+
+        aligned_messages = []
+        tool_responses = []
+        broken_data = False
+        for turn_idx, message in enumerate(messages):
+            role = message[self.dataset_attr.role_tag]
+            content = message[self.dataset_attr.content_tag]
+
+            if role in [self.dataset_attr.assistant_tag, self.dataset_attr.function_tag]:
+                if "tool_calls" in message and len(message["tool_calls"]) > 0:
+                    tool_calls_list = [tool["function"] for tool in message["tool_calls"]]
+                    content = json.dumps(tool_calls_list, ensure_ascii=False)
+                    role = self.dataset_attr.function_tag
+
+            if role == self.dataset_attr.observation_tag:
+                tool_responses.append(content)
+                continue
+            elif len(tool_responses) > 0:
+                _content = "\n</tool_response>\n<tool_response>\n".join(tool_responses)
+                aligned_messages.append(
+                    {
+                        "role": Role.OBSERVATION.value,
+                        "content": _content,
+                    }
+                )
+                tool_responses = []
+
+            aligned_messages.append(
+                {
+                    "role": tag_mapping[role],
+                    "content": content,
+                }
+            )
+
+        odd_tags = (Role.USER.value, Role.OBSERVATION.value)
+        even_tags = (Role.ASSISTANT.value, Role.FUNCTION.value)
+        accept_tags = (odd_tags, even_tags)
+        for turn_idx, message in enumerate(aligned_messages):
+            if message["role"] not in accept_tags[turn_idx % 2]:
+                logger.warning_rank0(f"Invalid role tag in {messages}.")
+                broken_data = True
+                break
+
+        if (not self.dataset_attr.ranking and len(aligned_messages) % 2 != 0) or (
+            self.dataset_attr.ranking and len(aligned_messages) % 2 == 0
+        ):
+            logger.warning_rank0(f"Invalid message count in {messages}.")
+            broken_data = True
+
+        if broken_data:
+            logger.warning_rank0("Skipping this abnormal example.")
+            prompt, response = [], []
+        elif self.dataset_attr.kto_tag and isinstance(example[self.dataset_attr.kto_tag], bool):  # kto example
+            prompt = aligned_messages[:-1]
+            response = aligned_messages[-1:]
+            if example[self.dataset_attr.kto_tag]:
+                response = response + [{"role": Role.ASSISTANT.value, "content": ""}]
+            else:
+                response = [{"role": Role.ASSISTANT.value, "content": ""}] + response
+        elif (
+            self.dataset_attr.ranking
+            and isinstance(example[self.dataset_attr.chosen], dict)
+            and isinstance(example[self.dataset_attr.rejected], dict)
+        ):  # pairwise example
+            chosen = example[self.dataset_attr.chosen]
+            rejected = example[self.dataset_attr.rejected]
+            if (
+                chosen[self.dataset_attr.role_tag] not in accept_tags[-1]
+                or rejected[self.dataset_attr.role_tag] not in accept_tags[-1]
+            ):
+                logger.warning_rank0(f"Invalid role tag in {[chosen, rejected]}.")
+                broken_data = True
+
+            prompt = aligned_messages
+            response = [
+                {
+                    "role": tag_mapping[chosen[self.dataset_attr.role_tag]],
+                    "content": chosen[self.dataset_attr.content_tag],
+                },
+                {
+                    "role": tag_mapping[rejected[self.dataset_attr.role_tag]],
+                    "content": rejected[self.dataset_attr.content_tag],
+                },
+            ]
+        else:  # normal example
+            prompt = aligned_messages[:-1]
+            response = aligned_messages[-1:]
+
+        tools = example.get(self.dataset_attr.tools, "") if self.dataset_attr.tools else ""
+        if isinstance(tools, dict) or isinstance(tools, list):
+            tools = json.dumps(tools, ensure_ascii=False)
+
+        short_system_prompt = "detailed thinking off"
+        if not system:
+            if not tools:
+                system = short_system_prompt
+            else:
+                pass
+        else:
+            if not tools:
+                if "detailed thinking on" in system or "detailed thinking off" in system:
+                    pass
+                else:
+                    system += "\n" + short_system_prompt
+            else:
+                system += "\n"
+
+        output = {
+            "_prompt": prompt,
+            "_response": response,
+            "_system": system,
+            "_tools": tools,
+            "_images": self._find_medias(example[self.dataset_attr.images]) if self.dataset_attr.images else None,
+            "_videos": self._find_medias(example[self.dataset_attr.videos]) if self.dataset_attr.videos else None,
+            "_audios": self._find_medias(example[self.dataset_attr.audios]) if self.dataset_attr.audios else None,
+        }
+        return output
+
+
+DATASET_CONVERTERS = {
+    "alpaca": AlpacaDatasetConverter,
+    "sharegpt": SharegptDatasetConverter,
+    "openai": OpenAIDatasetConverter,
+}
+
+
+def register_dataset_converter(name: str, dataset_converter: type["DatasetConverter"]) -> None:
+    r"""Register a new dataset converter."""
+    if name in DATASET_CONVERTERS:
+        raise ValueError(f"Dataset converter {name} already exists.")
+
+    DATASET_CONVERTERS[name] = dataset_converter
+
+
+def get_dataset_converter(name: str, dataset_attr: "DatasetAttr", data_args: "DataArguments") -> "DatasetConverter":
+    r"""Get a dataset converter."""
+    if name not in DATASET_CONVERTERS:
+        raise ValueError(f"Dataset converter {name} not found.")
+
+    return DATASET_CONVERTERS[name](dataset_attr, data_args)
+
+
+def align_dataset(
+    dataset: Union["Dataset", "IterableDataset"],
+    dataset_attr: "DatasetAttr",
+    data_args: "DataArguments",
+    training_args: "Seq2SeqTrainingArguments",
+) -> Union["Dataset", "IterableDataset"]:
+    r"""Align the dataset to a specific format.
+
+    Aligned dataset:
+    _prompt: [{"role": "user", "content": "..."}] * (2T - 1)
+    _response: [{"role": "assistant", "content": "..."}] * N (N > 1 for ranking dataset)
+    _system: "..."
+    _tools: "..."
+    _images: []
+    _videos: []
+    _audios: []
+    """
+    column_names = list(next(iter(dataset)).keys())
+    kwargs = {}
+    if not data_args.streaming:
+        kwargs = dict(
+            num_proc=data_args.preprocessing_num_workers,
+            load_from_cache_file=(not data_args.overwrite_cache) or (training_args.local_process_index != 0),
+            desc="Converting format of dataset",
+        )
+
+    dataset_converter = get_dataset_converter(dataset_attr.formatting, dataset_attr, data_args)
+    return dataset.map(
+        dataset_converter,
+        batched=False,
+        remove_columns=column_names,
+        **kwargs,
+    )
diff --git a/llamafactory/data/data_utils.py b/llamafactory/data/data_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..14e261290a032466c87772915eb9f472e08d14fa
--- /dev/null
+++ b/llamafactory/data/data_utils.py
@@ -0,0 +1,190 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+from enum import Enum, unique
+from typing import TYPE_CHECKING, Any, Optional, TypedDict, Union
+
+import fsspec
+from datasets import DatasetDict, concatenate_datasets, interleave_datasets
+
+from ..extras import logging
+
+
+if TYPE_CHECKING:
+    from datasets import Dataset, IterableDataset
+
+    from ..hparams import DataArguments
+
+
+logger = logging.get_logger(__name__)
+
+
+SLOTS = list[Union[str, set[str], dict[str, str]]]
+
+
+@unique
+class Role(str, Enum):
+    USER = "user"
+    ASSISTANT = "assistant"
+    SYSTEM = "system"
+    FUNCTION = "function"
+    OBSERVATION = "observation"
+
+
+class DatasetModule(TypedDict):
+    train_dataset: Optional[Union["Dataset", "IterableDataset"]]
+    eval_dataset: Optional[Union["Dataset", "IterableDataset", dict[str, "Dataset"]]]
+
+
+def merge_dataset(
+    all_datasets: list[Union["Dataset", "IterableDataset"]], data_args: "DataArguments", seed: int
+) -> Union["Dataset", "IterableDataset"]:
+    r"""Merge multiple datasets to a unified dataset."""
+    if len(all_datasets) == 1:
+        return all_datasets[0]
+
+    elif data_args.mix_strategy == "concat":
+        if data_args.streaming:
+            logger.warning_rank0_once("The samples between different datasets will not be mixed in streaming mode.")
+
+        return concatenate_datasets(all_datasets)
+
+    elif data_args.mix_strategy.startswith("interleave"):
+        if not data_args.streaming:
+            logger.warning_rank0_once("We recommend using `mix_strategy=concat` in non-streaming mode.")
+
+        return interleave_datasets(
+            datasets=all_datasets,
+            probabilities=data_args.interleave_probs,
+            seed=seed,
+            stopping_strategy="first_exhausted" if data_args.mix_strategy.endswith("under") else "all_exhausted",
+        )
+
+    else:
+        raise ValueError(f"Unknown mixing strategy: {data_args.mix_strategy}.")
+
+
+def split_dataset(
+    dataset: Optional[Union["Dataset", "IterableDataset"]],
+    eval_dataset: Optional[Union["Dataset", "IterableDataset", dict[str, "Dataset"]]],
+    data_args: "DataArguments",
+    seed: int,
+) -> "DatasetDict":
+    r"""Split the dataset and returns a dataset dict containing train set and validation set.
+
+    Support both map dataset and iterable dataset.
+    """
+    if eval_dataset is not None and data_args.val_size > 1e-6:
+        raise ValueError("Cannot specify `val_size` if `eval_dataset` is not None.")
+
+    dataset_dict = {}
+    if dataset is not None:
+        if data_args.streaming:
+            dataset = dataset.shuffle(buffer_size=data_args.buffer_size, seed=seed)
+
+        if data_args.val_size > 1e-6:
+            if data_args.streaming:
+                dataset_dict["validation"] = dataset.take(int(data_args.val_size))
+                dataset_dict["train"] = dataset.skip(int(data_args.val_size))
+            else:
+                val_size = int(data_args.val_size) if data_args.val_size > 1 else data_args.val_size
+                dataset_dict = dataset.train_test_split(test_size=val_size, seed=seed)
+                dataset = dataset.train_test_split(test_size=val_size, seed=seed)
+                dataset_dict = {"train": dataset["train"], "validation": dataset["test"]}
+        else:
+            dataset_dict["train"] = dataset
+
+    if eval_dataset is not None:
+        if isinstance(eval_dataset, dict):
+            dataset_dict.update({f"validation_{name}": data for name, data in eval_dataset.items()})
+        else:
+            if data_args.streaming:
+                eval_dataset = eval_dataset.shuffle(buffer_size=data_args.buffer_size, seed=seed)
+
+            dataset_dict["validation"] = eval_dataset
+
+    return DatasetDict(dataset_dict)
+
+
+def get_dataset_module(dataset: Union["Dataset", "DatasetDict"]) -> "DatasetModule":
+    r"""Convert dataset or dataset dict to dataset module."""
+    dataset_module: DatasetModule = {}
+    if isinstance(dataset, DatasetDict):  # dataset dict
+        if "train" in dataset:
+            dataset_module["train_dataset"] = dataset["train"]
+
+        if "validation" in dataset:
+            dataset_module["eval_dataset"] = dataset["validation"]
+        else:
+            eval_dataset = {}
+            for key in dataset.keys():
+                if key.startswith("validation_"):
+                    eval_dataset[key[len("validation_") :]] = dataset[key]
+
+            if len(eval_dataset):
+                dataset_module["eval_dataset"] = eval_dataset
+
+    else:  # single dataset
+        dataset_module["train_dataset"] = dataset
+
+    return dataset_module
+
+
+def setup_fs(path: str, anon: bool = False) -> "fsspec.AbstractFileSystem":
+    r"""Set up a filesystem object based on the path protocol."""
+    storage_options = {"anon": anon} if anon else {}
+    if path.startswith("s3://"):
+        fs = fsspec.filesystem("s3", **storage_options)
+    elif path.startswith(("gs://", "gcs://")):
+        fs = fsspec.filesystem("gcs", **storage_options)
+    else:
+        raise ValueError(f"Unsupported protocol in path: {path}. Use 's3://' or 'gs://'.")
+
+    if not fs.exists(path):
+        raise ValueError(f"Path does not exist: {path}.")
+
+    return fs
+
+
+def _read_json_with_fs(fs: "fsspec.AbstractFileSystem", path: str) -> list[Any]:
+    r"""Helper function to read JSON/JSONL files using fsspec."""
+    with fs.open(path, "r") as f:
+        if path.endswith(".jsonl"):
+            return [json.loads(line) for line in f if line.strip()]
+        else:
+            return json.load(f)
+
+
+def read_cloud_json(cloud_path: str) -> list[Any]:
+    r"""Read a JSON/JSONL file from cloud storage (S3 or GCS).
+
+    Args:
+        cloud_path: str
+            Cloud path in the format:
+            - 's3://bucket-name/file.json' for AWS S3
+            - 'gs://bucket-name/file.jsonl' or 'gcs://bucket-name/file.jsonl' for Google Cloud Storage
+    """
+    try:
+        fs = setup_fs(cloud_path, anon=True)  # try with anonymous access first
+    except Exception:
+        fs = setup_fs(cloud_path)  # try again with credentials
+
+    # filter out non-JSON files
+    files = [x["Key"] for x in fs.listdir(cloud_path)] if fs.isdir(cloud_path) else [cloud_path]
+    files = filter(lambda file: file.endswith(".json") or file.endswith(".jsonl"), files)
+    if not files:
+        raise ValueError(f"No JSON/JSONL files found in the specified path: {cloud_path}.")
+
+    return sum([_read_json_with_fs(fs, file) for file in files], [])
diff --git a/llamafactory/data/formatter.py b/llamafactory/data/formatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b527a7ef0093ef1f7d462639780d5330023c4e9
--- /dev/null
+++ b/llamafactory/data/formatter.py
@@ -0,0 +1,145 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import re
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from typing import Optional, Union
+
+from typing_extensions import override
+
+from .data_utils import SLOTS
+from .tool_utils import FunctionCall, get_tool_utils
+
+
+@dataclass
+class Formatter(ABC):
+    slots: SLOTS = field(default_factory=list)
+    tool_format: Optional[str] = None
+
+    @abstractmethod
+    def apply(self, **kwargs) -> SLOTS:
+        r"""Forms a list of slots according to the inputs to encode."""
+        ...
+
+    def extract(self, content: str) -> Union[str, list["FunctionCall"]]:
+        r"""Extract a list of tuples from the response message if using tools.
+
+        Each tuple consists of function name and function arguments.
+        """
+        raise NotImplementedError
+
+
+@dataclass
+class EmptyFormatter(Formatter):
+    def __post_init__(self):
+        has_placeholder = False
+        for slot in filter(lambda s: isinstance(s, str), self.slots):
+            if re.search(r"\{\{[a-zA-Z_][a-zA-Z0-9_]*\}\}", slot):
+                has_placeholder = True
+
+        if has_placeholder:
+            raise ValueError("Empty formatter should not contain any placeholder.")
+
+    @override
+    def apply(self, **kwargs) -> SLOTS:
+        return self.slots
+
+
+@dataclass
+class StringFormatter(Formatter):
+    def __post_init__(self):
+        has_placeholder = False
+        for slot in filter(lambda s: isinstance(s, str), self.slots):
+            if re.search(r"\{\{[a-zA-Z_][a-zA-Z0-9_]*\}\}", slot):
+                has_placeholder = True
+
+        if not has_placeholder:
+            raise ValueError("A placeholder is required in the string formatter.")
+
+    @override
+    def apply(self, **kwargs) -> SLOTS:
+        elements = []
+        for slot in self.slots:
+            if isinstance(slot, str):
+                for name, value in kwargs.items():
+                    if not isinstance(value, str):
+                        raise RuntimeError(f"Expected a string, got {value}")
+
+                    slot = slot.replace("{{" + name + "}}", value, 1)
+                elements.append(slot)
+            elif isinstance(slot, (dict, set)):
+                elements.append(slot)
+            else:
+                raise RuntimeError(f"Input must be string, set[str] or dict[str, str], got {type(slot)}.")
+
+        return elements
+
+
+@dataclass
+class FunctionFormatter(StringFormatter):
+    def __post_init__(self):
+        super().__post_init__()
+        self.tool_utils = get_tool_utils(self.tool_format)
+
+    @override
+    def apply(self, **kwargs) -> SLOTS:
+        content: str = kwargs.pop("content")
+        thought_words, thought = kwargs.pop("thought_words", None), None
+        if thought_words and len(thought_words) == 2:
+            regex = re.compile(rf"{re.escape(thought_words[0])}(.*?){re.escape(thought_words[1])}", re.DOTALL)
+            thought = re.search(regex, content)
+
+        if thought:
+            content = content.replace(thought.group(0), "")
+
+        functions: list[FunctionCall] = []
+        try:
+            tool_calls = json.loads(content)
+            if not isinstance(tool_calls, list):  # parallel function call
+                tool_calls = [tool_calls]
+
+            for tool_call in tool_calls:
+                functions.append(
+                    FunctionCall(tool_call["name"], json.dumps(tool_call["arguments"], ensure_ascii=False))
+                )
+
+        except json.JSONDecodeError:
+            raise RuntimeError(f"Invalid JSON format in function message: {str([content])}.")  # flat string
+
+        function_str = self.tool_utils.function_formatter(functions)
+        if thought:
+            function_str = thought.group(0) + function_str
+
+        return super().apply(content=function_str)
+
+
+@dataclass
+class ToolFormatter(Formatter):
+    def __post_init__(self):
+        self.tool_utils = get_tool_utils(self.tool_format)
+
+    @override
+    def apply(self, **kwargs) -> SLOTS:
+        content = kwargs.pop("content")
+        try:
+            tools = json.loads(content)
+            return [self.tool_utils.tool_formatter(tools) if len(tools) != 0 else ""]
+        except json.JSONDecodeError:
+            raise RuntimeError(f"Invalid JSON format in tool description: {str([content])}.")  # flat string
+
+    @override
+    def extract(self, content: str) -> Union[str, list["FunctionCall"]]:
+        return self.tool_utils.tool_extractor(content)
diff --git a/llamafactory/data/loader.py b/llamafactory/data/loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbb13455beb1172930afd26154db72bbe387df16
--- /dev/null
+++ b/llamafactory/data/loader.py
@@ -0,0 +1,334 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import TYPE_CHECKING, Literal, Optional, Union
+
+import numpy as np
+from datasets import Dataset, load_dataset, load_from_disk
+
+from ..extras import logging
+from ..extras.constants import FILEEXT2TYPE
+from ..extras.misc import check_version, has_tokenized_data
+from .converter import align_dataset
+from .data_utils import get_dataset_module, merge_dataset, read_cloud_json, split_dataset
+from .parser import get_dataset_list
+from .processor import (
+    FeedbackDatasetProcessor,
+    PackedSupervisedDatasetProcessor,
+    PairwiseDatasetProcessor,
+    PretrainDatasetProcessor,
+    SupervisedDatasetProcessor,
+    UnsupervisedDatasetProcessor,
+)
+
+
+if TYPE_CHECKING:
+    from datasets import Dataset, IterableDataset
+    from transformers import PreTrainedTokenizer, ProcessorMixin, Seq2SeqTrainingArguments
+
+    from ..hparams import DataArguments, ModelArguments
+    from .data_utils import DatasetModule
+    from .parser import DatasetAttr
+    from .processor import DatasetProcessor
+    from .template import Template
+
+
+logger = logging.get_logger(__name__)
+
+
+def _load_single_dataset(
+    dataset_attr: "DatasetAttr",
+    model_args: "ModelArguments",
+    data_args: "DataArguments",
+    training_args: "Seq2SeqTrainingArguments",
+) -> Union["Dataset", "IterableDataset"]:
+    r"""Load a single dataset and aligns it to the standard format."""
+    logger.info_rank0(f"Loading dataset {dataset_attr}...")
+    data_path, data_name, data_dir, data_files = None, None, None, None
+    if dataset_attr.load_from in ["hf_hub", "ms_hub", "om_hub"]:
+        data_path = dataset_attr.dataset_name
+        data_name = dataset_attr.subset
+        data_dir = dataset_attr.folder
+
+    elif dataset_attr.load_from == "script":
+        data_path = os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)
+        data_name = dataset_attr.subset
+        data_dir = dataset_attr.folder
+
+    elif dataset_attr.load_from == "cloud_file":
+        data_path = dataset_attr.dataset_name
+
+    elif dataset_attr.load_from == "file":
+        data_files = []
+        local_path = os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)
+        if os.path.isdir(local_path):  # is directory
+            for file_name in os.listdir(local_path):
+                data_files.append(os.path.join(local_path, file_name))
+        elif os.path.isfile(local_path):  # is file
+            data_files.append(local_path)
+        else:
+            raise ValueError(f"File {local_path} not found.")
+
+        data_path = FILEEXT2TYPE.get(os.path.splitext(data_files[0])[-1][1:], None)
+        if data_path is None:
+            raise ValueError("Allowed file types: {}.".format(",".join(FILEEXT2TYPE.keys())))
+
+        if any(data_path != FILEEXT2TYPE.get(os.path.splitext(data_file)[-1][1:], None) for data_file in data_files):
+            raise ValueError("File types should be identical.")
+    else:
+        raise NotImplementedError(f"Unknown load type: {dataset_attr.load_from}.")
+
+    if dataset_attr.load_from == "ms_hub":
+        check_version("modelscope>=1.14.0", mandatory=True)
+        from modelscope import MsDataset  # type: ignore
+        from modelscope.utils.config_ds import MS_DATASETS_CACHE  # type: ignore
+
+        cache_dir = model_args.cache_dir or MS_DATASETS_CACHE
+        dataset = MsDataset.load(
+            dataset_name=data_path,
+            subset_name=data_name,
+            data_dir=data_dir,
+            data_files=data_files,
+            split=dataset_attr.split,
+            cache_dir=cache_dir,
+            token=model_args.ms_hub_token,
+            use_streaming=data_args.streaming,
+        )
+        if isinstance(dataset, MsDataset):
+            dataset = dataset.to_hf_dataset()
+
+    elif dataset_attr.load_from == "om_hub":
+        check_version("openmind>=0.8.0", mandatory=True)
+        from openmind import OmDataset  # type: ignore
+        from openmind.utils.hub import OM_DATASETS_CACHE  # type: ignore
+
+        cache_dir = model_args.cache_dir or OM_DATASETS_CACHE
+        dataset = OmDataset.load_dataset(
+            path=data_path,
+            name=data_name,
+            data_dir=data_dir,
+            data_files=data_files,
+            split=dataset_attr.split,
+            cache_dir=cache_dir,
+            token=model_args.om_hub_token,
+            streaming=data_args.streaming,
+        )
+    elif dataset_attr.load_from == "cloud_file":
+        dataset = Dataset.from_list(read_cloud_json(data_path), split=dataset_attr.split)
+    else:
+        dataset = load_dataset(
+            path=data_path,
+            name=data_name,
+            data_dir=data_dir,
+            data_files=data_files,
+            split=dataset_attr.split,
+            cache_dir=model_args.cache_dir,
+            token=model_args.hf_hub_token,
+            num_proc=data_args.preprocessing_num_workers,
+            streaming=data_args.streaming and dataset_attr.load_from != "file",
+        )
+        if data_args.streaming and dataset_attr.load_from == "file":
+            dataset = dataset.to_iterable_dataset(num_shards=training_args.dataloader_num_workers)
+
+    if dataset_attr.num_samples is not None and not data_args.streaming:
+        target_num = dataset_attr.num_samples
+        indexes = np.random.permutation(len(dataset))[:target_num]  # all samples should be included
+        target_num -= len(indexes)
+        if target_num > 0:
+            expand_indexes = np.random.choice(len(dataset), target_num)
+            indexes = np.concatenate((indexes, expand_indexes), axis=0)
+
+        assert len(indexes) == dataset_attr.num_samples, "Sample num mismatched."
+        dataset = dataset.select(indexes)
+        logger.info_rank0(f"Sampled {dataset_attr.num_samples} examples from dataset {dataset_attr}.")
+
+    if data_args.max_samples is not None:  # truncate dataset
+        max_samples = min(data_args.max_samples, len(dataset))
+        dataset = dataset.select(range(max_samples))
+
+    return align_dataset(dataset, dataset_attr, data_args, training_args)
+
+
+def _get_merged_dataset(
+    dataset_names: Optional[list[str]],
+    model_args: "ModelArguments",
+    data_args: "DataArguments",
+    training_args: "Seq2SeqTrainingArguments",
+    stage: Literal["pt", "sft", "rm", "ppo", "kto"],
+    return_dict: bool = False,
+) -> Optional[Union["Dataset", "IterableDataset", dict[str, "Dataset"]]]:
+    r"""Return the merged datasets in the standard format."""
+    if dataset_names is None:
+        return None
+
+    datasets = {}
+    for dataset_name, dataset_attr in zip(dataset_names, get_dataset_list(dataset_names, data_args.dataset_dir)):
+        if (stage == "rm" and dataset_attr.ranking is False) or (stage != "rm" and dataset_attr.ranking is True):
+            raise ValueError("The dataset is not applicable in the current training stage.")
+
+        datasets[dataset_name] = _load_single_dataset(dataset_attr, model_args, data_args, training_args)
+
+    if return_dict:
+        return datasets
+    else:
+        return merge_dataset(list(datasets.values()), data_args, seed=training_args.seed)
+
+
+def _get_dataset_processor(
+    data_args: "DataArguments",
+    stage: Literal["pt", "sft", "rm", "ppo", "kto"],
+    template: "Template",
+    tokenizer: "PreTrainedTokenizer",
+    processor: Optional["ProcessorMixin"],
+    do_generate: bool = False,
+) -> "DatasetProcessor":
+    r"""Return the corresponding dataset processor."""
+    if stage == "pt":
+        dataset_processor_class = PretrainDatasetProcessor
+    elif stage == "sft" and not do_generate:
+        if data_args.packing:
+            if data_args.neat_packing:  # hack datasets to have int32 attention mask
+                from datasets.arrow_writer import OptimizedTypedSequence, TypedSequence
+
+                def __init__(self, data, **kwargs):
+                    return TypedSequence.__init__(
+                        self,
+                        data,
+                        type=kwargs.pop("type", None),
+                        try_type=kwargs.pop("try_type", None),
+                        optimized_int_type=kwargs.pop("optimized_int_type", None),
+                    )
+
+                OptimizedTypedSequence.__init__ = __init__
+            dataset_processor_class = PackedSupervisedDatasetProcessor
+        else:
+            dataset_processor_class = SupervisedDatasetProcessor
+
+    elif stage == "rm":
+        dataset_processor_class = PairwiseDatasetProcessor
+    elif stage == "kto":
+        dataset_processor_class = FeedbackDatasetProcessor
+    else:
+        dataset_processor_class = UnsupervisedDatasetProcessor
+
+    return dataset_processor_class(template=template, tokenizer=tokenizer, processor=processor, data_args=data_args)
+
+
+def _get_preprocessed_dataset(
+    dataset: Optional[Union["Dataset", "IterableDataset"]],
+    data_args: "DataArguments",
+    training_args: "Seq2SeqTrainingArguments",
+    stage: Literal["pt", "sft", "rm", "ppo", "kto"],
+    template: "Template",
+    tokenizer: "PreTrainedTokenizer",
+    processor: Optional["ProcessorMixin"] = None,
+    is_eval: bool = False,
+) -> Optional[Union["Dataset", "IterableDataset"]]:
+    r"""Preprocesses the dataset, including format checking and tokenization."""
+    if dataset is None:
+        return None
+
+    dataset_processor = _get_dataset_processor(
+        data_args, stage, template, tokenizer, processor, do_generate=(training_args.predict_with_generate and is_eval)
+    )
+    column_names = list(next(iter(dataset)).keys())
+    kwargs = {}
+    if not data_args.streaming:
+        kwargs = dict(
+            num_proc=data_args.preprocessing_num_workers,
+            load_from_cache_file=(not data_args.overwrite_cache) or (training_args.local_process_index != 0),
+            desc="Running tokenizer on dataset",
+        )
+
+    dataset = dataset.map(
+        dataset_processor.preprocess_dataset,
+        batched=True,
+        batch_size=data_args.preprocessing_batch_size,
+        remove_columns=column_names,
+        **kwargs,
+    )
+
+    if training_args.should_log:
+        try:
+            print("eval example:" if is_eval else "training example:")
+            dataset_processor.print_data_example(next(iter(dataset)))
+        except StopIteration:
+            if stage == "pt":
+                raise RuntimeError("Cannot find sufficient samples, consider increasing dataset size.")
+            else:
+                raise RuntimeError("Cannot find valid samples, check `data/README.md` for the data format.")
+
+    return dataset
+
+
+def get_dataset(
+    template: "Template",
+    model_args: "ModelArguments",
+    data_args: "DataArguments",
+    training_args: "Seq2SeqTrainingArguments",
+    stage: Literal["pt", "sft", "rm", "ppo", "kto"],
+    tokenizer: "PreTrainedTokenizer",
+    processor: Optional["ProcessorMixin"] = None,
+) -> "DatasetModule":
+    r"""Get the train dataset and optionally gets the evaluation dataset."""
+    # Load tokenized dataset if path exists
+    if data_args.tokenized_path is not None:
+        if has_tokenized_data(data_args.tokenized_path):
+            logger.warning_rank0("Loading dataset from disk will ignore other data arguments.")
+            tokenized_data = load_from_disk(data_args.tokenized_path)
+            dataset_module = get_dataset_module(tokenized_data)
+            if data_args.streaming:
+                dataset_module["train_dataset"] = dataset_module["train_dataset"].to_iterable_dataset()
+
+            logger.info_rank0(f"Loaded tokenized dataset from {data_args.tokenized_path}.")
+            return dataset_module
+
+        if data_args.streaming:
+            raise ValueError("Turn off `streaming` when saving dataset to disk.")
+
+    # Load and preprocess dataset
+    with training_args.main_process_first(desc="load dataset", local=(not data_args.data_shared_file_system)):
+        dataset = _get_merged_dataset(data_args.dataset, model_args, data_args, training_args, stage)
+        eval_dataset = _get_merged_dataset(
+            data_args.eval_dataset,
+            model_args,
+            data_args,
+            training_args,
+            stage,
+            return_dict=data_args.eval_on_each_dataset,
+        )
+
+    with training_args.main_process_first(desc="pre-process dataset", local=(not data_args.data_shared_file_system)):
+        dataset = _get_preprocessed_dataset(
+            dataset, data_args, training_args, stage, template, tokenizer, processor, is_eval=False
+        )
+        if isinstance(eval_dataset, dict):
+            for eval_name, eval_data in eval_dataset.items():
+                eval_dataset[eval_name] = _get_preprocessed_dataset(
+                    eval_data, data_args, training_args, stage, template, tokenizer, processor, is_eval=True
+                )
+        else:
+            eval_dataset = _get_preprocessed_dataset(
+                eval_dataset, data_args, training_args, stage, template, tokenizer, processor, is_eval=True
+            )
+
+        dataset_dict = split_dataset(dataset, eval_dataset, data_args, seed=training_args.seed)
+        if data_args.tokenized_path is not None:  # save tokenized dataset to disk
+            if training_args.should_save:
+                dataset_dict.save_to_disk(data_args.tokenized_path)
+                logger.info_rank0(f"Tokenized dataset is saved at {data_args.tokenized_path}.")
+                logger.info_rank0(f"Please launch the training with `tokenized_path: {data_args.tokenized_path}`.")
+
+        return get_dataset_module(dataset_dict)
diff --git a/llamafactory/data/mm_plugin.py b/llamafactory/data/mm_plugin.py
new file mode 100644
index 0000000000000000000000000000000000000000..91b801dc1105c31fe9fe8a76807f4126198556a2
--- /dev/null
+++ b/llamafactory/data/mm_plugin.py
@@ -0,0 +1,2082 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's Transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llava/processing_llava.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import math
+import os
+import re
+from copy import deepcopy
+from dataclasses import dataclass
+from io import BytesIO
+from typing import TYPE_CHECKING, BinaryIO, Literal, Optional, TypedDict, Union
+
+import numpy as np
+import torch
+from transformers.image_utils import get_image_size, is_valid_image, to_numpy_array
+from transformers.models.mllama.processing_mllama import (
+    convert_sparse_cross_attention_mask_to_dense,
+    get_cross_attention_token_mask,
+)
+from typing_extensions import NotRequired, override
+
+from ..extras.constants import AUDIO_PLACEHOLDER, IGNORE_INDEX, IMAGE_PLACEHOLDER, VIDEO_PLACEHOLDER
+from ..extras.packages import (
+    is_librosa_available,
+    is_pillow_available,
+    is_pyav_available,
+    is_transformers_version_greater_than,
+)
+
+
+if is_librosa_available():
+    import librosa
+
+
+if is_pillow_available():
+    from PIL import Image
+    from PIL.Image import Image as ImageObject
+
+
+if is_pyav_available():
+    import av
+
+
+if is_transformers_version_greater_than("4.52.0"):
+    from transformers.image_utils import make_flat_list_of_images
+    from transformers.video_utils import make_batched_videos
+else:
+    from transformers.image_utils import make_batched_videos, make_flat_list_of_images
+
+
+if TYPE_CHECKING:
+    from av.stream import Stream
+    from numpy.typing import NDArray
+    from transformers import PreTrainedTokenizer, ProcessorMixin
+    from transformers.feature_extraction_sequence_utils import SequenceFeatureExtractor
+    from transformers.image_processing_utils import BaseImageProcessor
+    from transformers.video_processing_utils import BaseVideoProcessor
+
+    class EncodedImage(TypedDict):
+        path: Optional[str]
+        bytes: Optional[bytes]
+
+    ImageInput = Union[str, bytes, EncodedImage, BinaryIO, ImageObject]
+    VideoInput = Union[str, BinaryIO, list[list[ImageInput]]]
+    AudioInput = Union[str, BinaryIO, NDArray]
+
+    class RegularizedImageOutput(TypedDict):
+        images: list[ImageObject]
+
+    class RegularizedVideoOutput(TypedDict):
+        videos: list[list[ImageObject]]
+        durations: list[float]
+        fps_per_video: NotRequired[list[float]]
+
+    class RegularizedAudioOutput(TypedDict):
+        audios: list[NDArray]
+        sampling_rates: list[float]
+
+    class MMProcessor(ProcessorMixin):
+        patch_size: int
+        image_seq_length: int
+        num_additional_image_tokens: int
+        vision_feature_select_strategy: Literal["default", "full"]
+
+        def _get_number_of_features(self, orig_height: int, orig_width: int, height: int, width: int) -> int:
+            pass
+
+
+def _get_paligemma_token_type_ids(imglens: list[int], seqlens: list[int], processor: "MMProcessor") -> list[list[int]]:
+    r"""Get paligemma token type ids for computing loss.
+
+    It is slightly different with the original token type ids where the prompt part is 0.
+
+    Returns:
+        batch_token_type_ids: shape (batch_size, seq_length)
+
+    """
+    batch_token_type_ids = []
+    for imglen, seqlen in zip(imglens, seqlens):
+        image_seqlen = imglen * processor.image_seq_length
+        batch_token_type_ids.append([0] * image_seqlen + [1] * (seqlen - image_seqlen))
+
+    return batch_token_type_ids
+
+
+def _get_gemma3_token_type_ids(batch_ids: list[list[int]], processor: "MMProcessor"):
+    r"""Get gemma3 token type ids for computing loss.
+
+    Returns:
+        batch_token_type_ids: shape (batch_size, seq_length)
+
+    """
+    image_token_id: int = getattr(processor, "image_token_id")
+    batch_token_type_ids = []
+    for token_ids in batch_ids:
+        token_ids = np.array(token_ids)
+        token_type_ids = np.zeros_like(token_ids)
+        token_type_ids[token_ids == image_token_id] = 1
+        batch_token_type_ids.append(token_type_ids.tolist())
+
+    return batch_token_type_ids
+
+
+def _make_batched_images(images: list["ImageObject"], imglens: list[int]) -> list[list["ImageObject"]]:
+    r"""Make nested list of images."""
+    batch_images = []
+    for imglen in imglens:
+        batch_images.append(images[:imglen])
+        images = images[imglen:]
+
+    return batch_images
+
+
+def _check_video_is_nested_images(video: "VideoInput") -> bool:
+    r"""Check if the video is nested images."""
+    return isinstance(video, list) and all(isinstance(frame, (str, BinaryIO, dict, ImageObject)) for frame in video)
+
+
+@dataclass
+class MMPluginMixin:
+    image_token: Optional[str]
+    video_token: Optional[str]
+    audio_token: Optional[str]
+    expand_mm_tokens: bool = True
+
+    def _validate_input(
+        self,
+        processor: Optional["MMProcessor"],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+    ) -> None:
+        r"""Validate if this model accepts the input modalities."""
+        image_processor: BaseImageProcessor = getattr(processor, "image_processor", None)
+        video_processor: BaseImageProcessor = getattr(
+            processor, "video_processor", getattr(processor, "image_processor", None)
+        )
+        feature_extractor: SequenceFeatureExtractor = getattr(processor, "feature_extractor", None)
+        if len(images) != 0 and self.image_token is None:
+            raise ValueError(
+                "This model does not support image input. Please check whether the correct `template` is used."
+            )
+
+        if len(videos) != 0 and self.video_token is None:
+            raise ValueError(
+                "This model does not support video input. Please check whether the correct `template` is used."
+            )
+
+        if len(audios) != 0 and self.audio_token is None:
+            raise ValueError(
+                "This model does not support audio input. Please check whether the correct `template` is used."
+            )
+
+        if self.image_token is not None and processor is None:
+            raise ValueError("Processor was not found, please check and update your model file.")
+
+        if self.image_token is not None and image_processor is None:
+            raise ValueError("Image processor was not found, please check and update your model file.")
+
+        if self.video_token is not None and video_processor is None:
+            raise ValueError("Video processor was not found, please check and update your model file.")
+
+        if self.audio_token is not None and feature_extractor is None:
+            raise ValueError("Audio feature extractor was not found, please check and update your model file.")
+
+    def _validate_messages(
+        self,
+        messages: list[dict[str, str]],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+    ):
+        r"""Validate if the number of images, videos and audios match the number of placeholders in messages."""
+        num_image_tokens, num_video_tokens, num_audio_tokens = 0, 0, 0
+        for message in messages:
+            num_image_tokens += message["content"].count(IMAGE_PLACEHOLDER)
+            num_video_tokens += message["content"].count(VIDEO_PLACEHOLDER)
+            num_audio_tokens += message["content"].count(AUDIO_PLACEHOLDER)
+
+        if len(images) != num_image_tokens:
+            raise ValueError(
+                f"The number of images does not match the number of {IMAGE_PLACEHOLDER} tokens in {messages}."
+            )
+
+        if len(videos) != num_video_tokens:
+            raise ValueError(
+                f"The number of videos does not match the number of {VIDEO_PLACEHOLDER} tokens in {messages}."
+            )
+
+        if len(audios) != num_audio_tokens:
+            raise ValueError(
+                f"The number of audios does not match the number of {AUDIO_PLACEHOLDER} tokens in {messages}."
+            )
+
+    def _preprocess_image(
+        self, image: "ImageObject", image_max_pixels: int, image_min_pixels: int, **kwargs
+    ) -> "ImageObject":
+        r"""Pre-process a single image."""
+        if (image.width * image.height) > image_max_pixels:
+            resize_factor = math.sqrt(image_max_pixels / (image.width * image.height))
+            width, height = int(image.width * resize_factor), int(image.height * resize_factor)
+            image = image.resize((width, height))
+
+        if (image.width * image.height) < image_min_pixels:
+            resize_factor = math.sqrt(image_min_pixels / (image.width * image.height))
+            width, height = int(image.width * resize_factor), int(image.height * resize_factor)
+            image = image.resize((width, height))
+
+        if image.mode != "RGB":
+            image = image.convert("RGB")
+
+        return image
+
+    def _get_video_sample_indices(
+        self, video_stream: "Stream", video_fps: float, video_maxlen: int, **kwargs
+    ) -> list[int]:
+        r"""Compute video sample indices according to fps."""
+        total_frames = video_stream.frames
+        if total_frames == 0:  # infinite video
+            return np.linspace(0, video_maxlen - 1, video_maxlen).astype(np.int32)
+
+        sample_frames = max(1, math.floor(float(video_stream.duration * video_stream.time_base) * video_fps))
+        sample_frames = min(total_frames, video_maxlen, sample_frames)
+        return np.linspace(0, total_frames - 1, sample_frames).astype(np.int32)
+
+    def _regularize_images(self, images: list["ImageInput"], **kwargs) -> "RegularizedImageOutput":
+        r"""Regularize images to avoid error. Including reading and pre-processing."""
+        results = []
+        for image in images:
+            if isinstance(image, (str, BinaryIO)):
+                image = Image.open(image)
+            elif isinstance(image, bytes):
+                image = Image.open(BytesIO(image))
+            elif isinstance(image, dict):
+                if image["bytes"] is not None:
+                    image = Image.open(BytesIO(image["bytes"]))
+                else:
+                    image = Image.open(image["path"])
+
+            if not isinstance(image, ImageObject):
+                raise ValueError(f"Expect input is a list of images, but got {type(image)}.")
+
+            results.append(self._preprocess_image(image, **kwargs))
+
+        return {"images": results}
+
+    def _regularize_videos(self, videos: list["VideoInput"], **kwargs) -> "RegularizedVideoOutput":
+        r"""Regularizes videos to avoid error. Including reading, resizing and converting."""
+        results = []
+        durations = []
+        for video in videos:
+            frames: list[ImageObject] = []
+            if _check_video_is_nested_images(video):
+                for frame in video:
+                    if not is_valid_image(frame) and not isinstance(frame, dict) and not os.path.exists(frame):
+                        raise ValueError("Invalid image found in video frames.")
+                frames = video
+                durations.append(len(frames) / kwargs.get("video_fps", 2.0))
+            else:
+                container = av.open(video, "r")
+                video_stream = next(stream for stream in container.streams if stream.type == "video")
+                sample_indices = self._get_video_sample_indices(video_stream, **kwargs)
+                container.seek(0)
+                for frame_idx, frame in enumerate(container.decode(video_stream)):
+                    if frame_idx in sample_indices:
+                        frames.append(frame.to_image())
+
+                if video_stream.duration is None:
+                    durations.append(len(frames) / kwargs.get("video_fps", 2.0))
+                else:
+                    durations.append(float(video_stream.duration * video_stream.time_base))
+
+            frames = self._regularize_images(frames, **kwargs)["images"]
+            results.append(frames)
+
+        return {"videos": results, "durations": durations}
+
+    def _regularize_audios(
+        self, audios: list["AudioInput"], sampling_rate: float, **kwargs
+    ) -> "RegularizedAudioOutput":
+        r"""Regularizes audios to avoid error. Including reading and resampling."""
+        results, sampling_rates = [], []
+        for audio in audios:
+            if not isinstance(audio, np.ndarray):
+                audio, sampling_rate = librosa.load(audio, sr=sampling_rate)
+
+            results.append(audio)
+            sampling_rates.append(sampling_rate)
+
+        return {"audios": results, "sampling_rates": sampling_rates}
+
+    def _get_mm_inputs(
+        self,
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: "MMProcessor",
+        imglens: Optional[list[int]] = None,
+    ) -> dict[str, "torch.Tensor"]:
+        r"""Process visual inputs.
+
+        Returns: (llava and paligemma)
+            pixel_values: tensor with shape (B, C, H, W)
+
+        Returns: (qwen2-vl)
+            pixel_values: tensor with shape (num_patches, patch_dim)
+            image_grid_thw: tensor with shape (num_images, 3), where the three numbers are time, width, height
+                            where num_patches == torch.prod(image_grid_thw)
+
+        Returns: (mllama)
+            pixel_values: tensor with shape
+                          (batch_size, max_num_images, max_image_tiles, channels, tile_height, tile_width)
+                          For example, (2, 1, 4, 3, 560, 560).
+            aspect_ratio_ids: tensor with shape (batch_size, max_num_images). For example, (2, 1).
+            aspect_ratio_mask: tensor with shape (batch_size, max_num_images, max_image_tiles). For example, (2, 1, 4).
+            num_tiles: List[List[int]] with shape (batch_size, num_images_in_batch). For example, (2, 1).
+
+        """
+        mm_inputs = {}
+        if len(images) != 0:
+            image_processor: BaseImageProcessor = getattr(processor, "image_processor", None)
+            images = self._regularize_images(
+                images,
+                image_max_pixels=getattr(processor, "image_max_pixels", 768 * 768),
+                image_min_pixels=getattr(processor, "image_min_pixels", 32 * 32),
+            )["images"]
+            if imglens is not None:  # if imglens are provided, make batched images
+                images = _make_batched_images(images, imglens)
+
+            image_processor_kwargs = {}
+            if getattr(processor, "image_do_pan_and_scan", False):  # gemma3 image processor
+                image_processor_kwargs.update(
+                    {
+                        "do_pan_and_scan": True,
+                        "pan_and_scan_min_crop_size": 256,
+                        "pan_and_scan_max_num_crops": 4,
+                        "pan_and_scan_min_ratio_to_activate": 1.2,
+                    }
+                )
+
+            mm_inputs.update(image_processor(images, return_tensors="pt", **image_processor_kwargs))
+
+        if len(videos) != 0:
+            video_processor: BaseImageProcessor = getattr(
+                processor, "video_processor", getattr(processor, "image_processor", None)
+            )
+            videos = self._regularize_videos(
+                videos,
+                image_max_pixels=getattr(processor, "video_max_pixels", 256 * 256),
+                image_min_pixels=getattr(processor, "video_min_pixels", 16 * 16),
+                video_fps=getattr(processor, "video_fps", 2.0),
+                video_maxlen=getattr(processor, "video_maxlen", 128),
+            )["videos"]
+            if "videos" in inspect.signature(video_processor.preprocess).parameters:  # for qwen2_vl and video_llava
+                mm_inputs.update(video_processor(images=None, videos=videos, return_tensors="pt"))
+            else:  # for llava_next_video
+                mm_inputs.update(video_processor(videos, return_tensors="pt"))
+
+        if len(audios) != 0:
+            feature_extractor: SequenceFeatureExtractor = getattr(processor, "feature_extractor", None)
+            audios = self._regularize_audios(
+                audios,
+                sampling_rate=getattr(processor, "audio_sampling_rate", 16000),
+            )["audios"]
+            mm_inputs.update(
+                feature_extractor(
+                    audios,
+                    sampling_rate=getattr(processor, "audio_sampling_rate", 16000),
+                    return_attention_mask=True,
+                    padding="max_length",
+                    return_tensors="pt",
+                )
+            )
+            mm_inputs["feature_attention_mask"] = mm_inputs.pop("attention_mask", None)  # prevent conflicts
+
+        return mm_inputs
+
+
+@dataclass
+class BasePlugin(MMPluginMixin):
+    def process_messages(
+        self,
+        messages: list[dict[str, str]],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: Optional["MMProcessor"],
+    ) -> list[dict[str, str]]:
+        r"""Pre-process input messages before tokenization for VLMs."""
+        self._validate_input(processor, images, videos, audios)
+        return messages
+
+    def process_token_ids(
+        self,
+        input_ids: list[int],
+        labels: Optional[list[int]],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        tokenizer: "PreTrainedTokenizer",
+        processor: Optional["MMProcessor"],
+    ) -> tuple[list[int], Optional[list[int]]]:
+        r"""Pre-process token ids after tokenization for VLMs."""
+        self._validate_input(processor, images, videos, audios)
+        return input_ids, labels
+
+    def get_mm_inputs(
+        self,
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        imglens: list[int],
+        vidlens: list[int],
+        audlens: list[int],
+        batch_ids: list[list[int]],
+        processor: Optional["MMProcessor"],
+    ) -> dict[str, Union[list[int], "torch.Tensor"]]:
+        r"""Build batched multimodal inputs for VLMs.
+
+        Arguments:
+            images: a list of image inputs, shape (num_images,)
+            videos: a list of video inputs, shape (num_videos,)
+            audios: a list of audio inputs, shape (num_audios,)
+            imglens: number of images in each sample, shape (batch_size,)
+            vidlens: number of videos in each sample, shape (batch_size,)
+            audlens: number of audios in each sample, shape (batch_size,)
+            batch_ids: token ids of input samples, shape (batch_size, seq_len)
+            processor: a processor for pre-processing images and videos
+
+        """
+        self._validate_input(processor, images, videos, audios)
+        return self._get_mm_inputs(images, videos, audios, processor)
+
+
+@dataclass
+class Gemma3Plugin(BasePlugin):
+    @override
+    def process_messages(
+        self,
+        messages: list[dict[str, str]],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: Optional["MMProcessor"],
+    ) -> list[dict[str, str]]:
+        self._validate_input(processor, images, videos, audios)
+        self._validate_messages(messages, images, videos, audios)
+        num_image_tokens = 0
+        messages = deepcopy(messages)
+        boi_token: str = getattr(processor, "boi_token")
+        full_image_sequence: str = getattr(processor, "full_image_sequence")
+        image_str = full_image_sequence if self.expand_mm_tokens else boi_token
+
+        do_pan_and_scan: bool = getattr(processor, "image_do_pan_and_scan", False)
+        if do_pan_and_scan:
+            mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
+
+        for message in messages:
+            content = message["content"]
+            while IMAGE_PLACEHOLDER in content:
+                if do_pan_and_scan:
+                    image_placeholder_str = (
+                        "Here is the original image {{image}} and here are some crops to help you see better "
+                        + " ".join(["{{image}}"] * mm_inputs["num_crops"][0][num_image_tokens])
+                    )
+                else:
+                    image_placeholder_str = "{{image}}"
+
+                content = content.replace(IMAGE_PLACEHOLDER, image_placeholder_str, 1)
+                num_image_tokens += 1
+
+            message["content"] = content.replace("{{image}}", image_str)
+
+        return messages
+
+    @override
+    def get_mm_inputs(
+        self,
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        imglens: list[int],
+        vidlens: list[int],
+        audlens: list[int],
+        batch_ids: list[list[int]],
+        processor: Optional["MMProcessor"],
+    ) -> dict[str, Union[list[int], "torch.Tensor"]]:
+        self._validate_input(processor, images, videos, audios)
+        mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
+        mm_inputs.pop("num_crops", None)
+        mm_inputs["token_type_ids"] = _get_gemma3_token_type_ids(batch_ids, processor)
+        return mm_inputs
+
+
+class Gemma3nPlugin(Gemma3Plugin):
+    @override
+    def process_messages(
+        self,
+        messages: list[dict[str, str]],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: Optional["MMProcessor"],
+    ) -> list[dict[str, str]]:
+        self._validate_input(processor, images, videos, audios)
+        self._validate_messages(messages, images, videos, audios)
+        messages = deepcopy(messages)
+        boi_token: str = getattr(processor, "boi_token")
+        boa_token: str = getattr(processor, "boa_token")
+        full_image_sequence: str = getattr(processor, "full_image_sequence")
+        full_audio_sequence: str = getattr(processor, "full_audio_sequence")
+        image_str = full_image_sequence if self.expand_mm_tokens else boi_token
+        audio_str = full_audio_sequence if self.expand_mm_tokens else boa_token
+
+        for message in messages:
+            content = message["content"]
+            while IMAGE_PLACEHOLDER in content:
+                content = content.replace(IMAGE_PLACEHOLDER, image_str, 1)
+
+            while AUDIO_PLACEHOLDER in content:
+                content = content.replace(AUDIO_PLACEHOLDER, audio_str, 1)
+
+            message["content"] = content
+
+        return messages
+
+
+@dataclass
+class InternVLPlugin(BasePlugin):
+    @override
+    def _get_mm_inputs(
+        self,
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: "ProcessorMixin",
+        **kwargs,
+    ) -> dict[str, "torch.Tensor"]:
+        image_processor: BaseImageProcessor = getattr(processor, "image_processor")
+        image_processor_kwargs = {}
+        if getattr(processor, "crop_to_patches", False):
+            image_processor_kwargs.update(
+                {
+                    "crop_to_patches": True,
+                    "max_patches": 12,
+                    "min_patches": 1,
+                }
+            )
+
+        mm_inputs = {}
+        image_video_patches = []
+
+        if len(images) != 0:
+            images = self._regularize_images(
+                images,
+                image_max_pixels=getattr(processor, "image_max_pixels", 1024 * 1024),
+                image_min_pixels=getattr(processor, "image_min_pixels", 32 * 32),
+            )["images"]
+
+        if len(videos) != 0:
+            videos = self._regularize_videos(
+                videos,
+                image_max_pixels=getattr(processor, "video_max_pixels", 256 * 256),
+                image_min_pixels=getattr(processor, "video_min_pixels", 16 * 16),
+                video_fps=getattr(processor, "video_fps", 2.0),
+                video_maxlen=getattr(processor, "video_maxlen", 128),
+            )["videos"]
+
+        if len(images) != 0:
+            images = make_flat_list_of_images(images)
+            image_inputs = image_processor(images=images, return_tensors="pt", **image_processor_kwargs)
+            image_num_patches = image_inputs.pop("num_patches")
+            image_pixel_values = image_inputs.pop("pixel_values")
+            image_num_patches_indices = np.cumsum(image_num_patches)
+
+        if len(videos) != 0:
+            videos = make_batched_videos(videos)
+            num_frames_per_video = [len(video) for video in videos]
+            patch_indices = np.cumsum(num_frames_per_video)
+            image_processor_kwargs["crop_to_patches"] = False
+            video_inputs = image_processor(images=videos, return_tensors="pt", **image_processor_kwargs)
+            video_num_patches = video_inputs.pop("num_patches")
+            video_pixel_values = video_inputs.pop("pixel_values")
+            video_num_patches_indices = np.cumsum(video_num_patches)
+
+        # NOT SUPPORT IMAGE VIDEO INTERLEAVED
+        if len(images) != 0 and image_pixel_values is not None:
+            for i in range(len(images)):
+                start_index = image_num_patches_indices[i - 1] if i > 0 else 0
+                end_index = image_num_patches_indices[i]
+                image_video_patches.append(image_pixel_values[start_index:end_index])
+
+        if len(videos) != 0 and video_pixel_values is not None:
+            patch_indices_with_prefix = [0] + list(patch_indices)
+            for i in range(len(videos)):
+                current_patch_index = patch_indices_with_prefix[i]
+                end_patch_index = patch_indices_with_prefix[i + 1]
+                start_index = video_num_patches_indices[current_patch_index - 1] if i > 0 else 0
+                end_index = video_num_patches_indices[end_patch_index - 1]
+                image_video_patches.append(video_pixel_values[start_index:end_index])
+
+        if len(images) != 0 or len(videos) != 0:
+            mm_inputs["pixel_values"] = torch.cat(image_video_patches, dim=0)
+
+        if len(images) != 0:
+            mm_inputs.update({"image_num_patches": image_num_patches})
+
+        if len(videos) != 0:
+            mm_inputs.update({"video_patch_indices": patch_indices})
+            mm_inputs.update({"video_num_patches": video_num_patches})
+
+        return mm_inputs
+
+    @override
+    def process_messages(
+        self,
+        messages: list[dict[str, str]],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: Optional["ProcessorMixin"],
+    ) -> list[dict[str, str]]:
+        self._validate_input(processor, images, videos, audios)
+        self._validate_messages(messages, images, videos, audios)
+        num_image_tokens, num_video_tokens = 0, 0
+        image_seqlen = getattr(processor, "image_seq_length") if self.expand_mm_tokens else 1
+        messages = deepcopy(messages)
+        mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
+
+        image_pixel_patch_list = mm_inputs.get("image_num_patches")  # pathes of images
+        video_num_patches = mm_inputs.get("video_num_patches")  # all patches for frames of videos
+        video_patch_indices = mm_inputs.get("video_patch_indices")  # num frames of per video
+
+        for message in messages:
+            content = message["content"]
+            while IMAGE_PLACEHOLDER in content:
+                content = content.replace(
+                    IMAGE_PLACEHOLDER,
+                    f"<img>{'<IMG_CONTEXT>' * image_seqlen * image_pixel_patch_list[num_image_tokens]}</img>",
+                    1,
+                )
+                num_image_tokens += 1
+
+            while VIDEO_PLACEHOLDER in content:
+                current_patch_index = video_patch_indices[num_video_tokens - 1] if num_video_tokens > 0 else 0
+                end_patch_index = video_patch_indices[num_video_tokens]
+                num_patches = list(video_num_patches[current_patch_index:end_patch_index])
+                video_replaced_prompt = "\n".join(
+                    f"Frame{i + 1}: <img>{'<IMG_CONTEXT>' * image_seqlen * num_patches[i]}</img>"
+                    for i in range(len(num_patches))
+                )
+                content = content.replace(VIDEO_PLACEHOLDER, video_replaced_prompt, 1)
+                num_video_tokens += 1
+
+            message["content"] = content
+
+        return messages
+
+    @override
+    def get_mm_inputs(
+        self,
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        imglens: list[int],
+        vidlens: list[int],
+        audlens: list[int],
+        batch_ids: list[list[int]],
+        processor: Optional["ProcessorMixin"],
+    ) -> dict[str, Union[list[int], "torch.Tensor"]]:
+        self._validate_input(processor, images, videos, audios)
+        mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
+        mm_inputs.pop("image_num_patches", None)
+        mm_inputs.pop("video_patch_indices", None)
+        mm_inputs.pop("video_num_patches", None)
+        return mm_inputs
+
+
+class KimiVLPlugin(BasePlugin):
+    @override
+    def process_messages(self, messages, images, videos, audios, processor):
+        self._validate_input(processor, images, videos, audios)
+        self._validate_messages(messages, images, videos, audios)
+        if self.expand_mm_tokens:
+            mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
+            image_grid_hws = mm_inputs.get("image_grid_hws", [])
+        else:
+            image_grid_hws = [None] * len(images)
+
+        num_image_tokens = 0
+        image_processor: BaseImageProcessor = getattr(processor, "image_processor")
+        merge_length = math.prod(image_processor.merge_kernel_size)
+        messages = deepcopy(messages)
+        for message in messages:
+            content = message["content"]
+            while IMAGE_PLACEHOLDER in content:
+                image_seqlen = image_grid_hws[num_image_tokens].prod() // merge_length if self.expand_mm_tokens else 1
+                content = content.replace(
+                    IMAGE_PLACEHOLDER,
+                    f"<|media_start|>image<|media_content|>{self.image_token * image_seqlen}<|media_end|>",
+                    1,
+                )
+                num_image_tokens += 1
+
+            message["content"] = content
+
+        return messages
+
+
+@dataclass
+class Llama4Plugin(BasePlugin):
+    @override
+    def process_messages(
+        self,
+        messages: list[dict[str, str]],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: Optional["MMProcessor"],
+    ) -> list[dict[str, str]]:
+        self._validate_input(processor, images, videos, audios)
+        self._validate_messages(messages, images, videos, audios)
+        if self.expand_mm_tokens:
+            mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
+            if "pixel_values" in mm_inputs:
+                image_height, image_width = mm_inputs["pixel_values"][0].shape[-2:]
+                num_patches_per_chunk = int(
+                    (image_height // processor.patch_size)
+                    * (image_width // processor.patch_size)
+                    // processor.downsample_ratio
+                )
+                aspect_ratios = mm_inputs.pop("aspect_ratios")
+
+        num_image_tokens = 0
+        messages = deepcopy(messages)
+        for message in messages:
+            content = message["content"]
+            if self.expand_mm_tokens:
+                placeholder_count = content.count(IMAGE_PLACEHOLDER)
+                prompt_splits = content.split(IMAGE_PLACEHOLDER)
+                new_content = []
+                for local_image_index, split_part in enumerate(prompt_splits):
+                    new_content.append(split_part)
+                    if local_image_index < placeholder_count:
+                        tokens_for_this_image = processor._prompt_split_image(
+                            aspect_ratios[num_image_tokens], num_patches_per_chunk
+                        )
+                        num_image_tokens += 1
+                        new_content.append(tokens_for_this_image)
+
+                content = "".join(new_content)
+            else:
+                content = content.replace(IMAGE_PLACEHOLDER, self.image_token)
+
+            message["content"] = content
+
+        return messages
+
+    @override
+    def get_mm_inputs(
+        self,
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        imglens: list[int],
+        vidlens: list[int],
+        audlens: list[int],
+        batch_ids: list[list[int]],
+        processor: Optional["MMProcessor"],
+    ) -> dict[str, Union[list[int], "torch.Tensor"]]:
+        self._validate_input(processor, images, videos, audios)
+        mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
+        mm_inputs.pop("aspect_ratios", None)
+        return mm_inputs
+
+
+@dataclass
+class LlavaPlugin(BasePlugin):
+    @override
+    def process_messages(
+        self,
+        messages: list[dict[str, str]],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: Optional["MMProcessor"],
+    ) -> list[dict[str, str]]:
+        self._validate_input(processor, images, videos, audios)
+        self._validate_messages(messages, images, videos, audios)
+        messages = deepcopy(messages)
+        if self.expand_mm_tokens:
+            mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
+            if "pixel_values" in mm_inputs:
+                height, width = get_image_size(to_numpy_array(mm_inputs["pixel_values"][0]))
+                image_seqlen = (height // processor.patch_size) * (
+                    width // processor.patch_size
+                ) + processor.num_additional_image_tokens
+                if processor.vision_feature_select_strategy == "default":
+                    image_seqlen -= 1
+        else:
+            image_seqlen = 1
+
+        for message in messages:
+            content = message["content"]
+            while IMAGE_PLACEHOLDER in content:
+                content = content.replace(IMAGE_PLACEHOLDER, "{{image}}" * image_seqlen, 1)
+
+            message["content"] = content.replace("{{image}}", self.image_token)
+
+        return messages
+
+
+@dataclass
+class LlavaNextPlugin(BasePlugin):
+    @override
+    def process_messages(
+        self,
+        messages: list[dict[str, str]],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: Optional["MMProcessor"],
+    ) -> list[dict[str, str]]:
+        self._validate_input(processor, images, videos, audios)
+        self._validate_messages(messages, images, videos, audios)
+        num_image_tokens = 0
+        messages = deepcopy(messages)
+        if self.expand_mm_tokens:
+            mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
+            if "pixel_values" in mm_inputs:
+                image_sizes = iter(mm_inputs["image_sizes"].tolist())
+                height, width = get_image_size(to_numpy_array(mm_inputs["pixel_values"][0][0]))
+
+        for message in messages:
+            content = message["content"]
+            while IMAGE_PLACEHOLDER in content:
+                if self.expand_mm_tokens:
+                    orig_height, orig_width = next(image_sizes)
+                    image_seqlen = processor._get_number_of_features(orig_height, orig_width, height, width)
+                    if processor.vision_feature_select_strategy == "default":
+                        image_seqlen -= 1
+                else:
+                    image_seqlen = 1
+
+                content = content.replace(IMAGE_PLACEHOLDER, "{{image}}" * image_seqlen, 1)
+                num_image_tokens += 1
+
+            message["content"] = content.replace("{{image}}", self.image_token)
+
+        return messages
+
+
+@dataclass
+class LlavaNextVideoPlugin(BasePlugin):
+    @override
+    def process_messages(
+        self,
+        messages: list[dict[str, str]],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: Optional["MMProcessor"],
+    ) -> list[dict[str, str]]:
+        self._validate_input(processor, images, videos, audios)
+        self._validate_messages(messages, images, videos, audios)
+        messages = deepcopy(messages)
+        if self.expand_mm_tokens:
+            mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
+            if "pixel_values" in mm_inputs:
+                image_sizes = iter(mm_inputs["image_sizes"].tolist())
+                height, width = get_image_size(to_numpy_array(mm_inputs["pixel_values"][0][0]))
+
+        for message in messages:
+            content = message["content"]
+            while IMAGE_PLACEHOLDER in content:
+                if self.expand_mm_tokens:
+                    orig_height, orig_width = next(image_sizes)
+                    image_seqlen = processor._get_number_of_features(orig_height, orig_width, height, width)
+                    if processor.vision_feature_select_strategy == "default":
+                        image_seqlen -= 1
+                else:
+                    image_seqlen = 1
+
+                content = content.replace(IMAGE_PLACEHOLDER, "{{image}}" * image_seqlen, 1)
+
+            message["content"] = content.replace("{{image}}", self.image_token)
+
+        if self.expand_mm_tokens:
+            if "pixel_values_videos" in mm_inputs:
+                one_video = to_numpy_array(mm_inputs.get("pixel_values_videos")[0])
+                height, width = get_image_size(one_video[0])
+                num_frames = one_video.shape[0]  # frame dim is always after batch dim
+                image_seqlen = (height // processor.patch_size) * (width // processor.patch_size)
+                video_seqlen = image_seqlen // 4 * num_frames  # divide by 4 needed for avg pooling layer
+        else:
+            video_seqlen = 1
+
+        for message in messages:
+            content = message["content"]
+            while VIDEO_PLACEHOLDER in content:
+                content = content.replace(VIDEO_PLACEHOLDER, "{{video}}" * video_seqlen, 1)
+
+            message["content"] = content.replace("{{video}}", self.video_token)
+
+        return messages
+
+
+@dataclass
+class MiniCPMVPlugin(BasePlugin):
+    @override
+    def _get_mm_inputs(
+        self,
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: "MMProcessor",
+        **kwargs,
+    ) -> dict[str, "torch.Tensor"]:
+        image_processor: BaseImageProcessor = getattr(processor, "image_processor")
+        mm_inputs = {}
+        if len(images) != 0:
+            images = self._regularize_images(
+                images,
+                image_max_pixels=getattr(processor, "image_max_pixels", 768 * 768),
+                image_min_pixels=getattr(processor, "image_min_pixels", 32 * 32),
+            )["images"]
+            if "valid_image_nums_ls" in kwargs:
+                valid_image_nums_ls = kwargs["valid_image_nums_ls"]
+                new_images = []
+                idx = 0
+                for valid_image_nums in valid_image_nums_ls:
+                    new_images.append(images[idx : idx + valid_image_nums])
+                    idx += valid_image_nums
+
+                images = new_images
+
+            image_inputs = image_processor(
+                images, do_pad=True, max_slice_nums=image_processor.max_slice_nums, return_tensors="pt"
+            )
+            mm_inputs.update(image_inputs)
+
+        if len(videos) != 0:
+            videos = self._regularize_videos(
+                videos,
+                image_max_pixels=getattr(processor, "video_max_pixels", 256 * 256),
+                image_min_pixels=getattr(processor, "video_min_pixels", 16 * 16),
+                video_fps=getattr(processor, "video_fps", 2.0),
+                video_maxlen=getattr(processor, "video_maxlen", 128),
+            )["videos"]
+            video_inputs = image_processor(videos, do_pad=True, max_slice_nums=2, return_tensors="pt")
+            mm_inputs.update(video_inputs)
+
+        if len(audios) != 0:
+            audios = self._regularize_audios(
+                audios,
+                sampling_rate=getattr(processor, "audio_sampling_rate", 16000),
+            )["audios"]
+            if "valid_audio_nums_ls" in kwargs:
+                valid_audio_nums_ls = kwargs["valid_audio_nums_ls"]
+                audios_ls = []
+                idx = 0
+                for valid_audio_nums in valid_audio_nums_ls:
+                    audios_ls.append(audios[idx : idx + valid_audio_nums])
+                    idx += valid_audio_nums
+            else:
+                audios_ls = [audios]
+
+            audio_features, audio_feature_lens, audio_phs = processor.audio_feature_extract(
+                audios_ls,
+                chunk_input=True,
+                sampling_rate=getattr(processor, "audio_sampling_rate", 16000),
+            )
+            audio_feature_lens = [torch.tensor(audio_feature_len) for audio_feature_len in audio_feature_lens]
+            mm_inputs.update({"audio_features": audio_features, "audio_feature_lens": audio_feature_lens})
+            if kwargs.get("ret_phs", False):
+                mm_inputs.update({"audio_phs": audio_phs})
+
+        return mm_inputs
+
+    @override
+    def process_messages(
+        self,
+        messages: list[dict[str, str]],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: Optional["MMProcessor"],
+    ) -> list[dict[str, str]]:
+        self._validate_input(processor, images, videos, audios)
+        self._validate_messages(messages, images, videos, audios)
+        num_image_tokens, num_video_tokens, num_audio_tokens = 0, 0, 0
+        messages = deepcopy(messages)
+        image_processor: BaseImageProcessor = getattr(processor, "image_processor")
+        mm_inputs, audio_inputs = {}, {}
+        if len(images) != 0 and len(videos) != 0:
+            raise ValueError("MiniCPM-V model does not support input images and videos at the same time.")
+
+        if len(videos) != 0:
+            max_slice_nums = 2
+            use_image_id = False
+            mm_inputs = self._get_mm_inputs([], videos, [], processor)
+        else:
+            max_slice_nums = image_processor.max_slice_nums
+            use_image_id = image_processor.use_image_id
+
+        for i, message in enumerate(messages):
+            content = message["content"]
+            while IMAGE_PLACEHOLDER in content:
+                content = content.replace(IMAGE_PLACEHOLDER, "{{image}}", 1)
+                num_image_tokens += 1
+
+            while VIDEO_PLACEHOLDER in content:
+                video_seqlen = len(mm_inputs["pixel_values"][num_video_tokens]) if self.expand_mm_tokens else 1
+                content = content.replace(VIDEO_PLACEHOLDER, "{{image}}" * video_seqlen, 1)
+                num_video_tokens += 1
+
+            while AUDIO_PLACEHOLDER in content:
+                content = content.replace(AUDIO_PLACEHOLDER, "{{audio}}", 1)
+                num_audio_tokens += 1
+
+            message["content"] = content.replace("{{image}}", "(<image>./</image>)").replace(
+                "{{audio}}", "(<audio>./</audio>)"
+            )
+
+        if len(images):
+            mm_inputs = self._get_mm_inputs(images, [], [], processor)
+
+        if len(audios):
+            audio_inputs = self._get_mm_inputs([], [], audios, processor, ret_phs=True)
+
+        if self.expand_mm_tokens and mm_inputs:
+            pattern = "(<image>./</image>)"
+            image_sizes = mm_inputs["image_sizes"]
+            idx = 0
+            for index, message in enumerate(messages):
+                text = message["content"]
+                image_tags = re.findall(pattern, text)
+                text_chunks = text.split(pattern)
+                final_text = ""
+                for i in range(len(image_tags)):
+                    final_text = (
+                        final_text
+                        + text_chunks[i]
+                        + image_processor.get_slice_image_placeholder(
+                            image_sizes[0][idx], idx, max_slice_nums, use_image_id
+                        )
+                    )
+                    idx += 1
+
+                final_text += text_chunks[-1]
+                messages[index]["content"] = final_text
+
+        if self.expand_mm_tokens and audio_inputs:
+            pattern = "(<audio>./</audio>)"
+            idx = 0
+            for index, message in enumerate(messages):
+                text = message["content"]
+                audio_tags = re.findall(pattern, text)
+                text_chunks = text.split(pattern)
+                final_text = ""
+                for i in range(len(audio_tags)):
+                    audio_placeholder = audio_inputs["audio_phs"][0][idx]
+                    final_text = final_text + text_chunks[i] + audio_placeholder
+                    idx += 1
+
+                final_text += text_chunks[-1]
+                messages[index]["content"] = final_text
+
+        return messages
+
+    @override
+    def get_mm_inputs(
+        self,
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        imglens: list[int],
+        vidlens: list[int],
+        audlens: list[int],
+        batch_ids: list[list[int]],
+        processor: Optional["MMProcessor"],
+    ) -> dict[str, Union[list[int], "torch.Tensor"]]:
+        self._validate_input(processor, images, videos, audios)
+        # image bound
+        image_bounds_list = []
+        valid_image_nums_ls = []
+        for i, input_ids in enumerate(batch_ids):
+            input_ids_ = torch.tensor(input_ids)
+            start_cond = (input_ids_ == processor.tokenizer.im_start_id) | (
+                input_ids_ == processor.tokenizer.slice_start_id
+            )
+            end_cond = (input_ids_ == processor.tokenizer.im_end_id) | (input_ids_ == processor.tokenizer.slice_end_id)
+            image_start_tokens = torch.where(start_cond)[0]
+            image_start_tokens += 1
+            image_end_tokens = torch.where(end_cond)[0]
+            valid_image_nums_ls.append(imglens[i])
+            image_bounds = torch.hstack(
+                [
+                    image_start_tokens.unsqueeze(-1),
+                    image_end_tokens.unsqueeze(-1),
+                ]
+            )
+            image_bounds_list.append(image_bounds)
+
+        mm_inputs = self._get_mm_inputs(images, videos, [], processor, valid_image_nums_ls=valid_image_nums_ls)
+        if "tgt_sizes" not in mm_inputs:
+            dummy_data = [torch.empty(0) for _ in range(len(batch_ids))]
+            mm_inputs.update({"tgt_sizes": dummy_data, "pixel_values": dummy_data, "image_sizes": dummy_data})
+
+        mm_inputs.update({"image_bound": image_bounds_list})
+
+        if len(audios) > 0:
+            # audio bound
+            audio_bounds_ls = []
+            spk_bounds_ls = []
+            valid_audio_nums_ls = []
+
+            for input_ids, audiolen in zip(batch_ids, audlens):
+                input_ids_ = torch.tensor(input_ids)
+                audio_start_idx = torch.where(input_ids_ == processor.tokenizer.audio_start_id)[0]
+                audio_end_idx = torch.where(input_ids_ == processor.tokenizer.audio_end_id)[0]
+                assert len(audio_start_idx) == len(audio_end_idx)
+                audio_bounds = torch.hstack([(audio_start_idx + 1).unsqueeze(-1), audio_end_idx.unsqueeze(-1)])
+                audio_bounds_ls.append(audio_bounds)
+                valid_audio_nums_ls.append(audiolen)
+
+                spk_start_idx = torch.where(input_ids_ == processor.tokenizer.spk_start_id)[0]
+                spk_end_idx = torch.where(input_ids_ == processor.tokenizer.spk_end_id)[0]
+                assert len(spk_start_idx) == len(spk_end_idx)
+                spk_bounds = torch.hstack([(spk_start_idx + 1).unsqueeze(-1), spk_end_idx.unsqueeze(-1)])
+                spk_bounds_ls.append(spk_bounds)
+
+            audio_inputs = self._get_mm_inputs([], [], audios, processor, valid_audio_nums_ls=valid_audio_nums_ls)
+            mm_inputs.update(audio_inputs)
+            mm_inputs.update({"audio_bounds": audio_bounds_ls, "spk_bounds": spk_bounds_ls})
+
+        return mm_inputs
+
+
+@dataclass
+class MllamaPlugin(BasePlugin):
+    @override
+    def process_messages(
+        self,
+        messages: list[dict[str, str]],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: Optional["MMProcessor"],
+    ) -> list[dict[str, str]]:
+        self._validate_input(processor, images, videos, audios)
+        self._validate_messages(messages, images, videos, audios)
+        num_image_tokens = 0
+        messages = deepcopy(messages)
+        for message in messages:
+            content = message["content"]
+            num_image_tokens += content.count(IMAGE_PLACEHOLDER)
+            message["content"] = content.replace(IMAGE_PLACEHOLDER, self.image_token)
+
+        return messages
+
+    @override
+    def get_mm_inputs(
+        self,
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        imglens: list[int],
+        vidlens: list[int],
+        audlens: list[int],
+        batch_ids: list[list[int]],
+        processor: Optional["MMProcessor"],
+    ) -> dict[str, Union[list[int], "torch.Tensor"]]:
+        self._validate_input(processor, images, videos, audios)
+        mm_inputs = self._get_mm_inputs(images, videos, audios, processor, imglens)
+        if mm_inputs:
+            num_tiles = mm_inputs.pop("num_tiles")
+            image_token_id: int = getattr(processor, "image_token_id")
+            max_image_tiles: int = getattr(processor.image_processor, "max_image_tiles")
+            cross_attention_token_mask = [
+                get_cross_attention_token_mask(input_ids, image_token_id) for input_ids in batch_ids
+            ]
+            mm_inputs["cross_attention_mask"] = torch.from_numpy(
+                convert_sparse_cross_attention_mask_to_dense(
+                    cross_attention_token_mask,
+                    num_tiles=num_tiles,
+                    max_num_tiles=max_image_tiles,
+                    length=max(len(input_ids) for input_ids in batch_ids),
+                )
+            )  # shape: (batch_size, length, max_num_images, max_num_tiles)
+
+        return mm_inputs
+
+
+@dataclass
+class PaliGemmaPlugin(BasePlugin):
+    @override
+    def process_messages(
+        self,
+        messages: list[dict[str, str]],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: Optional["MMProcessor"],
+    ) -> list[dict[str, str]]:
+        self._validate_input(processor, images, videos, audios)
+        self._validate_messages(messages, images, videos, audios)
+        num_image_tokens = 0
+        messages = deepcopy(messages)
+        for message in messages:
+            content = message["content"]
+            while IMAGE_PLACEHOLDER in content:
+                content = content.replace(IMAGE_PLACEHOLDER, "", 1)
+                num_image_tokens += 1
+
+            message["content"] = content
+
+        return messages
+
+    @override
+    def process_token_ids(
+        self,
+        input_ids: list[int],
+        labels: Optional[list[int]],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        tokenizer: "PreTrainedTokenizer",
+        processor: Optional["MMProcessor"],
+    ) -> tuple[list[int], Optional[list[int]]]:
+        self._validate_input(processor, images, videos, audios)
+        num_images = len(images)
+        image_seqlen = processor.image_seq_length if self.expand_mm_tokens else 0  # skip mm token
+        image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
+        input_ids = [image_token_id] * num_images * image_seqlen + input_ids
+        if labels is not None:
+            labels = [IGNORE_INDEX] * num_images * image_seqlen + labels
+
+        return input_ids, labels
+
+    @override
+    def get_mm_inputs(
+        self,
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        imglens: list[int],
+        vidlens: list[int],
+        audlens: list[int],
+        batch_ids: list[list[int]],
+        processor: Optional["MMProcessor"],
+    ) -> dict[str, Union[list[int], "torch.Tensor"]]:
+        self._validate_input(processor, images, videos, audios)
+        seqlens = [len(input_ids) for input_ids in batch_ids]
+        mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
+        mm_inputs["token_type_ids"] = _get_paligemma_token_type_ids(imglens, seqlens, processor)
+        return mm_inputs
+
+
+@dataclass
+class PixtralPlugin(BasePlugin):
+    @override
+    def process_messages(
+        self,
+        messages: list[dict[str, str]],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: Optional["MMProcessor"],
+    ) -> list[dict[str, str]]:
+        self._validate_input(processor, images, videos, audios)
+        self._validate_messages(messages, images, videos, audios)
+        messages = deepcopy(messages)
+        if self.expand_mm_tokens:
+            mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
+            if "pixel_values" in mm_inputs:
+                # BC for transformers < 4.49.0
+                if isinstance(mm_inputs["image_sizes"], list):
+                    image_sizes = iter(mm_inputs["image_sizes"][0])
+                else:
+                    image_sizes = iter(mm_inputs["image_sizes"].tolist())
+
+                image_break_token: str = getattr(processor, "image_break_token")
+                image_end_token: str = getattr(processor, "image_end_token")
+
+        for message in messages:
+            content = message["content"]
+            while IMAGE_PLACEHOLDER in content:
+                if self.expand_mm_tokens:
+                    patch_size = processor.patch_size * getattr(processor, "spatial_merge_size", 1)
+                    height, width = next(image_sizes)
+                    num_height_tokens = height // patch_size
+                    num_width_tokens = width // patch_size
+                    replace_tokens = [[self.image_token] * num_width_tokens + [image_break_token]] * num_height_tokens
+                    replace_tokens = [item for sublist in replace_tokens for item in sublist]  # flatten list
+                    replace_tokens[-1] = image_end_token
+                    replace_str = "".join(replace_tokens)
+                else:
+                    replace_str = self.image_token
+
+                content = content.replace(IMAGE_PLACEHOLDER, replace_str, 1)
+
+            message["content"] = content
+
+        return messages
+
+    @override
+    def get_mm_inputs(
+        self,
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        imglens: list[int],
+        vidlens: list[int],
+        audlens: list[int],
+        batch_ids: list[list[int]],
+        processor: Optional["MMProcessor"],
+    ) -> dict[str, Union[list[int], "torch.Tensor"]]:
+        self._validate_input(processor, images, videos, audios)
+        mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
+        # ref to this commit https://github.com/huggingface/transformers/pull/35122
+        # after transformers 4.49.0, the `image_sizes` is mandatory as an input parameter for Pixtral VisionEncoder forwarding.
+        # it can be passed into `LlavaConditionalGeneration` as a parameter.
+        if not is_transformers_version_greater_than("4.49.0"):
+            mm_inputs.pop("image_sizes", None)
+        return mm_inputs
+
+
+@dataclass
+class Qwen2AudioPlugin(BasePlugin):
+    @override
+    def process_messages(
+        self,
+        messages: list[dict[str, str]],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: Optional["MMProcessor"],
+    ) -> list[dict[str, str]]:
+        self._validate_input(processor, images, videos, audios)
+        self._validate_messages(messages, images, videos, audios)
+        bos_token: str = getattr(processor, "audio_bos_token")
+        eos_token: str = getattr(processor, "audio_eos_token")
+        messages = deepcopy(messages)
+        if self.expand_mm_tokens:
+            mm_inputs = self._get_mm_inputs([], [], audios, processor)
+            if "feature_attention_mask" in mm_inputs:
+                audio_lengths = mm_inputs["feature_attention_mask"].sum(-1).tolist()
+
+        for message in messages:
+            content = message["content"]
+            while AUDIO_PLACEHOLDER in content:
+                if self.expand_mm_tokens:
+                    audio_length = audio_lengths.pop(0)
+                    input_length = (audio_length - 1) // 2 + 1
+                    audio_seqlen = (input_length - 2) // 2 + 1
+                else:
+                    audio_seqlen = 1
+
+                content = content.replace(
+                    AUDIO_PLACEHOLDER, f"{bos_token}{self.audio_token * audio_seqlen}{eos_token}", 1
+                )
+
+            message["content"] = content
+
+        return messages
+
+    @override
+    def get_mm_inputs(
+        self,
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        imglens: list[int],
+        vidlens: list[int],
+        audlens: list[int],
+        batch_ids: list[list[int]],
+        processor: Optional["MMProcessor"],
+    ) -> dict[str, Union[list[int], "torch.Tensor"]]:
+        self._validate_input(processor, images, videos, audios)
+        return self._get_mm_inputs(images, videos, audios, processor)
+
+
+@dataclass
+class Qwen2VLPlugin(BasePlugin):
+    vision_bos_token: str = "<|vision_start|>"
+    vision_eos_token: str = "<|vision_end|>"
+
+    @override
+    def _preprocess_image(self, image: "ImageObject", **kwargs) -> "ImageObject":
+        image = super()._preprocess_image(image, **kwargs)
+        if min(image.width, image.height) < 28:
+            width, height = max(image.width, 28), max(image.height, 28)
+            image = image.resize((width, height))
+
+        if image.width / image.height > 200:
+            width, height = image.height * 180, image.height
+            image = image.resize((width, height))
+
+        if image.height / image.width > 200:
+            width, height = image.width, image.width * 180
+            image = image.resize((width, height))
+
+        return image
+
+    @override
+    def _regularize_videos(self, videos: list["VideoInput"], **kwargs) -> "RegularizedVideoOutput":
+        results, fps_per_video, durations = [], [], []
+        for video in videos:
+            frames: list[ImageObject] = []
+            if _check_video_is_nested_images(video):
+                for frame in video:
+                    if not is_valid_image(frame) and not isinstance(frame, dict) and not os.path.exists(frame):
+                        raise ValueError("Invalid image found in video frames.")
+
+                frames = video
+                fps_per_video.append(kwargs.get("video_fps", 2.0))
+                durations.append(len(frames) / kwargs.get("video_fps", 2.0))
+            else:
+                container = av.open(video, "r")
+                video_stream = next(stream for stream in container.streams if stream.type == "video")
+                sample_indices = self._get_video_sample_indices(video_stream, **kwargs)
+                container.seek(0)
+                for frame_idx, frame in enumerate(container.decode(video_stream)):
+                    if frame_idx in sample_indices:
+                        frames.append(frame.to_image())
+
+                if video_stream.duration is None:
+                    fps_per_video.append(kwargs.get("video_fps", 2.0))
+                    durations.append(len(frames) / kwargs.get("video_fps", 2.0))
+                else:
+                    fps_per_video.append(len(sample_indices) / float(video_stream.duration * video_stream.time_base))
+                    durations.append(float(video_stream.duration * video_stream.time_base))
+
+            if len(frames) % 2 != 0:
+                frames.append(frames[-1])
+
+            frames = self._regularize_images(frames, **kwargs)["images"]
+            results.append(frames)
+
+        return {"videos": results, "fps_per_video": fps_per_video, "durations": durations}
+
+    @override
+    def _get_mm_inputs(
+        self,
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: "MMProcessor",
+    ) -> dict[str, "torch.Tensor"]:
+        image_processor: BaseImageProcessor = getattr(processor, "image_processor", None)
+        video_processor: BaseVideoProcessor = getattr(processor, "video_processor", None)
+        mm_inputs = {}
+        if len(images) != 0:
+            images = self._regularize_images(
+                images,
+                image_max_pixels=getattr(processor, "image_max_pixels", 768 * 768),
+                image_min_pixels=getattr(processor, "image_min_pixels", 32 * 32),
+            )["images"]
+            mm_inputs.update(image_processor(images, return_tensors="pt"))
+
+        if len(videos) != 0:
+            video_data = self._regularize_videos(
+                videos,
+                image_max_pixels=getattr(processor, "video_max_pixels", 256 * 256),
+                image_min_pixels=getattr(processor, "video_min_pixels", 16 * 16),
+                video_fps=getattr(processor, "video_fps", 2.0),
+                video_maxlen=getattr(processor, "video_maxlen", 128),
+            )
+            mm_inputs.update(video_processor(videos=video_data["videos"], return_tensors="pt"))
+            temporal_patch_size: int = getattr(image_processor, "temporal_patch_size", 2)
+            if "second_per_grid_ts" in processor.model_input_names:
+                mm_inputs["second_per_grid_ts"] = [temporal_patch_size / fps for fps in video_data["fps_per_video"]]
+
+        return mm_inputs
+
+    @override
+    def process_messages(
+        self,
+        messages: list[dict[str, str]],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: Optional["MMProcessor"],
+    ) -> list[dict[str, str]]:
+        self._validate_input(processor, images, videos, audios)
+        self._validate_messages(messages, images, videos, audios)
+        num_image_tokens, num_video_tokens = 0, 0
+        messages = deepcopy(messages)
+        image_processor: BaseImageProcessor = getattr(processor, "image_processor")
+
+        merge_length: int = getattr(image_processor, "merge_size") ** 2
+        if self.expand_mm_tokens:
+            mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
+            image_grid_thw = mm_inputs.get("image_grid_thw", [])
+            video_grid_thw = mm_inputs.get("video_grid_thw", [])
+        else:
+            image_grid_thw = [None] * len(images)
+            video_grid_thw = [None] * len(videos)
+
+        for message in messages:
+            content = message["content"]
+            while IMAGE_PLACEHOLDER in content:
+                image_seqlen = image_grid_thw[num_image_tokens].prod() // merge_length if self.expand_mm_tokens else 1
+                content = content.replace(
+                    IMAGE_PLACEHOLDER,
+                    f"{self.vision_bos_token}{self.image_token * image_seqlen}{self.vision_eos_token}",
+                    1,
+                )
+                num_image_tokens += 1
+
+            while VIDEO_PLACEHOLDER in content:
+                video_seqlen = video_grid_thw[num_video_tokens].prod() // merge_length if self.expand_mm_tokens else 1
+                content = content.replace(
+                    VIDEO_PLACEHOLDER,
+                    f"{self.vision_bos_token}{self.video_token * video_seqlen}{self.vision_eos_token}",
+                    1,
+                )
+                num_video_tokens += 1
+
+            message["content"] = content
+
+        return messages
+
+
+@dataclass
+class Qwen3VLPlugin(Qwen2VLPlugin):
+    @override
+    def _get_mm_inputs(
+        self,
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: "MMProcessor",
+    ) -> dict[str, "torch.Tensor"]:
+        image_processor: BaseImageProcessor = getattr(processor, "image_processor", None)
+        video_processor: BaseImageProcessor = getattr(processor, "video_processor", None)
+        mm_inputs = {}
+        if len(images) != 0:
+            images = self._regularize_images(
+                images,
+                image_max_pixels=getattr(processor, "image_max_pixels", 768 * 768),
+                image_min_pixels=getattr(processor, "image_min_pixels", 32 * 32),
+            )["images"]
+            mm_inputs.update(image_processor(images, return_tensors="pt"))
+
+        if len(videos) != 0:
+            videos = self._regularize_videos(
+                videos,
+                image_max_pixels=getattr(processor, "video_max_pixels", 256 * 256),
+                image_min_pixels=getattr(processor, "video_min_pixels", 16 * 16),
+                video_fps=getattr(processor, "video_fps", 2.0),
+                video_maxlen=getattr(processor, "video_maxlen", 128),
+            )
+            video_metadata = [
+                {"fps": getattr(processor, "video_fps", 24.0), "duration": duration, "total_num_frames": len(video)}
+                for video, duration in zip(videos["videos"], videos["durations"])
+            ]
+            mm_inputs.update(
+                video_processor(videos=videos["videos"], video_metadata=video_metadata, return_metadata=True)
+            )
+            temporal_patch_size: int = getattr(image_processor, "temporal_patch_size", 2)
+            if "second_per_grid_ts" in processor.model_input_names:
+                mm_inputs["second_per_grid_ts"] = [temporal_patch_size / fps for fps in videos["fps_per_video"]]
+
+        return mm_inputs
+
+    @override
+    def process_messages(
+        self,
+        messages: list[dict[str, str]],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: Optional["MMProcessor"],
+    ) -> list[dict[str, str]]:
+        self._validate_input(processor, images, videos, audios)
+        self._validate_messages(messages, images, videos, audios)
+        num_image_tokens, num_video_tokens = 0, 0
+        messages = deepcopy(messages)
+        image_processor: BaseImageProcessor = getattr(processor, "image_processor")
+        video_processor: BaseImageProcessor = getattr(processor, "video_processor")
+
+        image_merge_length: int = getattr(image_processor, "merge_size") ** 2
+        video_merge_length: int = getattr(video_processor, "merge_size") ** 2
+        if self.expand_mm_tokens:
+            mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
+            image_grid_thw = mm_inputs.get("image_grid_thw", [])
+            video_grid_thw = mm_inputs.get("video_grid_thw", [])
+            num_frames = video_grid_thw[0][0] if len(video_grid_thw) > 0 else 0  # hard code for now
+            video_metadata = mm_inputs.get("video_metadata", {})
+
+        else:
+            image_grid_thw = [None] * len(images)
+            video_grid_thw = [None] * len(videos)
+            num_frames = 0
+            timestamps = [0]
+
+        for idx, message in enumerate(messages):
+            content = message["content"]
+            while IMAGE_PLACEHOLDER in content:
+                image_seqlen = (
+                    image_grid_thw[num_image_tokens].prod() // image_merge_length if self.expand_mm_tokens else 1
+                )
+                content = content.replace(
+                    IMAGE_PLACEHOLDER,
+                    f"{self.vision_bos_token}{self.image_token * image_seqlen}{self.vision_eos_token}",
+                    1,
+                )
+                num_image_tokens += 1
+
+            while VIDEO_PLACEHOLDER in content:
+                if self.expand_mm_tokens:
+                    metadata = video_metadata[idx]
+                    timestamps = processor._calculate_timestamps(
+                        metadata.frames_indices,
+                        metadata.fps,
+                        video_processor.merge_size,
+                    )
+                    video_structure = ""
+                    for frame_index in range(num_frames):
+                        video_seqlen = (
+                            video_grid_thw[num_video_tokens][1:].prod() // video_merge_length
+                            if self.expand_mm_tokens
+                            else 1
+                        )
+                        timestamp_sec = timestamps[frame_index]
+                        frame_structure = (
+                            f"<{timestamp_sec:.1f} seconds>"
+                            f"{self.vision_bos_token}{self.video_token * video_seqlen}{self.vision_eos_token}"
+                        )
+                        video_structure += frame_structure
+                else:
+                    video_structure = f"{self.vision_bos_token}{self.video_token}{self.vision_eos_token}"
+
+                content = content.replace(VIDEO_PLACEHOLDER, video_structure, 1)
+                num_video_tokens += 1
+
+            message["content"] = content
+
+        return messages
+
+
+@dataclass
+class GLM4VPlugin(Qwen2VLPlugin):
+    @override
+    def _get_mm_inputs(
+        self,
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: "MMProcessor",
+    ) -> dict[str, "torch.Tensor"]:
+        image_processor: BaseImageProcessor = getattr(processor, "image_processor", None)
+        video_processor: BaseImageProcessor = getattr(processor, "video_processor", None)
+        mm_inputs = {}
+        if len(images) != 0:
+            images = self._regularize_images(
+                images,
+                image_max_pixels=getattr(processor, "image_max_pixels", 768 * 768),
+                image_min_pixels=getattr(processor, "image_min_pixels", 32 * 32),
+            )["images"]
+            mm_inputs.update(image_processor(images, return_tensors="pt"))
+
+        if len(videos) != 0:
+            video_data = self._regularize_videos(
+                videos,
+                image_max_pixels=getattr(processor, "video_max_pixels", 256 * 256),
+                image_min_pixels=getattr(processor, "video_min_pixels", 16 * 16),
+                video_fps=getattr(processor, "video_fps", 2.0),
+                video_maxlen=getattr(processor, "video_maxlen", 128),
+            )
+            # prepare video metadata
+            video_metadata = [
+                {"fps": 2, "duration": duration, "total_frames": len(video)}
+                for video, duration in zip(video_data["videos"], video_data["durations"])
+            ]
+            mm_inputs.update(video_processor(images=None, videos=video_data["videos"], video_metadata=video_metadata))
+
+        return mm_inputs
+
+    @override
+    def process_messages(
+        self,
+        messages: list[dict[str, str]],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: Optional["MMProcessor"],
+    ) -> list[dict[str, str]]:
+        self._validate_input(processor, images, videos, audios)
+        self._validate_messages(messages, images, videos, audios)
+        num_image_tokens, num_video_tokens = 0, 0
+        messages = deepcopy(messages)
+        image_processor: BaseImageProcessor = getattr(processor, "image_processor")
+
+        merge_length: int = getattr(image_processor, "merge_size") ** 2
+        if self.expand_mm_tokens:
+            mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
+            image_grid_thw = mm_inputs.get("image_grid_thw", [])
+            video_grid_thw = mm_inputs.get("video_grid_thw", [])
+            num_frames = video_grid_thw[0][0] if len(video_grid_thw) > 0 else 0  # hard code for now
+            timestamps = mm_inputs.get("timestamps", [])
+
+            if hasattr(timestamps, "tolist"):
+                timestamps = timestamps.tolist()
+
+            if not timestamps:
+                timestamps_list = []
+            elif isinstance(timestamps[0], list):
+                timestamps_list = timestamps[0]
+            else:
+                timestamps_list = timestamps
+
+            unique_timestamps = timestamps_list.copy()
+            selected_timestamps = unique_timestamps[:num_frames]
+            while len(selected_timestamps) < num_frames:
+                selected_timestamps.append(selected_timestamps[-1] if selected_timestamps else 0)
+
+        else:
+            image_grid_thw = [None] * len(images)
+            video_grid_thw = [None] * len(videos)
+            num_frames = 0
+            selected_timestamps = [0]
+
+        for message in messages:
+            content = message["content"]
+            while IMAGE_PLACEHOLDER in content:
+                image_seqlen = image_grid_thw[num_image_tokens].prod() // merge_length if self.expand_mm_tokens else 1
+                content = content.replace(
+                    IMAGE_PLACEHOLDER, f"<|begin_of_image|>{self.image_token * image_seqlen}<|end_of_image|>", 1
+                )
+                num_image_tokens += 1
+
+            while VIDEO_PLACEHOLDER in content:
+                video_structure = ""
+                for frame_index in range(num_frames):
+                    video_seqlen = (
+                        video_grid_thw[num_video_tokens][1:].prod() // merge_length if self.expand_mm_tokens else 1
+                    )
+                    timestamp_sec = selected_timestamps[frame_index]
+                    frame_structure = (
+                        f"<|begin_of_image|>{self.image_token * video_seqlen}<|end_of_image|>{timestamp_sec}"
+                    )
+                    video_structure += frame_structure
+
+                if not self.expand_mm_tokens:
+                    video_structure = self.video_token
+
+                content = content.replace(VIDEO_PLACEHOLDER, f"<|begin_of_video|>{video_structure}<|end_of_video|>", 1)
+                num_video_tokens += 1
+
+            message["content"] = content
+
+        return messages
+
+    @override
+    def get_mm_inputs(
+        self,
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        imglens: list[int],
+        vidlens: list[int],
+        audlens: list[int],
+        batch_ids: list[list[int]],
+        processor: Optional["ProcessorMixin"],
+    ) -> dict[str, Union[list[int], "torch.Tensor"]]:
+        self._validate_input(processor, images, videos, audios)
+        mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
+        mm_inputs.pop("timestamps", None)
+        return mm_inputs
+
+
+@dataclass
+class Qwen2OmniPlugin(Qwen2VLPlugin):
+    audio_bos_token: str = "<|audio_start|>"
+    audio_eos_token: str = "<|audio_end|>"
+
+    @override
+    def _get_mm_inputs(
+        self,
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: "MMProcessor",
+    ) -> dict[str, "torch.Tensor"]:
+        image_processor: BaseImageProcessor = getattr(processor, "image_processor", None)
+        video_processor: BaseVideoProcessor = getattr(processor, "video_processor", None)
+        feature_extractor: SequenceFeatureExtractor = getattr(processor, "feature_extractor", None)
+        mm_inputs = {}
+        if len(images) != 0:
+            images = self._regularize_images(
+                images,
+                image_max_pixels=getattr(processor, "image_max_pixels", 768 * 768),
+                image_min_pixels=getattr(processor, "image_min_pixels", 32 * 32),
+            )["images"]
+            mm_inputs.update(image_processor(images, return_tensors="pt"))
+
+        if len(videos) != 0:
+            video_dict = self._regularize_videos(
+                videos,
+                image_max_pixels=getattr(processor, "video_max_pixels", 256 * 256),
+                image_min_pixels=getattr(processor, "video_min_pixels", 16 * 16),
+                video_fps=getattr(processor, "video_fps", 2.0),
+                video_maxlen=getattr(processor, "video_maxlen", 128),
+            )
+            mm_inputs.update(video_processor(videos=video_dict["videos"], return_tensors="pt"))
+            temporal_patch_size: int = getattr(image_processor, "temporal_patch_size", 2)
+            mm_inputs["video_second_per_grid"] = torch.tensor(
+                [temporal_patch_size / fps for fps in video_dict["fps_per_video"]]
+            )
+
+        if len(audios) != 0:
+            audios = self._regularize_audios(
+                audios,
+                sampling_rate=getattr(processor, "audio_sampling_rate", 16000),
+            )["audios"]
+            mm_inputs.update(
+                feature_extractor(
+                    audios,
+                    sampling_rate=getattr(processor, "audio_sampling_rate", 16000),
+                    return_attention_mask=True,
+                    padding="max_length",
+                    return_tensors="pt",
+                )
+            )
+            mm_inputs["feature_attention_mask"] = mm_inputs.pop("attention_mask")  # prevent conflicts
+
+        return mm_inputs
+
+    @override
+    def process_messages(
+        self,
+        messages: list[dict[str, str]],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: Optional["MMProcessor"],
+    ) -> list[dict[str, str]]:
+        self._validate_input(processor, images, videos, audios)
+        self._validate_messages(messages, images, videos, audios)
+        num_image_tokens, num_video_tokens, num_audio_tokens = 0, 0, 0
+        messages = deepcopy(messages)
+        image_processor: BaseImageProcessor = getattr(processor, "image_processor", None)
+
+        merge_length = processor.image_processor.merge_size**2
+        use_audio_in_video = getattr(processor, "use_audio_in_video", False)
+        if self.expand_mm_tokens:
+            mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
+            image_grid_thw = mm_inputs.get("image_grid_thw", [])
+            video_grid_thw = mm_inputs.get("video_grid_thw", [])
+            if "feature_attention_mask" in mm_inputs:
+                if processor.__class__.__name__ == "Qwen3OmniMoeProcessor":  # for qwen3omni
+                    input_lengths = mm_inputs["feature_attention_mask"].sum(-1)
+                    input_lengths_leave = input_lengths % 100
+                    feature_lengths = (input_lengths_leave - 1) // 2 + 1
+                    audio_lengths = ((feature_lengths - 1) // 2 + 1 - 1) // 2 + 1 + (input_lengths // 100) * 13
+                else:
+                    input_lengths = (mm_inputs["feature_attention_mask"].sum(-1).numpy() - 1) // 2 + 1
+                    audio_lengths = (input_lengths - 2) // 2 + 1
+        else:
+            mm_inputs = {}
+            image_grid_thw = [None] * len(images)
+            video_grid_thw = [None] * len(videos)
+            audio_lengths = [None] * len(audios)
+
+        for message in messages:
+            content = message["content"]
+            while IMAGE_PLACEHOLDER in content:
+                image_seqlen = image_grid_thw[num_image_tokens].prod() // merge_length if self.expand_mm_tokens else 1
+                content = content.replace(
+                    IMAGE_PLACEHOLDER,
+                    f"{self.vision_bos_token}{self.image_token * image_seqlen}{self.vision_eos_token}",
+                    1,
+                )
+                num_image_tokens += 1
+
+            if (
+                use_audio_in_video and len(audios) and len(videos)
+            ):  # if use the audio of video # deal video token and audio token togather
+                if len(videos) != len(audios):
+                    raise ValueError(
+                        f"Number of videos ({len(videos)}) must match number of audios ({len(audios)}) when using audio in video."
+                    )
+
+                while VIDEO_PLACEHOLDER in content:
+                    video_pos = content.find(VIDEO_PLACEHOLDER)
+                    audio_pos = content.find(AUDIO_PLACEHOLDER, video_pos)
+                    if audio_pos == -1 or audio_pos < video_pos:
+                        raise ValueError(
+                            f"Each {VIDEO_PLACEHOLDER} must be followed by an {AUDIO_PLACEHOLDER} when using audio in video."
+                        )
+
+                    audio_t_index = torch.arange(audio_lengths[num_audio_tokens])
+                    video_t_index = (
+                        torch.arange(video_grid_thw[num_video_tokens][0])
+                        .view(-1, 1, 1)
+                        .expand(
+                            -1,
+                            video_grid_thw[num_video_tokens][1] // image_processor.merge_size,
+                            video_grid_thw[num_video_tokens][2] // image_processor.merge_size,
+                        )
+                        .flatten()
+                        * mm_inputs["video_second_per_grid"][num_video_tokens]
+                        * 25  # FIXME hardcode of position_id_per_seconds=25
+                    ).long()
+                    t_ntoken_per_chunk = 50  # FIXME hardcode: [25 * 2]
+                    video_chunk_indices = processor.get_chunked_index(video_t_index, t_ntoken_per_chunk)
+                    audio_chunk_indices = processor.get_chunked_index(audio_t_index, t_ntoken_per_chunk)
+                    placeholder_string = ""
+                    placeholder_string += self.vision_bos_token + self.audio_bos_token
+                    for j in range(max(len(video_chunk_indices), len(audio_chunk_indices))):
+                        video_chunk_index = video_chunk_indices[j] if j < len(video_chunk_indices) else None
+                        audio_chunk_index = audio_chunk_indices[j] if j < len(audio_chunk_indices) else None
+                        if video_chunk_index is not None:
+                            placeholder_string += self.video_token * (video_chunk_index[1] - video_chunk_index[0])
+
+                        if audio_chunk_index is not None:
+                            placeholder_string += self.audio_token * (audio_chunk_index[1] - audio_chunk_index[0])
+
+                    placeholder_string += self.audio_eos_token + self.vision_eos_token
+                    content = content.replace(VIDEO_PLACEHOLDER, placeholder_string, 1)
+                    content = content.replace(AUDIO_PLACEHOLDER, "", 1)
+                    num_audio_tokens += 1
+                    num_video_tokens += 1
+            else:
+                while AUDIO_PLACEHOLDER in content:
+                    audio_seqlen = audio_lengths[num_audio_tokens] if self.expand_mm_tokens else 1
+                    content = content.replace(
+                        AUDIO_PLACEHOLDER,
+                        f"{self.audio_bos_token}{self.audio_token * audio_seqlen}{self.audio_eos_token}",
+                        1,
+                    )
+                    num_audio_tokens += 1
+
+                while VIDEO_PLACEHOLDER in content:
+                    video_seqlen = (
+                        video_grid_thw[num_video_tokens].prod() // merge_length if self.expand_mm_tokens else 1
+                    )
+                    content = content.replace(
+                        VIDEO_PLACEHOLDER,
+                        f"{self.vision_bos_token}{self.video_token * video_seqlen}{self.vision_eos_token}",
+                        1,
+                    )
+                    num_video_tokens += 1
+
+            message["content"] = content
+
+        return messages
+
+
+@dataclass
+class VideoLlavaPlugin(BasePlugin):
+    @override
+    def process_messages(
+        self,
+        messages: list[dict[str, str]],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: Optional["MMProcessor"],
+    ) -> list[dict[str, str]]:
+        self._validate_input(processor, images, videos, audios)
+        self._validate_messages(messages, images, videos, audios)
+        num_image_tokens, num_video_tokens = 0, 0
+        messages = deepcopy(messages)
+        num_frames = 0
+        if self.expand_mm_tokens:
+            mm_inputs = self._get_mm_inputs(images, videos, audios, processor)
+            if "pixel_values_images" in mm_inputs:
+                height, width = get_image_size(to_numpy_array(mm_inputs["pixel_values_images"][0]))
+                num_frames = 1
+
+            if "pixel_values_videos" in mm_inputs:
+                one_video = to_numpy_array(mm_inputs["pixel_values_videos"][0])
+                height, width = get_image_size(one_video[0])
+                num_frames = one_video.shape[0]  # frame dim is always after batch dim
+
+            if "pixel_values_images" in mm_inputs or "pixel_values_videos" in mm_inputs:
+                image_seqlen = (height // processor.patch_size) * (
+                    width // processor.patch_size
+                ) + processor.num_additional_image_tokens
+                video_seqlen = image_seqlen * num_frames
+                if processor.vision_feature_select_strategy == "default":
+                    image_seqlen -= 1
+        else:
+            image_seqlen, video_seqlen = 1, 1
+
+        for message in messages:
+            content = message["content"]
+            while IMAGE_PLACEHOLDER in content:
+                content = content.replace(IMAGE_PLACEHOLDER, "{{image}}" * image_seqlen, 1)
+                num_image_tokens += 1
+
+            while VIDEO_PLACEHOLDER in content:
+                content = content.replace(VIDEO_PLACEHOLDER, "{{video}}" * video_seqlen, 1)
+                num_video_tokens += 1
+
+            content = content.replace("{{image}}", self.image_token)
+            message["content"] = content.replace("{{video}}", self.video_token)
+
+        return messages
+
+
+PLUGINS = {
+    "base": BasePlugin,
+    "gemma3": Gemma3Plugin,
+    "glm4v": GLM4VPlugin,
+    "gemma3n": Gemma3nPlugin,
+    "intern_vl": InternVLPlugin,
+    "kimi_vl": KimiVLPlugin,
+    "llama4": Llama4Plugin,
+    "llava": LlavaPlugin,
+    "llava_next": LlavaNextPlugin,
+    "llava_next_video": LlavaNextVideoPlugin,
+    "minicpm_v": MiniCPMVPlugin,
+    "mllama": MllamaPlugin,
+    "paligemma": PaliGemmaPlugin,
+    "pixtral": PixtralPlugin,
+    "qwen2_audio": Qwen2AudioPlugin,
+    "qwen2_omni": Qwen2OmniPlugin,
+    "qwen2_vl": Qwen2VLPlugin,
+    "qwen3_vl": Qwen3VLPlugin,
+    "video_llava": VideoLlavaPlugin,
+}
+
+
+def register_mm_plugin(name: str, plugin_class: type["BasePlugin"]) -> None:
+    r"""Register a multimodal plugin."""
+    if name in PLUGINS:
+        raise ValueError(f"Multimodal plugin {name} already exists.")
+
+    PLUGINS[name] = plugin_class
+
+
+def get_mm_plugin(
+    name: str,
+    image_token: Optional[str] = None,
+    video_token: Optional[str] = None,
+    audio_token: Optional[str] = None,
+    **kwargs,
+) -> "BasePlugin":
+    r"""Get plugin for multimodal inputs."""
+    if name not in PLUGINS:
+        raise ValueError(f"Multimodal plugin `{name}` not found.")
+
+    return PLUGINS[name](image_token, video_token, audio_token, **kwargs)
diff --git a/llamafactory/data/parser.py b/llamafactory/data/parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a865fd83dfe6221f312a2030cd08a6b38366cfe
--- /dev/null
+++ b/llamafactory/data/parser.py
@@ -0,0 +1,149 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from dataclasses import dataclass
+from typing import Any, Literal, Optional, Union
+
+from huggingface_hub import hf_hub_download
+
+from ..extras.constants import DATA_CONFIG
+from ..extras.misc import use_modelscope, use_openmind
+
+
+@dataclass
+class DatasetAttr:
+    r"""Dataset attributes."""
+
+    # basic configs
+    load_from: Literal["hf_hub", "ms_hub", "om_hub", "script", "file"]
+    dataset_name: str
+    formatting: Literal["alpaca", "sharegpt", "openai"] = "alpaca"
+    ranking: bool = False
+    # extra configs
+    subset: Optional[str] = None
+    split: str = "train"
+    folder: Optional[str] = None
+    num_samples: Optional[int] = None
+    # common columns
+    system: Optional[str] = None
+    tools: Optional[str] = None
+    images: Optional[str] = None
+    videos: Optional[str] = None
+    audios: Optional[str] = None
+    # dpo columns
+    chosen: Optional[str] = None
+    rejected: Optional[str] = None
+    kto_tag: Optional[str] = None
+    # alpaca columns
+    prompt: Optional[str] = "instruction"
+    query: Optional[str] = "input"
+    response: Optional[str] = "output"
+    history: Optional[str] = None
+    # sharegpt columns
+    messages: Optional[str] = "conversations"
+    # sharegpt tags
+    role_tag: Optional[str] = "from"
+    content_tag: Optional[str] = "value"
+    user_tag: Optional[str] = "human"
+    assistant_tag: Optional[str] = "gpt"
+    observation_tag: Optional[str] = "observation"
+    function_tag: Optional[str] = "function_call"
+    system_tag: Optional[str] = "system"
+
+    def __repr__(self) -> str:
+        return self.dataset_name
+
+    def set_attr(self, key: str, obj: dict[str, Any], default: Optional[Any] = None) -> None:
+        setattr(self, key, obj.get(key, default))
+
+    def join(self, attr: dict[str, Any]) -> None:
+        self.set_attr("formatting", attr, default="alpaca")
+        self.set_attr("ranking", attr, default=False)
+        self.set_attr("subset", attr)
+        self.set_attr("split", attr, default="train")
+        self.set_attr("folder", attr)
+        self.set_attr("num_samples", attr)
+
+        if "columns" in attr:
+            column_names = ["prompt", "query", "response", "history", "messages", "system", "tools"]
+            column_names += ["images", "videos", "audios", "chosen", "rejected", "kto_tag"]
+            for column_name in column_names:
+                self.set_attr(column_name, attr["columns"])
+
+        if "tags" in attr:
+            tag_names = ["role_tag", "content_tag"]
+            tag_names += ["user_tag", "assistant_tag", "observation_tag", "function_tag", "system_tag"]
+            for tag in tag_names:
+                self.set_attr(tag, attr["tags"])
+
+
+def get_dataset_list(dataset_names: Optional[list[str]], dataset_dir: Union[str, dict]) -> list["DatasetAttr"]:
+    r"""Get the attributes of the datasets."""
+    if dataset_names is None:
+        dataset_names = []
+
+    if isinstance(dataset_dir, dict):
+        dataset_info = dataset_dir
+    elif dataset_dir == "ONLINE":
+        dataset_info = None
+    else:
+        if dataset_dir.startswith("REMOTE:"):
+            config_path = hf_hub_download(repo_id=dataset_dir[7:], filename=DATA_CONFIG, repo_type="dataset")
+        else:
+            config_path = os.path.join(dataset_dir, DATA_CONFIG)
+
+        try:
+            with open(config_path) as f:
+                dataset_info = json.load(f)
+        except Exception as err:
+            if len(dataset_names) != 0:
+                raise ValueError(f"Cannot open {config_path} due to {str(err)}.")
+
+            dataset_info = None
+
+    dataset_list: list[DatasetAttr] = []
+    for name in dataset_names:
+        if dataset_info is None:  # dataset_dir is ONLINE
+            load_from = "ms_hub" if use_modelscope() else "om_hub" if use_openmind() else "hf_hub"
+            dataset_attr = DatasetAttr(load_from, dataset_name=name)
+            dataset_list.append(dataset_attr)
+            continue
+
+        if name not in dataset_info:
+            raise ValueError(f"Undefined dataset {name} in {DATA_CONFIG}.")
+
+        has_hf_url = "hf_hub_url" in dataset_info[name]
+        has_ms_url = "ms_hub_url" in dataset_info[name]
+        has_om_url = "om_hub_url" in dataset_info[name]
+
+        if has_hf_url or has_ms_url or has_om_url:
+            if has_ms_url and (use_modelscope() or not has_hf_url):
+                dataset_attr = DatasetAttr("ms_hub", dataset_name=dataset_info[name]["ms_hub_url"])
+            elif has_om_url and (use_openmind() or not has_hf_url):
+                dataset_attr = DatasetAttr("om_hub", dataset_name=dataset_info[name]["om_hub_url"])
+            else:
+                dataset_attr = DatasetAttr("hf_hub", dataset_name=dataset_info[name]["hf_hub_url"])
+        elif "script_url" in dataset_info[name]:
+            dataset_attr = DatasetAttr("script", dataset_name=dataset_info[name]["script_url"])
+        elif "cloud_file_name" in dataset_info[name]:
+            dataset_attr = DatasetAttr("cloud_file", dataset_name=dataset_info[name]["cloud_file_name"])
+        else:
+            dataset_attr = DatasetAttr("file", dataset_name=dataset_info[name]["file_name"])
+
+        dataset_attr.join(dataset_info[name])
+        dataset_list.append(dataset_attr)
+
+    return dataset_list
diff --git a/llamafactory/data/processor/__init__.py b/llamafactory/data/processor/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..357ab7899f9eecbd29344482d109b89af274ea2e
--- /dev/null
+++ b/llamafactory/data/processor/__init__.py
@@ -0,0 +1,31 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .feedback import FeedbackDatasetProcessor
+from .pairwise import PairwiseDatasetProcessor
+from .pretrain import PretrainDatasetProcessor
+from .processor_utils import DatasetProcessor
+from .supervised import PackedSupervisedDatasetProcessor, SupervisedDatasetProcessor
+from .unsupervised import UnsupervisedDatasetProcessor
+
+
+__all__ = [
+    "DatasetProcessor",
+    "FeedbackDatasetProcessor",
+    "PackedSupervisedDatasetProcessor",
+    "PairwiseDatasetProcessor",
+    "PretrainDatasetProcessor",
+    "SupervisedDatasetProcessor",
+    "UnsupervisedDatasetProcessor",
+]
diff --git a/llamafactory/data/processor/__pycache__/__init__.cpython-312.pyc b/llamafactory/data/processor/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3e85ff951b87bd8d7cd85626bac8829ab0a727db
Binary files /dev/null and b/llamafactory/data/processor/__pycache__/__init__.cpython-312.pyc differ
diff --git a/llamafactory/data/processor/__pycache__/feedback.cpython-312.pyc b/llamafactory/data/processor/__pycache__/feedback.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..01a84077c2946d6287b73a6f0decf5ab36fe133f
Binary files /dev/null and b/llamafactory/data/processor/__pycache__/feedback.cpython-312.pyc differ
diff --git a/llamafactory/data/processor/__pycache__/pairwise.cpython-312.pyc b/llamafactory/data/processor/__pycache__/pairwise.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..448f8f1ced997cd41e862fe07863214e4be3f68d
Binary files /dev/null and b/llamafactory/data/processor/__pycache__/pairwise.cpython-312.pyc differ
diff --git a/llamafactory/data/processor/__pycache__/pretrain.cpython-312.pyc b/llamafactory/data/processor/__pycache__/pretrain.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..85208325c8814f094b4466853fa293154e398576
Binary files /dev/null and b/llamafactory/data/processor/__pycache__/pretrain.cpython-312.pyc differ
diff --git a/llamafactory/data/processor/__pycache__/processor_utils.cpython-312.pyc b/llamafactory/data/processor/__pycache__/processor_utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..84248d8f528b3a457311a4d90c73173eef406822
Binary files /dev/null and b/llamafactory/data/processor/__pycache__/processor_utils.cpython-312.pyc differ
diff --git a/llamafactory/data/processor/__pycache__/supervised.cpython-312.pyc b/llamafactory/data/processor/__pycache__/supervised.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2619e0ba420f7ea44190d6d70bafb8b0245106d4
Binary files /dev/null and b/llamafactory/data/processor/__pycache__/supervised.cpython-312.pyc differ
diff --git a/llamafactory/data/processor/__pycache__/unsupervised.cpython-312.pyc b/llamafactory/data/processor/__pycache__/unsupervised.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..656d5b93ad6bf756fb5ba1cfd4a027954fa2e783
Binary files /dev/null and b/llamafactory/data/processor/__pycache__/unsupervised.cpython-312.pyc differ
diff --git a/llamafactory/data/processor/feedback.py b/llamafactory/data/processor/feedback.py
new file mode 100644
index 0000000000000000000000000000000000000000..871615b9266e501f25f68e84e4536c0d24617803
--- /dev/null
+++ b/llamafactory/data/processor/feedback.py
@@ -0,0 +1,129 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+from typing import TYPE_CHECKING, Any, Optional
+
+from ...extras import logging
+from ...extras.constants import IGNORE_INDEX
+from .processor_utils import DatasetProcessor, infer_seqlen
+
+
+if TYPE_CHECKING:
+    from ..mm_plugin import AudioInput, ImageInput, VideoInput
+
+
+logger = logging.get_logger(__name__)
+
+
+class FeedbackDatasetProcessor(DatasetProcessor):
+    def _encode_data_example(
+        self,
+        prompt: list[dict[str, str]],
+        response: list[dict[str, str]],
+        kl_response: list[dict[str, str]],
+        system: Optional[str],
+        tools: Optional[str],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+    ) -> tuple[list[int], list[int], list[int], list[int], bool]:
+        if response[0]["content"]:  # desired example
+            kto_tag = True
+            messages = prompt + [response[0]]
+        else:  # undesired example
+            kto_tag = False
+            messages = prompt + [response[1]]
+
+        if kl_response[0]["content"]:
+            kl_messages = prompt + [kl_response[0]]
+        else:
+            kl_messages = prompt + [kl_response[1]]
+
+        messages = self.template.mm_plugin.process_messages(messages, images, videos, audios, self.processor)
+        kl_messages = self.template.mm_plugin.process_messages(kl_messages, images, videos, audios, self.processor)
+        prompt_ids, response_ids = self.template.encode_oneturn(self.tokenizer, messages, system, tools)
+        kl_prompt_ids, kl_response_ids = self.template.encode_oneturn(self.tokenizer, kl_messages, system, tools)
+
+        if self.template.efficient_eos:
+            response_ids += [self.tokenizer.eos_token_id]
+            kl_response_ids += [self.tokenizer.eos_token_id]
+
+        prompt_ids, _ = self.template.mm_plugin.process_token_ids(
+            prompt_ids, None, images, videos, audios, self.tokenizer, self.processor
+        )
+        kl_prompt_ids, _ = self.template.mm_plugin.process_token_ids(
+            kl_prompt_ids, None, images, videos, audios, self.tokenizer, self.processor
+        )
+
+        source_len, target_len = infer_seqlen(len(prompt_ids), len(response_ids), self.data_args.cutoff_len)
+        prompt_ids = prompt_ids[:source_len]
+        response_ids = response_ids[:target_len]
+        kl_source_len, kl_target_len = infer_seqlen(
+            len(kl_prompt_ids), len(kl_response_ids), self.data_args.cutoff_len
+        )
+        kl_prompt_ids = kl_prompt_ids[:kl_source_len]
+        kl_response_ids = kl_response_ids[:kl_target_len]
+
+        input_ids = prompt_ids + response_ids
+        labels = [IGNORE_INDEX] * source_len + response_ids
+        kl_input_ids = kl_prompt_ids + kl_response_ids
+        kl_labels = [IGNORE_INDEX] * kl_source_len + kl_response_ids
+        return input_ids, labels, kl_input_ids, kl_labels, kto_tag
+
+    def preprocess_dataset(self, examples: dict[str, list[Any]]) -> dict[str, list[Any]]:
+        # Creates mismatched pairs of prompts and completions for the KL dataset by adding a +1 offset to the order of completions.
+        kl_response = [examples["_response"][-1]] + examples["_response"][:-1]
+        model_inputs = defaultdict(list)
+        for i in range(len(examples["_prompt"])):
+            if len(examples["_prompt"][i]) % 2 != 1 or len(examples["_response"][i]) < 2:
+                logger.warning_rank0(
+                    "Dropped invalid example: {}".format(examples["_prompt"][i] + examples["_response"][i])
+                )
+                continue
+
+            input_ids, labels, kl_input_ids, kl_labels, kto_tag = self._encode_data_example(
+                prompt=examples["_prompt"][i],
+                response=examples["_response"][i],
+                kl_response=kl_response[i],
+                system=examples["_system"][i],
+                tools=examples["_tools"][i],
+                images=examples["_images"][i] or [],
+                videos=examples["_videos"][i] or [],
+                audios=examples["_audios"][i] or [],
+            )
+            model_inputs["input_ids"].append(input_ids)
+            model_inputs["attention_mask"].append([1] * len(input_ids))
+            model_inputs["labels"].append(labels)
+            model_inputs["kl_input_ids"].append(kl_input_ids)
+            model_inputs["kl_attention_mask"].append([1] * len(kl_input_ids))
+            model_inputs["kl_labels"].append(kl_labels)
+            model_inputs["kto_tags"].append(kto_tag)
+            model_inputs["images"].append(examples["_images"][i])
+            model_inputs["videos"].append(examples["_videos"][i])
+            model_inputs["audios"].append(examples["_audios"][i])
+
+        desirable_num = sum([1 for tag in model_inputs["kto_tags"] if tag])
+        undesirable_num = len(model_inputs["kto_tags"]) - desirable_num
+        if desirable_num == 0 or undesirable_num == 0:
+            logger.warning_rank0("Your dataset only has one preference type.")
+
+        return model_inputs
+
+    def print_data_example(self, example: dict[str, list[int]]) -> None:
+        valid_labels = list(filter(lambda x: x != IGNORE_INDEX, example["labels"]))
+        print("input_ids:\n{}".format(example["input_ids"]))
+        print("inputs:\n{}".format(self.tokenizer.decode(example["input_ids"], skip_special_tokens=False)))
+        print("label_ids:\n{}".format(example["labels"]))
+        print(f"labels:\n{self.tokenizer.decode(valid_labels, skip_special_tokens=False)}")
diff --git a/llamafactory/data/processor/pairwise.py b/llamafactory/data/processor/pairwise.py
new file mode 100644
index 0000000000000000000000000000000000000000..94101deb8e75af73c1851720604994a11f2eb87d
--- /dev/null
+++ b/llamafactory/data/processor/pairwise.py
@@ -0,0 +1,118 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+from typing import TYPE_CHECKING, Any, Optional
+
+from ...extras import logging
+from ...extras.constants import IGNORE_INDEX
+from .processor_utils import DatasetProcessor, infer_seqlen
+
+
+if TYPE_CHECKING:
+    from ..mm_plugin import AudioInput, ImageInput, VideoInput
+
+
+logger = logging.get_logger(__name__)
+
+
+class PairwiseDatasetProcessor(DatasetProcessor):
+    def _encode_data_example(
+        self,
+        prompt: list[dict[str, str]],
+        response: list[dict[str, str]],
+        system: Optional[str],
+        tools: Optional[str],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+    ) -> tuple[list[int], list[int], list[int], list[int]]:
+        chosen_messages = self.template.mm_plugin.process_messages(
+            prompt + [response[0]], images, videos, audios, self.processor
+        )
+        rejected_messages = self.template.mm_plugin.process_messages(
+            prompt + [response[1]], images, videos, audios, self.processor
+        )
+        prompt_ids, chosen_ids = self.template.encode_oneturn(self.tokenizer, chosen_messages, system, tools)
+        _, rejected_ids = self.template.encode_oneturn(self.tokenizer, rejected_messages, system, tools)
+
+        if self.template.efficient_eos:
+            chosen_ids += [self.tokenizer.eos_token_id]
+            rejected_ids += [self.tokenizer.eos_token_id]
+
+        prompt_ids, _ = self.template.mm_plugin.process_token_ids(
+            prompt_ids, None, images, videos, audios, self.tokenizer, self.processor
+        )
+        # consider the response is more important
+        source_len, target_len = infer_seqlen(
+            len(prompt_ids), max(len(chosen_ids), len(rejected_ids)), self.data_args.cutoff_len
+        )
+        prompt_ids = prompt_ids[:source_len]
+        chosen_ids = chosen_ids[:target_len]
+        rejected_ids = rejected_ids[:target_len]
+
+        chosen_input_ids = prompt_ids + chosen_ids
+        chosen_labels = [IGNORE_INDEX] * source_len + chosen_ids
+        rejected_input_ids = prompt_ids + rejected_ids
+        rejected_labels = [IGNORE_INDEX] * source_len + rejected_ids
+        return chosen_input_ids, chosen_labels, rejected_input_ids, rejected_labels
+
+    def preprocess_dataset(self, examples: dict[str, list[Any]]) -> dict[str, list[Any]]:
+        # build input pairs with format `<bos> X`, `Y1 <eos>` and `Y2 <eos>`
+        model_inputs = defaultdict(list)
+        for i in range(len(examples["_prompt"])):
+            if len(examples["_prompt"][i]) % 2 != 1 or len(examples["_response"][i]) < 2:
+                logger.warning_rank0(
+                    "Dropped invalid example: {}".format(examples["_prompt"][i] + examples["_response"][i])
+                )
+                continue
+
+            chosen_input_ids, chosen_labels, rejected_input_ids, rejected_labels = self._encode_data_example(
+                prompt=examples["_prompt"][i],
+                response=examples["_response"][i],
+                system=examples["_system"][i],
+                tools=examples["_tools"][i],
+                images=examples["_images"][i] or [],
+                videos=examples["_videos"][i] or [],
+                audios=examples["_audios"][i] or [],
+            )
+            model_inputs["chosen_input_ids"].append(chosen_input_ids)
+            model_inputs["chosen_attention_mask"].append([1] * len(chosen_input_ids))
+            model_inputs["chosen_labels"].append(chosen_labels)
+            model_inputs["rejected_input_ids"].append(rejected_input_ids)
+            model_inputs["rejected_attention_mask"].append([1] * len(rejected_input_ids))
+            model_inputs["rejected_labels"].append(rejected_labels)
+            model_inputs["images"].append(examples["_images"][i])
+            model_inputs["videos"].append(examples["_videos"][i])
+            model_inputs["audios"].append(examples["_audios"][i])
+
+        return model_inputs
+
+    def print_data_example(self, example: dict[str, list[int]]) -> None:
+        valid_chosen_labels = list(filter(lambda x: x != IGNORE_INDEX, example["chosen_labels"]))
+        valid_rejected_labels = list(filter(lambda x: x != IGNORE_INDEX, example["rejected_labels"]))
+        print("chosen_input_ids:\n{}".format(example["chosen_input_ids"]))
+        print(
+            "chosen_inputs:\n{}".format(self.tokenizer.decode(example["chosen_input_ids"], skip_special_tokens=False))
+        )
+        print("chosen_label_ids:\n{}".format(example["chosen_labels"]))
+        print(f"chosen_labels:\n{self.tokenizer.decode(valid_chosen_labels, skip_special_tokens=False)}")
+        print("rejected_input_ids:\n{}".format(example["rejected_input_ids"]))
+        print(
+            "rejected_inputs:\n{}".format(
+                self.tokenizer.decode(example["rejected_input_ids"], skip_special_tokens=False)
+            )
+        )
+        print("rejected_label_ids:\n{}".format(example["rejected_labels"]))
+        print(f"rejected_labels:\n{self.tokenizer.decode(valid_rejected_labels, skip_special_tokens=False)}")
diff --git a/llamafactory/data/processor/pretrain.py b/llamafactory/data/processor/pretrain.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fa6b1ca58a8d59493cd4b43c51cb268080cc506
--- /dev/null
+++ b/llamafactory/data/processor/pretrain.py
@@ -0,0 +1,57 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/language-modeling/run_clm.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from itertools import chain
+from typing import Any
+
+from .processor_utils import DatasetProcessor
+
+
+@dataclass
+class PretrainDatasetProcessor(DatasetProcessor):
+    def preprocess_dataset(self, examples: dict[str, list[Any]]) -> dict[str, list[Any]]:
+        # build grouped texts with format `X1 X2 X3 ...` if packing is enabled
+        eos_token = "<|end_of_text|>" if self.data_args.template == "llama3" else self.tokenizer.eos_token
+        text_examples = [messages[0]["content"] + eos_token for messages in examples["_prompt"]]
+
+        if not self.data_args.packing:
+            if getattr(self.tokenizer, "add_bos_token", False):
+                text_examples = [self.tokenizer.bos_token + example for example in text_examples]
+
+            result = self.tokenizer(
+                text_examples, add_special_tokens=False, truncation=True, max_length=self.data_args.cutoff_len
+            )
+        else:
+            tokenized_examples = self.tokenizer(text_examples, add_special_tokens=False)
+            concatenated_examples = {k: list(chain(*tokenized_examples[k])) for k in tokenized_examples.keys()}
+            total_length = len(concatenated_examples[list(concatenated_examples.keys())[0]])
+            block_size = self.data_args.cutoff_len
+            total_length = (total_length // block_size) * block_size
+            result = {
+                k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
+                for k, t in concatenated_examples.items()
+            }
+            if getattr(self.tokenizer, "add_bos_token", False):
+                for i in range(len(result["input_ids"])):
+                    result["input_ids"][i][0] = self.tokenizer.bos_token_id
+
+        return result
+
+    def print_data_example(self, example: dict[str, list[int]]) -> None:
+        print("input_ids:\n{}".format(example["input_ids"]))
+        print("inputs:\n{}".format(self.tokenizer.decode(example["input_ids"], skip_special_tokens=False)))
diff --git a/llamafactory/data/processor/processor_utils.py b/llamafactory/data/processor/processor_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..db44b19cf6fc84d6551fb7cce82283774ae72030
--- /dev/null
+++ b/llamafactory/data/processor/processor_utils.py
@@ -0,0 +1,88 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import bisect
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Optional
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizer, ProcessorMixin
+
+    from ...hparams import DataArguments
+    from ..template import Template
+
+
+@dataclass
+class DatasetProcessor(ABC):
+    r"""A class for data processors."""
+
+    template: "Template"
+    tokenizer: "PreTrainedTokenizer"
+    processor: Optional["ProcessorMixin"]
+    data_args: "DataArguments"
+
+    @abstractmethod
+    def preprocess_dataset(self, examples: dict[str, list[Any]]) -> dict[str, list[Any]]:
+        r"""Build model inputs from the examples."""
+        ...
+
+    @abstractmethod
+    def print_data_example(self, example: dict[str, list[int]]) -> None:
+        r"""Print a data example to stdout."""
+        ...
+
+
+def search_for_fit(numbers: list[int], capacity: int) -> int:
+    r"""Find the index of largest number that fits into the knapsack with the given capacity."""
+    index = bisect.bisect(numbers, capacity)
+    return -1 if index == 0 else (index - 1)
+
+
+def greedy_knapsack(numbers: list[int], capacity: int) -> list[list[int]]:
+    r"""Implement efficient greedy algorithm with binary search for the knapsack problem."""
+    numbers.sort()  # sort numbers in ascending order for binary search
+    knapsacks = []
+
+    while numbers:
+        current_knapsack = []
+        remaining_capacity = capacity
+
+        while True:
+            index = search_for_fit(numbers, remaining_capacity)
+            if index == -1:
+                break  # no more numbers fit in this knapsack
+
+            remaining_capacity -= numbers[index]  # update the remaining capacity
+            current_knapsack.append(numbers.pop(index))  # add the number to knapsack
+
+        knapsacks.append(current_knapsack)
+
+    return knapsacks
+
+
+def infer_seqlen(source_len: int, target_len: int, cutoff_len: int) -> tuple[int, int]:
+    r"""Compute the real sequence length after truncation by the cutoff_len."""
+    if target_len * 2 < cutoff_len:  # truncate source
+        max_target_len = cutoff_len
+    elif source_len * 2 < cutoff_len:  # truncate target
+        max_target_len = cutoff_len - source_len
+    else:  # truncate both
+        max_target_len = int(cutoff_len * (target_len / (source_len + target_len)))
+
+    new_target_len = min(max_target_len, target_len)
+    max_source_len = max(cutoff_len - new_target_len, 0)
+    new_source_len = min(max_source_len, source_len)
+    return new_source_len, new_target_len
diff --git a/llamafactory/data/processor/supervised.py b/llamafactory/data/processor/supervised.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5aba11b6535078f46bdf6aca743c6ae262e1fc6
--- /dev/null
+++ b/llamafactory/data/processor/supervised.py
@@ -0,0 +1,203 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Optional
+
+from ...extras import logging
+from ...extras.constants import IGNORE_INDEX
+from .processor_utils import DatasetProcessor, greedy_knapsack, infer_seqlen
+
+
+if TYPE_CHECKING:
+    from ..mm_plugin import AudioInput, ImageInput, VideoInput
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+class SupervisedDatasetProcessor(DatasetProcessor):
+    def _encode_data_example(
+        self,
+        prompt: list[dict[str, str]],
+        response: list[dict[str, str]],
+        system: Optional[str],
+        tools: Optional[str],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+    ) -> tuple[list[int], list[int]]:
+        messages = self.template.mm_plugin.process_messages(prompt + response, images, videos, audios, self.processor)
+        input_ids, labels = self.template.mm_plugin.process_token_ids(
+            [], [], images, videos, audios, self.tokenizer, self.processor
+        )
+        encoded_pairs = self.template.encode_multiturn(self.tokenizer, messages, system, tools)
+        total_length = len(input_ids) + (1 if self.template.efficient_eos else 0)
+        if self.data_args.mask_history:
+            encoded_pairs = encoded_pairs[::-1]  # high priority for last turns
+
+        for turn_idx, (source_ids, target_ids) in enumerate(encoded_pairs):
+            if total_length >= self.data_args.cutoff_len:
+                break
+
+            source_len, target_len = infer_seqlen(
+                len(source_ids), len(target_ids), self.data_args.cutoff_len - total_length
+            )
+            source_ids = source_ids[:source_len]
+            target_ids = target_ids[:target_len]
+            total_length += source_len + target_len
+
+            if self.data_args.train_on_prompt:
+                source_label = source_ids
+            elif self.template.efficient_eos and turn_idx != 0:
+                source_label = [self.tokenizer.eos_token_id] + [IGNORE_INDEX] * (source_len - 1)
+            else:
+                source_label = [IGNORE_INDEX] * source_len
+
+            if self.data_args.mask_history and turn_idx != 0:  # train on the last turn only
+                target_label = [IGNORE_INDEX] * target_len
+            else:
+                target_label = target_ids
+
+            if self.data_args.mask_history:  # reversed sequences
+                input_ids = source_ids + target_ids + input_ids
+                labels = source_label + target_label + labels
+            else:
+                input_ids += source_ids + target_ids
+                labels += source_label + target_label
+
+        if self.template.efficient_eos:
+            input_ids += [self.tokenizer.eos_token_id]
+            labels += [self.tokenizer.eos_token_id]
+
+        return input_ids, labels
+
+    def preprocess_dataset(self, examples: dict[str, list[Any]]) -> dict[str, list[Any]]:
+        # build inputs with format `<bos> X Y <eos>` and labels with format `<ignore> ... <ignore> Y <eos>`
+        # for multiturn examples, we only mask the prompt part in each prompt-response pair.
+        model_inputs = defaultdict(list)
+        for i in range(len(examples["_prompt"])):
+            if len(examples["_prompt"][i]) % 2 != 1 or len(examples["_response"][i]) != 1:
+                logger.warning_rank0(
+                    "Dropped invalid example: {}".format(examples["_prompt"][i] + examples["_response"][i])
+                )
+                continue
+
+            input_ids, labels = self._encode_data_example(
+                prompt=examples["_prompt"][i],
+                response=examples["_response"][i],
+                system=examples["_system"][i],
+                tools=examples["_tools"][i],
+                images=examples["_images"][i] or [],
+                videos=examples["_videos"][i] or [],
+                audios=examples["_audios"][i] or [],
+            )
+            model_inputs["input_ids"].append(input_ids)
+            model_inputs["attention_mask"].append([1] * len(input_ids))
+            model_inputs["labels"].append(labels)
+            model_inputs["images"].append(examples["_images"][i])
+            model_inputs["videos"].append(examples["_videos"][i])
+            model_inputs["audios"].append(examples["_audios"][i])
+
+        return model_inputs
+
+    def print_data_example(self, example: dict[str, list[int]]) -> None:
+        valid_labels = list(filter(lambda x: x != IGNORE_INDEX, example["labels"]))
+        print("input_ids:\n{}".format(example["input_ids"]))
+        print("inputs:\n{}".format(self.tokenizer.decode(example["input_ids"], skip_special_tokens=False)))
+        print("label_ids:\n{}".format(example["labels"]))
+        print(f"labels:\n{self.tokenizer.decode(valid_labels, skip_special_tokens=False)}")
+
+
+@dataclass
+class PackedSupervisedDatasetProcessor(SupervisedDatasetProcessor):
+    def preprocess_dataset(self, examples: dict[str, list[Any]]) -> dict[str, list[Any]]:
+        # TODO: use `position_ids` to achieve packing
+        # build inputs with format `<bos> X1 Y1 <eos> <bos> X2 Y2 <eos>`
+        # and labels with format `<ignore> ... <ignore> Y1 <eos> <ignore> ... <ignore> Y2 <eos>`
+        valid_num = 0
+        batch_input_ids, batch_labels, batch_images, batch_videos, batch_audios = [], [], [], [], []
+        lengths = []
+        length2indexes = defaultdict(list)
+        for i in range(len(examples["_prompt"])):
+            if len(examples["_prompt"][i]) % 2 != 1 or len(examples["_response"][i]) != 1:
+                logger.warning_rank0(
+                    "Dropped invalid example: {}".format(examples["_prompt"][i] + examples["_response"][i])
+                )
+                continue
+
+            input_ids, labels = self._encode_data_example(
+                prompt=examples["_prompt"][i],
+                response=examples["_response"][i],
+                system=examples["_system"][i],
+                tools=examples["_tools"][i],
+                images=examples["_images"][i] or [],
+                videos=examples["_videos"][i] or [],
+                audios=examples["_audios"][i] or [],
+            )
+            length = len(input_ids)
+            if length > self.data_args.cutoff_len:
+                logger.warning_rank0(f"Dropped lengthy example with length {length} > {self.data_args.cutoff_len}.")
+            else:
+                lengths.append(length)
+                length2indexes[length].append(valid_num)
+                batch_input_ids.append(input_ids)
+                batch_labels.append(labels)
+                batch_images.append(examples["_images"][i] or [])
+                batch_videos.append(examples["_videos"][i] or [])
+                batch_audios.append(examples["_audios"][i] or [])
+                valid_num += 1
+
+        model_inputs = defaultdict(list)
+        knapsacks = greedy_knapsack(lengths, self.data_args.cutoff_len)
+        for knapsack in knapsacks:
+            packed_input_ids, packed_attention_masks, packed_position_ids, packed_labels = [], [], [], []
+            packed_images, packed_videos, packed_audios = [], [], []
+            for i, length in enumerate(knapsack):
+                index = length2indexes[length].pop()
+                packed_input_ids += batch_input_ids[index]
+                packed_position_ids += list(range(len(batch_input_ids[index])))  # NOTE: pad_to_multiple_of ignore this
+                packed_labels += batch_labels[index]
+                packed_images += batch_images[index]
+                packed_videos += batch_videos[index]
+                packed_audios += batch_audios[index]
+                if self.data_args.neat_packing:
+                    packed_attention_masks += [i + 1] * len(batch_input_ids[index])  # start from 1
+                else:
+                    packed_attention_masks += [1] * len(batch_input_ids[index])
+
+            if len(packed_input_ids) < self.data_args.cutoff_len + 1:  # avoid flash_attn drops attn mask
+                pad_length = self.data_args.cutoff_len - len(packed_input_ids) + 1
+                packed_input_ids += [self.tokenizer.pad_token_id] * pad_length
+                packed_position_ids += [0] * pad_length
+                packed_labels += [IGNORE_INDEX] * pad_length
+                if self.data_args.neat_packing:
+                    packed_attention_masks += [0] * pad_length
+                else:
+                    packed_attention_masks += [1] * pad_length  # more efficient flash_attn
+
+            if len(packed_input_ids) != self.data_args.cutoff_len + 1:
+                raise ValueError("The length of packed example should be identical to the cutoff length.")
+
+            model_inputs["input_ids"].append(packed_input_ids)
+            model_inputs["attention_mask"].append(packed_attention_masks)
+            model_inputs["position_ids"].append(packed_position_ids)
+            model_inputs["labels"].append(packed_labels)
+            model_inputs["images"].append(packed_images or None)
+            model_inputs["videos"].append(packed_videos or None)
+            model_inputs["audios"].append(packed_audios or None)
+
+        return model_inputs
diff --git a/llamafactory/data/processor/unsupervised.py b/llamafactory/data/processor/unsupervised.py
new file mode 100644
index 0000000000000000000000000000000000000000..256174b6dd38696b5b180501102af40ff395d0a9
--- /dev/null
+++ b/llamafactory/data/processor/unsupervised.py
@@ -0,0 +1,91 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+from typing import TYPE_CHECKING, Any, Optional
+
+from ...extras import logging
+from ..data_utils import Role
+from .processor_utils import DatasetProcessor, infer_seqlen
+
+
+if TYPE_CHECKING:
+    from ..mm_plugin import AudioInput, ImageInput, VideoInput
+
+
+logger = logging.get_logger(__name__)
+
+
+class UnsupervisedDatasetProcessor(DatasetProcessor):
+    def _encode_data_example(
+        self,
+        prompt: list[dict[str, str]],
+        response: list[dict[str, str]],
+        system: Optional[str],
+        tools: Optional[str],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+    ) -> tuple[list[int], list[int]]:
+        if len(response) == 1:
+            messages = prompt + response
+        else:
+            messages = prompt + [{"role": Role.ASSISTANT.value, "content": ""}]
+
+        messages = self.template.mm_plugin.process_messages(messages, images, videos, audios, self.processor)
+        input_ids, labels = self.template.encode_oneturn(self.tokenizer, messages, system, tools)
+        if self.template.efficient_eos:
+            labels += [self.tokenizer.eos_token_id]
+
+        input_ids, _ = self.template.mm_plugin.process_token_ids(
+            input_ids, None, images, videos, audios, self.tokenizer, self.processor
+        )
+        source_len, target_len = infer_seqlen(len(input_ids), len(labels), self.data_args.cutoff_len)
+        input_ids = input_ids[:source_len]
+        labels = labels[:target_len]
+        return input_ids, labels
+
+    def preprocess_dataset(self, examples: dict[str, list[Any]]) -> dict[str, list[Any]]:
+        # build inputs with format `<bos> X` and labels with format `Y <eos>`
+        model_inputs = defaultdict(list)
+        for i in range(len(examples["_prompt"])):
+            if len(examples["_prompt"][i]) % 2 != 1:
+                logger.warning_rank0(
+                    "Dropped invalid example: {}".format(examples["_prompt"][i] + examples["_response"][i])
+                )
+                continue
+
+            input_ids, labels = self._encode_data_example(
+                prompt=examples["_prompt"][i],
+                response=examples["_response"][i],
+                system=examples["_system"][i],
+                tools=examples["_tools"][i],
+                images=examples["_images"][i] or [],
+                videos=examples["_videos"][i] or [],
+                audios=examples["_audios"][i] or [],
+            )
+            model_inputs["input_ids"].append(input_ids)
+            model_inputs["attention_mask"].append([1] * len(input_ids))
+            model_inputs["labels"].append(labels)
+            model_inputs["images"].append(examples["_images"][i])
+            model_inputs["videos"].append(examples["_videos"][i])
+            model_inputs["audios"].append(examples["_audios"][i])
+
+        return model_inputs
+
+    def print_data_example(self, example: dict[str, list[int]]) -> None:
+        print("input_ids:\n{}".format(example["input_ids"]))
+        print("inputs:\n{}".format(self.tokenizer.decode(example["input_ids"], skip_special_tokens=False)))
+        print("label_ids:\n{}".format(example["labels"]))
+        print("labels:\n{}".format(self.tokenizer.decode(example["labels"], skip_special_tokens=False)))
diff --git a/llamafactory/data/template.py b/llamafactory/data/template.py
new file mode 100644
index 0000000000000000000000000000000000000000..56e32dd203cf753e6ffe486df291e2f71c29a11c
--- /dev/null
+++ b/llamafactory/data/template.py
@@ -0,0 +1,2209 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+from copy import deepcopy
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional, Union
+
+from typing_extensions import override
+
+from ..extras import logging
+from .data_utils import Role
+from .formatter import EmptyFormatter, FunctionFormatter, StringFormatter, ToolFormatter
+from .mm_plugin import get_mm_plugin
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizer
+
+    from ..hparams import DataArguments
+    from .formatter import SLOTS, Formatter
+    from .mm_plugin import BasePlugin
+    from .tool_utils import FunctionCall
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+class Template:
+    format_user: "Formatter"
+    format_assistant: "Formatter"
+    format_system: "Formatter"
+    format_function: "Formatter"
+    format_observation: "Formatter"
+    format_tools: "Formatter"
+    format_prefix: "Formatter"
+    default_system: str
+    stop_words: list[str]
+    thought_words: tuple[str, str]
+    efficient_eos: bool
+    replace_eos: bool
+    replace_jinja_template: bool
+    enable_thinking: Optional[bool]
+    mm_plugin: "BasePlugin"
+
+    def encode_oneturn(
+        self,
+        tokenizer: "PreTrainedTokenizer",
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+    ) -> tuple[list[int], list[int]]:
+        r"""Return a single pair of token ids representing prompt and response respectively."""
+        encoded_messages = self._encode(tokenizer, messages, system, tools)
+        prompt_ids = []
+        for encoded_ids in encoded_messages[:-1]:
+            prompt_ids += encoded_ids
+
+        response_ids = encoded_messages[-1]
+        return prompt_ids, response_ids
+
+    def encode_multiturn(
+        self,
+        tokenizer: "PreTrainedTokenizer",
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+    ) -> list[tuple[list[int], list[int]]]:
+        r"""Return multiple pairs of token ids representing prompts and responses respectively."""
+        encoded_messages = self._encode(tokenizer, messages, system, tools)
+        return [(encoded_messages[i], encoded_messages[i + 1]) for i in range(0, len(encoded_messages), 2)]
+
+    def extract_tool(self, content: str) -> Union[str, list["FunctionCall"]]:
+        r"""Extract tool message."""
+        return self.format_tools.extract(content)
+
+    def get_stop_token_ids(self, tokenizer: "PreTrainedTokenizer") -> list[int]:
+        r"""Return stop token ids."""
+        stop_token_ids = {tokenizer.eos_token_id}
+        for token in self.stop_words:
+            stop_token_ids.add(tokenizer.convert_tokens_to_ids(token))
+
+        return list(stop_token_ids)
+
+    def add_thought(self, content: str = "") -> str:
+        r"""Add empty thought to assistant message."""
+        return f"{self.thought_words[0]}{self.thought_words[1]}" + content
+
+    def remove_thought(self, content: str) -> str:
+        r"""Remove thought from assistant message."""
+        pattern = re.compile(f"{re.escape(self.thought_words[0])}(.*?){re.escape(self.thought_words[1])}", re.DOTALL)
+        return re.sub(pattern, "", content).lstrip("\n")
+
+    def get_thought_word_ids(self, tokenizer: "PreTrainedTokenizer") -> list[int]:
+        r"""Get the token ids of thought words."""
+        return tokenizer.encode(self.add_thought(), add_special_tokens=False)
+
+    def _convert_elements_to_ids(self, tokenizer: "PreTrainedTokenizer", elements: "SLOTS") -> list[int]:
+        r"""Convert elements to token ids."""
+        token_ids = []
+        for elem in elements:
+            if isinstance(elem, str):
+                if len(elem) != 0:
+                    token_ids += tokenizer.encode(elem, add_special_tokens=False)
+            elif isinstance(elem, dict):
+                token_ids += [tokenizer.convert_tokens_to_ids(elem.get("token"))]
+            elif isinstance(elem, set):
+                if "bos_token" in elem and tokenizer.bos_token_id is not None:
+                    token_ids += [tokenizer.bos_token_id]
+                elif "eos_token" in elem and tokenizer.eos_token_id is not None:
+                    token_ids += [tokenizer.eos_token_id]
+            else:
+                raise ValueError(f"Input must be string, set[str] or dict[str, str], got {type(elem)}")
+
+        return token_ids
+
+    def _encode(
+        self,
+        tokenizer: "PreTrainedTokenizer",
+        messages: list[dict[str, str]],
+        system: Optional[str],
+        tools: Optional[str],
+    ) -> list[list[int]]:
+        r"""Encode formatted inputs to pairs of token ids.
+
+        Turn 0: prefix + system + query        resp
+        Turn t: query                          resp.
+        """
+        system = system or self.default_system
+        encoded_messages = []
+        for i, message in enumerate(messages):
+            elements = []
+
+            if i == 0:
+                elements += self.format_prefix.apply()
+                if system or tools:
+                    tool_text = self.format_tools.apply(content=tools)[0] if tools else ""
+                    elements += self.format_system.apply(content=(system + tool_text))
+
+            if message["role"] == Role.USER:
+                elements += self.format_user.apply(content=message["content"], idx=str(i // 2))
+            elif message["role"] == Role.ASSISTANT:
+                elements += self.format_assistant.apply(content=message["content"])
+            elif message["role"] == Role.OBSERVATION:
+                elements += self.format_observation.apply(content=message["content"])
+            elif message["role"] == Role.FUNCTION:
+                elements += self.format_function.apply(content=message["content"], thought_words=self.thought_words)
+            else:
+                raise NotImplementedError("Unexpected role: {}".format(message["role"]))
+
+            encoded_messages.append(self._convert_elements_to_ids(tokenizer, elements))
+
+        return encoded_messages
+
+    @staticmethod
+    def _add_or_replace_eos_token(tokenizer: "PreTrainedTokenizer", eos_token: str) -> None:
+        r"""Add or replace eos token to the tokenizer."""
+        if tokenizer.eos_token == eos_token:
+            return
+
+        is_added = tokenizer.eos_token_id is None
+        num_added_tokens = tokenizer.add_special_tokens({"eos_token": eos_token})
+
+        if is_added:
+            logger.info_rank0(f"Add eos token: {tokenizer.eos_token}.")
+        else:
+            logger.info_rank0(f"Replace eos token: {tokenizer.eos_token}.")
+
+        if num_added_tokens > 0:
+            logger.warning_rank0("New tokens have been added, make sure `resize_vocab` is True.")
+
+    def fix_special_tokens(self, tokenizer: "PreTrainedTokenizer") -> None:
+        r"""Add eos token and pad token to the tokenizer."""
+        stop_words = self.stop_words
+        if self.replace_eos:
+            if not stop_words:
+                raise ValueError("Stop words are required to replace the EOS token.")
+
+            self._add_or_replace_eos_token(tokenizer, eos_token=stop_words[0])
+            stop_words = stop_words[1:]
+
+        if tokenizer.eos_token_id is None:
+            self._add_or_replace_eos_token(tokenizer, eos_token="<|endoftext|>")
+
+        if tokenizer.pad_token_id is None:
+            tokenizer.pad_token = tokenizer.eos_token
+            logger.info_rank0(f"Add pad token: {tokenizer.pad_token}")
+
+        if stop_words:
+            num_added_tokens = tokenizer.add_special_tokens(
+                dict(additional_special_tokens=stop_words), replace_additional_special_tokens=False
+            )
+            logger.info_rank0("Add {} to stop words.".format(",".join(stop_words)))
+            if num_added_tokens > 0:
+                logger.warning_rank0("New tokens have been added, make sure `resize_vocab` is True.")
+
+    @staticmethod
+    def _jinja_escape(content: str) -> str:
+        r"""Escape single quotes in content."""
+        return content.replace("'", r"\'")
+
+    @staticmethod
+    def _convert_slots_to_jinja(slots: "SLOTS", tokenizer: "PreTrainedTokenizer", placeholder: str = "content") -> str:
+        r"""Convert slots to jinja template."""
+        slot_items = []
+        for slot in slots:
+            if isinstance(slot, str):
+                slot_pieces = slot.split("{{content}}")
+                if slot_pieces[0]:
+                    slot_items.append("'" + Template._jinja_escape(slot_pieces[0]) + "'")
+                if len(slot_pieces) > 1:
+                    slot_items.append(placeholder)
+                    if slot_pieces[1]:
+                        slot_items.append("'" + Template._jinja_escape(slot_pieces[1]) + "'")
+            elif isinstance(slot, set):  # do not use {{ eos_token }} since it may be replaced
+                if "bos_token" in slot and tokenizer.bos_token_id is not None:
+                    slot_items.append("'" + tokenizer.bos_token + "'")
+                elif "eos_token" in slot and tokenizer.eos_token_id is not None:
+                    slot_items.append("'" + tokenizer.eos_token + "'")
+            elif isinstance(slot, dict):
+                raise ValueError("Dict is not supported.")
+
+        return " + ".join(slot_items)
+
+    def _get_jinja_template(self, tokenizer: "PreTrainedTokenizer") -> str:
+        r"""Return the jinja template."""
+        prefix = self._convert_slots_to_jinja(self.format_prefix.apply(), tokenizer)
+        system = self._convert_slots_to_jinja(self.format_system.apply(), tokenizer, placeholder="system_message")
+        user = self._convert_slots_to_jinja(self.format_user.apply(), tokenizer)
+        assistant = self._convert_slots_to_jinja(self.format_assistant.apply(), tokenizer)
+        jinja_template = ""
+        if prefix:
+            jinja_template += "{{ " + prefix + " }}"
+
+        if self.default_system:
+            jinja_template += "{% set system_message = '" + self._jinja_escape(self.default_system) + "' %}"
+
+        jinja_template += (
+            "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}"
+            "{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% endif %}"
+            "{% if system_message is defined %}{{ " + system + " }}{% endif %}"
+            "{% for message in loop_messages %}"
+            "{% set content = message['content'] %}"
+            "{% if message['role'] == 'user' %}"
+            "{{ " + user + " }}"
+            "{% elif message['role'] == 'assistant' %}"
+            "{{ " + assistant + " }}"
+            "{% endif %}"
+            "{% endfor %}"
+        )
+        return jinja_template
+
+    def fix_jinja_template(self, tokenizer: "PreTrainedTokenizer") -> None:
+        r"""Replace the jinja template in the tokenizer."""
+        if tokenizer.chat_template is None or self.replace_jinja_template:
+            try:
+                tokenizer.chat_template = self._get_jinja_template(tokenizer)
+            except ValueError as e:
+                logger.info_rank0(f"Cannot add this chat template to tokenizer: {e}.")
+
+    @staticmethod
+    def _convert_slots_to_ollama(
+        slots: "SLOTS", tokenizer: "PreTrainedTokenizer", placeholder: str = "content"
+    ) -> str:
+        r"""Convert slots to ollama template."""
+        slot_items = []
+        for slot in slots:
+            if isinstance(slot, str):
+                slot_pieces = slot.split("{{content}}")
+                if slot_pieces[0]:
+                    slot_items.append(slot_pieces[0])
+                if len(slot_pieces) > 1:
+                    slot_items.append("{{ " + placeholder + " }}")
+                    if slot_pieces[1]:
+                        slot_items.append(slot_pieces[1])
+            elif isinstance(slot, set):  # do not use {{ eos_token }} since it may be replaced
+                if "bos_token" in slot and tokenizer.bos_token_id is not None:
+                    slot_items.append(tokenizer.bos_token)
+                elif "eos_token" in slot and tokenizer.eos_token_id is not None:
+                    slot_items.append(tokenizer.eos_token)
+            elif isinstance(slot, dict):
+                raise ValueError("Dict is not supported.")
+
+        return "".join(slot_items)
+
+    def _get_ollama_template(self, tokenizer: "PreTrainedTokenizer") -> str:
+        r"""Return the ollama template."""
+        prefix = self._convert_slots_to_ollama(self.format_prefix.apply(), tokenizer)
+        system = self._convert_slots_to_ollama(self.format_system.apply(), tokenizer, placeholder=".System")
+        user = self._convert_slots_to_ollama(self.format_user.apply(), tokenizer, placeholder=".Content")
+        assistant = self._convert_slots_to_ollama(self.format_assistant.apply(), tokenizer, placeholder=".Content")
+        return (
+            f"{prefix}{{{{ if .System }}}}{system}{{{{ end }}}}"
+            f"""{{{{ range .Messages }}}}{{{{ if eq .Role "user" }}}}{user}"""
+            f"""{{{{ else if eq .Role "assistant" }}}}{assistant}{{{{ end }}}}{{{{ end }}}}"""
+        )
+
+    def get_ollama_modelfile(self, tokenizer: "PreTrainedTokenizer") -> str:
+        r"""Return the ollama modelfile.
+
+        TODO: support function calling.
+        """
+        modelfile = "# ollama modelfile auto-generated by llamafactory\n\n"
+        modelfile += f'FROM .\n\nTEMPLATE """{self._get_ollama_template(tokenizer)}"""\n\n'
+
+        if self.default_system:
+            modelfile += f'SYSTEM """{self.default_system}"""\n\n'
+
+        for stop_token_id in self.get_stop_token_ids(tokenizer):
+            modelfile += f'PARAMETER stop "{tokenizer.convert_ids_to_tokens(stop_token_id)}"\n'
+
+        modelfile += "PARAMETER num_ctx 4096\n"
+        return modelfile
+
+
+@dataclass
+class Llama2Template(Template):
+    r"""A template that fuse the system message to first user message."""
+
+    @override
+    def _encode(
+        self,
+        tokenizer: "PreTrainedTokenizer",
+        messages: list[dict[str, str]],
+        system: str,
+        tools: str,
+    ) -> list[list[int]]:
+        system = system or self.default_system
+        encoded_messages = []
+        for i, message in enumerate(messages):
+            elements = []
+
+            system_text = ""
+            if i == 0:
+                elements += self.format_prefix.apply()
+                if system or tools:
+                    tool_text = self.format_tools.apply(content=tools)[0] if tools else ""
+                    system_text = self.format_system.apply(content=(system + tool_text))[0]
+
+            if message["role"] == Role.USER:
+                elements += self.format_user.apply(content=system_text + message["content"])
+            elif message["role"] == Role.ASSISTANT:
+                elements += self.format_assistant.apply(content=message["content"])
+            elif message["role"] == Role.OBSERVATION:
+                elements += self.format_observation.apply(content=message["content"])
+            elif message["role"] == Role.FUNCTION:
+                elements += self.format_function.apply(content=message["content"])
+            else:
+                raise NotImplementedError("Unexpected role: {}".format(message["role"]))
+
+            encoded_messages.append(self._convert_elements_to_ids(tokenizer, elements))
+
+        return encoded_messages
+
+    def _get_jinja_template(self, tokenizer: "PreTrainedTokenizer") -> str:
+        prefix = self._convert_slots_to_jinja(self.format_prefix.apply(), tokenizer)
+        system_message = self._convert_slots_to_jinja(
+            self.format_system.apply(), tokenizer, placeholder="system_message"
+        )
+        user_message = self._convert_slots_to_jinja(self.format_user.apply(), tokenizer)
+        assistant_message = self._convert_slots_to_jinja(self.format_assistant.apply(), tokenizer)
+        jinja_template = ""
+        if prefix:
+            jinja_template += "{{ " + prefix + " }}"
+
+        if self.default_system:
+            jinja_template += "{% set system_message = '" + self._jinja_escape(self.default_system) + "' %}"
+
+        jinja_template += (
+            "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}"
+            "{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% endif %}"
+            "{% for message in loop_messages %}"
+            "{% if loop.index0 == 0 and system_message is defined %}"
+            "{% set content = " + system_message + " + message['content'] %}"
+            "{% else %}{% set content = message['content'] %}{% endif %}"
+            "{% if message['role'] == 'user' %}"
+            "{{ " + user_message + " }}"
+            "{% elif message['role'] == 'assistant' %}"
+            "{{ " + assistant_message + " }}"
+            "{% endif %}"
+            "{% endfor %}"
+        )
+        return jinja_template
+
+
+@dataclass
+class ReasoningTemplate(Template):
+    r"""A template that add thought to assistant message."""
+
+    @override
+    def encode_oneturn(
+        self,
+        tokenizer: "PreTrainedTokenizer",
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+    ) -> tuple[list[int], list[int]]:
+        messages = deepcopy(messages)
+        for i in range(1, len(messages) - 2, 2):
+            messages[i]["content"] = self.remove_thought(messages[i]["content"])
+
+        if self.enable_thinking is False:  # remove all cot
+            messages[-1]["content"] = self.remove_thought(messages[-1]["content"])
+
+        prompt_ids, response_ids = super().encode_oneturn(tokenizer, messages, system, tools)
+        if (
+            self.thought_words[0].strip() not in messages[-1]["content"]
+            and self.thought_words[1].strip() not in messages[-1]["content"]
+        ):  # add empty cot
+            if not self.enable_thinking:  # do not compute loss
+                prompt_ids += self.get_thought_word_ids(tokenizer)
+            else:  # do compute loss
+                response_ids = self.get_thought_word_ids(tokenizer) + response_ids
+
+        return prompt_ids, response_ids
+
+    @override
+    def encode_multiturn(
+        self,
+        tokenizer: "PreTrainedTokenizer",
+        messages: list[dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+    ) -> list[tuple[list[int], list[int]]]:
+        messages = deepcopy(messages)
+        if self.enable_thinking is False:  # remove all cot
+            for i in range(1, len(messages), 2):
+                messages[i]["content"] = self.remove_thought(messages[i]["content"])
+
+        encoded_messages = self._encode(tokenizer, messages, system, tools)
+        for i in range(0, len(messages), 2):
+            if (
+                self.thought_words[0].strip() not in messages[i + 1]["content"]
+                and self.thought_words[1].strip() not in messages[i + 1]["content"]
+            ):  # add empty cot
+                if not self.enable_thinking:  # do not compute loss
+                    encoded_messages[i] += self.get_thought_word_ids(tokenizer)
+                else:  # do compute loss
+                    encoded_messages[i + 1] = self.get_thought_word_ids(tokenizer) + encoded_messages[i + 1]
+
+        return [(encoded_messages[i], encoded_messages[i + 1]) for i in range(0, len(encoded_messages), 2)]
+
+
+TEMPLATES: dict[str, "Template"] = {}
+
+
+def register_template(
+    name: str,
+    format_user: Optional["Formatter"] = None,
+    format_assistant: Optional["Formatter"] = None,
+    format_system: Optional["Formatter"] = None,
+    format_function: Optional["Formatter"] = None,
+    format_observation: Optional["Formatter"] = None,
+    format_tools: Optional["Formatter"] = None,
+    format_prefix: Optional["Formatter"] = None,
+    default_system: str = "",
+    stop_words: Optional[list[str]] = None,
+    thought_words: Optional[tuple[str, str]] = None,
+    efficient_eos: bool = False,
+    replace_eos: bool = False,
+    replace_jinja_template: bool = False,
+    enable_thinking: Optional[bool] = True,
+    mm_plugin: "BasePlugin" = get_mm_plugin(name="base"),
+    template_class: type["Template"] = Template,
+) -> None:
+    r"""Register a chat template.
+
+    To add the following chat template:
+    ```
+    <s><user>user prompt here
+    <model>model response here</s>
+    <user>user prompt here
+    <model>model response here</s>
+    ```
+
+    The corresponding code should be:
+    ```
+    register_template(
+        name="custom",
+        format_user=StringFormatter(slots=["<user>{{content}}\n<model>"]),
+        format_assistant=StringFormatter(slots=["{{content}}</s>\n"]),
+        format_prefix=EmptyFormatter("<s>"),
+    )
+    ```
+    """
+    if name in TEMPLATES:
+        raise ValueError(f"Template {name} already exists.")
+
+    default_slots = ["{{content}}"] if efficient_eos else ["{{content}}", {"eos_token"}]
+    default_user_formatter = StringFormatter(slots=["{{content}}"])
+    default_assistant_formatter = StringFormatter(slots=default_slots)
+    if format_assistant is not None:
+        default_function_formatter = FunctionFormatter(slots=format_assistant.slots, tool_format="default")
+    else:
+        default_function_formatter = FunctionFormatter(slots=default_slots, tool_format="default")
+
+    default_tool_formatter = ToolFormatter(tool_format="default")
+    default_prefix_formatter = EmptyFormatter()
+    TEMPLATES[name] = template_class(
+        format_user=format_user or default_user_formatter,
+        format_assistant=format_assistant or default_assistant_formatter,
+        format_system=format_system or default_user_formatter,
+        format_function=format_function or default_function_formatter,
+        format_observation=format_observation or format_user or default_user_formatter,
+        format_tools=format_tools or default_tool_formatter,
+        format_prefix=format_prefix or default_prefix_formatter,
+        default_system=default_system,
+        stop_words=stop_words or [],
+        thought_words=thought_words or ("<think>\n", "\n</think>\n\n"),
+        efficient_eos=efficient_eos,
+        replace_eos=replace_eos,
+        replace_jinja_template=replace_jinja_template,
+        enable_thinking=enable_thinking,
+        mm_plugin=mm_plugin,
+    )
+
+
+def parse_template(tokenizer: "PreTrainedTokenizer") -> "Template":
+    r"""Extract a chat template from the tokenizer."""
+
+    def find_diff(short_str: str, long_str: str) -> str:
+        i, j = 0, 0
+        diff = ""
+        while i < len(short_str) and j < len(long_str):
+            if short_str[i] == long_str[j]:
+                i += 1
+                j += 1
+            else:
+                diff += long_str[j]
+                j += 1
+
+        return diff
+
+    prefix = tokenizer.decode(tokenizer.encode(""))
+
+    messages = [{"role": "system", "content": "{{content}}"}]
+    system_slot = tokenizer.apply_chat_template(messages, add_generation_prompt=False, tokenize=False)[len(prefix) :]
+
+    messages = [{"role": "system", "content": ""}, {"role": "user", "content": "{{content}}"}]
+    user_slot_empty_system = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+    user_slot_empty_system = user_slot_empty_system[len(prefix) :]
+
+    messages = [{"role": "user", "content": "{{content}}"}]
+    user_slot = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+    user_slot = user_slot[len(prefix) :]
+
+    messages = [{"role": "user", "content": "{{content}}"}, {"role": "assistant", "content": "{{content}}"}]
+    assistant_slot = tokenizer.apply_chat_template(messages, add_generation_prompt=False, tokenize=False)
+    assistant_slot = assistant_slot[len(prefix) + len(user_slot) :]
+    template_class = ReasoningTemplate if "<think>" in assistant_slot else Template
+    assistant_slot = assistant_slot.replace("<think>", "").replace("</think>", "").lstrip("\n")  # remove thought tags
+
+    if len(user_slot) > len(user_slot_empty_system):
+        default_system = find_diff(user_slot_empty_system, user_slot)
+        sole_system = system_slot.replace("{{content}}", default_system, 1)
+        user_slot = user_slot[len(sole_system) :]
+    else:  # if defaut_system is empty, user_slot_empty_system will be longer than user_slot
+        default_system = ""
+
+    return template_class(
+        format_user=StringFormatter(slots=[user_slot]),
+        format_assistant=StringFormatter(slots=[assistant_slot]),
+        format_system=StringFormatter(slots=[system_slot]),
+        format_function=FunctionFormatter(slots=[assistant_slot], tool_format="default"),
+        format_observation=StringFormatter(slots=[user_slot]),
+        format_tools=ToolFormatter(tool_format="default"),
+        format_prefix=EmptyFormatter(slots=[prefix]) if prefix else EmptyFormatter(),
+        default_system=default_system,
+        stop_words=[],
+        thought_words=("<think>\n", "\n</think>\n\n"),
+        efficient_eos=False,
+        replace_eos=False,
+        replace_jinja_template=False,
+        enable_thinking=True,
+        mm_plugin=get_mm_plugin(name="base"),
+    )
+
+
+def get_template_and_fix_tokenizer(tokenizer: "PreTrainedTokenizer", data_args: "DataArguments") -> "Template":
+    r"""Get chat template and fixes the tokenizer."""
+    if data_args.template is None:
+        if isinstance(tokenizer.chat_template, str):
+            logger.warning_rank0("`template` was not specified, try parsing the chat template from the tokenizer.")
+            template = parse_template(tokenizer)
+        else:
+            logger.warning_rank0("`template` was not specified, use `empty` template.")
+            template = TEMPLATES["empty"]  # placeholder
+    else:
+        if data_args.template not in TEMPLATES:
+            raise ValueError(f"Template {data_args.template} does not exist.")
+
+        template = TEMPLATES[data_args.template]
+
+    if data_args.train_on_prompt and template.efficient_eos:
+        raise ValueError("Current template does not support `train_on_prompt`.")
+
+    if data_args.tool_format is not None:
+        logger.info_rank0(f"Using tool format: {data_args.tool_format}.")
+        default_slots = ["{{content}}"] if template.efficient_eos else ["{{content}}", {"eos_token"}]
+        template.format_function = FunctionFormatter(slots=default_slots, tool_format=data_args.tool_format)
+        template.format_tools = ToolFormatter(tool_format=data_args.tool_format)
+
+    if data_args.default_system is not None:
+        logger.info_rank0(f"Using default system message: {data_args.default_system}.")
+        template.default_system = data_args.default_system
+
+    if isinstance(template, ReasoningTemplate):
+        logger.warning_rank0(
+            "You are using reasoning template, "
+            "please add `_nothink` suffix if the model is not a reasoning model. "
+            "e.g., qwen3_vl_nothink"
+        )
+        template.enable_thinking = data_args.enable_thinking
+
+    template.fix_special_tokens(tokenizer)
+    template.fix_jinja_template(tokenizer)
+    return template
+
+
+register_template(
+    name="alpaca",
+    format_user=StringFormatter(slots=["### Instruction:\n{{content}}\n\n### Response:\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}", {"eos_token"}, "\n\n"]),
+    default_system=(
+        "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
+    ),
+    replace_jinja_template=True,
+)
+
+
+register_template(
+    name="aquila",
+    format_user=StringFormatter(slots=["Human: {{content}}###Assistant:"]),
+    format_assistant=StringFormatter(slots=["{{content}}###"]),
+    format_system=StringFormatter(slots=["System: {{content}}###"]),
+    default_system=(
+        "A chat between a curious human and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the human's questions."
+    ),
+    stop_words=["</s>"],
+)
+
+
+register_template(
+    name="atom",
+    format_user=StringFormatter(
+        slots=[{"bos_token"}, "Human: {{content}}\n", {"eos_token"}, {"bos_token"}, "Assistant:"]
+    ),
+    format_assistant=StringFormatter(slots=["{{content}}\n", {"eos_token"}]),
+)
+
+
+register_template(
+    name="baichuan",
+    format_user=StringFormatter(slots=[{"token": "<reserved_102>"}, "{{content}}", {"token": "<reserved_103>"}]),
+    efficient_eos=True,
+)
+
+
+register_template(
+    name="baichuan2",
+    format_user=StringFormatter(slots=["<reserved_106>{{content}}<reserved_107>"]),
+    efficient_eos=True,
+)
+
+
+register_template(
+    name="bailing",
+    format_user=StringFormatter(slots=["<role>HUMAN</role>{{content}}<role>ASSISTANT</role>"]),
+    format_system=StringFormatter(slots=["<role>SYSTEM</role>{{content}}"]),
+    format_observation=StringFormatter(slots=["<role>OBSERVATION</role>{{content}}<role>ASSISTANT</role>"]),
+    stop_words=["<|endoftext|>"],
+    efficient_eos=True,
+)
+
+
+register_template(
+    name="bailing_v2",
+    format_user=StringFormatter(slots=["<role>HUMAN</role>{{content}}<|role_end|><role>ASSISTANT</role>"]),
+    format_system=StringFormatter(slots=["<role>SYSTEM</role>{{content}}<|role_end|>"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|role_end|>"]),
+    format_observation=StringFormatter(
+        slots=[
+            "<role>OBSERVATION</role>\n<tool_response>\n{{content}}\n</tool_response><|role_end|><role>ASSISTANT</role>"
+        ]
+    ),
+    format_function=FunctionFormatter(slots=["{{content}}<|role_end|>"], tool_format="ling"),
+    format_tools=ToolFormatter(tool_format="ling"),
+    stop_words=["<|endoftext|>"],
+    efficient_eos=True,
+)
+
+
+register_template(
+    name="belle",
+    format_user=StringFormatter(slots=["Human: {{content}}\n\nBelle: "]),
+    format_assistant=StringFormatter(slots=["{{content}}", {"eos_token"}, "\n\n"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+)
+
+
+register_template(
+    name="bluelm",
+    format_user=StringFormatter(slots=[{"token": "[|Human|]:"}, "{{content}}", {"token": "[|AI|]:"}]),
+)
+
+
+register_template(
+    name="breeze",
+    format_user=StringFormatter(slots=["[INST] {{content}} [/INST] "]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    efficient_eos=True,
+)
+
+
+register_template(
+    name="chatglm2",
+    format_user=StringFormatter(slots=["[Round {{idx}}]\n\n问：{{content}}\n\n答："]),
+    format_prefix=EmptyFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}]),
+    efficient_eos=True,
+)
+
+
+register_template(
+    name="chatglm3",
+    format_user=StringFormatter(slots=[{"token": "<|user|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]),
+    format_assistant=StringFormatter(slots=["\n", "{{content}}"]),
+    format_system=StringFormatter(slots=[{"token": "<|system|>"}, "\n", "{{content}}"]),
+    format_function=FunctionFormatter(slots=["{{content}}"], tool_format="glm4"),
+    format_observation=StringFormatter(
+        slots=[{"token": "<|observation|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]
+    ),
+    format_tools=ToolFormatter(tool_format="glm4"),
+    format_prefix=EmptyFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}]),
+    stop_words=["<|user|>", "<|observation|>"],
+    efficient_eos=True,
+)
+
+
+register_template(
+    name="chatml",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    stop_words=["<|im_end|>", "<|im_start|>"],
+    replace_eos=True,
+    replace_jinja_template=True,
+)
+
+
+# copied from chatml template
+register_template(
+    name="chatml_de",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    default_system="Du bist ein freundlicher und hilfsbereiter KI-Assistent.",
+    stop_words=["<|im_end|>", "<|im_start|>"],
+    replace_eos=True,
+    replace_jinja_template=True,
+)
+
+
+register_template(
+    name="codegeex2",
+    format_prefix=EmptyFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}]),
+)
+
+
+register_template(
+    name="codegeex4",
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>\n"]),
+    format_system=StringFormatter(slots=["<|system|>\n{{content}}"]),
+    format_function=FunctionFormatter(slots=["{{content}}"], tool_format="glm4"),
+    format_observation=StringFormatter(slots=["<|observation|>\n{{content}}<|assistant|>\n"]),
+    format_tools=ToolFormatter(tool_format="glm4"),
+    format_prefix=EmptyFormatter(slots=["[gMASK]<sop>"]),
+    default_system=(
+        "你是一位智能编程助手，你叫CodeGeeX。你会为用户回答关于编程、代码、计算机方面的任何问题，"
+        "并提供格式规范、可以执行、准确安全的代码，并在必要时提供详细的解释。"
+    ),
+    stop_words=["<|user|>", "<|observation|>"],
+    efficient_eos=True,
+)
+
+
+register_template(
+    name="cohere",
+    format_user=StringFormatter(
+        slots=[
+            (
+                "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{content}}<|END_OF_TURN_TOKEN|>"
+                "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
+            )
+        ]
+    ),
+    format_system=StringFormatter(slots=["<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{content}}<|END_OF_TURN_TOKEN|>"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+)
+
+
+register_template(
+    name="cpm",
+    format_user=StringFormatter(slots=["<用户>{{content}}<AI>"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+)
+
+
+# copied from chatml template
+register_template(
+    name="cpm3",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<|im_end|>"],
+)
+
+
+# copied from chatml template
+register_template(
+    name="cpm4",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<|im_end|>"],
+)
+
+
+# copied from chatml template
+register_template(
+    name="dbrx",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    default_system=(
+        "You are DBRX, created by Databricks. You were last updated in December 2023. "
+        "You answer questions based on information available up to that point.\n"
+        "YOU PROVIDE SHORT RESPONSES TO SHORT QUESTIONS OR STATEMENTS, but provide thorough "
+        "responses to more complex and open-ended questions.\nYou assist with various tasks, "
+        "from writing to coding (using markdown for code blocks — remember to use ``` with "
+        "code, JSON, and tables).\n(You do not have real-time data access or code execution "
+        "capabilities. You avoid stereotyping and provide balanced perspectives on "
+        "controversial topics. You do not provide song lyrics, poems, or news articles and "
+        "do not divulge details of your training data.)\nThis is your system prompt, "
+        "guiding your responses. Do not reference it, just respond to the user. If you find "
+        "yourself talking about this message, stop. You should be responding appropriately "
+        "and usually that means not mentioning this.\nYOU DO NOT MENTION ANY OF THIS INFORMATION "
+        "ABOUT YOURSELF UNLESS THE INFORMATION IS DIRECTLY PERTINENT TO THE USER'S QUERY."
+    ),
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+)
+
+
+register_template(
+    name="deepseek",
+    format_user=StringFormatter(slots=["User: {{content}}\n\nAssistant:"]),
+    format_system=StringFormatter(slots=["{{content}}\n\n"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+)
+
+
+register_template(
+    name="deepseek3",
+    format_user=StringFormatter(slots=["<｜User｜>{{content}}<｜Assistant｜>"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+)
+
+
+# copied from deepseek3 template
+register_template(
+    name="deepseekr1",
+    format_user=StringFormatter(slots=["<｜User｜>{{content}}<｜Assistant｜>"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    template_class=ReasoningTemplate,
+)
+
+
+register_template(
+    name="deepseekcoder",
+    format_user=StringFormatter(slots=["### Instruction:\n{{content}}\n### Response:"]),
+    format_assistant=StringFormatter(slots=["\n{{content}}\n<|EOT|>\n"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    default_system=(
+        "You are an AI programming assistant, utilizing the DeepSeek Coder model, "
+        "developed by DeepSeek Company, and you only answer questions related to computer science. "
+        "For politically sensitive questions, security and privacy issues, "
+        "and other non-computer science questions, you will refuse to answer.\n"
+    ),
+)
+
+
+register_template(
+    name="default",
+    format_user=StringFormatter(slots=["Human: {{content}}", {"eos_token"}, "\nAssistant:"]),
+    format_assistant=StringFormatter(slots=["{{content}}", {"eos_token"}, "\n"]),
+    format_system=StringFormatter(slots=["System: {{content}}", {"eos_token"}, "\n"]),
+    replace_jinja_template=True,
+)
+
+
+register_template(
+    name="dots_ocr",
+    format_user=StringFormatter(slots=["<|user|>{{content}}<|endofuser|><|assistant|>"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|endofassistant|>"]),
+    format_system=StringFormatter(slots=["<|system|>{{content}}<|endofsystem|>\n"]),
+    stop_words=["<|endofassistant|>"],
+    efficient_eos=True,
+    mm_plugin=get_mm_plugin(
+        name="qwen2_vl",
+        image_token="<|imgpad|>",
+        video_token="<|vidpad|>",
+        vision_bos_token="<|img|>",
+        vision_eos_token="<|endofimg|>",
+    ),
+)
+
+
+register_template(
+    name="empty",
+    format_assistant=StringFormatter(slots=["{{content}}"]),
+)
+
+
+# copied from chatml template
+register_template(
+    name="ernie",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n\n"]),
+    format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n\n<|im_start|>assistant\n"]),
+    default_system="<global_setting>\nthink_mode=True\n</global_setting>",
+    stop_words=["<|im_end|>"],
+)
+
+
+register_template(
+    name="ernie_nothink",
+    format_user=StringFormatter(slots=["User: {{content}}\nAssistant: "]),
+    format_assistant=StringFormatter(slots=["{{content}}<|end_of_sentence|>"]),
+    format_system=StringFormatter(slots=["{{content}}\n"]),
+    format_prefix=EmptyFormatter(slots=["<|begin_of_sentence|>"]),
+    stop_words=["<|end_of_sentence|>"],
+)
+
+
+register_template(
+    name="exaone",
+    format_user=StringFormatter(slots=["[|user|]{{content}}\n[|assistant|]"]),
+    format_assistant=StringFormatter(slots=["{{content}}", {"eos_token"}, "\n"]),
+    format_system=StringFormatter(slots=["[|system|]{{content}}[|endofturn|]\n"]),
+)
+
+
+register_template(
+    name="falcon",
+    format_user=StringFormatter(slots=["User: {{content}}\nFalcon:"]),
+    format_assistant=StringFormatter(slots=["{{content}}\n"]),
+    efficient_eos=True,
+)
+
+
+# copied from chatml template
+register_template(
+    name="falcon_h1",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<|im_end|>", "<|end_of_text|>"],
+)
+
+
+register_template(
+    name="fewshot",
+    format_assistant=StringFormatter(slots=["{{content}}\n\n"]),
+    efficient_eos=True,
+    replace_jinja_template=True,
+)
+
+
+register_template(
+    name="gemma",
+    format_user=StringFormatter(slots=["<start_of_turn>user\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<end_of_turn>\n"]),
+    format_system=StringFormatter(slots=["{{content}}\n\n"]),
+    format_observation=StringFormatter(
+        slots=["<start_of_turn>tool\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]
+    ),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<end_of_turn>"],
+    replace_eos=True,
+    template_class=Llama2Template,
+)
+
+
+# copied from gemma template
+register_template(
+    name="gemma2",
+    format_user=StringFormatter(slots=["<start_of_turn>user\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<end_of_turn>\n"]),
+    format_system=StringFormatter(slots=["{{content}}\n\n"]),
+    format_observation=StringFormatter(
+        slots=["<start_of_turn>tool\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]
+    ),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<eos>", "<end_of_turn>"],
+    efficient_eos=True,
+    template_class=Llama2Template,
+)
+
+
+# copied from gemma template
+register_template(
+    name="gemma3",
+    format_user=StringFormatter(slots=["<start_of_turn>user\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<end_of_turn>\n"]),
+    format_system=StringFormatter(slots=["{{content}}\n\n"]),
+    format_observation=StringFormatter(
+        slots=["<start_of_turn>tool\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]
+    ),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<end_of_turn>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin("gemma3", image_token="<image_soft_token>"),
+    template_class=Llama2Template,
+)
+
+
+register_template(
+    name="gemma3n",
+    format_user=StringFormatter(slots=["<start_of_turn>user\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<end_of_turn>\n"]),
+    format_system=StringFormatter(slots=["{{content}}\n\n"]),
+    format_observation=StringFormatter(
+        slots=["<start_of_turn>tool\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]
+    ),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<end_of_turn>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin("gemma3n", image_token="<image_soft_token>", audio_token="<audio_soft_token>"),
+    template_class=Llama2Template,
+)
+
+
+register_template(
+    name="glm4",
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>"]),
+    format_assistant=StringFormatter(slots=["\n{{content}}"]),
+    format_system=StringFormatter(slots=["<|system|>\n{{content}}"]),
+    format_function=FunctionFormatter(slots=["{{content}}"], tool_format="glm4"),
+    format_observation=StringFormatter(slots=["<|observation|>\n{{content}}<|assistant|>"]),
+    format_tools=ToolFormatter(tool_format="glm4"),
+    format_prefix=EmptyFormatter(slots=["[gMASK]<sop>"]),
+    stop_words=["<|user|>", "<|observation|>"],
+    efficient_eos=True,
+)
+
+
+# copied from glm4 template
+register_template(
+    name="glm4_moe",
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>"]),
+    format_assistant=StringFormatter(slots=["\n{{content}}"]),
+    format_system=StringFormatter(slots=["<|system|>\n{{content}}"]),
+    format_function=FunctionFormatter(slots=["{{content}}"], tool_format="glm4_moe"),
+    format_observation=StringFormatter(slots=["<|observation|>\n{{content}}<|assistant|>"]),
+    format_tools=ToolFormatter(tool_format="glm4_moe"),
+    format_prefix=EmptyFormatter(slots=["[gMASK]<sop>"]),
+    stop_words=["<|user|>", "<|observation|>"],
+    efficient_eos=True,
+    template_class=ReasoningTemplate,
+)
+
+
+# copied from glm4 template
+register_template(
+    name="glm4v",
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>"]),
+    format_assistant=StringFormatter(slots=["\n{{content}}"]),
+    format_system=StringFormatter(slots=["<|system|>\n{{content}}"]),
+    format_function=FunctionFormatter(slots=["{{content}}"], tool_format="glm4"),
+    format_observation=StringFormatter(slots=["<|observation|>\n{{content}}<|assistant|>"]),
+    format_tools=ToolFormatter(tool_format="glm4"),
+    format_prefix=EmptyFormatter(slots=["[gMASK]<sop>"]),
+    stop_words=["<|user|>", "<|observation|>", "</answer>"],
+    efficient_eos=True,
+    mm_plugin=get_mm_plugin(name="glm4v", image_token="<|image|>", video_token="<|video|>"),
+    template_class=ReasoningTemplate,
+)
+
+
+# copied from glm4 template
+register_template(
+    name="glm4v_moe",
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>"]),
+    format_assistant=StringFormatter(slots=["\n{{content}}"]),
+    format_system=StringFormatter(slots=["<|system|>\n{{content}}"]),
+    format_function=FunctionFormatter(slots=["{{content}}"], tool_format="glm4_moe"),
+    format_observation=StringFormatter(slots=["<|observation|>\n{{content}}<|assistant|>"]),
+    format_tools=ToolFormatter(tool_format="glm4_moe"),
+    format_prefix=EmptyFormatter(slots=["[gMASK]<sop>"]),
+    stop_words=["<|user|>", "<|observation|>", "</answer>"],
+    efficient_eos=True,
+    mm_plugin=get_mm_plugin(name="glm4v", image_token="<|image|>", video_token="<|video|>"),
+    template_class=ReasoningTemplate,
+)
+
+
+# copied from glm4 template
+register_template(
+    name="glmz1",
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>"]),
+    format_assistant=StringFormatter(slots=["\n{{content}}"]),
+    format_system=StringFormatter(slots=["<|system|>\n{{content}}"]),
+    format_function=FunctionFormatter(slots=["{{content}}"], tool_format="glm4"),
+    format_observation=StringFormatter(slots=["<|observation|>\n{{content}}<|assistant|>"]),
+    format_tools=ToolFormatter(tool_format="glm4"),
+    format_prefix=EmptyFormatter(slots=["[gMASK]<sop>"]),
+    stop_words=["<|user|>", "<|observation|>"],
+    efficient_eos=True,
+    template_class=ReasoningTemplate,
+)
+
+
+register_template(
+    name="gpt",
+    format_user=StringFormatter(slots=["<|start|>user<|message|>{{content}}<|end|><|start|>assistant"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|end|>"]),
+    format_system=StringFormatter(slots=["<|start|>system<|message|>{{content}}<|end|>"]),
+    default_system="You are ChatGPT, a large language model trained by OpenAI.",
+    thought_words=("<|channel|>analysis<|message|>", "<|end|><|start|>assistant<|channel|>final<|message|>"),
+    efficient_eos=True,
+    template_class=ReasoningTemplate,
+)
+
+
+register_template(
+    name="granite3",
+    format_user=StringFormatter(
+        slots=[
+            "<|start_of_role|>user<|end_of_role|>{{content}}<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>"
+        ]
+    ),
+    format_assistant=StringFormatter(slots=["{{content}}<|end_of_text|>\n"]),
+    format_system=StringFormatter(slots=["<|start_of_role|>system<|end_of_role|>{{content}}<|end_of_text|>\n"]),
+)
+
+
+register_template(
+    name="granite3_vision",
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}\n<|assistant|>\n"]),
+    format_system=StringFormatter(slots=["<|system|>\n{{content}}\n"]),
+    default_system=(
+        "A chat between a curious user and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the user's questions."
+    ),
+    mm_plugin=get_mm_plugin(name="llava_next", image_token="<image>"),
+)
+
+
+register_template(
+    name="granite4",
+    format_user=StringFormatter(
+        slots=[
+            "<|start_of_role|>user<|end_of_role|>{{content}}<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>"
+        ]
+    ),
+    format_assistant=StringFormatter(slots=["{{content}}<|end_of_text|>\n"]),
+    format_system=StringFormatter(slots=["<|start_of_role|>system<|end_of_role|>{{content}}<|end_of_text|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|end_of_text|>\n"], tool_format="default"),
+    format_observation=StringFormatter(
+        slots=["<|start_of_role|>tool<|end_of_role|>{{content}}<|end_of_text|>\n<|start_of_role|>assistant\n"]
+    ),
+    format_tools=ToolFormatter(tool_format="default"),
+    stop_words=["<|end_of_text|>"],
+    default_system="You are Granite, developed by IBM. You are a helpful AI assistant.",
+)
+
+
+register_template(
+    name="index",
+    format_user=StringFormatter(slots=["reserved_0{{content}}reserved_1"]),
+    format_system=StringFormatter(slots=["<unk>{{content}}"]),
+    efficient_eos=True,
+)
+
+
+register_template(
+    name="hunyuan",
+    format_user=StringFormatter(slots=["{{content}}<|extra_0|>"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|eos|>"]),
+    format_system=StringFormatter(slots=["{{content}}<|extra_4|>"]),
+    format_prefix=EmptyFormatter(slots=["<|startoftext|>"]),
+    stop_words=["<|eos|>"],
+)
+
+
+register_template(
+    name="intern",
+    format_user=StringFormatter(slots=["<|User|>:{{content}}\n<|Bot|>:"]),
+    format_assistant=StringFormatter(slots=["{{content}}<eoa>\n"]),
+    format_system=StringFormatter(slots=["<|System|>:{{content}}\n"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    default_system=(
+        "You are an AI assistant whose name is InternLM (书生·浦语).\n"
+        "- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory "
+        "(上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n"
+        "- InternLM (书生·浦语) can understand and communicate fluently in the language "
+        "chosen by the user such as English and 中文."
+    ),
+    stop_words=["<eoa>"],
+)
+
+
+register_template(
+    name="intern2",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    default_system=(
+        "You are an AI assistant whose name is InternLM (书生·浦语).\n"
+        "- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory "
+        "(上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n"
+        "- InternLM (书生·浦语) can understand and communicate fluently in the language "
+        "chosen by the user such as English and 中文."
+    ),
+    stop_words=["<|im_end|>"],
+)
+
+
+register_template(
+    name="intern_vl",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    default_system=(
+        "你是书生·万象，英文名是InternVL，是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。"
+    ),
+    stop_words=["<|im_end|>"],
+    mm_plugin=get_mm_plugin(name="intern_vl", image_token="<image>", video_token="<video>"),
+)
+
+
+register_template(
+    name="intern_s1",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<|im_end|>"],
+    mm_plugin=get_mm_plugin(name="intern_vl", image_token="<image>", video_token="<video>"),
+)
+
+
+# copied from qwen template
+register_template(
+    name="keye_vl",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="qwen"),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>user\n<tool_response>\n{{content}}\n</tool_response><|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_tools=ToolFormatter(tool_format="qwen"),
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(name="qwen2_vl", image_token="<|image_pad|>", video_token="<|video_pad|>"),
+    template_class=ReasoningTemplate,
+)
+
+
+register_template(
+    name="kimi_vl",
+    format_user=StringFormatter(
+        slots=["<|im_user|>user<|im_middle|>{{content}}<|im_end|><|im_assistant|>assistant<|im_middle|>"]
+    ),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>"]),
+    format_system=StringFormatter(slots=["<|im_system|>system<|im_middle|>{{content}}<|im_end|>"]),
+    default_system="You are a helpful assistant",
+    stop_words=["<|im_end|>"],
+    thought_words=("◁think▷", "◁/think▷"),
+    mm_plugin=get_mm_plugin("kimi_vl", image_token="<|media_pad|>"),
+    template_class=ReasoningTemplate,
+)
+
+
+register_template(
+    name="llama2",
+    format_user=StringFormatter(slots=[{"bos_token"}, "[INST] {{content}} [/INST]"]),
+    format_system=StringFormatter(slots=["<<SYS>>\n{{content}}\n<</SYS>>\n\n"]),
+    template_class=Llama2Template,
+)
+
+
+# copied from llama2 template
+register_template(
+    name="llama2_zh",
+    format_user=StringFormatter(slots=[{"bos_token"}, "[INST] {{content}} [/INST]"]),
+    format_system=StringFormatter(slots=["<<SYS>>\n{{content}}\n<</SYS>>\n\n"]),
+    default_system="You are a helpful assistant. 你是一个乐于助人的助手。",
+    template_class=Llama2Template,
+)
+
+
+register_template(
+    name="llama3",
+    format_user=StringFormatter(
+        slots=[
+            (
+                "<|start_header_id|>user<|end_header_id|>\n\n{{content}}<|eot_id|>"
+                "<|start_header_id|>assistant<|end_header_id|>\n\n"
+            )
+        ]
+    ),
+    format_assistant=StringFormatter(slots=["{{content}}<|eot_id|>"]),
+    format_system=StringFormatter(slots=["<|start_header_id|>system<|end_header_id|>\n\n{{content}}<|eot_id|>"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|eot_id|>"], tool_format="llama3"),
+    format_observation=StringFormatter(
+        slots=[
+            (
+                "<|start_header_id|>ipython<|end_header_id|>\n\n{{content}}<|eot_id|>"
+                "<|start_header_id|>assistant<|end_header_id|>\n\n"
+            )
+        ]
+    ),
+    format_tools=ToolFormatter(tool_format="llama3"),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<|eot_id|>", "<|eom_id|>"],
+    replace_eos=True,
+)
+
+
+register_template(
+    name="llama4",
+    format_user=StringFormatter(
+        slots=["<|header_start|>user<|header_end|>\n\n{{content}}<|eot|><|header_start|>assistant<|header_end|>\n\n"]
+    ),
+    format_assistant=StringFormatter(slots=["{{content}}<|eot|>"]),
+    format_system=StringFormatter(slots=["<|header_start|>system<|header_end|>\n\n{{content}}<|eot|>"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|eot|>"], tool_format="llama3"),
+    format_observation=StringFormatter(
+        slots=[
+            "<|header_start|>ipython<|header_end|>\n\n{{content}}<|eot|><|header_start|>assistant<|header_end|>\n\n"
+        ]
+    ),
+    format_tools=ToolFormatter(tool_format="llama3"),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<|eot|>", "<|eom|>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(name="llama4", image_token="<|image|>"),
+)
+
+
+# copied from llama3 template
+register_template(
+    name="mllama",
+    format_user=StringFormatter(
+        slots=[
+            (
+                "<|start_header_id|>user<|end_header_id|>\n\n{{content}}<|eot_id|>"
+                "<|start_header_id|>assistant<|end_header_id|>\n\n"
+            )
+        ]
+    ),
+    format_assistant=StringFormatter(slots=["{{content}}<|eot_id|>"]),
+    format_system=StringFormatter(slots=["<|start_header_id|>system<|end_header_id|>\n\n{{content}}<|eot_id|>"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|eot_id|>"], tool_format="llama3"),
+    format_observation=StringFormatter(
+        slots=[
+            (
+                "<|start_header_id|>ipython<|end_header_id|>\n\n{{content}}<|eot_id|>"
+                "<|start_header_id|>assistant<|end_header_id|>\n\n"
+            )
+        ]
+    ),
+    format_tools=ToolFormatter(tool_format="llama3"),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<|eot_id|>", "<|eom_id|>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(name="mllama", image_token="<|image|>"),
+)
+
+
+register_template(
+    name="moonlight",
+    format_user=StringFormatter(
+        slots=["<|im_user|>user<|im_middle|>{{content}}<|im_end|><|im_assistant|>assistant<|im_middle|>"]
+    ),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>"]),
+    format_system=StringFormatter(slots=["<|im_system|>system<|im_middle|>{{content}}<|im_end|>"]),
+    default_system="You are a helpful assistant provided by Moonshot-AI.",
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+)
+
+
+# copied from vicuna template
+register_template(
+    name="llava",
+    format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT:"]),
+    default_system=(
+        "A chat between a curious user and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the user's questions."
+    ),
+    mm_plugin=get_mm_plugin(name="llava", image_token="<image>"),
+)
+
+
+# copied from vicuna template
+register_template(
+    name="llava_next",
+    format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT:"]),
+    default_system=(
+        "A chat between a curious user and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the user's questions."
+    ),
+    mm_plugin=get_mm_plugin(name="llava_next", image_token="<image>"),
+)
+
+
+# copied from llama3 template
+register_template(
+    name="llava_next_llama3",
+    format_user=StringFormatter(
+        slots=[
+            (
+                "<|start_header_id|>user<|end_header_id|>\n\n{{content}}<|eot_id|>"
+                "<|start_header_id|>assistant<|end_header_id|>\n\n"
+            )
+        ]
+    ),
+    format_assistant=StringFormatter(slots=["{{content}}<|eot_id|>"]),
+    format_system=StringFormatter(slots=["<|start_header_id|>system<|end_header_id|>\n\n{{content}}<|eot_id|>"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|eot_id|>"], tool_format="llama3"),
+    format_observation=StringFormatter(
+        slots=[
+            (
+                "<|start_header_id|>ipython<|end_header_id|>\n\n{{content}}<|eot_id|>"
+                "<|start_header_id|>assistant<|end_header_id|>\n\n"
+            )
+        ]
+    ),
+    format_tools=ToolFormatter(tool_format="llama3"),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<|eot_id|>", "<|eom_id|>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(name="llava_next", image_token="<image>"),
+)
+
+
+# copied from mistral template
+register_template(
+    name="llava_next_mistral",
+    format_user=StringFormatter(slots=["[INST] {{content}}[/INST]"]),
+    format_assistant=StringFormatter(slots=[" {{content}}", {"eos_token"}]),
+    format_system=StringFormatter(slots=["{{content}}\n\n"]),
+    format_function=FunctionFormatter(slots=["[TOOL_CALLS] {{content}}", {"eos_token"}], tool_format="mistral"),
+    format_observation=StringFormatter(slots=["""[TOOL_RESULTS] {"content": {{content}}}[/TOOL_RESULTS]"""]),
+    format_tools=ToolFormatter(tool_format="mistral"),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    mm_plugin=get_mm_plugin(name="llava_next", image_token="<image>"),
+    template_class=Llama2Template,
+)
+
+
+# copied from qwen template
+register_template(
+    name="llava_next_qwen",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="qwen"),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>user\n<tool_response>\n{{content}}\n</tool_response><|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_tools=ToolFormatter(tool_format="qwen"),
+    default_system="You are a helpful assistant.",
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(name="llava_next", image_token="<image>"),
+)
+
+
+# copied from chatml template
+register_template(
+    name="llava_next_yi",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    stop_words=["<|im_end|>"],
+    mm_plugin=get_mm_plugin(name="llava_next", image_token="<image>"),
+)
+
+
+# copied from vicuna template
+register_template(
+    name="llava_next_video",
+    format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT:"]),
+    default_system=(
+        "A chat between a curious user and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the user's questions."
+    ),
+    mm_plugin=get_mm_plugin(name="llava_next_video", image_token="<image>", video_token="<video>"),
+)
+
+
+# copied from mistral template
+register_template(
+    name="llava_next_video_mistral",
+    format_user=StringFormatter(slots=["[INST] {{content}}[/INST]"]),
+    format_assistant=StringFormatter(slots=[" {{content}}", {"eos_token"}]),
+    format_system=StringFormatter(slots=["{{content}}\n\n"]),
+    format_function=FunctionFormatter(slots=["[TOOL_CALLS] {{content}}", {"eos_token"}], tool_format="mistral"),
+    format_observation=StringFormatter(slots=["""[TOOL_RESULTS] {"content": {{content}}}[/TOOL_RESULTS]"""]),
+    format_tools=ToolFormatter(tool_format="mistral"),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    mm_plugin=get_mm_plugin(name="llava_next_video", image_token="<image>", video_token="<video>"),
+    template_class=Llama2Template,
+)
+
+
+# copied from chatml template
+register_template(
+    name="llava_next_video_yi",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    stop_words=["<|im_end|>"],
+    mm_plugin=get_mm_plugin(name="llava_next_video", image_token="<image>", video_token="<video>"),
+)
+
+
+# copied from chatml template
+register_template(
+    name="marco",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    default_system=(
+        "你是一个经过良好训练的AI助手，你的名字是Marco-o1."
+        "由阿里国际数字商业集团的AI Business创造.\n## 重要！！！！！\n"
+        "当你回答问题时，你的思考应该在<Thought>内完成，<Output>内输出你的结果。\n"
+        "<Thought>应该尽可能是英文，但是有2个特例，一个是对原文中的引用，另一个是是数学应该使用markdown格式，<Output>内的输出需要遵循用户输入的语言。\n"
+    ),
+    stop_words=["<|im_end|>"],
+)
+
+
+# copied from qwen template
+register_template(
+    name="mimo",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="qwen"),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>user\n<tool_response>\n{{content}}\n</tool_response><|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_tools=ToolFormatter(tool_format="qwen"),
+    default_system="You are a helpful assistant.",
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+    template_class=ReasoningTemplate,
+)
+
+# copied from qwen2vl
+register_template(
+    name="mimo_vl",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="qwen"),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>user\n<tool_response>\n{{content}}\n</tool_response><|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_tools=ToolFormatter(tool_format="qwen"),
+    default_system="You are MiMo, an AI assistant developed by Xiaomi.",
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(name="qwen2_vl", image_token="<|image_pad|>", video_token="<|video_pad|>"),
+    template_class=ReasoningTemplate,
+)
+
+
+# copied from chatml template
+register_template(
+    name="minicpm_v",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    stop_words=["<|im_end|>"],
+    default_system="You are a helpful assistant.",
+    mm_plugin=get_mm_plugin(name="minicpm_v", image_token="<image>", video_token="<video>"),
+)
+
+
+# copied from minicpm_v template
+register_template(
+    name="minicpm_o",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    stop_words=["<|im_end|>"],
+    default_system="You are a helpful assistant. You can accept audio and text input and output voice and text.",
+    mm_plugin=get_mm_plugin(name="minicpm_v", image_token="<image>", video_token="<video>", audio_token="<audio>"),
+)
+
+
+# mistral tokenizer v3 tekken
+register_template(
+    name="ministral",
+    format_user=StringFormatter(slots=["[INST]{{content}}[/INST]"]),
+    format_system=StringFormatter(slots=["{{content}}\n\n"]),
+    format_function=FunctionFormatter(slots=["[TOOL_CALLS]{{content}}", {"eos_token"}], tool_format="mistral"),
+    format_observation=StringFormatter(slots=["""[TOOL_RESULTS]{"content": {{content}}}[/TOOL_RESULTS]"""]),
+    format_tools=ToolFormatter(tool_format="mistral"),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    template_class=Llama2Template,
+)
+
+
+# mistral tokenizer v3
+register_template(
+    name="mistral",
+    format_user=StringFormatter(slots=["[INST] {{content}}[/INST]"]),
+    format_assistant=StringFormatter(slots=[" {{content}}", {"eos_token"}]),
+    format_system=StringFormatter(slots=["{{content}}\n\n"]),
+    format_function=FunctionFormatter(slots=["[TOOL_CALLS] {{content}}", {"eos_token"}], tool_format="mistral"),
+    format_observation=StringFormatter(slots=["""[TOOL_RESULTS] {"content": {{content}}}[/TOOL_RESULTS]"""]),
+    format_tools=ToolFormatter(tool_format="mistral"),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    template_class=Llama2Template,
+)
+
+
+# mistral tokenizer v7 tekken (copied from ministral)
+register_template(
+    name="mistral_small",
+    format_user=StringFormatter(slots=["[INST]{{content}}[/INST]"]),
+    format_system=StringFormatter(slots=["[SYSTEM_PROMPT]{{content}}[/SYSTEM_PROMPT]"]),
+    format_function=FunctionFormatter(slots=["[TOOL_CALLS]{{content}}", {"eos_token"}], tool_format="mistral"),
+    format_observation=StringFormatter(slots=["""[TOOL_RESULTS]{"content": {{content}}}[/TOOL_RESULTS]"""]),
+    format_tools=ToolFormatter(tool_format="mistral"),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    mm_plugin=get_mm_plugin(name="pixtral", image_token="[IMG]"),
+)
+
+
+register_template(
+    name="olmo",
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>\n"]),
+    format_prefix=EmptyFormatter(slots=[{"eos_token"}]),
+)
+
+
+register_template(
+    name="openchat",
+    format_user=StringFormatter(slots=["GPT4 Correct User: {{content}}", {"eos_token"}, "GPT4 Correct Assistant:"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+)
+
+
+register_template(
+    name="openchat-3.6",
+    format_user=StringFormatter(
+        slots=[
+            (
+                "<|start_header_id|>GPT4 Correct User<|end_header_id|>\n\n{{content}}<|eot_id|>"
+                "<|start_header_id|>GPT4 Correct Assistant<|end_header_id|>\n\n"
+            )
+        ]
+    ),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<|eot_id|>"],
+)
+
+
+# copied from chatml template
+register_template(
+    name="opencoder",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    default_system="You are OpenCoder, created by OpenCoder Team.",
+    stop_words=["<|im_end|>"],
+)
+
+
+register_template(
+    name="orion",
+    format_user=StringFormatter(slots=["Human: {{content}}\n\nAssistant: ", {"eos_token"}]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+)
+
+
+register_template(
+    name="paligemma",
+    format_user=StringFormatter(slots=["{{content}}\n"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    mm_plugin=get_mm_plugin(name="paligemma", image_token="<image>"),
+    template_class=Llama2Template,
+)
+
+
+# copied from gemma template
+register_template(
+    name="paligemma_chat",
+    format_user=StringFormatter(slots=["<start_of_turn>user\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<end_of_turn>\n"]),
+    format_observation=StringFormatter(
+        slots=["<start_of_turn>tool\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]
+    ),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<end_of_turn>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(name="paligemma", image_token="<image>"),
+    template_class=Llama2Template,
+)
+
+
+register_template(
+    name="phi",
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}<|end|>\n<|assistant|>\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|end|>\n"]),
+    format_system=StringFormatter(slots=["<|system|>\n{{content}}<|end|>\n"]),
+    stop_words=["<|end|>"],
+    replace_eos=True,
+)
+
+
+register_template(
+    name="phi_small",
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}<|end|>\n<|assistant|>\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|end|>\n"]),
+    format_system=StringFormatter(slots=["<|system|>\n{{content}}<|end|>\n"]),
+    format_prefix=EmptyFormatter(slots=[{"<|endoftext|>"}]),
+    stop_words=["<|end|>"],
+    replace_eos=True,
+)
+
+
+register_template(
+    name="phi4",
+    format_user=StringFormatter(
+        slots=["<|im_start|>user<|im_sep|>{{content}}<|im_end|><|im_start|>assistant<|im_sep|>"]
+    ),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>"]),
+    format_system=StringFormatter(slots=["<|im_start|>system<|im_sep|>{{content}}<|im_end|>"]),
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+)
+
+
+# copied from ministral template
+register_template(
+    name="pixtral",
+    format_user=StringFormatter(slots=["[INST]{{content}}[/INST]"]),
+    format_system=StringFormatter(slots=["{{content}}\n\n"]),
+    format_function=FunctionFormatter(slots=["[TOOL_CALLS]{{content}}", {"eos_token"}], tool_format="mistral"),
+    format_observation=StringFormatter(slots=["""[TOOL_RESULTS]{"content": {{content}}}[/TOOL_RESULTS]"""]),
+    format_tools=ToolFormatter(tool_format="mistral"),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    mm_plugin=get_mm_plugin(name="pixtral", image_token="[IMG]"),
+    template_class=Llama2Template,
+)
+
+
+# copied from chatml template
+register_template(
+    name="qwen",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="qwen"),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>user\n<tool_response>\n{{content}}\n</tool_response><|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_tools=ToolFormatter(tool_format="qwen"),
+    default_system="You are Qwen, created by Alibaba Cloud. You are a helpful assistant.",
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+)
+
+
+# copied from qwen template
+register_template(
+    name="qwen3",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="qwen"),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>user\n<tool_response>\n{{content}}\n</tool_response><|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_tools=ToolFormatter(tool_format="qwen"),
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+    template_class=ReasoningTemplate,
+)
+
+
+# copied from qwen template
+register_template(
+    name="qwen3_nothink",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="qwen"),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>user\n<tool_response>\n{{content}}\n</tool_response><|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_tools=ToolFormatter(tool_format="qwen"),
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+)
+
+
+# copied from chatml template
+register_template(
+    name="qwen2_audio",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    default_system="You are a helpful assistant.",
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(name="qwen2_audio", audio_token="<|AUDIO|>"),
+)
+
+
+# copied from qwen template
+register_template(
+    name="qwen2_omni",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="qwen"),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>user\n<tool_response>\n{{content}}\n</tool_response><|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_tools=ToolFormatter(tool_format="qwen"),
+    default_system="You are a helpful assistant.",
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(
+        name="qwen2_omni",
+        image_token="<|IMAGE|>",
+        video_token="<|VIDEO|>",
+        audio_token="<|AUDIO|>",
+        vision_bos_token="<|vision_bos|>",
+        vision_eos_token="<|vision_eos|>",
+        audio_bos_token="<|audio_bos|>",
+        audio_eos_token="<|audio_eos|>",
+    ),
+)
+
+
+register_template(
+    name="qwen3_omni",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="qwen"),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>user\n<tool_response>\n{{content}}\n</tool_response><|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_tools=ToolFormatter(tool_format="qwen"),
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(
+        name="qwen2_omni", image_token="<|image_pad|>", video_token="<|video_pad|>", audio_token="<|audio_pad|>"
+    ),
+    template_class=ReasoningTemplate,
+)
+
+
+register_template(
+    name="qwen3_omni_nothink",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="qwen"),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>user\n<tool_response>\n{{content}}\n</tool_response><|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_tools=ToolFormatter(tool_format="qwen"),
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(
+        name="qwen2_omni", image_token="<|image_pad|>", video_token="<|video_pad|>", audio_token="<|audio_pad|>"
+    ),
+)
+
+
+# copied from qwen template
+register_template(
+    name="qwen2_vl",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="qwen"),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>user\n<tool_response>\n{{content}}\n</tool_response><|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_tools=ToolFormatter(tool_format="qwen"),
+    default_system="You are a helpful assistant.",
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(name="qwen2_vl", image_token="<|image_pad|>", video_token="<|video_pad|>"),
+)
+
+
+# copied from qwen template
+register_template(
+    name="qwen3_vl",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="qwen"),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>user\n<tool_response>\n{{content}}\n</tool_response><|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_tools=ToolFormatter(tool_format="qwen"),
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(name="qwen3_vl", image_token="<|image_pad|>", video_token="<|video_pad|>"),
+    template_class=ReasoningTemplate,
+)
+
+
+# copied from qwen template
+register_template(
+    name="qwen3_vl_nothink",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="qwen"),
+    format_observation=StringFormatter(
+        slots=["<|im_start|>user\n<tool_response>\n{{content}}\n</tool_response><|im_end|>\n<|im_start|>assistant\n"]
+    ),
+    format_tools=ToolFormatter(tool_format="qwen"),
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(name="qwen3_vl", image_token="<|image_pad|>", video_token="<|video_pad|>"),
+)
+
+
+register_template(
+    name="sailor",
+    format_user=StringFormatter(slots=["<|im_start|>question\n{{content}}<|im_end|>\n<|im_start|>answer\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    default_system=(
+        "You are an AI assistant named Sailor created by Sea AI Lab. "
+        "Your answer should be friendly, unbiased, faithful, informative and detailed."
+    ),
+    stop_words=["<|im_end|>"],
+)
+
+
+register_template(
+    name="seed_coder",
+    format_user=StringFormatter(
+        slots=[{"bos_token"}, "user\n{{content}}", {"eos_token"}, {"bos_token"}, "assistant\n"]
+    ),
+    format_system=StringFormatter(slots=[{"bos_token"}, "system\n{{content}}", {"eos_token"}]),
+    default_system=(
+        "You are an AI programming assistant, utilizing the Seed-Coder model, developed by ByteDance Seed, "
+        "and you only answer questions related to computer science. For politically sensitive questions, "
+        "security and privacy issues, and other non-computer science questions, you will refuse to answer.\n\n"
+    ),
+)
+
+
+# copied from seed_coder
+register_template(
+    name="seed_oss",
+    format_user=StringFormatter(
+        slots=[{"bos_token"}, "user\n{{content}}", {"eos_token"}, {"bos_token"}, "assistant\n"]
+    ),
+    format_system=StringFormatter(slots=[{"bos_token"}, "system\n{{content}}", {"eos_token"}]),
+    format_function=FunctionFormatter(slots=[{"bos_token"}, "\n{{content}}", {"eos_token"}], tool_format="seed_oss"),
+    format_tools=ToolFormatter(tool_format="seed_oss"),
+    template_class=ReasoningTemplate,
+    thought_words=("<seed:think>", "</seed:think>"),
+)
+
+
+# copied from llama3 template
+register_template(
+    name="skywork_o1",
+    format_user=StringFormatter(
+        slots=[
+            (
+                "<|start_header_id|>user<|end_header_id|>\n\n{{content}}<|eot_id|>"
+                "<|start_header_id|>assistant<|end_header_id|>\n\n"
+            )
+        ]
+    ),
+    format_assistant=StringFormatter(slots=["{{content}}<|eot_id|>"]),
+    format_system=StringFormatter(slots=["<|start_header_id|>system<|end_header_id|>\n\n{{content}}<|eot_id|>"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|eot_id|>"], tool_format="llama3"),
+    format_observation=StringFormatter(
+        slots=[
+            (
+                "<|start_header_id|>ipython<|end_header_id|>\n\n{{content}}<|eot_id|>"
+                "<|start_header_id|>assistant<|end_header_id|>\n\n"
+            )
+        ]
+    ),
+    format_tools=ToolFormatter(tool_format="llama3"),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    default_system=(
+        "You are Skywork-o1, a thinking model developed by Skywork AI, specializing in solving complex problems "
+        "involving mathematics, coding, and logical reasoning through deep thought. When faced with a user's request, "
+        "you first engage in a lengthy and in-depth thinking process to explore possible solutions to the problem. "
+        "After completing your thoughts, you then provide a detailed explanation of the solution process "
+        "in your response."
+    ),
+    stop_words=["<|eot_id|>", "<|eom_id|>"],
+)
+
+
+register_template(
+    name="smollm",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    stop_words=["<|im_end|>"],
+)
+
+
+register_template(
+    name="smollm2",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    stop_words=["<|im_end|>"],
+    default_system="You are a helpful AI assistant named SmolLM, trained by Hugging Face.",
+)
+
+
+register_template(
+    name="solar",
+    format_user=StringFormatter(slots=["### User:\n{{content}}\n\n### Assistant:\n"]),
+    format_system=StringFormatter(slots=["### System:\n{{content}}\n\n"]),
+    efficient_eos=True,
+)
+
+
+register_template(
+    name="starchat",
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}<|end|>\n<|assistant|>"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|end|>\n"]),
+    format_system=StringFormatter(slots=["<|system|>\n{{content}}<|end|>\n"]),
+    stop_words=["<|end|>"],
+)
+
+
+register_template(
+    name="telechat",
+    format_user=StringFormatter(slots=["<_user>{{content}}<_bot>"]),
+    format_system=StringFormatter(slots=["<_system>{{content}}<_end>"]),
+)
+
+
+register_template(
+    name="telechat2",
+    format_user=StringFormatter(slots=["<_user>{{content}}<_bot>"]),
+    format_system=StringFormatter(slots=["<_system>{{content}}"]),
+    default_system=(
+        "你是中国电信星辰语义大模型，英文名是TeleChat，你是由中电信人工智能科技有限公司和中国电信人工智能研究院（TeleAI）研发的人工智能助手。"
+    ),
+)
+
+
+register_template(
+    name="vicuna",
+    format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT:"]),
+    default_system=(
+        "A chat between a curious user and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the user's questions."
+    ),
+    replace_jinja_template=True,
+)
+
+
+register_template(
+    name="video_llava",
+    format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT:"]),
+    default_system=(
+        "A chat between a curious user and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the user's questions."
+    ),
+    mm_plugin=get_mm_plugin(name="video_llava", image_token="<image>", video_token="<video>"),
+)
+
+
+register_template(
+    name="xuanyuan",
+    format_user=StringFormatter(slots=["Human: {{content}} Assistant:"]),
+    default_system=(
+        "以下是用户和人工智能助手之间的对话。用户以Human开头，人工智能助手以Assistant开头，"
+        "会对人类提出的问题给出有帮助、高质量、详细和礼貌的回答，并且总是拒绝参与与不道德、"
+        "不安全、有争议、政治敏感等相关的话题、问题和指示。\n"
+    ),
+)
+
+
+register_template(
+    name="xverse",
+    format_user=StringFormatter(slots=["Human: {{content}}\n\nAssistant: "]),
+)
+
+
+register_template(
+    name="yayi",
+    format_user=StringFormatter(slots=[{"token": "<|Human|>"}, ":\n{{content}}\n\n", {"token": "<|YaYi|>"}, ":"]),
+    format_assistant=StringFormatter(slots=["{{content}}\n\n"]),
+    format_system=StringFormatter(slots=[{"token": "<|System|>"}, ":\n{{content}}\n\n"]),
+    default_system=(
+        "You are a helpful, respectful and honest assistant named YaYi "
+        "developed by Beijing Wenge Technology Co.,Ltd. "
+        "Always answer as helpfully as possible, while being safe.  "
+        "Your answers should not include any harmful, unethical, "
+        "racist, sexist, toxic, dangerous, or illegal content. "
+        "Please ensure that your responses are socially unbiased and positive in nature.\n\n"
+        "If a question does not make any sense, or is not factually coherent, "
+        "explain why instead of answering something not correct. "
+        "If you don't know the answer to a question, please don't share false information."
+    ),
+    stop_words=["<|End|>"],
+)
+
+
+# copied from chatml template
+register_template(
+    name="yi",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    stop_words=["<|im_end|>"],
+)
+
+
+register_template(
+    name="yi_vl",
+    format_user=StringFormatter(slots=["### Human: {{content}}\n### Assistant:"]),
+    format_assistant=StringFormatter(slots=["{{content}}\n"]),
+    default_system=(
+        "This is a chat between an inquisitive human and an AI assistant. "
+        "Assume the role of the AI assistant. Read all the images carefully, "
+        "and respond to the human's questions with informative, helpful, detailed and polite answers. "
+        "这是一个好奇的人类和一个人工智能助手之间的对话。假设你扮演这个AI助手的角色。"
+        "仔细阅读所有的图像，并对人类的问题做出信息丰富、有帮助、详细的和礼貌的回答。\n\n"
+    ),
+    stop_words=["###"],
+    efficient_eos=True,
+    mm_plugin=get_mm_plugin(name="llava", image_token="<image>"),
+)
+
+
+register_template(
+    name="yuan",
+    format_user=StringFormatter(slots=["{{content}}", {"token": "<sep>"}]),
+    format_assistant=StringFormatter(slots=["{{content}}<eod>\n"]),
+    stop_words=["<eod>"],
+)
+
+
+register_template(
+    name="zephyr",
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}", {"eos_token"}, "<|assistant|>\n"]),
+    format_system=StringFormatter(slots=["<|system|>\n{{content}}", {"eos_token"}]),
+    default_system="You are Zephyr, a helpful assistant.",
+)
+
+
+register_template(
+    name="ziya",
+    format_user=StringFormatter(slots=["<human>:{{content}}\n<bot>:"]),
+    format_assistant=StringFormatter(slots=["{{content}}\n"]),
+)
diff --git a/llamafactory/data/tool_utils.py b/llamafactory/data/tool_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f677f1eb950ef211c1e6e93d9fb0e7cd37ad200
--- /dev/null
+++ b/llamafactory/data/tool_utils.py
@@ -0,0 +1,448 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import re
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from datetime import datetime
+from typing import Any, NamedTuple, Union
+
+from typing_extensions import override
+
+
+class FunctionCall(NamedTuple):
+    name: str
+    arguments: str
+
+
+DEFAULT_TOOL_PROMPT = (
+    "You have access to the following tools:\n{tool_text}"
+    "Use the following format if using a tool:\n"
+    "```\n"
+    "Action: tool name (one of [{tool_names}])\n"
+    "Action Input: the input to the tool, in a JSON format representing the kwargs "
+    """(e.g. ```{{"input": "hello world", "num_beams": 5}}```)\n"""
+    "```\n"
+)
+
+GLM4_TOOL_PROMPT = (
+    "你是一个名为 ChatGLM 的人工智能助手。你是基于智谱 AI 公司训练的语言模型 GLM-4 模型开发的，"
+    "你的任务是针对用户的问题和要求提供适当的答复和支持。\n\n# 可用工具{tool_text}"
+)
+
+GLM4_MOE_TOOL_PROMPT = (
+    "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\n"
+    "You are provided with function signatures within <tools></tools> XML tags:\n<tools>{tool_text}"
+    "\n</tools>\n\nFor each function call, output the function name and arguments within the following XML format:"
+    "\n<tool_call>{{function-name}}"
+    "\n<arg_key>{{arg-key-1}}</arg_key>"
+    "\n<arg_value>{{arg-value-1}}</arg_value>"
+    "\n<arg_key>{{arg-key-2}}</arg_key>"
+    "\n<arg_value>{{arg-value-2}}</arg_value>"
+    "\n...\n</tool_call>\n"
+)
+
+LLAMA3_TOOL_PROMPT = (
+    "Cutting Knowledge Date: December 2023\nToday Date: {date}\n\n"
+    "You have access to the following functions. To call a function, please respond with JSON for a function call. "
+    """Respond in the format {{"name": function name, "parameters": dictionary of argument name and its value}}. """
+    "Do not use variables.\n\n{tool_text}"
+)
+
+QWEN_TOOL_PROMPT = (
+    "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\n"
+    "You are provided with function signatures within <tools></tools> XML tags:\n<tools>{tool_text}"
+    "\n</tools>\n\nFor each function call, return a json object with function name and arguments within "
+    """<tool_call></tool_call> XML tags:\n<tool_call>\n{{"name": <function-name>, """
+    """"arguments": <args-json-object>}}\n</tool_call>"""
+)
+
+SEED_TOOL_PROMPT = (
+    "system\nYou are Doubao, a helpful AI assistant. You may call one or more functions to assist with the user query."
+    "Tool List:\nYou are authorized to use the following tools (described in JSON Schema format). Before performing "
+    "any task, you must decide how to call them based on the descriptions and parameters of these tools.{tool_text}\n"
+    "工具调用请遵循如下格式:\n<seed:tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>value_1"
+    "</parameter>\n<parameter=example_parameter_2>This is the value for the second parameter\nthat can span\nmultiple "
+    "lines</parameter>\n</function>\n</seed:tool_call>\n"
+)
+
+LING_TOOL_PROMPT = (
+    "# Tools\n\nYou may call one or more functions to assist with the user query.\n\n"
+    "You are provided with function signatures within <tools></tools> XML tags:\n<tools>{tool_text}"
+    "\n</tools>\n\nFor each function call, return a json object with function name and arguments within "
+    """<tool_call></tool_call> XML tags:\n<tool_call>\n{{"name": <function-name>, """
+    """"arguments": <args-json-object>}}\n</tool_call>"""
+)
+
+
+@dataclass
+class ToolUtils(ABC):
+    """Base class for tool utilities."""
+
+    @staticmethod
+    @abstractmethod
+    def tool_formatter(tools: list[dict[str, Any]]) -> str:
+        r"""Generate the system message describing all the available tools."""
+        ...
+
+    @staticmethod
+    @abstractmethod
+    def function_formatter(functions: list["FunctionCall"]) -> str:
+        r"""Generate the assistant message including all the tool calls."""
+        ...
+
+    @staticmethod
+    @abstractmethod
+    def tool_extractor(content: str) -> Union[str, list["FunctionCall"]]:
+        r"""Extract all the function calls from the assistant message.
+
+        It should be an inverse function of `function_formatter`.
+        """
+        ...
+
+
+class DefaultToolUtils(ToolUtils):
+    r"""Default tool using template."""
+
+    @override
+    @staticmethod
+    def tool_formatter(tools: list[dict[str, Any]]) -> str:
+        tool_text = ""
+        tool_names = []
+        for tool in tools:
+            tool = tool.get("function", "") if tool.get("type") == "function" else tool
+            param_text = ""
+            for name, param in tool["parameters"]["properties"].items():
+                required, enum, items = "", "", ""
+                if name in tool["parameters"].get("required", []):
+                    required = ", required"
+
+                if param.get("enum", None):
+                    enum = ", should be one of [{}]".format(", ".join(param["enum"]))
+
+                if param.get("items", None):
+                    items = ", where each item should be {}".format(param["items"].get("type", ""))
+
+                param_text += "  - {name} ({type}{required}): {desc}{enum}{items}\n".format(
+                    name=name,
+                    type=param.get("type", ""),
+                    required=required,
+                    desc=param.get("description", ""),
+                    enum=enum,
+                    items=items,
+                )
+
+            tool_text += "> Tool Name: {name}\nTool Description: {desc}\nTool Args:\n{args}\n".format(
+                name=tool["name"], desc=tool.get("description", ""), args=param_text
+            )
+            tool_names.append(tool["name"])
+
+        return DEFAULT_TOOL_PROMPT.format(tool_text=tool_text, tool_names=", ".join(tool_names))
+
+    @override
+    @staticmethod
+    def function_formatter(functions: list["FunctionCall"]) -> str:
+        return "\n".join([f"Action: {name}\nAction Input: {arguments}" for name, arguments in functions])
+
+    @override
+    @staticmethod
+    def tool_extractor(content: str) -> Union[str, list["FunctionCall"]]:
+        regex = re.compile(r"Action:\s*([a-zA-Z0-9_]+)\s*Action Input:\s*(.+?)(?=\s*Action:|\s*$)", re.DOTALL)
+        action_match: list[tuple[str, str]] = re.findall(regex, content)
+        if not action_match:
+            return content
+
+        results = []
+        for match in action_match:
+            tool_name = match[0].strip()
+            tool_input = match[1].strip().strip('"').strip("```")
+            try:
+                arguments = json.loads(tool_input)
+                results.append(FunctionCall(tool_name, json.dumps(arguments, ensure_ascii=False)))
+            except json.JSONDecodeError:
+                return content
+
+        return results
+
+
+class GLM4ToolUtils(ToolUtils):
+    r"""GLM-4 tool using template."""
+
+    @override
+    @staticmethod
+    def tool_formatter(tools: list[dict[str, Any]]) -> str:
+        tool_text = ""
+        for tool in tools:
+            tool = tool.get("function", "") if tool.get("type") == "function" else tool
+            tool_text += "\n\n## {name}\n\n{body}\n在调用上述函数时，请使用 Json 格式表示调用的参数。".format(
+                name=tool["name"], body=json.dumps(tool, indent=4, ensure_ascii=False)
+            )
+
+        return GLM4_TOOL_PROMPT.format(tool_text=tool_text)
+
+    @override
+    @staticmethod
+    def function_formatter(functions: list["FunctionCall"]) -> str:
+        if len(functions) > 1:
+            raise ValueError("GLM-4 does not support parallel functions.")
+
+        return f"{functions[0].name}\n{functions[0].arguments}"
+
+    @override
+    @staticmethod
+    def tool_extractor(content: str) -> Union[str, list["FunctionCall"]]:
+        if "\n" not in content:
+            return content
+
+        tool_name, tool_input = content.split("\n", maxsplit=1)
+        try:
+            arguments = json.loads(tool_input.strip())
+        except json.JSONDecodeError:
+            return content
+
+        return [FunctionCall(tool_name, json.dumps(arguments, ensure_ascii=False))]
+
+
+class Llama3ToolUtils(ToolUtils):
+    r"""Llama 3.x tool using template with `tools_in_user_message=False`.
+
+    Reference: https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling
+    """
+
+    @override
+    @staticmethod
+    def tool_formatter(tools: list[dict[str, Any]]) -> str:
+        date = datetime.now().strftime("%d %b %Y")
+        tool_text = ""
+        for tool in tools:
+            wrapped_tool = tool if tool.get("type") == "function" else {"type": "function", "function": tool}
+            tool_text += json.dumps(wrapped_tool, indent=4, ensure_ascii=False) + "\n\n"
+
+        return LLAMA3_TOOL_PROMPT.format(date=date, tool_text=tool_text)
+
+    @override
+    @staticmethod
+    def function_formatter(functions: list["FunctionCall"]) -> str:
+        function_objects = [{"name": name, "parameters": json.loads(arguments)} for name, arguments in functions]
+        return json.dumps(function_objects[0] if len(function_objects) == 1 else function_objects, ensure_ascii=False)
+
+    @override
+    @staticmethod
+    def tool_extractor(content: str) -> Union[str, list["FunctionCall"]]:
+        try:
+            tools = json.loads(content.strip())
+        except json.JSONDecodeError:
+            return content
+
+        tools = [tools] if not isinstance(tools, list) else tools
+        try:
+            return [FunctionCall(tool["name"], json.dumps(tool["parameters"], ensure_ascii=False)) for tool in tools]
+        except KeyError:
+            return content
+
+
+class MistralToolUtils(ToolUtils):
+    r"""Mistral v0.3 tool using template."""
+
+    @override
+    @staticmethod
+    def tool_formatter(tools: list[dict[str, Any]]) -> str:
+        wrapped_tools = []
+        for tool in tools:
+            wrapped_tools.append(tool if tool.get("type") == "function" else {"type": "function", "function": tool})
+
+        return "[AVAILABLE_TOOLS] " + json.dumps(wrapped_tools, ensure_ascii=False) + "[/AVAILABLE_TOOLS]"
+
+    @override
+    @staticmethod
+    def function_formatter(functions: list["FunctionCall"]) -> str:
+        return json.dumps(
+            [{"name": name, "arguments": json.loads(arguments)} for name, arguments in functions], ensure_ascii=False
+        )
+
+    @override
+    @staticmethod
+    def tool_extractor(content: str) -> Union[str, list["FunctionCall"]]:
+        try:
+            tools = json.loads(content.strip())
+        except json.JSONDecodeError:
+            return content
+
+        tools = [tools] if not isinstance(tools, list) else tools
+        try:
+            return [FunctionCall(tool["name"], json.dumps(tool["arguments"], ensure_ascii=False)) for tool in tools]
+        except KeyError:
+            return content
+
+
+class QwenToolUtils(ToolUtils):
+    r"""Qwen 2.5 tool using template."""
+
+    @override
+    @staticmethod
+    def tool_formatter(tools: list[dict[str, Any]]) -> str:
+        tool_text = ""
+        for tool in tools:
+            wrapped_tool = tool if tool.get("type") == "function" else {"type": "function", "function": tool}
+            tool_text += "\n" + json.dumps(wrapped_tool, ensure_ascii=False)
+
+        return QWEN_TOOL_PROMPT.format(tool_text=tool_text)
+
+    @override
+    @staticmethod
+    def function_formatter(functions: list["FunctionCall"]) -> str:
+        function_texts = [
+            json.dumps({"name": name, "arguments": json.loads(arguments)}, ensure_ascii=False)
+            for name, arguments in functions
+        ]
+        return "\n".join([f"<tool_call>\n{text}\n</tool_call>" for text in function_texts])
+
+    @override
+    @staticmethod
+    def tool_extractor(content: str) -> Union[str, list["FunctionCall"]]:
+        regex = re.compile(r"<tool_call>(.+?)</tool_call>(?=\s*<tool_call>|\s*$)", re.DOTALL)
+        tool_match: list[str] = re.findall(regex, content)
+        if not tool_match:
+            return content
+
+        results = []
+        for tool in tool_match:
+            try:
+                tool = json.loads(tool.strip())
+            except json.JSONDecodeError:
+                return content
+
+            if "name" not in tool or "arguments" not in tool:
+                return content
+
+            results.append(FunctionCall(tool["name"], json.dumps(tool["arguments"], ensure_ascii=False)))
+
+        return results
+
+
+class GLM4MOEToolUtils(QwenToolUtils):
+    r"""GLM-4-MOE tool using template."""
+
+    @override
+    @staticmethod
+    def tool_formatter(tools: list[dict[str, Any]]) -> str:
+        tool_text = ""
+        for tool in tools:
+            wrapped_tool = tool if tool.get("type") == "function" else {"type": "function", "function": tool}
+            tool_text += "\n" + json.dumps(wrapped_tool, ensure_ascii=False)
+
+        return GLM4_MOE_TOOL_PROMPT.format(tool_text=tool_text)
+
+    @override
+    @staticmethod
+    def function_formatter(functions: list["FunctionCall"]) -> str:
+        function_json = [
+            {"func_name": name, "func_key_values": json.loads(arguments)} for name, arguments in functions
+        ]
+        function_texts = []
+        for func in function_json:
+            prompt = "\n<tool_call>" + func["func_name"]
+            for key, value in func["func_key_values"].items():
+                prompt += "\n<arg_key>" + key + "</arg_key>"
+                if not isinstance(value, str):
+                    value = json.dumps(value, ensure_ascii=False)
+                prompt += "\n<arg_value>" + value + "</arg_value>"
+            function_texts.append(prompt)
+
+        return "\n".join(function_texts)
+
+
+class SeedToolUtils(ToolUtils):
+    r"""Seed tool using template."""
+
+    @override
+    @staticmethod
+    def tool_formatter(tools: list[dict[str, Any]]) -> str:
+        return SEED_TOOL_PROMPT.format(tool_text="\n" + json.dumps(tools, ensure_ascii=False))
+
+    @override
+    @staticmethod
+    def function_formatter(functions: list["FunctionCall"]) -> str:
+        function_json = [
+            {"func_name": name, "func_key_values": json.loads(arguments)} for name, arguments in functions
+        ]
+        function_texts = []
+        for func in function_json:
+            prompt = "\n<seed:tool_call>\n<function=" + func["func_name"]
+            for key, value in func["func_key_values"].items():
+                prompt += "\n<parameter=" + key + ">"
+                if not isinstance(value, str):
+                    value = json.dumps(value, ensure_ascii=False)
+                prompt += value + "</parameter>"
+            prompt += "\n</function>\n</seed:tool_call>"
+            function_texts.append(prompt)
+
+        return "\n".join(function_texts)
+
+    @override
+    @staticmethod
+    def tool_extractor(content: str) -> Union[str, list["FunctionCall"]]:
+        results = []
+        regex = re.compile(
+            r"<seed:tool_call>\s*<function=\s*([^\s<]+)\s*(.*?)\s*</function>\s*</seed:tool_call>", re.DOTALL
+        )
+        for func_name, params_block in re.findall(regex, content):
+            args_dict = {}
+            param_pattern = re.compile(r"<parameter=(.*?)>(.*?)</parameter>", re.DOTALL)
+            for key, raw_value in re.findall(param_pattern, params_block.strip()):
+                value = raw_value.strip()
+                try:
+                    parsed_value = json.loads(value)
+                except json.JSONDecodeError:
+                    parsed_value = raw_value
+                args_dict[key] = parsed_value
+
+            results.append(FunctionCall(func_name.strip(), json.dumps(args_dict, ensure_ascii=False)))
+
+        return results
+
+
+class LingToolUtils(QwenToolUtils):
+    r"""Ling v2 tool using template."""
+
+    @override
+    @staticmethod
+    def tool_formatter(tools: list[dict[str, Any]]) -> str:
+        tool_text = ""
+        for tool in tools:
+            wrapped_tool = tool if tool.get("type") == "function" else {"type": "function", "function": tool}
+            tool_text += "\n" + json.dumps(wrapped_tool, ensure_ascii=False)
+
+        return LING_TOOL_PROMPT.format(tool_text=tool_text) + "\n" + "detailed thinking off"
+
+
+TOOLS = {
+    "default": DefaultToolUtils(),
+    "glm4": GLM4ToolUtils(),
+    "llama3": Llama3ToolUtils(),
+    "mistral": MistralToolUtils(),
+    "qwen": QwenToolUtils(),
+    "glm4_moe": GLM4MOEToolUtils(),
+    "seed_oss": SeedToolUtils(),
+    "ling": LingToolUtils(),
+}
+
+
+def get_tool_utils(name: str) -> "ToolUtils":
+    tool_utils = TOOLS.get(name, None)
+    if tool_utils is None:
+        raise ValueError(f"Tool utils `{name}` not found.")
+
+    return tool_utils
diff --git a/llamafactory/eval/__init__.py b/llamafactory/eval/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/llamafactory/eval/evaluator.py b/llamafactory/eval/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..7729c59bf413cb66054a3e06f2e42d6794cb495d
--- /dev/null
+++ b/llamafactory/eval/evaluator.py
@@ -0,0 +1,158 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# This code is inspired by the Dan's test library.
+# https://github.com/hendrycks/test/blob/master/evaluate_flan.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# MIT License
+#
+# Copyright (c) 2020 Dan Hendrycks
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import json
+import os
+from typing import TYPE_CHECKING, Any, Optional
+
+import numpy as np
+import torch
+from datasets import load_dataset
+from tqdm import tqdm, trange
+from transformers.utils import cached_file
+
+from ..data import get_template_and_fix_tokenizer
+from ..extras.constants import CHOICES, SUBJECTS
+from ..hparams import get_eval_args
+from ..model import load_model, load_tokenizer
+from .template import get_eval_template
+
+
+if TYPE_CHECKING:
+    from numpy.typing import NDArray
+
+
+class Evaluator:
+    def __init__(self, args: Optional[dict[str, Any]] = None) -> None:
+        self.model_args, self.data_args, self.eval_args, finetuning_args = get_eval_args(args)
+        self.tokenizer = load_tokenizer(self.model_args)["tokenizer"]
+        self.tokenizer.padding_side = "right"  # avoid overflow issue in batched inference for llama2
+        self.template = get_template_and_fix_tokenizer(self.tokenizer, self.data_args)
+        self.model = load_model(self.tokenizer, self.model_args, finetuning_args)
+        self.eval_template = get_eval_template(self.eval_args.lang)
+        self.choice_inputs = [self.tokenizer.encode(ch, add_special_tokens=False)[-1] for ch in CHOICES]
+
+    @torch.inference_mode()
+    def batch_inference(self, batch_input: dict[str, "torch.Tensor"]) -> list[str]:
+        logits = self.model(**batch_input).logits
+        lengths = torch.sum(batch_input["attention_mask"], dim=-1)
+        word_probs = torch.stack([logits[i, lengths[i] - 1] for i in range(len(lengths))], dim=0)
+        choice_probs = torch.nn.functional.softmax(word_probs[:, self.choice_inputs], dim=-1).detach()
+        return [chr(ord("A") + offset.item()) for offset in torch.argmax(choice_probs, dim=-1)]
+
+    def eval(self) -> None:
+        eval_task = self.eval_args.task.split("_")[0]
+        eval_split = self.eval_args.task.split("_")[1]
+
+        mapping = cached_file(
+            path_or_repo_id=os.path.join(self.eval_args.task_dir, eval_task),
+            filename="mapping.json",
+            cache_dir=self.model_args.cache_dir,
+            token=self.model_args.hf_hub_token,
+        )
+
+        with open(mapping, encoding="utf-8") as f:
+            categorys: dict[str, dict[str, str]] = json.load(f)
+
+        category_corrects = {subj: np.array([], dtype="bool") for subj in SUBJECTS}
+        pbar = tqdm(categorys.keys(), desc="Processing subjects", position=0)
+        results = {}
+        for subject in pbar:
+            dataset = load_dataset(
+                path=os.path.join(self.eval_args.task_dir, eval_task),
+                name=subject,
+                cache_dir=self.model_args.cache_dir,
+                download_mode=self.eval_args.download_mode,
+                token=self.model_args.hf_hub_token,
+                trust_remote_code=self.model_args.trust_remote_code,
+            )
+            pbar.set_postfix_str(categorys[subject]["name"])
+            inputs, outputs, labels = [], [], []
+            for i in trange(len(dataset[eval_split]), desc="Formatting batches", position=1, leave=False):
+                support_set = (
+                    dataset["train"].shuffle().select(range(min(self.eval_args.n_shot, len(dataset["train"]))))
+                )
+                messages = self.eval_template.format_example(
+                    target_data=dataset[eval_split][i],
+                    support_set=support_set,
+                    subject_name=categorys[subject]["name"],
+                )
+
+                input_ids, _ = self.template.encode_oneturn(tokenizer=self.tokenizer, messages=messages)
+                inputs.append({"input_ids": input_ids, "attention_mask": [1] * len(input_ids)})
+                labels.append(messages[-1]["content"])
+
+            for i in trange(
+                0, len(inputs), self.eval_args.batch_size, desc="Predicting batches", position=1, leave=False
+            ):
+                batch_input = self.tokenizer.pad(
+                    inputs[i : i + self.eval_args.batch_size], return_attention_mask=True, return_tensors="pt"
+                ).to(self.model.device)
+                preds = self.batch_inference(batch_input)
+                outputs += preds
+
+            corrects = np.array(outputs) == np.array(labels)
+            category_name = categorys[subject]["category"]
+            category_corrects[category_name] = np.concatenate([category_corrects[category_name], corrects], axis=0)
+            category_corrects["Average"] = np.concatenate([category_corrects["Average"], corrects], axis=0)
+            results[subject] = {str(i): outputs[i] for i in range(len(outputs))}
+
+        pbar.close()
+        self._save_results(category_corrects, results)
+
+    def _save_results(self, category_corrects: dict[str, "NDArray"], results: dict[str, dict[int, str]]) -> None:
+        score_info = "\n".join(
+            [
+                f"{category_name:>15}: {100 * np.mean(category_correct):.2f}"
+                for category_name, category_correct in category_corrects.items()
+                if len(category_correct)
+            ]
+        )
+        print(score_info)
+        if self.eval_args.save_dir is not None:
+            os.makedirs(self.eval_args.save_dir, exist_ok=False)
+            with open(os.path.join(self.eval_args.save_dir, "results.json"), "w", encoding="utf-8", newline="\n") as f:
+                json.dump(results, f, indent=2)
+
+            with open(os.path.join(self.eval_args.save_dir, "results.log"), "w", encoding="utf-8", newline="\n") as f:
+                f.write(score_info)
+
+
+def run_eval() -> None:
+    Evaluator().eval()
diff --git a/llamafactory/eval/template.py b/llamafactory/eval/template.py
new file mode 100644
index 0000000000000000000000000000000000000000..5742469787a5001001a2702f183306bd2a312aef
--- /dev/null
+++ b/llamafactory/eval/template.py
@@ -0,0 +1,79 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+
+from ..data import Role
+from ..extras.constants import CHOICES
+
+
+@dataclass
+class EvalTemplate:
+    system: str
+    choice: str
+    answer: str
+
+    def _parse_example(self, example: dict[str, str]) -> tuple[str, str]:
+        r"""Parse eval example.
+
+        input: a dict with keys {"question", "A", "B", "C", "D", "answer"}
+        output: a tuple of (prompt, response).
+        """
+        candidates = [self.choice.format(choice=ch, content=example[ch]) for ch in CHOICES if ch in example]
+        return "".join([example["question"]] + candidates + [self.answer]), example["answer"]
+
+    def format_example(
+        self, target_data: dict[str, str], support_set: list[dict[str, str]], subject_name: str
+    ) -> list[dict[str, str]]:
+        r"""Convert dataset examples to messages."""
+        messages = []
+        for k in range(len(support_set)):
+            prompt, response = self._parse_example(support_set[k])
+            messages.append({"role": Role.USER.value, "content": prompt})
+            messages.append({"role": Role.ASSISTANT.value, "content": response})
+
+        prompt, response = self._parse_example(target_data)
+        messages.append({"role": Role.USER.value, "content": prompt})
+        messages.append({"role": Role.ASSISTANT.value, "content": response})
+        messages[0]["content"] = self.system.format(subject=subject_name) + messages[0]["content"]
+        return messages
+
+
+eval_templates: dict[str, "EvalTemplate"] = {}
+
+
+def _register_eval_template(name: str, system: str, choice: str, answer: str) -> None:
+    eval_templates[name] = EvalTemplate(system=system, choice=choice, answer=answer)
+
+
+def get_eval_template(name: str) -> "EvalTemplate":
+    eval_template = eval_templates.get(name, None)
+    assert eval_template is not None, f"Template {name} does not exist."
+    return eval_template
+
+
+_register_eval_template(
+    name="en",
+    system="The following are multiple choice questions (with answers) about {subject}.\n\n",
+    choice="\n{choice}. {content}",
+    answer="\nAnswer:",
+)
+
+
+_register_eval_template(
+    name="zh",
+    system="以下是中国关于{subject}考试的单项选择题，请选出其中的正确答案。\n\n",
+    choice="\n{choice}. {content}",
+    answer="\n答案：",
+)
diff --git a/llamafactory/extras/__init__.py b/llamafactory/extras/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/llamafactory/extras/__pycache__/__init__.cpython-312.pyc b/llamafactory/extras/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f5e1baf3390cd3adf6af89d8d5903f16ba1bb397
Binary files /dev/null and b/llamafactory/extras/__pycache__/__init__.cpython-312.pyc differ
diff --git a/llamafactory/extras/__pycache__/constants.cpython-312.pyc b/llamafactory/extras/__pycache__/constants.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a5f29541e00e9a36b05b1d7f4898f15d7780fa63
--- /dev/null
+++ b/llamafactory/extras/__pycache__/constants.cpython-312.pyc
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a7e43b2ec9f93d5d9ef49e7ba0b0c5f81af979781d2b216dbfb172e95ed0256
+size 114369
diff --git a/llamafactory/extras/__pycache__/env.cpython-312.pyc b/llamafactory/extras/__pycache__/env.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c4686ab393040faf112d4d6fd3a2fb3077fdc0ad
Binary files /dev/null and b/llamafactory/extras/__pycache__/env.cpython-312.pyc differ
diff --git a/llamafactory/extras/__pycache__/logging.cpython-312.pyc b/llamafactory/extras/__pycache__/logging.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d07484ad89ce25e8bf0b49882cf38a449556b0b4
Binary files /dev/null and b/llamafactory/extras/__pycache__/logging.cpython-312.pyc differ
diff --git a/llamafactory/extras/__pycache__/misc.cpython-312.pyc b/llamafactory/extras/__pycache__/misc.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..04bda986d198fe9eb30c53cfd4fc8041bd57f39a
Binary files /dev/null and b/llamafactory/extras/__pycache__/misc.cpython-312.pyc differ
diff --git a/llamafactory/extras/__pycache__/packages.cpython-312.pyc b/llamafactory/extras/__pycache__/packages.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..589a85d3f06998329ed8f606fa4424e98be89f37
Binary files /dev/null and b/llamafactory/extras/__pycache__/packages.cpython-312.pyc differ
diff --git a/llamafactory/extras/__pycache__/ploting.cpython-312.pyc b/llamafactory/extras/__pycache__/ploting.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b7eacf94d42b667235062ad100add246d6f47960
Binary files /dev/null and b/llamafactory/extras/__pycache__/ploting.cpython-312.pyc differ
diff --git a/llamafactory/extras/constants.py b/llamafactory/extras/constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1137ac290fec0cd97d86ca983d22ab168fac04e
--- /dev/null
+++ b/llamafactory/extras/constants.py
@@ -0,0 +1,3762 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from collections import OrderedDict, defaultdict
+from enum import Enum, unique
+from typing import Optional
+
+from peft.utils import SAFETENSORS_WEIGHTS_NAME as SAFE_ADAPTER_WEIGHTS_NAME
+from peft.utils import WEIGHTS_NAME as ADAPTER_WEIGHTS_NAME
+from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, WEIGHTS_INDEX_NAME, WEIGHTS_NAME
+
+
+AUDIO_PLACEHOLDER = os.getenv("AUDIO_PLACEHOLDER", "<audio>")
+
+CHECKPOINT_NAMES = {
+    SAFE_ADAPTER_WEIGHTS_NAME,
+    ADAPTER_WEIGHTS_NAME,
+    SAFE_WEIGHTS_INDEX_NAME,
+    SAFE_WEIGHTS_NAME,
+    WEIGHTS_INDEX_NAME,
+    WEIGHTS_NAME,
+}
+
+CHOICES = ["A", "B", "C", "D"]
+
+DATA_CONFIG = "dataset_info.json"
+
+DEFAULT_TEMPLATE = defaultdict(str)
+
+FILEEXT2TYPE = {
+    "arrow": "arrow",
+    "csv": "csv",
+    "json": "json",
+    "jsonl": "json",
+    "parquet": "parquet",
+    "txt": "text",
+}
+
+IGNORE_INDEX = -100
+
+IMAGE_PLACEHOLDER = os.getenv("IMAGE_PLACEHOLDER", "<image>")
+
+LAYERNORM_NAMES = {"norm", "ln"}
+
+LLAMABOARD_CONFIG = "llamaboard_config.yaml"
+
+MCA_SUPPORTED_MODELS = {
+    "deepseek_v3",
+    "llama",
+    "mistral",
+    "mixtral",
+    "qwen2",
+    "qwen2_vl",
+    "qwen2_5_vl",
+    "qwen3",
+    "qwen3_moe",
+    "qwen3_next",
+}
+
+METHODS = ["full", "freeze", "lora", "oft"]
+
+MOD_SUPPORTED_MODELS = {"bloom", "falcon", "gemma", "llama", "mistral", "mixtral", "phi", "starcoder2"}
+
+MULTIMODAL_SUPPORTED_MODELS = set()
+
+PEFT_METHODS = {"lora", "oft"}
+
+RUNNING_LOG = "running_log.txt"
+
+SUBJECTS = ["Average", "STEM", "Social Sciences", "Humanities", "Other"]
+
+SUPPORTED_MODELS = OrderedDict()
+
+TRAINER_LOG = "trainer_log.jsonl"
+
+TRAINING_ARGS = "training_args.yaml"
+
+TRAINING_STAGES = {
+    "Supervised Fine-Tuning": "sft",
+    "Reward Modeling": "rm",
+    "PPO": "ppo",
+    "DPO": "dpo",
+    "KTO": "kto",
+    "Pre-Training": "pt",
+}
+
+STAGES_USE_PAIR_DATA = {"rm", "dpo"}
+
+SUPPORTED_CLASS_FOR_S2ATTN = {"llama"}
+
+SWANLAB_CONFIG = "swanlab_public_config.json"
+
+VIDEO_PLACEHOLDER = os.getenv("VIDEO_PLACEHOLDER", "<video>")
+
+V_HEAD_WEIGHTS_NAME = "value_head.bin"
+
+V_HEAD_SAFE_WEIGHTS_NAME = "value_head.safetensors"
+
+
+class AttentionFunction(str, Enum):
+    AUTO = "auto"
+    DISABLED = "disabled"
+    SDPA = "sdpa"
+    FA2 = "fa2"
+
+
+class EngineName(str, Enum):
+    HF = "huggingface"
+    VLLM = "vllm"
+    SGLANG = "sglang"
+    KT = "ktransformers"
+
+
+class DownloadSource(str, Enum):
+    DEFAULT = "hf"
+    MODELSCOPE = "ms"
+    OPENMIND = "om"
+
+
+@unique
+class QuantizationMethod(str, Enum):
+    r"""Borrowed from `transformers.utils.quantization_config.QuantizationMethod`."""
+
+    BNB = "bnb"
+    GPTQ = "gptq"
+    AWQ = "awq"
+    AQLM = "aqlm"
+    QUANTO = "quanto"
+    EETQ = "eetq"
+    HQQ = "hqq"
+    MXFP4 = "mxfp4"
+
+
+class RopeScaling(str, Enum):
+    LINEAR = "linear"
+    DYNAMIC = "dynamic"
+    YARN = "yarn"
+    LLAMA3 = "llama3"
+
+
+def register_model_group(
+    models: dict[str, dict[DownloadSource, str]],
+    template: Optional[str] = None,
+    multimodal: bool = False,
+) -> None:
+    for name, path in models.items():
+        SUPPORTED_MODELS[name] = path
+        if template is not None and (
+            any(suffix in name for suffix in ("-Chat", "-Distill", "-Instruct", "-Thinking")) or multimodal
+        ):
+            DEFAULT_TEMPLATE[name] = template
+
+        if multimodal:
+            MULTIMODAL_SUPPORTED_MODELS.add(name)
+
+
+register_model_group(
+    models={
+        "Aya-23-8B-Chat": {
+            DownloadSource.DEFAULT: "CohereForAI/aya-23-8B",
+        },
+        "Aya-23-35B-Chat": {
+            DownloadSource.DEFAULT: "CohereForAI/aya-23-35B",
+        },
+    },
+    template="cohere",
+)
+
+
+register_model_group(
+    models={
+        "Baichuan-7B-Base": {
+            DownloadSource.DEFAULT: "baichuan-inc/Baichuan-7B",
+            DownloadSource.MODELSCOPE: "baichuan-inc/baichuan-7B",
+        },
+        "Baichuan-13B-Base": {
+            DownloadSource.DEFAULT: "baichuan-inc/Baichuan-13B-Base",
+            DownloadSource.MODELSCOPE: "baichuan-inc/Baichuan-13B-Base",
+        },
+        "Baichuan-13B-Chat": {
+            DownloadSource.DEFAULT: "baichuan-inc/Baichuan-13B-Chat",
+            DownloadSource.MODELSCOPE: "baichuan-inc/Baichuan-13B-Chat",
+        },
+    },
+    template="baichuan",
+)
+
+
+register_model_group(
+    models={
+        "Baichuan2-7B-Base": {
+            DownloadSource.DEFAULT: "baichuan-inc/Baichuan2-7B-Base",
+            DownloadSource.MODELSCOPE: "baichuan-inc/Baichuan2-7B-Base",
+        },
+        "Baichuan2-13B-Base": {
+            DownloadSource.DEFAULT: "baichuan-inc/Baichuan2-13B-Base",
+            DownloadSource.MODELSCOPE: "baichuan-inc/Baichuan2-13B-Base",
+            DownloadSource.OPENMIND: "Baichuan/Baichuan2_13b_base_pt",
+        },
+        "Baichuan2-7B-Chat": {
+            DownloadSource.DEFAULT: "baichuan-inc/Baichuan2-7B-Chat",
+            DownloadSource.MODELSCOPE: "baichuan-inc/Baichuan2-7B-Chat",
+            DownloadSource.OPENMIND: "Baichuan/Baichuan2_7b_chat_pt",
+        },
+        "Baichuan2-13B-Chat": {
+            DownloadSource.DEFAULT: "baichuan-inc/Baichuan2-13B-Chat",
+            DownloadSource.MODELSCOPE: "baichuan-inc/Baichuan2-13B-Chat",
+            DownloadSource.OPENMIND: "Baichuan/Baichuan2_13b_chat_pt",
+        },
+    },
+    template="baichuan2",
+)
+
+
+register_model_group(
+    models={
+        "BLOOM-560M": {
+            DownloadSource.DEFAULT: "bigscience/bloom-560m",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/bloom-560m",
+        },
+        "BLOOM-3B": {
+            DownloadSource.DEFAULT: "bigscience/bloom-3b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/bloom-3b",
+        },
+        "BLOOM-7B1": {
+            DownloadSource.DEFAULT: "bigscience/bloom-7b1",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/bloom-7b1",
+        },
+    },
+)
+
+
+register_model_group(
+    models={
+        "BLOOMZ-560M": {
+            DownloadSource.DEFAULT: "bigscience/bloomz-560m",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/bloomz-560m",
+        },
+        "BLOOMZ-3B": {
+            DownloadSource.DEFAULT: "bigscience/bloomz-3b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/bloomz-3b",
+        },
+        "BLOOMZ-7B1-mt": {
+            DownloadSource.DEFAULT: "bigscience/bloomz-7b1-mt",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/bloomz-7b1-mt",
+        },
+    },
+)
+
+
+register_model_group(
+    models={
+        "BlueLM-7B-Base": {
+            DownloadSource.DEFAULT: "vivo-ai/BlueLM-7B-Base",
+            DownloadSource.MODELSCOPE: "vivo-ai/BlueLM-7B-Base",
+        },
+        "BlueLM-7B-Chat": {
+            DownloadSource.DEFAULT: "vivo-ai/BlueLM-7B-Chat",
+            DownloadSource.MODELSCOPE: "vivo-ai/BlueLM-7B-Chat",
+        },
+    },
+    template="bluelm",
+)
+
+
+register_model_group(
+    models={
+        "Breeze-7B": {
+            DownloadSource.DEFAULT: "MediaTek-Research/Breeze-7B-Base-v1_0",
+        },
+        "Breeze-7B-Instruct": {
+            DownloadSource.DEFAULT: "MediaTek-Research/Breeze-7B-Instruct-v1_0",
+        },
+    },
+    template="breeze",
+)
+
+
+register_model_group(
+    models={
+        "ChatGLM2-6B-Chat": {
+            DownloadSource.DEFAULT: "zai-org/chatglm2-6b",
+            DownloadSource.MODELSCOPE: "ZhipuAI/chatglm2-6b",
+        }
+    },
+    template="chatglm2",
+)
+
+
+register_model_group(
+    models={
+        "ChatGLM3-6B-Base": {
+            DownloadSource.DEFAULT: "zai-org/chatglm3-6b-base",
+            DownloadSource.MODELSCOPE: "ZhipuAI/chatglm3-6b-base",
+        },
+        "ChatGLM3-6B-Chat": {
+            DownloadSource.DEFAULT: "zai-org/chatglm3-6b",
+            DownloadSource.MODELSCOPE: "ZhipuAI/chatglm3-6b",
+        },
+    },
+    template="chatglm3",
+)
+
+
+register_model_group(
+    models={
+        "Chinese-Llama-2-1.3B": {
+            DownloadSource.DEFAULT: "hfl/chinese-llama-2-1.3b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/chinese-llama-2-1.3b",
+        },
+        "Chinese-Llama-2-7B": {
+            DownloadSource.DEFAULT: "hfl/chinese-llama-2-7b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/chinese-llama-2-7b",
+        },
+        "Chinese-Llama-2-13B": {
+            DownloadSource.DEFAULT: "hfl/chinese-llama-2-13b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/chinese-llama-2-13b",
+        },
+        "Chinese-Alpaca-2-1.3B-Chat": {
+            DownloadSource.DEFAULT: "hfl/chinese-alpaca-2-1.3b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/chinese-alpaca-2-1.3b",
+        },
+        "Chinese-Alpaca-2-7B-Chat": {
+            DownloadSource.DEFAULT: "hfl/chinese-alpaca-2-7b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/chinese-alpaca-2-7b",
+        },
+        "Chinese-Alpaca-2-13B-Chat": {
+            DownloadSource.DEFAULT: "hfl/chinese-alpaca-2-13b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/chinese-alpaca-2-13b",
+        },
+    },
+    template="llama2_zh",
+)
+
+
+register_model_group(
+    models={
+        "CodeGeeX4-9B-Chat": {
+            DownloadSource.DEFAULT: "zai-org/codegeex4-all-9b",
+            DownloadSource.MODELSCOPE: "ZhipuAI/codegeex4-all-9b",
+        },
+    },
+    template="codegeex4",
+)
+
+
+register_model_group(
+    models={
+        "CodeGemma-7B": {
+            DownloadSource.DEFAULT: "google/codegemma-7b",
+        },
+        "CodeGemma-7B-Instruct": {
+            DownloadSource.DEFAULT: "google/codegemma-7b-it",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/codegemma-7b-it",
+        },
+        "CodeGemma-1.1-2B": {
+            DownloadSource.DEFAULT: "google/codegemma-1.1-2b",
+        },
+        "CodeGemma-1.1-7B-Instruct": {
+            DownloadSource.DEFAULT: "google/codegemma-1.1-7b-it",
+        },
+    },
+    template="gemma",
+)
+
+
+register_model_group(
+    models={
+        "Codestral-22B-v0.1-Chat": {
+            DownloadSource.DEFAULT: "mistralai/Codestral-22B-v0.1",
+            DownloadSource.MODELSCOPE: "swift/Codestral-22B-v0.1",
+        },
+    },
+    template="mistral",
+)
+
+
+register_model_group(
+    models={
+        "CommandR-35B-Chat": {
+            DownloadSource.DEFAULT: "CohereForAI/c4ai-command-r-v01",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/c4ai-command-r-v01",
+        },
+        "CommandR-Plus-104B-Chat": {
+            DownloadSource.DEFAULT: "CohereForAI/c4ai-command-r-plus",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/c4ai-command-r-plus",
+        },
+        "CommandR-35B-4bit-Chat": {
+            DownloadSource.DEFAULT: "CohereForAI/c4ai-command-r-v01-4bit",
+            DownloadSource.MODELSCOPE: "mirror013/c4ai-command-r-v01-4bit",
+        },
+        "CommandR-Plus-104B-4bit-Chat": {
+            DownloadSource.DEFAULT: "CohereForAI/c4ai-command-r-plus-4bit",
+        },
+    },
+    template="cohere",
+)
+
+
+register_model_group(
+    models={
+        "DBRX-132B-Base": {
+            DownloadSource.DEFAULT: "databricks/dbrx-base",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/dbrx-base",
+        },
+        "DBRX-132B-Instruct": {
+            DownloadSource.DEFAULT: "databricks/dbrx-instruct",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/dbrx-instruct",
+        },
+    },
+    template="dbrx",
+)
+
+
+register_model_group(
+    models={
+        "DeepSeek-LLM-7B-Base": {
+            DownloadSource.DEFAULT: "deepseek-ai/deepseek-llm-7b-base",
+            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-llm-7b-base",
+        },
+        "DeepSeek-LLM-67B-Base": {
+            DownloadSource.DEFAULT: "deepseek-ai/deepseek-llm-67b-base",
+            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-llm-67b-base",
+        },
+        "DeepSeek-LLM-7B-Chat": {
+            DownloadSource.DEFAULT: "deepseek-ai/deepseek-llm-7b-chat",
+            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-llm-7b-chat",
+        },
+        "DeepSeek-LLM-67B-Chat": {
+            DownloadSource.DEFAULT: "deepseek-ai/deepseek-llm-67b-chat",
+            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-llm-67b-chat",
+        },
+        "DeepSeek-Math-7B-Base": {
+            DownloadSource.DEFAULT: "deepseek-ai/deepseek-math-7b-base",
+            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-math-7b-base",
+        },
+        "DeepSeek-Math-7B-Instruct": {
+            DownloadSource.DEFAULT: "deepseek-ai/deepseek-math-7b-instruct",
+            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-math-7b-instruct",
+        },
+        "DeepSeek-MoE-16B-Base": {
+            DownloadSource.DEFAULT: "deepseek-ai/deepseek-moe-16b-base",
+            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-moe-16b-base",
+        },
+        "DeepSeek-MoE-16B-Chat": {
+            DownloadSource.DEFAULT: "deepseek-ai/deepseek-moe-16b-chat",
+            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-moe-16b-chat",
+        },
+        "DeepSeek-V2-16B-Base": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V2-Lite",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V2-Lite",
+        },
+        "DeepSeek-V2-236B-Base": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V2",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V2",
+        },
+        "DeepSeek-V2-16B-Chat": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V2-Lite-Chat",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V2-Lite-Chat",
+        },
+        "DeepSeek-V2-236B-Chat": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V2-Chat",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V2-Chat",
+        },
+        "DeepSeek-Coder-V2-16B-Base": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-Coder-V2-Lite-Base",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-Coder-V2-Lite-Base",
+        },
+        "DeepSeek-Coder-V2-236B-Base": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-Coder-V2-Base",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-Coder-V2-Base",
+        },
+        "DeepSeek-Coder-V2-16B-Instruct": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct",
+        },
+        "DeepSeek-Coder-V2-236B-Instruct": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-Coder-V2-Instruct",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-Coder-V2-Instruct",
+        },
+    },
+    template="deepseek",
+)
+
+
+register_model_group(
+    models={
+        "DeepSeek-Coder-6.7B-Base": {
+            DownloadSource.DEFAULT: "deepseek-ai/deepseek-coder-6.7b-base",
+            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-coder-6.7b-base",
+        },
+        "DeepSeek-Coder-7B-Base": {
+            DownloadSource.DEFAULT: "deepseek-ai/deepseek-coder-7b-base-v1.5",
+            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-coder-7b-base-v1.5",
+        },
+        "DeepSeek-Coder-33B-Base": {
+            DownloadSource.DEFAULT: "deepseek-ai/deepseek-coder-33b-base",
+            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-coder-33b-base",
+        },
+        "DeepSeek-Coder-6.7B-Instruct": {
+            DownloadSource.DEFAULT: "deepseek-ai/deepseek-coder-6.7b-instruct",
+            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-coder-6.7b-instruct",
+        },
+        "DeepSeek-Coder-7B-Instruct": {
+            DownloadSource.DEFAULT: "deepseek-ai/deepseek-coder-7b-instruct-v1.5",
+            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-coder-7b-instruct-v1.5",
+        },
+        "DeepSeek-Coder-33B-Instruct": {
+            DownloadSource.DEFAULT: "deepseek-ai/deepseek-coder-33b-instruct",
+            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-coder-33b-instruct",
+        },
+    },
+    template="deepseekcoder",
+)
+
+
+register_model_group(
+    models={
+        "DeepSeek-V2-0628-236B-Chat": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V2-Chat-0628",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V2-Chat-0628",
+        },
+        "DeepSeek-V2.5-236B-Chat": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V2.5",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V2.5",
+        },
+        "DeepSeek-V2.5-1210-236B-Chat": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V2.5-1210",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V2.5-1210",
+        },
+        "DeepSeek-V3-671B-Base": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V3-Base",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V3-Base",
+        },
+        "DeepSeek-V3-671B-Chat": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V3",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V3",
+        },
+        "DeepSeek-V3-0324-671B-Chat": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V3-0324",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V3-0324",
+        },
+    },
+    template="deepseek3",
+)
+
+
+register_model_group(
+    models={
+        "DeepSeek-R1-1.5B-Distill": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+        },
+        "DeepSeek-R1-7B-Distill": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
+        },
+        "DeepSeek-R1-8B-Distill": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+        },
+        "DeepSeek-R1-14B-Distill": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
+        },
+        "DeepSeek-R1-32B-Distill": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
+        },
+        "DeepSeek-R1-70B-Distill": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
+        },
+        "DeepSeek-R1-671B-Chat-Zero": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-R1-Zero",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-R1-Zero",
+        },
+        "DeepSeek-R1-671B-Chat": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-R1",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-R1",
+        },
+        "DeepSeek-R1-0528-8B-Distill": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B",
+        },
+        "DeepSeek-R1-0528-671B-Chat": {
+            DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-R1-0528",
+            DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-R1-0528",
+        },
+    },
+    template="deepseekr1",
+)
+
+
+register_model_group(
+    models={
+        "Devstral-Small-2507-Instruct": {
+            DownloadSource.DEFAULT: "mistralai/Devstral-Small-2507",
+            DownloadSource.MODELSCOPE: "mistralai/Devstral-Small-2507",
+        },
+    },
+    template="mistral_small",
+)
+
+
+register_model_group(
+    models={
+        "dots.ocr": {
+            DownloadSource.DEFAULT: "rednote-hilab/dots.ocr",
+            DownloadSource.MODELSCOPE: "rednote-hilab/dots.ocr",
+        },
+    },
+    template="dots_ocr",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "ERNIE-4.5-21B-A3B-Thinking": {
+            DownloadSource.DEFAULT: "baidu/ERNIE-4.5-21B-A3B-Thinking",
+            DownloadSource.MODELSCOPE: "PaddlePaddle/ERNIE-4.5-21B-A3B-Thinking",
+        },
+    },
+    template="ernie",
+)
+
+
+register_model_group(
+    models={
+        "ERNIE-4.5-0.3B-PT": {
+            DownloadSource.DEFAULT: "baidu/ERNIE-4.5-0.3B-PT",
+            DownloadSource.MODELSCOPE: "PaddlePaddle/ERNIE-4.5-0.3B-PT",
+        },
+        "ERNIE-4.5-21B-A3B-PT": {
+            DownloadSource.DEFAULT: "baidu/ERNIE-4.5-21B-A3B-PT",
+            DownloadSource.MODELSCOPE: "PaddlePaddle/ERNIE-4.5-21B-A3B-PT",
+        },
+        "ERNIE-4.5-300B-A47B-PT": {
+            DownloadSource.DEFAULT: "baidu/ERNIE-4.5-300B-A47B-PT",
+            DownloadSource.MODELSCOPE: "PaddlePaddle/ERNIE-4.5-300B-A47B-PT",
+        },
+    },
+    template="ernie_nothink",
+)
+
+
+register_model_group(
+    models={
+        "EXAONE-3.0-7.8B-Instruct": {
+            DownloadSource.DEFAULT: "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct",
+        },
+    },
+    template="exaone",
+)
+
+
+register_model_group(
+    models={
+        "Falcon-7B": {
+            DownloadSource.DEFAULT: "tiiuae/falcon-7b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/falcon-7b",
+        },
+        "Falcon-11B": {
+            DownloadSource.DEFAULT: "tiiuae/falcon-11B",
+            DownloadSource.MODELSCOPE: "tiiuae/falcon-11B",
+        },
+        "Falcon-40B": {
+            DownloadSource.DEFAULT: "tiiuae/falcon-40b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/falcon-40b",
+        },
+        "Falcon-180B": {
+            DownloadSource.DEFAULT: "tiiuae/falcon-180b",
+            DownloadSource.MODELSCOPE: "modelscope/falcon-180B",
+        },
+        "Falcon-7B-Instruct": {
+            DownloadSource.DEFAULT: "tiiuae/falcon-7b-instruct",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/falcon-7b-instruct",
+        },
+        "Falcon-40B-Instruct": {
+            DownloadSource.DEFAULT: "tiiuae/falcon-40b-instruct",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/falcon-40b-instruct",
+        },
+        "Falcon-180B-Chat": {
+            DownloadSource.DEFAULT: "tiiuae/falcon-180b-chat",
+            DownloadSource.MODELSCOPE: "modelscope/falcon-180B-chat",
+        },
+    },
+    template="falcon",
+)
+
+
+register_model_group(
+    models={
+        "Falcon-H1-0.5B-Base": {
+            DownloadSource.DEFAULT: "tiiuae/Falcon-H1-0.5B-Base",
+            DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-0.5B-Base",
+        },
+        "Falcon-H1-1.5B-Base": {
+            DownloadSource.DEFAULT: "tiiuae/Falcon-H1-1.5B-Base",
+            DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-1.5B-Base",
+        },
+        "Falcon-H1-1.5B-Deep-Base": {
+            DownloadSource.DEFAULT: "tiuae/Falcon-H1-1.5B-Deep-Base",
+            DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-1.5B-Deep-Base",
+        },
+        "Falcon-H1-3B-Base": {
+            DownloadSource.DEFAULT: "tiiuae/Falcon-H1-3B-Base",
+            DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-3B-Base",
+        },
+        "Falcon-H1-7B-Base": {
+            DownloadSource.DEFAULT: "tiiuae/Falcon-H1-7B-Base",
+            DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-7B-Base",
+        },
+        "Falcon-H1-34B-Base": {
+            DownloadSource.DEFAULT: "tiiuae/Falcon-H1-34B-Base",
+            DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-34B-Base",
+        },
+        "Falcon-H1-0.5B-Instruct": {
+            DownloadSource.DEFAULT: "tiiuae/Falcon-H1-0.5B-Instruct",
+            DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-0.5B-Instruct",
+        },
+        "Falcon-H1-1.5B-Instruct": {
+            DownloadSource.DEFAULT: "tiiuae/Falcon-H1-1.5B-Instruct",
+            DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-1.5B-Instruct",
+        },
+        "Falcon-H1-1.5B-Deep-Instruct": {
+            DownloadSource.DEFAULT: "tiiuae/Falcon-H1-1.5B-Deep-Instruct",
+            DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-1.5B-Deep-Instruct",
+        },
+        "Falcon-H1-3B-Instruct": {
+            DownloadSource.DEFAULT: "tiiuae/Falcon-H1-3B-Instruct",
+            DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-3B-Instruct",
+        },
+        "Falcon-H1-7B-Instruct": {
+            DownloadSource.DEFAULT: "tiiuae/Falcon-H1-7B-Instruct",
+            DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-7B-Instruct",
+        },
+        "Falcon-H1-34B-Instruct": {
+            DownloadSource.DEFAULT: "tiiuae/Falcon-H1-34B-Instruct",
+            DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-34B-Instruct",
+        },
+    },
+    template="falcon_h1",
+)
+
+
+register_model_group(
+    models={
+        "Gemma-2B": {
+            DownloadSource.DEFAULT: "google/gemma-2b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/gemma-2b",
+        },
+        "Gemma-7B": {
+            DownloadSource.DEFAULT: "google/gemma-7b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/gemma-2b-it",
+        },
+        "Gemma-2B-Instruct": {
+            DownloadSource.DEFAULT: "google/gemma-2b-it",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/gemma-7b",
+        },
+        "Gemma-7B-Instruct": {
+            DownloadSource.DEFAULT: "google/gemma-7b-it",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/gemma-7b-it",
+        },
+        "Gemma-1.1-2B-Instruct": {
+            DownloadSource.DEFAULT: "google/gemma-1.1-2b-it",
+        },
+        "Gemma-1.1-7B-Instruct": {
+            DownloadSource.DEFAULT: "google/gemma-1.1-7b-it",
+        },
+    },
+    template="gemma",
+)
+
+
+register_model_group(
+    models={
+        "Gemma-2-2B": {
+            DownloadSource.DEFAULT: "google/gemma-2-2b",
+            DownloadSource.MODELSCOPE: "LLM-Research/gemma-2-2b",
+        },
+        "Gemma-2-9B": {
+            DownloadSource.DEFAULT: "google/gemma-2-9b",
+            DownloadSource.MODELSCOPE: "LLM-Research/gemma-2-9b",
+        },
+        "Gemma-2-27B": {
+            DownloadSource.DEFAULT: "google/gemma-2-27b",
+            DownloadSource.MODELSCOPE: "LLM-Research/gemma-2-27b",
+        },
+        "Gemma-2-2B-Instruct": {
+            DownloadSource.DEFAULT: "google/gemma-2-2b-it",
+            DownloadSource.MODELSCOPE: "LLM-Research/gemma-2-2b-it",
+            DownloadSource.OPENMIND: "LlamaFactory/gemma-2-2b-it",
+        },
+        "Gemma-2-9B-Instruct": {
+            DownloadSource.DEFAULT: "google/gemma-2-9b-it",
+            DownloadSource.MODELSCOPE: "LLM-Research/gemma-2-9b-it",
+            DownloadSource.OPENMIND: "LlamaFactory/gemma-2-9b-it",
+        },
+        "Gemma-2-27B-Instruct": {
+            DownloadSource.DEFAULT: "google/gemma-2-27b-it",
+            DownloadSource.MODELSCOPE: "LLM-Research/gemma-2-27b-it",
+        },
+        "Gemma-3-270M": {
+            DownloadSource.DEFAULT: "google/gemma-3-270m",
+            DownloadSource.MODELSCOPE: "LLM-Research/gemma-3-270m",
+        },
+        "Gemma-3-1B": {
+            DownloadSource.DEFAULT: "google/gemma-3-1b-pt",
+            DownloadSource.MODELSCOPE: "LLM-Research/gemma-3-1b-pt",
+        },
+        "Gemma-3-270M-Instruct": {
+            DownloadSource.DEFAULT: "google/gemma-3-270m-it",
+            DownloadSource.MODELSCOPE: "LLM-Research/gemma-3-270m-it",
+        },
+        "Gemma-3-1B-Instruct": {
+            DownloadSource.DEFAULT: "google/gemma-3-1b-it",
+            DownloadSource.MODELSCOPE: "LLM-Research/gemma-3-1b-it",
+        },
+        "MedGemma-27B-Instruct": {
+            DownloadSource.DEFAULT: "google/medgemma-27b-text-it",
+            DownloadSource.MODELSCOPE: "google/medgemma-27b-text-it",
+        },
+    },
+    template="gemma2",
+)
+
+
+register_model_group(
+    models={
+        "Gemma-3-4B": {
+            DownloadSource.DEFAULT: "google/gemma-3-4b-pt",
+            DownloadSource.MODELSCOPE: "LLM-Research/gemma-3-4b-pt",
+        },
+        "Gemma-3-12B": {
+            DownloadSource.DEFAULT: "google/gemma-3-12b-pt",
+            DownloadSource.MODELSCOPE: "LLM-Research/gemma-3-12b-pt",
+        },
+        "Gemma-3-27B": {
+            DownloadSource.DEFAULT: "google/gemma-3-27b-pt",
+            DownloadSource.MODELSCOPE: "LLM-Research/gemma-3-27b-pt",
+        },
+        "Gemma-3-4B-Instruct": {
+            DownloadSource.DEFAULT: "google/gemma-3-4b-it",
+            DownloadSource.MODELSCOPE: "LLM-Research/gemma-3-4b-it",
+        },
+        "Gemma-3-12B-Instruct": {
+            DownloadSource.DEFAULT: "google/gemma-3-12b-it",
+            DownloadSource.MODELSCOPE: "LLM-Research/gemma-3-12b-it",
+        },
+        "Gemma-3-27B-Instruct": {
+            DownloadSource.DEFAULT: "google/gemma-3-27b-it",
+            DownloadSource.MODELSCOPE: "LLM-Research/gemma-3-27b-it",
+        },
+        "MedGemma-4B": {
+            DownloadSource.DEFAULT: "google/medgemma-4b-pt",
+            DownloadSource.MODELSCOPE: "google/medgemma-4b-pt",
+        },
+        "MedGemma-4B-Instruct": {
+            DownloadSource.DEFAULT: "google/medgemma-4b-it",
+            DownloadSource.MODELSCOPE: "google/medgemma-4b-it",
+        },
+        "MedGemma-27B-Instruct": {
+            DownloadSource.DEFAULT: "google/medgemma-27b-text-it",
+            DownloadSource.MODELSCOPE: "google/medgemma-27b-text-it",
+        },
+    },
+    template="gemma3",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "Gemma-3n-E2B": {
+            DownloadSource.DEFAULT: "google/gemma-3n-E2B",
+            DownloadSource.MODELSCOPE: "LLM-Research/gemma-3n-E2B",
+        },
+        "Gemma-3n-E4B": {
+            DownloadSource.DEFAULT: "google/gemma-3n-E4B",
+            DownloadSource.MODELSCOPE: "LLM-Research/gemma-3n-E4B",
+        },
+        "Gemma-3n-E2B-Instruct": {
+            DownloadSource.DEFAULT: "google/gemma-3n-E2B-it",
+            DownloadSource.MODELSCOPE: "LLM-Research/gemma-3n-E2B-it",
+        },
+        "Gemma-3n-E4B-Instruct": {
+            DownloadSource.DEFAULT: "google/gemma-3n-E4B-it",
+            DownloadSource.MODELSCOPE: "LLM-Research/gemma-3n-E4B-it",
+        },
+    },
+    template="gemma3n",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "GLM-4-9B": {
+            DownloadSource.DEFAULT: "zai-org/glm-4-9b",
+            DownloadSource.MODELSCOPE: "ZhipuAI/glm-4-9b",
+        },
+        "GLM-4-9B-Chat": {
+            DownloadSource.DEFAULT: "zai-org/glm-4-9b-chat",
+            DownloadSource.MODELSCOPE: "ZhipuAI/glm-4-9b-chat",
+            DownloadSource.OPENMIND: "LlamaFactory/glm-4-9b-chat",
+        },
+        "GLM-4-9B-1M-Chat": {
+            DownloadSource.DEFAULT: "zai-org/glm-4-9b-chat-1m",
+            DownloadSource.MODELSCOPE: "ZhipuAI/glm-4-9b-chat-1m",
+        },
+        "GLM-4-0414-9B-Chat": {
+            DownloadSource.DEFAULT: "zai-org/GLM-4-9B-0414",
+            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4-9B-0414",
+        },
+        "GLM-4-0414-32B-Base": {
+            DownloadSource.DEFAULT: "zai-org/GLM-4-32B-Base-0414",
+            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4-32B-Base-0414",
+        },
+        "GLM-4-0414-32B-Chat": {
+            DownloadSource.DEFAULT: "zai-org/GLM-4-32B-0414",
+            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4-32B-0414",
+        },
+    },
+    template="glm4",
+)
+
+
+register_model_group(
+    models={
+        "GLM-4.1V-9B-Base": {
+            DownloadSource.DEFAULT: "zai-org/GLM-4.1V-9B-Base",
+            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4.1V-9B-Base",
+        },
+        "GLM-4.1V-9B-Thinking": {
+            DownloadSource.DEFAULT: "zai-org/GLM-4.1V-9B-Thinking",
+            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4.1V-9B-Thinking",
+        },
+    },
+    template="glm4v",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "GLM-4.5-Air-Base": {
+            DownloadSource.DEFAULT: "zai-org/GLM-4.5-Air-Base",
+            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4.5-Air-Base",
+        },
+        "GLM-4.5-Base": {
+            DownloadSource.DEFAULT: "zai-org/GLM-4.5-Base",
+            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4.5-Base",
+        },
+        "GLM-4.5-Air-Thinking": {
+            DownloadSource.DEFAULT: "zai-org/GLM-4.5-Air",
+            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4.5-Air",
+        },
+        "GLM-4.5-Thinking": {
+            DownloadSource.DEFAULT: "zai-org/GLM-4.5",
+            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4.5",
+        },
+    },
+    template="glm4_moe",
+)
+
+
+register_model_group(
+    models={
+        "GLM-4.5V-Air-Thinking": {
+            DownloadSource.DEFAULT: "zai-org/GLM-4.5V",
+            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-4.5V",
+        }
+    },
+    template="glm4v_moe",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "GLM-Z1-0414-9B-Chat": {
+            DownloadSource.DEFAULT: "zai-org/GLM-Z1-9B-0414",
+            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-Z1-9B-0414",
+        },
+        "GLM-Z1-0414-32B-Chat": {
+            DownloadSource.DEFAULT: "zai-org/GLM-Z1-32B-0414",
+            DownloadSource.MODELSCOPE: "ZhipuAI/GLM-Z1-32B-0414",
+        },
+    },
+    template="glmz1",
+)
+
+
+register_model_group(
+    models={
+        "GPT-2-Small": {
+            DownloadSource.DEFAULT: "openai-community/gpt2",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/gpt2",
+        },
+        "GPT-2-Medium": {
+            DownloadSource.DEFAULT: "openai-community/gpt2-medium",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/gpt2-medium",
+        },
+        "GPT-2-Large": {
+            DownloadSource.DEFAULT: "openai-community/gpt2-large",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/gpt2-large",
+        },
+        "GPT-2-XL": {
+            DownloadSource.DEFAULT: "openai-community/gpt2-xl",
+            DownloadSource.MODELSCOPE: "goodbai95/GPT2-xl",
+        },
+    },
+)
+
+
+register_model_group(
+    models={
+        "GPT-OSS-20B-Thinking": {
+            DownloadSource.DEFAULT: "openai/gpt-oss-20b",
+            DownloadSource.MODELSCOPE: "openai/gpt-oss-20b",
+        },
+        "GPT-OSS-120B-Thinking": {
+            DownloadSource.DEFAULT: "openai/gpt-oss-120b",
+            DownloadSource.MODELSCOPE: "openai/gpt-oss-120b",
+        },
+    },
+    template="gpt",
+)
+
+
+register_model_group(
+    models={
+        "Granite-3.0-1B-A400M-Base": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-3.0-1b-a400m-base",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.0-1b-a400m-base",
+        },
+        "Granite-3.0-3B-A800M-Base": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-3.0-3b-a800m-base",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.0-3b-a800m-base",
+        },
+        "Granite-3.0-2B-Base": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-3.0-2b-base",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.0-2b-base",
+        },
+        "Granite-3.0-8B-Base": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-3.0-8b-base",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.0-8b-base",
+        },
+        "Granite-3.0-1B-A400M-Instruct": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-3.0-1b-a400m-instruct",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.0-1b-a400m-instruct",
+        },
+        "Granite-3.0-3B-A800M-Instruct": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-3.0-3b-a800m-instruct",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.0-3b-a800m-instruct",
+        },
+        "Granite-3.0-2B-Instruct": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-3.0-2b-instruct",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.0-2b-instruct",
+        },
+        "Granite-3.0-8B-Instruct": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-3.0-8b-instruct",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.0-8b-instruct",
+        },
+        "Granite-3.1-1B-A400M-Base": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-3.1-1b-a400m-base",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.1-1b-a400m-base",
+        },
+        "Granite-3.1-3B-A800M-Base": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-3.1-3b-a800m-base",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.1-3b-a800m-base",
+        },
+        "Granite-3.1-2B-Base": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-3.1-2b-base",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.1-2b-base",
+        },
+        "Granite-3.1-8B-Base": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-3.1-8b-base",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.1-8b-base",
+        },
+        "Granite-3.1-1B-A400M-Instruct": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-3.1-1b-a400m-instruct",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.1-1b-a400m-instruct",
+        },
+        "Granite-3.1-3B-A800M-Instruct": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-3.1-3b-a800m-instruct",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.1-3b-a800m-instruct",
+        },
+        "Granite-3.1-2B-Instruct": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-3.1-2b-instruct",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.1-2b-instruct",
+        },
+        "Granite-3.1-8B-Instruct": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-3.1-8b-instruct",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.1-8b-instruct",
+        },
+        "Granite-3.2-2B-Instruct": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-3.2-2b-instruct",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.2-2b-instruct",
+        },
+        "Granite-3.2-8B-Instruct": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-3.2-8b-instruct",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.2-8b-instruct",
+        },
+        "Granite-3.3-2B-Base": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-3.3-2b-base",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.3-2b-base",
+        },
+        "Granite-3.3-8B-Base": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-3.3-8b-base",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.3-8b-base",
+        },
+        "Granite-3.3-2B-Instruct": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-3.3-2b-instruct",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.3-2b-instruct",
+        },
+        "Granite-3.3-8B-Instruct": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-3.3-8b-instruct",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-3.3-8b-instruct",
+        },
+    },
+    template="granite3",
+)
+
+
+register_model_group(
+    models={
+        "Granite-Vision-3.2-2B": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-vision-3.2-2b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/granite-vision-3.2-2b",
+        },
+    },
+    template="granite3_vision",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "Granite-4.0-tiny-preview": {
+            DownloadSource.DEFAULT: "ibm-granite/granite-4.0-tiny-preview",
+            DownloadSource.MODELSCOPE: "ibm-granite/granite-4.0-tiny-preview",
+        },
+    },
+    template="granite4",
+)
+
+
+register_model_group(
+    models={
+        "Hunyuan-7B-Instruct": {
+            DownloadSource.DEFAULT: "tencent/Hunyuan-7B-Instruct",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/Hunyuan-7B-Instruct",
+        },
+        "Hunyuan-MT-7B-Instruct": {
+            DownloadSource.DEFAULT: "tencent/Hunyuan-MT-7B",
+            DownloadSource.MODELSCOPE: "Tencent-Hunyuan/Hunyuan-MT-7B",
+        },
+    },
+    template="hunyuan",
+)
+
+
+register_model_group(
+    models={
+        "Index-1.9B-Base": {
+            DownloadSource.DEFAULT: "IndexTeam/Index-1.9B",
+            DownloadSource.MODELSCOPE: "IndexTeam/Index-1.9B",
+        },
+        "Index-1.9B-Base-Pure": {
+            DownloadSource.DEFAULT: "IndexTeam/Index-1.9B-Pure",
+            DownloadSource.MODELSCOPE: "IndexTeam/Index-1.9B-Pure",
+        },
+        "Index-1.9B-Chat": {
+            DownloadSource.DEFAULT: "IndexTeam/Index-1.9B-Chat",
+            DownloadSource.MODELSCOPE: "IndexTeam/Index-1.9B-Chat",
+        },
+        "Index-1.9B-Character-Chat": {
+            DownloadSource.DEFAULT: "IndexTeam/Index-1.9B-Character",
+            DownloadSource.MODELSCOPE: "IndexTeam/Index-1.9B-Character",
+        },
+        "Index-1.9B-Chat-32K": {
+            DownloadSource.DEFAULT: "IndexTeam/Index-1.9B-32K",
+            DownloadSource.MODELSCOPE: "IndexTeam/Index-1.9B-32K",
+        },
+    },
+    template="index",
+)
+
+
+register_model_group(
+    models={
+        "InternLM-7B": {
+            DownloadSource.DEFAULT: "internlm/internlm-7b",
+            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm-7b",
+        },
+        "InternLM-20B": {
+            DownloadSource.DEFAULT: "internlm/internlm-20b",
+            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm-20b",
+        },
+        "InternLM-7B-Chat": {
+            DownloadSource.DEFAULT: "internlm/internlm-chat-7b",
+            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm-chat-7b",
+        },
+        "InternLM-20B-Chat": {
+            DownloadSource.DEFAULT: "internlm/internlm-chat-20b",
+            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm-chat-20b",
+        },
+    },
+    template="intern",
+)
+
+
+register_model_group(
+    models={
+        "InternLM2-7B": {
+            DownloadSource.DEFAULT: "internlm/internlm2-7b",
+            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm2-7b",
+        },
+        "InternLM2-20B": {
+            DownloadSource.DEFAULT: "internlm/internlm2-20b",
+            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm2-20b",
+        },
+        "InternLM2-7B-Chat": {
+            DownloadSource.DEFAULT: "internlm/internlm2-chat-7b",
+            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm2-chat-7b",
+        },
+        "InternLM2-20B-Chat": {
+            DownloadSource.DEFAULT: "internlm/internlm2-chat-20b",
+            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm2-chat-20b",
+        },
+        "InternLM2.5-1.8B": {
+            DownloadSource.DEFAULT: "internlm/internlm2_5-1_8b",
+            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm2_5-1_8b",
+            DownloadSource.OPENMIND: "Intern/internlm2_5-1_8b",
+        },
+        "InternLM2.5-7B": {
+            DownloadSource.DEFAULT: "internlm/internlm2_5-7b",
+            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm2_5-7b",
+        },
+        "InternLM2.5-20B": {
+            DownloadSource.DEFAULT: "internlm/internlm2_5-20b",
+            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm2_5-20b",
+            DownloadSource.OPENMIND: "Intern/internlm2_5-20b",
+        },
+        "InternLM2.5-1.8B-Chat": {
+            DownloadSource.DEFAULT: "internlm/internlm2_5-1_8b-chat",
+            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm2_5-1_8b-chat",
+            DownloadSource.OPENMIND: "Intern/internlm2_5-1_8b-chat",
+        },
+        "InternLM2.5-7B-Chat": {
+            DownloadSource.DEFAULT: "internlm/internlm2_5-7b-chat",
+            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm2_5-7b-chat",
+            DownloadSource.OPENMIND: "Intern/internlm2_5-7b-chat",
+        },
+        "InternLM2.5-7B-1M-Chat": {
+            DownloadSource.DEFAULT: "internlm/internlm2_5-7b-chat-1m",
+            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm2_5-7b-chat-1m",
+            DownloadSource.OPENMIND: "Intern/internlm2_5-7b-chat-1m",
+        },
+        "InternLM2.5-20B-Chat": {
+            DownloadSource.DEFAULT: "internlm/internlm2_5-20b-chat",
+            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm2_5-20b-chat",
+            DownloadSource.OPENMIND: "Intern/internlm2_5-20b-chat",
+        },
+        "InternLM3-8B-Chat": {
+            DownloadSource.DEFAULT: "internlm/internlm3-8b-instruct",
+            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm3-8b-instruct",
+        },
+    },
+    template="intern2",
+)
+
+
+register_model_group(
+    models={
+        "InternVL2.5-2B-MPO": {
+            DownloadSource.DEFAULT: "OpenGVLab/InternVL2_5-2B-MPO-hf",
+            DownloadSource.MODELSCOPE: "OpenGVLab/InternVL2_5-2B-MPO-hf",
+        },
+        "InternVL2.5-8B-MPO": {
+            DownloadSource.DEFAULT: "OpenGVLab/InternVL2_5-8B-MPO-hf",
+            DownloadSource.MODELSCOPE: "OpenGVLab/InternVL2_5-8B-MPO-hf",
+        },
+        "InternVL3-1B-hf": {
+            DownloadSource.DEFAULT: "OpenGVLab/InternVL3-1B-hf",
+            DownloadSource.MODELSCOPE: "OpenGVLab/InternVL3-1B-hf",
+        },
+        "InternVL3-2B-hf": {
+            DownloadSource.DEFAULT: "OpenGVLab/InternVL3-2B-hf",
+            DownloadSource.MODELSCOPE: "OpenGVLab/InternVL3-2B-hf",
+        },
+        "InternVL3-8B-hf": {
+            DownloadSource.DEFAULT: "OpenGVLab/InternVL3-8B-hf",
+            DownloadSource.MODELSCOPE: "OpenGVLab/InternVL3-8B-hf",
+        },
+        "InternVL3-14B-hf": {
+            DownloadSource.DEFAULT: "OpenGVLab/InternVL3-14B-hf",
+            DownloadSource.MODELSCOPE: "OpenGVLab/InternVL3-14B-hf",
+        },
+        "InternVL3-38B-hf": {
+            DownloadSource.DEFAULT: "OpenGVLab/InternVL3-38B-hf",
+            DownloadSource.MODELSCOPE: "OpenGVLab/InternVL3-38B-hf",
+        },
+        "InternVL3-78B-hf": {
+            DownloadSource.DEFAULT: "OpenGVLab/InternVL3-78B-hf",
+            DownloadSource.MODELSCOPE: "OpenGVLab/InternVL3-78B-hf",
+        },
+        "InternVL3.5-1B-hf": {
+            DownloadSource.DEFAULT: "OpenGVLab/InternVL3_5-1B-HF",
+            DownloadSource.MODELSCOPE: "OpenGVLab/InternVL3_5-1B-HF",
+        },
+        "InternVL3.5-2B-hf": {
+            DownloadSource.DEFAULT: "OpenGVLab/InternVL3_5-2B-HF",
+            DownloadSource.MODELSCOPE: "OpenGVLab/InternVL3_5-2B-HF",
+        },
+        "InternVL3.5-4B-hf": {
+            DownloadSource.DEFAULT: "OpenGVLab/InternVL3_5-4B-HF",
+            DownloadSource.MODELSCOPE: "OpenGVLab/InternVL3_5-4B-HF",
+        },
+        "InternVL3.5-8B-hf": {
+            DownloadSource.DEFAULT: "OpenGVLab/InternVL3_5-8B-HF",
+            DownloadSource.MODELSCOPE: "OpenGVLab/InternVL3_5-8B-HF",
+        },
+        "InternVL3.5-14B-hf": {
+            DownloadSource.DEFAULT: "OpenGVLab/InternVL3_5-14B-HF",
+            DownloadSource.MODELSCOPE: "OpenGVLab/InternVL3_5-14B-HF",
+        },
+        "InternVL3.5-30B-A3B-hf": {
+            DownloadSource.DEFAULT: "OpenGVLab/InternVL3_5-30B-A3B-HF",
+            DownloadSource.MODELSCOPE: "OpenGVLab/InternVL3_5-30B-A3B-HF",
+        },
+        "InternVL3.5-38B-hf": {
+            DownloadSource.DEFAULT: "OpenGVLab/InternVL3_5-38B-HF",
+            DownloadSource.MODELSCOPE: "OpenGVLab/InternVL3_5-38B-HF",
+        },
+    },
+    template="intern_vl",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "Intern-S1-mini": {
+            DownloadSource.DEFAULT: "internlm/Intern-S1-mini",
+            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/Intern-S1-mini",
+        }
+    },
+    template="intern_s1",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "Jamba-v0.1": {
+            DownloadSource.DEFAULT: "ai21labs/Jamba-v0.1",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/Jamba-v0.1",
+        }
+    },
+)
+
+
+register_model_group(
+    models={
+        "Keye-VL-8B-Chat": {
+            DownloadSource.DEFAULT: "Kwai-Keye/Keye-VL-8B-Preview",
+            DownloadSource.MODELSCOPE: "Kwai-Keye/Keye-VL-8B-Preview",
+        },
+    },
+    template="keye_vl",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "Kimi-Dev-72B-Instruct": {
+            DownloadSource.DEFAULT: "moonshotai/Kimi-Dev-72B",
+            DownloadSource.MODELSCOPE: "moonshotai/Kimi-Dev-72B",
+        },
+    },
+    template="qwen",
+)
+
+
+register_model_group(
+    models={
+        "Kimi-VL-A3B-Instruct": {
+            DownloadSource.DEFAULT: "moonshotai/Kimi-VL-A3B-Instruct",
+            DownloadSource.MODELSCOPE: "moonshotai/Kimi-VL-A3B-Instruct",
+        },
+        "Kimi-VL-A3B-Thinking": {
+            DownloadSource.DEFAULT: "moonshotai/Kimi-VL-A3B-Thinking",
+            DownloadSource.MODELSCOPE: "moonshotai/Kimi-VL-A3B-Thinking",
+        },
+        "Kimi-VL-A3B-Thinking-2506": {
+            DownloadSource.DEFAULT: "moonshotai/Kimi-VL-A3B-Thinking-2506",
+            DownloadSource.MODELSCOPE: "moonshotai/Kimi-VL-A3B-Thinking-2506",
+        },
+    },
+    template="kimi_vl",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "LingoWhale-8B": {
+            DownloadSource.DEFAULT: "deeplang-ai/LingoWhale-8B",
+            DownloadSource.MODELSCOPE: "DeepLang/LingoWhale-8B",
+        }
+    },
+)
+
+
+register_model_group(
+    models={
+        "Llama-7B": {
+            DownloadSource.DEFAULT: "huggyllama/llama-7b",
+            DownloadSource.MODELSCOPE: "skyline2006/llama-7b",
+        },
+        "Llama-13B": {
+            DownloadSource.DEFAULT: "huggyllama/llama-13b",
+            DownloadSource.MODELSCOPE: "skyline2006/llama-13b",
+        },
+        "Llama-30B": {
+            DownloadSource.DEFAULT: "huggyllama/llama-30b",
+            DownloadSource.MODELSCOPE: "skyline2006/llama-30b",
+        },
+        "Llama-65B": {
+            DownloadSource.DEFAULT: "huggyllama/llama-65b",
+            DownloadSource.MODELSCOPE: "skyline2006/llama-65b",
+        },
+    }
+)
+
+
+register_model_group(
+    models={
+        "Llama-2-7B": {
+            DownloadSource.DEFAULT: "meta-llama/Llama-2-7b-hf",
+            DownloadSource.MODELSCOPE: "modelscope/Llama-2-7b-ms",
+        },
+        "Llama-2-13B": {
+            DownloadSource.DEFAULT: "meta-llama/Llama-2-13b-hf",
+            DownloadSource.MODELSCOPE: "modelscope/Llama-2-13b-ms",
+        },
+        "Llama-2-70B": {
+            DownloadSource.DEFAULT: "meta-llama/Llama-2-70b-hf",
+            DownloadSource.MODELSCOPE: "modelscope/Llama-2-70b-ms",
+        },
+        "Llama-2-7B-Chat": {
+            DownloadSource.DEFAULT: "meta-llama/Llama-2-7b-chat-hf",
+            DownloadSource.MODELSCOPE: "modelscope/Llama-2-7b-chat-ms",
+        },
+        "Llama-2-13B-Chat": {
+            DownloadSource.DEFAULT: "meta-llama/Llama-2-13b-chat-hf",
+            DownloadSource.MODELSCOPE: "modelscope/Llama-2-13b-chat-ms",
+        },
+        "Llama-2-70B-Chat": {
+            DownloadSource.DEFAULT: "meta-llama/Llama-2-70b-chat-hf",
+            DownloadSource.MODELSCOPE: "modelscope/Llama-2-70b-chat-ms",
+        },
+    },
+    template="llama2",
+)
+
+
+register_model_group(
+    models={
+        "Llama-3-8B": {
+            DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3-8B",
+            DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3-8B",
+        },
+        "Llama-3-70B": {
+            DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3-70B",
+            DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3-70B",
+        },
+        "Llama-3-8B-Instruct": {
+            DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3-8B-Instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3-8B-Instruct",
+        },
+        "Llama-3-70B-Instruct": {
+            DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3-70B-Instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3-70B-Instruct",
+        },
+        "Llama-3-8B-Chinese-Chat": {
+            DownloadSource.DEFAULT: "shenzhi-wang/Llama3-8B-Chinese-Chat",
+            DownloadSource.MODELSCOPE: "LLM-Research/Llama3-8B-Chinese-Chat",
+            DownloadSource.OPENMIND: "LlamaFactory/Llama3-Chinese-8B-Instruct",
+        },
+        "Llama-3-70B-Chinese-Chat": {
+            DownloadSource.DEFAULT: "shenzhi-wang/Llama3-70B-Chinese-Chat",
+        },
+        "Llama-3.1-8B": {
+            DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3.1-8B",
+            DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3.1-8B",
+        },
+        "Llama-3.1-70B": {
+            DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3.1-70B",
+            DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3.1-70B",
+        },
+        "Llama-3.1-405B": {
+            DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3.1-405B",
+            DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3.1-405B",
+        },
+        "Llama-3.1-8B-Instruct": {
+            DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3.1-8B-Instruct",
+        },
+        "Llama-3.1-70B-Instruct": {
+            DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3.1-70B-Instruct",
+        },
+        "Llama-3.1-405B-Instruct": {
+            DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3.1-405B-Instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3.1-405B-Instruct",
+        },
+        "Llama-3.1-8B-Chinese-Chat": {
+            DownloadSource.DEFAULT: "shenzhi-wang/Llama3.1-8B-Chinese-Chat",
+            DownloadSource.MODELSCOPE: "XD_AI/Llama3.1-8B-Chinese-Chat",
+        },
+        "Llama-3.1-70B-Chinese-Chat": {
+            DownloadSource.DEFAULT: "shenzhi-wang/Llama3.1-70B-Chinese-Chat",
+            DownloadSource.MODELSCOPE: "XD_AI/Llama3.1-70B-Chinese-Chat",
+        },
+        "Llama-3.2-1B": {
+            DownloadSource.DEFAULT: "meta-llama/Llama-3.2-1B",
+            DownloadSource.MODELSCOPE: "LLM-Research/Llama-3.2-1B",
+        },
+        "Llama-3.2-3B": {
+            DownloadSource.DEFAULT: "meta-llama/Llama-3.2-3B",
+            DownloadSource.MODELSCOPE: "LLM-Research/Llama-3.2-3B",
+        },
+        "Llama-3.2-1B-Instruct": {
+            DownloadSource.DEFAULT: "meta-llama/Llama-3.2-1B-Instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Llama-3.2-1B-Instruct",
+        },
+        "Llama-3.2-3B-Instruct": {
+            DownloadSource.DEFAULT: "meta-llama/Llama-3.2-3B-Instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Llama-3.2-3B-Instruct",
+        },
+        "Llama-3.3-70B-Instruct": {
+            DownloadSource.DEFAULT: "meta-llama/Llama-3.3-70B-Instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Llama-3.3-70B-Instruct",
+        },
+    },
+    template="llama3",
+)
+
+
+register_model_group(
+    models={
+        "Llama-3.2-11B-Vision": {
+            DownloadSource.DEFAULT: "meta-llama/Llama-3.2-11B-Vision",
+            DownloadSource.MODELSCOPE: "LLM-Research/Llama-3.2-11B-Vision",
+        },
+        "Llama-3.2-11B-Vision-Instruct": {
+            DownloadSource.DEFAULT: "meta-llama/Llama-3.2-11B-Vision-Instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Llama-3.2-11B-Vision-Instruct",
+        },
+        "Llama-3.2-90B-Vision": {
+            DownloadSource.DEFAULT: "meta-llama/Llama-3.2-90B-Vision",
+            DownloadSource.MODELSCOPE: "LLM-Research/Llama-3.2-90B-Vision",
+        },
+        "Llama-3.2-90B-Vision-Instruct": {
+            DownloadSource.DEFAULT: "meta-llama/Llama-3.2-90B-Vision-Instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Llama-3.2-90B-Vision-Instruct",
+        },
+    },
+    template="mllama",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "Llama-4-Scout-17B-16E": {
+            DownloadSource.DEFAULT: "meta-llama/Llama-4-Scout-17B-16E",
+            DownloadSource.MODELSCOPE: "LLM-Research/Llama-4-Scout-17B-16E",
+        },
+        "Llama-4-Scout-17B-16E-Instruct": {
+            DownloadSource.DEFAULT: "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Llama-4-Scout-17B-16E-Instruct",
+        },
+        "Llama-4-Maverick-17B-128E": {
+            DownloadSource.DEFAULT: "meta-llama/Llama-4-Maverick-17B-128E",
+            DownloadSource.MODELSCOPE: "LLM-Research/Llama-4-Maverick-17B-128E",
+        },
+        "Llama-4-Maverick-17B-128E-Instruct": {
+            DownloadSource.DEFAULT: "meta-llama/Llama-4-Maverick-17B-128E-Instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Llama-4-Maverick-17B-128E-Instruct",
+        },
+    },
+    template="llama4",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "LLaVA-1.5-7B-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/llava-1.5-7b-hf",
+            DownloadSource.MODELSCOPE: "swift/llava-1.5-7b-hf",
+        },
+        "LLaVA-1.5-13B-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/llava-1.5-13b-hf",
+            DownloadSource.MODELSCOPE: "swift/llava-1.5-13b-hf",
+        },
+    },
+    template="llava",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "LLaVA-NeXT-7B-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/llava-v1.6-vicuna-7b-hf",
+            DownloadSource.MODELSCOPE: "swift/llava-v1.6-vicuna-7b-hf",
+        },
+        "LLaVA-NeXT-13B-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/llava-v1.6-vicuna-13b-hf",
+            DownloadSource.MODELSCOPE: "swift/llava-v1.6-vicuna-13b-hf",
+        },
+    },
+    template="llava_next",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "LLaVA-NeXT-Mistral-7B-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/llava-v1.6-mistral-7b-hf",
+            DownloadSource.MODELSCOPE: "swift/llava-v1.6-mistral-7b-hf",
+        },
+    },
+    template="llava_next_mistral",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "LLaVA-NeXT-Llama3-8B-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/llama3-llava-next-8b-hf",
+            DownloadSource.MODELSCOPE: "swift/llama3-llava-next-8b-hf",
+        },
+    },
+    template="llava_next_llama3",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "LLaVA-NeXT-34B-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/llava-v1.6-34b-hf",
+            DownloadSource.MODELSCOPE: "LLM-Research/llava-v1.6-34b-hf",
+        },
+    },
+    template="llava_next_yi",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "LLaVA-NeXT-72B-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/llava-next-72b-hf",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/llava-next-72b-hf",
+        },
+        "LLaVA-NeXT-110B-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/llava-next-110b-hf",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/llava-next-110b-hf",
+        },
+    },
+    template="llava_next_qwen",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "LLaVA-NeXT-Video-7B-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/LLaVA-NeXT-Video-7B-hf",
+            DownloadSource.MODELSCOPE: "swift/LLaVA-NeXT-Video-7B-hf",
+        },
+        "LLaVA-NeXT-Video-7B-DPO-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/LLaVA-NeXT-Video-7B-DPO-hf",
+            DownloadSource.MODELSCOPE: "swift/LLaVA-NeXT-Video-7B-DPO-hf",
+        },
+    },
+    template="llava_next_video",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "LLaVA-NeXT-Video-7B-32k-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/LLaVA-NeXT-Video-7B-32K-hf",
+            DownloadSource.MODELSCOPE: "swift/LLaVA-NeXT-Video-7B-32K-hf",
+        },
+    },
+    template="llava_next_video_mistral",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "LLaVA-NeXT-Video-34B-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/LLaVA-NeXT-Video-34B-hf",
+            DownloadSource.MODELSCOPE: "swift/LLaVA-NeXT-Video-34B-hf",
+        },
+        "LLaVA-NeXT-Video-34B-DPO-Chat": {
+            DownloadSource.DEFAULT: "llava-hf/LLaVA-NeXT-Video-34B-DPO-hf",
+        },
+    },
+    template="llava_next_video_yi",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "Marco-o1-Chat": {
+            DownloadSource.DEFAULT: "AIDC-AI/Marco-o1",
+            DownloadSource.MODELSCOPE: "AIDC-AI/Marco-o1",
+        },
+    },
+    template="marco",
+)
+
+
+register_model_group(
+    models={
+        "MiMo-7B-Base": {
+            DownloadSource.DEFAULT: "XiaomiMiMo/MiMo-7B-Base",
+            DownloadSource.MODELSCOPE: "XiaomiMiMo/MiMo-7B-Base",
+        },
+        "MiMo-7B-Instruct": {
+            DownloadSource.DEFAULT: "XiaomiMiMo/MiMo-7B-SFT",
+            DownloadSource.MODELSCOPE: "XiaomiMiMo/MiMo-7B-SFT",
+        },
+        "MiMo-7B-Instruct-RL": {
+            DownloadSource.DEFAULT: "XiaomiMiMo/MiMo-7B-RL",
+            DownloadSource.MODELSCOPE: "XiaomiMiMo/MiMo-7B-RL",
+        },
+        "MiMo-7B-RL-ZERO": {
+            DownloadSource.DEFAULT: "XiaomiMiMo/MiMo-7B-RL-ZERO",
+            DownloadSource.MODELSCOPE: "XiaomiMiMo/MiMo-7B-RL-ZERO",
+        },
+    },
+    template="mimo",
+)
+
+
+register_model_group(
+    models={
+        "MiMo-7B-VL-RL": {
+            DownloadSource.DEFAULT: "XiaomiMiMo/MiMo-VL-7B-RL",
+            DownloadSource.MODELSCOPE: "XiaomiMiMo/MiMo-VL-7B-RL",
+        },
+        "MiMo-VL-7B-RL-2508": {
+            DownloadSource.DEFAULT: "XiaomiMiMo/MiMo-VL-7B-RL-2508",
+            DownloadSource.MODELSCOPE: "XiaomiMiMo/MiMo-VL-7B-RL-2508",
+        },
+    },
+    template="mimo_vl",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "MiMo-7B-VL-Instruct": {
+            DownloadSource.DEFAULT: "XiaomiMiMo/MiMo-VL-7B-SFT",
+            DownloadSource.MODELSCOPE: "XiaomiMiMo/MiMo-VL-7B-SFT",
+        },
+        "MiMo-VL-7B-SFT-2508": {
+            DownloadSource.DEFAULT: "XiaomiMiMo/MiMo-VL-7B-SFT-2508",
+            DownloadSource.DEFAULT: "XiaomiMiMo/MiMo-VL-7B-SFT-2508",
+        },
+    },
+    template="qwen2_vl",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "MiniCPM-2B-SFT-Chat": {
+            DownloadSource.DEFAULT: "openbmb/MiniCPM-2B-sft-bf16",
+            DownloadSource.MODELSCOPE: "OpenBMB/miniCPM-bf16",
+        },
+        "MiniCPM-2B-DPO-Chat": {
+            DownloadSource.DEFAULT: "openbmb/MiniCPM-2B-dpo-bf16",
+            DownloadSource.MODELSCOPE: "OpenBMB/MiniCPM-2B-dpo-bf16",
+        },
+    },
+    template="cpm",
+)
+
+
+register_model_group(
+    models={
+        "MiniCPM3-4B-Chat": {
+            DownloadSource.DEFAULT: "openbmb/MiniCPM3-4B",
+            DownloadSource.MODELSCOPE: "OpenBMB/MiniCPM3-4B",
+            DownloadSource.OPENMIND: "LlamaFactory/MiniCPM3-4B",
+        },
+    },
+    template="cpm3",
+)
+
+
+register_model_group(
+    models={
+        "MiniCPM4-0.5B-Chat": {
+            DownloadSource.DEFAULT: "openbmb/MiniCPM4-0.5B",
+            DownloadSource.MODELSCOPE: "OpenBMB/MiniCPM4-0.5B",
+        },
+        "MiniCPM4-8B-Chat": {
+            DownloadSource.DEFAULT: "openbmb/MiniCPM4-8B",
+            DownloadSource.MODELSCOPE: "OpenBMB/MiniCPM4-8B",
+        },
+        "MiniCPM4.1-8B-Chat": {
+            DownloadSource.DEFAULT: "openbmb/MiniCPM4.1-8B",
+            DownloadSource.MODELSCOPE: "OpenBMB/MiniCPM4.1-8B",
+        },
+    },
+    template="cpm4",
+)
+
+
+register_model_group(
+    models={
+        "MiniCPM-o-2.6": {
+            DownloadSource.DEFAULT: "openbmb/MiniCPM-o-2_6",
+            DownloadSource.MODELSCOPE: "OpenBMB/MiniCPM-o-2_6",
+        },
+    },
+    template="minicpm_o",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "MiniCPM-V-2.6": {
+            DownloadSource.DEFAULT: "openbmb/MiniCPM-V-2_6",
+            DownloadSource.MODELSCOPE: "OpenBMB/MiniCPM-V-2_6",
+        },
+    },
+    template="minicpm_v",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "MiniCPM-V-4": {
+            DownloadSource.DEFAULT: "openbmb/MiniCPM-V-4",
+            DownloadSource.MODELSCOPE: "OpenBMB/MiniCPM-V-4",
+        },
+    },
+    template="minicpm_v",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "MiniCPM-V-4.5": {
+            DownloadSource.DEFAULT: "openbmb/MiniCPM-V-4_5",
+            DownloadSource.MODELSCOPE: "OpenBMB/MiniCPM-V-4_5",
+        },
+    },
+    template="minicpm_v",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "Ministral-8B-Instruct-2410": {
+            DownloadSource.DEFAULT: "mistralai/Ministral-8B-Instruct-2410",
+            DownloadSource.MODELSCOPE: "mistralai/Ministral-8B-Instruct-2410",
+        },
+        "Mistral-Nemo-Base-2407": {
+            DownloadSource.DEFAULT: "mistralai/Mistral-Nemo-Base-2407",
+            DownloadSource.MODELSCOPE: "LLM-Research/Mistral-Nemo-Base-2407",
+        },
+        "Mistral-Nemo-Instruct-2407": {
+            DownloadSource.DEFAULT: "mistralai/Mistral-Nemo-Instruct-2407",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/Mistral-Nemo-Instruct-2407",
+        },
+    },
+    template="ministral",
+)
+
+
+register_model_group(
+    models={
+        "Mistral-7B-v0.1": {
+            DownloadSource.DEFAULT: "mistralai/Mistral-7B-v0.1",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/Mistral-7B-v0.1",
+        },
+        "Mistral-7B-v0.2": {
+            DownloadSource.DEFAULT: "alpindale/Mistral-7B-v0.2-hf",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/Mistral-7B-v0.2-hf",
+        },
+        "Mistral-7B-v0.3": {
+            DownloadSource.DEFAULT: "mistralai/Mistral-7B-v0.3",
+            DownloadSource.MODELSCOPE: "LLM-Research/mistral-7b-v0.3",
+        },
+        "Mistral-7B-Instruct-v0.1": {
+            DownloadSource.DEFAULT: "mistralai/Mistral-7B-Instruct-v0.1",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/Mistral-7B-Instruct-v0.1",
+        },
+        "Mistral-7B-Instruct-v0.2": {
+            DownloadSource.DEFAULT: "mistralai/Mistral-7B-Instruct-v0.2",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/Mistral-7B-Instruct-v0.2",
+        },
+        "Mistral-7B-Instruct-v0.3": {
+            DownloadSource.DEFAULT: "mistralai/Mistral-7B-Instruct-v0.3",
+            DownloadSource.MODELSCOPE: "LLM-Research/Mistral-7B-Instruct-v0.3",
+        },
+    },
+    template="mistral",
+)
+
+
+register_model_group(
+    models={
+        "Mistral-Small-24B-Base-2501": {
+            DownloadSource.DEFAULT: "mistralai/Mistral-Small-24B-Base-2501",
+            DownloadSource.MODELSCOPE: "mistralai/Mistral-Small-24B-Base-2501",
+        },
+        "Mistral-Small-24B-Instruct-2501": {
+            DownloadSource.DEFAULT: "mistralai/Mistral-Small-24B-Instruct-2501",
+            DownloadSource.MODELSCOPE: "mistralai/Mistral-Small-24B-Instruct-2501",
+        },
+    },
+    template="mistral_small",
+)
+
+
+register_model_group(
+    models={
+        "Mistral-Small-3.1-24B-Base": {
+            DownloadSource.DEFAULT: "mistralai/Mistral-Small-3.1-24B-Base-2503",
+            DownloadSource.MODELSCOPE: "mistralai/Mistral-Small-3.1-24B-Base-2503",
+        },
+        "Mistral-Small-3.1-24B-Instruct": {
+            DownloadSource.DEFAULT: "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
+            DownloadSource.MODELSCOPE: "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
+        },
+        "Mistral-Small-3.2-24B-Instruct": {
+            DownloadSource.DEFAULT: "mistralai/Mistral-Small-3.2-24B-Instruct-2506",
+            DownloadSource.MODELSCOPE: "mistralai/Mistral-Small-3.2-24B-Instruct-2506",
+        },
+    },
+    template="mistral_small",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "Mixtral-8x7B-v0.1": {
+            DownloadSource.DEFAULT: "mistralai/Mixtral-8x7B-v0.1",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/Mixtral-8x7B-v0.1",
+        },
+        "Mixtral-8x22B-v0.1": {
+            DownloadSource.DEFAULT: "mistralai/Mixtral-8x22B-v0.1",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/Mixtral-8x22B-v0.1",
+        },
+        "Mixtral-8x7B-v0.1-Instruct": {
+            DownloadSource.DEFAULT: "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/Mixtral-8x7B-Instruct-v0.1",
+        },
+        "Mixtral-8x22B-v0.1-Instruct": {
+            DownloadSource.DEFAULT: "mistralai/Mixtral-8x22B-Instruct-v0.1",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/Mixtral-8x22B-Instruct-v0.1",
+        },
+    },
+    template="mistral",
+)
+
+
+register_model_group(
+    models={
+        "MobileLLM-R1-140M-Base": {
+            DownloadSource.DEFAULT: "facebook/MobileLLM-R1-140M-base",
+            DownloadSource.MODELSCOPE: "facebook/MobileLLM-R1-140M-base",
+        },
+        "MobileLLM-R1-360M-Base": {
+            DownloadSource.DEFAULT: "facebook/MobileLLM-R1-360M-base",
+            DownloadSource.MODELSCOPE: "facebook/MobileLLM-R1-360M-base",
+        },
+        "MobileLLM-R1-950M-Base": {
+            DownloadSource.DEFAULT: "facebook/MobileLLM-R1-950M-base",
+            DownloadSource.MODELSCOPE: "facebook/MobileLLM-R1-950M-base",
+        },
+        "MobileLLM-R1-140M-Instruct": {
+            DownloadSource.DEFAULT: "facebook/MobileLLM-R1-140M",
+            DownloadSource.MODELSCOPE: "facebook/MobileLLM-R1-140M",
+        },
+        "MobileLLM-R1-360M-Instruct": {
+            DownloadSource.DEFAULT: "facebook/MobileLLM-R1-360M",
+            DownloadSource.MODELSCOPE: "facebook/MobileLLM-R1-360M",
+        },
+        "MobileLLM-R1-950M-Instruct": {
+            DownloadSource.DEFAULT: "facebook/MobileLLM-R1-950M",
+            DownloadSource.MODELSCOPE: "facebook/MobileLLM-R1-950M",
+        },
+    },
+    template="llama3",
+)
+
+
+register_model_group(
+    models={
+        "Moonlight-16B-A3B": {
+            DownloadSource.DEFAULT: "moonshotai/Moonlight-16B-A3B",
+            DownloadSource.MODELSCOPE: "moonshotai/Moonlight-16B-A3B",
+        },
+        "Moonlight-16B-A3B-Instruct": {
+            DownloadSource.DEFAULT: "moonshotai/Moonlight-16B-A3B-Instruct",
+            DownloadSource.MODELSCOPE: "moonshotai/Moonlight-16B-A3B-Instruct",
+        },
+    },
+    template="moonlight",
+)
+
+
+register_model_group(
+    models={
+        "OLMo-1B": {
+            DownloadSource.DEFAULT: "allenai/OLMo-1B-hf",
+        },
+        "OLMo-7B": {
+            DownloadSource.DEFAULT: "allenai/OLMo-7B-hf",
+        },
+        "OLMo-7B-Chat": {
+            DownloadSource.DEFAULT: "ssec-uw/OLMo-7B-Instruct-hf",
+        },
+        "OLMo-1.7-7B": {
+            DownloadSource.DEFAULT: "allenai/OLMo-1.7-7B-hf",
+        },
+    },
+)
+
+
+register_model_group(
+    models={
+        "OpenChat3.5-7B-Chat": {
+            DownloadSource.DEFAULT: "openchat/openchat-3.5-0106",
+            DownloadSource.MODELSCOPE: "xcwzxcwz/openchat-3.5-0106",
+        }
+    },
+    template="openchat",
+)
+
+
+register_model_group(
+    models={
+        "OpenChat3.6-8B-Chat": {
+            DownloadSource.DEFAULT: "openchat/openchat-3.6-8b-20240522",
+        }
+    },
+    template="openchat-3.6",
+)
+
+
+register_model_group(
+    models={
+        "OpenCoder-1.5B-Base": {
+            DownloadSource.DEFAULT: "infly/OpenCoder-1.5B-Base",
+            DownloadSource.MODELSCOPE: "infly/OpenCoder-1.5B-Base",
+        },
+        "OpenCoder-8B-Base": {
+            DownloadSource.DEFAULT: "infly/OpenCoder-8B-Base",
+            DownloadSource.MODELSCOPE: "infly/OpenCoder-8B-Base",
+        },
+        "OpenCoder-1.5B-Instruct": {
+            DownloadSource.DEFAULT: "infly/OpenCoder-1.5B-Instruct",
+            DownloadSource.MODELSCOPE: "infly/OpenCoder-1.5B-Instruct",
+        },
+        "OpenCoder-8B-Instruct": {
+            DownloadSource.DEFAULT: "infly/OpenCoder-8B-Instruct",
+            DownloadSource.MODELSCOPE: "infly/OpenCoder-8B-Instruct",
+        },
+    },
+    template="opencoder",
+)
+
+
+register_model_group(
+    models={
+        "Orion-14B-Base": {
+            DownloadSource.DEFAULT: "OrionStarAI/Orion-14B-Base",
+            DownloadSource.MODELSCOPE: "OrionStarAI/Orion-14B-Base",
+        },
+        "Orion-14B-Chat": {
+            DownloadSource.DEFAULT: "OrionStarAI/Orion-14B-Chat",
+            DownloadSource.MODELSCOPE: "OrionStarAI/Orion-14B-Chat",
+        },
+        "Orion-14B-Long-Chat": {
+            DownloadSource.DEFAULT: "OrionStarAI/Orion-14B-LongChat",
+            DownloadSource.MODELSCOPE: "OrionStarAI/Orion-14B-LongChat",
+        },
+        "Orion-14B-RAG-Chat": {
+            DownloadSource.DEFAULT: "OrionStarAI/Orion-14B-Chat-RAG",
+            DownloadSource.MODELSCOPE: "OrionStarAI/Orion-14B-Chat-RAG",
+        },
+        "Orion-14B-Plugin-Chat": {
+            DownloadSource.DEFAULT: "OrionStarAI/Orion-14B-Chat-Plugin",
+            DownloadSource.MODELSCOPE: "OrionStarAI/Orion-14B-Chat-Plugin",
+        },
+    },
+    template="orion",
+)
+
+
+register_model_group(
+    models={
+        "PaliGemma-3B-pt-224": {
+            DownloadSource.DEFAULT: "google/paligemma-3b-pt-224",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma-3b-pt-224",
+        },
+        "PaliGemma-3B-pt-448": {
+            DownloadSource.DEFAULT: "google/paligemma-3b-pt-448",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma-3b-pt-448",
+        },
+        "PaliGemma-3B-pt-896": {
+            DownloadSource.DEFAULT: "google/paligemma-3b-pt-896",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma-3b-pt-896",
+        },
+        "PaliGemma-3B-mix-224": {
+            DownloadSource.DEFAULT: "google/paligemma-3b-mix-224",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma-3b-mix-224",
+        },
+        "PaliGemma-3B-mix-448": {
+            DownloadSource.DEFAULT: "google/paligemma-3b-mix-448",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma-3b-mix-448",
+        },
+    },
+    template="paligemma",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "PaliGemma2-3B-pt-224": {
+            DownloadSource.DEFAULT: "google/paligemma2-3b-pt-224",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma2-3b-pt-224",
+        },
+        "PaliGemma2-3B-pt-448": {
+            DownloadSource.DEFAULT: "google/paligemma2-3b-pt-448",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma2-3b-pt-448",
+        },
+        "PaliGemma2-3B-pt-896": {
+            DownloadSource.DEFAULT: "google/paligemma2-3b-pt-896",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma2-3b-pt-896",
+        },
+        "PaliGemma2-10B-pt-224": {
+            DownloadSource.DEFAULT: "google/paligemma2-10b-pt-224",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma2-10b-pt-224",
+        },
+        "PaliGemma2-10B-pt-448": {
+            DownloadSource.DEFAULT: "google/paligemma2-10b-pt-448",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma2-10b-pt-448",
+        },
+        "PaliGemma2-10B-pt-896": {
+            DownloadSource.DEFAULT: "google/paligemma2-10b-pt-896",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma2-10b-pt-896",
+        },
+        "PaliGemma2-28B-pt-224": {
+            DownloadSource.DEFAULT: "google/paligemma2-28b-pt-224",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma2-28b-pt-224",
+        },
+        "PaliGemma2-28B-pt-448": {
+            DownloadSource.DEFAULT: "google/paligemma2-28b-pt-448",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma2-28b-pt-448",
+        },
+        "PaliGemma2-28B-pt-896": {
+            DownloadSource.DEFAULT: "google/paligemma2-28b-pt-896",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma2-28b-pt-896",
+        },
+        "PaliGemma2-3B-mix-224": {
+            DownloadSource.DEFAULT: "google/paligemma2-3b-mix-224",
+            DownloadSource.MODELSCOPE: "mlx-community/paligemma2-3b-mix-224-bf16",
+        },
+        "PaliGemma2-3B-mix-448": {
+            DownloadSource.DEFAULT: "google/paligemma2-3b-mix-448",
+            DownloadSource.MODELSCOPE: "mlx-community/paligemma2-3b-mix-448-bf16",
+        },
+        "PaliGemma2-10B-mix-224": {
+            DownloadSource.DEFAULT: "google/paligemma2-10b-mix-224",
+            DownloadSource.MODELSCOPE: "mlx-community/paligemma2-10b-mix-224-bf16",
+        },
+        "PaliGemma2-10B-mix-448": {
+            DownloadSource.DEFAULT: "google/paligemma2-10b-mix-448",
+            DownloadSource.MODELSCOPE: "mlx-community/paligemma2-10b-mix-448-bf16",
+        },
+        "PaliGemma2-28B-mix-224": {
+            DownloadSource.DEFAULT: "google/paligemma2-28b-mix-224",
+            DownloadSource.MODELSCOPE: "mlx-community/paligemma2-28b-mix-224-bf16",
+        },
+        "PaliGemma2-28B-mix-448": {
+            DownloadSource.DEFAULT: "google/paligemma2-28b-mix-448",
+            DownloadSource.MODELSCOPE: "mlx-community/paligemma2-28b-mix-448-bf16",
+        },
+    },
+    template="paligemma",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "Phi-1.5-1.3B": {
+            DownloadSource.DEFAULT: "microsoft/phi-1_5",
+            DownloadSource.MODELSCOPE: "allspace/PHI_1-5",
+        },
+        "Phi-2-2.7B": {
+            DownloadSource.DEFAULT: "microsoft/phi-2",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/phi-2",
+        },
+    }
+)
+
+
+register_model_group(
+    models={
+        "Phi-3-4B-4k-Instruct": {
+            DownloadSource.DEFAULT: "microsoft/Phi-3-mini-4k-instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Phi-3-mini-4k-instruct",
+        },
+        "Phi-3-4B-128k-Instruct": {
+            DownloadSource.DEFAULT: "microsoft/Phi-3-mini-128k-instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Phi-3-mini-128k-instruct",
+        },
+        "Phi-3-14B-8k-Instruct": {
+            DownloadSource.DEFAULT: "microsoft/Phi-3-medium-4k-instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Phi-3-medium-4k-instruct",
+        },
+        "Phi-3-14B-128k-Instruct": {
+            DownloadSource.DEFAULT: "microsoft/Phi-3-medium-128k-instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Phi-3-medium-128k-instruct",
+        },
+        "Phi-3.5-4B-instruct": {
+            DownloadSource.DEFAULT: "microsoft/Phi-3.5-mini-instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Phi-3.5-mini-instruct",
+        },
+        "Phi-3.5-MoE-42B-A6.6B-instruct": {
+            DownloadSource.DEFAULT: "microsoft/Phi-3.5-MoE-instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Phi-3.5-MoE-instruct",
+        },
+    },
+    template="phi",
+)
+
+
+register_model_group(
+    models={
+        "Phi-3-7B-8k-Instruct": {
+            DownloadSource.DEFAULT: "microsoft/Phi-3-small-8k-instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Phi-3-small-8k-instruct",
+        },
+        "Phi-3-7B-128k-Instruct": {
+            DownloadSource.DEFAULT: "microsoft/Phi-3-small-128k-instruct",
+            DownloadSource.MODELSCOPE: "LLM-Research/Phi-3-small-128k-instruct",
+        },
+    },
+    template="phi_small",
+)
+
+
+register_model_group(
+    models={
+        "Phi-4-14B-Instruct": {
+            DownloadSource.DEFAULT: "microsoft/phi-4",
+            DownloadSource.MODELSCOPE: "LLM-Research/phi-4",
+        },
+    },
+    template="phi4",
+)
+
+
+register_model_group(
+    models={
+        "Pixtral-12B": {
+            DownloadSource.DEFAULT: "mistral-community/pixtral-12b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/pixtral-12b",
+        }
+    },
+    template="pixtral",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "Qwen-1.8B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen-1_8B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen-1_8B",
+        },
+        "Qwen-7B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen-7B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen-7B",
+        },
+        "Qwen-14B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen-14B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen-14B",
+        },
+        "Qwen-72B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen-72B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen-72B",
+        },
+        "Qwen-1.8B-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen-1_8B-Chat",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen-1_8B-Chat",
+        },
+        "Qwen-7B-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen-7B-Chat",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen-7B-Chat",
+        },
+        "Qwen-14B-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen-14B-Chat",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen-14B-Chat",
+        },
+        "Qwen-72B-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen-72B-Chat",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen-72B-Chat",
+        },
+        "Qwen-1.8B-Chat-Int8": {
+            DownloadSource.DEFAULT: "Qwen/Qwen-1_8B-Chat-Int8",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen-1_8B-Chat-Int8",
+        },
+        "Qwen-1.8B-Chat-Int4": {
+            DownloadSource.DEFAULT: "Qwen/Qwen-1_8B-Chat-Int4",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen-1_8B-Chat-Int4",
+        },
+        "Qwen-7B-Chat-Int8": {
+            DownloadSource.DEFAULT: "Qwen/Qwen-7B-Chat-Int8",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen-7B-Chat-Int8",
+        },
+        "Qwen-7B-Chat-Int4": {
+            DownloadSource.DEFAULT: "Qwen/Qwen-7B-Chat-Int4",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen-7B-Chat-Int4",
+        },
+        "Qwen-14B-Chat-Int8": {
+            DownloadSource.DEFAULT: "Qwen/Qwen-14B-Chat-Int8",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen-14B-Chat-Int8",
+        },
+        "Qwen-14B-Chat-Int4": {
+            DownloadSource.DEFAULT: "Qwen/Qwen-14B-Chat-Int4",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen-14B-Chat-Int4",
+        },
+        "Qwen-72B-Chat-Int8": {
+            DownloadSource.DEFAULT: "Qwen/Qwen-72B-Chat-Int8",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen-72B-Chat-Int8",
+        },
+        "Qwen-72B-Chat-Int4": {
+            DownloadSource.DEFAULT: "Qwen/Qwen-72B-Chat-Int4",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen-72B-Chat-Int4",
+        },
+    },
+    template="qwen",
+)
+
+
+register_model_group(
+    models={
+        "Qwen1.5-0.5B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-0.5B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-0.5B",
+        },
+        "Qwen1.5-1.8B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-1.8B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-1.8B",
+        },
+        "Qwen1.5-4B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-4B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-4B",
+        },
+        "Qwen1.5-7B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-7B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-7B",
+        },
+        "Qwen1.5-14B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-14B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-14B",
+        },
+        "Qwen1.5-32B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-32B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-32B",
+        },
+        "Qwen1.5-72B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-72B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-72B",
+        },
+        "Qwen1.5-110B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-110B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-110B",
+        },
+        "Qwen1.5-MoE-A2.7B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-MoE-A2.7B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-MoE-A2.7B",
+        },
+        "Qwen1.5-0.5B-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-0.5B-Chat",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-0.5B-Chat",
+        },
+        "Qwen1.5-1.8B-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-1.8B-Chat",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-1.8B-Chat",
+        },
+        "Qwen1.5-4B-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-4B-Chat",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-4B-Chat",
+        },
+        "Qwen1.5-7B-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-7B-Chat",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-7B-Chat",
+        },
+        "Qwen1.5-14B-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-14B-Chat",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-14B-Chat",
+        },
+        "Qwen1.5-32B-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-32B-Chat",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-32B-Chat",
+        },
+        "Qwen1.5-72B-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-72B-Chat",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-72B-Chat",
+        },
+        "Qwen1.5-110B-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-110B-Chat",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-110B-Chat",
+        },
+        "Qwen1.5-MoE-A2.7B-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-MoE-A2.7B-Chat",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-MoE-A2.7B-Chat",
+        },
+        "Qwen1.5-0.5B-Chat-GPTQ-Int8": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8",
+        },
+        "Qwen1.5-0.5B-Chat-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-0.5B-Chat-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-0.5B-Chat-AWQ",
+        },
+        "Qwen1.5-1.8B-Chat-GPTQ-Int8": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8",
+        },
+        "Qwen1.5-1.8B-Chat-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-1.8B-Chat-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-1.8B-Chat-AWQ",
+        },
+        "Qwen1.5-4B-Chat-GPTQ-Int8": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-4B-Chat-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-4B-Chat-GPTQ-Int8",
+        },
+        "Qwen1.5-4B-Chat-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-4B-Chat-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-4B-Chat-AWQ",
+        },
+        "Qwen1.5-7B-Chat-GPTQ-Int8": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-7B-Chat-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-7B-Chat-GPTQ-Int8",
+        },
+        "Qwen1.5-7B-Chat-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-7B-Chat-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-7B-Chat-AWQ",
+        },
+        "Qwen1.5-14B-Chat-GPTQ-Int8": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-14B-Chat-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-14B-Chat-GPTQ-Int8",
+        },
+        "Qwen1.5-14B-Chat-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-14B-Chat-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-14B-Chat-AWQ",
+        },
+        "Qwen1.5-32B-Chat-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-32B-Chat-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-32B-Chat-AWQ",
+        },
+        "Qwen1.5-72B-Chat-GPTQ-Int8": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-72B-Chat-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-72B-Chat-GPTQ-Int8",
+        },
+        "Qwen1.5-72B-Chat-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-72B-Chat-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-72B-Chat-AWQ",
+        },
+        "Qwen1.5-110B-Chat-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-110B-Chat-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-110B-Chat-AWQ",
+        },
+        "Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4",
+        },
+        "CodeQwen1.5-7B": {
+            DownloadSource.DEFAULT: "Qwen/CodeQwen1.5-7B",
+            DownloadSource.MODELSCOPE: "Qwen/CodeQwen1.5-7B",
+        },
+        "CodeQwen1.5-7B-Chat": {
+            DownloadSource.DEFAULT: "Qwen/CodeQwen1.5-7B-Chat",
+            DownloadSource.MODELSCOPE: "Qwen/CodeQwen1.5-7B-Chat",
+        },
+        "CodeQwen1.5-7B-Chat-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/CodeQwen1.5-7B-Chat-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/CodeQwen1.5-7B-Chat-AWQ",
+        },
+    },
+    template="qwen",
+)
+
+
+register_model_group(
+    models={
+        "Qwen2-0.5B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-0.5B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-0.5B",
+        },
+        "Qwen2-1.5B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-1.5B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-1.5B",
+        },
+        "Qwen2-7B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-7B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-7B",
+        },
+        "Qwen2-72B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-72B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-72B",
+        },
+        "Qwen2-MoE-57B-A14B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-57B-A14B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-57B-A14B",
+        },
+        "Qwen2-0.5B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-0.5B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-0.5B-Instruct",
+            DownloadSource.OPENMIND: "LlamaFactory/Qwen2-0.5B-Instruct",
+        },
+        "Qwen2-1.5B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-1.5B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-1.5B-Instruct",
+            DownloadSource.OPENMIND: "LlamaFactory/Qwen2-1.5B-Instruct",
+        },
+        "Qwen2-7B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-7B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-7B-Instruct",
+            DownloadSource.OPENMIND: "LlamaFactory/Qwen2-7B-Instruct",
+        },
+        "Qwen2-72B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-72B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-72B-Instruct",
+        },
+        "Qwen2-MoE-57B-A14B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-57B-A14B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-57B-A14B-Instruct",
+        },
+        "Qwen2-0.5B-Instruct-GPTQ-Int8": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-0.5B-Instruct-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-0.5B-Instruct-GPTQ-Int8",
+        },
+        "Qwen2-0.5B-Instruct-GPTQ-Int4": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-0.5B-Instruct-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-0.5B-Instruct-GPTQ-Int4",
+        },
+        "Qwen2-0.5B-Instruct-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-0.5B-Instruct-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-0.5B-Instruct-AWQ",
+        },
+        "Qwen2-1.5B-Instruct-GPTQ-Int8": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int8",
+        },
+        "Qwen2-1.5B-Instruct-GPTQ-Int4": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4",
+        },
+        "Qwen2-1.5B-Instruct-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-1.5B-Instruct-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-1.5B-Instruct-AWQ",
+        },
+        "Qwen2-7B-Instruct-GPTQ-Int8": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-7B-Instruct-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-7B-Instruct-GPTQ-Int8",
+        },
+        "Qwen2-7B-Instruct-GPTQ-Int4": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-7B-Instruct-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-7B-Instruct-GPTQ-Int4",
+        },
+        "Qwen2-7B-Instruct-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-7B-Instruct-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-7B-Instruct-AWQ",
+        },
+        "Qwen2-72B-Instruct-GPTQ-Int8": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-72B-Instruct-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-72B-Instruct-GPTQ-Int8",
+        },
+        "Qwen2-72B-Instruct-GPTQ-Int4": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-72B-Instruct-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-72B-Instruct-GPTQ-Int4",
+        },
+        "Qwen2-72B-Instruct-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-72B-Instruct-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-72B-Instruct-AWQ",
+        },
+        "Qwen2-57B-A14B-Instruct-GPTQ-Int4": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4",
+        },
+        "Qwen2-Math-1.5B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-Math-1.5B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-Math-1.5B",
+        },
+        "Qwen2-Math-7B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-Math-7B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-Math-7B",
+        },
+        "Qwen2-Math-72B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-Math-72B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-Math-72B",
+        },
+        "Qwen2-Math-1.5B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-Math-1.5B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-Math-1.5B-Instruct",
+        },
+        "Qwen2-Math-7B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-Math-7B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-Math-7B-Instruct",
+        },
+        "Qwen2-Math-72B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-Math-72B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-Math-72B-Instruct",
+        },
+    },
+    template="qwen",
+)
+
+
+register_model_group(
+    models={
+        "Qwen2.5-0.5B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-0.5B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-0.5B",
+        },
+        "Qwen2.5-1.5B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-1.5B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-1.5B",
+        },
+        "Qwen2.5-3B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-3B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-3B",
+        },
+        "Qwen2.5-7B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-7B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-7B",
+        },
+        "Qwen2.5-14B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-14B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-14B",
+        },
+        "Qwen2.5-32B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-32B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-32B",
+        },
+        "Qwen2.5-72B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-72B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-72B",
+        },
+        "Qwen2.5-0.5B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-0.5B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-0.5B-Instruct",
+        },
+        "Qwen2.5-1.5B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-1.5B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-1.5B-Instruct",
+        },
+        "Qwen2.5-3B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-3B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-3B-Instruct",
+        },
+        "Qwen2.5-7B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-7B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-7B-Instruct",
+        },
+        "Qwen2.5-14B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-14B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-14B-Instruct",
+        },
+        "Qwen2.5-32B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-32B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-32B-Instruct",
+        },
+        "Qwen2.5-72B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-72B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-72B-Instruct",
+        },
+        "Qwen2.5-7B-Instruct-1M": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-7B-Instruct-1M",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-7B-Instruct-1M",
+        },
+        "Qwen2.5-14B-Instruct-1M": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-14B-Instruct-1M",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-14B-Instruct-1M",
+        },
+        "Qwen2.5-0.5B-Instruct-GPTQ-Int8": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8",
+        },
+        "Qwen2.5-0.5B-Instruct-GPTQ-Int4": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4",
+        },
+        "Qwen2.5-0.5B-Instruct-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-0.5B-Instruct-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-0.5B-Instruct-AWQ",
+        },
+        "Qwen2.5-1.5B-Instruct-GPTQ-Int8": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8",
+        },
+        "Qwen2.5-1.5B-Instruct-GPTQ-Int4": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4",
+        },
+        "Qwen2.5-1.5B-Instruct-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-1.5B-Instruct-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-1.5B-Instruct-AWQ",
+        },
+        "Qwen2.5-3B-Instruct-GPTQ-Int8": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8",
+        },
+        "Qwen2.5-3B-Instruct-GPTQ-Int4": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4",
+        },
+        "Qwen2.5-3B-Instruct-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-3B-Instruct-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-3B-Instruct-AWQ",
+        },
+        "Qwen2.5-7B-Instruct-GPTQ-Int8": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8",
+        },
+        "Qwen2.5-7B-Instruct-GPTQ-Int4": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4",
+        },
+        "Qwen2.5-7B-Instruct-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-7B-Instruct-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-7B-Instruct-AWQ",
+        },
+        "Qwen2.5-14B-Instruct-GPTQ-Int8": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8",
+        },
+        "Qwen2.5-14B-Instruct-GPTQ-Int4": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4",
+        },
+        "Qwen2.5-14B-Instruct-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-14B-Instruct-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-14B-Instruct-AWQ",
+        },
+        "Qwen2.5-32B-Instruct-GPTQ-Int8": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8",
+        },
+        "Qwen2.5-32B-Instruct-GPTQ-Int4": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4",
+        },
+        "Qwen2.5-32B-Instruct-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-32B-Instruct-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-32B-Instruct-AWQ",
+        },
+        "Qwen2.5-72B-Instruct-GPTQ-Int8": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8",
+        },
+        "Qwen2.5-72B-Instruct-GPTQ-Int4": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4",
+        },
+        "Qwen2.5-72B-Instruct-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-72B-Instruct-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-72B-Instruct-AWQ",
+        },
+        "Qwen2.5-Coder-0.5B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Coder-0.5B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-0.5B",
+        },
+        "Qwen2.5-Coder-1.5B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Coder-1.5B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-1.5B",
+        },
+        "Qwen2.5-Coder-3B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Coder-3B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-3B",
+        },
+        "Qwen2.5-Coder-7B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Coder-7B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-7B",
+        },
+        "Qwen2.5-Coder-14B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Coder-14B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-14B",
+        },
+        "Qwen2.5-Coder-32B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Coder-32B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-32B",
+        },
+        "Qwen2.5-Coder-0.5B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Coder-0.5B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-0.5B-Instruct",
+        },
+        "Qwen2.5-Coder-1.5B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Coder-1.5B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-1.5B-Instruct",
+        },
+        "Qwen2.5-Coder-3B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Coder-3B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-3B-Instruct",
+        },
+        "Qwen2.5-Coder-7B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Coder-7B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-7B-Instruct",
+        },
+        "Qwen2.5-Coder-14B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Coder-14B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-14B-Instruct",
+        },
+        "Qwen2.5-Coder-32B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Coder-32B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-32B-Instruct",
+        },
+        "Qwen2.5-Math-1.5B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Math-1.5B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Math-1.5B",
+        },
+        "Qwen2.5-Math-7B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Math-7B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Math-7B",
+        },
+        "Qwen2.5-Math-72B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Math-72B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Math-72B",
+        },
+        "Qwen2.5-Math-1.5B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Math-1.5B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-1.5B-Instruct",
+        },
+        "Qwen2.5-Math-7B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Math-7B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-7B-Instruct",
+        },
+        "Qwen2.5-Math-72B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Math-72B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Coder-72B-Instruct",
+        },
+        "QwQ-32B-Preview-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/QwQ-32B-Preview",
+            DownloadSource.MODELSCOPE: "Qwen/QwQ-32B-Preview",
+        },
+        "QwQ-32B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/QwQ-32B",
+            DownloadSource.MODELSCOPE: "Qwen/QwQ-32B",
+        },
+    },
+    template="qwen",
+)
+
+
+register_model_group(
+    models={
+        "Qwen3-0.6B-Base": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-0.6B-Base",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-0.6B-Base",
+        },
+        "Qwen3-1.7B-Base": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-1.7B-Base",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-1.7B-Base",
+        },
+        "Qwen3-4B-Base": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-4B-Base",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-4B-Base",
+        },
+        "Qwen3-8B-Base": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-8B-Base",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-8B-Base",
+        },
+        "Qwen3-14B-Base": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-14B-Base",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-14B-Base",
+        },
+        "Qwen3-30B-A3B-Base": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-30B-A3B-Base",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-30B-A3B-Base",
+        },
+        "Qwen3-0.6B-Thinking": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-0.6B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-0.6B",
+        },
+        "Qwen3-1.7B-Thinking": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-1.7B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-1.7B",
+        },
+        "Qwen3-4B-Thinking": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-4B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-4B",
+        },
+        "Qwen3-4B-Thinking-2507": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-4B-Thinking-2507",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-4B-Thinking-2507",
+        },
+        "Qwen3-8B-Thinking": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-8B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-8B",
+        },
+        "Qwen3-14B-Thinking": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-14B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-14B",
+        },
+        "Qwen3-32B-Thinking": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-32B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-32B",
+        },
+        "Qwen3-30B-A3B-Thinking": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-30B-A3B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-30B-A3B",
+        },
+        "Qwen3-30B-A3B-Thinking-2507": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-30B-A3B-Thinking-2507",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-30B-A3B-Thinking-2507",
+        },
+        "Qwen3-235B-A22B-Thinking": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-235B-A22B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-235B-A22B",
+        },
+        "Qwen3-235B-A22B-Thinking-2507": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-235B-A22B-Thinking-2507",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-235B-A22B-Thinking-2507",
+        },
+        "Qwen3-0.6B-Thinking-GPTQ-Int8": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-0.6B-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-0.6B-GPTQ-Int8",
+        },
+        "Qwen3-1.7B-Thinking-GPTQ-Int8": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-1.7B-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-1.7B-GPTQ-Int8",
+        },
+        "Qwen3-4B-Thinking-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-4B-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-4B-AWQ",
+        },
+        "Qwen3-8B-Thinking-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-8B-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-8B-AWQ",
+        },
+        "Qwen3-14B-Thinking-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-14B-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-14B-AWQ",
+        },
+        "Qwen3-32B-Thinking-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-32B-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-32B-AWQ",
+        },
+        "Qwen3-30B-A3B-Thinking-GPTQ-Int4": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-30B-A3B-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-30B-A3B-GPTQ-Int4",
+        },
+        "Qwen3-235B-A22B-Thinking-GPTQ-Int4": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-235B-A22B-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-235B-A22B-GPTQ-Int4",
+        },
+        "Qwen/Qwen3-Next-80B-A3B-Thinking": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-Next-80B-A3B-Thinking",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-Next-80B-A3B-Thinking",
+        },
+    },
+    template="qwen3",
+)
+
+
+register_model_group(
+    models={
+        "Qwen3-4B-Instruct-2507": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-4B-Instruct-2507",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-4B-Instruct-2507",
+        },
+        "Qwen3-30B-A3B-Instruct-2507": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-30B-A3B-Instruct-2507",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-30B-A3B-Instruct-2507",
+        },
+        "Qwen3-235B-A22B-Instruct-2507": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-235B-A22B-Instruct-2507",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-235B-A22B-Instruct-2507",
+        },
+        "Qwen3-Next-80B-A3B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-Next-80B-A3B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-Next-80B-A3B-Instruct",
+        },
+    },
+    template="qwen3_nothink",
+)
+
+
+register_model_group(
+    models={
+        "Qwen2-Audio-7B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-Audio-7B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-Audio-7B",
+        },
+        "Qwen2-Audio-7B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-Audio-7B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-Audio-7B-Instruct",
+        },
+    },
+    template="qwen2_audio",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "Qwen2.5-Omni-3B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Omni-3B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Omni-3B",
+        },
+        "Qwen2.5-Omni-7B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Omni-7B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Omni-7B",
+        },
+        "Qwen2.5-Omni-7B-GPTQ-Int4": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Omni-7B-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Omni-7B-GPTQ-Int4",
+        },
+        "Qwen2.5-Omni-7B-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-Omni-7B-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-Omni-7B-AWQ",
+        },
+    },
+    template="qwen2_omni",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "Qwen3-Omni-30B-A3B-Captioner": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-Omni-30B-A3B-Captioner",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-Omni-30B-A3B-Captioner",
+        },
+        "Qwen3-Omni-30B-A3B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-Omni-30B-A3B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-Omni-30B-A3B-Instruct",
+        },
+    },
+    template="qwen3_omni_nothink",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "Qwen3-Omni-30B-A3B-Thinking": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-Omni-30B-A3B-Thinking",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-Omni-30B-A3B-Thinking",
+        },
+    },
+    template="qwen3_omni",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "Qwen2-VL-2B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-2B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-2B",
+        },
+        "Qwen2-VL-7B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-7B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-7B",
+        },
+        "Qwen2-VL-72B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-72B",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-72B",
+        },
+        "Qwen2-VL-2B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-2B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-2B-Instruct",
+            DownloadSource.OPENMIND: "LlamaFactory/Qwen2-VL-2B-Instruct",
+        },
+        "Qwen2-VL-7B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-7B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-7B-Instruct",
+            DownloadSource.OPENMIND: "LlamaFactory/Qwen2-VL-7B-Instruct",
+        },
+        "Qwen2-VL-72B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-72B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-72B-Instruct",
+        },
+        "Qwen2-VL-2B-Instruct-GPTQ-Int8": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8",
+        },
+        "Qwen2-VL-2B-Instruct-GPTQ-Int4": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4",
+        },
+        "Qwen2-VL-2B-Instruct-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-2B-Instruct-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-2B-Instruct-AWQ",
+        },
+        "Qwen2-VL-7B-Instruct-GPTQ-Int8": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8",
+        },
+        "Qwen2-VL-7B-Instruct-GPTQ-Int4": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4",
+        },
+        "Qwen2-VL-7B-Instruct-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-7B-Instruct-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-7B-Instruct-AWQ",
+        },
+        "Qwen2-VL-72B-Instruct-GPTQ-Int8": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8",
+        },
+        "Qwen2-VL-72B-Instruct-GPTQ-Int4": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4",
+        },
+        "Qwen2-VL-72B-Instruct-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2-VL-72B-Instruct-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2-VL-72B-Instruct-AWQ",
+        },
+        "QVQ-72B-Preview": {
+            DownloadSource.DEFAULT: "Qwen/QVQ-72B-Preview",
+            DownloadSource.MODELSCOPE: "Qwen/QVQ-72B-Preview",
+        },
+        "Qwen2.5-VL-3B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-VL-3B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-VL-3B-Instruct",
+        },
+        "Qwen2.5-VL-7B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-VL-7B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-VL-7B-Instruct",
+        },
+        "Qwen2.5-VL-32B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-VL-32B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-VL-32B-Instruct",
+        },
+        "Qwen2.5-VL-72B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-VL-72B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-VL-72B-Instruct",
+        },
+        "Qwen2.5-VL-3B-Instruct-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-VL-3B-Instruct-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-VL-3B-Instruct-AWQ",
+        },
+        "Qwen2.5-VL-7B-Instruct-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-VL-7B-Instruct-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-VL-7B-Instruct-AWQ",
+        },
+        "Qwen2.5-VL-72B-Instruct-AWQ": {
+            DownloadSource.DEFAULT: "Qwen/Qwen2.5-VL-72B-Instruct-AWQ",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen2.5-VL-72B-Instruct-AWQ",
+        },
+    },
+    template="qwen2_vl",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "Qwen3-VL-2B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-VL-2B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-VL-2B-Instruct",
+        },
+        "Qwen3-VL-4B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-VL-4B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-VL-4B-Instruct",
+        },
+        "Qwen3-VL-8B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-VL-8B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-VL-8B-Instruct",
+        },
+        "Qwen3-VL-32B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-VL-32B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-VL-32B-Instruct",
+        },
+        "Qwen3-VL-30B-A3B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-VL-30B-A3B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-VL-30B-A3B-Instruct",
+        },
+        "Qwen3-VL-235B-A22B-Instruct": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-VL-235B-A22B-Instruct",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-VL-235B-A22B-Instruct",
+        },
+    },
+    template="qwen3_vl_nothink",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "Qwen3-VL-2B-Thinking": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-VL-2B-Thinking",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-VL-2B-Thinking",
+        },
+        "Qwen3-VL-4B-Thinking": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-VL-4B-Thinking",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-VL-4B-Thinking",
+        },
+        "Qwen3-VL-8B-Thinking": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-VL-8B-Thinking",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-VL-8B-Thinking",
+        },
+        "Qwen3-VL-32B-Thinking": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-VL-32B-Thinking",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-VL-32B-Thinking",
+        },
+        "Qwen3-VL-30B-A3B-Thinking": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-VL-30B-A3B-Thinking",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-VL-30B-A3B-Thinking",
+        },
+        "Qwen3-VL-235B-A22B-Thinking": {
+            DownloadSource.DEFAULT: "Qwen/Qwen3-VL-235B-A22B-Thinking",
+            DownloadSource.MODELSCOPE: "Qwen/Qwen3-VL-235B-A22B-Thinking",
+        },
+    },
+    template="qwen3_vl",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "Seed-Coder-8B-Base": {
+            DownloadSource.DEFAULT: "ByteDance-Seed/Seed-Coder-8B-Base",
+            DownloadSource.MODELSCOPE: "ByteDance-Seed/Seed-Coder-8B-Base",
+        },
+        "Seed-Coder-8B-Instruct": {
+            DownloadSource.DEFAULT: "ByteDance-Seed/Seed-Coder-8B-Instruct",
+            DownloadSource.MODELSCOPE: "ByteDance-Seed/Seed-Coder-8B-Instruct",
+        },
+        "Seed-Coder-8B-Thinking": {
+            DownloadSource.DEFAULT: "ByteDance-Seed/Seed-Coder-8B-Reasoning-bf16",
+            DownloadSource.MODELSCOPE: "ByteDance-Seed/Seed-Coder-8B-Reasoning-bf16",
+        },
+    },
+    template="seed_coder",
+)
+
+
+register_model_group(
+    models={
+        "Seed-OSS-36B-Base": {
+            DownloadSource.DEFAULT: "ByteDance-Seed/Seed-OSS-36B-Base",
+            DownloadSource.MODELSCOPE: "ByteDance-Seed/Seed-OSS-36B-Base",
+        },
+        "Seed-OSS-36B-Base-woSyn": {
+            DownloadSource.DEFAULT: "ByteDance-Seed/Seed-OSS-36B-Base-woSyn",
+            DownloadSource.MODELSCOPE: "ByteDance-Seed/Seed-OSS-36B-Base-woSyn",
+        },
+        "Seed-OSS-36B-Instruct": {
+            DownloadSource.DEFAULT: "ByteDance-Seed/Seed-OSS-36B-Instruct",
+            DownloadSource.MODELSCOPE: "ByteDance-Seed/Seed-OSS-36B-Instruct",
+        },
+    },
+    template="seed_oss",
+)
+
+
+register_model_group(
+    models={
+        "Skywork-13B-Base": {
+            DownloadSource.DEFAULT: "Skywork/Skywork-13B-base",
+            DownloadSource.MODELSCOPE: "skywork/Skywork-13B-base",
+        }
+    }
+)
+
+
+register_model_group(
+    models={
+        "Skywork-o1-Open-Llama-3.1-8B": {
+            DownloadSource.DEFAULT: "Skywork/Skywork-o1-Open-Llama-3.1-8B",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/Skywork-o1-Open-Llama-3.1-8B",
+        }
+    },
+    template="skywork_o1",
+)
+
+
+register_model_group(
+    models={
+        "SmolLM-135M": {
+            DownloadSource.DEFAULT: "HuggingFaceTB/SmolLM-135M",
+            DownloadSource.MODELSCOPE: "HuggingFaceTB/SmolLM-135M",
+        },
+        "SmolLM-360M": {
+            DownloadSource.DEFAULT: "HuggingFaceTB/SmolLM-360M",
+            DownloadSource.MODELSCOPE: "HuggingFaceTB/SmolLM-360M",
+        },
+        "SmolLM-1.7B": {
+            DownloadSource.DEFAULT: "HuggingFaceTB/SmolLM-1.7B",
+            DownloadSource.MODELSCOPE: "HuggingFaceTB/SmolLM-1.7B",
+        },
+        "SmolLM-135M-Instruct": {
+            DownloadSource.DEFAULT: "HuggingFaceTB/SmolLM-135M-Instruct",
+            DownloadSource.MODELSCOPE: "HuggingFaceTB/SmolLM-135M-Instruct",
+        },
+        "SmolLM-360M-Instruct": {
+            DownloadSource.DEFAULT: "HuggingFaceTB/SmolLM-360M-Instruct",
+            DownloadSource.MODELSCOPE: "HuggingFaceTB/SmolLM-360M-Instruct",
+        },
+        "SmolLM-1.7B-Instruct": {
+            DownloadSource.DEFAULT: "HuggingFaceTB/SmolLM-1.7B-Instruct",
+            DownloadSource.MODELSCOPE: "HuggingFaceTB/SmolLM-1.7B-Instruct",
+        },
+    },
+    template="smollm",
+)
+
+
+register_model_group(
+    models={
+        "SmolLM2-135M": {
+            DownloadSource.DEFAULT: "HuggingFaceTB/SmolLM2-135M",
+            DownloadSource.MODELSCOPE: "HuggingFaceTB/SmolLM2-135M",
+        },
+        "SmolLM2-360M": {
+            DownloadSource.DEFAULT: "HuggingFaceTB/SmolLM2-360M",
+            DownloadSource.MODELSCOPE: "HuggingFaceTB/SmolLM2-360M",
+        },
+        "SmolLM2-1.7B": {
+            DownloadSource.DEFAULT: "HuggingFaceTB/SmolLM2-1.7B",
+            DownloadSource.MODELSCOPE: "HuggingFaceTB/SmolLM2-1.7B",
+        },
+        "SmolLM2-135M-Instruct": {
+            DownloadSource.DEFAULT: "HuggingFaceTB/SmolLM2-135M-Instruct",
+            DownloadSource.MODELSCOPE: "HuggingFaceTB/SmolLM2-135M-Instruct",
+        },
+        "SmolLM2-360M-Instruct": {
+            DownloadSource.DEFAULT: "HuggingFaceTB/SmolLM2-360M-Instruct",
+            DownloadSource.MODELSCOPE: "HuggingFaceTB/SmolLM2-360M-Instruct",
+        },
+        "SmolLM2-1.7B-Instruct": {
+            DownloadSource.DEFAULT: "HuggingFaceTB/SmolLM2-1.7B-Instruct",
+            DownloadSource.MODELSCOPE: "HuggingFaceTB/SmolLM2-1.7B-Instruct",
+        },
+    },
+    template="smollm2",
+)
+
+
+register_model_group(
+    models={
+        "SOLAR-10.7B-v1.0": {
+            DownloadSource.DEFAULT: "upstage/SOLAR-10.7B-v1.0",
+        },
+        "SOLAR-10.7B-Instruct-v1.0": {
+            DownloadSource.DEFAULT: "upstage/SOLAR-10.7B-Instruct-v1.0",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/SOLAR-10.7B-Instruct-v1.0",
+        },
+    },
+    template="solar",
+)
+
+
+register_model_group(
+    models={
+        "StarCoder2-3B": {
+            DownloadSource.DEFAULT: "bigcode/starcoder2-3b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/starcoder2-3b",
+        },
+        "StarCoder2-7B": {
+            DownloadSource.DEFAULT: "bigcode/starcoder2-7b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/starcoder2-7b",
+        },
+        "StarCoder2-15B": {
+            DownloadSource.DEFAULT: "bigcode/starcoder2-15b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/starcoder2-15b",
+        },
+    }
+)
+
+
+register_model_group(
+    models={
+        "TeleChat-1B-Chat": {
+            DownloadSource.DEFAULT: "Tele-AI/TeleChat-1B",
+            DownloadSource.MODELSCOPE: "TeleAI/TeleChat-1B",
+        },
+        "TeleChat-7B-Chat": {
+            DownloadSource.DEFAULT: "Tele-AI/telechat-7B",
+            DownloadSource.MODELSCOPE: "TeleAI/telechat-7B",
+            DownloadSource.OPENMIND: "TeleAI/TeleChat-7B-pt",
+        },
+        "TeleChat-12B-Chat": {
+            DownloadSource.DEFAULT: "Tele-AI/TeleChat-12B-v2",
+            DownloadSource.MODELSCOPE: "TeleAI/TeleChat-12B-v2",
+            DownloadSource.OPENMIND: "TeleAI/TeleChat-12B-pt",
+        },
+        "TeleChat-52B-Chat": {
+            DownloadSource.DEFAULT: "Tele-AI/TeleChat-52B",
+        },
+    },
+    template="telechat",
+)
+
+
+register_model_group(
+    models={
+        "TeleChat2-3B-Chat": {
+            DownloadSource.DEFAULT: "Tele-AI/TeleChat2-3B",
+            DownloadSource.MODELSCOPE: "TeleAI/TeleChat2-3B",
+        },
+        "TeleChat2-7B-Chat": {
+            DownloadSource.DEFAULT: "Tele-AI/TeleChat2-7B",
+            DownloadSource.MODELSCOPE: "TeleAI/TeleChat2-7B",
+        },
+        "TeleChat2-35B-Chat": {
+            DownloadSource.MODELSCOPE: "TeleAI/TeleChat2-35B-Nov",
+        },
+        "TeleChat2-115B-Chat": {
+            DownloadSource.DEFAULT: "Tele-AI/TeleChat2-115B",
+            DownloadSource.MODELSCOPE: "TeleAI/TeleChat2-115B",
+        },
+    },
+    template="telechat2",
+)
+
+
+register_model_group(
+    models={
+        "Vicuna-v1.5-7B-Chat": {
+            DownloadSource.DEFAULT: "lmsys/vicuna-7b-v1.5",
+            DownloadSource.MODELSCOPE: "Xorbits/vicuna-7b-v1.5",
+        },
+        "Vicuna-v1.5-13B-Chat": {
+            DownloadSource.DEFAULT: "lmsys/vicuna-13b-v1.5",
+            DownloadSource.MODELSCOPE: "Xorbits/vicuna-13b-v1.5",
+        },
+    },
+    template="vicuna",
+)
+
+
+register_model_group(
+    models={
+        "Video-LLaVA-7B-Chat": {
+            DownloadSource.DEFAULT: "LanguageBind/Video-LLaVA-7B-hf",
+        },
+    },
+    template="video_llava",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "XuanYuan-6B": {
+            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-6B",
+            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-6B",
+        },
+        "XuanYuan-70B": {
+            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-70B",
+            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-70B",
+        },
+        "XuanYuan2-70B": {
+            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan2-70B",
+            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan2-70B",
+        },
+        "XuanYuan-6B-Chat": {
+            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-6B-Chat",
+            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-6B-Chat",
+        },
+        "XuanYuan-70B-Chat": {
+            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-70B-Chat",
+            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-70B-Chat",
+        },
+        "XuanYuan2-70B-Chat": {
+            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan2-70B-Chat",
+            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan2-70B-Chat",
+        },
+        "XuanYuan-6B-Chat-8bit": {
+            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-6B-Chat-8bit",
+            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-6B-Chat-8bit",
+        },
+        "XuanYuan-6B-Chat-4bit": {
+            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-6B-Chat-4bit",
+            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-6B-Chat-4bit",
+        },
+        "XuanYuan-70B-Chat-8bit": {
+            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-70B-Chat-8bit",
+            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-70B-Chat-8bit",
+        },
+        "XuanYuan-70B-Chat-4bit": {
+            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-70B-Chat-4bit",
+            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-70B-Chat-4bit",
+        },
+        "XuanYuan2-70B-Chat-8bit": {
+            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan2-70B-Chat-8bit",
+            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan2-70B-Chat-8bit",
+        },
+        "XuanYuan2-70B-Chat-4bit": {
+            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan2-70B-Chat-4bit",
+            DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan2-70B-Chat-4bit",
+        },
+    },
+    template="xuanyuan",
+)
+
+
+register_model_group(
+    models={
+        "XVERSE-7B": {
+            DownloadSource.DEFAULT: "xverse/XVERSE-7B",
+            DownloadSource.MODELSCOPE: "xverse/XVERSE-7B",
+        },
+        "XVERSE-13B": {
+            DownloadSource.DEFAULT: "xverse/XVERSE-13B",
+            DownloadSource.MODELSCOPE: "xverse/XVERSE-13B",
+        },
+        "XVERSE-65B": {
+            DownloadSource.DEFAULT: "xverse/XVERSE-65B",
+            DownloadSource.MODELSCOPE: "xverse/XVERSE-65B",
+        },
+        "XVERSE-65B-2": {
+            DownloadSource.DEFAULT: "xverse/XVERSE-65B-2",
+            DownloadSource.MODELSCOPE: "xverse/XVERSE-65B-2",
+        },
+        "XVERSE-7B-Chat": {
+            DownloadSource.DEFAULT: "xverse/XVERSE-7B-Chat",
+            DownloadSource.MODELSCOPE: "xverse/XVERSE-7B-Chat",
+        },
+        "XVERSE-13B-Chat": {
+            DownloadSource.DEFAULT: "xverse/XVERSE-13B-Chat",
+            DownloadSource.MODELSCOPE: "xverse/XVERSE-13B-Chat",
+        },
+        "XVERSE-65B-Chat": {
+            DownloadSource.DEFAULT: "xverse/XVERSE-65B-Chat",
+            DownloadSource.MODELSCOPE: "xverse/XVERSE-65B-Chat",
+        },
+        "XVERSE-MoE-A4.2B": {
+            DownloadSource.DEFAULT: "xverse/XVERSE-MoE-A4.2B",
+            DownloadSource.MODELSCOPE: "xverse/XVERSE-MoE-A4.2B",
+        },
+        "XVERSE-7B-Chat-GPTQ-Int8": {
+            DownloadSource.DEFAULT: "xverse/XVERSE-7B-Chat-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "xverse/XVERSE-7B-Chat-GPTQ-Int8",
+        },
+        "XVERSE-7B-Chat-GPTQ-Int4": {
+            DownloadSource.DEFAULT: "xverse/XVERSE-7B-Chat-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "xverse/XVERSE-7B-Chat-GPTQ-Int4",
+        },
+        "XVERSE-13B-Chat-GPTQ-Int8": {
+            DownloadSource.DEFAULT: "xverse/XVERSE-13B-Chat-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "xverse/XVERSE-13B-Chat-GPTQ-Int8",
+        },
+        "XVERSE-13B-Chat-GPTQ-Int4": {
+            DownloadSource.DEFAULT: "xverse/XVERSE-13B-Chat-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "xverse/XVERSE-13B-Chat-GPTQ-Int4",
+        },
+        "XVERSE-65B-Chat-GPTQ-Int4": {
+            DownloadSource.DEFAULT: "xverse/XVERSE-65B-Chat-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "xverse/XVERSE-65B-Chat-GPTQ-Int4",
+        },
+    },
+    template="xverse",
+)
+
+
+register_model_group(
+    models={
+        "Yayi-7B": {
+            DownloadSource.DEFAULT: "wenge-research/yayi-7b-llama2",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/yayi-7b-llama2",
+        },
+        "Yayi-13B": {
+            DownloadSource.DEFAULT: "wenge-research/yayi-13b-llama2",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/yayi-13b-llama2",
+        },
+    },
+    template="yayi",
+)
+
+
+register_model_group(
+    models={
+        "Yi-6B": {
+            DownloadSource.DEFAULT: "01-ai/Yi-6B",
+            DownloadSource.MODELSCOPE: "01ai/Yi-6B",
+        },
+        "Yi-9B": {
+            DownloadSource.DEFAULT: "01-ai/Yi-9B",
+            DownloadSource.MODELSCOPE: "01ai/Yi-9B",
+        },
+        "Yi-34B": {
+            DownloadSource.DEFAULT: "01-ai/Yi-34B",
+            DownloadSource.MODELSCOPE: "01ai/Yi-34B",
+        },
+        "Yi-6B-Chat": {
+            DownloadSource.DEFAULT: "01-ai/Yi-6B-Chat",
+            DownloadSource.MODELSCOPE: "01ai/Yi-6B-Chat",
+        },
+        "Yi-34B-Chat": {
+            DownloadSource.DEFAULT: "01-ai/Yi-34B-Chat",
+            DownloadSource.MODELSCOPE: "01ai/Yi-34B-Chat",
+        },
+        "Yi-6B-Chat-8bits": {
+            DownloadSource.DEFAULT: "01-ai/Yi-6B-Chat-8bits",
+            DownloadSource.MODELSCOPE: "01ai/Yi-6B-Chat-8bits",
+        },
+        "Yi-6B-Chat-4bits": {
+            DownloadSource.DEFAULT: "01-ai/Yi-6B-Chat-4bits",
+            DownloadSource.MODELSCOPE: "01ai/Yi-6B-Chat-4bits",
+        },
+        "Yi-34B-Chat-8bits": {
+            DownloadSource.DEFAULT: "01-ai/Yi-34B-Chat-8bits",
+            DownloadSource.MODELSCOPE: "01ai/Yi-34B-Chat-8bits",
+        },
+        "Yi-34B-Chat-4bits": {
+            DownloadSource.DEFAULT: "01-ai/Yi-34B-Chat-4bits",
+            DownloadSource.MODELSCOPE: "01ai/Yi-34B-Chat-4bits",
+        },
+        "Yi-1.5-6B": {
+            DownloadSource.DEFAULT: "01-ai/Yi-1.5-6B",
+            DownloadSource.MODELSCOPE: "01ai/Yi-1.5-6B",
+        },
+        "Yi-1.5-9B": {
+            DownloadSource.DEFAULT: "01-ai/Yi-1.5-9B",
+            DownloadSource.MODELSCOPE: "01ai/Yi-1.5-9B",
+        },
+        "Yi-1.5-34B": {
+            DownloadSource.DEFAULT: "01-ai/Yi-1.5-34B",
+            DownloadSource.MODELSCOPE: "01ai/Yi-1.5-34B",
+        },
+        "Yi-1.5-6B-Chat": {
+            DownloadSource.DEFAULT: "01-ai/Yi-1.5-6B-Chat",
+            DownloadSource.MODELSCOPE: "01ai/Yi-1.5-6B-Chat",
+            DownloadSource.OPENMIND: "LlamaFactory/Yi-1.5-6B-Chat",
+        },
+        "Yi-1.5-9B-Chat": {
+            DownloadSource.DEFAULT: "01-ai/Yi-1.5-9B-Chat",
+            DownloadSource.MODELSCOPE: "01ai/Yi-1.5-9B-Chat",
+        },
+        "Yi-1.5-34B-Chat": {
+            DownloadSource.DEFAULT: "01-ai/Yi-1.5-34B-Chat",
+            DownloadSource.MODELSCOPE: "01ai/Yi-1.5-34B-Chat",
+        },
+        "Yi-Coder-1.5B": {
+            DownloadSource.DEFAULT: "01-ai/Yi-Coder-1.5B",
+            DownloadSource.MODELSCOPE: "01ai/Yi-Coder-1.5B",
+        },
+        "Yi-Coder-9B": {
+            DownloadSource.DEFAULT: "01-ai/Yi-Coder-9B",
+            DownloadSource.MODELSCOPE: "01ai/Yi-Coder-9B",
+        },
+        "Yi-Coder-1.5B-Chat": {
+            DownloadSource.DEFAULT: "01-ai/Yi-Coder-1.5B-Chat",
+            DownloadSource.MODELSCOPE: "01ai/Yi-Coder-1.5B-Chat",
+        },
+        "Yi-Coder-9B-Chat": {
+            DownloadSource.DEFAULT: "01-ai/Yi-Coder-9B-Chat",
+            DownloadSource.MODELSCOPE: "01ai/Yi-Coder-9B-Chat",
+        },
+    },
+    template="yi",
+)
+
+
+register_model_group(
+    models={
+        "Yi-VL-6B-Chat": {
+            DownloadSource.DEFAULT: "BUAADreamer/Yi-VL-6B-hf",
+        },
+        "Yi-VL-34B-Chat": {
+            DownloadSource.DEFAULT: "BUAADreamer/Yi-VL-34B-hf",
+        },
+    },
+    template="yi_vl",
+    multimodal=True,
+)
+
+
+register_model_group(
+    models={
+        "Yuan2-2B-Chat": {
+            DownloadSource.DEFAULT: "IEITYuan/Yuan2-2B-hf",
+            DownloadSource.MODELSCOPE: "YuanLLM/Yuan2.0-2B-hf",
+        },
+        "Yuan2-51B-Chat": {
+            DownloadSource.DEFAULT: "IEITYuan/Yuan2-51B-hf",
+            DownloadSource.MODELSCOPE: "YuanLLM/Yuan2.0-51B-hf",
+        },
+        "Yuan2-102B-Chat": {
+            DownloadSource.DEFAULT: "IEITYuan/Yuan2-102B-hf",
+            DownloadSource.MODELSCOPE: "YuanLLM/Yuan2.0-102B-hf",
+        },
+    },
+    template="yuan",
+)
+
+
+register_model_group(
+    models={
+        "Zephyr-7B-Alpha-Chat": {
+            DownloadSource.DEFAULT: "HuggingFaceH4/zephyr-7b-alpha",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/zephyr-7b-alpha",
+        },
+        "Zephyr-7B-Beta-Chat": {
+            DownloadSource.DEFAULT: "HuggingFaceH4/zephyr-7b-beta",
+            DownloadSource.MODELSCOPE: "modelscope/zephyr-7b-beta",
+        },
+        "Zephyr-141B-ORPO-Chat": {
+            DownloadSource.DEFAULT: "HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1",
+        },
+    },
+    template="zephyr",
+)
diff --git a/llamafactory/extras/env.py b/llamafactory/extras/env.py
new file mode 100644
index 0000000000000000000000000000000000000000..d39b994537dd6f98915218c974544d3ba5345013
--- /dev/null
+++ b/llamafactory/extras/env.py
@@ -0,0 +1,102 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/commands/env.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from collections import OrderedDict
+
+
+VERSION = "0.9.4.dev0"
+
+
+def print_env() -> None:
+    import os
+    import platform
+
+    import accelerate
+    import datasets
+    import peft
+    import torch
+    import transformers
+    from transformers.utils import is_torch_cuda_available, is_torch_npu_available
+
+    info = OrderedDict(
+        {
+            "`llamafactory` version": VERSION,
+            "Platform": platform.platform(),
+            "Python version": platform.python_version(),
+            "PyTorch version": torch.__version__,
+            "Transformers version": transformers.__version__,
+            "Datasets version": datasets.__version__,
+            "Accelerate version": accelerate.__version__,
+            "PEFT version": peft.__version__,
+        }
+    )
+
+    if is_torch_cuda_available():
+        info["PyTorch version"] += " (GPU)"
+        info["GPU type"] = torch.cuda.get_device_name()
+        info["GPU number"] = torch.cuda.device_count()
+        info["GPU memory"] = f"{torch.cuda.mem_get_info()[1] / (1024**3):.2f}GB"
+
+    if is_torch_npu_available():
+        info["PyTorch version"] += " (NPU)"
+        info["NPU type"] = torch.npu.get_device_name()
+        info["CANN version"] = torch.version.cann
+
+    try:
+        import trl  # type: ignore
+
+        info["TRL version"] = trl.__version__
+    except Exception:
+        pass
+
+    try:
+        import deepspeed  # type: ignore
+
+        info["DeepSpeed version"] = deepspeed.__version__
+    except Exception:
+        pass
+
+    try:
+        import bitsandbytes  # type: ignore
+
+        info["Bitsandbytes version"] = bitsandbytes.__version__
+    except Exception:
+        pass
+
+    try:
+        import vllm
+
+        info["vLLM version"] = vllm.__version__
+    except Exception:
+        pass
+
+    try:
+        import subprocess
+
+        commit_info = subprocess.run(["git", "rev-parse", "HEAD"], capture_output=True, text=True, check=True)
+        commit_hash = commit_info.stdout.strip()
+        info["Git commit"] = commit_hash
+    except Exception:
+        pass
+
+    if os.path.exists("data"):
+        info["Default data directory"] = "detected"
+    else:
+        info["Default data directory"] = "not detected"
+
+    print("\n" + "\n".join([f"- {key}: {value}" for key, value in info.items()]) + "\n")
diff --git a/llamafactory/extras/logging.py b/llamafactory/extras/logging.py
new file mode 100644
index 0000000000000000000000000000000000000000..f234a807ff346dd672fccf7199f62f880f0e1147
--- /dev/null
+++ b/llamafactory/extras/logging.py
@@ -0,0 +1,159 @@
+# Copyright 2025 Optuna, HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/utils/logging.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+import threading
+from concurrent.futures import ThreadPoolExecutor
+from functools import lru_cache
+from typing import Optional
+
+from .constants import RUNNING_LOG
+
+
+_thread_lock = threading.RLock()
+_default_handler: Optional["logging.Handler"] = None
+_default_log_level: "logging._Level" = logging.INFO
+
+
+class LoggerHandler(logging.Handler):
+    r"""Redirect the logging output to the logging file for LLaMA Board."""
+
+    def __init__(self, output_dir: str) -> None:
+        super().__init__()
+        self._formatter = logging.Formatter(
+            fmt="[%(levelname)s|%(asctime)s] %(filename)s:%(lineno)s >> %(message)s",
+            datefmt="%Y-%m-%d %H:%M:%S",
+        )
+        self.setLevel(logging.INFO)
+        os.makedirs(output_dir, exist_ok=True)
+        self.running_log = os.path.join(output_dir, RUNNING_LOG)
+        if os.path.exists(self.running_log):
+            os.remove(self.running_log)
+
+        self.thread_pool = ThreadPoolExecutor(max_workers=1)
+
+    def _write_log(self, log_entry: str) -> None:
+        with open(self.running_log, "a", encoding="utf-8") as f:
+            f.write(log_entry + "\n")
+
+    def emit(self, record) -> None:
+        if record.name == "httpx":
+            return
+
+        log_entry = self._formatter.format(record)
+        self.thread_pool.submit(self._write_log, log_entry)
+
+    def close(self) -> None:
+        self.thread_pool.shutdown(wait=True)
+        return super().close()
+
+
+class _Logger(logging.Logger):
+    r"""A logger that supports rank0 logging."""
+
+    def info_rank0(self, *args, **kwargs) -> None:
+        self.info(*args, **kwargs)
+
+    def warning_rank0(self, *args, **kwargs) -> None:
+        self.warning(*args, **kwargs)
+
+    def warning_rank0_once(self, *args, **kwargs) -> None:
+        self.warning(*args, **kwargs)
+
+
+def _get_default_logging_level() -> "logging._Level":
+    r"""Return the default logging level."""
+    env_level_str = os.getenv("LLAMAFACTORY_VERBOSITY", None)
+    if env_level_str:
+        if env_level_str.upper() in logging._nameToLevel:
+            return logging._nameToLevel[env_level_str.upper()]
+        else:
+            raise ValueError(f"Unknown logging level: {env_level_str}.")
+
+    return _default_log_level
+
+
+def _get_library_name() -> str:
+    return __name__.split(".")[0]
+
+
+def _get_library_root_logger() -> "_Logger":
+    return logging.getLogger(_get_library_name())
+
+
+def _configure_library_root_logger() -> None:
+    r"""Configure root logger using a stdout stream handler with an explicit format."""
+    global _default_handler
+
+    with _thread_lock:
+        if _default_handler:  # already configured
+            return
+
+        formatter = logging.Formatter(
+            fmt="[%(levelname)s|%(asctime)s] %(name)s:%(lineno)s >> %(message)s",
+            datefmt="%Y-%m-%d %H:%M:%S",
+        )
+        _default_handler = logging.StreamHandler(sys.stdout)
+        _default_handler.setFormatter(formatter)
+        library_root_logger = _get_library_root_logger()
+        library_root_logger.addHandler(_default_handler)
+        library_root_logger.setLevel(_get_default_logging_level())
+        library_root_logger.propagate = False
+
+
+def get_logger(name: Optional[str] = None) -> "_Logger":
+    r"""Return a logger with the specified name. It it not supposed to be accessed externally."""
+    if name is None:
+        name = _get_library_name()
+
+    _configure_library_root_logger()
+    return logging.getLogger(name)
+
+
+def add_handler(handler: "logging.Handler") -> None:
+    r"""Add a handler to the root logger."""
+    _configure_library_root_logger()
+    _get_library_root_logger().addHandler(handler)
+
+
+def remove_handler(handler: logging.Handler) -> None:
+    r"""Remove a handler to the root logger."""
+    _configure_library_root_logger()
+    _get_library_root_logger().removeHandler(handler)
+
+
+def info_rank0(self: "logging.Logger", *args, **kwargs) -> None:
+    if int(os.getenv("LOCAL_RANK", "0")) == 0:
+        self.info(*args, **kwargs)
+
+
+def warning_rank0(self: "logging.Logger", *args, **kwargs) -> None:
+    if int(os.getenv("LOCAL_RANK", "0")) == 0:
+        self.warning(*args, **kwargs)
+
+
+@lru_cache(None)
+def warning_rank0_once(self: "logging.Logger", *args, **kwargs) -> None:
+    if int(os.getenv("LOCAL_RANK", "0")) == 0:
+        self.warning(*args, **kwargs)
+
+
+logging.Logger.info_rank0 = info_rank0
+logging.Logger.warning_rank0 = warning_rank0
+logging.Logger.warning_rank0_once = warning_rank0_once
diff --git a/llamafactory/extras/misc.py b/llamafactory/extras/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c4c247877b9d66c32d1886bead6f2c4f54da7a2
--- /dev/null
+++ b/llamafactory/extras/misc.py
@@ -0,0 +1,334 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's PEFT library.
+# https://github.com/huggingface/peft/blob/v0.10.0/src/peft/peft_model.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import os
+import socket
+from typing import TYPE_CHECKING, Any, Literal, Optional, Union
+
+import torch
+import torch.distributed as dist
+import transformers.dynamic_module_utils
+from huggingface_hub.utils import WeakFileLock
+from transformers import InfNanRemoveLogitsProcessor, LogitsProcessorList
+from transformers.dynamic_module_utils import get_relative_imports
+from transformers.utils import (
+    is_torch_bf16_gpu_available,
+    is_torch_cuda_available,
+    is_torch_mps_available,
+    is_torch_npu_available,
+    is_torch_xpu_available,
+)
+from transformers.utils.versions import require_version
+
+from . import logging
+
+
+_is_fp16_available = is_torch_npu_available() or is_torch_cuda_available()
+try:
+    _is_bf16_available = is_torch_bf16_gpu_available() or (is_torch_npu_available() and torch.npu.is_bf16_supported())
+except Exception:
+    _is_bf16_available = False
+
+
+if TYPE_CHECKING:
+    from numpy.typing import NDArray
+
+    from ..hparams import ModelArguments
+
+
+logger = logging.get_logger(__name__)
+
+
+class AverageMeter:
+    r"""Compute and store the average and current value."""
+
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+
+def check_version(requirement: str, mandatory: bool = False) -> None:
+    r"""Optionally check the package version."""
+    if is_env_enabled("DISABLE_VERSION_CHECK") and not mandatory:
+        logger.warning_rank0_once("Version checking has been disabled, may lead to unexpected behaviors.")
+        return
+
+    if "gptmodel" in requirement or "autoawq" in requirement:
+        pip_command = f"pip install {requirement} --no-build-isolation"
+    else:
+        pip_command = f"pip install {requirement}"
+
+    if mandatory:
+        hint = f"To fix: run `{pip_command}`."
+    else:
+        hint = f"To fix: run `{pip_command}` or set `DISABLE_VERSION_CHECK=1` to skip this check."
+
+    require_version(requirement, hint)
+
+
+def check_dependencies() -> None:
+    r"""Check the version of the required packages."""
+    check_version("transformers>=4.49.0,<=4.57.1")
+    check_version("datasets>=2.16.0,<=4.0.0")
+    check_version("accelerate>=1.3.0,<=1.11.0")
+    check_version("peft>=0.14.0,<=0.17.1")
+    check_version("trl>=0.8.6,<=0.9.6")
+
+
+def calculate_tps(dataset: list[dict[str, Any]], metrics: dict[str, float], stage: Literal["sft", "rm"]) -> float:
+    r"""Calculate effective tokens per second."""
+    effective_token_num = 0
+    for data in dataset:
+        if stage == "sft":
+            effective_token_num += len(data["input_ids"])
+        elif stage == "rm":
+            effective_token_num += len(data["chosen_input_ids"]) + len(data["rejected_input_ids"])
+
+    result = effective_token_num * metrics["epoch"] / metrics["train_runtime"]
+    return result / dist.get_world_size() if dist.is_initialized() else result
+
+
+def count_parameters(model: "torch.nn.Module") -> tuple[int, int]:
+    r"""Return the number of trainable parameters and number of all parameters in the model."""
+    trainable_params, all_param = 0, 0
+    for param in model.parameters():
+        num_params = param.numel()
+        # if using DS Zero 3 and the weights are initialized empty
+        if num_params == 0 and hasattr(param, "ds_numel"):
+            num_params = param.ds_numel
+
+        # Due to the design of 4bit linear layers from bitsandbytes, multiply the number of parameters by itemsize
+        if param.__class__.__name__ == "Params4bit":
+            if hasattr(param, "quant_storage") and hasattr(param.quant_storage, "itemsize"):
+                num_bytes = param.quant_storage.itemsize
+            elif hasattr(param, "element_size"):  # for older pytorch version
+                num_bytes = param.element_size()
+            else:
+                num_bytes = 1
+
+            num_params = num_params * 2 * num_bytes
+
+        all_param += num_params
+        if param.requires_grad:
+            trainable_params += num_params
+
+    return trainable_params, all_param
+
+
+def get_current_device() -> "torch.device":
+    r"""Get the current available device."""
+    if is_torch_xpu_available():
+        device = "xpu:{}".format(os.getenv("LOCAL_RANK", "0"))
+    elif is_torch_npu_available():
+        device = "npu:{}".format(os.getenv("LOCAL_RANK", "0"))
+    elif is_torch_mps_available():
+        device = "mps:{}".format(os.getenv("LOCAL_RANK", "0"))
+    elif is_torch_cuda_available():
+        device = "cuda:{}".format(os.getenv("LOCAL_RANK", "0"))
+    else:
+        device = "cpu"
+
+    return torch.device(device)
+
+
+def get_device_count() -> int:
+    r"""Get the number of available devices."""
+    if is_torch_xpu_available():
+        return torch.xpu.device_count()
+    elif is_torch_npu_available():
+        return torch.npu.device_count()
+    elif is_torch_mps_available():
+        return torch.mps.device_count()
+    elif is_torch_cuda_available():
+        return torch.cuda.device_count()
+    else:
+        return 0
+
+
+def get_logits_processor() -> "LogitsProcessorList":
+    r"""Get logits processor that removes NaN and Inf logits."""
+    logits_processor = LogitsProcessorList()
+    logits_processor.append(InfNanRemoveLogitsProcessor())
+    return logits_processor
+
+
+def get_current_memory() -> tuple[int, int]:
+    r"""Get the available and total memory for the current device (in Bytes)."""
+    if is_torch_xpu_available():
+        return torch.xpu.mem_get_info()
+    elif is_torch_npu_available():
+        return torch.npu.mem_get_info()
+    elif is_torch_mps_available():
+        return torch.mps.current_allocated_memory(), torch.mps.recommended_max_memory()
+    elif is_torch_cuda_available():
+        return torch.cuda.mem_get_info()
+    else:
+        return 0, -1
+
+
+def get_peak_memory() -> tuple[int, int]:
+    r"""Get the peak memory usage (allocated, reserved) for the current device (in Bytes)."""
+    if is_torch_xpu_available():
+        return torch.xpu.max_memory_allocated(), torch.xpu.max_memory_reserved()
+    elif is_torch_npu_available():
+        return torch.npu.max_memory_allocated(), torch.npu.max_memory_reserved()
+    elif is_torch_mps_available():
+        return torch.mps.current_allocated_memory(), -1
+    elif is_torch_cuda_available():
+        return torch.cuda.max_memory_allocated(), torch.cuda.max_memory_reserved()
+    else:
+        return 0, -1
+
+
+def has_tokenized_data(path: "os.PathLike") -> bool:
+    r"""Check if the path has a tokenized dataset."""
+    return os.path.isdir(path) and len(os.listdir(path)) > 0
+
+
+def infer_optim_dtype(model_dtype: Optional["torch.dtype"]) -> "torch.dtype":
+    r"""Infer the optimal dtype according to the model_dtype and device compatibility."""
+    if _is_bf16_available and (model_dtype == torch.bfloat16 or model_dtype is None):
+        return torch.bfloat16
+    elif _is_fp16_available:
+        return torch.float16
+    else:
+        return torch.float32
+
+
+def is_accelerator_available() -> bool:
+    r"""Check if the accelerator is available."""
+    return (
+        is_torch_xpu_available() or is_torch_npu_available() or is_torch_mps_available() or is_torch_cuda_available()
+    )
+
+
+def is_env_enabled(env_var: str, default: str = "0") -> bool:
+    r"""Check if the environment variable is enabled."""
+    return os.getenv(env_var, default).lower() in ["true", "y", "1"]
+
+
+def numpify(inputs: Union["NDArray", "torch.Tensor"]) -> "NDArray":
+    r"""Cast a torch tensor or a numpy array to a numpy array."""
+    if isinstance(inputs, torch.Tensor):
+        inputs = inputs.cpu()
+        if inputs.dtype == torch.bfloat16:  # numpy does not support bfloat16 until 1.21.4
+            inputs = inputs.to(torch.float32)
+
+        inputs = inputs.numpy()
+
+    return inputs
+
+
+def skip_check_imports() -> None:
+    r"""Avoid flash attention import error in custom model files."""
+    if not is_env_enabled("FORCE_CHECK_IMPORTS"):
+        transformers.dynamic_module_utils.check_imports = get_relative_imports
+
+
+def torch_gc() -> None:
+    r"""Collect the device memory."""
+    gc.collect()
+    if is_torch_xpu_available():
+        torch.xpu.empty_cache()
+    elif is_torch_npu_available():
+        torch.npu.empty_cache()
+    elif is_torch_mps_available():
+        torch.mps.empty_cache()
+    elif is_torch_cuda_available():
+        torch.cuda.empty_cache()
+
+
+def try_download_model_from_other_hub(model_args: "ModelArguments") -> str:
+    if (not use_modelscope() and not use_openmind()) or os.path.exists(model_args.model_name_or_path):
+        return model_args.model_name_or_path
+
+    if use_modelscope():
+        check_version("modelscope>=1.14.0", mandatory=True)
+        from modelscope import snapshot_download  # type: ignore
+        from modelscope.hub.api import HubApi  # type: ignore
+
+        if model_args.ms_hub_token:
+            api = HubApi()
+            api.login(model_args.ms_hub_token)
+
+        revision = "master" if model_args.model_revision == "main" else model_args.model_revision
+        with WeakFileLock(os.path.abspath(os.path.expanduser("~/.cache/llamafactory/modelscope.lock"))):
+            model_path = snapshot_download(
+                model_args.model_name_or_path,
+                revision=revision,
+                cache_dir=model_args.cache_dir,
+            )
+
+        return model_path
+
+    if use_openmind():
+        check_version("openmind>=0.8.0", mandatory=True)
+        from openmind.utils.hub import snapshot_download  # type: ignore
+
+        with WeakFileLock(os.path.abspath(os.path.expanduser("~/.cache/llamafactory/openmind.lock"))):
+            model_path = snapshot_download(
+                model_args.model_name_or_path,
+                revision=model_args.model_revision,
+                cache_dir=model_args.cache_dir,
+            )
+
+        return model_path
+
+
+def use_modelscope() -> bool:
+    return is_env_enabled("USE_MODELSCOPE_HUB")
+
+
+def use_openmind() -> bool:
+    return is_env_enabled("USE_OPENMIND_HUB")
+
+
+def use_ray() -> bool:
+    return is_env_enabled("USE_RAY")
+
+
+def use_kt() -> bool:
+    return is_env_enabled("USE_KT")
+
+
+def find_available_port() -> int:
+    r"""Find an available port on the local machine."""
+    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    sock.bind(("", 0))
+    port = sock.getsockname()[1]
+    sock.close()
+    return port
+
+
+def fix_proxy(ipv6_enabled: bool = False) -> None:
+    r"""Fix proxy settings for gradio ui."""
+    os.environ["no_proxy"] = "localhost,127.0.0.1,0.0.0.0"
+    if ipv6_enabled:
+        os.environ.pop("http_proxy", None)
+        os.environ.pop("HTTP_PROXY", None)
diff --git a/llamafactory/extras/packages.py b/llamafactory/extras/packages.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6328a7b028102486d47be5d57f52690dc24c14a
--- /dev/null
+++ b/llamafactory/extras/packages.py
@@ -0,0 +1,124 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/utils/import_utils.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib.metadata
+import importlib.util
+from functools import lru_cache
+from typing import TYPE_CHECKING
+
+from packaging import version
+
+
+if TYPE_CHECKING:
+    from packaging.version import Version
+
+
+def _is_package_available(name: str) -> bool:
+    return importlib.util.find_spec(name) is not None
+
+
+def _get_package_version(name: str) -> "Version":
+    try:
+        return version.parse(importlib.metadata.version(name))
+    except Exception:
+        return version.parse("0.0.0")
+
+
+def is_pyav_available():
+    return _is_package_available("av")
+
+
+def is_librosa_available():
+    return _is_package_available("librosa")
+
+
+def is_fastapi_available():
+    return _is_package_available("fastapi")
+
+
+def is_galore_available():
+    return _is_package_available("galore_torch")
+
+
+def is_apollo_available():
+    return _is_package_available("apollo_torch")
+
+
+def is_jieba_available():
+    return _is_package_available("jieba")
+
+
+def is_gradio_available():
+    return _is_package_available("gradio")
+
+
+def is_matplotlib_available():
+    return _is_package_available("matplotlib")
+
+
+def is_mcore_adapter_available():
+    return _is_package_available("mcore_adapter")
+
+
+def is_pillow_available():
+    return _is_package_available("PIL")
+
+
+def is_ray_available():
+    return _is_package_available("ray")
+
+
+def is_kt_available():
+    return _is_package_available("ktransformers")
+
+
+def is_requests_available():
+    return _is_package_available("requests")
+
+
+def is_rouge_available():
+    return _is_package_available("rouge_chinese")
+
+
+def is_safetensors_available():
+    return _is_package_available("safetensors")
+
+
+def is_sglang_available():
+    return _is_package_available("sglang")
+
+
+def is_starlette_available():
+    return _is_package_available("sse_starlette")
+
+
+@lru_cache
+def is_transformers_version_greater_than(content: str):
+    return _get_package_version("transformers") >= version.parse(content)
+
+
+@lru_cache
+def is_torch_version_greater_than(content: str):
+    return _get_package_version("torch") >= version.parse(content)
+
+
+def is_uvicorn_available():
+    return _is_package_available("uvicorn")
+
+
+def is_vllm_available():
+    return _is_package_available("vllm")
diff --git a/llamafactory/extras/ploting.py b/llamafactory/extras/ploting.py
new file mode 100644
index 0000000000000000000000000000000000000000..be89bcc5cb30429b60e13e71eba465f83de90e74
--- /dev/null
+++ b/llamafactory/extras/ploting.py
@@ -0,0 +1,95 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import math
+import os
+from typing import Any
+
+from transformers.trainer import TRAINER_STATE_NAME
+
+from . import logging
+from .packages import is_matplotlib_available
+
+
+if is_matplotlib_available():
+    import matplotlib.figure
+    import matplotlib.pyplot as plt
+
+
+logger = logging.get_logger(__name__)
+
+
+def smooth(scalars: list[float]) -> list[float]:
+    r"""EMA implementation according to TensorBoard."""
+    if len(scalars) == 0:
+        return []
+
+    last = scalars[0]
+    smoothed = []
+    weight = 1.8 * (1 / (1 + math.exp(-0.05 * len(scalars))) - 0.5)  # a sigmoid function
+    for next_val in scalars:
+        smoothed_val = last * weight + (1 - weight) * next_val
+        smoothed.append(smoothed_val)
+        last = smoothed_val
+    return smoothed
+
+
+def gen_loss_plot(trainer_log: list[dict[str, Any]]) -> "matplotlib.figure.Figure":
+    r"""Plot loss curves in LlamaBoard."""
+    plt.close("all")
+    plt.switch_backend("agg")
+    fig = plt.figure()
+    ax = fig.add_subplot(111)
+    steps, losses = [], []
+    for log in trainer_log:
+        if log.get("loss", None):
+            steps.append(log["current_steps"])
+            losses.append(log["loss"])
+
+    ax.plot(steps, losses, color="#1f77b4", alpha=0.4, label="original")
+    ax.plot(steps, smooth(losses), color="#1f77b4", label="smoothed")
+    ax.legend()
+    ax.set_xlabel("step")
+    ax.set_ylabel("loss")
+    return fig
+
+
+def plot_loss(save_dictionary: str, keys: list[str] = ["loss"]) -> None:
+    r"""Plot loss curves and saves the image."""
+    plt.switch_backend("agg")
+    with open(os.path.join(save_dictionary, TRAINER_STATE_NAME), encoding="utf-8") as f:
+        data = json.load(f)
+
+    for key in keys:
+        steps, metrics = [], []
+        for i in range(len(data["log_history"])):
+            if key in data["log_history"][i]:
+                steps.append(data["log_history"][i]["step"])
+                metrics.append(data["log_history"][i][key])
+
+        if len(metrics) == 0:
+            logger.warning_rank0(f"No metric {key} to plot.")
+            continue
+
+        plt.figure()
+        plt.plot(steps, metrics, color="#1f77b4", alpha=0.4, label="original")
+        plt.plot(steps, smooth(metrics), color="#1f77b4", label="smoothed")
+        plt.title(f"training {key} of {save_dictionary}")
+        plt.xlabel("step")
+        plt.ylabel(key)
+        plt.legend()
+        figure_path = os.path.join(save_dictionary, "training_{}.png".format(key.replace("/", "_")))
+        plt.savefig(figure_path, format="png", dpi=100)
+        print("Figure saved at:", figure_path)
diff --git a/llamafactory/hparams/__init__.py b/llamafactory/hparams/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bcc4295ce20c431f8db209a40cfc585ae90139f
--- /dev/null
+++ b/llamafactory/hparams/__init__.py
@@ -0,0 +1,37 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .data_args import DataArguments
+from .evaluation_args import EvaluationArguments
+from .finetuning_args import FinetuningArguments
+from .generating_args import GeneratingArguments
+from .model_args import ModelArguments
+from .parser import get_eval_args, get_infer_args, get_ray_args, get_train_args, read_args
+from .training_args import RayArguments, TrainingArguments
+
+
+__all__ = [
+    "DataArguments",
+    "EvaluationArguments",
+    "FinetuningArguments",
+    "GeneratingArguments",
+    "ModelArguments",
+    "RayArguments",
+    "TrainingArguments",
+    "get_eval_args",
+    "get_infer_args",
+    "get_ray_args",
+    "get_train_args",
+    "read_args",
+]
diff --git a/llamafactory/hparams/__pycache__/__init__.cpython-312.pyc b/llamafactory/hparams/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eb75e1f528c59d39e1ed051bf8010e8a6275bd4f
Binary files /dev/null and b/llamafactory/hparams/__pycache__/__init__.cpython-312.pyc differ
diff --git a/llamafactory/hparams/__pycache__/data_args.cpython-312.pyc b/llamafactory/hparams/__pycache__/data_args.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2a60a9b459e365b00a73bbf69856b2869694cdac
Binary files /dev/null and b/llamafactory/hparams/__pycache__/data_args.cpython-312.pyc differ
diff --git a/llamafactory/hparams/__pycache__/evaluation_args.cpython-312.pyc b/llamafactory/hparams/__pycache__/evaluation_args.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2884a288664c3e37d62f826dceb486a80f94f7bb
Binary files /dev/null and b/llamafactory/hparams/__pycache__/evaluation_args.cpython-312.pyc differ
diff --git a/llamafactory/hparams/__pycache__/finetuning_args.cpython-312.pyc b/llamafactory/hparams/__pycache__/finetuning_args.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b92c38d0cc6accb57f45f6e9a9c1cfd98b526609
Binary files /dev/null and b/llamafactory/hparams/__pycache__/finetuning_args.cpython-312.pyc differ
diff --git a/llamafactory/hparams/__pycache__/generating_args.cpython-312.pyc b/llamafactory/hparams/__pycache__/generating_args.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ade003732196794fceedb8ad41af4b034e2c5d8c
Binary files /dev/null and b/llamafactory/hparams/__pycache__/generating_args.cpython-312.pyc differ
diff --git a/llamafactory/hparams/__pycache__/model_args.cpython-312.pyc b/llamafactory/hparams/__pycache__/model_args.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eb0662e73750a6b0b5557db4f4c81ed2d02df025
Binary files /dev/null and b/llamafactory/hparams/__pycache__/model_args.cpython-312.pyc differ
diff --git a/llamafactory/hparams/__pycache__/parser.cpython-312.pyc b/llamafactory/hparams/__pycache__/parser.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6ccde1e7d7f822f92307aafcab20d985a9836f65
Binary files /dev/null and b/llamafactory/hparams/__pycache__/parser.cpython-312.pyc differ
diff --git a/llamafactory/hparams/__pycache__/training_args.cpython-312.pyc b/llamafactory/hparams/__pycache__/training_args.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..508d027c3ea353e3dc44f28c82a6daa1c53e05ec
Binary files /dev/null and b/llamafactory/hparams/__pycache__/training_args.cpython-312.pyc differ
diff --git a/llamafactory/hparams/data_args.py b/llamafactory/hparams/data_args.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6844733e5e50f768e98b6058200afdffc3f6d4b
--- /dev/null
+++ b/llamafactory/hparams/data_args.py
@@ -0,0 +1,186 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/language-modeling/run_clm.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import asdict, dataclass, field
+from typing import Any, Literal, Optional
+
+
+@dataclass
+class DataArguments:
+    r"""Arguments pertaining to what data we are going to input our model for training and evaluation."""
+
+    template: Optional[str] = field(
+        default=None,
+        metadata={"help": "Which template to use for constructing prompts in training and inference."},
+    )
+    dataset: Optional[str] = field(
+        default=None,
+        metadata={"help": "The name of dataset(s) to use for training. Use commas to separate multiple datasets."},
+    )
+    eval_dataset: Optional[str] = field(
+        default=None,
+        metadata={"help": "The name of dataset(s) to use for evaluation. Use commas to separate multiple datasets."},
+    )
+    dataset_dir: str = field(
+        default="data",
+        metadata={"help": "Path to the folder containing the datasets."},
+    )
+    media_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to the folder containing the images, videos or audios. Defaults to `dataset_dir`."},
+    )
+    cutoff_len: int = field(
+        default=2048,
+        metadata={"help": "The cutoff length of the tokenized inputs in the dataset."},
+    )
+    train_on_prompt: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to disable the mask on the prompt."},
+    )
+    mask_history: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to mask the history and train on the last turn only."},
+    )
+    streaming: bool = field(
+        default=False,
+        metadata={"help": "Enable dataset streaming."},
+    )
+    buffer_size: int = field(
+        default=16384,
+        metadata={"help": "Size of the buffer to randomly sample examples from in dataset streaming."},
+    )
+    mix_strategy: Literal["concat", "interleave_under", "interleave_over"] = field(
+        default="concat",
+        metadata={"help": "Strategy to use in dataset mixing (concat/interleave) (undersampling/oversampling)."},
+    )
+    interleave_probs: Optional[str] = field(
+        default=None,
+        metadata={"help": "Probabilities to sample data from datasets. Use commas to separate multiple datasets."},
+    )
+    overwrite_cache: bool = field(
+        default=False,
+        metadata={"help": "Overwrite the cached training and evaluation sets."},
+    )
+    preprocessing_batch_size: int = field(
+        default=1000,
+        metadata={"help": "The number of examples in one group in pre-processing."},
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the pre-processing."},
+    )
+    max_samples: Optional[int] = field(
+        default=None,
+        metadata={"help": "For debugging purposes, truncate the number of examples for each dataset."},
+    )
+    eval_num_beams: Optional[int] = field(
+        default=None,
+        metadata={"help": "Number of beams to use for evaluation. This argument will be passed to `model.generate`"},
+    )
+    ignore_pad_token_for_loss: bool = field(
+        default=True,
+        metadata={"help": "Whether or not to ignore the tokens corresponding to the pad label in loss computation."},
+    )
+    val_size: float = field(
+        default=0.0,
+        metadata={"help": "Size of the validation set, should be an integer or a float in range `[0,1)`."},
+    )
+    eval_on_each_dataset: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to evaluate on each dataset separately."},
+    )
+    packing: Optional[bool] = field(
+        default=None,
+        metadata={"help": "Enable sequences packing in training. Will automatically enable in pre-training."},
+    )
+    neat_packing: bool = field(
+        default=False,
+        metadata={"help": "Enable sequence packing without cross-attention."},
+    )
+    tool_format: Optional[str] = field(
+        default=None,
+        metadata={"help": "Tool format to use for constructing function calling examples."},
+    )
+    default_system: Optional[str] = field(
+        default=None,
+        metadata={"help": "Override the default system message in the template."},
+    )
+    enable_thinking: Optional[bool] = field(
+        default=True,
+        metadata={"help": "Whether or not to enable thinking mode for reasoning models."},
+    )
+    tokenized_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Path to save or load the tokenized datasets. "
+                "If tokenized_path not exists, it will save the tokenized datasets. "
+                "If tokenized_path exists, it will load the tokenized datasets."
+            )
+        },
+    )
+    data_shared_file_system: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to use a shared file system for the datasets."},
+    )
+
+    def __post_init__(self):
+        def split_arg(arg):
+            if isinstance(arg, str):
+                return [item.strip() for item in arg.split(",")]
+            return arg
+
+        self.dataset = split_arg(self.dataset)
+        self.eval_dataset = split_arg(self.eval_dataset)
+
+        if self.media_dir is None:
+            self.media_dir = self.dataset_dir
+
+        if self.dataset is None and self.val_size > 1e-6:
+            raise ValueError("Cannot specify `val_size` if `dataset` is None.")
+
+        if self.eval_dataset is not None and self.val_size > 1e-6:
+            raise ValueError("Cannot specify `val_size` if `eval_dataset` is not None.")
+
+        if self.interleave_probs is not None:
+            if self.mix_strategy == "concat":
+                raise ValueError("`interleave_probs` is only valid for interleaved mixing.")
+
+            self.interleave_probs = list(map(float, split_arg(self.interleave_probs)))
+            if self.dataset is not None and len(self.dataset) != len(self.interleave_probs):
+                raise ValueError("The length of dataset and interleave probs should be identical.")
+
+            if self.eval_dataset is not None and len(self.eval_dataset) != len(self.interleave_probs):
+                raise ValueError("The length of eval dataset and interleave probs should be identical.")
+
+        if self.streaming and self.val_size > 1e-6 and self.val_size < 1:
+            raise ValueError("Streaming mode should have an integer val size.")
+
+        if self.streaming and self.max_samples is not None:
+            raise ValueError("`max_samples` is incompatible with `streaming`.")
+
+        if self.mask_history and self.train_on_prompt:
+            raise ValueError("`mask_history` is incompatible with `train_on_prompt`.")
+
+        if self.neat_packing:
+            self.packing = True
+
+        if self.packing:
+            self.cutoff_len -= 1  # avoid pad_to_multiple_of, needs improve
+
+    def to_dict(self) -> dict[str, Any]:
+        return asdict(self)
diff --git a/llamafactory/hparams/evaluation_args.py b/llamafactory/hparams/evaluation_args.py
new file mode 100644
index 0000000000000000000000000000000000000000..d92e8b1eaec9a93d120728db9cb9c59e125055f3
--- /dev/null
+++ b/llamafactory/hparams/evaluation_args.py
@@ -0,0 +1,60 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from dataclasses import dataclass, field
+from typing import Literal, Optional
+
+from datasets import DownloadMode
+
+
+@dataclass
+class EvaluationArguments:
+    r"""Arguments pertaining to specify the evaluation parameters."""
+
+    task: str = field(
+        metadata={"help": "Name of the evaluation task."},
+    )
+    task_dir: str = field(
+        default="evaluation",
+        metadata={"help": "Path to the folder containing the evaluation datasets."},
+    )
+    batch_size: int = field(
+        default=4,
+        metadata={"help": "The batch size per GPU for evaluation."},
+    )
+    seed: int = field(
+        default=42,
+        metadata={"help": "Random seed to be used with data loaders."},
+    )
+    lang: Literal["en", "zh"] = field(
+        default="en",
+        metadata={"help": "Language used at evaluation."},
+    )
+    n_shot: int = field(
+        default=5,
+        metadata={"help": "Number of examplars for few-shot learning."},
+    )
+    save_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to save the evaluation results."},
+    )
+    download_mode: DownloadMode = field(
+        default=DownloadMode.REUSE_DATASET_IF_EXISTS,
+        metadata={"help": "Download mode used for the evaluation datasets."},
+    )
+
+    def __post_init__(self):
+        if self.save_dir is not None and os.path.exists(self.save_dir):
+            raise ValueError("`save_dir` already exists, use another one.")
diff --git a/llamafactory/hparams/finetuning_args.py b/llamafactory/hparams/finetuning_args.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef690d7bbee86f995c5a716dd9ea6567a4641530
--- /dev/null
+++ b/llamafactory/hparams/finetuning_args.py
@@ -0,0 +1,586 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import asdict, dataclass, field
+from typing import Any, Literal, Optional
+
+
+@dataclass
+class FreezeArguments:
+    r"""Arguments pertaining to the freeze (partial-parameter) training."""
+
+    freeze_trainable_layers: int = field(
+        default=2,
+        metadata={
+            "help": (
+                "The number of trainable layers for freeze (partial-parameter) fine-tuning. "
+                "Positive numbers mean the last n layers are set as trainable, "
+                "negative numbers mean the first n layers are set as trainable."
+            )
+        },
+    )
+    freeze_trainable_modules: str = field(
+        default="all",
+        metadata={
+            "help": (
+                "Name(s) of trainable modules for freeze (partial-parameter) fine-tuning. "
+                "Use commas to separate multiple modules. "
+                "Use `all` to specify all the available modules."
+            )
+        },
+    )
+    freeze_extra_modules: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Name(s) of modules apart from hidden layers to be set as trainable "
+                "for freeze (partial-parameter) fine-tuning. "
+                "Use commas to separate multiple modules."
+            )
+        },
+    )
+
+
+@dataclass
+class LoraArguments:
+    r"""Arguments pertaining to the LoRA training."""
+
+    additional_target: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Name(s) of modules apart from LoRA layers to be set as trainable "
+                "and saved in the final checkpoint. "
+                "Use commas to separate multiple modules."
+            )
+        },
+    )
+    lora_alpha: Optional[int] = field(
+        default=None,
+        metadata={"help": "The scale factor for LoRA fine-tuning (default: lora_rank * 2)."},
+    )
+    lora_dropout: float = field(
+        default=0.0,
+        metadata={"help": "Dropout rate for the LoRA fine-tuning."},
+    )
+    lora_rank: int = field(
+        default=8,
+        metadata={"help": "The intrinsic dimension for LoRA fine-tuning."},
+    )
+    lora_target: str = field(
+        default="all",
+        metadata={
+            "help": (
+                "Name(s) of target modules to apply LoRA. "
+                "Use commas to separate multiple modules. "
+                "Use `all` to specify all the linear modules."
+            )
+        },
+    )
+    loraplus_lr_ratio: Optional[float] = field(
+        default=None,
+        metadata={"help": "LoRA plus learning rate ratio (lr_B / lr_A)."},
+    )
+    loraplus_lr_embedding: float = field(
+        default=1e-6,
+        metadata={"help": "LoRA plus learning rate for lora embedding layers."},
+    )
+    use_rslora: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to use the rank stabilization scaling factor for LoRA layer."},
+    )
+    use_dora: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to use the weight-decomposed lora method (DoRA)."},
+    )
+    pissa_init: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to initialize a PiSSA adapter."},
+    )
+    pissa_iter: int = field(
+        default=16,
+        metadata={"help": "The number of iteration steps performed by FSVD in PiSSA. Use -1 to disable it."},
+    )
+    pissa_convert: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to convert the PiSSA adapter to a normal LoRA adapter."},
+    )
+    create_new_adapter: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to create a new adapter with randomly initialized weight."},
+    )
+
+
+@dataclass
+class OFTArguments:
+    r"""Arguments pertaining to the OFT training."""
+
+    additional_target: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Name(s) of modules apart from LoRA layers to be set as trainable "
+                "and saved in the final checkpoint. "
+                "Use commas to separate multiple modules."
+            )
+        },
+    )
+    module_dropout: float = field(
+        default=0.0,
+        metadata={"help": "Dropout rate for the OFT fine-tuning."},
+    )
+    oft_rank: int = field(
+        default=0,
+        metadata={"help": "The intrinsic dimension for OFT fine-tuning."},
+    )
+    oft_block_size: int = field(
+        default=32,
+        metadata={"help": "The intrinsic dimension for OFT fine-tuning."},
+    )
+    oft_target: str = field(
+        default="all",
+        metadata={
+            "help": (
+                "Name(s) of target modules to apply OFT. "
+                "Use commas to separate multiple modules. "
+                "Use `all` to specify all the linear modules."
+            )
+        },
+    )
+    create_new_adapter: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to create a new adapter with randomly initialized weight."},
+    )
+
+
+@dataclass
+class RLHFArguments:
+    r"""Arguments pertaining to the PPO, DPO and KTO training."""
+
+    pref_beta: float = field(
+        default=0.1,
+        metadata={"help": "The beta parameter in the preference loss."},
+    )
+    pref_ftx: float = field(
+        default=0.0,
+        metadata={"help": "The supervised fine-tuning loss coefficient in DPO training."},
+    )
+    pref_bco_weight: float = field(
+        default=0.0,
+        metadata={"help": "The Binary Classifier Optimization coefficient in DPO training."},
+    )
+    pref_loss: Literal["sigmoid", "hinge", "ipo", "kto_pair", "orpo", "simpo"] = field(
+        default="sigmoid",
+        metadata={"help": "The type of DPO loss to use."},
+    )
+    dpo_label_smoothing: float = field(
+        default=0.0,
+        metadata={"help": "The robust DPO label smoothing parameter in cDPO that should be between 0 and 0.5."},
+    )
+    kto_chosen_weight: float = field(
+        default=1.0,
+        metadata={"help": "The weight factor of the desirable losses in KTO training."},
+    )
+    kto_rejected_weight: float = field(
+        default=1.0,
+        metadata={"help": "The weight factor of the undesirable losses in KTO training."},
+    )
+    simpo_gamma: float = field(
+        default=0.5,
+        metadata={"help": "The target reward margin term in SimPO loss."},
+    )
+    ppo_buffer_size: int = field(
+        default=1,
+        metadata={"help": "The number of mini-batches to make experience buffer in a PPO optimization step."},
+    )
+    ppo_epochs: int = field(
+        default=4,
+        metadata={"help": "The number of epochs to perform in a PPO optimization step."},
+    )
+    ppo_score_norm: bool = field(
+        default=False,
+        metadata={"help": "Use score normalization in PPO training."},
+    )
+    ppo_target: float = field(
+        default=6.0,
+        metadata={"help": "Target KL value for adaptive KL control in PPO training."},
+    )
+    ppo_whiten_rewards: bool = field(
+        default=False,
+        metadata={"help": "Whiten the rewards before compute advantages in PPO training."},
+    )
+    ref_model: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to the reference model used for the PPO or DPO training."},
+    )
+    ref_model_adapters: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to the adapters of the reference model."},
+    )
+    ref_model_quantization_bit: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of bits to quantize the reference model."},
+    )
+    reward_model: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to the reward model used for the PPO training."},
+    )
+    reward_model_adapters: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to the adapters of the reward model."},
+    )
+    reward_model_quantization_bit: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of bits to quantize the reward model."},
+    )
+    reward_model_type: Literal["lora", "full", "api"] = field(
+        default="lora",
+        metadata={"help": "The type of the reward model in PPO training. Lora model only supports lora training."},
+    )
+    ld_alpha: Optional[float] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Alpha parameter from the LD-DPO paper, which controls the weighting of"
+                " the verbose token log-probabilities in responses."
+            )
+        },
+    )
+
+
+@dataclass
+class GaloreArguments:
+    r"""Arguments pertaining to the GaLore algorithm."""
+
+    use_galore: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to use the gradient low-Rank projection (GaLore)."},
+    )
+    galore_target: str = field(
+        default="all",
+        metadata={
+            "help": (
+                "Name(s) of modules to apply GaLore. Use commas to separate multiple modules. "
+                "Use `all` to specify all the linear modules."
+            )
+        },
+    )
+    galore_rank: int = field(
+        default=16,
+        metadata={"help": "The rank of GaLore gradients."},
+    )
+    galore_update_interval: int = field(
+        default=200,
+        metadata={"help": "Number of steps to update the GaLore projection."},
+    )
+    galore_scale: float = field(
+        default=2.0,
+        metadata={"help": "GaLore scaling coefficient."},
+    )
+    galore_proj_type: Literal["std", "reverse_std", "right", "left", "full"] = field(
+        default="std",
+        metadata={"help": "Type of GaLore projection."},
+    )
+    galore_layerwise: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to enable layer-wise update to further save memory."},
+    )
+
+
+@dataclass
+class ApolloArguments:
+    r"""Arguments pertaining to the APOLLO algorithm."""
+
+    use_apollo: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to use the APOLLO optimizer."},
+    )
+    apollo_target: str = field(
+        default="all",
+        metadata={
+            "help": (
+                "Name(s) of modules to apply APOLLO. Use commas to separate multiple modules. "
+                "Use `all` to specify all the linear modules."
+            )
+        },
+    )
+    apollo_rank: int = field(
+        default=16,
+        metadata={"help": "The rank of APOLLO gradients."},
+    )
+    apollo_update_interval: int = field(
+        default=200,
+        metadata={"help": "Number of steps to update the APOLLO projection."},
+    )
+    apollo_scale: float = field(
+        default=32.0,
+        metadata={"help": "APOLLO scaling coefficient."},
+    )
+    apollo_proj: Literal["svd", "random"] = field(
+        default="random",
+        metadata={"help": "Type of APOLLO low-rank projection algorithm (svd or random)."},
+    )
+    apollo_proj_type: Literal["std", "right", "left"] = field(
+        default="std",
+        metadata={"help": "Type of APOLLO projection."},
+    )
+    apollo_scale_type: Literal["channel", "tensor"] = field(
+        default="channel",
+        metadata={"help": "Type of APOLLO scaling (channel or tensor)."},
+    )
+    apollo_layerwise: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to enable layer-wise update to further save memory."},
+    )
+    apollo_scale_front: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to use the norm-growth limiter in front of gradient scaling."},
+    )
+
+
+@dataclass
+class BAdamArgument:
+    r"""Arguments pertaining to the BAdam optimizer."""
+
+    use_badam: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to use the BAdam optimizer."},
+    )
+    badam_mode: Literal["layer", "ratio"] = field(
+        default="layer",
+        metadata={"help": "Whether to use layer-wise or ratio-wise BAdam optimizer."},
+    )
+    badam_start_block: Optional[int] = field(
+        default=None,
+        metadata={"help": "The starting block index for layer-wise BAdam."},
+    )
+    badam_switch_mode: Optional[Literal["ascending", "descending", "random", "fixed"]] = field(
+        default="ascending",
+        metadata={"help": "the strategy of picking block to update for layer-wise BAdam."},
+    )
+    badam_switch_interval: Optional[int] = field(
+        default=50,
+        metadata={
+            "help": "Number of steps to update the block for layer-wise BAdam. Use -1 to disable the block update."
+        },
+    )
+    badam_update_ratio: float = field(
+        default=0.05,
+        metadata={"help": "The ratio of the update for ratio-wise BAdam."},
+    )
+    badam_mask_mode: Literal["adjacent", "scatter"] = field(
+        default="adjacent",
+        metadata={
+            "help": (
+                "The mode of the mask for BAdam optimizer. "
+                "`adjacent` means that the trainable parameters are adjacent to each other, "
+                "`scatter` means that trainable parameters are randomly choosed from the weight."
+            )
+        },
+    )
+    badam_verbose: int = field(
+        default=0,
+        metadata={
+            "help": (
+                "The verbosity level of BAdam optimizer. "
+                "0 for no print, 1 for print the block prefix, 2 for print trainable parameters."
+            )
+        },
+    )
+
+
+@dataclass
+class SwanLabArguments:
+    use_swanlab: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to use the SwanLab (an experiment tracking and visualization tool)."},
+    )
+    swanlab_project: Optional[str] = field(
+        default="llamafactory",
+        metadata={"help": "The project name in SwanLab."},
+    )
+    swanlab_workspace: Optional[str] = field(
+        default=None,
+        metadata={"help": "The workspace name in SwanLab."},
+    )
+    swanlab_run_name: Optional[str] = field(
+        default=None,
+        metadata={"help": "The experiment name in SwanLab."},
+    )
+    swanlab_mode: Literal["cloud", "local"] = field(
+        default="cloud",
+        metadata={"help": "The mode of SwanLab."},
+    )
+    swanlab_api_key: Optional[str] = field(
+        default=None,
+        metadata={"help": "The API key for SwanLab."},
+    )
+    swanlab_logdir: Optional[str] = field(
+        default=None,
+        metadata={"help": "The log directory for SwanLab."},
+    )
+    swanlab_lark_webhook_url: Optional[str] = field(
+        default=None,
+        metadata={"help": "The Lark(飞书) webhook URL for SwanLab."},
+    )
+    swanlab_lark_secret: Optional[str] = field(
+        default=None,
+        metadata={"help": "The Lark(飞书) secret for SwanLab."},
+    )
+
+
+@dataclass
+class FinetuningArguments(
+    SwanLabArguments,
+    BAdamArgument,
+    ApolloArguments,
+    GaloreArguments,
+    RLHFArguments,
+    LoraArguments,
+    OFTArguments,
+    FreezeArguments,
+):
+    r"""Arguments pertaining to which techniques we are going to fine-tuning with."""
+
+    pure_bf16: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to train model in purely bf16 precision (without AMP)."},
+    )
+    stage: Literal["pt", "sft", "rm", "ppo", "dpo", "kto"] = field(
+        default="sft",
+        metadata={"help": "Which stage will be performed in training."},
+    )
+    finetuning_type: Literal["lora", "oft", "freeze", "full"] = field(
+        default="lora",
+        metadata={"help": "Which fine-tuning method to use."},
+    )
+    use_llama_pro: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to make only the parameters in the expanded blocks trainable."},
+    )
+    use_adam_mini: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to use the Adam-mini optimizer."},
+    )
+    use_mca: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to use MCA (Megatron Core Adapter) training. "
+                "Controlled by USE_MCA environment variable."
+            )
+        },
+    )
+    use_muon: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to use the Muon optimizer."},
+    )
+    use_dft_loss: bool = field(
+        default=False,
+        metadata={"help": "Whether to use the DFT loss."},
+    )
+    freeze_vision_tower: bool = field(
+        default=True,
+        metadata={"help": "Whether ot not to freeze the vision tower in MLLM training."},
+    )
+    freeze_multi_modal_projector: bool = field(
+        default=True,
+        metadata={"help": "Whether or not to freeze the multi modal projector in MLLM training."},
+    )
+    freeze_language_model: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to freeze the language model in MLLM training."},
+    )
+    compute_accuracy: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to compute the token-level accuracy at evaluation."},
+    )
+    disable_shuffling: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to disable the shuffling of the training set."},
+    )
+    early_stopping_steps: Optional[int] = field(
+        default=None,
+        metadata={"help": "Number of steps to stop training if the `metric_for_best_model` does not improve."},
+    )
+    plot_loss: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to save the training loss curves."},
+    )
+    include_effective_tokens_per_second: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to compute effective tokens per second."},
+    )
+
+    def __post_init__(self):
+        def split_arg(arg):
+            if isinstance(arg, str):
+                return [item.strip() for item in arg.split(",")]
+            return arg
+
+        self.freeze_trainable_modules: list[str] = split_arg(self.freeze_trainable_modules)
+        self.freeze_extra_modules: Optional[list[str]] = split_arg(self.freeze_extra_modules)
+        self.lora_alpha: int = self.lora_alpha or self.lora_rank * 2
+        self.lora_target: list[str] = split_arg(self.lora_target)
+        self.oft_target: list[str] = split_arg(self.oft_target)
+        self.additional_target: Optional[list[str]] = split_arg(self.additional_target)
+        self.galore_target: list[str] = split_arg(self.galore_target)
+        self.apollo_target: list[str] = split_arg(self.apollo_target)
+        self.use_ref_model = self.stage == "dpo" and self.pref_loss not in ["orpo", "simpo"]
+
+        assert self.finetuning_type in ["lora", "oft", "freeze", "full"], "Invalid fine-tuning method."
+        assert self.ref_model_quantization_bit in [None, 8, 4], "We only accept 4-bit or 8-bit quantization."
+        assert self.reward_model_quantization_bit in [None, 8, 4], "We only accept 4-bit or 8-bit quantization."
+
+        if self.stage == "ppo" and self.reward_model is None:
+            raise ValueError("`reward_model` is necessary for PPO training.")
+
+        if self.stage == "ppo" and self.reward_model_type == "lora" and self.finetuning_type != "lora":
+            raise ValueError("`reward_model_type` cannot be lora for Freeze/Full PPO training.")
+
+        if self.stage == "ppo" and self.reward_model_type == "oft" and self.finetuning_type != "oft":
+            raise ValueError("`reward_model_type` cannot be oft for Freeze/Full PPO training.")
+
+        if self.stage == "dpo" and self.pref_loss != "sigmoid" and self.dpo_label_smoothing > 1e-6:
+            raise ValueError("`dpo_label_smoothing` is only valid for sigmoid loss function.")
+
+        if self.use_llama_pro and self.finetuning_type == "full":
+            raise ValueError("`use_llama_pro` is only valid for Freeze or LoRA training.")
+
+        if self.finetuning_type == "lora" and (self.use_galore or self.use_apollo or self.use_badam):
+            raise ValueError("Cannot use LoRA with GaLore, APOLLO or BAdam together.")
+
+        if int(self.use_galore) + int(self.use_apollo) + (self.use_badam) > 1:
+            raise ValueError("Cannot use GaLore, APOLLO or BAdam together.")
+
+        if self.pissa_init and (self.stage in ["ppo", "kto"] or self.use_ref_model):
+            raise ValueError("Cannot use PiSSA for current training stage.")
+
+        if self.finetuning_type != "lora":
+            if self.loraplus_lr_ratio is not None:
+                raise ValueError("`loraplus_lr_ratio` is only valid for LoRA training.")
+
+            if self.use_rslora:
+                raise ValueError("`use_rslora` is only valid for LoRA training.")
+
+            if self.use_dora:
+                raise ValueError("`use_dora` is only valid for LoRA training.")
+
+            if self.pissa_init:
+                raise ValueError("`pissa_init` is only valid for LoRA training.")
+
+    def to_dict(self) -> dict[str, Any]:
+        args = asdict(self)
+        args = {k: f"<{k.upper()}>" if k.endswith("api_key") else v for k, v in args.items()}
+        return args
diff --git a/llamafactory/hparams/generating_args.py b/llamafactory/hparams/generating_args.py
new file mode 100644
index 0000000000000000000000000000000000000000..7eacb14763ac972a08ea59b74a799e704e4f9bc9
--- /dev/null
+++ b/llamafactory/hparams/generating_args.py
@@ -0,0 +1,83 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import asdict, dataclass, field
+from typing import Any
+
+from transformers import GenerationConfig
+
+
+@dataclass
+class GeneratingArguments:
+    r"""Arguments pertaining to specify the decoding parameters."""
+
+    do_sample: bool = field(
+        default=True,
+        metadata={"help": "Whether or not to use sampling, use greedy decoding otherwise."},
+    )
+    temperature: float = field(
+        default=0.95,
+        metadata={"help": "The value used to modulate the next token probabilities."},
+    )
+    top_p: float = field(
+        default=0.7,
+        metadata={
+            "help": (
+                "The smallest set of most probable tokens with probabilities that add up to top_p or higher are kept."
+            )
+        },
+    )
+    top_k: int = field(
+        default=50,
+        metadata={"help": "The number of highest probability vocabulary tokens to keep for top-k filtering."},
+    )
+    num_beams: int = field(
+        default=1,
+        metadata={"help": "Number of beams for beam search. 1 means no beam search."},
+    )
+    max_length: int = field(
+        default=1024,
+        metadata={"help": "The maximum length the generated tokens can have. It can be overridden by max_new_tokens."},
+    )
+    max_new_tokens: int = field(
+        default=1024,
+        metadata={"help": "The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt."},
+    )
+    repetition_penalty: float = field(
+        default=1.0,
+        metadata={"help": "The parameter for repetition penalty. 1.0 means no penalty."},
+    )
+    length_penalty: float = field(
+        default=1.0,
+        metadata={"help": "Exponential penalty to the length that is used with beam-based generation."},
+    )
+    skip_special_tokens: bool = field(
+        default=True,
+        metadata={"help": "Whether or not to remove special tokens in the decoding."},
+    )
+
+    def to_dict(self, obey_generation_config: bool = False) -> dict[str, Any]:
+        args = asdict(self)
+        if args.get("max_new_tokens", -1) > 0:
+            args.pop("max_length", None)
+        else:
+            args.pop("max_new_tokens", None)
+
+        if obey_generation_config:
+            generation_config = GenerationConfig()
+            for key in list(args.keys()):
+                if not hasattr(generation_config, key):
+                    args.pop(key)
+
+        return args
diff --git a/llamafactory/hparams/model_args.py b/llamafactory/hparams/model_args.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ee13a002b9b7e6d8976664e0bd4cf8456f8ac85
--- /dev/null
+++ b/llamafactory/hparams/model_args.py
@@ -0,0 +1,585 @@
+# Copyright 2025 HuggingFace Inc., the KVCache.AI team, Approaching AI, and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/language-modeling/run_clm.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+from dataclasses import asdict, dataclass, field, fields
+from typing import Any, Literal, Optional, Union
+
+import torch
+from omegaconf import OmegaConf
+from transformers.training_args import _convert_str_dict
+from typing_extensions import Self
+
+from ..extras.constants import AttentionFunction, EngineName, QuantizationMethod, RopeScaling
+from ..extras.logging import get_logger
+
+
+logger = get_logger(__name__)
+
+
+@dataclass
+class BaseModelArguments:
+    r"""Arguments pertaining to the model."""
+
+    model_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Path to the model weight or identifier from huggingface.co/models or modelscope.cn/models."
+        },
+    )
+    adapter_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Path to the adapter weight or identifier from huggingface.co/models. "
+                "Use commas to separate multiple adapters."
+            )
+        },
+    )
+    adapter_folder: Optional[str] = field(
+        default=None,
+        metadata={"help": "The folder containing the adapter weights to load."},
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where to store the pre-trained models downloaded from huggingface.co or modelscope.cn."},
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": "Whether or not to use one of the fast tokenizer (backed by the tokenizers library)."},
+    )
+    resize_vocab: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to resize the tokenizer vocab and the embedding layers."},
+    )
+    split_special_tokens: bool = field(
+        default=False,
+        metadata={"help": "Whether or not the special tokens should be split during the tokenization process."},
+    )
+    add_tokens: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Non-special tokens to be added into the tokenizer. Use commas to separate multiple tokens."
+        },
+    )
+    add_special_tokens: Optional[str] = field(
+        default=None,
+        metadata={"help": "Special tokens to be added into the tokenizer. Use commas to separate multiple tokens."},
+    )
+    new_special_tokens_config: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Path to YAML config with special token descriptions for semantic initialization. "
+                "If set, this takes precedence over add_special_tokens. "
+                "YAML format: {'<token>': 'description text', ...}"
+            )
+        },
+    )
+    init_special_tokens: Literal["noise_init", "desc_init", "desc_init_w_noise"] = field(
+        default="noise_init",
+        metadata={
+            "help": (
+                "Initialization method for new special tokens: "
+                "'noise_init' (default, random noise around mean), "
+                "'desc_init' (semantic initialization from descriptions), "
+                "'desc_init_w_noise' (semantic + random noise). "
+                "Note: 'desc_init' methods require new_special_tokens_config."
+            )
+        },
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    low_cpu_mem_usage: bool = field(
+        default=True,
+        metadata={"help": "Whether or not to use memory-efficient model loading."},
+    )
+    rope_scaling: Optional[RopeScaling] = field(
+        default=None,
+        metadata={"help": "Which scaling strategy should be adopted for the RoPE embeddings."},
+    )
+    flash_attn: AttentionFunction = field(
+        default=AttentionFunction.AUTO,
+        metadata={"help": "Enable FlashAttention for faster training and inference."},
+    )
+    shift_attn: bool = field(
+        default=False,
+        metadata={"help": "Enable shift short attention (S^2-Attn) proposed by LongLoRA."},
+    )
+    mixture_of_depths: Optional[Literal["convert", "load"]] = field(
+        default=None,
+        metadata={"help": "Convert the model to mixture-of-depths (MoD) or load the MoD model."},
+    )
+    use_unsloth: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to use unsloth's optimization for the LoRA training."},
+    )
+    use_unsloth_gc: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to use unsloth's gradient checkpointing (no need to install unsloth)."},
+    )
+    enable_liger_kernel: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to enable liger kernel for faster training."},
+    )
+    moe_aux_loss_coef: Optional[float] = field(
+        default=None,
+        metadata={"help": "Coefficient of the auxiliary router loss in mixture-of-experts model."},
+    )
+    disable_gradient_checkpointing: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to disable gradient checkpointing."},
+    )
+    use_reentrant_gc: bool = field(
+        default=True,
+        metadata={"help": "Whether or not to use reentrant gradient checkpointing."},
+    )
+    upcast_layernorm: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to upcast the layernorm weights in fp32."},
+    )
+    upcast_lmhead_output: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to upcast the output of lm_head in fp32."},
+    )
+    train_from_scratch: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to randomly initialize the model weights."},
+    )
+    infer_backend: EngineName = field(
+        default=EngineName.HF,
+        metadata={"help": "Backend engine used at inference."},
+    )
+    offload_folder: str = field(
+        default="offload",
+        metadata={"help": "Path to offload model weights."},
+    )
+    use_kv_cache: bool = field(
+        default=True,
+        metadata={"help": "Whether or not to use KV cache in generation."},
+    )
+    infer_dtype: Literal["auto", "float16", "bfloat16", "float32"] = field(
+        default="auto",
+        metadata={"help": "Data type for model weights and activations at inference."},
+    )
+    hf_hub_token: Optional[str] = field(
+        default=None,
+        metadata={"help": "Auth token to log in with Hugging Face Hub."},
+    )
+    ms_hub_token: Optional[str] = field(
+        default=None,
+        metadata={"help": "Auth token to log in with ModelScope Hub."},
+    )
+    om_hub_token: Optional[str] = field(
+        default=None,
+        metadata={"help": "Auth token to log in with Modelers Hub."},
+    )
+    print_param_status: bool = field(
+        default=False,
+        metadata={"help": "For debugging purposes, print the status of the parameters in the model."},
+    )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={"help": "Whether to trust the execution of code from datasets/models defined on the Hub or not."},
+    )
+
+    def __post_init__(self):
+        if self.model_name_or_path is None:
+            raise ValueError("Please provide `model_name_or_path`.")
+
+        if self.split_special_tokens and self.use_fast_tokenizer:
+            raise ValueError("`split_special_tokens` is only supported for slow tokenizers.")
+
+        if self.adapter_name_or_path is not None:  # support merging multiple lora weights
+            self.adapter_name_or_path = [path.strip() for path in self.adapter_name_or_path.split(",")]
+
+        if self.add_tokens is not None:  # support multiple tokens
+            self.add_tokens = [token.strip() for token in self.add_tokens.split(",")]
+
+        # Process special tokens with priority: new_special_tokens_config > add_special_tokens
+        if self.new_special_tokens_config is not None:
+            # Priority 1: Load from YAML config (extracts both tokens and descriptions)
+            try:
+                cfg = OmegaConf.load(self.new_special_tokens_config)
+                token_descriptions = OmegaConf.to_container(cfg)
+
+                if not isinstance(token_descriptions, dict):
+                    raise ValueError(
+                        f"YAML config must be a dictionary mapping tokens to descriptions. "
+                        f"Got: {type(token_descriptions)}"
+                    )
+
+                # Extract token list from config keys
+                extracted_tokens = list(token_descriptions.keys())
+
+                # Warn if both are set
+                if self.add_special_tokens is not None:
+                    logger.warning_rank0(
+                        "Both 'new_special_tokens_config' and 'add_special_tokens' are set. "
+                        f"Using tokens from config: {extracted_tokens}"
+                    )
+
+                # Override add_special_tokens with extracted tokens (as list)
+                self.add_special_tokens = extracted_tokens
+
+                # Store descriptions internally for later use (internal attribute)
+                self._special_token_descriptions = token_descriptions
+
+                logger.info_rank0(
+                    f"Loaded {len(extracted_tokens)} special tokens with descriptions from: "
+                    f"{self.new_special_tokens_config}"
+                )
+
+            except Exception as e:
+                logger.error_rank0(
+                    f"Failed to load special tokens config from '{self.new_special_tokens_config}': {e}"
+                )
+                raise
+
+        elif self.add_special_tokens is not None:
+            # Priority 2: Use simple comma-separated string (no descriptions)
+            self.add_special_tokens = [token.strip() for token in self.add_special_tokens.split(",")]
+            self._special_token_descriptions = None
+
+        else:
+            # No special tokens to add
+            self._special_token_descriptions = None
+
+        # Validate init method
+        if self.init_special_tokens in ["desc_init", "desc_init_w_noise"]:
+            if self._special_token_descriptions is None:
+                logger.warning_rank0(
+                    f"init_special_tokens='{self.init_special_tokens}' requires new_special_tokens_config. "
+                    "Falling back to 'noise_init'"
+                )
+                self.init_special_tokens = "noise_init"
+
+
+@dataclass
+class QuantizationArguments:
+    r"""Arguments pertaining to the quantization method."""
+
+    quantization_method: QuantizationMethod = field(
+        default=QuantizationMethod.BNB,
+        metadata={"help": "Quantization method to use for on-the-fly quantization."},
+    )
+    quantization_bit: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of bits to quantize the model using on-the-fly quantization."},
+    )
+    quantization_type: Literal["fp4", "nf4"] = field(
+        default="nf4",
+        metadata={"help": "Quantization data type to use in bitsandbytes int4 training."},
+    )
+    double_quantization: bool = field(
+        default=True,
+        metadata={"help": "Whether or not to use double quantization in bitsandbytes int4 training."},
+    )
+    quantization_device_map: Optional[Literal["auto"]] = field(
+        default=None,
+        metadata={"help": "Device map used to infer the 4-bit quantized model, needs bitsandbytes>=0.43.0."},
+    )
+    fp8: bool = field(
+        default=False,
+        metadata={
+            "help": "Enable FP8 mixed precision training via HuggingFace Accelerate. "
+            "Requires PyTorch 2.7+ and Hopper architecture GPUs."
+        },
+    )
+    fp8_backend: str = field(
+        default="auto",
+        metadata={
+            "help": "FP8 backend to use ('auto', 'torchao', 'te', 'msamp'). 'auto' selects best available backend."
+        },
+    )
+    fp8_enable_fsdp_float8_all_gather: bool = field(
+        default=False,
+        metadata={"help": "Enable FP8 optimizations for FSDP2 all-gather operations."},
+    )
+
+
+@dataclass
+class ProcessorArguments:
+    r"""Arguments pertaining to the image processor."""
+
+    image_max_pixels: int = field(
+        default=768 * 768,
+        metadata={"help": "The maximum number of pixels of image inputs."},
+    )
+    image_min_pixels: int = field(
+        default=32 * 32,
+        metadata={"help": "The minimum number of pixels of image inputs."},
+    )
+    image_do_pan_and_scan: bool = field(
+        default=False,
+        metadata={"help": "Use pan and scan to process image for gemma3."},
+    )
+    crop_to_patches: bool = field(
+        default=False,
+        metadata={"help": "Whether to crop the image to patches for internvl."},
+    )
+    video_max_pixels: int = field(
+        default=256 * 256,
+        metadata={"help": "The maximum number of pixels of video inputs."},
+    )
+    video_min_pixels: int = field(
+        default=16 * 16,
+        metadata={"help": "The minimum number of pixels of video inputs."},
+    )
+    video_fps: float = field(
+        default=2.0,
+        metadata={"help": "The frames to sample per second for video inputs."},
+    )
+    video_maxlen: int = field(
+        default=128,
+        metadata={"help": "The maximum number of sampled frames for video inputs."},
+    )
+    use_audio_in_video: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to use audio in video inputs."},
+    )
+    audio_sampling_rate: int = field(
+        default=16000,
+        metadata={"help": "The sampling rate of audio inputs."},
+    )
+
+    def __post_init__(self):
+        if self.image_max_pixels < self.image_min_pixels:
+            raise ValueError("`image_max_pixels` cannot be smaller than `image_min_pixels`.")
+
+        if self.video_max_pixels < self.video_min_pixels:
+            raise ValueError("`video_max_pixels` cannot be smaller than `video_min_pixels`.")
+
+
+@dataclass
+class ExportArguments:
+    r"""Arguments pertaining to the model export."""
+
+    export_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to the directory to save the exported model."},
+    )
+    export_size: int = field(
+        default=5,
+        metadata={"help": "The file shard size (in GB) of the exported model."},
+    )
+    export_device: Literal["cpu", "auto"] = field(
+        default="cpu",
+        metadata={"help": "The device used in model export, use `auto` to accelerate exporting."},
+    )
+    export_quantization_bit: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of bits to quantize the exported model."},
+    )
+    export_quantization_dataset: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to the dataset or dataset name to use in quantizing the exported model."},
+    )
+    export_quantization_nsamples: int = field(
+        default=128,
+        metadata={"help": "The number of samples used for quantization."},
+    )
+    export_quantization_maxlen: int = field(
+        default=1024,
+        metadata={"help": "The maximum length of the model inputs used for quantization."},
+    )
+    export_legacy_format: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to save the `.bin` files instead of `.safetensors`."},
+    )
+    export_hub_model_id: Optional[str] = field(
+        default=None,
+        metadata={"help": "The name of the repository if push the model to the Hugging Face hub."},
+    )
+
+    def __post_init__(self):
+        if self.export_quantization_bit is not None and self.export_quantization_dataset is None:
+            raise ValueError("Quantization dataset is necessary for exporting.")
+
+
+@dataclass
+class VllmArguments:
+    r"""Arguments pertaining to the vLLM worker."""
+
+    vllm_maxlen: int = field(
+        default=4096,
+        metadata={"help": "Maximum sequence (prompt + response) length of the vLLM engine."},
+    )
+    vllm_gpu_util: float = field(
+        default=0.7,
+        metadata={"help": "The fraction of GPU memory in (0,1) to be used for the vLLM engine."},
+    )
+    vllm_enforce_eager: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to disable CUDA graph in the vLLM engine."},
+    )
+    vllm_max_lora_rank: int = field(
+        default=32,
+        metadata={"help": "Maximum rank of all LoRAs in the vLLM engine."},
+    )
+    vllm_config: Optional[Union[dict, str]] = field(
+        default=None,
+        metadata={"help": "Config to initialize the vllm engine. Please use JSON strings."},
+    )
+
+    def __post_init__(self):
+        if isinstance(self.vllm_config, str) and self.vllm_config.startswith("{"):
+            self.vllm_config = _convert_str_dict(json.loads(self.vllm_config))
+
+
+@dataclass
+class SGLangArguments:
+    r"""Arguments pertaining to the SGLang worker."""
+
+    sglang_maxlen: int = field(
+        default=4096,
+        metadata={"help": "Maximum sequence (prompt + response) length of the SGLang engine."},
+    )
+    sglang_mem_fraction: float = field(
+        default=0.7,
+        metadata={"help": "The memory fraction (0-1) to be used for the SGLang engine."},
+    )
+    sglang_tp_size: int = field(
+        default=-1,
+        metadata={"help": "Tensor parallel size for the SGLang engine."},
+    )
+    sglang_config: Optional[Union[dict, str]] = field(
+        default=None,
+        metadata={"help": "Config to initialize the SGLang engine. Please use JSON strings."},
+    )
+    sglang_lora_backend: Literal["triton", "flashinfer"] = field(
+        default="triton",
+        metadata={
+            "help": "The backend of running GEMM kernels for Lora modules. Recommend using the Triton LoRA backend for better performance and stability."
+        },
+    )
+
+    def __post_init__(self):
+        if isinstance(self.sglang_config, str) and self.sglang_config.startswith("{"):
+            self.sglang_config = _convert_str_dict(json.loads(self.sglang_config))
+
+
+@dataclass
+class KTransformersArguments:
+    r"""Arguments pertaining to the KT training."""
+
+    use_kt: bool = field(
+        default=False,
+        metadata={"help": "Whether To Use KTransformers Optimizations For LoRA Training."},
+    )
+    kt_optimize_rule: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Path To The KTransformers Optimize Rule; See https://github.com/kvcache-ai/ktransformers/."
+        },
+    )
+    cpu_infer: Optional[int] = field(
+        default=32,
+        metadata={"help": "Number Of CPU Cores Used For Computation."},
+    )
+    chunk_size: Optional[int] = field(
+        default=8192,
+        metadata={"help": "Chunk Size Used For CPU Compute In KTransformers."},
+    )
+    mode: Optional[str] = field(
+        default="normal",
+        metadata={"help": "Normal Or Long_Context For Llama Models."},
+    )
+
+    kt_maxlen: int = field(
+        default=4096,
+        metadata={"help": "Maximum Sequence (Prompt + Response) Length Of The KT Engine."},
+    )
+    kt_use_cuda_graph: bool = field(
+        default=True,
+        metadata={"help": "Whether To Use CUDA Graphs For The KT Engine."},
+    )
+    kt_mode: str = field(
+        default="normal",
+        metadata={"help": "Normal Or Long_Context Mode For The KT Engine."},
+    )
+    kt_force_think: bool = field(
+        default=False,
+        metadata={"help": "Force-Think Toggle For The KT Engine."},
+    )
+
+
+@dataclass
+class ModelArguments(
+    SGLangArguments,
+    VllmArguments,
+    KTransformersArguments,
+    ExportArguments,
+    ProcessorArguments,
+    QuantizationArguments,
+    BaseModelArguments,
+):
+    r"""Arguments pertaining to which model/config/tokenizer we are going to fine-tune or infer.
+
+    The class on the most right will be displayed first.
+    """
+
+    compute_dtype: Optional[torch.dtype] = field(
+        default=None,
+        init=False,
+        metadata={"help": "Torch data type for computing model outputs, derived from `fp/bf16`. Do not specify it."},
+    )
+    device_map: Optional[Union[str, dict[str, Any]]] = field(
+        default=None,
+        init=False,
+        metadata={"help": "Device map for model placement, derived from training stage. Do not specify it."},
+    )
+    model_max_length: Optional[int] = field(
+        default=None,
+        init=False,
+        metadata={"help": "The maximum input length for model, derived from `cutoff_len`. Do not specify it."},
+    )
+    block_diag_attn: bool = field(
+        default=False,
+        init=False,
+        metadata={"help": "Whether use block diag attention or not, derived from `neat_packing`. Do not specify it."},
+    )
+
+    def __post_init__(self):
+        BaseModelArguments.__post_init__(self)
+        ProcessorArguments.__post_init__(self)
+        ExportArguments.__post_init__(self)
+        VllmArguments.__post_init__(self)
+        SGLangArguments.__post_init__(self)
+
+    @classmethod
+    def copyfrom(cls, source: "Self", **kwargs) -> "Self":
+        init_args, lazy_args = {}, {}
+        for attr in fields(source):
+            if attr.init:
+                init_args[attr.name] = getattr(source, attr.name)
+            else:
+                lazy_args[attr.name] = getattr(source, attr.name)
+
+        init_args.update(kwargs)
+        result = cls(**init_args)
+        for name, value in lazy_args.items():
+            setattr(result, name, value)
+
+        return result
+
+    def to_dict(self) -> dict[str, Any]:
+        args = asdict(self)
+        args = {k: f"<{k.upper()}>" if k.endswith("token") else v for k, v in args.items()}
+        return args
diff --git a/llamafactory/hparams/parser.py b/llamafactory/hparams/parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f204e805226904af3910d85b40f6f343751dd02
--- /dev/null
+++ b/llamafactory/hparams/parser.py
@@ -0,0 +1,532 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/language-modeling/run_clm.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+from pathlib import Path
+from typing import Any, Optional, Union
+
+import torch
+import transformers
+from omegaconf import OmegaConf
+from transformers import HfArgumentParser
+from transformers.integrations import is_deepspeed_zero3_enabled
+from transformers.trainer_utils import get_last_checkpoint
+from transformers.training_args import ParallelMode
+from transformers.utils import is_torch_bf16_gpu_available, is_torch_npu_available
+
+from ..extras import logging
+from ..extras.constants import CHECKPOINT_NAMES, EngineName
+from ..extras.misc import check_dependencies, check_version, get_current_device, is_env_enabled
+from ..extras.packages import is_mcore_adapter_available, is_transformers_version_greater_than
+from .data_args import DataArguments
+from .evaluation_args import EvaluationArguments
+from .finetuning_args import FinetuningArguments
+from .generating_args import GeneratingArguments
+from .model_args import ModelArguments
+from .training_args import RayArguments, TrainingArguments
+
+
+logger = logging.get_logger(__name__)
+
+check_dependencies()
+
+
+_TRAIN_ARGS = [ModelArguments, DataArguments, TrainingArguments, FinetuningArguments, GeneratingArguments]
+_TRAIN_CLS = tuple[ModelArguments, DataArguments, TrainingArguments, FinetuningArguments, GeneratingArguments]
+_INFER_ARGS = [ModelArguments, DataArguments, FinetuningArguments, GeneratingArguments]
+_INFER_CLS = tuple[ModelArguments, DataArguments, FinetuningArguments, GeneratingArguments]
+_EVAL_ARGS = [ModelArguments, DataArguments, EvaluationArguments, FinetuningArguments]
+_EVAL_CLS = tuple[ModelArguments, DataArguments, EvaluationArguments, FinetuningArguments]
+
+if is_mcore_adapter_available() and is_env_enabled("USE_MCA"):
+    from mcore_adapter import TrainingArguments as McaTrainingArguments
+
+    _TRAIN_MCA_ARGS = [ModelArguments, DataArguments, McaTrainingArguments, FinetuningArguments, GeneratingArguments]
+    _TRAIN_MCA_CLS = tuple[
+        ModelArguments, DataArguments, McaTrainingArguments, FinetuningArguments, GeneratingArguments
+    ]
+else:
+    _TRAIN_MCA_ARGS = []
+    _TRAIN_MCA_CLS = tuple()
+
+
+def read_args(args: Optional[Union[dict[str, Any], list[str]]] = None) -> Union[dict[str, Any], list[str]]:
+    r"""Get arguments from the command line or a config file."""
+    if args is not None:
+        return args
+
+    if sys.argv[1].endswith(".yaml") or sys.argv[1].endswith(".yml"):
+        override_config = OmegaConf.from_cli(sys.argv[2:])
+        dict_config = OmegaConf.load(Path(sys.argv[1]).absolute())
+        return OmegaConf.to_container(OmegaConf.merge(dict_config, override_config))
+    elif sys.argv[1].endswith(".json"):
+        override_config = OmegaConf.from_cli(sys.argv[2:])
+        dict_config = OmegaConf.load(Path(sys.argv[1]).absolute())
+        return OmegaConf.to_container(OmegaConf.merge(dict_config, override_config))
+    else:
+        return sys.argv[1:]
+
+
+def _parse_args(
+    parser: "HfArgumentParser", args: Optional[Union[dict[str, Any], list[str]]] = None, allow_extra_keys: bool = False
+) -> tuple[Any]:
+    args = read_args(args)
+    if isinstance(args, dict):
+        return parser.parse_dict(args, allow_extra_keys=allow_extra_keys)
+
+    (*parsed_args, unknown_args) = parser.parse_args_into_dataclasses(args=args, return_remaining_strings=True)
+
+    if unknown_args and not allow_extra_keys:
+        print(parser.format_help())
+        print(f"Got unknown args, potentially deprecated arguments: {unknown_args}")
+        raise ValueError(f"Some specified arguments are not used by the HfArgumentParser: {unknown_args}")
+
+    return tuple(parsed_args)
+
+
+def _set_transformers_logging() -> None:
+    if os.getenv("LLAMAFACTORY_VERBOSITY", "INFO") in ["DEBUG", "INFO"]:
+        transformers.utils.logging.set_verbosity_info()
+        transformers.utils.logging.enable_default_handler()
+        transformers.utils.logging.enable_explicit_format()
+
+
+def _set_env_vars() -> None:
+    if is_torch_npu_available():
+        # avoid JIT compile on NPU devices, see https://zhuanlan.zhihu.com/p/660875458
+        torch.npu.set_compile_mode(jit_compile=is_env_enabled("NPU_JIT_COMPILE"))
+        # avoid use fork method on NPU devices, see https://github.com/hiyouga/LLaMA-Factory/issues/7447
+        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
+
+def _verify_model_args(
+    model_args: "ModelArguments",
+    data_args: "DataArguments",
+    finetuning_args: "FinetuningArguments",
+) -> None:
+    if model_args.adapter_name_or_path is not None and finetuning_args.finetuning_type != "lora":
+        raise ValueError("Adapter is only valid for the LoRA method.")
+
+    if model_args.quantization_bit is not None:
+        if finetuning_args.finetuning_type not in ["lora", "oft"]:
+            raise ValueError("Quantization is only compatible with the LoRA or OFT method.")
+
+        if finetuning_args.pissa_init:
+            raise ValueError("Please use scripts/pissa_init.py to initialize PiSSA for a quantized model.")
+
+        if model_args.resize_vocab:
+            raise ValueError("Cannot resize embedding layers of a quantized model.")
+
+        if model_args.adapter_name_or_path is not None and finetuning_args.create_new_adapter:
+            raise ValueError("Cannot create new adapter upon a quantized model.")
+
+        if model_args.adapter_name_or_path is not None and len(model_args.adapter_name_or_path) != 1:
+            raise ValueError("Quantized model only accepts a single adapter. Merge them first.")
+
+    if data_args.template == "yi" and model_args.use_fast_tokenizer:
+        logger.warning_rank0("We should use slow tokenizer for the Yi models. Change `use_fast_tokenizer` to False.")
+        model_args.use_fast_tokenizer = False
+
+    # Validate advanced training features
+    if model_args.fp8 and model_args.quantization_bit is not None:
+        raise ValueError("FP8 training is not compatible with quantization. Please disable one of them.")
+
+    if model_args.fp8_enable_fsdp_float8_all_gather and not model_args.fp8:
+        logger.warning_rank0("fp8_enable_fsdp_float8_all_gather requires fp8=True. Setting fp8=True.")
+        model_args.fp8 = True
+
+
+def _check_extra_dependencies(
+    model_args: "ModelArguments",
+    finetuning_args: "FinetuningArguments",
+    training_args: Optional["TrainingArguments"] = None,
+) -> None:
+    if model_args.use_kt:
+        check_version("ktransformers", mandatory=True)
+
+    if model_args.use_unsloth:
+        check_version("unsloth", mandatory=True)
+
+    if model_args.enable_liger_kernel:
+        check_version("liger-kernel", mandatory=True)
+
+    if model_args.mixture_of_depths is not None:
+        check_version("mixture-of-depth>=1.1.6", mandatory=True)
+
+    if model_args.infer_backend == EngineName.VLLM:
+        check_version("vllm>=0.4.3,<=0.11.0")
+        check_version("vllm", mandatory=True)
+    elif model_args.infer_backend == EngineName.SGLANG:
+        check_version("sglang>=0.4.5")
+        check_version("sglang", mandatory=True)
+
+    if finetuning_args.use_galore:
+        check_version("galore_torch", mandatory=True)
+
+    if finetuning_args.use_apollo:
+        check_version("apollo_torch", mandatory=True)
+
+    if finetuning_args.use_badam:
+        check_version("badam>=1.2.1", mandatory=True)
+
+    if finetuning_args.use_adam_mini:
+        check_version("adam-mini", mandatory=True)
+
+    if finetuning_args.use_swanlab:
+        check_version("swanlab", mandatory=True)
+
+    if finetuning_args.plot_loss:
+        check_version("matplotlib", mandatory=True)
+
+    if training_args is not None:
+        if training_args.deepspeed:
+            # pin deepspeed version < 0.17 because of https://github.com/deepspeedai/DeepSpeed/issues/7347
+            check_version("deepspeed", mandatory=True)
+            check_version("deepspeed>=0.10.0,<=0.16.9")
+
+        if training_args.predict_with_generate:
+            check_version("jieba", mandatory=True)
+            check_version("nltk", mandatory=True)
+            check_version("rouge_chinese", mandatory=True)
+
+
+def _parse_train_args(args: Optional[Union[dict[str, Any], list[str]]] = None) -> _TRAIN_CLS:
+    parser = HfArgumentParser(_TRAIN_ARGS)
+    allow_extra_keys = is_env_enabled("ALLOW_EXTRA_ARGS")
+    return _parse_args(parser, args, allow_extra_keys=allow_extra_keys)
+
+
+def _parse_train_mca_args(args: Optional[Union[dict[str, Any], list[str]]] = None) -> _TRAIN_MCA_CLS:
+    parser = HfArgumentParser(_TRAIN_MCA_ARGS)
+    allow_extra_keys = is_env_enabled("ALLOW_EXTRA_ARGS")
+    model_args, data_args, training_args, finetuning_args, generating_args = _parse_args(
+        parser, args, allow_extra_keys=allow_extra_keys
+    )
+
+    _configure_mca_training_args(training_args, data_args, finetuning_args)
+
+    return model_args, data_args, training_args, finetuning_args, generating_args
+
+
+def _configure_mca_training_args(training_args, data_args, finetuning_args) -> None:
+    """Patch training args to avoid args checking errors and sync MCA settings."""
+    training_args.predict_with_generate = False
+    training_args.generation_max_length = data_args.cutoff_len
+    training_args.generation_num_beams = 1
+    training_args.use_mca = True
+    finetuning_args.use_mca = True
+
+
+def _parse_infer_args(args: Optional[Union[dict[str, Any], list[str]]] = None) -> _INFER_CLS:
+    parser = HfArgumentParser(_INFER_ARGS)
+    allow_extra_keys = is_env_enabled("ALLOW_EXTRA_ARGS")
+    return _parse_args(parser, args, allow_extra_keys=allow_extra_keys)
+
+
+def _parse_eval_args(args: Optional[Union[dict[str, Any], list[str]]] = None) -> _EVAL_CLS:
+    parser = HfArgumentParser(_EVAL_ARGS)
+    allow_extra_keys = is_env_enabled("ALLOW_EXTRA_ARGS")
+    return _parse_args(parser, args, allow_extra_keys=allow_extra_keys)
+
+
+def get_ray_args(args: Optional[Union[dict[str, Any], list[str]]] = None) -> RayArguments:
+    parser = HfArgumentParser(RayArguments)
+    (ray_args,) = _parse_args(parser, args, allow_extra_keys=True)
+    return ray_args
+
+
+def get_train_args(args: Optional[Union[dict[str, Any], list[str]]] = None) -> _TRAIN_CLS:
+    if is_env_enabled("USE_MCA"):
+        model_args, data_args, training_args, finetuning_args, generating_args = _parse_train_mca_args(args)
+    else:
+        model_args, data_args, training_args, finetuning_args, generating_args = _parse_train_args(args)
+        finetuning_args.use_mca = False
+
+    # Setup logging
+    if training_args.should_log:
+        _set_transformers_logging()
+
+    # Check arguments
+    if finetuning_args.stage != "sft":
+        if training_args.predict_with_generate:
+            raise ValueError("`predict_with_generate` cannot be set as True except SFT.")
+
+        if data_args.neat_packing:
+            raise ValueError("`neat_packing` cannot be set as True except SFT.")
+
+        if data_args.train_on_prompt or data_args.mask_history:
+            raise ValueError("`train_on_prompt` or `mask_history` cannot be set as True except SFT.")
+
+    if finetuning_args.stage == "sft" and training_args.do_predict and not training_args.predict_with_generate:
+        raise ValueError("Please enable `predict_with_generate` to save model predictions.")
+
+    if finetuning_args.stage in ["rm", "ppo"] and training_args.load_best_model_at_end:
+        raise ValueError("RM and PPO stages do not support `load_best_model_at_end`.")
+
+    if finetuning_args.stage == "ppo":
+        if not training_args.do_train:
+            raise ValueError("PPO training does not support evaluation, use the SFT stage to evaluate models.")
+
+        if model_args.shift_attn:
+            raise ValueError("PPO training is incompatible with S^2-Attn.")
+
+        if finetuning_args.reward_model_type == "lora" and model_args.use_kt:
+            raise ValueError("KTransformers does not support lora reward model.")
+
+        if finetuning_args.reward_model_type == "lora" and model_args.use_unsloth:
+            raise ValueError("Unsloth does not support lora reward model.")
+
+        if training_args.report_to and training_args.report_to[0] not in ["wandb", "tensorboard"]:
+            raise ValueError("PPO only accepts wandb or tensorboard logger.")
+
+    if not model_args.use_kt and training_args.parallel_mode == ParallelMode.NOT_DISTRIBUTED:
+        raise ValueError("Please launch distributed training with `llamafactory-cli` or `torchrun`.")
+
+    if training_args.deepspeed and training_args.parallel_mode != ParallelMode.DISTRIBUTED:
+        raise ValueError("Please use `FORCE_TORCHRUN=1` to launch DeepSpeed training.")
+
+    if training_args.max_steps == -1 and data_args.streaming:
+        raise ValueError("Please specify `max_steps` in streaming mode.")
+
+    if training_args.do_train and data_args.dataset is None:
+        raise ValueError("Please specify dataset for training.")
+
+    if (training_args.do_eval or training_args.do_predict) and (
+        data_args.eval_dataset is None and data_args.val_size < 1e-6
+    ):
+        raise ValueError("Please specify dataset for evaluation.")
+
+    if training_args.predict_with_generate:
+        if is_deepspeed_zero3_enabled():
+            raise ValueError("`predict_with_generate` is incompatible with DeepSpeed ZeRO-3.")
+
+        if data_args.eval_dataset is None:
+            raise ValueError("Cannot use `predict_with_generate` if `eval_dataset` is None.")
+
+        if finetuning_args.compute_accuracy:
+            raise ValueError("Cannot use `predict_with_generate` and `compute_accuracy` together.")
+
+    if training_args.do_train and model_args.quantization_device_map == "auto":
+        raise ValueError("Cannot use device map for quantized models in training.")
+
+    if finetuning_args.pissa_init and is_deepspeed_zero3_enabled():
+        raise ValueError("Please use scripts/pissa_init.py to initialize PiSSA in DeepSpeed ZeRO-3.")
+
+    if finetuning_args.pure_bf16:
+        if not (is_torch_bf16_gpu_available() or (is_torch_npu_available() and torch.npu.is_bf16_supported())):
+            raise ValueError("This device does not support `pure_bf16`.")
+
+        if is_deepspeed_zero3_enabled():
+            raise ValueError("`pure_bf16` is incompatible with DeepSpeed ZeRO-3.")
+
+    if training_args.parallel_mode == ParallelMode.DISTRIBUTED:
+        if finetuning_args.use_galore and finetuning_args.galore_layerwise:
+            raise ValueError("Distributed training does not support layer-wise GaLore.")
+
+        if finetuning_args.use_apollo and finetuning_args.apollo_layerwise:
+            raise ValueError("Distributed training does not support layer-wise APOLLO.")
+
+        if finetuning_args.use_badam:
+            if finetuning_args.badam_mode == "ratio":
+                raise ValueError("Radio-based BAdam does not yet support distributed training, use layer-wise BAdam.")
+            elif not is_deepspeed_zero3_enabled():
+                raise ValueError("Layer-wise BAdam only supports DeepSpeed ZeRO-3 training.")
+
+    if training_args.deepspeed is not None and (finetuning_args.use_galore or finetuning_args.use_apollo):
+        raise ValueError("GaLore and APOLLO are incompatible with DeepSpeed yet.")
+
+    if model_args.infer_backend != EngineName.HF:
+        raise ValueError("vLLM/SGLang backend is only available for API, CLI and Web.")
+
+    if model_args.use_unsloth and is_deepspeed_zero3_enabled():
+        raise ValueError("Unsloth is incompatible with DeepSpeed ZeRO-3.")
+
+    if model_args.use_kt and is_deepspeed_zero3_enabled():
+        raise ValueError("KTransformers is incompatible with DeepSpeed ZeRO-3.")
+
+    if data_args.neat_packing and is_transformers_version_greater_than("4.53.0"):
+        raise ValueError("Neat packing is incompatible with transformers>=4.53.0.")
+
+    _set_env_vars()
+    _verify_model_args(model_args, data_args, finetuning_args)
+    _check_extra_dependencies(model_args, finetuning_args, training_args)
+
+    if (
+        training_args.do_train
+        and finetuning_args.finetuning_type == "lora"
+        and model_args.quantization_bit is None
+        and model_args.resize_vocab
+        and finetuning_args.additional_target is None
+    ):
+        logger.warning_rank0(
+            "Remember to add embedding layers to `additional_target` to make the added tokens trainable."
+        )
+
+    if training_args.do_train and model_args.quantization_bit is not None and (not model_args.upcast_layernorm):
+        logger.warning_rank0("We recommend enable `upcast_layernorm` in quantized training.")
+
+    if training_args.do_train and (not training_args.fp16) and (not training_args.bf16):
+        logger.warning_rank0("We recommend enable mixed precision training.")
+
+    if (
+        training_args.do_train
+        and (finetuning_args.use_galore or finetuning_args.use_apollo)
+        and not finetuning_args.pure_bf16
+    ):
+        logger.warning_rank0(
+            "Using GaLore or APOLLO with mixed precision training may significantly increases GPU memory usage."
+        )
+
+    if (not training_args.do_train) and model_args.quantization_bit is not None:
+        logger.warning_rank0("Evaluating model in 4/8-bit mode may cause lower scores.")
+
+    if (not training_args.do_train) and finetuning_args.stage == "dpo" and finetuning_args.ref_model is None:
+        logger.warning_rank0("Specify `ref_model` for computing rewards at evaluation.")
+
+    # Post-process training arguments
+    training_args.generation_max_length = training_args.generation_max_length or data_args.cutoff_len
+    training_args.generation_num_beams = data_args.eval_num_beams or training_args.generation_num_beams
+    training_args.remove_unused_columns = False  # important for multimodal dataset
+
+    if finetuning_args.finetuning_type == "lora":
+        # https://github.com/huggingface/transformers/blob/v4.50.0/src/transformers/trainer.py#L782
+        training_args.label_names = training_args.label_names or ["labels"]
+
+    if "swanlab" in training_args.report_to and finetuning_args.use_swanlab:
+        training_args.report_to.remove("swanlab")
+
+    if (
+        training_args.parallel_mode == ParallelMode.DISTRIBUTED
+        and training_args.ddp_find_unused_parameters is None
+        and finetuning_args.finetuning_type == "lora"
+    ):
+        logger.info_rank0("Set `ddp_find_unused_parameters` to False in DDP training since LoRA is enabled.")
+        training_args.ddp_find_unused_parameters = False
+
+    if finetuning_args.stage in ["rm", "ppo"] and finetuning_args.finetuning_type in ["full", "freeze"]:
+        can_resume_from_checkpoint = False
+        if training_args.resume_from_checkpoint is not None:
+            logger.warning_rank0("Cannot resume from checkpoint in current stage.")
+            training_args.resume_from_checkpoint = None
+    else:
+        can_resume_from_checkpoint = True
+
+    if (
+        training_args.resume_from_checkpoint is None
+        and training_args.do_train
+        and os.path.isdir(training_args.output_dir)
+        and not training_args.overwrite_output_dir
+        and can_resume_from_checkpoint
+    ):
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and any(
+            os.path.isfile(os.path.join(training_args.output_dir, name)) for name in CHECKPOINT_NAMES
+        ):
+            raise ValueError("Output directory already exists and is not empty. Please set `overwrite_output_dir`.")
+
+        if last_checkpoint is not None:
+            training_args.resume_from_checkpoint = last_checkpoint
+            logger.info_rank0(f"Resuming training from {training_args.resume_from_checkpoint}.")
+            logger.info_rank0("Change `output_dir` or use `overwrite_output_dir` to avoid.")
+
+    if (
+        finetuning_args.stage in ["rm", "ppo"]
+        and finetuning_args.finetuning_type == "lora"
+        and training_args.resume_from_checkpoint is not None
+    ):
+        logger.warning_rank0(
+            f"Add {training_args.resume_from_checkpoint} to `adapter_name_or_path` to resume training from checkpoint."
+        )
+
+    # Post-process model arguments
+    if training_args.bf16 or finetuning_args.pure_bf16:
+        model_args.compute_dtype = torch.bfloat16
+    elif training_args.fp16:
+        model_args.compute_dtype = torch.float16
+
+    model_args.device_map = {"": get_current_device()}
+    model_args.model_max_length = data_args.cutoff_len
+    model_args.block_diag_attn = data_args.neat_packing
+    data_args.packing = data_args.packing if data_args.packing is not None else finetuning_args.stage == "pt"
+
+    # Log on each process the small summary
+    logger.info(
+        f"Process rank: {training_args.process_index}, "
+        f"world size: {training_args.world_size}, device: {training_args.device}, "
+        f"distributed training: {training_args.parallel_mode == ParallelMode.DISTRIBUTED}, "
+        f"compute dtype: {str(model_args.compute_dtype)}"
+    )
+    transformers.set_seed(training_args.seed)
+
+    return model_args, data_args, training_args, finetuning_args, generating_args
+
+
+def get_infer_args(args: Optional[Union[dict[str, Any], list[str]]] = None) -> _INFER_CLS:
+    model_args, data_args, finetuning_args, generating_args = _parse_infer_args(args)
+
+    # Setup logging
+    _set_transformers_logging()
+
+    # Check arguments
+    if model_args.infer_backend == "vllm":
+        if finetuning_args.stage != "sft":
+            raise ValueError("vLLM engine only supports auto-regressive models.")
+
+        if model_args.quantization_bit is not None:
+            raise ValueError("vLLM engine does not support bnb quantization (GPTQ and AWQ are supported).")
+
+        if model_args.rope_scaling is not None:
+            raise ValueError("vLLM engine does not support RoPE scaling.")
+
+        if model_args.adapter_name_or_path is not None and len(model_args.adapter_name_or_path) != 1:
+            raise ValueError("vLLM only accepts a single adapter. Merge them first.")
+
+    _set_env_vars()
+    _verify_model_args(model_args, data_args, finetuning_args)
+    _check_extra_dependencies(model_args, finetuning_args)
+
+    # Post-process model arguments
+    if model_args.export_dir is not None and model_args.export_device == "cpu":
+        model_args.device_map = {"": torch.device("cpu")}
+        if data_args.cutoff_len != DataArguments().cutoff_len:  # override cutoff_len if it is not default
+            model_args.model_max_length = data_args.cutoff_len
+    else:
+        model_args.device_map = "auto"
+
+    return model_args, data_args, finetuning_args, generating_args
+
+
+def get_eval_args(args: Optional[Union[dict[str, Any], list[str]]] = None) -> _EVAL_CLS:
+    model_args, data_args, eval_args, finetuning_args = _parse_eval_args(args)
+
+    # Setup logging
+    _set_transformers_logging()
+
+    # Check arguments
+    if model_args.infer_backend != EngineName.HF:
+        raise ValueError("vLLM/SGLang backend is only available for API, CLI and Web.")
+
+    _set_env_vars()
+    _verify_model_args(model_args, data_args, finetuning_args)
+    _check_extra_dependencies(model_args, finetuning_args)
+
+    model_args.device_map = "auto"
+
+    transformers.set_seed(eval_args.seed)
+
+    return model_args, data_args, eval_args, finetuning_args
diff --git a/llamafactory/hparams/training_args.py b/llamafactory/hparams/training_args.py
new file mode 100644
index 0000000000000000000000000000000000000000..46b40a2ddca1af1132949a82efe43842ff2ea5f4
--- /dev/null
+++ b/llamafactory/hparams/training_args.py
@@ -0,0 +1,105 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+from dataclasses import dataclass, field
+from typing import Literal, Optional, Union
+
+from transformers import Seq2SeqTrainingArguments
+from transformers.training_args import _convert_str_dict
+
+from ..extras.misc import is_env_enabled, use_ray
+from ..extras.packages import is_mcore_adapter_available
+
+
+if is_env_enabled("USE_MCA"):
+    if not is_mcore_adapter_available():
+        raise ImportError(
+            "mcore_adapter is required when USE_MCA=1. Please install `mcore_adapter` and its dependencies."
+        )
+
+    from mcore_adapter import Seq2SeqTrainingArguments as McaSeq2SeqTrainingArguments
+
+    BaseTrainingArguments = McaSeq2SeqTrainingArguments
+else:
+    BaseTrainingArguments = Seq2SeqTrainingArguments
+
+
+@dataclass
+class RayArguments:
+    r"""Arguments pertaining to the Ray training."""
+
+    ray_run_name: Optional[str] = field(
+        default=None,
+        metadata={"help": "The training results will be saved at `<ray_storage_path>/ray_run_name`."},
+    )
+    ray_storage_path: str = field(
+        default="./saves",
+        metadata={"help": "The storage path to save training results to"},
+    )
+    ray_storage_filesystem: Optional[Literal["s3", "gs", "gcs"]] = field(
+        default=None,
+        metadata={"help": "The storage filesystem to use. If None specified, local filesystem will be used."},
+    )
+    ray_num_workers: int = field(
+        default=1,
+        metadata={"help": "The number of workers for Ray training. Default is 1 worker."},
+    )
+    resources_per_worker: Union[dict, str] = field(
+        default_factory=lambda: {"GPU": 1},
+        metadata={"help": "The resources per worker for Ray training. Default is to use 1 GPU per worker."},
+    )
+    placement_strategy: Literal["SPREAD", "PACK", "STRICT_SPREAD", "STRICT_PACK"] = field(
+        default="PACK",
+        metadata={"help": "The placement strategy for Ray training. Default is PACK."},
+    )
+    ray_init_kwargs: Optional[Union[dict, str]] = field(
+        default=None,
+        metadata={"help": "The arguments to pass to ray.init for Ray training. Default is None."},
+    )
+
+    def __post_init__(self):
+        self.use_ray = use_ray()
+        if isinstance(self.resources_per_worker, str) and self.resources_per_worker.startswith("{"):
+            self.resources_per_worker = _convert_str_dict(json.loads(self.resources_per_worker))
+
+        if isinstance(self.ray_init_kwargs, str) and self.ray_init_kwargs.startswith("{"):
+            self.ray_init_kwargs = _convert_str_dict(json.loads(self.ray_init_kwargs))
+
+        if self.ray_storage_filesystem is not None:
+            if self.ray_storage_filesystem not in ["s3", "gs", "gcs"]:
+                raise ValueError(
+                    f"ray_storage_filesystem must be one of ['s3', 'gs', 'gcs'], got {self.ray_storage_filesystem}."
+                )
+
+            import pyarrow.fs as fs
+
+            if self.ray_storage_filesystem == "s3":
+                self.ray_storage_filesystem = fs.S3FileSystem()
+            elif self.ray_storage_filesystem == "gs" or self.ray_storage_filesystem == "gcs":
+                self.ray_storage_filesystem = fs.GcsFileSystem()
+
+
+@dataclass
+class TrainingArguments(RayArguments, BaseTrainingArguments):
+    r"""Arguments pertaining to the trainer."""
+
+    overwrite_output_dir: bool = field(
+        default=False,
+        metadata={"help": "deprecated"},
+    )
+
+    def __post_init__(self):
+        RayArguments.__post_init__(self)
+        BaseTrainingArguments.__post_init__(self)
diff --git a/llamafactory/launcher.py b/llamafactory/launcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ccc06f4a9d2dbc818b4faa08160d2fe1b5698bc
--- /dev/null
+++ b/llamafactory/launcher.py
@@ -0,0 +1,185 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import subprocess
+import sys
+from copy import deepcopy
+
+
+USAGE = (
+    "-" * 70
+    + "\n"
+    + "| Usage:                                                             |\n"
+    + "|   llamafactory-cli api -h: launch an OpenAI-style API server       |\n"
+    + "|   llamafactory-cli chat -h: launch a chat interface in CLI         |\n"
+    + "|   llamafactory-cli export -h: merge LoRA adapters and export model |\n"
+    + "|   llamafactory-cli train -h: train models                          |\n"
+    + "|   llamafactory-cli webchat -h: launch a chat interface in Web UI   |\n"
+    + "|   llamafactory-cli webui: launch LlamaBoard                        |\n"
+    + "|   llamafactory-cli env: show environment info                      |\n"
+    + "|   llamafactory-cli version: show version info                      |\n"
+    + "| Hint: You can use `lmf` as a shortcut for `llamafactory-cli`.      |\n"
+    + "-" * 70
+)
+
+
+def launch():
+    from .extras import logging
+    from .extras.env import VERSION, print_env
+    from .extras.misc import find_available_port, get_device_count, is_env_enabled, use_kt, use_ray
+
+    logger = logging.get_logger(__name__)
+    WELCOME = (
+        "-" * 58
+        + "\n"
+        + f"| Welcome to LLaMA Factory, version {VERSION}"
+        + " " * (21 - len(VERSION))
+        + "|\n|"
+        + " " * 56
+        + "|\n"
+        + "| Project page: https://github.com/hiyouga/LLaMA-Factory |\n"
+        + "-" * 58
+    )
+
+    command = sys.argv.pop(1) if len(sys.argv) > 1 else "help"
+    if is_env_enabled("USE_MCA"):  # force use torchrun
+        os.environ["FORCE_TORCHRUN"] = "1"
+
+    if command == "train" and (
+        is_env_enabled("FORCE_TORCHRUN") or (get_device_count() > 1 and not use_ray() and not use_kt())
+    ):
+        # launch distributed training
+        nnodes = os.getenv("NNODES", "1")
+        node_rank = os.getenv("NODE_RANK", "0")
+        nproc_per_node = os.getenv("NPROC_PER_NODE", str(get_device_count()))
+        master_addr = os.getenv("MASTER_ADDR", "127.0.0.1")
+        master_port = os.getenv("MASTER_PORT", str(find_available_port()))
+        logger.info_rank0(f"Initializing {nproc_per_node} distributed tasks at: {master_addr}:{master_port}")
+        if int(nnodes) > 1:
+            logger.info_rank0(f"Multi-node training enabled: num nodes: {nnodes}, node rank: {node_rank}")
+
+        # elastic launch support
+        max_restarts = os.getenv("MAX_RESTARTS", "0")
+        rdzv_id = os.getenv("RDZV_ID")
+        min_nnodes = os.getenv("MIN_NNODES")
+        max_nnodes = os.getenv("MAX_NNODES")
+
+        env = deepcopy(os.environ)
+        if is_env_enabled("OPTIM_TORCH", "1"):
+            # optimize DDP, see https://zhuanlan.zhihu.com/p/671834539
+            env["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+            env["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
+
+        if rdzv_id is not None:
+            # launch elastic job with fault tolerant support when possible
+            # see also https://docs.pytorch.org/docs/stable/elastic/train_script.html
+            rdzv_nnodes = nnodes
+            # elastic number of nodes if MIN_NNODES and MAX_NNODES are set
+            if min_nnodes is not None and max_nnodes is not None:
+                rdzv_nnodes = f"{min_nnodes}:{max_nnodes}"
+
+            process = subprocess.run(
+                (
+                    "torchrun --nnodes {rdzv_nnodes} --nproc-per-node {nproc_per_node} "
+                    "--rdzv-id {rdzv_id} --rdzv-backend c10d --rdzv-endpoint {master_addr}:{master_port} "
+                    "--max-restarts {max_restarts} {file_name} {args}"
+                )
+                .format(
+                    rdzv_nnodes=rdzv_nnodes,
+                    nproc_per_node=nproc_per_node,
+                    rdzv_id=rdzv_id,
+                    master_addr=master_addr,
+                    master_port=master_port,
+                    max_restarts=max_restarts,
+                    file_name=__file__,
+                    args=" ".join(sys.argv[1:]),
+                )
+                .split(),
+                env=env,
+                check=True,
+            )
+        else:
+            # NOTE: DO NOT USE shell=True to avoid security risk
+            process = subprocess.run(
+                (
+                    "torchrun --nnodes {nnodes} --node_rank {node_rank} --nproc_per_node {nproc_per_node} "
+                    "--master_addr {master_addr} --master_port {master_port} {file_name} {args}"
+                )
+                .format(
+                    nnodes=nnodes,
+                    node_rank=node_rank,
+                    nproc_per_node=nproc_per_node,
+                    master_addr=master_addr,
+                    master_port=master_port,
+                    file_name=__file__,
+                    args=" ".join(sys.argv[1:]),
+                )
+                .split(),
+                env=env,
+                check=True,
+            )
+
+        sys.exit(process.returncode)
+
+    elif command == "api":
+        from .api.app import run_api
+
+        run_api()
+
+    elif command == "chat":
+        from .chat.chat_model import run_chat
+
+        run_chat()
+
+    elif command == "eval":
+        raise NotImplementedError("Evaluation will be deprecated in the future.")
+
+    elif command == "export":
+        from .train.tuner import export_model
+
+        export_model()
+
+    elif command == "train":
+        from .train.tuner import run_exp
+
+        run_exp()
+
+    elif command == "webchat":
+        from .webui.interface import run_web_demo
+
+        run_web_demo()
+
+    elif command == "webui":
+        from .webui.interface import run_web_ui
+
+        run_web_ui()
+
+    elif command == "env":
+        print_env()
+
+    elif command == "version":
+        print(WELCOME)
+
+    elif command == "help":
+        print(USAGE)
+
+    else:
+        print(f"Unknown command: {command}.\n{USAGE}")
+
+
+if __name__ == "__main__":
+    from llamafactory.train.tuner import run_exp  # use absolute import
+
+    run_exp()
diff --git a/llamafactory/model/__init__.py b/llamafactory/model/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..71d4f47f43273457d15e38e3454c5f4bc156da3d
--- /dev/null
+++ b/llamafactory/model/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .loader import load_config, load_model, load_tokenizer
+from .model_utils.misc import find_all_linear_modules
+from .model_utils.quantization import QuantizationMethod
+from .model_utils.valuehead import load_valuehead_params
+
+
+__all__ = [
+    "QuantizationMethod",
+    "find_all_linear_modules",
+    "load_config",
+    "load_model",
+    "load_tokenizer",
+    "load_valuehead_params",
+]
diff --git a/llamafactory/model/__pycache__/__init__.cpython-312.pyc b/llamafactory/model/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..72e116d639eb157007739bd2afa108ca93aefa56
Binary files /dev/null and b/llamafactory/model/__pycache__/__init__.cpython-312.pyc differ
diff --git a/llamafactory/model/__pycache__/adapter.cpython-312.pyc b/llamafactory/model/__pycache__/adapter.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..32f9ea2cdf823c1a06113db9aaf46a0a3d7e52a6
Binary files /dev/null and b/llamafactory/model/__pycache__/adapter.cpython-312.pyc differ
diff --git a/llamafactory/model/__pycache__/loader.cpython-312.pyc b/llamafactory/model/__pycache__/loader.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4b89d2a56c8e131b825eb047e59af10015441f38
Binary files /dev/null and b/llamafactory/model/__pycache__/loader.cpython-312.pyc differ
diff --git a/llamafactory/model/__pycache__/patcher.cpython-312.pyc b/llamafactory/model/__pycache__/patcher.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..87a45b015a81ed0b9599b5f26dc2db4522265946
Binary files /dev/null and b/llamafactory/model/__pycache__/patcher.cpython-312.pyc differ
diff --git a/llamafactory/model/adapter.py b/llamafactory/model/adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..708db9429c9e95f056ed4491ef0ee381d70b7e58
--- /dev/null
+++ b/llamafactory/model/adapter.py
@@ -0,0 +1,366 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+from typing import TYPE_CHECKING
+
+import torch
+from peft import LoraConfig, LoraModel, OFTConfig, PeftModel, TaskType, get_peft_model
+from transformers.integrations import is_deepspeed_zero3_enabled
+
+from ..extras import logging
+from ..extras.constants import EngineName
+from .model_utils.ktransformers import get_kt_peft_model, load_kt_peft_model
+from .model_utils.misc import find_all_linear_modules, find_expanded_modules
+from .model_utils.quantization import QuantizationMethod
+from .model_utils.unsloth import get_unsloth_peft_model, load_unsloth_peft_model
+from .model_utils.visual import COMPOSITE_MODELS, get_forbidden_modules, patch_target_modules
+
+
+if TYPE_CHECKING:
+    from transformers import PretrainedConfig, PreTrainedModel
+
+    from ..hparams import FinetuningArguments, ModelArguments
+
+
+logger = logging.get_logger(__name__)
+
+
+def _setup_full_tuning(
+    model: "PreTrainedModel",
+    finetuning_args: "FinetuningArguments",
+    is_trainable: bool,
+    cast_trainable_params_to_fp32: bool,
+) -> None:
+    if not is_trainable:
+        return
+
+    logger.info_rank0("Fine-tuning method: Full")
+    forbidden_modules = get_forbidden_modules(model.config, finetuning_args)
+    for name, param in model.named_parameters():
+        if not any(forbidden_module in name for forbidden_module in forbidden_modules):
+            if cast_trainable_params_to_fp32:
+                param.data = param.data.to(torch.float32)
+        else:
+            param.requires_grad_(False)
+
+
+def _setup_freeze_tuning(
+    model: "PreTrainedModel",
+    finetuning_args: "FinetuningArguments",
+    is_trainable: bool,
+    cast_trainable_params_to_fp32: bool,
+) -> None:
+    if not is_trainable:
+        return
+
+    logger.info_rank0("Fine-tuning method: Freeze")
+    if hasattr(model.config, "text_config"):  # composite models
+        config = getattr(model.config, "text_config")
+    else:
+        config = model.config
+
+    num_layers = (
+        getattr(config, "num_hidden_layers", None)
+        or getattr(config, "num_layers", None)
+        or getattr(config, "n_layer", None)
+    )
+    if not num_layers:
+        raise ValueError("Current model does not support freeze tuning.")
+
+    if finetuning_args.use_llama_pro:
+        if num_layers % finetuning_args.freeze_trainable_layers != 0:
+            raise ValueError(
+                f"`num_layers` {num_layers} should be "
+                f"divisible by `num_layer_trainable` {finetuning_args.freeze_trainable_layers}."
+            )
+
+        stride = num_layers // finetuning_args.freeze_trainable_layers
+        trainable_layer_ids = range(stride - 1, num_layers + stride - 1, stride)
+    elif finetuning_args.freeze_trainable_layers > 0:  # fine-tuning the last n layers if num_layer_trainable > 0
+        trainable_layer_ids = range(max(0, num_layers - finetuning_args.freeze_trainable_layers), num_layers)
+    else:  # fine-tuning the first n layers if num_layer_trainable < 0
+        trainable_layer_ids = range(min(-finetuning_args.freeze_trainable_layers, num_layers))
+
+    hidden_modules = set()
+    non_hidden_modules = set()
+    for name, _ in model.named_parameters():
+        if ".0." in name:
+            hidden_modules.add(name.split(".0.")[-1].split(".")[0])
+        elif ".1." in name:  # MoD starts from layer 1
+            hidden_modules.add(name.split(".1.")[-1].split(".")[0])
+
+        if re.search(r"\.\d+\.", name) is None:
+            non_hidden_modules.add(name.split(".")[-2])  # remove weight/bias
+
+    trainable_layers = []
+    for module_name in finetuning_args.freeze_trainable_modules:
+        if module_name != "all" and module_name not in hidden_modules:
+            raise ValueError(
+                "Module {} is not found, please choose from {}".format(module_name, ", ".join(hidden_modules))
+            )
+
+        for idx in trainable_layer_ids:
+            trainable_layers.append(".{:d}.{}".format(idx, module_name if module_name != "all" else ""))
+
+    if finetuning_args.freeze_extra_modules:
+        for module_name in finetuning_args.freeze_extra_modules:
+            if module_name not in non_hidden_modules:
+                raise ValueError(
+                    "Module {} is not found, please choose from {}".format(module_name, ", ".join(non_hidden_modules))
+                )
+
+            trainable_layers.append(module_name)
+
+    model_type = getattr(model.config, "model_type", None)
+    if not finetuning_args.freeze_multi_modal_projector and model_type in COMPOSITE_MODELS:
+        trainable_layers.append(COMPOSITE_MODELS[model_type].projector_key)
+
+    forbidden_modules = get_forbidden_modules(model.config, finetuning_args)
+    for name, param in model.named_parameters():
+        if any(trainable_layer in name for trainable_layer in trainable_layers) and not any(
+            forbidden_module in name for forbidden_module in forbidden_modules
+        ):
+            if cast_trainable_params_to_fp32:
+                param.data = param.data.to(torch.float32)
+        else:
+            param.requires_grad_(False)
+
+    logger.info_rank0("Set trainable layers: {}".format(",".join(trainable_layers)))
+
+
+def _setup_lora_tuning(
+    config: "PretrainedConfig",
+    model: "PreTrainedModel",
+    model_args: "ModelArguments",
+    finetuning_args: "FinetuningArguments",
+    is_trainable: bool,
+    cast_trainable_params_to_fp32: bool,
+) -> "PeftModel":
+    if is_trainable:
+        if finetuning_args.finetuning_type == "oft":
+            logger.info_rank0("Fine-tuning method: OFT")
+        else:
+            logger.info_rank0("Fine-tuning method: {}".format("DoRA" if finetuning_args.use_dora else "LoRA"))
+
+    adapter_to_resume = None
+
+    if model_args.adapter_name_or_path is not None:
+        is_mergeable = True
+        if getattr(model, "quantization_method", None):  # merge lora in quantized model is unstable
+            assert len(model_args.adapter_name_or_path) == 1, "Quantized model only accepts a single adapter."
+            is_mergeable = False
+
+        if is_deepspeed_zero3_enabled():
+            assert len(model_args.adapter_name_or_path) == 1, "Cannot use multiple adapters in DeepSpeed ZeRO-3."
+            is_mergeable = False
+
+        if model_args.use_kt:
+            assert len(model_args.adapter_name_or_path) == 1, "KTransformers model only accepts a single adapter"
+            is_mergeable = False
+
+        if model_args.use_unsloth:
+            assert len(model_args.adapter_name_or_path) == 1, "Unsloth model only accepts a single adapter."
+            is_mergeable = False
+
+        if (is_trainable and not finetuning_args.create_new_adapter) or (not is_mergeable):
+            adapter_to_merge = model_args.adapter_name_or_path[:-1]
+            adapter_to_resume = model_args.adapter_name_or_path[-1]
+        else:
+            adapter_to_merge = model_args.adapter_name_or_path
+
+        init_kwargs = {
+            "subfolder": model_args.adapter_folder,
+            "offload_folder": model_args.offload_folder,
+            "cache_dir": model_args.cache_dir,
+            "revision": model_args.model_revision,
+            "token": model_args.hf_hub_token,
+        }
+
+        if model_args.use_kt:
+            if model_args.infer_backend != EngineName.KT:
+                raise ValueError(
+                    "We should use ktransformers as backend to infer the adapter fine-tuned by ktransformers."
+                )
+
+        for adapter in adapter_to_merge:
+            model: LoraModel = PeftModel.from_pretrained(model, adapter, **init_kwargs)
+            model = model.merge_and_unload()
+
+        if len(adapter_to_merge) > 0:
+            logger.info_rank0(f"Merged {len(adapter_to_merge)} adapter(s).")
+
+        if adapter_to_resume is not None:  # resume lora training
+            if model_args.use_kt:
+                model = load_kt_peft_model(model_args, model)
+            elif model_args.use_unsloth:
+                model = load_unsloth_peft_model(config, model_args, finetuning_args, is_trainable=is_trainable)
+            else:
+                model = PeftModel.from_pretrained(model, adapter_to_resume, is_trainable=is_trainable, **init_kwargs)
+
+        logger.info_rank0("Loaded adapter(s): {}".format(",".join(model_args.adapter_name_or_path)))
+
+    if is_trainable and adapter_to_resume is None:  # create new lora weights while training
+        if len(finetuning_args.lora_target) == 1 and finetuning_args.lora_target[0] == "all":
+            target_modules = find_all_linear_modules(model, finetuning_args.freeze_vision_tower)
+        else:
+            target_modules = finetuning_args.lora_target
+
+        if model_args.use_kt:
+            new_list = []
+            for m in target_modules:
+                if m in ("down_proj", "up_proj", "gate_proj"):
+                    new_list.extend([f"mlp.{m}", f"shared_experts.{m}"])
+                elif m not in ("generate_linear", "orig_module", "prefill_linear"):
+                    new_list.append(m)
+
+            target_modules[:] = new_list
+
+        if finetuning_args.use_llama_pro:
+            target_modules = find_expanded_modules(model, target_modules, finetuning_args.freeze_trainable_layers)
+
+        target_modules = patch_target_modules(model, finetuning_args, target_modules)
+
+        if (
+            finetuning_args.use_dora
+            and getattr(model, "quantization_method", None) is not None
+            and getattr(model, "quantization_method", None) != QuantizationMethod.BNB
+        ):
+            raise ValueError("DoRA is not compatible with PTQ-quantized models.")
+
+        if model_args.resize_vocab and finetuning_args.additional_target is None:
+            input_embeddings = model.get_input_embeddings()
+            output_embeddings = model.get_output_embeddings()
+            module_names = set()
+            for name, module in model.named_modules():
+                if module in [input_embeddings, output_embeddings]:
+                    module_names.add(name.split(".")[-1])
+
+            finetuning_args.additional_target = module_names
+            logger.warning_rank0("Vocab has been resized, add {} to trainable params.".format(",".join(module_names)))
+
+        if finetuning_args.finetuning_type == "lora":
+            peft_kwargs = {
+                "r": finetuning_args.lora_rank,
+                "target_modules": target_modules,
+                "lora_alpha": finetuning_args.lora_alpha,
+                "lora_dropout": finetuning_args.lora_dropout,
+                "use_rslora": finetuning_args.use_rslora,
+                "use_dora": finetuning_args.use_dora,
+                "modules_to_save": finetuning_args.additional_target,
+            }
+        elif finetuning_args.finetuning_type == "oft":
+            peft_kwargs = {
+                "r": finetuning_args.oft_rank,
+                "oft_block_size": finetuning_args.oft_block_size,
+                "target_modules": target_modules,
+                "module_dropout": finetuning_args.module_dropout,
+                "modules_to_save": finetuning_args.additional_target,
+            }
+
+        if model_args.use_kt:
+            if finetuning_args.finetuning_type == "oft":
+                raise ValueError("KTransformers is currently not supported for OFT.")
+            if finetuning_args.finetuning_type == "lora":
+                peft_config = LoraConfig(
+                    task_type=TaskType.CAUSAL_LM,
+                    inference_mode=False,
+                    **peft_kwargs,
+                )
+            else:
+                raise ValueError("KTransformers is currently only supported for LoRA.")
+
+            model = get_kt_peft_model(model, peft_config)
+            print(f"KT_model:{model}")
+        elif model_args.use_unsloth:
+            if finetuning_args.finetuning_type == "oft":
+                raise ValueError("Unsloth is currently not supported for OFT.")
+
+            model = get_unsloth_peft_model(model, model_args, peft_kwargs)
+        else:
+            if finetuning_args.pissa_init:
+                if finetuning_args.pissa_iter == -1:
+                    logger.info_rank0("Using PiSSA initialization.")
+                    peft_kwargs["init_lora_weights"] = "pissa"
+                else:
+                    logger.info_rank0(f"Using PiSSA initialization with FSVD steps {finetuning_args.pissa_iter}.")
+                    peft_kwargs["init_lora_weights"] = f"pissa_niter_{finetuning_args.pissa_iter}"
+
+            if finetuning_args.finetuning_type == "lora":
+                peft_config = LoraConfig(
+                    task_type=TaskType.CAUSAL_LM,
+                    inference_mode=False,
+                    **peft_kwargs,
+                )
+            elif finetuning_args.finetuning_type == "oft":
+                peft_config = OFTConfig(
+                    task_type=TaskType.CAUSAL_LM,
+                    inference_mode=False,
+                    **peft_kwargs,
+                )
+            model = get_peft_model(model, peft_config)
+
+    if is_trainable and cast_trainable_params_to_fp32:
+        for param in filter(lambda p: p.requires_grad, model.parameters()):
+            param.data = param.data.to(torch.float32)
+
+    return model
+
+
+def init_adapter(
+    config: "PretrainedConfig",
+    model: "PreTrainedModel",
+    model_args: "ModelArguments",
+    finetuning_args: "FinetuningArguments",
+    is_trainable: bool,
+) -> "PreTrainedModel":
+    r"""Initialize the adapters.
+
+    Support full-parameter, freeze and LoRA training.
+
+    Note that the trainable parameters must be cast to float32.
+    """
+    if is_trainable and getattr(model, "quantization_method", None) is not None:
+        if finetuning_args.finetuning_type not in ["lora", "oft"]:
+            raise ValueError("Quantized models can only be used for the LoRA or OFT tuning.")
+
+        if finetuning_args.pissa_init:
+            raise ValueError("Cannot initialize PiSSA adapter on quantized models.")
+
+    # cast trainable parameters to float32 if:
+    # 1. is_trainable and not pure_bf16 and not badam and quantization_bit is not None (qlora)
+    # 2. is_trainable and not pure_bf16 and not badam and not zero3 (zero3 already in fp32)
+    cast_trainable_params_to_fp32 = False
+    if not is_trainable:
+        pass
+    elif finetuning_args.pure_bf16 or finetuning_args.use_badam:
+        logger.info_rank0("Pure bf16 / BAdam detected, remaining trainable params in half precision.")
+    elif model_args.quantization_bit is None and is_deepspeed_zero3_enabled():
+        logger.info_rank0("DeepSpeed ZeRO3 detected, remaining trainable params in float32.")
+    else:
+        logger.info_rank0("Upcasting trainable params to float32.")
+        cast_trainable_params_to_fp32 = True
+
+    if finetuning_args.finetuning_type == "full":
+        _setup_full_tuning(model, finetuning_args, is_trainable, cast_trainable_params_to_fp32)
+    elif finetuning_args.finetuning_type == "freeze":
+        _setup_freeze_tuning(model, finetuning_args, is_trainable, cast_trainable_params_to_fp32)
+    elif finetuning_args.finetuning_type in ["lora", "oft"]:
+        model = _setup_lora_tuning(
+            config, model, model_args, finetuning_args, is_trainable, cast_trainable_params_to_fp32
+        )
+    else:
+        raise NotImplementedError(f"Unknown finetuning type: {finetuning_args.finetuning_type}.")
+
+    return model
diff --git a/llamafactory/model/loader.py b/llamafactory/model/loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..10c257a5ad1c2c3bfee5eb6a101a89550366eae2
--- /dev/null
+++ b/llamafactory/model/loader.py
@@ -0,0 +1,231 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import TYPE_CHECKING, Any, Optional, TypedDict
+
+import torch
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoModelForImageTextToText,
+    AutoModelForSeq2SeqLM,
+    AutoModelForTextToWaveform,
+    AutoModelForVision2Seq,
+    AutoProcessor,
+    AutoTokenizer,
+)
+from trl import AutoModelForCausalLMWithValueHead
+
+from ..extras import logging
+from ..extras.misc import count_parameters, skip_check_imports, try_download_model_from_other_hub
+from .adapter import init_adapter
+from .model_utils.ktransformers import load_kt_pretrained_model
+from .model_utils.liger_kernel import apply_liger_kernel
+from .model_utils.misc import register_autoclass
+from .model_utils.mod import convert_pretrained_model_to_mod, load_mod_pretrained_model
+from .model_utils.unsloth import load_unsloth_pretrained_model
+from .model_utils.valuehead import load_valuehead_params
+from .patcher import patch_config, patch_model, patch_processor, patch_tokenizer, patch_valuehead_model
+
+
+if TYPE_CHECKING:
+    from transformers import PretrainedConfig, PreTrainedModel, PreTrainedTokenizer, ProcessorMixin
+
+    from ..hparams import FinetuningArguments, ModelArguments
+
+
+logger = logging.get_logger(__name__)
+
+
+class TokenizerModule(TypedDict):
+    tokenizer: "PreTrainedTokenizer"
+    processor: Optional["ProcessorMixin"]
+
+
+def _get_init_kwargs(model_args: "ModelArguments") -> dict[str, Any]:
+    r"""Get arguments to load config/tokenizer/model.
+
+    Note: including inplace operation of model_args.
+    """
+    skip_check_imports()
+    model_args.model_name_or_path = try_download_model_from_other_hub(model_args)
+    return {
+        "trust_remote_code": model_args.trust_remote_code,
+        "cache_dir": model_args.cache_dir,
+        "revision": model_args.model_revision,
+        "token": model_args.hf_hub_token,
+    }
+
+
+def load_tokenizer(model_args: "ModelArguments") -> "TokenizerModule":
+    r"""Load pretrained tokenizer and optionally loads processor.
+
+    Note: including inplace operation of model_args.
+    """
+    init_kwargs = _get_init_kwargs(model_args)
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_args.model_name_or_path,
+            use_fast=model_args.use_fast_tokenizer,
+            split_special_tokens=model_args.split_special_tokens,
+            padding_side="right",
+            **init_kwargs,
+        )
+    except ValueError:  # try another one
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_args.model_name_or_path,
+            use_fast=not model_args.use_fast_tokenizer,
+            padding_side="right",
+            **init_kwargs,
+        )
+    except Exception as e:
+        raise OSError("Failed to load tokenizer.") from e
+
+    patch_tokenizer(tokenizer, model_args)
+
+    try:
+        processor = AutoProcessor.from_pretrained(
+            model_args.model_name_or_path,
+            use_fast=model_args.use_fast_tokenizer,
+            **init_kwargs,
+        )
+    except ValueError:  # try another one
+        processor = AutoProcessor.from_pretrained(
+            model_args.model_name_or_path,
+            use_fast=not model_args.use_fast_tokenizer,
+            **init_kwargs,
+        )
+    except Exception as e:
+        logger.info_rank0(f"Failed to load processor: {e}.")
+        processor = None
+
+    # Avoid load tokenizer, see:
+    # https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/auto/processing_auto.py#L324
+    if processor is not None and "Processor" not in processor.__class__.__name__:
+        logger.debug("The loaded processor is not an instance of Processor. Dropping it.")
+        processor = None
+
+    if processor is not None:
+        patch_processor(processor, tokenizer, model_args)
+
+    return {"tokenizer": tokenizer, "processor": processor}
+
+
+def load_config(model_args: "ModelArguments") -> "PretrainedConfig":
+    r"""Load model config."""
+    init_kwargs = _get_init_kwargs(model_args)
+    return AutoConfig.from_pretrained(model_args.model_name_or_path, **init_kwargs)
+
+
+def load_model(
+    tokenizer: "PreTrainedTokenizer",
+    model_args: "ModelArguments",
+    finetuning_args: "FinetuningArguments",
+    is_trainable: bool = False,
+    add_valuehead: bool = False,
+) -> "PreTrainedModel":
+    r"""Load pretrained model."""
+    init_kwargs = _get_init_kwargs(model_args)
+    config = load_config(model_args)
+    patch_config(config, tokenizer, model_args, init_kwargs, is_trainable)
+    apply_liger_kernel(config, model_args, is_trainable, require_logits=(finetuning_args.stage not in ["pt", "sft"]))
+
+    model = None
+    lazy_load = False
+    if model_args.use_kt:
+        from ktransformers.sft.monkey_patch_torch_module import install_patch
+
+        install_patch()
+        model = load_kt_pretrained_model(config, model_args)
+    elif model_args.use_unsloth:
+        if model_args.adapter_name_or_path is not None:
+            lazy_load = True
+        elif is_trainable:
+            model = load_unsloth_pretrained_model(config, model_args, finetuning_args)
+
+    if model is None and not lazy_load:
+        init_kwargs["config"] = config
+        init_kwargs["pretrained_model_name_or_path"] = model_args.model_name_or_path
+
+        if model_args.mixture_of_depths == "load":
+            model = load_mod_pretrained_model(**init_kwargs)
+        else:
+            if type(config) in AutoModelForImageTextToText._model_mapping.keys():  # image-text
+                load_class = AutoModelForImageTextToText
+            elif type(config) in AutoModelForVision2Seq._model_mapping.keys():  # image-text
+                load_class = AutoModelForVision2Seq
+            elif type(config) in AutoModelForSeq2SeqLM._model_mapping.keys():  # audio-text
+                load_class = AutoModelForSeq2SeqLM
+            elif type(config) in AutoModelForTextToWaveform._model_mapping.keys():  # audio hack for qwen omni
+                load_class = AutoModelForTextToWaveform
+            else:
+                load_class = AutoModelForCausalLM
+
+            if model_args.train_from_scratch:
+                model = load_class.from_config(config, trust_remote_code=model_args.trust_remote_code)
+            else:
+                model = load_class.from_pretrained(**init_kwargs)
+                if getattr(model.config, "model_type", None) in ["qwen2_5_omni", "qwen3_omni_moe"]:
+                    model = getattr(model, "thinker")
+
+        if model_args.mixture_of_depths == "convert":
+            model = convert_pretrained_model_to_mod(model, config, model_args)
+
+    if not lazy_load:
+        patch_model(model, tokenizer, model_args, is_trainable, add_valuehead)
+        register_autoclass(config, model, tokenizer)
+
+    model = init_adapter(config, model, model_args, finetuning_args, is_trainable)
+
+    if add_valuehead:
+        model = AutoModelForCausalLMWithValueHead.from_pretrained(model)
+        patch_valuehead_model(model)
+
+        if model_args.adapter_name_or_path is not None:
+            vhead_path = model_args.adapter_name_or_path[-1]
+        else:
+            vhead_path = model_args.model_name_or_path
+
+        vhead_params = load_valuehead_params(vhead_path, model_args)
+        if vhead_params is not None:
+            model.load_state_dict(vhead_params, strict=False)
+            logger.info_rank0(f"Loaded valuehead from checkpoint: {vhead_path}")
+
+    if not is_trainable:
+        model.requires_grad_(False)
+        for param in model.parameters():
+            if param.data.dtype == torch.float32 and model_args.compute_dtype != torch.float32:
+                param.data = param.data.to(model_args.compute_dtype)
+
+        model.eval()
+    else:
+        model.train()
+
+    trainable_params, all_param = count_parameters(model)
+    if is_trainable:
+        param_stats = (
+            f"trainable params: {trainable_params:,} || "
+            f"all params: {all_param:,} || trainable%: {100 * trainable_params / all_param:.4f}"
+        )
+    else:
+        param_stats = f"all params: {all_param:,}"
+
+    logger.info_rank0(param_stats)
+
+    if model_args.print_param_status and int(os.getenv("LOCAL_RANK", "0")) == 0:
+        for name, param in model.named_parameters():
+            print(f"name: {name}, dtype: {param.dtype}, device: {param.device}, trainable: {param.requires_grad}")
+
+    return model
diff --git a/llamafactory/model/model_utils/__init__.py b/llamafactory/model/model_utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/llamafactory/model/model_utils/__pycache__/__init__.cpython-312.pyc b/llamafactory/model/model_utils/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c07d68b362a004e62f0da96cb88e0e45590aaa37
Binary files /dev/null and b/llamafactory/model/model_utils/__pycache__/__init__.cpython-312.pyc differ
diff --git a/llamafactory/model/model_utils/__pycache__/attention.cpython-312.pyc b/llamafactory/model/model_utils/__pycache__/attention.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1839343162663c322fce2de92b2cd59d375e5572
Binary files /dev/null and b/llamafactory/model/model_utils/__pycache__/attention.cpython-312.pyc differ
diff --git a/llamafactory/model/model_utils/__pycache__/checkpointing.cpython-312.pyc b/llamafactory/model/model_utils/__pycache__/checkpointing.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..80ef7cb5c92e19f84423e742f151ff67f4a1ca51
Binary files /dev/null and b/llamafactory/model/model_utils/__pycache__/checkpointing.cpython-312.pyc differ
diff --git a/llamafactory/model/model_utils/__pycache__/embedding.cpython-312.pyc b/llamafactory/model/model_utils/__pycache__/embedding.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ad9ae56470e43c7c7436fb9ca4bae308a277445e
Binary files /dev/null and b/llamafactory/model/model_utils/__pycache__/embedding.cpython-312.pyc differ
diff --git a/llamafactory/model/model_utils/__pycache__/ktransformers.cpython-312.pyc b/llamafactory/model/model_utils/__pycache__/ktransformers.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e9fa50d43e000993f37a5b390ecb2ceb4193b0e1
Binary files /dev/null and b/llamafactory/model/model_utils/__pycache__/ktransformers.cpython-312.pyc differ
diff --git a/llamafactory/model/model_utils/__pycache__/kv_cache.cpython-312.pyc b/llamafactory/model/model_utils/__pycache__/kv_cache.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..78f6c77f86f4717ab6625f986e95c3fdc7599429
Binary files /dev/null and b/llamafactory/model/model_utils/__pycache__/kv_cache.cpython-312.pyc differ
diff --git a/llamafactory/model/model_utils/__pycache__/liger_kernel.cpython-312.pyc b/llamafactory/model/model_utils/__pycache__/liger_kernel.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0518cc418dc63b663eaa36b55b91cb0bdf4b45fc
Binary files /dev/null and b/llamafactory/model/model_utils/__pycache__/liger_kernel.cpython-312.pyc differ
diff --git a/llamafactory/model/model_utils/__pycache__/longlora.cpython-312.pyc b/llamafactory/model/model_utils/__pycache__/longlora.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5db2a2ed35fd14a69631796d66e95028fbe2d884
Binary files /dev/null and b/llamafactory/model/model_utils/__pycache__/longlora.cpython-312.pyc differ
diff --git a/llamafactory/model/model_utils/__pycache__/misc.cpython-312.pyc b/llamafactory/model/model_utils/__pycache__/misc.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..37d0fcac4f9a29c2a2e61effd0a3c721299374b2
Binary files /dev/null and b/llamafactory/model/model_utils/__pycache__/misc.cpython-312.pyc differ
diff --git a/llamafactory/model/model_utils/__pycache__/mod.cpython-312.pyc b/llamafactory/model/model_utils/__pycache__/mod.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7ccac03435c6676101c9f76d17ddb06a75898d7f
Binary files /dev/null and b/llamafactory/model/model_utils/__pycache__/mod.cpython-312.pyc differ
diff --git a/llamafactory/model/model_utils/__pycache__/moe.cpython-312.pyc b/llamafactory/model/model_utils/__pycache__/moe.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3e7d1130f70740c54c970fac8ec3ef1cd362a26f
Binary files /dev/null and b/llamafactory/model/model_utils/__pycache__/moe.cpython-312.pyc differ
diff --git a/llamafactory/model/model_utils/__pycache__/packing.cpython-312.pyc b/llamafactory/model/model_utils/__pycache__/packing.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1b810aaf0611f81df52a8c736a41ae1089f61ac0
Binary files /dev/null and b/llamafactory/model/model_utils/__pycache__/packing.cpython-312.pyc differ
diff --git a/llamafactory/model/model_utils/__pycache__/quantization.cpython-312.pyc b/llamafactory/model/model_utils/__pycache__/quantization.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..58b35c5862e2c7d8b409bbd906d160b83e8afc27
Binary files /dev/null and b/llamafactory/model/model_utils/__pycache__/quantization.cpython-312.pyc differ
diff --git a/llamafactory/model/model_utils/__pycache__/rope.cpython-312.pyc b/llamafactory/model/model_utils/__pycache__/rope.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9b904795235ac6f4838218eb01dce2a7b148353d
Binary files /dev/null and b/llamafactory/model/model_utils/__pycache__/rope.cpython-312.pyc differ
diff --git a/llamafactory/model/model_utils/__pycache__/unsloth.cpython-312.pyc b/llamafactory/model/model_utils/__pycache__/unsloth.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a90b03741f09edeb12499ad6953a77084678734c
Binary files /dev/null and b/llamafactory/model/model_utils/__pycache__/unsloth.cpython-312.pyc differ
diff --git a/llamafactory/model/model_utils/__pycache__/valuehead.cpython-312.pyc b/llamafactory/model/model_utils/__pycache__/valuehead.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ec5004b519ec86c192044dbfb1a800cd91d3b235
Binary files /dev/null and b/llamafactory/model/model_utils/__pycache__/valuehead.cpython-312.pyc differ
diff --git a/llamafactory/model/model_utils/__pycache__/visual.cpython-312.pyc b/llamafactory/model/model_utils/__pycache__/visual.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fc83560ee026911d774c6fcfa2e4ead8d904d7b4
Binary files /dev/null and b/llamafactory/model/model_utils/__pycache__/visual.cpython-312.pyc differ
diff --git a/llamafactory/model/model_utils/attention.py b/llamafactory/model/model_utils/attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a12012e90b5d4adfcb0ff9083ab82e50634d118
--- /dev/null
+++ b/llamafactory/model/model_utils/attention.py
@@ -0,0 +1,91 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...extras import logging
+from ...extras.constants import AttentionFunction
+from ...extras.packages import is_torch_version_greater_than
+
+
+if TYPE_CHECKING:
+    from transformers import PretrainedConfig
+
+    from ...hparams import ModelArguments
+
+
+logger = logging.get_logger(__name__)
+
+
+def configure_attn_implementation(config: "PretrainedConfig", model_args: "ModelArguments") -> None:
+    from transformers.utils import is_flash_attn_2_available
+
+    if getattr(config, "model_type", None) == "gemma2":
+        if model_args.flash_attn == AttentionFunction.AUTO or model_args.flash_attn == AttentionFunction.FA2:
+            if is_flash_attn_2_available():
+                if model_args.flash_attn != AttentionFunction.FA2:
+                    logger.warning_rank0("Gemma 2 should use flash attention 2, change `flash_attn` to fa2.")
+                    model_args.flash_attn = AttentionFunction.FA2
+            else:
+                logger.warning_rank0("FlashAttention-2 is not installed, use eager attention.")
+                model_args.flash_attn = AttentionFunction.DISABLED
+        elif model_args.flash_attn == AttentionFunction.SDPA:
+            logger.warning_rank0(
+                "Gemma-2 should use soft-capping attention, while the SDPA attention does not support it."
+            )
+
+    if model_args.flash_attn == AttentionFunction.AUTO:
+        return
+
+    elif model_args.flash_attn == AttentionFunction.DISABLED:
+        requested_attn_implementation = "eager"
+
+    elif model_args.flash_attn == AttentionFunction.SDPA:
+        if not is_torch_version_greater_than("2.1.1"):
+            logger.warning_rank0("torch>=2.1.1 is required for SDPA attention.")
+            return
+
+        requested_attn_implementation = "sdpa"
+    elif model_args.flash_attn == AttentionFunction.FA2:
+        from transformers import is_torch_npu_available
+
+        if not (is_flash_attn_2_available() or is_torch_npu_available()):
+            logger.warning_rank0("FlashAttention-2 is not installed.")
+            return
+
+        requested_attn_implementation = "flash_attention_2"
+    else:
+        raise NotImplementedError(f"Unknown attention type: {model_args.flash_attn}")
+
+    if getattr(config, "model_type", None) == "internlm2":  # special case for custom models
+        setattr(config, "attn_implementation", requested_attn_implementation)
+    elif getattr(config, "model_type", None) == "kimi_vl":
+        setattr(config.vision_config, "_attn_implementation", requested_attn_implementation)
+        setattr(config.text_config, "_attn_implementation", requested_attn_implementation)
+    else:
+        setattr(config, "_attn_implementation", requested_attn_implementation)
+
+
+def print_attn_implementation(config: "PretrainedConfig") -> None:
+    if getattr(config, "model_type", None) == "internlm2":  # special case for custom models
+        attn_implementation = getattr(config, "attn_implementation", None)
+    else:
+        attn_implementation = getattr(config, "_attn_implementation", None)
+
+    if attn_implementation == "flash_attention_2":
+        logger.info_rank0("Using FlashAttention-2 for faster training and inference.")
+    elif attn_implementation == "sdpa":
+        logger.info_rank0("Using torch SDPA for faster training and inference.")
+    else:
+        logger.info_rank0("Using vanilla attention implementation.")
diff --git a/llamafactory/model/model_utils/checkpointing.py b/llamafactory/model/model_utils/checkpointing.py
new file mode 100644
index 0000000000000000000000000000000000000000..714aca03347d5d6f1a951ca3b06049755efc3d6d
--- /dev/null
+++ b/llamafactory/model/model_utils/checkpointing.py
@@ -0,0 +1,175 @@
+# Copyright 2025 HuggingFace Inc., Daniel Han-Chen & the Unsloth team and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's Transformers and PEFT library,
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/modeling_utils.py
+# https://github.com/huggingface/peft/blob/v0.10.0/src/peft/utils/other.py
+# and the Unsloth library.
+# https://github.com/unslothai/unsloth/blob/July-2024/unsloth/models/_utils.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from functools import WRAPPER_ASSIGNMENTS, partial, wraps
+from types import MethodType
+from typing import TYPE_CHECKING, Any, Callable, Optional, Union
+
+import torch
+
+from ...extras import logging
+from ...extras.constants import LAYERNORM_NAMES
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel
+
+    from ...hparams import ModelArguments
+
+
+logger = logging.get_logger(__name__)
+
+
+def get_unsloth_gradient_checkpointing_func() -> Callable:
+    class UnslothGradientCheckpointing(torch.autograd.Function):
+        r"""Saves VRAM by smartly offloading to RAM."""
+
+        @staticmethod
+        @torch.cuda.amp.custom_fwd
+        def forward(
+            ctx: "torch.autograd.Function",
+            forward_function: "torch.Module",
+            hidden_states: "torch.Tensor",
+            *args: Union["torch.Tensor", Any],
+        ) -> "torch.Tensor":
+            saved_hidden_states = hidden_states.to("cpu", non_blocking=True)
+            with torch.no_grad():
+                outputs = forward_function(hidden_states, *args)
+
+            ctx.save_for_backward(saved_hidden_states)
+            ctx.forward_function = forward_function
+            ctx.args = args
+            return outputs
+
+        @staticmethod
+        @torch.cuda.amp.custom_bwd
+        def backward(ctx: "torch.autograd.Function", grad_output: "torch.Tensor") -> "torch.Tensor":
+            (hidden_states,) = ctx.saved_tensors
+            hidden_states = hidden_states.to("cuda", non_blocking=True).detach()
+            hidden_states.requires_grad_(True)
+            with torch.enable_grad():
+                outputs = ctx.forward_function(hidden_states, *ctx.args)
+                output = outputs[0] if isinstance(outputs, tuple) else outputs
+
+            torch.autograd.backward(output, grad_output)
+            return (None, hidden_states.grad) + (None,) * len(ctx.args)
+
+    return UnslothGradientCheckpointing.apply
+
+
+def get_custom_gradient_checkpointing_func(gradient_checkpointing_func: Callable) -> Callable:
+    r"""Only applies gradient checkpointing to trainable layers."""
+
+    @wraps(gradient_checkpointing_func, assigned=WRAPPER_ASSIGNMENTS + ("__self__",))
+    def custom_gradient_checkpointing_func(func: Callable, *args: Union["torch.Tensor", Any], **kwargs):
+        if isinstance(func, partial):
+            module: torch.nn.Module = func.func.__self__
+        else:
+            module: torch.nn.Module = func.__self__
+
+        has_grad = False
+        if any(param.requires_grad for param in module.parameters()):
+            has_grad = True
+            for arg in args:
+                if torch.is_tensor(arg) and torch.is_floating_point(arg):
+                    arg.requires_grad_(True)
+                    break  # assume the first tensor is always the hidden states
+
+        if has_grad:
+            return gradient_checkpointing_func(func, *args, **kwargs)
+        else:
+            return func(*args, **kwargs)
+
+    return custom_gradient_checkpointing_func
+
+
+def _gradient_checkpointing_enable(
+    self: "PreTrainedModel",
+    gradient_checkpointing_kwargs: Optional[dict[str, Any]] = None,
+    use_unsloth_gc: bool = False,
+) -> None:
+    r"""Activates gradient checkpointing for the current model.
+
+    Modification of the original method to enable gradient checkpointing for block-wise optimizer.
+    """
+    from torch.utils.checkpoint import checkpoint
+
+    if not self.supports_gradient_checkpointing:
+        raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
+
+    if gradient_checkpointing_kwargs is None:
+        gradient_checkpointing_kwargs = {"use_reentrant": True}
+
+    if use_unsloth_gc:
+        gradient_checkpointing_func = get_unsloth_gradient_checkpointing_func()
+    else:
+        gradient_checkpointing_func = partial(checkpoint, **gradient_checkpointing_kwargs)
+
+    gradient_checkpointing_func = get_custom_gradient_checkpointing_func(gradient_checkpointing_func)
+    if "value" in inspect.signature(self._set_gradient_checkpointing).parameters:  # old GC format
+        self.apply(partial(self._set_gradient_checkpointing, value=True))
+        self.enable_input_require_grads()
+        logger.warning_rank0_once("You are using the old GC format, some features (e.g. BAdam) will be invalid.")
+    else:  # have already enabled input require gradients
+        self._set_gradient_checkpointing(enable=True, gradient_checkpointing_func=gradient_checkpointing_func)
+
+
+def _fp32_forward_post_hook(
+    module: "torch.nn.Module", args: tuple["torch.Tensor"], output: "torch.Tensor"
+) -> "torch.Tensor":
+    return output.to(torch.float32)
+
+
+def prepare_model_for_training(model: "PreTrainedModel", model_args: "ModelArguments") -> None:
+    r"""Prepare the model before training.
+
+    Include:
+    (1) cast the layernorm in fp32
+    (2) make output embedding layer require grads
+    (3) add the upcasting of the lm_head in fp32.
+    """
+    if model_args.upcast_layernorm:
+        logger.info_rank0("Upcasting layernorm weights in float32.")
+        for name, param in model.named_parameters():
+            if param.ndim == 1 and any(ln_name in name for ln_name in LAYERNORM_NAMES):
+                param.data = param.data.to(torch.float32)
+
+    if not model_args.disable_gradient_checkpointing:
+        if not getattr(model, "supports_gradient_checkpointing", False):
+            logger.warning_rank0("Current model does not support gradient checkpointing.")
+        else:
+            # use_reentrant=False might increase VRAM usage (have not been empirically verified yet)
+            # According to: https://github.com/huggingface/transformers/issues/28339
+            gradient_checkpointing_enable = partial(
+                _gradient_checkpointing_enable, use_unsloth_gc=model_args.use_unsloth_gc
+            )
+            model.gradient_checkpointing_enable = MethodType(gradient_checkpointing_enable, model)
+            model.gradient_checkpointing_enable(
+                gradient_checkpointing_kwargs={"use_reentrant": model_args.use_reentrant_gc}
+            )
+            setattr(model.config, "use_cache", False)  # turn off when gradient checkpointing is enabled
+            logger.info_rank0("Gradient checkpointing enabled.")
+
+    if model_args.upcast_lmhead_output:
+        output_layer = model.get_output_embeddings()
+        if isinstance(output_layer, torch.nn.Linear) and output_layer.weight.dtype != torch.float32:
+            logger.info_rank0("Upcasting lm_head outputs in float32.")
+            output_layer.register_forward_hook(_fp32_forward_post_hook)
diff --git a/llamafactory/model/model_utils/embedding.py b/llamafactory/model/model_utils/embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..b503f3b97d137a19ebe6338c8f4ec9d794e86efa
--- /dev/null
+++ b/llamafactory/model/model_utils/embedding.py
@@ -0,0 +1,218 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from contextlib import nullcontext
+from typing import TYPE_CHECKING, Optional
+
+import torch
+from transformers.integrations import is_deepspeed_zero3_enabled
+
+from ...extras import logging
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel, PreTrainedTokenizer
+
+
+logger = logging.get_logger(__name__)
+
+
+def _noisy_mean_initialization(embed_weight: "torch.Tensor", num_new_tokens: int) -> None:
+    """Initialize new token embeddings with mean + Gaussian noise.
+
+    This is the default initialization method used by LlamaFactory.
+
+    Args:
+        embed_weight: The embedding weight matrix to initialize (shape: [vocab_size, embedding_dim])
+        num_new_tokens: Number of new tokens added at the end of the embedding matrix
+    """
+    embedding_dim = embed_weight.size(1)
+    avg_weight = embed_weight[:-num_new_tokens].mean(dim=0, keepdim=True)
+    noise_weight = torch.empty_like(embed_weight[-num_new_tokens:])
+    noise_weight.normal_(mean=0, std=(1.0 / math.sqrt(embedding_dim)))
+    embed_weight[-num_new_tokens:] = avg_weight + noise_weight
+
+
+def _description_based_initialization(
+    embed_weight: "torch.Tensor",
+    num_new_tokens: int,
+    descriptions: dict[str, str],
+    tokenizer: "PreTrainedTokenizer",
+    model: "PreTrainedModel",
+    add_noise: bool = False,
+) -> None:
+    """Initialize new token embeddings based on textual descriptions.
+
+    For each new token, this function:
+    1. Tokenizes its description text
+    2. Gets embeddings of the description tokens
+    3. Averages them to initialize the new token's embedding
+    4. Optionally adds Gaussian noise
+
+    Args:
+        embed_weight: The embedding weight matrix to initialize (shape: [vocab_size, embedding_dim])
+        num_new_tokens: Number of new tokens added
+        descriptions: Dict mapping token string to its description text
+                      e.g., {"<think>": "A token representing reasoning process"}
+        tokenizer: The tokenizer instance
+        model: The model instance (used to get input embeddings)
+        add_noise: Whether to add Gaussian noise to the initialization
+
+    Example:
+        descriptions = {
+            "<|START_OF_SVG|>": "Marks the beginning of an SVG document",
+            "<|END_OF_SVG|>": "Marks the end of an SVG document"
+        }
+    """
+    embedding_dim = embed_weight.size(1)
+
+    for i, desc in enumerate(descriptions.values()):
+        # Tokenize description text
+        tokens = tokenizer(desc, return_tensors="pt", add_special_tokens=False)
+
+        with torch.no_grad():
+            token_ids = tokens["input_ids"][0]
+            # Move to the same device as embed_weight
+            device = embed_weight.device
+            token_ids = token_ids.to(device)
+
+            # Filter out new tokens (they don't have valid embeddings yet)
+            valid_token_ids = token_ids[token_ids < (len(tokenizer) - num_new_tokens)]
+
+            if len(valid_token_ids) == 0:
+                # Fallback: use mean of all existing embeddings
+                logger.warning_rank0(
+                    f"Description for token {i + 1}/{num_new_tokens} contains no valid tokens. "
+                    "Using mean of existing embeddings."
+                )
+                base_embedding = embed_weight[:-num_new_tokens].mean(dim=0)
+            else:
+                # Get embeddings of description tokens and average them
+                token_embeds = model.get_input_embeddings()(valid_token_ids)
+                base_embedding = token_embeds.mean(dim=0)
+
+            # Add noise if requested (ensure correct device and dtype)
+            if add_noise:
+                noise = torch.randn_like(base_embedding) * (1.0 / math.sqrt(embedding_dim))
+                embed_weight[-num_new_tokens + i] = base_embedding + noise
+            else:
+                embed_weight[-num_new_tokens + i] = base_embedding
+
+
+def _initialize_embeddings(
+    embed_weight: "torch.Tensor",
+    num_new_tokens: int,
+    init_method: str,
+    new_special_tokens_config: Optional[dict],
+    tokenizer: "PreTrainedTokenizer",
+    model: "PreTrainedModel",
+) -> None:
+    """Single source of truth for embedding initialization.
+
+    This function selects the appropriate initialization method and applies it.
+
+    Args:
+        embed_weight: The embedding weight matrix to initialize
+        num_new_tokens: Number of new tokens added
+        init_method: Initialization method ('noise_init', 'desc_init', 'desc_init_w_noise')
+        new_special_tokens_config: Config dict with token descriptions (required for desc_init methods)
+        tokenizer: The tokenizer instance
+        model: The model instance
+    """
+    if init_method == "desc_init" and new_special_tokens_config:
+        logger.info_rank0("Using semantic initialization (desc_init) for new special tokens")
+        _description_based_initialization(
+            embed_weight, num_new_tokens, new_special_tokens_config, tokenizer, model, add_noise=False
+        )
+    elif init_method == "desc_init_w_noise" and new_special_tokens_config:
+        logger.info_rank0("Using semantic initialization with noise (desc_init_w_noise) for new special tokens")
+        _description_based_initialization(
+            embed_weight, num_new_tokens, new_special_tokens_config, tokenizer, model, add_noise=True
+        )
+    else:
+        if init_method != "noise_init":
+            logger.warning_rank0(
+                f"init_method='{init_method}' requires descriptions config, falling back to 'noise_init'"
+            )
+        logger.info_rank0("Using noisy mean initialization (noise_init) for new special tokens")
+        _noisy_mean_initialization(embed_weight, num_new_tokens)
+
+
+def resize_embedding_layer(
+    model: "PreTrainedModel",
+    tokenizer: "PreTrainedTokenizer",
+    new_special_tokens_config: Optional[dict] = None,
+    init_special_tokens: str = "noise_init",
+) -> None:
+    r"""Resize token embeddings and initialize new tokens.
+
+    Args:
+        model: The model to resize
+        tokenizer: The tokenizer (used to get target vocab size)
+        new_special_tokens_config: Optional dict with token descriptions for semantic initialization
+        init_special_tokens: Initialization method ('noise_init', 'desc_init', 'desc_init_w_noise')
+    """
+    if is_deepspeed_zero3_enabled():
+        import deepspeed  # type: ignore
+
+        params = [model.get_input_embeddings().weight]
+        if model.get_output_embeddings() is not None and not model.config.tie_word_embeddings:
+            params.append(model.get_output_embeddings().weight)
+
+        context_maybe_zero3 = deepspeed.zero.GatheredParameters(params, modifier_rank=0)
+    else:
+        context_maybe_zero3 = nullcontext()
+
+    with context_maybe_zero3:
+        current_embedding_size = model.get_input_embeddings().weight.size(0)
+
+    if len(tokenizer) > current_embedding_size:
+        if getattr(model, "quantization_method", None):
+            raise ValueError("Cannot resize embedding layers of a quantized model.")
+
+        if not isinstance(model.get_output_embeddings(), torch.nn.Linear):
+            raise ValueError("Current model does not support resizing embedding layers.")
+
+        model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=64)
+        with context_maybe_zero3:
+            new_embedding_size = model.get_input_embeddings().weight.size(0)
+            num_new_tokens = new_embedding_size - current_embedding_size
+            logger.info_rank0(
+                f"Resizing embeddings: {current_embedding_size} -> {new_embedding_size} (+{num_new_tokens} tokens)"
+            )
+
+            # Initialize input embeddings
+            _initialize_embeddings(
+                model.get_input_embeddings().weight.data,
+                num_new_tokens,
+                init_special_tokens,
+                new_special_tokens_config,
+                tokenizer,
+                model,
+            )
+
+            # Initialize output embeddings if not tied
+            if model.get_output_embeddings() is not None and not model.config.tie_word_embeddings:
+                _initialize_embeddings(
+                    model.get_output_embeddings().weight.data,
+                    num_new_tokens,
+                    init_special_tokens,
+                    new_special_tokens_config,
+                    tokenizer,
+                    model,
+                )
+
+        model.config.vocab_size = new_embedding_size
+        logger.info_rank0(f"Resized token embeddings from {current_embedding_size} to {new_embedding_size}.")
diff --git a/llamafactory/model/model_utils/ktransformers.py b/llamafactory/model/model_utils/ktransformers.py
new file mode 100644
index 0000000000000000000000000000000000000000..d123edc2abcb9f76488418b3fa1a39a34844c2d7
--- /dev/null
+++ b/llamafactory/model/model_utils/ktransformers.py
@@ -0,0 +1,152 @@
+# Copyright 2025 the KVCache.AI team, Approaching AI, and the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib.util as _u
+from typing import TYPE_CHECKING, Any
+
+import torch
+
+from ...extras import logging
+from ...extras.misc import get_current_device
+
+
+if TYPE_CHECKING:
+    from ...hparams import FinetuningArguments, ModelArguments
+
+from transformers import AutoConfig, AutoModelForCausalLM, PretrainedConfig, PreTrainedModel
+
+
+KT_AVAILABLE = _u.find_spec("ktransformers") is not None
+if KT_AVAILABLE:
+    from ktransformers.models.modeling_deepseek import DeepseekV2ForCausalLM
+    from ktransformers.models.modeling_deepseek_v3 import DeepseekV3ForCausalLM
+    from ktransformers.models.modeling_llama import LlamaForCausalLM
+    from ktransformers.models.modeling_mixtral import MixtralForCausalLM
+    from ktransformers.models.modeling_qwen2_moe import Qwen2MoeForCausalLM
+    from ktransformers.optimize.optimize import optimize_and_load_gguf
+    from ktransformers.server.config.config import Config
+    from ktransformers.sft.lora import inject_lora_layer
+    from ktransformers.util.custom_loader import GGUFLoader, SafeTensorLoader
+    from ktransformers.util.globals import GLOBAL_CONFIG
+    from ktransformers.util.utils import load_weights
+
+logger = logging.get_logger(__name__)
+
+
+def _get_kt_kwargs(
+    config: "PretrainedConfig",
+    model_name_or_path: str,
+    model_args: "ModelArguments",
+    finetuning_args: "FinetuningArguments",
+) -> dict[str, Any]:
+    return {
+        "model_name": model_name_or_path,
+        "max_seq_length": model_args.model_max_length or 4096,
+        "dtype": model_args.compute_dtype,
+        "load_in_4bit": model_args.quantization_bit == 4,
+        "token": model_args.hf_hub_token,
+        "full_finetuning": finetuning_args.finetuning_type == "full",
+        "device_map": {"": get_current_device()},
+        "rope_scaling": getattr(config, "rope_scaling", None),
+        "fix_tokenizer": False,
+        "trust_remote_code": model_args.trust_remote_code,
+        "use_gradient_checkpointing": "ktransformers",
+    }
+
+
+def load_kt_pretrained_model(config: "PretrainedConfig", model_args: "ModelArguments") -> "PreTrainedModel":
+    r"""Optionally load pretrained model with KTransformers. Used in training."""
+    custom_models = {
+        "DeepseekV2ForCausalLM": DeepseekV2ForCausalLM,
+        "DeepseekV3ForCausalLM": DeepseekV3ForCausalLM,
+        "Qwen2MoeForCausalLM": Qwen2MoeForCausalLM,
+        "LlamaForCausalLM": LlamaForCausalLM,
+        "MixtralForCausalLM": MixtralForCausalLM,
+    }
+    Config().cpu_infer = model_args.cpu_infer
+    Config().chunk_size = model_args.chunk_size
+    config = AutoConfig.from_pretrained(model_args.model_name_or_path, trust_remote_code=model_args.trust_remote_code)
+
+    if model_args.mode == "long_context":
+        assert config.architectures[0] == "LlamaForCausalLM", "only LlamaForCausalLM support long_context mode"
+        torch.set_default_dtype(torch.float16)
+    else:
+        torch.set_default_dtype(config.torch_dtype)
+
+    with torch.device("meta"):
+        if config.architectures[0] in custom_models:
+            print("using custom modeling_xxx.py.")
+            if "Qwen2Moe" in config.architectures[0]:  # Qwen2Moe must use flash_attention_2 to avoid overflow.
+                config._attn_implementation = "flash_attention_2"
+            if "Llama" in config.architectures[0]:
+                config._attn_implementation = "eager"
+            if "Mixtral" in config.architectures[0]:
+                config._attn_implementation = "flash_attention_2"
+            model = custom_models[config.architectures[0]](config)
+        else:
+            attn_implementation = "flash_attention_2"
+            model = AutoModelForCausalLM.from_config(
+                config, trust_remote_code=True, attn_implementation=attn_implementation
+            )
+
+    optimize_config_path = model_args.kt_optimize_rule
+    gguf_path = model_args.model_name_or_path
+
+    assert optimize_config_path is not None, "optimize_config_path must be provided (path to YAML rules file)."
+    assert gguf_path is not None, "gguf_path must be provided (path to a folder or .gguf file)."
+
+    GLOBAL_CONFIG._config["mod"] = "infer"
+    optimize_and_load_gguf(model, optimize_config_path, gguf_path, config)
+
+    return model
+
+
+def get_kt_peft_model(model: "PreTrainedModel", peft_kwargs: dict[str, Any]) -> "PreTrainedModel":
+    r"""Get the peft model for the pretrained model with KTransformers. Used in training."""
+    from ktransformers.sft.peft_utils.mapping import get_peft_model
+
+    return get_peft_model(model, peft_kwargs)
+
+
+def load_kt_peft_model(model_args: "ModelArguments", model: "PreTrainedModel") -> "PreTrainedModel":
+    r"""Load peft model with KTransformers. Used in both training and inference."""
+    load_adapter_name_or_path = model_args.adapter_name_or_path[0]
+    if load_adapter_name_or_path.endswith(".gguf"):
+        inject_lora_layer(model, load_adapter_name_or_path)
+        adapter_gguf_loader = GGUFLoader(load_adapter_name_or_path)
+        load_weights(model, adapter_gguf_loader, adapter_gguf=True)
+        model.train()
+    else:
+        inject_lora_layer(model, load_adapter_name_or_path)
+
+        adapter_loader = SafeTensorLoader(load_adapter_name_or_path)
+        device = next(model.parameters()).device
+        for key in adapter_loader.tensor_file_map.keys():
+            try:
+                tensor = adapter_loader.load_tensor(key, device=device)
+
+                model_key = key.replace("base_model.model.", "")
+                model_key = model_key.replace(".weight", ".default.weight")
+                model_key = model_key.replace(".default.default.weight", ".default.weight")
+
+                param = model.get_parameter(model_key)
+                param.data.copy_(tensor.data)
+
+                print(f"Loaded adapter weight: {key} -> {model_key}")
+            except AttributeError:
+                print(f"Skipping {key}: not a model parameter")
+            except KeyError:
+                print(f"Key not found in model: {model_key} (original: {key})")
+
+    return model
diff --git a/llamafactory/model/model_utils/kv_cache.py b/llamafactory/model/model_utils/kv_cache.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f622f73f30017868f55e8503a968593c309b204
--- /dev/null
+++ b/llamafactory/model/model_utils/kv_cache.py
@@ -0,0 +1,44 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...extras import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+if TYPE_CHECKING:
+    from transformers import PretrainedConfig
+
+    from ...hparams import ModelArguments
+
+
+def configure_kv_cache(config: "PretrainedConfig", model_args: "ModelArguments", is_trainable: bool) -> None:
+    if not is_trainable:
+        setattr(config, "use_cache", model_args.use_kv_cache)
+        if hasattr(config, "text_config"):
+            setattr(config.text_config, "use_cache", model_args.use_kv_cache)
+
+        if model_args.use_kv_cache:
+            logger.info_rank0("KV cache is enabled for faster generation.")
+        else:
+            logger.info_rank0("KV cache is disabled.")
+    else:
+        setattr(config, "use_cache", False)
+        if hasattr(config, "text_config"):
+            setattr(config.text_config, "use_cache", False)
+
+        logger.info_rank0("KV cache is disabled during training.")
diff --git a/llamafactory/model/model_utils/liger_kernel.py b/llamafactory/model/model_utils/liger_kernel.py
new file mode 100644
index 0000000000000000000000000000000000000000..72c01f6f7875708e90e98d22e6cbbef437fe5254
--- /dev/null
+++ b/llamafactory/model/model_utils/liger_kernel.py
@@ -0,0 +1,91 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import TYPE_CHECKING
+
+from ...extras import logging
+
+
+if TYPE_CHECKING:
+    from transformers import PretrainedConfig
+
+    from ...hparams import ModelArguments
+
+
+logger = logging.get_logger(__name__)
+
+
+def apply_liger_kernel(
+    config: "PretrainedConfig",
+    model_args: "ModelArguments",
+    is_trainable: bool,
+    require_logits: bool,
+) -> None:
+    if not is_trainable or not model_args.enable_liger_kernel:
+        return
+
+    model_type = getattr(config, "model_type", None)
+    if model_type == "gemma":
+        from liger_kernel.transformers import apply_liger_kernel_to_gemma as apply_liger_kernel
+    elif model_type == "gemma2":
+        from liger_kernel.transformers import apply_liger_kernel_to_gemma2 as apply_liger_kernel
+    elif model_type == "gemma3":
+        from liger_kernel.transformers import apply_liger_kernel_to_gemma3 as apply_liger_kernel
+    elif model_type == "gemma3_text":
+        from liger_kernel.transformers import apply_liger_kernel_to_gemma3_text as apply_liger_kernel
+    elif model_type == "glm4":
+        from liger_kernel.transformers import apply_liger_kernel_to_glm4 as apply_liger_kernel
+    elif model_type == "glm4v":
+        from liger_kernel.transformers import apply_liger_kernel_to_glm4v as apply_liger_kernel
+    elif model_type == "granite":
+        from liger_kernel.transformers import apply_liger_kernel_to_granite as apply_liger_kernel
+    elif model_type == "llama":
+        from liger_kernel.transformers import apply_liger_kernel_to_llama as apply_liger_kernel
+    elif model_type == "llava":
+        from liger_kernel.transformers import apply_liger_kernel_to_llava as apply_liger_kernel
+    elif model_type == "mistral":
+        from liger_kernel.transformers import apply_liger_kernel_to_mistral as apply_liger_kernel
+    elif model_type == "mixtral":
+        from liger_kernel.transformers import apply_liger_kernel_to_mixtral as apply_liger_kernel
+    elif model_type == "mllama":
+        from liger_kernel.transformers import apply_liger_kernel_to_mllama as apply_liger_kernel
+    elif model_type == "olmo2":
+        from liger_kernel.transformers import apply_liger_kernel_to_olmo2 as apply_liger_kernel
+    elif model_type == "paligemma":
+        from liger_kernel.transformers import apply_liger_kernel_to_paligemma as apply_liger_kernel
+    elif model_type == "phi3":
+        from liger_kernel.transformers import apply_liger_kernel_to_phi3 as apply_liger_kernel
+    elif model_type == "qwen2":
+        from liger_kernel.transformers import apply_liger_kernel_to_qwen2 as apply_liger_kernel
+    elif model_type == "qwen2_vl":
+        from liger_kernel.transformers import apply_liger_kernel_to_qwen2_vl as apply_liger_kernel
+    elif model_type == "qwen2_5_vl":
+        from liger_kernel.transformers import apply_liger_kernel_to_qwen2_5_vl as apply_liger_kernel
+    elif model_type == "qwen3":
+        from liger_kernel.transformers import apply_liger_kernel_to_qwen3 as apply_liger_kernel
+    elif model_type == "qwen3_moe":
+        from liger_kernel.transformers import apply_liger_kernel_to_qwen3_moe as apply_liger_kernel
+    else:
+        logger.warning_rank0("Current model does not support liger kernel.")
+        return
+
+    if require_logits and "fused_linear_cross_entropy" in inspect.signature(apply_liger_kernel).parameters:
+        logger.info_rank0("Current training stage does not support chunked cross entropy.")
+        kwargs = {"fused_linear_cross_entropy": False, "cross_entropy": True}
+    else:
+        kwargs = {}
+
+    apply_liger_kernel(**kwargs)
+    logger.info_rank0("Liger kernel has been applied to the model.")
diff --git a/llamafactory/model/model_utils/longlora.py b/llamafactory/model/model_utils/longlora.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7c36ee0cb89959fa692d57cc9f85bb162ca58a0
--- /dev/null
+++ b/llamafactory/model/model_utils/longlora.py
@@ -0,0 +1,370 @@
+# Copyright 2025 EleutherAI, HuggingFace Inc., Yukang Chen, and the LlamaFactory team.
+#
+# This code is based on the EleutherAI's GPT-NeoX and the HuggingFace's Transformers libraries.
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llama/modeling_llama.py
+# This code is also inspired by the original LongLoRA implementation.
+# https://github.com/dvlab-research/LongLoRA/blob/main/llama_attn_replace.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import TYPE_CHECKING, Optional
+
+import torch
+import torch.nn as nn
+import transformers
+
+from ...extras import logging
+from ...extras.constants import SUPPORTED_CLASS_FOR_S2ATTN
+from ...extras.misc import check_version
+from ...extras.packages import is_transformers_version_greater_than
+
+
+if not is_transformers_version_greater_than("4.48.0"):
+    from transformers.modeling_flash_attention_utils import _flash_attention_forward
+    from transformers.models.llama.modeling_llama import (
+        Cache,
+        LlamaAttention,
+        LlamaFlashAttention2,
+        LlamaSdpaAttention,
+        apply_rotary_pos_emb,
+        repeat_kv,
+    )
+
+
+if TYPE_CHECKING:
+    from transformers import PretrainedConfig
+
+    from ...hparams import ModelArguments
+
+
+transformers_logger = transformers.utils.logging.get_logger(__name__)
+
+
+# Modified from:
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llama/modeling_llama.py
+def llama_attention_forward(
+    self: "LlamaAttention",
+    hidden_states: "torch.Tensor",
+    attention_mask: Optional["torch.Tensor"] = None,
+    position_ids: Optional["torch.LongTensor"] = None,
+    past_key_value: Optional["Cache"] = None,
+    output_attentions: bool = False,
+    cache_position: Optional["torch.LongTensor"] = None,
+    position_embeddings: Optional[tuple["torch.Tensor", "torch.Tensor"]] = None,
+    **kwargs,
+) -> tuple["torch.Tensor", Optional["torch.Tensor"], Optional[tuple["torch.Tensor"]]]:
+    bsz, q_len, _ = hidden_states.size()
+
+    query_states: torch.Tensor = self.q_proj(hidden_states)
+    key_states: torch.Tensor = self.k_proj(hidden_states)
+    value_states: torch.Tensor = self.v_proj(hidden_states)
+
+    query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+    key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+    value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+    if position_embeddings is None:
+        cos, sin = self.rotary_emb(value_states, position_ids)
+    else:
+        cos, sin = position_embeddings
+
+    query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+    if past_key_value is not None:
+        cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+        key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+    key_states = repeat_kv(key_states, self.num_key_value_groups)
+    value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+    if getattr(self.config, "group_size_ratio", None) and self.training:  # shift
+        groupsz = int(q_len * getattr(self.config, "group_size_ratio"))
+        assert q_len % groupsz == 0, f"q_len {q_len} should be divisible by group size {groupsz}."
+        num_groups = q_len // groupsz
+
+        def shift(state: "torch.Tensor") -> "torch.Tensor":
+            state = state.transpose(1, 2)  # output: (bsz, seq_len, n_heads, head_dim)
+            state = torch.cat(
+                (state[:, :, : self.num_heads // 2], state[:, :, self.num_heads // 2 :].roll(-groupsz // 2, dims=1)),
+                dim=2,
+            )
+            return state.reshape(bsz * num_groups, groupsz, self.num_heads, self.head_dim).transpose(1, 2)
+
+        query_states, key_states, value_states = shift(query_states), shift(key_states), shift(value_states)
+        if attention_mask is not None:
+            attention_mask = attention_mask[:, :, :groupsz, :groupsz].repeat(num_groups, 1, 1, 1)
+
+    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+    if attention_mask is not None:  # no matter the length, we just slice it
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    # upcast attention to fp32
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+    attn_output = torch.matmul(attn_weights, value_states)  # (bsz, :, seq_len, :) or (bsz * n_group, :, groupsz, :)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    if getattr(self.config, "group_size_ratio", None) and self.training:  # shift back
+        attn_output.reshape(bsz, q_len, self.num_heads, self.head_dim)
+        attn_output = torch.cat(
+            (
+                attn_output[:, :, : self.num_heads // 2],
+                attn_output[:, :, self.num_heads // 2 :].roll(groupsz // 2, dims=1),
+            ),
+            dim=2,
+        )
+
+    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+    attn_output = self.o_proj(attn_output)
+
+    if not output_attentions:
+        attn_weights = None
+
+    return attn_output, attn_weights, past_key_value
+
+
+# Modified from:
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llama/modeling_llama.py
+def llama_flash_attention_2_forward(
+    self: "LlamaFlashAttention2",
+    hidden_states: "torch.Tensor",
+    attention_mask: Optional["torch.Tensor"] = None,
+    position_ids: Optional["torch.LongTensor"] = None,
+    past_key_value: Optional["Cache"] = None,
+    output_attentions: bool = False,
+    cache_position: Optional["torch.LongTensor"] = None,
+    position_embeddings: Optional[tuple["torch.Tensor", "torch.Tensor"]] = None,
+    **kwargs,
+) -> tuple["torch.Tensor", Optional["torch.Tensor"], Optional[tuple["torch.Tensor"]]]:
+    # LlamaFlashAttention2 attention does not support output_attentions
+    output_attentions = False
+
+    bsz, q_len, _ = hidden_states.size()
+
+    query_states: torch.Tensor = self.q_proj(hidden_states)
+    key_states: torch.Tensor = self.k_proj(hidden_states)
+    value_states: torch.Tensor = self.v_proj(hidden_states)
+
+    query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+    key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+    value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+    if position_embeddings is None:
+        cos, sin = self.rotary_emb(value_states, position_ids)
+    else:
+        cos, sin = position_embeddings
+
+    query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+    if past_key_value is not None:
+        cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+        key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+    key_states = repeat_kv(key_states, self.num_key_value_groups)
+    value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+    # FlashAttention requires the input to have the shape (bsz, seq_len, n_heads, head_dim)
+    query_states = query_states.transpose(1, 2)
+    key_states = key_states.transpose(1, 2)
+    value_states = value_states.transpose(1, 2)
+
+    dropout_rate = self.attention_dropout if self.training else 0.0
+
+    input_dtype = query_states.dtype
+    if input_dtype == torch.float32:
+        if torch.is_autocast_enabled():
+            target_dtype = torch.get_autocast_gpu_dtype()
+        elif hasattr(self.config, "_pre_quantization_dtype"):
+            target_dtype = self.config._pre_quantization_dtype
+        else:
+            target_dtype = self.q_proj.weight.dtype
+
+        transformers_logger.warning_once("The input hidden states seems to be silently casted in float32.")
+        query_states = query_states.to(target_dtype)
+        key_states = key_states.to(target_dtype)
+        value_states = value_states.to(target_dtype)
+
+    if getattr(self.config, "group_size_ratio", None) and self.training:  # shift
+        groupsz = int(q_len * getattr(self.config, "group_size_ratio"))
+        assert q_len % groupsz == 0, f"q_len {q_len} should be divisible by group size {groupsz}."
+        num_groups = q_len // groupsz
+
+        def shift(state: "torch.Tensor") -> "torch.Tensor":
+            state = torch.cat(
+                (state[:, :, : self.num_heads // 2], state[:, :, self.num_heads // 2 :].roll(-groupsz // 2, dims=1)),
+                dim=2,
+            )
+            return state.reshape(bsz * num_groups, groupsz, self.num_heads, self.head_dim)
+
+        query_states, key_states, value_states = shift(query_states), shift(key_states), shift(value_states)
+        if attention_mask is not None:
+            attention_mask = attention_mask[:, :groupsz].repeat(num_groups, 1)
+
+        attn_output: torch.Tensor = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            query_states.size(1),
+            dropout=dropout_rate,
+            sliding_window=getattr(self, "sliding_window", None),
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+            is_causal=self.is_causal,
+        )
+
+    if getattr(self.config, "group_size_ratio", None) and self.training:  # shift back
+        attn_output.reshape(bsz, q_len, self.num_heads, self.head_dim)
+        attn_output = torch.cat(
+            (
+                attn_output[:, :, : self.num_heads // 2],
+                attn_output[:, :, self.num_heads // 2 :].roll(groupsz // 2, dims=1),
+            ),
+            dim=2,
+        )
+
+    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+    attn_output = self.o_proj(attn_output)
+
+    if not output_attentions:
+        attn_weights = None
+
+    return attn_output, attn_weights, past_key_value
+
+
+# Modified from:
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llama/modeling_llama.py
+def llama_sdpa_attention_forward(
+    self: "LlamaSdpaAttention",
+    hidden_states: "torch.Tensor",
+    attention_mask: Optional["torch.Tensor"] = None,
+    position_ids: Optional["torch.LongTensor"] = None,
+    past_key_value: Optional["Cache"] = None,
+    output_attentions: bool = False,
+    cache_position: Optional["torch.LongTensor"] = None,
+    position_embeddings: Optional[tuple["torch.Tensor", "torch.Tensor"]] = None,
+    **kwargs,
+) -> tuple["torch.Tensor", Optional["torch.Tensor"], Optional[tuple["torch.Tensor"]]]:
+    if output_attentions:
+        transformers_logger.warning_once(
+            "SDPA does not support `output_attentions=True`. Falling back to the vanilla attention"
+        )
+        return llama_attention_forward(
+            self,
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+    bsz, q_len, _ = hidden_states.size()
+
+    query_states: torch.Tensor = self.q_proj(hidden_states)
+    key_states: torch.Tensor = self.k_proj(hidden_states)
+    value_states: torch.Tensor = self.v_proj(hidden_states)
+
+    query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+    key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+    value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+    if position_embeddings is None:
+        cos, sin = self.rotary_emb(value_states, position_ids)
+    else:
+        cos, sin = position_embeddings
+
+    query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+    if past_key_value is not None:
+        cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+        key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+    key_states = repeat_kv(key_states, self.num_key_value_groups)
+    value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+    if getattr(self.config, "group_size_ratio", None) and self.training:  # shift
+        groupsz = int(q_len * getattr(self.config, "group_size_ratio"))
+        assert q_len % groupsz == 0, f"q_len {q_len} should be divisible by group size {groupsz}."
+        num_groups = q_len // groupsz
+
+        def shift(state: "torch.Tensor") -> "torch.Tensor":
+            state = state.transpose(1, 2)  # output: (bsz, seq_len, n_heads, head_dim)
+            state = torch.cat(
+                (state[:, :, : self.num_heads // 2], state[:, :, self.num_heads // 2 :].roll(-groupsz // 2, dims=1)),
+                dim=2,
+            )
+            return state.reshape(bsz * num_groups, groupsz, self.num_heads, self.head_dim).transpose(1, 2)
+
+        query_states, key_states, value_states = shift(query_states), shift(key_states), shift(value_states)
+        if attention_mask is not None:
+            attention_mask = attention_mask[:, :, :groupsz, :groupsz].repeat(num_groups, 1, 1, 1)
+
+    causal_mask = attention_mask
+    if attention_mask is not None:
+        causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
+
+    if query_states.device.type == "cuda" and causal_mask is not None:  # avoid pytorch bug
+        query_states = query_states.contiguous()
+        key_states = key_states.contiguous()
+        value_states = value_states.contiguous()
+
+    is_causal = True if causal_mask is None and q_len > 1 else False
+    attn_output = torch.nn.functional.scaled_dot_product_attention(
+        query_states,
+        key_states,
+        value_states,
+        attn_mask=causal_mask,
+        dropout_p=self.attention_dropout if self.training else 0.0,
+        is_causal=is_causal,
+    )
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    if getattr(self.config, "group_size_ratio", None) and self.training:  # shift back
+        attn_output.reshape(bsz, q_len, self.num_heads, self.head_dim)
+        attn_output = torch.cat(
+            (
+                attn_output[:, :, : self.num_heads // 2],
+                attn_output[:, :, self.num_heads // 2 :].roll(groupsz // 2, dims=1),
+            ),
+            dim=2,
+        )
+
+    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+    attn_output = self.o_proj(attn_output)
+
+    return attn_output, None, past_key_value
+
+
+def _apply_llama_patch() -> None:
+    check_version("transformers>=4.45.0,<4.48.0", mandatory=True)
+    LlamaAttention.forward = llama_attention_forward
+    LlamaFlashAttention2.forward = llama_flash_attention_2_forward
+    LlamaSdpaAttention.forward = llama_sdpa_attention_forward
+
+
+def configure_longlora(config: "PretrainedConfig", model_args: "ModelArguments", is_trainable: bool) -> None:
+    if not is_trainable or not model_args.shift_attn:
+        return
+
+    logger = logging.get_logger(__name__)
+
+    if getattr(config, "model_type", None) in SUPPORTED_CLASS_FOR_S2ATTN:
+        setattr(config, "group_size_ratio", 0.25)
+        _apply_llama_patch()
+        logger.info_rank0("Using shift short attention with group_size_ratio=1/4.")
+    else:
+        logger.warning_rank0("Current model does not support shift short attention.")
diff --git a/llamafactory/model/model_utils/misc.py b/llamafactory/model/model_utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0249b47c0740c16e2b42a3e8c9f9f47691412fe
--- /dev/null
+++ b/llamafactory/model/model_utils/misc.py
@@ -0,0 +1,86 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...extras import logging
+from .visual import COMPOSITE_MODELS
+
+
+if TYPE_CHECKING:
+    from transformers import PretrainedConfig, PreTrainedModel, PreTrainedTokenizer
+
+
+logger = logging.get_logger(__name__)
+
+
+def find_all_linear_modules(model: "PreTrainedModel", freeze_vision_tower: bool) -> list[str]:
+    r"""Find all available modules to apply LoRA, GaLore or APOLLO."""
+    model_type = getattr(model.config, "model_type", None)
+    forbidden_modules = {"lm_head"}
+    if model_type == "chatglm":
+        forbidden_modules.add("output_layer")
+    elif model_type == "internlm2":
+        forbidden_modules.add("output")
+
+    if model_type in COMPOSITE_MODELS:
+        forbidden_modules.add(COMPOSITE_MODELS[model_type].projector_key)
+
+    if freeze_vision_tower and model_type in COMPOSITE_MODELS:
+        forbidden_modules.update(COMPOSITE_MODELS[model_type].vision_model_keys)
+
+    module_names = set()
+    for name, module in model.named_modules():
+        if any(forbidden_module in name for forbidden_module in forbidden_modules):
+            continue
+
+        if "Linear" in module.__class__.__name__ and "Embedding" not in module.__class__.__name__:
+            module_names.add(name.split(".")[-1])
+
+    logger.info_rank0("Found linear modules: {}".format(",".join(module_names)))
+    return list(module_names)
+
+
+def find_expanded_modules(model: "PreTrainedModel", target_modules: list[str], num_layer_trainable: int) -> list[str]:
+    r"""Find the modules in the expanded blocks to apply lora."""
+    num_layers = getattr(model.config, "num_hidden_layers", None)
+    if not num_layers:
+        raise ValueError("Model was not supported.")
+
+    if num_layers % num_layer_trainable != 0:
+        raise ValueError(
+            f"`num_layers` {num_layers} should be divisible by `num_layer_trainable` {num_layer_trainable}."
+        )
+
+    stride = num_layers // num_layer_trainable
+    trainable_layer_ids = range(stride - 1, num_layers + stride - 1, stride)
+    trainable_layers = [f".{idx:d}." for idx in trainable_layer_ids]
+    module_names = []
+    for name, _ in model.named_modules():
+        if any(target_module in name for target_module in target_modules) and any(
+            trainable_layer in name for trainable_layer in trainable_layers
+        ):
+            module_names.append(name)
+
+    logger.info_rank0("Apply lora to layers: {}.".format(",".join(map(str, trainable_layer_ids))))
+    return module_names
+
+
+def register_autoclass(config: "PretrainedConfig", model: "PreTrainedModel", tokenizer: "PreTrainedTokenizer"):
+    if "AutoConfig" in getattr(config, "auto_map", {}):
+        config.__class__.register_for_auto_class()
+    if "AutoModelForCausalLM" in getattr(config, "auto_map", {}):
+        model.__class__.register_for_auto_class()
+    if "AutoTokenizer" in tokenizer.init_kwargs.get("auto_map", {}):
+        tokenizer.__class__.register_for_auto_class()
diff --git a/llamafactory/model/model_utils/mod.py b/llamafactory/model/model_utils/mod.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f67cd50dbc9016fc2cd650ae496501e8c594e76
--- /dev/null
+++ b/llamafactory/model/model_utils/mod.py
@@ -0,0 +1,42 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...extras.constants import MOD_SUPPORTED_MODELS
+
+
+if TYPE_CHECKING:
+    from transformers import PretrainedConfig, PreTrainedModel
+
+    from ...hparams import ModelArguments
+
+
+def load_mod_pretrained_model(**init_kwargs) -> "PreTrainedModel":
+    from MoD import AutoMoDModelForCausalLM
+
+    return AutoMoDModelForCausalLM.from_pretrained(**init_kwargs)
+
+
+def convert_pretrained_model_to_mod(
+    model: "PreTrainedModel", config: "PretrainedConfig", model_args: "ModelArguments"
+) -> "PreTrainedModel":
+    from MoD import apply_mod_to_hf
+
+    if getattr(config, "model_type", None) not in MOD_SUPPORTED_MODELS:
+        raise ValueError("Current model is not supported by mixture-of-depth.")
+
+    model = apply_mod_to_hf(model)
+    model = model.to(model_args.compute_dtype)
+    return model
diff --git a/llamafactory/model/model_utils/moe.py b/llamafactory/model/model_utils/moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..baf8406664d5db797ab78dabbe1667ce38fd97bd
--- /dev/null
+++ b/llamafactory/model/model_utils/moe.py
@@ -0,0 +1,177 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING, Union
+
+from transformers.integrations import is_deepspeed_zero3_enabled
+
+from ...extras.misc import check_version
+
+
+if TYPE_CHECKING:
+    from torch import nn
+    from transformers import PretrainedConfig, PreTrainedModel
+
+    from ...hparams import ModelArguments
+
+
+def _set_z3_leaf_modules(model: "PreTrainedModel", leaf_modules: list[Union["nn.Module", str]]) -> None:
+    check_version("deepspeed>=0.13.0")
+    from deepspeed.utils import set_z3_leaf_modules  # type: ignore
+
+    set_z3_leaf_modules(model, leaf_modules)
+
+
+def add_z3_leaf_module(model: "PreTrainedModel") -> None:
+    r"""Set module as a leaf module to skip partitioning in deepspeed zero3."""
+    if not is_deepspeed_zero3_enabled():
+        return
+
+    model_type = getattr(model.config, "model_type", None)
+    text_config = getattr(model.config, "text_config", None)
+    text_model_type = getattr(text_config, "model_type", None)
+
+    if model_type == "dbrx":
+        from transformers.models.dbrx.modeling_dbrx import DbrxFFN
+
+        _set_z3_leaf_modules(model, [DbrxFFN])
+
+    if model_type == "deepseek_v2":
+        # deepseek v2 uses custom code
+        _set_z3_leaf_modules(model, ["DeepseekV2MoE"])
+
+    if model_type == "deepseek_v3" or model_type == "kimi_vl":
+        # deepseek v3 and kimi vl use custom code
+        _set_z3_leaf_modules(model, ["DeepseekV3MoE"])
+
+    if model_type == "ernie4_5_moe":
+        from transformers.models.ernie4_5_moe.modeling_ernie4_5_moe import Ernie4_5_MoeSparseMoeBlock
+
+        _set_z3_leaf_modules(model, [Ernie4_5_MoeSparseMoeBlock])
+
+    if model_type == "granitemoe":
+        from transformers.models.granitemoe.modeling_granitemoe import GraniteMoeMoE
+
+        _set_z3_leaf_modules(model, [GraniteMoeMoE])
+
+    if model_type == "glm4_moe":
+        from transformers.models.glm4_moe.modeling_glm4_moe import Glm4MoeMoE
+
+        _set_z3_leaf_modules(model, [Glm4MoeMoE])
+
+    if model_type == "glm4v_moe":
+        from transformers.models.glm4v_moe.modeling_glm4v_moe import Glm4vMoeTextMoE
+
+        _set_z3_leaf_modules(model, [Glm4vMoeTextMoE])
+
+    if model_type == "jamba":
+        from transformers.models.jamba.modeling_jamba import JambaSparseMoeBlock
+
+        _set_z3_leaf_modules(model, [JambaSparseMoeBlock])
+
+    if model_type == "jetmoe":
+        from transformers.models.jetmoe.modeling_jetmoe import JetMoeMoA, JetMoeMoE
+
+        _set_z3_leaf_modules(model, [JetMoeMoA, JetMoeMoE])
+
+    if model_type == "llama4":
+        from transformers.models.llama4.modeling_llama4 import Llama4TextMoe
+
+        _set_z3_leaf_modules(model, [Llama4TextMoe])
+
+    if model_type == "mixtral":
+        from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
+
+        _set_z3_leaf_modules(model, [MixtralSparseMoeBlock])
+
+    if model_type == "olmoe":
+        from transformers.models.olmoe.modeling_olmoe import OlmoeSparseMoeBlock
+
+        _set_z3_leaf_modules(model, [OlmoeSparseMoeBlock])
+
+    if model_type == "phimoe":
+        from transformers.models.phimoe.modeling_phimoe import PhimoeSparseMoeBlock
+
+        _set_z3_leaf_modules(model, [PhimoeSparseMoeBlock])
+
+    if model_type == "qwen2_moe":
+        from transformers.models.qwen2_moe.modeling_qwen2_moe import Qwen2MoeSparseMoeBlock
+
+        _set_z3_leaf_modules(model, [Qwen2MoeSparseMoeBlock])
+
+    if model_type == "qwen3_moe" or text_model_type == "qwen3_moe":  # internvl 3.5
+        from transformers.models.qwen3_moe.modeling_qwen3_moe import Qwen3MoeSparseMoeBlock
+
+        _set_z3_leaf_modules(model, [Qwen3MoeSparseMoeBlock])
+
+    if model_type == "qwen3_vl_moe":
+        from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeTextSparseMoeBlock
+
+        _set_z3_leaf_modules(model, [Qwen3VLMoeTextSparseMoeBlock])
+
+    if model_type == "qwen3_omni_moe":
+        from transformers.models.qwen3_omni_moe.modeling_qwen3_omni_moe import Qwen3OmniMoeThinkerTextSparseMoeBlock
+
+        _set_z3_leaf_modules(model, [Qwen3OmniMoeThinkerTextSparseMoeBlock])
+
+
+def configure_moe(config: "PretrainedConfig", model_args: "ModelArguments", is_trainable: bool) -> None:
+    if not is_trainable or not model_args.moe_aux_loss_coef:
+        return
+
+    model_type = getattr(config, "model_type", None)
+    text_config = getattr(config, "text_config", None)  # for multimodal model
+
+    if model_type in [
+        "dbrx",
+        "ernie4_5_moe",
+        "granitemoe",
+        "jamba",
+        "jetmoe",
+        "llama4",
+        "mixtral",
+        "olmoe",
+        "phimoe",
+        "qwen2_moe",
+        "qwen3_moe",
+    ]:
+        setattr(config, "output_router_logits", True)
+
+    if text_config and getattr(text_config, "model_type", None) in [
+        "glm4v_moe_text",  # glmv4_5
+        "qwen3_moe",  # internvl_3_5
+    ]:
+        setattr(text_config, "output_router_logits", True)
+
+    if model_type in [
+        "ernie4_5_moe",
+        "granitemoe",
+        "jamba",
+        "llama4",
+        "mixtral",
+        "olmoe",
+        "phimoe",
+        "qwen2_moe",
+        "qwen3_moe",
+    ]:
+        setattr(config, "router_aux_loss_coef", model_args.moe_aux_loss_coef)
+
+    elif text_config and getattr(text_config, "model_type", None) in ["qwen3_moe"]:
+        setattr(text_config, "router_aux_loss_coef", model_args.moe_aux_loss_coef)
+
+    elif model_type == "deepseek":
+        setattr(config, "aux_loss_alpha", model_args.moe_aux_loss_coef)
+
+    elif model_type == "jetmoe":
+        setattr(config, "aux_loss_coef", model_args.moe_aux_loss_coef)
diff --git a/llamafactory/model/model_utils/packing.py b/llamafactory/model/model_utils/packing.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed0d6c41af98ecf6efeb511de48b27f225448e16
--- /dev/null
+++ b/llamafactory/model/model_utils/packing.py
@@ -0,0 +1,117 @@
+# Copyright 2025 Musab Gultekin and the LlamaFactory team.
+#
+# This code is based on the Musab Gultekin's functionary library.
+# https://github.com/MeetKai/functionary/blob/main/functionary/train/packing/monkey_patch_packing.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# MIT License
+#
+# Copyright (c) 2023 Musab Gultekin
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from typing import TYPE_CHECKING
+
+import torch
+import torch.nn.functional as F
+
+from ...extras import logging
+
+
+if TYPE_CHECKING:
+    from ...hparams import ModelArguments
+
+
+logger = logging.get_logger(__name__)
+
+
+def get_seqlens_in_batch(attention_mask: "torch.Tensor") -> "torch.Tensor":
+    r"""Get the sequnce lengths in the current batch.
+
+    e.g.
+    ```python
+    # input
+    [
+        [1, 1, 2, 2, 2, 0],
+        [1, 2, 2, 3, 3, 3],
+    ]
+    # output
+    [2, 3, 1, 2, 3]
+    ```
+    """
+    bsz = attention_mask.size(0)
+    dtype, device = attention_mask.dtype, attention_mask.device
+    max_num = torch.max(attention_mask).item()
+    counts: torch.Tensor = torch.zeros((bsz, max_num), dtype=dtype, device=device)
+    for i in range(max_num):
+        counts[:, i] = torch.sum(attention_mask == (i + 1), dim=-1)
+
+    counts = counts.flatten()
+    seqlens = counts[counts.nonzero().squeeze(dim=-1)]
+    return seqlens
+
+
+def get_unpad_data(attention_mask: "torch.Tensor") -> tuple["torch.Tensor", "torch.Tensor", int]:
+    r"""Prepare the indices and seqlens for flash attn varlen function.
+
+    Returns:
+        indices: indices of non-masked tokens from the flattened sequence.
+        cu_seqlens: the cumulative sequence lengths in the current batch, always starts from 0.
+        max_seqlen_in_batch: the largest seqlen in the current batch.
+
+    e.g.
+    ```python
+    # input
+    [
+        [1, 1, 2, 2, 2, 0],
+        [1, 2, 2, 3, 3, 3],
+    ]
+    # output
+    [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11]
+    [0, 2, 5, 6, 8, 11]
+    3
+    ```
+
+    """
+    seqlens_in_batch = get_seqlens_in_batch(attention_mask)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    return indices, cu_seqlens, max_seqlen_in_batch
+
+
+def configure_packing(model_args: "ModelArguments", is_trainable: bool) -> None:
+    if not is_trainable or not model_args.block_diag_attn:
+        return
+
+    import transformers.modeling_flash_attention_utils
+
+    transformers.modeling_flash_attention_utils._get_unpad_data = get_unpad_data
+    logger.info_rank0("Using block diagonal attention for sequence packing without cross-attention.")
diff --git a/llamafactory/model/model_utils/quantization.py b/llamafactory/model/model_utils/quantization.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b227b7f083946e5e4db5d1071757e4ecc669370
--- /dev/null
+++ b/llamafactory/model/model_utils/quantization.py
@@ -0,0 +1,199 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's Transformers and Optimum library.
+# https://github.com/huggingface/transformers/blob/v4.41.0/src/transformers/utils/quantization_config.py
+# https://github.com/huggingface/optimum/blob/v1.20.0/optimum/gptq/data.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import random
+from typing import TYPE_CHECKING, Any
+
+import torch
+from datasets import load_dataset
+from transformers import BitsAndBytesConfig, EetqConfig, GPTQConfig, HqqConfig
+from transformers.integrations import is_deepspeed_zero3_enabled
+from transformers.modeling_utils import is_fsdp_enabled
+
+from ...extras import logging
+from ...extras.constants import FILEEXT2TYPE, QuantizationMethod
+from ...extras.misc import check_version, get_current_device
+
+
+if TYPE_CHECKING:
+    from transformers import PretrainedConfig, PreTrainedTokenizer
+
+    from ...hparams import ModelArguments
+
+
+logger = logging.get_logger(__name__)
+
+
+def _get_quantization_dataset(tokenizer: "PreTrainedTokenizer", model_args: "ModelArguments") -> list[dict[str, Any]]:
+    r"""Prepare the tokenized dataset to perform AutoGPTQ. Do not use tensor output for JSON serialization."""
+    if os.path.isfile(model_args.export_quantization_dataset):
+        data_path = FILEEXT2TYPE.get(model_args.export_quantization_dataset.split(".")[-1], None)
+        data_files = model_args.export_quantization_dataset
+    else:
+        data_path = model_args.export_quantization_dataset
+        data_files = None
+
+    dataset = load_dataset(
+        path=data_path,
+        data_files=data_files,
+        split="train",
+        cache_dir=model_args.cache_dir,
+        token=model_args.hf_hub_token,
+    )
+
+    samples = []
+    maxlen = model_args.export_quantization_maxlen
+    for _ in range(model_args.export_quantization_nsamples):
+        n_try = 0
+        while True:
+            if n_try > 100:
+                raise ValueError("Cannot find satisfying example, considering decrease `export_quantization_maxlen`.")
+
+            sample_idx = random.randint(0, len(dataset) - 1)
+            sample: dict[str, torch.Tensor] = tokenizer(dataset[sample_idx]["text"], return_tensors="pt")
+            n_try += 1
+            if sample["input_ids"].size(1) > maxlen:
+                break  # TODO: fix large maxlen
+
+        word_idx = random.randint(0, sample["input_ids"].size(1) - maxlen - 1)
+        input_ids = sample["input_ids"][:, word_idx : word_idx + maxlen]
+        attention_mask = sample["attention_mask"][:, word_idx : word_idx + maxlen]
+        samples.append({"input_ids": input_ids.tolist(), "attention_mask": attention_mask.tolist()})
+
+    return samples
+
+
+def configure_quantization(
+    config: "PretrainedConfig",
+    tokenizer: "PreTrainedTokenizer",
+    model_args: "ModelArguments",
+    init_kwargs: dict[str, Any],
+) -> None:
+    r"""Priority: PTQ-quantized (train/infer) > AutoGPTQ (export) > On-the-fly quantization (train/infer)."""
+    if getattr(config, "quantization_config", None):  # ptq
+        if model_args.quantization_bit is not None:
+            logger.warning_rank0("`quantization_bit` will not affect on the PTQ-quantized models.")
+
+        quantization_config: dict[str, Any] = getattr(config, "quantization_config", None)
+        quant_method = quantization_config.get("quant_method", "")
+
+        if quant_method != QuantizationMethod.MXFP4 and (is_deepspeed_zero3_enabled() or is_fsdp_enabled()):
+            # mxfp4 will dequant the model weights
+            raise ValueError("DeepSpeed ZeRO-3 or FSDP is incompatible with PTQ-quantized models.")
+
+        if quant_method == QuantizationMethod.GPTQ:
+            check_version("gptqmodel>=2.0.0", mandatory=True)
+            quantization_config.pop("disable_exllama", None)  # remove deprecated args
+            quantization_config["use_exllama"] = False  # disable exllama
+
+        if quant_method == QuantizationMethod.AWQ:
+            check_version("autoawq", mandatory=True)
+
+        if quant_method == QuantizationMethod.AQLM:
+            check_version("aqlm>=1.1.0", mandatory=True)
+            quantization_config["bits"] = 2
+
+        quant_bits = quantization_config.get("bits", "?")
+        logger.info_rank0(f"Loading {quant_bits}-bit {quant_method.upper()}-quantized model.")
+
+    elif model_args.export_quantization_bit is not None:  # gptqmodel
+        if model_args.export_quantization_bit not in [8, 4, 3, 2]:
+            raise ValueError("AutoGPTQ only accepts 2/3/4/8-bit quantization.")
+
+        check_version("optimum>=1.24.0", mandatory=True)
+        check_version("gptqmodel>=2.0.0", mandatory=True)
+        from accelerate.utils import get_max_memory
+
+        if getattr(config, "model_type", None) == "chatglm":
+            raise ValueError("ChatGLM model is not supported yet.")
+
+        try:
+            from optimum.gptq import utils as gq_utils
+
+            if "language_model.model.layers" not in gq_utils.BLOCK_PATTERNS:
+                gq_utils.BLOCK_PATTERNS.insert(0, "language_model.model.layers")
+        except ImportError:
+            pass
+
+        block_name_to_quantize = None
+        if getattr(config, "model_type", None) in ["gemma3", "paligemma"]:
+            block_name_to_quantize = "language_model.model.layers"
+
+        init_kwargs["quantization_config"] = GPTQConfig(
+            bits=model_args.export_quantization_bit,
+            tokenizer=tokenizer,
+            dataset=_get_quantization_dataset(tokenizer, model_args),
+            block_name_to_quantize=block_name_to_quantize,
+        )
+        init_kwargs["device_map"] = "auto"
+        init_kwargs["max_memory"] = get_max_memory()
+        model_args.compute_dtype = torch.float16  # force fp16 for gptqmodel
+        logger.info_rank0(f"Quantizing model to {model_args.export_quantization_bit} bit with GPTQModel.")
+
+    elif model_args.quantization_bit is not None:  # on-the-fly
+        if model_args.quantization_method == QuantizationMethod.BNB:
+            if model_args.quantization_bit == 8:
+                check_version("bitsandbytes>=0.37.0", mandatory=True)
+                init_kwargs["quantization_config"] = BitsAndBytesConfig(load_in_8bit=True)
+            elif model_args.quantization_bit == 4:
+                check_version("bitsandbytes>=0.39.0", mandatory=True)
+                init_kwargs["quantization_config"] = BitsAndBytesConfig(
+                    load_in_4bit=True,
+                    bnb_4bit_compute_dtype=model_args.compute_dtype,
+                    bnb_4bit_use_double_quant=model_args.double_quantization,
+                    bnb_4bit_quant_type=model_args.quantization_type,
+                    bnb_4bit_quant_storage=model_args.compute_dtype,  # crucial for fsdp+qlora
+                )
+            else:
+                raise ValueError("Bitsandbytes only accepts 4-bit or 8-bit quantization.")
+
+            # Do not assign device map if:
+            # 1. deepspeed zero3 or fsdp (train)
+            # 2. auto quantization device map (inference)
+            if is_deepspeed_zero3_enabled() or is_fsdp_enabled() or model_args.quantization_device_map == "auto":
+                if model_args.quantization_bit != 4:
+                    raise ValueError("Only 4-bit quantized model can use fsdp+qlora or auto device map.")
+
+                check_version("bitsandbytes>=0.43.0", mandatory=True)
+            else:
+                init_kwargs["device_map"] = {"": get_current_device()}  # change auto device map for inference
+
+            logger.info_rank0(f"Quantizing model to {model_args.quantization_bit} bit with bitsandbytes.")
+        elif model_args.quantization_method == QuantizationMethod.HQQ:
+            if model_args.quantization_bit not in [8, 6, 5, 4, 3, 2, 1]:
+                raise ValueError("HQQ only accepts 1/2/3/4/5/6/8-bit quantization.")
+
+            if is_deepspeed_zero3_enabled() or is_fsdp_enabled():
+                raise ValueError("HQQ quantization is incompatible with DeepSpeed ZeRO-3 or FSDP.")
+
+            check_version("hqq", mandatory=True)
+            init_kwargs["quantization_config"] = HqqConfig(
+                nbits=model_args.quantization_bit, quant_zero=False, quant_scale=False, axis=0
+            )  # use ATEN kernel (axis=0) for performance
+            logger.info_rank0(f"Quantizing model to {model_args.quantization_bit} bit with HQQ.")
+        elif model_args.quantization_method == QuantizationMethod.EETQ:
+            if model_args.quantization_bit != 8:
+                raise ValueError("EETQ only accepts 8-bit quantization.")
+
+            if is_deepspeed_zero3_enabled() or is_fsdp_enabled():
+                raise ValueError("EETQ quantization is incompatible with DeepSpeed ZeRO-3 or FSDP.")
+
+            check_version("eetq", mandatory=True)
+            init_kwargs["quantization_config"] = EetqConfig()
+            logger.info_rank0(f"Quantizing model to {model_args.quantization_bit} bit with EETQ.")
diff --git a/llamafactory/model/model_utils/rope.py b/llamafactory/model/model_utils/rope.py
new file mode 100644
index 0000000000000000000000000000000000000000..d04279e0d3270a7049b8e66e3257d22dfe9f1505
--- /dev/null
+++ b/llamafactory/model/model_utils/rope.py
@@ -0,0 +1,81 @@
+# Copyright 2025 LMSYS and the LlamaFactory team.
+# Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
+#
+# This code is inspired by the LMSYS's FastChat library.
+# https://github.com/lm-sys/FastChat/blob/v0.2.30/fastchat/train/train.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import TYPE_CHECKING
+
+from ...extras import logging
+from ...extras.constants import RopeScaling
+
+
+if TYPE_CHECKING:
+    from transformers import PretrainedConfig
+
+    from ...hparams import ModelArguments
+
+
+logger = logging.get_logger(__name__)
+
+
+def configure_rope(config: "PretrainedConfig", model_args: "ModelArguments") -> None:
+    if model_args.rope_scaling is None:
+        return
+
+    if not hasattr(config, "rope_scaling"):
+        logger.warning_rank0("Current model does not support RoPE scaling.")
+        return
+
+    if hasattr(config, "max_position_embeddings"):
+        old_max_length = getattr(config, "max_position_embeddings", None)
+    else:
+        logger.warning_rank0("Cannot find the max position embeddings in the config.")
+        return
+
+    if model_args.model_max_length is not None:  # training
+        if model_args.model_max_length <= old_max_length:
+            logger.warning_rank0("Input length is smaller than max length. Disabling rope scaling.")
+            return
+
+        if model_args.rope_scaling == RopeScaling.DYNAMIC:
+            logger.warning_rank0(
+                "Dynamic NTK scaling may not work well with fine-tuning. "
+                "See: https://github.com/huggingface/transformers/pull/24653"
+            )
+
+        rope_factor = float(math.ceil(model_args.model_max_length / old_max_length))
+    else:  # inference
+        rope_factor = 2.0
+
+    rope_kwargs = {
+        "rope_type": getattr(model_args.rope_scaling, "value", model_args.rope_scaling),  # handle enum
+        "factor": rope_factor,
+    }
+    setattr(config, "max_position_embeddings", old_max_length * rope_factor)
+    logger.info_rank0(f"Enlarge max model length from {old_max_length} to {old_max_length * rope_factor}.")
+
+    if model_args.rope_scaling in [RopeScaling.DYNAMIC, RopeScaling.YARN]:
+        rope_kwargs["original_max_position_embeddings"] = old_max_length
+    elif model_args.rope_scaling == RopeScaling.LLAMA3:
+        rope_kwargs["original_max_position_embeddings"] = old_max_length
+        rope_kwargs["low_freq_factor"] = 1.0
+        rope_kwargs["high_freq_factor"] = 4.0
+
+    setattr(config, "rope_scaling", rope_kwargs)
+    logger.info_rank0(
+        f"Using {rope_kwargs['rope_type']} scaling strategy and setting scaling factor to {rope_kwargs['factor']}."
+    )
diff --git a/llamafactory/model/model_utils/unsloth.py b/llamafactory/model/model_utils/unsloth.py
new file mode 100644
index 0000000000000000000000000000000000000000..91e18dac966071491a061c241732f6be02c321f6
--- /dev/null
+++ b/llamafactory/model/model_utils/unsloth.py
@@ -0,0 +1,103 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING, Any, Optional
+
+from ...extras import logging
+from ...extras.misc import get_current_device
+
+
+if TYPE_CHECKING:
+    from transformers import PretrainedConfig, PreTrainedModel
+
+    from ...hparams import FinetuningArguments, ModelArguments
+
+
+logger = logging.get_logger(__name__)
+
+
+def _get_unsloth_kwargs(
+    config: "PretrainedConfig",
+    model_name_or_path: str,
+    model_args: "ModelArguments",
+    finetuning_args: "FinetuningArguments",
+) -> dict[str, Any]:
+    return {
+        "model_name": model_name_or_path,
+        "max_seq_length": model_args.model_max_length or 4096,
+        "dtype": model_args.compute_dtype,
+        "load_in_4bit": model_args.quantization_bit == 4,
+        "token": model_args.hf_hub_token,
+        "full_finetuning": finetuning_args.finetuning_type == "full",
+        "device_map": {"": get_current_device()},
+        "rope_scaling": getattr(config, "rope_scaling", None),
+        "fix_tokenizer": False,
+        "trust_remote_code": model_args.trust_remote_code,
+        "use_gradient_checkpointing": "unsloth",
+    }
+
+
+def load_unsloth_pretrained_model(
+    config: "PretrainedConfig", model_args: "ModelArguments", finetuning_args: "FinetuningArguments"
+) -> Optional["PreTrainedModel"]:
+    r"""Optionally load pretrained model with unsloth. Used in training."""
+    from unsloth import FastLanguageModel  # type: ignore
+
+    unsloth_kwargs = _get_unsloth_kwargs(config, model_args.model_name_or_path, model_args, finetuning_args)
+    try:
+        model, _ = FastLanguageModel.from_pretrained(**unsloth_kwargs)
+    except NotImplementedError:
+        logger.warning_rank0("Unsloth does not support model type {}.".format(getattr(config, "model_type", None)))
+        model = None
+        model_args.use_unsloth = False
+
+    return model
+
+
+def get_unsloth_peft_model(
+    model: "PreTrainedModel", model_args: "ModelArguments", peft_kwargs: dict[str, Any]
+) -> "PreTrainedModel":
+    r"""Get the peft model for the pretrained model with unsloth. Used in training."""
+    from unsloth import FastLanguageModel  # type: ignore
+
+    unsloth_peft_kwargs = {
+        "model": model,
+        "max_seq_length": model_args.model_max_length,
+        "use_gradient_checkpointing": "unsloth",
+    }
+    return FastLanguageModel.get_peft_model(**peft_kwargs, **unsloth_peft_kwargs)
+
+
+def load_unsloth_peft_model(
+    config: "PretrainedConfig",
+    model_args: "ModelArguments",
+    finetuning_args: "FinetuningArguments",
+    is_trainable: bool,
+) -> "PreTrainedModel":
+    r"""Load peft model with unsloth. Used in both training and inference."""
+    from unsloth import FastLanguageModel  # type: ignore
+
+    unsloth_kwargs = _get_unsloth_kwargs(config, model_args.adapter_name_or_path[0], model_args, finetuning_args)
+    try:
+        if not is_trainable:
+            unsloth_kwargs["use_gradient_checkpointing"] = False
+
+        model, _ = FastLanguageModel.from_pretrained(**unsloth_kwargs)
+    except NotImplementedError:
+        raise ValueError("Unsloth does not support model type {}.".format(getattr(config, "model_type", None)))
+
+    if not is_trainable:
+        FastLanguageModel.for_inference(model)
+
+    return model
diff --git a/llamafactory/model/model_utils/valuehead.py b/llamafactory/model/model_utils/valuehead.py
new file mode 100644
index 0000000000000000000000000000000000000000..7409a22edd3a66f08b9b9c3b99d2e9354e32c4c3
--- /dev/null
+++ b/llamafactory/model/model_utils/valuehead.py
@@ -0,0 +1,72 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+import torch
+from transformers.utils import cached_file
+
+from ...extras import logging
+from ...extras.constants import V_HEAD_SAFE_WEIGHTS_NAME, V_HEAD_WEIGHTS_NAME
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel
+
+    from ...hparams import ModelArguments
+
+
+logger = logging.get_logger(__name__)
+
+
+def load_valuehead_params(path_or_repo_id: str, model_args: "ModelArguments") -> dict[str, torch.Tensor]:
+    r"""Load value head parameters from Hugging Face Hub or local disk.
+
+    Returns: dict with keys `v_head.summary.weight` and `v_head.summary.bias`.
+    """
+    kwargs = {"path_or_repo_id": path_or_repo_id, "cache_dir": model_args.cache_dir, "token": model_args.hf_hub_token}
+    err_text = ""
+
+    try:
+        from safetensors import safe_open
+
+        vhead_file = cached_file(filename=V_HEAD_SAFE_WEIGHTS_NAME, **kwargs)
+        with safe_open(vhead_file, framework="pt", device="cpu") as f:
+            return {key: f.get_tensor(key) for key in f.keys()}
+    except Exception as err:
+        err_text = str(err)
+
+    try:
+        vhead_file = cached_file(filename=V_HEAD_WEIGHTS_NAME, **kwargs)
+        return torch.load(vhead_file, map_location="cpu", weights_only=True)
+    except Exception as err:
+        err_text = str(err)
+
+    logger.info_rank0(f"Provided path ({path_or_repo_id}) does not contain value head weights: {err_text}.")
+    logger.info_rank0("Ignore the above message if you are not resuming the training of a value head model.")
+    return None
+
+
+def prepare_valuehead_model(model: "PreTrainedModel") -> None:
+    if getattr(model.config, "model_type", None) == "llava":
+        setattr(model, "lm_head", model.language_model.get_output_embeddings())
+        setattr(model, "_keys_to_ignore_on_save", ["lm_head.weight"])
+
+    if getattr(model.config, "model_type", None) == "chatglm":
+        setattr(model, "lm_head", model.transformer.output_layer)
+        setattr(model, "_keys_to_ignore_on_save", ["lm_head.weight"])
+
+    if getattr(model.config, "model_type", None) == "internlm2":
+        setattr(model, "lm_head", model.output)
+        setattr(model, "_keys_to_ignore_on_save", ["lm_head.weight"])
diff --git a/llamafactory/model/model_utils/visual.py b/llamafactory/model/model_utils/visual.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ca645699bf6c825ffe63c50a1666afa8d4f4a9d
--- /dev/null
+++ b/llamafactory/model/model_utils/visual.py
@@ -0,0 +1,384 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's Transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llava/modeling_llava.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional
+
+import torch
+import transformers
+import transformers.models
+from transformers.activations import ACT2FN
+
+from ...extras import logging
+from ...extras.packages import is_transformers_version_greater_than
+
+
+if TYPE_CHECKING:
+    from transformers import LlavaConfig, PretrainedConfig, PreTrainedModel
+
+    from ...hparams import FinetuningArguments, ModelArguments
+
+
+logger = logging.get_logger(__name__)
+transformers_logger = transformers.utils.logging.get_logger(__name__)
+
+
+@dataclass
+class CompositeModel:
+    model_type: str
+    projector_key: str
+    vision_model_keys: list[str]
+    language_model_keys: list[str]
+    lora_conflict_keys: list[str]
+
+    def get_projector(self, module: "torch.nn.Module") -> "torch.nn.Module":
+        for key in self.projector_key.split("."):
+            module = getattr(module, key)
+
+        return module
+
+
+COMPOSITE_MODELS: dict[str, "CompositeModel"] = {}
+
+
+def _register_composite_model(
+    model_type: str,
+    projector_key: Optional[str] = None,
+    vision_model_keys: Optional[list[str]] = None,
+    language_model_keys: Optional[list[str]] = None,
+    lora_conflict_keys: Optional[list[str]] = None,
+):
+    r"""Register a new composite model.
+
+    Args:
+        model_type: model type
+        projector_key: multi_modal_projector
+        vision_model_keys: vision_tower
+        language_model_keys: language_model
+        lora_conflict_keys: None
+
+    """
+    COMPOSITE_MODELS[model_type] = CompositeModel(
+        model_type=model_type,
+        projector_key=projector_key or "multi_modal_projector",
+        vision_model_keys=vision_model_keys or ["vision_tower"],
+        language_model_keys=language_model_keys or ["language_model", "lm_head"],
+        lora_conflict_keys=lora_conflict_keys or [],
+    )
+
+
+class LlavaMultiModalProjectorForYiVL(torch.nn.Module):
+    def __init__(self, config: "LlavaConfig") -> None:
+        super().__init__()
+
+        self.config = config
+        if config is None:
+            return
+
+        self.linear_1 = torch.nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, bias=True)
+        self.linear_2 = torch.nn.LayerNorm(config.text_config.hidden_size, bias=True)
+        self.linear_3 = torch.nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True)
+        self.linear_4 = torch.nn.LayerNorm(config.text_config.hidden_size, bias=True)
+        self.act = ACT2FN[config.projector_hidden_act]
+
+    def forward(self, image_features: "torch.Tensor") -> "torch.Tensor":
+        hidden_states = self.linear_1(image_features)
+        hidden_states = self.linear_2(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_3(hidden_states)
+        hidden_states = self.linear_4(hidden_states)
+        if hidden_states.dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.linear_1.weight.dtype
+
+            transformers_logger.warning_once("The hidden states seems to be silently casted in float32.")
+            hidden_states = hidden_states.to(target_dtype)
+
+        return hidden_states
+
+
+class LlavaMultiModalProjectorForYiVLForVLLM(LlavaMultiModalProjectorForYiVL):
+    def __init__(self, vision_hidden_size: int, text_hidden_size: int, projector_hidden_act: str) -> None:
+        super().__init__(config=None)
+
+        self.linear_1 = torch.nn.Linear(vision_hidden_size, text_hidden_size, bias=True)
+        self.linear_2 = torch.nn.LayerNorm(text_hidden_size, bias=True)
+        self.linear_3 = torch.nn.Linear(text_hidden_size, text_hidden_size, bias=True)
+        self.linear_4 = torch.nn.LayerNorm(text_hidden_size, bias=True)
+        self.act = ACT2FN[projector_hidden_act]
+
+
+def autocast_projector_dtype(model: "PreTrainedModel", model_args: "ModelArguments") -> None:
+    r"""Cast projector output to half precision for fine-tuning quantized VLMs."""
+
+    def _mm_projector_forward_post_hook(
+        module: "torch.nn.Module", args: tuple["torch.Tensor"], output: "torch.Tensor"
+    ) -> "torch.Tensor":
+        return output.to(model_args.compute_dtype)
+
+    if getattr(model, "quantization_method", None):
+        model_type = getattr(model.config, "model_type", None)
+        if model_type in COMPOSITE_MODELS:
+            mm_projector = COMPOSITE_MODELS[model_type].get_projector(model)
+        else:
+            return
+
+        logger.info_rank0(f"Casting multimodal projector outputs in {model_args.compute_dtype}.")
+        mm_projector.register_forward_hook(_mm_projector_forward_post_hook)
+
+
+def configure_visual_model(config: "PretrainedConfig") -> None:
+    r"""Patch VLMs before loading them."""
+    if getattr(config, "text_config", None) and not getattr(config, "hidden_size", None):
+        # required for ds zero3 and valuehead models
+        setattr(config, "hidden_size", getattr(config.text_config, "hidden_size", None))
+
+    if getattr(config, "is_yi_vl_derived_model", None):
+        logger.info_rank0("Detected Yi-VL model, applying projector patch.")
+        transformers.models.llava.modeling_llava.LlavaMultiModalProjector = LlavaMultiModalProjectorForYiVL
+
+
+def get_forbidden_modules(config: "PretrainedConfig", finetuning_args: "FinetuningArguments") -> set[str]:
+    r"""Freeze vision tower and language model for VLM full/freeze tuning."""
+    model_type = getattr(config, "model_type", None)
+    forbidden_modules = set()
+    if model_type in COMPOSITE_MODELS:
+        if finetuning_args.freeze_vision_tower:
+            vision_model_keys = COMPOSITE_MODELS[model_type].vision_model_keys
+            logger.info_rank0(f"Set vision model not trainable: {vision_model_keys}.")
+            forbidden_modules.update(vision_model_keys)
+
+        if finetuning_args.freeze_multi_modal_projector:
+            projector_key = COMPOSITE_MODELS[model_type].projector_key
+            logger.info_rank0(f"Set multi model projector not trainable: {projector_key}.")
+            forbidden_modules.add(projector_key)
+
+        if finetuning_args.freeze_language_model:
+            language_model_keys = COMPOSITE_MODELS[model_type].language_model_keys
+            logger.info_rank0(f"Set language model not trainable: {language_model_keys}.")
+            forbidden_modules.update(language_model_keys)
+
+    return forbidden_modules
+
+
+def patch_target_modules(
+    model: "PreTrainedModel", finetuning_args: "FinetuningArguments", target_modules: list[str]
+) -> list[str]:
+    r"""Freeze vision tower for VLM LoRA tuning."""
+    model_type = getattr(model.config, "model_type", None)
+    if model_type in COMPOSITE_MODELS:
+        forbidden_modules = get_forbidden_modules(model.config, finetuning_args)
+        forbidden_modules.update(COMPOSITE_MODELS[model_type].lora_conflict_keys)
+        module_names = []
+        for name, _ in model.named_modules():
+            if any(target_module in name for target_module in target_modules) and not any(
+                forbidden_module in name for forbidden_module in forbidden_modules
+            ):
+                module_names.append(name)
+
+        return module_names
+    else:
+        return target_modules
+
+
+_register_composite_model(
+    model_type="dots_ocr",
+    projector_key="vision_tower.merger",
+    vision_model_keys=["vision_tower"],
+    language_model_keys=["model", "lm_head"],
+    lora_conflict_keys=["merger"],
+)
+
+
+_register_composite_model(
+    model_type="gemma3",
+)
+
+
+_register_composite_model(
+    model_type="gemma3n",
+    vision_model_keys=["vision_tower", "audio_tower"],
+    lora_conflict_keys=["timm_model", "subsample_conv_projection"],
+)
+
+
+# copied from qwen2vl
+_register_composite_model(
+    model_type="glm4v",
+    projector_key="visual.merger",
+    vision_model_keys=["visual.patch_embed", "visual.blocks"],
+    language_model_keys=["language_model", "lm_head"],
+    lora_conflict_keys=["patch_embed"],
+)
+
+
+_register_composite_model(
+    model_type="glm4v_moe",
+    projector_key="visual.merger",
+    vision_model_keys=["visual.patch_embed", "visual.blocks"],
+    language_model_keys=["language_model", "lm_head"],
+    lora_conflict_keys=["patch_embed"],
+)
+
+
+_register_composite_model(
+    model_type="internvl",
+)
+
+_register_composite_model(
+    model_type="interns1",
+)
+
+_register_composite_model(
+    model_type="Keye",
+    projector_key="mlp_AR",
+    vision_model_keys=["visual.vision_model.patch_embedding", "visual.vision_model.encoder"],
+    language_model_keys=["model", "lm_head"],
+    lora_conflict_keys=["patch_embedding"],
+)
+
+
+_register_composite_model(
+    model_type="kimi_vl",
+)
+
+
+_register_composite_model(
+    model_type="llama4",
+    vision_model_keys=["vision_model"],
+)
+
+
+_register_composite_model(
+    model_type="llava",
+)
+
+
+_register_composite_model(
+    model_type="llava_next",
+)
+
+
+_register_composite_model(
+    model_type="llava_next_video",
+)
+
+
+_register_composite_model(
+    model_type="minicpmv",
+    projector_key="resampler",
+    vision_model_keys=["vpm"],
+    language_model_keys=["llm"],
+)
+
+
+_register_composite_model(
+    model_type="minicpmo",
+    projector_key="resampler",
+    vision_model_keys=["vpm", "apm", "audio_avg_pooler", "audio_projection_layer", "tts"],
+    language_model_keys=["llm"],
+    lora_conflict_keys=["audio_projection_layer"],
+)
+
+
+_register_composite_model(
+    model_type="mistral3",
+)
+
+
+_register_composite_model(
+    model_type="mllama",
+    vision_model_keys=["vision_model"],
+)
+
+
+_register_composite_model(
+    model_type="paligemma",
+)
+
+
+_register_composite_model(
+    model_type="qwen2_audio",
+    vision_model_keys=["audio_tower"],
+)
+
+
+_register_composite_model(
+    model_type="qwen2_5_omni_thinker",
+    projector_key="visual.merger",
+    vision_model_keys=["visual.patch_embed", "visual.blocks", "audio_tower"],
+    language_model_keys=["model", "lm_head"],
+    lora_conflict_keys=["patch_embed"],
+)
+
+
+_register_composite_model(
+    model_type="qwen2_vl",
+    projector_key="visual.merger",
+    vision_model_keys=["visual.patch_embed", "visual.blocks"],
+    language_model_keys=["language_model", "lm_head"]
+    if is_transformers_version_greater_than("4.52.0")
+    else ["model", "lm_head"],
+    lora_conflict_keys=["patch_embed"],
+)
+
+
+_register_composite_model(
+    model_type="qwen2_5_vl",
+    projector_key="visual.merger",
+    vision_model_keys=["visual.patch_embed", "visual.blocks"],
+    language_model_keys=["language_model", "lm_head"]
+    if is_transformers_version_greater_than("4.52.0")
+    else ["model", "lm_head"],
+    lora_conflict_keys=["patch_embed"],
+)
+
+
+_register_composite_model(
+    model_type="qwen3_vl",
+    projector_key="visual.merger",
+    vision_model_keys=["visual.patch_embed", "visual.blocks", "visual.deepstack_merger_list"],
+    language_model_keys=["language_model", "lm_head"],
+    lora_conflict_keys=["patch_embed"],
+)
+
+
+_register_composite_model(
+    model_type="qwen3_vl_moe",
+    projector_key="visual.merger",
+    vision_model_keys=["visual.patch_embed", "visual.blocks", "visual.deepstack_merger_list"],
+    language_model_keys=["language_model", "lm_head"],
+    lora_conflict_keys=["patch_embed"],
+)
+
+
+_register_composite_model(
+    model_type="qwen3_omni_moe_thinker",
+    projector_key="visual.merger",
+    vision_model_keys=["visual.patch_embed", "visual.blocks", "visual.deepstack_merger_list", "audio_tower"],
+    language_model_keys=["model", "lm_head"],
+    lora_conflict_keys=["patch_embed"],
+)
+
+
+_register_composite_model(
+    model_type="video_llava",
+)
diff --git a/llamafactory/model/patcher.py b/llamafactory/model/patcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbf9aea37d1b403f8685476e46082e0667f069b2
--- /dev/null
+++ b/llamafactory/model/patcher.py
@@ -0,0 +1,238 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from types import MethodType
+from typing import TYPE_CHECKING, Any
+
+import torch
+from peft import PeftModel
+from transformers import GenerationMixin, PreTrainedModel, PreTrainedTokenizerBase
+from transformers.integrations import is_deepspeed_zero3_enabled
+from transformers.modeling_utils import is_fsdp_enabled
+
+from ..extras import logging
+from ..extras.misc import infer_optim_dtype
+from ..extras.packages import is_transformers_version_greater_than
+from .model_utils.attention import configure_attn_implementation, print_attn_implementation
+from .model_utils.checkpointing import prepare_model_for_training
+from .model_utils.embedding import resize_embedding_layer
+from .model_utils.kv_cache import configure_kv_cache
+from .model_utils.longlora import configure_longlora
+from .model_utils.moe import add_z3_leaf_module, configure_moe
+from .model_utils.packing import configure_packing
+from .model_utils.quantization import configure_quantization
+from .model_utils.rope import configure_rope
+from .model_utils.valuehead import prepare_valuehead_model
+from .model_utils.visual import autocast_projector_dtype, configure_visual_model
+
+
+if TYPE_CHECKING:
+    from transformers import PretrainedConfig, PreTrainedTokenizer, ProcessorMixin
+    from trl import AutoModelForCausalLMWithValueHead
+
+    from ..hparams import ModelArguments
+
+
+logger = logging.get_logger(__name__)
+
+
+def patch_tokenizer(tokenizer: "PreTrainedTokenizer", model_args: "ModelArguments") -> None:
+    if "PreTrainedTokenizerBase" not in str(tokenizer._pad.__func__):
+        tokenizer._pad = MethodType(PreTrainedTokenizerBase._pad, tokenizer)
+
+    if model_args.model_max_length is not None and tokenizer.model_max_length < model_args.model_max_length:
+        tokenizer.model_max_length = model_args.model_max_length  # enlarge the tokenizer max length
+
+    if model_args.add_tokens is not None:
+        num_added_tokens = tokenizer.add_tokens(new_tokens=model_args.add_tokens, special_tokens=False)
+        logger.info_rank0("Add tokens {} to tokenizer's vocabulary.".format(",".join(model_args.add_tokens)))
+        if num_added_tokens > 0 and not model_args.resize_vocab:
+            model_args.resize_vocab = True
+            logger.warning_rank0("New tokens have been added, changed `resize_vocab` to True.")
+
+    if model_args.add_special_tokens is not None:
+        num_added_special_tokens = tokenizer.add_tokens(new_tokens=model_args.add_special_tokens, special_tokens=True)
+        logger.info_rank0(
+            "Add special tokens {} to tokenizer's vocabulary.".format(",".join(model_args.add_special_tokens))
+        )
+        if num_added_special_tokens > 0 and not model_args.resize_vocab:
+            model_args.resize_vocab = True
+            logger.warning_rank0("New special tokens have been added, changed `resize_vocab` to True.")
+
+
+def patch_processor(
+    processor: "ProcessorMixin",
+    tokenizer: "PreTrainedTokenizer",
+    model_args: "ModelArguments",
+) -> None:
+    setattr(processor, "tokenizer", tokenizer)
+    setattr(processor, "image_max_pixels", model_args.image_max_pixels)
+    setattr(processor, "image_min_pixels", model_args.image_min_pixels)
+    setattr(processor, "image_do_pan_and_scan", model_args.image_do_pan_and_scan)
+    setattr(processor, "crop_to_patches", model_args.crop_to_patches)
+    setattr(processor, "video_max_pixels", model_args.video_max_pixels)
+    setattr(processor, "video_min_pixels", model_args.video_min_pixels)
+    setattr(processor, "video_fps", model_args.video_fps)
+    setattr(processor, "video_maxlen", model_args.video_maxlen)
+    setattr(processor, "use_audio_in_video", model_args.use_audio_in_video)
+    setattr(processor, "audio_sampling_rate", model_args.audio_sampling_rate)
+
+
+def patch_config(
+    config: "PretrainedConfig",
+    tokenizer: "PreTrainedTokenizer",
+    model_args: "ModelArguments",
+    init_kwargs: dict[str, Any],
+    is_trainable: bool,
+) -> None:
+    if model_args.compute_dtype is None:  # priority: bf16 > fp16 > fp32
+        if model_args.infer_dtype != "auto" and not is_trainable:
+            model_args.compute_dtype = getattr(torch, model_args.infer_dtype)
+        else:
+            model_args.compute_dtype = infer_optim_dtype(model_dtype=getattr(config, "torch_dtype", None))
+
+    configure_attn_implementation(config, model_args)
+    configure_rope(config, model_args)
+    configure_longlora(config, model_args, is_trainable)
+    configure_quantization(config, tokenizer, model_args, init_kwargs)
+    configure_moe(config, model_args, is_trainable)
+    configure_visual_model(config)
+    configure_packing(model_args, is_trainable)
+    configure_kv_cache(config, model_args, is_trainable)
+
+    if getattr(config, "model_type", None) == "qwen":
+        setattr(config, "use_flash_attn", model_args.flash_attn == "fa2")
+        for dtype_name, dtype in [("fp16", torch.float16), ("bf16", torch.bfloat16), ("fp32", torch.float32)]:
+            setattr(config, dtype_name, model_args.compute_dtype == dtype)
+
+    if getattr(config, "model_type", None) == "minicpmo":
+        setattr(config, "init_audio", True)
+        setattr(config, "init_tts", False)
+
+    # replace the top-k gating method
+    if getattr(config, "model_type", None) == "kimi_vl" and is_trainable:
+        setattr(config.text_config, "topk_method", "greedy")
+
+    if "InternVLChatModel" in getattr(config, "architectures", []):
+        raise ValueError(
+            "Please download the internvl models in a Hugging Face–compatible format "
+            "(for example, https://huggingface.co/OpenGVLab/InternVL3-8B-hf)."
+        )
+
+    if "LlavaLlamaForCausalLM" in getattr(config, "architectures", []):
+        raise ValueError("Please download llava models with hf-compatible format: https://huggingface.co/llava-hf")
+
+    if getattr(config, "model_type", None) == "internlm3" and not is_transformers_version_greater_than("4.47.1"):
+        raise RuntimeError("InternLM3 model requires transformers>=4.47.1, please upgrade it.")
+
+    # deepspeed zero3 is not compatible with low_cpu_mem_usage
+    init_kwargs["low_cpu_mem_usage"] = model_args.low_cpu_mem_usage and (not is_deepspeed_zero3_enabled())
+
+    # do not cast data type of the model deepspeed zero3 without qlora
+    if not (is_deepspeed_zero3_enabled() and model_args.quantization_bit is None):
+        init_kwargs["torch_dtype"] = model_args.compute_dtype
+
+        if init_kwargs["low_cpu_mem_usage"] and not is_fsdp_enabled():  # fsdp does not need device map
+            if "device_map" not in init_kwargs and model_args.device_map:
+                init_kwargs["device_map"] = model_args.device_map  # device map requires low_cpu_mem_usage=True
+
+            if init_kwargs.get("device_map", None) == "auto":
+                init_kwargs["offload_folder"] = model_args.offload_folder
+
+
+def patch_model(
+    model: "PreTrainedModel",
+    tokenizer: "PreTrainedTokenizer",
+    model_args: "ModelArguments",
+    is_trainable: bool,
+    add_valuehead: bool,
+) -> None:
+    gen_config = model.generation_config  # check and fix generation config
+    if not gen_config.do_sample and (
+        (gen_config.temperature is not None and gen_config.temperature != 1.0)
+        or (gen_config.top_p is not None and gen_config.top_p != 1.0)
+        or (gen_config.typical_p is not None and gen_config.typical_p != 1.0)
+    ):
+        gen_config.do_sample = True
+
+    if getattr(model.config, "model_type", None) not in ["minicpmv", "minicpmo"] and "GenerationMixin" not in str(
+        model.generate.__func__
+    ):
+        model.generate = MethodType(GenerationMixin.generate, model)
+
+    if add_valuehead:
+        prepare_valuehead_model(model)
+
+    if model_args.resize_vocab:
+        resize_embedding_layer(
+            model,
+            tokenizer,
+            new_special_tokens_config=getattr(model_args, "_special_token_descriptions", None),
+            init_special_tokens=model_args.init_special_tokens,
+        )
+
+    if is_trainable:
+        if getattr(model.config, "model_type", None) == "gemma3n":
+            setattr(model_args, "disable_gradient_checkpointing", True)
+
+        prepare_model_for_training(model, model_args)
+        autocast_projector_dtype(model, model_args)
+        add_z3_leaf_module(model)
+
+    if not model_args.use_unsloth:
+        print_attn_implementation(model.config)
+
+    try:
+        model.add_model_tags(["llama-factory"])
+    except Exception:
+        logger.warning_rank0("Cannot properly tag the model.")
+
+
+def patch_valuehead_model(model: "AutoModelForCausalLMWithValueHead") -> None:
+    def tie_weights(self: "AutoModelForCausalLMWithValueHead") -> None:
+        if isinstance(self.pretrained_model, PreTrainedModel):
+            self.pretrained_model.tie_weights()
+
+    def get_input_embeddings(self: "AutoModelForCausalLMWithValueHead") -> torch.nn.Module:
+        if isinstance(self.pretrained_model, PreTrainedModel):
+            return self.pretrained_model.get_input_embeddings()
+
+    def get_output_embeddings(self: "AutoModelForCausalLMWithValueHead") -> torch.nn.Module:
+        if isinstance(self.pretrained_model, PreTrainedModel):
+            return self.pretrained_model.get_output_embeddings()
+
+    def create_or_update_model_card(self: "AutoModelForCausalLMWithValueHead", output_dir: str) -> None:
+        if isinstance(self.pretrained_model, PeftModel):
+            self.pretrained_model.create_or_update_model_card(output_dir)
+
+    def get_rope_index_func(self: "AutoModelForCausalLMWithValueHead"):
+        if isinstance(self.pretrained_model, PeftModel):
+            base_model = self.pretrained_model.base_model.model
+        else:
+            base_model = self.pretrained_model
+
+        if base_model and hasattr(base_model, "get_rope_index"):
+            return base_model.get_rope_index
+        elif base_model and hasattr(base_model, "model") and hasattr(base_model.model, "get_rope_index"):
+            return base_model.model.get_rope_index
+        else:
+            return None
+
+    ignore_modules = [name for name, _ in model.named_parameters() if "pretrained_model" in name]
+    setattr(model, "_keys_to_ignore_on_save", ignore_modules)
+    setattr(model, "tie_weights", MethodType(tie_weights, model))
+    setattr(model, "get_input_embeddings", MethodType(get_input_embeddings, model))
+    setattr(model, "get_output_embeddings", MethodType(get_output_embeddings, model))
+    setattr(model, "get_rope_index", get_rope_index_func(model))
+    setattr(model, "create_or_update_model_card", MethodType(create_or_update_model_card, model))
diff --git a/llamafactory/third_party/__init__.py b/llamafactory/third_party/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/llamafactory/third_party/muon/__init__.py b/llamafactory/third_party/muon/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..afa615d0df2163ea271fff8e128aa8c81224dd8f
--- /dev/null
+++ b/llamafactory/third_party/muon/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .muon import Muon
+
+
+__all__ = ["Muon"]
diff --git a/llamafactory/third_party/muon/muon.py b/llamafactory/third_party/muon/muon.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7482c36b8196cf1dfaa8b0ae441b1cfd7ae5dc3
--- /dev/null
+++ b/llamafactory/third_party/muon/muon.py
@@ -0,0 +1,226 @@
+# Copyright 2025 Moonshot AI and the LlamaFactory team.
+#
+# This code is based on the MoonshotAI's Moonlight library.
+# https://github.com/MoonshotAI/Moonlight/blob/master/examples/toy_train.py
+# and the Keller Jordan's Muon library.
+# https://github.com/KellerJordan/Muon/blob/master/muon.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# MIT License
+#
+# Copyright (c) 2025 Moonshot AI
+# Copyright (c) 2024 Keller Jordan
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import math
+
+import torch
+
+
+def zeropower_via_newtonschulz5(G: "torch.Tensor", steps: int) -> "torch.Tensor":
+    """Newton-Schulz iteration to compute the zeroth power / orthogonalization of G.
+
+    We opt to use a quintic iteration whose coefficients are selected to maximize the slope at zero.
+    For the purpose of minimizing steps, it turns out to be empirically effective to keep increasing
+    the slope at zero even beyond the point where the iteration no longer converges all the way to
+    one everywhere on the interval. This iteration therefore does not produce UV^T but rather something
+    like US'V^T where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
+    performance at all relative to UV^T, where USV^T = G is the SVD.
+    """
+    assert len(G.shape) == 2
+    a, b, c = (3.4445, -4.7750, 2.0315)
+    X = G.bfloat16()
+    if G.size(0) > G.size(1):
+        X = X.T
+    # Ensure spectral norm is at most 1
+    X = X / (X.norm() + 1e-7)
+    # Perform the NS iterations
+    for _ in range(steps):
+        A = X @ X.T
+        B = b * A + c * A @ A  # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng
+        X = a * X + B @ X
+
+    if G.size(0) > G.size(1):
+        X = X.T
+    return X
+
+
+class Muon(torch.optim.Optimizer):
+    """Muon - MomentUm Orthogonalized by Newton-schulz.
+
+    Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-
+    processing step, in which each 2D parameter's update is replaced with the nearest orthogonal
+    matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has
+    the advantage that it can be stably run in bfloat16 on the GPU.
+
+    Some warnings:
+    - We believe this optimizer is unlikely to work well for training with small batch size.
+    - We believe it may not work well for finetuning pretrained models, but we haven't tested this.
+
+    Arguments:
+        muon_params: The parameters to be optimized by Muon.
+        lr: The learning rate. The updates will have spectral norm of `lr`. (0.02 is a good default)
+        momentum: The momentum used by the internal SGD. (0.95 is a good default)
+        nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
+        ns_steps: The number of Newton-Schulz iterations to run. (6 is probably always enough)
+        adamw_params: The parameters to be optimized by AdamW. Any parameters in `muon_params` which are
+        {0, 1}-D or are detected as being the embed or lm_head will be optimized by AdamW as well.
+        adamw_lr: The learning rate for the internal AdamW.
+        adamw_betas: The betas for the internal AdamW.
+        adamw_eps: The epsilon for the internal AdamW.
+        adamw_wd: The weight decay for the internal AdamW.
+    """
+
+    def __init__(
+        self,
+        lr=1e-3,
+        wd=0.1,
+        muon_params=None,
+        momentum=0.95,
+        nesterov=True,
+        ns_steps=5,
+        adamw_params=None,
+        adamw_betas=(0.9, 0.95),
+        adamw_eps=1e-8,
+    ):
+        defaults = dict(
+            lr=lr,
+            wd=wd,
+            momentum=momentum,
+            nesterov=nesterov,
+            ns_steps=ns_steps,
+            adamw_betas=adamw_betas,
+            adamw_eps=adamw_eps,
+        )
+
+        params = list(muon_params)
+        adamw_params = list(adamw_params) if adamw_params is not None else []
+        params.extend(adamw_params)
+        super().__init__(params, defaults)
+        # Sort parameters into those for which we will use Muon, and those for which we will not
+        for p in muon_params:
+            # Use Muon for every parameter in muon_params which is >= 2D and doesn't look like an embedding or head layer
+            assert p.ndim == 2, p.ndim
+            self.state[p]["use_muon"] = True
+        for p in adamw_params:
+            # Do not use Muon for parameters in adamw_params
+            self.state[p]["use_muon"] = False
+
+    def adjust_lr_for_muon(self, lr: float, param_shape: list[int]) -> float:
+        A, B = param_shape[:2]
+        # We adjust the learning rate and weight decay based on the size of the parameter matrix
+        # as describted in the paper
+        adjusted_ratio = 0.2 * math.sqrt(max(A, B))
+        adjusted_lr = lr * adjusted_ratio
+        return adjusted_lr
+
+    def step(self, closure=None):
+        """Perform a single optimization step.
+
+        Args:
+            closure (Callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            # Muon loop
+            params = [p for p in group["params"] if self.state[p]["use_muon"]]
+            lr = group["lr"]
+            wd = group["wd"]
+            momentum = group["momentum"]
+
+            # generate weight updates in distributed fashion
+            for p in params:
+                # sanity check
+                g = p.grad
+                if g is None:
+                    continue
+                if g.ndim > 2:
+                    g = g.view(g.size(0), -1)
+                assert g is not None
+
+                # calc update
+                state = self.state[p]
+                if "momentum_buffer" not in state:
+                    state["momentum_buffer"] = torch.zeros_like(g)
+                buf = state["momentum_buffer"]
+                buf.mul_(momentum).add_(g)
+                if group["nesterov"]:
+                    g = g.add(buf, alpha=momentum)
+                else:
+                    g = buf
+                u = zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
+
+                # scale update
+                adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
+
+                # apply weight decay
+                p.data.mul_(1 - lr * wd)
+
+                # apply update
+                p.data.add_(u, alpha=-adjusted_lr)
+
+            # Adam backup
+            params = [p for p in group["params"] if not self.state[p]["use_muon"]]
+            lr = group["lr"]
+            beta1, beta2 = group["adamw_betas"]
+            eps = group["adamw_eps"]
+            weight_decay = group["wd"]
+
+            for p in params:
+                g = p.grad
+                if g is None:
+                    continue
+                state = self.state[p]
+                if "step" not in state:
+                    state["step"] = 0
+                    state["moment1"] = torch.zeros_like(g)
+                    state["moment2"] = torch.zeros_like(g)
+                state["step"] += 1
+                step = state["step"]
+                buf1 = state["moment1"]
+                buf2 = state["moment2"]
+                buf1.lerp_(g, 1 - beta1)
+                buf2.lerp_(g.square(), 1 - beta2)
+
+                g = buf1 / (eps + buf2.sqrt())
+
+                bias_correction1 = 1 - beta1**step
+                bias_correction2 = 1 - beta2**step
+                scale = bias_correction1 / bias_correction2**0.5
+                p.data.mul_(1 - lr * weight_decay)
+                p.data.add_(g, alpha=-lr / scale)
+
+        return loss
diff --git a/llamafactory/train/__init__.py b/llamafactory/train/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/llamafactory/train/__pycache__/__init__.cpython-312.pyc b/llamafactory/train/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f560d114d11d4d7510699f37f3153c45888adad8
Binary files /dev/null and b/llamafactory/train/__pycache__/__init__.cpython-312.pyc differ
diff --git a/llamafactory/train/__pycache__/callbacks.cpython-312.pyc b/llamafactory/train/__pycache__/callbacks.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..119c06cc3b7ddd9b92e3174aa26a69a4eb224601
Binary files /dev/null and b/llamafactory/train/__pycache__/callbacks.cpython-312.pyc differ
diff --git a/llamafactory/train/__pycache__/fp8_utils.cpython-312.pyc b/llamafactory/train/__pycache__/fp8_utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4d11706634b66228290c053372a8fedc81e96756
Binary files /dev/null and b/llamafactory/train/__pycache__/fp8_utils.cpython-312.pyc differ
diff --git a/llamafactory/train/__pycache__/trainer_utils.cpython-312.pyc b/llamafactory/train/__pycache__/trainer_utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f2be685bc4e884e3e46f653a5bf89c5355d8e412
Binary files /dev/null and b/llamafactory/train/__pycache__/trainer_utils.cpython-312.pyc differ
diff --git a/llamafactory/train/__pycache__/tuner.cpython-312.pyc b/llamafactory/train/__pycache__/tuner.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f44af581d55e9434d08ff712a621ff43fa051662
Binary files /dev/null and b/llamafactory/train/__pycache__/tuner.cpython-312.pyc differ
diff --git a/llamafactory/train/callbacks.py b/llamafactory/train/callbacks.py
new file mode 100644
index 0000000000000000000000000000000000000000..5619568ea3416c007f4afd1fc35bbedc6cccef12
--- /dev/null
+++ b/llamafactory/train/callbacks.py
@@ -0,0 +1,382 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import signal
+import sys
+import time
+from concurrent.futures import ThreadPoolExecutor
+from datetime import timedelta
+from typing import TYPE_CHECKING, Any, Optional
+
+import torch
+import transformers
+from peft import PeftModel
+from transformers import PreTrainedModel, ProcessorMixin, TrainerCallback
+from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, has_length
+from transformers.utils import SAFE_WEIGHTS_NAME, WEIGHTS_NAME
+from typing_extensions import override
+
+from ..extras import logging
+from ..extras.constants import TRAINER_LOG, V_HEAD_SAFE_WEIGHTS_NAME, V_HEAD_WEIGHTS_NAME
+from ..extras.misc import get_peak_memory, is_env_enabled, use_ray
+from ..extras.packages import is_safetensors_available
+
+
+if is_safetensors_available():
+    from safetensors import safe_open
+    from safetensors.torch import save_file
+
+
+if TYPE_CHECKING:
+    from transformers import TrainerControl, TrainerState, TrainingArguments
+    from trl import AutoModelForCausalLMWithValueHead
+
+    from ..hparams import DataArguments, FinetuningArguments, GeneratingArguments, ModelArguments
+
+
+logger = logging.get_logger(__name__)
+
+
+def fix_valuehead_checkpoint(
+    model: "AutoModelForCausalLMWithValueHead", output_dir: str, safe_serialization: bool
+) -> None:
+    r"""Fix the valuehead checkpoint files.
+
+    The model is already unwrapped.
+
+    There are three cases:
+    1. full tuning without ds_zero3: state_dict = {"model.layers.*": ..., "v_head.summary.*": ...}
+    2. lora tuning without ds_zero3: state_dict = {"v_head.summary.*": ...}
+    3. under deepspeed zero3: state_dict = {"pretrained_model.model.layers.*": ..., "v_head.summary.*": ...}
+
+    We assume `stage3_gather_16bit_weights_on_model_save=true`.
+    """
+    if not isinstance(model.pretrained_model, (PreTrainedModel, PeftModel)):
+        return
+
+    if safe_serialization:
+        path_to_checkpoint = os.path.join(output_dir, SAFE_WEIGHTS_NAME)
+        with safe_open(path_to_checkpoint, framework="pt", device="cpu") as f:
+            state_dict: dict[str, torch.Tensor] = {key: f.get_tensor(key).clone() for key in f.keys()}
+    else:
+        path_to_checkpoint = os.path.join(output_dir, WEIGHTS_NAME)
+        state_dict: dict[str, torch.Tensor] = torch.load(path_to_checkpoint, map_location="cpu", weights_only=True)
+
+    os.remove(path_to_checkpoint)
+    decoder_state_dict, v_head_state_dict = {}, {}
+    for name, param in state_dict.items():
+        if name.startswith("v_head."):
+            v_head_state_dict[name] = param
+        else:
+            decoder_state_dict[name.replace("pretrained_model.", "", 1)] = param
+
+    model.pretrained_model.save_pretrained(
+        output_dir, state_dict=decoder_state_dict or None, safe_serialization=safe_serialization
+    )
+
+    if safe_serialization:
+        save_file(v_head_state_dict, os.path.join(output_dir, V_HEAD_SAFE_WEIGHTS_NAME), metadata={"format": "pt"})
+    else:
+        torch.save(v_head_state_dict, os.path.join(output_dir, V_HEAD_WEIGHTS_NAME))
+
+    logger.info_rank0(f"Value head model saved at: {output_dir}")
+
+
+class FixValueHeadModelCallback(TrainerCallback):
+    r"""A callback for fixing the checkpoint for valuehead models."""
+
+    @override
+    def on_save(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        if args.should_save:
+            output_dir = os.path.join(args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}")
+            fix_valuehead_checkpoint(
+                model=kwargs.pop("model"), output_dir=output_dir, safe_serialization=args.save_safetensors
+            )
+
+
+class SaveProcessorCallback(TrainerCallback):
+    r"""A callback for saving the processor."""
+
+    def __init__(self, processor: "ProcessorMixin") -> None:
+        self.processor = processor
+
+    @override
+    def on_save(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        if args.should_save:
+            output_dir = os.path.join(args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}")
+            self.processor.save_pretrained(output_dir)
+
+    @override
+    def on_train_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        if args.should_save:
+            self.processor.save_pretrained(args.output_dir)
+
+
+class PissaConvertCallback(TrainerCallback):
+    r"""A callback for converting the PiSSA adapter to a normal one."""
+
+    @override
+    def on_train_begin(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        if args.should_save:
+            model = kwargs.pop("model")
+            pissa_init_dir = os.path.join(args.output_dir, "pissa_init")
+            logger.info_rank0(f"Initial PiSSA adapter will be saved at: {pissa_init_dir}.")
+            if isinstance(model, PeftModel):
+                init_lora_weights = getattr(model.peft_config["default"], "init_lora_weights")
+                setattr(model.peft_config["default"], "init_lora_weights", True)
+                model.save_pretrained(pissa_init_dir, safe_serialization=args.save_safetensors)
+                setattr(model.peft_config["default"], "init_lora_weights", init_lora_weights)
+
+    @override
+    def on_train_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        if args.should_save:
+            model = kwargs.pop("model")
+            pissa_init_dir = os.path.join(args.output_dir, "pissa_init")
+            pissa_backup_dir = os.path.join(args.output_dir, "pissa_backup")
+            pissa_convert_dir = os.path.join(args.output_dir, "pissa_converted")
+            logger.info_rank0(f"Converted PiSSA adapter will be saved at: {pissa_convert_dir}.")
+            # 1. save a pissa backup with init_lora_weights: True
+            # 2. save a converted lora with init_lora_weights: pissa
+            # 3. load the pissa backup with init_lora_weights: True
+            # 4. delete the initial adapter and change init_lora_weights to pissa
+            if isinstance(model, PeftModel):
+                init_lora_weights = getattr(model.peft_config["default"], "init_lora_weights")
+                setattr(model.peft_config["default"], "init_lora_weights", True)
+                model.save_pretrained(pissa_backup_dir, safe_serialization=args.save_safetensors)
+                setattr(model.peft_config["default"], "init_lora_weights", init_lora_weights)
+                model.save_pretrained(
+                    pissa_convert_dir,
+                    safe_serialization=args.save_safetensors,
+                    path_initial_model_for_weight_conversion=pissa_init_dir,
+                )
+                model.load_adapter(pissa_backup_dir, "default", is_trainable=True)
+                model.set_adapter("default")
+                setattr(model.peft_config["default"], "init_lora_weights", init_lora_weights)
+
+
+class LogCallback(TrainerCallback):
+    r"""A callback for logging training and evaluation status."""
+
+    def __init__(self) -> None:
+        # Progress
+        self.start_time = 0
+        self.cur_steps = 0
+        self.max_steps = 0
+        self.elapsed_time = ""
+        self.remaining_time = ""
+        self.thread_pool: Optional[ThreadPoolExecutor] = None
+        # Status
+        self.aborted = False
+        self.do_train = False
+        # Web UI
+        self.webui_mode = is_env_enabled("LLAMABOARD_ENABLED")
+        if self.webui_mode and not use_ray():
+            signal.signal(signal.SIGABRT, self._set_abort)
+            self.logger_handler = logging.LoggerHandler(os.getenv("LLAMABOARD_WORKDIR"))
+            logging.add_handler(self.logger_handler)
+            transformers.logging.add_handler(self.logger_handler)
+
+    def _set_abort(self, signum, frame) -> None:
+        self.aborted = True
+
+    def _reset(self, max_steps: int = 0) -> None:
+        self.start_time = time.time()
+        self.cur_steps = 0
+        self.max_steps = max_steps
+        self.elapsed_time = ""
+        self.remaining_time = ""
+
+    def _timing(self, cur_steps: int) -> None:
+        cur_time = time.time()
+        elapsed_time = cur_time - self.start_time
+        avg_time_per_step = elapsed_time / cur_steps if cur_steps != 0 else 0
+        remaining_time = (self.max_steps - cur_steps) * avg_time_per_step
+        self.cur_steps = cur_steps
+        self.elapsed_time = str(timedelta(seconds=int(elapsed_time)))
+        self.remaining_time = str(timedelta(seconds=int(remaining_time)))
+
+    def _write_log(self, output_dir: str, logs: dict[str, Any]) -> None:
+        with open(os.path.join(output_dir, TRAINER_LOG), "a", encoding="utf-8") as f:
+            f.write(json.dumps(logs) + "\n")
+
+    def _create_thread_pool(self, output_dir: str) -> None:
+        os.makedirs(output_dir, exist_ok=True)
+        self.thread_pool = ThreadPoolExecutor(max_workers=1)
+
+    def _close_thread_pool(self) -> None:
+        if self.thread_pool is not None:
+            self.thread_pool.shutdown(wait=True)
+            self.thread_pool = None
+
+    @override
+    def on_init_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        if (
+            args.should_save
+            and os.path.exists(os.path.join(args.output_dir, TRAINER_LOG))
+            and args.overwrite_output_dir
+        ):
+            logger.warning_rank0_once("Previous trainer log in this folder will be deleted.")
+            os.remove(os.path.join(args.output_dir, TRAINER_LOG))
+
+    @override
+    def on_train_begin(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        if args.should_save:
+            self.do_train = True
+            self._reset(max_steps=state.max_steps)
+            self._create_thread_pool(output_dir=args.output_dir)
+
+    @override
+    def on_train_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        self._close_thread_pool()
+
+    @override
+    def on_substep_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        if self.aborted:
+            control.should_epoch_stop = True
+            control.should_training_stop = True
+
+    @override
+    def on_step_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        if self.aborted:
+            control.should_epoch_stop = True
+            control.should_training_stop = True
+
+    @override
+    def on_evaluate(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        if not self.do_train:
+            self._close_thread_pool()
+
+    @override
+    def on_predict(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        if not self.do_train:
+            self._close_thread_pool()
+
+    @override
+    def on_log(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        if not args.should_save:
+            return
+
+        self._timing(cur_steps=state.global_step)
+        logs = dict(
+            current_steps=self.cur_steps,
+            total_steps=self.max_steps,
+            loss=state.log_history[-1].get("loss"),
+            eval_loss=state.log_history[-1].get("eval_loss"),
+            predict_loss=state.log_history[-1].get("predict_loss"),
+            reward=state.log_history[-1].get("reward"),
+            accuracy=state.log_history[-1].get("rewards/accuracies"),
+            lr=state.log_history[-1].get("learning_rate"),
+            epoch=state.log_history[-1].get("epoch"),
+            percentage=round(self.cur_steps / self.max_steps * 100, 2) if self.max_steps != 0 else 100,
+            elapsed_time=self.elapsed_time,
+            remaining_time=self.remaining_time,
+        )
+        if state.num_input_tokens_seen:
+            logs["throughput"] = round(state.num_input_tokens_seen / (time.time() - self.start_time), 2)
+            logs["total_tokens"] = state.num_input_tokens_seen
+
+        if is_env_enabled("RECORD_VRAM"):
+            vram_allocated, vram_reserved = get_peak_memory()
+            logs["vram_allocated"] = round(vram_allocated / (1024**3), 2)
+            logs["vram_reserved"] = round(vram_reserved / (1024**3), 2)
+
+        logs = {k: v for k, v in logs.items() if v is not None}
+        if self.webui_mode and all(key in logs for key in ("loss", "lr", "epoch")):
+            log_str = f"'loss': {logs['loss']:.4f}, 'learning_rate': {logs['lr']:2.4e}, 'epoch': {logs['epoch']:.2f}"
+            for extra_key in ("reward", "accuracy", "throughput"):
+                if logs.get(extra_key):
+                    log_str += f", '{extra_key}': {logs[extra_key]:.2f}"
+
+            logger.info_rank0("{" + log_str + "}")
+
+        if self.thread_pool is not None:
+            self.thread_pool.submit(self._write_log, args.output_dir, logs)
+
+    @override
+    def on_prediction_step(
+        self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs
+    ):
+        if self.do_train:
+            return
+
+        if self.aborted:
+            sys.exit(0)
+
+        if not args.should_save:
+            return
+
+        eval_dataloader = kwargs.pop("eval_dataloader", None)
+        if has_length(eval_dataloader):
+            if self.max_steps == 0:
+                self._reset(max_steps=len(eval_dataloader))
+                self._create_thread_pool(output_dir=args.output_dir)
+
+            self._timing(cur_steps=self.cur_steps + 1)
+            if self.cur_steps % 5 == 0 and self.thread_pool is not None:
+                logs = dict(
+                    current_steps=self.cur_steps,
+                    total_steps=self.max_steps,
+                    percentage=round(self.cur_steps / self.max_steps * 100, 2) if self.max_steps != 0 else 100,
+                    elapsed_time=self.elapsed_time,
+                    remaining_time=self.remaining_time,
+                )
+                self.thread_pool.submit(self._write_log, args.output_dir, logs)
+
+
+class ReporterCallback(TrainerCallback):
+    r"""A callback for reporting training status to external logger."""
+
+    def __init__(
+        self,
+        model_args: "ModelArguments",
+        data_args: "DataArguments",
+        finetuning_args: "FinetuningArguments",
+        generating_args: "GeneratingArguments",
+    ) -> None:
+        self.model_args = model_args
+        self.data_args = data_args
+        self.finetuning_args = finetuning_args
+        self.generating_args = generating_args
+        os.environ["WANDB_PROJECT"] = os.getenv("WANDB_PROJECT", "llamafactory")
+
+    @override
+    def on_train_begin(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        if not state.is_world_process_zero:
+            return
+
+        if "wandb" in args.report_to:
+            import wandb
+
+            wandb.config.update(
+                {
+                    "model_args": self.model_args.to_dict(),
+                    "data_args": self.data_args.to_dict(),
+                    "finetuning_args": self.finetuning_args.to_dict(),
+                    "generating_args": self.generating_args.to_dict(),
+                }
+            )
+
+        if self.finetuning_args.use_swanlab:
+            import swanlab  # type: ignore
+
+            swanlab.config.update(
+                {
+                    "model_args": self.model_args.to_dict(),
+                    "data_args": self.data_args.to_dict(),
+                    "finetuning_args": self.finetuning_args.to_dict(),
+                    "generating_args": self.generating_args.to_dict(),
+                }
+            )
diff --git a/llamafactory/train/dpo/__init__.py b/llamafactory/train/dpo/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73c1a4a6bd8a6c68c6875f19fdf8eb9899e70826
--- /dev/null
+++ b/llamafactory/train/dpo/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .workflow import run_dpo
+
+
+__all__ = ["run_dpo"]
diff --git a/llamafactory/train/dpo/__pycache__/__init__.cpython-312.pyc b/llamafactory/train/dpo/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a26de37345f9eb220f2afbda96b8602ee75e5207
Binary files /dev/null and b/llamafactory/train/dpo/__pycache__/__init__.cpython-312.pyc differ
diff --git a/llamafactory/train/dpo/__pycache__/trainer.cpython-312.pyc b/llamafactory/train/dpo/__pycache__/trainer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..139c629340e5f4c1b5e14bb238472188662377ce
Binary files /dev/null and b/llamafactory/train/dpo/__pycache__/trainer.cpython-312.pyc differ
diff --git a/llamafactory/train/dpo/__pycache__/workflow.cpython-312.pyc b/llamafactory/train/dpo/__pycache__/workflow.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2b8a40752ec94fd84c80fbda1032751c5eb50810
Binary files /dev/null and b/llamafactory/train/dpo/__pycache__/workflow.cpython-312.pyc differ
diff --git a/llamafactory/train/dpo/trainer.py b/llamafactory/train/dpo/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0ebc301e7658a558b50313560da13d811e47e19
--- /dev/null
+++ b/llamafactory/train/dpo/trainer.py
@@ -0,0 +1,333 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's TRL library.
+# https://github.com/huggingface/trl/blob/v0.8.0/trl/trainer/dpo_trainer.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from collections import defaultdict
+from contextlib import nullcontext
+from types import MethodType
+from typing import TYPE_CHECKING, Literal, Optional, Union
+
+import torch
+import torch.nn.functional as F
+from transformers import Trainer
+from trl import DPOTrainer
+from trl.trainer import disable_dropout_in_model
+from typing_extensions import override
+
+from ...extras.constants import IGNORE_INDEX
+from ...extras.packages import is_transformers_version_greater_than
+from ..callbacks import SaveProcessorCallback
+from ..trainer_utils import create_custom_optimizer, create_custom_scheduler, get_batch_logps, nested_detach
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel, ProcessorMixin
+
+    from ...hparams import FinetuningArguments
+
+
+class CustomDPOTrainer(DPOTrainer):
+    def __init__(
+        self,
+        model: Union["PreTrainedModel", torch.nn.Module],
+        ref_model: Optional[Union["PreTrainedModel", torch.nn.Module]],
+        finetuning_args: "FinetuningArguments",
+        processor: Optional["ProcessorMixin"],
+        disable_dropout: bool = True,
+        **kwargs,
+    ):
+        if is_transformers_version_greater_than("4.46"):
+            kwargs["processing_class"] = kwargs.pop("tokenizer")
+
+        if disable_dropout:
+            disable_dropout_in_model(model)
+            if ref_model is not None:
+                disable_dropout_in_model(ref_model)
+
+        self.finetuning_args = finetuning_args
+        self.f_divergence_type = "reverse_kl"
+        self.reference_free = False
+        self.use_dpo_data_collator = True  # hack to avoid warning
+        self.generate_during_eval = False  # disable at evaluation
+        self.label_pad_token_id = IGNORE_INDEX
+        self.padding_value = 0
+        self.is_encoder_decoder = model.config.is_encoder_decoder
+        self.precompute_ref_log_probs = False
+        self._precomputed_train_ref_log_probs = False
+        self._precomputed_eval_ref_log_probs = False
+        self._peft_has_been_casted_to_bf16 = False
+
+        self.ref_model = ref_model
+        self._stored_metrics = defaultdict(lambda: defaultdict(list))
+
+        # dpo hyperparams
+        self.beta = finetuning_args.pref_beta
+        self.loss_type = finetuning_args.pref_loss
+        self.ftx_gamma = finetuning_args.pref_ftx
+        self.bco_gemma = finetuning_args.pref_bco_weight
+        self.label_smoothing = finetuning_args.dpo_label_smoothing
+        self.simpo_gamma = finetuning_args.simpo_gamma
+        self.ld_alpha = finetuning_args.ld_alpha
+
+        Trainer.__init__(self, model=model, **kwargs)
+        self.model_accepts_loss_kwargs = False  # overwrite trainer's default behavior
+        if not hasattr(self, "accelerator"):
+            raise AttributeError("Please update `transformers`.")
+
+        warnings.simplefilter("ignore")  # remove gc warnings on ref model
+
+        if ref_model is not None:
+            if self.is_deepspeed_enabled:
+                if not (
+                    getattr(ref_model, "is_loaded_in_8bit", False) or getattr(ref_model, "is_loaded_in_4bit", False)
+                ):  # quantized models are already set on the correct device
+                    self.ref_model = self._prepare_deepspeed(self.ref_model)
+            else:
+                self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True)
+                self.ref_model.eval()
+
+        if processor is not None:
+            self.add_callback(SaveProcessorCallback(processor))
+
+        if finetuning_args.use_badam:
+            from badam import BAdamCallback, clip_grad_norm_old_version  # type: ignore
+
+            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
+            self.add_callback(BAdamCallback)
+
+        if self.bco_gemma >= 1e-6:
+            from trl.trainer import RunningMoments
+
+            self.running = RunningMoments(self.accelerator)
+
+    @override
+    def create_optimizer(self) -> "torch.optim.Optimizer":
+        if self.optimizer is None:
+            self.optimizer = create_custom_optimizer(self.model, self.args, self.finetuning_args)
+        return super().create_optimizer()
+
+    @override
+    def create_scheduler(
+        self, num_training_steps: int, optimizer: Optional["torch.optim.Optimizer"] = None
+    ) -> "torch.optim.lr_scheduler.LRScheduler":
+        create_custom_scheduler(self.args, num_training_steps, optimizer)
+        return super().create_scheduler(num_training_steps, optimizer)
+
+    @override
+    def _get_train_sampler(self, *args, **kwargs) -> Optional["torch.utils.data.Sampler"]:
+        if self.finetuning_args.disable_shuffling:
+            return torch.utils.data.SequentialSampler(self.train_dataset)
+
+        return super()._get_train_sampler(*args, **kwargs)
+
+    @override
+    def get_batch_samples(self, *args, **kwargs):
+        r"""Replace the method of DPO Trainer with the one of the standard Trainer."""
+        return Trainer.get_batch_samples(self, *args, **kwargs)
+
+    def odds_ratio_loss(self, chosen_logps: "torch.Tensor", rejected_logps: "torch.Tensor") -> "torch.Tensor":
+        r"""Compute ORPO's odds ratio (OR) loss for batched log probabilities of the policy model."""
+        log_odds = (chosen_logps - rejected_logps) - (
+            torch.log1p(-torch.exp(chosen_logps)) - torch.log1p(-torch.exp(rejected_logps))
+        )
+        sft_loss = -chosen_logps
+        odds_ratio_loss = -F.logsigmoid(log_odds)
+        orpo_loss = sft_loss + self.beta * odds_ratio_loss
+        return orpo_loss
+
+    def simpo_loss(self, chosen_logps: "torch.Tensor", rejected_logps: "torch.Tensor") -> "torch.Tensor":
+        r"""Compute SimPO loss for batched log probabilities of the policy model."""
+        pi_logratios = chosen_logps - rejected_logps
+        gamma_logratios = self.simpo_gamma / self.beta
+        logits = pi_logratios - gamma_logratios
+        simpo_loss = -F.logsigmoid(self.beta * logits)
+        return simpo_loss
+
+    def bco_loss(
+        self,
+        chosen_logps: "torch.Tensor",
+        rejected_logps: "torch.Tensor",
+        reference_chosen_logps: "torch.Tensor",
+        reference_rejected_logps: "torch.Tensor",
+    ) -> "torch.Tensor":
+        chosen_logratios = chosen_logps - reference_chosen_logps
+        rejected_logratios = rejected_logps - reference_rejected_logps
+        chosen_rewards = self.beta * chosen_logratios
+        rejected_rewards = self.beta * rejected_logratios
+        rewards = torch.cat((chosen_rewards, rejected_rewards), 0).mean().detach()
+        self.running.update(rewards)  # update baseline
+        delta = self.running.mean
+        bco_loss = -F.logsigmoid((self.beta * chosen_logratios) - delta) - F.logsigmoid(
+            -(self.beta * rejected_logratios - delta)
+        )
+        return bco_loss
+
+    def compute_preference_loss(
+        self,
+        policy_chosen_logps: "torch.Tensor",
+        policy_rejected_logps: "torch.Tensor",
+        reference_chosen_logps: Optional["torch.Tensor"],
+        reference_rejected_logps: Optional["torch.Tensor"],
+    ) -> tuple["torch.Tensor", "torch.Tensor", "torch.Tensor"]:
+        r"""Compute loss for preference learning."""
+        if not self.finetuning_args.use_ref_model:
+            if self.loss_type == "orpo":
+                losses = self.odds_ratio_loss(policy_chosen_logps, policy_rejected_logps)
+            elif self.loss_type == "simpo":
+                losses = self.simpo_loss(policy_chosen_logps, policy_rejected_logps)
+            else:
+                raise NotImplementedError(f"Unknown loss type: {self.loss_type}.")
+
+            chosen_rewards = self.beta * policy_chosen_logps.to(self.accelerator.device).detach()
+            rejected_rewards = self.beta * policy_rejected_logps.to(self.accelerator.device).detach()
+        else:
+            losses, chosen_rewards, rejected_rewards = self.dpo_loss(
+                policy_chosen_logps, policy_rejected_logps, reference_chosen_logps, reference_rejected_logps
+            )
+
+            if self.bco_gemma > 1e-6:
+                bco_losses = self.bco_loss(
+                    policy_chosen_logps, policy_rejected_logps, reference_chosen_logps, reference_rejected_logps
+                )
+                losses = (losses + bco_losses * self.bco_gemma) / (1.0 + self.bco_gemma)  # re-weight W_p and W_q
+
+        return losses, chosen_rewards, rejected_rewards
+
+    @override
+    def concatenated_forward(
+        self, model: "PreTrainedModel", batch: dict[str, "torch.Tensor"], is_ref_model: bool = False
+    ) -> tuple["torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor"]:
+        r"""Compute the sum log probabilities of the labels under given logits if loss_type is not IPO, ORPO or SimPO.
+
+        Otherwise the average log probabilities.
+        """
+        if self.finetuning_args.use_ref_model:
+            batch = nested_detach(batch, clone=True)  # avoid error
+
+        all_logits: torch.Tensor = model(**batch, return_dict=True, use_cache=False).logits.to(torch.float32)
+        all_logps, valid_length = get_batch_logps(
+            logits=all_logits, labels=batch["labels"], ld_alpha=(self.ld_alpha if not is_ref_model else None)
+        )
+        if self.loss_type in ["ipo", "orpo", "simpo"]:
+            all_logps = all_logps / valid_length
+
+        batch_size = batch["input_ids"].size(0) // 2
+        chosen_logps, rejected_logps = all_logps.split(batch_size, dim=0)
+        chosen_logits, rejected_logits = all_logits.split(batch_size, dim=0)
+        chosen_length, _ = valid_length.split(batch_size, dim=0)
+
+        if self.loss_type in ["ipo", "orpo", "simpo"]:
+            return chosen_logps, rejected_logps, chosen_logits, rejected_logits, chosen_logps
+        else:
+            return chosen_logps, rejected_logps, chosen_logits, rejected_logits, chosen_logps / chosen_length
+
+    @override
+    def compute_reference_log_probs(
+        self, model: "PreTrainedModel", batch: dict[str, "torch.Tensor"]
+    ) -> tuple[Optional["torch.Tensor"], Optional["torch.Tensor"]]:
+        r"""Compute log probabilities of the reference model."""
+        if not self.finetuning_args.use_ref_model:
+            return None, None
+
+        if self.ref_model is None:
+            ref_model = model
+            ref_context = self.accelerator.unwrap_model(model).disable_adapter()
+        else:
+            ref_model = self.ref_model
+            ref_context = nullcontext()
+
+        with torch.no_grad(), ref_context:
+            reference_chosen_logps, reference_rejected_logps, *_ = self.concatenated_forward(
+                ref_model, batch, is_ref_model=True
+            )
+
+        return reference_chosen_logps, reference_rejected_logps
+
+    @override
+    def get_batch_loss_metrics(
+        self,
+        model: "PreTrainedModel",
+        batch: dict[str, "torch.Tensor"],
+        train_eval: Literal["train", "eval"] = "train",
+    ) -> tuple["torch.Tensor", dict[str, "torch.Tensor"]]:
+        r"""Compute the DPO loss and other metrics for the given batch of inputs for train or test."""
+        metrics = {}
+        (
+            policy_chosen_logps,
+            policy_rejected_logps,
+            policy_chosen_logits,
+            policy_rejected_logits,
+            policy_chosen_logps_avg,
+        ) = self.concatenated_forward(model, batch)
+
+        reference_chosen_logps, reference_rejected_logps = self.compute_reference_log_probs(model, batch)
+        losses, chosen_rewards, rejected_rewards = self.compute_preference_loss(
+            policy_chosen_logps,
+            policy_rejected_logps,
+            reference_chosen_logps,
+            reference_rejected_logps,
+        )
+        sft_loss = -policy_chosen_logps_avg
+        if self.ftx_gamma > 1e-6:
+            losses += self.ftx_gamma * sft_loss
+
+        prefix = "eval_" if train_eval == "eval" else ""
+        metrics[f"{prefix}rewards/chosen"] = chosen_rewards.mean().item()
+        metrics[f"{prefix}rewards/rejected"] = rejected_rewards.mean().item()
+        metrics[f"{prefix}rewards/accuracies"] = (chosen_rewards > rejected_rewards).float().mean().item()
+        metrics[f"{prefix}rewards/margins"] = (chosen_rewards - rejected_rewards).mean().item()
+        metrics[f"{prefix}logps/chosen"] = policy_chosen_logps.mean().item()
+        metrics[f"{prefix}logps/rejected"] = policy_rejected_logps.mean().item()
+        metrics[f"{prefix}logits/chosen"] = policy_chosen_logits.mean().item()
+        metrics[f"{prefix}logits/rejected"] = policy_rejected_logits.mean().item()
+        if self.loss_type == "orpo":
+            metrics[f"{prefix}sft_loss"] = sft_loss.mean().item()
+            metrics[f"{prefix}odds_ratio_loss"] = ((losses - sft_loss) / self.beta).mean().item()
+
+        return losses.mean(), metrics
+
+    @override
+    def compute_loss(
+        self, model: "PreTrainedModel", inputs: dict[str, "torch.Tensor"], return_outputs: bool = False, **kwargs
+    ) -> Union["torch.Tensor", tuple["torch.Tensor", list["torch.Tensor"]]]:
+        r"""Subclass and override to accept extra kwargs."""
+        return super().compute_loss(model, inputs, return_outputs)
+
+    @override
+    def log(self, logs: dict[str, float], *args, **kwargs) -> None:
+        r"""Log `logs` on the various objects watching training, including stored metrics."""
+        # logs either has "loss" or "eval_loss"
+        train_eval = "train" if "loss" in logs else "eval"
+        # Add averaged stored metrics to logs
+        key_list, metric_list = [], []
+        for key, metrics in self._stored_metrics[train_eval].items():
+            key_list.append(key)
+            metric_list.append(torch.tensor(metrics, dtype=torch.float).to(self.accelerator.device).mean().item())
+
+        del self._stored_metrics[train_eval]
+        if len(metric_list) < 10:  # pad to for all reduce
+            for i in range(10 - len(metric_list)):
+                key_list.append(f"dummy_{i}")
+                metric_list.append(0.0)
+
+        metric_list = torch.tensor(metric_list, dtype=torch.float).to(self.accelerator.device)
+        metric_list = self.accelerator.reduce(metric_list, "mean").tolist()
+        for key, metric in zip(key_list, metric_list):  # add remaining items
+            if not key.startswith("dummy_"):
+                logs[key] = metric
+
+        return Trainer.log(self, logs, *args, **kwargs)
diff --git a/llamafactory/train/dpo/workflow.py b/llamafactory/train/dpo/workflow.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0a107d2f99ad0ed22bf706ba098ff06b33921ce
--- /dev/null
+++ b/llamafactory/train/dpo/workflow.py
@@ -0,0 +1,110 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's TRL library.
+# https://github.com/huggingface/trl/blob/v0.8.0/examples/scripts/dpo.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING, Optional
+
+from ...data import PairwiseDataCollatorWithPadding, get_dataset, get_template_and_fix_tokenizer
+from ...extras.constants import IGNORE_INDEX
+from ...extras.misc import calculate_tps
+from ...extras.ploting import plot_loss
+from ...hparams import ModelArguments
+from ...model import load_model, load_tokenizer
+from ..trainer_utils import create_modelcard_and_push, create_ref_model
+from .trainer import CustomDPOTrainer
+
+
+if TYPE_CHECKING:
+    from transformers import Seq2SeqTrainingArguments, TrainerCallback
+
+    from ...hparams import DataArguments, FinetuningArguments
+
+
+def run_dpo(
+    model_args: "ModelArguments",
+    data_args: "DataArguments",
+    training_args: "Seq2SeqTrainingArguments",
+    finetuning_args: "FinetuningArguments",
+    callbacks: Optional[list["TrainerCallback"]] = None,
+):
+    tokenizer_module = load_tokenizer(model_args)
+    tokenizer = tokenizer_module["tokenizer"]
+    template = get_template_and_fix_tokenizer(tokenizer, data_args)
+    dataset_module = get_dataset(template, model_args, data_args, training_args, stage="rm", **tokenizer_module)
+    model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train)
+
+    data_collator = PairwiseDataCollatorWithPadding(
+        template=template,
+        model=model,
+        pad_to_multiple_of=8,
+        label_pad_token_id=IGNORE_INDEX if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id,
+        **tokenizer_module,
+    )
+
+    # Create reference model
+    if finetuning_args.use_ref_model:
+        if finetuning_args.ref_model is None and (not training_args.do_train):  # use the model itself
+            ref_model = model
+        else:
+            ref_model = create_ref_model(model_args, finetuning_args)
+    else:
+        ref_model = None
+
+    # Initialize our Trainer
+    trainer = CustomDPOTrainer(
+        model=model,
+        ref_model=ref_model,
+        args=training_args,
+        finetuning_args=finetuning_args,
+        data_collator=data_collator,
+        callbacks=callbacks,
+        **dataset_module,
+        **tokenizer_module,
+    )
+
+    # Training
+    if training_args.do_train:
+        train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
+        trainer.save_model()
+        if finetuning_args.include_effective_tokens_per_second:
+            train_result.metrics["effective_tokens_per_sec"] = calculate_tps(
+                dataset_module["train_dataset"], train_result.metrics, stage="rm"
+            )
+
+        trainer.log_metrics("train", train_result.metrics)
+        trainer.save_metrics("train", train_result.metrics)
+        trainer.save_state()
+        if trainer.is_world_process_zero() and finetuning_args.plot_loss:
+            keys = ["loss", "rewards/accuracies"]
+            if isinstance(dataset_module.get("eval_dataset"), dict):
+                keys += [f"eval_{key}_loss" for key in dataset_module["eval_dataset"].keys()]
+            else:
+                keys += ["eval_loss"]
+
+            plot_loss(training_args.output_dir, keys=keys)
+
+    # Evaluation
+    if training_args.do_eval:
+        metrics = trainer.evaluate(metric_key_prefix="eval")
+        if id(model) == id(ref_model):  # unable to compute rewards if reference model is the model itself
+            remove_keys = [key for key in metrics.keys() if "rewards" in key]
+            for key in remove_keys:
+                metrics.pop(key)
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+
+    # Create model card
+    create_modelcard_and_push(trainer, model_args, data_args, training_args, finetuning_args)
diff --git a/llamafactory/train/fp8_utils.py b/llamafactory/train/fp8_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab8a8ee25f0bdb70b66ab9a0fb09bee9be1344cb
--- /dev/null
+++ b/llamafactory/train/fp8_utils.py
@@ -0,0 +1,171 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING, Any, Optional
+
+from ..extras import logging
+
+
+if TYPE_CHECKING:
+    from ..hparams import ModelArguments
+
+logger = logging.get_logger(__name__)
+
+
+def create_fp8_kwargs(model_args: "ModelArguments") -> list[Any]:
+    """Create AORecipeKwargs for FP8 training with HuggingFace Accelerate.
+
+    Args:
+        model_args: Model arguments containing FP8 configuration
+
+    Returns:
+        List containing AORecipeKwargs if FP8 is enabled and supported, empty list otherwise
+    """
+    if not model_args.fp8:
+        return []
+
+    try:
+        # Check if AORecipeKwargs is available (Accelerate 1.8.0+)
+        from accelerate.utils import AORecipeKwargs
+
+        backend = getattr(model_args, "fp8_backend", "auto")
+        logger.info_rank0(f"Creating FP8 configuration with backend: {backend}")
+
+        # Create Float8LinearConfig if torchao backend is used
+        config = None
+        if backend == "torchao" or backend == "auto":
+            from torchao.float8 import Float8LinearConfig
+
+            # Use rowwise scaling for better performance (as recommended by torchao)
+            # Configure alignment requirements for FP8 kernels
+            config = Float8LinearConfig.from_recipe_name("rowwise")
+
+            # Enable alignment for better kernel performance
+            if hasattr(config, "enable_amax_init"):
+                config.enable_amax_init = True
+            if hasattr(config, "enable_pre_and_post_forward"):
+                config.enable_pre_and_post_forward = True
+
+        # Create module filter function to skip problematic layers
+        # TorchAO FP8 requires dimensions divisible by 16 for optimal kernels
+        def module_filter_func(module, layer_name):
+            # Skip embedding and output layers for numerical stability
+            skip_layers = ["embed", "lm_head", "output", "classifier"]
+            if any(skip_name in layer_name.lower() for skip_name in skip_layers):
+                return False
+
+            # Only convert Linear layers
+            if not (hasattr(module, "weight") and len(module.weight.shape) == 2):
+                return False
+
+            # Check dimension alignment for FP8 kernels
+            weight = module.weight
+            in_features, out_features = weight.shape[1], weight.shape[0]
+
+            # Skip layers with dimensions not divisible by 16 to avoid kernel errors
+            if in_features % 16 != 0 or out_features % 16 != 0:
+                logger.debug(
+                    f"Skipping layer {layer_name} with dimensions {out_features}x{in_features} (not divisible by 16)"
+                )
+                return False
+
+            return True
+
+        # Map FSDP all-gather setting if available (this affects the underlying implementation)
+        if hasattr(model_args, "fp8_enable_fsdp_float8_all_gather") and model_args.fp8_enable_fsdp_float8_all_gather:
+            logger.info_rank0("FSDP float8 all-gather optimization requested")
+
+        return [AORecipeKwargs(config=config, module_filter_func=module_filter_func)]
+    except Exception as e:
+        logger.info_rank0(f"Failed to create FP8 configuration: {e}")
+        return []
+
+
+def get_fp8_mixed_precision(model_args: "ModelArguments") -> Optional[str]:
+    """Get the mixed precision setting for Accelerate when using FP8.
+
+    Args:
+        model_args: Model arguments containing FP8 configuration
+
+    Returns:
+        "fp8" if FP8 is enabled, None otherwise
+    """
+    return "fp8" if model_args.fp8 else None
+
+
+def configure_fp8_environment(model_args: "ModelArguments") -> None:
+    """Configure FP8 environment for HuggingFace Accelerate.
+
+    FP8 training is handled entirely through HuggingFace Accelerate, regardless of whether
+    DeepSpeed or FSDP is used for distributed training. This function sets up the environment
+    variables and validates the FP8 configuration.
+
+    Args:
+        model_args: Model arguments containing FP8 configuration
+    """
+    import os
+
+    if not model_args.fp8:
+        return
+
+    # Set mixed precision to fp8 for HuggingFace Accelerate
+    os.environ["ACCELERATE_MIXED_PRECISION"] = "fp8"
+    logger.info_rank0("Set ACCELERATE_MIXED_PRECISION=fp8")
+
+    # Configure FP8 backend and options
+    backend = getattr(model_args, "fp8_backend", "auto")
+    if backend != "auto":
+        os.environ["FP8_BACKEND"] = backend
+        logger.info_rank0(f"Set FP8_BACKEND={backend}")
+
+    # Create and validate FP8 recipe kwargs (for logging/debugging)
+    fp8_kwargs = create_fp8_kwargs(model_args)
+    logger.info_rank0(f"FP8 AORecipeKwargs created: {len(fp8_kwargs)} items")
+
+    # Enable FSDP float8 all-gather optimization if requested
+    if hasattr(model_args, "fp8_enable_fsdp_float8_all_gather") and model_args.fp8_enable_fsdp_float8_all_gather:
+        os.environ["FP8_ENABLE_FSDP_FLOAT8_ALL_GATHER"] = "true"
+        logger.info_rank0("Set FP8_ENABLE_FSDP_FLOAT8_ALL_GATHER=true")
+
+    logger.info_rank0("FP8 environment configured - all FP8 training handled by HuggingFace Accelerate")
+
+
+def verify_fp8_status(accelerator, model_args: "ModelArguments") -> None:
+    """Verify that FP8 training is actually working after model preparation.
+
+    Args:
+        accelerator: The HuggingFace Accelerator instance
+        model_args: Model arguments containing FP8 configuration
+    """
+    if not model_args.fp8:
+        return
+
+    # Check Accelerate's FP8 status
+    fp8_enabled = getattr(accelerator, "fp8_enabled", False)
+    fp8_backend_type = getattr(accelerator, "fp8_backend", "UNKNOWN")
+
+    backend = getattr(model_args, "fp8_backend", "auto")
+    if backend == "torchao" or backend == "auto":
+        logger.info_rank0(
+            "FP8 training enabled with TorchAO backend. For optimal performance, "
+            "ensure model layer dimensions are mostly divisible by 16. "
+            "If you encounter issues, try fp8_backend='te' with Transformer Engine."
+        )
+    else:
+        logger.info_rank0(f"FP8 training enabled with {backend} backend.")
+
+    logger.info_rank0(f"Accelerate FP8 status - enabled: {fp8_enabled}, backend: {fp8_backend_type}")
+
+    if not fp8_enabled:
+        logger.info_rank0("WARNING: FP8 was requested but Accelerate shows fp8_enabled=False. FP8 may not be working.")
diff --git a/llamafactory/train/ksft/__init__.py b/llamafactory/train/ksft/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..12c53f62de02680d6c3c5dd9721216b4c7217b1d
--- /dev/null
+++ b/llamafactory/train/ksft/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2025 the KVCache.AI team, Approaching AI, and the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .workflow import run_sft
+
+
+__all__ = ["run_sft"]
diff --git a/llamafactory/train/ksft/workflow.py b/llamafactory/train/ksft/workflow.py
new file mode 100644
index 0000000000000000000000000000000000000000..5478a437b429a3c52680d8d734f0933fc3b26f17
--- /dev/null
+++ b/llamafactory/train/ksft/workflow.py
@@ -0,0 +1,113 @@
+# Copyright 2025 the KVCache.AI team, Approaching AI, and the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING, Optional
+
+from ...data import SFTDataCollatorWith4DAttentionMask, get_dataset, get_template_and_fix_tokenizer
+from ...extras.constants import IGNORE_INDEX
+from ...extras.logging import get_logger
+from ...extras.misc import calculate_tps
+from ...extras.ploting import plot_loss
+from ...model import load_model, load_tokenizer
+from ..trainer_utils import create_modelcard_and_push
+
+
+if TYPE_CHECKING:
+    from transformers import Seq2SeqTrainingArguments, TrainerCallback
+
+    from ...hparams import DataArguments, FinetuningArguments, GeneratingArguments, ModelArguments
+
+
+logger = get_logger(__name__)
+
+
+def run_sft(
+    model_args: "ModelArguments",
+    data_args: "DataArguments",
+    training_args: "Seq2SeqTrainingArguments",
+    finetuning_args: "FinetuningArguments",
+    generating_args: "GeneratingArguments",
+    callbacks: Optional[list["TrainerCallback"]] = None,
+):
+    tokenizer_module = load_tokenizer(model_args)
+    tokenizer = tokenizer_module["tokenizer"]
+    template = get_template_and_fix_tokenizer(tokenizer, data_args)
+    dataset_module = get_dataset(template, model_args, data_args, training_args, stage="sft", **tokenizer_module)
+    model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train)
+
+    from ktransformers.util.globals import GLOBAL_CONFIG
+
+    GLOBAL_CONFIG._config["mod"] = "sft"
+
+    if getattr(model, "is_quantized", False) and not training_args.do_train:
+        setattr(model, "_hf_peft_config_loaded", True)  # hack here: make model compatible with prediction
+
+    data_collator = SFTDataCollatorWith4DAttentionMask(
+        template=template,
+        model=model if not training_args.predict_with_generate else None,
+        pad_to_multiple_of=8 if training_args.do_train else None,  # for shift short attention
+        label_pad_token_id=IGNORE_INDEX if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id,
+        block_diag_attn=model_args.block_diag_attn,
+        attn_implementation=getattr(model.config, "_attn_implementation", None),
+        compute_dtype=model_args.compute_dtype,
+        **tokenizer_module,
+    )
+
+    # Metric utils
+    metric_module = {}
+    if training_args.predict_with_generate:
+        raise NotImplementedError("`predict_with_generate` is not supported in KTransformers SFT yet.")
+    elif finetuning_args.compute_accuracy:
+        raise NotImplementedError("`compute_accuracy` is not supported in KTransformers SFT yet.")
+
+    # Initialize our Trainer
+    from ktransformers.sft.lora import KTrainer
+
+    trainer = KTrainer(
+        model=model,
+        args=training_args,
+        tokenizer=tokenizer_module,
+        data_collator=data_collator,
+        callbacks=callbacks,
+        **dataset_module,
+        **metric_module,
+    )
+    trainer.model_accepts_loss_kwargs = False
+
+    # Training
+    if training_args.do_train:
+        model.config.use_cache = False
+        train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
+        trainer.save_model()
+        if finetuning_args.include_effective_tokens_per_second:
+            train_result.metrics["effective_tokens_per_sec"] = calculate_tps(
+                dataset_module["train_dataset"], train_result.metrics, stage="sft"
+            )
+
+        trainer.log_metrics("train", train_result.metrics)
+        trainer.save_metrics("train", train_result.metrics)
+        trainer.save_state()
+        if trainer.is_world_process_zero() and finetuning_args.plot_loss:
+            keys = ["loss"]
+            if isinstance(dataset_module.get("eval_dataset"), dict):
+                keys += sum(
+                    [[f"eval_{key}_loss", f"eval_{key}_accuracy"] for key in dataset_module["eval_dataset"].keys()], []
+                )
+            else:
+                keys += ["eval_loss", "eval_accuracy"]
+
+            plot_loss(training_args.output_dir, keys=keys)
+
+    # Create model card
+    create_modelcard_and_push(trainer, model_args, data_args, training_args, finetuning_args)
diff --git a/llamafactory/train/kto/__init__.py b/llamafactory/train/kto/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..491b067e41c53641f989d7dc17a22d6765f5684d
--- /dev/null
+++ b/llamafactory/train/kto/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .workflow import run_kto
+
+
+__all__ = ["run_kto"]
diff --git a/llamafactory/train/kto/__pycache__/__init__.cpython-312.pyc b/llamafactory/train/kto/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5bb7e3cd8be887b94a898b5ebd756d0daa43dc9c
Binary files /dev/null and b/llamafactory/train/kto/__pycache__/__init__.cpython-312.pyc differ
diff --git a/llamafactory/train/kto/__pycache__/trainer.cpython-312.pyc b/llamafactory/train/kto/__pycache__/trainer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2deba66814fc51986ae87fbe1404e4fe4c0bdd3b
Binary files /dev/null and b/llamafactory/train/kto/__pycache__/trainer.cpython-312.pyc differ
diff --git a/llamafactory/train/kto/__pycache__/workflow.cpython-312.pyc b/llamafactory/train/kto/__pycache__/workflow.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6ed5b4f2aa48d13748b77df785f322d4355aa8a2
Binary files /dev/null and b/llamafactory/train/kto/__pycache__/workflow.cpython-312.pyc differ
diff --git a/llamafactory/train/kto/trainer.py b/llamafactory/train/kto/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f67d0eced555f2e81b7d50da9933b11c3f42422f
--- /dev/null
+++ b/llamafactory/train/kto/trainer.py
@@ -0,0 +1,297 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's TRL library.
+# https://github.com/huggingface/trl/blob/v0.8.0/trl/trainer/kto_trainer.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from collections import defaultdict
+from contextlib import nullcontext
+from types import MethodType
+from typing import TYPE_CHECKING, Literal, Optional, Union
+
+import torch
+from transformers import Trainer
+from trl import KTOTrainer
+from trl.trainer import disable_dropout_in_model
+from typing_extensions import override
+
+from ...extras.constants import IGNORE_INDEX
+from ...extras.packages import is_transformers_version_greater_than
+from ..callbacks import SaveProcessorCallback
+from ..trainer_utils import create_custom_optimizer, create_custom_scheduler, get_batch_logps, nested_detach
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel, ProcessorMixin
+
+    from ...hparams import FinetuningArguments
+
+
+class CustomKTOTrainer(KTOTrainer):
+    def __init__(
+        self,
+        model: Union["PreTrainedModel", torch.nn.Module],
+        ref_model: Optional[Union["PreTrainedModel", torch.nn.Module]],
+        finetuning_args: "FinetuningArguments",
+        processor: Optional["ProcessorMixin"],
+        disable_dropout: bool = True,
+        **kwargs,
+    ):
+        if is_transformers_version_greater_than("4.46"):
+            kwargs["processing_class"] = kwargs.pop("tokenizer")
+
+        if disable_dropout:
+            disable_dropout_in_model(model)
+            if ref_model is not None:
+                disable_dropout_in_model(ref_model)
+
+        self.finetuning_args = finetuning_args
+        self.reference_free = False
+        self.use_dpo_data_collator = True  # hack to avoid warning
+        self.generate_during_eval = False  # disable at evaluation
+        self.label_pad_token_id = IGNORE_INDEX
+        self.padding_value = 0
+        self.is_encoder_decoder = model.config.is_encoder_decoder
+        self.precompute_ref_log_probs = False
+        self._precomputed_train_ref_log_probs = False
+        self._precomputed_eval_ref_log_probs = False
+        self._peft_has_been_casted_to_bf16 = False
+
+        self.ref_model = ref_model
+        self._stored_metrics = defaultdict(lambda: defaultdict(list))
+
+        # kto hyperparams
+        self.beta = finetuning_args.pref_beta
+        self.desirable_weight = finetuning_args.kto_chosen_weight
+        self.undesirable_weight = finetuning_args.kto_rejected_weight
+        self.ftx_gamma = finetuning_args.pref_ftx
+
+        Trainer.__init__(self, model=model, **kwargs)
+        self.model_accepts_loss_kwargs = False  # overwrite trainer's default behavior
+        if not hasattr(self, "accelerator"):
+            raise AttributeError("Please update `transformers`.")
+
+        warnings.simplefilter("ignore")  # remove gc warnings on ref model
+
+        if ref_model is not None:
+            if self.is_deepspeed_enabled:
+                if not (
+                    getattr(ref_model, "is_loaded_in_8bit", False) or getattr(ref_model, "is_loaded_in_4bit", False)
+                ):  # quantized models are already set on the correct device
+                    self.ref_model = self._prepare_deepspeed(self.ref_model)
+            else:
+                self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True)
+                self.ref_model.eval()
+
+        if processor is not None:
+            self.add_callback(SaveProcessorCallback(processor))
+
+        if finetuning_args.use_badam:
+            from badam import BAdamCallback, clip_grad_norm_old_version  # type: ignore
+
+            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
+            self.add_callback(BAdamCallback)
+
+    @override
+    def create_optimizer(self) -> "torch.optim.Optimizer":
+        if self.optimizer is None:
+            self.optimizer = create_custom_optimizer(self.model, self.args, self.finetuning_args)
+        return super().create_optimizer()
+
+    @override
+    def create_scheduler(
+        self, num_training_steps: int, optimizer: Optional["torch.optim.Optimizer"] = None
+    ) -> "torch.optim.lr_scheduler.LRScheduler":
+        create_custom_scheduler(self.args, num_training_steps, optimizer)
+        return super().create_scheduler(num_training_steps, optimizer)
+
+    @override
+    def _get_train_sampler(self, *args, **kwargs) -> Optional["torch.utils.data.Sampler"]:
+        r"""Replace the sequential sampler of KTO Trainer created by trl with the random sampler."""
+        if self.finetuning_args.disable_shuffling:
+            return torch.utils.data.SequentialSampler(self.train_dataset)
+
+        return Trainer._get_train_sampler(self, *args, **kwargs)
+
+    @override
+    def get_batch_samples(self, *args, **kwargs):
+        r"""Replace the method of KTO Trainer with the one of the standard Trainer."""
+        return Trainer.get_batch_samples(self, *args, **kwargs)
+
+    @override
+    def forward(
+        self, model: "PreTrainedModel", batch: dict[str, "torch.Tensor"], prefix: Literal["", "kl_"] = ""
+    ) -> tuple["torch.Tensor", "torch.Tensor", "torch.Tensor"]:
+        r"""Run forward pass and computes the log probabilities."""
+        batch = nested_detach(batch, clone=True)  # avoid error
+        model_inputs = {
+            "input_ids": batch[f"{prefix}input_ids"],
+            "attention_mask": batch[f"{prefix}attention_mask"],
+        }
+        if f"{prefix}token_type_ids" in batch:
+            model_inputs["token_type_ids"] = batch[f"{prefix}token_type_ids"]
+
+        if "pixel_values" in batch:
+            model_inputs["pixel_values"] = batch["pixel_values"]
+
+        if "image_sizes" in batch:
+            model_inputs["image_sizes"] = batch["image_sizes"]
+
+        if "image_grid_thw" in batch:
+            model_inputs["image_grid_thw"] = batch["image_grid_thw"]
+
+        if "aspect_ratio_ids" in batch:
+            model_inputs["aspect_ratio_ids"] = batch["aspect_ratio_ids"]
+
+        if "aspect_ratio_mask" in batch:
+            model_inputs["aspect_ratio_mask"] = batch["aspect_ratio_mask"]
+
+        if f"{prefix}cross_attention_mask" in batch:
+            model_inputs["cross_attention_mask"] = batch[f"{prefix}cross_attention_mask"]
+
+        logits = model(**model_inputs, return_dict=True, use_cache=False).logits.to(torch.float32)
+        logps, valid_length = get_batch_logps(logits=logits, labels=batch[f"{prefix}labels"])
+        return logits, logps, logps / valid_length
+
+    @override
+    def concatenated_forward(
+        self, model: "PreTrainedModel", batch: dict[str, "torch.Tensor"]
+    ) -> tuple["torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor"]:
+        target_logits, target_logps, target_logps_avg = self.forward(model, batch)
+        with torch.no_grad():
+            _, kl_logps, _ = self.forward(model, batch, prefix="kl_")
+
+        if len(target_logps) != len(batch["kto_tags"]):
+            raise ValueError("Mismatched shape of inputs and labels.")
+
+        chosen_logits = target_logits[batch["kto_tags"]]
+        chosen_logps = target_logps[batch["kto_tags"]]
+        rejected_logits = target_logits[~batch["kto_tags"]]
+        rejected_logps = target_logps[~batch["kto_tags"]]
+        chosen_logps_avg = target_logps_avg[batch["kto_tags"]]
+        return chosen_logps, rejected_logps, chosen_logits, rejected_logits, kl_logps, chosen_logps_avg
+
+    @override
+    def compute_reference_log_probs(
+        self, model: "PreTrainedModel", batch: dict[str, "torch.Tensor"]
+    ) -> tuple["torch.Tensor", "torch.Tensor", "torch.Tensor"]:
+        r"""Compute log probabilities of the reference model."""
+        if self.ref_model is None:
+            ref_model = model
+            ref_context = self.accelerator.unwrap_model(model).disable_adapter()
+        else:
+            ref_model = self.ref_model
+            ref_context = nullcontext()
+
+        with torch.no_grad(), ref_context:
+            reference_chosen_logps, reference_rejected_logps, _, _, reference_kl_logps, _ = self.concatenated_forward(
+                ref_model, batch
+            )
+
+        return reference_chosen_logps, reference_rejected_logps, reference_kl_logps
+
+    @override
+    def get_batch_loss_metrics(
+        self,
+        model: "PreTrainedModel",
+        batch: dict[str, "torch.Tensor"],
+    ) -> tuple["torch.Tensor", dict[str, "torch.Tensor"]]:
+        r"""Compute the DPO loss and other metrics for the given batch of inputs for train or test."""
+        metrics = {}
+        (
+            policy_chosen_logps,
+            policy_rejected_logps,
+            policy_chosen_logits,
+            policy_rejected_logits,
+            policy_kl_logps,
+            policy_chosen_logps_avg,
+        ) = self.concatenated_forward(model, batch)
+        reference_chosen_logps, reference_rejected_logps, reference_kl_logps = self.compute_reference_log_probs(
+            model, batch
+        )
+        losses, chosen_rewards, rejected_rewards, kl = self.kto_loss(
+            policy_chosen_logps,
+            policy_rejected_logps,
+            policy_kl_logps,
+            reference_chosen_logps,
+            reference_rejected_logps,
+            reference_kl_logps,
+        )
+        losses = losses.nanmean()
+
+        if self.ftx_gamma > 1e-6 and len(policy_chosen_logps) > 0:  # remember to rescale
+            sft_loss = -policy_chosen_logps_avg
+            losses += self.ftx_gamma * sft_loss.nanmean() / len(policy_chosen_logps) * len(batch["labels"])
+
+        num_chosen = len(chosen_rewards)
+        num_rejected = len(rejected_rewards)
+        if num_chosen > 0:
+            metrics["rewards/chosen_sum"] = chosen_rewards.nansum().item()
+            metrics["logps/chosen_sum"] = policy_chosen_logps.nansum().item()
+            metrics["logits/chosen_sum"] = policy_chosen_logits.nansum().item()
+            metrics["count/chosen"] = float(num_chosen)
+
+        if num_rejected > 0:
+            metrics["rewards/rejected_sum"] = rejected_rewards.nansum().item()
+            metrics["logps/rejected_sum"] = policy_rejected_logps.nansum().item()
+            metrics["logits/rejected_sum"] = policy_rejected_logits.nansum().item()
+            metrics["count/rejected"] = float(num_rejected)
+
+        metrics["kl"] = kl.item()
+        return losses, metrics
+
+    @override
+    def compute_loss(
+        self, model: "PreTrainedModel", inputs: dict[str, "torch.Tensor"], return_outputs: bool = False, **kwargs
+    ) -> Union["torch.Tensor", tuple["torch.Tensor", list["torch.Tensor"]]]:
+        r"""Subclass and override to accept extra kwargs."""
+        return super().compute_loss(model, inputs, return_outputs)
+
+    @override
+    def log(self, logs: dict[str, float], *args, **kwargs) -> None:
+        r"""Log `logs` on the various objects watching training, including stored metrics."""
+        # logs either has "loss" or "eval_loss"
+        train_eval = "train" if "loss" in logs else "eval"
+        prefix = "eval_" if train_eval == "eval" else ""
+        # Add averaged stored metrics to logs
+        key_list, metric_list = [], []
+        for key, metrics in self._stored_metrics[train_eval].items():
+            key_list.append(key)
+            metric_list.append(torch.tensor(metrics, dtype=torch.float).to(self.accelerator.device).sum().item())
+
+        del self._stored_metrics[train_eval]
+        if len(metric_list) < 9:  # pad to for all reduce
+            for i in range(9 - len(metric_list)):
+                key_list.append(f"dummy_{i}")
+                metric_list.append(0.0)
+
+        metric_list = torch.tensor(metric_list, dtype=torch.float).to(self.accelerator.device)
+        metric_list = self.accelerator.reduce(metric_list, "sum").tolist()
+        metric_dict: dict[str, float] = dict(zip(key_list, metric_list))
+        for split in ["chosen", "rejected"]:  # accumulate average metrics from sums and lengths
+            if f"count/{split}" in metric_dict:
+                for key in ("rewards", "logps", "logits"):
+                    logs[f"{prefix}{key}/{split}"] = metric_dict[f"{key}/{split}_sum"] / metric_dict[f"count/{split}"]
+                    del metric_dict[f"{key}/{split}_sum"]
+                del metric_dict[f"count/{split}"]
+
+        if f"{prefix}rewards/chosen" in logs and f"{prefix}rewards/rejected" in logs:  # calculate reward margin
+            logs[f"{prefix}rewards/margins"] = logs[f"{prefix}rewards/chosen"] - logs[f"{prefix}rewards/rejected"]
+
+        for key, metric in metric_dict.items():  # add remaining items
+            if not key.startswith("dummy_"):
+                logs[key] = metric
+
+        return Trainer.log(self, logs, *args, **kwargs)
diff --git a/llamafactory/train/kto/workflow.py b/llamafactory/train/kto/workflow.py
new file mode 100644
index 0000000000000000000000000000000000000000..df0794e3e986e0a0b55d667b255ee8c714fb8911
--- /dev/null
+++ b/llamafactory/train/kto/workflow.py
@@ -0,0 +1,101 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's TRL library.
+# https://github.com/huggingface/trl/blob/v0.8.0/examples/scripts/kto.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING, Optional
+
+from ...data import KTODataCollatorWithPadding, get_dataset, get_template_and_fix_tokenizer
+from ...extras.constants import IGNORE_INDEX
+from ...extras.ploting import plot_loss
+from ...hparams import ModelArguments
+from ...model import load_model, load_tokenizer
+from ..trainer_utils import create_modelcard_and_push, create_ref_model
+from .trainer import CustomKTOTrainer
+
+
+if TYPE_CHECKING:
+    from transformers import Seq2SeqTrainingArguments, TrainerCallback
+
+    from ...hparams import DataArguments, FinetuningArguments
+
+
+def run_kto(
+    model_args: "ModelArguments",
+    data_args: "DataArguments",
+    training_args: "Seq2SeqTrainingArguments",
+    finetuning_args: "FinetuningArguments",
+    callbacks: Optional[list["TrainerCallback"]] = None,
+):
+    tokenizer_module = load_tokenizer(model_args)
+    tokenizer = tokenizer_module["tokenizer"]
+    template = get_template_and_fix_tokenizer(tokenizer, data_args)
+    dataset_module = get_dataset(template, model_args, data_args, training_args, stage="kto", **tokenizer_module)
+    model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train)
+
+    data_collator = KTODataCollatorWithPadding(
+        template=template,
+        model=model,
+        pad_to_multiple_of=8,
+        label_pad_token_id=IGNORE_INDEX if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id,
+        **tokenizer_module,
+    )
+
+    # Create reference model
+    if finetuning_args.ref_model is None and (not training_args.do_train):  # use the model itself
+        ref_model = model
+    else:
+        ref_model = create_ref_model(model_args, finetuning_args)
+
+    # Initialize our Trainer
+    trainer = CustomKTOTrainer(
+        model=model,
+        ref_model=ref_model,
+        args=training_args,
+        finetuning_args=finetuning_args,
+        data_collator=data_collator,
+        callbacks=callbacks,
+        **dataset_module,
+        **tokenizer_module,
+    )
+
+    # Training
+    if training_args.do_train:
+        train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
+        trainer.save_model()
+        trainer.log_metrics("train", train_result.metrics)
+        trainer.save_metrics("train", train_result.metrics)
+        trainer.save_state()
+        if trainer.is_world_process_zero() and finetuning_args.plot_loss:
+            keys = ["loss", "rewards/chosen"]
+            if isinstance(dataset_module.get("eval_dataset"), dict):
+                keys += [f"eval_{key}_loss" for key in dataset_module["eval_dataset"].keys()]
+            else:
+                keys += ["eval_loss"]
+
+            plot_loss(training_args.output_dir, keys=keys)
+
+    # Evaluation
+    if training_args.do_eval:
+        metrics = trainer.evaluate(metric_key_prefix="eval")
+        if id(model) == id(ref_model):  # unable to compute rewards without a reference model
+            remove_keys = [key for key in metrics.keys() if "rewards" in key]
+            for key in remove_keys:
+                metrics.pop(key)
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+
+    # Create model card
+    create_modelcard_and_push(trainer, model_args, data_args, training_args, finetuning_args)
diff --git a/llamafactory/train/mca/__init__.py b/llamafactory/train/mca/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b3fb6eba7260ff5b6a01e19f9f05fc172a64df4
--- /dev/null
+++ b/llamafactory/train/mca/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .workflow import run_dpo, run_pt, run_sft
+
+
+__all__ = ["run_dpo", "run_pt", "run_sft"]
diff --git a/llamafactory/train/mca/trainer.py b/llamafactory/train/mca/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..97cc9b71379826d10914a14b67d34df3f4baffa8
--- /dev/null
+++ b/llamafactory/train/mca/trainer.py
@@ -0,0 +1,15 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# TODO override the original trainer
diff --git a/llamafactory/train/mca/workflow.py b/llamafactory/train/mca/workflow.py
new file mode 100644
index 0000000000000000000000000000000000000000..4684e827dcde49cb241a0b707486bb9cbf011b2e
--- /dev/null
+++ b/llamafactory/train/mca/workflow.py
@@ -0,0 +1,295 @@
+# Copyright 2025 the ROLL team and the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""MCA (mcore_adapter) workflows for PT/SFT/DPO stages, aligned with LLaMA-Factory's workflow style."""
+
+from __future__ import annotations
+
+import functools
+from collections.abc import Sequence
+from copy import deepcopy
+from typing import TYPE_CHECKING, Any
+
+from ...data import (
+    SFTDataCollatorWith4DAttentionMask,
+    get_dataset,
+    get_template_and_fix_tokenizer,
+)
+from ...data.collator import (
+    PairwiseDataCollatorWithPadding,
+)
+from ...extras.constants import IGNORE_INDEX, MCA_SUPPORTED_MODELS
+from ...extras.logging import get_logger
+from ...extras.misc import calculate_tps
+from ...extras.packages import is_mcore_adapter_available
+from ...extras.ploting import plot_loss
+from ...model import load_tokenizer
+from ..callbacks import SaveProcessorCallback
+
+
+if not is_mcore_adapter_available():
+    raise ImportError("mcore_adapter is not installed. Please install it with `pip install mcore-adapter`.")
+
+from mcore_adapter.models import AutoConfig, AutoModel
+from mcore_adapter.trainer import DPOTrainer as McaDPOTrainer
+from mcore_adapter.trainer import McaTrainer
+from mcore_adapter.trainer.dpo_config import DPOConfig
+from mcore_adapter.training_args import Seq2SeqTrainingArguments as McaSeq2SeqTrainingArguments
+
+
+if TYPE_CHECKING:
+    from transformers import DataCollatorForSeq2Seq, TrainerCallback
+
+    from ...hparams import DataArguments, FinetuningArguments, ModelArguments
+
+
+logger = get_logger(__name__)
+
+
+def _data_collator_wrapper(data_collator: Any):
+    @functools.wraps(data_collator)
+    def wrapper(features: Sequence[dict[str, Any]]):
+        labels_key = [k for k in features[0].keys() if k.endswith("labels")]
+        input_ids_key = [k for k in features[0].keys() if k.endswith("input_ids")]
+        for feature in features:
+            if len(labels_key) == 0:  # pt
+                feature["labels"] = deepcopy(feature["input_ids"])[1:]
+            for k in labels_key:
+                feature[k] = feature[k][1:]
+            for k in input_ids_key:
+                feature[k] = feature[k][:-1]
+            for k in ["attention_mask", "position_ids"]:
+                if k in feature:
+                    feature[k] = feature[k][:-1]
+        return data_collator(features)
+
+    return wrapper
+
+
+def _check_model_support(model_args: ModelArguments):
+    from transformers import AutoConfig as HfAutoConfig
+
+    config = HfAutoConfig.from_pretrained(
+        model_args.model_name_or_path, trust_remote_code=model_args.trust_remote_code
+    )
+    if config.model_type not in MCA_SUPPORTED_MODELS:
+        raise ValueError(f"Model {config.model_type} is not supported by MCA.")
+
+
+def run_pt(
+    model_args: ModelArguments,
+    data_args: DataArguments,
+    training_args: McaSeq2SeqTrainingArguments,
+    finetuning_args: FinetuningArguments,
+    callbacks: list[TrainerCallback] | None = None,
+):
+    tokenizer_module = load_tokenizer(model_args)
+    tokenizer = tokenizer_module["tokenizer"]
+    template = get_template_and_fix_tokenizer(tokenizer, data_args)
+
+    # dataset needs +1 then cut back due to MCA shift logic
+    data_args.cutoff_len += 1
+    dataset_module = get_dataset(template, model_args, data_args, training_args, stage="pt", **tokenizer_module)
+    data_args.cutoff_len -= 1
+
+    _check_model_support(model_args)
+    model = AutoModel.from_pretrained(model_args.model_name_or_path, training_args)
+
+    from transformers import DataCollatorForSeq2Seq
+
+    data_collator: DataCollatorForSeq2Seq = DataCollatorForSeq2Seq(
+        tokenizer=tokenizer,
+        pad_to_multiple_of=8,
+        label_pad_token_id=IGNORE_INDEX,
+    )
+    data_collator = _data_collator_wrapper(data_collator)
+
+    trainer = McaTrainer(
+        model=model,
+        args=training_args,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        callbacks=callbacks,
+        **dataset_module,
+    )
+
+    if "processor" in tokenizer_module and tokenizer_module["processor"] is not None:
+        trainer.add_callback(SaveProcessorCallback(tokenizer_module["processor"]))
+
+    if training_args.do_train:
+        train_result = trainer.train(training_args.resume_from_checkpoint)
+        trainer.save_model()
+        trainer.log_metrics("train", train_result.metrics)
+        trainer.save_metrics("train", train_result.metrics)
+        trainer.save_state()
+        if trainer.is_world_process_zero() and finetuning_args.plot_loss:
+            keys = ["loss"]
+            if isinstance(dataset_module.get("eval_dataset"), dict):
+                keys += [f"eval_{key}_loss" for key in dataset_module["eval_dataset"].keys()]
+            else:
+                keys += ["eval_loss"]
+            plot_loss(training_args.output_dir, keys=keys)
+
+
+def run_sft(
+    model_args: ModelArguments,
+    data_args: DataArguments,
+    training_args: McaSeq2SeqTrainingArguments,
+    finetuning_args: FinetuningArguments,
+    callbacks: list[TrainerCallback] | None = None,
+):
+    # align packing flags
+    # TODO: FIX SequencePacking
+    data_args.neat_packing = training_args.sequence_packing = data_args.neat_packing or training_args.sequence_packing
+    data_args.packing = data_args.neat_packing or data_args.packing
+
+    tokenizer_module = load_tokenizer(model_args)
+    tokenizer = tokenizer_module["tokenizer"]
+    template = get_template_and_fix_tokenizer(tokenizer, data_args)
+
+    # dataset needs +1 then cut back due to MCA shift logic
+    data_args.cutoff_len += 1
+    dataset_module = get_dataset(template, model_args, data_args, training_args, stage="sft", **tokenizer_module)
+    data_args.cutoff_len -= 1
+
+    _check_model_support(model_args)
+    model = AutoModel.from_pretrained(model_args.model_name_or_path, training_args)
+
+    # optional freezing for qwen2_vl, qwen2_5_vl
+    if getattr(model.config, "hf_model_type", None) in ["qwen2_vl", "qwen2_5_vl"]:
+        params_to_freeze = []
+        if finetuning_args.freeze_vision_tower:
+            params_to_freeze.extend(["vision_model.blocks", "vision_model.patch_embed"])
+
+        if finetuning_args.freeze_multi_modal_projector:
+            params_to_freeze.extend(["multi_modal_projector"])
+
+        if finetuning_args.freeze_language_model:
+            params_to_freeze.extend(["embedding", "decoder", "output_layer"])
+
+        if params_to_freeze:
+            for name, p in model.named_parameters():
+                if any(name.startswith(k) for k in params_to_freeze):
+                    p.requires_grad_(False)
+
+    pad_to_max = training_args.expert_model_parallel_size is not None and training_args.expert_model_parallel_size > 1
+    data_collator = SFTDataCollatorWith4DAttentionMask(
+        template=template,
+        padding="max_length" if pad_to_max else "longest",
+        max_length=data_args.cutoff_len if pad_to_max else None,
+        pad_to_multiple_of=64,
+        label_pad_token_id=IGNORE_INDEX,
+        **tokenizer_module,
+    )
+    data_collator = _data_collator_wrapper(data_collator)
+
+    trainer = McaTrainer(
+        model=model,
+        args=training_args,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        callbacks=callbacks,
+        **dataset_module,
+    )
+
+    if "processor" in tokenizer_module and tokenizer_module["processor"] is not None:
+        trainer.add_callback(SaveProcessorCallback(tokenizer_module["processor"]))
+
+    train_result = trainer.train(training_args.resume_from_checkpoint)
+    trainer.save_model()
+    trainer.log_metrics("train", train_result.metrics)
+    trainer.save_metrics("train", train_result.metrics)
+    trainer.save_state()
+    if trainer.is_world_process_zero() and finetuning_args.plot_loss:
+        keys = ["loss"]
+        if isinstance(dataset_module.get("eval_dataset"), dict):
+            keys += [f"eval_{key}_loss" for key in dataset_module["eval_dataset"].keys()]
+        else:
+            keys += ["eval_loss"]
+        plot_loss(training_args.output_dir, keys=keys)
+
+
+def run_dpo(
+    model_args: ModelArguments,
+    data_args: DataArguments,
+    training_args: McaSeq2SeqTrainingArguments,
+    finetuning_args: FinetuningArguments,
+    callbacks: list[TrainerCallback] | None = None,
+):
+    tokenizer_module = load_tokenizer(model_args)
+    tokenizer = tokenizer_module["tokenizer"]
+    template = get_template_and_fix_tokenizer(tokenizer, data_args)
+
+    _check_model_support(model_args)
+    model = AutoModel.from_pretrained(model_args.model_name_or_path, training_args)
+
+    if finetuning_args.use_ref_model:
+        ref_config = AutoConfig.from_pretrained(model_args.model_name_or_path, training_args)
+        ref_model = AutoModel.from_config(ref_config)
+        ref_model.load_state_dict(model.state_dict())
+    else:
+        ref_model = None
+
+    # dataset needs +1 then cut back due to MCA shift logic
+    data_args.cutoff_len += 1
+    dataset_module = get_dataset(template, model_args, data_args, training_args, stage="rm", **tokenizer_module)
+    data_args.cutoff_len -= 1
+
+    pad_to_max = training_args.expert_model_parallel_size is not None and training_args.expert_model_parallel_size > 1
+    dpo_config = DPOConfig(
+        beta=finetuning_args.pref_beta,
+        pref_loss=finetuning_args.pref_loss,
+        label_smoothing=finetuning_args.dpo_label_smoothing,
+    )
+    data_collator = PairwiseDataCollatorWithPadding(
+        template=template,
+        pad_to_multiple_of=64,
+        padding="max_length" if pad_to_max else "longest",
+        max_length=data_args.cutoff_len if pad_to_max else None,
+        label_pad_token_id=IGNORE_INDEX,
+        **tokenizer_module,
+    )
+    data_collator = _data_collator_wrapper(data_collator)
+
+    trainer = McaDPOTrainer(
+        model=model,
+        ref_model=ref_model,
+        args=training_args,
+        train_config=dpo_config,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        callbacks=callbacks,
+        **dataset_module,
+    )
+
+    if "processor" in tokenizer_module and tokenizer_module["processor"] is not None:
+        trainer.add_callback(SaveProcessorCallback(tokenizer_module["processor"]))
+
+    train_result = trainer.train(training_args.resume_from_checkpoint)
+    trainer.save_model()
+    if finetuning_args.include_effective_tokens_per_second:
+        train_result.metrics["effective_tokens_per_sec"] = calculate_tps(
+            dataset_module["train_dataset"], train_result.metrics, stage="rm"
+        )
+
+    trainer.log_metrics("train", train_result.metrics)
+    trainer.save_metrics("train", train_result.metrics)
+    trainer.save_state()
+    if trainer.is_world_process_zero() and finetuning_args.plot_loss:
+        keys = ["loss", "rewards/accuracies"]
+        if isinstance(dataset_module.get("eval_dataset"), dict):
+            keys += [f"eval_{key}_loss" for key in dataset_module["eval_dataset"].keys()]
+        else:
+            keys += ["eval_loss"]
+
+        plot_loss(training_args.output_dir, keys=keys)
diff --git a/llamafactory/train/ppo/__init__.py b/llamafactory/train/ppo/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed9bc4d274d2b0a5cc16074858cd552348620ceb
--- /dev/null
+++ b/llamafactory/train/ppo/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .workflow import run_ppo
+
+
+__all__ = ["run_ppo"]
diff --git a/llamafactory/train/ppo/__pycache__/__init__.cpython-312.pyc b/llamafactory/train/ppo/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6769c254f206ed089fb9982edbb724b0e4804efc
Binary files /dev/null and b/llamafactory/train/ppo/__pycache__/__init__.cpython-312.pyc differ
diff --git a/llamafactory/train/ppo/__pycache__/ppo_utils.cpython-312.pyc b/llamafactory/train/ppo/__pycache__/ppo_utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2bf415db190dc5e76af8220da443b1726c80a836
Binary files /dev/null and b/llamafactory/train/ppo/__pycache__/ppo_utils.cpython-312.pyc differ
diff --git a/llamafactory/train/ppo/__pycache__/trainer.cpython-312.pyc b/llamafactory/train/ppo/__pycache__/trainer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6d2605feafb5e17c3d4abccf7d835e96769c457a
Binary files /dev/null and b/llamafactory/train/ppo/__pycache__/trainer.cpython-312.pyc differ
diff --git a/llamafactory/train/ppo/__pycache__/workflow.cpython-312.pyc b/llamafactory/train/ppo/__pycache__/workflow.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0a832c1a7920c405943361a0bdfb5a14fc75cbe2
Binary files /dev/null and b/llamafactory/train/ppo/__pycache__/workflow.cpython-312.pyc differ
diff --git a/llamafactory/train/ppo/ppo_utils.py b/llamafactory/train/ppo/ppo_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d462e77b74e88d66af8e60f3483786c11607bea
--- /dev/null
+++ b/llamafactory/train/ppo/ppo_utils.py
@@ -0,0 +1,80 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+from contextlib import nullcontext
+from typing import TYPE_CHECKING, Literal, Optional
+
+import torch
+from transformers.integrations import is_deepspeed_zero3_enabled
+
+from ...extras.packages import is_requests_available
+
+
+if is_requests_available():
+    import requests
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel
+    from trl import AutoModelForCausalLMWithValueHead
+
+
+def get_rewards_from_server(server_url: str, messages: list[str]) -> list["torch.Tensor"]:
+    r"""Get reward scores from the API server."""
+    headers = {"Content-Type": "application/json"}
+    payload = {"model": "model", "messages": messages}
+    response = requests.post(server_url, json=payload, headers=headers)
+    rewards = json.loads(response.text)["scores"]
+    return torch.Tensor(rewards)
+
+
+def replace_model(model: "AutoModelForCausalLMWithValueHead", target: Literal["default", "reward"]) -> None:
+    r"""Replace the default/reward modules in the model. The model is already unwrapped."""
+    v_head_layer = model.v_head.summary
+    if is_deepspeed_zero3_enabled():
+        import deepspeed  # type: ignore
+
+        params = [v_head_layer.weight, v_head_layer.bias]
+        context_maybe_zero3 = deepspeed.zero.GatheredParameters(params, modifier_rank=0)
+    else:
+        context_maybe_zero3 = nullcontext()
+
+    model.pretrained_model.set_adapter(target)  # set the LoRA adapter to be active
+    with context_maybe_zero3:
+        if target == "reward":  # save default head temporarily
+            setattr(model, "default_head_weight", v_head_layer.weight.data.detach().clone())
+            setattr(model, "default_head_bias", v_head_layer.bias.data.detach().clone())
+
+        device = v_head_layer.weight.device
+        v_head_layer.weight.data = model.get_buffer(f"{target}_head_weight").detach().clone().to(device)
+        v_head_layer.bias.data = model.get_buffer(f"{target}_head_bias").detach().clone().to(device)
+
+
+def dump_layernorm(model: "PreTrainedModel") -> dict[str, "torch.Tensor"]:
+    r"""Dump the layernorm parameters in the model. The model is already unwrapped (and gathered)."""
+    layer_norm_params = {}
+    for name, param in model.named_parameters():
+        if param.data.dtype == torch.float32:
+            layer_norm_params[name] = param.data.detach().clone()
+            param.data = param.data.to(model.config.torch_dtype)
+
+    return layer_norm_params
+
+
+def restore_layernorm(model: "PreTrainedModel", layernorm_params: Optional[dict[str, "torch.Tensor"]] = None) -> None:
+    r"""Restore the layernorm parameters in the model. The model is already unwrapped (and gathered)."""
+    for name, param in model.named_parameters():
+        if name in layernorm_params:
+            param.data = layernorm_params[name]
diff --git a/llamafactory/train/ppo/trainer.py b/llamafactory/train/ppo/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..09d12a851ed48451286e9df6e291bcf5032ebf66
--- /dev/null
+++ b/llamafactory/train/ppo/trainer.py
@@ -0,0 +1,503 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's TRL library.
+# https://github.com/huggingface/trl/blob/v0.8.0/trl/trainer/ppo_trainer.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import os
+import sys
+import warnings
+from types import MethodType
+from typing import TYPE_CHECKING, Any, Optional
+
+import torch
+from accelerate.utils import DistributedDataParallelKwargs
+from tqdm import tqdm
+from transformers import GenerationConfig, Trainer, TrainerControl, TrainerState
+from transformers.optimization import get_scheduler
+from transformers.trainer import DEFAULT_CALLBACKS
+from transformers.trainer_callback import CallbackHandler
+from transformers.trainer_pt_utils import remove_dummy_checkpoint
+from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
+from transformers.utils import SAFE_WEIGHTS_NAME, WEIGHTS_NAME
+from trl import PPOConfig, PPOTrainer
+from trl.core import PPODecorators, logprobs_from_logits
+from trl.models.utils import unwrap_model_for_generation
+from typing_extensions import override
+
+from ...extras import logging
+from ...extras.misc import AverageMeter, count_parameters, get_current_device, get_logits_processor
+from ..callbacks import FixValueHeadModelCallback, SaveProcessorCallback
+from ..trainer_utils import create_custom_optimizer, create_custom_scheduler
+from .ppo_utils import dump_layernorm, get_rewards_from_server, replace_model, restore_layernorm
+
+
+if TYPE_CHECKING:
+    from datasets import Dataset
+    from transformers import (
+        DataCollatorWithPadding,
+        PreTrainedTokenizer,
+        ProcessorMixin,
+        Seq2SeqTrainingArguments,
+        TrainerCallback,
+    )
+    from trl import AutoModelForCausalLMWithValueHead
+
+    from ...hparams import FinetuningArguments, GeneratingArguments, ModelArguments
+
+
+logger = logging.get_logger(__name__)
+
+
+class CustomPPOTrainer(PPOTrainer, Trainer):
+    r"""Inherit PPOTrainer."""
+
+    def __init__(
+        self,
+        model_args: "ModelArguments",
+        training_args: "Seq2SeqTrainingArguments",
+        finetuning_args: "FinetuningArguments",
+        generating_args: "GeneratingArguments",
+        callbacks: Optional[list["TrainerCallback"]],
+        model: "AutoModelForCausalLMWithValueHead",
+        reward_model: Optional["AutoModelForCausalLMWithValueHead"],
+        ref_model: Optional["AutoModelForCausalLMWithValueHead"],
+        tokenizer: "PreTrainedTokenizer",
+        processor: Optional["ProcessorMixin"],
+        data_collator: "DataCollatorWithPadding",
+        train_dataset: Optional["Dataset"] = None,
+        eval_dataset: Optional["Dataset"] = None,
+    ) -> None:
+        if eval_dataset is not None:
+            raise NotImplementedError("PPOTrainer does not support eval dataset yet.")
+
+        backward_batch_size = training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps
+        ppo_config = PPOConfig(
+            model_name=model_args.model_name_or_path,
+            learning_rate=training_args.learning_rate,
+            mini_batch_size=training_args.per_device_train_batch_size,
+            batch_size=backward_batch_size * finetuning_args.ppo_buffer_size,
+            gradient_accumulation_steps=training_args.gradient_accumulation_steps,
+            ppo_epochs=finetuning_args.ppo_epochs,
+            max_grad_norm=training_args.max_grad_norm,
+            seed=training_args.seed,
+            optimize_device_cache=True,
+            target=finetuning_args.ppo_target,
+            use_score_scaling=finetuning_args.ppo_score_norm,
+            use_score_norm=finetuning_args.ppo_score_norm,
+            whiten_rewards=finetuning_args.ppo_whiten_rewards,
+            accelerator_kwargs={"step_scheduler_with_optimizer": False},
+            log_with=training_args.report_to[0] if training_args.report_to else None,
+            project_kwargs={"logging_dir": training_args.logging_dir},
+        )
+
+        # Add deepspeed config
+        if training_args.deepspeed_plugin is not None:
+            ppo_config.accelerator_kwargs["kwargs_handlers"] = [
+                DistributedDataParallelKwargs(find_unused_parameters=training_args.ddp_find_unused_parameters)
+            ]
+            ppo_config.accelerator_kwargs["deepspeed_plugin"] = training_args.deepspeed_plugin
+            if ppo_config.log_with is not None:
+                logger.warning_rank0("PPOTrainer cannot use external logger when DeepSpeed is enabled.")
+                ppo_config.log_with = None
+
+        # Create optimizer and scheduler
+        if training_args.max_steps > 0:
+            num_training_steps = training_args.max_steps
+        else:
+            total_train_batch_size = backward_batch_size * finetuning_args.ppo_buffer_size * training_args.world_size
+            num_training_steps = training_args.num_train_epochs * math.ceil(
+                len(train_dataset) / total_train_batch_size
+            )
+
+        optimizer = self.create_optimizer(model, training_args, finetuning_args)
+        scheduler = self.create_scheduler(training_args, num_training_steps, optimizer)
+
+        PPOTrainer.__init__(
+            self,
+            config=ppo_config,
+            model=model,
+            ref_model=ref_model,
+            tokenizer=tokenizer,
+            dataset=train_dataset,
+            optimizer=optimizer,
+            data_collator=data_collator,
+            lr_scheduler=scheduler,
+        )
+
+        self.args = training_args
+        self.model_args = model_args
+        self.finetuning_args = finetuning_args
+        self.reward_model = reward_model
+        self.current_device = get_current_device()  # patch for deepspeed training
+
+        self.generation_config = GenerationConfig(
+            pad_token_id=self.tokenizer.pad_token_id,
+            eos_token_id=[self.tokenizer.eos_token_id] + self.tokenizer.additional_special_tokens_ids,
+            **generating_args.to_dict(),
+        )
+
+        self.state = TrainerState()
+        self.control = TrainerControl()
+        self.is_deepspeed_enabled = getattr(self.accelerator.state, "deepspeed_plugin", None) is not None
+        self.is_fsdp_enabled = getattr(self.accelerator.state, "fsdp_plugin", None) is not None
+        callbacks = DEFAULT_CALLBACKS if callbacks is None else DEFAULT_CALLBACKS + callbacks
+        self.callback_handler = CallbackHandler(
+            callbacks, self.accelerator.unwrap_model(self.model), self.tokenizer, self.optimizer, self.lr_scheduler
+        )
+        if self.args.max_steps > 0:
+            logger.info_rank0("max_steps is given, it will override any value given in num_train_epochs")
+
+        self.amp_context = torch.autocast(self.current_device.type)
+        warnings.simplefilter("ignore")  # remove gc warnings on ref model
+
+        if finetuning_args.reward_model_type == "full":
+            if self.is_deepspeed_enabled:
+                if not (
+                    getattr(reward_model.pretrained_model, "is_loaded_in_8bit", False)
+                    or getattr(reward_model.pretrained_model, "is_loaded_in_4bit", False)
+                ):  # quantized models are already set on the correct device
+                    self.reward_model = self._prepare_deepspeed(self.reward_model)
+            else:
+                self.reward_model = self.accelerator.prepare_model(self.reward_model, evaluation_mode=True)
+
+        self.add_callback(FixValueHeadModelCallback)
+
+        if processor is not None:
+            self.add_callback(SaveProcessorCallback(processor))
+
+        if finetuning_args.use_badam:
+            from badam import BAdamCallback, clip_grad_norm_old_version  # type: ignore
+
+            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
+            self.add_callback(BAdamCallback)
+
+    def ppo_train(self, resume_from_checkpoint: Optional[str] = None) -> None:
+        r"""Implement training loop for the PPO stage, like _inner_training_loop() in Huggingface's Trainer."""
+        if resume_from_checkpoint is not None:
+            raise ValueError("`resume_from_checkpoint` will be supported in the future version.")
+
+        total_train_batch_size = (
+            self.args.per_device_train_batch_size
+            * self.args.gradient_accumulation_steps
+            * self.finetuning_args.ppo_buffer_size
+            * self.args.world_size
+        )
+        if self.args.max_steps > 0:
+            num_examples = total_train_batch_size * self.args.max_steps
+            num_train_epochs = sys.maxsize
+            max_steps = self.args.max_steps
+            steps_in_epoch = self.args.max_steps
+        else:
+            len_dataloader = len(self.dataloader)
+            num_examples = len(self.dataset)
+            num_train_epochs = self.args.num_train_epochs
+            max_steps = math.ceil(num_train_epochs * len_dataloader)
+            steps_in_epoch = len_dataloader
+
+        self.state.max_steps = max_steps
+        self.state.num_train_epochs = num_train_epochs
+        self.state.is_local_process_zero = self.is_local_process_zero()
+        self.state.is_world_process_zero = self.is_world_process_zero()
+
+        logger.info_rank0("***** Running training *****")
+        logger.info_rank0(f"  Num examples = {num_examples:,}")
+        logger.info_rank0(f"  Num Epochs = {num_train_epochs:,}")
+        logger.info_rank0(f"  Instantaneous batch size per device = {self.args.per_device_train_batch_size:,}")
+        logger.info_rank0(
+            f"  Total train batch size (w. parallel, buffer, distributed & accumulation) = {total_train_batch_size:,}"
+        )
+        logger.info_rank0(f"  Gradient Accumulation steps = {self.args.gradient_accumulation_steps:,}")
+        logger.info_rank0(f"  Num optimization epochs per batch = {self.finetuning_args.ppo_epochs:,}")
+        logger.info_rank0(f"  Total training steps = {max_steps:,}")
+        logger.info_rank0(f"  Number of trainable parameters = {count_parameters(self.model)[0]:,}")
+
+        dataiter = iter(self.dataloader)
+        loss_meter = AverageMeter()
+        reward_meter = AverageMeter()
+        self.callback_handler.on_train_begin(self.args, self.state, self.control)
+
+        for step in tqdm(range(max_steps), disable=not self.is_local_process_zero()):
+            try:
+                batch = next(dataiter)
+            except StopIteration:
+                dataiter = iter(self.dataloader)
+                batch = next(dataiter)
+
+            # Get inputs
+            self.model.eval()
+            self.tokenizer.padding_side = "right"  # change padding side
+            queries, responses, rewards = [], [], []
+            for idx in range(0, self.config.batch_size, self.config.mini_batch_size):
+                mini_batch = {
+                    "input_ids": batch["input_ids"][idx : idx + self.config.mini_batch_size],
+                    "attention_mask": batch["attention_mask"][idx : idx + self.config.mini_batch_size],
+                }
+                mini_batch_queries, mini_batch_responses = self.get_inputs(mini_batch)
+                mini_batch_rewards = self.get_rewards(mini_batch_queries, mini_batch_responses)
+                queries.extend(mini_batch_queries)
+                responses.extend(mini_batch_responses)
+                rewards.extend(mini_batch_rewards)
+
+            # Run PPO step
+            self.model.train()
+            stats = self.step(queries, responses, rewards)
+            self.tokenizer.padding_side = "left"  # restore padding side
+            loss_meter.update(float(stats["ppo/loss/total"]), n=len(rewards))
+            reward_meter.update(torch.stack(rewards).mean().item(), n=len(rewards))
+
+            if self.config.log_with is not None:
+                try:
+                    batch["query"] = self.tokenizer.batch_decode(queries, skip_special_tokens=True)
+                    batch["response"] = self.tokenizer.batch_decode(responses, skip_special_tokens=True)
+                    self.log_stats(stats, batch, rewards)
+                except Exception:
+                    logger.warning_rank0("Failed to save stats due to unknown errors.")
+
+            self.state.global_step += 1
+            self.callback_handler.on_step_end(self.args, self.state, self.control)
+
+            if self.is_local_process_zero() and (step + 1) % self.args.logging_steps == 0:
+                logs = dict(
+                    loss=round(loss_meter.avg, 4),
+                    reward=round(reward_meter.avg, 4),
+                    learning_rate=stats["ppo/learning_rate"],
+                    epoch=round(step / steps_in_epoch, 2),
+                )
+                tqdm.write(str(logs))
+                logs["step"] = step
+                self.state.log_history.append(logs)
+                self.callback_handler.on_log(self.args, self.state, self.control, logs)
+                loss_meter.reset()
+                reward_meter.reset()
+
+            if (step + 1) % self.args.save_steps == 0:  # save checkpoint
+                self.save_model(
+                    os.path.join(self.args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}")
+                )
+                self.callback_handler.on_save(self.args, self.state, self.control)
+
+            if self.control.should_epoch_stop or self.control.should_training_stop:
+                break
+
+        self.callback_handler.on_train_end(self.args, self.state, self.control)
+
+    @override
+    def create_optimizer(
+        self,
+        model: "AutoModelForCausalLMWithValueHead",
+        training_args: "Seq2SeqTrainingArguments",
+        finetuning_args: "FinetuningArguments",
+    ) -> "torch.optim.Optimizer":
+        optimizer = create_custom_optimizer(model, training_args, finetuning_args)
+        if optimizer is None:
+            decay_params, nodecay_params = [], []
+            decay_param_names = self.get_decay_parameter_names(model)
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    if name in decay_param_names:
+                        decay_params.append(param)
+                    else:
+                        nodecay_params.append(param)
+
+            optim_class, optim_kwargs = Trainer.get_optimizer_cls_and_kwargs(training_args)
+            param_groups = [
+                dict(params=nodecay_params),
+                dict(params=decay_params, weight_decay=training_args.weight_decay),
+            ]
+            optimizer = optim_class(param_groups, **optim_kwargs)
+
+        return optimizer
+
+    @override
+    def create_scheduler(
+        self, training_args: "Seq2SeqTrainingArguments", num_training_steps: int, optimizer: "torch.optim.Optimizer"
+    ) -> "torch.optim.lr_scheduler.LRScheduler":
+        create_custom_scheduler(training_args, num_training_steps, optimizer)
+        lr_scheduler = get_scheduler(
+            training_args.lr_scheduler_type,
+            optimizer=optimizer,
+            num_warmup_steps=training_args.get_warmup_steps(num_training_steps),
+            num_training_steps=num_training_steps,
+        )
+        return lr_scheduler
+
+    @torch.no_grad()
+    def get_inputs(self, batch: dict[str, "torch.Tensor"]) -> tuple[list["torch.Tensor"], list["torch.Tensor"]]:
+        r"""Generate model's responses given queries."""
+        if batch["input_ids"].size(0) == 1:  # handle llama2 ppo with gradient accumulation > 1
+            start_index = (batch["input_ids"][0] != self.tokenizer.pad_token_id).nonzero()[0].item()
+            for k, v in batch.items():
+                batch[k] = v[:, start_index:]
+
+        with unwrap_model_for_generation(self.model, self.accelerator) as unwrapped_model:
+            unwrapped_model: AutoModelForCausalLMWithValueHead = self.accelerator.unwrap_model(self.model)
+            if self.model_args.upcast_layernorm:
+                layernorm_params = dump_layernorm(unwrapped_model)
+
+            generate_output: torch.Tensor = unwrapped_model.generate(
+                generation_config=self.generation_config, logits_processor=get_logits_processor(), **batch
+            )
+            if self.model_args.upcast_layernorm:
+                restore_layernorm(unwrapped_model, layernorm_params)
+
+        query = batch["input_ids"].detach().cpu()
+        response = generate_output[:, batch["input_ids"].size(-1) :].detach().cpu()
+        queries, responses = [], []
+        for i in range(len(query)):
+            query_start_index = (query[i] != self.tokenizer.pad_token_id).nonzero()[0].item()
+            response_indexes = (response[i] != self.tokenizer.pad_token_id).nonzero()
+
+            if len(response_indexes) == 0:  # allow empty response
+                response_length = 1
+            elif self.tokenizer.eos_token_id == self.tokenizer.pad_token_id:  # include eos token
+                response_length = response_indexes[-1].item() + 2
+            else:
+                response_length = response_indexes[-1].item() + 1
+
+            queries.append(query[i, query_start_index:])  # remove padding from left
+            responses.append(response[i, :response_length])  # remove padding from right
+
+        return queries, responses
+
+    @torch.no_grad()
+    def get_rewards(
+        self,
+        queries: list["torch.Tensor"],
+        responses: list["torch.Tensor"],
+    ) -> list["torch.Tensor"]:
+        r"""Compute scores using given reward model.
+
+        Both inputs and outputs are put on CPU.
+        """
+        if self.finetuning_args.reward_model_type == "api":
+            token_ids = [torch.cat((q, r), dim=-1).tolist() for q, r in zip(queries, responses)]
+            messages = self.tokenizer.batch_decode(token_ids, skip_special_tokens=False)
+            return get_rewards_from_server(self.reward_model, messages)
+
+        batch: dict[str, torch.Tensor] = self.prepare_model_inputs(queries, responses)
+        unwrapped_model: AutoModelForCausalLMWithValueHead = self.accelerator.unwrap_model(self.model)
+
+        if self.finetuning_args.reward_model_type in ["lora", "oft"]:
+            replace_model(unwrapped_model, target="reward")
+            reward_model = self.model
+        else:
+            reward_model = self.reward_model
+
+        with unwrap_model_for_generation(reward_model, self.accelerator), self.amp_context:  # support bf16
+            values: torch.Tensor = reward_model(**batch, return_dict=True, use_cache=False)[-1]
+
+        if self.finetuning_args.reward_model_type in ["lora", "oft"]:
+            replace_model(unwrapped_model, target="default")
+
+        rewards = values.gather(dim=-1, index=(batch["attention_mask"].sum(dim=-1, keepdim=True) - 1))
+        return rewards.float().detach()  # use fp32 type
+
+    @override
+    @PPODecorators.empty_device_cache()
+    def batched_forward_pass(
+        self,
+        model: "AutoModelForCausalLMWithValueHead",
+        queries: "torch.Tensor",
+        responses: "torch.Tensor",
+        model_inputs: dict[str, Any],
+        return_logits: bool = False,
+        response_masks: Optional["torch.Tensor"] = None,
+    ) -> tuple["torch.Tensor", Optional["torch.Tensor"], "torch.Tensor", "torch.Tensor"]:
+        r"""Calculate model outputs in multiple batches.
+
+        Subclass and override to inject custom behavior.
+        """
+        bs = len(queries)
+        fbs = self.config.mini_batch_size
+        all_logprobs = []
+        all_logits = []
+        all_masks = []
+        all_values = []
+
+        for i in range(math.ceil(bs / fbs)):
+            input_kwargs = {key: value[i * fbs : (i + 1) * fbs] for key, value in model_inputs.items()}
+            query_batch = queries[i * fbs : (i + 1) * fbs]
+            response_batch = responses[i * fbs : (i + 1) * fbs]
+            if response_masks is not None:
+                response_masks_batch = response_masks[i * fbs : (i + 1) * fbs]
+            input_ids = input_kwargs["input_ids"]
+            attention_mask = input_kwargs["attention_mask"]
+
+            with self.amp_context:  # support bf16
+                logits, _, values = model(**input_kwargs, return_dict=True, use_cache=False)
+
+            logprobs = logprobs_from_logits(logits[:, :-1, :], input_ids[:, 1:])
+            masks = torch.zeros_like(attention_mask)
+            masks[:, :-1] = attention_mask[:, 1:]
+
+            for j in range(len(query_batch)):
+                start = len(query_batch[j]) - 1
+                if attention_mask[j, 0] == 0:  # offset left padding
+                    start += attention_mask[j, :].nonzero()[0].item()
+                end = start + len(response_batch[j])
+
+                if response_masks is not None:
+                    response_masks_batch = torch.cat((torch.zeros_like(query_batch[j]), response_masks_batch[j]))[1:]
+
+                masks[j, :start] = 0
+                masks[j, end:] = 0
+                if response_masks is not None:
+                    masks[j, start:end] = masks[j, start:end] * response_masks_batch[j][start:end]
+
+            if return_logits:
+                all_logits.append(logits)
+            else:
+                del logits
+
+            all_values.append(values)
+            all_logprobs.append(logprobs)
+            all_masks.append(masks)
+
+        return (
+            torch.cat(all_logprobs),
+            torch.cat(all_logits)[:, :-1] if return_logits else None,
+            torch.cat(all_values)[:, :-1],
+            torch.cat(all_masks)[:, :-1],
+        )
+
+    @override
+    def save_model(self, output_dir: Optional[str] = None) -> None:
+        r"""Save model checkpoint.
+
+        Subclass and override to inject custom behavior.
+        """
+        if output_dir is None:
+            output_dir = self.args.output_dir
+
+        if self.is_fsdp_enabled or self.is_deepspeed_enabled:
+            try:
+                state_dict = self.accelerator.get_state_dict(self.model)  # must be called at all ranks
+                if self.args.should_save:
+                    self._save(output_dir, state_dict=state_dict)
+            except ValueError:
+                logger.warning_rank0(
+                    " stage3_gather_16bit_weights_on_model_save=false. Saving the full checkpoint instead,"
+                    " use zero_to_fp32.py to recover weights"
+                )
+                if self.args.should_save:
+                    self._save(output_dir, state_dict={})
+                # remove the dummy state_dict
+                remove_dummy_checkpoint(self.args.should_save, output_dir, [WEIGHTS_NAME, SAFE_WEIGHTS_NAME])
+                self.model.save_checkpoint(output_dir)
+
+        elif self.args.should_save:
+            unwrapped_model: AutoModelForCausalLMWithValueHead = self.accelerator.unwrap_model(self.model)
+            self._save(output_dir, state_dict=unwrapped_model.state_dict())
diff --git a/llamafactory/train/ppo/workflow.py b/llamafactory/train/ppo/workflow.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa6629a2502fb4faae869e99e72da6b1f253d396
--- /dev/null
+++ b/llamafactory/train/ppo/workflow.py
@@ -0,0 +1,79 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's TRL library.
+# https://github.com/huggingface/trl/blob/v0.8.0/examples/scripts/ppo.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING, Optional
+
+from ...data import MultiModalDataCollatorForSeq2Seq, get_dataset, get_template_and_fix_tokenizer
+from ...extras.ploting import plot_loss
+from ...model import load_model, load_tokenizer
+from ..callbacks import fix_valuehead_checkpoint
+from ..trainer_utils import create_ref_model, create_reward_model
+from .trainer import CustomPPOTrainer
+
+
+if TYPE_CHECKING:
+    from transformers import Seq2SeqTrainingArguments, TrainerCallback
+
+    from ...hparams import DataArguments, FinetuningArguments, GeneratingArguments, ModelArguments
+
+
+def run_ppo(
+    model_args: "ModelArguments",
+    data_args: "DataArguments",
+    training_args: "Seq2SeqTrainingArguments",
+    finetuning_args: "FinetuningArguments",
+    generating_args: "GeneratingArguments",
+    callbacks: Optional[list["TrainerCallback"]] = None,
+):
+    tokenizer_module = load_tokenizer(model_args)
+    tokenizer = tokenizer_module["tokenizer"]
+    template = get_template_and_fix_tokenizer(tokenizer, data_args)
+    dataset_module = get_dataset(template, model_args, data_args, training_args, stage="ppo", **tokenizer_module)
+    model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train, add_valuehead=True)
+
+    tokenizer.padding_side = "left"  # use left-padding in generation while using right-padding in training
+    data_collator = MultiModalDataCollatorForSeq2Seq(template=template, model=model, **tokenizer_module)
+
+    # Create reference model and reward model
+    ref_model = create_ref_model(model_args, finetuning_args, add_valuehead=True)
+    reward_model = create_reward_model(model, model_args, finetuning_args)
+
+    # Initialize our Trainer
+    ppo_trainer: CustomPPOTrainer = CustomPPOTrainer(
+        model_args=model_args,
+        training_args=training_args,
+        finetuning_args=finetuning_args,
+        generating_args=generating_args,
+        callbacks=callbacks,
+        model=model,
+        reward_model=reward_model,
+        ref_model=ref_model,
+        data_collator=data_collator,
+        **dataset_module,
+        **tokenizer_module,
+    )
+
+    # Training
+    if training_args.do_train:
+        ppo_trainer.ppo_train(resume_from_checkpoint=training_args.resume_from_checkpoint)
+        ppo_trainer.save_model()
+        if training_args.should_save:
+            fix_valuehead_checkpoint(model, training_args.output_dir, training_args.save_safetensors)
+
+        ppo_trainer.save_state()  # must be called after save_model to have a folder
+        if ppo_trainer.is_world_process_zero() and finetuning_args.plot_loss:
+            plot_loss(training_args.output_dir, keys=["loss", "reward"])
diff --git a/llamafactory/train/pt/__init__.py b/llamafactory/train/pt/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f5c2898372d7dc2563472741fe76bba04de5479
--- /dev/null
+++ b/llamafactory/train/pt/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .workflow import run_pt
+
+
+__all__ = ["run_pt"]
diff --git a/llamafactory/train/pt/__pycache__/__init__.cpython-312.pyc b/llamafactory/train/pt/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..70f4116b0801f85b64502cdccb51a54a103385ea
Binary files /dev/null and b/llamafactory/train/pt/__pycache__/__init__.cpython-312.pyc differ
diff --git a/llamafactory/train/pt/__pycache__/trainer.cpython-312.pyc b/llamafactory/train/pt/__pycache__/trainer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1d4bc27a815914f0a48f397cd61d8ea7b667250d
Binary files /dev/null and b/llamafactory/train/pt/__pycache__/trainer.cpython-312.pyc differ
diff --git a/llamafactory/train/pt/__pycache__/workflow.cpython-312.pyc b/llamafactory/train/pt/__pycache__/workflow.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8e5c41247947b3dfd15bafd50ceefa7d0233a9eb
Binary files /dev/null and b/llamafactory/train/pt/__pycache__/workflow.cpython-312.pyc differ
diff --git a/llamafactory/train/pt/trainer.py b/llamafactory/train/pt/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..dadf496c34ce4719e0dbe841c7c3504f39dbdefd
--- /dev/null
+++ b/llamafactory/train/pt/trainer.py
@@ -0,0 +1,93 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from types import MethodType
+from typing import TYPE_CHECKING, Optional
+
+import torch
+from transformers import Trainer
+from typing_extensions import override
+
+from ...extras.packages import is_transformers_version_greater_than
+from ..callbacks import SaveProcessorCallback
+from ..fp8_utils import configure_fp8_environment, verify_fp8_status
+from ..trainer_utils import create_custom_optimizer, create_custom_scheduler
+
+
+if TYPE_CHECKING:
+    from transformers import ProcessorMixin
+
+    from ...hparams import FinetuningArguments, ModelArguments
+
+
+class CustomTrainer(Trainer):
+    r"""Inherit Trainer for custom optimizer."""
+
+    def __init__(
+        self,
+        finetuning_args: "FinetuningArguments",
+        processor: Optional["ProcessorMixin"],
+        model_args: Optional["ModelArguments"] = None,
+        **kwargs,
+    ) -> None:
+        # Configure FP8 environment if enabled
+        if model_args is not None and model_args.fp8:
+            configure_fp8_environment(model_args)
+        if is_transformers_version_greater_than("4.46"):
+            kwargs["processing_class"] = kwargs.pop("tokenizer")
+
+        super().__init__(**kwargs)
+        if processor is not None:
+            # avoid wrong loss under gradient accumulation
+            # https://github.com/huggingface/transformers/pull/36044#issuecomment-2746657112
+            self.model_accepts_loss_kwargs = False
+
+        self.finetuning_args = finetuning_args
+
+        if processor is not None:
+            self.add_callback(SaveProcessorCallback(processor))
+
+        if finetuning_args.use_badam:
+            from badam import BAdamCallback, clip_grad_norm_old_version  # type: ignore
+
+            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
+            self.add_callback(BAdamCallback)
+
+        # Verify FP8 status after trainer initialization (accelerator should be available)
+        if model_args is not None and model_args.fp8 and hasattr(self, "accelerator"):
+            verify_fp8_status(self.accelerator, model_args)
+
+    @override
+    def create_optimizer(self) -> "torch.optim.Optimizer":
+        if self.optimizer is None:
+            self.optimizer = create_custom_optimizer(self.model, self.args, self.finetuning_args)
+        return super().create_optimizer()
+
+    @override
+    def create_scheduler(
+        self, num_training_steps: int, optimizer: Optional["torch.optim.Optimizer"] = None
+    ) -> "torch.optim.lr_scheduler.LRScheduler":
+        create_custom_scheduler(self.args, num_training_steps, optimizer)
+        return super().create_scheduler(num_training_steps, optimizer)
+
+    @override
+    def _get_train_sampler(self, *args, **kwargs) -> Optional["torch.utils.data.Sampler"]:
+        if self.finetuning_args.disable_shuffling:
+            return torch.utils.data.SequentialSampler(self.train_dataset)
+
+        return super()._get_train_sampler(*args, **kwargs)
+
+    @override
+    def compute_loss(self, model, inputs, *args, **kwargs):
+        return super().compute_loss(model, inputs, *args, **kwargs)
diff --git a/llamafactory/train/pt/workflow.py b/llamafactory/train/pt/workflow.py
new file mode 100644
index 0000000000000000000000000000000000000000..17ea604bf76dc05cade1810c230791cb654e7056
--- /dev/null
+++ b/llamafactory/train/pt/workflow.py
@@ -0,0 +1,101 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/language-modeling/run_clm.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import TYPE_CHECKING, Optional
+
+from transformers import DataCollatorForLanguageModeling
+
+from ...data import get_dataset, get_template_and_fix_tokenizer
+from ...extras.ploting import plot_loss
+from ...model import load_model, load_tokenizer
+from ..trainer_utils import create_modelcard_and_push
+from .trainer import CustomTrainer
+
+
+if TYPE_CHECKING:
+    from transformers import Seq2SeqTrainingArguments, TrainerCallback
+
+    from ...hparams import DataArguments, FinetuningArguments, ModelArguments
+
+
+def run_pt(
+    model_args: "ModelArguments",
+    data_args: "DataArguments",
+    training_args: "Seq2SeqTrainingArguments",
+    finetuning_args: "FinetuningArguments",
+    callbacks: Optional[list["TrainerCallback"]] = None,
+):
+    tokenizer_module = load_tokenizer(model_args)
+    tokenizer = tokenizer_module["tokenizer"]
+    template = get_template_and_fix_tokenizer(tokenizer, data_args)
+    dataset_module = get_dataset(template, model_args, data_args, training_args, stage="pt", **tokenizer_module)
+    model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train)
+    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
+
+    # Initialize our Trainer
+    trainer = CustomTrainer(
+        model=model,
+        args=training_args,
+        finetuning_args=finetuning_args,
+        data_collator=data_collator,
+        callbacks=callbacks,
+        **dataset_module,
+        **tokenizer_module,
+    )
+
+    # Training
+    if training_args.do_train:
+        train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
+        trainer.save_model()
+        trainer.log_metrics("train", train_result.metrics)
+        trainer.save_metrics("train", train_result.metrics)
+        trainer.save_state()
+        if trainer.is_world_process_zero() and finetuning_args.plot_loss:
+            keys = ["loss"]
+            if isinstance(dataset_module.get("eval_dataset"), dict):
+                keys += [f"eval_{key}_loss" for key in dataset_module["eval_dataset"].keys()]
+            else:
+                keys += ["eval_loss"]
+
+            plot_loss(training_args.output_dir, keys=keys)
+
+    # Evaluation
+    if training_args.do_eval:
+        metrics = trainer.evaluate(metric_key_prefix="eval")
+
+        if isinstance(dataset_module.get("eval_dataset"), dict):
+            for key in dataset_module["eval_dataset"].keys():
+                try:
+                    perplexity = math.exp(metrics[f"eval_{key}_loss"])
+                except OverflowError:
+                    perplexity = float("inf")
+
+                metrics[f"eval_{key}_perplexity"] = perplexity
+        else:
+            try:
+                perplexity = math.exp(metrics["eval_loss"])
+            except OverflowError:
+                perplexity = float("inf")
+
+            metrics["eval_perplexity"] = perplexity
+
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+
+    # Create model card
+    create_modelcard_and_push(trainer, model_args, data_args, training_args, finetuning_args)
diff --git a/llamafactory/train/rm/__init__.py b/llamafactory/train/rm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0e8a45c0f6a4e426c459f0d3e353b8b5e3ebce7
--- /dev/null
+++ b/llamafactory/train/rm/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .workflow import run_rm
+
+
+__all__ = ["run_rm"]
diff --git a/llamafactory/train/rm/__pycache__/__init__.cpython-312.pyc b/llamafactory/train/rm/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1df8ec786e8da906c1c8b31e9d01cdd02cbf8683
Binary files /dev/null and b/llamafactory/train/rm/__pycache__/__init__.cpython-312.pyc differ
diff --git a/llamafactory/train/rm/__pycache__/metric.cpython-312.pyc b/llamafactory/train/rm/__pycache__/metric.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..12b4e5d42dd21c97724fd6df40fee8f922ed2571
Binary files /dev/null and b/llamafactory/train/rm/__pycache__/metric.cpython-312.pyc differ
diff --git a/llamafactory/train/rm/__pycache__/trainer.cpython-312.pyc b/llamafactory/train/rm/__pycache__/trainer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5c8b1f53847b57f3707f8a738407235232d7d46b
Binary files /dev/null and b/llamafactory/train/rm/__pycache__/trainer.cpython-312.pyc differ
diff --git a/llamafactory/train/rm/__pycache__/workflow.cpython-312.pyc b/llamafactory/train/rm/__pycache__/workflow.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2ee5c03b2fce2731638371d4e7d97ae72a40f8b7
Binary files /dev/null and b/llamafactory/train/rm/__pycache__/workflow.cpython-312.pyc differ
diff --git a/llamafactory/train/rm/metric.py b/llamafactory/train/rm/metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7c3c43f5c92dfb9030b6689ffd982ea667b883a
--- /dev/null
+++ b/llamafactory/train/rm/metric.py
@@ -0,0 +1,51 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional
+
+import numpy as np
+
+from ...extras.misc import numpify
+
+
+if TYPE_CHECKING:
+    from transformers import EvalPrediction
+
+
+@dataclass
+class ComputeAccuracy:
+    r"""Compute reward accuracy and support `batch_eval_metrics`."""
+
+    def _dump(self) -> Optional[dict[str, float]]:
+        result = None
+        if hasattr(self, "score_dict"):
+            result = {k: float(np.mean(v)) for k, v in self.score_dict.items()}
+
+        self.score_dict = {"accuracy": []}
+        return result
+
+    def __post_init__(self):
+        self._dump()
+
+    def __call__(self, eval_preds: "EvalPrediction", compute_result: bool = True) -> Optional[dict[str, float]]:
+        chosen_scores, rejected_scores = numpify(eval_preds.predictions[0]), numpify(eval_preds.predictions[1])
+        if not chosen_scores.shape:
+            self.score_dict["accuracy"].append(chosen_scores > rejected_scores)
+        else:
+            for i in range(len(chosen_scores)):
+                self.score_dict["accuracy"].append(chosen_scores[i] > rejected_scores[i])
+
+        if compute_result:
+            return self._dump()
diff --git a/llamafactory/train/rm/trainer.py b/llamafactory/train/rm/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe2bd5571f3fb79b42ef1e2fe38476237c1cf6ee
--- /dev/null
+++ b/llamafactory/train/rm/trainer.py
@@ -0,0 +1,129 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/trainer.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from types import MethodType
+from typing import TYPE_CHECKING, Optional, Union
+
+import torch
+from transformers import Trainer
+from typing_extensions import override
+
+from ...extras import logging
+from ...extras.packages import is_transformers_version_greater_than
+from ..callbacks import FixValueHeadModelCallback, SaveProcessorCallback
+from ..trainer_utils import create_custom_optimizer, create_custom_scheduler
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel, ProcessorMixin
+    from transformers.trainer import PredictionOutput
+
+    from ...hparams import FinetuningArguments
+
+
+logger = logging.get_logger(__name__)
+
+
+class PairwiseTrainer(Trainer):
+    r"""Inherits Trainer to compute pairwise loss."""
+
+    def __init__(
+        self, finetuning_args: "FinetuningArguments", processor: Optional["ProcessorMixin"], **kwargs
+    ) -> None:
+        if is_transformers_version_greater_than("4.46"):
+            kwargs["processing_class"] = kwargs.pop("tokenizer")
+
+        super().__init__(**kwargs)
+        self.model_accepts_loss_kwargs = False  # overwrite trainer's default behavior
+        self.finetuning_args = finetuning_args
+        self.can_return_loss = True  # override property to return eval_loss
+        self.add_callback(FixValueHeadModelCallback)
+
+        if processor is not None:
+            self.add_callback(SaveProcessorCallback(processor))
+
+        if finetuning_args.use_badam:
+            from badam import BAdamCallback, clip_grad_norm_old_version  # type: ignore
+
+            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
+            self.add_callback(BAdamCallback)
+
+    @override
+    def create_optimizer(self) -> "torch.optim.Optimizer":
+        if self.optimizer is None:
+            self.optimizer = create_custom_optimizer(self.model, self.args, self.finetuning_args)
+        return super().create_optimizer()
+
+    @override
+    def create_scheduler(
+        self, num_training_steps: int, optimizer: Optional["torch.optim.Optimizer"] = None
+    ) -> "torch.optim.lr_scheduler.LRScheduler":
+        create_custom_scheduler(self.args, num_training_steps, optimizer)
+        return super().create_scheduler(num_training_steps, optimizer)
+
+    @override
+    def _get_train_sampler(self, *args, **kwargs) -> Optional["torch.utils.data.Sampler"]:
+        if self.finetuning_args.disable_shuffling:
+            return torch.utils.data.SequentialSampler(self.train_dataset)
+
+        return super()._get_train_sampler(*args, **kwargs)
+
+    @override
+    def compute_loss(
+        self, model: "PreTrainedModel", inputs: dict[str, "torch.Tensor"], return_outputs: bool = False, **kwargs
+    ) -> Union["torch.Tensor", tuple["torch.Tensor", list["torch.Tensor"]]]:
+        r"""Compute pairwise loss. The first n examples are chosen and the last n examples are rejected.
+
+        Subclass and override to inject custom behavior.
+
+        Note that the first element will be removed from the output tuple.
+        See: https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/trainer.py#L3842
+        """
+        _, _, values = model(**inputs, output_hidden_states=True, return_dict=True, use_cache=False)
+        batch_size = inputs["input_ids"].size(0) // 2
+        chosen_masks, rejected_masks = torch.split(inputs["attention_mask"], batch_size, dim=0)
+        chosen_rewards, rejected_rewards = torch.split(values, batch_size, dim=0)
+        chosen_scores = chosen_rewards.gather(dim=-1, index=(chosen_masks.sum(dim=-1, keepdim=True) - 1))
+        rejected_scores = rejected_rewards.gather(dim=-1, index=(rejected_masks.sum(dim=-1, keepdim=True) - 1))
+        chosen_scores, rejected_scores = chosen_scores.squeeze(), rejected_scores.squeeze()
+
+        loss = -torch.nn.functional.logsigmoid(chosen_scores.float() - rejected_scores.float()).mean()
+        if return_outputs:
+            return loss, (loss, chosen_scores, rejected_scores)
+        else:
+            return loss
+
+    def save_predictions(self, predict_results: "PredictionOutput") -> None:
+        r"""Save model predictions to `output_dir`.
+
+        A custom behavior that not contained in Seq2SeqTrainer.
+        """
+        if not self.is_world_process_zero():
+            return
+
+        output_prediction_file = os.path.join(self.args.output_dir, "generated_predictions.jsonl")
+        logger.info_rank0(f"Saving prediction results to {output_prediction_file}")
+        chosen_scores, rejected_scores = predict_results.predictions
+
+        with open(output_prediction_file, "w", encoding="utf-8") as writer:
+            res: list[str] = []
+            for c_score, r_score in zip(chosen_scores, rejected_scores):
+                res.append(json.dumps({"chosen": round(float(c_score), 2), "rejected": round(float(r_score), 2)}))
+
+            writer.write("\n".join(res))
diff --git a/llamafactory/train/rm/workflow.py b/llamafactory/train/rm/workflow.py
new file mode 100644
index 0000000000000000000000000000000000000000..89b2c95c6f86b9ccfbeb33c35f0a7643d6f0a960
--- /dev/null
+++ b/llamafactory/train/rm/workflow.py
@@ -0,0 +1,98 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/summarization/run_summarization.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING, Optional
+
+from ...data import PairwiseDataCollatorWithPadding, get_dataset, get_template_and_fix_tokenizer
+from ...extras.ploting import plot_loss
+from ...model import load_model, load_tokenizer
+from ..callbacks import fix_valuehead_checkpoint
+from ..trainer_utils import create_modelcard_and_push
+from .metric import ComputeAccuracy
+from .trainer import PairwiseTrainer
+
+
+if TYPE_CHECKING:
+    from transformers import Seq2SeqTrainingArguments, TrainerCallback
+
+    from ...hparams import DataArguments, FinetuningArguments, ModelArguments
+
+
+def run_rm(
+    model_args: "ModelArguments",
+    data_args: "DataArguments",
+    training_args: "Seq2SeqTrainingArguments",
+    finetuning_args: "FinetuningArguments",
+    callbacks: Optional[list["TrainerCallback"]] = None,
+):
+    tokenizer_module = load_tokenizer(model_args)
+    tokenizer = tokenizer_module["tokenizer"]
+    template = get_template_and_fix_tokenizer(tokenizer, data_args)
+    dataset_module = get_dataset(template, model_args, data_args, training_args, stage="rm", **tokenizer_module)
+    model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train, add_valuehead=True)
+    data_collator = PairwiseDataCollatorWithPadding(
+        template=template, model=model, pad_to_multiple_of=8, **tokenizer_module
+    )
+
+    # Initialize our Trainer
+    trainer = PairwiseTrainer(
+        model=model,
+        args=training_args,
+        finetuning_args=finetuning_args,
+        data_collator=data_collator,
+        callbacks=callbacks,
+        compute_metrics=ComputeAccuracy(),
+        **dataset_module,
+        **tokenizer_module,
+    )
+
+    # Training
+    if training_args.do_train:
+        train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
+        trainer.save_model()
+        if training_args.should_save:
+            fix_valuehead_checkpoint(model, training_args.output_dir, training_args.save_safetensors)
+
+        trainer.log_metrics("train", train_result.metrics)
+        trainer.save_metrics("train", train_result.metrics)
+        trainer.save_state()
+        if trainer.is_world_process_zero() and finetuning_args.plot_loss:
+            keys = ["loss"]
+            if isinstance(dataset_module.get("eval_dataset"), dict):
+                keys += sum(
+                    [[f"eval_{key}_loss", f"eval_{key}_accuracy"] for key in dataset_module["eval_dataset"].keys()], []
+                )
+            else:
+                keys += ["eval_loss", "eval_accuracy"]
+
+            plot_loss(training_args.output_dir, keys=keys)
+
+    # Evaluation
+    if training_args.do_eval:
+        metrics = trainer.evaluate(metric_key_prefix="eval")
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+
+    # Predict
+    if training_args.do_predict:
+        predict_results = trainer.predict(dataset_module["eval_dataset"], metric_key_prefix="predict")
+        trainer.log_metrics("predict", predict_results.metrics)
+        trainer.save_metrics("predict", predict_results.metrics)
+        trainer.save_predictions(predict_results)
+
+    # Create model card
+    create_modelcard_and_push(trainer, model_args, data_args, training_args, finetuning_args)
diff --git a/llamafactory/train/sft/__init__.py b/llamafactory/train/sft/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6107a9ae741be83e0b3038015316f5ca7510fa76
--- /dev/null
+++ b/llamafactory/train/sft/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .workflow import run_sft
+
+
+__all__ = ["run_sft"]
diff --git a/llamafactory/train/sft/__pycache__/__init__.cpython-312.pyc b/llamafactory/train/sft/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..358fe07b935964547fde99074d0fa78a4b3562fb
Binary files /dev/null and b/llamafactory/train/sft/__pycache__/__init__.cpython-312.pyc differ
diff --git a/llamafactory/train/sft/__pycache__/metric.cpython-312.pyc b/llamafactory/train/sft/__pycache__/metric.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b4c8c465a05105e0ea3e0d5bf0db258d00d41dee
Binary files /dev/null and b/llamafactory/train/sft/__pycache__/metric.cpython-312.pyc differ
diff --git a/llamafactory/train/sft/__pycache__/trainer.cpython-312.pyc b/llamafactory/train/sft/__pycache__/trainer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b5d70ccad7e13096f8d7964be4a75979ce833aae
Binary files /dev/null and b/llamafactory/train/sft/__pycache__/trainer.cpython-312.pyc differ
diff --git a/llamafactory/train/sft/__pycache__/workflow.cpython-312.pyc b/llamafactory/train/sft/__pycache__/workflow.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..769036485f9475905c7b5f41087db6270edb7e6f
Binary files /dev/null and b/llamafactory/train/sft/__pycache__/workflow.cpython-312.pyc differ
diff --git a/llamafactory/train/sft/metric.py b/llamafactory/train/sft/metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..76ef1dec054916b8046e713478f84c44731076af
--- /dev/null
+++ b/llamafactory/train/sft/metric.py
@@ -0,0 +1,134 @@
+# Copyright 2025 HuggingFace Inc., THUDM, and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's transformers library and the THUDM's ChatGLM implementation.
+# https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/summarization/run_summarization.py
+# https://github.com/THUDM/ChatGLM-6B/blob/main/ptuning/main.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional
+
+import numpy as np
+import torch
+from transformers.utils import is_nltk_available
+
+from ...extras.constants import IGNORE_INDEX
+from ...extras.misc import numpify
+from ...extras.packages import is_jieba_available, is_rouge_available
+
+
+if TYPE_CHECKING:
+    from transformers import EvalPrediction, PreTrainedTokenizer
+
+
+if is_jieba_available():
+    import jieba  # type: ignore
+
+
+if is_nltk_available():
+    from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu  # type: ignore
+
+
+if is_rouge_available():
+    from rouge_chinese import Rouge  # type: ignore
+
+
+def eval_logit_processor(logits: "torch.Tensor", labels: "torch.Tensor") -> "torch.Tensor":
+    r"""Compute the token with the largest likelihood to reduce memory footprint."""
+    if isinstance(logits, (list, tuple)):
+        if logits[0].dim() == 3:  # (batch_size, seq_len, vocab_size)
+            logits = logits[0]
+        else:  # moe models have aux loss
+            logits = logits[1]
+
+    if logits.dim() != 3:
+        raise ValueError("Cannot process the logits.")
+
+    return torch.argmax(logits, dim=-1)
+
+
+@dataclass
+class ComputeAccuracy:
+    r"""Compute accuracy and support `batch_eval_metrics`."""
+
+    def _dump(self) -> Optional[dict[str, float]]:
+        result = None
+        if hasattr(self, "score_dict"):
+            result = {k: float(np.mean(v)) for k, v in self.score_dict.items()}
+
+        self.score_dict = {"accuracy": []}
+        return result
+
+    def __post_init__(self):
+        self._dump()
+
+    def __call__(self, eval_preds: "EvalPrediction", compute_result: bool = True) -> Optional[dict[str, float]]:
+        preds, labels = numpify(eval_preds.predictions), numpify(eval_preds.label_ids)
+        for i in range(len(preds)):
+            pred, label = preds[i, :-1], labels[i, 1:]
+            label_mask = label != IGNORE_INDEX
+            self.score_dict["accuracy"].append(np.mean(pred[label_mask] == label[label_mask]))
+
+        if compute_result:
+            return self._dump()
+
+
+@dataclass
+class ComputeSimilarity:
+    r"""Compute text similarity scores and support `batch_eval_metrics`.
+
+    Wraps the tokenizer into metric functions, used in CustomSeq2SeqTrainer.
+    """
+
+    tokenizer: "PreTrainedTokenizer"
+
+    def _dump(self) -> Optional[dict[str, float]]:
+        result = None
+        if hasattr(self, "score_dict"):
+            result = {k: float(np.mean(v)) for k, v in self.score_dict.items()}
+
+        self.score_dict = {"rouge-1": [], "rouge-2": [], "rouge-l": [], "bleu-4": []}
+        return result
+
+    def __post_init__(self):
+        self._dump()
+
+    def __call__(self, eval_preds: "EvalPrediction", compute_result: bool = True) -> Optional[dict[str, float]]:
+        preds, labels = numpify(eval_preds.predictions), numpify(eval_preds.label_ids)
+
+        preds = np.where(preds != IGNORE_INDEX, preds, self.tokenizer.pad_token_id)
+        labels = np.where(labels != IGNORE_INDEX, labels, self.tokenizer.pad_token_id)
+
+        decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True)
+        decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
+
+        for pred, label in zip(decoded_preds, decoded_labels):
+            hypothesis = list(jieba.cut(pred))
+            reference = list(jieba.cut(label))
+
+            if len(" ".join(hypothesis).split()) == 0 or len(" ".join(reference).split()) == 0:
+                result = {"rouge-1": {"f": 0.0}, "rouge-2": {"f": 0.0}, "rouge-l": {"f": 0.0}}
+            else:
+                rouge = Rouge()
+                scores = rouge.get_scores(" ".join(hypothesis), " ".join(reference))
+                result = scores[0]
+
+            for k, v in result.items():
+                self.score_dict[k].append(round(v["f"] * 100, 4))
+
+            bleu_score = sentence_bleu([list(label)], list(pred), smoothing_function=SmoothingFunction().method3)
+            self.score_dict["bleu-4"].append(round(bleu_score * 100, 4))
+
+        if compute_result:
+            return self._dump()
diff --git a/llamafactory/train/sft/trainer.py b/llamafactory/train/sft/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea66a851191ebff7f2c8f4ebc651f9195e3a5343
--- /dev/null
+++ b/llamafactory/train/sft/trainer.py
@@ -0,0 +1,179 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/trainer_seq2seq.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from types import MethodType
+from typing import TYPE_CHECKING, Any, Optional, Union
+
+import numpy as np
+import torch
+from transformers import Seq2SeqTrainer
+from typing_extensions import override
+
+from ...extras import logging
+from ...extras.constants import IGNORE_INDEX
+from ...extras.packages import is_transformers_version_greater_than
+from ..callbacks import SaveProcessorCallback
+from ..fp8_utils import configure_fp8_environment, verify_fp8_status
+from ..trainer_utils import create_custom_optimizer, create_custom_scheduler
+
+
+if TYPE_CHECKING:
+    from torch.utils.data import Dataset
+    from transformers import PreTrainedTokenizer, ProcessorMixin
+    from transformers.trainer import PredictionOutput
+
+    from ...hparams import FinetuningArguments, ModelArguments
+
+
+logger = logging.get_logger(__name__)
+
+
+class CustomSeq2SeqTrainer(Seq2SeqTrainer):
+    r"""Inherits Seq2SeqTrainer to compute generative metrics such as BLEU and ROUGE."""
+
+    def __init__(
+        self,
+        finetuning_args: "FinetuningArguments",
+        processor: Optional["ProcessorMixin"],
+        model_args: Optional["ModelArguments"] = None,
+        gen_kwargs: Optional[dict[str, Any]] = None,
+        **kwargs,
+    ) -> None:
+        # Configure FP8 environment if enabled
+        if model_args is not None and model_args.fp8:
+            configure_fp8_environment(model_args)
+        if is_transformers_version_greater_than("4.46"):
+            kwargs["processing_class"] = kwargs.pop("tokenizer")
+        else:
+            self.processing_class: PreTrainedTokenizer = kwargs.get("tokenizer")
+
+        super().__init__(**kwargs)
+        if processor is not None:
+            # avoid wrong loss under gradient accumulation
+            # https://github.com/huggingface/transformers/pull/36044#issuecomment-2746657112
+            self.model_accepts_loss_kwargs = False
+
+        self.finetuning_args = finetuning_args
+        if gen_kwargs is not None:
+            # https://github.com/huggingface/transformers/blob/v4.45.0/src/transformers/trainer_seq2seq.py#L287
+            self._gen_kwargs = gen_kwargs
+
+        if processor is not None:
+            self.add_callback(SaveProcessorCallback(processor))
+
+        if finetuning_args.use_badam:
+            from badam import BAdamCallback, clip_grad_norm_old_version  # type: ignore
+
+            self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
+            self.add_callback(BAdamCallback)
+
+        if finetuning_args.use_dft_loss:
+            from ..trainer_utils import dft_loss_func
+
+            self.compute_loss_func = dft_loss_func
+
+        # Verify FP8 status after trainer initialization (accelerator should be available)
+        if model_args is not None and model_args.fp8 and hasattr(self, "accelerator"):
+            verify_fp8_status(self.accelerator, model_args)
+
+    @override
+    def create_optimizer(self) -> "torch.optim.Optimizer":
+        if self.optimizer is None:
+            self.optimizer = create_custom_optimizer(self.model, self.args, self.finetuning_args)
+        return super().create_optimizer()
+
+    @override
+    def create_scheduler(
+        self, num_training_steps: int, optimizer: Optional["torch.optim.Optimizer"] = None
+    ) -> "torch.optim.lr_scheduler.LRScheduler":
+        create_custom_scheduler(self.args, num_training_steps, optimizer)
+        return super().create_scheduler(num_training_steps, optimizer)
+
+    @override
+    def _get_train_sampler(self, *args, **kwargs) -> Optional["torch.utils.data.Sampler"]:
+        if self.finetuning_args.disable_shuffling:
+            return torch.utils.data.SequentialSampler(self.train_dataset)
+
+        return super()._get_train_sampler(*args, **kwargs)
+
+    @override
+    def compute_loss(self, model, inputs, *args, **kwargs):
+        return super().compute_loss(model, inputs, *args, **kwargs)
+
+    @override
+    def prediction_step(
+        self,
+        model: "torch.nn.Module",
+        inputs: dict[str, Union["torch.Tensor", Any]],
+        prediction_loss_only: bool,
+        ignore_keys: Optional[list[str]] = None,
+        **gen_kwargs,
+    ) -> tuple[Optional[float], Optional["torch.Tensor"], Optional["torch.Tensor"]]:
+        r"""Remove the prompt part in the generated tokens.
+
+        Subclass and override to inject custom behavior.
+        """
+        if self.args.predict_with_generate:  # do not pass labels to model when generate
+            labels = inputs.pop("labels", None)
+        else:
+            labels = inputs.get("labels")
+
+        loss, generated_tokens, _ = super().prediction_step(
+            model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys, **gen_kwargs
+        )
+        if generated_tokens is not None and self.args.predict_with_generate:
+            generated_tokens[:, : inputs["input_ids"].size(-1)] = self.processing_class.pad_token_id
+            generated_tokens = generated_tokens.contiguous()
+
+        return loss, generated_tokens, labels
+
+    def save_predictions(
+        self, dataset: "Dataset", predict_results: "PredictionOutput", skip_special_tokens: bool = True
+    ) -> None:
+        r"""Save model predictions to `output_dir`.
+
+        A custom behavior that not contained in Seq2SeqTrainer.
+        """
+        if not self.is_world_process_zero():
+            return
+
+        output_prediction_file = os.path.join(self.args.output_dir, "generated_predictions.jsonl")
+        logger.info_rank0(f"Saving prediction results to {output_prediction_file}")
+
+        labels = np.where(
+            predict_results.label_ids != IGNORE_INDEX, predict_results.label_ids, self.processing_class.pad_token_id
+        )
+        preds = np.where(
+            predict_results.predictions != IGNORE_INDEX,
+            predict_results.predictions,
+            self.processing_class.pad_token_id,
+        )
+
+        for i in range(len(preds)):
+            pad_len = np.nonzero(preds[i] != self.processing_class.pad_token_id)[0]
+            if len(pad_len):  # move pad token to last
+                preds[i] = np.concatenate((preds[i][pad_len[0] :], preds[i][: pad_len[0]]), axis=-1)
+
+        decoded_inputs = self.processing_class.batch_decode(dataset["input_ids"], skip_special_tokens=False)
+        decoded_preds = self.processing_class.batch_decode(preds, skip_special_tokens=skip_special_tokens)
+        decoded_labels = self.processing_class.batch_decode(labels, skip_special_tokens=skip_special_tokens)
+
+        with open(output_prediction_file, "w", encoding="utf-8") as f:
+            for text, pred, label in zip(decoded_inputs, decoded_preds, decoded_labels):
+                f.write(json.dumps({"prompt": text, "predict": pred, "label": label}, ensure_ascii=False) + "\n")
diff --git a/llamafactory/train/sft/workflow.py b/llamafactory/train/sft/workflow.py
new file mode 100644
index 0000000000000000000000000000000000000000..aab1cb133ac4a59731f1da5b0d58656cadba86d4
--- /dev/null
+++ b/llamafactory/train/sft/workflow.py
@@ -0,0 +1,135 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/summarization/run_summarization.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING, Optional
+
+from ...data import SFTDataCollatorWith4DAttentionMask, get_dataset, get_template_and_fix_tokenizer
+from ...extras.constants import IGNORE_INDEX
+from ...extras.logging import get_logger
+from ...extras.misc import calculate_tps
+from ...extras.ploting import plot_loss
+from ...model import load_model, load_tokenizer
+from ..trainer_utils import create_modelcard_and_push
+from .metric import ComputeAccuracy, ComputeSimilarity, eval_logit_processor
+from .trainer import CustomSeq2SeqTrainer
+
+
+if TYPE_CHECKING:
+    from transformers import Seq2SeqTrainingArguments, TrainerCallback
+
+    from ...hparams import DataArguments, FinetuningArguments, GeneratingArguments, ModelArguments
+
+
+logger = get_logger(__name__)
+
+
+def run_sft(
+    model_args: "ModelArguments",
+    data_args: "DataArguments",
+    training_args: "Seq2SeqTrainingArguments",
+    finetuning_args: "FinetuningArguments",
+    generating_args: "GeneratingArguments",
+    callbacks: Optional[list["TrainerCallback"]] = None,
+):
+    tokenizer_module = load_tokenizer(model_args)
+    tokenizer = tokenizer_module["tokenizer"]
+    template = get_template_and_fix_tokenizer(tokenizer, data_args)
+    dataset_module = get_dataset(template, model_args, data_args, training_args, stage="sft", **tokenizer_module)
+    model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train)
+
+    if getattr(model, "is_quantized", False) and not training_args.do_train:
+        setattr(model, "_hf_peft_config_loaded", True)  # hack here: make model compatible with prediction
+
+    data_collator = SFTDataCollatorWith4DAttentionMask(
+        template=template,
+        model=model if not training_args.predict_with_generate else None,
+        pad_to_multiple_of=8 if training_args.do_train else None,  # for shift short attention
+        label_pad_token_id=IGNORE_INDEX if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id,
+        block_diag_attn=model_args.block_diag_attn,
+        attn_implementation=getattr(model.config, "_attn_implementation", None),
+        compute_dtype=model_args.compute_dtype,
+        **tokenizer_module,
+    )
+
+    # Metric utils
+    metric_module = {}
+    if training_args.predict_with_generate:
+        metric_module["compute_metrics"] = ComputeSimilarity(tokenizer=tokenizer)
+    elif finetuning_args.compute_accuracy:
+        metric_module["compute_metrics"] = ComputeAccuracy()
+        metric_module["preprocess_logits_for_metrics"] = eval_logit_processor
+
+    # Keyword arguments for `model.generate`
+    gen_kwargs = generating_args.to_dict(obey_generation_config=True)
+    gen_kwargs["eos_token_id"] = [tokenizer.eos_token_id] + tokenizer.additional_special_tokens_ids
+    gen_kwargs["pad_token_id"] = tokenizer.pad_token_id
+
+    # Initialize our Trainer
+    trainer = CustomSeq2SeqTrainer(
+        model=model,
+        args=training_args,
+        finetuning_args=finetuning_args,
+        data_collator=data_collator,
+        callbacks=callbacks,
+        gen_kwargs=gen_kwargs,
+        **dataset_module,
+        **tokenizer_module,
+        **metric_module,
+    )
+
+    # Training
+    if training_args.do_train:
+        train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
+        trainer.save_model()
+        if finetuning_args.include_effective_tokens_per_second:
+            train_result.metrics["effective_tokens_per_sec"] = calculate_tps(
+                dataset_module["train_dataset"], train_result.metrics, stage="sft"
+            )
+
+        trainer.log_metrics("train", train_result.metrics)
+        trainer.save_metrics("train", train_result.metrics)
+        trainer.save_state()
+        if trainer.is_world_process_zero() and finetuning_args.plot_loss:
+            keys = ["loss"]
+            if isinstance(dataset_module.get("eval_dataset"), dict):
+                keys += sum(
+                    [[f"eval_{key}_loss", f"eval_{key}_accuracy"] for key in dataset_module["eval_dataset"].keys()], []
+                )
+            else:
+                keys += ["eval_loss", "eval_accuracy"]
+
+            plot_loss(training_args.output_dir, keys=keys)
+
+    if training_args.predict_with_generate:
+        tokenizer.padding_side = "left"  # use left-padding in generation
+
+    # Evaluation
+    if training_args.do_eval:
+        metrics = trainer.evaluate(metric_key_prefix="eval", **gen_kwargs)
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+
+    # Predict
+    if training_args.do_predict:
+        logger.warning_rank0_once("Batch generation can be very slow. Consider using `scripts/vllm_infer.py` instead.")
+        predict_results = trainer.predict(dataset_module["eval_dataset"], metric_key_prefix="predict", **gen_kwargs)
+        trainer.log_metrics("predict", predict_results.metrics)
+        trainer.save_metrics("predict", predict_results.metrics)
+        trainer.save_predictions(dataset_module["eval_dataset"], predict_results, generating_args.skip_special_tokens)
+
+    # Create model card
+    create_modelcard_and_push(trainer, model_args, data_args, training_args, finetuning_args)
diff --git a/llamafactory/train/test_utils.py b/llamafactory/train/test_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e4c4ffc287b33f4928ddd3f16cb0015f839eb2a
--- /dev/null
+++ b/llamafactory/train/test_utils.py
@@ -0,0 +1,119 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING, Optional, Union
+
+import torch
+from peft import PeftModel
+from transformers import AutoModelForCausalLM
+from trl import AutoModelForCausalLMWithValueHead
+
+from ..data import get_dataset, get_template_and_fix_tokenizer
+from ..extras.misc import get_current_device
+from ..hparams import get_infer_args, get_train_args
+from ..model import load_model, load_tokenizer
+
+
+if TYPE_CHECKING:
+    from peft import LoraModel
+    from transformers import PreTrainedModel
+
+    from ..data.data_utils import DatasetModule
+
+
+def compare_model(model_a: "torch.nn.Module", model_b: "torch.nn.Module", diff_keys: list[str] = []) -> None:
+    state_dict_a = model_a.state_dict()
+    state_dict_b = model_b.state_dict()
+    assert set(state_dict_a.keys()) == set(state_dict_b.keys())
+    for name in state_dict_a.keys():
+        if any(key in name for key in diff_keys):
+            assert torch.allclose(state_dict_a[name], state_dict_b[name], rtol=1e-4, atol=1e-5) is False
+        else:
+            assert torch.allclose(state_dict_a[name], state_dict_b[name], rtol=1e-4, atol=1e-5) is True
+
+
+def check_lora_model(model: "LoraModel") -> tuple[set[str], set[str]]:
+    linear_modules, extra_modules = set(), set()
+    for name, param in model.named_parameters():
+        if any(module in name for module in ["lora_A", "lora_B"]):
+            linear_modules.add(name.split(".lora_", maxsplit=1)[0].split(".")[-1])
+            assert param.requires_grad is True
+            assert param.dtype == torch.float32
+        elif "modules_to_save" in name:
+            extra_modules.add(name.split(".modules_to_save", maxsplit=1)[0].split(".")[-1])
+            assert param.requires_grad is True
+            assert param.dtype == torch.float32
+        else:
+            assert param.requires_grad is False
+            assert param.dtype == torch.float16
+
+    return linear_modules, extra_modules
+
+
+def load_train_model(add_valuehead: bool = False, **kwargs) -> "PreTrainedModel":
+    model_args, _, _, finetuning_args, _ = get_train_args(kwargs)
+    tokenizer = load_tokenizer(model_args)["tokenizer"]
+    return load_model(tokenizer, model_args, finetuning_args, is_trainable=True, add_valuehead=add_valuehead)
+
+
+def load_infer_model(add_valuehead: bool = False, **kwargs) -> "PreTrainedModel":
+    model_args, _, finetuning_args, _ = get_infer_args(kwargs)
+    tokenizer = load_tokenizer(model_args)["tokenizer"]
+    return load_model(tokenizer, model_args, finetuning_args, is_trainable=False, add_valuehead=add_valuehead)
+
+
+def load_reference_model(
+    model_path: str,
+    lora_path: Optional[str] = None,
+    use_lora: bool = False,
+    use_pissa: bool = False,
+    is_trainable: bool = False,
+    add_valuehead: bool = False,
+) -> Union["PreTrainedModel", "LoraModel"]:
+    current_device = get_current_device()
+    if add_valuehead:
+        model: AutoModelForCausalLMWithValueHead = AutoModelForCausalLMWithValueHead.from_pretrained(
+            model_path, torch_dtype=torch.float16, device_map=current_device
+        )
+        if not is_trainable:
+            model.v_head = model.v_head.to(torch.float16)
+
+        return model
+
+    model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, device_map=current_device)
+    if use_lora or use_pissa:
+        model = PeftModel.from_pretrained(
+            model, lora_path, subfolder="pissa_init" if use_pissa else None, is_trainable=is_trainable
+        )
+        for param in filter(lambda p: p.requires_grad, model.parameters()):
+            param.data = param.data.to(torch.float32)
+
+    return model
+
+
+def load_dataset_module(**kwargs) -> "DatasetModule":
+    model_args, data_args, training_args, _, _ = get_train_args(kwargs)
+    tokenizer_module = load_tokenizer(model_args)
+    template = get_template_and_fix_tokenizer(tokenizer_module["tokenizer"], data_args)
+    dataset_module = get_dataset(template, model_args, data_args, training_args, kwargs["stage"], **tokenizer_module)
+    return dataset_module
+
+
+def patch_valuehead_model() -> None:
+    def post_init(self: "AutoModelForCausalLMWithValueHead", state_dict: dict[str, "torch.Tensor"]) -> None:
+        state_dict = {k[7:]: state_dict[k] for k in state_dict.keys() if k.startswith("v_head.")}
+        self.v_head.load_state_dict(state_dict, strict=False)
+        del state_dict
+
+    AutoModelForCausalLMWithValueHead.post_init = post_init
diff --git a/llamafactory/train/trainer_utils.py b/llamafactory/train/trainer_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..66bff5f19527bea7ec834a99df2fb0b3e6b3a459
--- /dev/null
+++ b/llamafactory/train/trainer_utils.py
@@ -0,0 +1,779 @@
+# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the original GaLore's implementation: https://github.com/jiaweizzhao/GaLore
+# and the original LoRA+'s implementation: https://github.com/nikhil-ghosh-berkeley/loraplus
+# and the original BAdam's implementation: https://github.com/Ledzy/BAdam
+# and the HuggingFace's TRL library: https://github.com/huggingface/trl
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from collections.abc import Mapping
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Callable, Optional, Union
+
+import torch
+from transformers import Trainer
+from transformers.integrations import is_deepspeed_zero3_enabled
+from transformers.modeling_utils import is_fsdp_enabled
+from transformers.optimization import get_scheduler
+from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
+from transformers.trainer_pt_utils import get_parameter_names
+from typing_extensions import override
+
+from ..extras import logging
+from ..extras.constants import IGNORE_INDEX, SWANLAB_CONFIG
+from ..extras.packages import is_apollo_available, is_galore_available, is_ray_available
+from ..hparams import FinetuningArguments, ModelArguments
+from ..model import find_all_linear_modules, load_model, load_tokenizer, load_valuehead_params
+
+
+if is_galore_available():
+    from galore_torch import GaLoreAdafactor, GaLoreAdamW, GaLoreAdamW8bit  # type: ignore
+
+
+if is_apollo_available():
+    from apollo_torch import APOLLOAdamW  # type: ignore
+
+
+if is_ray_available():
+    import ray
+    from ray.train import RunConfig, ScalingConfig
+    from ray.train.torch import TorchTrainer
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel, TrainerCallback, TrainerState
+    from trl import AutoModelForCausalLMWithValueHead
+
+    from ..hparams import DataArguments, RayArguments, TrainingArguments
+
+
+logger = logging.get_logger(__name__)
+
+
+class DummyOptimizer(torch.optim.Optimizer):
+    r"""A dummy optimizer used for the GaLore or APOLLO algorithm."""
+
+    def __init__(
+        self, lr: float = 1e-3, optimizer_dict: Optional[dict["torch.nn.Parameter", "torch.optim.Optimizer"]] = None
+    ) -> None:
+        dummy_tensor = torch.randn(1, 1)
+        self.optimizer_dict = optimizer_dict
+        super().__init__([dummy_tensor], {"lr": lr})
+
+    @override
+    def zero_grad(self, set_to_none: bool = True) -> None:
+        pass
+
+    @override
+    def step(self, closure: Optional[Callable[[], float]] = None) -> Optional[float]:
+        pass
+
+
+def create_modelcard_and_push(
+    trainer: "Trainer",
+    model_args: "ModelArguments",
+    data_args: "DataArguments",
+    training_args: "TrainingArguments",
+    finetuning_args: "FinetuningArguments",
+) -> None:
+    kwargs = {
+        "tasks": "text-generation",
+        "finetuned_from": model_args.model_name_or_path,
+        "tags": ["llama-factory", finetuning_args.finetuning_type],
+    }
+    if data_args.dataset is not None:
+        kwargs["dataset"] = data_args.dataset
+
+    if model_args.use_unsloth:
+        kwargs["tags"] = kwargs["tags"] + ["unsloth"]
+
+    if model_args.use_kt:
+        kwargs["tags"] = kwargs["tags"] + ["ktransformers"]
+
+    if not training_args.do_train:
+        pass
+    elif training_args.push_to_hub:
+        trainer.push_to_hub(**kwargs)
+    else:
+        trainer.create_model_card(license="other", **kwargs)  # prevent from connecting to hub
+
+
+def create_ref_model(
+    model_args: "ModelArguments", finetuning_args: "FinetuningArguments", add_valuehead: bool = False
+) -> Optional[Union["PreTrainedModel", "AutoModelForCausalLMWithValueHead"]]:
+    r"""Create reference model for PPO/DPO training. Evaluation mode is not supported.
+
+    The valuehead parameter is randomly initialized since it is useless for PPO training.
+    """
+    if finetuning_args.ref_model is not None:
+        ref_model_args = ModelArguments.copyfrom(
+            model_args,
+            model_name_or_path=finetuning_args.ref_model,
+            adapter_name_or_path=finetuning_args.ref_model_adapters,
+            quantization_bit=finetuning_args.ref_model_quantization_bit,
+        )
+        ref_finetuning_args = FinetuningArguments()
+        tokenizer = load_tokenizer(ref_model_args)["tokenizer"]
+        ref_model = load_model(
+            tokenizer, ref_model_args, ref_finetuning_args, is_trainable=False, add_valuehead=add_valuehead
+        )
+        logger.info_rank0(f"Created reference model from {finetuning_args.ref_model}")
+    else:
+        if finetuning_args.finetuning_type == "lora":
+            ref_model = None
+        else:
+            ref_model_args = ModelArguments.copyfrom(model_args)
+            ref_finetuning_args = FinetuningArguments()
+            tokenizer = load_tokenizer(ref_model_args)["tokenizer"]
+            ref_model = load_model(
+                tokenizer, ref_model_args, ref_finetuning_args, is_trainable=False, add_valuehead=add_valuehead
+            )
+            logger.info_rank0("Created reference model from the model itself.")
+
+    return ref_model
+
+
+def create_reward_model(
+    model: "AutoModelForCausalLMWithValueHead", model_args: "ModelArguments", finetuning_args: "FinetuningArguments"
+) -> Optional["AutoModelForCausalLMWithValueHead"]:
+    r"""Create reward model for PPO training."""
+    if finetuning_args.reward_model_type == "api":
+        assert finetuning_args.reward_model.startswith("http"), "Please provide full url."
+        logger.info_rank0(f"Use reward server {finetuning_args.reward_model}")
+        return finetuning_args.reward_model
+    elif finetuning_args.reward_model_type == "lora":
+        model.pretrained_model.load_adapter(finetuning_args.reward_model, "reward")
+        for name, param in model.named_parameters():  # https://github.com/huggingface/peft/issues/1090
+            if "default" in name:
+                param.data = param.data.to(torch.float32)  # trainable params should in fp32
+        vhead_params = load_valuehead_params(finetuning_args.reward_model, model_args)
+        assert vhead_params is not None, "Reward model is not correctly loaded."
+        model.register_buffer("reward_head_weight", vhead_params["v_head.summary.weight"], persistent=False)
+        model.register_buffer("reward_head_bias", vhead_params["v_head.summary.bias"], persistent=False)
+        model.register_buffer(
+            "default_head_weight", torch.zeros_like(vhead_params["v_head.summary.weight"]), persistent=False
+        )
+        model.register_buffer(
+            "default_head_bias", torch.zeros_like(vhead_params["v_head.summary.bias"]), persistent=False
+        )
+        logger.info_rank0(f"Loaded adapter weights of reward model from {finetuning_args.reward_model}")
+        return None
+    else:
+        reward_model_args = ModelArguments.copyfrom(
+            model_args,
+            model_name_or_path=finetuning_args.reward_model,
+            adapter_name_or_path=finetuning_args.reward_model_adapters,
+            quantization_bit=finetuning_args.reward_model_quantization_bit,
+        )
+        reward_finetuning_args = FinetuningArguments()
+        tokenizer = load_tokenizer(reward_model_args)["tokenizer"]
+        reward_model = load_model(
+            tokenizer, reward_model_args, reward_finetuning_args, is_trainable=False, add_valuehead=True
+        )
+        logger.info_rank0(f"Loaded full weights of reward model from {finetuning_args.reward_model}")
+        logger.warning_rank0("Please ensure the ppo model and reward model share SAME tokenizer and vocabulary.")
+        return reward_model
+
+
+def _get_decay_parameter_names(model: "PreTrainedModel") -> list[str]:
+    r"""Return a list of names of parameters with weight decay. (weights in non-layernorm layers)."""
+    decay_parameters = get_parameter_names(model, ALL_LAYERNORM_LAYERS)
+    decay_parameters = [name for name in decay_parameters if "bias" not in name]
+    return decay_parameters
+
+
+def _create_galore_optimizer(
+    model: "PreTrainedModel",
+    training_args: "TrainingArguments",
+    finetuning_args: "FinetuningArguments",
+) -> "torch.optim.Optimizer":
+    if len(finetuning_args.galore_target) == 1 and finetuning_args.galore_target[0] == "all":
+        galore_targets = find_all_linear_modules(model, finetuning_args.freeze_vision_tower)
+    else:
+        galore_targets = finetuning_args.galore_target
+
+    galore_params: list[torch.nn.Parameter] = []
+    for name, module in model.named_modules():
+        if isinstance(module, torch.nn.Linear) and any(target in name for target in galore_targets):
+            for param in module.parameters():
+                if param.requires_grad and len(param.shape) > 1:
+                    galore_params.append(param)
+
+    galore_kwargs = {
+        "rank": finetuning_args.galore_rank,
+        "update_proj_gap": finetuning_args.galore_update_interval,
+        "scale": finetuning_args.galore_scale,
+        "proj_type": finetuning_args.galore_proj_type,
+    }
+
+    id_galore_params = {id(param) for param in galore_params}
+    decay_params, nodecay_params = [], []  # they are non-galore parameters
+    trainable_params: list[torch.nn.Parameter] = []  # galore_params + decay_params + nodecay_params
+    decay_param_names = _get_decay_parameter_names(model)
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            trainable_params.append(param)
+            if id(param) not in id_galore_params:
+                if name in decay_param_names:
+                    decay_params.append(param)
+                else:
+                    nodecay_params.append(param)
+
+    _, optim_kwargs = Trainer.get_optimizer_cls_and_kwargs(training_args)
+
+    if training_args.optim == "adamw_torch":
+        optim_class = GaLoreAdamW
+    elif training_args.optim in ["adamw_bnb_8bit", "adamw_8bit", "paged_adamw_8bit"]:
+        optim_class = GaLoreAdamW8bit
+    elif training_args.optim == "adafactor":
+        optim_class = GaLoreAdafactor
+    else:
+        raise NotImplementedError(f"Unknown optim: {training_args.optim}.")
+
+    if finetuning_args.galore_layerwise:
+        logger.warning_rank0("The displayed gradient norm will be all zeros in layerwise GaLore.")
+        if training_args.gradient_accumulation_steps != 1:
+            raise ValueError("Per-layer GaLore does not support gradient accumulation.")
+
+        optimizer_dict: dict[torch.Tensor, torch.optim.Optimizer] = {}
+        for param in nodecay_params:
+            param_groups = [dict(params=[param], weight_decay=0.0)]
+            optimizer_dict[param] = optim_class(param_groups, **optim_kwargs)
+        for param in decay_params:
+            param_groups = [dict(params=[param], weight_decay=training_args.weight_decay)]
+            optimizer_dict[param] = optim_class(param_groups, **optim_kwargs)
+        for param in galore_params:  # galore params have weight decay
+            param_groups = [dict(params=[param], weight_decay=training_args.weight_decay, **galore_kwargs)]
+            optimizer_dict[param] = optim_class(param_groups, **optim_kwargs)
+
+        def optimizer_hook(param: "torch.nn.Parameter"):
+            if param.grad is not None:
+                optimizer_dict[param].step()
+                optimizer_dict[param].zero_grad()
+
+        for param in trainable_params:
+            param.register_post_accumulate_grad_hook(optimizer_hook)
+
+        optimizer = DummyOptimizer(lr=training_args.learning_rate, optimizer_dict=optimizer_dict)
+    else:
+        param_groups = [
+            dict(params=nodecay_params, weight_decay=0.0),
+            dict(params=decay_params, weight_decay=training_args.weight_decay),
+            dict(params=galore_params, weight_decay=training_args.weight_decay, **galore_kwargs),
+        ]
+        optimizer = optim_class(param_groups, **optim_kwargs)
+
+    logger.info_rank0(
+        f"Using GaLore optimizer with args: {galore_kwargs}. "
+        "It may cause hanging at the start of training, wait patiently."
+    )
+    return optimizer
+
+
+def _create_apollo_optimizer(
+    model: "PreTrainedModel",
+    training_args: "TrainingArguments",
+    finetuning_args: "FinetuningArguments",
+) -> "torch.optim.Optimizer":
+    if len(finetuning_args.apollo_target) == 1 and finetuning_args.apollo_target[0] == "all":
+        apollo_targets = find_all_linear_modules(model, finetuning_args.freeze_vision_tower)
+    else:
+        apollo_targets = finetuning_args.apollo_target
+
+    apollo_params: list[torch.nn.Parameter] = []
+    for name, module in model.named_modules():
+        if isinstance(module, torch.nn.Linear) and any(target in name for target in apollo_targets):
+            for param in module.parameters():
+                if param.requires_grad and len(param.shape) > 1:
+                    apollo_params.append(param)
+
+    apollo_kwargs = {
+        "rank": finetuning_args.apollo_rank,
+        "proj": finetuning_args.apollo_proj,
+        "proj_type": finetuning_args.apollo_proj_type,
+        "update_proj_gap": finetuning_args.apollo_update_interval,
+        "scale": finetuning_args.apollo_scale,
+        "scale_type": finetuning_args.apollo_scale_type,
+        "scale_front": finetuning_args.apollo_scale_front,
+    }
+
+    id_apollo_params = {id(param) for param in apollo_params}
+    decay_params, nodecay_params = [], []  # they are non-apollo parameters
+    trainable_params: list[torch.nn.Parameter] = []  # apollo_params + decay_params + nodecay_params
+    decay_param_names = _get_decay_parameter_names(model)
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            trainable_params.append(param)
+            if id(param) not in id_apollo_params:
+                if name in decay_param_names:
+                    decay_params.append(param)
+                else:
+                    nodecay_params.append(param)
+
+    _, optim_kwargs = Trainer.get_optimizer_cls_and_kwargs(training_args)
+
+    if training_args.optim == "adamw_torch":
+        optim_class = APOLLOAdamW
+    else:
+        raise NotImplementedError(f"Unknown optim: {training_args.optim}.")
+
+    if finetuning_args.apollo_layerwise:
+        logger.warning_rank0("The displayed gradient norm will be all zeros in layerwise APOLLO.")
+        if training_args.gradient_accumulation_steps != 1:
+            raise ValueError("Per-layer APOLLO does not support gradient accumulation.")
+
+        optimizer_dict: dict[torch.Tensor, torch.optim.Optimizer] = {}
+        for param in nodecay_params:
+            param_groups = [dict(params=[param], weight_decay=0.0)]
+            optimizer_dict[param] = optim_class(param_groups, **optim_kwargs)
+        for param in decay_params:
+            param_groups = [dict(params=[param], weight_decay=training_args.weight_decay)]
+            optimizer_dict[param] = optim_class(param_groups, **optim_kwargs)
+        for param in apollo_params:  # apollo params have weight decay
+            param_groups = [dict(params=[param], weight_decay=training_args.weight_decay, **apollo_kwargs)]
+            optimizer_dict[param] = optim_class(param_groups, **optim_kwargs)
+
+        def optimizer_hook(param: "torch.nn.Parameter"):
+            if param.grad is not None:
+                optimizer_dict[param].step()
+                optimizer_dict[param].zero_grad()
+
+        for param in trainable_params:
+            param.register_post_accumulate_grad_hook(optimizer_hook)
+
+        optimizer = DummyOptimizer(lr=training_args.learning_rate, optimizer_dict=optimizer_dict)
+    else:
+        param_groups = [
+            dict(params=nodecay_params, weight_decay=0.0),
+            dict(params=decay_params, weight_decay=training_args.weight_decay),
+            dict(params=apollo_params, weight_decay=training_args.weight_decay, **apollo_kwargs),
+        ]
+        optimizer = optim_class(param_groups, **optim_kwargs)
+
+    logger.info_rank0(f"Using APOLLO optimizer with args: {apollo_kwargs}.")
+    return optimizer
+
+
+def _create_loraplus_optimizer(
+    model: "PreTrainedModel",
+    training_args: "TrainingArguments",
+    finetuning_args: "FinetuningArguments",
+) -> "torch.optim.Optimizer":
+    default_lr = training_args.learning_rate
+    loraplus_lr = training_args.learning_rate * finetuning_args.loraplus_lr_ratio
+    embedding_lr = finetuning_args.loraplus_lr_embedding
+
+    decay_param_names = _get_decay_parameter_names(model)
+    param_dict: dict[str, list[torch.nn.Parameter]] = {
+        "lora_a": [],
+        "lora_b": [],
+        "lora_b_nodecay": [],
+        "embedding": [],
+    }
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            if "lora_embedding_B" in name:
+                param_dict["embedding"].append(param)
+            elif "lora_B" in name or param.ndim == 1:
+                if name in decay_param_names:
+                    param_dict["lora_b"].append(param)
+                else:
+                    param_dict["lora_b_nodecay"].append(param)
+            else:
+                param_dict["lora_a"].append(param)
+
+    optim_class, optim_kwargs = Trainer.get_optimizer_cls_and_kwargs(training_args)
+    param_groups = [
+        dict(params=param_dict["lora_a"], lr=default_lr, weight_decay=training_args.weight_decay),
+        dict(params=param_dict["lora_b"], lr=loraplus_lr, weight_decay=training_args.weight_decay),
+        dict(params=param_dict["lora_b_nodecay"], lr=loraplus_lr, weight_decay=0.0),
+        dict(params=param_dict["embedding"], lr=embedding_lr, weight_decay=training_args.weight_decay),
+    ]
+    optimizer = optim_class(param_groups, **optim_kwargs)
+    logger.info_rank0(f"Using LoRA+ optimizer with loraplus lr ratio {finetuning_args.loraplus_lr_ratio:.2f}.")
+    return optimizer
+
+
+def _create_badam_optimizer(
+    model: "PreTrainedModel",
+    training_args: "TrainingArguments",
+    finetuning_args: "FinetuningArguments",
+) -> "torch.optim.Optimizer":
+    decay_params, nodecay_params = [], []
+    decay_param_names = _get_decay_parameter_names(model)
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            if name in decay_param_names:
+                decay_params.append(param)
+            else:
+                nodecay_params.append(param)
+
+    optim_class, optim_kwargs = Trainer.get_optimizer_cls_and_kwargs(training_args)
+    param_groups = [
+        dict(params=nodecay_params, weight_decay=0.0),
+        dict(params=decay_params, weight_decay=training_args.weight_decay),
+    ]
+
+    if finetuning_args.badam_mode == "layer":
+        from badam import BlockOptimizer  # type: ignore
+
+        base_optimizer = optim_class(param_groups, **optim_kwargs)
+        optimizer = BlockOptimizer(
+            base_optimizer=base_optimizer,
+            named_parameters_list=list(model.named_parameters()),
+            block_prefix_list=None,
+            switch_block_every=finetuning_args.badam_switch_interval,
+            start_block=finetuning_args.badam_start_block,
+            switch_mode=finetuning_args.badam_switch_mode,
+            verbose=finetuning_args.badam_verbose,
+            ds_zero3_enabled=is_deepspeed_zero3_enabled(),
+        )
+        logger.info_rank0(
+            f"Using BAdam optimizer with layer-wise update, switch mode is {finetuning_args.badam_switch_mode}, "
+            f"switch block every {finetuning_args.badam_switch_interval} steps, "
+            f"default start block is {finetuning_args.badam_start_block}"
+        )
+
+    elif finetuning_args.badam_mode == "ratio":
+        from badam import BlockOptimizerRatio  # type: ignore
+
+        assert finetuning_args.badam_update_ratio > 1e-6
+        optimizer = BlockOptimizerRatio(
+            param_groups=param_groups,
+            named_parameters_list=list(model.named_parameters()),
+            update_ratio=finetuning_args.badam_update_ratio,
+            mask_mode=finetuning_args.badam_mask_mode,
+            verbose=finetuning_args.badam_verbose,
+            include_embedding=False,
+            **optim_kwargs,
+        )
+        logger.info_rank0(
+            f"Using BAdam optimizer with ratio-based update, update ratio is {finetuning_args.badam_update_ratio}, "
+            f"mask mode is {finetuning_args.badam_mask_mode}"
+        )
+
+    return optimizer
+
+
+def _create_adam_mini_optimizer(
+    model: "PreTrainedModel",
+    training_args: "TrainingArguments",
+) -> "torch.optim.Optimizer":
+    from adam_mini import Adam_mini  # type: ignore
+
+    hidden_size = getattr(model.config, "hidden_size", None)
+    num_q_head = getattr(model.config, "num_attention_heads", None)
+    num_kv_head = getattr(model.config, "num_key_value_heads", None)
+
+    optimizer = Adam_mini(
+        named_parameters=model.named_parameters(),
+        lr=training_args.learning_rate,
+        betas=(training_args.adam_beta1, training_args.adam_beta2),
+        eps=training_args.adam_epsilon,
+        weight_decay=training_args.weight_decay,
+        model_sharding=is_fsdp_enabled() or is_deepspeed_zero3_enabled(),
+        dim=hidden_size,
+        n_heads=num_q_head,
+        n_kv_heads=num_kv_head,
+    )
+    logger.info_rank0("Using Adam-mini optimizer.")
+    return optimizer
+
+
+def _create_muon_optimizer(
+    model: "PreTrainedModel",
+    training_args: "TrainingArguments",
+) -> "torch.optim.Optimizer":
+    from ..third_party.muon import Muon
+
+    muon_params, adamw_params = [], []
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            # Use Muon for 2D parameters that aren't embeddings or heads
+            if param.ndim == 2 and "embed" not in name and "lm_head" not in name:
+                muon_params.append(param)
+            else:
+                adamw_params.append(param)
+
+    optimizer = Muon(
+        lr=training_args.learning_rate,
+        wd=training_args.weight_decay,
+        muon_params=muon_params,
+        adamw_params=adamw_params,
+        adamw_betas=(training_args.adam_beta1, training_args.adam_beta2),
+        adamw_eps=training_args.adam_epsilon,
+    )
+    logger.info_rank0(
+        f"Using Muon optimizer with {len(muon_params)} Muon params and {len(adamw_params)} AdamW params."
+    )
+    return optimizer
+
+
+def create_custom_optimizer(
+    model: "PreTrainedModel",
+    training_args: "TrainingArguments",
+    finetuning_args: "FinetuningArguments",
+) -> Optional["torch.optim.Optimizer"]:
+    if finetuning_args.use_galore:
+        return _create_galore_optimizer(model, training_args, finetuning_args)
+
+    if finetuning_args.use_apollo:
+        return _create_apollo_optimizer(model, training_args, finetuning_args)
+
+    if finetuning_args.loraplus_lr_ratio is not None:
+        return _create_loraplus_optimizer(model, training_args, finetuning_args)
+
+    if finetuning_args.use_badam:
+        return _create_badam_optimizer(model, training_args, finetuning_args)
+
+    if finetuning_args.use_adam_mini:
+        return _create_adam_mini_optimizer(model, training_args)
+
+    if finetuning_args.use_muon:
+        return _create_muon_optimizer(model, training_args)
+
+
+def create_custom_scheduler(
+    training_args: "TrainingArguments",
+    num_training_steps: int,
+    optimizer: Optional["torch.optim.Optimizer"] = None,
+) -> None:
+    if training_args.lr_scheduler_type == "warmup_stable_decay":
+        num_warmup_steps = training_args.get_warmup_steps(num_training_steps)
+        remaining_steps = num_training_steps - num_warmup_steps
+        num_stable_steps = remaining_steps // 3  # use 1/3 for stable by default
+        num_decay_steps = remaining_steps - num_stable_steps
+        scheduler_kwargs = training_args.lr_scheduler_kwargs or {}
+        default_kwargs = {
+            "num_stable_steps": num_stable_steps,
+            "num_decay_steps": num_decay_steps,
+        }
+        for key, value in default_kwargs.items():
+            if key not in scheduler_kwargs:
+                scheduler_kwargs[key] = value
+
+        training_args.lr_scheduler_kwargs = scheduler_kwargs
+
+    if optimizer is not None and isinstance(optimizer, DummyOptimizer):
+        optimizer_dict = optimizer.optimizer_dict
+        scheduler_dict: dict[torch.nn.Parameter, torch.optim.lr_scheduler.LRScheduler] = {}
+
+        for param in optimizer_dict.keys():
+            scheduler_dict[param] = get_scheduler(
+                training_args.lr_scheduler_type,
+                optimizer=optimizer_dict[param],
+                num_warmup_steps=training_args.get_warmup_steps(num_training_steps),
+                num_training_steps=num_training_steps,
+                scheduler_specific_kwargs=training_args.lr_scheduler_kwargs,
+            )
+
+        def scheduler_hook(param: "torch.nn.Parameter"):
+            scheduler_dict[param].step()
+
+        for param in optimizer_dict.keys():
+            param.register_post_accumulate_grad_hook(scheduler_hook)
+
+
+def get_batch_logps(
+    logits: "torch.Tensor",
+    labels: "torch.Tensor",
+    label_pad_token_id: int = IGNORE_INDEX,
+    ld_alpha: Optional[float] = None,
+) -> tuple["torch.Tensor", "torch.Tensor"]:
+    r"""Compute the log probabilities of the given labels under the given logits.
+
+    Returns:
+        logps: A tensor of shape (batch_size,) containing the sum of log probabilities.
+        valid_length: A tensor of shape (batch_size,) containing the number of non-masked tokens.
+
+    """
+    if logits.shape[:-1] != labels.shape:
+        raise ValueError("Logits (batchsize x seqlen) and labels must have the same shape.")
+
+    labels = labels[:, 1:].clone()
+    logits = logits[:, :-1, :]
+    loss_mask = labels != label_pad_token_id
+    labels[labels == label_pad_token_id] = 0  # dummy token
+    per_token_logps = torch.gather(logits.log_softmax(-1), dim=2, index=labels.unsqueeze(2)).squeeze(2)
+
+    valid_length = loss_mask.sum(-1)
+    if ld_alpha is not None:
+        num_examples = labels.shape[0] // 2
+        chosen_lengths = valid_length[:num_examples]
+        rejected_lengths = valid_length[num_examples:]
+        min_lengths = torch.min(chosen_lengths, rejected_lengths)
+        start_positions = torch.argmax(loss_mask.int(), dim=1)
+        public_lengths = start_positions + torch.cat([min_lengths, min_lengths], dim=0)
+
+        seq_len = labels.shape[-1]
+        position_ids = torch.arange(seq_len, device=per_token_logps.device).expand_as(per_token_logps)
+
+        ld_mask = position_ids < public_lengths.unsqueeze(1)
+        front_mask = (ld_mask * loss_mask).float()
+        rear_mask = (~ld_mask * loss_mask).float()
+
+        front_logps = (per_token_logps * front_mask).sum(-1)
+        rear_logps = (per_token_logps * rear_mask).sum(-1)
+        logps = front_logps + ld_alpha * rear_logps
+    else:
+        logps = (per_token_logps * loss_mask).sum(-1)
+
+    return logps, valid_length
+
+
+def dft_loss_func(outputs, labels, num_items_in_batch=None):
+    logits = outputs.get("logits")
+    if logits is None:
+        return outputs.get("loss", torch.tensor(0.0))
+
+    logits = logits.float()
+    vocab_size = logits.size(-1)
+    labels = torch.nn.functional.pad(labels, (0, 1), value=-100)
+    shift_labels = labels[..., 1:].contiguous()
+    logits = logits.view(-1, vocab_size)
+    shift_labels = shift_labels.view(-1)
+    shift_labels = shift_labels.to(logits.device)
+
+    loss = _dft_cross_entropy(logits, shift_labels, num_items_in_batch)
+    return loss
+
+
+def _dft_cross_entropy(
+    source: torch.Tensor,
+    target: torch.Tensor,
+    num_items_in_batch: Optional[torch.Tensor] = None,
+    ignore_index: int = -100,
+) -> torch.Tensor:
+    per_token_loss = torch.nn.functional.cross_entropy(source, target, ignore_index=ignore_index, reduction="none")
+    valid_mask = target != ignore_index
+    if not valid_mask.any():
+        return torch.tensor(0.0, device=source.device, dtype=source.dtype)
+
+    valid_losses = per_token_loss[valid_mask]
+
+    with torch.no_grad():
+        target_probs = torch.exp(-valid_losses)
+
+    weighted_losses = valid_losses * target_probs
+
+    if num_items_in_batch is not None:
+        total_loss = weighted_losses.sum()
+        if torch.is_tensor(num_items_in_batch):
+            num_items_in_batch = num_items_in_batch.to(total_loss.device)
+        loss = total_loss / num_items_in_batch
+    else:
+        loss = weighted_losses.mean()
+    return loss
+
+
+def nested_detach(
+    tensors: Union["torch.Tensor", list["torch.Tensor"], tuple["torch.Tensor"], dict[str, "torch.Tensor"]],
+    clone: bool = False,
+):
+    r"""Detach `tensors` (even if it's a nested list/tuple/dict of tensors)."""
+    if isinstance(tensors, (list, tuple)):
+        return type(tensors)(nested_detach(t, clone=clone) for t in tensors)
+    elif isinstance(tensors, Mapping):
+        return type(tensors)({k: nested_detach(t, clone=clone) for k, t in tensors.items()})
+
+    if isinstance(tensors, torch.Tensor):
+        if clone:
+            return tensors.detach().clone()
+        else:
+            return tensors.detach()
+    else:
+        return tensors
+
+
+def get_swanlab_callback(finetuning_args: "FinetuningArguments") -> "TrainerCallback":
+    r"""Get the callback for logging to SwanLab."""
+    import swanlab  # type: ignore
+    from swanlab.integration.transformers import SwanLabCallback  # type: ignore
+
+    if finetuning_args.swanlab_api_key is not None:
+        swanlab.login(api_key=finetuning_args.swanlab_api_key)
+
+    if finetuning_args.swanlab_lark_webhook_url is not None:
+        from swanlab.plugin.notification import LarkCallback  # type: ignore
+
+        lark_callback = LarkCallback(
+            webhook_url=finetuning_args.swanlab_lark_webhook_url,
+            secret=finetuning_args.swanlab_lark_secret,
+        )
+        swanlab.register_callbacks([lark_callback])
+
+    class SwanLabCallbackExtension(SwanLabCallback):
+        def setup(self, args: "TrainingArguments", state: "TrainerState", model: "PreTrainedModel", **kwargs):
+            if not state.is_world_process_zero:
+                return
+
+            super().setup(args, state, model, **kwargs)
+            try:
+                if hasattr(self, "_swanlab"):
+                    swanlab_public_config = self._swanlab.get_run().public.json()
+                else:  # swanlab <= 0.4.9
+                    swanlab_public_config = self._experiment.get_run().public.json()
+            except Exception:
+                swanlab_public_config = {}
+
+            with open(os.path.join(args.output_dir, SWANLAB_CONFIG), "w") as f:
+                f.write(json.dumps(swanlab_public_config, indent=2))
+
+    swanlab_callback = SwanLabCallbackExtension(
+        project=finetuning_args.swanlab_project,
+        workspace=finetuning_args.swanlab_workspace,
+        experiment_name=finetuning_args.swanlab_run_name,
+        mode=finetuning_args.swanlab_mode,
+        config={"Framework": "🦙LlamaFactory"},
+        logdir=finetuning_args.swanlab_logdir,
+        tags=["🦙LlamaFactory"],
+    )
+    return swanlab_callback
+
+
+def get_ray_trainer(
+    training_function: Callable,
+    train_loop_config: dict[str, Any],
+    ray_args: "RayArguments",
+) -> "TorchTrainer":
+    if not ray_args.use_ray:
+        raise ValueError("Ray was not enabled. Please set `USE_RAY=1` to enable ray.")
+
+    if ray_args.ray_init_kwargs is not None:
+        ray.init(**ray_args.ray_init_kwargs)
+
+    if ray_args.ray_storage_filesystem is not None:
+        # this means we are using s3/gcs
+        storage_path = ray_args.ray_storage_path
+    else:
+        storage_path = Path(ray_args.ray_storage_path).absolute().as_posix()
+
+    trainer = TorchTrainer(
+        training_function,
+        train_loop_config=train_loop_config,
+        scaling_config=ScalingConfig(
+            num_workers=ray_args.ray_num_workers,
+            resources_per_worker=ray_args.resources_per_worker,
+            placement_strategy=ray_args.placement_strategy,
+            use_gpu=True,
+        ),
+        run_config=RunConfig(
+            name=ray_args.ray_run_name,
+            storage_filesystem=ray_args.ray_storage_filesystem,
+            storage_path=storage_path,
+        ),
+    )
+    return trainer
diff --git a/llamafactory/train/tuner.py b/llamafactory/train/tuner.py
new file mode 100644
index 0000000000000000000000000000000000000000..b646890ec655bd7c79015135fe605e535af770a4
--- /dev/null
+++ b/llamafactory/train/tuner.py
@@ -0,0 +1,220 @@
+# Copyright 2025 the KVCache.AI team, Approaching AI, and the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import shutil
+from typing import TYPE_CHECKING, Any, Optional
+
+import torch
+import torch.distributed as dist
+from transformers import EarlyStoppingCallback, PreTrainedModel
+
+from ..data import get_template_and_fix_tokenizer
+from ..extras import logging
+from ..extras.constants import V_HEAD_SAFE_WEIGHTS_NAME, V_HEAD_WEIGHTS_NAME
+from ..extras.misc import infer_optim_dtype
+from ..extras.packages import is_mcore_adapter_available, is_ray_available
+from ..hparams import get_infer_args, get_ray_args, get_train_args, read_args
+from ..model import load_model, load_tokenizer
+from .callbacks import LogCallback, PissaConvertCallback, ReporterCallback
+from .dpo import run_dpo
+from .kto import run_kto
+from .ppo import run_ppo
+from .pt import run_pt
+from .rm import run_rm
+from .sft import run_sft
+from .trainer_utils import get_ray_trainer, get_swanlab_callback
+
+
+if is_ray_available():
+    import ray
+    from ray.train.huggingface.transformers import RayTrainReportCallback
+
+
+if TYPE_CHECKING:
+    from transformers import TrainerCallback
+
+
+logger = logging.get_logger(__name__)
+
+
+def _training_function(config: dict[str, Any]) -> None:
+    args = config.get("args")
+    callbacks: list[Any] = config.get("callbacks")
+    model_args, data_args, training_args, finetuning_args, generating_args = get_train_args(args)
+
+    callbacks.append(LogCallback())
+    if finetuning_args.pissa_convert:
+        callbacks.append(PissaConvertCallback())
+
+    if finetuning_args.use_swanlab:
+        callbacks.append(get_swanlab_callback(finetuning_args))
+
+    if finetuning_args.early_stopping_steps is not None:
+        callbacks.append(EarlyStoppingCallback(early_stopping_patience=finetuning_args.early_stopping_steps))
+
+    callbacks.append(ReporterCallback(model_args, data_args, finetuning_args, generating_args))  # add to last
+
+    if finetuning_args.stage in ["pt", "sft", "dpo"] and finetuning_args.use_mca:
+        if not is_mcore_adapter_available():
+            raise ImportError("mcore_adapter is not installed. Please install it with `pip install mcore-adapter`.")
+        if finetuning_args.stage == "pt":
+            from .mca import run_pt as run_pt_mca
+
+            run_pt_mca(model_args, data_args, training_args, finetuning_args, callbacks)
+        elif finetuning_args.stage == "sft":
+            from .mca import run_sft as run_sft_mca
+
+            run_sft_mca(model_args, data_args, training_args, finetuning_args, callbacks)
+        elif finetuning_args.stage == "dpo":
+            from .mca import run_dpo as run_dpo_mca
+
+            run_dpo_mca(model_args, data_args, training_args, finetuning_args, callbacks)
+
+    elif finetuning_args.stage == "pt":
+        run_pt(model_args, data_args, training_args, finetuning_args, callbacks)
+    elif finetuning_args.stage == "sft":
+        if model_args.use_kt:
+            from .ksft.workflow import run_sft as run_sft_kt
+
+            run_sft_kt(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
+        else:
+            run_sft(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
+
+    elif finetuning_args.stage == "rm":
+        run_rm(model_args, data_args, training_args, finetuning_args, callbacks)
+    elif finetuning_args.stage == "ppo":
+        run_ppo(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
+    elif finetuning_args.stage == "dpo":
+        run_dpo(model_args, data_args, training_args, finetuning_args, callbacks)
+    elif finetuning_args.stage == "kto":
+        run_kto(model_args, data_args, training_args, finetuning_args, callbacks)
+    else:
+        raise ValueError(f"Unknown task: {finetuning_args.stage}.")
+
+    if is_ray_available() and ray.is_initialized():
+        return  # if ray is intialized it will destroy the process group on return
+
+    try:
+        if dist.is_initialized():
+            dist.destroy_process_group()
+    except Exception as e:
+        logger.warning(f"Failed to destroy process group: {e}.")
+
+
+def run_exp(args: Optional[dict[str, Any]] = None, callbacks: Optional[list["TrainerCallback"]] = None) -> None:
+    args = read_args(args)
+    if "-h" in args or "--help" in args:
+        get_train_args(args)
+
+    ray_args = get_ray_args(args)
+    callbacks = callbacks or []
+    if ray_args.use_ray:
+        callbacks.append(RayTrainReportCallback())
+        trainer = get_ray_trainer(
+            training_function=_training_function,
+            train_loop_config={"args": args, "callbacks": callbacks},
+            ray_args=ray_args,
+        )
+        trainer.fit()
+    else:
+        _training_function(config={"args": args, "callbacks": callbacks})
+
+
+def export_model(args: Optional[dict[str, Any]] = None) -> None:
+    model_args, data_args, finetuning_args, _ = get_infer_args(args)
+
+    if model_args.export_dir is None:
+        raise ValueError("Please specify `export_dir` to save model.")
+
+    if model_args.adapter_name_or_path is not None and model_args.export_quantization_bit is not None:
+        raise ValueError("Please merge adapters before quantizing the model.")
+
+    tokenizer_module = load_tokenizer(model_args)
+    tokenizer = tokenizer_module["tokenizer"]
+    processor = tokenizer_module["processor"]
+    template = get_template_and_fix_tokenizer(tokenizer, data_args)
+    model = load_model(tokenizer, model_args, finetuning_args)  # must after fixing tokenizer to resize vocab
+
+    if getattr(model, "quantization_method", None) is not None and model_args.adapter_name_or_path is not None:
+        raise ValueError("Cannot merge adapters to a quantized model.")
+
+    if not isinstance(model, PreTrainedModel):
+        raise ValueError("The model is not a `PreTrainedModel`, export aborted.")
+
+    if getattr(model, "quantization_method", None) is not None:  # quantized model adopts float16 type
+        setattr(model.config, "torch_dtype", torch.float16)
+    else:
+        if model_args.infer_dtype == "auto":
+            output_dtype = getattr(model.config, "torch_dtype", torch.float32)
+            if output_dtype == torch.float32:  # if infer_dtype is auto, try using half precision first
+                output_dtype = infer_optim_dtype(torch.bfloat16)
+        else:
+            output_dtype = getattr(torch, model_args.infer_dtype)
+
+        setattr(model.config, "torch_dtype", output_dtype)
+        model = model.to(output_dtype)
+        logger.info_rank0(f"Convert model dtype to: {output_dtype}.")
+
+    model.save_pretrained(
+        save_directory=model_args.export_dir,
+        max_shard_size=f"{model_args.export_size}GB",
+        safe_serialization=(not model_args.export_legacy_format),
+    )
+    if model_args.export_hub_model_id is not None:
+        model.push_to_hub(
+            model_args.export_hub_model_id,
+            token=model_args.hf_hub_token,
+            max_shard_size=f"{model_args.export_size}GB",
+            safe_serialization=(not model_args.export_legacy_format),
+        )
+
+    if finetuning_args.stage == "rm":
+        if model_args.adapter_name_or_path is not None:
+            vhead_path = model_args.adapter_name_or_path[-1]
+        else:
+            vhead_path = model_args.model_name_or_path
+
+        if os.path.exists(os.path.join(vhead_path, V_HEAD_SAFE_WEIGHTS_NAME)):
+            shutil.copy(
+                os.path.join(vhead_path, V_HEAD_SAFE_WEIGHTS_NAME),
+                os.path.join(model_args.export_dir, V_HEAD_SAFE_WEIGHTS_NAME),
+            )
+            logger.info_rank0(f"Copied valuehead to {model_args.export_dir}.")
+        elif os.path.exists(os.path.join(vhead_path, V_HEAD_WEIGHTS_NAME)):
+            shutil.copy(
+                os.path.join(vhead_path, V_HEAD_WEIGHTS_NAME),
+                os.path.join(model_args.export_dir, V_HEAD_WEIGHTS_NAME),
+            )
+            logger.info_rank0(f"Copied valuehead to {model_args.export_dir}.")
+
+    try:
+        tokenizer.padding_side = "left"  # restore padding side
+        tokenizer.init_kwargs["padding_side"] = "left"
+        tokenizer.save_pretrained(model_args.export_dir)
+        if model_args.export_hub_model_id is not None:
+            tokenizer.push_to_hub(model_args.export_hub_model_id, token=model_args.hf_hub_token)
+
+        if processor is not None:
+            processor.save_pretrained(model_args.export_dir)
+            if model_args.export_hub_model_id is not None:
+                processor.push_to_hub(model_args.export_hub_model_id, token=model_args.hf_hub_token)
+
+    except Exception as e:
+        logger.warning_rank0(f"Cannot save tokenizer, please copy the files manually: {e}.")
+
+    ollama_modelfile = os.path.join(model_args.export_dir, "Modelfile")
+    with open(ollama_modelfile, "w", encoding="utf-8") as f:
+        f.write(template.get_ollama_modelfile(tokenizer))
+        logger.info_rank0(f"Ollama modelfile saved in {ollama_modelfile}")
diff --git a/llamafactory/v1/__init__.py b/llamafactory/v1/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/llamafactory/v1/config/__init__.py b/llamafactory/v1/config/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/llamafactory/v1/config/data_args.py b/llamafactory/v1/config/data_args.py
new file mode 100644
index 0000000000000000000000000000000000000000..28829642c7affc63409f6f8f45776bfc9aef09d2
--- /dev/null
+++ b/llamafactory/v1/config/data_args.py
@@ -0,0 +1,33 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from dataclasses import dataclass, field
+from typing import Optional
+
+
+@dataclass
+class DataArguments:
+    dataset: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to the dataset."},
+    )
+    dataset_dir: str = field(
+        default="data",
+        metadata={"help": "Path to the folder containing the datasets."},
+    )
+    cutoff_len: int = field(
+        default=2048,
+        metadata={"help": "Cutoff length for the dataset."},
+    )
diff --git a/llamafactory/v1/config/model_args.py b/llamafactory/v1/config/model_args.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9a98660abf487bfd8ee7ed7ab1c15324c3a02f5
--- /dev/null
+++ b/llamafactory/v1/config/model_args.py
@@ -0,0 +1,27 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from dataclasses import dataclass, field
+
+
+@dataclass
+class ModelArguments:
+    model: str = field(
+        metadata={"help": "Path to the model or model identifier from Hugging Face."},
+    )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={"help": "Trust remote code from Hugging Face."},
+    )
diff --git a/llamafactory/v1/config/parser.py b/llamafactory/v1/config/parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..eca1749f788c785ec7cf170141579620b37ae087
--- /dev/null
+++ b/llamafactory/v1/config/parser.py
@@ -0,0 +1,63 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import sys
+from pathlib import Path
+from typing import Any, Optional, Union
+
+from omegaconf import OmegaConf
+from transformers import HfArgumentParser
+
+from ...extras.misc import is_env_enabled
+from .data_args import DataArguments
+from .model_args import ModelArguments
+from .sample_args import SampleArguments
+from .training_args import TrainingArguments
+
+
+def get_args(
+    args: Optional[Union[dict[str, Any], list[str]]] = None,
+) -> tuple[DataArguments, ModelArguments, TrainingArguments, SampleArguments]:
+    """Parse arguments from command line or config file."""
+    parser = HfArgumentParser([DataArguments, ModelArguments, TrainingArguments, SampleArguments])
+    allow_extra_keys = is_env_enabled("ALLOW_EXTRA_KEYS")
+
+    if args is None:
+        if len(sys.argv) > 1 and (sys.argv[1].endswith(".yaml") or sys.argv[1].endswith(".yml")):
+            override_config = OmegaConf.from_cli(sys.argv[2:])
+            dict_config = OmegaConf.load(Path(sys.argv[1]).absolute())
+            args = OmegaConf.to_container(OmegaConf.merge(dict_config, override_config))
+        elif len(sys.argv) > 1 and sys.argv[1].endswith(".json"):
+            override_config = OmegaConf.from_cli(sys.argv[2:])
+            dict_config = OmegaConf.create(json.load(Path(sys.argv[1]).absolute()))
+            args = OmegaConf.to_container(OmegaConf.merge(dict_config, override_config))
+        else:  # list of strings
+            args = sys.argv[1:]
+
+    if isinstance(args, dict):
+        (*parsed_args,) = parser.parse_dict(args, allow_extra_keys=allow_extra_keys)
+    else:
+        (*parsed_args, unknown_args) = parser.parse_args_into_dataclasses(args, return_remaining_strings=True)
+        if unknown_args and not allow_extra_keys:
+            print(parser.format_help())
+            print(f"Got unknown args, potentially deprecated arguments: {unknown_args}")
+            raise ValueError(f"Some specified arguments are not used by the HfArgumentParser: {unknown_args}")
+
+    return tuple(parsed_args)
+
+
+if __name__ == "__main__":
+    print(get_args())
diff --git a/llamafactory/v1/config/sample_args.py b/llamafactory/v1/config/sample_args.py
new file mode 100644
index 0000000000000000000000000000000000000000..666efb01427c64b7647eb55b05028f38c11155d8
--- /dev/null
+++ b/llamafactory/v1/config/sample_args.py
@@ -0,0 +1,24 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from dataclasses import dataclass, field
+
+
+@dataclass
+class SampleArguments:
+    max_new_tokens: int = field(
+        default=128,
+        metadata={"help": "Maximum number of new tokens to generate."},
+    )
diff --git a/llamafactory/v1/config/training_args.py b/llamafactory/v1/config/training_args.py
new file mode 100644
index 0000000000000000000000000000000000000000..38d62ecf587dd4f47b2942202b22dcc58bfdadef
--- /dev/null
+++ b/llamafactory/v1/config/training_args.py
@@ -0,0 +1,40 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from dataclasses import dataclass, field
+
+
+@dataclass
+class TrainingArguments:
+    output_dir: str = field(
+        default="",
+        metadata={"help": "Path to the output directory."},
+    )
+    micro_batch_size: int = field(
+        default=1,
+        metadata={"help": "Micro batch size for training."},
+    )
+    global_batch_size: int = field(
+        default=1,
+        metadata={"help": "Global batch size for training."},
+    )
+    learning_rate: float = field(
+        default=1e-4,
+        metadata={"help": "Learning rate for training."},
+    )
+    bf16: bool = field(
+        default=False,
+        metadata={"help": "Use bf16 for training."},
+    )
diff --git a/llamafactory/v1/core/__init__.py b/llamafactory/v1/core/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/llamafactory/v1/core/base_trainer.py b/llamafactory/v1/core/base_trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..041e5f7dca21b84f2277999d7548eede3cdb8835
--- /dev/null
+++ b/llamafactory/v1/core/base_trainer.py
@@ -0,0 +1,57 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any
+
+from ..config.training_args import TrainingArguments
+from ..extras.types import Model, Processor, Tensor, TorchDataset
+
+
+class DataCollator:
+    """Default Data collator."""
+
+    def __init__(self, processor: Processor) -> None:
+        self.processor = processor
+
+    def __call__(self, features: list[dict[str, Any]]) -> dict[str, Tensor]:
+        """Collate features into a batch."""
+        for feature in features:
+            pass
+
+        # sft: messages
+        # dpo: chosen_messages, rejected_messages
+
+
+class BaseTrainer:
+    def __init__(
+        self,
+        args: TrainingArguments,
+        model: Model,
+        processor: Processor,
+        dataset: TorchDataset,
+        data_collator: DataCollator,
+    ) -> None:
+        self.args = args
+        self.model = model
+        self.processor = processor
+        self.dataset = dataset
+        self.data_collator = data_collator
+        self.optimizer = None
+        self.lr_scheduler = None
+
+    def create_dataloader(self) -> None:
+        pass
+
+    def fit(self) -> None:
+        pass
diff --git a/llamafactory/v1/core/chat_sampler.py b/llamafactory/v1/core/chat_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f213914bce708d66835fcf529dde740a1309236
--- /dev/null
+++ b/llamafactory/v1/core/chat_sampler.py
@@ -0,0 +1,20 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..config.sample_args import SampleArguments
+
+
+class ChatSampler:
+    def __init__(self, sample_args: SampleArguments) -> None:
+        self.args = sample_args
diff --git a/llamafactory/v1/core/data_engine.py b/llamafactory/v1/core/data_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..8abe3bb722046115ad15790558da203d576887c0
--- /dev/null
+++ b/llamafactory/v1/core/data_engine.py
@@ -0,0 +1,182 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from collections.abc import AsyncIterable, Iterable
+from typing import Any, Union
+
+from datasets import load_dataset
+from huggingface_hub import hf_hub_download
+from omegaconf import OmegaConf
+from torch.utils.data import Dataset
+
+from ..config.data_args import DataArguments
+from ..extras.types import DatasetInfo, HFDataset, Sample
+
+
+class DataEngine(Dataset):
+    """Data engine."""
+
+    def __init__(self, data_args: DataArguments) -> None:
+        self.args = data_args
+        """Data arguments."""
+        self.datasets: dict[str, HFDataset] = {}
+        """Dict of (dataset_name, dataset)"""
+        self.dataset_infos: dict[str, DatasetInfo] = {}
+        """Dict of (dataset_name, dataset_info)"""
+        self.data_index: list[tuple[str, int]] = []
+        """List of (dataset_name, sample_index)"""
+        self.streaming: bool = False
+        """Whether dataset is streaming."""
+        self.get_dataset_info()
+        self.load_dataset()
+        self.build_data_index()
+
+    def get_dataset_info(self) -> None:
+        """Get dataset info from data arguments."""
+        if self.args.dataset.endswith(".yaml") and os.path.isfile(
+            os.path.join(self.args.dataset_dir, self.args.dataset)
+        ):  # local file
+            self.dataset_infos = OmegaConf.load(os.path.join(self.args.dataset_dir, self.args.dataset))
+        elif self.args.dataset.endswith(".yaml"):  # hf hub uri, e.g. llamafactory/v1-sft-demo/dataset_info.yaml
+            repo_id, filename = os.path.split(self.args.dataset)
+            filepath = hf_hub_download(repo_id=repo_id, filename=filename, repo_type="dataset")
+            self.dataset_infos = OmegaConf.load(filepath)
+        elif os.path.exists(os.path.join(self.args.dataset_dir, self.args.dataset)):  # local file(s)
+            self.dataset_infos = {"default": {"file_name": self.args.dataset}}
+        else:  # hf hub dataset, e.g. llamafactory/v1-sft-demo
+            self.dataset_infos = {"default": {"hf_hub_url": self.args.dataset}}
+
+    def load_dataset(self) -> None:
+        """Load datasets according to dataset info."""
+        for key, value in self.dataset_infos.items():
+            split = value.get("split", "train")
+            streaming = value.get("streaming", False)
+            self.streaming |= streaming
+            if "hf_hub_url" in value:
+                self.datasets[key] = load_dataset(value["hf_hub_url"], split=split, streaming=streaming)
+            else:  # data loader plugin
+                from ..plugins.data_plugins.loader import DataLoaderPlugin
+
+                self.datasets[key] = DataLoaderPlugin(args=self.args).auto_load_data(value)
+
+    def build_data_index(self) -> None:
+        """Build dataset index."""
+        for dataset_name, dataset in self.datasets.items():
+            size = self.dataset_infos[dataset_name].get("size")
+            weight = self.dataset_infos[dataset_name].get("weight")
+            if self.streaming:
+                data_index = [(dataset_name, -1) for _ in range(1000)]
+            else:
+                data_index = [(dataset_name, sample_index) for sample_index in range(len(dataset))]
+
+            if size or weight:  # data index plugin
+                from ..plugins.data_plugins.loader import DataIndexPlugin
+
+                data_index = DataIndexPlugin().adjust_data_index(data_index, size, weight)
+
+            self.data_index.extend(data_index)
+
+    def _convert_data_sample(self, raw_sample: dict[str, Any], dataset_name: str) -> Sample:
+        """Convert dataset sample.
+
+        Args:
+            raw_sample (dict[str, Any]): Raw dataset sample.
+            dataset_name (str): Dataset name.
+
+        Returns:
+            Sample: Dataset sample.
+        """
+        converter = self.dataset_infos[dataset_name].get("converter")
+        if converter is not None:
+            from ..plugins.data_plugins.converter import get_converter
+
+            return {"_dataset_name": dataset_name, **get_converter(converter)(raw_sample)}
+        else:
+            return {"_dataset_name": dataset_name, **raw_sample}
+
+    def __len__(self) -> int:
+        """Get dataset length.
+
+        Returns:
+            int: Dataset length.
+        """
+        if self.streaming:
+            return -1
+        else:
+            return len(self.data_index)
+
+    def __getitem__(self, index: Union[int, Any]) -> Union[Sample, list[Sample]]:
+        """Get dataset item.
+
+        Args:
+            index (int): Dataset index.
+
+        Returns:
+            Sample: Dataset item.
+        """
+        if self.streaming:
+            raise ValueError("Streaming dataset does not support index access.")
+
+        if isinstance(index, int):
+            dataset_name, sample_index = self.data_index[index]
+            return self._convert_data_sample(self.datasets[dataset_name][sample_index], dataset_name)
+        else:
+            from ..plugins.data_plugins.loader import DataSelectorPlugin
+
+            selected_index = DataSelectorPlugin(data_index=self.data_index).select(index)
+            if isinstance(selected_index, list):
+                return [
+                    self._convert_data_sample(self.datasets[dataset_name][sample_index], dataset_name)
+                    for dataset_name, sample_index in selected_index
+                ]
+            else:
+                dataset_name, sample_index = selected_index
+                return self._convert_data_sample(self.datasets[dataset_name][sample_index], dataset_name)
+
+    def __iter__(self) -> Iterable:
+        """Get dataset iterator.
+
+        Returns:
+            Iterable: Dataset iterator.
+        """
+        if self.streaming:
+            pass
+        else:
+            # TODO: add shuffle here
+            pass
+
+        raise NotImplementedError()
+
+    async def __aiter__(self) -> AsyncIterable:
+        """Get dataset async iterator.
+
+        Returns:
+            AsyncIterable: Dataset async iterator.
+        """
+        if self.streaming:
+            pass
+        else:
+            # TODO: add shuffle here
+            pass
+
+        raise NotImplementedError()
+
+
+if __name__ == "__main__":
+    from ..config.parser import get_args
+
+    data_args, *_ = get_args()
+    data_engine = DataEngine(data_args=data_args)
+    print(data_engine[0])
diff --git a/llamafactory/v1/core/model_engine.py b/llamafactory/v1/core/model_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..24d2d4b7d9280bb54058d771ade2f0892dede1f5
--- /dev/null
+++ b/llamafactory/v1/core/model_engine.py
@@ -0,0 +1,27 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..config.model_args import ModelArguments
+from ..extras.types import Model, Processor
+
+
+class ModelEngine:
+    def __init__(self, model_args: ModelArguments) -> None:
+        self.args = model_args
+
+    def get_model(self) -> Model:
+        pass
+
+    def get_processor(self) -> Processor:
+        pass
diff --git a/llamafactory/v1/extras/types.py b/llamafactory/v1/extras/types.py
new file mode 100644
index 0000000000000000000000000000000000000000..9539931af5a0f94fbb0ea0b6b2673ea684c7a223
--- /dev/null
+++ b/llamafactory/v1/extras/types.py
@@ -0,0 +1,95 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING, Literal, TypedDict, Union
+
+from typing_extensions import NotRequired
+
+
+if TYPE_CHECKING:
+    import datasets
+    import torch
+    import torch.utils.data
+    import transformers
+
+    Tensor = torch.Tensor
+    TorchDataset = Union[torch.utils.data.Dataset, torch.utils.data.IterableDataset]
+    HFDataset = Union[datasets.Dataset, datasets.IterableDataset]
+    DataCollator = transformers.DataCollator
+    DataLoader = torch.utils.data.DataLoader
+    HFModel = transformers.PreTrainedModel
+    DistModel = torch.nn.parallel.DistributedDataParallel
+    Processor = Union[transformers.PreTrainedTokenizer, transformers.ProcessorMixin]
+else:
+    Tensor = None
+    TorchDataset = None
+    HFDataset = None
+    DataCollator = None
+    DataLoader = None
+    HFModel = None
+    DistModel = None
+    Processor = None
+
+
+class DatasetInfo(TypedDict, total=False):
+    hf_hub_url: NotRequired[str]
+    """HF hub dataset uri."""
+    file_name: NotRequired[str]
+    """Local file path."""
+    dataset_dir: NotRequired[str]
+    """Dataset directory, default to args.dataset_dir."""
+    split: NotRequired[str]
+    """Dataset split, default to "train"."""
+    converter: NotRequired[str]
+    """Dataset converter, default to None."""
+    size: NotRequired[int]
+    """Number of samples, default to all samples."""
+    weight: NotRequired[float]
+    """Dataset weight, default to 1.0."""
+    streaming: NotRequired[bool]
+    """Is streaming dataset, default to False."""
+
+
+class Content(TypedDict):
+    type: Literal["text", "tools", "reasoning", "tool_calls", "image_url"]
+    value: str
+
+
+class Message(TypedDict):
+    role: Literal["system", "user", "assistant"]
+    content: list[Content]
+    loss_weight: float
+
+
+class SFTSample(TypedDict):
+    messages: list[Message]
+    extra_info: NotRequired[str]
+    _dataset_name: NotRequired[str]
+
+
+class DPOSample(TypedDict):
+    chosen_messages: list[Message]
+    rejected_messages: list[Message]
+    extra_info: NotRequired[str]
+    _dataset_name: NotRequired[str]
+
+
+Sample = Union[SFTSample, DPOSample]
+
+
+class Model(TypedDict):
+    hf_model: HFModel
+    """HF model."""
+    dist_model: DistModel
+    """Distributed model."""
diff --git a/llamafactory/v1/launcher.py b/llamafactory/v1/launcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..6160835bb19d837f1203e0c992a4aafcadf3f6b1
--- /dev/null
+++ b/llamafactory/v1/launcher.py
@@ -0,0 +1,66 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+
+from ..extras.env import VERSION, print_env
+
+
+USAGE = (
+    "-" * 70
+    + "\n"
+    + "| Usage:                                                             |\n"
+    + "|   llamafactory-cli sft -h: train models                            |\n"
+    + "|   llamafactory-cli version: show version info                      |\n"
+    + "| Hint: You can use `lmf` as a shortcut for `llamafactory-cli`.      |\n"
+    + "-" * 70
+)
+
+
+WELCOME = (
+    "-" * 58
+    + "\n"
+    + f"| Welcome to LLaMA Factory, version {VERSION}"
+    + " " * (21 - len(VERSION))
+    + "|\n|"
+    + " " * 56
+    + "|\n"
+    + "| Project page: https://github.com/hiyouga/LLaMA-Factory |\n"
+    + "-" * 58
+)
+
+
+def launch():
+    command = sys.argv.pop(1) if len(sys.argv) > 1 else "help"
+
+    if command == "sft":
+        from .trainers.sft_trainer import run_sft
+
+        run_sft()
+
+    elif command == "env":
+        print_env()
+
+    elif command == "version":
+        print(WELCOME)
+
+    elif command == "help":
+        print(USAGE)
+
+    else:
+        print(f"Unknown command: {command}.\n{USAGE}")
+
+
+if __name__ == "__main__":
+    pass
diff --git a/llamafactory/v1/plugins/__init__.py b/llamafactory/v1/plugins/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/llamafactory/v1/plugins/data_plugins/__init__.py b/llamafactory/v1/plugins/data_plugins/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/llamafactory/v1/plugins/data_plugins/converter.py b/llamafactory/v1/plugins/data_plugins/converter.py
new file mode 100644
index 0000000000000000000000000000000000000000..371259264697f741cb562bf60010bb93ee9e3780
--- /dev/null
+++ b/llamafactory/v1/plugins/data_plugins/converter.py
@@ -0,0 +1,158 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Callable, TypedDict
+
+from typing_extensions import NotRequired
+
+from ...extras.types import DPOSample, Sample, SFTSample
+
+
+class AlpacaSample(TypedDict, total=False):
+    system: NotRequired[str]
+    instruction: NotRequired[str]
+    input: NotRequired[str]
+    output: NotRequired[str]
+
+
+class PairSample(TypedDict, total=False):
+    prompt: NotRequired[str]
+    chosen: NotRequired[list[dict]]
+    rejected: NotRequired[list[dict]]
+
+
+def alpaca_converter(raw_sample: AlpacaSample) -> SFTSample:
+    """Convert Alpaca sample to SFT sample.
+
+    Args:
+        raw_sample (AlpacaSample): Alpaca sample.
+
+    Returns:
+        SFTSample: SFT sample.
+    """
+    messages = []
+    if "system" in raw_sample:
+        messages.append(
+            {"role": "system", "content": [{"type": "text", "value": raw_sample["system"]}], "loss_weight": 0.0}
+        )
+
+    if "instruction" in raw_sample or "input" in raw_sample:
+        messages.append(
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "value": raw_sample.get("instruction", "") + raw_sample.get("input", "")}
+                ],
+                "loss_weight": 0.0,
+            }
+        )
+
+    if "output" in raw_sample:
+        messages.append(
+            {"role": "assistant", "content": [{"type": "text", "value": raw_sample["output"]}], "loss_weight": 1.0}
+        )
+
+    return {"messages": messages}
+
+
+def pair_converter(raw_sample: PairSample) -> DPOSample:
+    """Convert Pair sample to standard DPO sample.
+
+    Args:
+        raw_sample (PairSample): pair sample with prompt, chosen, rejected fields.
+        see raw example at: https://huggingface.co/datasets/HuggingFaceH4/orca_dpo_pairs
+
+    Returns:
+        DPOSample: DPO sample with chosen_messages and rejected_messages.
+        see the standard DPO sample at: https://huggingface.co/datasets/frozenleaves/v1-dpo-demo/raw/main/v1-dpo-demo.jsonl
+    """
+    chosen_messages = []
+    assert "chosen" in raw_sample, "chosen field is required in pair sample."
+    assert "rejected" in raw_sample, "rejected field is required in pair sample."
+    assert isinstance(raw_sample["chosen"], list) and isinstance(raw_sample["rejected"], list), (
+        "chosen and rejected field should be a list[dict], or you may need to implement your custom converter."
+    )
+
+    if "chosen" in raw_sample:
+        value = raw_sample.get("chosen", "")
+        for item in value:
+            if item.get("role", "") == "system":
+                chosen_messages.append(
+                    {
+                        "role": "system",
+                        "content": [{"type": "text", "value": item.get("content", "")}],
+                        "loss_weight": 0.0,
+                    }
+                )
+            if item.get("role", "") == "user":
+                chosen_messages.append(
+                    {
+                        "role": "user",
+                        "content": [{"type": "text", "value": item.get("content", "")}],
+                        "loss_weight": 0.0,
+                    }
+                )
+            if item.get("role", "") == "assistant":
+                chosen_messages.append(
+                    {
+                        "role": "assistant",
+                        "content": [{"type": "text", "value": item.get("content", "")}],
+                        "loss_weight": 1.0,
+                    }
+                )
+
+    rejected_messages = []
+    if "rejected" in raw_sample:
+        value = raw_sample.get("rejected", "")
+        for item in value:
+            if item.get("role", "") == "system":
+                rejected_messages.append(
+                    {
+                        "role": "system",
+                        "content": [{"type": "text", "value": item.get("content", "")}],
+                        "loss_weight": 0.0,
+                    }
+                )
+            if item.get("role", "") == "user":
+                rejected_messages.append(
+                    {
+                        "role": "user",
+                        "content": [{"type": "text", "value": item.get("content", "")}],
+                        "loss_weight": 0.0,
+                    }
+                )
+            if item.get("role", "") == "assistant":
+                rejected_messages.append(
+                    {
+                        "role": "assistant",
+                        "content": [{"type": "text", "value": item.get("content", "")}],
+                        "loss_weight": 1.0,
+                    }
+                )
+
+    return {"chosen_messages": chosen_messages, "rejected_messages": rejected_messages}
+
+
+CONVERTERS = {
+    "alpaca": alpaca_converter,
+    "pair": pair_converter,
+}
+
+
+def get_converter(converter_name: str) -> Callable[[dict], Sample]:
+    if converter_name not in CONVERTERS:
+        raise ValueError(f"Converter {converter_name} not found.")
+
+    return CONVERTERS[converter_name]
diff --git a/llamafactory/v1/plugins/data_plugins/loader.py b/llamafactory/v1/plugins/data_plugins/loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..543db438e7a1b0052d630b8bb519022bf985c331
--- /dev/null
+++ b/llamafactory/v1/plugins/data_plugins/loader.py
@@ -0,0 +1,123 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+from dataclasses import dataclass
+from typing import Any, Literal, Optional, Union
+
+from datasets import load_dataset
+
+from ...config.data_args import DataArguments
+from ...extras.types import DatasetInfo, HFDataset
+
+
+@dataclass
+class DataLoaderPlugin:
+    """Plugin for loading dataset."""
+
+    args: DataArguments
+    """Data arguments."""
+
+    def _get_builder_name(self, path: str) -> Literal["arrow", "csv", "json", "parquet", "text"]:
+        """Get dataset builder name.
+
+        Args:
+            path (str): Dataset path.
+
+        Returns:
+            Literal["arrow", "csv", "json", "parquet", "text"]: Dataset builder name.
+        """
+        return os.path.splitext(path)[-1][1:].replace("jsonl", "json").replace("txt", "text")
+
+    def auto_load_data(self, dataset_info: DatasetInfo) -> HFDataset:
+        dataset_dir = dataset_info.get("dataset_dir", self.args.dataset_dir)
+        split = dataset_info.get("split", "train")
+        streaming = dataset_info.get("streaming", False)
+        if "file_name" in dataset_info:
+            filepath = os.path.join(dataset_dir, dataset_info["file_name"])
+            return self.load_data_from_file(filepath, split, streaming)
+        else:
+            raise NotImplementedError()
+
+    def load_data_from_file(self, filepath: str, split: str, streaming: bool) -> HFDataset:
+        if os.path.isdir(filepath):
+            filetype = self._get_builder_name(os.listdir(filepath)[0])
+            dataset = load_dataset(filetype, data_dir=filepath, split=split)
+        elif os.path.isfile(filepath):
+            filetype = self._get_builder_name(filepath)
+            dataset = load_dataset(filetype, data_files=filepath, split=split)
+        else:
+            raise ValueError(f"Can not load dataset from {filepath}.")
+
+        if streaming:
+            dataset = dataset.to_iterable_dataset()
+
+        return dataset
+
+
+@dataclass
+class DataIndexPlugin:
+    """Plugin for adjusting dataset index."""
+
+    def adjust_data_index(
+        self, data_index: list[tuple[str, int]], size: Optional[int], weight: Optional[float]
+    ) -> list[tuple[str, int]]:
+        """Adjust dataset index by size and weight.
+
+        Args:
+            data_index (list[tuple[str, int]]): List of (dataset_name, sample_index).
+            size (Optional[int]): Desired dataset size.
+            weight (Optional[float]): Desired dataset weight.
+
+        Returns:
+            list[tuple[str, int]]: Adjusted dataset index.
+        """
+        if size is not None:
+            data_index = self.adjust_by_size(data_index, size)
+
+        if weight is not None:
+            data_index = self.adjust_by_weight(data_index, weight)
+
+        return data_index
+
+    def adjust_by_size(self, data_index: list[tuple[str, int]], size: int) -> list[tuple[str, int]]:
+        raise NotImplementedError()
+
+    def adjust_by_weight(self, data_index: list[tuple[str, int]], weight: float) -> list[tuple[str, int]]:
+        raise NotImplementedError()
+
+
+@dataclass
+class DataSelectorPlugin:
+    """Plugin for selecting dataset samples."""
+
+    data_index: list[tuple[str, int]]
+    """List of (dataset_name, sample_index)"""
+
+    def select(self, index: Union[slice, list[int], Any]) -> Union[tuple[str, int], list[tuple[str, int]]]:
+        """Select dataset samples.
+
+        Args:
+            index (Union[slice, list[int], Any]): Index of dataset samples.
+
+        Returns:
+            Union[tuple[str, int], list[tuple[str, int]]]: Selected dataset samples.
+        """
+        if isinstance(index, slice):
+            return [self.data_index[i] for i in range(*index.indices(len(self.data_index)))]
+        elif isinstance(index, list):
+            return [self.data_index[i] for i in index]
+        else:
+            raise ValueError(f"Invalid index type {type(index)}.")
diff --git a/llamafactory/v1/plugins/data_plugins/template.py b/llamafactory/v1/plugins/data_plugins/template.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf41389fee77d7f9cff1d401e09359c1e2c36d75
--- /dev/null
+++ b/llamafactory/v1/plugins/data_plugins/template.py
@@ -0,0 +1,26 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from dataclasses import dataclass
+
+
+@dataclass
+class Template:
+    user_template: str
+    assistant_template: str
+    system_template: str
+
+    def render_message(self, message: "dict[str, str]") -> str:
+        return self.user_template.format(**message)
diff --git a/llamafactory/v1/plugins/model_plugins/__init__.py b/llamafactory/v1/plugins/model_plugins/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/llamafactory/v1/plugins/model_plugins/added_token.py b/llamafactory/v1/plugins/model_plugins/added_token.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/llamafactory/v1/plugins/model_plugins/kernels/__init__.py b/llamafactory/v1/plugins/model_plugins/kernels/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/llamafactory/v1/plugins/model_plugins/kernels/constants.py b/llamafactory/v1/plugins/model_plugins/kernels/constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ad988bf910277aec440c86832184dc992bc3826
--- /dev/null
+++ b/llamafactory/v1/plugins/model_plugins/kernels/constants.py
@@ -0,0 +1,30 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from enum import Enum
+
+
+class KernelType(str, Enum):
+    RMSNORM = "rmsnorm"
+    SWIGLU = "swiglu"
+    FLASH_ATTENTION = "flash_attention"
+    ROPE = "rope"
+    MOE = "moe"
+
+
+class DeviceType(str, Enum):
+    CPU = "cpu"
+    CUDA = "cuda"
+    NPU = "npu"
+    XPU = "xpu"
diff --git a/llamafactory/v1/plugins/model_plugins/kernels/fa/__init__.py b/llamafactory/v1/plugins/model_plugins/kernels/fa/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/llamafactory/v1/plugins/model_plugins/kernels/mlp/__init__.py b/llamafactory/v1/plugins/model_plugins/kernels/mlp/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/llamafactory/v1/plugins/model_plugins/kernels/mlp/npu_fused_moe.py b/llamafactory/v1/plugins/model_plugins/kernels/mlp/npu_fused_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec0d625544e7098f00da67041d5430679e284ba7
--- /dev/null
+++ b/llamafactory/v1/plugins/model_plugins/kernels/mlp/npu_fused_moe.py
@@ -0,0 +1,13 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/llamafactory/v1/plugins/model_plugins/kernels/mlp/npu_swiglu.py b/llamafactory/v1/plugins/model_plugins/kernels/mlp/npu_swiglu.py
new file mode 100644
index 0000000000000000000000000000000000000000..702d27bcf7ec119b5ad0e8ddcf0abea81323cb17
--- /dev/null
+++ b/llamafactory/v1/plugins/model_plugins/kernels/mlp/npu_swiglu.py
@@ -0,0 +1,55 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+import types
+
+import torch
+
+from .....extras.types import HFModel
+from ....trainer_plugins.distributed.accelerate import is_torch_npu_available
+from ..constants import DeviceType, KernelType
+from ..registry import KERNEL_REGISTRY, MetaSwiGluKernel
+
+
+def _npu_swiglu_forward(self, hidden_state):
+    import torch_npu
+
+    return self.down_proj(
+        torch_npu.npu_swiglu(torch.cat((self.gate_proj(hidden_state), self.up_proj(hidden_state)), dim=-1), dim=-1)
+    )
+
+
+class NpuSwiGluKernel(MetaSwiGluKernel):
+    device = DeviceType.NPU
+    kernel = _npu_swiglu_forward
+
+    @classmethod
+    def register_kernel(cls, kernel_type=KernelType.SWIGLU, device_type=DeviceType.NPU):
+        KERNEL_REGISTRY.register(kernel_type, device_type, cls)
+
+    @classmethod
+    def apply(cls, model, **kwargs) -> "HFModel":
+        if not is_torch_npu_available():
+            return model
+
+        swiglu_pattern = re.compile("MLP", re.IGNORECASE)
+        for name, module in model.named_modules():
+            # Match any module whose class name contains "RMSNorm"
+            if re.search(swiglu_pattern, module.__class__.__name__):
+                # Bind function as an instance method to preserve `self` semantics
+                # and replace the original forward
+                module.forward = types.MethodType(cls.kernel, module)
+
+        return model
diff --git a/llamafactory/v1/plugins/model_plugins/kernels/registry.py b/llamafactory/v1/plugins/model_plugins/kernels/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..45e14a09944c3a68b2d11a1527e7221439257bbd
--- /dev/null
+++ b/llamafactory/v1/plugins/model_plugins/kernels/registry.py
@@ -0,0 +1,138 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from typing import Any, Callable, Optional
+
+from ....extras.types import HFModel
+from ...trainer_plugins.distributed.accelerate import get_available_accelerator
+from .constants import DeviceType, KernelType
+
+
+class KernelRegistry:
+    _instance: Optional["KernelRegistry"] = None
+    _initialized: bool = False
+
+    def __new__(cls, *args: Any, **kwargs: Any) -> "KernelRegistry":
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+        return cls._instance
+
+    def __init__(self) -> None:
+        if self._initialized:
+            return
+        self._registry: dict[KernelType, dict[DeviceType, Callable[..., Any]]] = {}
+        self._initialized = True
+
+    def register(
+        self, kernel_type: KernelType, device_type: DeviceType, kernel_impl: Optional[Callable[..., Any]]
+    ) -> None:
+        """Register a kernel implementation.
+
+        Args:
+            kernel_type: the type of the kernel (e.g., KernelType.FLASH_ATTENTION).
+            device_type: the device type the kernel is adapted to (e.g., DeviceType.CUDA).
+            kernel_impl: the actual kernel function or class.
+        """
+        if kernel_type not in self._registry:
+            self._registry[kernel_type] = {}
+
+        if device_type in self._registry[kernel_type]:
+            print(f"Warning: Overwriting kernel for {kernel_type.name} on {device_type.name}.")
+
+        self._registry[kernel_type][device_type] = kernel_impl
+        print(f"Registered kernel {kernel_type.name} for device {device_type.name}.")
+
+    def get_kernel(self, kernel_type: KernelType, device_type: DeviceType) -> Optional[Callable[..., Any]]:
+        return self._registry.get(kernel_type, {}).get(device_type)
+
+
+KERNEL_REGISTRY = KernelRegistry()
+
+
+class MetaKernel(ABC):
+    type: Optional[KernelType] = None
+    device: Optional[DeviceType] = None
+    kernel: Optional[Callable] = None
+
+    @classmethod
+    def register_kernel(cls, kernel_type: KernelType, device_type: DeviceType):
+        KERNEL_REGISTRY.register(kernel_type, device_type, cls)
+
+    @classmethod
+    @abstractmethod
+    def apply(cls, model: HFModel, **kwargs) -> HFModel:
+        raise NotImplementedError
+
+
+class MetaFlashAttentionKernel(MetaKernel):
+    @classmethod
+    def apply(cls, model: HFModel, **kwargs) -> HFModel:
+        raise NotImplementedError
+
+
+class MetaRMSNormKernel(MetaKernel):
+    @classmethod
+    def apply(cls, model: HFModel, **kwargs) -> HFModel:
+        raise NotImplementedError
+
+
+class MetaSwiGluKernel(MetaKernel):
+    @classmethod
+    def apply(cls, model: HFModel, **kwargs) -> HFModel:
+        raise NotImplementedError
+
+
+class MetaRoPEKernel(MetaKernel):
+    @classmethod
+    def apply(cls, model: HFModel, **kwargs) -> HFModel:
+        raise NotImplementedError
+
+
+class MetaMoEKernel(MetaKernel):
+    @classmethod
+    def apply(cls, model: HFModel, **kwargs) -> HFModel:
+        raise NotImplementedError
+
+
+def discover_kernels(model: HFModel) -> list[MetaKernel]:
+    """Discover and construct MetaKernel instances for the current model/device.
+
+    This is a placeholder to be implemented: it should inspect the runtime
+    environment (device type, available extensions, model architecture) and
+    return an ordered list of MetaKernel instances to be applied. Each returned
+    MetaKernel must encapsulate its own replacement logic in `apply`.
+    """
+    # TODO: Implement auto discovery logic based on registry and device capabilities.
+    return []
+
+
+def apply_kernel(model: HFModel, kernel: type[MetaKernel], /, **kwargs) -> "HFModel":
+    """Call the MetaKernel's `apply` to perform the replacement.
+
+    Corresponding replacement logic is maintained inside each kernel; the only
+    requirement is that `apply` returns the replaced model.
+
+    Example:
+        from transformers import AutoModelForCausalLM
+        from .rms_norm.npu_rms_norm import NpuRMSNormKernel
+        model = AutoModelForCausalLM.from_pretrained("qwen/qwen2.5-0.5B")
+        model = apply_kernel(model, NpuRMSNormKernel)
+    """
+    if issubclass(kernel, MetaKernel) and kernel.device == get_available_accelerator().type:
+        return kernel.apply(model, **kwargs)
+
+    raise ValueError(
+        f"{kernel} must be a MetaKernel instance, or the kernel don't match the device type. got {kernel.device} and {get_available_accelerator().type} instead."
+    )
diff --git a/llamafactory/v1/plugins/model_plugins/kernels/rms_norm/__init__.py b/llamafactory/v1/plugins/model_plugins/kernels/rms_norm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/llamafactory/v1/plugins/model_plugins/kernels/rms_norm/npu_rms_norm.py b/llamafactory/v1/plugins/model_plugins/kernels/rms_norm/npu_rms_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6f032b98c00707465bf079c95e6c68b714c86eb
--- /dev/null
+++ b/llamafactory/v1/plugins/model_plugins/kernels/rms_norm/npu_rms_norm.py
@@ -0,0 +1,72 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+import types
+
+from .....extras.types import HFModel
+from ....trainer_plugins.distributed.accelerate import is_torch_npu_available
+from ..constants import DeviceType, KernelType
+from ..registry import KERNEL_REGISTRY, MetaRMSNormKernel
+
+
+def _npu_rms_forward(self, hidden_states):
+    """NPU forward implementation for RMSNorm.
+
+    Args:
+        self: RMSNorm module instance with `weight` and `variance_epsilon`.
+        hidden_states: Input hidden states tensor, same shape as the baseline.
+
+    Returns:
+        Normalized tensor consistent with the baseline RMSNorm behavior.
+    """
+    import torch_npu
+
+    return torch_npu.npu_rms_norm(hidden_states, self.weight, epsilon=self.variance_epsilon)[0]
+
+
+class NpuRMSNormKernel(MetaRMSNormKernel):
+    """NPU kernel wrapper for RMSNorm that applies the replacement within a model."""
+
+    device = DeviceType.NPU
+    kernel = _npu_rms_forward
+
+    @classmethod
+    def register_kernel(cls, kernel_type=KernelType.RMSNORM, device_type=DeviceType.NPU):
+        """Register the NPU RMSNorm forward implementation to the global registry."""
+        KERNEL_REGISTRY.register(kernel_type, device_type, cls)
+
+    @classmethod
+    def apply(cls, model, **kwargs) -> HFModel:
+        """Iterate the model and apply NPU-optimized forward to matched RMSNorm modules.
+
+        Key points:
+        - Match modules whose class name contains "RMSNorm" (case-insensitive).
+        - Bind `_npu_rms_forward` as an instance method via `types.MethodType` to
+          replace the original `forward`.
+        - Do not modify weights, hyperparameters, or module structure to ensure
+          numerical behavior and interface consistency.
+        """
+        if not is_torch_npu_available():
+            return model
+
+        rms_norm_pattern = re.compile("RMSNorm", re.IGNORECASE)
+
+        for name, module in model.named_modules():
+            # Match any module whose class name contains "RMSNorm"
+            if re.search(rms_norm_pattern, module.__class__.__name__):
+                # Bind function as an instance method to preserve `self` semantics
+                # and replace the original forward
+                module.forward = types.MethodType(cls.kernel, module)
+
+        return model
diff --git a/llamafactory/v1/plugins/model_plugins/kernels/rope/__init__.py b/llamafactory/v1/plugins/model_plugins/kernels/rope/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/llamafactory/v1/plugins/model_plugins/kernels/rope/npu_rope.py b/llamafactory/v1/plugins/model_plugins/kernels/rope/npu_rope.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cb40575a6667d58e61ca12e2d44fc02b277708c
--- /dev/null
+++ b/llamafactory/v1/plugins/model_plugins/kernels/rope/npu_rope.py
@@ -0,0 +1,121 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+
+import torch
+
+from .....extras.types import HFModel
+from ....trainer_plugins.distributed.accelerate import is_torch_npu_available
+from ..constants import DeviceType, KernelType
+from ..registry import KERNEL_REGISTRY, MetaRoPEKernel
+
+
+def _apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors."""
+    import torch_npu
+
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = torch_npu.npu_rotary_mul(q, cos, sin)
+    k_embed = torch_npu.npu_rotary_mul(k, cos, sin)
+    return q_embed, k_embed
+
+
+def _apply_multimodal_rotary_pos_emb_qwen25_vl(q, k, cos, sin, mrope_section, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding with multimodal sections (Qwen2-VL)."""
+    import torch_npu
+
+    mrope_section = mrope_section * 2
+    cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze(
+        unsqueeze_dim
+    )
+    sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze(
+        unsqueeze_dim
+    )
+
+    q_embed = torch_npu.npu_rotary_mul(q, cos, sin)
+    k_embed = torch_npu.npu_rotary_mul(k, cos, sin)
+    return q_embed, k_embed
+
+
+class NpuRoPEKernel(MetaRoPEKernel):
+    device = DeviceType.NPU
+    kernel = _apply_rotary_pos_emb
+
+    @classmethod
+    def register_kernel(cls, kernel_type=KernelType.ROPE, device_type=DeviceType.NPU):
+        KERNEL_REGISTRY.register(kernel_type, device_type, cls)
+
+    @classmethod
+    def apply(cls, model, **kwargs) -> "HFModel":
+        """Apply RoPE acceleration by monkey-patching `apply_rotary_pos_emb`.
+
+        This function iterates through the model's modules to find attention layers,
+        identifies the module where they are defined, and replaces the original
+        `apply_rotary_pos_emb` function in that module's namespace with the
+        NPU-accelerated version from this file.
+        """
+        if not is_torch_npu_available():
+            return model
+
+        _modules = set()
+        for module in model.modules():
+            if "Attention" in module.__class__.__name__:
+                module_name = module.__class__.__module__
+                if module_name in _modules:
+                    continue
+                try:
+                    target_module = sys.modules[module_name]
+                    if hasattr(target_module, "apply_rotary_pos_emb"):
+                        if getattr(target_module, "apply_rotary_pos_emb") is not cls.kernel:
+                            setattr(target_module, "apply_rotary_pos_emb", cls.kernel)
+                            _modules.add(module_name)
+                except Exception:
+                    pass
+        return model
+
+
+class NpuQwen2VLRoPEKernel(MetaRoPEKernel):
+    device = DeviceType.NPU
+    kernel = _apply_multimodal_rotary_pos_emb_qwen25_vl
+
+    @classmethod
+    def register_kernel(cls, kernel_type=KernelType.ROPE, device_type=DeviceType.NPU):
+        KERNEL_REGISTRY.register(kernel_type, device_type, cls)
+
+    @classmethod
+    def apply(cls, model, **kwargs) -> "HFModel":
+        """Apply RoPE acceleration by monkey-patching `apply_rotary_pos_emb`.
+
+        This function iterates through the model's modules to find attention layers,
+        identifies the module where they are defined, and replaces the original
+        `apply_rotary_pos_emb` function in that module's namespace with the
+        NPU-accelerated version from this file.
+        """
+        _modules = set()
+        for module in model.modules():
+            if "Attention" in module.__class__.__name__:
+                module_name = module.__class__.__module__
+                if module_name in _modules:
+                    continue
+                try:
+                    target_module = sys.modules[module_name]
+                    if hasattr(target_module, "apply_multimodal_rotary_pos_emb"):
+                        if getattr(target_module, "apply_multimodal_rotary_pos_emb") is not cls.kernel:
+                            setattr(target_module, "apply_multimodal_rotary_pos_emb", cls.kernel)
+                            _modules.add(module_name)
+                except Exception:
+                    pass
+        return model
diff --git a/llamafactory/v1/plugins/model_plugins/peft.py b/llamafactory/v1/plugins/model_plugins/peft.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/llamafactory/v1/plugins/sampler_plugins/__init__.py b/llamafactory/v1/plugins/sampler_plugins/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/llamafactory/v1/plugins/sampler_plugins/vllm.py b/llamafactory/v1/plugins/sampler_plugins/vllm.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/llamafactory/v1/plugins/trainer_plugins/__init__.py b/llamafactory/v1/plugins/trainer_plugins/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/llamafactory/v1/plugins/trainer_plugins/distributed/__init__.py b/llamafactory/v1/plugins/trainer_plugins/distributed/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/llamafactory/v1/plugins/trainer_plugins/distributed/accelerate.py b/llamafactory/v1/plugins/trainer_plugins/distributed/accelerate.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7a9bf306eb1626ef88ccb0e1e496cc110cc246d
--- /dev/null
+++ b/llamafactory/v1/plugins/trainer_plugins/distributed/accelerate.py
@@ -0,0 +1,47 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import lru_cache
+
+import torch
+
+
+def get_available_accelerator():
+    """Get available accelerator in current environment.
+
+    Note: this api requires torch>=2.7.0, 2.6 or lower will get an AttributeError or RuntimeError
+    """
+    accelerator = torch.accelerator.current_accelerator()
+    if accelerator is None:
+        return torch.device("cpu")
+    return accelerator
+
+
+@lru_cache
+def is_torch_npu_available():
+    return get_available_accelerator().type == "npu"
+
+
+@lru_cache
+def is_torch_cuda_available():
+    return get_available_accelerator().type == "cuda"
+
+
+@lru_cache
+def is_torch_xpu_available():
+    return get_available_accelerator().type == "xpu"
+
+
+@lru_cache
+def is_torch_mps_available():
+    return get_available_accelerator().type == "mps"
diff --git a/llamafactory/v1/trainers/__init__.py b/llamafactory/v1/trainers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/llamafactory/v1/trainers/dpo_trainer.py b/llamafactory/v1/trainers/dpo_trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/llamafactory/v1/trainers/rm_trainer.py b/llamafactory/v1/trainers/rm_trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/llamafactory/v1/trainers/sft_trainer.py b/llamafactory/v1/trainers/sft_trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d254e4c5d3c636a6e15d718a979f47913a5419b
--- /dev/null
+++ b/llamafactory/v1/trainers/sft_trainer.py
@@ -0,0 +1,34 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from ..config.parser import get_args
+from ..core.base_trainer import BaseTrainer
+from ..core.data_engine import DataEngine
+from ..core.model_engine import ModelEngine
+
+
+class SFTTrainer(BaseTrainer):
+    pass
+
+
+def run_sft():
+    model_args, data_args, training_args, _ = get_args()
+    model_engine = ModelEngine(model_args)
+    data_engine = DataEngine(data_args)
+    model = model_engine.get_model()
+    processor = model_engine.get_processor()
+    data_loader = data_engine.get_data_loader(processor)
+    trainer = SFTTrainer(training_args, model, processor, data_loader)
+    trainer.fit()
diff --git a/llamafactory/webui/__init__.py b/llamafactory/webui/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/llamafactory/webui/__pycache__/__init__.cpython-312.pyc b/llamafactory/webui/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..96f81261e5ffce6a9b11a961eaa5b23a722a76ad
Binary files /dev/null and b/llamafactory/webui/__pycache__/__init__.cpython-312.pyc differ
diff --git a/llamafactory/webui/__pycache__/chatter.cpython-312.pyc b/llamafactory/webui/__pycache__/chatter.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..085bb627cdb27c14e6bb6835079adc93b489b710
Binary files /dev/null and b/llamafactory/webui/__pycache__/chatter.cpython-312.pyc differ
diff --git a/llamafactory/webui/__pycache__/common.cpython-312.pyc b/llamafactory/webui/__pycache__/common.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..53fb71e1b58591ea00f2eccdd7774ccbe48e738e
Binary files /dev/null and b/llamafactory/webui/__pycache__/common.cpython-312.pyc differ
diff --git a/llamafactory/webui/__pycache__/control.cpython-312.pyc b/llamafactory/webui/__pycache__/control.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..48e67088015de25058142c9b8aaba35e2bac7c64
Binary files /dev/null and b/llamafactory/webui/__pycache__/control.cpython-312.pyc differ
diff --git a/llamafactory/webui/__pycache__/css.cpython-312.pyc b/llamafactory/webui/__pycache__/css.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8af9fefc9814c64d2fc18309ab90c1f56838fb7e
Binary files /dev/null and b/llamafactory/webui/__pycache__/css.cpython-312.pyc differ
diff --git a/llamafactory/webui/__pycache__/engine.cpython-312.pyc b/llamafactory/webui/__pycache__/engine.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6ee069e5193b80148472993cff78e3de6b2c6379
Binary files /dev/null and b/llamafactory/webui/__pycache__/engine.cpython-312.pyc differ
diff --git a/llamafactory/webui/__pycache__/interface.cpython-312.pyc b/llamafactory/webui/__pycache__/interface.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1105cc89790f4591d7867d12528df6f67d1cab40
Binary files /dev/null and b/llamafactory/webui/__pycache__/interface.cpython-312.pyc differ
diff --git a/llamafactory/webui/__pycache__/locales.cpython-312.pyc b/llamafactory/webui/__pycache__/locales.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a0848143564a3493e40a9d7270368f985c7530b1
Binary files /dev/null and b/llamafactory/webui/__pycache__/locales.cpython-312.pyc differ
diff --git a/llamafactory/webui/__pycache__/manager.cpython-312.pyc b/llamafactory/webui/__pycache__/manager.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1e64905d3429c57452e9782a97394411ededf8a7
Binary files /dev/null and b/llamafactory/webui/__pycache__/manager.cpython-312.pyc differ
diff --git a/llamafactory/webui/__pycache__/runner.cpython-312.pyc b/llamafactory/webui/__pycache__/runner.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..99e356ff140e107ac7cfce6c3c5a882455737049
Binary files /dev/null and b/llamafactory/webui/__pycache__/runner.cpython-312.pyc differ
diff --git a/llamafactory/webui/chatter.py b/llamafactory/webui/chatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..feeeaf3d82213bfde26cd4cc320c1bf091f63498
--- /dev/null
+++ b/llamafactory/webui/chatter.py
@@ -0,0 +1,246 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from collections.abc import Generator
+from contextlib import contextmanager
+from typing import TYPE_CHECKING, Any, Optional
+
+from transformers.utils import is_torch_npu_available
+
+from ..chat import ChatModel
+from ..data import Role
+from ..extras.constants import PEFT_METHODS
+from ..extras.misc import torch_gc
+from ..extras.packages import is_gradio_available
+from .common import get_save_dir, load_config
+from .locales import ALERTS
+
+
+if TYPE_CHECKING:
+    from ..chat import BaseEngine
+    from .manager import Manager
+
+
+if is_gradio_available():
+    import gradio as gr
+
+
+def _escape_html(text: str) -> str:
+    r"""Escape HTML characters."""
+    return text.replace("<", "&lt;").replace(">", "&gt;")
+
+
+def _format_response(text: str, lang: str, escape_html: bool, thought_words: tuple[str, str]) -> str:
+    r"""Post-process the response text.
+
+    Based on: https://huggingface.co/spaces/Lyte/DeepSeek-R1-Distill-Qwen-1.5B-Demo-GGUF/blob/main/app.py
+    """
+    if thought_words[0] not in text:
+        return _escape_html(text) if escape_html else text
+
+    text = text.replace(thought_words[0], "")
+    result = text.split(thought_words[1], maxsplit=1)
+    if len(result) == 1:
+        summary = ALERTS["info_thinking"][lang]
+        thought, answer = text, ""
+    else:
+        summary = ALERTS["info_thought"][lang]
+        thought, answer = result
+
+    if escape_html:
+        thought, answer = _escape_html(thought), _escape_html(answer)
+
+    return (
+        f"<details open><summary class='thinking-summary'><span>{summary}</span></summary>\n\n"
+        f"<div class='thinking-container'>\n{thought}\n</div>\n</details>{answer}"
+    )
+
+
+@contextmanager
+def update_attr(obj: Any, name: str, value: Any):
+    old_value = getattr(obj, name, None)
+    setattr(obj, name, value)
+    yield
+    setattr(obj, name, old_value)
+
+
+class WebChatModel(ChatModel):
+    def __init__(self, manager: "Manager", demo_mode: bool = False, lazy_init: bool = True) -> None:
+        self.manager = manager
+        self.demo_mode = demo_mode
+        self.engine: Optional[BaseEngine] = None
+
+        if not lazy_init:  # read arguments from command line
+            super().__init__()
+
+        if demo_mode and os.getenv("DEMO_MODEL") and os.getenv("DEMO_TEMPLATE"):  # load demo model
+            model_name_or_path = os.getenv("DEMO_MODEL")
+            template = os.getenv("DEMO_TEMPLATE")
+            infer_backend = os.getenv("DEMO_BACKEND", "huggingface")
+            super().__init__(
+                dict(model_name_or_path=model_name_or_path, template=template, infer_backend=infer_backend)
+            )
+
+    @property
+    def loaded(self) -> bool:
+        return self.engine is not None
+
+    def load_model(self, data) -> Generator[str, None, None]:
+        get = lambda elem_id: data[self.manager.get_elem_by_id(elem_id)]
+        lang, model_name, model_path = get("top.lang"), get("top.model_name"), get("top.model_path")
+        finetuning_type, checkpoint_path = get("top.finetuning_type"), get("top.checkpoint_path")
+        user_config = load_config()
+
+        error = ""
+        if self.loaded:
+            error = ALERTS["err_exists"][lang]
+        elif not model_name:
+            error = ALERTS["err_no_model"][lang]
+        elif not model_path:
+            error = ALERTS["err_no_path"][lang]
+        elif self.demo_mode:
+            error = ALERTS["err_demo"][lang]
+
+        try:
+            json.loads(get("infer.extra_args"))
+        except json.JSONDecodeError:
+            error = ALERTS["err_json_schema"][lang]
+
+        if error:
+            gr.Warning(error)
+            yield error
+            return
+
+        yield ALERTS["info_loading"][lang]
+        args = dict(
+            model_name_or_path=model_path,
+            cache_dir=user_config.get("cache_dir", None),
+            finetuning_type=finetuning_type,
+            template=get("top.template"),
+            rope_scaling=get("top.rope_scaling") if get("top.rope_scaling") != "none" else None,
+            flash_attn="fa2" if get("top.booster") == "flashattn2" else "auto",
+            use_unsloth=(get("top.booster") == "unsloth"),
+            enable_liger_kernel=(get("top.booster") == "liger_kernel"),
+            infer_backend=get("infer.infer_backend"),
+            infer_dtype=get("infer.infer_dtype"),
+            trust_remote_code=True,
+        )
+        args.update(json.loads(get("infer.extra_args")))
+
+        # checkpoints
+        if checkpoint_path:
+            if finetuning_type in PEFT_METHODS:  # list
+                args["adapter_name_or_path"] = ",".join(
+                    [get_save_dir(model_name, finetuning_type, adapter) for adapter in checkpoint_path]
+                )
+            else:  # str
+                args["model_name_or_path"] = get_save_dir(model_name, finetuning_type, checkpoint_path)
+
+        # quantization
+        if get("top.quantization_bit") != "none":
+            args["quantization_bit"] = int(get("top.quantization_bit"))
+            args["quantization_method"] = get("top.quantization_method")
+            args["double_quantization"] = not is_torch_npu_available()
+
+        super().__init__(args)
+        yield ALERTS["info_loaded"][lang]
+
+    def unload_model(self, data) -> Generator[str, None, None]:
+        lang = data[self.manager.get_elem_by_id("top.lang")]
+
+        if self.demo_mode:
+            gr.Warning(ALERTS["err_demo"][lang])
+            yield ALERTS["err_demo"][lang]
+            return
+
+        yield ALERTS["info_unloading"][lang]
+        self.engine = None
+        torch_gc()
+        yield ALERTS["info_unloaded"][lang]
+
+    @staticmethod
+    def append(
+        chatbot: list[dict[str, str]],
+        messages: list[dict[str, str]],
+        role: str,
+        query: str,
+        escape_html: bool,
+    ) -> tuple[list[dict[str, str]], list[dict[str, str]], str]:
+        r"""Add the user input to chatbot.
+
+        Inputs: infer.chatbot, infer.messages, infer.role, infer.query, infer.escape_html
+        Output: infer.chatbot, infer.messages, infer.query
+        """
+        return (
+            chatbot + [{"role": "user", "content": _escape_html(query) if escape_html else query}],
+            messages + [{"role": role, "content": query}],
+            "",
+        )
+
+    def stream(
+        self,
+        chatbot: list[dict[str, str]],
+        messages: list[dict[str, str]],
+        lang: str,
+        system: str,
+        tools: str,
+        image: Optional[Any],
+        video: Optional[Any],
+        audio: Optional[Any],
+        max_new_tokens: int,
+        top_p: float,
+        temperature: float,
+        skip_special_tokens: bool,
+        escape_html: bool,
+        enable_thinking: bool,
+    ) -> Generator[tuple[list[dict[str, str]], list[dict[str, str]]], None, None]:
+        r"""Generate output text in stream.
+
+        Inputs: infer.chatbot, infer.messages, infer.system, infer.tools, infer.image, infer.video, ...
+        Output: infer.chatbot, infer.messages
+        """
+        with update_attr(self.engine.template, "enable_thinking", enable_thinking):
+            chatbot.append({"role": "assistant", "content": ""})
+            response = ""
+            for new_text in self.stream_chat(
+                messages,
+                system,
+                tools,
+                images=[image] if image else None,
+                videos=[video] if video else None,
+                audios=[audio] if audio else None,
+                max_new_tokens=max_new_tokens,
+                top_p=top_p,
+                temperature=temperature,
+                skip_special_tokens=skip_special_tokens,
+            ):
+                response += new_text
+                if tools:
+                    result = self.engine.template.extract_tool(response)
+                else:
+                    result = response
+
+                if isinstance(result, list):
+                    tool_calls = [{"name": tool.name, "arguments": json.loads(tool.arguments)} for tool in result]
+                    tool_calls = json.dumps(tool_calls, ensure_ascii=False)
+                    output_messages = messages + [{"role": Role.FUNCTION.value, "content": tool_calls}]
+                    bot_text = "```json\n" + tool_calls + "\n```"
+                else:
+                    output_messages = messages + [{"role": Role.ASSISTANT.value, "content": result}]
+                    bot_text = _format_response(result, lang, escape_html, self.engine.template.thought_words)
+
+                chatbot[-1] = {"role": "assistant", "content": bot_text}
+                yield chatbot, output_messages
diff --git a/llamafactory/webui/common.py b/llamafactory/webui/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8e829f4e9c32d6152f1b0825644b46ede312bcc
--- /dev/null
+++ b/llamafactory/webui/common.py
@@ -0,0 +1,286 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import signal
+from collections import defaultdict
+from datetime import datetime
+from typing import Any, Optional, Union
+
+from psutil import Process
+from yaml import safe_dump, safe_load
+
+from ..extras import logging
+from ..extras.constants import (
+    DATA_CONFIG,
+    DEFAULT_TEMPLATE,
+    MULTIMODAL_SUPPORTED_MODELS,
+    SUPPORTED_MODELS,
+    TRAINING_ARGS,
+    DownloadSource,
+)
+from ..extras.misc import use_modelscope, use_openmind
+
+
+logger = logging.get_logger(__name__)
+
+DEFAULT_CACHE_DIR = "llamaboard_cache"
+DEFAULT_CONFIG_DIR = "llamaboard_config"
+DEFAULT_DATA_DIR = "data"
+DEFAULT_SAVE_DIR = "saves"
+USER_CONFIG = "user_config.yaml"
+
+
+def abort_process(pid: int) -> None:
+    r"""Abort the processes recursively in a bottom-up way."""
+    try:
+        children = Process(pid).children()
+        if children:
+            for child in children:
+                abort_process(child.pid)
+
+        os.kill(pid, signal.SIGABRT)
+    except Exception:
+        pass
+
+
+def get_save_dir(*paths: str) -> os.PathLike:
+    r"""Get the path to saved model checkpoints."""
+    if os.path.sep in paths[-1]:
+        logger.warning_rank0("Found complex path, some features may be not available.")
+        return paths[-1]
+
+    paths = (path.replace(" ", "").strip() for path in paths)
+    return os.path.join(DEFAULT_SAVE_DIR, *paths)
+
+
+def _get_config_path() -> os.PathLike:
+    r"""Get the path to user config."""
+    return os.path.join(DEFAULT_CACHE_DIR, USER_CONFIG)
+
+
+def load_config() -> dict[str, Union[str, dict[str, Any]]]:
+    r"""Load user config if exists."""
+    try:
+        with open(_get_config_path(), encoding="utf-8") as f:
+            return safe_load(f)
+    except Exception:
+        return {"lang": None, "hub_name": None, "last_model": None, "path_dict": {}, "cache_dir": None}
+
+
+def save_config(
+    lang: str, hub_name: Optional[str] = None, model_name: Optional[str] = None, model_path: Optional[str] = None
+) -> None:
+    r"""Save user config."""
+    os.makedirs(DEFAULT_CACHE_DIR, exist_ok=True)
+    user_config = load_config()
+    user_config["lang"] = lang or user_config["lang"]
+    if hub_name:
+        user_config["hub_name"] = hub_name
+
+    if model_name:
+        user_config["last_model"] = model_name
+
+    if model_name and model_path:
+        user_config["path_dict"][model_name] = model_path
+
+    with open(_get_config_path(), "w", encoding="utf-8") as f:
+        safe_dump(user_config, f)
+
+
+def get_model_path(model_name: str) -> str:
+    r"""Get the model path according to the model name."""
+    user_config = load_config()
+    path_dict: dict[DownloadSource, str] = SUPPORTED_MODELS.get(model_name, defaultdict(str))
+    model_path = user_config["path_dict"].get(model_name, "") or path_dict.get(DownloadSource.DEFAULT, "")
+    if (
+        use_modelscope()
+        and path_dict.get(DownloadSource.MODELSCOPE)
+        and model_path == path_dict.get(DownloadSource.DEFAULT)
+    ):  # replace hf path with ms path
+        model_path = path_dict.get(DownloadSource.MODELSCOPE)
+
+    if (
+        use_openmind()
+        and path_dict.get(DownloadSource.OPENMIND)
+        and model_path == path_dict.get(DownloadSource.DEFAULT)
+    ):  # replace hf path with om path
+        model_path = path_dict.get(DownloadSource.OPENMIND)
+
+    return model_path
+
+
+def get_template(model_name: str) -> str:
+    r"""Get the template name if the model is a chat/distill/instruct model."""
+    return DEFAULT_TEMPLATE.get(model_name, "default")
+
+
+def get_time() -> str:
+    r"""Get current date and time."""
+    return datetime.now().strftime(r"%Y-%m-%d-%H-%M-%S")
+
+
+def is_multimodal(model_name: str) -> bool:
+    r"""Judge if the model is a vision language model."""
+    return model_name in MULTIMODAL_SUPPORTED_MODELS
+
+
+def load_dataset_info(dataset_dir: str) -> dict[str, dict[str, Any]]:
+    r"""Load dataset_info.json."""
+    if dataset_dir == "ONLINE" or dataset_dir.startswith("REMOTE:"):
+        logger.info_rank0(f"dataset_dir is {dataset_dir}, using online dataset.")
+        return {}
+
+    try:
+        with open(os.path.join(dataset_dir, DATA_CONFIG), encoding="utf-8") as f:
+            return json.load(f)
+    except Exception as err:
+        logger.warning_rank0(f"Cannot open {os.path.join(dataset_dir, DATA_CONFIG)} due to {str(err)}.")
+        return {}
+
+
+def load_args(config_path: str) -> Optional[dict[str, Any]]:
+    r"""Load the training configuration from config path."""
+    try:
+        with open(config_path, encoding="utf-8") as f:
+            return safe_load(f)
+    except Exception:
+        return None
+
+
+def save_args(config_path: str, config_dict: dict[str, Any]) -> None:
+    r"""Save the training configuration to config path."""
+    with open(config_path, "w", encoding="utf-8") as f:
+        safe_dump(config_dict, f)
+
+
+def _clean_cmd(args: dict[str, Any]) -> dict[str, Any]:
+    r"""Remove args with NoneType or False or empty string value."""
+    no_skip_keys = [
+        "packing",
+        "enable_thinking",
+        "use_reentrant_gc",
+        "double_quantization",
+        "freeze_vision_tower",
+        "freeze_multi_modal_projector",
+    ]
+    return {k: v for k, v in args.items() if (k in no_skip_keys) or (v is not None and v is not False and v != "")}
+
+
+def gen_cmd(args: dict[str, Any]) -> str:
+    r"""Generate CLI commands for previewing."""
+    cmd_lines = ["llamafactory-cli train "]
+    for k, v in _clean_cmd(args).items():
+        if isinstance(v, dict):
+            cmd_lines.append(f"    --{k} {json.dumps(v, ensure_ascii=False)} ")
+        elif isinstance(v, list):
+            cmd_lines.append(f"    --{k} {' '.join(map(str, v))} ")
+        else:
+            cmd_lines.append(f"    --{k} {str(v)} ")
+
+    if os.name == "nt":
+        cmd_text = "`\n".join(cmd_lines)
+    else:
+        cmd_text = "\\\n".join(cmd_lines)
+
+    cmd_text = f"```bash\n{cmd_text}\n```"
+    return cmd_text
+
+
+def save_cmd(args: dict[str, Any]) -> str:
+    r"""Save CLI commands to launch training."""
+    output_dir = args["output_dir"]
+    os.makedirs(output_dir, exist_ok=True)
+    with open(os.path.join(output_dir, TRAINING_ARGS), "w", encoding="utf-8") as f:
+        safe_dump(_clean_cmd(args), f)
+
+    return os.path.join(output_dir, TRAINING_ARGS)
+
+
+def load_eval_results(path: os.PathLike) -> str:
+    r"""Get scores after evaluation."""
+    with open(path, encoding="utf-8") as f:
+        result = json.dumps(json.load(f), indent=4)
+
+    return f"```json\n{result}\n```\n"
+
+
+def calculate_pixels(pixels: str) -> int:
+    r"""Calculate the number of pixels from the expression."""
+    if "*" in pixels:
+        return int(pixels.split("*")[0]) * int(pixels.split("*")[1])
+    else:
+        return int(pixels)
+
+
+def create_ds_config() -> None:
+    r"""Create deepspeed config in the current directory."""
+    os.makedirs(DEFAULT_CACHE_DIR, exist_ok=True)
+    ds_config = {
+        "train_batch_size": "auto",
+        "train_micro_batch_size_per_gpu": "auto",
+        "gradient_accumulation_steps": "auto",
+        "gradient_clipping": "auto",
+        "zero_allow_untested_optimizer": True,
+        "fp16": {
+            "enabled": "auto",
+            "loss_scale": 0,
+            "loss_scale_window": 1000,
+            "initial_scale_power": 16,
+            "hysteresis": 2,
+            "min_loss_scale": 1,
+        },
+        "bf16": {"enabled": "auto"},
+    }
+    offload_config = {
+        "device": "cpu",
+        "pin_memory": True,
+    }
+    ds_config["zero_optimization"] = {
+        "stage": 2,
+        "allgather_partitions": True,
+        "allgather_bucket_size": 5e8,
+        "overlap_comm": False,
+        "reduce_scatter": True,
+        "reduce_bucket_size": 5e8,
+        "contiguous_gradients": True,
+        "round_robin_gradients": True,
+    }
+    with open(os.path.join(DEFAULT_CACHE_DIR, "ds_z2_config.json"), "w", encoding="utf-8") as f:
+        json.dump(ds_config, f, indent=2)
+
+    ds_config["zero_optimization"]["offload_optimizer"] = offload_config
+    with open(os.path.join(DEFAULT_CACHE_DIR, "ds_z2_offload_config.json"), "w", encoding="utf-8") as f:
+        json.dump(ds_config, f, indent=2)
+
+    ds_config["zero_optimization"] = {
+        "stage": 3,
+        "overlap_comm": False,
+        "contiguous_gradients": True,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": True,
+    }
+    with open(os.path.join(DEFAULT_CACHE_DIR, "ds_z3_config.json"), "w", encoding="utf-8") as f:
+        json.dump(ds_config, f, indent=2)
+
+    ds_config["zero_optimization"]["offload_optimizer"] = offload_config
+    ds_config["zero_optimization"]["offload_param"] = offload_config
+    with open(os.path.join(DEFAULT_CACHE_DIR, "ds_z3_offload_config.json"), "w", encoding="utf-8") as f:
+        json.dump(ds_config, f, indent=2)
diff --git a/llamafactory/webui/components/__init__.py b/llamafactory/webui/components/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2c64ea739b881b0908e476343ea55e59f56e373
--- /dev/null
+++ b/llamafactory/webui/components/__init__.py
@@ -0,0 +1,32 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .chatbot import create_chat_box
+from .eval import create_eval_tab
+from .export import create_export_tab
+from .footer import create_footer
+from .infer import create_infer_tab
+from .top import create_top
+from .train import create_train_tab
+
+
+__all__ = [
+    "create_chat_box",
+    "create_eval_tab",
+    "create_export_tab",
+    "create_footer",
+    "create_infer_tab",
+    "create_top",
+    "create_train_tab",
+]
diff --git a/llamafactory/webui/components/__pycache__/__init__.cpython-312.pyc b/llamafactory/webui/components/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..75118ffc0289762af9d34a8cf6b03fe8cfc86a9b
Binary files /dev/null and b/llamafactory/webui/components/__pycache__/__init__.cpython-312.pyc differ
diff --git a/llamafactory/webui/components/__pycache__/chatbot.cpython-312.pyc b/llamafactory/webui/components/__pycache__/chatbot.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e4514c7c05a29622e0f007ed6528efc47d33ba21
Binary files /dev/null and b/llamafactory/webui/components/__pycache__/chatbot.cpython-312.pyc differ
diff --git a/llamafactory/webui/components/__pycache__/data.cpython-312.pyc b/llamafactory/webui/components/__pycache__/data.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8a32a32fdce20d514ff5cc9c84ce09df1abbc268
Binary files /dev/null and b/llamafactory/webui/components/__pycache__/data.cpython-312.pyc differ
diff --git a/llamafactory/webui/components/__pycache__/eval.cpython-312.pyc b/llamafactory/webui/components/__pycache__/eval.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9fd94fbb45348155d4aa1246cc6c662e20da070c
Binary files /dev/null and b/llamafactory/webui/components/__pycache__/eval.cpython-312.pyc differ
diff --git a/llamafactory/webui/components/__pycache__/export.cpython-312.pyc b/llamafactory/webui/components/__pycache__/export.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3b5ebb52eed2e791182f3289af5be3e66c2c6917
Binary files /dev/null and b/llamafactory/webui/components/__pycache__/export.cpython-312.pyc differ
diff --git a/llamafactory/webui/components/__pycache__/footer.cpython-312.pyc b/llamafactory/webui/components/__pycache__/footer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..852acf28c338648bddef0005c1ad04d5a4b94ef7
Binary files /dev/null and b/llamafactory/webui/components/__pycache__/footer.cpython-312.pyc differ
diff --git a/llamafactory/webui/components/__pycache__/infer.cpython-312.pyc b/llamafactory/webui/components/__pycache__/infer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..765ddb6fa21376caca043296b0b143c9a27d139d
Binary files /dev/null and b/llamafactory/webui/components/__pycache__/infer.cpython-312.pyc differ
diff --git a/llamafactory/webui/components/__pycache__/top.cpython-312.pyc b/llamafactory/webui/components/__pycache__/top.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8eae0af13087a309e2063c3ca6a8da8c4600e519
Binary files /dev/null and b/llamafactory/webui/components/__pycache__/top.cpython-312.pyc differ
diff --git a/llamafactory/webui/components/__pycache__/train.cpython-312.pyc b/llamafactory/webui/components/__pycache__/train.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c33b6986170d5bd9855d4e69db491d801dbb661b
Binary files /dev/null and b/llamafactory/webui/components/__pycache__/train.cpython-312.pyc differ
diff --git a/llamafactory/webui/components/chatbot.py b/llamafactory/webui/components/chatbot.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5ff5492cafbf183f113f399206c835c11f99623
--- /dev/null
+++ b/llamafactory/webui/components/chatbot.py
@@ -0,0 +1,143 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import json
+from typing import TYPE_CHECKING
+
+from ...data import Role
+from ...extras.packages import is_gradio_available
+from ..locales import ALERTS
+
+
+if is_gradio_available():
+    import gradio as gr
+
+
+if TYPE_CHECKING:
+    from gradio.components import Component
+
+    from ..engine import Engine
+
+
+def check_json_schema(text: str, lang: str) -> None:
+    r"""Check if the json schema is valid."""
+    try:
+        tools = json.loads(text)
+        if tools:
+            assert isinstance(tools, list)
+            for tool in tools:
+                if "name" not in tool:
+                    raise NotImplementedError("Name not found.")
+    except NotImplementedError:
+        gr.Warning(ALERTS["err_tool_name"][lang])
+    except Exception:
+        gr.Warning(ALERTS["err_json_schema"][lang])
+
+
+def create_chat_box(
+    engine: "Engine", visible: bool = False
+) -> tuple["Component", "Component", dict[str, "Component"]]:
+    lang = engine.manager.get_elem_by_id("top.lang")
+    with gr.Column(visible=visible) as chat_box:
+        kwargs = {}
+        if "show_copy_button" in inspect.signature(gr.Chatbot.__init__).parameters:
+            kwargs["show_copy_button"] = True
+
+        if "resizable" in inspect.signature(gr.Chatbot.__init__).parameters:
+            kwargs["resizable"] = True
+
+        chatbot = gr.Chatbot(type="messages", **kwargs)
+        messages = gr.State([])
+        with gr.Row():
+            with gr.Column(scale=4):
+                with gr.Row():
+                    with gr.Column():
+                        role = gr.Dropdown(choices=[Role.USER.value, Role.OBSERVATION.value], value=Role.USER.value)
+                        system = gr.Textbox(show_label=False)
+                        tools = gr.Textbox(show_label=False, lines=3)
+
+                    with gr.Column() as mm_box:
+                        with gr.Tab("Image"):
+                            image = gr.Image(type="pil")
+
+                        with gr.Tab("Video"):
+                            video = gr.Video()
+
+                        with gr.Tab("Audio"):
+                            audio = gr.Audio(type="filepath")
+
+                query = gr.Textbox(show_label=False, lines=8)
+                submit_btn = gr.Button(variant="primary")
+
+            with gr.Column(scale=1):
+                max_new_tokens = gr.Slider(minimum=8, maximum=8192, value=1024, step=1)
+                top_p = gr.Slider(minimum=0.01, maximum=1.0, value=0.7, step=0.01)
+                temperature = gr.Slider(minimum=0.01, maximum=1.5, value=0.95, step=0.01)
+                skip_special_tokens = gr.Checkbox(value=True)
+                escape_html = gr.Checkbox(value=True)
+                enable_thinking = gr.Checkbox(value=True)
+                clear_btn = gr.Button()
+
+    tools.input(check_json_schema, inputs=[tools, engine.manager.get_elem_by_id("top.lang")])
+
+    submit_btn.click(
+        engine.chatter.append,
+        [chatbot, messages, role, query, escape_html],
+        [chatbot, messages, query],
+    ).then(
+        engine.chatter.stream,
+        [
+            chatbot,
+            messages,
+            lang,
+            system,
+            tools,
+            image,
+            video,
+            audio,
+            max_new_tokens,
+            top_p,
+            temperature,
+            skip_special_tokens,
+            escape_html,
+            enable_thinking,
+        ],
+        [chatbot, messages],
+    )
+    clear_btn.click(lambda: ([], []), outputs=[chatbot, messages])
+
+    return (
+        chatbot,
+        messages,
+        dict(
+            chat_box=chat_box,
+            role=role,
+            system=system,
+            tools=tools,
+            mm_box=mm_box,
+            image=image,
+            video=video,
+            audio=audio,
+            query=query,
+            submit_btn=submit_btn,
+            max_new_tokens=max_new_tokens,
+            top_p=top_p,
+            temperature=temperature,
+            skip_special_tokens=skip_special_tokens,
+            escape_html=escape_html,
+            enable_thinking=enable_thinking,
+            clear_btn=clear_btn,
+        ),
+    )
diff --git a/llamafactory/webui/components/data.py b/llamafactory/webui/components/data.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f27bd19674735662f88ff53b267b772fcc2f44e
--- /dev/null
+++ b/llamafactory/webui/components/data.py
@@ -0,0 +1,122 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from typing import TYPE_CHECKING, Any
+
+from ...extras.constants import DATA_CONFIG
+from ...extras.packages import is_gradio_available
+
+
+if is_gradio_available():
+    import gradio as gr
+
+
+if TYPE_CHECKING:
+    from gradio.components import Component
+
+
+PAGE_SIZE = 2
+
+
+def prev_page(page_index: int) -> int:
+    return page_index - 1 if page_index > 0 else page_index
+
+
+def next_page(page_index: int, total_num: int) -> int:
+    return page_index + 1 if (page_index + 1) * PAGE_SIZE < total_num else page_index
+
+
+def can_preview(dataset_dir: str, dataset: list) -> "gr.Button":
+    r"""Check if the dataset is a local dataset."""
+    try:
+        with open(os.path.join(dataset_dir, DATA_CONFIG), encoding="utf-8") as f:
+            dataset_info = json.load(f)
+    except Exception:
+        return gr.Button(interactive=False)
+
+    if len(dataset) == 0 or "file_name" not in dataset_info[dataset[0]]:
+        return gr.Button(interactive=False)
+
+    data_path = os.path.join(dataset_dir, dataset_info[dataset[0]]["file_name"])
+    if os.path.isfile(data_path) or (os.path.isdir(data_path) and os.listdir(data_path)):
+        return gr.Button(interactive=True)
+    else:
+        return gr.Button(interactive=False)
+
+
+def _load_data_file(file_path: str) -> list[Any]:
+    with open(file_path, encoding="utf-8") as f:
+        if file_path.endswith(".json"):
+            return json.load(f)
+        elif file_path.endswith(".jsonl"):
+            return [json.loads(line) for line in f]
+        else:
+            return list(f)
+
+
+def get_preview(dataset_dir: str, dataset: list, page_index: int) -> tuple[int, list, "gr.Column"]:
+    r"""Get the preview samples from the dataset."""
+    with open(os.path.join(dataset_dir, DATA_CONFIG), encoding="utf-8") as f:
+        dataset_info = json.load(f)
+
+    data_path = os.path.join(dataset_dir, dataset_info[dataset[0]]["file_name"])
+    if os.path.isfile(data_path):
+        data = _load_data_file(data_path)
+    else:
+        data = []
+        for file_name in os.listdir(data_path):
+            data.extend(_load_data_file(os.path.join(data_path, file_name)))
+
+    return len(data), data[PAGE_SIZE * page_index : PAGE_SIZE * (page_index + 1)], gr.Column(visible=True)
+
+
+def create_preview_box(dataset_dir: "gr.Textbox", dataset: "gr.Dropdown") -> dict[str, "Component"]:
+    data_preview_btn = gr.Button(interactive=False, scale=1)
+    with gr.Column(visible=False, elem_classes="modal-box") as preview_box:
+        with gr.Row():
+            preview_count = gr.Number(value=0, interactive=False, precision=0)
+            page_index = gr.Number(value=0, interactive=False, precision=0)
+
+        with gr.Row():
+            prev_btn = gr.Button()
+            next_btn = gr.Button()
+            close_btn = gr.Button()
+
+        with gr.Row():
+            preview_samples = gr.JSON()
+
+    dataset.change(can_preview, [dataset_dir, dataset], [data_preview_btn], queue=False).then(
+        lambda: 0, outputs=[page_index], queue=False
+    )
+    data_preview_btn.click(
+        get_preview, [dataset_dir, dataset, page_index], [preview_count, preview_samples, preview_box], queue=False
+    )
+    prev_btn.click(prev_page, [page_index], [page_index], queue=False).then(
+        get_preview, [dataset_dir, dataset, page_index], [preview_count, preview_samples, preview_box], queue=False
+    )
+    next_btn.click(next_page, [page_index, preview_count], [page_index], queue=False).then(
+        get_preview, [dataset_dir, dataset, page_index], [preview_count, preview_samples, preview_box], queue=False
+    )
+    close_btn.click(lambda: gr.Column(visible=False), outputs=[preview_box], queue=False)
+    return dict(
+        data_preview_btn=data_preview_btn,
+        preview_count=preview_count,
+        page_index=page_index,
+        prev_btn=prev_btn,
+        next_btn=next_btn,
+        close_btn=close_btn,
+        preview_samples=preview_samples,
+    )
diff --git a/llamafactory/webui/components/eval.py b/llamafactory/webui/components/eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..3804a77d6b1bd24e91e6d9fc16c0cea4846707fb
--- /dev/null
+++ b/llamafactory/webui/components/eval.py
@@ -0,0 +1,94 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...extras.packages import is_gradio_available
+from ..common import DEFAULT_DATA_DIR
+from ..control import list_datasets
+from .data import create_preview_box
+
+
+if is_gradio_available():
+    import gradio as gr
+
+
+if TYPE_CHECKING:
+    from gradio.components import Component
+
+    from ..engine import Engine
+
+
+def create_eval_tab(engine: "Engine") -> dict[str, "Component"]:
+    input_elems = engine.manager.get_base_elems()
+    elem_dict = dict()
+
+    with gr.Row():
+        dataset_dir = gr.Textbox(value=DEFAULT_DATA_DIR, scale=2)
+        dataset = gr.Dropdown(multiselect=True, allow_custom_value=True, scale=4)
+        preview_elems = create_preview_box(dataset_dir, dataset)
+
+    input_elems.update({dataset_dir, dataset})
+    elem_dict.update(dict(dataset_dir=dataset_dir, dataset=dataset, **preview_elems))
+
+    with gr.Row():
+        cutoff_len = gr.Slider(minimum=4, maximum=131072, value=1024, step=1)
+        max_samples = gr.Textbox(value="100000")
+        batch_size = gr.Slider(minimum=1, maximum=1024, value=2, step=1)
+        predict = gr.Checkbox(value=True)
+
+    input_elems.update({cutoff_len, max_samples, batch_size, predict})
+    elem_dict.update(dict(cutoff_len=cutoff_len, max_samples=max_samples, batch_size=batch_size, predict=predict))
+
+    with gr.Row():
+        max_new_tokens = gr.Slider(minimum=8, maximum=4096, value=512, step=1)
+        top_p = gr.Slider(minimum=0.01, maximum=1, value=0.7, step=0.01)
+        temperature = gr.Slider(minimum=0.01, maximum=1.5, value=0.95, step=0.01)
+        output_dir = gr.Textbox()
+
+    input_elems.update({max_new_tokens, top_p, temperature, output_dir})
+    elem_dict.update(dict(max_new_tokens=max_new_tokens, top_p=top_p, temperature=temperature, output_dir=output_dir))
+
+    with gr.Row():
+        cmd_preview_btn = gr.Button()
+        start_btn = gr.Button(variant="primary")
+        stop_btn = gr.Button(variant="stop")
+
+    with gr.Row():
+        resume_btn = gr.Checkbox(visible=False, interactive=False)
+        progress_bar = gr.Slider(visible=False, interactive=False)
+
+    with gr.Row():
+        output_box = gr.Markdown()
+
+    elem_dict.update(
+        dict(
+            cmd_preview_btn=cmd_preview_btn,
+            start_btn=start_btn,
+            stop_btn=stop_btn,
+            resume_btn=resume_btn,
+            progress_bar=progress_bar,
+            output_box=output_box,
+        )
+    )
+    output_elems = [output_box, progress_bar]
+
+    cmd_preview_btn.click(engine.runner.preview_eval, input_elems, output_elems, concurrency_limit=None)
+    start_btn.click(engine.runner.run_eval, input_elems, output_elems)
+    stop_btn.click(engine.runner.set_abort)
+    resume_btn.change(engine.runner.monitor, outputs=output_elems, concurrency_limit=None)
+
+    dataset.focus(list_datasets, [dataset_dir], [dataset], queue=False)
+
+    return elem_dict
diff --git a/llamafactory/webui/components/export.py b/llamafactory/webui/components/export.py
new file mode 100644
index 0000000000000000000000000000000000000000..d153ffa6f2db721b9946292495cbf5b8af605625
--- /dev/null
+++ b/llamafactory/webui/components/export.py
@@ -0,0 +1,169 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+from collections.abc import Generator
+from typing import TYPE_CHECKING, Union
+
+from ...extras.constants import PEFT_METHODS
+from ...extras.misc import torch_gc
+from ...extras.packages import is_gradio_available
+from ...train.tuner import export_model
+from ..common import get_save_dir, load_config
+from ..locales import ALERTS
+
+
+if is_gradio_available():
+    import gradio as gr
+
+
+if TYPE_CHECKING:
+    from gradio.components import Component
+
+    from ..engine import Engine
+
+
+GPTQ_BITS = ["8", "4", "3", "2"]
+
+
+def can_quantize(checkpoint_path: Union[str, list[str]]) -> "gr.Dropdown":
+    if isinstance(checkpoint_path, list) and len(checkpoint_path) != 0:
+        return gr.Dropdown(value="none", interactive=False)
+    else:
+        return gr.Dropdown(interactive=True)
+
+
+def save_model(
+    lang: str,
+    model_name: str,
+    model_path: str,
+    finetuning_type: str,
+    checkpoint_path: Union[str, list[str]],
+    template: str,
+    export_size: int,
+    export_quantization_bit: str,
+    export_quantization_dataset: str,
+    export_device: str,
+    export_legacy_format: bool,
+    export_dir: str,
+    export_hub_model_id: str,
+    extra_args: str,
+) -> Generator[str, None, None]:
+    user_config = load_config()
+    error = ""
+    if not model_name:
+        error = ALERTS["err_no_model"][lang]
+    elif not model_path:
+        error = ALERTS["err_no_path"][lang]
+    elif not export_dir:
+        error = ALERTS["err_no_export_dir"][lang]
+    elif export_quantization_bit in GPTQ_BITS and not export_quantization_dataset:
+        error = ALERTS["err_no_dataset"][lang]
+    elif export_quantization_bit not in GPTQ_BITS and not checkpoint_path:
+        error = ALERTS["err_no_adapter"][lang]
+    elif export_quantization_bit in GPTQ_BITS and checkpoint_path and isinstance(checkpoint_path, list):
+        error = ALERTS["err_gptq_lora"][lang]
+
+    try:
+        json.loads(extra_args)
+    except json.JSONDecodeError:
+        error = ALERTS["err_json_schema"][lang]
+
+    if error:
+        gr.Warning(error)
+        yield error
+        return
+
+    args = dict(
+        model_name_or_path=model_path,
+        cache_dir=user_config.get("cache_dir", None),
+        finetuning_type=finetuning_type,
+        template=template,
+        export_dir=export_dir,
+        export_hub_model_id=export_hub_model_id or None,
+        export_size=export_size,
+        export_quantization_bit=int(export_quantization_bit) if export_quantization_bit in GPTQ_BITS else None,
+        export_quantization_dataset=export_quantization_dataset,
+        export_device=export_device,
+        export_legacy_format=export_legacy_format,
+        trust_remote_code=True,
+    )
+    args.update(json.loads(extra_args))
+
+    if checkpoint_path:
+        if finetuning_type in PEFT_METHODS:  # list
+            args["adapter_name_or_path"] = ",".join(
+                [get_save_dir(model_name, finetuning_type, adapter) for adapter in checkpoint_path]
+            )
+        else:  # str
+            args["model_name_or_path"] = get_save_dir(model_name, finetuning_type, checkpoint_path)
+
+    yield ALERTS["info_exporting"][lang]
+    export_model(args)
+    torch_gc()
+    yield ALERTS["info_exported"][lang]
+
+
+def create_export_tab(engine: "Engine") -> dict[str, "Component"]:
+    with gr.Row():
+        export_size = gr.Slider(minimum=1, maximum=100, value=5, step=1)
+        export_quantization_bit = gr.Dropdown(choices=["none"] + GPTQ_BITS, value="none")
+        export_quantization_dataset = gr.Textbox(value="data/c4_demo.jsonl")
+        export_device = gr.Radio(choices=["cpu", "auto"], value="cpu")
+        export_legacy_format = gr.Checkbox()
+
+    with gr.Row():
+        export_dir = gr.Textbox()
+        export_hub_model_id = gr.Textbox()
+        extra_args = gr.Textbox(value="{}")
+
+    checkpoint_path: gr.Dropdown = engine.manager.get_elem_by_id("top.checkpoint_path")
+    checkpoint_path.change(can_quantize, [checkpoint_path], [export_quantization_bit], queue=False)
+
+    export_btn = gr.Button()
+    info_box = gr.Textbox(show_label=False, interactive=False)
+
+    export_btn.click(
+        save_model,
+        [
+            engine.manager.get_elem_by_id("top.lang"),
+            engine.manager.get_elem_by_id("top.model_name"),
+            engine.manager.get_elem_by_id("top.model_path"),
+            engine.manager.get_elem_by_id("top.finetuning_type"),
+            engine.manager.get_elem_by_id("top.checkpoint_path"),
+            engine.manager.get_elem_by_id("top.template"),
+            export_size,
+            export_quantization_bit,
+            export_quantization_dataset,
+            export_device,
+            export_legacy_format,
+            export_dir,
+            export_hub_model_id,
+            extra_args,
+        ],
+        [info_box],
+    )
+
+    return dict(
+        export_size=export_size,
+        export_quantization_bit=export_quantization_bit,
+        export_quantization_dataset=export_quantization_dataset,
+        export_device=export_device,
+        export_legacy_format=export_legacy_format,
+        export_dir=export_dir,
+        export_hub_model_id=export_hub_model_id,
+        extra_args=extra_args,
+        export_btn=export_btn,
+        info_box=info_box,
+    )
diff --git a/llamafactory/webui/components/footer.py b/llamafactory/webui/components/footer.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ee9bbce4b9f3ac24e48f911d27b78577f1f821c
--- /dev/null
+++ b/llamafactory/webui/components/footer.py
@@ -0,0 +1,45 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...extras.misc import get_current_memory
+from ...extras.packages import is_gradio_available
+
+
+if is_gradio_available():
+    import gradio as gr
+
+
+if TYPE_CHECKING:
+    from gradio.components import Component
+
+
+def get_device_memory() -> "gr.Slider":
+    free, total = get_current_memory()
+    if total != -1:
+        used = round((total - free) / (1024**3), 2)
+        total = round(total / (1024**3), 2)
+        return gr.Slider(minimum=0, maximum=total, value=used, step=0.01, visible=True)
+    else:
+        return gr.Slider(visible=False)
+
+
+def create_footer() -> dict[str, "Component"]:
+    with gr.Row():
+        device_memory = gr.Slider(visible=False, interactive=False)
+        timer = gr.Timer(value=5)
+
+    timer.tick(get_device_memory, outputs=[device_memory], queue=False)
+    return dict(device_memory=device_memory)
diff --git a/llamafactory/webui/components/infer.py b/llamafactory/webui/components/infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef508cdf6eeff3d8f541100257c602b596161c6c
--- /dev/null
+++ b/llamafactory/webui/components/infer.py
@@ -0,0 +1,76 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...extras.packages import is_gradio_available
+from ..common import is_multimodal
+from .chatbot import create_chat_box
+
+
+if is_gradio_available():
+    import gradio as gr
+
+
+if TYPE_CHECKING:
+    from gradio.components import Component
+
+    from ..engine import Engine
+
+
+def create_infer_tab(engine: "Engine") -> dict[str, "Component"]:
+    input_elems = engine.manager.get_base_elems()
+    elem_dict = dict()
+
+    with gr.Row():
+        infer_backend = gr.Dropdown(choices=["huggingface", "vllm", "sglang"], value="huggingface")
+        infer_dtype = gr.Dropdown(choices=["auto", "float16", "bfloat16", "float32"], value="auto")
+        extra_args = gr.Textbox(value='{"vllm_enforce_eager": true}')
+
+    with gr.Row():
+        load_btn = gr.Button()
+        unload_btn = gr.Button()
+
+    info_box = gr.Textbox(show_label=False, interactive=False)
+
+    input_elems.update({infer_backend, infer_dtype, extra_args})
+    elem_dict.update(
+        dict(
+            infer_backend=infer_backend,
+            infer_dtype=infer_dtype,
+            extra_args=extra_args,
+            load_btn=load_btn,
+            unload_btn=unload_btn,
+            info_box=info_box,
+        )
+    )
+
+    chatbot, messages, chat_elems = create_chat_box(engine, visible=False)
+    elem_dict.update(chat_elems)
+
+    load_btn.click(engine.chatter.load_model, input_elems, [info_box]).then(
+        lambda: gr.Column(visible=engine.chatter.loaded), outputs=[chat_elems["chat_box"]]
+    )
+
+    unload_btn.click(engine.chatter.unload_model, input_elems, [info_box]).then(
+        lambda: ([], []), outputs=[chatbot, messages]
+    ).then(lambda: gr.Column(visible=engine.chatter.loaded), outputs=[chat_elems["chat_box"]])
+
+    engine.manager.get_elem_by_id("top.model_name").change(
+        lambda model_name: gr.Column(visible=is_multimodal(model_name)),
+        [engine.manager.get_elem_by_id("top.model_name")],
+        [chat_elems["mm_box"]],
+    )
+
+    return elem_dict
diff --git a/llamafactory/webui/components/top.py b/llamafactory/webui/components/top.py
new file mode 100644
index 0000000000000000000000000000000000000000..12275f1610acd89e2df6a765ca449c023aae38f3
--- /dev/null
+++ b/llamafactory/webui/components/top.py
@@ -0,0 +1,82 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...data import TEMPLATES
+from ...extras.constants import METHODS, SUPPORTED_MODELS
+from ...extras.misc import use_modelscope, use_openmind
+from ...extras.packages import is_gradio_available
+from ..common import save_config
+from ..control import can_quantize, can_quantize_to, check_template, get_model_info, list_checkpoints, switch_hub
+
+
+if is_gradio_available():
+    import gradio as gr
+
+
+if TYPE_CHECKING:
+    from gradio.components import Component
+
+
+def create_top() -> dict[str, "Component"]:
+    with gr.Row():
+        lang = gr.Dropdown(choices=["en", "ru", "zh", "ko", "ja"], value=None, scale=1)
+        available_models = list(SUPPORTED_MODELS.keys()) + ["Custom"]
+        model_name = gr.Dropdown(choices=available_models, value=None, scale=2)
+        model_path = gr.Textbox(scale=2)
+        default_hub = "modelscope" if use_modelscope() else "openmind" if use_openmind() else "huggingface"
+        hub_name = gr.Dropdown(choices=["huggingface", "modelscope", "openmind"], value=default_hub, scale=2)
+
+    with gr.Row():
+        finetuning_type = gr.Dropdown(choices=METHODS, value="lora", scale=1)
+        checkpoint_path = gr.Dropdown(multiselect=True, allow_custom_value=True, scale=6)
+
+    with gr.Row():
+        quantization_bit = gr.Dropdown(choices=["none", "8", "4"], value="none", allow_custom_value=True)
+        quantization_method = gr.Dropdown(choices=["bnb", "hqq", "eetq"], value="bnb")
+        template = gr.Dropdown(choices=list(TEMPLATES.keys()), value="default")
+        rope_scaling = gr.Dropdown(choices=["none", "linear", "dynamic", "yarn", "llama3"], value="none")
+        booster = gr.Dropdown(choices=["auto", "flashattn2", "unsloth", "liger_kernel"], value="auto")
+
+    model_name.change(get_model_info, [model_name], [model_path, template], queue=False).then(
+        list_checkpoints, [model_name, finetuning_type], [checkpoint_path], queue=False
+    ).then(check_template, [lang, template])
+    model_name.input(save_config, inputs=[lang, hub_name, model_name], queue=False)
+    model_path.input(save_config, inputs=[lang, hub_name, model_name, model_path], queue=False)
+    finetuning_type.change(can_quantize, [finetuning_type], [quantization_bit], queue=False).then(
+        list_checkpoints, [model_name, finetuning_type], [checkpoint_path], queue=False
+    )
+    checkpoint_path.focus(list_checkpoints, [model_name, finetuning_type], [checkpoint_path], queue=False)
+    quantization_method.change(can_quantize_to, [quantization_method], [quantization_bit], queue=False)
+    hub_name.change(switch_hub, inputs=[hub_name], queue=False).then(
+        get_model_info, [model_name], [model_path, template], queue=False
+    ).then(list_checkpoints, [model_name, finetuning_type], [checkpoint_path], queue=False).then(
+        check_template, [lang, template]
+    )
+    hub_name.input(save_config, inputs=[lang, hub_name], queue=False)
+
+    return dict(
+        lang=lang,
+        model_name=model_name,
+        model_path=model_path,
+        hub_name=hub_name,
+        finetuning_type=finetuning_type,
+        checkpoint_path=checkpoint_path,
+        quantization_bit=quantization_bit,
+        quantization_method=quantization_method,
+        template=template,
+        rope_scaling=rope_scaling,
+        booster=booster,
+    )
diff --git a/llamafactory/webui/components/train.py b/llamafactory/webui/components/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b7aa6e946f6c9fc5aaea4cab053a44430032d61
--- /dev/null
+++ b/llamafactory/webui/components/train.py
@@ -0,0 +1,447 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from transformers.trainer_utils import SchedulerType
+
+from ...extras.constants import TRAINING_STAGES
+from ...extras.misc import get_device_count
+from ...extras.packages import is_gradio_available
+from ..common import DEFAULT_DATA_DIR
+from ..control import change_stage, list_checkpoints, list_config_paths, list_datasets, list_output_dirs
+from .data import create_preview_box
+
+
+if is_gradio_available():
+    import gradio as gr
+
+
+if TYPE_CHECKING:
+    from gradio.components import Component
+
+    from ..engine import Engine
+
+
+def create_train_tab(engine: "Engine") -> dict[str, "Component"]:
+    input_elems = engine.manager.get_base_elems()
+    elem_dict = dict()
+
+    with gr.Row():
+        stages = list(TRAINING_STAGES.keys())
+        training_stage = gr.Dropdown(choices=stages, value=stages[0], scale=1)
+        dataset_dir = gr.Textbox(value=DEFAULT_DATA_DIR, scale=1)
+        dataset = gr.Dropdown(multiselect=True, allow_custom_value=True, scale=4)
+        preview_elems = create_preview_box(dataset_dir, dataset)
+
+    input_elems.update({training_stage, dataset_dir, dataset})
+    elem_dict.update(dict(training_stage=training_stage, dataset_dir=dataset_dir, dataset=dataset, **preview_elems))
+
+    with gr.Row():
+        learning_rate = gr.Textbox(value="5e-5")
+        num_train_epochs = gr.Textbox(value="3.0")
+        max_grad_norm = gr.Textbox(value="1.0")
+        max_samples = gr.Textbox(value="100000")
+        compute_type = gr.Dropdown(choices=["bf16", "fp16", "fp32", "pure_bf16"], value="bf16")
+
+    input_elems.update({learning_rate, num_train_epochs, max_grad_norm, max_samples, compute_type})
+    elem_dict.update(
+        dict(
+            learning_rate=learning_rate,
+            num_train_epochs=num_train_epochs,
+            max_grad_norm=max_grad_norm,
+            max_samples=max_samples,
+            compute_type=compute_type,
+        )
+    )
+
+    with gr.Row():
+        cutoff_len = gr.Slider(minimum=4, maximum=131072, value=2048, step=1)
+        batch_size = gr.Slider(minimum=1, maximum=1024, value=2, step=1)
+        gradient_accumulation_steps = gr.Slider(minimum=1, maximum=1024, value=8, step=1)
+        val_size = gr.Slider(minimum=0, maximum=1, value=0, step=0.001)
+        lr_scheduler_type = gr.Dropdown(choices=[scheduler.value for scheduler in SchedulerType], value="cosine")
+
+    input_elems.update({cutoff_len, batch_size, gradient_accumulation_steps, val_size, lr_scheduler_type})
+    elem_dict.update(
+        dict(
+            cutoff_len=cutoff_len,
+            batch_size=batch_size,
+            gradient_accumulation_steps=gradient_accumulation_steps,
+            val_size=val_size,
+            lr_scheduler_type=lr_scheduler_type,
+        )
+    )
+
+    with gr.Accordion(open=False) as extra_tab:
+        with gr.Row():
+            logging_steps = gr.Slider(minimum=1, maximum=1000, value=5, step=5)
+            save_steps = gr.Slider(minimum=10, maximum=5000, value=100, step=10)
+            warmup_steps = gr.Slider(minimum=0, maximum=5000, value=0, step=1)
+            neftune_alpha = gr.Slider(minimum=0, maximum=10, value=0, step=0.1)
+            extra_args = gr.Textbox(value='{"optim": "adamw_torch"}')
+
+        with gr.Row():
+            with gr.Column():
+                packing = gr.Checkbox()
+                neat_packing = gr.Checkbox()
+
+            with gr.Column():
+                train_on_prompt = gr.Checkbox()
+                mask_history = gr.Checkbox()
+
+            with gr.Column():
+                resize_vocab = gr.Checkbox()
+                use_llama_pro = gr.Checkbox()
+
+            with gr.Column():
+                enable_thinking = gr.Checkbox(value=True)
+                report_to = gr.Dropdown(
+                    choices=["none", "wandb", "mlflow", "neptune", "tensorboard", "all"],
+                    value="none",
+                    allow_custom_value=True,
+                )
+
+    input_elems.update(
+        {
+            logging_steps,
+            save_steps,
+            warmup_steps,
+            neftune_alpha,
+            extra_args,
+            packing,
+            neat_packing,
+            train_on_prompt,
+            mask_history,
+            resize_vocab,
+            use_llama_pro,
+            enable_thinking,
+            report_to,
+        }
+    )
+    elem_dict.update(
+        dict(
+            extra_tab=extra_tab,
+            logging_steps=logging_steps,
+            save_steps=save_steps,
+            warmup_steps=warmup_steps,
+            neftune_alpha=neftune_alpha,
+            extra_args=extra_args,
+            packing=packing,
+            neat_packing=neat_packing,
+            train_on_prompt=train_on_prompt,
+            mask_history=mask_history,
+            resize_vocab=resize_vocab,
+            use_llama_pro=use_llama_pro,
+            enable_thinking=enable_thinking,
+            report_to=report_to,
+        )
+    )
+
+    with gr.Accordion(open=False) as freeze_tab:
+        with gr.Row():
+            freeze_trainable_layers = gr.Slider(minimum=-128, maximum=128, value=2, step=1)
+            freeze_trainable_modules = gr.Textbox(value="all")
+            freeze_extra_modules = gr.Textbox()
+
+    input_elems.update({freeze_trainable_layers, freeze_trainable_modules, freeze_extra_modules})
+    elem_dict.update(
+        dict(
+            freeze_tab=freeze_tab,
+            freeze_trainable_layers=freeze_trainable_layers,
+            freeze_trainable_modules=freeze_trainable_modules,
+            freeze_extra_modules=freeze_extra_modules,
+        )
+    )
+
+    with gr.Accordion(open=False) as lora_tab:
+        with gr.Row():
+            lora_rank = gr.Slider(minimum=1, maximum=1024, value=8, step=1)
+            lora_alpha = gr.Slider(minimum=1, maximum=2048, value=16, step=1)
+            lora_dropout = gr.Slider(minimum=0, maximum=1, value=0, step=0.01)
+            loraplus_lr_ratio = gr.Slider(minimum=0, maximum=64, value=0, step=0.01)
+            create_new_adapter = gr.Checkbox()
+
+        with gr.Row():
+            use_rslora = gr.Checkbox()
+            use_dora = gr.Checkbox()
+            use_pissa = gr.Checkbox()
+            lora_target = gr.Textbox(scale=2)
+            additional_target = gr.Textbox(scale=2)
+
+    input_elems.update(
+        {
+            lora_rank,
+            lora_alpha,
+            lora_dropout,
+            loraplus_lr_ratio,
+            create_new_adapter,
+            use_rslora,
+            use_dora,
+            use_pissa,
+            lora_target,
+            additional_target,
+        }
+    )
+    elem_dict.update(
+        dict(
+            lora_tab=lora_tab,
+            lora_rank=lora_rank,
+            lora_alpha=lora_alpha,
+            lora_dropout=lora_dropout,
+            loraplus_lr_ratio=loraplus_lr_ratio,
+            create_new_adapter=create_new_adapter,
+            use_rslora=use_rslora,
+            use_dora=use_dora,
+            use_pissa=use_pissa,
+            lora_target=lora_target,
+            additional_target=additional_target,
+        )
+    )
+
+    with gr.Accordion(open=False) as rlhf_tab:
+        with gr.Row():
+            pref_beta = gr.Slider(minimum=0, maximum=1, value=0.1, step=0.01)
+            pref_ftx = gr.Slider(minimum=0, maximum=10, value=0, step=0.01)
+            pref_loss = gr.Dropdown(choices=["sigmoid", "hinge", "ipo", "kto_pair", "orpo", "simpo"], value="sigmoid")
+            reward_model = gr.Dropdown(multiselect=True, allow_custom_value=True)
+            with gr.Column():
+                ppo_score_norm = gr.Checkbox()
+                ppo_whiten_rewards = gr.Checkbox()
+
+    input_elems.update({pref_beta, pref_ftx, pref_loss, reward_model, ppo_score_norm, ppo_whiten_rewards})
+    elem_dict.update(
+        dict(
+            rlhf_tab=rlhf_tab,
+            pref_beta=pref_beta,
+            pref_ftx=pref_ftx,
+            pref_loss=pref_loss,
+            reward_model=reward_model,
+            ppo_score_norm=ppo_score_norm,
+            ppo_whiten_rewards=ppo_whiten_rewards,
+        )
+    )
+
+    with gr.Accordion(open=False) as mm_tab:
+        with gr.Row():
+            freeze_vision_tower = gr.Checkbox(value=True)
+            freeze_multi_modal_projector = gr.Checkbox(value=True)
+            freeze_language_model = gr.Checkbox(value=False)
+
+        with gr.Row():
+            image_max_pixels = gr.Textbox(value="768*768")
+            image_min_pixels = gr.Textbox(value="32*32")
+            video_max_pixels = gr.Textbox(value="256*256")
+            video_min_pixels = gr.Textbox(value="16*16")
+
+    input_elems.update(
+        {
+            freeze_vision_tower,
+            freeze_multi_modal_projector,
+            freeze_language_model,
+            image_max_pixels,
+            image_min_pixels,
+            video_max_pixels,
+            video_min_pixels,
+        }
+    )
+    elem_dict.update(
+        dict(
+            mm_tab=mm_tab,
+            freeze_vision_tower=freeze_vision_tower,
+            freeze_multi_modal_projector=freeze_multi_modal_projector,
+            freeze_language_model=freeze_language_model,
+            image_max_pixels=image_max_pixels,
+            image_min_pixels=image_min_pixels,
+            video_max_pixels=video_max_pixels,
+            video_min_pixels=video_min_pixels,
+        )
+    )
+
+    with gr.Accordion(open=False) as galore_tab:
+        with gr.Row():
+            use_galore = gr.Checkbox()
+            galore_rank = gr.Slider(minimum=1, maximum=1024, value=16, step=1)
+            galore_update_interval = gr.Slider(minimum=1, maximum=2048, value=200, step=1)
+            galore_scale = gr.Slider(minimum=0, maximum=100, value=2.0, step=0.1)
+            galore_target = gr.Textbox(value="all")
+
+    input_elems.update({use_galore, galore_rank, galore_update_interval, galore_scale, galore_target})
+    elem_dict.update(
+        dict(
+            galore_tab=galore_tab,
+            use_galore=use_galore,
+            galore_rank=galore_rank,
+            galore_update_interval=galore_update_interval,
+            galore_scale=galore_scale,
+            galore_target=galore_target,
+        )
+    )
+
+    with gr.Accordion(open=False) as apollo_tab:
+        with gr.Row():
+            use_apollo = gr.Checkbox()
+            apollo_rank = gr.Slider(minimum=1, maximum=1024, value=16, step=1)
+            apollo_update_interval = gr.Slider(minimum=1, maximum=2048, value=200, step=1)
+            apollo_scale = gr.Slider(minimum=0, maximum=100, value=32.0, step=0.1)
+            apollo_target = gr.Textbox(value="all")
+
+    input_elems.update({use_apollo, apollo_rank, apollo_update_interval, apollo_scale, apollo_target})
+    elem_dict.update(
+        dict(
+            apollo_tab=apollo_tab,
+            use_apollo=use_apollo,
+            apollo_rank=apollo_rank,
+            apollo_update_interval=apollo_update_interval,
+            apollo_scale=apollo_scale,
+            apollo_target=apollo_target,
+        )
+    )
+
+    with gr.Accordion(open=False) as badam_tab:
+        with gr.Row():
+            use_badam = gr.Checkbox()
+            badam_mode = gr.Dropdown(choices=["layer", "ratio"], value="layer")
+            badam_switch_mode = gr.Dropdown(choices=["ascending", "descending", "random", "fixed"], value="ascending")
+            badam_switch_interval = gr.Slider(minimum=1, maximum=1024, value=50, step=1)
+            badam_update_ratio = gr.Slider(minimum=0, maximum=1, value=0.05, step=0.01)
+
+    input_elems.update({use_badam, badam_mode, badam_switch_mode, badam_switch_interval, badam_update_ratio})
+    elem_dict.update(
+        dict(
+            badam_tab=badam_tab,
+            use_badam=use_badam,
+            badam_mode=badam_mode,
+            badam_switch_mode=badam_switch_mode,
+            badam_switch_interval=badam_switch_interval,
+            badam_update_ratio=badam_update_ratio,
+        )
+    )
+
+    with gr.Accordion(open=False) as swanlab_tab:
+        with gr.Row():
+            use_swanlab = gr.Checkbox()
+            swanlab_project = gr.Textbox(value="llamafactory")
+            swanlab_run_name = gr.Textbox()
+            swanlab_workspace = gr.Textbox()
+            swanlab_api_key = gr.Textbox()
+            swanlab_mode = gr.Dropdown(choices=["cloud", "local"], value="cloud")
+            swanlab_link = gr.Markdown(visible=False)
+
+    input_elems.update(
+        {
+            use_swanlab,
+            swanlab_project,
+            swanlab_run_name,
+            swanlab_workspace,
+            swanlab_api_key,
+            swanlab_mode,
+            swanlab_link,
+        }
+    )
+    elem_dict.update(
+        dict(
+            swanlab_tab=swanlab_tab,
+            use_swanlab=use_swanlab,
+            swanlab_project=swanlab_project,
+            swanlab_run_name=swanlab_run_name,
+            swanlab_workspace=swanlab_workspace,
+            swanlab_api_key=swanlab_api_key,
+            swanlab_mode=swanlab_mode,
+            swanlab_link=swanlab_link,
+        )
+    )
+
+    with gr.Row():
+        cmd_preview_btn = gr.Button()
+        arg_save_btn = gr.Button()
+        arg_load_btn = gr.Button()
+        start_btn = gr.Button(variant="primary")
+        stop_btn = gr.Button(variant="stop")
+
+    with gr.Row():
+        with gr.Column(scale=3):
+            with gr.Row():
+                current_time = gr.Textbox(visible=False, interactive=False)
+                output_dir = gr.Dropdown(allow_custom_value=True)
+                config_path = gr.Dropdown(allow_custom_value=True)
+
+            with gr.Row():
+                device_count = gr.Textbox(value=str(get_device_count() or 1), interactive=False)
+                ds_stage = gr.Dropdown(choices=["none", "2", "3"], value="none")
+                ds_offload = gr.Checkbox()
+
+            with gr.Row():
+                resume_btn = gr.Checkbox(visible=False, interactive=False)
+                progress_bar = gr.Slider(visible=False, interactive=False)
+
+            with gr.Row():
+                output_box = gr.Markdown()
+
+        with gr.Column(scale=1):
+            loss_viewer = gr.Plot()
+
+    input_elems.update({output_dir, config_path, ds_stage, ds_offload})
+    elem_dict.update(
+        dict(
+            cmd_preview_btn=cmd_preview_btn,
+            arg_save_btn=arg_save_btn,
+            arg_load_btn=arg_load_btn,
+            start_btn=start_btn,
+            stop_btn=stop_btn,
+            current_time=current_time,
+            output_dir=output_dir,
+            config_path=config_path,
+            device_count=device_count,
+            ds_stage=ds_stage,
+            ds_offload=ds_offload,
+            resume_btn=resume_btn,
+            progress_bar=progress_bar,
+            output_box=output_box,
+            loss_viewer=loss_viewer,
+        )
+    )
+    output_elems = [output_box, progress_bar, loss_viewer, swanlab_link]
+
+    cmd_preview_btn.click(engine.runner.preview_train, input_elems, output_elems, concurrency_limit=None)
+    start_btn.click(engine.runner.run_train, input_elems, output_elems)
+    stop_btn.click(engine.runner.set_abort)
+    resume_btn.change(engine.runner.monitor, outputs=output_elems, concurrency_limit=None)
+
+    lang = engine.manager.get_elem_by_id("top.lang")
+    model_name: gr.Dropdown = engine.manager.get_elem_by_id("top.model_name")
+    finetuning_type: gr.Dropdown = engine.manager.get_elem_by_id("top.finetuning_type")
+
+    arg_save_btn.click(engine.runner.save_args, input_elems, output_elems, concurrency_limit=None)
+    arg_load_btn.click(
+        engine.runner.load_args, [lang, config_path], list(input_elems) + [output_box], concurrency_limit=None
+    )
+
+    dataset.focus(list_datasets, [dataset_dir, training_stage], [dataset], queue=False)
+    training_stage.change(change_stage, [training_stage], [dataset, packing], queue=False)
+    reward_model.focus(list_checkpoints, [model_name, finetuning_type], [reward_model], queue=False)
+    model_name.change(list_output_dirs, [model_name, finetuning_type, current_time], [output_dir], queue=False)
+    finetuning_type.change(list_output_dirs, [model_name, finetuning_type, current_time], [output_dir], queue=False)
+    output_dir.change(
+        list_output_dirs, [model_name, finetuning_type, current_time], [output_dir], concurrency_limit=None
+    )
+    output_dir.input(
+        engine.runner.check_output_dir,
+        [lang, model_name, finetuning_type, output_dir],
+        list(input_elems) + [output_box],
+        concurrency_limit=None,
+    )
+    config_path.change(list_config_paths, [current_time], [config_path], queue=False)
+
+    return elem_dict
diff --git a/llamafactory/webui/control.py b/llamafactory/webui/control.py
new file mode 100644
index 0000000000000000000000000000000000000000..f64b369944ee0b1818236044040ac40e339f2871
--- /dev/null
+++ b/llamafactory/webui/control.py
@@ -0,0 +1,224 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from typing import Any, Optional
+
+from transformers.trainer_utils import get_last_checkpoint
+
+from ..extras.constants import (
+    CHECKPOINT_NAMES,
+    PEFT_METHODS,
+    RUNNING_LOG,
+    STAGES_USE_PAIR_DATA,
+    SWANLAB_CONFIG,
+    TRAINER_LOG,
+    TRAINING_STAGES,
+)
+from ..extras.packages import is_gradio_available, is_matplotlib_available
+from ..extras.ploting import gen_loss_plot
+from ..model import QuantizationMethod
+from .common import DEFAULT_CONFIG_DIR, DEFAULT_DATA_DIR, get_model_path, get_save_dir, get_template, load_dataset_info
+from .locales import ALERTS
+
+
+if is_gradio_available():
+    import gradio as gr
+
+
+def switch_hub(hub_name: str) -> None:
+    r"""Switch model hub.
+
+    Inputs: top.hub_name
+    """
+    os.environ["USE_MODELSCOPE_HUB"] = "1" if hub_name == "modelscope" else "0"
+    os.environ["USE_OPENMIND_HUB"] = "1" if hub_name == "openmind" else "0"
+
+
+def can_quantize(finetuning_type: str) -> "gr.Dropdown":
+    r"""Judge if the quantization is available in this finetuning type.
+
+    Inputs: top.finetuning_type
+    Outputs: top.quantization_bit
+    """
+    if finetuning_type not in PEFT_METHODS:
+        return gr.Dropdown(value="none", interactive=False)
+    else:
+        return gr.Dropdown(interactive=True)
+
+
+def can_quantize_to(quantization_method: str) -> "gr.Dropdown":
+    r"""Get the available quantization bits.
+
+    Inputs: top.quantization_method
+    Outputs: top.quantization_bit
+    """
+    if quantization_method == QuantizationMethod.BNB:
+        available_bits = ["none", "8", "4"]
+    elif quantization_method == QuantizationMethod.HQQ:
+        available_bits = ["none", "8", "6", "5", "4", "3", "2", "1"]
+    elif quantization_method == QuantizationMethod.EETQ:
+        available_bits = ["none", "8"]
+
+    return gr.Dropdown(choices=available_bits)
+
+
+def change_stage(training_stage: str = list(TRAINING_STAGES.keys())[0]) -> tuple[list[str], bool]:
+    r"""Modify states after changing the training stage.
+
+    Inputs: train.training_stage
+    Outputs: train.dataset, train.packing
+    """
+    return [], TRAINING_STAGES[training_stage] == "pt"
+
+
+def get_model_info(model_name: str) -> tuple[str, str]:
+    r"""Get the necessary information of this model.
+
+    Inputs: top.model_name
+    Outputs: top.model_path, top.template
+    """
+    return get_model_path(model_name), get_template(model_name)
+
+
+def check_template(lang: str, template: str) -> None:
+    r"""Check if an instruct model is used.
+
+    Please use queue=True to show the warning message.
+
+    Inputs: top.lang, top.template
+    """
+    if template == "default":
+        gr.Warning(ALERTS["warn_no_instruct"][lang])
+
+
+def get_trainer_info(lang: str, output_path: os.PathLike, do_train: bool) -> tuple[str, "gr.Slider", dict[str, Any]]:
+    r"""Get training infomation for monitor.
+
+    If do_train is True:
+        Inputs: top.lang, train.output_path
+        Outputs: train.output_box, train.progress_bar, train.loss_viewer, train.swanlab_link
+    If do_train is False:
+        Inputs: top.lang, eval.output_path
+        Outputs: eval.output_box, eval.progress_bar, None, None
+    """
+    running_log = ""
+    running_progress = gr.Slider(visible=False)
+    running_info = {}
+
+    running_log_path = os.path.join(output_path, RUNNING_LOG)
+    if os.path.isfile(running_log_path):
+        with open(running_log_path, encoding="utf-8") as f:
+            running_log = "```\n" + f.read()[-20000:] + "\n```\n"  # avoid lengthy log
+
+    trainer_log_path = os.path.join(output_path, TRAINER_LOG)
+    if os.path.isfile(trainer_log_path):
+        trainer_log: list[dict[str, Any]] = []
+        with open(trainer_log_path, encoding="utf-8") as f:
+            for line in f:
+                trainer_log.append(json.loads(line))
+
+        if len(trainer_log) != 0:
+            latest_log = trainer_log[-1]
+            percentage = latest_log["percentage"]
+            label = "Running {:d}/{:d}: {} < {}".format(
+                latest_log["current_steps"],
+                latest_log["total_steps"],
+                latest_log["elapsed_time"],
+                latest_log["remaining_time"],
+            )
+            running_progress = gr.Slider(label=label, value=percentage, visible=True)
+
+            if do_train and is_matplotlib_available():
+                running_info["loss_viewer"] = gr.Plot(gen_loss_plot(trainer_log))
+
+    swanlab_config_path = os.path.join(output_path, SWANLAB_CONFIG)
+    if os.path.isfile(swanlab_config_path):
+        with open(swanlab_config_path, encoding="utf-8") as f:
+            swanlab_public_config = json.load(f)
+            swanlab_link = swanlab_public_config["cloud"]["experiment_url"]
+            if swanlab_link is not None:
+                running_info["swanlab_link"] = gr.Markdown(
+                    ALERTS["info_swanlab_link"][lang] + swanlab_link, visible=True
+                )
+
+    return running_log, running_progress, running_info
+
+
+def list_checkpoints(model_name: str, finetuning_type: str) -> "gr.Dropdown":
+    r"""List all available checkpoints.
+
+    Inputs: top.model_name, top.finetuning_type
+    Outputs: top.checkpoint_path
+    """
+    checkpoints = []
+    if model_name:
+        save_dir = get_save_dir(model_name, finetuning_type)
+        if save_dir and os.path.isdir(save_dir):
+            for checkpoint in os.listdir(save_dir):
+                if os.path.isdir(os.path.join(save_dir, checkpoint)) and any(
+                    os.path.isfile(os.path.join(save_dir, checkpoint, name)) for name in CHECKPOINT_NAMES
+                ):
+                    checkpoints.append(checkpoint)
+
+    if finetuning_type in PEFT_METHODS:
+        return gr.Dropdown(value=[], choices=checkpoints, multiselect=True)
+    else:
+        return gr.Dropdown(value=None, choices=checkpoints, multiselect=False)
+
+
+def list_config_paths(current_time: str) -> "gr.Dropdown":
+    r"""List all the saved configuration files.
+
+    Inputs: train.current_time
+    Outputs: train.config_path
+    """
+    config_files = [f"{current_time}.yaml"]
+    if os.path.isdir(DEFAULT_CONFIG_DIR):
+        for file_name in os.listdir(DEFAULT_CONFIG_DIR):
+            if file_name.endswith(".yaml") and file_name not in config_files:
+                config_files.append(file_name)
+
+    return gr.Dropdown(choices=config_files)
+
+
+def list_datasets(dataset_dir: str = None, training_stage: str = list(TRAINING_STAGES.keys())[0]) -> "gr.Dropdown":
+    r"""List all available datasets in the dataset dir for the training stage.
+
+    Inputs: *.dataset_dir, *.training_stage
+    Outputs: *.dataset
+    """
+    dataset_info = load_dataset_info(dataset_dir if dataset_dir is not None else DEFAULT_DATA_DIR)
+    ranking = TRAINING_STAGES[training_stage] in STAGES_USE_PAIR_DATA
+    datasets = [k for k, v in dataset_info.items() if v.get("ranking", False) == ranking]
+    return gr.Dropdown(choices=datasets)
+
+
+def list_output_dirs(model_name: Optional[str], finetuning_type: str, current_time: str) -> "gr.Dropdown":
+    r"""List all the directories that can resume from.
+
+    Inputs: top.model_name, top.finetuning_type, train.current_time
+    Outputs: train.output_dir
+    """
+    output_dirs = [f"train_{current_time}"]
+    if model_name:
+        save_dir = get_save_dir(model_name, finetuning_type)
+        if save_dir and os.path.isdir(save_dir):
+            for folder in os.listdir(save_dir):
+                output_dir = os.path.join(save_dir, folder)
+                if os.path.isdir(output_dir) and get_last_checkpoint(output_dir) is not None:
+                    output_dirs.append(folder)
+
+    return gr.Dropdown(choices=output_dirs)
diff --git a/llamafactory/webui/css.py b/llamafactory/webui/css.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7e4c3572907b4d2419b68f87513cc89db21ed06
--- /dev/null
+++ b/llamafactory/webui/css.py
@@ -0,0 +1,67 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+CSS = r"""
+.duplicate-button {
+  margin: auto !important;
+  color: white !important;
+  background: black !important;
+  border-radius: 100vh !important;
+}
+
+.thinking-summary {
+  padding: 8px !important;
+}
+
+.thinking-summary span {
+  border-radius: 4px !important;
+  padding: 4px !important;
+  cursor: pointer !important;
+  font-size: 14px !important;
+  background: rgb(245, 245, 245) !important;
+}
+
+.dark .thinking-summary span {
+  background: rgb(73, 73, 73) !important;
+}
+
+.thinking-container {
+  border-left: 2px solid #a6a6a6 !important;
+  padding-left: 10px !important;
+  margin: 4px 0 !important;
+}
+
+.thinking-container p {
+  color: #a6a6a6 !important;
+}
+
+.modal-box {
+  position: fixed !important;
+  top: 50%;
+  left: 50%;
+  transform: translate(-50%, -50%); /* center horizontally */
+  max-width: 1000px;
+  max-height: 750px;
+  overflow-y: auto;
+  background-color: var(--input-background-fill);
+  flex-wrap: nowrap !important;
+  border: 2px solid black !important;
+  z-index: 1000;
+  padding: 10px;
+}
+
+.dark .modal-box {
+  border: 2px solid white !important;
+}
+"""
diff --git a/llamafactory/webui/engine.py b/llamafactory/webui/engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb1aa443d6130726a0791088f200ce12fdf80655
--- /dev/null
+++ b/llamafactory/webui/engine.py
@@ -0,0 +1,83 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING, Any
+
+from .chatter import WebChatModel
+from .common import create_ds_config, get_time, load_config
+from .locales import LOCALES
+from .manager import Manager
+from .runner import Runner
+
+
+if TYPE_CHECKING:
+    from gradio.components import Component
+
+
+class Engine:
+    r"""A general engine to control the behaviors of Web UI."""
+
+    def __init__(self, demo_mode: bool = False, pure_chat: bool = False) -> None:
+        self.demo_mode = demo_mode
+        self.pure_chat = pure_chat
+        self.manager = Manager()
+        self.runner = Runner(self.manager, demo_mode)
+        self.chatter = WebChatModel(self.manager, demo_mode, lazy_init=(not pure_chat))
+        if not demo_mode:
+            create_ds_config()
+
+    def _update_component(self, input_dict: dict[str, dict[str, Any]]) -> dict["Component", "Component"]:
+        r"""Update gradio components according to the (elem_id, properties) mapping."""
+        output_dict: dict[Component, Component] = {}
+        for elem_id, elem_attr in input_dict.items():
+            elem = self.manager.get_elem_by_id(elem_id)
+            output_dict[elem] = elem.__class__(**elem_attr)
+
+        return output_dict
+
+    def resume(self):
+        r"""Get the initial value of gradio components and restores training status if necessary."""
+        user_config = load_config() if not self.demo_mode else {}  # do not use config in demo mode
+        lang = user_config.get("lang") or "en"
+        init_dict = {"top.lang": {"value": lang}, "infer.chat_box": {"visible": self.chatter.loaded}}
+
+        if not self.pure_chat:
+            current_time = get_time()
+            hub_name = user_config.get("hub_name") or "huggingface"
+            init_dict["top.hub_name"] = {"value": hub_name}
+            init_dict["train.current_time"] = {"value": current_time}
+            init_dict["train.output_dir"] = {"value": f"train_{current_time}"}
+            init_dict["train.config_path"] = {"value": f"{current_time}.yaml"}
+            init_dict["eval.output_dir"] = {"value": f"eval_{current_time}"}
+            init_dict["infer.mm_box"] = {"visible": False}
+
+            if user_config.get("last_model", None):
+                init_dict["top.model_name"] = {"value": user_config["last_model"]}
+
+        yield self._update_component(init_dict)
+
+        if self.runner.running and not self.demo_mode and not self.pure_chat:
+            yield {elem: elem.__class__(value=value) for elem, value in self.runner.running_data.items()}
+            if self.runner.do_train:
+                yield self._update_component({"train.resume_btn": {"value": True}})
+            else:
+                yield self._update_component({"eval.resume_btn": {"value": True}})
+
+    def change_lang(self, lang: str):
+        r"""Update the displayed language of gradio components."""
+        return {
+            elem: elem.__class__(**LOCALES[elem_name][lang])
+            for elem_name, elem in self.manager.get_elem_iter()
+            if elem_name in LOCALES
+        }
diff --git a/llamafactory/webui/interface.py b/llamafactory/webui/interface.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cb989b296ccca0c7fbace473f48ed5f3176a529
--- /dev/null
+++ b/llamafactory/webui/interface.py
@@ -0,0 +1,106 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import platform
+
+from ..extras.misc import fix_proxy, is_env_enabled
+from ..extras.packages import is_gradio_available
+from .common import save_config
+from .components import (
+    create_chat_box,
+    create_eval_tab,
+    create_export_tab,
+    create_footer,
+    create_infer_tab,
+    create_top,
+    create_train_tab,
+)
+from .css import CSS
+from .engine import Engine
+
+
+if is_gradio_available():
+    import gradio as gr
+
+
+def create_ui(demo_mode: bool = False) -> "gr.Blocks":
+    engine = Engine(demo_mode=demo_mode, pure_chat=False)
+    hostname = os.getenv("HOSTNAME", os.getenv("COMPUTERNAME", platform.node())).split(".")[0]
+
+    with gr.Blocks(title=f"LLaMA Factory ({hostname})", css=CSS) as demo:
+        title = gr.HTML()
+        subtitle = gr.HTML()
+        if demo_mode:
+            gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
+
+        engine.manager.add_elems("head", {"title": title, "subtitle": subtitle})
+        engine.manager.add_elems("top", create_top())
+        lang: gr.Dropdown = engine.manager.get_elem_by_id("top.lang")
+
+        with gr.Tab("Train"):
+            engine.manager.add_elems("train", create_train_tab(engine))
+
+        with gr.Tab("Evaluate & Predict"):
+            engine.manager.add_elems("eval", create_eval_tab(engine))
+
+        with gr.Tab("Chat"):
+            engine.manager.add_elems("infer", create_infer_tab(engine))
+
+        if not demo_mode:
+            with gr.Tab("Export"):
+                engine.manager.add_elems("export", create_export_tab(engine))
+
+        engine.manager.add_elems("footer", create_footer())
+        demo.load(engine.resume, outputs=engine.manager.get_elem_list(), concurrency_limit=None)
+        lang.change(engine.change_lang, [lang], engine.manager.get_elem_list(), queue=False)
+        lang.input(save_config, inputs=[lang], queue=False)
+
+    return demo
+
+
+def create_web_demo() -> "gr.Blocks":
+    engine = Engine(pure_chat=True)
+    hostname = os.getenv("HOSTNAME", os.getenv("COMPUTERNAME", platform.node())).split(".")[0]
+
+    with gr.Blocks(title=f"LLaMA Factory Web Demo ({hostname})", css=CSS) as demo:
+        lang = gr.Dropdown(choices=["en", "ru", "zh", "ko", "ja"], scale=1)
+        engine.manager.add_elems("top", dict(lang=lang))
+
+        _, _, chat_elems = create_chat_box(engine, visible=True)
+        engine.manager.add_elems("infer", chat_elems)
+
+        demo.load(engine.resume, outputs=engine.manager.get_elem_list(), concurrency_limit=None)
+        lang.change(engine.change_lang, [lang], engine.manager.get_elem_list(), queue=False)
+        lang.input(save_config, inputs=[lang], queue=False)
+
+    return demo
+
+
+def run_web_ui() -> None:
+    gradio_ipv6 = is_env_enabled("GRADIO_IPV6")
+    gradio_share = is_env_enabled("GRADIO_SHARE")
+    server_name = os.getenv("GRADIO_SERVER_NAME", "[::]" if gradio_ipv6 else "0.0.0.0")
+    print("Visit http://ip:port for Web UI, e.g., http://127.0.0.1:7860")
+    fix_proxy(ipv6_enabled=gradio_ipv6)
+    create_ui().queue().launch(share=gradio_share, server_name=server_name, inbrowser=True)
+
+
+def run_web_demo() -> None:
+    gradio_ipv6 = is_env_enabled("GRADIO_IPV6")
+    gradio_share = is_env_enabled("GRADIO_SHARE")
+    server_name = os.getenv("GRADIO_SERVER_NAME", "[::]" if gradio_ipv6 else "0.0.0.0")
+    print("Visit http://ip:port for Web UI, e.g., http://127.0.0.1:7860")
+    fix_proxy(ipv6_enabled=gradio_ipv6)
+    create_web_demo().queue().launch(share=gradio_share, server_name=server_name, inbrowser=True)
diff --git a/llamafactory/webui/locales.py b/llamafactory/webui/locales.py
new file mode 100644
index 0000000000000000000000000000000000000000..7051b30e80db2ae8283d1367a4f3b3ca1d68fabd
--- /dev/null
+++ b/llamafactory/webui/locales.py
@@ -0,0 +1,3178 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+LOCALES = {
+    "title": {
+        "en": {
+            "value": "<h1><center>🦙🏭LLaMA Factory: Unified Efficient Fine-Tuning of 100+ LLMs</center></h1>",
+        },
+        "ru": {
+            "value": "<h1><center>🦙🏭LLaMA Factory: Унифицированная эффективная тонкая настройка 100+ LLMs</center></h1>",
+        },
+        "zh": {
+            "value": "<h1><center>🦙🏭LLaMA Factory: 一站式大模型高效微调平台</center></h1>",
+        },
+        "ko": {
+            "value": "<h1><center>🦙🏭LLaMA Factory: 100+ LLMs를 위한 통합 효율적인 튜닝</center></h1>",
+        },
+        "ja": {
+            "value": "<h1><center>🦙🏭LLaMA Factory: 100+ LLMs の統合効率的なチューニング</center></h1>",
+        },
+    },
+    "subtitle": {
+        "en": {
+            "value": (
+                "<h3><center>Visit <a href='https://github.com/hiyouga/LLaMA-Factory' target='_blank'>"
+                "GitHub Page</a> <a href='https://llamafactory.readthedocs.io/en/latest/' target='_blank'>"
+                "Documentation</a></center></h3>"
+            ),
+        },
+        "ru": {
+            "value": (
+                "<h3><center>Посетить <a href='https://github.com/hiyouga/LLaMA-Factory' target='_blank'>"
+                "страницу GitHub</a> <a href='https://llamafactory.readthedocs.io/en/latest/' target='_blank'>"
+                "Документацию</a></center></h3>"
+            ),
+        },
+        "zh": {
+            "value": (
+                "<h3><center>访问 <a href='https://github.com/hiyouga/LLaMA-Factory' target='_blank'>"
+                "GitHub 主页</a> <a href='https://llamafactory.readthedocs.io/zh-cn/latest/' target='_blank'>"
+                "官方文档</a></center></h3>"
+            ),
+        },
+        "ko": {
+            "value": (
+                "<h3><center><a href='https://github.com/hiyouga/LLaMA-Factory' target='_blank'>"
+                "GitHub 페이지</a> <a href='https://llamafactory.readthedocs.io/en/latest/' target='_blank'>"
+                "공식 문서</a>를 방문하세요.</center></h3>"
+            ),
+        },
+        "ja": {
+            "value": (
+                "<h3><center><a href='https://github.com/hiyouga/LLaMA-Factory' target='_blank'>"
+                "GitHub ページ</a> <a href='https://llamafactory.readthedocs.io/en/latest/' target='_blank'>"
+                "ドキュメント</a>にアクセスする</center></h3>"
+            ),
+        },
+    },
+    "lang": {
+        "en": {
+            "label": "Language",
+        },
+        "ru": {
+            "label": "Язык",
+        },
+        "zh": {
+            "label": "语言",
+        },
+        "ko": {
+            "label": "언어",
+        },
+        "ja": {
+            "label": "言語",
+        },
+    },
+    "model_name": {
+        "en": {
+            "label": "Model name",
+            "info": "Input the initial name to search for the model.",
+        },
+        "ru": {
+            "label": "Название модели",
+            "info": "Введите начальное имя для поиска модели.",
+        },
+        "zh": {
+            "label": "模型名称",
+            "info": "输入首单词以检索模型。",
+        },
+        "ko": {
+            "label": "모델 이름",
+            "info": "모델을 검색할 초기 이름을 입력하세요.",
+        },
+        "ja": {
+            "label": "モデル名",
+            "info": "モデルを検索するための初期名を入力してください。",
+        },
+    },
+    "model_path": {
+        "en": {
+            "label": "Model path",
+            "info": "Path to pretrained model or model identifier from Hugging Face.",
+        },
+        "ru": {
+            "label": "Путь к модели",
+            "info": "Путь к предварительно обученной модели или идентификатор модели от Hugging Face.",
+        },
+        "zh": {
+            "label": "模型路径",
+            "info": "本地模型的文件路径或 Hugging Face 的模型标识符。",
+        },
+        "ko": {
+            "label": "모델 경로",
+            "info": "사전 훈련된 모델의 경로 또는 Hugging Face의 모델 식별자.",
+        },
+        "ja": {
+            "label": "モデルパス",
+            "info": "事前学習済みモデルへのパス、または Hugging Face のモデル識別子。",
+        },
+    },
+    "hub_name": {
+        "en": {
+            "label": "Hub name",
+            "info": "Choose the model download source.",
+        },
+        "ru": {
+            "label": "Имя хаба",
+            "info": "Выберите источник загрузки модели.",
+        },
+        "zh": {
+            "label": "模型下载源",
+            "info": "选择模型下载源。（网络受限环境推荐使用 ModelScope）",
+        },
+        "ko": {
+            "label": "모델 다운로드 소스",
+            "info": "모델 다운로드 소스를 선택하세요.",
+        },
+        "ja": {
+            "label": "モデルダウンロードソース",
+            "info": "モデルをダウンロードするためのソースを選択してください。",
+        },
+    },
+    "finetuning_type": {
+        "en": {
+            "label": "Finetuning method",
+        },
+        "ru": {
+            "label": "Метод дообучения",
+        },
+        "zh": {
+            "label": "微调方法",
+        },
+        "ko": {
+            "label": "파인튜닝 방법",
+        },
+        "ja": {
+            "label": "ファインチューニング方法",
+        },
+    },
+    "checkpoint_path": {
+        "en": {
+            "label": "Checkpoint path",
+        },
+        "ru": {
+            "label": "Путь контрольной точки",
+        },
+        "zh": {
+            "label": "检查点路径",
+        },
+        "ko": {
+            "label": "체크포인트 경로",
+        },
+        "ja": {
+            "label": "チェックポイントパス",
+        },
+    },
+    "quantization_bit": {
+        "en": {
+            "label": "Quantization bit",
+            "info": "Enable quantization (QLoRA).",
+        },
+        "ru": {
+            "label": "Уровень квантования",
+            "info": "Включить квантование (QLoRA).",
+        },
+        "zh": {
+            "label": "量化等级",
+            "info": "启用量化（QLoRA）。",
+        },
+        "ko": {
+            "label": "양자화 비트",
+            "info": "양자화 활성화 (QLoRA).",
+        },
+        "ja": {
+            "label": "量子化ビット",
+            "info": "量子化を有効にする (QLoRA)。",
+        },
+    },
+    "quantization_method": {
+        "en": {
+            "label": "Quantization method",
+            "info": "Quantization algorithm to use.",
+        },
+        "ru": {
+            "label": "Метод квантования",
+            "info": "Алгоритм квантования, который следует использовать.",
+        },
+        "zh": {
+            "label": "量化方法",
+            "info": "使用的量化算法。",
+        },
+        "ko": {
+            "label": "양자화 방법",
+            "info": "사용할 양자화 알고리즘.",
+        },
+        "ja": {
+            "label": "量子化方法",
+            "info": "使用する量子化アルゴリズム。",
+        },
+    },
+    "template": {
+        "en": {
+            "label": "Chat template",
+            "info": "The chat template used in constructing prompts.",
+        },
+        "ru": {
+            "label": "Шаблон чата",
+            "info": "Шаблон чата используемый для составления подсказок.",
+        },
+        "zh": {
+            "label": "对话模板",
+            "info": "构建提示词时使用的模板。",
+        },
+        "ko": {
+            "label": "채팅 템플릿",
+            "info": "프롬프트 작성에 사용되는 채팅 템플릿.",
+        },
+        "ja": {
+            "label": "チャットテンプレート",
+            "info": "プロンプトの構築に使用されるチャットテンプレート。",
+        },
+    },
+    "rope_scaling": {
+        "en": {
+            "label": "RoPE scaling",
+            "info": "RoPE scaling method to use.",
+        },
+        "ru": {
+            "label": "Масштабирование RoPE",
+            "info": "Метод масштабирования RoPE для использования.",
+        },
+        "zh": {"label": "RoPE 插值方法", "info": "RoPE 插值时使用的方法。"},
+        "ko": {
+            "label": "RoPE 스케일링",
+            "info": "사용할 RoPE 스케일링 방법.",
+        },
+        "ja": {
+            "label": "RoPE スケーリング",
+            "info": "使用する RoPE スケーリング方法。",
+        },
+    },
+    "booster": {
+        "en": {
+            "label": "Booster",
+            "info": "Approach used to boost training speed.",
+        },
+        "ru": {
+            "label": "Ускоритель",
+            "info": "Подход, используемый для ускорения обучения.",
+        },
+        "zh": {"label": "加速方式", "info": "使用的加速方法。"},
+        "ko": {
+            "label": "부스터",
+            "info": "훈련 속도를 향상시키기 위해 사용된 접근 방식.",
+        },
+        "ja": {
+            "label": "ブースター",
+            "info": "トレーニング速度を向上させるためのアプローチ。",
+        },
+    },
+    "training_stage": {
+        "en": {
+            "label": "Stage",
+            "info": "The stage to perform in training.",
+        },
+        "ru": {
+            "label": "Этап",
+            "info": "Этап выполнения обучения.",
+        },
+        "zh": {
+            "label": "训练阶段",
+            "info": "目前采用的训练方式。",
+        },
+        "ko": {
+            "label": "학습 단계",
+            "info": "수행할 학습 방법.",
+        },
+        "ja": {
+            "label": "ステージ",
+            "info": "トレーニングで実行するステージ。",
+        },
+    },
+    "dataset_dir": {
+        "en": {
+            "label": "Data dir",
+            "info": "Path to the data directory.",
+        },
+        "ru": {
+            "label": "Директория данных",
+            "info": "Путь к директории данных.",
+        },
+        "zh": {
+            "label": "数据路径",
+            "info": "数据文件夹的路径。",
+        },
+        "ko": {
+            "label": "데이터 디렉토리",
+            "info": "데이터 디렉토리의 경로.",
+        },
+        "ja": {
+            "label": "データディレクトリ",
+            "info": "データディレクトリへのパス。",
+        },
+    },
+    "dataset": {
+        "en": {
+            "label": "Dataset",
+        },
+        "ru": {
+            "label": "Набор данных",
+        },
+        "zh": {
+            "label": "数据集",
+        },
+        "ko": {
+            "label": "데이터셋",
+        },
+        "ja": {
+            "label": "データセット",
+        },
+    },
+    "data_preview_btn": {
+        "en": {
+            "value": "Preview dataset",
+        },
+        "ru": {
+            "value": "Просмотреть набор данных",
+        },
+        "zh": {
+            "value": "预览数据集",
+        },
+        "ko": {
+            "value": "데이터셋 미리보기",
+        },
+        "ja": {
+            "value": "データセットをプレビュー",
+        },
+    },
+    "preview_count": {
+        "en": {
+            "label": "Count",
+        },
+        "ru": {
+            "label": "Количество",
+        },
+        "zh": {
+            "label": "数量",
+        },
+        "ko": {
+            "label": "개수",
+        },
+        "ja": {
+            "label": "カウント",
+        },
+    },
+    "page_index": {
+        "en": {
+            "label": "Page",
+        },
+        "ru": {
+            "label": "Страница",
+        },
+        "zh": {
+            "label": "页数",
+        },
+        "ko": {
+            "label": "페이지",
+        },
+        "ja": {
+            "label": "ページ",
+        },
+    },
+    "prev_btn": {
+        "en": {
+            "value": "Prev",
+        },
+        "ru": {
+            "value": "Предыдущая",
+        },
+        "zh": {
+            "value": "上一页",
+        },
+        "ko": {
+            "value": "이전",
+        },
+        "ja": {
+            "value": "前へ",
+        },
+    },
+    "next_btn": {
+        "en": {
+            "value": "Next",
+        },
+        "ru": {
+            "value": "Следующая",
+        },
+        "zh": {
+            "value": "下一页",
+        },
+        "ko": {
+            "value": "다음",
+        },
+        "ja": {
+            "value": "次へ",
+        },
+    },
+    "close_btn": {
+        "en": {
+            "value": "Close",
+        },
+        "ru": {
+            "value": "Закрыть",
+        },
+        "zh": {
+            "value": "关闭",
+        },
+        "ko": {
+            "value": "닫기",
+        },
+        "ja": {
+            "value": "閉じる",
+        },
+    },
+    "preview_samples": {
+        "en": {
+            "label": "Samples",
+        },
+        "ru": {
+            "label": "Примеры",
+        },
+        "zh": {
+            "label": "样例",
+        },
+        "ko": {
+            "label": "샘플",
+        },
+        "ja": {
+            "label": "サンプル",
+        },
+    },
+    "learning_rate": {
+        "en": {
+            "label": "Learning rate",
+            "info": "Initial learning rate for AdamW.",
+        },
+        "ru": {
+            "label": "Скорость обучения",
+            "info": "Начальная скорость обучения для AdamW.",
+        },
+        "zh": {
+            "label": "学习率",
+            "info": "AdamW 优化器的初始学习率。",
+        },
+        "ko": {
+            "label": "학습률",
+            "info": "AdamW의 초기 학습률.",
+        },
+        "ja": {
+            "label": "学習率",
+            "info": "AdamW の初期学習率。",
+        },
+    },
+    "num_train_epochs": {
+        "en": {
+            "label": "Epochs",
+            "info": "Total number of training epochs to perform.",
+        },
+        "ru": {
+            "label": "Эпохи",
+            "info": "Общее количество эпох обучения.",
+        },
+        "zh": {
+            "label": "训练轮数",
+            "info": "需要执行的训练总轮数。",
+        },
+        "ko": {
+            "label": "에포크",
+            "info": "수행할 총 학습 에포크 수.",
+        },
+        "ja": {
+            "label": "エポック数",
+            "info": "実行するトレーニングの総エポック数。",
+        },
+    },
+    "max_grad_norm": {
+        "en": {
+            "label": "Maximum gradient norm",
+            "info": "Norm for gradient clipping.",
+        },
+        "ru": {
+            "label": "Максимальная норма градиента",
+            "info": "Норма для обрезки градиента.",
+        },
+        "zh": {
+            "label": "最大梯度范数",
+            "info": "用于梯度裁剪的范数。",
+        },
+        "ko": {
+            "label": "최대 그레디언트 노름(norm)",
+            "info": "그레디언트 클리핑을 위한 노름(norm).",
+        },
+        "ja": {
+            "label": "最大勾配ノルム",
+            "info": "勾配クリッピングのためのノルム。",
+        },
+    },
+    "max_samples": {
+        "en": {
+            "label": "Max samples",
+            "info": "Maximum samples per dataset.",
+        },
+        "ru": {
+            "label": "Максимальное количество образцов",
+            "info": "Максимальное количество образцов на набор данных.",
+        },
+        "zh": {
+            "label": "最大样本数",
+            "info": "每个数据集的最大样本数。",
+        },
+        "ko": {
+            "label": "최대 샘플 수",
+            "info": "데이터셋 당 최대 샘플 수.",
+        },
+        "ja": {
+            "label": "最大サンプル数",
+            "info": "データセットごとの最大サンプル数。",
+        },
+    },
+    "compute_type": {
+        "en": {
+            "label": "Compute type",
+            "info": "Whether to use mixed precision training.",
+        },
+        "ru": {
+            "label": "Тип вычислений",
+            "info": "Использовать ли обучение смешанной точности.",
+        },
+        "zh": {
+            "label": "计算类型",
+            "info": "是否使用混合精度训练。",
+        },
+        "ko": {
+            "label": "연산 유형",
+            "info": "혼합 정밀도 훈련을 사용할지 여부.",
+        },
+        "ja": {
+            "label": "計算タイプ",
+            "info": "混合精度トレーニングを使用するかどうか。",
+        },
+    },
+    "cutoff_len": {
+        "en": {
+            "label": "Cutoff length",
+            "info": "Max tokens in input sequence.",
+        },
+        "ru": {
+            "label": "Длина обрезки",
+            "info": "Максимальное количество токенов во входной последовательности.",
+        },
+        "zh": {
+            "label": "截断长度",
+            "info": "输入序列分词后的最大长度。",
+        },
+        "ko": {
+            "label": "컷오프 길이",
+            "info": "입력 시퀀스의 최대 토큰 수.",
+        },
+        "ja": {
+            "label": "カットオフ長",
+            "info": "入力シーケンスの最大トークン数。",
+        },
+    },
+    "batch_size": {
+        "en": {
+            "label": "Batch size",
+            "info": "Number of samples processed on each GPU.",
+        },
+        "ru": {
+            "label": "Размер пакета",
+            "info": "Количество образцов для обработки на каждом GPU.",
+        },
+        "zh": {
+            "label": "批处理大小",
+            "info": "每个 GPU 处理的样本数量。",
+        },
+        "ko": {
+            "label": "배치 크기",
+            "info": "각 GPU에서 처리되는 샘플 수.",
+        },
+        "ja": {
+            "label": "バッチサイズ",
+            "info": "各 GPU で処理されるサンプル数。",
+        },
+    },
+    "gradient_accumulation_steps": {
+        "en": {
+            "label": "Gradient accumulation",
+            "info": "Number of steps for gradient accumulation.",
+        },
+        "ru": {
+            "label": "Накопление градиента",
+            "info": "Количество шагов накопления градиента.",
+        },
+        "zh": {
+            "label": "梯度累积",
+            "info": "梯度累积的步数。",
+        },
+        "ko": {
+            "label": "그레디언트 누적",
+            "info": "그레디언트 누적 단계 수.",
+        },
+        "ja": {
+            "label": "勾配累積",
+            "info": "勾配累積のステップ数。",
+        },
+    },
+    "val_size": {
+        "en": {
+            "label": "Val size",
+            "info": "Percentage of validation set from the entire dataset.",
+        },
+        "ru": {
+            "label": "Размер валидации",
+            "info": "Пропорция данных в наборе для разработки.",
+        },
+        "zh": {
+            "label": "验证集比例",
+            "info": "验证集占全部样本的百分比。",
+        },
+        "ko": {
+            "label": "검증 데이터셋 크기",
+            "info": "개발 데이터셋에서 검증 데이터의 비율.",
+        },
+        "ja": {
+            "label": "検証セットサイズ",
+            "info": "データセット全体に対する検証セットの割合。",
+        },
+    },
+    "lr_scheduler_type": {
+        "en": {
+            "label": "LR scheduler",
+            "info": "Name of the learning rate scheduler.",
+        },
+        "ru": {
+            "label": "Планировщик скорости обучения",
+            "info": "Название планировщика скорости обучения.",
+        },
+        "zh": {
+            "label": "学习率调节器",
+            "info": "学习率调度器的名称。",
+        },
+        "ko": {
+            "label": "LR 스케줄러",
+            "info": "학습률 스케줄러의 이름.",
+        },
+        "ja": {
+            "label": "学習率スケジューラ",
+            "info": "学習率スケジューラの名前。",
+        },
+    },
+    "extra_tab": {
+        "en": {
+            "label": "Extra configurations",
+        },
+        "ru": {
+            "label": "Дополнительные конфигурации",
+        },
+        "zh": {
+            "label": "其它参数设置",
+        },
+        "ko": {
+            "label": "추가 구성(configuration)",
+        },
+        "ja": {
+            "label": "追加設定",
+        },
+    },
+    "logging_steps": {
+        "en": {
+            "label": "Logging steps",
+            "info": "Number of steps between two logs.",
+        },
+        "ru": {
+            "label": "Шаги логирования",
+            "info": "Количество шагов между двумя записями в журнале.",
+        },
+        "zh": {
+            "label": "日志间隔",
+            "info": "每两次日志输出间的更新步数。",
+        },
+        "ko": {
+            "label": "로깅 스텝",
+            "info": "이전 로깅과 다음 로깅 간 스텝 수.",
+        },
+        "ja": {
+            "label": "ロギングステップ",
+            "info": "2 つのログ間のステップ数。",
+        },
+    },
+    "save_steps": {
+        "en": {
+            "label": "Save steps",
+            "info": "Number of steps between two checkpoints.",
+        },
+        "ru": {
+            "label": "Шаги сохранения",
+            "info": "Количество шагов между двумя контрольными точками.",
+        },
+        "zh": {
+            "label": "保存间隔",
+            "info": "每两次断点保存间的更新步数。",
+        },
+        "ko": {
+            "label": "저장 스텝",
+            "info": "이전 체크포인트와 다음 체크포인트 사이의 스텝 수.",
+        },
+        "ja": {
+            "label": "保存ステップ",
+            "info": "2 つのチェックポイント間のステップ数。",
+        },
+    },
+    "warmup_steps": {
+        "en": {
+            "label": "Warmup steps",
+            "info": "Number of steps used for warmup.",
+        },
+        "ru": {
+            "label": "Шаги прогрева",
+            "info": "Количество шагов, используемых для прогрева.",
+        },
+        "zh": {
+            "label": "预热步数",
+            "info": "学习率预热采用的步数。",
+        },
+        "ko": {
+            "label": "Warmup 스텝",
+            "info": "Warmup에 사용되는 스텝 수.",
+        },
+        "ja": {
+            "label": "ウォームアップステップ",
+            "info": "ウォームアップに使用されるステップ数。",
+        },
+    },
+    "neftune_alpha": {
+        "en": {
+            "label": "NEFTune alpha",
+            "info": "Magnitude of noise adding to embedding vectors.",
+        },
+        "ru": {
+            "label": "NEFTune alpha",
+            "info": "Величина шума, добавляемого к векторам вложений.",
+        },
+        "zh": {
+            "label": "NEFTune 噪声参数",
+            "info": "嵌入向量所添加的噪声大小。",
+        },
+        "ko": {
+            "label": "NEFTune 알파",
+            "info": "임베딩 벡터에 추가되는 노이즈의 크기.",
+        },
+        "ja": {
+            "label": "NEFTune alpha",
+            "info": "埋め込みベクトルに追加されるノイズの大きさ。",
+        },
+    },
+    "extra_args": {
+        "en": {
+            "label": "Extra arguments",
+            "info": "Extra arguments passed to the trainer in JSON format.",
+        },
+        "ru": {
+            "label": "Дополнительные аргументы",
+            "info": "Дополнительные аргументы, которые передаются тренеру в формате JSON.",
+        },
+        "zh": {
+            "label": "额外参数",
+            "info": "以 JSON 格式传递给训练器的额外参数。",
+        },
+        "ko": {
+            "label": "추가 인수",
+            "info": "JSON 형식으로 트레이너에게 전달할 추가 인수입니다.",
+        },
+        "ja": {
+            "label": "追加引数",
+            "info": "JSON 形式でトレーナーに渡される追加引数。",
+        },
+    },
+    "packing": {
+        "en": {
+            "label": "Pack sequences",
+            "info": "Pack sequences into samples of fixed length.",
+        },
+        "ru": {
+            "label": "Упаковка последовательностей",
+            "info": "Упаковка последовательностей в образцы фиксированной длины.",
+        },
+        "zh": {
+            "label": "序列打包",
+            "info": "将序列打包为等长样本。",
+        },
+        "ko": {
+            "label": "시퀀스 패킹",
+            "info": "고정된 길이의 샘플로 시퀀스를 패킹합니다.",
+        },
+        "ja": {
+            "label": "シーケンスパッキング",
+            "info": "シーケンスを固定長のサンプルにパッキングします。",
+        },
+    },
+    "neat_packing": {
+        "en": {
+            "label": "Use neat packing",
+            "info": "Avoid cross-attention between packed sequences.",
+        },
+        "ru": {
+            "label": "Используйте аккуратную упаковку",
+            "info": "избегайте перекрестного внимания между упакованными последовательностями.",
+        },
+        "zh": {
+            "label": "使用无污染打包",
+            "info": "避免打包后的序列产生交叉注意力。",
+        },
+        "ko": {
+            "label": "니트 패킹 사용",
+            "info": "패킹된 시퀀스 간의 크로스 어텐션을 피합니다.",
+        },
+        "ja": {
+            "label": "無汚染パッキングを使用",
+            "info": "パッキング後のシーケンス間のクロスアテンションを避けます。",
+        },
+    },
+    "train_on_prompt": {
+        "en": {
+            "label": "Train on prompt",
+            "info": "Disable the label mask on the prompt (only for SFT).",
+        },
+        "ru": {
+            "label": "Тренировка на подсказке",
+            "info": "Отключить маску меток на подсказке (только для SFT).",
+        },
+        "zh": {
+            "label": "学习提示词",
+            "info": "不在提示词的部分添加掩码（仅适用于 SFT）。",
+        },
+        "ko": {
+            "label": "프롬프트도 학습",
+            "info": "프롬프트에서 라벨 마스킹을 비활성화합니다 (SFT에만 해당).",
+        },
+        "ja": {
+            "label": "プロンプトで学習",
+            "info": "プロンプト部分にマスクを追加しない（SFT のみ）。",
+        },
+    },
+    "mask_history": {
+        "en": {
+            "label": "Mask history",
+            "info": "Train on the last turn only (only for SFT).",
+        },
+        "ru": {
+            "label": "История масок",
+            "info": "Тренироваться только на последнем шаге (только для SFT).",
+        },
+        "zh": {
+            "label": "不学习历史对话",
+            "info": "仅学习最后一轮对话（仅适用于 SFT）。",
+        },
+        "ko": {
+            "label": "히스토리 마스킹",
+            "info": "대화 데이터의 마지막 턴만 학습합니다 (SFT에만 해당).",
+        },
+        "ja": {
+            "label": "履歴をマスク",
+            "info": "最後のターンのみを学習する（SFT のみ）。",
+        },
+    },
+    "resize_vocab": {
+        "en": {
+            "label": "Resize token embeddings",
+            "info": "Resize the tokenizer vocab and the embedding layers.",
+        },
+        "ru": {
+            "label": "Изменение размера токенных эмбеддингов",
+            "info": "Изменить размер словаря токенизатора и слоев эмбеддинга.",
+        },
+        "zh": {
+            "label": "更改词表大小",
+            "info": "更改分词器词表和嵌入层的大小。",
+        },
+        "ko": {
+            "label": "토큰 임베딩의 사이즈 조정",
+            "info": "토크나이저 어휘와 임베딩 레이어의 크기를 조정합니다.",
+        },
+        "ja": {
+            "label": "トークン埋め込みのサイズ変更",
+            "info": "トークナイザーの語彙と埋め込み層のサイズを変更します。",
+        },
+    },
+    "use_llama_pro": {
+        "en": {
+            "label": "Enable LLaMA Pro",
+            "info": "Make the parameters in the expanded blocks trainable.",
+        },
+        "ru": {
+            "label": "Включить LLaMA Pro",
+            "info": "Сделать параметры в расширенных блоках обучаемыми.",
+        },
+        "zh": {
+            "label": "使用 LLaMA Pro",
+            "info": "仅训练块扩展后的参数。",
+        },
+        "ko": {
+            "label": "LLaMA Pro 사용",
+            "info": "확장된 블록의 매개변수를 학습 가능하게 만듭니다.",
+        },
+        "ja": {
+            "label": "LLaMA Pro を有効化",
+            "info": "拡張ブロックのパラメータのみをトレーニングします。",
+        },
+    },
+    "enable_thinking": {
+        "en": {
+            "label": "Enable thinking",
+            "info": "Whether or not to enable thinking mode for reasoning models.",
+        },
+        "ru": {
+            "label": "Включить мысли",
+            "info": "Включить режим мысли для моделей решающего характера.",
+        },
+        "zh": {
+            "label": "启用思考模式",
+            "info": "是否启用推理模型的思考模式。",
+        },
+        "ko": {
+            "label": "생각 모드 활성화",
+            "info": "추론 모델의 생각 모드를 활성화할지 여부.",
+        },
+        "ja": {
+            "label": "思考モードを有効化",
+            "info": "推論モデルの思考モードを有効にするかどうか。",
+        },
+    },
+    "report_to": {
+        "en": {
+            "label": "Enable external logger",
+            "info": "Use TensorBoard or wandb to log experiment.",
+        },
+        "ru": {
+            "label": "Включить внешний регистратор",
+            "info": "Использовать TensorBoard или wandb для ведения журнала экспериментов.",
+        },
+        "zh": {
+            "label": "启用外部记录面板",
+            "info": "使用 TensorBoard 或 wandb 记录实验。",
+        },
+        "ko": {
+            "label": "외부 logger 활성화",
+            "info": "TensorBoard 또는 wandb를 사용하여 실험을 기록합니다.",
+        },
+        "ja": {
+            "label": "外部ロガーを有効化",
+            "info": "TensorBoard または wandb を使用して実験を記録します。",
+        },
+    },
+    "freeze_tab": {
+        "en": {
+            "label": "Freeze tuning configurations",
+        },
+        "ru": {
+            "label": "конфигурации для настройки заморозки",
+        },
+        "zh": {
+            "label": "部分参数微调设置",
+        },
+        "ko": {
+            "label": "Freeze tuning 설정",
+        },
+        "ja": {
+            "label": "フリーズチューニング設定",
+        },
+    },
+    "freeze_trainable_layers": {
+        "en": {
+            "label": "Trainable layers",
+            "info": "Number of the last(+)/first(-) hidden layers to be set as trainable.",
+        },
+        "ru": {
+            "label": "Обучаемые слои",
+            "info": "Количество последних (+)/первых (-) скрытых слоев, которые будут установлены как обучаемые.",
+        },
+        "zh": {
+            "label": "可训练层数",
+            "info": "最末尾（+）/最前端（-）可训练隐藏层的数量。",
+        },
+        "ko": {
+            "label": "학습 가능한 레이어",
+            "info": "학습 가능하게 설정할 마지막(+)/처음(-) 히든 레이어의 수.",
+        },
+        "ja": {
+            "label": "学習可能なレイヤー",
+            "info": "最後（+）/最初（-）の学習可能な隠れ層の数。",
+        },
+    },
+    "freeze_trainable_modules": {
+        "en": {
+            "label": "Trainable modules",
+            "info": "Name(s) of trainable modules. Use commas to separate multiple modules.",
+        },
+        "ru": {
+            "label": "Обучаемые модули",
+            "info": "Название обучаемых модулей. Используйте запятые для разделения нескольких модулей.",
+        },
+        "zh": {
+            "label": "可训练模块",
+            "info": "可训练模块的名称。使用英文逗号分隔多个名称。",
+        },
+        "ko": {
+            "label": "학습 가능한 모듈",
+            "info": "학습 가능한 모듈의 이름. 여러 모듈을 구분하려면 쉼표(,)를 사용하세요.",
+        },
+        "ja": {
+            "label": "学習可能なモジュール",
+            "info": "学習可能なモジュールの名前。複数のモジュールを区切るにはカンマを使用します。",
+        },
+    },
+    "freeze_extra_modules": {
+        "en": {
+            "label": "Extra modules (optional)",
+            "info": (
+                "Name(s) of modules apart from hidden layers to be set as trainable. "
+                "Use commas to separate multiple modules."
+            ),
+        },
+        "ru": {
+            "label": "Дополнительные модули (опционально)",
+            "info": (
+                "Имена модулей, кроме скрытых слоев, которые следует установить в качестве обучаемых. "
+                "Используйте запятые для разделения нескольких модулей."
+            ),
+        },
+        "zh": {
+            "label": "额外模块（非必填）",
+            "info": "除隐藏层以外的可训练模块名称。使用英文逗号分隔多个名称。",
+        },
+        "ko": {
+            "label": "추가 모듈 (선택 사항)",
+            "info": "히든 레이어 외에 학습 가능하게 설정할 모듈의 이름. 모듈 간에는 쉼표(,)로 구분하십시오.",
+        },
+        "ja": {
+            "label": "追加モジュール（オプション）",
+            "info": "隠れ層以外の学習可能なモジュールの名前。複数のモジュールを区切るにはカンマを使用します。",
+        },
+    },
+    "lora_tab": {
+        "en": {
+            "label": "LoRA configurations",
+        },
+        "ru": {
+            "label": "Конфигурации LoRA",
+        },
+        "zh": {
+            "label": "LoRA 参数设置",
+        },
+        "ko": {
+            "label": "LoRA 구성",
+        },
+        "ja": {
+            "label": "LoRA 設定",
+        },
+    },
+    "lora_rank": {
+        "en": {
+            "label": "LoRA rank",
+            "info": "The rank of LoRA matrices.",
+        },
+        "ru": {
+            "label": "Ранг матриц LoRA",
+            "info": "Ранг матриц LoRA.",
+        },
+        "zh": {
+            "label": "LoRA 秩",
+            "info": "LoRA 矩阵的秩大小。",
+        },
+        "ko": {
+            "label": "LoRA 랭크",
+            "info": "LoRA 행렬의 랭크.",
+        },
+        "ja": {
+            "label": "LoRA ランク",
+            "info": "LoRA 行列のランク。",
+        },
+    },
+    "lora_alpha": {
+        "en": {
+            "label": "LoRA alpha",
+            "info": "Lora scaling coefficient.",
+        },
+        "ru": {
+            "label": "LoRA alpha",
+            "info": "Коэффициент масштабирования LoRA.",
+        },
+        "zh": {
+            "label": "LoRA 缩放系数",
+            "info": "LoRA 缩放系数大小。",
+        },
+        "ko": {
+            "label": "LoRA 알파",
+            "info": "LoRA 스케일링 계수.",
+        },
+        "ja": {
+            "label": "LoRA alpha",
+            "info": "LoRA スケーリング係数。",
+        },
+    },
+    "lora_dropout": {
+        "en": {
+            "label": "LoRA dropout",
+            "info": "Dropout ratio of LoRA weights.",
+        },
+        "ru": {
+            "label": "Вероятность отсева LoRA",
+            "info": "Вероятность отсева весов LoRA.",
+        },
+        "zh": {
+            "label": "LoRA 随机丢弃",
+            "info": "LoRA 权重随机丢弃的概率。",
+        },
+        "ko": {
+            "label": "LoRA 드롭아웃",
+            "info": "LoRA 가중치의 드롭아웃 비율.",
+        },
+        "ja": {
+            "label": "LoRA ドロップアウト",
+            "info": "LoRA 重みのドロップアウト確率。",
+        },
+    },
+    "loraplus_lr_ratio": {
+        "en": {
+            "label": "LoRA+ LR ratio",
+            "info": "The LR ratio of the B matrices in LoRA.",
+        },
+        "ru": {
+            "label": "LoRA+ LR коэффициент",
+            "info": "Коэффициент LR матриц B в LoRA.",
+        },
+        "zh": {
+            "label": "LoRA+ 学习率比例",
+            "info": "LoRA+ 中 B 矩阵的学习率倍数。",
+        },
+        "ko": {
+            "label": "LoRA+ LR 비율",
+            "info": "LoRA에서 B 행렬의 LR 비율.",
+        },
+        "ja": {
+            "label": "LoRA+ LR 比率",
+            "info": "LoRA+ の B 行列の学習率倍率。",
+        },
+    },
+    "create_new_adapter": {
+        "en": {
+            "label": "Create new adapter",
+            "info": "Create a new adapter with randomly initialized weight upon the existing one.",
+        },
+        "ru": {
+            "label": "Создать новый адаптер",
+            "info": "Создать новый адаптер с случайной инициализацией веса на основе существующего.",
+        },
+        "zh": {
+            "label": "新建适配器",
+            "info": "在现有的适配器上创建一个随机初始化后的新适配器。",
+        },
+        "ko": {
+            "label": "새 어댑터 생성",
+            "info": "기존 어댑터 위에 무작위로 초기화된 가중치를 가진 새 어댑터를 생성합니다.",
+        },
+        "ja": {
+            "label": "新しいアダプターを作成",
+            "info": "既存のアダプター上にランダムに初期化された新しいアダプターを作成します。",
+        },
+    },
+    "use_rslora": {
+        "en": {
+            "label": "Use rslora",
+            "info": "Use the rank stabilization scaling factor for LoRA layer.",
+        },
+        "ru": {
+            "label": "Использовать rslora",
+            "info": "Использовать коэффициент масштабирования стабилизации ранга для слоя LoRA.",
+        },
+        "zh": {
+            "label": "使用 rslora",
+            "info": "对 LoRA 层使用秩稳定缩放方法。",
+        },
+        "ko": {
+            "label": "rslora 사용",
+            "info": "LoRA 레이어에 랭크 안정화 스케일링 계수를 사용합니다.",
+        },
+        "ja": {
+            "label": "rslora を使用",
+            "info": "LoRA 層にランク安定化スケーリング方法を使用します。",
+        },
+    },
+    "use_dora": {
+        "en": {
+            "label": "Use DoRA",
+            "info": "Use weight-decomposed LoRA.",
+        },
+        "ru": {
+            "label": "Используйте DoRA",
+            "info": "Используйте LoRA с декомпозицией весов.",
+        },
+        "zh": {
+            "label": "使用 DoRA",
+            "info": "使用权重分解的 LoRA。",
+        },
+        "ko": {
+            "label": "DoRA 사용",
+            "info": "가중치-분해 LoRA를 사용합니다.",
+        },
+        "ja": {
+            "label": "DoRA を使用",
+            "info": "重み分解された LoRA を使用します。",
+        },
+    },
+    "use_pissa": {
+        "en": {
+            "label": "Use PiSSA",
+            "info": "Use PiSSA method.",
+        },
+        "ru": {
+            "label": "используйте PiSSA",
+            "info": "Используйте метод PiSSA.",
+        },
+        "zh": {
+            "label": "使用 PiSSA",
+            "info": "使用 PiSSA 方法。",
+        },
+        "ko": {
+            "label": "PiSSA 사용",
+            "info": "PiSSA 방법을 사용합니다.",
+        },
+        "ja": {
+            "label": "PiSSA を使用",
+            "info": "PiSSA メソッドを使用します。",
+        },
+    },
+    "lora_target": {
+        "en": {
+            "label": "LoRA modules (optional)",
+            "info": "Name(s) of modules to apply LoRA. Use commas to separate multiple modules.",
+        },
+        "ru": {
+            "label": "Модули LoRA (опционально)",
+            "info": "Имена модулей для применения LoRA. Используйте запятые для разделения нескольких модулей.",
+        },
+        "zh": {
+            "label": "LoRA 作用模块（非必填）",
+            "info": "应用 LoRA 的模块名称。使用英文逗号分隔多个名称。",
+        },
+        "ko": {
+            "label": "LoRA 모듈 (선택 사항)",
+            "info": "LoRA를 적용할 모듈의 이름. 모듈 간에는 쉼표(,)로 구분하십시오.",
+        },
+        "ja": {
+            "label": "LoRA モジュール（オプション）",
+            "info": "LoRA を適用するモジュールの名前。複数のモジュールを区切るにはカンマを使用します。",
+        },
+    },
+    "additional_target": {
+        "en": {
+            "label": "Additional modules (optional)",
+            "info": (
+                "Name(s) of modules apart from LoRA layers to be set as trainable. "
+                "Use commas to separate multiple modules."
+            ),
+        },
+        "ru": {
+            "label": "Дополнительные модули (опционально)",
+            "info": (
+                "Имена модулей, кроме слоев LoRA, которые следует установить в качестве обучаемых. "
+                "Используйте запятые для разделения нескольких модулей."
+            ),
+        },
+        "zh": {
+            "label": "附加模块（非必填）",
+            "info": "除 LoRA 层以外的可训练模块名称。使用英文逗号分隔多个名称。",
+        },
+        "ko": {
+            "label": "추가 모듈 (선택 사항)",
+            "info": "LoRA 레이어 외에 학습 가능하게 설정할 모듈의 이름. 모듈 간에는 쉼표(,)로 구분하십시오.",
+        },
+        "ja": {
+            "label": "追加モジュール（オプション）",
+            "info": "LoRA 層以外の学習可能なモジュールの名前。複数のモジュールを区切るにはカンマを使用します。",
+        },
+    },
+    "rlhf_tab": {
+        "en": {
+            "label": "RLHF configurations",
+        },
+        "ru": {
+            "label": "Конфигурации RLHF",
+        },
+        "zh": {
+            "label": "RLHF 参数设置",
+        },
+        "ko": {
+            "label": "RLHF 구성",
+        },
+        "ja": {
+            "label": "RLHF 設定",
+        },
+    },
+    "pref_beta": {
+        "en": {
+            "label": "Beta value",
+            "info": "Value of the beta parameter in the loss.",
+        },
+        "ru": {
+            "label": "Бета значение",
+            "info": "Значение параметра бета в функции потерь.",
+        },
+        "zh": {
+            "label": "Beta 参数",
+            "info": "损失函数中 beta 超参数大小。",
+        },
+        "ko": {
+            "label": "베타 값",
+            "info": "손실 함수에서 베타 매개 변수의 값.",
+        },
+        "ja": {
+            "label": "Beta 値",
+            "info": "損失関数における beta ハイパーパラメータの値。",
+        },
+    },
+    "pref_ftx": {
+        "en": {
+            "label": "Ftx gamma",
+            "info": "The weight of SFT loss in the final loss.",
+        },
+        "ru": {
+            "label": "Ftx гамма",
+            "info": "Вес потери SFT в итоговой потере.",
+        },
+        "zh": {
+            "label": "Ftx gamma",
+            "info": "损失函数中 SFT 损失的权重大小。",
+        },
+        "ko": {
+            "label": "Ftx 감마",
+            "info": "최종 로스 함수에서 SFT 로스의 가중치.",
+        },
+        "ja": {
+            "label": "Ftx gamma",
+            "info": "損失関数における SFT 損失の重み。",
+        },
+    },
+    "pref_loss": {
+        "en": {
+            "label": "Loss type",
+            "info": "The type of the loss function.",
+        },
+        "ru": {
+            "label": "Тип потерь",
+            "info": "Тип функции потерь.",
+        },
+        "zh": {
+            "label": "损失类型",
+            "info": "损失函数的类型。",
+        },
+        "ko": {
+            "label": "로스 유형",
+            "info": "로스 함수의 유형.",
+        },
+        "ja": {
+            "label": "損失タイプ",
+            "info": "損失関数のタイプ。",
+        },
+    },
+    "reward_model": {
+        "en": {
+            "label": "Reward model",
+            "info": "Adapter of the reward model in PPO training.",
+        },
+        "ru": {
+            "label": "Модель вознаграждения",
+            "info": "Адаптер модели вознаграждения для обучения PPO.",
+        },
+        "zh": {
+            "label": "奖励模型",
+            "info": "PPO 训练中奖励模型的适配器路径。",
+        },
+        "ko": {
+            "label": "리워드 모델",
+            "info": "PPO 학습에서 사용할 리워드 모델의 어댑터.",
+        },
+        "ja": {
+            "label": "報酬モデル",
+            "info": "PPO トレーニングにおける報酬モデルのアダプター。",
+        },
+    },
+    "ppo_score_norm": {
+        "en": {
+            "label": "Score norm",
+            "info": "Normalizing scores in PPO training.",
+        },
+        "ru": {
+            "label": "Норма оценок",
+            "info": "Нормализация оценок в тренировке PPO.",
+        },
+        "zh": {
+            "label": "归一化分数",
+            "info": "PPO 训练中归一化奖励分数。",
+        },
+        "ko": {
+            "label": "스코어 정규화",
+            "info": "PPO 학습에서 스코어를 정규화합니다.",
+        },
+        "ja": {
+            "label": "スコア正規化",
+            "info": "PPO トレーニングにおける報酬スコアの正規化。",
+        },
+    },
+    "ppo_whiten_rewards": {
+        "en": {
+            "label": "Whiten rewards",
+            "info": "Whiten the rewards in PPO training.",
+        },
+        "ru": {
+            "label": "Белые вознаграждения",
+            "info": "Осветлите вознаграждения в обучении PPO.",
+        },
+        "zh": {
+            "label": "白化奖励",
+            "info": "PPO 训练中将奖励分数做白化处理。",
+        },
+        "ko": {
+            "label": "보상 백화",
+            "info": "PPO 훈련에서 보상을 백화(Whiten)합니다.",
+        },
+        "ja": {
+            "label": "報酬のホワイトニング",
+            "info": "PPO トレーニングにおいて報酬スコアをホワイトニング処理します。",
+        },
+    },
+    "mm_tab": {
+        "en": {
+            "label": "Multimodal configurations",
+        },
+        "ru": {
+            "label": "Конфигурации мультимедиа",
+        },
+        "zh": {
+            "label": "多模态参数设置",
+        },
+        "ko": {
+            "label": "멀티모달 구성",
+        },
+        "ja": {
+            "label": "多モーダル設定",
+        },
+    },
+    "freeze_vision_tower": {
+        "en": {
+            "label": "Freeze vision tower",
+            "info": "Freeze the vision tower in the model.",
+        },
+        "ru": {
+            "label": "Заморозить башню визиона",
+            "info": "Заморозить башню визиона в модели.",
+        },
+        "zh": {
+            "label": "冻结视觉编码器",
+            "info": "冻结模型中的视觉编码器。",
+        },
+        "ko": {
+            "label": "비전 타워 고정",
+            "info": "모델의 비전 타워를 고정합니다.",
+        },
+        "ja": {
+            "label": "ビジョンタワーの固定",
+            "info": "モデルのビジョンタワーを固定します。",
+        },
+    },
+    "freeze_multi_modal_projector": {
+        "en": {
+            "label": "Freeze multi-modal projector",
+            "info": "Freeze the multi-modal projector in the model.",
+        },
+        "ru": {
+            "label": "Заморозить мультимодальный проектор",
+            "info": "Заморозить мультимодальный проектор в модели.",
+        },
+        "zh": {
+            "label": "冻结多模态投影器",
+            "info": "冻结模型中的多模态投影器。",
+        },
+        "ko": {
+            "label": "멀티모달 프로젝터 고정",
+            "info": "모델의 멀티모달 프로젝터를 고정합니다.",
+        },
+        "ja": {
+            "label": "多モーダルプロジェクターの固定",
+            "info": "モデルの多モーダルプロジェクターを固定します。",
+        },
+    },
+    "freeze_language_model": {
+        "en": {
+            "label": "Freeze language model",
+            "info": "Freeze the language model in the model.",
+        },
+        "ru": {
+            "label": "Заморозить язык модели",
+            "info": "Заморозить язык модели в модели.",
+        },
+        "zh": {
+            "label": "冻结语言模型",
+            "info": "冻结模型中的语言模型。",
+        },
+        "ko": {
+            "label": "언어 모델 고정",
+            "info": "모델의 언어 모델을 고정합니다.",
+        },
+        "ja": {
+            "label": "言語モデルの固定",
+            "info": "モデルの言語モデルを固定します。",
+        },
+    },
+    "image_max_pixels": {
+        "en": {
+            "label": "Image max pixels",
+            "info": "The maximum number of pixels of image inputs.",
+        },
+        "ru": {
+            "label": "Максимальное количество пикселей изображения",
+            "info": "Максимальное количество пикселей изображения.",
+        },
+        "zh": {
+            "label": "图像最大像素",
+            "info": "输入图像的最大像素数。",
+        },
+        "ko": {
+            "label": "이미지 최대 픽셀",
+            "info": "이미지 입력의 최대 픽셀 수입니다.",
+        },
+        "ja": {
+            "label": "画像最大ピクセル",
+            "info": "画像入力の最大ピクセル数です。",
+        },
+    },
+    "image_min_pixels": {
+        "en": {
+            "label": "Image min pixels",
+            "info": "The minimum number of pixels of image inputs.",
+        },
+        "ru": {
+            "label": "Минимальное количество пикселей изображения",
+            "info": "Минимальное количество пикселей изображения.",
+        },
+        "zh": {
+            "label": "图像最小像素",
+            "info": "输入图像的最小像素数。",
+        },
+        "ko": {
+            "label": "이미지 최소 픽셀",
+            "info": "이미지 입력의 최소 픽셀 수입니다.",
+        },
+        "ja": {
+            "label": "画像最小ピクセル",
+            "info": "画像入力の最小ピクセル数です。",
+        },
+    },
+    "video_max_pixels": {
+        "en": {
+            "label": "Video max pixels",
+            "info": "The maximum number of pixels of video inputs.",
+        },
+        "ru": {
+            "label": "Максимальное количество пикселей видео",
+            "info": "Максимальное количество пикселей видео.",
+        },
+        "zh": {
+            "label": "视频最大像素",
+            "info": "输入视频的最大像素数。",
+        },
+        "ko": {
+            "label": "비디오 최대 픽셀",
+            "info": "비디오 입력의 최대 픽셀 수입니다.",
+        },
+        "ja": {
+            "label": "ビデオ最大ピクセル",
+            "info": "ビデオ入力の最大ピクセル数です。",
+        },
+    },
+    "video_min_pixels": {
+        "en": {
+            "label": "Video min pixels",
+            "info": "The minimum number of pixels of video inputs.",
+        },
+        "ru": {
+            "label": "Минимальное количество пикселей видео",
+            "info": "Минимальное количество пикселей видео.",
+        },
+        "zh": {
+            "label": "视频最小像素",
+            "info": "输入视频的最小像素数。",
+        },
+        "ko": {
+            "label": "비디오 최소 픽셀",
+            "info": "비디오 입력의 최소 픽셀 수입니다.",
+        },
+        "ja": {
+            "label": "ビデオ最小ピクセル",
+            "info": "ビデオ入力の最小ピクセル数です。",
+        },
+    },
+    "galore_tab": {
+        "en": {
+            "label": "GaLore configurations",
+        },
+        "ru": {
+            "label": "Конфигурации GaLore",
+        },
+        "zh": {
+            "label": "GaLore 参数设置",
+        },
+        "ko": {
+            "label": "GaLore 구성",
+        },
+        "ja": {
+            "label": "GaLore 設定",
+        },
+    },
+    "use_galore": {
+        "en": {
+            "label": "Use GaLore",
+            "info": "Use [GaLore](https://github.com/jiaweizzhao/GaLore) optimizer.",
+        },
+        "ru": {
+            "label": "Использовать GaLore",
+            "info": "Используйте оптимизатор [GaLore](https://github.com/jiaweizzhao/GaLore).",
+        },
+        "zh": {
+            "label": "使用 GaLore",
+            "info": "使用 [GaLore](https://github.com/jiaweizzhao/GaLore) 优化器。",
+        },
+        "ko": {
+            "label": "GaLore 사용",
+            "info": "[GaLore](https://github.com/jiaweizzhao/GaLore) 최적화를 사용하세요.",
+        },
+        "ja": {
+            "label": "GaLore を使用",
+            "info": "[GaLore](https://github.com/jiaweizzhao/GaLore) オプティマイザーを使用します。",
+        },
+    },
+    "galore_rank": {
+        "en": {
+            "label": "GaLore rank",
+            "info": "The rank of GaLore gradients.",
+        },
+        "ru": {
+            "label": "Ранг GaLore",
+            "info": "Ранг градиентов GaLore.",
+        },
+        "zh": {
+            "label": "GaLore 秩",
+            "info": "GaLore 梯度的秩大小。",
+        },
+        "ko": {
+            "label": "GaLore 랭크",
+            "info": "GaLore 그레디언트의 랭크.",
+        },
+        "ja": {
+            "label": "GaLore ランク",
+            "info": "GaLore 勾配のランク。",
+        },
+    },
+    "galore_update_interval": {
+        "en": {
+            "label": "Update interval",
+            "info": "Number of steps to update the GaLore projection.",
+        },
+        "ru": {
+            "label": "Интервал обновления",
+            "info": "Количество шагов для обновления проекции GaLore.",
+        },
+        "zh": {
+            "label": "更新间隔",
+            "info": "相邻两次投影更新的步数。",
+        },
+        "ko": {
+            "label": "업데이트 간격",
+            "info": "GaLore 프로젝션을 업데이트할 간격의 스텝 수.",
+        },
+        "ja": {
+            "label": "更新間隔",
+            "info": "隣接する 2 回の投影更新間のステップ数。",
+        },
+    },
+    "galore_scale": {
+        "en": {
+            "label": "GaLore scale",
+            "info": "GaLore scaling coefficient.",
+        },
+        "ru": {
+            "label": "LoRA Alpha",
+            "info": "Коэффициент масштабирования GaLore.",
+        },
+        "zh": {
+            "label": "GaLore 缩放系数",
+            "info": "GaLore 缩放系数大小。",
+        },
+        "ko": {
+            "label": "GaLore 스케일",
+            "info": "GaLore 스케일링 계수.",
+        },
+        "ja": {
+            "label": "GaLore スケール",
+            "info": "GaLore スケーリング係数。",
+        },
+    },
+    "galore_target": {
+        "en": {
+            "label": "GaLore modules",
+            "info": "Name(s) of modules to apply GaLore. Use commas to separate multiple modules.",
+        },
+        "ru": {
+            "label": "Модули GaLore",
+            "info": "Имена модулей для применения GaLore. Используйте запятые для разделения нескольких модулей.",
+        },
+        "zh": {
+            "label": "GaLore 作用模块",
+            "info": "应用 GaLore 的模块名称。使用英文逗号分隔多个名称。",
+        },
+        "ko": {
+            "label": "GaLore 모듈",
+            "info": "GaLore를 적용할 모듈의 이름. 모듈 간에는 쉼표(,)로 구분하십시오.",
+        },
+        "ja": {
+            "label": "GaLore モジュール",
+            "info": "GaLore を適用するモジュールの名前。複数のモジュールを区切るにはカンマを使用します。",
+        },
+    },
+    "apollo_tab": {
+        "en": {
+            "label": "APOLLO configurations",
+        },
+        "ru": {
+            "label": "Конфигурации APOLLO",
+        },
+        "zh": {
+            "label": "APOLLO 参数设置",
+        },
+        "ko": {
+            "label": "APOLLO 구성",
+        },
+        "ja": {
+            "label": "APOLLO 設定",
+        },
+    },
+    "use_apollo": {
+        "en": {
+            "label": "Use APOLLO",
+            "info": "Use [APOLLO](https://github.com/zhuhanqing/APOLLO) optimizer.",
+        },
+        "ru": {
+            "label": "Использовать APOLLO",
+            "info": "Используйте оптимизатор [APOLLO](https://github.com/zhuhanqing/APOLLO).",
+        },
+        "zh": {
+            "label": "使用 APOLLO",
+            "info": "使用 [APOLLO](https://github.com/zhuhanqing/APOLLO) 优化器。",
+        },
+        "ko": {
+            "label": "APOLLO 사용",
+            "info": "[APOLLO](https://github.com/zhuhanqing/APOLLO) 최적화를 사용하세요.",
+        },
+        "ja": {
+            "label": "APOLLO を使用",
+            "info": "[APOLLO](https://github.com/zhuhanqing/APOLLO) オプティマイザーを使用します。",
+        },
+    },
+    "apollo_rank": {
+        "en": {
+            "label": "APOLLO rank",
+            "info": "The rank of APOLLO gradients.",
+        },
+        "ru": {
+            "label": "Ранг APOLLO",
+            "info": "Ранг градиентов APOLLO.",
+        },
+        "zh": {
+            "label": "APOLLO 秩",
+            "info": "APOLLO 梯度的秩大小。",
+        },
+        "ko": {
+            "label": "APOLLO 랭크",
+            "info": "APOLLO 그레디언트의 랭크.",
+        },
+        "ja": {
+            "label": "APOLLO ランク",
+            "info": "APOLLO 勾配のランク。",
+        },
+    },
+    "apollo_update_interval": {
+        "en": {
+            "label": "Update interval",
+            "info": "Number of steps to update the APOLLO projection.",
+        },
+        "ru": {
+            "label": "Интервал обновления",
+            "info": "Количество шагов для обновления проекции APOLLO.",
+        },
+        "zh": {
+            "label": "更新间隔",
+            "info": "相邻两次投影更新的步数。",
+        },
+        "ko": {
+            "label": "업데이트 간격",
+            "info": "APOLLO 프로젝션을 업데이트할 간격의 스텝 수.",
+        },
+        "ja": {
+            "label": "更新間隔",
+            "info": "隣接する 2 回の投影更新間のステップ数。",
+        },
+    },
+    "apollo_scale": {
+        "en": {
+            "label": "APOLLO scale",
+            "info": "APOLLO scaling coefficient.",
+        },
+        "ru": {
+            "label": "LoRA Alpha",
+            "info": "Коэффициент масштабирования APOLLO.",
+        },
+        "zh": {
+            "label": "APOLLO 缩放系数",
+            "info": "APOLLO 缩放系数大小。",
+        },
+        "ko": {
+            "label": "APOLLO 스케일",
+            "info": "APOLLO 스케일링 계수.",
+        },
+        "ja": {
+            "label": "APOLLO スケール",
+            "info": "APOLLO スケーリング係数。",
+        },
+    },
+    "apollo_target": {
+        "en": {
+            "label": "APOLLO modules",
+            "info": "Name(s) of modules to apply APOLLO. Use commas to separate multiple modules.",
+        },
+        "ru": {
+            "label": "Модули APOLLO",
+            "info": "Имена модулей для применения APOLLO. Используйте запятые для разделения нескольких модулей.",
+        },
+        "zh": {
+            "label": "APOLLO 作用模块",
+            "info": "应用 APOLLO 的模块名称。使用英文逗号分隔多个名称。",
+        },
+        "ko": {
+            "label": "APOLLO 모듈",
+            "info": "APOLLO를 적용할 모듈의 이름. 모듈 간에는 쉼표(,)로 구분하십시오.",
+        },
+        "ja": {
+            "label": "APOLLO モジュール",
+            "info": "APOLLO を適用するモジュールの名前。複数のモジュールを区切るにはカンマを使用します。",
+        },
+    },
+    "badam_tab": {
+        "en": {
+            "label": "BAdam configurations",
+        },
+        "ru": {
+            "label": "Конфигурации BAdam",
+        },
+        "zh": {
+            "label": "BAdam 参数设置",
+        },
+        "ko": {
+            "label": "BAdam 설정",
+        },
+        "ja": {
+            "label": "BAdam 設定",
+        },
+    },
+    "use_badam": {
+        "en": {
+            "label": "Use BAdam",
+            "info": "Enable the [BAdam](https://github.com/Ledzy/BAdam) optimizer.",
+        },
+        "ru": {
+            "label": "Использовать BAdam",
+            "info": "Включите оптимизатор [BAdam](https://github.com/Ledzy/BAdam).",
+        },
+        "zh": {
+            "label": "使用 BAdam",
+            "info": "使用 [BAdam](https://github.com/Ledzy/BAdam) 优化器。",
+        },
+        "ko": {
+            "label": "BAdam 사용",
+            "info": "[BAdam](https://github.com/Ledzy/BAdam) 옵티마이저를 사용합니다.",
+        },
+        "ja": {
+            "label": "BAdam を使用",
+            "info": "[BAdam](https://github.com/Ledzy/BAdam) オプティマイザーを使用します。",
+        },
+    },
+    "badam_mode": {
+        "en": {
+            "label": "BAdam mode",
+            "info": "Whether to use layer-wise or ratio-wise BAdam optimizer.",
+        },
+        "ru": {
+            "label": "Режим BAdam",
+            "info": "Использовать ли оптимизатор BAdam с послоевой или пропорциональной настройкой.",
+        },
+        "zh": {
+            "label": "BAdam 模式",
+            "info": "使用 layer-wise 或 ratio-wise BAdam 优化器。",
+        },
+        "ko": {
+            "label": "BAdam 모드",
+            "info": "레이어-BAdam 옵티마이저인지 비율-BAdam 옵티마이저인지.",
+        },
+        "ja": {
+            "label": "BAdam モード",
+            "info": "layer-wise または ratio-wise BAdam オプティマイザーを使用します。",
+        },
+    },
+    "badam_switch_mode": {
+        "en": {
+            "label": "Switch mode",
+            "info": "The strategy of picking block to update for layer-wise BAdam.",
+        },
+        "ru": {
+            "label": "Режим переключения",
+            "info": "Стратегия выбора блока для обновления для послойного BAdam.",
+        },
+        "zh": {
+            "label": "切换策略",
+            "info": "Layer-wise BAdam 优化器的块切换策略。",
+        },
+        "ko": {
+            "label": "스위치 모드",
+            "info": "레이어-BAdam을 위한 블록 선택 전략.",
+        },
+        "ja": {
+            "label": "切り替え戦略",
+            "info": "Layer-wise BAdam オプティマイザーのブロック切り替え戦略。",
+        },
+    },
+    "badam_switch_interval": {
+        "en": {
+            "label": "Switch interval",
+            "info": "Number of steps to update the block for layer-wise BAdam.",
+        },
+        "ru": {
+            "label": "Интервал переключения",
+            "info": "количество шагов для обновления блока для пошагового BAdam.",
+        },
+        "zh": {
+            "label": "切换频率",
+            "info": "Layer-wise BAdam 优化器的块切换频率。",
+        },
+        "ko": {
+            "label": "전환 간격",
+            "info": "레이어-BAdam을 위한 블록 업데이트 간 스텝 수.",
+        },
+        "ja": {
+            "label": "切り替え頻度",
+            "info": "Layer-wise BAdam オプティマイザーのブロック切り替え頻度。",
+        },
+    },
+    "badam_update_ratio": {
+        "en": {
+            "label": "Update ratio",
+            "info": "The ratio of the update for ratio-wise BAdam.",
+        },
+        "ru": {
+            "label": "Коэффициент обновления",
+            "info": "Коэффициент обновления для BAdam с учётом соотношений.",
+        },
+        "zh": {
+            "label": "Block 更新比例",
+            "info": "Ratio-wise BAdam 优化器的更新比例。",
+        },
+        "ko": {
+            "label": "업데이트 비율",
+            "info": "비율-BAdam의 업데이트 비율.",
+        },
+        "ja": {
+            "label": "ブロック更新比率",
+            "info": "Ratio-wise BAdam オプティマイザーの更新比率。",
+        },
+    },
+    "swanlab_tab": {
+        "en": {
+            "label": "SwanLab configurations",
+        },
+        "ru": {
+            "label": "Конфигурации SwanLab",
+        },
+        "zh": {
+            "label": "SwanLab 参数设置",
+        },
+        "ko": {
+            "label": "SwanLab 설정",
+        },
+        "ja": {
+            "label": "SwanLab 設定",
+        },
+    },
+    "use_swanlab": {
+        "en": {
+            "label": "Use SwanLab",
+            "info": "Enable [SwanLab](https://swanlab.cn/) for experiment tracking and visualization.",
+        },
+        "ru": {
+            "label": "Использовать SwanLab",
+            "info": "Включить [SwanLab](https://swanlab.cn/) для отслеживания и визуализации экспериментов.",
+        },
+        "zh": {
+            "label": "使用 SwanLab",
+            "info": "启用 [SwanLab](https://swanlab.cn/) 进行实验跟踪和可视化。",
+        },
+        "ko": {
+            "label": "SwanLab 사용",
+            "info": "[SwanLab](https://swanlab.cn/) 를 사용하여 실험을 추적하고 시각화합니다.",
+        },
+        "ja": {
+            "label": "SwanLab を使用",
+            "info": "[SwanLab](https://swanlab.cn/) を有効にして実験の追跡と可視化を行います。",
+        },
+    },
+    "swanlab_project": {
+        "en": {
+            "label": "SwanLab project",
+        },
+        "ru": {
+            "label": "SwanLab Проект",
+        },
+        "zh": {
+            "label": "SwanLab 项目名",
+        },
+        "ko": {
+            "label": "SwanLab 프로젝트",
+        },
+        "ja": {
+            "label": "SwanLab プロジェクト",
+        },
+    },
+    "swanlab_run_name": {
+        "en": {
+            "label": "SwanLab experiment name (optional)",
+        },
+        "ru": {
+            "label": "SwanLab Имя эксперимента (опционально)",
+        },
+        "zh": {
+            "label": "SwanLab 实验名（非必填）",
+        },
+        "ko": {
+            "label": "SwanLab 실험 이름 (선택 사항)",
+        },
+        "ja": {
+            "label": "SwanLab 実験名（オプション）",
+        },
+    },
+    "swanlab_workspace": {
+        "en": {
+            "label": "SwanLab workspace (optional)",
+            "info": "Workspace for SwanLab. Defaults to the personal workspace.",
+        },
+        "ru": {
+            "label": "SwanLab Рабочая область (опционально)",
+            "info": "Рабочая область SwanLab, если не заполнено, то по умолчанию в личной рабочей области.",
+        },
+        "zh": {
+            "label": "SwanLab 工作区（非必填）",
+            "info": "SwanLab 的工作区，默认在个人工作区下。",
+        },
+        "ko": {
+            "label": "SwanLab 작업 영역 (선택 사항)",
+            "info": "SwanLab 조직의 작업 영역, 비어 있으면 기본적으로 개인 작업 영역에 있습니다.",
+        },
+        "ja": {
+            "label": "SwanLab ワークスペース（オプション）",
+            "info": "SwanLab のワークスペース。デフォルトでは個人ワークスペースです。",
+        },
+    },
+    "swanlab_api_key": {
+        "en": {
+            "label": "SwanLab API key (optional)",
+            "info": "API key for SwanLab.",
+        },
+        "ru": {
+            "label": "SwanLab API ключ (опционально)",
+            "info": "API ключ для SwanLab.",
+        },
+        "zh": {
+            "label": "SwanLab API 密钥（非必填）",
+            "info": "用于在编程环境登录 SwanLab，已登录则无需填写。",
+        },
+        "ko": {
+            "label": "SwanLab API 키 (선택 사항)",
+            "info": "SwanLab의 API 키.",
+        },
+        "ja": {
+            "label": "SwanLab API キー（オプション）",
+            "info": "SwanLab の API キー。",
+        },
+    },
+    "swanlab_mode": {
+        "en": {
+            "label": "SwanLab mode",
+            "info": "Cloud or offline version.",
+        },
+        "ru": {
+            "label": "SwanLab Режим",
+            "info": "Версия в облаке или локальная версия.",
+        },
+        "zh": {
+            "label": "SwanLab 模式",
+            "info": "使用云端版或离线版 SwanLab。",
+        },
+        "ko": {
+            "label": "SwanLab 모드",
+            "info": "클라우드 버전 또는 오프라인 버전.",
+        },
+        "ja": {
+            "label": "SwanLab モード",
+            "info": "クラウド版またはオフライン版 SwanLab を使用します。",
+        },
+    },
+    "swanlab_logdir": {
+        "en": {
+            "label": "SwanLab log directory",
+            "info": "The log directory for SwanLab.",
+        },
+        "ru": {
+            "label": "SwanLab 로그 디렉토리",
+            "info": "SwanLab의 로그 디렉토리.",
+        },
+        "zh": {
+            "label": "SwanLab 日志目录",
+            "info": "SwanLab 的日志目录。",
+        },
+        "ko": {
+            "label": "SwanLab 로그 디렉토리",
+            "info": "SwanLab의 로그 디렉토리.",
+        },
+        "ja": {
+            "label": "SwanLab ログ ディレクトリ",
+            "info": "SwanLab のログ ディレクトリ。",
+        },
+    },
+    "cmd_preview_btn": {
+        "en": {
+            "value": "Preview command",
+        },
+        "ru": {
+            "value": "Просмотр команды",
+        },
+        "zh": {
+            "value": "预览命令",
+        },
+        "ko": {
+            "value": "명령어 미리보기",
+        },
+        "ja": {
+            "value": "コマンドをプレビュー",
+        },
+    },
+    "arg_save_btn": {
+        "en": {
+            "value": "Save arguments",
+        },
+        "ru": {
+            "value": "Сохранить аргументы",
+        },
+        "zh": {
+            "value": "保存训练参数",
+        },
+        "ko": {
+            "value": "Argument 저장",
+        },
+        "ja": {
+            "value": "引数を保存",
+        },
+    },
+    "arg_load_btn": {
+        "en": {
+            "value": "Load arguments",
+        },
+        "ru": {
+            "value": "Загрузить аргументы",
+        },
+        "zh": {
+            "value": "载入训练参数",
+        },
+        "ko": {
+            "value": "Argument 불러오기",
+        },
+        "ja": {
+            "value": "引数を読み込む",
+        },
+    },
+    "start_btn": {
+        "en": {
+            "value": "Start",
+        },
+        "ru": {
+            "value": "Начать",
+        },
+        "zh": {
+            "value": "开始",
+        },
+        "ko": {
+            "value": "시작",
+        },
+        "ja": {
+            "value": "開始",
+        },
+    },
+    "stop_btn": {
+        "en": {
+            "value": "Abort",
+        },
+        "ru": {
+            "value": "Прервать",
+        },
+        "zh": {
+            "value": "中断",
+        },
+        "ko": {
+            "value": "중단",
+        },
+        "ja": {
+            "value": "中断",
+        },
+    },
+    "output_dir": {
+        "en": {
+            "label": "Output dir",
+            "info": "Directory for saving results.",
+        },
+        "ru": {
+            "label": "Выходной каталог",
+            "info": "Каталог для сохранения результатов.",
+        },
+        "zh": {
+            "label": "输出目录",
+            "info": "保存结果的路径。",
+        },
+        "ko": {
+            "label": "출력 디렉토리",
+            "info": "결과를 저장할 디렉토리.",
+        },
+        "ja": {
+            "label": "出力ディレクトリ",
+            "info": "結果を保存するパス。",
+        },
+    },
+    "config_path": {
+        "en": {
+            "label": "Config path",
+            "info": "Path to config saving arguments.",
+        },
+        "ru": {
+            "label": "Путь к конфигурации",
+            "info": "Путь для сохранения аргументов конфигурации.",
+        },
+        "zh": {
+            "label": "配置路径",
+            "info": "保存训练参数的配置文件路径。",
+        },
+        "ko": {
+            "label": "설정 경로",
+            "info": "Arguments 저장 파일 경로.",
+        },
+        "ja": {
+            "label": "設定パス",
+            "info": "トレーニングパラメータを保存する設定ファイルのパス。",
+        },
+    },
+    "device_count": {
+        "en": {
+            "label": "Device count",
+            "info": "Number of devices available.",
+        },
+        "ru": {
+            "label": "Количество устройств",
+            "info": "Количество доступных устройств.",
+        },
+        "zh": {
+            "label": "设备数量",
+            "info": "当前可用的运算设备数。",
+        },
+        "ko": {
+            "label": "디바이스 수",
+            "info": "사용 가능한 디바이스 수.",
+        },
+        "ja": {
+            "label": "デバイス数",
+            "info": "現在利用可能な演算デバイス数。",
+        },
+    },
+    "ds_stage": {
+        "en": {
+            "label": "DeepSpeed stage",
+            "info": "DeepSpeed stage for distributed training.",
+        },
+        "ru": {
+            "label": "Этап DeepSpeed",
+            "info": "Этап DeepSpeed для распределенного обучения.",
+        },
+        "zh": {
+            "label": "DeepSpeed stage",
+            "info": "多卡训练的 DeepSpeed stage。",
+        },
+        "ko": {
+            "label": "DeepSpeed 단계",
+            "info": "분산 학습을 위한 DeepSpeed 단계.",
+        },
+        "ja": {
+            "label": "DeepSpeed stage",
+            "info": "マルチ GPU トレーニングの DeepSpeed stage。",
+        },
+    },
+    "ds_offload": {
+        "en": {
+            "label": "Enable offload",
+            "info": "Enable DeepSpeed offload (slow down training).",
+        },
+        "ru": {
+            "label": "Включить выгрузку",
+            "info": "включить выгрузку DeepSpeed (замедлит обучение).",
+        },
+        "zh": {
+            "label": "使用 offload",
+            "info": "使用 DeepSpeed offload（会减慢速度）。",
+        },
+        "ko": {
+            "label": "오프로딩 활성화",
+            "info": "DeepSpeed 오프로딩 활성화 (훈련 속도 느려짐).",
+        },
+        "ja": {
+            "label": "オフロードを使用",
+            "info": "DeepSpeed オフロードを使用します（速度が遅くなります）。",
+        },
+    },
+    "output_box": {
+        "en": {
+            "value": "Ready.",
+        },
+        "ru": {
+            "value": "Готово.",
+        },
+        "zh": {
+            "value": "准备就绪。",
+        },
+        "ko": {
+            "value": "준비 완료.",
+        },
+        "ja": {
+            "value": "準備完了。",
+        },
+    },
+    "loss_viewer": {
+        "en": {
+            "label": "Loss",
+        },
+        "ru": {
+            "label": "Потери",
+        },
+        "zh": {
+            "label": "损失",
+        },
+        "ko": {
+            "label": "손실",
+        },
+        "ja": {
+            "label": "損失",
+        },
+    },
+    "predict": {
+        "en": {
+            "label": "Save predictions",
+        },
+        "ru": {
+            "label": "Сохранить предсказания",
+        },
+        "zh": {
+            "label": "保存预测结果",
+        },
+        "ko": {
+            "label": "예측 결과 저장",
+        },
+        "ja": {
+            "label": "予測結果を保存",
+        },
+    },
+    "infer_backend": {
+        "en": {
+            "label": "Inference engine",
+        },
+        "ru": {
+            "label": "Инференс движок",
+        },
+        "zh": {
+            "label": "推理引擎",
+        },
+        "ko": {
+            "label": "추론 엔진",
+        },
+        "ja": {
+            "label": "推論エンジン",
+        },
+    },
+    "infer_dtype": {
+        "en": {
+            "label": "Inference data type",
+        },
+        "ru": {
+            "label": "Тип данных для вывода",
+        },
+        "zh": {
+            "label": "推理数据类型",
+        },
+        "ko": {
+            "label": "추론 데이터 유형",
+        },
+        "ja": {
+            "label": "推論データタイプ",
+        },
+    },
+    "load_btn": {
+        "en": {
+            "value": "Load model",
+        },
+        "ru": {
+            "value": "Загрузить модель",
+        },
+        "zh": {
+            "value": "加载模型",
+        },
+        "ko": {
+            "value": "모델 불러오기",
+        },
+        "ja": {
+            "value": "モデルを読み込む",
+        },
+    },
+    "unload_btn": {
+        "en": {
+            "value": "Unload model",
+        },
+        "ru": {
+            "value": "Выгрузить модель",
+        },
+        "zh": {
+            "value": "卸载模型",
+        },
+        "ko": {
+            "value": "모델 언로드",
+        },
+        "ja": {
+            "value": "モデルをアンロード",
+        },
+    },
+    "info_box": {
+        "en": {
+            "value": "Model unloaded, please load a model first.",
+        },
+        "ru": {
+            "value": "Модель не загружена, загрузите модель сначала.",
+        },
+        "zh": {
+            "value": "模型未加载，请先加载模型。",
+        },
+        "ko": {
+            "value": "모델이 언로드되었습니다. 모델을 먼저 불러오십시오.",
+        },
+        "ja": {
+            "value": "モデルがロードされていません。最初にモデルをロードしてください。",
+        },
+    },
+    "role": {
+        "en": {
+            "label": "Role",
+        },
+        "ru": {
+            "label": "Роль",
+        },
+        "zh": {
+            "label": "角色",
+        },
+        "ko": {
+            "label": "역할",
+        },
+        "ja": {
+            "label": "役割",
+        },
+    },
+    "system": {
+        "en": {
+            "placeholder": "System prompt (optional)",
+        },
+        "ru": {
+            "placeholder": "Системный запрос (по желанию)",
+        },
+        "zh": {
+            "placeholder": "系统提示词（非必填）",
+        },
+        "ko": {
+            "placeholder": "시스템 프롬프트 (선택 사항)",
+        },
+        "ja": {
+            "placeholder": "システムプロンプト（オプション）",
+        },
+    },
+    "tools": {
+        "en": {
+            "placeholder": "Tools (optional)",
+        },
+        "ru": {
+            "placeholder": "Инструменты (по желанию)",
+        },
+        "zh": {
+            "placeholder": "工具列表（非必填）",
+        },
+        "ko": {
+            "placeholder": "툴 (선택 사항)",
+        },
+        "ja": {
+            "placeholder": "ツールリスト（オプション）",
+        },
+    },
+    "image": {
+        "en": {
+            "label": "Image (optional)",
+        },
+        "ru": {
+            "label": "Изображение (по желанию)",
+        },
+        "zh": {
+            "label": "图像（非必填）",
+        },
+        "ko": {
+            "label": "이미지 (선택 사항)",
+        },
+        "ja": {
+            "label": "画像（オプション）",
+        },
+    },
+    "video": {
+        "en": {
+            "label": "Video (optional)",
+        },
+        "ru": {
+            "label": "Видео (по желанию)",
+        },
+        "zh": {
+            "label": "视频（非必填）",
+        },
+        "ko": {
+            "label": "비디오 (선택 사항)",
+        },
+        "ja": {
+            "label": "動画（オプション）",
+        },
+    },
+    "query": {
+        "en": {
+            "placeholder": "Input...",
+        },
+        "ru": {
+            "placeholder": "Ввод...",
+        },
+        "zh": {
+            "placeholder": "输入...",
+        },
+        "ko": {
+            "placeholder": "입력...",
+        },
+        "ja": {
+            "placeholder": "入力...",
+        },
+    },
+    "submit_btn": {
+        "en": {
+            "value": "Submit",
+        },
+        "ru": {
+            "value": "Отправить",
+        },
+        "zh": {
+            "value": "提交",
+        },
+        "ko": {
+            "value": "제출",
+        },
+        "ja": {
+            "value": "送信",
+        },
+    },
+    "max_length": {
+        "en": {
+            "label": "Maximum length",
+        },
+        "ru": {
+            "label": "Максимальная длина",
+        },
+        "zh": {
+            "label": "最大长度",
+        },
+        "ko": {
+            "label": "최대 길이",
+        },
+        "ja": {
+            "label": "最大長",
+        },
+    },
+    "max_new_tokens": {
+        "en": {
+            "label": "Maximum new tokens",
+        },
+        "ru": {
+            "label": "Максимальное количество новых токенов",
+        },
+        "zh": {
+            "label": "最大生成长度",
+        },
+        "ko": {
+            "label": "응답의 최대 길이",
+        },
+        "ja": {
+            "label": "最大生成長",
+        },
+    },
+    "top_p": {
+        "en": {
+            "label": "Top-p",
+        },
+        "ru": {
+            "label": "Лучшие-p",
+        },
+        "zh": {
+            "label": "Top-p 采样值",
+        },
+        "ko": {
+            "label": "Top-p",
+        },
+        "ja": {
+            "label": "Top-p",
+        },
+    },
+    "temperature": {
+        "en": {
+            "label": "Temperature",
+        },
+        "ru": {
+            "label": "Температура",
+        },
+        "zh": {
+            "label": "温度系数",
+        },
+        "ko": {
+            "label": "온도",
+        },
+        "ja": {
+            "label": "温度",
+        },
+    },
+    "skip_special_tokens": {
+        "en": {
+            "label": "Skip special tokens",
+        },
+        "ru": {
+            "label": "Пропустить специальные токены",
+        },
+        "zh": {
+            "label": "跳过特殊 token",
+        },
+        "ko": {
+            "label": "스페셜 토큰을 건너뛰기",
+        },
+        "ja": {
+            "label": "スペシャルトークンをスキップ",
+        },
+    },
+    "escape_html": {
+        "en": {
+            "label": "Escape HTML tags",
+        },
+        "ru": {
+            "label": "Исключить HTML теги",
+        },
+        "zh": {
+            "label": "转义 HTML 标签",
+        },
+        "ko": {
+            "label": "HTML 태그 이스케이프",
+        },
+        "ja": {
+            "label": "HTML タグをエスケープ",
+        },
+    },
+    "clear_btn": {
+        "en": {
+            "value": "Clear history",
+        },
+        "ru": {
+            "value": "Очистить историю",
+        },
+        "zh": {
+            "value": "清空历史",
+        },
+        "ko": {
+            "value": "기록 지우기",
+        },
+        "ja": {
+            "value": "履歴をクリア",
+        },
+    },
+    "export_size": {
+        "en": {
+            "label": "Max shard size (GB)",
+            "info": "The maximum size for a model file.",
+        },
+        "ru": {
+            "label": "Максимальный размер фрагмента (ГБ)",
+            "info": "Максимальный размер файла модели.",
+        },
+        "zh": {
+            "label": "最大分块大小（GB）",
+            "info": "单个模型文件的最大大小。",
+        },
+        "ko": {
+            "label": "최대 샤드 크기 (GB)",
+            "info": "모델 파일의 최대 크기.",
+        },
+        "ja": {
+            "label": "最大シャードサイズ（GB）",
+            "info": "単一のモデルファイルの最大サイズ。",
+        },
+    },
+    "export_quantization_bit": {
+        "en": {
+            "label": "Export quantization bit.",
+            "info": "Quantizing the exported model.",
+        },
+        "ru": {
+            "label": "Экспорт бита квантования",
+            "info": "Квантование экспортируемой модели.",
+        },
+        "zh": {
+            "label": "导出量化等级",
+            "info": "量化导出模型。",
+        },
+        "ko": {
+            "label": "양자화 비트 내보내기",
+            "info": "내보낸 모델의 양자화.",
+        },
+        "ja": {
+            "label": "量子化ビットをエクスポート",
+            "info": "エクスポートするモデルを量子化します。",
+        },
+    },
+    "export_quantization_dataset": {
+        "en": {
+            "label": "Export quantization dataset",
+            "info": "The calibration dataset used for quantization.",
+        },
+        "ru": {
+            "label": "Экспорт набора данных для квантования",
+            "info": "Набор данных калибровки, используемый для квантования.",
+        },
+        "zh": {
+            "label": "导出量化数据集",
+            "info": "量化过程中使用的校准数据集。",
+        },
+        "ko": {
+            "label": "양자화 데이터셋 내보내기",
+            "info": "양자화에 사용되는 교정 데이터셋.",
+        },
+        "ja": {
+            "label": "量子化データセットをエクスポート",
+            "info": "量子化プロセスで使用されるキャリブレーションデータセット。",
+        },
+    },
+    "export_device": {
+        "en": {
+            "label": "Export device",
+            "info": "Which device should be used to export model.",
+        },
+        "ru": {
+            "label": "Экспорт устройство",
+            "info": "Какое устройство следует использовать для экспорта модели.",
+        },
+        "zh": {
+            "label": "导出设备",
+            "info": "导出模型使用的设备类型。",
+        },
+        "ko": {
+            "label": "내보낼 장치",
+            "info": "모델을 내보내는 데 사용할 장치.",
+        },
+        "ja": {
+            "label": "エクスポートデバイス",
+            "info": "モデルをエクスポートするために使用するデバイスタイプ。",
+        },
+    },
+    "export_legacy_format": {
+        "en": {
+            "label": "Export legacy format",
+            "info": "Do not use safetensors to save the model.",
+        },
+        "ru": {
+            "label": "Экспорт в устаревший формат",
+            "info": "Не использовать safetensors для сохранения модели.",
+        },
+        "zh": {
+            "label": "导出旧格式",
+            "info": "不使用 safetensors 格式保存模型。",
+        },
+        "ko": {
+            "label": "레거시 형식 내보내기",
+            "info": "모델을 저장하는 데 safetensors를 사용하지 않습니다.",
+        },
+        "ja": {
+            "label": "レガシーフォーマットをエクスポート",
+            "info": "safetensors フォーマットを使用せずにモデルを保存します。",
+        },
+    },
+    "export_dir": {
+        "en": {
+            "label": "Export dir",
+            "info": "Directory to save exported model.",
+        },
+        "ru": {
+            "label": "Каталог экспорта",
+            "info": "Каталог для сохранения экспортированной модели.",
+        },
+        "zh": {
+            "label": "导出目录",
+            "info": "保存导出模型的文件夹路径。",
+        },
+        "ko": {
+            "label": "내보내기 디렉토리",
+            "info": "내보낸 모델을 저장할 디렉토리.",
+        },
+        "ja": {
+            "label": "エクスポートディレクトリ",
+            "info": "エクスポートしたモデルを保存するフォルダのパス。",
+        },
+    },
+    "export_hub_model_id": {
+        "en": {
+            "label": "HF Hub ID (optional)",
+            "info": "Repo ID for uploading model to Hugging Face hub.",
+        },
+        "ru": {
+            "label": "HF Hub ID (опционально)",
+            "info": "Идентификатор репозитория для загрузки модели на Hugging Face hub.",
+        },
+        "zh": {
+            "label": "HF Hub ID（非必填）",
+            "info": "用于将模型上传至 Hugging Face Hub 的仓库 ID。",
+        },
+        "ko": {
+            "label": "HF 허브 ID (선택 사항)",
+            "info": "모델을 Hugging Face 허브에 업로드하기 위한 레포 ID.",
+        },
+        "ja": {
+            "label": "HF Hub ID（オプション）",
+            "info": "Hugging Face Hub にモデルをアップロードするためのリポジトリ ID。",
+        },
+    },
+    "export_btn": {
+        "en": {
+            "value": "Export",
+        },
+        "ru": {
+            "value": "Экспорт",
+        },
+        "zh": {
+            "value": "开始导出",
+        },
+        "ko": {
+            "value": "내보내기",
+        },
+        "ja": {
+            "value": "エクスポート",
+        },
+    },
+    "device_memory": {
+        "en": {
+            "label": "Device memory",
+            "info": "Current memory usage of the device (GB).",
+        },
+        "ru": {
+            "label": "Память устройства",
+            "info": "Текущая память на устройстве (GB).",
+        },
+        "zh": {
+            "label": "设备显存",
+            "info": "当前设备的显存（GB）。",
+        },
+        "ko": {
+            "label": "디바이스 메모리",
+            "info": "지금 사용 중인 기기 메모리 (GB).",
+        },
+        "ja": {
+            "label": "デバイスメモリ",
+            "info": "現在のデバイスのメモリ（GB）。",
+        },
+    },
+}
+
+
+ALERTS = {
+    "err_conflict": {
+        "en": "A process is in running, please abort it first.",
+        "ru": "Процесс уже запущен, пожалуйста, сначала прервите его.",
+        "zh": "任务已存在，请先中断训练。",
+        "ko": "프로세스가 실행 중입니다. 먼저 중단하십시오.",
+        "ja": "プロセスが実行中です。最初に中断してください。",
+    },
+    "err_exists": {
+        "en": "You have loaded a model, please unload it first.",
+        "ru": "Вы загрузили модель, сначала разгрузите ее.",
+        "zh": "模型已存在，请先卸载模型。",
+        "ko": "모델이 로드되었습니다. 먼저 언로드하십시오.",
+        "ja": "モデルがロードされています。最初にアンロードしてください。",
+    },
+    "err_no_model": {
+        "en": "Please select a model.",
+        "ru": "Пожалуйста, выберите модель.",
+        "zh": "请选择模型。",
+        "ko": "모델을 선택하십시오.",
+        "ja": "モデルを選択してください。",
+    },
+    "err_no_path": {
+        "en": "Model not found.",
+        "ru": "Модель не найдена.",
+        "zh": "模型未找到。",
+        "ko": "모델을 찾을 수 없습니다.",
+        "ja": "モデルが見つかりません。",
+    },
+    "err_no_dataset": {
+        "en": "Please choose a dataset.",
+        "ru": "Пожалуйста, выберите набор данных.",
+        "zh": "请选择数据集。",
+        "ko": "데이터 세트를 선택하십시오.",
+        "ja": "データセットを選択してください。",
+    },
+    "err_no_adapter": {
+        "en": "Please select an adapter.",
+        "ru": "Пожалуйста, выберите адаптер.",
+        "zh": "请选择适配器。",
+        "ko": "어댑터를 선택하십시오.",
+        "ja": "アダプターを選択してください。",
+    },
+    "err_no_output_dir": {
+        "en": "Please provide output dir.",
+        "ru": "Пожалуйста, укажите выходную директорию.",
+        "zh": "请填写输出目录。",
+        "ko": "출력 디렉토리를 제공하십시오.",
+        "ja": "出力ディレクトリを入力してください。",
+    },
+    "err_no_reward_model": {
+        "en": "Please select a reward model.",
+        "ru": "Пожалуйста, выберите модель вознаграждения.",
+        "zh": "请选择奖励模型。",
+        "ko": "리워드 모델을 선택하십시오.",
+        "ja": "報酬モデルを選択してください。",
+    },
+    "err_no_export_dir": {
+        "en": "Please provide export dir.",
+        "ru": "Пожалуйста, укажите каталог для экспорта.",
+        "zh": "请填写导出目录。",
+        "ko": "Export 디렉토리를 제공하십시오.",
+        "ja": "エクスポートディレクトリを入力してください。",
+    },
+    "err_gptq_lora": {
+        "en": "Please merge adapters before quantizing the model.",
+        "ru": "Пожалуйста, объедините адаптеры перед квантованием модели.",
+        "zh": "量化模型前请先合并适配器。",
+        "ko": "모델을 양자화하기 전에 어댑터를 병합하십시오.",
+        "ja": "モデルを量子化する前にアダプターをマージしてください。",
+    },
+    "err_failed": {
+        "en": "Failed.",
+        "ru": "Ошибка.",
+        "zh": "训练出错。",
+        "ko": "실패했습니다.",
+        "ja": "失敗しました。",
+    },
+    "err_demo": {
+        "en": "Training is unavailable in demo mode, duplicate the space to a private one first.",
+        "ru": "Обучение недоступно в демонстрационном режиме, сначала скопируйте пространство в частное.",
+        "zh": "展示模式不支持训练，请先复制到私人空间。",
+        "ko": "데모 모드에서는 훈련을 사용할 수 없습니다. 먼저 프라이빗 레포지토리로 작업 공간을 복제하십시오.",
+        "ja": "デモモードではトレーニングは利用できません。最初にプライベートスペースに複製してください。",
+    },
+    "err_tool_name": {
+        "en": "Tool name not found.",
+        "ru": "Имя инструмента не найдено.",
+        "zh": "工具名称未找到。",
+        "ko": "툴 이름을 찾을 수 없습니다.",
+        "ja": "ツール名が見つかりません。",
+    },
+    "err_json_schema": {
+        "en": "Invalid JSON schema.",
+        "ru": "Неверная схема JSON.",
+        "zh": "Json 格式错误。",
+        "ko": "잘못된 JSON 스키마입니다.",
+        "ja": "JSON スキーマが無効です。",
+    },
+    "err_config_not_found": {
+        "en": "Config file is not found.",
+        "ru": "Файл конфигурации не найден.",
+        "zh": "未找到配置文件。",
+        "ko": "Config 파일을 찾을 수 없습니다.",
+        "ja": "設定ファイルが見つかりません。",
+    },
+    "warn_no_cuda": {
+        "en": "CUDA environment was not detected.",
+        "ru": "Среда CUDA не обнаружена.",
+        "zh": "未检测到 CUDA 环境。",
+        "ko": "CUDA 환경이 감지되지 않았습니다.",
+        "ja": "CUDA 環境が検出されませんでした。",
+    },
+    "warn_output_dir_exists": {
+        "en": "Output dir already exists, will resume training from here.",
+        "ru": "Выходной каталог уже существует, обучение будет продолжено отсюда.",
+        "zh": "输出目录已存在，将从该断点恢复训练。",
+        "ko": "출력 디렉토리가 이미 존재합니다. 위 출력 디렉토리에 저장된 학습을 재개합니다.",
+        "ja": "出力ディレクトリが既に存在します。このチェックポイントからトレーニングを再開します。",
+    },
+    "warn_no_instruct": {
+        "en": "You are using a non-instruct model, please fine-tune it first.",
+        "ru": "Вы используете модель без инструкции, пожалуйста, primeros выполните донастройку этой модели.",
+        "zh": "您正在使用非指令模型，请先对其进行微调。",
+        "ko": "당신은 지시하지 않은 모델을 사용하고 있습니다. 먼저 이를 미세 조정해 주세요.",
+        "ja": "インストラクションモデルを使用していません。まずモデルをアダプターに適合させてください。",
+    },
+    "info_aborting": {
+        "en": "Aborted, wait for terminating...",
+        "ru": "Прервано, ожидание завершения...",
+        "zh": "训练中断，正在等待进程结束……",
+        "ko": "중단되었습니다. 종료를 기다리십시오...",
+        "ja": "トレーニングが中断されました。プロセスの終了を待っています...",
+    },
+    "info_aborted": {
+        "en": "Ready.",
+        "ru": "Готово.",
+        "zh": "准备就绪。",
+        "ko": "준비되었습니다.",
+        "ja": "準備完了。",
+    },
+    "info_finished": {
+        "en": "Finished.",
+        "ru": "Завершено.",
+        "zh": "训练完毕。",
+        "ko": "완료되었습니다.",
+        "ja": "トレーニングが完了しました。",
+    },
+    "info_config_saved": {
+        "en": "Arguments have been saved at: ",
+        "ru": "Аргументы были сохранены по адресу: ",
+        "zh": "训练参数已保存至：",
+        "ko": "매개변수가 저장되었습니다: ",
+        "ja": "トレーニングパラメータが保存されました: ",
+    },
+    "info_config_loaded": {
+        "en": "Arguments have been restored.",
+        "ru": "Аргументы были восстановлены.",
+        "zh": "训练参数已载入。",
+        "ko": "매개변수가 복원되었습니다.",
+        "ja": "トレーニングパラメータが読み込まれました。",
+    },
+    "info_loading": {
+        "en": "Loading model...",
+        "ru": "Загрузка модели...",
+        "zh": "加载中……",
+        "ko": "모델 로딩 중...",
+        "ja": "モデルをロード中...",
+    },
+    "info_unloading": {
+        "en": "Unloading model...",
+        "ru": "Выгрузка модели...",
+        "zh": "卸载中……",
+        "ko": "모델 언로딩 중...",
+        "ja": "モデルをアンロード中...",
+    },
+    "info_loaded": {
+        "en": "Model loaded, now you can chat with your model!",
+        "ru": "Модель загружена, теперь вы можете общаться с вашей моделью!",
+        "zh": "模型已加载，可以开始聊天了！",
+        "ko": "모델이 로드되었습니다. 이제 모델과 채팅할 수 있습니다!",
+        "ja": "モデルがロードされました。チャットを開始できます！",
+    },
+    "info_unloaded": {
+        "en": "Model unloaded.",
+        "ru": "Модель выгружена.",
+        "zh": "模型已卸载。",
+        "ko": "모델이 언로드되었습니다.",
+        "ja": "モデルがアンロードされました。",
+    },
+    "info_thinking": {
+        "en": "🌀 Thinking...",
+        "ru": "🌀 Думаю...",
+        "zh": "🌀 思考中...",
+        "ko": "🌀 생각 중...",
+        "ja": "🌀 考えています...",
+    },
+    "info_thought": {
+        "en": "✅ Thought",
+        "ru": "✅ Думать закончено",
+        "zh": "✅ 思考完成",
+        "ko": "✅ 생각이 완료되었습니다",
+        "ja": "✅ 思考完了",
+    },
+    "info_exporting": {
+        "en": "Exporting model...",
+        "ru": "Экспорт модели...",
+        "zh": "正在导出模型……",
+        "ko": "모델 내보내기 중...",
+        "ja": "モデルをエクスポート中...",
+    },
+    "info_exported": {
+        "en": "Model exported.",
+        "ru": "Модель экспортирована.",
+        "zh": "模型导出完成。",
+        "ko": "모델이 내보내졌습니다.",
+        "ja": "モデルのエクスポートが完了しました。",
+    },
+    "info_swanlab_link": {
+        "en": "### SwanLab Link\n",
+        "ru": "### SwanLab ссылка\n",
+        "zh": "### SwanLab 链接\n",
+        "ko": "### SwanLab 링크\n",
+        "ja": "### SwanLab リンク\n",
+    },
+}
diff --git a/llamafactory/webui/manager.py b/llamafactory/webui/manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..e762fa6b5e427a5b0a77e4faa7e28f413c243863
--- /dev/null
+++ b/llamafactory/webui/manager.py
@@ -0,0 +1,70 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections.abc import Generator
+from typing import TYPE_CHECKING
+
+
+if TYPE_CHECKING:
+    from gradio.components import Component
+
+
+class Manager:
+    r"""A class to manage all the gradio components in Web UI."""
+
+    def __init__(self) -> None:
+        self._id_to_elem: dict[str, Component] = {}
+        self._elem_to_id: dict[Component, str] = {}
+
+    def add_elems(self, tab_name: str, elem_dict: dict[str, "Component"]) -> None:
+        r"""Add elements to manager."""
+        for elem_name, elem in elem_dict.items():
+            elem_id = f"{tab_name}.{elem_name}"
+            self._id_to_elem[elem_id] = elem
+            self._elem_to_id[elem] = elem_id
+
+    def get_elem_list(self) -> list["Component"]:
+        r"""Return the list of all elements."""
+        return list(self._id_to_elem.values())
+
+    def get_elem_iter(self) -> Generator[tuple[str, "Component"], None, None]:
+        r"""Return an iterator over all elements with their names."""
+        for elem_id, elem in self._id_to_elem.items():
+            yield elem_id.split(".")[-1], elem
+
+    def get_elem_by_id(self, elem_id: str) -> "Component":
+        r"""Get element by id.
+
+        Example: top.lang, train.dataset
+        """
+        return self._id_to_elem[elem_id]
+
+    def get_id_by_elem(self, elem: "Component") -> str:
+        r"""Get id by element."""
+        return self._elem_to_id[elem]
+
+    def get_base_elems(self) -> set["Component"]:
+        r"""Get the base elements that are commonly used."""
+        return {
+            self._id_to_elem["top.lang"],
+            self._id_to_elem["top.model_name"],
+            self._id_to_elem["top.model_path"],
+            self._id_to_elem["top.finetuning_type"],
+            self._id_to_elem["top.checkpoint_path"],
+            self._id_to_elem["top.quantization_bit"],
+            self._id_to_elem["top.quantization_method"],
+            self._id_to_elem["top.template"],
+            self._id_to_elem["top.rope_scaling"],
+            self._id_to_elem["top.booster"],
+        }
diff --git a/llamafactory/webui/runner.py b/llamafactory/webui/runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a6fc7c9aaaed9756c3d7ccb7f638260322abbf5
--- /dev/null
+++ b/llamafactory/webui/runner.py
@@ -0,0 +1,505 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from collections.abc import Generator
+from copy import deepcopy
+from subprocess import PIPE, Popen, TimeoutExpired
+from typing import TYPE_CHECKING, Any, Optional
+
+from transformers.utils import is_torch_npu_available
+
+from ..extras.constants import LLAMABOARD_CONFIG, MULTIMODAL_SUPPORTED_MODELS, PEFT_METHODS, TRAINING_STAGES
+from ..extras.misc import is_accelerator_available, torch_gc
+from ..extras.packages import is_gradio_available
+from .common import (
+    DEFAULT_CACHE_DIR,
+    DEFAULT_CONFIG_DIR,
+    abort_process,
+    calculate_pixels,
+    gen_cmd,
+    get_save_dir,
+    load_args,
+    load_config,
+    load_eval_results,
+    save_args,
+    save_cmd,
+)
+from .control import get_trainer_info
+from .locales import ALERTS, LOCALES
+
+
+if is_gradio_available():
+    import gradio as gr
+
+
+if TYPE_CHECKING:
+    from gradio.components import Component
+
+    from .manager import Manager
+
+
+class Runner:
+    r"""A class to manage the running status of the trainers."""
+
+    def __init__(self, manager: "Manager", demo_mode: bool = False) -> None:
+        r"""Init a runner."""
+        self.manager = manager
+        self.demo_mode = demo_mode
+        """ Resume """
+        self.trainer: Optional[Popen] = None
+        self.do_train = True
+        self.running_data: dict[Component, Any] = None
+        """ State """
+        self.aborted = False
+        self.running = False
+
+    def set_abort(self) -> None:
+        self.aborted = True
+        if self.trainer is not None:
+            abort_process(self.trainer.pid)
+
+    def _initialize(self, data: dict["Component", Any], do_train: bool, from_preview: bool) -> str:
+        r"""Validate the configuration."""
+        get = lambda elem_id: data[self.manager.get_elem_by_id(elem_id)]
+        lang, model_name, model_path = get("top.lang"), get("top.model_name"), get("top.model_path")
+        dataset = get("train.dataset") if do_train else get("eval.dataset")
+
+        if self.running:
+            return ALERTS["err_conflict"][lang]
+
+        if not model_name:
+            return ALERTS["err_no_model"][lang]
+
+        if not model_path:
+            return ALERTS["err_no_path"][lang]
+
+        if not dataset:
+            return ALERTS["err_no_dataset"][lang]
+
+        if not from_preview and self.demo_mode:
+            return ALERTS["err_demo"][lang]
+
+        if do_train:
+            if not get("train.output_dir"):
+                return ALERTS["err_no_output_dir"][lang]
+
+            try:
+                json.loads(get("train.extra_args"))
+            except json.JSONDecodeError:
+                return ALERTS["err_json_schema"][lang]
+
+            stage = TRAINING_STAGES[get("train.training_stage")]
+            if stage == "ppo" and not get("train.reward_model"):
+                return ALERTS["err_no_reward_model"][lang]
+        else:
+            if not get("eval.output_dir"):
+                return ALERTS["err_no_output_dir"][lang]
+
+        if not from_preview and not is_accelerator_available():
+            gr.Warning(ALERTS["warn_no_cuda"][lang])
+
+        return ""
+
+    def _finalize(self, lang: str, finish_info: str) -> None:
+        r"""Clean the cached memory and resets the runner."""
+        finish_info = ALERTS["info_aborted"][lang] if self.aborted else finish_info
+        gr.Info(finish_info)
+        self.trainer = None
+        self.aborted = False
+        self.running = False
+        self.running_data = None
+        torch_gc()
+
+    def _parse_train_args(self, data: dict["Component", Any]) -> dict[str, Any]:
+        r"""Build and validate the training arguments."""
+        get = lambda elem_id: data[self.manager.get_elem_by_id(elem_id)]
+        model_name, finetuning_type = get("top.model_name"), get("top.finetuning_type")
+        user_config = load_config()
+
+        args = dict(
+            stage=TRAINING_STAGES[get("train.training_stage")],
+            do_train=True,
+            model_name_or_path=get("top.model_path"),
+            cache_dir=user_config.get("cache_dir", None),
+            preprocessing_num_workers=16,
+            finetuning_type=finetuning_type,
+            template=get("top.template"),
+            rope_scaling=get("top.rope_scaling") if get("top.rope_scaling") != "none" else None,
+            flash_attn="fa2" if get("top.booster") == "flashattn2" else "auto",
+            use_unsloth=(get("top.booster") == "unsloth"),
+            enable_liger_kernel=(get("top.booster") == "liger_kernel"),
+            dataset_dir=get("train.dataset_dir"),
+            dataset=",".join(get("train.dataset")),
+            cutoff_len=get("train.cutoff_len"),
+            learning_rate=float(get("train.learning_rate")),
+            num_train_epochs=float(get("train.num_train_epochs")),
+            max_samples=int(get("train.max_samples")),
+            per_device_train_batch_size=get("train.batch_size"),
+            gradient_accumulation_steps=get("train.gradient_accumulation_steps"),
+            lr_scheduler_type=get("train.lr_scheduler_type"),
+            max_grad_norm=float(get("train.max_grad_norm")),
+            logging_steps=get("train.logging_steps"),
+            save_steps=get("train.save_steps"),
+            warmup_steps=get("train.warmup_steps"),
+            neftune_noise_alpha=get("train.neftune_alpha") or None,
+            packing=get("train.packing") or get("train.neat_packing"),
+            neat_packing=get("train.neat_packing"),
+            train_on_prompt=get("train.train_on_prompt"),
+            mask_history=get("train.mask_history"),
+            resize_vocab=get("train.resize_vocab"),
+            use_llama_pro=get("train.use_llama_pro"),
+            enable_thinking=get("train.enable_thinking"),
+            report_to=get("train.report_to"),
+            use_galore=get("train.use_galore"),
+            use_apollo=get("train.use_apollo"),
+            use_badam=get("train.use_badam"),
+            use_swanlab=get("train.use_swanlab"),
+            output_dir=get_save_dir(model_name, finetuning_type, get("train.output_dir")),
+            fp16=(get("train.compute_type") == "fp16"),
+            bf16=(get("train.compute_type") == "bf16"),
+            pure_bf16=(get("train.compute_type") == "pure_bf16"),
+            plot_loss=True,
+            trust_remote_code=True,
+            ddp_timeout=180000000,
+            include_num_input_tokens_seen=True,
+        )
+        args.update(json.loads(get("train.extra_args")))
+
+        # checkpoints
+        if get("top.checkpoint_path"):
+            if finetuning_type in PEFT_METHODS:  # list
+                args["adapter_name_or_path"] = ",".join(
+                    [get_save_dir(model_name, finetuning_type, adapter) for adapter in get("top.checkpoint_path")]
+                )
+            else:  # str
+                args["model_name_or_path"] = get_save_dir(model_name, finetuning_type, get("top.checkpoint_path"))
+
+        # quantization
+        if get("top.quantization_bit") != "none":
+            args["quantization_bit"] = int(get("top.quantization_bit"))
+            args["quantization_method"] = get("top.quantization_method")
+            args["double_quantization"] = not is_torch_npu_available()
+
+        # freeze config
+        if args["finetuning_type"] == "freeze":
+            args["freeze_trainable_layers"] = get("train.freeze_trainable_layers")
+            args["freeze_trainable_modules"] = get("train.freeze_trainable_modules")
+            args["freeze_extra_modules"] = get("train.freeze_extra_modules") or None
+
+        # lora config
+        if args["finetuning_type"] == "lora":
+            args["lora_rank"] = get("train.lora_rank")
+            args["lora_alpha"] = get("train.lora_alpha")
+            args["lora_dropout"] = get("train.lora_dropout")
+            args["loraplus_lr_ratio"] = get("train.loraplus_lr_ratio") or None
+            args["create_new_adapter"] = get("train.create_new_adapter")
+            args["use_rslora"] = get("train.use_rslora")
+            args["use_dora"] = get("train.use_dora")
+            args["pissa_init"] = get("train.use_pissa")
+            args["pissa_convert"] = get("train.use_pissa")
+            args["lora_target"] = get("train.lora_target") or "all"
+            args["additional_target"] = get("train.additional_target") or None
+
+            if args["use_llama_pro"]:
+                args["freeze_trainable_layers"] = get("train.freeze_trainable_layers")
+
+        # rlhf config
+        if args["stage"] == "ppo":
+            if finetuning_type in PEFT_METHODS:
+                args["reward_model"] = ",".join(
+                    [get_save_dir(model_name, finetuning_type, adapter) for adapter in get("train.reward_model")]
+                )
+            else:
+                args["reward_model"] = get_save_dir(model_name, finetuning_type, get("train.reward_model"))
+
+            args["reward_model_type"] = "lora" if finetuning_type == "lora" else "full"
+            args["ppo_score_norm"] = get("train.ppo_score_norm")
+            args["ppo_whiten_rewards"] = get("train.ppo_whiten_rewards")
+            args["top_k"] = 0
+            args["top_p"] = 0.9
+        elif args["stage"] in ["dpo", "kto"]:
+            args["pref_beta"] = get("train.pref_beta")
+            args["pref_ftx"] = get("train.pref_ftx")
+            args["pref_loss"] = get("train.pref_loss")
+
+        # multimodal config
+        if model_name in MULTIMODAL_SUPPORTED_MODELS:
+            args["freeze_vision_tower"] = get("train.freeze_vision_tower")
+            args["freeze_multi_modal_projector"] = get("train.freeze_multi_modal_projector")
+            args["freeze_language_model"] = get("train.freeze_language_model")
+            args["image_max_pixels"] = calculate_pixels(get("train.image_max_pixels"))
+            args["image_min_pixels"] = calculate_pixels(get("train.image_min_pixels"))
+            args["video_max_pixels"] = calculate_pixels(get("train.video_max_pixels"))
+            args["video_min_pixels"] = calculate_pixels(get("train.video_min_pixels"))
+
+        # galore config
+        if args["use_galore"]:
+            args["galore_rank"] = get("train.galore_rank")
+            args["galore_update_interval"] = get("train.galore_update_interval")
+            args["galore_scale"] = get("train.galore_scale")
+            args["galore_target"] = get("train.galore_target")
+
+        # apollo config
+        if args["use_apollo"]:
+            args["apollo_rank"] = get("train.apollo_rank")
+            args["apollo_update_interval"] = get("train.apollo_update_interval")
+            args["apollo_scale"] = get("train.apollo_scale")
+            args["apollo_target"] = get("train.apollo_target")
+
+        # badam config
+        if args["use_badam"]:
+            args["badam_mode"] = get("train.badam_mode")
+            args["badam_switch_mode"] = get("train.badam_switch_mode")
+            args["badam_switch_interval"] = get("train.badam_switch_interval")
+            args["badam_update_ratio"] = get("train.badam_update_ratio")
+
+        # swanlab config
+        if get("train.use_swanlab"):
+            args["swanlab_project"] = get("train.swanlab_project")
+            args["swanlab_run_name"] = get("train.swanlab_run_name")
+            args["swanlab_workspace"] = get("train.swanlab_workspace")
+            args["swanlab_api_key"] = get("train.swanlab_api_key")
+            args["swanlab_mode"] = get("train.swanlab_mode")
+
+        # eval config
+        if get("train.val_size") > 1e-6 and args["stage"] != "ppo":
+            args["val_size"] = get("train.val_size")
+            args["eval_strategy"] = "steps"
+            args["eval_steps"] = args["save_steps"]
+            args["per_device_eval_batch_size"] = args["per_device_train_batch_size"]
+
+        # ds config
+        if get("train.ds_stage") != "none":
+            ds_stage = get("train.ds_stage")
+            ds_offload = "offload_" if get("train.ds_offload") else ""
+            args["deepspeed"] = os.path.join(DEFAULT_CACHE_DIR, f"ds_z{ds_stage}_{ds_offload}config.json")
+
+        return args
+
+    def _parse_eval_args(self, data: dict["Component", Any]) -> dict[str, Any]:
+        r"""Build and validate the evaluation arguments."""
+        get = lambda elem_id: data[self.manager.get_elem_by_id(elem_id)]
+        model_name, finetuning_type = get("top.model_name"), get("top.finetuning_type")
+        user_config = load_config()
+
+        args = dict(
+            stage="sft",
+            model_name_or_path=get("top.model_path"),
+            cache_dir=user_config.get("cache_dir", None),
+            preprocessing_num_workers=16,
+            finetuning_type=finetuning_type,
+            quantization_method=get("top.quantization_method"),
+            template=get("top.template"),
+            rope_scaling=get("top.rope_scaling") if get("top.rope_scaling") != "none" else None,
+            flash_attn="fa2" if get("top.booster") == "flashattn2" else "auto",
+            use_unsloth=(get("top.booster") == "unsloth"),
+            dataset_dir=get("eval.dataset_dir"),
+            eval_dataset=",".join(get("eval.dataset")),
+            cutoff_len=get("eval.cutoff_len"),
+            max_samples=int(get("eval.max_samples")),
+            per_device_eval_batch_size=get("eval.batch_size"),
+            predict_with_generate=True,
+            report_to="none",
+            max_new_tokens=get("eval.max_new_tokens"),
+            top_p=get("eval.top_p"),
+            temperature=get("eval.temperature"),
+            output_dir=get_save_dir(model_name, finetuning_type, get("eval.output_dir")),
+            trust_remote_code=True,
+            ddp_timeout=180000000,
+        )
+
+        if get("eval.predict"):
+            args["do_predict"] = True
+        else:
+            args["do_eval"] = True
+
+        # checkpoints
+        if get("top.checkpoint_path"):
+            if finetuning_type in PEFT_METHODS:  # list
+                args["adapter_name_or_path"] = ",".join(
+                    [get_save_dir(model_name, finetuning_type, adapter) for adapter in get("top.checkpoint_path")]
+                )
+            else:  # str
+                args["model_name_or_path"] = get_save_dir(model_name, finetuning_type, get("top.checkpoint_path"))
+
+        # quantization
+        if get("top.quantization_bit") != "none":
+            args["quantization_bit"] = int(get("top.quantization_bit"))
+            args["quantization_method"] = get("top.quantization_method")
+            args["double_quantization"] = not is_torch_npu_available()
+
+        return args
+
+    def _preview(self, data: dict["Component", Any], do_train: bool) -> Generator[dict["Component", str], None, None]:
+        r"""Preview the training commands."""
+        output_box = self.manager.get_elem_by_id("{}.output_box".format("train" if do_train else "eval"))
+        error = self._initialize(data, do_train, from_preview=True)
+        if error:
+            gr.Warning(error)
+            yield {output_box: error}
+        else:
+            args = self._parse_train_args(data) if do_train else self._parse_eval_args(data)
+            yield {output_box: gen_cmd(args)}
+
+    def _launch(self, data: dict["Component", Any], do_train: bool) -> Generator[dict["Component", Any], None, None]:
+        r"""Start the training process."""
+        output_box = self.manager.get_elem_by_id("{}.output_box".format("train" if do_train else "eval"))
+        error = self._initialize(data, do_train, from_preview=False)
+        if error:
+            gr.Warning(error)
+            yield {output_box: error}
+        else:
+            self.do_train, self.running_data = do_train, data
+            args = self._parse_train_args(data) if do_train else self._parse_eval_args(data)
+
+            os.makedirs(args["output_dir"], exist_ok=True)
+            save_args(os.path.join(args["output_dir"], LLAMABOARD_CONFIG), self._build_config_dict(data))
+
+            env = deepcopy(os.environ)
+            env["LLAMABOARD_ENABLED"] = "1"
+            env["LLAMABOARD_WORKDIR"] = args["output_dir"]
+            if args.get("deepspeed", None) is not None:
+                env["FORCE_TORCHRUN"] = "1"
+
+            # NOTE: DO NOT USE shell=True to avoid security risk
+            self.trainer = Popen(["llamafactory-cli", "train", save_cmd(args)], env=env, stderr=PIPE, text=True)
+            yield from self.monitor()
+
+    def _build_config_dict(self, data: dict["Component", Any]) -> dict[str, Any]:
+        r"""Build a dictionary containing the current training configuration."""
+        config_dict = {}
+        skip_ids = ["top.lang", "top.model_path", "train.output_dir", "train.config_path"]
+        for elem, value in data.items():
+            elem_id = self.manager.get_id_by_elem(elem)
+            if elem_id not in skip_ids:
+                config_dict[elem_id] = value
+
+        return config_dict
+
+    def preview_train(self, data):
+        yield from self._preview(data, do_train=True)
+
+    def preview_eval(self, data):
+        yield from self._preview(data, do_train=False)
+
+    def run_train(self, data):
+        yield from self._launch(data, do_train=True)
+
+    def run_eval(self, data):
+        yield from self._launch(data, do_train=False)
+
+    def monitor(self):
+        r"""Monitorgit the training progress and logs."""
+        self.aborted = False
+        self.running = True
+
+        get = lambda elem_id: self.running_data[self.manager.get_elem_by_id(elem_id)]
+        lang, model_name, finetuning_type = get("top.lang"), get("top.model_name"), get("top.finetuning_type")
+        output_dir = get("{}.output_dir".format("train" if self.do_train else "eval"))
+        output_path = get_save_dir(model_name, finetuning_type, output_dir)
+
+        output_box = self.manager.get_elem_by_id("{}.output_box".format("train" if self.do_train else "eval"))
+        progress_bar = self.manager.get_elem_by_id("{}.progress_bar".format("train" if self.do_train else "eval"))
+        loss_viewer = self.manager.get_elem_by_id("train.loss_viewer") if self.do_train else None
+        swanlab_link = self.manager.get_elem_by_id("train.swanlab_link") if self.do_train else None
+
+        running_log = ""
+        return_code = -1
+        while return_code == -1:
+            if self.aborted:
+                yield {
+                    output_box: ALERTS["info_aborting"][lang],
+                    progress_bar: gr.Slider(visible=False),
+                }
+            else:
+                running_log, running_progress, running_info = get_trainer_info(lang, output_path, self.do_train)
+                return_dict = {
+                    output_box: running_log,
+                    progress_bar: running_progress,
+                }
+                if "loss_viewer" in running_info:
+                    return_dict[loss_viewer] = running_info["loss_viewer"]
+
+                if "swanlab_link" in running_info:
+                    return_dict[swanlab_link] = running_info["swanlab_link"]
+
+                yield return_dict
+
+            try:
+                stderr = self.trainer.communicate(timeout=2)[1]
+                return_code = self.trainer.returncode
+            except TimeoutExpired:
+                continue
+
+        if return_code == 0 or self.aborted:
+            finish_info = ALERTS["info_finished"][lang]
+            if self.do_train:
+                finish_log = ALERTS["info_finished"][lang] + "\n\n" + running_log
+            else:
+                finish_log = load_eval_results(os.path.join(output_path, "all_results.json")) + "\n\n" + running_log
+        else:
+            print(stderr)
+            finish_info = ALERTS["err_failed"][lang]
+            finish_log = ALERTS["err_failed"][lang] + f" Exit code: {return_code}\n\n```\n{stderr}\n```\n"
+
+        self._finalize(lang, finish_info)
+        return_dict = {output_box: finish_log, progress_bar: gr.Slider(visible=False)}
+        yield return_dict
+
+    def save_args(self, data):
+        r"""Save the training configuration to config path."""
+        output_box = self.manager.get_elem_by_id("train.output_box")
+        error = self._initialize(data, do_train=True, from_preview=True)
+        if error:
+            gr.Warning(error)
+            return {output_box: error}
+
+        lang = data[self.manager.get_elem_by_id("top.lang")]
+        config_path = data[self.manager.get_elem_by_id("train.config_path")]
+        os.makedirs(DEFAULT_CONFIG_DIR, exist_ok=True)
+        save_path = os.path.join(DEFAULT_CONFIG_DIR, config_path)
+
+        save_args(save_path, self._build_config_dict(data))
+        return {output_box: ALERTS["info_config_saved"][lang] + save_path}
+
+    def load_args(self, lang: str, config_path: str):
+        r"""Load the training configuration from config path."""
+        output_box = self.manager.get_elem_by_id("train.output_box")
+        config_dict = load_args(os.path.join(DEFAULT_CONFIG_DIR, config_path))
+        if config_dict is None:
+            gr.Warning(ALERTS["err_config_not_found"][lang])
+            return {output_box: ALERTS["err_config_not_found"][lang]}
+
+        output_dict: dict[Component, Any] = {output_box: ALERTS["info_config_loaded"][lang]}
+        for elem_id, value in config_dict.items():
+            output_dict[self.manager.get_elem_by_id(elem_id)] = value
+
+        return output_dict
+
+    def check_output_dir(self, lang: str, model_name: str, finetuning_type: str, output_dir: str):
+        r"""Restore the training status if output_dir exists."""
+        output_box = self.manager.get_elem_by_id("train.output_box")
+        output_dict: dict[Component, Any] = {output_box: LOCALES["output_box"][lang]["value"]}
+        if model_name and output_dir and os.path.isdir(get_save_dir(model_name, finetuning_type, output_dir)):
+            gr.Warning(ALERTS["warn_output_dir_exists"][lang])
+            output_dict[output_box] = ALERTS["warn_output_dir_exists"][lang]
+
+            output_dir = get_save_dir(model_name, finetuning_type, output_dir)
+            config_dict = load_args(os.path.join(output_dir, LLAMABOARD_CONFIG))  # load llamaboard config
+            for elem_id, value in config_dict.items():
+                output_dict[self.manager.get_elem_by_id(elem_id)] = value
+
+        return output_dict
diff --git a/train.py b/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..eba846a024a72853948ee2757e69b44551c63b12
--- /dev/null
+++ b/train.py
@@ -0,0 +1,28 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from llamafactory.train.tuner import run_exp
+
+
+def main():
+    run_exp()
+
+
+def _mp_fn(index):
+    # For xla_spawn (TPUs)
+    run_exp()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/webui.py b/webui.py
new file mode 100644
index 0000000000000000000000000000000000000000..f13d2f26c30dd259baf6394d9b293e87a7664450
--- /dev/null
+++ b/webui.py
@@ -0,0 +1,31 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from llamafactory.extras.misc import fix_proxy, is_env_enabled
+from llamafactory.webui.interface import create_ui
+
+
+def main():
+    gradio_ipv6 = is_env_enabled("GRADIO_IPV6")
+    gradio_share = is_env_enabled("GRADIO_SHARE")
+    server_name = os.getenv("GRADIO_SERVER_NAME", "[::]" if gradio_ipv6 else "0.0.0.0")
+    print("Visit http://ip:port for Web UI, e.g., http://127.0.0.1:7860")
+    fix_proxy(ipv6_enabled=gradio_ipv6)
+    create_ui().queue().launch(share=gradio_share, server_name=server_name, inbrowser=True)
+
+
+if __name__ == "__main__":
+    main()