Update 0604

0722acf1 · chenych · c4ba4563 · 0722acf1 · 0722acf1 · c4ba4563
Commit 0722acf1 authored Jun 04, 2025 by chenych
20 changed files
--- a/README.md
+++ b/README.md
@@ -20,27 +20,36 @@ LLaMA Factory是一个大语言模型训练和推理的框架，支持了魔搭
 | ----------------------------------------------------------------- | -------------------------------- | ------------------- |
 | [Baichuan 2](https://huggingface.co/baichuan-inc)                 | 7B/13B                           | baichuan2           |
 | [ChatGLM3](https://huggingface.co/THUDM)                          | 6B                               | chatglm3            |
-| [Gemma 2](https://huggingface.co/google)                          | 2B/9B                            | gemma               |
+| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai)         | 7B/16B/67B/236B                  | deepseek            |
+| [DeepSeek 2.5/3](https://huggingface.co/deepseek-ai)              | 236B/671B                        | deepseek3           |
+| [DeepSeek R1 (Distill)](https://huggingface.co/deepseek-ai)       | 1.5B/7B/8B/14B/32B/70B/671B      | deepseekr1          |
+| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google)          | 2B/7B/9B/27B                     | gemma               |
+| [Gemma 3](https://huggingface.co/google)                          | 1B/4B/12B/27B                    | gemma3/gemma (1B)   |
 | [GLM-4/GLM-4-0414/GLM-Z1](https://huggingface.co/THUDM)           | 9B/32B                           | glm4                |
+| [Hunyuan](https://huggingface.co/tencent/)                        | 7B                               | hunyuan             |
+| [InternLM 2-3](https://huggingface.co/internlm)                   | 7B/8B/20B                        | intern2             |
+| [InternVL 2.5-3](https://huggingface.co/OpenGVLab)                | 1B/2B/8B/14B/38B/78B             | intern_vl           |
 | [Llama 2](https://huggingface.co/meta-llama)                      | 7B/13B/70B                       | llama2              |
-| [Llama 3/Llama 3.1](https://huggingface.co/meta-llama)            | 8B/70B                           | llama3              |
+| [Llama 3-3.3](https://huggingface.co/meta-llama)                  | 1B/3B/8B/70B                     | llama3              |
 | [Llama 4](https://huggingface.co/meta-llama)                      | 109B/402B                        | llama4              |
 | [OLMo](https://hf-mirror.com/allenai)                             | 1B/7B                            | olmo                |
 | [Qwen (1-2.5) (Code/Math/MoE/QwQ)](https://huggingface.co/Qwen)   | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen                |
-| [Qwen3 (MoE)](https://huggingface.co/Qwen)                        | 0.6B/1.7B/4B/8B/14B/30B/32B/235B     | qwen3               |
+| [Qwen3 (MoE)](https://huggingface.co/Qwen)                        | 0.6B/1.7B/4B/8B/14B/32B/235B     | qwen3               |
 | [XVERSE](https://hf-mirror.com/xverse)                            | 7B/13B                           | xverse              |
 持续更新中...
 > **[!NOTE]**
 >
-> 注意：本版本仅支持deepseek蒸馏模型的监督微调(SFT)，可参考[deepseek-r1-distill_vllm](https://developer.sourcefind.cn/codes/modelzoo/deepseek-r1-distill_vllm)
->
 > 对于所有“基座”（Base）模型，`template` 参数可以是 `default`, `alpaca`, `vicuna` 等任意值。但“对话”（Instruct/Chat）模型请务必使用**对应的模板**。
 >
 > 请务必在训练和推理时采用**完全一致**的模板。
 > 您也可以在 [template.py](src/llamafactory/data/template.py) 中添加自己的对话模板。
 >
+> \*：您需要从 main 分支安装 `transformers` 并使用 `DISABLE_VERSION_CHECK=1` 来跳过版本检查。
+>
+> \*\*：您需要安装特定版本的 `transformers` 以使用该模型。
 > **已知问题及解决方案**
 > 1. `Baichuan 2` 需要卸载掉环境中的xformers库，当前仅支持Lora方式训练。
 >
@@ -64,9 +73,7 @@ docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.4.1-ubuntu22.04-dt
 docker run -it --shm-size 200g --network=host --name {docker_name} --privileged --device=/dev/kfd --device=/dev/dri --device=/dev/mkfd --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro {imageID} bash
 cd /your_code_path/llama_factory
-pip install -e ".[torch,metrics]"
+pip install -e ".[torch,metrics]" --no-build-isolation
-## llama4 需要单独安装以下包
-pip install git+https://github.com/hiyouga/transformers.git@llama4_train
 ```
 #### Dockerfile（方法二）
@@ -77,9 +84,7 @@ docker build --no-cache -t llama-factory:latest .
 docker run -it --shm-size 200g --network=host --name {docker_name} --privileged --device=/dev/kfd --device=/dev/dri --device=/dev/mkfd --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro {imageID} bash
 cd /your_code_path/llama_factory
-pip install -e ".[torch,metrics]"
+pip install -e ".[torch,metrics]" --no-build-isolation
-## llama4 需要单独安装以下包
-pip install git+https://github.com/hiyouga/transformers.git@llama4_train
 ```
 #### Anaconda（方法三）
@@ -102,9 +107,7 @@ deepspeed: 0.14.2+das.opt2.dtk2504
 ```bash
 git clone http://developer.hpccube.com/codes/OpenDAS/llama-factory.git
 cd /your_code_path/llama_factory
-pip install -e ".[torch,metrics]"
+pip install -e ".[torch,metrics]" --no-build-isolation
-## llama4 需要单独安装以下包
-pip install git+https://github.com/hiyouga/transformers.git@llama4_train
 # （可选）deepspeed多机训练
 # pdsh安装，若已安装，可忽略。
@@ -244,6 +247,12 @@ llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
 >
 > 自有数据集推理精度验证方法推荐使用：`python scripts/vllm_infer.py`生成结果，`python scripts/eval_bleu_rouge.py`计算得分，具体参数信息请参考脚本内容。
+### LLaMA Board 可视化微调（由 [Gradio](https://github.com/gradio-app/gradio) 驱动）
+```bash
+llamafactory-cli webui
+```
 ## 参考资料
 - [README_zh](README_zh.md)

--- a/README_en.md
+++ b/README_en.md
@@ -5,8 +5,8 @@
 [![GitHub contributors](https://img.shields.io/github/contributors/hiyouga/LLaMA-Factory?color=orange)](https://github.com/hiyouga/LLaMA-Factory/graphs/contributors)
 [![GitHub workflow](https://github.com/hiyouga/LLaMA-Factory/actions/workflows/tests.yml/badge.svg)](https://github.com/hiyouga/LLaMA-Factory/actions/workflows/tests.yml)
 [![PyPI](https://img.shields.io/pypi/v/llamafactory)](https://pypi.org/project/llamafactory/)
-[![Citation](https://img.shields.io/badge/citation-429-green)](https://scholar.google.com/scholar?cites=12620864006390196564)
+[![Citation](https://img.shields.io/badge/citation-544-green)](https://scholar.google.com/scholar?cites=12620864006390196564)
-[![GitHub pull request](https://img.shields.io/badge/PRs-welcome-blue)](https://github.com/hiyouga/LLaMA-Factory/pulls)
+[![Docker Pulls](https://img.shields.io/docker/pulls/hiyouga/llamafactory)](https://hub.docker.com/r/hiyouga/llamafactory/tags)
 [![Twitter](https://img.shields.io/twitter/follow/llamafactory_ai)](https://twitter.com/llamafactory_ai)
 [![Discord](https://dcbadge.vercel.app/api/server/rKfvV9r9FK?compact=true&style=flat)](https://discord.gg/rKfvV9r9FK)
@@ -14,18 +14,31 @@
 [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1eRTPn37ltBbYsISy9Aw2NuI2Aq5CQrD9?usp=sharing)
 [![Open in DSW](https://gallery.pai-ml.com/assets/open-in-dsw.svg)](https://gallery.pai-ml.com/#/preview/deepLearning/nlp/llama_factory)
-[![Spaces](https://img.shields.io/badge/🤗-Open%20in%20Spaces-blue)](https://huggingface.co/spaces/hiyouga/LLaMA-Board)
+[![Open in Spaces](https://img.shields.io/badge/🤗-Open%20in%20Spaces-blue)](https://huggingface.co/spaces/hiyouga/LLaMA-Board)
-[![Studios](https://img.shields.io/badge/ModelScope-Open%20in%20Studios-blue)](https://modelscope.cn/studios/hiyouga/LLaMA-Board)
+[![Open in Studios](https://img.shields.io/badge/ModelScope-Open%20in%20Studios-blue)](https://modelscope.cn/studios/hiyouga/LLaMA-Board)
-[![SageMaker](https://img.shields.io/badge/SageMaker-Open%20in%20AWS-blue)](https://aws.amazon.com/cn/blogs/china/a-one-stop-code-free-model-fine-tuning-deployment-platform-based-on-sagemaker-and-llama-factory/)
+[![Open in Novita](https://img.shields.io/badge/Novita-Deploy%20Template-blue)](https://novita.ai/templates-library/105981?sharer=88115474-394e-4bda-968e-b88e123d0c47)
-<h3 align="center">
+### Used by [Amazon](https://aws.amazon.com/cn/blogs/machine-learning/how-apoidea-group-enhances-visual-information-extraction-from-banking-documents-with-multimodal-models-using-llama-factory-on-amazon-sagemaker-hyperpod/), [NVIDIA](https://developer.nvidia.com/rtx/ai-toolkit), [Aliyun](https://help.aliyun.com/zh/pai/use-cases/fine-tune-a-llama-3-model-with-llama-factory), etc.
-    Easily fine-tune 100+ large language models with zero-code <a href="#quickstart">CLI</a> and <a href="#fine-tuning-with-llama-board-gui-powered-by-gradio">Web UI</a>
-</h3>
+<div align="center" markdown="1">
-<p align="center">
-    <picture>
+### Supporters ❤️
-        <img alt="Github trend" src="https://trendshift.io/api/badge/repositories/4535">
-    </picture>
+<a href="https://warp.dev/llama-factory">
-</p>
+    <img alt="Warp sponsorship" width="400" src="https://github.com/user-attachments/assets/ab8dd143-b0fd-4904-bdc5-dd7ecac94eae">
+</a>
+#### [Warp, the agentic terminal for developers](https://warp.dev/llama-factory)
+[Available for MacOS, Linux, & Windows](https://warp.dev/llama-factory)
+----
+### Easily fine-tune 100+ large language models with zero-code [CLI](#quickstart) and [Web UI](#fine-tuning-with-llama-board-gui-powered-by-gradio)
+![GitHub Trend](https://trendshift.io/api/badge/repositories/4535)
+</div>
 👋 Join our [WeChat](assets/wechat.jpg) or [NPU user group](assets/wechat_npu.jpg).
@@ -40,9 +53,7 @@ Choose your path:
 - **Documentation**: https://llamafactory.readthedocs.io/en/latest/
 - **Colab (free)**: https://colab.research.google.com/drive/1eRTPn37ltBbYsISy9Aw2NuI2Aq5CQrD9?usp=sharing
 - **Local machine**: Please refer to [usage](#getting-started)
- **PAI-DSW (free trial)**: [Llama3 Example](https://gallery.pai-ml.com/#/preview/deepLearning/nlp/llama_factory) | [Qwen2-VL Example](https://gallery.pai-ml.com/#/preview/deepLearning/nlp/llama_factory_qwen2vl) | [DeepSeek-R1-Distill Example](https://gallery.pai-ml.com/#/preview/deepLearning/nlp/llama_factory_deepseek_r1_distill_7b)
+- **PAI-DSW (free trial)**: https://gallery.pai-ml.com/#/preview/deepLearning/nlp/llama_factory
- **Amazon SageMaker**: [Blog](https://aws.amazon.com/cn/blogs/china/a-one-stop-code-free-model-fine-tuning-deployment-platform-based-on-sagemaker-and-llama-factory/)
- **Easy Dataset**: [Fine-tune on Synthetic Data](https://buaa-act.feishu.cn/wiki/GVzlwYcRFiR8OLkHbL6cQpYin7g)
 > [!NOTE]
 > Except for the above links, all other websites are unauthorized third-party websites. Please carefully use them.
@@ -50,7 +61,7 @@ Choose your path:
 ## Table of Contents
 - [Features](#features)
- [Benchmark](#benchmark)
+- [Blogs](#blogs)
 - [Changelog](#changelog)
 - [Supported Models](#supported-models)
 - [Supported Training Approaches](#supported-training-approaches)
@@ -90,18 +101,17 @@ Choose your path:
 | Day 0        | Qwen3 / Qwen2.5-VL / Gemma 3 / InternLM 3 / MiniCPM-o-2.6    |
 | Day 1        | Llama 3 / GLM-4 / Mistral Small / PaliGemma2 / Llama 4       |
-## Benchmark
+## Blogs
-Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/ptuning), LLaMA Factory's LoRA tuning offers up to **3.7 times faster** training speed with a better Rouge score on the advertising text generation task. By leveraging 4-bit quantization technique, LLaMA Factory's QLoRA further improves the efficiency regarding the GPU memory.
-![benchmark](assets/benchmark.svg)
+- [How Apoidea Group enhances visual information extraction from banking documents with multimodal models using LLaMA-Factory on Amazon SageMaker HyperPod](https://aws.amazon.com/cn/blogs/machine-learning/how-apoidea-group-enhances-visual-information-extraction-from-banking-documents-with-multimodal-models-using-llama-factory-on-amazon-sagemaker-hyperpod/) (English)
+- [Easy Dataset × LLaMA Factory: Enabling LLMs to Efficiently Learn Domain Knowledge](https://buaa-act.feishu.cn/wiki/GVzlwYcRFiR8OLkHbL6cQpYin7g) (English)
+- [LLaMA Factory: Fine-tuning the DeepSeek-R1-Distill-Qwen-7B Model for News Classifier](https://gallery.pai-ml.com/#/preview/deepLearning/nlp/llama_factory_deepseek_r1_distill_7b) (Chinese)
-<details><summary>Definitions</summary>
+<details><summary>All Blogs</summary>
- **Training Speed**: the number of training samples processed per second during the training. (bs=4, cutoff_len=1024)
+- [A One-Stop Code-Free Model Fine-Tuning \& Deployment Platform based on SageMaker and LLaMA-Factory](https://aws.amazon.com/cn/blogs/china/a-one-stop-code-free-model-fine-tuning-deployment-platform-based-on-sagemaker-and-llama-factory/) (Chinese)
- **Rouge Score**: Rouge-2 score on the development set of the [advertising text generation](https://aclanthology.org/D19-1321.pdf) task. (bs=4, cutoff_len=1024)
+- [LLaMA Factory Multi-Modal Fine-Tuning Practice: Fine-Tuning Qwen2-VL for Personal Tourist Guide](https://gallery.pai-ml.com/#/preview/deepLearning/nlp/llama_factory_qwen2vl) (Chinese)
- **GPU Memory**: Peak GPU memory usage in 4-bit quantized training. (bs=1, cutoff_len=1024)
+- [LLaMA Factory: Fine-tuning the LLaMA3 Model for Role-Playing](https://gallery.pai-ml.com/#/preview/deepLearning/nlp/llama_factory) (Chinese)
- We adopt `pre_seq_len=128` for ChatGLM's P-Tuning and `lora_rank=32` for LLaMA Factory's LoRA tuning.
 </details>
@@ -233,6 +243,9 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 </details>
+> [!TIP]
+> If you cannot use the latest feature, please pull the latest code and install LLaMA-Factory again.
 ## Supported Models
 | Model                                                             | Model size                       | Template            |
@@ -243,17 +256,17 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 | [Command R](https://huggingface.co/CohereForAI)                   | 35B/104B                         | cohere              |
 | [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai)         | 7B/16B/67B/236B                  | deepseek            |
 | [DeepSeek 2.5/3](https://huggingface.co/deepseek-ai)              | 236B/671B                        | deepseek3           |
-| [DeepSeek R1 (Distill)](https://huggingface.co/deepseek-ai)       | 1.5B/7B/8B/14B/32B/70B/671B      | deepseek3           |
+| [DeepSeek R1 (Distill)](https://huggingface.co/deepseek-ai)       | 1.5B/7B/8B/14B/32B/70B/671B      | deepseekr1          |
 | [Falcon](https://huggingface.co/tiiuae)                           | 7B/11B/40B/180B                  | falcon              |
 | [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google)          | 2B/7B/9B/27B                     | gemma               |
 | [Gemma 3](https://huggingface.co/google)                          | 1B/4B/12B/27B                    | gemma3/gemma (1B)   |
-| [GLM-4/GLM-4-0414/GLM-Z1](https://huggingface.co/THUDM)           | 9B/32B                           | glm4                |
+| [GLM-4/GLM-4-0414/GLM-Z1](https://huggingface.co/THUDM)           | 9B/32B                           | glm4/glmz1          |
 | [GPT-2](https://huggingface.co/openai-community)                  | 0.1B/0.4B/0.8B/1.5B              | -                   |
 | [Granite 3.0-3.3](https://huggingface.co/ibm-granite)             | 1B/2B/3B/8B                      | granite3            |
 | [Hunyuan](https://huggingface.co/tencent/)                        | 7B                               | hunyuan             |
 | [Index](https://huggingface.co/IndexTeam)                         | 1.9B                             | index               |
 | [InternLM 2-3](https://huggingface.co/internlm)                   | 7B/8B/20B                        | intern2             |
-| [InternVL 2.5-3](https://huggingface.co/OpenGVLab)\*              | 1B/2B/8B/14B/38B/78B             | intern_vl           |
+| [InternVL 2.5-3](https://huggingface.co/OpenGVLab)                | 1B/2B/8B/14B/38B/78B             | intern_vl           |
 | [Kimi-VL](https://huggingface.co/moonshotai)                      | 16B                              | kimi_vl             |
 | [Llama](https://github.com/facebookresearch/llama)                | 7B/13B/33B/65B                   | -                   |
 | [Llama 2](https://huggingface.co/meta-llama)                      | 7B/13B/70B                       | llama2              |
@@ -263,6 +276,7 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 | [LLaVA-1.5](https://huggingface.co/llava-hf)                      | 7B/13B                           | llava               |
 | [LLaVA-NeXT](https://huggingface.co/llava-hf)                     | 7B/8B/13B/34B/72B/110B           | llava_next          |
 | [LLaVA-NeXT-Video](https://huggingface.co/llava-hf)               | 7B/34B                           | llava_next_video    |
+| [MiMo](https://huggingface.co/XiaomiMiMo)                         | 7B                               | mimo                |
 | [MiniCPM](https://huggingface.co/openbmb)                         | 1B/2B/4B                         | cpm/cpm3            |
 | [MiniCPM-o-2.6/MiniCPM-V-2.6](https://huggingface.co/openbmb)     | 8B                               | minicpm_o/minicpm_v |
 | [Ministral/Mistral-Nemo](https://huggingface.co/mistralai)        | 8B/12B                           | ministral           |
@@ -278,8 +292,9 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 | [Qwen (1-2.5) (Code/Math/MoE/QwQ)](https://huggingface.co/Qwen)   | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen                |
 | [Qwen3 (MoE)](https://huggingface.co/Qwen)                        | 0.6B/1.7B/4B/8B/14B/32B/235B     | qwen3               |
 | [Qwen2-Audio](https://huggingface.co/Qwen)                        | 7B                               | qwen2_audio         |
-| [Qwen2.5-Omni](https://huggingface.co/Qwen)\*\*                   | 7B                               | qwen2_omni          |
+| [Qwen2.5-Omni](https://huggingface.co/Qwen)                       | 3B/7B                            | qwen2_omni          |
 | [Qwen2-VL/Qwen2.5-VL/QVQ](https://huggingface.co/Qwen)            | 2B/3B/7B/32B/72B                 | qwen2_vl            |
+| [Seed Coder](https://huggingface.co/ByteDance-Seed)               | 8B                               | seed_coder          |
 | [Skywork o1](https://huggingface.co/Skywork)                      | 8B                               | skywork_o1          |
 | [StarCoder 2](https://huggingface.co/bigcode)                     | 3B/7B/15B                        | -                   |
 | [TeleChat2](https://huggingface.co/Tele-AI)                       | 3B/7B/35B/115B                   | telechat2           |
@@ -402,6 +417,7 @@ You also can add a custom chat template to [template.py](src/llamafactory/data/t
 - [COIG-P (en&zh)](https://huggingface.co/datasets/m-a-p/COIG-P)
 - [RLHF-V (en)](https://huggingface.co/datasets/openbmb/RLHF-V-Dataset)
 - [VLFeedback (en)](https://huggingface.co/datasets/Zhihui/VLFeedback)
+- [RLAIF-V (en)](https://huggingface.co/datasets/openbmb/RLAIF-V-Dataset)
 - [Orca DPO Pairs (en)](https://huggingface.co/datasets/Intel/orca_dpo_pairs)
 - [HH-RLHF (en)](https://huggingface.co/datasets/Anthropic/hh-rlhf)
 - [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar)
@@ -423,6 +439,7 @@ huggingface-cli login
 | ------------ | ------- | --------- |
 | python       | 3.9     | 3.10      |
 | torch        | 2.0.0   | 2.6.0     |
+| torchvision  | 0.15.0  | 0.21.0    |
 | transformers | 4.45.0  | 4.50.0    |
 | datasets     | 2.16.0  | 3.2.0     |
 | accelerate   | 0.34.0  | 1.2.1     |
@@ -457,16 +474,25 @@ huggingface-cli login
 > [!IMPORTANT]
 > Installation is mandatory.
+#### Install from Source
 ```bash
 git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git
 cd LLaMA-Factory
-pip install -e ".[torch,metrics]"
+pip install -e ".[torch,metrics]" --no-build-isolation
 ```
-Extra dependencies available: torch, torch-npu, metrics, deepspeed, liger-kernel, bitsandbytes, hqq, eetq, gptq, awq, aqlm, vllm, sglang, galore, apollo, badam, adam-mini, qwen, minicpm_v, modelscope, openmind, swanlab, quality
+Extra dependencies available: torch, torch-npu, metrics, deepspeed, liger-kernel, bitsandbytes, hqq, eetq, gptq, aqlm, vllm, sglang, galore, apollo, badam, adam-mini, qwen, minicpm_v, modelscope, openmind, swanlab, dev
-> [!TIP]
+#### Install from Docker Image
-> Use `pip install --no-deps -e .` to resolve package conflicts.
+```bash
+docker run -it --rm --gpus=all --ipc=host hiyouga/llamafactory:latest
+```
+Find the pre-built images: https://hub.docker.com/r/hiyouga/llamafactory/tags
+Please refer to [build docker](#build-docker) to build the image yourself.
 <details><summary>Setting up a virtual environment with <b>uv</b></summary>
@@ -486,6 +512,20 @@ uv run --prerelease=allow llamafactory-cli train examples/train_lora/llama3_lora
 <details><summary>For Windows users</summary>
+#### Install PyTorch
+You need to manually install the GPU version of PyTorch on the Windows platform. Please refer to the [official website](https://pytorch.org/get-started/locally/) and the following command to install PyTorch with CUDA support:
+```bash
+pip uninstall torch torchvision torchaudio
+pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
+python -c "import torch; print(torch.cuda.is_available())"
+```
+If you see `True` then you have successfully installed PyTorch with CUDA support.
+Try `dataloader_num_workers: 0` if you encounter `Can't pickle local object` error.
 #### Install BitsAndBytes
 If you want to enable the quantized LoRA (QLoRA) on the Windows platform, you need to install a pre-built version of `bitsandbytes` library, which supports CUDA 11.1 to 12.2, please select the appropriate [release version](https://github.com/jllllll/bitsandbytes-windows-webui/releases/tag/wheels) based on your CUDA version.
@@ -575,7 +615,7 @@ Please refer to [data/README.md](data/README.md) for checking the details about
 > [!NOTE]
 > Please update `data/dataset_info.json` to use your custom dataset.
-You can also use **[Easy Dataset](https://github.com/ConardLi/easy-dataset)** to create synthetic data for fine-tuning.
+You can also use **[Easy Dataset](https://github.com/ConardLi/easy-dataset)** or **[GraphGen](https://github.com/open-sciencelab/GraphGen)** to create synthetic data for fine-tuning.
 ### Quickstart
@@ -632,22 +672,18 @@ For CUDA users:
 ```bash
 docker build -f ./docker/docker-cuda/Dockerfile \
-    --build-arg INSTALL_BNB=false \
-    --build-arg INSTALL_VLLM=false \
-    --build-arg INSTALL_DEEPSPEED=false \
-    --build-arg INSTALL_FLASHATTN=false \
    --build-arg PIP_INDEX=https://pypi.org/simple \
+    --build-arg EXTRAS=metrics \
    -t llamafactory:latest .
-docker run -dit --gpus=all \
+docker run -dit --ipc=host --gpus=all \
    -v ./hf_cache:/root/.cache/huggingface \
    -v ./ms_cache:/root/.cache/modelscope \
    -v ./om_cache:/root/.cache/openmind \
-    -v ./data:/app/data \
+    -v ./shared_data:/app/shared_data \
    -v ./output:/app/output \
    -p 7860:7860 \
    -p 8000:8000 \
-    --shm-size 16G \
    --name llamafactory \
    llamafactory:latest
@@ -657,18 +693,16 @@ docker exec -it llamafactory bash
 For Ascend NPU users:
 ```bash
-# Choose docker image upon your environment
 docker build -f ./docker/docker-npu/Dockerfile \
-    --build-arg INSTALL_DEEPSPEED=false \
    --build-arg PIP_INDEX=https://pypi.org/simple \
+    --build-arg EXTRAS=torch-npu,metrics \
    -t llamafactory:latest .
-# Change `device` upon your resources
+docker run -dit --ipc=host \
-docker run -dit \
    -v ./hf_cache:/root/.cache/huggingface \
    -v ./ms_cache:/root/.cache/modelscope \
    -v ./om_cache:/root/.cache/openmind \
-    -v ./data:/app/data \
+    -v ./shared_data:/app/shared_data \
    -v ./output:/app/output \
    -v /usr/local/dcmi:/usr/local/dcmi \
    -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
@@ -680,7 +714,6 @@ docker run -dit \
    --device /dev/davinci_manager \
    --device /dev/devmm_svm \
    --device /dev/hisi_hdc \
-    --shm-size 16G \
    --name llamafactory \
    llamafactory:latest
@@ -691,25 +724,20 @@ For AMD ROCm users:
 ```bash
 docker build -f ./docker/docker-rocm/Dockerfile \
-    --build-arg INSTALL_BNB=false \
-    --build-arg INSTALL_VLLM=false \
-    --build-arg INSTALL_DEEPSPEED=false \
-    --build-arg INSTALL_FLASHATTN=false \
    --build-arg PIP_INDEX=https://pypi.org/simple \
+    --build-arg EXTRAS=metrics \
    -t llamafactory:latest .
-docker run -dit \
+docker run -dit --ipc=host \
    -v ./hf_cache:/root/.cache/huggingface \
    -v ./ms_cache:/root/.cache/modelscope \
    -v ./om_cache:/root/.cache/openmind \
-    -v ./data:/app/data \
+    -v ./shared_data:/app/shared_data \
    -v ./output:/app/output \
-    -v ./saves:/app/saves \
    -p 7860:7860 \
    -p 8000:8000 \
    --device /dev/kfd \
    --device /dev/dri \
-    --shm-size 16G \
    --name llamafactory \
    llamafactory:latest
@@ -723,7 +751,7 @@ docker exec -it llamafactory bash
 - `hf_cache`: Utilize Hugging Face cache on the host machine. Reassignable if a cache already exists in a different directory.
 - `ms_cache`: Similar to Hugging Face cache but for ModelScope users.
 - `om_cache`: Similar to Hugging Face cache but for Modelers users.
- `data`: Place datasets on this dir of the host machine so that they can be selected on LLaMA Board GUI.
+- `shared_data`: Place datasets on this dir of the host machine so that they can be selected on LLaMA Board GUI.
 - `output`: Set export dir to this location so that the merged result can be accessed directly on the host machine.
 </details>
@@ -886,6 +914,7 @@ If you have a project that should be incorporated, please contact via email or c
 1. **[RAG-Retrieval](https://github.com/NLPJCL/RAG-Retrieval)**: A full pipeline for RAG retrieval model fine-tuning, inference, and distillation. [[blog]](https://zhuanlan.zhihu.com/p/987727357)
 1. **[360-LLaMA-Factory](https://github.com/Qihoo360/360-LLaMA-Factory)**: A modified library that supports long sequence SFT & DPO using ring attention.
 1. **[Sky-T1](https://novasky-ai.github.io/posts/sky-t1/)**: An o1-like model fine-tuned by NovaSky AI with very small cost.
+1. **[WeClone](https://github.com/xming521/WeClone)**: One-stop solution for creating your digital avatar from chat logs.
 </details>

--- a/assets/wechat.jpg
+++ b/assets/wechat.jpg
--- a/assets/wechat_npu.jpg
+++ b/assets/wechat_npu.jpg
--- a/data/README.md
+++ b/data/README.md
 The [dataset_info.json](dataset_info.json) contains all available datasets. If you are using a custom dataset, please **make sure** to add a *dataset description* in `dataset_info.json` and specify `dataset: dataset_name` before training to use it.
-Currently we support datasets in **alpaca** and **sharegpt** format.
+The `dataset_info.json` file should be put in the `dataset_dir` directory. You can change `dataset_dir` to use another directory. The default value is `./data`.
+Currently we support datasets in **alpaca** and **sharegpt** format. Allowed file types include json, jsonl, csv, parquet, arrow.
 ```json
 "dataset_name": {
@@ -48,7 +50,9 @@ Currently we support datasets in **alpaca** and **sharegpt** format.
 * [Example dataset](alpaca_en_demo.json)
-In supervised fine-tuning, the `instruction` column will be concatenated with the `input` column and used as the human prompt, then the human prompt would be `instruction\ninput`. The `output` column represents the model response.
+In supervised fine-tuning, the `instruction` column will be concatenated with the `input` column and used as the user prompt, then the user prompt would be `instruction\ninput`. The `output` column represents the model response.
+For reasoning models, if the dataset contains chain-of-thought (CoT), the CoT needs to be placed in the model responses, such as `<think>cot</think>output`.
 The `system` column will be used as the system prompt if specified.
@@ -57,13 +61,13 @@ The `history` column is a list consisting of string tuples representing prompt-r
 ```json
 [
  {
-    "instruction": "human instruction (required)",
+    "instruction": "user instruction (required)",
-    "input": "human input (optional)",
+    "input": "user input (optional)",
    "output": "model response (required)",
    "system": "system prompt (optional)",
    "history": [
-      ["human instruction in the first round (optional)", "model response in the first round (optional)"],
+      ["user instruction in the first round (optional)", "model response in the first round (optional)"],
-      ["human instruction in the second round (optional)", "model response in the second round (optional)"]
+      ["user instruction in the second round (optional)", "model response in the second round (optional)"]
    ]
  }
 ]
@@ -84,6 +88,11 @@ Regarding the above dataset, the *dataset description* in `dataset_info.json` sh
 }
 ```
+> [!TIP]  
+> If the model has reasoning capabilities (e.g. Qwen3) but the dataset does not contain chain-of-thought (CoT), LLaMA-Factory will automatically add empty CoT to the data. When `enable_thinking` is `True` (slow thinking, by default), the empty CoT will be added to the model responses and loss computation will be considered; otherwise (fast thinking), it will be added to the user prompts and loss computation will be ignored. Please keep the `enable_thinking` parameter consistent during training and inference.
+>
+> If you want to train data containing CoT with slow thinking and data without CoT with fast thinking, you can set `enable_thinking` to `None`. However, this feature is relatively complicated and should be used with caution.
 ### Pre-training Dataset
 - [Example dataset](c4_demo.jsonl)
@@ -117,8 +126,8 @@ It requires a better response in `chosen` column and a worse response in `reject
 ```json
 [
  {
-    "instruction": "human instruction (required)",
+    "instruction": "user instruction (required)",
-    "input": "human input (optional)",
+    "input": "user input (optional)",
    "chosen": "chosen answer (required)",
    "rejected": "rejected answer (required)"
  }
@@ -172,7 +181,7 @@ Note that the human and observation should appear in odd positions, while gpt an
    "conversations": [
      {
        "from": "human",
-        "value": "human instruction"
+        "value": "user instruction"
      },
      {
        "from": "function_call",
@@ -223,7 +232,7 @@ Preference datasets in sharegpt format also require a better message in `chosen`
    "conversations": [
      {
        "from": "human",
-        "value": "human instruction"
+        "value": "user instruction"
      },
      {
        "from": "gpt",
@@ -231,7 +240,7 @@ Preference datasets in sharegpt format also require a better message in `chosen`
      },
      {
        "from": "human",
-        "value": "human instruction"
+        "value": "user instruction"
      }
    ],
    "chosen": {
@@ -273,7 +282,7 @@ KTO datasets require a extra `kto_tag` column containing the boolean human feedb
    "conversations": [
      {
        "from": "human",
-        "value": "human instruction"
+        "value": "user instruction"
      },
      {
        "from": "gpt",
@@ -312,7 +321,7 @@ The number of images should be identical to the `<image>` tokens in the conversa
    "conversations": [
      {
        "from": "human",
-        "value": "<image>human instruction"
+        "value": "<image>user instruction"
      },
      {
        "from": "gpt",
@@ -353,7 +362,7 @@ The number of videos should be identical to the `<video>` tokens in the conversa
    "conversations": [
      {
        "from": "human",
-        "value": "<video>human instruction"
+        "value": "<video>user instruction"
      },
      {
        "from": "gpt",
@@ -394,7 +403,7 @@ The number of audios should be identical to the `<audio>` tokens in the conversa
    "conversations": [
      {
        "from": "human",
-        "value": "<audio>human instruction"
+        "value": "<audio>user instruction"
      },
      {
        "from": "gpt",
@@ -435,7 +444,7 @@ The openai format is simply a special case of the sharegpt format, where the fir
      },
      {
        "role": "user",
-        "content": "human instruction"
+        "content": "user instruction"
      },
      {
        "role": "assistant",

--- a/data/README_zh.md
+++ b/data/README_zh.md
 [dataset_info.json](dataset_info.json) 包含了所有可用的数据集。如果您希望使用自定义数据集，请**务必**在 `dataset_info.json` 文件中添加*数据集描述*，并通过修改 `dataset: 数据集名称` 配置来使用数据集。
-目前我们支持 **alpaca** 格式和 **sharegpt** 格式的数据集。
+其中 `dataset_info.json` 文件应放置在 `dataset_dir` 目录下。您可以通过修改 `dataset_dir` 参数来使用其他目录。默认值为 `./data`。
+目前我们支持 **alpaca** 格式和 **sharegpt** 格式的数据集。允许的文件类型包括 json、jsonl、csv、parquet 和 arrow。
 ```json
 "数据集名称": {
@@ -47,7 +49,9 @@
 - [样例数据集](alpaca_zh_demo.json)
-在指令监督微调时，`instruction` 列对应的内容会与 `input` 列对应的内容拼接后作为人类指令，即人类指令为 `instruction\ninput`。而 `output` 列对应的内容为模型回答。
+在指令监督微调时，`instruction` 列对应的内容会与 `input` 列对应的内容拼接后作为提示词，即提示词为 `instruction\ninput`。而 `output` 列对应的内容为模型回答。
+对于推理类模型的微调，如果数据集包含思维链，则需要把思维链放在模型回答中，例如 `<think>cot</think>output`。
 如果指定，`system` 列对应的内容将被作为系统提示词。
@@ -56,8 +60,8 @@
 ```json
 [
  {
-    "instruction": "人类指令（必填）",
+    "instruction": "用户指令（必填）",
-    "input": "人类输入（选填）",
+    "input": "用户输入（选填）",
    "output": "模型回答（必填）",
    "system": "系统提示词（选填）",
    "history": [
@@ -83,6 +87,11 @@
 }
 ```
+> [!TIP]
+> 如果模型本身具备推理能力（如 Qwen3）而数据集不包含思维链，LLaMA-Factory 会自动为数据添加空思维链。当 `enable_thinking` 为 `True` 时（慢思考，默认），空思维链会添加到模型回答中并且计算损失，否则会添加到用户指令中并且不计算损失（快思考）。请在训练和推理时保持 `enable_thinking` 参数一致。
+>
+> 如果您希望训练包含思维链的数据时使用慢思考，训练不包含思维链的数据时使用快思考，可以设置 `enable_thinking` 为 `None`。但该功能较为复杂，请谨慎使用。
 ### 预训练数据集
 - [样例数据集](c4_demo.jsonl)
@@ -116,8 +125,8 @@
 ```json
 [
  {
-    "instruction": "人类指令（必填）",
+    "instruction": "用户指令（必填）",
-    "input": "人类输入（选填）",
+    "input": "用户输入（选填）",
    "chosen": "优质回答（必填）",
    "rejected": "劣质回答（必填）"
  }
@@ -171,7 +180,7 @@ KTO 数据集需要提供额外的 `kto_tag` 列。详情请参阅 [sharegpt](#s
    "conversations": [
      {
        "from": "human",
-        "value": "人类指令"
+        "value": "用户指令"
      },
      {
        "from": "function_call",
@@ -222,7 +231,7 @@ Sharegpt 格式的偏好数据集同样需要在 `chosen` 列中提供更优的
    "conversations": [
      {
        "from": "human",
-        "value": "人类指令"
+        "value": "用户指令"
      },
      {
        "from": "gpt",
@@ -230,7 +239,7 @@ Sharegpt 格式的偏好数据集同样需要在 `chosen` 列中提供更优的
      },
      {
        "from": "human",
-        "value": "人类指令"
+        "value": "用户指令"
      }
    ],
    "chosen": {
@@ -272,7 +281,7 @@ KTO 数据集需要额外添加一个 `kto_tag` 列，包含 bool 类型的人
    "conversations": [
      {
        "from": "human",
-        "value": "人类指令"
+        "value": "用户指令"
      },
      {
        "from": "gpt",
@@ -311,7 +320,7 @@ KTO 数据集需要额外添加一个 `kto_tag` 列，包含 bool 类型的人
    "conversations": [
      {
        "from": "human",
-        "value": "<image>人类指令"
+        "value": "<image><image>用户指令"
      },
      {
        "from": "gpt",
@@ -319,6 +328,7 @@ KTO 数据集需要额外添加一个 `kto_tag` 列，包含 bool 类型的人
      }
    ],
    "images": [
+      "图像路径（必填）",
      "图像路径（必填）"
    ]
  }
@@ -352,7 +362,7 @@ KTO 数据集需要额外添加一个 `kto_tag` 列，包含 bool 类型的人
    "conversations": [
      {
        "from": "human",
-        "value": "<video>人类指令"
+        "value": "<video><video>用户指令"
      },
      {
        "from": "gpt",
@@ -360,6 +370,7 @@ KTO 数据集需要额外添加一个 `kto_tag` 列，包含 bool 类型的人
      }
    ],
    "videos": [
+      "视频路径（必填）",
      "视频路径（必填）"
    ]
  }
@@ -393,7 +404,7 @@ KTO 数据集需要额外添加一个 `kto_tag` 列，包含 bool 类型的人
    "conversations": [
      {
        "from": "human",
-        "value": "<audio>人类指令"
+        "value": "<audio><audio>用户指令"
      },
      {
        "from": "gpt",
@@ -401,6 +412,7 @@ KTO 数据集需要额外添加一个 `kto_tag` 列，包含 bool 类型的人
      }
    ],
    "audios": [
+      "音频路径（必填）",
      "音频路径（必填）"
    ]
  }
@@ -435,7 +447,7 @@ OpenAI 格式仅仅是 sharegpt 格式的一种特殊情况，其中第一条消
      },
      {
        "role": "user",
-        "content": "人类指令"
+        "content": "用户指令"
      },
      {
        "role": "assistant",

--- a/data/dataset_info.json
+++ b/data/dataset_info.json
@@ -274,7 +274,7 @@
    "tags": {
      "role_tag": "role",
      "content_tag": "content",
-      "user_tag": "human",
+      "user_tag": "user",
      "assistant_tag": "assistant"
    }
  },
@@ -559,6 +559,16 @@
      "images": "images"
    }
  },
+  "rlaif_v": {
+    "hf_hub_url": "openbmb/RLAIF-V-Dataset",
+    "ranking": true,
+    "columns": {
+      "prompt": "question",
+      "chosen": "chosen",
+      "rejected": "rejected",
+      "images": "image"
+    }
+  },
  "orca_pairs": {
    "hf_hub_url": "Intel/orca_dpo_pairs",
    "ranking": true,

--- a/examples/README.md
+++ b/examples/README.md
@@ -52,7 +52,7 @@ llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
 #### Multimodal Supervised Fine-Tuning
 ```bash
-llamafactory-cli train examples/train_lora/qwen2vl_lora_sft.yaml
+llamafactory-cli train examples/train_lora/qwen2_5vl_lora_sft.yaml
 ```
 #### DPO/ORPO/SimPO Training
@@ -64,7 +64,7 @@ llamafactory-cli train examples/train_lora/llama3_lora_dpo.yaml
 #### Multimodal DPO/ORPO/SimPO Training
 ```bash
-llamafactory-cli train examples/train_lora/qwen2vl_lora_dpo.yaml
+llamafactory-cli train examples/train_lora/qwen2_5vl_lora_dpo.yaml
 ```
 #### Reward Modeling
@@ -168,7 +168,7 @@ FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500
 #### Multimodal Supervised Fine-Tuning
 ```bash
-FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/qwen2vl_full_sft.yaml
+FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/qwen2_5vl_full_sft.yaml
 ```
 ### Merging LoRA Adapters and Quantization
@@ -195,10 +195,11 @@ llamafactory-cli export examples/merge_lora/llama3_full_sft.yaml
 ### Inferring LoRA Fine-Tuned Models
-#### Batch Generation using vLLM Tensor Parallel
+#### Evaluation using vLLM's Multi-GPU Inference
 ```
-python scripts/vllm_infer.py --model_name_or_path path_to_merged_model --dataset alpaca_en_demo
+python scripts/vllm_infer.py --model_name_or_path meta-llama/Meta-Llama-3-8B-Instruct --template llama3 --dataset alpaca_en_demo
+python scripts/eval_bleu_rouge.py generated_predictions.jsonl
 ```
 #### Use CLI ChatBox
@@ -281,9 +282,3 @@ llamafactory-cli train examples/extras/llama_pro/llama3_freeze_sft.yaml
 ```bash
 bash examples/extras/fsdp_qlora/train.sh
 ```
-#### Computing BLEU and ROUGE Scores
-```bash
-llamafactory-cli train examples/extras/nlg_eval/llama3_lora_predict.yaml
-```
--- a/examples/README_zh.md
+++ b/examples/README_zh.md
@@ -52,7 +52,7 @@ llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
 #### 多模态指令监督微调
 ```bash
-llamafactory-cli train examples/train_lora/qwen2vl_lora_sft.yaml
+llamafactory-cli train examples/train_lora/qwen2_5vl_lora_sft.yaml
 ```
 #### DPO/ORPO/SimPO 训练
@@ -64,7 +64,7 @@ llamafactory-cli train examples/train_lora/llama3_lora_dpo.yaml
 #### 多模态 DPO/ORPO/SimPO 训练
 ```bash
-llamafactory-cli train examples/train_lora/qwen2vl_lora_dpo.yaml
+llamafactory-cli train examples/train_lora/qwen2_5vl_lora_dpo.yaml
 ```
 #### 奖励模型训练
@@ -168,7 +168,7 @@ FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500
 #### 多模态指令监督微调
 ```bash
-FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/qwen2vl_full_sft.yaml
+FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/qwen2_5vl_full_sft.yaml
 ```
 ### 合并 LoRA 适配器与模型量化
@@ -195,10 +195,11 @@ llamafactory-cli export examples/merge_lora/llama3_full_sft.yaml
 ### 推理 LoRA 模型
-#### 使用 vLLM+TP 批量推理
+#### 使用 vLLM 多卡推理评估
 ```
-python scripts/vllm_infer.py --model_name_or_path path_to_merged_model --dataset alpaca_en_demo
+python scripts/vllm_infer.py --model_name_or_path meta-llama/Meta-Llama-3-8B-Instruct --template llama3 --dataset alpaca_en_demo
+python scripts/eval_bleu_rouge.py generated_predictions.jsonl
 ```
 #### 使用命令行对话框
@@ -281,9 +282,3 @@ llamafactory-cli train examples/extras/llama_pro/llama3_freeze_sft.yaml
 ```bash
 bash examples/extras/fsdp_qlora/train.sh
 ```
-#### 计算 BLEU 和 ROUGE 分数
-```bash
-llamafactory-cli train examples/extras/nlg_eval/llama3_lora_predict.yaml
-```
--- a/examples/inference/qwen2_vl.yaml
+++ b/examples/inference/qwen2_vl.yaml
--- a/examples/merge_lora/qwen2vl_lora_sft.yaml
+++ b/examples/merge_lora/qwen2vl_lora_sft.yaml
@@ -2,12 +2,12 @@
 ### model
 model_name_or_path: Qwen/Qwen2.5-VL-7B-Instruct
-adapter_name_or_path: saves/qwen2_vl-7b/lora/sft
+adapter_name_or_path: saves/qwen2_5vl-7b/lora/sft
 template: qwen2_vl
 trust_remote_code: true
 ### export
-export_dir: output/qwen2_vl_lora_sft
+export_dir: output/qwen2_5vl_lora_sft
 export_size: 5
 export_device: cpu  # choices: [cpu, auto]
 export_legacy_format: false
--- a/examples/train_full/qwen2vl_full_sft.yaml
+++ b/examples/train_full/qwen2vl_full_sft.yaml
 ### model
-model_name_or_path: Qwen/Qwen2-VL-7B-Instruct
+model_name_or_path: Qwen/Qwen2.5-VL-7B-Instruct
 image_max_pixels: 262144
 video_max_pixels: 16384
 trust_remote_code: true
@@ -23,7 +23,7 @@ preprocessing_num_workers: 16
 dataloader_num_workers: 4
 ### output
-output_dir: saves/qwen2_vl-7b/full/sft
+output_dir: saves/qwen2_5vl-7b/full/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true

--- a/examples/train_lora/qwen2vl_lora_dpo.yaml
+++ b/examples/train_lora/qwen2vl_lora_dpo.yaml
@@ -23,7 +23,7 @@ preprocessing_num_workers: 16
 dataloader_num_workers: 4
 ### output
-output_dir: saves/qwen2_vl-7b/lora/dpo
+output_dir: saves/qwen2_5vl-7b/lora/dpo
 logging_steps: 10
 save_steps: 500
 plot_loss: true

--- a/examples/train_lora/qwen2vl_lora_sft.yaml
+++ b/examples/train_lora/qwen2vl_lora_sft.yaml
@@ -21,7 +21,7 @@ preprocessing_num_workers: 16
 dataloader_num_workers: 4
 ### output
-output_dir: saves/qwen2_vl-7b/lora/sft
+output_dir: saves/qwen2_5vl-7b/lora/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true

--- a/requirements.txt
+++ b/requirements.txt
-transformers>=4.45.0,<=4.51.3,!=4.46.*,!=4.47.*,!=4.48.0
+transformers>=4.45.0,<=4.52.4,!=4.46.*,!=4.47.*,!=4.48.0,!=4.52.0; sys_platform != 'darwin'
-datasets>=2.16.0,<=3.5.0
+transformers>=4.45.0,<=4.51.3,!=4.46.*,!=4.47.*,!=4.48.0,!=4.52.0; sys_platform == 'darwin'
-accelerate>=0.34.0,<=1.6.0
+datasets>=2.16.0,<=3.6.0
-peft>=0.14.0,<=0.15.1
+accelerate>=0.34.0,<=1.7.0
+peft>=0.14.0,<=0.15.2
 trl>=0.8.6,<=0.9.6
 tokenizers>=0.19.0,<=0.21.1
-gradio>=4.38.0,<=5.25.0
+gradio>=4.38.0,<=5.31.0
 scipy
 einops
 sentencepiece

--- a/scripts/qwen_omni_merge.py
+++ b/scripts/qwen_omni_merge.py
@@ -17,7 +17,11 @@ import shutil
 import fire
 from peft import PeftModel
-from transformers import AutoModel, AutoProcessor, Qwen2_5OmniThinkerForConditionalGeneration  # type: ignore
+from transformers import (
+    AutoProcessor,
+    Qwen2_5OmniForConditionalGeneration,  # type: ignore
+    Qwen2_5OmniThinkerForConditionalGeneration,
+)
 def merge_lora(
@@ -27,7 +31,7 @@ def merge_lora(
    submodule_name: str = "thinker",
    save_path: str = "./merged_model_checkpoint",
 ):
-    """Load the original model, tokenizer, and processor configuration, merge the LoRA weights.
+    """Load the original model, merge the LoRA weights.
    For a specified submodule, and save the final merged model along with its configurations.
@@ -38,10 +42,9 @@ def merge_lora(
        submodule_name (str): Name of the submodule to merge (default: "thinker").
        save_path (str): Directory where the merged model and configurations will be saved.
    """
-    # 1. Load the original model, tokenizer, and processor
+    # 1. Load the original model
-    model = AutoModel.from_pretrained(base_model_path, torch_dtype="auto", device_map="cpu")
+    model = Qwen2_5OmniForConditionalGeneration.from_pretrained(base_model_path, torch_dtype="auto", device_map="cpu")
-    processor = AutoProcessor.from_pretrained(base_model_path)
+    print("Successfully loaded the original model.")
-    print("Successfully loaded the original model and tokenizer.")
    # 2. Extract the submodule to be merged (e.g., model.thinker)
    if not hasattr(model, submodule_name):
@@ -52,7 +55,8 @@ def merge_lora(
    # 3. Load the LoRA weights onto the extracted submodule
    lora_model = PeftModel.from_pretrained(base_submodule, lora_checkpoint_path)
-    print("LoRA weights loaded successfully.")
+    processor = AutoProcessor.from_pretrained(lora_checkpoint_path)
+    print("LoRA weights and processor loaded successfully.")
    # 4. Merge the LoRA weights into the submodule and unload the LoRA modules
    merged_submodule = lora_model.merge_and_unload()
@@ -95,14 +99,16 @@ def save_full_model(
    thinker = Qwen2_5OmniThinkerForConditionalGeneration.from_pretrained(
        saved_thinker_path, torch_dtype="auto", device_map="cpu"
    )
-    base_model = AutoModel.from_pretrained(base_model_path, torch_dtype="auto", device_map="cpu")
+    base_model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
+        base_model_path, torch_dtype="auto", device_map="cpu"
+    )
    base_model.thinker = thinker
    # 2. Save the complete model along with its tokenizer and processor configuration
-    processor = AutoProcessor.from_pretrained(base_model_path)
+    processor = AutoProcessor.from_pretrained(saved_thinker_path)
    base_model.save_pretrained(save_path)
    processor.save_pretrained(save_path)
-    print(f"Merged model and tokenizer saved to {save_path}.")
+    print(f"Merged model and processor saved to {save_path}.")
    # 3. Copy the extra file from the base model directory to the save_path
    source_file = os.path.join(base_model_path, extra_file)

--- a/scripts/vllm_infer.py
+++ b/scripts/vllm_infer.py
@@ -12,10 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import gc
 import json
 from typing import Optional
 import fire
+from tqdm import tqdm
 from transformers import Seq2SeqTrainingArguments
 from llamafactory.data import get_dataset, get_template_and_fix_tokenizer
@@ -47,10 +49,15 @@ def vllm_infer(
    max_new_tokens: int = 1024,
    repetition_penalty: float = 1.0,
    skip_special_tokens: bool = True,
+    default_system: Optional[str] = None,
+    enable_thinking: bool = True,
    seed: Optional[int] = None,
    pipeline_parallel_size: int = 1,
    image_max_pixels: int = 768 * 768,
    image_min_pixels: int = 32 * 32,
+    video_fps: float = 2.0,
+    video_maxlen: int = 128,
+    batch_size: int = 1024,
 ):
    r"""Perform batch generation using vLLM engine, which supports tensor parallelism.
@@ -69,6 +76,8 @@ def vllm_infer(
            cutoff_len=cutoff_len,
            max_samples=max_samples,
            preprocessing_num_workers=16,
+            default_system=default_system,
+            enable_thinking=enable_thinking,
            vllm_config=vllm_config,
            temperature=temperature,
            top_p=top_p,
@@ -83,78 +92,106 @@ def vllm_infer(
    tokenizer = tokenizer_module["tokenizer"]
    template_obj = get_template_and_fix_tokenizer(tokenizer, data_args)
    template_obj.mm_plugin.expand_mm_tokens = False  # for vllm generate
+    engine_args = {
+        "model": model_args.model_name_or_path,
+        "trust_remote_code": True,
+        "dtype": model_args.infer_dtype,
+        "max_model_len": cutoff_len + max_new_tokens,
+        "tensor_parallel_size": (get_device_count() // pipeline_parallel_size) or 1,
+        "pipeline_parallel_size": pipeline_parallel_size,
+        "disable_log_stats": True,
+        "enable_lora": model_args.adapter_name_or_path is not None,
+    }
+    if template_obj.mm_plugin.__class__.__name__ != "BasePlugin":
+        engine_args["limit_mm_per_prompt"] = {"image": 4, "video": 2, "audio": 2}
+    if isinstance(model_args.vllm_config, dict):
+        engine_args.update(model_args.vllm_config)
+    llm = LLM(**engine_args)
+    # load datasets
    dataset_module = get_dataset(template_obj, model_args, data_args, training_args, "ppo", **tokenizer_module)
+    train_dataset = dataset_module["train_dataset"]
+    sampling_params = SamplingParams(
+        repetition_penalty=generating_args.repetition_penalty or 1.0,  # repetition_penalty must > 0
+        temperature=generating_args.temperature,
+        top_p=generating_args.top_p or 1.0,  # top_p must > 0
+        top_k=generating_args.top_k or -1,  # top_k must > 0
+        stop_token_ids=template_obj.get_stop_token_ids(tokenizer),
+        max_tokens=generating_args.max_new_tokens,
+        skip_special_tokens=skip_special_tokens,
+        seed=seed,
+    )
+    if model_args.adapter_name_or_path is not None:
+        lora_request = LoRARequest("default", 1, model_args.adapter_name_or_path[0])
+    else:
+        lora_request = None
+    # Store all results in these lists
+    all_prompts, all_preds, all_labels = [], [], []
-    inputs, prompts, labels = [], [], []
+    # Add batch process to avoid the issue of too many files opened
-    for sample in dataset_module["train_dataset"]:
+    for i in tqdm(range(0, len(train_dataset), batch_size), desc="Processing batched inference"):
-        if sample["images"]:
+        vllm_inputs, prompts, labels = [], [], []
+        batch = train_dataset[i : min(i + batch_size, len(train_dataset))]
+        for j in range(len(batch["input_ids"])):
+            if batch["images"][j] is not None:
+                image = batch["images"][j]
                multi_modal_data = {
                    "image": template_obj.mm_plugin._regularize_images(
-                    sample["images"], image_max_pixels=image_max_pixels, image_min_pixels=image_min_pixels
+                        image, image_max_pixels=image_max_pixels, image_min_pixels=image_min_pixels
                    )["images"]
                }
-        elif sample["videos"]:
+            elif batch["videos"][j] is not None:
+                video = batch["videos"][j]
                multi_modal_data = {
                    "video": template_obj.mm_plugin._regularize_videos(
-                    sample["videos"], image_max_pixels=image_max_pixels, image_min_pixels=image_min_pixels
+                        video,
+                        image_max_pixels=image_max_pixels,
+                        image_min_pixels=image_min_pixels,
+                        video_fps=video_fps,
+                        video_maxlen=video_maxlen,
                    )["videos"]
                }
-        elif sample["audios"]:
+            elif batch["audios"][j] is not None:
+                audio = batch["audios"][j]
                audio_data = template_obj.mm_plugin._regularize_audios(
-                sample["audios"],
+                    audio,
                    sampling_rate=16000,
                )
                multi_modal_data = {"audio": zip(audio_data["audios"], audio_data["sampling_rates"])}
            else:
                multi_modal_data = None
-        inputs.append({"prompt_token_ids": sample["input_ids"], "multi_modal_data": multi_modal_data})
+            vllm_inputs.append({"prompt_token_ids": batch["input_ids"][j], "multi_modal_data": multi_modal_data})
-        prompts.append(tokenizer.decode(sample["input_ids"], skip_special_tokens=skip_special_tokens))
+            prompts.append(tokenizer.decode(batch["input_ids"][j], skip_special_tokens=skip_special_tokens))
            labels.append(
                tokenizer.decode(
-                list(filter(lambda x: x != IGNORE_INDEX, sample["labels"])), skip_special_tokens=skip_special_tokens
+                    list(filter(lambda x: x != IGNORE_INDEX, batch["labels"][j])),
-            )
-        )
-    sampling_params = SamplingParams(
-        repetition_penalty=generating_args.repetition_penalty or 1.0,  # repetition_penalty must > 0
-        temperature=generating_args.temperature,
-        top_p=generating_args.top_p or 1.0,  # top_p must > 0
-        top_k=generating_args.top_k or -1,  # top_k must > 0
-        stop_token_ids=template_obj.get_stop_token_ids(tokenizer),
-        max_tokens=generating_args.max_new_tokens,
                    skip_special_tokens=skip_special_tokens,
-        seed=seed,
                )
-    if model_args.adapter_name_or_path is not None:
+            )
-        lora_request = LoRARequest("default", 1, model_args.adapter_name_or_path[0])
-    else:
-        lora_request = None
-    engine_args = {
+        results = llm.generate(vllm_inputs, sampling_params, lora_request=lora_request)
-        "model": model_args.model_name_or_path,
+        preds = [result.outputs[0].text for result in results]
-        "trust_remote_code": True,
-        "dtype": model_args.infer_dtype,
-        "max_model_len": cutoff_len + max_new_tokens,
-        "tensor_parallel_size": (get_device_count() // pipeline_parallel_size) or 1,
-        "pipeline_parallel_size": pipeline_parallel_size,
-        "disable_log_stats": True,
-        "enable_lora": model_args.adapter_name_or_path is not None,
-    }
-    if template_obj.mm_plugin.__class__.__name__ != "BasePlugin":
-        engine_args["limit_mm_per_prompt"] = {"image": 4, "video": 2, "audio": 2}
-    if isinstance(model_args.vllm_config, dict):
+        # Accumulate results
-        engine_args.update(model_args.vllm_config)
+        all_prompts.extend(prompts)
+        all_preds.extend(preds)
+        all_labels.extend(labels)
+        gc.collect()
-    results = LLM(**engine_args).generate(inputs, sampling_params, lora_request=lora_request)
+    # Write all results at once outside the loop
-    preds = [result.outputs[0].text for result in results]
    with open(save_name, "w", encoding="utf-8") as f:
-        for text, pred, label in zip(prompts, preds, labels):
+        for text, pred, label in zip(all_prompts, all_preds, all_labels):
            f.write(json.dumps({"prompt": text, "predict": pred, "label": label}, ensure_ascii=False) + "\n")
    print("*" * 70)
-    print(f"{len(prompts)} generated results have been saved at {save_name}.")
+    print(f"{len(all_prompts)} total generated results have been saved at {save_name}.")
    print("*" * 70)

--- a/src/llamafactory/api/chat.py
+++ b/src/llamafactory/api/chat.py
@@ -198,6 +198,7 @@ async def create_chat_completion_response(
        top_p=request.top_p,
        max_new_tokens=request.max_tokens,
        num_return_sequences=request.n,
+        repetition_penalty=request.presence_penalty,
        stop=request.stop,
    )
@@ -259,6 +260,7 @@ async def create_stream_chat_completion_response(
        temperature=request.temperature,
        top_p=request.top_p,
        max_new_tokens=request.max_tokens,
+        repetition_penalty=request.presence_penalty,
        stop=request.stop,
    ):
        if len(new_token) != 0:

--- a/src/llamafactory/api/protocol.py
+++ b/src/llamafactory/api/protocol.py
@@ -103,6 +103,7 @@ class ChatCompletionRequest(BaseModel):
    temperature: Optional[float] = None
    top_p: Optional[float] = None
    n: int = 1
+    presence_penalty: Optional[float] = None
    max_tokens: Optional[int] = None
    stop: Optional[Union[str, list[str]]] = None
    stream: bool = False

--- a/src/llamafactory/chat/hf_engine.py
+++ b/src/llamafactory/chat/hf_engine.py
@@ -104,7 +104,6 @@ class HuggingfaceEngine(BaseEngine):
            messages, mm_input_dict["images"], mm_input_dict["videos"], mm_input_dict["audios"], processor
        )
        paired_messages = messages + [{"role": "assistant", "content": ""}]
-        system = system or generating_args["default_system"]
        prompt_ids, _ = template.encode_oneturn(tokenizer, paired_messages, system, tools)
        prompt_ids, _ = template.mm_plugin.process_token_ids(
            prompt_ids,
@@ -117,7 +116,7 @@ class HuggingfaceEngine(BaseEngine):
        )
        prompt_length = len(prompt_ids)
        inputs = torch.tensor([prompt_ids], device=model.device)
-        attention_mask = torch.ones_like(inputs, dtype=torch.bool)
+        attention_mask = torch.ones_like(inputs, dtype=torch.long)
        do_sample: Optional[bool] = input_kwargs.pop("do_sample", None)
        temperature: Optional[float] = input_kwargs.pop("temperature", None)