Update 0604

0722acf1 · chenych · c4ba4563 · 0722acf1 · 0722acf1 · c4ba4563
Commit 0722acf1 authored Jun 04, 2025 by chenych
20 changed files
--- a/README.md
+++ b/README.md
@@ -20,27 +20,36 @@ LLaMA Factory是一个大语言模型训练和推理的框架，支持了魔搭
 | ----------------------------------------------------------------- | -------------------------------- | ------------------- |
 | [Baichuan 2](https://huggingface.co/baichuan-inc)                 | 7B/13B                           | baichuan2           |
 | [ChatGLM3](https://huggingface.co/THUDM)                          | 6B                               | chatglm3            |
-| [Gemma 2](https://huggingface.co/google)                          | 2B/9B                            | gemma               |
+| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai)         | 7B/16B/67B/236B                  | deepseek            |
+| [DeepSeek 2.5/3](https://huggingface.co/deepseek-ai)              | 236B/671B                        | deepseek3           |
+| [DeepSeek R1 (Distill)](https://huggingface.co/deepseek-ai)       | 1.5B/7B/8B/14B/32B/70B/671B      | deepseekr1          |
+| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google)          | 2B/7B/9B/27B                     | gemma               |
+| [Gemma 3](https://huggingface.co/google)                          | 1B/4B/12B/27B                    | gemma3/gemma (1B)   |
 | [GLM-4/GLM-4-0414/GLM-Z1](https://huggingface.co/THUDM)           | 9B/32B                           | glm4                |
+| [Hunyuan](https://huggingface.co/tencent/)                        | 7B                               | hunyuan             |
+| [InternLM 2-3](https://huggingface.co/internlm)                   | 7B/8B/20B                        | intern2             |
+| [InternVL 2.5-3](https://huggingface.co/OpenGVLab)                | 1B/2B/8B/14B/38B/78B             | intern_vl           |
 | [Llama 2](https://huggingface.co/meta-llama)                      | 7B/13B/70B                       | llama2              |
-| [Llama 3/Llama 3.1](https://huggingface.co/meta-llama)            | 8B/70B                           | llama3              |
+| [Llama 3-3.3](https://huggingface.co/meta-llama)                  | 1B/3B/8B/70B                     | llama3              |
 | [Llama 4](https://huggingface.co/meta-llama)                      | 109B/402B                        | llama4              |
 | [OLMo](https://hf-mirror.com/allenai)                             | 1B/7B                            | olmo                |
 | [Qwen (1-2.5) (Code/Math/MoE/QwQ)](https://huggingface.co/Qwen)   | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen                |
-| [Qwen3 (MoE)](https://huggingface.co/Qwen)                        | 0.6B/1.7B/4B/8B/14B/30B/32B/235B     | qwen3               |
+| [Qwen3 (MoE)](https://huggingface.co/Qwen)                        | 0.6B/1.7B/4B/8B/14B/32B/235B     | qwen3               |
 | [XVERSE](https://hf-mirror.com/xverse)                            | 7B/13B                           | xverse              |

 持续更新中...

 > **[!NOTE]**
 >
-> 注意：本版本仅支持deepseek蒸馏模型的监督微调(SFT)，可参考[deepseek-r1-distill_vllm](https://developer.sourcefind.cn/codes/modelzoo/deepseek-r1-distill_vllm)
->
 > 对于所有“基座”（Base）模型，`template` 参数可以是 `default`, `alpaca`, `vicuna` 等任意值。但“对话”（Instruct/Chat）模型请务必使用**对应的模板**。
 >
 > 请务必在训练和推理时采用**完全一致**的模板。
 > 您也可以在 [template.py](src/llamafactory/data/template.py) 中添加自己的对话模板。
 >
+> \*：您需要从 main 分支安装 `transformers` 并使用 `DISABLE_VERSION_CHECK=1` 来跳过版本检查。
+>
+> \*\*：您需要安装特定版本的 `transformers` 以使用该模型。
+
 > **已知问题及解决方案**
 > 1. `Baichuan 2` 需要卸载掉环境中的xformers库，当前仅支持Lora方式训练。
 >
@@ -64,9 +73,7 @@ docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.4.1-ubuntu22.04-dt
 docker run -it --shm-size 200g --network=host --name {docker_name} --privileged --device=/dev/kfd --device=/dev/dri --device=/dev/mkfd --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro {imageID} bash

 cd /your_code_path/llama_factory
-pip install -e ".[torch,metrics]"
-## llama4 需要单独安装以下包
-pip install git+https://github.com/hiyouga/transformers.git@llama4_train
+pip install -e ".[torch,metrics]" --no-build-isolation
 ```

 #### Dockerfile（方法二）
@@ -77,9 +84,7 @@ docker build --no-cache -t llama-factory:latest .
 docker run -it --shm-size 200g --network=host --name {docker_name} --privileged --device=/dev/kfd --device=/dev/dri --device=/dev/mkfd --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro {imageID} bash

 cd /your_code_path/llama_factory
-pip install -e ".[torch,metrics]"
-## llama4 需要单独安装以下包
-pip install git+https://github.com/hiyouga/transformers.git@llama4_train
+pip install -e ".[torch,metrics]" --no-build-isolation
 ```

 #### Anaconda（方法三）
@@ -102,9 +107,7 @@ deepspeed: 0.14.2+das.opt2.dtk2504
 ```bash
 git clone http://developer.hpccube.com/codes/OpenDAS/llama-factory.git
 cd /your_code_path/llama_factory
-pip install -e ".[torch,metrics]"
-## llama4 需要单独安装以下包
-pip install git+https://github.com/hiyouga/transformers.git@llama4_train
+pip install -e ".[torch,metrics]" --no-build-isolation

 # （可选）deepspeed多机训练
 # pdsh安装，若已安装，可忽略。
@@ -244,6 +247,12 @@ llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
 >
 > 自有数据集推理精度验证方法推荐使用：`python scripts/vllm_infer.py`生成结果，`python scripts/eval_bleu_rouge.py`计算得分，具体参数信息请参考脚本内容。

+### LLaMA Board 可视化微调（由 [Gradio](https://github.com/gradio-app/gradio) 驱动）
+
+```bash
+llamafactory-cli webui
+```
+
 ## 参考资料

 - [README_zh](README_zh.md)

--- a/README_en.md
+++ b/README_en.md
--- a/assets/wechat.jpg
+++ b/assets/wechat.jpg
--- a/assets/wechat_npu.jpg
+++ b/assets/wechat_npu.jpg
--- a/data/README.md
+++ b/data/README.md
 The [dataset_info.json](dataset_info.json) contains all available datasets. If you are using a custom dataset, please **make sure** to add a *dataset description* in `dataset_info.json` and specify `dataset: dataset_name` before training to use it.

-Currently we support datasets in **alpaca** and **sharegpt** format.
+The `dataset_info.json` file should be put in the `dataset_dir` directory. You can change `dataset_dir` to use another directory. The default value is `./data`.
+
+Currently we support datasets in **alpaca** and **sharegpt** format. Allowed file types include json, jsonl, csv, parquet, arrow.

 ```json
 "dataset_name": {
@@ -48,7 +50,9 @@ Currently we support datasets in **alpaca** and **sharegpt** format.

 * [Example dataset](alpaca_en_demo.json)

-In supervised fine-tuning, the `instruction` column will be concatenated with the `input` column and used as the human prompt, then the human prompt would be `instruction\ninput`. The `output` column represents the model response.
+In supervised fine-tuning, the `instruction` column will be concatenated with the `input` column and used as the user prompt, then the user prompt would be `instruction\ninput`. The `output` column represents the model response.
+
+For reasoning models, if the dataset contains chain-of-thought (CoT), the CoT needs to be placed in the model responses, such as `<think>cot</think>output`.

 The `system` column will be used as the system prompt if specified.

@@ -57,13 +61,13 @@ The `history` column is a list consisting of string tuples representing prompt-r
 ```json
 [
  {
-    "instruction": "human instruction (required)",
-    "input": "human input (optional)",
+    "instruction": "user instruction (required)",
+    "input": "user input (optional)",
    "output": "model response (required)",
    "system": "system prompt (optional)",
    "history": [
-      ["human instruction in the first round (optional)", "model response in the first round (optional)"],
-      ["human instruction in the second round (optional)", "model response in the second round (optional)"]
+      ["user instruction in the first round (optional)", "model response in the first round (optional)"],
+      ["user instruction in the second round (optional)", "model response in the second round (optional)"]
    ]
  }
 ]
@@ -84,6 +88,11 @@ Regarding the above dataset, the *dataset description* in `dataset_info.json` sh
 }
 ```

+> [!TIP]  
+> If the model has reasoning capabilities (e.g. Qwen3) but the dataset does not contain chain-of-thought (CoT), LLaMA-Factory will automatically add empty CoT to the data. When `enable_thinking` is `True` (slow thinking, by default), the empty CoT will be added to the model responses and loss computation will be considered; otherwise (fast thinking), it will be added to the user prompts and loss computation will be ignored. Please keep the `enable_thinking` parameter consistent during training and inference.
+>
+> If you want to train data containing CoT with slow thinking and data without CoT with fast thinking, you can set `enable_thinking` to `None`. However, this feature is relatively complicated and should be used with caution.
+
 ### Pre-training Dataset

 - [Example dataset](c4_demo.jsonl)
@@ -117,8 +126,8 @@ It requires a better response in `chosen` column and a worse response in `reject
 ```json
 [
  {
-    "instruction": "human instruction (required)",
-    "input": "human input (optional)",
+    "instruction": "user instruction (required)",
+    "input": "user input (optional)",
    "chosen": "chosen answer (required)",
    "rejected": "rejected answer (required)"
  }
@@ -172,7 +181,7 @@ Note that the human and observation should appear in odd positions, while gpt an
    "conversations": [
      {
        "from": "human",
-        "value": "human instruction"
+        "value": "user instruction"
      },
      {
        "from": "function_call",
@@ -223,7 +232,7 @@ Preference datasets in sharegpt format also require a better message in `chosen`
    "conversations": [
      {
        "from": "human",
-        "value": "human instruction"
+        "value": "user instruction"
      },
      {
        "from": "gpt",
@@ -231,7 +240,7 @@ Preference datasets in sharegpt format also require a better message in `chosen`
      },
      {
        "from": "human",
-        "value": "human instruction"
+        "value": "user instruction"
      }
    ],
    "chosen": {
@@ -273,7 +282,7 @@ KTO datasets require a extra `kto_tag` column containing the boolean human feedb
    "conversations": [
      {
        "from": "human",
-        "value": "human instruction"
+        "value": "user instruction"
      },
      {
        "from": "gpt",
@@ -312,7 +321,7 @@ The number of images should be identical to the `<image>` tokens in the conversa
    "conversations": [
      {
        "from": "human",
-        "value": "<image>human instruction"
+        "value": "<image>user instruction"
      },
      {
        "from": "gpt",
@@ -353,7 +362,7 @@ The number of videos should be identical to the `<video>` tokens in the conversa
    "conversations": [
      {
        "from": "human",
-        "value": "<video>human instruction"
+        "value": "<video>user instruction"
      },
      {
        "from": "gpt",
@@ -394,7 +403,7 @@ The number of audios should be identical to the `<audio>` tokens in the conversa
    "conversations": [
      {
        "from": "human",
-        "value": "<audio>human instruction"
+        "value": "<audio>user instruction"
      },
      {
        "from": "gpt",
@@ -435,7 +444,7 @@ The openai format is simply a special case of the sharegpt format, where the fir
      },
      {
        "role": "user",
-        "content": "human instruction"
+        "content": "user instruction"
      },
      {
        "role": "assistant",

--- a/data/README_zh.md
+++ b/data/README_zh.md
 [dataset_info.json](dataset_info.json) 包含了所有可用的数据集。如果您希望使用自定义数据集，请**务必**在 `dataset_info.json` 文件中添加*数据集描述*，并通过修改 `dataset: 数据集名称` 配置来使用数据集。

-目前我们支持 **alpaca** 格式和 **sharegpt** 格式的数据集。
+其中 `dataset_info.json` 文件应放置在 `dataset_dir` 目录下。您可以通过修改 `dataset_dir` 参数来使用其他目录。默认值为 `./data`。
+
+目前我们支持 **alpaca** 格式和 **sharegpt** 格式的数据集。允许的文件类型包括 json、jsonl、csv、parquet 和 arrow。

 ```json
 "数据集名称": {
@@ -47,7 +49,9 @@

 - [样例数据集](alpaca_zh_demo.json)

-在指令监督微调时，`instruction` 列对应的内容会与 `input` 列对应的内容拼接后作为人类指令，即人类指令为 `instruction\ninput`。而 `output` 列对应的内容为模型回答。
+在指令监督微调时，`instruction` 列对应的内容会与 `input` 列对应的内容拼接后作为提示词，即提示词为 `instruction\ninput`。而 `output` 列对应的内容为模型回答。
+
+对于推理类模型的微调，如果数据集包含思维链，则需要把思维链放在模型回答中，例如 `<think>cot</think>output`。

 如果指定，`system` 列对应的内容将被作为系统提示词。

@@ -56,8 +60,8 @@
 ```json
 [
  {
-    "instruction": "人类指令（必填）",
-    "input": "人类输入（选填）",
+    "instruction": "用户指令（必填）",
+    "input": "用户输入（选填）",
    "output": "模型回答（必填）",
    "system": "系统提示词（选填）",
    "history": [
@@ -83,6 +87,11 @@
 }
 ```

+> [!TIP]
+> 如果模型本身具备推理能力（如 Qwen3）而数据集不包含思维链，LLaMA-Factory 会自动为数据添加空思维链。当 `enable_thinking` 为 `True` 时（慢思考，默认），空思维链会添加到模型回答中并且计算损失，否则会添加到用户指令中并且不计算损失（快思考）。请在训练和推理时保持 `enable_thinking` 参数一致。
+>
+> 如果您希望训练包含思维链的数据时使用慢思考，训练不包含思维链的数据时使用快思考，可以设置 `enable_thinking` 为 `None`。但该功能较为复杂，请谨慎使用。
+
 ### 预训练数据集

 - [样例数据集](c4_demo.jsonl)
@@ -116,8 +125,8 @@
 ```json
 [
  {
-    "instruction": "人类指令（必填）",
-    "input": "人类输入（选填）",
+    "instruction": "用户指令（必填）",
+    "input": "用户输入（选填）",
    "chosen": "优质回答（必填）",
    "rejected": "劣质回答（必填）"
  }
@@ -171,7 +180,7 @@ KTO 数据集需要提供额外的 `kto_tag` 列。详情请参阅 [sharegpt](#s
    "conversations": [
      {
        "from": "human",
-        "value": "人类指令"
+        "value": "用户指令"
      },
      {
        "from": "function_call",
@@ -222,7 +231,7 @@ Sharegpt 格式的偏好数据集同样需要在 `chosen` 列中提供更优的
    "conversations": [
      {
        "from": "human",
-        "value": "人类指令"
+        "value": "用户指令"
      },
      {
        "from": "gpt",
@@ -230,7 +239,7 @@ Sharegpt 格式的偏好数据集同样需要在 `chosen` 列中提供更优的
      },
      {
        "from": "human",
-        "value": "人类指令"
+        "value": "用户指令"
      }
    ],
    "chosen": {
@@ -272,7 +281,7 @@ KTO 数据集需要额外添加一个 `kto_tag` 列，包含 bool 类型的人
    "conversations": [
      {
        "from": "human",
-        "value": "人类指令"
+        "value": "用户指令"
      },
      {
        "from": "gpt",
@@ -311,7 +320,7 @@ KTO 数据集需要额外添加一个 `kto_tag` 列，包含 bool 类型的人
    "conversations": [
      {
        "from": "human",
-        "value": "<image>人类指令"
+        "value": "<image><image>用户指令"
      },
      {
        "from": "gpt",
@@ -319,6 +328,7 @@ KTO 数据集需要额外添加一个 `kto_tag` 列，包含 bool 类型的人
      }
    ],
    "images": [
+      "图像路径（必填）",
      "图像路径（必填）"
    ]
  }
@@ -352,7 +362,7 @@ KTO 数据集需要额外添加一个 `kto_tag` 列，包含 bool 类型的人
    "conversations": [
      {
        "from": "human",
-        "value": "<video>人类指令"
+        "value": "<video><video>用户指令"
      },
      {
        "from": "gpt",
@@ -360,6 +370,7 @@ KTO 数据集需要额外添加一个 `kto_tag` 列，包含 bool 类型的人
      }
    ],
    "videos": [
+      "视频路径（必填）",
      "视频路径（必填）"
    ]
  }
@@ -393,7 +404,7 @@ KTO 数据集需要额外添加一个 `kto_tag` 列，包含 bool 类型的人
    "conversations": [
      {
        "from": "human",
-        "value": "<audio>人类指令"
+        "value": "<audio><audio>用户指令"
      },
      {
        "from": "gpt",
@@ -401,6 +412,7 @@ KTO 数据集需要额外添加一个 `kto_tag` 列，包含 bool 类型的人
      }
    ],
    "audios": [
+      "音频路径（必填）",
      "音频路径（必填）"
    ]
  }
@@ -435,7 +447,7 @@ OpenAI 格式仅仅是 sharegpt 格式的一种特殊情况，其中第一条消
      },
      {
        "role": "user",
-        "content": "人类指令"
+        "content": "用户指令"
      },
      {
        "role": "assistant",

--- a/data/dataset_info.json
+++ b/data/dataset_info.json
@@ -274,7 +274,7 @@
    "tags": {
      "role_tag": "role",
      "content_tag": "content",
-      "user_tag": "human",
+      "user_tag": "user",
      "assistant_tag": "assistant"
    }
  },
@@ -559,6 +559,16 @@
      "images": "images"
    }
  },
+  "rlaif_v": {
+    "hf_hub_url": "openbmb/RLAIF-V-Dataset",
+    "ranking": true,
+    "columns": {
+      "prompt": "question",
+      "chosen": "chosen",
+      "rejected": "rejected",
+      "images": "image"
+    }
+  },
  "orca_pairs": {
    "hf_hub_url": "Intel/orca_dpo_pairs",
    "ranking": true,

--- a/examples/README.md
+++ b/examples/README.md
@@ -52,7 +52,7 @@ llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
 #### Multimodal Supervised Fine-Tuning

 ```bash
-llamafactory-cli train examples/train_lora/qwen2vl_lora_sft.yaml
+llamafactory-cli train examples/train_lora/qwen2_5vl_lora_sft.yaml
 ```

 #### DPO/ORPO/SimPO Training
@@ -64,7 +64,7 @@ llamafactory-cli train examples/train_lora/llama3_lora_dpo.yaml
 #### Multimodal DPO/ORPO/SimPO Training

 ```bash
-llamafactory-cli train examples/train_lora/qwen2vl_lora_dpo.yaml
+llamafactory-cli train examples/train_lora/qwen2_5vl_lora_dpo.yaml
 ```

 #### Reward Modeling
@@ -168,7 +168,7 @@ FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500
 #### Multimodal Supervised Fine-Tuning

 ```bash
-FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/qwen2vl_full_sft.yaml
+FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/qwen2_5vl_full_sft.yaml
 ```

 ### Merging LoRA Adapters and Quantization
@@ -195,10 +195,11 @@ llamafactory-cli export examples/merge_lora/llama3_full_sft.yaml

 ### Inferring LoRA Fine-Tuned Models

-#### Batch Generation using vLLM Tensor Parallel
+#### Evaluation using vLLM's Multi-GPU Inference

 ```
-python scripts/vllm_infer.py --model_name_or_path path_to_merged_model --dataset alpaca_en_demo
+python scripts/vllm_infer.py --model_name_or_path meta-llama/Meta-Llama-3-8B-Instruct --template llama3 --dataset alpaca_en_demo
+python scripts/eval_bleu_rouge.py generated_predictions.jsonl
 ```

 #### Use CLI ChatBox
@@ -281,9 +282,3 @@ llamafactory-cli train examples/extras/llama_pro/llama3_freeze_sft.yaml
 ```bash
 bash examples/extras/fsdp_qlora/train.sh
 ```
-
-#### Computing BLEU and ROUGE Scores
-
-```bash
-llamafactory-cli train examples/extras/nlg_eval/llama3_lora_predict.yaml
-```
--- a/examples/README_zh.md
+++ b/examples/README_zh.md
@@ -52,7 +52,7 @@ llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
 #### 多模态指令监督微调

 ```bash
-llamafactory-cli train examples/train_lora/qwen2vl_lora_sft.yaml
+llamafactory-cli train examples/train_lora/qwen2_5vl_lora_sft.yaml
 ```

 #### DPO/ORPO/SimPO 训练
@@ -64,7 +64,7 @@ llamafactory-cli train examples/train_lora/llama3_lora_dpo.yaml
 #### 多模态 DPO/ORPO/SimPO 训练

 ```bash
-llamafactory-cli train examples/train_lora/qwen2vl_lora_dpo.yaml
+llamafactory-cli train examples/train_lora/qwen2_5vl_lora_dpo.yaml
 ```

 #### 奖励模型训练
@@ -168,7 +168,7 @@ FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500
 #### 多模态指令监督微调

 ```bash
-FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/qwen2vl_full_sft.yaml
+FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/qwen2_5vl_full_sft.yaml
 ```

 ### 合并 LoRA 适配器与模型量化
@@ -195,10 +195,11 @@ llamafactory-cli export examples/merge_lora/llama3_full_sft.yaml

 ### 推理 LoRA 模型

-#### 使用 vLLM+TP 批量推理
+#### 使用 vLLM 多卡推理评估

 ```
-python scripts/vllm_infer.py --model_name_or_path path_to_merged_model --dataset alpaca_en_demo
+python scripts/vllm_infer.py --model_name_or_path meta-llama/Meta-Llama-3-8B-Instruct --template llama3 --dataset alpaca_en_demo
+python scripts/eval_bleu_rouge.py generated_predictions.jsonl
 ```

 #### 使用命令行对话框
@@ -281,9 +282,3 @@ llamafactory-cli train examples/extras/llama_pro/llama3_freeze_sft.yaml
 ```bash
 bash examples/extras/fsdp_qlora/train.sh
 ```
-
-#### 计算 BLEU 和 ROUGE 分数
-
-```bash
-llamafactory-cli train examples/extras/nlg_eval/llama3_lora_predict.yaml
-```
--- a/examples/inference/qwen2_vl.yaml
+++ b/examples/inference/qwen2_vl.yaml
--- a/examples/merge_lora/qwen2vl_lora_sft.yaml
+++ b/examples/merge_lora/qwen2vl_lora_sft.yaml
@@ -2,12 +2,12 @@

 ### model
 model_name_or_path: Qwen/Qwen2.5-VL-7B-Instruct
-adapter_name_or_path: saves/qwen2_vl-7b/lora/sft
+adapter_name_or_path: saves/qwen2_5vl-7b/lora/sft
 template: qwen2_vl
 trust_remote_code: true

 ### export
-export_dir: output/qwen2_vl_lora_sft
+export_dir: output/qwen2_5vl_lora_sft
 export_size: 5
 export_device: cpu  # choices: [cpu, auto]
 export_legacy_format: false
--- a/examples/train_full/qwen2vl_full_sft.yaml
+++ b/examples/train_full/qwen2vl_full_sft.yaml
 ### model
-model_name_or_path: Qwen/Qwen2-VL-7B-Instruct
+model_name_or_path: Qwen/Qwen2.5-VL-7B-Instruct
 image_max_pixels: 262144
 video_max_pixels: 16384
 trust_remote_code: true
@@ -23,7 +23,7 @@ preprocessing_num_workers: 16
 dataloader_num_workers: 4

 ### output
-output_dir: saves/qwen2_vl-7b/full/sft
+output_dir: saves/qwen2_5vl-7b/full/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true

--- a/examples/train_lora/qwen2vl_lora_dpo.yaml
+++ b/examples/train_lora/qwen2vl_lora_dpo.yaml
@@ -23,7 +23,7 @@ preprocessing_num_workers: 16
 dataloader_num_workers: 4

 ### output
-output_dir: saves/qwen2_vl-7b/lora/dpo
+output_dir: saves/qwen2_5vl-7b/lora/dpo
 logging_steps: 10
 save_steps: 500
 plot_loss: true

--- a/examples/train_lora/qwen2vl_lora_sft.yaml
+++ b/examples/train_lora/qwen2vl_lora_sft.yaml
@@ -21,7 +21,7 @@ preprocessing_num_workers: 16
 dataloader_num_workers: 4

 ### output
-output_dir: saves/qwen2_vl-7b/lora/sft
+output_dir: saves/qwen2_5vl-7b/lora/sft
 logging_steps: 10
 save_steps: 500
 plot_loss: true

--- a/requirements.txt
+++ b/requirements.txt
-transformers>=4.45.0,<=4.51.3,!=4.46.*,!=4.47.*,!=4.48.0
-datasets>=2.16.0,<=3.5.0
-accelerate>=0.34.0,<=1.6.0
-peft>=0.14.0,<=0.15.1
+transformers>=4.45.0,<=4.52.4,!=4.46.*,!=4.47.*,!=4.48.0,!=4.52.0; sys_platform != 'darwin'
+transformers>=4.45.0,<=4.51.3,!=4.46.*,!=4.47.*,!=4.48.0,!=4.52.0; sys_platform == 'darwin'
+datasets>=2.16.0,<=3.6.0
+accelerate>=0.34.0,<=1.7.0
+peft>=0.14.0,<=0.15.2
 trl>=0.8.6,<=0.9.6
 tokenizers>=0.19.0,<=0.21.1
-gradio>=4.38.0,<=5.25.0
+gradio>=4.38.0,<=5.31.0
 scipy
 einops
 sentencepiece

--- a/scripts/qwen_omni_merge.py
+++ b/scripts/qwen_omni_merge.py
@@ -17,7 +17,11 @@ import shutil

 import fire
 from peft import PeftModel
-from transformers import AutoModel, AutoProcessor, Qwen2_5OmniThinkerForConditionalGeneration  # type: ignore
+from transformers import (
+    AutoProcessor,
+    Qwen2_5OmniForConditionalGeneration,  # type: ignore
+    Qwen2_5OmniThinkerForConditionalGeneration,
+)


 def merge_lora(
@@ -27,7 +31,7 @@ def merge_lora(
    submodule_name: str = "thinker",
    save_path: str = "./merged_model_checkpoint",
 ):
-    """Load the original model, tokenizer, and processor configuration, merge the LoRA weights.
+    """Load the original model, merge the LoRA weights.

    For a specified submodule, and save the final merged model along with its configurations.

@@ -38,10 +42,9 @@ def merge_lora(
        submodule_name (str): Name of the submodule to merge (default: "thinker").
        save_path (str): Directory where the merged model and configurations will be saved.
    """
-    # 1. Load the original model, tokenizer, and processor
-    model = AutoModel.from_pretrained(base_model_path, torch_dtype="auto", device_map="cpu")
-    processor = AutoProcessor.from_pretrained(base_model_path)
-    print("Successfully loaded the original model and tokenizer.")
+    # 1. Load the original model
+    model = Qwen2_5OmniForConditionalGeneration.from_pretrained(base_model_path, torch_dtype="auto", device_map="cpu")
+    print("Successfully loaded the original model.")

    # 2. Extract the submodule to be merged (e.g., model.thinker)
    if not hasattr(model, submodule_name):
@@ -52,7 +55,8 @@ def merge_lora(

    # 3. Load the LoRA weights onto the extracted submodule
    lora_model = PeftModel.from_pretrained(base_submodule, lora_checkpoint_path)
-    print("LoRA weights loaded successfully.")
+    processor = AutoProcessor.from_pretrained(lora_checkpoint_path)
+    print("LoRA weights and processor loaded successfully.")

    # 4. Merge the LoRA weights into the submodule and unload the LoRA modules
    merged_submodule = lora_model.merge_and_unload()
@@ -95,14 +99,16 @@ def save_full_model(
    thinker = Qwen2_5OmniThinkerForConditionalGeneration.from_pretrained(
        saved_thinker_path, torch_dtype="auto", device_map="cpu"
    )
-    base_model = AutoModel.from_pretrained(base_model_path, torch_dtype="auto", device_map="cpu")
+    base_model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
+        base_model_path, torch_dtype="auto", device_map="cpu"
+    )
    base_model.thinker = thinker

    # 2. Save the complete model along with its tokenizer and processor configuration
-    processor = AutoProcessor.from_pretrained(base_model_path)
+    processor = AutoProcessor.from_pretrained(saved_thinker_path)
    base_model.save_pretrained(save_path)
    processor.save_pretrained(save_path)
-    print(f"Merged model and tokenizer saved to {save_path}.")
+    print(f"Merged model and processor saved to {save_path}.")

    # 3. Copy the extra file from the base model directory to the save_path
    source_file = os.path.join(base_model_path, extra_file)

--- a/scripts/vllm_infer.py
+++ b/scripts/vllm_infer.py
@@ -12,10 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+import gc
 import json
 from typing import Optional

 import fire
+from tqdm import tqdm
 from transformers import Seq2SeqTrainingArguments

 from llamafactory.data import get_dataset, get_template_and_fix_tokenizer
@@ -47,10 +49,15 @@ def vllm_infer(
    max_new_tokens: int = 1024,
    repetition_penalty: float = 1.0,
    skip_special_tokens: bool = True,
+    default_system: Optional[str] = None,
+    enable_thinking: bool = True,
    seed: Optional[int] = None,
    pipeline_parallel_size: int = 1,
    image_max_pixels: int = 768 * 768,
    image_min_pixels: int = 32 * 32,
+    video_fps: float = 2.0,
+    video_maxlen: int = 128,
+    batch_size: int = 1024,
 ):
    r"""Perform batch generation using vLLM engine, which supports tensor parallelism.

@@ -69,6 +76,8 @@ def vllm_infer(
            cutoff_len=cutoff_len,
            max_samples=max_samples,
            preprocessing_num_workers=16,
+            default_system=default_system,
+            enable_thinking=enable_thinking,
            vllm_config=vllm_config,
            temperature=temperature,
            top_p=top_p,
@@ -83,78 +92,106 @@ def vllm_infer(
    tokenizer = tokenizer_module["tokenizer"]
    template_obj = get_template_and_fix_tokenizer(tokenizer, data_args)
    template_obj.mm_plugin.expand_mm_tokens = False  # for vllm generate
+
+    engine_args = {
+        "model": model_args.model_name_or_path,
+        "trust_remote_code": True,
+        "dtype": model_args.infer_dtype,
+        "max_model_len": cutoff_len + max_new_tokens,
+        "tensor_parallel_size": (get_device_count() // pipeline_parallel_size) or 1,
+        "pipeline_parallel_size": pipeline_parallel_size,
+        "disable_log_stats": True,
+        "enable_lora": model_args.adapter_name_or_path is not None,
+    }
+    if template_obj.mm_plugin.__class__.__name__ != "BasePlugin":
+        engine_args["limit_mm_per_prompt"] = {"image": 4, "video": 2, "audio": 2}
+
+    if isinstance(model_args.vllm_config, dict):
+        engine_args.update(model_args.vllm_config)
+
+    llm = LLM(**engine_args)
+
+    # load datasets
    dataset_module = get_dataset(template_obj, model_args, data_args, training_args, "ppo", **tokenizer_module)
+    train_dataset = dataset_module["train_dataset"]
+
+    sampling_params = SamplingParams(
+        repetition_penalty=generating_args.repetition_penalty or 1.0,  # repetition_penalty must > 0
+        temperature=generating_args.temperature,
+        top_p=generating_args.top_p or 1.0,  # top_p must > 0
+        top_k=generating_args.top_k or -1,  # top_k must > 0
+        stop_token_ids=template_obj.get_stop_token_ids(tokenizer),
+        max_tokens=generating_args.max_new_tokens,
+        skip_special_tokens=skip_special_tokens,
+        seed=seed,
+    )
+    if model_args.adapter_name_or_path is not None:
+        lora_request = LoRARequest("default", 1, model_args.adapter_name_or_path[0])
+    else:
+        lora_request = None
+
+    # Store all results in these lists
+    all_prompts, all_preds, all_labels = [], [], []

-    inputs, prompts, labels = [], [], []
-    for sample in dataset_module["train_dataset"]:
-        if sample["images"]:
+    # Add batch process to avoid the issue of too many files opened
+    for i in tqdm(range(0, len(train_dataset), batch_size), desc="Processing batched inference"):
+        vllm_inputs, prompts, labels = [], [], []
+        batch = train_dataset[i : min(i + batch_size, len(train_dataset))]
+
+        for j in range(len(batch["input_ids"])):
+            if batch["images"][j] is not None:
+                image = batch["images"][j]
                multi_modal_data = {
                    "image": template_obj.mm_plugin._regularize_images(
-                    sample["images"], image_max_pixels=image_max_pixels, image_min_pixels=image_min_pixels
+                        image, image_max_pixels=image_max_pixels, image_min_pixels=image_min_pixels
                    )["images"]
                }
-        elif sample["videos"]:
+            elif batch["videos"][j] is not None:
+                video = batch["videos"][j]
                multi_modal_data = {
                    "video": template_obj.mm_plugin._regularize_videos(
-                    sample["videos"], image_max_pixels=image_max_pixels, image_min_pixels=image_min_pixels
+                        video,
+                        image_max_pixels=image_max_pixels,
+                        image_min_pixels=image_min_pixels,
+                        video_fps=video_fps,
+                        video_maxlen=video_maxlen,
                    )["videos"]
                }
-        elif sample["audios"]:
+            elif batch["audios"][j] is not None:
+                audio = batch["audios"][j]
                audio_data = template_obj.mm_plugin._regularize_audios(
-                sample["audios"],
+                    audio,
                    sampling_rate=16000,
                )
                multi_modal_data = {"audio": zip(audio_data["audios"], audio_data["sampling_rates"])}
            else:
                multi_modal_data = None

-        inputs.append({"prompt_token_ids": sample["input_ids"], "multi_modal_data": multi_modal_data})
-        prompts.append(tokenizer.decode(sample["input_ids"], skip_special_tokens=skip_special_tokens))
+            vllm_inputs.append({"prompt_token_ids": batch["input_ids"][j], "multi_modal_data": multi_modal_data})
+            prompts.append(tokenizer.decode(batch["input_ids"][j], skip_special_tokens=skip_special_tokens))
            labels.append(
                tokenizer.decode(
-                list(filter(lambda x: x != IGNORE_INDEX, sample["labels"])), skip_special_tokens=skip_special_tokens
-            )
-        )
-
-    sampling_params = SamplingParams(
-        repetition_penalty=generating_args.repetition_penalty or 1.0,  # repetition_penalty must > 0
-        temperature=generating_args.temperature,
-        top_p=generating_args.top_p or 1.0,  # top_p must > 0
-        top_k=generating_args.top_k or -1,  # top_k must > 0
-        stop_token_ids=template_obj.get_stop_token_ids(tokenizer),
-        max_tokens=generating_args.max_new_tokens,
+                    list(filter(lambda x: x != IGNORE_INDEX, batch["labels"][j])),
                    skip_special_tokens=skip_special_tokens,
-        seed=seed,
                )
-    if model_args.adapter_name_or_path is not None:
-        lora_request = LoRARequest("default", 1, model_args.adapter_name_or_path[0])
-    else:
-        lora_request = None
+            )

-    engine_args = {
-        "model": model_args.model_name_or_path,
-        "trust_remote_code": True,
-        "dtype": model_args.infer_dtype,
-        "max_model_len": cutoff_len + max_new_tokens,
-        "tensor_parallel_size": (get_device_count() // pipeline_parallel_size) or 1,
-        "pipeline_parallel_size": pipeline_parallel_size,
-        "disable_log_stats": True,
-        "enable_lora": model_args.adapter_name_or_path is not None,
-    }
-    if template_obj.mm_plugin.__class__.__name__ != "BasePlugin":
-        engine_args["limit_mm_per_prompt"] = {"image": 4, "video": 2, "audio": 2}
+        results = llm.generate(vllm_inputs, sampling_params, lora_request=lora_request)
+        preds = [result.outputs[0].text for result in results]

-    if isinstance(model_args.vllm_config, dict):
-        engine_args.update(model_args.vllm_config)
+        # Accumulate results
+        all_prompts.extend(prompts)
+        all_preds.extend(preds)
+        all_labels.extend(labels)
+        gc.collect()

-    results = LLM(**engine_args).generate(inputs, sampling_params, lora_request=lora_request)
-    preds = [result.outputs[0].text for result in results]
+    # Write all results at once outside the loop
    with open(save_name, "w", encoding="utf-8") as f:
-        for text, pred, label in zip(prompts, preds, labels):
+        for text, pred, label in zip(all_prompts, all_preds, all_labels):
            f.write(json.dumps({"prompt": text, "predict": pred, "label": label}, ensure_ascii=False) + "\n")

    print("*" * 70)
-    print(f"{len(prompts)} generated results have been saved at {save_name}.")
+    print(f"{len(all_prompts)} total generated results have been saved at {save_name}.")
    print("*" * 70)



--- a/src/llamafactory/api/chat.py
+++ b/src/llamafactory/api/chat.py
@@ -198,6 +198,7 @@ async def create_chat_completion_response(
        top_p=request.top_p,
        max_new_tokens=request.max_tokens,
        num_return_sequences=request.n,
+        repetition_penalty=request.presence_penalty,
        stop=request.stop,
    )

@@ -259,6 +260,7 @@ async def create_stream_chat_completion_response(
        temperature=request.temperature,
        top_p=request.top_p,
        max_new_tokens=request.max_tokens,
+        repetition_penalty=request.presence_penalty,
        stop=request.stop,
    ):
        if len(new_token) != 0:

--- a/src/llamafactory/api/protocol.py
+++ b/src/llamafactory/api/protocol.py
@@ -103,6 +103,7 @@ class ChatCompletionRequest(BaseModel):
    temperature: Optional[float] = None
    top_p: Optional[float] = None
    n: int = 1
+    presence_penalty: Optional[float] = None
    max_tokens: Optional[int] = None
    stop: Optional[Union[str, list[str]]] = None
    stream: bool = False

--- a/src/llamafactory/chat/hf_engine.py
+++ b/src/llamafactory/chat/hf_engine.py
@@ -104,7 +104,6 @@ class HuggingfaceEngine(BaseEngine):
            messages, mm_input_dict["images"], mm_input_dict["videos"], mm_input_dict["audios"], processor
        )
        paired_messages = messages + [{"role": "assistant", "content": ""}]
-        system = system or generating_args["default_system"]
        prompt_ids, _ = template.encode_oneturn(tokenizer, paired_messages, system, tools)
        prompt_ids, _ = template.mm_plugin.process_token_ids(
            prompt_ids,
@@ -117,7 +116,7 @@ class HuggingfaceEngine(BaseEngine):
        )
        prompt_length = len(prompt_ids)
        inputs = torch.tensor([prompt_ids], device=model.device)
-        attention_mask = torch.ones_like(inputs, dtype=torch.bool)
+        attention_mask = torch.ones_like(inputs, dtype=torch.long)

        do_sample: Optional[bool] = input_kwargs.pop("do_sample", None)
        temperature: Optional[float] = input_kwargs.pop("temperature", None)