update to 0.9.2.dev0

8293100a · luopl · 2778a3d0 · 8293100a · 8293100a · 8293100a
Commit 8293100a authored Jan 16, 2025 by luopl
20 changed files
--- a/examples/train_lora/qwen2vl_lora_sft.yaml
+++ b/examples/train_lora/qwen2vl_lora_sft.yaml
 ### model
 model_name_or_path: Qwen/Qwen2-VL-7B-Instruct
+trust_remote_code: true

 ### method
 stage: sft
@@ -8,7 +9,7 @@ finetuning_type: lora
 lora_target: all

 ### dataset
-dataset: mllm_demo,identity  # video: mllm_video_demo
+dataset: mllm_demo,identity,alpaca_en_demo  # video: mllm_video_demo
 template: qwen2_vl
 cutoff_len: 2048
 max_samples: 1000

--- a/examples/train_qlora/llama3_lora_sft_aqlm.yaml
+++ b/examples/train_qlora/llama3_lora_sft_aqlm.yaml
 ### model
 model_name_or_path: ISTA-DASLab/Meta-Llama-3-8B-Instruct-AQLM-2Bit-1x16
+trust_remote_code: true

 ### method
 stage: sft

--- a/examples/train_qlora/llama3_lora_sft_awq.yaml
+++ b/examples/train_qlora/llama3_lora_sft_awq.yaml
 ### model
 model_name_or_path: TechxGenus/Meta-Llama-3-8B-Instruct-AWQ
+trust_remote_code: true

 ### method
 stage: sft

--- a/examples/train_qlora/llama3_lora_sft_bnb_npu.yaml
+++ b/examples/train_qlora/llama3_lora_sft_bnb_npu.yaml
+### model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+quantization_bit: 4
+quantization_method: bitsandbytes
+double_quantization: false
+trust_remote_code: true
+
+### method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_target: all
+
+### dataset
+dataset: identity,alpaca_en_demo
+template: llama3
+cutoff_len: 2048
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+### output
+output_dir: saves/llama3-8b/lora/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 1.0e-4
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+
+### eval
+val_size: 0.1
+per_device_eval_batch_size: 1
+eval_strategy: steps
+eval_steps: 500
--- a/examples/train_qlora/llama3_lora_sft_gptq.yaml
+++ b/examples/train_qlora/llama3_lora_sft_gptq.yaml
 ### model
 model_name_or_path: TechxGenus/Meta-Llama-3-8B-Instruct-GPTQ
+trust_remote_code: true

 ### method
 stage: sft

--- a/examples/train_qlora/llama3_lora_sft_otfq.yaml
+++ b/examples/train_qlora/llama3_lora_sft_otfq.yaml
@@ -2,6 +2,7 @@
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 quantization_bit: 4
 quantization_method: bitsandbytes  # choices: [bitsandbytes (4/8), hqq (2/3/4/5/6/8), eetq (8)]
+trust_remote_code: true

 ### method
 stage: sft

--- a/requirements.txt
+++ b/requirements.txt
@@ -3,6 +3,7 @@ datasets>=2.16.0,<=3.1.0
 accelerate>=0.34.0,<=1.0.1
 peft>=0.11.1,<=0.12.0
 trl>=0.8.6,<=0.9.6
+tokenizers>=0.19.0,<0.20.4
 gradio>=4.0.0,<5.0.0
 pandas>=2.0.0
 scipy

--- a/scripts/api_example/test_image.py
+++ b/scripts/api_example/test_image.py
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from openai import OpenAI
+from transformers.utils.versions import require_version
+
+
+require_version("openai>=1.5.0", "To fix: pip install openai>=1.5.0")
+
+
+def main():
+    client = OpenAI(
+        api_key="{}".format(os.environ.get("API_KEY", "0")),
+        base_url="http://localhost:{}/v1".format(os.environ.get("API_PORT", 8000)),
+    )
+    messages = []
+    messages.append(
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Output the color and number of each box."},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-VL/boxes.png"},
+                },
+            ],
+        }
+    )
+    result = client.chat.completions.create(messages=messages, model="test")
+    messages.append(result.choices[0].message)
+    print("Round 1:", result.choices[0].message.content)
+    # The image shows a pyramid of colored blocks with numbers on them. Here are the colors and numbers of ...
+    messages.append(
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "What kind of flower is this?"},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-VL/flowers.jpg"},
+                },
+            ],
+        }
+    )
+    result = client.chat.completions.create(messages=messages, model="test")
+    messages.append(result.choices[0].message)
+    print("Round 2:", result.choices[0].message.content)
+    # The image shows a cluster of forget-me-not flowers. Forget-me-nots are small ...
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/api_example/test_toolcall.py
+++ b/scripts/api_example/test_toolcall.py
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from typing import Sequence
+
+from openai import OpenAI
+from transformers.utils.versions import require_version
+
+
+require_version("openai>=1.5.0", "To fix: pip install openai>=1.5.0")
+
+
+def calculate_gpa(grades: Sequence[str], hours: Sequence[int]) -> float:
+    grade_to_score = {"A": 4, "B": 3, "C": 2}
+    total_score, total_hour = 0, 0
+    for grade, hour in zip(grades, hours):
+        total_score += grade_to_score[grade] * hour
+        total_hour += hour
+    return round(total_score / total_hour, 2)
+
+
+def main():
+    client = OpenAI(
+        api_key="{}".format(os.environ.get("API_KEY", "0")),
+        base_url="http://localhost:{}/v1".format(os.environ.get("API_PORT", 8000)),
+    )
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "calculate_gpa",
+                "description": "Calculate the Grade Point Average (GPA) based on grades and credit hours",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "grades": {"type": "array", "items": {"type": "string"}, "description": "The grades"},
+                        "hours": {"type": "array", "items": {"type": "integer"}, "description": "The credit hours"},
+                    },
+                    "required": ["grades", "hours"],
+                },
+            },
+        }
+    ]
+    tool_map = {"calculate_gpa": calculate_gpa}
+
+    messages = []
+    messages.append({"role": "user", "content": "My grades are A, A, B, and C. The credit hours are 3, 4, 3, and 2."})
+    result = client.chat.completions.create(messages=messages, model="test", tools=tools)
+    if result.choices[0].message.tool_calls is None:
+        raise ValueError("Cannot retrieve function call from the response.")
+
+    messages.append(result.choices[0].message)
+    tool_call = result.choices[0].message.tool_calls[0].function
+    print(tool_call)
+    # Function(arguments='{"grades": ["A", "A", "B", "C"], "hours": [3, 4, 3, 2]}', name='calculate_gpa')
+    name, arguments = tool_call.name, json.loads(tool_call.arguments)
+    tool_result = tool_map[name](**arguments)
+    messages.append({"role": "tool", "content": json.dumps({"gpa": tool_result}, ensure_ascii=False)})
+    result = client.chat.completions.create(messages=messages, model="test", tools=tools)
+    print(result.choices[0].message.content)
+    # Based on the grades and credit hours you provided, your Grade Point Average (GPA) is 3.42.
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/convert_ckpt/llamafy_baichuan2.py
+++ b/scripts/convert_ckpt/llamafy_baichuan2.py
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from collections import OrderedDict
+from typing import Any, Dict
+
+import fire
+import torch
+from safetensors.torch import save_file
+from tqdm import tqdm
+from transformers.modeling_utils import (
+    SAFE_WEIGHTS_INDEX_NAME,
+    SAFE_WEIGHTS_NAME,
+    WEIGHTS_INDEX_NAME,
+    WEIGHTS_NAME,
+    shard_checkpoint,
+)
+
+
+CONFIG_NAME = "config.json"
+
+
+def save_weight(input_dir: str, output_dir: str, shard_size: str, save_safetensors: bool):
+    baichuan2_state_dict: Dict[str, torch.Tensor] = OrderedDict()
+    for filepath in tqdm(os.listdir(input_dir), desc="Load weights"):
+        if os.path.isfile(os.path.join(input_dir, filepath)) and filepath.endswith(".bin"):
+            shard_weight = torch.load(os.path.join(input_dir, filepath), map_location="cpu")
+            baichuan2_state_dict.update(shard_weight)
+
+    llama2_state_dict: Dict[str, torch.Tensor] = OrderedDict()
+    for key, value in tqdm(baichuan2_state_dict.items(), desc="Convert format"):
+        if "W_pack" in key:
+            proj_size = value.size(0) // 3
+            llama2_state_dict[key.replace("W_pack", "q_proj")] = value[:proj_size, :]
+            llama2_state_dict[key.replace("W_pack", "k_proj")] = value[proj_size : 2 * proj_size, :]
+            llama2_state_dict[key.replace("W_pack", "v_proj")] = value[2 * proj_size :, :]
+        elif "lm_head" in key:
+            llama2_state_dict[key] = torch.nn.functional.normalize(value)
+        else:
+            llama2_state_dict[key] = value
+
+    weights_name = SAFE_WEIGHTS_NAME if save_safetensors else WEIGHTS_NAME
+    shards, index = shard_checkpoint(llama2_state_dict, max_shard_size=shard_size, weights_name=weights_name)
+
+    for shard_file, shard in tqdm(shards.items(), desc="Save weights"):
+        if save_safetensors:
+            save_file(shard, os.path.join(output_dir, shard_file), metadata={"format": "pt"})
+        else:
+            torch.save(shard, os.path.join(output_dir, shard_file))
+
+    if index is None:
+        print(f"Model weights saved in {os.path.join(output_dir, WEIGHTS_NAME)}")
+    else:
+        index_name = SAFE_WEIGHTS_INDEX_NAME if save_safetensors else WEIGHTS_INDEX_NAME
+        with open(os.path.join(output_dir, index_name), "w", encoding="utf-8") as f:
+            json.dump(index, f, indent=2, sort_keys=True)
+        print(f"Model weights saved in {output_dir}")
+
+
+def save_config(input_dir: str, output_dir: str):
+    with open(os.path.join(input_dir, CONFIG_NAME), encoding="utf-8") as f:
+        llama2_config_dict: Dict[str, Any] = json.load(f)
+
+    llama2_config_dict["architectures"] = ["LlamaForCausalLM"]
+    llama2_config_dict.pop("auto_map", None)
+    llama2_config_dict.pop("tokenizer_class", None)
+    llama2_config_dict["model_type"] = "llama"
+
+    with open(os.path.join(output_dir, CONFIG_NAME), "w", encoding="utf-8") as f:
+        json.dump(llama2_config_dict, f, indent=2)
+    print(f"Model config saved in {os.path.join(output_dir, CONFIG_NAME)}")
+
+
+def llamafy_baichuan2(
+    input_dir: str,
+    output_dir: str,
+    shard_size: str = "2GB",
+    save_safetensors: bool = True,
+):
+    r"""
+    Converts the Baichuan2-7B model in the same format as LLaMA2-7B.
+    Usage: python llamafy_baichuan2.py --input_dir input --output_dir output
+    Converted model: https://huggingface.co/hiyouga/Baichuan2-7B-Base-LLaMAfied
+    """
+    try:
+        os.makedirs(output_dir, exist_ok=False)
+    except Exception as e:
+        raise print("Output dir already exists", e)
+
+    save_weight(input_dir, output_dir, shard_size, save_safetensors)
+    save_config(input_dir, output_dir)
+
+
+if __name__ == "__main__":
+    fire.Fire(llamafy_baichuan2)
--- a/scripts/convert_ckpt/llamafy_qwen.py
+++ b/scripts/convert_ckpt/llamafy_qwen.py
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from collections import OrderedDict
+from typing import Any, Dict
+
+import fire
+import torch
+from safetensors import safe_open
+from safetensors.torch import save_file
+from tqdm import tqdm
+from transformers.modeling_utils import (
+    SAFE_WEIGHTS_INDEX_NAME,
+    SAFE_WEIGHTS_NAME,
+    WEIGHTS_INDEX_NAME,
+    WEIGHTS_NAME,
+    shard_checkpoint,
+)
+from transformers.utils import check_min_version
+
+
+try:
+    check_min_version("4.34.0")
+except Exception:
+    raise ValueError("Please upgrade `transformers` to 4.34.0")
+
+
+CONFIG_NAME = "config.json"
+
+
+def save_weight(input_dir: str, output_dir: str, shard_size: str, save_safetensors: bool) -> str:
+    qwen_state_dict: Dict[str, torch.Tensor] = OrderedDict()
+    for filepath in tqdm(os.listdir(input_dir), desc="Load weights"):
+        if os.path.isfile(os.path.join(input_dir, filepath)) and filepath.endswith(".safetensors"):
+            with safe_open(os.path.join(input_dir, filepath), framework="pt", device="cpu") as f:
+                for key in f.keys():
+                    qwen_state_dict[key] = f.get_tensor(key)
+
+    llama2_state_dict: Dict[str, torch.Tensor] = OrderedDict()
+    torch_dtype = None
+    for key, value in tqdm(qwen_state_dict.items(), desc="Convert format"):
+        if torch_dtype is None:
+            torch_dtype = value.dtype
+        if "wte" in key:
+            llama2_state_dict["model.embed_tokens.weight"] = value
+        elif "ln_f" in key:
+            llama2_state_dict["model.norm.weight"] = value
+        else:
+            key = key.replace("transformer.h", "model.layers")
+            if "attn.c_attn" in key:
+                proj_size = value.size(0) // 3
+                llama2_state_dict[key.replace("attn.c_attn", "self_attn.q_proj")] = value[:proj_size, ...]
+                llama2_state_dict[key.replace("attn.c_attn", "self_attn.k_proj")] = value[
+                    proj_size : 2 * proj_size, ...
+                ]
+                llama2_state_dict[key.replace("attn.c_attn", "self_attn.v_proj")] = value[2 * proj_size :, ...]
+            elif "attn.c_proj" in key:
+                llama2_state_dict[key.replace("attn.c_proj", "self_attn.o_proj")] = value
+                llama2_state_dict[key.replace("attn.c_proj.weight", "self_attn.o_proj.bias")] = torch.zeros_like(
+                    value[:, 0]
+                ).squeeze()
+            elif "ln_1" in key:
+                llama2_state_dict[key.replace("ln_1", "input_layernorm")] = value
+            elif "ln_2" in key:
+                llama2_state_dict[key.replace("ln_2", "post_attention_layernorm")] = value
+            elif "mlp.w1" in key:
+                llama2_state_dict[key.replace("mlp.w1", "mlp.up_proj")] = value
+            elif "mlp.w2" in key:
+                llama2_state_dict[key.replace("mlp.w2", "mlp.gate_proj")] = value
+            elif "mlp.c_proj" in key:
+                llama2_state_dict[key.replace("mlp.c_proj", "mlp.down_proj")] = value
+            elif "lm_head" in key:
+                llama2_state_dict[key] = value
+            else:
+                raise KeyError(f"Unable to process key {key}")
+
+    weights_name = SAFE_WEIGHTS_NAME if save_safetensors else WEIGHTS_NAME
+    shards, index = shard_checkpoint(llama2_state_dict, max_shard_size=shard_size, weights_name=weights_name)
+
+    for shard_file, shard in tqdm(shards.items(), desc="Save weights"):
+        if save_safetensors:
+            save_file(shard, os.path.join(output_dir, shard_file), metadata={"format": "pt"})
+        else:
+            torch.save(shard, os.path.join(output_dir, shard_file))
+
+    if index is None:
+        print(f"Model weights saved in {os.path.join(output_dir, weights_name)}")
+    else:
+        index_name = SAFE_WEIGHTS_INDEX_NAME if save_safetensors else WEIGHTS_INDEX_NAME
+        with open(os.path.join(output_dir, index_name), "w", encoding="utf-8") as f:
+            json.dump(index, f, indent=2, sort_keys=True)
+        print(f"Model weights saved in {output_dir}")
+
+    return str(torch_dtype).replace("torch.", "")
+
+
+def save_config(input_dir: str, output_dir: str, torch_dtype: str):
+    with open(os.path.join(input_dir, CONFIG_NAME), encoding="utf-8") as f:
+        qwen_config_dict: Dict[str, Any] = json.load(f)
+
+    llama2_config_dict: Dict[str, Any] = OrderedDict()
+    llama2_config_dict["architectures"] = ["LlamaForCausalLM"]
+    llama2_config_dict["hidden_act"] = "silu"
+    llama2_config_dict["hidden_size"] = qwen_config_dict["hidden_size"]
+    llama2_config_dict["initializer_range"] = qwen_config_dict["initializer_range"]
+    llama2_config_dict["intermediate_size"] = qwen_config_dict["intermediate_size"] // 2
+    llama2_config_dict["max_position_embeddings"] = qwen_config_dict["max_position_embeddings"]
+    llama2_config_dict["model_type"] = "llama"
+    llama2_config_dict["num_attention_heads"] = qwen_config_dict["num_attention_heads"]
+    llama2_config_dict["num_hidden_layers"] = qwen_config_dict["num_hidden_layers"]
+    llama2_config_dict["num_key_value_heads"] = qwen_config_dict["hidden_size"] // qwen_config_dict["kv_channels"]
+    llama2_config_dict["pretraining_tp"] = 1
+    llama2_config_dict["rms_norm_eps"] = qwen_config_dict["layer_norm_epsilon"]
+    llama2_config_dict["rope_scaling"] = None
+    llama2_config_dict["tie_word_embeddings"] = qwen_config_dict["tie_word_embeddings"]
+    llama2_config_dict["torch_dtype"] = torch_dtype
+    llama2_config_dict["transformers_version"] = "4.34.0"
+    llama2_config_dict["use_cache"] = True
+    llama2_config_dict["vocab_size"] = qwen_config_dict["vocab_size"]
+    llama2_config_dict["attention_bias"] = True
+
+    with open(os.path.join(output_dir, CONFIG_NAME), "w", encoding="utf-8") as f:
+        json.dump(llama2_config_dict, f, indent=2)
+    print(f"Model config saved in {os.path.join(output_dir, CONFIG_NAME)}")
+
+
+def llamafy_qwen(
+    input_dir: str,
+    output_dir: str,
+    shard_size: str = "2GB",
+    save_safetensors: bool = False,
+):
+    r"""
+    Converts the Qwen models in the same format as LLaMA2.
+    Usage: python llamafy_qwen.py --input_dir input --output_dir output
+    Converted model: https://huggingface.co/hiyouga/Qwen-14B-Chat-LLaMAfied
+    """
+    try:
+        os.makedirs(output_dir, exist_ok=False)
+    except Exception as e:
+        raise print("Output dir already exists", e)
+
+    torch_dtype = save_weight(input_dir, output_dir, shard_size, save_safetensors)
+    save_config(input_dir, output_dir, torch_dtype)
+
+
+if __name__ == "__main__":
+    fire.Fire(llamafy_qwen)
--- a/scripts/llama_pro.py
+++ b/scripts/llama_pro.py
@@ -24,7 +24,7 @@ import fire
 import torch
 from safetensors.torch import save_file
 from tqdm import tqdm
-from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, PreTrainedModel
 from transformers.modeling_utils import (
    SAFE_WEIGHTS_INDEX_NAME,
    SAFE_WEIGHTS_NAME,
@@ -35,7 +35,7 @@ from transformers.modeling_utils import (


 if TYPE_CHECKING:
-    from transformers import PretrainedConfig, PreTrainedModel
+    from transformers import PretrainedConfig


 def change_name(name: str, old_index: int, new_index: int) -> str:
@@ -61,17 +61,18 @@ def block_expansion(
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
    tokenizer.save_pretrained(output_dir)

-    config: "PretrainedConfig" = AutoConfig.from_pretrained(model_name_or_path)  # load the original one
+    config = AutoConfig.from_pretrained(model_name_or_path)  # load the original one
    if save_safetensors:
        setattr(config, "tie_word_embeddings", False)  # safetensors does not allow shared weights

-    model: "PreTrainedModel" = AutoModelForCausalLM.from_pretrained(
+    model = AutoModelForCausalLM.from_pretrained(
        model_name_or_path,
        config=config,
        torch_dtype="auto",
        trust_remote_code=True,
        low_cpu_mem_usage=True,
    )
+    assert isinstance(model, PreTrainedModel)  # type hint
    state_dict = model.state_dict()

    if num_layers % num_expand != 0:
@@ -85,7 +86,7 @@ def block_expansion(
            if f".{i:d}." in key:
                output_state_dict[change_name(key, i, layer_cnt)] = value

-        print(f"Add layer {layer_cnt} copied from layer {i}")
+        print(f"Add layer {layer_cnt} copied from layer {i}.")
        layer_cnt += 1
        if (i + 1) % split == 0:
            for key, value in state_dict.items():
@@ -95,7 +96,7 @@ def block_expansion(
                    else:
                        output_state_dict[change_name(key, i, layer_cnt)] = torch.clone(value)

-            print(f"Add layer {layer_cnt} expanded from layer {i}")
+            print(f"Add layer {layer_cnt} expanded from layer {i}.")
            layer_cnt += 1

    for key, value in state_dict.items():
@@ -112,12 +113,13 @@ def block_expansion(
            torch.save(shard, os.path.join(output_dir, shard_file))

    if index is None:
-        print(f"Model weights saved in {os.path.join(output_dir, weights_name)}")
+        print(f"Model weights saved in {os.path.join(output_dir, weights_name)}.")
    else:
        index_name = SAFE_WEIGHTS_INDEX_NAME if save_safetensors else WEIGHTS_INDEX_NAME
        with open(os.path.join(output_dir, index_name), "w", encoding="utf-8") as f:
            json.dump(index, f, indent=2, sort_keys=True)
-        print(f"Model weights saved in {output_dir}")
+
+        print(f"Model weights saved in {output_dir}.")

    print("- Fine-tune this model with:")
    print(f"model_name_or_path: {output_dir}")

--- a/scripts/stat_utils/cal_flops.py
+++ b/scripts/stat_utils/cal_flops.py
+# Copyright 2024 Microsoft Corporation and the LlamaFactory team.
+#
+# This code is inspired by the Microsoft's DeepSpeed library.
+# https://www.deepspeed.ai/tutorials/flops-profiler/
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import fire
+import torch
+from deepspeed.accelerator import get_accelerator  # type: ignore
+from deepspeed.profiling.flops_profiler import get_model_profile  # type: ignore
+
+from llamafactory.chat import ChatModel
+
+
+def calculate_flops(
+    model_name_or_path: str,
+    batch_size: int = 1,
+    seq_length: int = 512,
+    flash_attn: str = "auto",
+):
+    r"""
+    Calculates the flops of pre-trained models.
+    Usage: python cal_flops.py --model_name_or_path path_to_model --batch_size 1 --seq_length 512
+    """
+    with get_accelerator().device(0):
+        chat_model = ChatModel(dict(model_name_or_path=model_name_or_path, template="empty", flash_attn=flash_attn))
+        fake_input = torch.ones((batch_size, seq_length), dtype=torch.long, device=chat_model.engine.model.device)
+        input_dict = {"input_ids": fake_input, "labels": fake_input.clone()}
+        flops, macs, params = get_model_profile(
+            chat_model.engine.model, kwargs=input_dict, print_profile=True, detailed=True
+        )
+        print("FLOPs:", flops)
+        print("MACs:", macs)
+        print("Params:", params)
+
+
+if __name__ == "__main__":
+    fire.Fire(calculate_flops)
--- a/scripts/stat_utils/cal_lr.py
+++ b/scripts/stat_utils/cal_lr.py
+# Copyright 2024 imoneoi and the LlamaFactory team.
+#
+# This code is inspired by the imoneoi's OpenChat library.
+# https://github.com/imoneoi/openchat/blob/3.6.0/ochat/training_deepspeed/train.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Literal
+
+import fire
+import torch
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import DataCollatorForLanguageModeling
+
+from llamafactory.data import MultiModalDataCollatorForSeq2Seq, get_dataset, get_template_and_fix_tokenizer
+from llamafactory.extras.constants import IGNORE_INDEX
+from llamafactory.hparams import get_train_args
+from llamafactory.model import load_tokenizer
+
+
+BASE_LR = 3e-4  # 1.5e-4 for 30B-70B models
+BASE_BS = 4_000_000  # from llama paper
+
+
+def calculate_lr(
+    model_name_or_path: str,
+    batch_size: int,  # total batch size, namely (batch size * gradient accumulation * world size)
+    stage: Literal["pt", "sft"] = "sft",
+    dataset: str = "alpaca_en_demo",
+    dataset_dir: str = "data",
+    template: str = "default",
+    cutoff_len: int = 2048,  # i.e. maximum input length during training
+    is_mistral_or_gemma: bool = False,  # mistral and gemma models opt for a smaller learning rate,
+    packing: bool = False,
+):
+    r"""
+    Calculates the optimal learning rate for 7B/13B models using LLaMA's hyper-parameters.
+    Usage:
+    python cal_lr.py --model_name_or_path path_to_model --dataset alpaca_en_demo --cutoff_len 1024 --batch_size 16
+    """
+    model_args, data_args, training_args, _, _ = get_train_args(
+        dict(
+            stage=stage,
+            model_name_or_path=model_name_or_path,
+            dataset=dataset,
+            dataset_dir=dataset_dir,
+            template=template,
+            cutoff_len=cutoff_len,
+            packing=packing,
+            preprocessing_num_workers=16,
+            output_dir="dummy_dir",
+            overwrite_cache=True,
+            do_train=True,
+        )
+    )
+    tokenizer_module = load_tokenizer(model_args)
+    tokenizer = tokenizer_module["tokenizer"]
+    template = get_template_and_fix_tokenizer(tokenizer, data_args)
+    trainset = get_dataset(template, model_args, data_args, training_args, stage, **tokenizer_module)["train_dataset"]
+    if stage == "pt":
+        data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
+    elif stage == "sft":
+        data_collator = MultiModalDataCollatorForSeq2Seq(
+            template=template, tokenizer=tokenizer, label_pad_token_id=IGNORE_INDEX
+        )
+    else:
+        raise NotImplementedError(f"Stage does not supported: {stage}.")
+
+    dataloader = DataLoader(trainset, batch_size, shuffle=False, collate_fn=data_collator, pin_memory=True)
+    valid_tokens, total_tokens = 0, 0
+    for batch in tqdm(dataloader, desc="Collecting valid tokens"):
+        valid_tokens += torch.sum(batch["labels"] != IGNORE_INDEX).item()
+        total_tokens += torch.numel(batch["labels"])
+
+    valid_ratio = valid_tokens / total_tokens
+    token_batch_size = cutoff_len * batch_size * valid_ratio
+    lr = BASE_LR * math.sqrt(token_batch_size / BASE_BS)  # lr ~ sqrt(batch_size)
+    lr = lr / 6.0 if is_mistral_or_gemma else lr
+    print(
+        "Optimal learning rate is {:.2e} for valid ratio% {:.2f} and effective token batch size {:.2f}".format(
+            lr, valid_ratio * 100, token_batch_size
+        )
+    )
+
+
+if __name__ == "__main__":
+    fire.Fire(calculate_lr)
--- a/scripts/stat_utils/cal_mfu.py
+++ b/scripts/stat_utils/cal_mfu.py
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+
+import fire
+import torch
+import torch.distributed as dist
+from transformers import AutoConfig
+
+from llamafactory.train.tuner import run_exp
+
+
+BASE = 2  # gemm (add + mul)
+
+
+def compute_model_flops(
+    model_name_or_path: str,
+    total_batch_size: int,
+    seq_length: int,
+    include_backward: bool = True,
+    include_recompute: bool = False,
+    include_flashattn: bool = False,
+) -> int:
+    r"""
+    Calculates the FLOPs of model per forward/backward pass.
+    """
+    config = AutoConfig.from_pretrained(model_name_or_path)
+    hidden_size = getattr(config, "hidden_size", None)
+    vocab_size = getattr(config, "vocab_size", None)
+    intermediate_size = getattr(config, "intermediate_size", None)
+    num_attention_heads = getattr(config, "num_attention_heads", None)
+    num_key_value_heads = getattr(config, "num_key_value_heads", None)
+    num_hidden_layers = getattr(config, "num_hidden_layers", None)
+    tie_word_embeddings = getattr(config, "tie_word_embeddings", False)
+
+    # mlp module
+    mlp_flops_per_token = 3 * BASE * hidden_size * intermediate_size  # up, gate, down
+    mlp_flops = total_batch_size * seq_length * num_hidden_layers * mlp_flops_per_token
+
+    # attn projector module
+    q_flops_per_token = BASE * hidden_size * hidden_size
+    o_flops_per_token = BASE * hidden_size * hidden_size
+    k_flops_per_token = BASE * hidden_size * hidden_size * num_key_value_heads // num_attention_heads
+    v_flops_per_token = BASE * hidden_size * hidden_size * num_key_value_heads // num_attention_heads
+    attn_proj_flops_per_token = q_flops_per_token + o_flops_per_token + k_flops_per_token + v_flops_per_token
+    attn_proj_flops = total_batch_size * seq_length * num_hidden_layers * attn_proj_flops_per_token
+
+    # attn sdpa module
+    sdpa_flops_per_layer = 2 * BASE * hidden_size * seq_length * seq_length  # (q * k^T) * v
+    sdpa_flops = total_batch_size * num_hidden_layers * sdpa_flops_per_layer
+
+    # embedding module
+    embedding_flops_per_token = hidden_size * vocab_size
+    embedding_flops = total_batch_size * seq_length * embedding_flops_per_token
+    if tie_word_embeddings is False:
+        embedding_flops *= 2
+
+    non_embedding_flops = mlp_flops + attn_proj_flops + sdpa_flops
+    non_embedding_coeff, embedding_coeff = 1, 1
+    if include_backward:
+        non_embedding_coeff += 2
+        embedding_coeff += 2
+
+    if include_recompute:
+        non_embedding_coeff += 1
+
+    total_flops = non_embedding_coeff * non_embedding_flops + embedding_coeff * embedding_flops
+
+    if include_flashattn:
+        total_flops += sdpa_flops
+
+    return total_flops
+
+
+def compute_device_flops(world_size: int) -> float:
+    r"""
+    Calculates the FLOPs of the device capability per second.
+    """
+    device_name = torch.cuda.get_device_name()
+    if "H100" in device_name or "H800" in device_name:
+        return 989 * 1e12 * world_size
+    elif "A100" in device_name or "A800" in device_name:
+        return 312 * 1e12 * world_size
+    elif "V100" in device_name:
+        return 125 * 1e12 * world_size
+    elif "4090" in device_name:
+        return 98 * 1e12 * world_size
+    else:
+        raise NotImplementedError(f"Device not supported: {device_name}.")
+
+
+def calculate_mfu(
+    model_name_or_path: str,
+    batch_size: int = 1,
+    seq_length: int = 1024,
+    num_steps: int = 100,
+    finetuning_type: str = "lora",
+    flash_attn: str = "auto",
+    deepspeed_stage: int = 0,
+    disable_gc: bool = False,
+    liger_kernel: bool = False,
+    unsloth_gc: bool = False,
+) -> float:
+    r"""
+    Calculates MFU for given model and hyper-params.
+    Usage: python cal_mfu.py --model_name_or_path path_to_model --batch_size 1 --seq_length 1024
+    """
+    args = {
+        "model_name_or_path": model_name_or_path,
+        "flash_attn": flash_attn,
+        "disable_gradient_checkpointing": disable_gc,
+        "enable_liger_kernel": liger_kernel,
+        "use_unsloth_gc": unsloth_gc,
+        "stage": "pt",
+        "do_train": True,
+        "finetuning_type": finetuning_type,
+        "dataset": "c4_demo",
+        "cutoff_len": seq_length,
+        "output_dir": os.path.join("saves", "test_mfu"),
+        "logging_strategy": "no",
+        "save_strategy": "no",
+        "save_only_model": True,
+        "overwrite_output_dir": True,
+        "per_device_train_batch_size": batch_size,
+        "max_steps": num_steps,
+        "bf16": True,
+    }
+    if deepspeed_stage in [2, 3]:
+        args["deepspeed"] = f"examples/deepspeed/ds_z{deepspeed_stage}_config.json"
+
+    run_exp(args)
+    with open(os.path.join("saves", "test_mfu", "all_results.json"), encoding="utf-8") as f:
+        result = json.load(f)
+
+    if dist.is_initialized():
+        world_size = dist.get_world_size()
+    else:
+        world_size = 1
+
+    total_batch_size = batch_size * world_size
+    mfu_value = (
+        result["train_steps_per_second"]
+        * compute_model_flops(model_name_or_path, total_batch_size, seq_length)
+        / compute_device_flops(world_size)
+    )
+    print(f"MFU: {mfu_value * 100:.2f}%")
+
+
+if __name__ == "__main__":
+    fire.Fire(calculate_mfu)
--- a/scripts/stat_utils/cal_ppl.py
+++ b/scripts/stat_utils/cal_ppl.py
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+from dataclasses import dataclass
+from typing import Any, Dict, Literal, Optional, Sequence
+
+import fire
+import torch
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import DataCollatorForLanguageModeling
+
+from llamafactory.data import MultiModalDataCollatorForSeq2Seq, get_dataset, get_template_and_fix_tokenizer
+from llamafactory.extras.constants import IGNORE_INDEX
+from llamafactory.hparams import get_train_args
+from llamafactory.model import load_model, load_tokenizer
+
+
+@dataclass
+class PairwiseDataCollatorWithPadding(MultiModalDataCollatorForSeq2Seq):
+    r"""
+    Data collator for pairwise data.
+    """
+
+    train_on_prompt: bool = False
+
+    def __call__(self, features: Sequence[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
+        r"""
+        Pads batched data to the longest sequence in the batch.
+        """
+        chosen_features = []
+        for feature in features:
+            chosen_features.append(
+                {
+                    "input_ids": feature["chosen_input_ids"],
+                    "attention_mask": feature["chosen_attention_mask"],
+                    "labels": feature["chosen_input_ids"] if self.train_on_prompt else feature["chosen_labels"],
+                    "images": feature["images"],
+                    "videos": feature["videos"],
+                }
+            )
+
+        return super().__call__(chosen_features)
+
+
+def calculate_ppl(
+    model_name_or_path: str,
+    save_name: str = "ppl.json",
+    batch_size: int = 4,
+    stage: Literal["pt", "sft", "rm"] = "sft",
+    dataset: str = "alpaca_en_demo",
+    dataset_dir: str = "data",
+    template: str = "default",
+    cutoff_len: int = 2048,
+    max_samples: Optional[int] = None,
+    train_on_prompt: bool = False,
+):
+    r"""
+    Calculates the ppl on the dataset of the pre-trained models.
+    Usage: export CUDA_VISIBLE_DEVICES=0
+    python cal_ppl.py --model_name_or_path path_to_model --dataset alpaca_en_demo --save_name ppl.json
+    """
+    model_args, data_args, training_args, finetuning_args, _ = get_train_args(
+        dict(
+            stage=stage,
+            model_name_or_path=model_name_or_path,
+            dataset=dataset,
+            dataset_dir=dataset_dir,
+            template=template,
+            cutoff_len=cutoff_len,
+            max_samples=max_samples,
+            train_on_prompt=train_on_prompt,
+            preprocessing_num_workers=16,
+            output_dir="dummy_dir",
+            overwrite_cache=True,
+            do_train=True,
+        )
+    )
+    tokenizer_module = load_tokenizer(model_args)
+    tokenizer = tokenizer_module["tokenizer"]
+    template = get_template_and_fix_tokenizer(tokenizer, data_args)
+    trainset = get_dataset(template, model_args, data_args, training_args, stage, **tokenizer_module)["train_dataset"]
+    model = load_model(tokenizer, model_args, finetuning_args, is_trainable=False)
+    if stage == "pt":
+        data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
+    elif stage == "sft":
+        data_collator = MultiModalDataCollatorForSeq2Seq(
+            template=template, tokenizer=tokenizer, label_pad_token_id=IGNORE_INDEX
+        )
+    elif stage == "rm":
+        data_collator = PairwiseDataCollatorWithPadding(
+            template=template, tokenizer=tokenizer, label_pad_token_id=IGNORE_INDEX, train_on_prompt=train_on_prompt
+        )
+    else:
+        raise NotImplementedError(f"Stage does not supported: {stage}.")
+
+    dataloader = DataLoader(trainset, batch_size, shuffle=False, collate_fn=data_collator, pin_memory=True)
+    criterion = torch.nn.CrossEntropyLoss(reduction="none")
+    total_ppl = 0
+    perplexities = []
+    batch: Dict[str, "torch.Tensor"]
+    with torch.no_grad():
+        for batch in tqdm(dataloader, desc="Computing perplexities"):
+            batch = batch.to(model.device)
+            outputs = model(**batch)
+            shift_logits: "torch.Tensor" = outputs["logits"][..., :-1, :]
+            shift_labels: "torch.Tensor" = batch["labels"][..., 1:]
+            loss_mask = shift_labels != IGNORE_INDEX
+            flatten_logits = shift_logits.contiguous().view(shift_labels.size(0) * shift_labels.size(1), -1)
+            flatten_labels = shift_labels.contiguous().view(-1)
+            token_logps: "torch.Tensor" = criterion(flatten_logits, flatten_labels)
+            token_logps = token_logps.contiguous().view(shift_logits.size(0), -1)
+            sentence_logps = (token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
+            total_ppl += sentence_logps.exp().sum().item()
+            perplexities.extend(sentence_logps.exp().tolist())
+
+    with open(save_name, "w", encoding="utf-8") as f:
+        json.dump(perplexities, f, indent=2)
+
+    print(f"Average perplexity is {total_ppl / len(perplexities):.2f}")
+    print(f"Perplexities have been saved at {save_name}.")
+
+
+if __name__ == "__main__":
+    fire.Fire(calculate_ppl)
--- a/scripts/stat_utils/length_cdf.py
+++ b/scripts/stat_utils/length_cdf.py
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+
+import fire
+from tqdm import tqdm
+
+from llamafactory.data import get_dataset, get_template_and_fix_tokenizer
+from llamafactory.hparams import get_train_args
+from llamafactory.model import load_tokenizer
+
+
+def length_cdf(
+    model_name_or_path: str,
+    dataset: str = "alpaca_en_demo",
+    dataset_dir: str = "data",
+    template: str = "default",
+    interval: int = 1000,
+):
+    r"""
+    Calculates the distribution of the input lengths in the dataset.
+    Usage: export CUDA_VISIBLE_DEVICES=0
+    python length_cdf.py --model_name_or_path path_to_model --dataset alpaca_en_demo --template default
+    """
+    model_args, data_args, training_args, _, _ = get_train_args(
+        dict(
+            stage="sft",
+            model_name_or_path=model_name_or_path,
+            dataset=dataset,
+            dataset_dir=dataset_dir,
+            template=template,
+            cutoff_len=1_000_000,
+            preprocessing_num_workers=16,
+            output_dir="dummy_dir",
+            overwrite_cache=True,
+            do_train=True,
+        )
+    )
+    tokenizer_module = load_tokenizer(model_args)
+    template = get_template_and_fix_tokenizer(tokenizer_module["tokenizer"], data_args)
+    trainset = get_dataset(template, model_args, data_args, training_args, "sft", **tokenizer_module)["train_dataset"]
+    total_num = len(trainset)
+    length_dict = defaultdict(int)
+    for sample in tqdm(trainset["input_ids"], desc="Collecting lengths"):
+        length_dict[len(sample) // interval * interval] += 1
+
+    length_tuples = list(length_dict.items())
+    length_tuples.sort()
+    count_accu, prob_accu = 0, 0
+    for length, count in length_tuples:
+        count_accu += count
+        prob_accu += count / total_num * 100
+        print(f"{count_accu:d} ({prob_accu:.2f}%) samples have length < {length + interval}.")
+
+
+if __name__ == "__main__":
+    fire.Fire(length_cdf)
--- a/scripts/vllm_infer.py
+++ b/scripts/vllm_infer.py
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+
+import fire
+from transformers import Seq2SeqTrainingArguments
+
+from llamafactory.data import get_dataset, get_template_and_fix_tokenizer
+from llamafactory.extras.constants import IGNORE_INDEX
+from llamafactory.extras.misc import check_version, get_device_count
+from llamafactory.extras.packages import is_vllm_available
+from llamafactory.hparams import get_infer_args
+from llamafactory.model import load_tokenizer
+
+
+if is_vllm_available():
+    from vllm import LLM, SamplingParams
+    from vllm.lora.request import LoRARequest
+
+
+def vllm_infer(
+    model_name_or_path: str,
+    adapter_name_or_path: str = None,
+    dataset: str = "alpaca_en_demo",
+    dataset_dir: str = "data",
+    template: str = "default",
+    cutoff_len: int = 2048,
+    max_samples: int = None,
+    vllm_config: str = "{}",
+    save_name: str = "generated_predictions.jsonl",
+    temperature: float = 0.95,
+    top_p: float = 0.7,
+    top_k: int = 50,
+    max_new_tokens: int = 1024,
+    repetition_penalty: float = 1.0,
+    pipeline_parallel_size: int = 1,
+    image_resolution: int = 512 * 512,
+):
+    r"""
+    Performs batch generation using vLLM engine, which supports tensor parallelism.
+    Usage: python vllm_infer.py --model_name_or_path meta-llama/Llama-2-7b-hf --template llama --dataset alpaca_en_demo
+    """
+    check_version("vllm>=0.4.3,<=0.6.5")
+    if pipeline_parallel_size > get_device_count():
+        raise ValueError("Pipeline parallel size should be smaller than the number of gpus.")
+
+    model_args, data_args, _, generating_args = get_infer_args(
+        dict(
+            model_name_or_path=model_name_or_path,
+            adapter_name_or_path=adapter_name_or_path,
+            dataset=dataset,
+            dataset_dir=dataset_dir,
+            template=template,
+            cutoff_len=cutoff_len,
+            max_samples=max_samples,
+            preprocessing_num_workers=16,
+            vllm_config=vllm_config,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            max_new_tokens=max_new_tokens,
+            repetition_penalty=repetition_penalty,
+        )
+    )
+
+    training_args = Seq2SeqTrainingArguments(output_dir="dummy_dir")
+    tokenizer_module = load_tokenizer(model_args)
+    tokenizer = tokenizer_module["tokenizer"]
+    template_obj = get_template_and_fix_tokenizer(tokenizer, data_args)
+    template_obj.mm_plugin.expand_mm_tokens = False  # for vllm generate
+    dataset_module = get_dataset(template_obj, model_args, data_args, training_args, "ppo", **tokenizer_module)
+
+    inputs, prompts, labels = [], [], []
+    for sample in dataset_module["train_dataset"]:
+        if sample["images"]:
+            multi_modal_data = {
+                "image": template_obj.mm_plugin._regularize_images(sample["images"], image_resolution=image_resolution)
+            }
+        else:
+            multi_modal_data = None
+
+        inputs.append({"prompt_token_ids": sample["input_ids"], "multi_modal_data": multi_modal_data})
+        prompts.append(tokenizer.decode(sample["input_ids"], skip_special_tokens=False))
+        labels.append(
+            tokenizer.decode(list(filter(lambda x: x != IGNORE_INDEX, sample["labels"])), skip_special_tokens=False)
+        )
+
+    sampling_params = SamplingParams(
+        repetition_penalty=generating_args.repetition_penalty or 1.0,  # repetition_penalty must > 0
+        temperature=generating_args.temperature,
+        top_p=generating_args.top_p or 1.0,  # top_p must > 0
+        top_k=generating_args.top_k,
+        stop_token_ids=template_obj.get_stop_token_ids(tokenizer),
+        max_tokens=generating_args.max_new_tokens,
+        skip_special_tokens=False,
+    )
+    if model_args.adapter_name_or_path is not None:
+        lora_request = LoRARequest("default", 1, model_args.adapter_name_or_path[0])
+    else:
+        lora_request = None
+
+    engine_args = {
+        "model": model_args.model_name_or_path,
+        "trust_remote_code": True,
+        "dtype": model_args.infer_dtype,
+        "tensor_parallel_size": (get_device_count() // pipeline_parallel_size) or 1,
+        "pipeline_parallel_size": pipeline_parallel_size,
+        "disable_log_stats": True,
+        "enable_lora": model_args.adapter_name_or_path is not None,
+    }
+    if template_obj.mm_plugin.__class__.__name__ != "BasePlugin":
+        engine_args["limit_mm_per_prompt"] = {"image": 4, "video": 2}
+
+    if isinstance(model_args.vllm_config, dict):
+        engine_args.update(model_args.vllm_config)
+
+    results = LLM(**engine_args).generate(inputs, sampling_params, lora_request=lora_request)
+    preds = [result.outputs[0].text for result in results]
+    with open(save_name, "w", encoding="utf-8") as f:
+        for text, pred, label in zip(prompts, preds, labels):
+            f.write(json.dumps({"prompt": text, "predict": pred, "label": label}, ensure_ascii=False) + "\n")
+
+    print("*" * 70)
+    print(f"{len(prompts)} generated results have been saved at {save_name}.")
+    print("*" * 70)
+
+
+if __name__ == "__main__":
+    fire.Fire(vllm_infer)
--- a/setup.py
+++ b/setup.py
@@ -54,13 +54,26 @@ extra_require = {
    "gptq": ["optimum>=1.17.0", "auto-gptq>=0.5.0"],
    "awq": ["autoawq"],
    "aqlm": ["aqlm[gpu]>=1.1.0"],
-    "vllm": ["vllm>=0.4.3,<0.6.4"],
+    "vllm": ["vllm>=0.4.3,<=0.6.5"],
    "galore": ["galore-torch"],
+    "apollo": ["apollo-torch"],
    "badam": ["badam>=1.2.1"],
    "adam-mini": ["adam-mini"],
    "qwen": ["transformers_stream_generator"],
+    "minicpm_v": [
+        "soundfile",
+        "torchvision",
+        "torchaudio",
+        "vector_quantize_pytorch",
+        "vocos",
+        "msgpack",
+        "referencing",
+        "jsonschema_specifications",
+        "librosa",
+    ],
    "modelscope": ["modelscope"],
    "openmind": ["openmind"],
+    "swanlab": ["swanlab"],
    "dev": ["pre-commit", "ruff", "pytest"],
 }

@@ -70,7 +83,7 @@ def main():
        name="llamafactory",
        version=get_version(),
        author="hiyouga",
-        author_email="hiyouga" "@" "buaa.edu.cn",
+        author_email="hiyouga AT buaa.edu.cn",
        description="Easy-to-use LLM fine-tuning framework",
        long_description=open("README.md", encoding="utf-8").read(),
        long_description_content_type="text/markdown",

--- a/src/llamafactory/__init__.py
+++ b/src/llamafactory/__init__.py
@@ -30,7 +30,7 @@ Dependency graph:
  longlora:
    transformers>=4.41.2,<=4.46.1
  packing:
-    transformers>=4.41.2,<=4.46.1
+    transformers>=4.43.0,<=4.46.1

 Disable version checking: DISABLE_VERSION_CHECK=1
 Enable VRAM recording: RECORD_VRAM=1