update to v0.9.1

27a7ad86 · luopl · 731cf9b8 · 27a7ad86 · 27a7ad86 · 27a7ad86
Commit 27a7ad86 authored Oct 14, 2024 by luopl
20 changed files
--- a/examples/README.md
+++ b/examples/README.md
@@ -33,24 +33,31 @@ llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
 ```bash
 llamafactory-cli train examples/train_lora/llava1_5_lora_sft.yaml
+llamafactory-cli train examples/train_lora/qwen2vl_lora_sft.yaml
 ```
-#### Reward Modeling
+#### DPO/ORPO/SimPO Training
 ```bash
-llamafactory-cli train examples/train_lora/llama3_lora_reward.yaml
+llamafactory-cli train examples/train_lora/llama3_lora_dpo.yaml
 ```
-#### PPO Training
+#### Multimodal DPO/ORPO/SimPO Training
 ```bash
-llamafactory-cli train examples/train_lora/llama3_lora_ppo.yaml
+llamafactory-cli train examples/train_lora/qwen2vl_lora_dpo.yaml
 ```
-#### DPO/ORPO/SimPO Training
+#### Reward Modeling
 ```bash
-llamafactory-cli train examples/train_lora/llama3_lora_dpo.yaml
+llamafactory-cli train examples/train_lora/llama3_lora_reward.yaml
+```
+#### PPO Training
+```bash
+llamafactory-cli train examples/train_lora/llama3_lora_ppo.yaml
 ```
 #### KTO Training
@@ -133,6 +140,12 @@ FORCE_TORCHRUN=1 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llama
 FORCE_TORCHRUN=1 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft_ds3.yaml
 ```
+#### Multimodal Supervised Fine-Tuning
+```bash
+FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/qwen2vl_full_sft.yaml
+```
 #### Batch Predicting and Computing BLEU and ROUGE Scores
 ```bash

--- a/examples/README_zh.md
+++ b/examples/README_zh.md
 我们提供了多样化的大模型微调示例脚本。
-请确保在 `llama_factory` 目录下执行下述命令。
+请确保在 `LLaMA-Factory` 目录下执行下述命令。
 ## 目录
@@ -11,7 +11,7 @@
 - [推理 LoRA 模型](#推理-lora-模型)
 - [杂项](#杂项)
-使用 `HIP_VISIBLE_DEVICES`选择计算设备。
+使用 `CUDA_VISIBLE_DEVICES`（GPU）或 `ASCEND_RT_VISIBLE_DEVICES`（NPU）选择计算设备。
 ## 示例
@@ -33,24 +33,31 @@ llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
 ```bash
 llamafactory-cli train examples/train_lora/llava1_5_lora_sft.yaml
+llamafactory-cli train examples/train_lora/qwen2vl_lora_sft.yaml
 ```
-#### 奖励模型训练
+#### DPO/ORPO/SimPO 训练
 ```bash
-llamafactory-cli train examples/train_lora/llama3_lora_reward.yaml
+llamafactory-cli train examples/train_lora/llama3_lora_dpo.yaml
 ```
-#### PPO 训练
+#### 多模态 DPO/ORPO/SimPO 训练
 ```bash
-llamafactory-cli train examples/train_lora/llama3_lora_ppo.yaml
+llamafactory-cli train examples/train_lora/qwen2vl_lora_dpo.yaml
 ```
-#### DPO/ORPO/SimPO 训练
+#### 奖励模型训练
 ```bash
-llamafactory-cli train examples/train_lora/llama3_lora_dpo.yaml
+llamafactory-cli train examples/train_lora/llama3_lora_reward.yaml
+```
+#### PPO 训练
+```bash
+llamafactory-cli train examples/train_lora/llama3_lora_ppo.yaml
 ```
 #### KTO 训练
@@ -133,6 +140,12 @@ FORCE_TORCHRUN=1 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llama
 FORCE_TORCHRUN=1 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft_ds3.yaml
 ```
+#### 多模态指令监督微调
+```bash
+FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/qwen2vl_full_sft.yaml
+```
 #### 批量预测并计算 BLEU 和 ROUGE 分数
 ```bash

--- a/examples/inference/llava1_5.yaml
+++ b/examples/inference/llava1_5.yaml
 model_name_or_path: llava-hf/llava-1.5-7b-hf
-template: vicuna
+template: llava
-visual_inputs: true
--- a/examples/inference/qwen2_vl.yaml
+++ b/examples/inference/qwen2_vl.yaml
+model_name_or_path: Qwen/Qwen2-VL-7B-Instruct
+template: qwen2_vl
--- a/examples/merge_lora/qwen2vl_lora_sft.yaml
+++ b/examples/merge_lora/qwen2vl_lora_sft.yaml
+### Note: DO NOT use quantized model or quantization_bit when merging lora adapters
+### model
+model_name_or_path: Qwen/Qwen2-VL-7B-Instruct
+adapter_name_or_path: saves/qwen2_vl-7b/lora/sft
+template: qwen2_vl
+finetuning_type: lora
+### export
+export_dir: models/qwen2_vl_lora_sft
+export_size: 2
+export_device: cpu
+export_legacy_format: false
--- a/examples/train_full/qwen2vl_full_sft.yaml
+++ b/examples/train_full/qwen2vl_full_sft.yaml
+### model
+model_name_or_path: Qwen/Qwen2-VL-7B-Instruct
+### method
+stage: sft
+do_train: true
+finetuning_type: full
+deepspeed: examples/deepspeed/ds_z3_config.json
+### dataset
+dataset: mllm_demo,identity
+template: qwen2_vl
+cutoff_len: 1024
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+### output
+output_dir: saves/qwen2_vl-7b/full/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 2
+learning_rate: 1.0e-5
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+### eval
+val_size: 0.1
+per_device_eval_batch_size: 1
+eval_strategy: steps
+eval_steps: 500
--- a/examples/train_lora/llava1_5_lora_sft.yaml
+++ b/examples/train_lora/llava1_5_lora_sft.yaml
 ### model
 model_name_or_path: llava-hf/llava-1.5-7b-hf
-visual_inputs: true
 ### method
 stage: sft
@@ -10,7 +9,7 @@ lora_target: all
 ### dataset
 dataset: mllm_demo
-template: vicuna
+template: llava
 cutoff_len: 1024
 max_samples: 1000
 overwrite_cache: true

--- a/examples/train_lora/qwen2.5_lora_sft_ds3.yaml
+++ b/examples/train_lora/qwen2.5_lora_sft_ds3.yaml
+### model
+model_name_or_path: /data/luopl/Qwen/Qwen2.5-72B
+### method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_target: q_proj,v_proj
+deepspeed: examples/deepspeed/ds_z3_config.json
+### dataset
+dataset: identity,alpaca_zh_demo,alpaca_en_demo
+template: qwen
+cutoff_len: 1024
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 4
+### output
+output_dir: saves/qwen2.5_72b/lora/sft/
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 1
+learning_rate: 1.0e-5
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+### eval
+val_size: 0.1
+per_device_eval_batch_size: 1
+eval_strategy: steps
+eval_steps: 250
--- a/examples/train_lora/qwen2.5_lora_sft_offload_ds3.yaml
+++ b/examples/train_lora/qwen2.5_lora_sft_offload_ds3.yaml
+### model
+model_name_or_path: /data/luopl/Qwen/Qwen2.5-72B
+### method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_target: q_proj,v_proj
+deepspeed: examples/deepspeed/ds_z3_offload_config.json
+### dataset
+dataset: identity,alpaca_zh_demo,alpaca_en_demo
+template: qwen
+cutoff_len: 1024
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 4
+### output
+output_dir: saves/qwen2.5_72b/lora/sft/
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 1
+learning_rate: 1.0e-5
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+### eval
+val_size: 0.1
+per_device_eval_batch_size: 1
+eval_strategy: steps
+eval_steps: 250
--- a/examples/train_lora/qwen2vl_lora_dpo.yaml
+++ b/examples/train_lora/qwen2vl_lora_dpo.yaml
+### model
+model_name_or_path: Qwen/Qwen2-VL-7B-Instruct
+### method
+stage: dpo
+do_train: true
+finetuning_type: lora
+lora_target: all
+pref_beta: 0.1
+pref_loss: sigmoid  # choices: [sigmoid (dpo), orpo, simpo]
+### dataset
+dataset: rlhf_v
+template: qwen2_vl
+cutoff_len: 1024
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+### output
+output_dir: saves/qwen2_vl-7b/lora/dpo
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 5.0e-6
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+### eval
+val_size: 0.1
+per_device_eval_batch_size: 1
+eval_strategy: steps
+eval_steps: 500
--- a/examples/train_lora/qwen2vl_lora_sft.yaml
+++ b/examples/train_lora/qwen2vl_lora_sft.yaml
+### model
+model_name_or_path: Qwen/Qwen2-VL-7B-Instruct
+### method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_target: all
+### dataset
+dataset: mllm_demo,identity  # video: mllm_video_demo
+template: qwen2_vl
+cutoff_len: 1024
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+### output
+output_dir: saves/qwen2_vl-7b/lora/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 1.0e-4
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+### eval
+val_size: 0.1
+per_device_eval_batch_size: 1
+eval_strategy: steps
+eval_steps: 500
--- a/requirements.txt
+++ b/requirements.txt
-transformers>=4.41.2,<=4.43.4
+transformers==4.41.2
-datasets>=2.16.0,<=2.20.0
+accelerate==0.30.1
-accelerate>=0.30.1,<=0.32.0
+datasets==2.16.0
-peft>=0.11.1,<=0.12.0
+peft==0.11.1
-trl>=0.8.6,<=0.9.6
+trl==0.8.6
-gradio>=4.0.0
+gradio==4.0.0
-pandas>=2.0.0
+pandas==2.0.0
 scipy
 einops
 sentencepiece
@@ -14,8 +14,9 @@ uvicorn
 pydantic
 fastapi
 sse-starlette
-matplotlib>=3.7.0
+matplotlib==3.7.0
 fire
 packaging
 pyyaml
 numpy<2.0.0
+av
--- a/scripts/cal_flops.py
+++ b/scripts/cal_flops.py
@@ -27,7 +27,7 @@ from llamafactory.chat import ChatModel
 def calculate_flops(
    model_name_or_path: str,
    batch_size: int = 1,
-    seq_length: int = 256,
+    seq_length: int = 512,
    flash_attn: str = "auto",
 ):
    r"""

--- a/scripts/cal_lr.py
+++ b/scripts/cal_lr.py
@@ -25,7 +25,7 @@ from torch.utils.data import DataLoader
 from tqdm import tqdm
 from transformers import DataCollatorForLanguageModeling, DataCollatorForSeq2Seq
-from llamafactory.data import get_dataset
+from llamafactory.data import get_dataset, get_template_and_fix_tokenizer
 from llamafactory.extras.constants import IGNORE_INDEX
 from llamafactory.hparams import get_train_args
 from llamafactory.model import load_tokenizer
@@ -39,7 +39,7 @@ def calculate_lr(
    model_name_or_path: str,
    batch_size: int,  # total batch size, namely (batch size * gradient accumulation * world size)
    stage: Literal["pt", "sft"] = "sft",
-    dataset: str = "alpaca_en",
+    dataset: str = "alpaca_en_demo",
    dataset_dir: str = "data",
    template: str = "default",
    cutoff_len: int = 1024,  # i.e. maximum input length during training
@@ -48,7 +48,8 @@ def calculate_lr(
 ):
    r"""
    Calculates the optimal learning rate for 7B/13B models using LLaMA's hyper-parameters.
-    Usage: python cal_lr.py --model_name_or_path path_to_model --dataset alpaca_en --cutoff_len 1024 --batch_size 16
+    Usage:
+    python cal_lr.py --model_name_or_path path_to_model --dataset alpaca_en_demo --cutoff_len 1024 --batch_size 16
    """
    model_args, data_args, training_args, _, _ = get_train_args(
        dict(
@@ -66,7 +67,8 @@ def calculate_lr(
    )
    tokenizer_module = load_tokenizer(model_args)
    tokenizer = tokenizer_module["tokenizer"]
-    trainset = get_dataset(model_args, data_args, training_args, stage, **tokenizer_module)["train_dataset"]
+    template = get_template_and_fix_tokenizer(tokenizer, data_args)
+    trainset = get_dataset(template, model_args, data_args, training_args, stage, **tokenizer_module)["train_dataset"]
    if stage == "pt":
        data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
    elif stage == "sft":

--- a/scripts/cal_mfu.py
+++ b/scripts/cal_mfu.py
+# coding=utf-8
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+import fire
+import torch
+import torch.distributed as dist
+from transformers import AutoConfig
+from llamafactory.train.tuner import run_exp
+BASE = 2  # gemm (add + mul)
+def compute_model_flops(
+    model_name_or_path: str,
+    total_batch_size: int,
+    seq_length: int,
+    include_backward: bool = True,
+    include_recompute: bool = False,
+    include_flashattn: bool = False,
+) -> int:
+    r"""
+    Calculates the FLOPs of model per forward/backward pass.
+    """
+    config = AutoConfig.from_pretrained(model_name_or_path)
+    hidden_size = getattr(config, "hidden_size", None)
+    vocab_size = getattr(config, "vocab_size", None)
+    intermediate_size = getattr(config, "intermediate_size", None)
+    num_attention_heads = getattr(config, "num_attention_heads", None)
+    num_key_value_heads = getattr(config, "num_key_value_heads", None)
+    num_hidden_layers = getattr(config, "num_hidden_layers", None)
+    tie_word_embeddings = getattr(config, "tie_word_embeddings", False)
+    # mlp module
+    mlp_flops_per_token = 3 * BASE * hidden_size * intermediate_size  # up, gate, down
+    mlp_flops = total_batch_size * seq_length * num_hidden_layers * mlp_flops_per_token
+    # attn projector module
+    q_flops_per_token = BASE * hidden_size * hidden_size
+    o_flops_per_token = BASE * hidden_size * hidden_size
+    k_flops_per_token = BASE * hidden_size * hidden_size * num_key_value_heads // num_attention_heads
+    v_flops_per_token = BASE * hidden_size * hidden_size * num_key_value_heads // num_attention_heads
+    attn_proj_flops_per_token = q_flops_per_token + o_flops_per_token + k_flops_per_token + v_flops_per_token
+    attn_proj_flops = total_batch_size * seq_length * num_hidden_layers * attn_proj_flops_per_token
+    # attn sdpa module
+    sdpa_flops_per_layer = 2 * BASE * hidden_size * seq_length * seq_length  # (q * k^T) * v
+    sdpa_flops = total_batch_size * num_hidden_layers * sdpa_flops_per_layer
+    # embedding module
+    embedding_flops_per_token = hidden_size * vocab_size
+    embedding_flops = total_batch_size * seq_length * embedding_flops_per_token
+    if tie_word_embeddings is False:
+        embedding_flops *= 2
+    non_embedding_flops = mlp_flops + attn_proj_flops + sdpa_flops
+    non_embedding_coeff, embedding_coeff = 1, 1
+    if include_backward:
+        non_embedding_coeff += 2
+        embedding_coeff += 2
+    if include_recompute:
+        non_embedding_coeff += 1
+    total_flops = non_embedding_coeff * non_embedding_flops + embedding_coeff * embedding_flops
+    if include_flashattn:
+        total_flops += sdpa_flops
+    return total_flops
+def compute_device_flops(world_size: int) -> float:
+    r"""
+    Calculates the FLOPs of the device capability per second.
+    """
+    device_name = torch.cuda.get_device_name()
+    if "H100" in device_name or "H800" in device_name:
+        return 989 * 1e12 * world_size
+    elif "A100" in device_name or "A800" in device_name:
+        return 312 * 1e12 * world_size
+    elif "V100" in device_name:
+        return 125 * 1e12 * world_size
+    elif "4090" in device_name:
+        return 98 * 1e12 * world_size
+    else:
+        raise NotImplementedError("Device not supported: {}.".format(device_name))
+def calculate_mfu(
+    model_name_or_path: str,
+    batch_size: int = 1,
+    seq_length: int = 1024,
+    num_steps: int = 100,
+    finetuning_type: str = "lora",
+    flash_attn: str = "auto",
+    deepspeed_stage: int = 0,
+    disable_gc: bool = False,
+    liger_kernel: bool = False,
+    unsloth_gc: bool = False,
+) -> float:
+    r"""
+    Calculates MFU for given model and hyper-params.
+    Usage: python cal_mfu.py --model_name_or_path path_to_model --batch_size 1 --seq_length 1024
+    """
+    args = {
+        "model_name_or_path": model_name_or_path,
+        "flash_attn": flash_attn,
+        "disable_gradient_checkpointing": disable_gc,
+        "enable_liger_kernel": liger_kernel,
+        "use_unsloth_gc": unsloth_gc,
+        "stage": "pt",
+        "do_train": True,
+        "finetuning_type": finetuning_type,
+        "dataset": "c4_demo",
+        "cutoff_len": seq_length,
+        "output_dir": os.path.join("saves", "test_mfu"),
+        "logging_strategy": "no",
+        "save_strategy": "no",
+        "save_only_model": True,
+        "overwrite_output_dir": True,
+        "per_device_train_batch_size": batch_size,
+        "max_steps": num_steps,
+        "bf16": True,
+    }
+    if deepspeed_stage in [2, 3]:
+        args["deepspeed"] = "examples/deepspeed/ds_z{}_config.json".format(deepspeed_stage)
+    run_exp(args)
+    with open(os.path.join("saves", "test_mfu", "all_results.json"), "r", encoding="utf-8") as f:
+        result = json.load(f)
+    if dist.is_initialized():
+        world_size = dist.get_world_size()
+    else:
+        world_size = 1
+    total_batch_size = batch_size * world_size
+    mfu_value = (
+        result["train_steps_per_second"]
+        * compute_model_flops(model_name_or_path, total_batch_size, seq_length)
+        / compute_device_flops(world_size)
+    )
+    print("MFU: {:.2f}%".format(mfu_value * 100))
+if __name__ == "__main__":
+    fire.Fire(calculate_mfu)
--- a/scripts/cal_ppl.py
+++ b/scripts/cal_ppl.py
@@ -23,7 +23,7 @@ from torch.utils.data import DataLoader
 from tqdm import tqdm
 from transformers import DataCollatorForLanguageModeling, DataCollatorForSeq2Seq
-from llamafactory.data import get_dataset
+from llamafactory.data import get_dataset, get_template_and_fix_tokenizer
 from llamafactory.extras.constants import IGNORE_INDEX
 from llamafactory.hparams import get_train_args
 from llamafactory.model import load_model, load_tokenizer
@@ -55,12 +55,12 @@ class PairwiseDataCollatorWithPadding(DataCollatorForSeq2Seq):
        return super().__call__(chosen_features)
-def cal_ppl(
+def calculate_ppl(
    model_name_or_path: str,
    save_name: str,
    batch_size: int = 4,
    stage: Literal["pt", "sft", "rm"] = "sft",
-    dataset: str = "alpaca_en",
+    dataset: str = "alpaca_en_demo",
    dataset_dir: str = "data",
    template: str = "default",
    cutoff_len: int = 1024,
@@ -69,7 +69,7 @@ def cal_ppl(
 ):
    r"""
    Calculates the ppl on the dataset of the pre-trained models.
-    Usage: python cal_ppl.py --model_name_or_path path_to_model --save_name ppl.json
+    Usage: python cal_ppl.py --model_name_or_path path_to_model --dataset alpaca_en_demo --save_name ppl.json
    """
    model_args, data_args, training_args, finetuning_args, _ = get_train_args(
        dict(
@@ -88,7 +88,8 @@ def cal_ppl(
    )
    tokenizer_module = load_tokenizer(model_args)
    tokenizer = tokenizer_module["tokenizer"]
-    trainset = get_dataset(model_args, data_args, training_args, stage, **tokenizer_module)["train_dataset"]
+    template = get_template_and_fix_tokenizer(tokenizer, data_args)
+    trainset = get_dataset(template, model_args, data_args, training_args, stage, **tokenizer_module)["train_dataset"]
    model = load_model(tokenizer, model_args, finetuning_args, is_trainable=False)
    if stage == "pt":
        data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
@@ -129,4 +130,4 @@ def cal_ppl(
 if __name__ == "__main__":
-    fire.Fire(cal_ppl)
+    fire.Fire(calculate_ppl)
--- a/scripts/length_cdf.py
+++ b/scripts/length_cdf.py
@@ -18,21 +18,21 @@ from collections import defaultdict
 import fire
 from tqdm import tqdm
-from llamafactory.data import get_dataset
+from llamafactory.data import get_dataset, get_template_and_fix_tokenizer
 from llamafactory.hparams import get_train_args
 from llamafactory.model import load_tokenizer
 def length_cdf(
    model_name_or_path: str,
-    dataset: str = "alpaca_en",
+    dataset: str = "alpaca_en_demo",
    dataset_dir: str = "data",
    template: str = "default",
    interval: int = 1000,
 ):
    r"""
    Calculates the distribution of the input lengths in the dataset.
-    Usage: python length_cdf.py --model_name_or_path path_to_model --dataset alpaca_en --template default
+    Usage: python length_cdf.py --model_name_or_path path_to_model --dataset alpaca_en_demo --template default
    """
    model_args, data_args, training_args, _, _ = get_train_args(
        dict(
@@ -48,7 +48,8 @@ def length_cdf(
        )
    )
    tokenizer_module = load_tokenizer(model_args)
-    trainset = get_dataset(model_args, data_args, training_args, stage="sft", **tokenizer_module)["train_dataset"]
+    template = get_template_and_fix_tokenizer(tokenizer_module["tokenizer"], data_args)
+    trainset = get_dataset(template, model_args, data_args, training_args, "sft", **tokenizer_module)["train_dataset"]
    total_num = len(trainset)
    length_dict = defaultdict(int)
    for sample in tqdm(trainset["input_ids"]):

--- a/setup.py
+++ b/setup.py
@@ -14,11 +14,12 @@
 import os
 import re
+from typing import List
 from setuptools import find_packages, setup
-def get_version():
+def get_version() -> str:
    with open(os.path.join("src", "llamafactory", "extras", "env.py"), "r", encoding="utf-8") as f:
        file_content = f.read()
        pattern = r"{}\W*=\W*\"([^\"]+)\"".format("VERSION")
@@ -26,25 +27,34 @@ def get_version():
        return version
-def get_requires():
+def get_requires() -> List[str]:
    with open("requirements.txt", "r", encoding="utf-8") as f:
        file_content = f.read()
        lines = [line.strip() for line in file_content.strip().split("\n") if not line.startswith("#")]
        return lines
+def get_console_scripts() -> List[str]:
+    console_scripts = ["llamafactory-cli = llamafactory.cli:main"]
+    if os.environ.get("ENABLE_SHORT_CONSOLE", "1").lower() in ["true", "1"]:
+        console_scripts.append("lmf = llamafactory.cli:main")
+    return console_scripts
 extra_require = {
    "torch": ["torch>=1.13.1"],
    "torch-npu": ["torch==2.1.0", "torch-npu==2.1.0.post3", "decorator"],
    "metrics": ["nltk", "jieba", "rouge-chinese"],
-    "deepspeed": ["deepspeed>=0.10.0"],
+    "deepspeed": ["deepspeed>=0.10.0,<=0.14.4"],
+    "liger-kernel": ["liger-kernel"],
    "bitsandbytes": ["bitsandbytes>=0.39.0"],
    "hqq": ["hqq"],
    "eetq": ["eetq"],
    "gptq": ["optimum>=1.17.0", "auto-gptq>=0.5.0"],
    "awq": ["autoawq"],
    "aqlm": ["aqlm[gpu]>=1.1.0"],
-    "vllm": ["vllm>=0.4.3"],
+    "vllm": ["vllm>=0.4.3,<=0.6.2"],
    "galore": ["galore-torch"],
    "badam": ["badam>=1.2.1"],
    "adam-mini": ["adam-mini"],
@@ -71,7 +81,7 @@ def main():
        python_requires=">=3.8.0",
        install_requires=get_requires(),
        extras_require=extra_require,
-        entry_points={"console_scripts": ["llamafactory-cli = llamafactory.cli:main"]},
+        entry_points={"console_scripts": get_console_scripts()},
        classifiers=[
            "Development Status :: 4 - Beta",
            "Intended Audience :: Developers",

--- a/src/llamafactory/__init__.py
+++ b/src/llamafactory/__init__.py
@@ -20,20 +20,27 @@ Level:
 Dependency graph:
  main:
-    transformers>=4.41.2,<=4.43.4
+    transformers>=4.41.2,<=4.45.2
-    datasets>=2.16.0,<=2.20.0
+    datasets>=2.16.0,<=2.21.0
-    accelerate>=0.30.1,<=0.32.0
+    accelerate>=0.30.1,<=0.34.2
    peft>=0.11.1,<=0.12.0
    trl>=0.8.6,<=0.9.6
  attention:
    transformers>=4.42.4 (gemma+fa2)
  longlora:
-    transformers>=4.41.2,<=4.43.4
+    transformers>=4.41.2,<=4.45.2
  packing:
-    transformers>=4.41.2,<=4.43.4
+    transformers>=4.41.2,<=4.45.2
+Disable version checking: DISABLE_VERSION_CHECK=1
+Enable VRAM recording: RECORD_VRAM=1
+Force check imports: FORCE_CHECK_IMPORTS=1
+Force using torchrun: FORCE_TORCHRUN=1
+Set logging verbosity: LLAMAFACTORY_VERBOSITY=WARN
+Use modelscope: USE_MODELSCOPE_HUB=1
 """
-from .cli import VERSION
+from .extras.env import VERSION
 __version__ = VERSION
--- a/src/llamafactory/api/app.py
+++ b/src/llamafactory/api/app.py
@@ -12,8 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import asyncio
 import os
 from contextlib import asynccontextmanager
+from functools import partial
 from typing import Optional
 from typing_extensions import Annotated
@@ -50,14 +52,24 @@ if is_uvicorn_available():
    import uvicorn
+async def sweeper() -> None:
+    while True:
+        torch_gc()
+        await asyncio.sleep(300)
 @asynccontextmanager
-async def lifespan(app: "FastAPI"):  # collects GPU memory
+async def lifespan(app: "FastAPI", chat_model: "ChatModel"):  # collects GPU memory
+    if chat_model.engine_type == "huggingface":
+        asyncio.create_task(sweeper())
    yield
    torch_gc()
 def create_app(chat_model: "ChatModel") -> "FastAPI":
-    app = FastAPI(lifespan=lifespan)
+    root_path = os.environ.get("FASTAPI_ROOT_PATH", "")
+    app = FastAPI(lifespan=partial(lifespan, chat_model=chat_model), root_path=root_path)
    app.add_middleware(
        CORSMiddleware,
        allow_origins=["*"],
@@ -65,7 +77,7 @@ def create_app(chat_model: "ChatModel") -> "FastAPI":
        allow_methods=["*"],
        allow_headers=["*"],
    )
-    api_key = os.environ.get("API_KEY")
+    api_key = os.environ.get("API_KEY", None)
    security = HTTPBearer(auto_error=False)
    async def verify_api_key(auth: Annotated[Optional[HTTPAuthorizationCredentials], Depends(security)]):
@@ -79,7 +91,7 @@ def create_app(chat_model: "ChatModel") -> "FastAPI":
        dependencies=[Depends(verify_api_key)],
    )
    async def list_models():
-        model_card = ModelCard(id="gpt-3.5-turbo")
+        model_card = ModelCard(id=os.environ.get("API_MODEL_NAME", "gpt-3.5-turbo"))
        return ModelList(data=[model_card])
    @app.post(