fix save method of adapter_model.bin

0938ae70 · zhaoying1 · 1b73554f · 0938ae70 · 0938ae70 · 0938ae70
Commit 0938ae70 authored Sep 12, 2023 by zhaoying1
20 changed files
--- a/Dockerfile
+++ b/Dockerfile
@@ -3,4 +3,4 @@ COPY requirements.txt requirements.txt
 RUN source /opt/dtk-23.04/env.sh
 RUN cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && echo 'Asia/Shanghai' >/etc/timezone 
 ENV LANG C.UTF-8
-RUN pip install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
+RUN pip install -r requirements.txt --no-dependencies -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
--- a/README.md
+++ b/README.md
@@ -23,9 +23,9 @@ docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.13.1-centos7.6-dtk
 ```
 安装docker中没有的依赖:
 ```
-pip install transformers==4.28.0 -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
+pip install transformers==4.31.0 -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
+pip install accelerate==0.22.0 --no-dependencies -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
-pip install datasets accelerate peft trl tiktoken jieba rouge-chinese nltk gradio matplotlib uvicore fastapi sse-starlette
+pip install datasets peft trl tiktoken jieba rouge-chinese nltk gradio matplotlib uvicore fastapi sse-starlette
 ```
@@ -51,9 +51,11 @@ conda create -n chatglm python=3.8
 3. 其它依赖库参照requirements.txt安装：
 ```
-pip install -r requirements.txt
+pip install -r requirements.txt --no-dependencies -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com 
 ```
+说明：若在accelerate、transformers等库中遇到对deepspeed0.9.3的依赖，请注释掉相应的version check代码，目前暂未对deepspeed0.9.3进行适配，deepspeed0.9.2即可使用。
 ## 数据集
 输入数据为放置在项目[data](.data)目录下的 json 文件，用--dataset选项指定（参考下面示例），多个输入文件用`,`分隔。json 文件示例格式和字段说明如下：
@@ -79,6 +81,8 @@ json 文件中存储一个列表，列表的每个元素是一个sample。其中
 ```
 数据集的使用方法请参考 [data/README.md](data/README_zh.md) 文件。
+注意：请配置[./src/llmtunerhparams/data_args.py](src/llmtuner/hparams/data_args.py)中L38的dataset_dir路径；
 ## 模型下载
 Hugging Face模型下载地址：

--- a/deepspeed.json
+++ b/deepspeed.json
 {
    "train_micro_batch_size_per_gpu": "auto",
+    "train_batch_size": "auto",
    "zero_allow_untested_optimizer": true,
    "fp16": {
      "enabled": "auto",

--- a/requirements.txt
+++ b/requirements.txt
-transformers==4.28.0
+transformers==4.31.0
-datasets
+datasets>=2.12.0
-accelerate
+accelerate>=0.21.0
-peft
+peft==0.4.0
-trl
+trl>=0.7.1
 scipy
 sentencepiece
 tiktoken
 jieba
 rouge-chinese
 nltk
-gradio
+gradio>=3.36.0
 uvicorn
 pydantic==1.10.11
 fastapi==0.95.1

--- a/slurm_script/deepspeed.json
+++ b/slurm_script/deepspeed.json
 {
    "train_micro_batch_size_per_gpu": "auto",
+    "train_batch_size": "auto",
    "zero_allow_untested_optimizer": true,
    "fp16": {
      "enabled": "auto",

--- a/slurm_script/run-7b-sft-lora-single.sh
+++ b/slurm_script/run-7b-sft-lora-single.sh
 #!/bin/bash
-export HSA_FORCE_FINE_GRAIN_PCIE=1
 export MIOPEN_FIND_MODE=3
-export MIOPEN_COMPILE_PARALLEL_LEVEL=1
+export GPU_MAX_HW_QUEUES=16
-export NCCL_PLUGIN_P2P=ucx
-export RCCL_NCHANNELS=2
-export NCCL_SOCKET_IFNAME=ib0
-export NCCL_P2P_LEVEL=5
 lrank=$OMPI_COMM_WORLD_LOCAL_RANK
-echo "LRANK===============================$lrank"
+comm_rank=$OMPI_COMM_WORLD_RANK
-RANK=$OMPI_COMM_WORLD_RANK
+comm_size=$OMPI_COMM_WORLD_SIZE
-WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
+export LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
+export RANK=$comm_rank
-export NCCL_IB_HCA=mlx5_0  #0号网卡
+export WORLD_SIZE=$comm_size
+export MASTER_ADDR=$1
+export MASTER_PORT=29500
+export NCCL_IB_HCA=mlx5
+export NCCL_SOCKET_IFNAME=ib0 
+export HIP_DIRECT_DISPATCH=0
 APP="python3 ../src/train_bash.py --stage sft \
-    --model_name_or_path ../../baichun-7b \
+    --model_name_or_path ../../baichuan-13b-base \
    --do_train \
    --template default \
-    --dataset alpaca_gpt4_en,alpaca_gpt4_zh,codealpaca \
+    --dataset alpaca_gpt4_en \
    --finetuning_type lora \
    --lora_rank 16 \
    --lora_target W_pack,o_proj,gate_proj,down_proj,up_proj \
-    --output_dir output/baichuan-7b-lora-2-3 \
+    --output_dir out/baichuan-7b-lora-test7 \
-    --per_device_train_batch_size 8 \
+    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 8 \
+    --per_device_eval_batch_size 1 \
    --gradient_accumulation_steps 1 \
-    --preprocessing_num_workers 16 \
+    --preprocessing_num_workers 8 \
    --lr_scheduler_type cosine \
    --logging_steps 10 \
-    --save_steps 2000 \
+    --save_steps 2 \
+    --eval_steps 2 \
    --learning_rate 1e-4 \
    --max_grad_norm 0.5 \
    --num_train_epochs 1.0 \
+    --val_size 0.001 \
+    --evaluation_strategy steps \
+    --load_best_model_at_end \
    --plot_loss \
    --fp16 \
    --deepspeed deepspeed.json

--- a/slurm_script/run-7b-sft-lora.sh
+++ b/slurm_script/run-7b-sft-lora.sh
 #!/bin/bash
-#SBATCH -p kshdnormal
+#SBATCH -p kshdexclu11
-#SBATCH -N 32
+#SBATCH -N 4
 #SBATCH --cpus-per-task=1
 #SBATCH --ntasks-per-node=32
 #SBATCH --gres=dcu:4
 #SBATCH -J baichuan
-#SBATCH -o logs-7B/baichuan-lora-%j.out
+#SBATCH -o logs-13B/baichuan-lora-%j.out
-#SBATCH -e logs-7B/baichuan-lora-%j.err
+#SBATCH -e logs-13B/baichuan-lora-%j.err
-ulimit -u 200000
+#SBATCH --exclusive
+ulimit -s unlimited
+export HIP_VISIBLE_DEVICES=0,1,2,3 
+export MIOPEN_FIND_MODE=3
+export MIOPEN_DEBUG_CONV_IMPLICIT_GEMM=0 
+export MIOPEN_USER_DB_PATH=/tmp/miopen-udb
+export MIOPEN_CUSTOM_CACHE_DIR=/tmp/miopen-cache
+export NCCL_SOCKET_IFNAME=ib0
 export HSA_FORCE_FINE_GRAIN_PCIE=1
 export OMP_NUM_THREADS=1
+export NCCL_IB_HCA=mlx5
 export NCCL_DEBUG=INFO
-export MIOPEN_FIND_MODE=3
-export HSA_FORCE_FINE_GRAIN_PCIE=1
 export MIOPEN_COMPILE_PARALLEL_LEVEL=1
 export NCCL_PLUGIN_P2P=ucx
-export NCCL_SOCKET_IFNAME=ib0
 export NCCL_P2P_LEVEL=5
 echo "START TIME: $(date)"
-hostfile=./hostfile/$SLURM_JOB_ID
+nodes=($(scontrol show hostnames $SLURM_JOB_NODELIST ))
+nodes_array=($nodes)
+head_node=${nodes_array[0]}
+head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
+echo Node IP: $head_node_ip 
+echo headnode: $head_node 
+NODE_RANK=$SLURM_NODEID
+hostfile=./hostfile/$SLURM_JOB_ID #获取节点号
 scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
 rm `pwd`/hostfile-dl -f
 for i in `cat $hostfile`
 do
-    echo ${i} slots=4 >> `pwd`/hostfile/hostfile-dl-$SLURM_JOB_ID
+    echo ${i} slots=4 >> `pwd`/hostfile/hostfile-dl-$SLURM_JOB_ID #节点号
 done
-np=$(cat $hostfile|sort|uniq |wc -l)
+np=$(cat $hostfile|sort|uniq |wc -l) #节点去重
 np=$(($np*4))
-nodename=$(cat $hostfile |sed -n "1p")
+nodename=$(cat $hostfile |sed -n "1p") #读取每行节点 第一个是主节点
 dist_url=`echo $nodename | awk '{print $1}'`
-mpirun -np $np --allow-run-as-root --hostfile hostfile/hostfile-dl-$SLURM_JOB_ID --bind-to none `pwd`/run-7b-sft-lora-single.sh $dist_url $np
+mpirun -np $np --allow-run-as-root --hostfile hostfile/hostfile-dl-$SLURM_JOB_ID --bind-to none `pwd`/run-7b-single-lora.sh $dist_url
--- a/src/llmtuner/chat/stream_chat.py
+++ b/src/llmtuner/chat/stream_chat.py
 import torch
 from typing import Any, Dict, Generator, List, Optional, Tuple
 from threading import Thread
-from transformers import TextIteratorStreamer
+from transformers import GenerationConfig, TextIteratorStreamer
 from llmtuner.extras.misc import dispatch_model, get_logits_processor
 from llmtuner.extras.template import get_template_and_fix_tokenizer
@@ -14,7 +14,6 @@ class ChatModel:
        model_args, data_args, finetuning_args, self.generating_args = get_infer_args(args)
        self.model, self.tokenizer = load_model_and_tokenizer(model_args, finetuning_args)
        self.model = dispatch_model(self.model)
-        self.model = self.model.eval() # enable evaluation mode
        self.template = get_template_and_fix_tokenizer(data_args.template, self.tokenizer)
        self.system_prompt = data_args.system_prompt
@@ -41,26 +40,30 @@ class ChatModel:
        max_length = input_kwargs.pop("max_length", None)
        max_new_tokens = input_kwargs.pop("max_new_tokens", None)
-        gen_kwargs = self.generating_args.to_dict()
+        generating_args = self.generating_args.to_dict()
-        gen_kwargs.update(dict(
+        generating_args.update(dict(
-            input_ids=input_ids,
+            do_sample=do_sample if do_sample is not None else generating_args["do_sample"],
-            do_sample=do_sample if do_sample is not None else gen_kwargs["do_sample"],
+            temperature=temperature or generating_args["temperature"],
-            temperature=temperature or gen_kwargs["temperature"],
+            top_p=top_p or generating_args["top_p"],
-            top_p=top_p or gen_kwargs["top_p"],
+            top_k=top_k or generating_args["top_k"],
-            top_k=top_k or gen_kwargs["top_k"],
+            repetition_penalty=repetition_penalty or generating_args["repetition_penalty"],
-            repetition_penalty=repetition_penalty or gen_kwargs["repetition_penalty"],
+            eos_token_id=[self.tokenizer.eos_token_id] + self.tokenizer.additional_special_tokens_ids,
-            eos_token_id=list(set([self.tokenizer.eos_token_id] + self.tokenizer.additional_special_tokens_ids)),
+            pad_token_id=self.tokenizer.pad_token_id
-            pad_token_id=self.tokenizer.pad_token_id,
-            logits_processor=get_logits_processor()
        ))
        if max_length:
-            gen_kwargs.pop("max_new_tokens", None)
+            generating_args.pop("max_new_tokens", None)
-            gen_kwargs["max_length"] = max_length
+            generating_args["max_length"] = max_length
        if max_new_tokens:
-            gen_kwargs.pop("max_length", None)
+            generating_args.pop("max_length", None)
-            gen_kwargs["max_new_tokens"] = max_new_tokens
+            generating_args["max_new_tokens"] = max_new_tokens
+        gen_kwargs = dict(
+            inputs=input_ids,
+            generation_config=GenerationConfig(**generating_args),
+            logits_processor=get_logits_processor()
+        )
        return gen_kwargs, prompt_length

--- a/src/llmtuner/dsets/preprocess.py
+++ b/src/llmtuner/dsets/preprocess.py
@@ -31,11 +31,15 @@ def preprocess_dataset(
            yield query, response, history, system
    def preprocess_pretrain_dataset(examples: Dict[str, List[Any]]) -> Dict[str, Any]:
-        # build grouped texts with format `X1 X2 X3 ...` (without <eos>)
+        # build grouped texts with format `X1 X2 X3 ...`
-        if isinstance(getattr(tokenizer, "tokenizer", None), tiktoken.Encoding): # for tiktoken tokenizer (Qwen)
+        if isinstance(getattr(tokenizer, "tokenizer", None), tiktoken.Encoding):
-            kwargs = dict(allowed_special="all")
+            kwargs = dict(allowed_special="all") # for tiktoken tokenizer (Qwen)
        else:
-            kwargs = dict(add_special_tokens=False)
+            kwargs = dict(add_special_tokens=True)
+        if hasattr(tokenizer, "add_bos_token") and hasattr(tokenizer, "add_eos_token"):
+            setattr(tokenizer, "add_bos_token", True) # for LLaMA tokenizer
+            setattr(tokenizer, "add_eos_token", True)
        tokenized_examples = tokenizer(examples["prompt"], **kwargs)
        concatenated_examples = {k: list(chain(*tokenized_examples[k])) for k in tokenized_examples.keys()}
@@ -59,7 +63,9 @@ def preprocess_dataset(
        for query, response, history, system in construct_example(examples):
            input_ids, labels = [], []
-            for source_ids, target_ids in template.encode_multiturn(tokenizer, query, response, history, system):
+            for turn_idx, (source_ids, target_ids) in enumerate(template.encode_multiturn(
+                tokenizer, query, response, history, system
+            )):
                if len(source_ids) > data_args.max_source_length:
                    source_ids = source_ids[:data_args.max_source_length]
                if len(target_ids) > data_args.max_target_length:
@@ -68,8 +74,17 @@ def preprocess_dataset(
                if len(input_ids) + len(source_ids) + len(target_ids) > max_length:
                    break
+                if turn_idx != 0 and template.efficient_eos:
+                    source_mask = [tokenizer.eos_token_id] + [IGNORE_INDEX] * (len(source_ids) - 1)
+                else:
+                    source_mask = [IGNORE_INDEX] * len(source_ids)
                input_ids += source_ids + target_ids
-                labels += [IGNORE_INDEX] * len(source_ids) + target_ids
+                labels += source_mask + target_ids
+            if template.efficient_eos:
+                input_ids += [tokenizer.eos_token_id]
+                labels += [tokenizer.eos_token_id]
            model_inputs["input_ids"].append(input_ids)
            model_inputs["attention_mask"].append([1] * len(input_ids))
@@ -89,6 +104,9 @@ def preprocess_dataset(
            if len(target_ids) > data_args.max_target_length:
                target_ids = target_ids[:data_args.max_target_length]
+            if template.efficient_eos:
+                target_ids += [tokenizer.eos_token_id]
            model_inputs["input_ids"].append(source_ids)
            model_inputs["attention_mask"].append([1] * len(source_ids))
            model_inputs["labels"].append(target_ids)
@@ -109,6 +127,10 @@ def preprocess_dataset(
            if len(rejected_ids) > data_args.max_target_length:
                rejected_ids = rejected_ids[:data_args.max_target_length]
+            if template.efficient_eos:
+                chosen_ids += [tokenizer.eos_token_id]
+                rejected_ids += [tokenizer.eos_token_id]
            model_inputs["prompt_ids"].append(prompt_ids)
            model_inputs["chosen_ids"].append(chosen_ids)
            model_inputs["rejected_ids"].append(rejected_ids)

--- a/src/llmtuner/extras/callbacks.py
+++ b/src/llmtuner/extras/callbacks.py
@@ -5,7 +5,9 @@ from typing import TYPE_CHECKING
 from datetime import timedelta
 from transformers import TrainerCallback
-from transformers.trainer_utils import has_length
+from transformers.trainer_callback import TrainerControl, TrainerState
+from transformers.trainer_utils import has_length, PREFIX_CHECKPOINT_DIR
+from transformers.training_args import TrainingArguments
 from llmtuner.extras.constants import LOG_FILE_NAME
 from llmtuner.extras.logging import get_logger
@@ -17,6 +19,24 @@ if TYPE_CHECKING:
 logger = get_logger(__name__)
+class SavePeftModelCallback(TrainerCallback):
+    def on_save(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        r"""
+        Event called after a checkpoint save.
+        """
+        output_dir = os.path.join(args.output_dir, "{}-{}".format(PREFIX_CHECKPOINT_DIR, state.global_step))
+        getattr(kwargs.get("model"), "pretrained_model").save_pretrained(output_dir)
+        return control
+    def on_train_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
+        r"""
+        Event called at the end of training.
+        """
+        getattr(kwargs.get("model"), "pretrained_model").save_pretrained(args.output_dir)
+        return control
 class LogCallback(TrainerCallback):
    def __init__(self, runner=None):

--- a/src/llmtuner/extras/constants.py
+++ b/src/llmtuner/extras/constants.py
@@ -2,28 +2,16 @@ IGNORE_INDEX = -100
 LOG_FILE_NAME = "trainer_log.jsonl"
-VALUE_HEAD_FILE_NAME = "value_head.bin"
-FINETUNING_ARGS_NAME = "finetuning_args.json"
 LAYERNORM_NAMES = ["norm", "ln_f", "ln_attn", "ln_mlp"]
 METHODS = ["full", "freeze", "lora"]
-STAGES = [
+TRAINING_STAGES = {
-    "SFT",
+    "Supervised Fine-Tuning": "sft",
-    "Reward Modeling",
-    "PPO",
-    "DPO",
-    "Pre-Training"
-]
-DATASET_STAGE_MAP = {
-    "SFT": "sft",
-    "Pre-Training": "pt",
    "Reward Modeling": "rm",
-    "PPO": "sft",
+    "PPO": "ppo",
-    "DPO": "rm"
+    "DPO": "dpo",
+    "Pre-Training": "pt"
 }
 SUPPORTED_MODELS = {
@@ -54,11 +42,16 @@ SUPPORTED_MODELS = {
    "Baichuan-7B": "baichuan-inc/Baichuan-7B",
    "Baichuan-13B": "baichuan-inc/Baichuan-13B-Base",
    "Baichuan-13B-Chat": "baichuan-inc/Baichuan-13B-Chat",
+    "Baichuan2-7B": "baichuan-inc/Baichuan2-7B-Base",
+    "Baichuan2-13B": "baichuan-inc/Baichuan2-13B-Base",
+    "Baichuan2-7B-Chat": "baichuan-inc/Baichuan2-7B-Chat",
+    "Baichuan2-13B-Chat": "baichuan-inc/Baichuan2-13B-Chat",
    "InternLM-7B": "internlm/internlm-7b",
    "InternLM-7B-Chat": "internlm/internlm-chat-7b",
    "Qwen-7B": "Qwen/Qwen-7B",
    "Qwen-7B-Chat": "Qwen/Qwen-7B-Chat",
    "XVERSE-13B": "xverse/XVERSE-13B",
+    "XVERSE-13B-Chat": "xverse/XVERSE-13B-Chat",
    "ChatGLM2-6B-Chat": "THUDM/chatglm2-6b"
 }
@@ -70,6 +63,7 @@ DEFAULT_MODULE = {
    "BLOOMZ": "query_key_value",
    "Falcon": "query_key_value",
    "Baichuan": "W_pack",
+    "Baichuan2": "W_pack",
    "InternLM": "q_proj,v_proj",
    "Qwen": "c_attn",
    "XVERSE": "q_proj,v_proj",
@@ -80,7 +74,9 @@ DEFAULT_TEMPLATE = {
    "LLaMA2": "llama2",
    "ChineseLLaMA2": "llama2_zh",
    "Baichuan": "baichuan",
+    "Baichuan2": "baichuan2",
    "InternLM": "intern",
    "Qwen": "chatml",
+    "XVERSE": "xverse",
    "ChatGLM2": "chatglm2"
 }
--- a/src/llmtuner/extras/misc.py
+++ b/src/llmtuner/extras/misc.py
+import gc
 import torch
-from typing import TYPE_CHECKING, List, Optional, Tuple
+from typing import TYPE_CHECKING, Tuple
 from transformers import InfNanRemoveLogitsProcessor, LogitsProcessorList
-from llmtuner.extras.constants import LAYERNORM_NAMES
 if TYPE_CHECKING:
    from transformers.modeling_utils import PreTrainedModel
@@ -28,12 +27,6 @@ class AverageMeter:
        self.avg = self.sum / self.count
-def get_logits_processor() -> LogitsProcessorList:
-    logits_processor = LogitsProcessorList()
-    logits_processor.append(InfNanRemoveLogitsProcessor())
-    return logits_processor
 def count_parameters(model: torch.nn.Module) -> Tuple[int, int]:
    r"""
    Returns the number of trainable parameters and number of all parameters in the model.
@@ -56,48 +49,17 @@ def count_parameters(model: torch.nn.Module) -> Tuple[int, int]:
    return trainable_params, all_param
-# Includes: (1) cast the layernorm in fp32 (2) make output embedding layer require grads (3) upcast the lm_head to fp32
+def get_logits_processor() -> LogitsProcessorList:
-# Inspired by: https://github.com/huggingface/peft/blob/c0209c35abbf88c63aa267800d98a8e212ed0a42/src/peft/utils/other.py#L35
+    logits_processor = LogitsProcessorList()
-def prepare_model_for_training(
+    logits_processor.append(InfNanRemoveLogitsProcessor())
-    model: "PreTrainedModel",
+    return logits_processor
-    finetuning_type: str,
-    output_layer_name: Optional[str] = "lm_head",
-    use_gradient_checkpointing: Optional[bool] = True,
-    layer_norm_names: Optional[List[str]] = LAYERNORM_NAMES
-) -> "PreTrainedModel":
-    for name, param in model.named_parameters():
-        if param.ndim == 1 and any(layer_norm_name in name for layer_norm_name in layer_norm_names):
-            param.data = param.data.to(torch.float32)
-    if use_gradient_checkpointing:
-        if hasattr(model, "enable_input_require_grads"):
-            model.enable_input_require_grads()
-        else:
-            def make_inputs_require_grad(module, input, output):
-                output.requires_grad_(True)
-            model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
-        model.gradient_checkpointing_enable()
-        model.config.use_cache = False # turn off when gradient checkpointing is enabled
-    if finetuning_type != "full" and hasattr(model, output_layer_name):
-        output_layer: torch.nn.Linear = getattr(model, output_layer_name)
-        input_dtype = output_layer.weight.dtype
-        class CastOutputToFloat(torch.nn.Sequential):
-            def forward(self, x: torch.Tensor) -> torch.Tensor:
-                return super().forward(x.to(input_dtype)).to(torch.float32)
-        setattr(model, output_layer_name, CastOutputToFloat(output_layer))
-    return model
 def torch_gc() -> None:
    r"""
    Collects GPU memory.
    """
+    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()

--- a/src/llmtuner/extras/models/__init__.py
+++ b/src/llmtuner/extras/models/__init__.py
--- a/src/llmtuner/extras/models/flash_llama.py
+++ b/src/llmtuner/extras/models/flash_llama.py
--- a/src/llmtuner/extras/save_and_load.py
+++ b/src/llmtuner/extras/save_and_load.py
 import os
 import torch
-from typing import Dict
+from transformers.trainer import WEIGHTS_NAME
-from transformers.trainer import WEIGHTS_NAME, WEIGHTS_INDEX_NAME
-from transformers.modeling_utils import load_sharded_checkpoint
-from llmtuner.extras.constants import VALUE_HEAD_FILE_NAME
 from llmtuner.extras.logging import get_logger
 logger = get_logger(__name__)
-def get_state_dict(model: torch.nn.Module) -> Dict[str, torch.Tensor]:
-    state_dict: Dict[str, torch.Tensor] = model.state_dict()
-    filtered_state_dict = {}
-    for k, v in model.named_parameters():
-        if v.requires_grad:
-            filtered_state_dict[k] = state_dict[k].cpu().clone().detach()
-    return filtered_state_dict
-def load_trainable_params(model: torch.nn.Module, checkpoint_dir: os.PathLike) -> bool:
-    weights_file = os.path.join(checkpoint_dir, WEIGHTS_NAME)
-    if os.path.exists(weights_file):
-        model_state_dict = torch.load(weights_file, map_location="cpu")
-        model.load_state_dict(model_state_dict, strict=False) # skip missing keys
-    elif os.path.exists(os.path.join(checkpoint_dir, WEIGHTS_INDEX_NAME)):
-        load_sharded_checkpoint(model, checkpoint_dir, strict=False)
-    else:
-        logger.warning("Provided path ({}) does not contain pre-trained weights.".format(checkpoint_dir))
-        return False
-    return True
 def load_valuehead_params(model: torch.nn.Module, checkpoint_dir: os.PathLike) -> bool:
-    valuehead_file = os.path.join(checkpoint_dir, VALUE_HEAD_FILE_NAME)
+    vhead_file = os.path.join(checkpoint_dir, WEIGHTS_NAME)
-    if not os.path.exists(valuehead_file):
+    if not os.path.exists(vhead_file):
        logger.warning("Provided path ({}) does not contain valuehead weights.".format(checkpoint_dir))
        return False
-    valuehead_state_dict = torch.load(valuehead_file, map_location="cpu")
+    vhead_params = torch.load(vhead_file, map_location="cpu")
-    model.register_buffer("reward_head_weight", valuehead_state_dict["summary.weight"])
+    model.register_buffer("reward_head_weight", vhead_params["v_head.summary.weight"], persistent=False)
-    model.register_buffer("reward_head_bias", valuehead_state_dict["summary.bias"])
+    model.register_buffer("reward_head_bias", vhead_params["v_head.summary.bias"], persistent=False)
-    model.register_buffer("default_head_weight", torch.zeros_like(valuehead_state_dict["summary.weight"]))
+    model.register_buffer("default_head_weight", torch.zeros_like(vhead_params["v_head.summary.weight"]), persistent=False)
-    model.register_buffer("default_head_bias", torch.zeros_like(valuehead_state_dict["summary.bias"]))
+    model.register_buffer("default_head_bias", torch.zeros_like(vhead_params["v_head.summary.bias"]), persistent=False)
    return True
--- a/src/llmtuner/extras/template.py
+++ b/src/llmtuner/extras/template.py
@@ -20,6 +20,7 @@ class Template:
    sep: List[Union[str, Dict[str, str]]]
    stop_words: List[str]
    use_history: bool
+    efficient_eos: bool
    def encode_oneturn(
        self,
@@ -74,19 +75,19 @@ class Template:
        self,
        tokenizer: "PreTrainedTokenizer"
    ) -> Tuple[List[int], List[int]]:
-        if (
+        if tokenizer.bos_token_id is not None and getattr(tokenizer, "add_bos_token", True):
-            tokenizer.bos_token_id is not None
-            and getattr(tokenizer, "add_bos_token", True)
-        ): # baichuan-13b has no bos token
            bos_ids = [tokenizer.bos_token_id]
-        else:
+        else: # baichuan, qwen and gpt2 models have no bos token
-            bos_ids = [] # bos token is optional
+            bos_ids = []
-        if tokenizer.eos_token_id is not None:
+        if tokenizer.eos_token_id is None:
-            eos_ids = [tokenizer.eos_token_id]
-        else:
            raise ValueError("EOS token is required.")
+        if self.efficient_eos: # used in baichuan, qwen, chatglm, etc.
+            eos_ids = []
+        else:
+            eos_ids = [tokenizer.eos_token_id]
        return bos_ids, eos_ids
    def _encode(
@@ -137,6 +138,8 @@ class Template:
        token_ids = []
        for elem in context:
            if isinstance(elem, str):
+                if len(elem) == 0:
+                    continue
                elem = elem.replace("{{system}}", system, 1) if system is not None else elem
                elem = elem.replace("{{query}}", query, 1) if query is not None else elem
                elem = elem.replace("{{idx}}", idx, 1) if idx is not None else elem
@@ -184,7 +187,8 @@ def register_template(
    system: str,
    sep: List[Union[str, Dict[str, str]]],
    stop_words: Optional[List[str]] = [],
-    use_history: Optional[bool] = True
+    use_history: Optional[bool] = True,
+    efficient_eos: Optional[bool] = False
 ) -> None:
    template_class = Llama2Template if "llama2" in name else Template
    templates[name] = template_class(
@@ -193,7 +197,8 @@ def register_template(
        system=system,
        sep=sep,
        stop_words=stop_words,
-        use_history=use_history
+        use_history=use_history,
+        efficient_eos=efficient_eos
    )
@@ -201,31 +206,21 @@ def get_template_and_fix_tokenizer(
    name: str,
    tokenizer: "PreTrainedTokenizer"
 ) -> Template:
-    template = templates.get(name, None)
-    assert template is not None, "Template {} does not exist.".format(name)
-    additional_special_tokens = template.stop_words
-    if len(template.stop_words): # inplace method
-        if tokenizer.eos_token_id is not None:
-            additional_special_tokens.append(tokenizer.eos_token)
-        tokenizer.eos_token = additional_special_tokens[0] # use the first stop word as eos token
-        additional_special_tokens.pop(0)
-        logger.info("Replace eos token: {}".format(tokenizer.eos_token))
    if tokenizer.eos_token_id is None:
        tokenizer.eos_token = "<|endoftext|>"
        logger.info("Add eos token: {}".format(tokenizer.eos_token))
    if tokenizer.pad_token_id is None:
-        if tokenizer.unk_token_id is not None:
+        tokenizer.pad_token = tokenizer.eos_token
-            tokenizer.pad_token = tokenizer.unk_token
-        else:
-            tokenizer.pad_token = tokenizer.eos_token
        logger.info("Add pad token: {}".format(tokenizer.pad_token))
+    if name is None:
+        return None
+    template = templates.get(name, None)
+    assert template is not None, "Template {} does not exist.".format(name)
    tokenizer.add_special_tokens(
-        dict(additional_special_tokens=additional_special_tokens),
+        dict(additional_special_tokens=template.stop_words),
        replace_additional_special_tokens=False
    )
    return template
@@ -464,18 +459,18 @@ register_template(
    ],
    system="",
    sep=[
+        {"token": "<eoa>"},
        "\n"
    ],
    stop_words=[
-        "</s>", # internlm cannot replace eos token
        "<eoa>"
-    ]
+    ],
+    efficient_eos=True
 )
 r"""
 Supports: https://huggingface.co/baichuan-inc/Baichuan-13B-Chat
-Used for training and inference of the fine-tuned models.
 """
 register_template(
    name="baichuan",
@@ -485,33 +480,31 @@ register_template(
    prompt=[
        {"token": "<reserved_102>"}, # user token
        "{{query}}",
-        {"token": "<reserved_103>"} # assistant token
+        {"token": "<reserved_103>"}  # assistant token
    ],
    system="",
    sep=[],
-    stop_words=[]
+    efficient_eos=True
 )
 r"""
-Supports: https://huggingface.co/baichuan-inc/Baichuan-13B-Chat
+Supports: https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat
-Used for inference of the original model.
+          https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat
 """
 register_template(
-    name="baichuan_eval",
+    name="baichuan2",
    prefix=[
-        "{{system}}",
+        "{{system}}"
-        {"token": "<reserved_102>"} # user token
    ],
    prompt=[
+        {"token": "<reserved_106>"}, # user token
        "{{query}}",
-        {"token": "<reserved_103>"} # assistant token
+        {"token": "<reserved_107>"}  # assistant token
    ],
    system="",
    sep=[],
-    stop_words=[
+    efficient_eos=True
-        "<reserved_102>" # user token
-    ]
 )
@@ -524,7 +517,6 @@ register_template(
    prefix=[
        {"token": "<|system|>"},
        "\n{{system}}",
-        {"token": "<|end|>"}
    ],
    prompt=[
        {"token": "<|user|>"},
@@ -535,11 +527,13 @@ register_template(
    ],
    system="",
    sep=[
+        {"token": "<|end|>"},
        "\n"
    ],
    stop_words=[
        "<|end|>"
-    ]
+    ],
+    efficient_eos=True
 )
@@ -550,8 +544,7 @@ register_template(
    name="chatml",
    prefix=[
        {"token": "<|im_start|>"},
-        "system\n{{system}}",
+        "system\n{{system}}"
-        {"token": "<|im_end|>"}
    ],
    prompt=[
        {"token": "<|im_start|>"},
@@ -563,11 +556,13 @@ register_template(
    ],
    system="You are a helpful assistant.",
    sep=[
+        {"token": "<|im_end|>"},
        "\n"
    ],
    stop_words=[
        "<|im_end|>"
-    ]
+    ],
+    efficient_eos=True
 )
@@ -587,7 +582,8 @@ register_template(
    system="",
    sep=[
        "\n\n"
-    ]
+    ],
+    efficient_eos=True
 )

--- a/src/llmtuner/hparams/data_args.py
+++ b/src/llmtuner/hparams/data_args.py
@@ -11,24 +11,23 @@ class DatasetAttr:
    dataset_name: Optional[str] = None
    dataset_sha1: Optional[str] = None
    system_prompt: Optional[str] = None
-    stage: Optional[str] = None
+    ranking: Optional[bool] = False
+    prompt: Optional[str] = "instruction"
+    query: Optional[str] = "input"
+    response: Optional[str] = "output"
+    history: Optional[str] = None
    def __repr__(self) -> str:
        return self.dataset_name
-    def __post_init__(self):
-        self.prompt = "instruction"
-        self.query = "input"
-        self.response = "output"
-        self.history = None
 @dataclass
 class DataArguments:
    r"""
    Arguments pertaining to what data we are going to input our model for training and evaluation.
    """
-    template: str = field(
+    template: Optional[str] = field(
+        default=None,
        metadata={"help": "Which template to use for constructing prompts in training and inference."}
    )
    dataset: Optional[str] = field(
@@ -36,7 +35,7 @@ class DataArguments:
        metadata={"help": "The name of provided dataset(s) to use. Use commas to separate multiple datasets."}
    )
    dataset_dir: Optional[str] = field(
-        default="data",
+        default="/public/home/zhaoying1/work/Baichuan-13B-main/LLaMA-Efficient-Tuning-remove-pe/data",
        metadata={"help": "The name of the folder containing datasets."}
    )
    split: Optional[str] = field(
@@ -48,7 +47,7 @@ class DataArguments:
        metadata={"help": "Enable streaming mode."}
    )
    buffer_size: Optional[int] = field(
-        default=16384,
+        default=1024,
        metadata={"help": "Size of the buffer to randomly sample examples from in streaming mode."}
    )
    mix_strategy: Optional[Literal["concat", "interleave_under", "interleave_over"]] = field(
@@ -114,21 +113,14 @@ class DataArguments:
                raise ValueError("Undefined dataset {} in dataset_info.json.".format(name))
            if "hf_hub_url" in dataset_info[name]:
-                dataset_attr = DatasetAttr(
+                dataset_attr = DatasetAttr("hf_hub", dataset_name=dataset_info[name]["hf_hub_url"])
-                    "hf_hub",
-                    dataset_name=dataset_info[name]["hf_hub_url"],
-                    stage=dataset_info[name].get("stage", None))
            elif "script_url" in dataset_info[name]:
-                dataset_attr = DatasetAttr(
+                dataset_attr = DatasetAttr("script", dataset_name=dataset_info[name]["script_url"])
-                    "script",
-                    dataset_name=dataset_info[name]["script_url"],
-                    stage=dataset_info[name].get("stage", None))
            else:
                dataset_attr = DatasetAttr(
                    "file",
                    dataset_name=dataset_info[name]["file_name"],
-                    dataset_sha1=dataset_info[name].get("file_sha1", None),
+                    dataset_sha1=dataset_info[name].get("file_sha1", None)
-                    stage=dataset_info[name].get("stage", None)
                )
            if "columns" in dataset_info[name]:
@@ -137,5 +129,6 @@ class DataArguments:
                dataset_attr.response = dataset_info[name]["columns"].get("response", None)
                dataset_attr.history = dataset_info[name]["columns"].get("history", None)
+            dataset_attr.ranking = dataset_info[name].get("ranking", False)
            dataset_attr.system_prompt = prompt_list[i]
            self.dataset_list.append(dataset_attr)
--- a/src/llmtuner/hparams/model_args.py
+++ b/src/llmtuner/hparams/model_args.py
@@ -16,7 +16,7 @@ class ModelArguments:
        metadata={"help": "Where to store the pretrained models downloaded from huggingface.co."}
    )
    use_fast_tokenizer: Optional[bool] = field(
-        default=False,
+        default=True,
        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}
    )
    use_auth_token: Optional[bool] = field(
@@ -27,10 +27,6 @@ class ModelArguments:
        default="main",
        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}
    )
-    padding_side: Optional[Literal["left", "right"]] = field(
-        default="left",
-        metadata={"help": "The side on which the model should have padding applied."}
-    )
    quantization_bit: Optional[int] = field(
        default=None,
        metadata={"help": "The number of bits to quantize the model."}
@@ -47,6 +43,10 @@ class ModelArguments:
        default=None,
        metadata={"help": "Adopt scaled rotary positional embeddings."}
    )
+    flash_attn: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Enable flash attention for faster training."}
+    )
    checkpoint_dir: Optional[str] = field(
        default=None,
        metadata={"help": "Path to the directory(s) containing the delta model checkpoints as well as the configurations."}

--- a/src/llmtuner/tuner/core/__pycache__/__init__.cpython-38.pyc
+++ b/src/llmtuner/tuner/core/__pycache__/__init__.cpython-38.pyc
--- a/src/llmtuner/tuner/core/__pycache__/__init__.cpython-39.pyc
+++ b/src/llmtuner/tuner/core/__pycache__/__init__.cpython-39.pyc