Commit 0938ae70 authored by zhaoying1's avatar zhaoying1
Browse files

fix save method of adapter_model.bin

parent 1b73554f
...@@ -3,4 +3,4 @@ COPY requirements.txt requirements.txt ...@@ -3,4 +3,4 @@ COPY requirements.txt requirements.txt
RUN source /opt/dtk-23.04/env.sh RUN source /opt/dtk-23.04/env.sh
RUN cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && echo 'Asia/Shanghai' >/etc/timezone RUN cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && echo 'Asia/Shanghai' >/etc/timezone
ENV LANG C.UTF-8 ENV LANG C.UTF-8
RUN pip install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com RUN pip install -r requirements.txt --no-dependencies -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
...@@ -23,9 +23,9 @@ docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.13.1-centos7.6-dtk ...@@ -23,9 +23,9 @@ docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.13.1-centos7.6-dtk
``` ```
安装docker中没有的依赖: 安装docker中没有的依赖:
``` ```
pip install transformers==4.28.0 -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com pip install transformers==4.31.0 -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
pip install accelerate==0.22.0 --no-dependencies -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
pip install datasets accelerate peft trl tiktoken jieba rouge-chinese nltk gradio matplotlib uvicore fastapi sse-starlette pip install datasets peft trl tiktoken jieba rouge-chinese nltk gradio matplotlib uvicore fastapi sse-starlette
``` ```
...@@ -51,9 +51,11 @@ conda create -n chatglm python=3.8 ...@@ -51,9 +51,11 @@ conda create -n chatglm python=3.8
3. 其它依赖库参照requirements.txt安装: 3. 其它依赖库参照requirements.txt安装:
``` ```
pip install -r requirements.txt pip install -r requirements.txt --no-dependencies -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
``` ```
说明:若在accelerate、transformers等库中遇到对deepspeed0.9.3的依赖,请注释掉相应的version check代码,目前暂未对deepspeed0.9.3进行适配,deepspeed0.9.2即可使用。
## 数据集 ## 数据集
输入数据为放置在项目[data](.data)目录下的 json 文件,用--dataset选项指定(参考下面示例),多个输入文件用`,`分隔。json 文件示例格式和字段说明如下: 输入数据为放置在项目[data](.data)目录下的 json 文件,用--dataset选项指定(参考下面示例),多个输入文件用`,`分隔。json 文件示例格式和字段说明如下:
...@@ -79,6 +81,8 @@ json 文件中存储一个列表,列表的每个元素是一个sample。其中 ...@@ -79,6 +81,8 @@ json 文件中存储一个列表,列表的每个元素是一个sample。其中
``` ```
数据集的使用方法请参考 [data/README.md](data/README_zh.md) 文件。 数据集的使用方法请参考 [data/README.md](data/README_zh.md) 文件。
注意:请配置[./src/llmtunerhparams/data_args.py](src/llmtuner/hparams/data_args.py)中L38的dataset_dir路径;
## 模型下载 ## 模型下载
Hugging Face模型下载地址: Hugging Face模型下载地址:
......
{ {
"train_micro_batch_size_per_gpu": "auto", "train_micro_batch_size_per_gpu": "auto",
"train_batch_size": "auto",
"zero_allow_untested_optimizer": true, "zero_allow_untested_optimizer": true,
"fp16": { "fp16": {
"enabled": "auto", "enabled": "auto",
......
{ {
"train_micro_batch_size_per_gpu": "auto", "train_micro_batch_size_per_gpu": "auto",
"train_batch_size": "auto",
"zero_allow_untested_optimizer": true, "zero_allow_untested_optimizer": true,
"fp16": { "fp16": {
"enabled": "auto", "enabled": "auto",
......
#!/bin/bash #!/bin/bash
export HSA_FORCE_FINE_GRAIN_PCIE=1
export MIOPEN_FIND_MODE=3 export MIOPEN_FIND_MODE=3
export MIOPEN_COMPILE_PARALLEL_LEVEL=1 export GPU_MAX_HW_QUEUES=16
export NCCL_PLUGIN_P2P=ucx
export RCCL_NCHANNELS=2
export NCCL_SOCKET_IFNAME=ib0
export NCCL_P2P_LEVEL=5
lrank=$OMPI_COMM_WORLD_LOCAL_RANK lrank=$OMPI_COMM_WORLD_LOCAL_RANK
echo "LRANK===============================$lrank" comm_rank=$OMPI_COMM_WORLD_RANK
RANK=$OMPI_COMM_WORLD_RANK comm_size=$OMPI_COMM_WORLD_SIZE
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE export LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
export RANK=$comm_rank
export NCCL_IB_HCA=mlx5_0 #0号网卡 export WORLD_SIZE=$comm_size
export MASTER_ADDR=$1
export MASTER_PORT=29500
export NCCL_IB_HCA=mlx5
export NCCL_SOCKET_IFNAME=ib0
export HIP_DIRECT_DISPATCH=0
APP="python3 ../src/train_bash.py --stage sft \ APP="python3 ../src/train_bash.py --stage sft \
--model_name_or_path ../../baichun-7b \ --model_name_or_path ../../baichuan-13b-base \
--do_train \ --do_train \
--template default \ --template default \
--dataset alpaca_gpt4_en,alpaca_gpt4_zh,codealpaca \ --dataset alpaca_gpt4_en \
--finetuning_type lora \ --finetuning_type lora \
--lora_rank 16 \ --lora_rank 16 \
--lora_target W_pack,o_proj,gate_proj,down_proj,up_proj \ --lora_target W_pack,o_proj,gate_proj,down_proj,up_proj \
--output_dir output/baichuan-7b-lora-2-3 \ --output_dir out/baichuan-7b-lora-test7 \
--per_device_train_batch_size 8 \ --per_device_train_batch_size 1 \
--per_device_eval_batch_size 8 \ --per_device_eval_batch_size 1 \
--gradient_accumulation_steps 1 \ --gradient_accumulation_steps 1 \
--preprocessing_num_workers 16 \ --preprocessing_num_workers 8 \
--lr_scheduler_type cosine \ --lr_scheduler_type cosine \
--logging_steps 10 \ --logging_steps 10 \
--save_steps 2000 \ --save_steps 2 \
--eval_steps 2 \
--learning_rate 1e-4 \ --learning_rate 1e-4 \
--max_grad_norm 0.5 \ --max_grad_norm 0.5 \
--num_train_epochs 1.0 \ --num_train_epochs 1.0 \
--val_size 0.001 \
--evaluation_strategy steps \
--load_best_model_at_end \
--plot_loss \ --plot_loss \
--fp16 \ --fp16 \
--deepspeed deepspeed.json --deepspeed deepspeed.json
......
#!/bin/bash #!/bin/bash
#SBATCH -p kshdnormal #SBATCH -p kshdexclu11
#SBATCH -N 32 #SBATCH -N 4
#SBATCH --cpus-per-task=1 #SBATCH --cpus-per-task=1
#SBATCH --ntasks-per-node=32 #SBATCH --ntasks-per-node=32
#SBATCH --gres=dcu:4 #SBATCH --gres=dcu:4
#SBATCH -J baichuan #SBATCH -J baichuan
#SBATCH -o logs-7B/baichuan-lora-%j.out #SBATCH -o logs-13B/baichuan-lora-%j.out
#SBATCH -e logs-7B/baichuan-lora-%j.err #SBATCH -e logs-13B/baichuan-lora-%j.err
ulimit -u 200000 #SBATCH --exclusive
ulimit -s unlimited
export HIP_VISIBLE_DEVICES=0,1,2,3
export MIOPEN_FIND_MODE=3
export MIOPEN_DEBUG_CONV_IMPLICIT_GEMM=0
export MIOPEN_USER_DB_PATH=/tmp/miopen-udb
export MIOPEN_CUSTOM_CACHE_DIR=/tmp/miopen-cache
export NCCL_SOCKET_IFNAME=ib0
export HSA_FORCE_FINE_GRAIN_PCIE=1 export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1 export OMP_NUM_THREADS=1
export NCCL_IB_HCA=mlx5
export NCCL_DEBUG=INFO export NCCL_DEBUG=INFO
export MIOPEN_FIND_MODE=3
export HSA_FORCE_FINE_GRAIN_PCIE=1
export MIOPEN_COMPILE_PARALLEL_LEVEL=1 export MIOPEN_COMPILE_PARALLEL_LEVEL=1
export NCCL_PLUGIN_P2P=ucx export NCCL_PLUGIN_P2P=ucx
export NCCL_SOCKET_IFNAME=ib0
export NCCL_P2P_LEVEL=5 export NCCL_P2P_LEVEL=5
echo "START TIME: $(date)" echo "START TIME: $(date)"
hostfile=./hostfile/$SLURM_JOB_ID
nodes=($(scontrol show hostnames $SLURM_JOB_NODELIST ))
nodes_array=($nodes)
head_node=${nodes_array[0]}
head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
echo Node IP: $head_node_ip
echo headnode: $head_node
NODE_RANK=$SLURM_NODEID
hostfile=./hostfile/$SLURM_JOB_ID #获取节点号
scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile} scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
rm `pwd`/hostfile-dl -f rm `pwd`/hostfile-dl -f
for i in `cat $hostfile` for i in `cat $hostfile`
do do
echo ${i} slots=4 >> `pwd`/hostfile/hostfile-dl-$SLURM_JOB_ID echo ${i} slots=4 >> `pwd`/hostfile/hostfile-dl-$SLURM_JOB_ID #节点号
done done
np=$(cat $hostfile|sort|uniq |wc -l) np=$(cat $hostfile|sort|uniq |wc -l) #节点去重
np=$(($np*4)) np=$(($np*4))
nodename=$(cat $hostfile |sed -n "1p") nodename=$(cat $hostfile |sed -n "1p") #读取每行节点 第一个是主节点
dist_url=`echo $nodename | awk '{print $1}'` dist_url=`echo $nodename | awk '{print $1}'`
mpirun -np $np --allow-run-as-root --hostfile hostfile/hostfile-dl-$SLURM_JOB_ID --bind-to none `pwd`/run-7b-sft-lora-single.sh $dist_url $np mpirun -np $np --allow-run-as-root --hostfile hostfile/hostfile-dl-$SLURM_JOB_ID --bind-to none `pwd`/run-7b-single-lora.sh $dist_url
import torch import torch
from typing import Any, Dict, Generator, List, Optional, Tuple from typing import Any, Dict, Generator, List, Optional, Tuple
from threading import Thread from threading import Thread
from transformers import TextIteratorStreamer from transformers import GenerationConfig, TextIteratorStreamer
from llmtuner.extras.misc import dispatch_model, get_logits_processor from llmtuner.extras.misc import dispatch_model, get_logits_processor
from llmtuner.extras.template import get_template_and_fix_tokenizer from llmtuner.extras.template import get_template_and_fix_tokenizer
...@@ -14,7 +14,6 @@ class ChatModel: ...@@ -14,7 +14,6 @@ class ChatModel:
model_args, data_args, finetuning_args, self.generating_args = get_infer_args(args) model_args, data_args, finetuning_args, self.generating_args = get_infer_args(args)
self.model, self.tokenizer = load_model_and_tokenizer(model_args, finetuning_args) self.model, self.tokenizer = load_model_and_tokenizer(model_args, finetuning_args)
self.model = dispatch_model(self.model) self.model = dispatch_model(self.model)
self.model = self.model.eval() # enable evaluation mode
self.template = get_template_and_fix_tokenizer(data_args.template, self.tokenizer) self.template = get_template_and_fix_tokenizer(data_args.template, self.tokenizer)
self.system_prompt = data_args.system_prompt self.system_prompt = data_args.system_prompt
...@@ -41,26 +40,30 @@ class ChatModel: ...@@ -41,26 +40,30 @@ class ChatModel:
max_length = input_kwargs.pop("max_length", None) max_length = input_kwargs.pop("max_length", None)
max_new_tokens = input_kwargs.pop("max_new_tokens", None) max_new_tokens = input_kwargs.pop("max_new_tokens", None)
gen_kwargs = self.generating_args.to_dict() generating_args = self.generating_args.to_dict()
gen_kwargs.update(dict( generating_args.update(dict(
input_ids=input_ids, do_sample=do_sample if do_sample is not None else generating_args["do_sample"],
do_sample=do_sample if do_sample is not None else gen_kwargs["do_sample"], temperature=temperature or generating_args["temperature"],
temperature=temperature or gen_kwargs["temperature"], top_p=top_p or generating_args["top_p"],
top_p=top_p or gen_kwargs["top_p"], top_k=top_k or generating_args["top_k"],
top_k=top_k or gen_kwargs["top_k"], repetition_penalty=repetition_penalty or generating_args["repetition_penalty"],
repetition_penalty=repetition_penalty or gen_kwargs["repetition_penalty"], eos_token_id=[self.tokenizer.eos_token_id] + self.tokenizer.additional_special_tokens_ids,
eos_token_id=list(set([self.tokenizer.eos_token_id] + self.tokenizer.additional_special_tokens_ids)), pad_token_id=self.tokenizer.pad_token_id
pad_token_id=self.tokenizer.pad_token_id,
logits_processor=get_logits_processor()
)) ))
if max_length: if max_length:
gen_kwargs.pop("max_new_tokens", None) generating_args.pop("max_new_tokens", None)
gen_kwargs["max_length"] = max_length generating_args["max_length"] = max_length
if max_new_tokens: if max_new_tokens:
gen_kwargs.pop("max_length", None) generating_args.pop("max_length", None)
gen_kwargs["max_new_tokens"] = max_new_tokens generating_args["max_new_tokens"] = max_new_tokens
gen_kwargs = dict(
inputs=input_ids,
generation_config=GenerationConfig(**generating_args),
logits_processor=get_logits_processor()
)
return gen_kwargs, prompt_length return gen_kwargs, prompt_length
......
...@@ -31,11 +31,15 @@ def preprocess_dataset( ...@@ -31,11 +31,15 @@ def preprocess_dataset(
yield query, response, history, system yield query, response, history, system
def preprocess_pretrain_dataset(examples: Dict[str, List[Any]]) -> Dict[str, Any]: def preprocess_pretrain_dataset(examples: Dict[str, List[Any]]) -> Dict[str, Any]:
# build grouped texts with format `X1 X2 X3 ...` (without <eos>) # build grouped texts with format `X1 X2 X3 ...`
if isinstance(getattr(tokenizer, "tokenizer", None), tiktoken.Encoding): # for tiktoken tokenizer (Qwen) if isinstance(getattr(tokenizer, "tokenizer", None), tiktoken.Encoding):
kwargs = dict(allowed_special="all") kwargs = dict(allowed_special="all") # for tiktoken tokenizer (Qwen)
else: else:
kwargs = dict(add_special_tokens=False) kwargs = dict(add_special_tokens=True)
if hasattr(tokenizer, "add_bos_token") and hasattr(tokenizer, "add_eos_token"):
setattr(tokenizer, "add_bos_token", True) # for LLaMA tokenizer
setattr(tokenizer, "add_eos_token", True)
tokenized_examples = tokenizer(examples["prompt"], **kwargs) tokenized_examples = tokenizer(examples["prompt"], **kwargs)
concatenated_examples = {k: list(chain(*tokenized_examples[k])) for k in tokenized_examples.keys()} concatenated_examples = {k: list(chain(*tokenized_examples[k])) for k in tokenized_examples.keys()}
...@@ -59,7 +63,9 @@ def preprocess_dataset( ...@@ -59,7 +63,9 @@ def preprocess_dataset(
for query, response, history, system in construct_example(examples): for query, response, history, system in construct_example(examples):
input_ids, labels = [], [] input_ids, labels = [], []
for source_ids, target_ids in template.encode_multiturn(tokenizer, query, response, history, system): for turn_idx, (source_ids, target_ids) in enumerate(template.encode_multiturn(
tokenizer, query, response, history, system
)):
if len(source_ids) > data_args.max_source_length: if len(source_ids) > data_args.max_source_length:
source_ids = source_ids[:data_args.max_source_length] source_ids = source_ids[:data_args.max_source_length]
if len(target_ids) > data_args.max_target_length: if len(target_ids) > data_args.max_target_length:
...@@ -68,8 +74,17 @@ def preprocess_dataset( ...@@ -68,8 +74,17 @@ def preprocess_dataset(
if len(input_ids) + len(source_ids) + len(target_ids) > max_length: if len(input_ids) + len(source_ids) + len(target_ids) > max_length:
break break
if turn_idx != 0 and template.efficient_eos:
source_mask = [tokenizer.eos_token_id] + [IGNORE_INDEX] * (len(source_ids) - 1)
else:
source_mask = [IGNORE_INDEX] * len(source_ids)
input_ids += source_ids + target_ids input_ids += source_ids + target_ids
labels += [IGNORE_INDEX] * len(source_ids) + target_ids labels += source_mask + target_ids
if template.efficient_eos:
input_ids += [tokenizer.eos_token_id]
labels += [tokenizer.eos_token_id]
model_inputs["input_ids"].append(input_ids) model_inputs["input_ids"].append(input_ids)
model_inputs["attention_mask"].append([1] * len(input_ids)) model_inputs["attention_mask"].append([1] * len(input_ids))
...@@ -89,6 +104,9 @@ def preprocess_dataset( ...@@ -89,6 +104,9 @@ def preprocess_dataset(
if len(target_ids) > data_args.max_target_length: if len(target_ids) > data_args.max_target_length:
target_ids = target_ids[:data_args.max_target_length] target_ids = target_ids[:data_args.max_target_length]
if template.efficient_eos:
target_ids += [tokenizer.eos_token_id]
model_inputs["input_ids"].append(source_ids) model_inputs["input_ids"].append(source_ids)
model_inputs["attention_mask"].append([1] * len(source_ids)) model_inputs["attention_mask"].append([1] * len(source_ids))
model_inputs["labels"].append(target_ids) model_inputs["labels"].append(target_ids)
...@@ -109,6 +127,10 @@ def preprocess_dataset( ...@@ -109,6 +127,10 @@ def preprocess_dataset(
if len(rejected_ids) > data_args.max_target_length: if len(rejected_ids) > data_args.max_target_length:
rejected_ids = rejected_ids[:data_args.max_target_length] rejected_ids = rejected_ids[:data_args.max_target_length]
if template.efficient_eos:
chosen_ids += [tokenizer.eos_token_id]
rejected_ids += [tokenizer.eos_token_id]
model_inputs["prompt_ids"].append(prompt_ids) model_inputs["prompt_ids"].append(prompt_ids)
model_inputs["chosen_ids"].append(chosen_ids) model_inputs["chosen_ids"].append(chosen_ids)
model_inputs["rejected_ids"].append(rejected_ids) model_inputs["rejected_ids"].append(rejected_ids)
......
...@@ -5,7 +5,9 @@ from typing import TYPE_CHECKING ...@@ -5,7 +5,9 @@ from typing import TYPE_CHECKING
from datetime import timedelta from datetime import timedelta
from transformers import TrainerCallback from transformers import TrainerCallback
from transformers.trainer_utils import has_length from transformers.trainer_callback import TrainerControl, TrainerState
from transformers.trainer_utils import has_length, PREFIX_CHECKPOINT_DIR
from transformers.training_args import TrainingArguments
from llmtuner.extras.constants import LOG_FILE_NAME from llmtuner.extras.constants import LOG_FILE_NAME
from llmtuner.extras.logging import get_logger from llmtuner.extras.logging import get_logger
...@@ -17,6 +19,24 @@ if TYPE_CHECKING: ...@@ -17,6 +19,24 @@ if TYPE_CHECKING:
logger = get_logger(__name__) logger = get_logger(__name__)
class SavePeftModelCallback(TrainerCallback):
def on_save(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
r"""
Event called after a checkpoint save.
"""
output_dir = os.path.join(args.output_dir, "{}-{}".format(PREFIX_CHECKPOINT_DIR, state.global_step))
getattr(kwargs.get("model"), "pretrained_model").save_pretrained(output_dir)
return control
def on_train_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
r"""
Event called at the end of training.
"""
getattr(kwargs.get("model"), "pretrained_model").save_pretrained(args.output_dir)
return control
class LogCallback(TrainerCallback): class LogCallback(TrainerCallback):
def __init__(self, runner=None): def __init__(self, runner=None):
......
...@@ -2,28 +2,16 @@ IGNORE_INDEX = -100 ...@@ -2,28 +2,16 @@ IGNORE_INDEX = -100
LOG_FILE_NAME = "trainer_log.jsonl" LOG_FILE_NAME = "trainer_log.jsonl"
VALUE_HEAD_FILE_NAME = "value_head.bin"
FINETUNING_ARGS_NAME = "finetuning_args.json"
LAYERNORM_NAMES = ["norm", "ln_f", "ln_attn", "ln_mlp"] LAYERNORM_NAMES = ["norm", "ln_f", "ln_attn", "ln_mlp"]
METHODS = ["full", "freeze", "lora"] METHODS = ["full", "freeze", "lora"]
STAGES = [ TRAINING_STAGES = {
"SFT", "Supervised Fine-Tuning": "sft",
"Reward Modeling",
"PPO",
"DPO",
"Pre-Training"
]
DATASET_STAGE_MAP = {
"SFT": "sft",
"Pre-Training": "pt",
"Reward Modeling": "rm", "Reward Modeling": "rm",
"PPO": "sft", "PPO": "ppo",
"DPO": "rm" "DPO": "dpo",
"Pre-Training": "pt"
} }
SUPPORTED_MODELS = { SUPPORTED_MODELS = {
...@@ -54,11 +42,16 @@ SUPPORTED_MODELS = { ...@@ -54,11 +42,16 @@ SUPPORTED_MODELS = {
"Baichuan-7B": "baichuan-inc/Baichuan-7B", "Baichuan-7B": "baichuan-inc/Baichuan-7B",
"Baichuan-13B": "baichuan-inc/Baichuan-13B-Base", "Baichuan-13B": "baichuan-inc/Baichuan-13B-Base",
"Baichuan-13B-Chat": "baichuan-inc/Baichuan-13B-Chat", "Baichuan-13B-Chat": "baichuan-inc/Baichuan-13B-Chat",
"Baichuan2-7B": "baichuan-inc/Baichuan2-7B-Base",
"Baichuan2-13B": "baichuan-inc/Baichuan2-13B-Base",
"Baichuan2-7B-Chat": "baichuan-inc/Baichuan2-7B-Chat",
"Baichuan2-13B-Chat": "baichuan-inc/Baichuan2-13B-Chat",
"InternLM-7B": "internlm/internlm-7b", "InternLM-7B": "internlm/internlm-7b",
"InternLM-7B-Chat": "internlm/internlm-chat-7b", "InternLM-7B-Chat": "internlm/internlm-chat-7b",
"Qwen-7B": "Qwen/Qwen-7B", "Qwen-7B": "Qwen/Qwen-7B",
"Qwen-7B-Chat": "Qwen/Qwen-7B-Chat", "Qwen-7B-Chat": "Qwen/Qwen-7B-Chat",
"XVERSE-13B": "xverse/XVERSE-13B", "XVERSE-13B": "xverse/XVERSE-13B",
"XVERSE-13B-Chat": "xverse/XVERSE-13B-Chat",
"ChatGLM2-6B-Chat": "THUDM/chatglm2-6b" "ChatGLM2-6B-Chat": "THUDM/chatglm2-6b"
} }
...@@ -70,6 +63,7 @@ DEFAULT_MODULE = { ...@@ -70,6 +63,7 @@ DEFAULT_MODULE = {
"BLOOMZ": "query_key_value", "BLOOMZ": "query_key_value",
"Falcon": "query_key_value", "Falcon": "query_key_value",
"Baichuan": "W_pack", "Baichuan": "W_pack",
"Baichuan2": "W_pack",
"InternLM": "q_proj,v_proj", "InternLM": "q_proj,v_proj",
"Qwen": "c_attn", "Qwen": "c_attn",
"XVERSE": "q_proj,v_proj", "XVERSE": "q_proj,v_proj",
...@@ -80,7 +74,9 @@ DEFAULT_TEMPLATE = { ...@@ -80,7 +74,9 @@ DEFAULT_TEMPLATE = {
"LLaMA2": "llama2", "LLaMA2": "llama2",
"ChineseLLaMA2": "llama2_zh", "ChineseLLaMA2": "llama2_zh",
"Baichuan": "baichuan", "Baichuan": "baichuan",
"Baichuan2": "baichuan2",
"InternLM": "intern", "InternLM": "intern",
"Qwen": "chatml", "Qwen": "chatml",
"XVERSE": "xverse",
"ChatGLM2": "chatglm2" "ChatGLM2": "chatglm2"
} }
import gc
import torch import torch
from typing import TYPE_CHECKING, List, Optional, Tuple from typing import TYPE_CHECKING, Tuple
from transformers import InfNanRemoveLogitsProcessor, LogitsProcessorList from transformers import InfNanRemoveLogitsProcessor, LogitsProcessorList
from llmtuner.extras.constants import LAYERNORM_NAMES
if TYPE_CHECKING: if TYPE_CHECKING:
from transformers.modeling_utils import PreTrainedModel from transformers.modeling_utils import PreTrainedModel
...@@ -28,12 +27,6 @@ class AverageMeter: ...@@ -28,12 +27,6 @@ class AverageMeter:
self.avg = self.sum / self.count self.avg = self.sum / self.count
def get_logits_processor() -> LogitsProcessorList:
logits_processor = LogitsProcessorList()
logits_processor.append(InfNanRemoveLogitsProcessor())
return logits_processor
def count_parameters(model: torch.nn.Module) -> Tuple[int, int]: def count_parameters(model: torch.nn.Module) -> Tuple[int, int]:
r""" r"""
Returns the number of trainable parameters and number of all parameters in the model. Returns the number of trainable parameters and number of all parameters in the model.
...@@ -56,48 +49,17 @@ def count_parameters(model: torch.nn.Module) -> Tuple[int, int]: ...@@ -56,48 +49,17 @@ def count_parameters(model: torch.nn.Module) -> Tuple[int, int]:
return trainable_params, all_param return trainable_params, all_param
# Includes: (1) cast the layernorm in fp32 (2) make output embedding layer require grads (3) upcast the lm_head to fp32 def get_logits_processor() -> LogitsProcessorList:
# Inspired by: https://github.com/huggingface/peft/blob/c0209c35abbf88c63aa267800d98a8e212ed0a42/src/peft/utils/other.py#L35 logits_processor = LogitsProcessorList()
def prepare_model_for_training( logits_processor.append(InfNanRemoveLogitsProcessor())
model: "PreTrainedModel", return logits_processor
finetuning_type: str,
output_layer_name: Optional[str] = "lm_head",
use_gradient_checkpointing: Optional[bool] = True,
layer_norm_names: Optional[List[str]] = LAYERNORM_NAMES
) -> "PreTrainedModel":
for name, param in model.named_parameters():
if param.ndim == 1 and any(layer_norm_name in name for layer_norm_name in layer_norm_names):
param.data = param.data.to(torch.float32)
if use_gradient_checkpointing:
if hasattr(model, "enable_input_require_grads"):
model.enable_input_require_grads()
else:
def make_inputs_require_grad(module, input, output):
output.requires_grad_(True)
model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
model.gradient_checkpointing_enable()
model.config.use_cache = False # turn off when gradient checkpointing is enabled
if finetuning_type != "full" and hasattr(model, output_layer_name):
output_layer: torch.nn.Linear = getattr(model, output_layer_name)
input_dtype = output_layer.weight.dtype
class CastOutputToFloat(torch.nn.Sequential):
def forward(self, x: torch.Tensor) -> torch.Tensor:
return super().forward(x.to(input_dtype)).to(torch.float32)
setattr(model, output_layer_name, CastOutputToFloat(output_layer))
return model
def torch_gc() -> None: def torch_gc() -> None:
r""" r"""
Collects GPU memory. Collects GPU memory.
""" """
gc.collect()
if torch.cuda.is_available(): if torch.cuda.is_available():
torch.cuda.empty_cache() torch.cuda.empty_cache()
torch.cuda.ipc_collect() torch.cuda.ipc_collect()
......
This diff is collapsed.
import os import os
import torch import torch
from typing import Dict from transformers.trainer import WEIGHTS_NAME
from transformers.trainer import WEIGHTS_NAME, WEIGHTS_INDEX_NAME
from transformers.modeling_utils import load_sharded_checkpoint
from llmtuner.extras.constants import VALUE_HEAD_FILE_NAME
from llmtuner.extras.logging import get_logger from llmtuner.extras.logging import get_logger
logger = get_logger(__name__) logger = get_logger(__name__)
def get_state_dict(model: torch.nn.Module) -> Dict[str, torch.Tensor]:
state_dict: Dict[str, torch.Tensor] = model.state_dict()
filtered_state_dict = {}
for k, v in model.named_parameters():
if v.requires_grad:
filtered_state_dict[k] = state_dict[k].cpu().clone().detach()
return filtered_state_dict
def load_trainable_params(model: torch.nn.Module, checkpoint_dir: os.PathLike) -> bool:
weights_file = os.path.join(checkpoint_dir, WEIGHTS_NAME)
if os.path.exists(weights_file):
model_state_dict = torch.load(weights_file, map_location="cpu")
model.load_state_dict(model_state_dict, strict=False) # skip missing keys
elif os.path.exists(os.path.join(checkpoint_dir, WEIGHTS_INDEX_NAME)):
load_sharded_checkpoint(model, checkpoint_dir, strict=False)
else:
logger.warning("Provided path ({}) does not contain pre-trained weights.".format(checkpoint_dir))
return False
return True
def load_valuehead_params(model: torch.nn.Module, checkpoint_dir: os.PathLike) -> bool: def load_valuehead_params(model: torch.nn.Module, checkpoint_dir: os.PathLike) -> bool:
valuehead_file = os.path.join(checkpoint_dir, VALUE_HEAD_FILE_NAME) vhead_file = os.path.join(checkpoint_dir, WEIGHTS_NAME)
if not os.path.exists(valuehead_file): if not os.path.exists(vhead_file):
logger.warning("Provided path ({}) does not contain valuehead weights.".format(checkpoint_dir)) logger.warning("Provided path ({}) does not contain valuehead weights.".format(checkpoint_dir))
return False return False
valuehead_state_dict = torch.load(valuehead_file, map_location="cpu") vhead_params = torch.load(vhead_file, map_location="cpu")
model.register_buffer("reward_head_weight", valuehead_state_dict["summary.weight"]) model.register_buffer("reward_head_weight", vhead_params["v_head.summary.weight"], persistent=False)
model.register_buffer("reward_head_bias", valuehead_state_dict["summary.bias"]) model.register_buffer("reward_head_bias", vhead_params["v_head.summary.bias"], persistent=False)
model.register_buffer("default_head_weight", torch.zeros_like(valuehead_state_dict["summary.weight"])) model.register_buffer("default_head_weight", torch.zeros_like(vhead_params["v_head.summary.weight"]), persistent=False)
model.register_buffer("default_head_bias", torch.zeros_like(valuehead_state_dict["summary.bias"])) model.register_buffer("default_head_bias", torch.zeros_like(vhead_params["v_head.summary.bias"]), persistent=False)
return True return True
...@@ -20,6 +20,7 @@ class Template: ...@@ -20,6 +20,7 @@ class Template:
sep: List[Union[str, Dict[str, str]]] sep: List[Union[str, Dict[str, str]]]
stop_words: List[str] stop_words: List[str]
use_history: bool use_history: bool
efficient_eos: bool
def encode_oneturn( def encode_oneturn(
self, self,
...@@ -74,19 +75,19 @@ class Template: ...@@ -74,19 +75,19 @@ class Template:
self, self,
tokenizer: "PreTrainedTokenizer" tokenizer: "PreTrainedTokenizer"
) -> Tuple[List[int], List[int]]: ) -> Tuple[List[int], List[int]]:
if ( if tokenizer.bos_token_id is not None and getattr(tokenizer, "add_bos_token", True):
tokenizer.bos_token_id is not None
and getattr(tokenizer, "add_bos_token", True)
): # baichuan-13b has no bos token
bos_ids = [tokenizer.bos_token_id] bos_ids = [tokenizer.bos_token_id]
else: else: # baichuan, qwen and gpt2 models have no bos token
bos_ids = [] # bos token is optional bos_ids = []
if tokenizer.eos_token_id is not None: if tokenizer.eos_token_id is None:
eos_ids = [tokenizer.eos_token_id]
else:
raise ValueError("EOS token is required.") raise ValueError("EOS token is required.")
if self.efficient_eos: # used in baichuan, qwen, chatglm, etc.
eos_ids = []
else:
eos_ids = [tokenizer.eos_token_id]
return bos_ids, eos_ids return bos_ids, eos_ids
def _encode( def _encode(
...@@ -137,6 +138,8 @@ class Template: ...@@ -137,6 +138,8 @@ class Template:
token_ids = [] token_ids = []
for elem in context: for elem in context:
if isinstance(elem, str): if isinstance(elem, str):
if len(elem) == 0:
continue
elem = elem.replace("{{system}}", system, 1) if system is not None else elem elem = elem.replace("{{system}}", system, 1) if system is not None else elem
elem = elem.replace("{{query}}", query, 1) if query is not None else elem elem = elem.replace("{{query}}", query, 1) if query is not None else elem
elem = elem.replace("{{idx}}", idx, 1) if idx is not None else elem elem = elem.replace("{{idx}}", idx, 1) if idx is not None else elem
...@@ -184,7 +187,8 @@ def register_template( ...@@ -184,7 +187,8 @@ def register_template(
system: str, system: str,
sep: List[Union[str, Dict[str, str]]], sep: List[Union[str, Dict[str, str]]],
stop_words: Optional[List[str]] = [], stop_words: Optional[List[str]] = [],
use_history: Optional[bool] = True use_history: Optional[bool] = True,
efficient_eos: Optional[bool] = False
) -> None: ) -> None:
template_class = Llama2Template if "llama2" in name else Template template_class = Llama2Template if "llama2" in name else Template
templates[name] = template_class( templates[name] = template_class(
...@@ -193,7 +197,8 @@ def register_template( ...@@ -193,7 +197,8 @@ def register_template(
system=system, system=system,
sep=sep, sep=sep,
stop_words=stop_words, stop_words=stop_words,
use_history=use_history use_history=use_history,
efficient_eos=efficient_eos
) )
...@@ -201,31 +206,21 @@ def get_template_and_fix_tokenizer( ...@@ -201,31 +206,21 @@ def get_template_and_fix_tokenizer(
name: str, name: str,
tokenizer: "PreTrainedTokenizer" tokenizer: "PreTrainedTokenizer"
) -> Template: ) -> Template:
template = templates.get(name, None)
assert template is not None, "Template {} does not exist.".format(name)
additional_special_tokens = template.stop_words
if len(template.stop_words): # inplace method
if tokenizer.eos_token_id is not None:
additional_special_tokens.append(tokenizer.eos_token)
tokenizer.eos_token = additional_special_tokens[0] # use the first stop word as eos token
additional_special_tokens.pop(0)
logger.info("Replace eos token: {}".format(tokenizer.eos_token))
if tokenizer.eos_token_id is None: if tokenizer.eos_token_id is None:
tokenizer.eos_token = "<|endoftext|>" tokenizer.eos_token = "<|endoftext|>"
logger.info("Add eos token: {}".format(tokenizer.eos_token)) logger.info("Add eos token: {}".format(tokenizer.eos_token))
if tokenizer.pad_token_id is None: if tokenizer.pad_token_id is None:
if tokenizer.unk_token_id is not None: tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token = tokenizer.unk_token
else:
tokenizer.pad_token = tokenizer.eos_token
logger.info("Add pad token: {}".format(tokenizer.pad_token)) logger.info("Add pad token: {}".format(tokenizer.pad_token))
if name is None:
return None
template = templates.get(name, None)
assert template is not None, "Template {} does not exist.".format(name)
tokenizer.add_special_tokens( tokenizer.add_special_tokens(
dict(additional_special_tokens=additional_special_tokens), dict(additional_special_tokens=template.stop_words),
replace_additional_special_tokens=False replace_additional_special_tokens=False
) )
return template return template
...@@ -464,18 +459,18 @@ register_template( ...@@ -464,18 +459,18 @@ register_template(
], ],
system="", system="",
sep=[ sep=[
{"token": "<eoa>"},
"\n" "\n"
], ],
stop_words=[ stop_words=[
"</s>", # internlm cannot replace eos token
"<eoa>" "<eoa>"
] ],
efficient_eos=True
) )
r""" r"""
Supports: https://huggingface.co/baichuan-inc/Baichuan-13B-Chat Supports: https://huggingface.co/baichuan-inc/Baichuan-13B-Chat
Used for training and inference of the fine-tuned models.
""" """
register_template( register_template(
name="baichuan", name="baichuan",
...@@ -485,33 +480,31 @@ register_template( ...@@ -485,33 +480,31 @@ register_template(
prompt=[ prompt=[
{"token": "<reserved_102>"}, # user token {"token": "<reserved_102>"}, # user token
"{{query}}", "{{query}}",
{"token": "<reserved_103>"} # assistant token {"token": "<reserved_103>"} # assistant token
], ],
system="", system="",
sep=[], sep=[],
stop_words=[] efficient_eos=True
) )
r""" r"""
Supports: https://huggingface.co/baichuan-inc/Baichuan-13B-Chat Supports: https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat
Used for inference of the original model. https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat
""" """
register_template( register_template(
name="baichuan_eval", name="baichuan2",
prefix=[ prefix=[
"{{system}}", "{{system}}"
{"token": "<reserved_102>"} # user token
], ],
prompt=[ prompt=[
{"token": "<reserved_106>"}, # user token
"{{query}}", "{{query}}",
{"token": "<reserved_103>"} # assistant token {"token": "<reserved_107>"} # assistant token
], ],
system="", system="",
sep=[], sep=[],
stop_words=[ efficient_eos=True
"<reserved_102>" # user token
]
) )
...@@ -524,7 +517,6 @@ register_template( ...@@ -524,7 +517,6 @@ register_template(
prefix=[ prefix=[
{"token": "<|system|>"}, {"token": "<|system|>"},
"\n{{system}}", "\n{{system}}",
{"token": "<|end|>"}
], ],
prompt=[ prompt=[
{"token": "<|user|>"}, {"token": "<|user|>"},
...@@ -535,11 +527,13 @@ register_template( ...@@ -535,11 +527,13 @@ register_template(
], ],
system="", system="",
sep=[ sep=[
{"token": "<|end|>"},
"\n" "\n"
], ],
stop_words=[ stop_words=[
"<|end|>" "<|end|>"
] ],
efficient_eos=True
) )
...@@ -550,8 +544,7 @@ register_template( ...@@ -550,8 +544,7 @@ register_template(
name="chatml", name="chatml",
prefix=[ prefix=[
{"token": "<|im_start|>"}, {"token": "<|im_start|>"},
"system\n{{system}}", "system\n{{system}}"
{"token": "<|im_end|>"}
], ],
prompt=[ prompt=[
{"token": "<|im_start|>"}, {"token": "<|im_start|>"},
...@@ -563,11 +556,13 @@ register_template( ...@@ -563,11 +556,13 @@ register_template(
], ],
system="You are a helpful assistant.", system="You are a helpful assistant.",
sep=[ sep=[
{"token": "<|im_end|>"},
"\n" "\n"
], ],
stop_words=[ stop_words=[
"<|im_end|>" "<|im_end|>"
] ],
efficient_eos=True
) )
...@@ -587,7 +582,8 @@ register_template( ...@@ -587,7 +582,8 @@ register_template(
system="", system="",
sep=[ sep=[
"\n\n" "\n\n"
] ],
efficient_eos=True
) )
......
...@@ -11,24 +11,23 @@ class DatasetAttr: ...@@ -11,24 +11,23 @@ class DatasetAttr:
dataset_name: Optional[str] = None dataset_name: Optional[str] = None
dataset_sha1: Optional[str] = None dataset_sha1: Optional[str] = None
system_prompt: Optional[str] = None system_prompt: Optional[str] = None
stage: Optional[str] = None ranking: Optional[bool] = False
prompt: Optional[str] = "instruction"
query: Optional[str] = "input"
response: Optional[str] = "output"
history: Optional[str] = None
def __repr__(self) -> str: def __repr__(self) -> str:
return self.dataset_name return self.dataset_name
def __post_init__(self):
self.prompt = "instruction"
self.query = "input"
self.response = "output"
self.history = None
@dataclass @dataclass
class DataArguments: class DataArguments:
r""" r"""
Arguments pertaining to what data we are going to input our model for training and evaluation. Arguments pertaining to what data we are going to input our model for training and evaluation.
""" """
template: str = field( template: Optional[str] = field(
default=None,
metadata={"help": "Which template to use for constructing prompts in training and inference."} metadata={"help": "Which template to use for constructing prompts in training and inference."}
) )
dataset: Optional[str] = field( dataset: Optional[str] = field(
...@@ -36,7 +35,7 @@ class DataArguments: ...@@ -36,7 +35,7 @@ class DataArguments:
metadata={"help": "The name of provided dataset(s) to use. Use commas to separate multiple datasets."} metadata={"help": "The name of provided dataset(s) to use. Use commas to separate multiple datasets."}
) )
dataset_dir: Optional[str] = field( dataset_dir: Optional[str] = field(
default="data", default="/public/home/zhaoying1/work/Baichuan-13B-main/LLaMA-Efficient-Tuning-remove-pe/data",
metadata={"help": "The name of the folder containing datasets."} metadata={"help": "The name of the folder containing datasets."}
) )
split: Optional[str] = field( split: Optional[str] = field(
...@@ -48,7 +47,7 @@ class DataArguments: ...@@ -48,7 +47,7 @@ class DataArguments:
metadata={"help": "Enable streaming mode."} metadata={"help": "Enable streaming mode."}
) )
buffer_size: Optional[int] = field( buffer_size: Optional[int] = field(
default=16384, default=1024,
metadata={"help": "Size of the buffer to randomly sample examples from in streaming mode."} metadata={"help": "Size of the buffer to randomly sample examples from in streaming mode."}
) )
mix_strategy: Optional[Literal["concat", "interleave_under", "interleave_over"]] = field( mix_strategy: Optional[Literal["concat", "interleave_under", "interleave_over"]] = field(
...@@ -114,21 +113,14 @@ class DataArguments: ...@@ -114,21 +113,14 @@ class DataArguments:
raise ValueError("Undefined dataset {} in dataset_info.json.".format(name)) raise ValueError("Undefined dataset {} in dataset_info.json.".format(name))
if "hf_hub_url" in dataset_info[name]: if "hf_hub_url" in dataset_info[name]:
dataset_attr = DatasetAttr( dataset_attr = DatasetAttr("hf_hub", dataset_name=dataset_info[name]["hf_hub_url"])
"hf_hub",
dataset_name=dataset_info[name]["hf_hub_url"],
stage=dataset_info[name].get("stage", None))
elif "script_url" in dataset_info[name]: elif "script_url" in dataset_info[name]:
dataset_attr = DatasetAttr( dataset_attr = DatasetAttr("script", dataset_name=dataset_info[name]["script_url"])
"script",
dataset_name=dataset_info[name]["script_url"],
stage=dataset_info[name].get("stage", None))
else: else:
dataset_attr = DatasetAttr( dataset_attr = DatasetAttr(
"file", "file",
dataset_name=dataset_info[name]["file_name"], dataset_name=dataset_info[name]["file_name"],
dataset_sha1=dataset_info[name].get("file_sha1", None), dataset_sha1=dataset_info[name].get("file_sha1", None)
stage=dataset_info[name].get("stage", None)
) )
if "columns" in dataset_info[name]: if "columns" in dataset_info[name]:
...@@ -137,5 +129,6 @@ class DataArguments: ...@@ -137,5 +129,6 @@ class DataArguments:
dataset_attr.response = dataset_info[name]["columns"].get("response", None) dataset_attr.response = dataset_info[name]["columns"].get("response", None)
dataset_attr.history = dataset_info[name]["columns"].get("history", None) dataset_attr.history = dataset_info[name]["columns"].get("history", None)
dataset_attr.ranking = dataset_info[name].get("ranking", False)
dataset_attr.system_prompt = prompt_list[i] dataset_attr.system_prompt = prompt_list[i]
self.dataset_list.append(dataset_attr) self.dataset_list.append(dataset_attr)
...@@ -16,7 +16,7 @@ class ModelArguments: ...@@ -16,7 +16,7 @@ class ModelArguments:
metadata={"help": "Where to store the pretrained models downloaded from huggingface.co."} metadata={"help": "Where to store the pretrained models downloaded from huggingface.co."}
) )
use_fast_tokenizer: Optional[bool] = field( use_fast_tokenizer: Optional[bool] = field(
default=False, default=True,
metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."} metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}
) )
use_auth_token: Optional[bool] = field( use_auth_token: Optional[bool] = field(
...@@ -27,10 +27,6 @@ class ModelArguments: ...@@ -27,10 +27,6 @@ class ModelArguments:
default="main", default="main",
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."} metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}
) )
padding_side: Optional[Literal["left", "right"]] = field(
default="left",
metadata={"help": "The side on which the model should have padding applied."}
)
quantization_bit: Optional[int] = field( quantization_bit: Optional[int] = field(
default=None, default=None,
metadata={"help": "The number of bits to quantize the model."} metadata={"help": "The number of bits to quantize the model."}
...@@ -47,6 +43,10 @@ class ModelArguments: ...@@ -47,6 +43,10 @@ class ModelArguments:
default=None, default=None,
metadata={"help": "Adopt scaled rotary positional embeddings."} metadata={"help": "Adopt scaled rotary positional embeddings."}
) )
flash_attn: Optional[bool] = field(
default=False,
metadata={"help": "Enable flash attention for faster training."}
)
checkpoint_dir: Optional[str] = field( checkpoint_dir: Optional[str] = field(
default=None, default=None,
metadata={"help": "Path to the directory(s) containing the delta model checkpoints as well as the configurations."} metadata={"help": "Path to the directory(s) containing the delta model checkpoints as well as the configurations."}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment