Commit 0938ae70 authored by zhaoying1's avatar zhaoying1
Browse files

fix save method of adapter_model.bin

parent 1b73554f
......@@ -3,4 +3,4 @@ COPY requirements.txt requirements.txt
RUN source /opt/dtk-23.04/env.sh
RUN cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && echo 'Asia/Shanghai' >/etc/timezone
ENV LANG C.UTF-8
RUN pip install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
RUN pip install -r requirements.txt --no-dependencies -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
......@@ -23,9 +23,9 @@ docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.13.1-centos7.6-dtk
```
安装docker中没有的依赖:
```
pip install transformers==4.28.0 -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
pip install datasets accelerate peft trl tiktoken jieba rouge-chinese nltk gradio matplotlib uvicore fastapi sse-starlette
pip install transformers==4.31.0 -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
pip install accelerate==0.22.0 --no-dependencies -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
pip install datasets peft trl tiktoken jieba rouge-chinese nltk gradio matplotlib uvicore fastapi sse-starlette
```
......@@ -51,9 +51,11 @@ conda create -n chatglm python=3.8
3. 其它依赖库参照requirements.txt安装:
```
pip install -r requirements.txt
pip install -r requirements.txt --no-dependencies -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
```
说明:若在accelerate、transformers等库中遇到对deepspeed0.9.3的依赖,请注释掉相应的version check代码,目前暂未对deepspeed0.9.3进行适配,deepspeed0.9.2即可使用。
## 数据集
输入数据为放置在项目[data](.data)目录下的 json 文件,用--dataset选项指定(参考下面示例),多个输入文件用`,`分隔。json 文件示例格式和字段说明如下:
......@@ -79,6 +81,8 @@ json 文件中存储一个列表,列表的每个元素是一个sample。其中
```
数据集的使用方法请参考 [data/README.md](data/README_zh.md) 文件。
注意:请配置[./src/llmtunerhparams/data_args.py](src/llmtuner/hparams/data_args.py)中L38的dataset_dir路径;
## 模型下载
Hugging Face模型下载地址:
......
{
"train_micro_batch_size_per_gpu": "auto",
"train_batch_size": "auto",
"zero_allow_untested_optimizer": true,
"fp16": {
"enabled": "auto",
......
{
"train_micro_batch_size_per_gpu": "auto",
"train_batch_size": "auto",
"zero_allow_untested_optimizer": true,
"fp16": {
"enabled": "auto",
......
#!/bin/bash
export HSA_FORCE_FINE_GRAIN_PCIE=1
export MIOPEN_FIND_MODE=3
export MIOPEN_COMPILE_PARALLEL_LEVEL=1
export NCCL_PLUGIN_P2P=ucx
export RCCL_NCHANNELS=2
export NCCL_SOCKET_IFNAME=ib0
export NCCL_P2P_LEVEL=5
export GPU_MAX_HW_QUEUES=16
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
echo "LRANK===============================$lrank"
RANK=$OMPI_COMM_WORLD_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
export NCCL_IB_HCA=mlx5_0 #0号网卡
comm_rank=$OMPI_COMM_WORLD_RANK
comm_size=$OMPI_COMM_WORLD_SIZE
export LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
export RANK=$comm_rank
export WORLD_SIZE=$comm_size
export MASTER_ADDR=$1
export MASTER_PORT=29500
export NCCL_IB_HCA=mlx5
export NCCL_SOCKET_IFNAME=ib0
export HIP_DIRECT_DISPATCH=0
APP="python3 ../src/train_bash.py --stage sft \
--model_name_or_path ../../baichun-7b \
--model_name_or_path ../../baichuan-13b-base \
--do_train \
--template default \
--dataset alpaca_gpt4_en,alpaca_gpt4_zh,codealpaca \
--dataset alpaca_gpt4_en \
--finetuning_type lora \
--lora_rank 16 \
--lora_target W_pack,o_proj,gate_proj,down_proj,up_proj \
--output_dir output/baichuan-7b-lora-2-3 \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 8 \
--output_dir out/baichuan-7b-lora-test7 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--gradient_accumulation_steps 1 \
--preprocessing_num_workers 16 \
--preprocessing_num_workers 8 \
--lr_scheduler_type cosine \
--logging_steps 10 \
--save_steps 2000 \
--save_steps 2 \
--eval_steps 2 \
--learning_rate 1e-4 \
--max_grad_norm 0.5 \
--num_train_epochs 1.0 \
--val_size 0.001 \
--evaluation_strategy steps \
--load_best_model_at_end \
--plot_loss \
--fp16 \
--deepspeed deepspeed.json
......
#!/bin/bash
#SBATCH -p kshdnormal
#SBATCH -N 32
#SBATCH -p kshdexclu11
#SBATCH -N 4
#SBATCH --cpus-per-task=1
#SBATCH --ntasks-per-node=32
#SBATCH --gres=dcu:4
#SBATCH -J baichuan
#SBATCH -o logs-7B/baichuan-lora-%j.out
#SBATCH -e logs-7B/baichuan-lora-%j.err
ulimit -u 200000
#SBATCH -o logs-13B/baichuan-lora-%j.out
#SBATCH -e logs-13B/baichuan-lora-%j.err
#SBATCH --exclusive
ulimit -s unlimited
export HIP_VISIBLE_DEVICES=0,1,2,3
export MIOPEN_FIND_MODE=3
export MIOPEN_DEBUG_CONV_IMPLICIT_GEMM=0
export MIOPEN_USER_DB_PATH=/tmp/miopen-udb
export MIOPEN_CUSTOM_CACHE_DIR=/tmp/miopen-cache
export NCCL_SOCKET_IFNAME=ib0
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export NCCL_IB_HCA=mlx5
export NCCL_DEBUG=INFO
export MIOPEN_FIND_MODE=3
export HSA_FORCE_FINE_GRAIN_PCIE=1
export MIOPEN_COMPILE_PARALLEL_LEVEL=1
export NCCL_PLUGIN_P2P=ucx
export NCCL_SOCKET_IFNAME=ib0
export NCCL_P2P_LEVEL=5
echo "START TIME: $(date)"
hostfile=./hostfile/$SLURM_JOB_ID
nodes=($(scontrol show hostnames $SLURM_JOB_NODELIST ))
nodes_array=($nodes)
head_node=${nodes_array[0]}
head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
echo Node IP: $head_node_ip
echo headnode: $head_node
NODE_RANK=$SLURM_NODEID
hostfile=./hostfile/$SLURM_JOB_ID #获取节点号
scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
rm `pwd`/hostfile-dl -f
for i in `cat $hostfile`
do
echo ${i} slots=4 >> `pwd`/hostfile/hostfile-dl-$SLURM_JOB_ID
echo ${i} slots=4 >> `pwd`/hostfile/hostfile-dl-$SLURM_JOB_ID #节点号
done
np=$(cat $hostfile|sort|uniq |wc -l)
np=$(cat $hostfile|sort|uniq |wc -l) #节点去重
np=$(($np*4))
nodename=$(cat $hostfile |sed -n "1p")
nodename=$(cat $hostfile |sed -n "1p") #读取每行节点 第一个是主节点
dist_url=`echo $nodename | awk '{print $1}'`
mpirun -np $np --allow-run-as-root --hostfile hostfile/hostfile-dl-$SLURM_JOB_ID --bind-to none `pwd`/run-7b-sft-lora-single.sh $dist_url $np
mpirun -np $np --allow-run-as-root --hostfile hostfile/hostfile-dl-$SLURM_JOB_ID --bind-to none `pwd`/run-7b-single-lora.sh $dist_url
import torch
from typing import Any, Dict, Generator, List, Optional, Tuple
from threading import Thread
from transformers import TextIteratorStreamer
from transformers import GenerationConfig, TextIteratorStreamer
from llmtuner.extras.misc import dispatch_model, get_logits_processor
from llmtuner.extras.template import get_template_and_fix_tokenizer
......@@ -14,7 +14,6 @@ class ChatModel:
model_args, data_args, finetuning_args, self.generating_args = get_infer_args(args)
self.model, self.tokenizer = load_model_and_tokenizer(model_args, finetuning_args)
self.model = dispatch_model(self.model)
self.model = self.model.eval() # enable evaluation mode
self.template = get_template_and_fix_tokenizer(data_args.template, self.tokenizer)
self.system_prompt = data_args.system_prompt
......@@ -41,26 +40,30 @@ class ChatModel:
max_length = input_kwargs.pop("max_length", None)
max_new_tokens = input_kwargs.pop("max_new_tokens", None)
gen_kwargs = self.generating_args.to_dict()
gen_kwargs.update(dict(
input_ids=input_ids,
do_sample=do_sample if do_sample is not None else gen_kwargs["do_sample"],
temperature=temperature or gen_kwargs["temperature"],
top_p=top_p or gen_kwargs["top_p"],
top_k=top_k or gen_kwargs["top_k"],
repetition_penalty=repetition_penalty or gen_kwargs["repetition_penalty"],
eos_token_id=list(set([self.tokenizer.eos_token_id] + self.tokenizer.additional_special_tokens_ids)),
pad_token_id=self.tokenizer.pad_token_id,
logits_processor=get_logits_processor()
generating_args = self.generating_args.to_dict()
generating_args.update(dict(
do_sample=do_sample if do_sample is not None else generating_args["do_sample"],
temperature=temperature or generating_args["temperature"],
top_p=top_p or generating_args["top_p"],
top_k=top_k or generating_args["top_k"],
repetition_penalty=repetition_penalty or generating_args["repetition_penalty"],
eos_token_id=[self.tokenizer.eos_token_id] + self.tokenizer.additional_special_tokens_ids,
pad_token_id=self.tokenizer.pad_token_id
))
if max_length:
gen_kwargs.pop("max_new_tokens", None)
gen_kwargs["max_length"] = max_length
generating_args.pop("max_new_tokens", None)
generating_args["max_length"] = max_length
if max_new_tokens:
gen_kwargs.pop("max_length", None)
gen_kwargs["max_new_tokens"] = max_new_tokens
generating_args.pop("max_length", None)
generating_args["max_new_tokens"] = max_new_tokens
gen_kwargs = dict(
inputs=input_ids,
generation_config=GenerationConfig(**generating_args),
logits_processor=get_logits_processor()
)
return gen_kwargs, prompt_length
......
......@@ -31,11 +31,15 @@ def preprocess_dataset(
yield query, response, history, system
def preprocess_pretrain_dataset(examples: Dict[str, List[Any]]) -> Dict[str, Any]:
# build grouped texts with format `X1 X2 X3 ...` (without <eos>)
if isinstance(getattr(tokenizer, "tokenizer", None), tiktoken.Encoding): # for tiktoken tokenizer (Qwen)
kwargs = dict(allowed_special="all")
# build grouped texts with format `X1 X2 X3 ...`
if isinstance(getattr(tokenizer, "tokenizer", None), tiktoken.Encoding):
kwargs = dict(allowed_special="all") # for tiktoken tokenizer (Qwen)
else:
kwargs = dict(add_special_tokens=False)
kwargs = dict(add_special_tokens=True)
if hasattr(tokenizer, "add_bos_token") and hasattr(tokenizer, "add_eos_token"):
setattr(tokenizer, "add_bos_token", True) # for LLaMA tokenizer
setattr(tokenizer, "add_eos_token", True)
tokenized_examples = tokenizer(examples["prompt"], **kwargs)
concatenated_examples = {k: list(chain(*tokenized_examples[k])) for k in tokenized_examples.keys()}
......@@ -59,7 +63,9 @@ def preprocess_dataset(
for query, response, history, system in construct_example(examples):
input_ids, labels = [], []
for source_ids, target_ids in template.encode_multiturn(tokenizer, query, response, history, system):
for turn_idx, (source_ids, target_ids) in enumerate(template.encode_multiturn(
tokenizer, query, response, history, system
)):
if len(source_ids) > data_args.max_source_length:
source_ids = source_ids[:data_args.max_source_length]
if len(target_ids) > data_args.max_target_length:
......@@ -68,8 +74,17 @@ def preprocess_dataset(
if len(input_ids) + len(source_ids) + len(target_ids) > max_length:
break
if turn_idx != 0 and template.efficient_eos:
source_mask = [tokenizer.eos_token_id] + [IGNORE_INDEX] * (len(source_ids) - 1)
else:
source_mask = [IGNORE_INDEX] * len(source_ids)
input_ids += source_ids + target_ids
labels += [IGNORE_INDEX] * len(source_ids) + target_ids
labels += source_mask + target_ids
if template.efficient_eos:
input_ids += [tokenizer.eos_token_id]
labels += [tokenizer.eos_token_id]
model_inputs["input_ids"].append(input_ids)
model_inputs["attention_mask"].append([1] * len(input_ids))
......@@ -89,6 +104,9 @@ def preprocess_dataset(
if len(target_ids) > data_args.max_target_length:
target_ids = target_ids[:data_args.max_target_length]
if template.efficient_eos:
target_ids += [tokenizer.eos_token_id]
model_inputs["input_ids"].append(source_ids)
model_inputs["attention_mask"].append([1] * len(source_ids))
model_inputs["labels"].append(target_ids)
......@@ -109,6 +127,10 @@ def preprocess_dataset(
if len(rejected_ids) > data_args.max_target_length:
rejected_ids = rejected_ids[:data_args.max_target_length]
if template.efficient_eos:
chosen_ids += [tokenizer.eos_token_id]
rejected_ids += [tokenizer.eos_token_id]
model_inputs["prompt_ids"].append(prompt_ids)
model_inputs["chosen_ids"].append(chosen_ids)
model_inputs["rejected_ids"].append(rejected_ids)
......
......@@ -5,7 +5,9 @@ from typing import TYPE_CHECKING
from datetime import timedelta
from transformers import TrainerCallback
from transformers.trainer_utils import has_length
from transformers.trainer_callback import TrainerControl, TrainerState
from transformers.trainer_utils import has_length, PREFIX_CHECKPOINT_DIR
from transformers.training_args import TrainingArguments
from llmtuner.extras.constants import LOG_FILE_NAME
from llmtuner.extras.logging import get_logger
......@@ -17,6 +19,24 @@ if TYPE_CHECKING:
logger = get_logger(__name__)
class SavePeftModelCallback(TrainerCallback):
def on_save(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
r"""
Event called after a checkpoint save.
"""
output_dir = os.path.join(args.output_dir, "{}-{}".format(PREFIX_CHECKPOINT_DIR, state.global_step))
getattr(kwargs.get("model"), "pretrained_model").save_pretrained(output_dir)
return control
def on_train_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
r"""
Event called at the end of training.
"""
getattr(kwargs.get("model"), "pretrained_model").save_pretrained(args.output_dir)
return control
class LogCallback(TrainerCallback):
def __init__(self, runner=None):
......
......@@ -2,28 +2,16 @@ IGNORE_INDEX = -100
LOG_FILE_NAME = "trainer_log.jsonl"
VALUE_HEAD_FILE_NAME = "value_head.bin"
FINETUNING_ARGS_NAME = "finetuning_args.json"
LAYERNORM_NAMES = ["norm", "ln_f", "ln_attn", "ln_mlp"]
METHODS = ["full", "freeze", "lora"]
STAGES = [
"SFT",
"Reward Modeling",
"PPO",
"DPO",
"Pre-Training"
]
DATASET_STAGE_MAP = {
"SFT": "sft",
"Pre-Training": "pt",
TRAINING_STAGES = {
"Supervised Fine-Tuning": "sft",
"Reward Modeling": "rm",
"PPO": "sft",
"DPO": "rm"
"PPO": "ppo",
"DPO": "dpo",
"Pre-Training": "pt"
}
SUPPORTED_MODELS = {
......@@ -54,11 +42,16 @@ SUPPORTED_MODELS = {
"Baichuan-7B": "baichuan-inc/Baichuan-7B",
"Baichuan-13B": "baichuan-inc/Baichuan-13B-Base",
"Baichuan-13B-Chat": "baichuan-inc/Baichuan-13B-Chat",
"Baichuan2-7B": "baichuan-inc/Baichuan2-7B-Base",
"Baichuan2-13B": "baichuan-inc/Baichuan2-13B-Base",
"Baichuan2-7B-Chat": "baichuan-inc/Baichuan2-7B-Chat",
"Baichuan2-13B-Chat": "baichuan-inc/Baichuan2-13B-Chat",
"InternLM-7B": "internlm/internlm-7b",
"InternLM-7B-Chat": "internlm/internlm-chat-7b",
"Qwen-7B": "Qwen/Qwen-7B",
"Qwen-7B-Chat": "Qwen/Qwen-7B-Chat",
"XVERSE-13B": "xverse/XVERSE-13B",
"XVERSE-13B-Chat": "xverse/XVERSE-13B-Chat",
"ChatGLM2-6B-Chat": "THUDM/chatglm2-6b"
}
......@@ -70,6 +63,7 @@ DEFAULT_MODULE = {
"BLOOMZ": "query_key_value",
"Falcon": "query_key_value",
"Baichuan": "W_pack",
"Baichuan2": "W_pack",
"InternLM": "q_proj,v_proj",
"Qwen": "c_attn",
"XVERSE": "q_proj,v_proj",
......@@ -80,7 +74,9 @@ DEFAULT_TEMPLATE = {
"LLaMA2": "llama2",
"ChineseLLaMA2": "llama2_zh",
"Baichuan": "baichuan",
"Baichuan2": "baichuan2",
"InternLM": "intern",
"Qwen": "chatml",
"XVERSE": "xverse",
"ChatGLM2": "chatglm2"
}
import gc
import torch
from typing import TYPE_CHECKING, List, Optional, Tuple
from typing import TYPE_CHECKING, Tuple
from transformers import InfNanRemoveLogitsProcessor, LogitsProcessorList
from llmtuner.extras.constants import LAYERNORM_NAMES
if TYPE_CHECKING:
from transformers.modeling_utils import PreTrainedModel
......@@ -28,12 +27,6 @@ class AverageMeter:
self.avg = self.sum / self.count
def get_logits_processor() -> LogitsProcessorList:
logits_processor = LogitsProcessorList()
logits_processor.append(InfNanRemoveLogitsProcessor())
return logits_processor
def count_parameters(model: torch.nn.Module) -> Tuple[int, int]:
r"""
Returns the number of trainable parameters and number of all parameters in the model.
......@@ -56,48 +49,17 @@ def count_parameters(model: torch.nn.Module) -> Tuple[int, int]:
return trainable_params, all_param
# Includes: (1) cast the layernorm in fp32 (2) make output embedding layer require grads (3) upcast the lm_head to fp32
# Inspired by: https://github.com/huggingface/peft/blob/c0209c35abbf88c63aa267800d98a8e212ed0a42/src/peft/utils/other.py#L35
def prepare_model_for_training(
model: "PreTrainedModel",
finetuning_type: str,
output_layer_name: Optional[str] = "lm_head",
use_gradient_checkpointing: Optional[bool] = True,
layer_norm_names: Optional[List[str]] = LAYERNORM_NAMES
) -> "PreTrainedModel":
for name, param in model.named_parameters():
if param.ndim == 1 and any(layer_norm_name in name for layer_norm_name in layer_norm_names):
param.data = param.data.to(torch.float32)
if use_gradient_checkpointing:
if hasattr(model, "enable_input_require_grads"):
model.enable_input_require_grads()
else:
def make_inputs_require_grad(module, input, output):
output.requires_grad_(True)
model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
model.gradient_checkpointing_enable()
model.config.use_cache = False # turn off when gradient checkpointing is enabled
if finetuning_type != "full" and hasattr(model, output_layer_name):
output_layer: torch.nn.Linear = getattr(model, output_layer_name)
input_dtype = output_layer.weight.dtype
class CastOutputToFloat(torch.nn.Sequential):
def forward(self, x: torch.Tensor) -> torch.Tensor:
return super().forward(x.to(input_dtype)).to(torch.float32)
setattr(model, output_layer_name, CastOutputToFloat(output_layer))
return model
def get_logits_processor() -> LogitsProcessorList:
logits_processor = LogitsProcessorList()
logits_processor.append(InfNanRemoveLogitsProcessor())
return logits_processor
def torch_gc() -> None:
r"""
Collects GPU memory.
"""
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
......
This diff is collapsed.
import os
import torch
from typing import Dict
from transformers.trainer import WEIGHTS_NAME
from transformers.trainer import WEIGHTS_NAME, WEIGHTS_INDEX_NAME
from transformers.modeling_utils import load_sharded_checkpoint
from llmtuner.extras.constants import VALUE_HEAD_FILE_NAME
from llmtuner.extras.logging import get_logger
logger = get_logger(__name__)
def get_state_dict(model: torch.nn.Module) -> Dict[str, torch.Tensor]:
state_dict: Dict[str, torch.Tensor] = model.state_dict()
filtered_state_dict = {}
for k, v in model.named_parameters():
if v.requires_grad:
filtered_state_dict[k] = state_dict[k].cpu().clone().detach()
return filtered_state_dict
def load_trainable_params(model: torch.nn.Module, checkpoint_dir: os.PathLike) -> bool:
weights_file = os.path.join(checkpoint_dir, WEIGHTS_NAME)
if os.path.exists(weights_file):
model_state_dict = torch.load(weights_file, map_location="cpu")
model.load_state_dict(model_state_dict, strict=False) # skip missing keys
elif os.path.exists(os.path.join(checkpoint_dir, WEIGHTS_INDEX_NAME)):
load_sharded_checkpoint(model, checkpoint_dir, strict=False)
else:
logger.warning("Provided path ({}) does not contain pre-trained weights.".format(checkpoint_dir))
return False
return True
def load_valuehead_params(model: torch.nn.Module, checkpoint_dir: os.PathLike) -> bool:
valuehead_file = os.path.join(checkpoint_dir, VALUE_HEAD_FILE_NAME)
if not os.path.exists(valuehead_file):
vhead_file = os.path.join(checkpoint_dir, WEIGHTS_NAME)
if not os.path.exists(vhead_file):
logger.warning("Provided path ({}) does not contain valuehead weights.".format(checkpoint_dir))
return False
valuehead_state_dict = torch.load(valuehead_file, map_location="cpu")
model.register_buffer("reward_head_weight", valuehead_state_dict["summary.weight"])
model.register_buffer("reward_head_bias", valuehead_state_dict["summary.bias"])
model.register_buffer("default_head_weight", torch.zeros_like(valuehead_state_dict["summary.weight"]))
model.register_buffer("default_head_bias", torch.zeros_like(valuehead_state_dict["summary.bias"]))
vhead_params = torch.load(vhead_file, map_location="cpu")
model.register_buffer("reward_head_weight", vhead_params["v_head.summary.weight"], persistent=False)
model.register_buffer("reward_head_bias", vhead_params["v_head.summary.bias"], persistent=False)
model.register_buffer("default_head_weight", torch.zeros_like(vhead_params["v_head.summary.weight"]), persistent=False)
model.register_buffer("default_head_bias", torch.zeros_like(vhead_params["v_head.summary.bias"]), persistent=False)
return True
......@@ -20,6 +20,7 @@ class Template:
sep: List[Union[str, Dict[str, str]]]
stop_words: List[str]
use_history: bool
efficient_eos: bool
def encode_oneturn(
self,
......@@ -74,19 +75,19 @@ class Template:
self,
tokenizer: "PreTrainedTokenizer"
) -> Tuple[List[int], List[int]]:
if (
tokenizer.bos_token_id is not None
and getattr(tokenizer, "add_bos_token", True)
): # baichuan-13b has no bos token
if tokenizer.bos_token_id is not None and getattr(tokenizer, "add_bos_token", True):
bos_ids = [tokenizer.bos_token_id]
else:
bos_ids = [] # bos token is optional
else: # baichuan, qwen and gpt2 models have no bos token
bos_ids = []
if tokenizer.eos_token_id is not None:
eos_ids = [tokenizer.eos_token_id]
else:
if tokenizer.eos_token_id is None:
raise ValueError("EOS token is required.")
if self.efficient_eos: # used in baichuan, qwen, chatglm, etc.
eos_ids = []
else:
eos_ids = [tokenizer.eos_token_id]
return bos_ids, eos_ids
def _encode(
......@@ -137,6 +138,8 @@ class Template:
token_ids = []
for elem in context:
if isinstance(elem, str):
if len(elem) == 0:
continue
elem = elem.replace("{{system}}", system, 1) if system is not None else elem
elem = elem.replace("{{query}}", query, 1) if query is not None else elem
elem = elem.replace("{{idx}}", idx, 1) if idx is not None else elem
......@@ -184,7 +187,8 @@ def register_template(
system: str,
sep: List[Union[str, Dict[str, str]]],
stop_words: Optional[List[str]] = [],
use_history: Optional[bool] = True
use_history: Optional[bool] = True,
efficient_eos: Optional[bool] = False
) -> None:
template_class = Llama2Template if "llama2" in name else Template
templates[name] = template_class(
......@@ -193,7 +197,8 @@ def register_template(
system=system,
sep=sep,
stop_words=stop_words,
use_history=use_history
use_history=use_history,
efficient_eos=efficient_eos
)
......@@ -201,31 +206,21 @@ def get_template_and_fix_tokenizer(
name: str,
tokenizer: "PreTrainedTokenizer"
) -> Template:
template = templates.get(name, None)
assert template is not None, "Template {} does not exist.".format(name)
additional_special_tokens = template.stop_words
if len(template.stop_words): # inplace method
if tokenizer.eos_token_id is not None:
additional_special_tokens.append(tokenizer.eos_token)
tokenizer.eos_token = additional_special_tokens[0] # use the first stop word as eos token
additional_special_tokens.pop(0)
logger.info("Replace eos token: {}".format(tokenizer.eos_token))
if tokenizer.eos_token_id is None:
tokenizer.eos_token = "<|endoftext|>"
logger.info("Add eos token: {}".format(tokenizer.eos_token))
if tokenizer.pad_token_id is None:
if tokenizer.unk_token_id is not None:
tokenizer.pad_token = tokenizer.unk_token
else:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token = tokenizer.eos_token
logger.info("Add pad token: {}".format(tokenizer.pad_token))
if name is None:
return None
template = templates.get(name, None)
assert template is not None, "Template {} does not exist.".format(name)
tokenizer.add_special_tokens(
dict(additional_special_tokens=additional_special_tokens),
dict(additional_special_tokens=template.stop_words),
replace_additional_special_tokens=False
)
return template
......@@ -464,18 +459,18 @@ register_template(
],
system="",
sep=[
{"token": "<eoa>"},
"\n"
],
stop_words=[
"</s>", # internlm cannot replace eos token
"<eoa>"
]
],
efficient_eos=True
)
r"""
Supports: https://huggingface.co/baichuan-inc/Baichuan-13B-Chat
Used for training and inference of the fine-tuned models.
"""
register_template(
name="baichuan",
......@@ -485,33 +480,31 @@ register_template(
prompt=[
{"token": "<reserved_102>"}, # user token
"{{query}}",
{"token": "<reserved_103>"} # assistant token
{"token": "<reserved_103>"} # assistant token
],
system="",
sep=[],
stop_words=[]
efficient_eos=True
)
r"""
Supports: https://huggingface.co/baichuan-inc/Baichuan-13B-Chat
Used for inference of the original model.
Supports: https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat
https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat
"""
register_template(
name="baichuan_eval",
name="baichuan2",
prefix=[
"{{system}}",
{"token": "<reserved_102>"} # user token
"{{system}}"
],
prompt=[
{"token": "<reserved_106>"}, # user token
"{{query}}",
{"token": "<reserved_103>"} # assistant token
{"token": "<reserved_107>"} # assistant token
],
system="",
sep=[],
stop_words=[
"<reserved_102>" # user token
]
efficient_eos=True
)
......@@ -524,7 +517,6 @@ register_template(
prefix=[
{"token": "<|system|>"},
"\n{{system}}",
{"token": "<|end|>"}
],
prompt=[
{"token": "<|user|>"},
......@@ -535,11 +527,13 @@ register_template(
],
system="",
sep=[
{"token": "<|end|>"},
"\n"
],
stop_words=[
"<|end|>"
]
],
efficient_eos=True
)
......@@ -550,8 +544,7 @@ register_template(
name="chatml",
prefix=[
{"token": "<|im_start|>"},
"system\n{{system}}",
{"token": "<|im_end|>"}
"system\n{{system}}"
],
prompt=[
{"token": "<|im_start|>"},
......@@ -563,11 +556,13 @@ register_template(
],
system="You are a helpful assistant.",
sep=[
{"token": "<|im_end|>"},
"\n"
],
stop_words=[
"<|im_end|>"
]
],
efficient_eos=True
)
......@@ -587,7 +582,8 @@ register_template(
system="",
sep=[
"\n\n"
]
],
efficient_eos=True
)
......
......@@ -11,24 +11,23 @@ class DatasetAttr:
dataset_name: Optional[str] = None
dataset_sha1: Optional[str] = None
system_prompt: Optional[str] = None
stage: Optional[str] = None
ranking: Optional[bool] = False
prompt: Optional[str] = "instruction"
query: Optional[str] = "input"
response: Optional[str] = "output"
history: Optional[str] = None
def __repr__(self) -> str:
return self.dataset_name
def __post_init__(self):
self.prompt = "instruction"
self.query = "input"
self.response = "output"
self.history = None
@dataclass
class DataArguments:
r"""
Arguments pertaining to what data we are going to input our model for training and evaluation.
"""
template: str = field(
template: Optional[str] = field(
default=None,
metadata={"help": "Which template to use for constructing prompts in training and inference."}
)
dataset: Optional[str] = field(
......@@ -36,7 +35,7 @@ class DataArguments:
metadata={"help": "The name of provided dataset(s) to use. Use commas to separate multiple datasets."}
)
dataset_dir: Optional[str] = field(
default="data",
default="/public/home/zhaoying1/work/Baichuan-13B-main/LLaMA-Efficient-Tuning-remove-pe/data",
metadata={"help": "The name of the folder containing datasets."}
)
split: Optional[str] = field(
......@@ -48,7 +47,7 @@ class DataArguments:
metadata={"help": "Enable streaming mode."}
)
buffer_size: Optional[int] = field(
default=16384,
default=1024,
metadata={"help": "Size of the buffer to randomly sample examples from in streaming mode."}
)
mix_strategy: Optional[Literal["concat", "interleave_under", "interleave_over"]] = field(
......@@ -114,21 +113,14 @@ class DataArguments:
raise ValueError("Undefined dataset {} in dataset_info.json.".format(name))
if "hf_hub_url" in dataset_info[name]:
dataset_attr = DatasetAttr(
"hf_hub",
dataset_name=dataset_info[name]["hf_hub_url"],
stage=dataset_info[name].get("stage", None))
dataset_attr = DatasetAttr("hf_hub", dataset_name=dataset_info[name]["hf_hub_url"])
elif "script_url" in dataset_info[name]:
dataset_attr = DatasetAttr(
"script",
dataset_name=dataset_info[name]["script_url"],
stage=dataset_info[name].get("stage", None))
dataset_attr = DatasetAttr("script", dataset_name=dataset_info[name]["script_url"])
else:
dataset_attr = DatasetAttr(
"file",
dataset_name=dataset_info[name]["file_name"],
dataset_sha1=dataset_info[name].get("file_sha1", None),
stage=dataset_info[name].get("stage", None)
dataset_sha1=dataset_info[name].get("file_sha1", None)
)
if "columns" in dataset_info[name]:
......@@ -137,5 +129,6 @@ class DataArguments:
dataset_attr.response = dataset_info[name]["columns"].get("response", None)
dataset_attr.history = dataset_info[name]["columns"].get("history", None)
dataset_attr.ranking = dataset_info[name].get("ranking", False)
dataset_attr.system_prompt = prompt_list[i]
self.dataset_list.append(dataset_attr)
......@@ -16,7 +16,7 @@ class ModelArguments:
metadata={"help": "Where to store the pretrained models downloaded from huggingface.co."}
)
use_fast_tokenizer: Optional[bool] = field(
default=False,
default=True,
metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}
)
use_auth_token: Optional[bool] = field(
......@@ -27,10 +27,6 @@ class ModelArguments:
default="main",
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}
)
padding_side: Optional[Literal["left", "right"]] = field(
default="left",
metadata={"help": "The side on which the model should have padding applied."}
)
quantization_bit: Optional[int] = field(
default=None,
metadata={"help": "The number of bits to quantize the model."}
......@@ -47,6 +43,10 @@ class ModelArguments:
default=None,
metadata={"help": "Adopt scaled rotary positional embeddings."}
)
flash_attn: Optional[bool] = field(
default=False,
metadata={"help": "Enable flash attention for faster training."}
)
checkpoint_dir: Optional[str] = field(
default=None,
metadata={"help": "Path to the directory(s) containing the delta model checkpoints as well as the configurations."}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment