Commit d74a64c4 authored by chenzk's avatar chenzk
Browse files

v1.0

parents
Pipeline #1450 canceled with stages
# 模型编码
modelCode=840
# 模型名称
modelName=firefly-llama3_unsloth
# 模型描述
modelDescription=Unsloth对Llama3-8B进行QLoRA训练,最少仅需7.75GB显存,这意味着我们可以在一张1080Ti级别的卡上训练Llama3-8B。
# 应用场景
appScenario=训练,对话问答,制造,广媒,金融,能源,医疗,家居,教育
# 框架类型
frameType=unsloth
# accelerate==0.21.0
# transformers==4.37.2
peft==0.10.0
# bitsandbytes==0.39.0
loguru==0.7.0
numpy==1.26.4
pandas==2.2.2
# tqdm==4.62.3
# deepspeed==0.9.5
tensorboard
sentencepiece
transformers_stream_generator
tiktoken
einops
httpx
scipy
# torch==1.13.1
mmengine
# xformers
astunparse==1.6.2
# flash_attn
datasets
trl==0.7.11
typing_extensions==4.9.0
mpi4py
from transformers import AutoTokenizer, AutoConfig, AddedToken
import torch
from loguru import logger
import copy
import sys
sys.path.append("../../")
from component.utils import ModelUtils
from component.template import template_dict
def build_prompt_chatglm3(tokenizer, query, history, system=None):
history.append({"role": 'user', 'message': query})
# system
input_ids = tokenizer.get_prefix_tokens() + \
[tokenizer.get_command(f"<|system|>")] + \
tokenizer.encode(system, add_special_tokens=False)
# convs
for item in history:
role, message = item['role'], item['message']
if role == 'user':
tokens = [tokenizer.get_command(f"<|user|>")] + \
tokenizer.encode(message, add_special_tokens=False) + \
[tokenizer.get_command(f"<|assistant|>")]
else:
tokens = tokenizer.encode(message, add_special_tokens=False) + [tokenizer.eos_token_id]
input_ids += tokens
return input_ids
def build_prompt(tokenizer, template, query, history, system=None):
template_name = template.template_name
system_format = template.system_format
user_format = template.user_format
assistant_format = template.assistant_format
system = system if system is not None else template.system
if template_name == 'chatglm2':
prompt = tokenizer.build_prompt(query, history)
input_ids = tokenizer.encode(prompt)
elif template_name == 'chatglm3':
input_ids = build_prompt_chatglm3(tokenizer, query, history, system)
else:
history.append({"role": 'user', 'message': query})
input_ids = []
# setting system information
if system_format is not None:
# system信息不为空
if system is not None:
system_text = system_format.format(content=system)
input_ids = tokenizer.encode(system_text, add_special_tokens=False)
# concat conversation
for item in history:
role, message = item['role'], item['message']
if role == 'user':
message = user_format.format(content=message, stop_token=tokenizer.eos_token)
else:
message = assistant_format.format(content=message, stop_token=tokenizer.eos_token)
tokens = tokenizer.encode(message, add_special_tokens=False)
input_ids += tokens
input_ids = torch.tensor([input_ids], dtype=torch.long)
return input_ids
def load_tokenizer(model_name_or_path):
# config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True)
# 加载tokenzier
tokenizer = AutoTokenizer.from_pretrained(
model_name_or_path,
trust_remote_code=True,
use_fast=False
# llama不支持fast
# use_fast=False if config.model_type == 'llama' else True
)
if tokenizer.__class__.__name__ == 'QWenTokenizer':
tokenizer.pad_token_id = tokenizer.eod_id
tokenizer.bos_token_id = tokenizer.eod_id
tokenizer.eos_token_id = tokenizer.eod_id
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# assert tokenizer.pad_token_id is not None, "pad_token_id should not be None"
return tokenizer
def main():
# 使用合并后的模型进行推理
# model_name_or_path = 'Qwen/Qwen-7B-Chat'
# template_name = 'qwen'
# adapter_name_or_path = None
model_name_or_path = '01-ai/Yi-6B-Chat'
template_name = 'yi'
adapter_name_or_path = None
template = template_dict[template_name]
# 是否使用4bit进行推理,能够节省很多显存,但效果可能会有一定的下降
load_in_4bit = False
# 生成超参配置
max_new_tokens = 500
top_p = 0.9
temperature = 0.35
repetition_penalty = 1.0
# 加载模型
logger.info(f'Loading model from: {model_name_or_path}')
logger.info(f'adapter_name_or_path: {adapter_name_or_path}')
model = ModelUtils.load_model(
model_name_or_path,
load_in_4bit=load_in_4bit,
adapter_name_or_path=adapter_name_or_path
).eval()
tokenizer = load_tokenizer(model_name_or_path if adapter_name_or_path is None else adapter_name_or_path)
if template_name == 'chatglm2':
stop_token_id = tokenizer.eos_token_id
elif template_name == 'chatglm3':
stop_token_id = [tokenizer.eos_token_id, tokenizer.get_command("<|user|>"), tokenizer.get_command("<|observation|>")]
else:
if template.stop_word is None:
template.stop_word = tokenizer.eos_token
stop_token_id = tokenizer.convert_tokens_to_ids(template.stop_word)
history = []
query = input('User:')
while True:
query = query.strip()
input_ids = build_prompt(tokenizer, template, query, copy.deepcopy(history), system=None).to(model.device)
outputs = model.generate(
input_ids=input_ids, max_new_tokens=max_new_tokens, do_sample=True,
top_p=top_p, temperature=temperature, repetition_penalty=repetition_penalty,
eos_token_id=stop_token_id
)
outputs = outputs.tolist()[0][len(input_ids[0]):]
response = tokenizer.decode(outputs)
response = response.strip().replace(template.stop_word, "").strip()
# update history
history.append({"role": 'user', 'message': query})
history.append({"role": 'assistant', 'message': response})
print("Firefly:{}".format(response))
query = input('User:')
if __name__ == '__main__':
main()
from mmengine.config import read_base
from opencompass.models import HuggingFaceCausalLM
batch_size = 20
# 指定评测模型
model_name_or_paths = [
'internlm/internlm-chat-7b',
'baichuan-inc/Baichuan-13B-Chat',
'THUDM/chatglm2-6b',
'YeungNLP/firefly-baichuan-7b',
'YeungNLP/firefly-baichuan-13b',
'YeungNLP/firefly-internlm-7b',
'YeungNLP/firefly-chatglm2-6b',
'YeungNLP/firefly-ziya-13b',
'YeungNLP/firefly-bloom-1b4',
'YeungNLP/firefly-bloom-2b6-v2',
'YeungNLP/firefly-qwen-7b',
'OpenBuddy/openbuddy-llama2-13b-v8.1-fp16',
'OpenBuddy/openbuddy-llama2-13b-v11.1-bf16',
]
models = []
for model_name_or_path in model_name_or_paths:
# baichuan-7b与qwen的pad_token_id为None,将无法正常评测
if 'baichuan-7b' in model_name_or_path.lower():
pad_token = '</s>'
elif 'qwen' in model_name_or_path.lower():
pad_token = '<|endoftext|>'
else:
pad_token = None
abbr = model_name_or_path.split('/')[-1]
model = dict(
type=HuggingFaceCausalLM,
abbr=abbr,
path=model_name_or_path,
tokenizer_path=model_name_or_path,
tokenizer_kwargs=dict(padding_side='left',
truncation_side='left',
use_fast=False,
trust_remote_code=True,
pad_token=pad_token
),
max_out_len=100,
max_seq_len=2048,
batch_size=batch_size,
model_kwargs=dict(device_map='auto', trust_remote_code=True),
batch_padding=False, # if false, inference with for-loop without batch padding
run_cfg=dict(num_gpus=2, num_procs=2),
)
models.append(model)
# 指定评测集
with read_base():
from .datasets.ceval.ceval_ppl import ceval_datasets
from .summarizers.example import summarizer
datasets = [*ceval_datasets]
# python run.py configs/eval_demo.py -w outputs/firefly
import json
import httpx
def main():
url = 'http://127.0.0.1:8877/firefly'
timeout = 60 # 超时设置
# 生成超参数
max_new_tokens = 500
top_p = 0.85
temperature = 0.35
repetition_penalty = 1.0
do_sample = True
inputs = '背诵李白的将进酒' # 请求内容
inputs = inputs.strip()
params = {
"inputs": inputs,
"max_new_tokens": max_new_tokens,
"top_p": top_p,
"temperature": temperature,
"repetition_penalty": repetition_penalty,
"do_sample": do_sample
}
timeout = httpx.Timeout(timeout)
headers = {"Content-Type": "application/json", "Connection": "close"}
session = httpx.Client(base_url="", headers=headers)
response = session.request("POST", url, json=params, timeout=timeout)
result = json.loads(response.text)['output']
print(result)
if __name__ == '__main__':
main()
from flask import Flask, request
import json
import torch
from loguru import logger
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
app = Flask(__name__)
app.config["JSON_AS_ASCII"] = False # 防止返回中文乱码
@app.route('/firefly', methods=['POST'])
def ds_llm():
params = request.get_json()
inputs = params.pop('inputs').strip()
# chatglm使用官方的数据组织格式
if model.config.model_type == 'chatglm':
text = '[Round 1]\n\n问:{}\n\n答:'.format(inputs)
input_ids = tokenizer(text, return_tensors="pt", add_special_tokens=False).input_ids.to(device)
# 为了兼容qwen-7b,因为其对eos_token进行tokenize,无法得到对应的eos_token_id
else:
input_ids = tokenizer(inputs, return_tensors="pt", add_special_tokens=False).input_ids.to(device)
bos_token_id = torch.tensor([[tokenizer.bos_token_id]], dtype=torch.long).to(device)
eos_token_id = torch.tensor([[tokenizer.eos_token_id]], dtype=torch.long).to(device)
input_ids = torch.concat([bos_token_id, input_ids, eos_token_id], dim=1)
logger.info(params)
input_ids = input_ids.to(device)
with torch.no_grad():
outputs = model.generate(input_ids=input_ids, eos_token_id=tokenizer.eos_token_id, **params)
outputs = outputs.tolist()[0][len(input_ids[0]):]
# response = tokenizer.batch_decode(outputs)
response = tokenizer.decode(outputs)
response = response.strip().replace(tokenizer.eos_token, "").strip()
result = {
'input': inputs,
'output': response
}
with open(log_file, 'a', encoding='utf8') as f:
data = json.dumps(result, ensure_ascii=False)
f.write('{}\n'.format(data))
return result
if __name__ == '__main__':
# 参数设置
model_name_or_path = 'YeungNLP/firefly-baichuan-13b'
log_file = 'service_history.txt'
port = 8877
device = 'cuda'
logger.info(f"Starting to load the model {model_name_or_path} into memory")
# 加载model和tokenizer
model = AutoModelForCausalLM.from_pretrained(
model_name_or_path,
trust_remote_code=True,
low_cpu_mem_usage=True,
torch_dtype=torch.float16,
device_map='auto'
).to(device).eval()
tokenizer = AutoTokenizer.from_pretrained(
model_name_or_path,
trust_remote_code=True,
# llama不支持fast
use_fast=False if model.config.model_type == 'llama' else True
)
# QWenTokenizer比较特殊,pad_token_id、bos_token_id、eos_token_id均为None。eod_id对应的token为<|endoftext|>
if tokenizer.__class__.__name__ == 'QWenTokenizer':
tokenizer.pad_token_id = tokenizer.eod_id
tokenizer.bos_token_id = tokenizer.eod_id
tokenizer.eos_token_id = tokenizer.eod_id
logger.info(f"Successfully loaded the model {model_name_or_path} into memory")
# 计算模型参数量
total = sum(p.numel() for p in model.parameters())
print("Total model params: %.2fM" % (total / 1e6))
model.eval()
app.run(port=port)
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
import torch
"""
使用该脚本,将lora的权重合并大base model中
"""
def merge_lora_to_base_model():
model_name_or_path = 'baichuan-inc/baichuan-7B'
adapter_name_or_path = 'YeungNLP/firefly-baichuan-7b-qlora-sft'
save_path = 'checkpoint/firefly-baichuan-7b-qlora-sft-merge'
config = AutoConfig.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(
adapter_name_or_path,
trust_remote_code=True,
# llama不支持fast
use_fast=False if config.model_type == 'llama' else True
)
model = AutoModelForCausalLM.from_pretrained(
model_name_or_path,
trust_remote_code=True,
low_cpu_mem_usage=True,
torch_dtype=torch.float16,
# device_map='auto',
device_map={'': 'cpu'}
)
model = PeftModel.from_pretrained(model, adapter_name_or_path, device_map={'': 'cpu'})
model = model.merge_and_unload()
tokenizer.save_pretrained(save_path)
model.save_pretrained(save_path)
if __name__ == '__main__':
merge_lora_to_base_model()
import argparse
from loguru import logger
import os
from os.path import join
import torch
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import bitsandbytes as bnb
from component.collator import PretrainCollator, SFTDataCollator
from component.argument import CustomizedArguments
from component.template import template_dict
from component.dataset import (
UnifiedSFTDataset,
ChatGLM2SFTDataset,
ChatGLM3SFTDataset,
UnifiedDPODataset
)
from transformers import (
set_seed,
HfArgumentParser,
TrainingArguments,
AutoTokenizer,
AutoModelForCausalLM,
AutoConfig,
BitsAndBytesConfig,
Trainer,
AddedToken
)
import importlib
if importlib.util.find_spec('unsloth') is not None:
from unsloth import FastLanguageModel
from datasets import load_dataset, concatenate_datasets
import datasets
from itertools import chain
from tqdm import tqdm
import json
from trl import DPOTrainer, get_kbit_device_map
import torch.nn as nn
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
def setup_everything():
parser = argparse.ArgumentParser()
# parser.add_argument("--train_args_file", type=str, default='train_args/pretrain/full/bloom-1b1-pretrain-full.json', help="")
parser.add_argument("--train_args_file", type=str, default='train_args/sft/qlora/qwen-7b-sft-qlora.json', help="")
parser.add_argument("--local_rank", type=int, help="")
args = parser.parse_args()
train_args_file = args.train_args_file
# 读取训练的参数配置
parser = HfArgumentParser((CustomizedArguments, TrainingArguments))
# 解析得到自定义参数,以及自带参数
args, training_args = parser.parse_json_file(json_file=train_args_file)
# 创建输出目录
if not os.path.exists(training_args.output_dir):
os.makedirs(training_args.output_dir)
logger.add(join(training_args.output_dir, 'train.log'))
logger.info("train_args:{}".format(training_args))
# 加载训练配置文件
with open(train_args_file, "r") as f:
train_args = json.load(f)
# 保存训练参数到输出目录
with open(join(training_args.output_dir, 'train_args.json'), "w") as f:
json.dump(train_args, f, indent=4)
# 设置随机种子
set_seed(training_args.seed)
# check some setting
assert args.task_type in ['pretrain', 'sft', 'dpo'], "task_type should be in ['pretrain', 'sft', 'dpo']"
assert args.train_mode in ['full', 'lora', 'qlora'], "task_type should be in ['full', 'lora', 'qlora']"
assert sum([training_args.fp16, training_args.bf16]) == 1, "only one of fp16 and bf16 can be True"
# assert not (args.task_type == 'dpo' and args.use_unsloth), 'We have not tested Unsloth during DPO yet. Please set use_unsloth=False when task_type=dpo'
return args, training_args
def find_all_linear_names(model, train_mode):
"""
找出所有全连接层,为所有全连接添加adapter
"""
assert train_mode in ['lora', 'qlora']
cls = bnb.nn.Linear4bit if train_mode == 'qlora' else nn.Linear
lora_module_names = set()
for name, module in model.named_modules():
if isinstance(module, cls):
names = name.split('.')
lora_module_names.add(names[0] if len(names) == 1 else names[-1])
if 'lm_head' in lora_module_names: # needed for 16-bit
lora_module_names.remove('lm_head')
lora_module_names = list(lora_module_names)
logger.info(f'LoRA target module names: {lora_module_names}')
return lora_module_names
def load_pretrain_dataset(training_args, args, tokenizer):
"""
多线程预处理预训练数据
"""
def tokenize_function(examples):
output = tokenizer(examples["text"])
output = {'input_ids': output.input_ids}
return output
def group_texts(examples):
# Concatenate all texts.
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs.
if total_length >= max_seq_length:
total_length = (total_length // max_seq_length) * max_seq_length
# Split by chunks of max_len.
result = {
k: [t[i: i + max_seq_length] for i in range(0, total_length, max_seq_length)]
for k, t in concatenated_examples.items()
}
return result
data_path = args.train_file
max_seq_length = args.max_seq_length
# 创建缓存路径
cache_dir = join(data_path, 'cache')
os.makedirs(cache_dir, exist_ok=True)
logger.info('Pretraining data path: {}'.format(data_path))
# 扫描所有jsonl文件
logger.info('Scanning all the training file...')
files = []
for root, dir_names, file_names in os.walk(data_path):
for file_name in file_names:
file = join(root, file_name)
if file_name.endswith('.jsonl'):
files.append(file)
logger.info(f'Total num of training file: {len(files)}')
# 预处理所有文本,将其id化,并且进行packing操作
with training_args.main_process_first(desc="dataset map tokenization and grouping"):
pretrain_dataset = [] # 汇总所有dataset
for idx, file in enumerate(tqdm(files)):
logger.info(f'Loading file: {file}')
file_name = os.path.basename(file)
file_name = file_name.replace('.jsonl', '')
cache_path = os.path.join(cache_dir, file_name)
os.makedirs(cache_path, exist_ok=True)
try:
processed_dataset = datasets.load_from_disk(cache_path, keep_in_memory=False)
logger.info(f'Finished loading datasets-{file_name} from cache')
except Exception:
tmp_cache_path = join(cache_path, 'tmp') # 临时缓存目录,会被自动删除
logger.info(f'There is no cache of file {file_name}, start preprocessing...')
raw_dataset = load_dataset("json", data_files=file, cache_dir=tmp_cache_path, keep_in_memory=False)
tokenized_dataset = raw_dataset.map(
tokenize_function,
batched=True,
num_proc=args.tokenize_num_workers,
remove_columns="text",
load_from_cache_file=True,
keep_in_memory=False,
cache_file_names={k: os.path.join(tmp_cache_path, 'tokenized.arrow') for k in raw_dataset},
desc="Running tokenizer on dataset",
)
grouped_datasets = tokenized_dataset.map(
group_texts,
batched=True,
num_proc=args.tokenize_num_workers,
load_from_cache_file=True,
keep_in_memory=False,
cache_file_names={k: os.path.join(tmp_cache_path, 'grouped.arrow') for k in tokenized_dataset},
desc=f"Grouping texts in chunks of {max_seq_length}",
)
processed_dataset = grouped_datasets
processed_dataset.save_to_disk(cache_path)
# 删除临时目录
# shutil.rmtree(tmp_cache_path)
logger.info(f"Training number of {file_name}: {len(processed_dataset['train'])}")
if idx == 0:
pretrain_dataset = processed_dataset['train']
else:
assert pretrain_dataset.features.type == processed_dataset["train"].features.type
pretrain_dataset = concatenate_datasets([pretrain_dataset, processed_dataset["train"]])
logger.info(f"Total training number: {len(pretrain_dataset)}")
return pretrain_dataset
def load_tokenizer(args):
config = AutoConfig.from_pretrained(args.model_name_or_path, trust_remote_code=True)
# 加载tokenzier
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
trust_remote_code=True,
# llama不支持fast
use_fast=False if config.model_type == 'llama' or config.model_type == 'internlm2' else True
)
# 部分模型的base与chat版本的tokenizer存在差异
if 'internlm2' in args.model_name_or_path.lower():
tokenizer._added_tokens_encoder.update({'<|im_start|>': 92543})
tokenizer._added_tokens_encoder.update({'<|im_end|>': 92542})
tokenizer._added_tokens_decoder.update({92543: AddedToken('<|im_start|>')})
tokenizer._added_tokens_decoder.update({92542: AddedToken('<|im_end|>')})
tokenizer.add_special_tokens({'additional_special_tokens': ['<|im_start|>', '<|im_end|>']})
elif 'orion' in args.model_name_or_path.lower():
tokenizer.add_special_tokens({'bos_token': '<s>', 'eos_token': '</s>'})
elif 'gemma' in args.model_name_or_path.lower():
tokenizer.add_special_tokens({'additional_special_tokens': ['<start_of_turn>', '<end_of_turn>']})
if tokenizer.__class__.__name__ == 'QWenTokenizer':
tokenizer.pad_token_id = tokenizer.eod_id
tokenizer.bos_token_id = tokenizer.eod_id
tokenizer.eos_token_id = tokenizer.eod_id
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
assert tokenizer.pad_token_id is not None, "pad_token_id should not be None"
assert tokenizer.eos_token_id is not None, "eos_token_id should not be None"
logger.info(f'vocab_size of tokenizer: {tokenizer.vocab_size}')
return tokenizer
def load_unsloth_model(args, training_args):
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=args.model_name_or_path,
max_seq_length=args.max_seq_length,
dtype=None,
trust_remote_code=True,
load_in_4bit=True if args.train_mode == 'qlora' else False,
)
if args.train_mode in ['lora', 'qlora']:
logger.info('Initializing PEFT Model...')
target_modules = find_all_linear_names(model, args.train_mode)
model = FastLanguageModel.get_peft_model(
model,
r=args.lora_rank,
target_modules=target_modules,
lora_alpha=args.lora_alpha,
lora_dropout=args.lora_dropout,
bias="none",
use_gradient_checkpointing=True,
random_state=training_args.seed,
max_seq_length=args.max_seq_length,
)
logger.info(f'target_modules: {target_modules}')
return {
'model': model,
'ref_model': None,
'peft_config': None
}
def load_model(args, training_args):
"""
加载模型
"""
assert training_args.bf16 or training_args.fp16, 'bf16 or fp16 should be True'
logger.info(f'Loading model from base model: {args.model_name_or_path}')
logger.info(f'Train model with {args.train_mode}')
# init model kwargs
# todo add flash attention
# attn_implementation = None
torch_dtype = torch.float16 if training_args.fp16 else torch.bfloat16
if args.train_mode == 'qlora':
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16 if training_args.fp16 else torch.bfloat16,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
llm_int8_threshold=6.0,
llm_int8_has_fp16_weight=False,
)
else:
quantization_config = None
model_kwargs = dict(
trust_remote_code=True,
# attn_implementation=attn_implementation,
torch_dtype=torch_dtype,
use_cache=False if training_args.gradient_checkpointing else True,
device_map=get_kbit_device_map() if quantization_config is not None else None,
quantization_config=quantization_config,
)
model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, **model_kwargs)
# moe模型,需要考虑负载均衡的loss
if 'output_router_logits' in model.config.to_dict():
logger.info('set output_router_logits as True')
model.config.output_router_logits = True
# QLoRA: casts all the non int8 modules to full precision (fp32) for stability
if args.train_mode == 'qlora' and args.task_type in ['pretrain', 'sft']:
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=training_args.gradient_checkpointing)
# LoRA: Enables the gradients for the input embeddings
if args.train_mode == 'lora' and args.task_type in ['pretrain', 'sft']:
# For backward compatibility
if hasattr(model, "enable_input_require_grads"):
model.enable_input_require_grads()
else:
def make_inputs_require_grad(module, input, output):
output.requires_grad_(True)
model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
# init peft_config
if args.train_mode == 'full':
peft_config = None
else:
# 找到所有需要插入adapter的全连接层
target_modules = find_all_linear_names(model, args.train_mode)
peft_config = LoraConfig(
r=args.lora_rank,
lora_alpha=args.lora_alpha,
target_modules=target_modules,
lora_dropout=args.lora_dropout,
bias="none",
task_type="CAUSAL_LM",
)
# init peft model
if args.train_mode in ['lora', 'qlora'] and args.task_type in ['pretrain', 'sft']:
model = get_peft_model(model, peft_config)
logger.info(f'memory footprint of model: {model.get_memory_footprint() / (1024 * 1024 * 1024)} GB')
model.print_trainable_parameters()
# init ref_model
if args.task_type == 'dpo':
ref_model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, **model_kwargs) if args.train_mode == 'full' else None
# pretrain和sft,不需要ref_model
else:
ref_model = None
# 计算模型参数量
total = sum(p.numel() for p in model.parameters())
logger.info("Total model params: %.2fM" % (total / 1e6))
return {
'model': model,
'ref_model': ref_model,
'peft_config': peft_config
}
def load_sft_dataset(args, tokenizer):
if args.template_name not in template_dict.keys():
raise Exception(f"template_name doesn't exist, all template_name: {template_dict.keys()}")
template = template_dict[args.template_name]
if 'chatglm2' in args.model_name_or_path.lower():
logger.info('Loading data with ChatGLM2SFTDataset')
train_dataset = ChatGLM2SFTDataset(args.train_file, tokenizer, args.max_seq_length, template)
elif 'chatglm3' in args.model_name_or_path.lower():
logger.info('Loading data with ChatGLM3SFTDataset')
train_dataset = ChatGLM3SFTDataset(args.train_file, tokenizer, args.max_seq_length, template)
else:
logger.info('Loading data with UnifiedSFTDataset')
train_dataset = UnifiedSFTDataset(args.train_file, tokenizer, args.max_seq_length, template)
return train_dataset
def load_dpo_dataset(args, tokenizer):
if args.template_name not in template_dict.keys():
raise Exception(f"template_name doesn't exist, all template_name: {template_dict.keys()}")
template = template_dict[args.template_name]
train_dataset = UnifiedDPODataset(args.train_file, tokenizer, args.max_seq_length, args.max_prompt_length, template)
return train_dataset
def init_components(args, training_args):
"""
初始化各个组件
"""
training_args.ddp_find_unused_parameters = False
logger.info('Initializing components...')
# 加载tokenizer
tokenizer = load_tokenizer(args)
# 加载model
if args.use_unsloth:
components = load_unsloth_model(args, training_args)
else:
components = load_model(args, training_args)
model = components['model']
ref_model = components['ref_model']
peft_config = components['peft_config']
# 初始化dataset和collator
if args.task_type == 'pretrain':
logger.info('Train model with pretrain task')
train_dataset = load_pretrain_dataset(training_args, args, tokenizer)
data_collator = PretrainCollator(tokenizer, args.max_seq_length)
elif args.task_type == 'sft':
logger.info('Train model with sft task')
train_dataset = load_sft_dataset(args, tokenizer)
data_collator = SFTDataCollator(tokenizer, args.max_seq_length)
else:
logger.info('Train model with dpo task')
train_dataset = load_dpo_dataset(args, tokenizer)
data_collator = None
# dpo
if args.task_type == 'dpo':
trainer = DPOTrainer(
model,
ref_model,
args=training_args,
beta=args.beta,
train_dataset=train_dataset,
data_collator=data_collator,
tokenizer=tokenizer,
peft_config=peft_config
)
# pretrain or sft
else:
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
tokenizer=tokenizer,
data_collator=data_collator,
)
return trainer
def main():
# 进行一些配置和检查
args, training_args = setup_everything()
# 加载各种组件
trainer = init_components(args, training_args)
# 开始训练
logger.info("*** starting training ***")
train_result = trainer.train()
# 保存最好的checkpoint
final_save_path = join(training_args.output_dir)
trainer.save_model(final_save_path) # Saves the tokenizer too
# 保存训练指标
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()
if __name__ == "__main__":
main()
{
"output_dir": "output/firefly-minicpm-2b-dpo-full",
"model_name_or_path": "openbmb/MiniCPM-2B-dpo-fp16",
"train_file": "./data/dummy_dpo.jsonl",
"template_name": "minicpm",
"train_mode": "full",
"task_type": "dpo",
"beta": 0.1,
"num_train_epochs": 1,
"per_device_train_batch_size": 1,
"gradient_accumulation_steps": 16,
"learning_rate": 5e-7,
"max_seq_length": 1024,
"max_prompt_length": 300,
"logging_steps": 100,
"save_steps": 100,
"save_total_limit": 1,
"lr_scheduler_type": "constant_with_warmup",
"warmup_steps": 100,
"lora_rank": 64,
"lora_alpha": 16,
"lora_dropout": 0.05,
"gradient_checkpointing": true,
"disable_tqdm": false,
"optim": "paged_adamw_32bit",
"seed": 42,
"fp16": true,
"report_to": "tensorboard",
"dataloader_num_workers": 0,
"save_strategy": "steps",
"weight_decay": 0,
"max_grad_norm": 0.3,
"remove_unused_columns": false
}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment