Commit d0572507 authored by lvzhen's avatar lvzhen
Browse files

Deleted basic_demo/cli_demo.py, basic_demo/cli_demo_bad_word_ids.py,...

Deleted basic_demo/cli_demo.py, basic_demo/cli_demo_bad_word_ids.py, basic_demo/infer_test.py, basic_demo/utils.py, basic_demo/vocab.txt, basic_demo/web_demo.py, basic_demo/web_demo2.py, composite_demo/.streamlit/config.toml, composite_demo/assets/demo.png, composite_demo/assets/emojis.png, composite_demo/assets/heart.png, composite_demo/assets/tool.png, composite_demo/README.md, composite_demo/README_en.md, composite_demo/client.py, composite_demo/conversation.py, composite_demo/demo_chat.py, composite_demo/demo_ci.py, composite_demo/demo_tool.py, composite_demo/main.py, composite_demo/requirements.txt, composite_demo/tool_registry.py, cookbook/data/toutiao_cat_data_example.txt, cookbook/accurate_prompt.ipynb, cookbook/finetune_muti_classfication.ipynb, finetune_basemodel_demo/scripts/finetune_lora.sh, finetune_basemodel_demo/scripts/formate_alpaca2jsonl.py, finetune_basemodel_demo/README.md, finetune_basemodel_demo/arguments.py, finetune_basemodel_demo/finetune.py, finetune_basemodel_demo/inference.py, finetune_basemodel_demo/preprocess_utils.py, finetune_basemodel_demo/requirements.txt, finetune_basemodel_demo/trainer.py, finetune_chatmodel_demo/AdvertiseGen/dev.json, finetune_chatmodel_demo/AdvertiseGen/train.json, finetune_chatmodel_demo/configs/deepspeed.json, finetune_chatmodel_demo/formatted_data/advertise_gen.jsonl, finetune_chatmodel_demo/formatted_data/tool_alpaca.jsonl, finetune_chatmodel_demo/scripts/finetune_ds.sh, finetune_chatmodel_demo/scripts/finetune_ds_multiturn.sh, finetune_chatmodel_demo/scripts/finetune_pt.sh, finetune_chatmodel_demo/scripts/finetune_pt_multiturn.sh, finetune_chatmodel_demo/scripts/format_advertise_gen.py, finetune_chatmodel_demo/scripts/format_tool_alpaca.py, finetune_chatmodel_demo/README.md, finetune_chatmodel_demo/arguments.py, finetune_chatmodel_demo/finetune.py, finetune_chatmodel_demo/inference.py, finetune_chatmodel_demo/preprocess_utils.py, finetune_chatmodel_demo/requirements.txt, finetune_chatmodel_demo/train_data.json, finetune_chatmodel_demo/trainer.py, langchain_demo/Tool/Calculator.py, langchain_demo/Tool/Calculator.yaml, langchain_demo/Tool/Weather.py, langchain_demo/Tool/arxiv_example.yaml, langchain_demo/Tool/weather.yaml, langchain_demo/ChatGLM3.py, langchain_demo/README.md, langchain_demo/main.py, langchain_demo/requirements.txt, langchain_demo/utils.py, media/GLM.png, media/cli.png, media/transformers.jpg, openai_api_demo/openai_api.py, openai_api_demo/openai_api_request.py, openai_api_demo/requirements.txt, openai_api_demo/utils.py, resources/WECHAT.md, resources/cli-demo.png, resources/code_en.gif, resources/heart.png, resources/tool.png, resources/tool_en.png, resources/web-demo.gif, resources/web-demo2.gif, resources/web-demo2.png, resources/wechat.jpg, tool_using/README.md, tool_using/README_en.md, tool_using/cli_demo_tool.py, tool_using/openai_api_demo.py, tool_using/requirements.txt, tool_using/test.py, tool_using/tool_register.py, DEPLOYMENT.md, DEPLOYMENT_en.md, Dockerfile, MODEL_LICENSE, PROMPT.md, PROMPT_en.md, README.md, README_en.md, README_old.md, lvzhen.log, model.properties, requirements.txt files
parent d7be7b1c
#! /usr/bin/env bash
set -ex
LR=1e-4
NUM_GPUS=4
LORA_RANK=8
LORA_ALPHA=32
LORA_DROUPOUT=0.1
MAX_SOURCE_LEN=512
MAX_TARGET_LEN=128
DEV_BATCH_SIZE=1
GRAD_ACCUMULARION_STEPS=1
MAX_STEP=500
SAVE_INTERVAL=50
MAX_SEQ_LEN=512
RUN_NAME=text
BASE_MODEL_PATH=THUDM/chatglm3-6b-base
DATASET_PATH=data/alpaca_data.jsonl
DATESTR=`date +%Y%m%d-%H%M%S`
OUTPUT_DIR=output/${RUN_NAME}-${DATESTR}-${LR}
MASTER_PORT=$(shuf -n 1 -i 10000-65535)
mkdir -p $OUTPUT_DIR
torchrun --standalone --nnodes=1 --nproc_per_node=$NUM_GPUS finetune.py \
--train_format input-output \
--train_file $DATASET_PATH \
--lora_rank $LORA_RANK \
--lora_alpha $LORA_ALPHA \
--lora_dropout $LORA_DROUPOUT \
--max_seq_length $MAX_SEQ_LEN \
--preprocessing_num_workers 1 \
--model_name_or_path $BASE_MODEL_PATH \
--output_dir $OUTPUT_DIR \
--per_device_train_batch_size $DEV_BATCH_SIZE \
--gradient_accumulation_steps $GRAD_ACCUMULARION_STEPS \
--max_steps $MAX_STEP \
--logging_steps 1 \
--save_steps $SAVE_INTERVAL \
--learning_rate $LR 2>&1 | tee ${OUTPUT_DIR}/train.log
import argparse
import json
import tqdm
def format_example(example: dict) -> dict:
context = f"Instruction: {example['instruction']}\n"
if example.get("input"):
context += f"Input: {example['input']}\n"
context += "Answer: "
target = example["output"]
return {"context": context, "target": target}
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--data_path", type=str, default="data/alpaca_data.json")
parser.add_argument("--save_path", type=str, default="data/alpaca_data.jsonl")
args = parser.parse_args()
print("args:", args)
with open(args.data_path) as f:
examples = json.load(f)
with open(args.save_path, 'w') as f:
for example in tqdm.tqdm(examples, desc="formatting.."):
f.write(json.dumps(format_example(example), ensure_ascii=False) + '\n')
if __name__ == "__main__":
main()
# coding=utf-8
# Copyright 2020-present the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
The Trainer class, to easily train a 🤗 Transformers from scratch or finetune it on a new task.
"""
import os
from typing import Optional
from transformers import Trainer
import torch
from transformers.modeling_utils import PreTrainedModel, unwrap_model
from transformers.utils import logging
logger = logging.get_logger(__name__)
WEIGHTS_NAME = "pytorch_model.pt"
TRAINING_ARGS_NAME = "training_args.bin"
class LoRATrainer(Trainer):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def compute_loss(self, model, inputs, return_outputs=False):
return model(**inputs).loss
def save_model(self, output_dir=None, _internal_call=False):
output_dir = output_dir if output_dir is not None else self.args.output_dir
os.makedirs(output_dir, exist_ok=True)
logger.info(f"Saving model checkpoint to {output_dir}")
model_to_save = unwrap_model(self.model)
saved_params = {
k: v.to("cuda") for k, v in model_to_save.named_parameters() if v.requires_grad
}
torch.save(saved_params, os.path.join(output_dir, WEIGHTS_NAME))
if self.tokenizer is not None:
self.tokenizer.save_pretrained(output_dir)
torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
This diff is collapsed.
This diff is collapsed.
# ChatGLM3-6B 微调示例
本目录提供 ChatGLM3-6B 模型的微调示例,包括全量微调和 P-Tuning v2。格式上,提供多轮对话微调样例和输入输出格式微调样例。
如果将模型下载到了本地,本文和代码中的 `THUDM/chatglm3-6b` 字段均应替换为相应地址以从本地加载模型。
运行示例需要 `python>=3.10`,除基础的 `torch` 依赖外,示例代码运行还需要依赖
```bash
pip install requirements.txt
```
## 多轮对话格式
多轮对话微调示例采用 ChatGLM3 对话格式约定,对不同角色添加不同 `loss_mask` 从而在一遍计算中为多轮回复计算 `loss`
### 数据格式和预处理
对于数据文件,样例采用如下格式
如果您仅希望微调模型的对话能力,而非工具能力,您应该按照以下格式整理数据。
```json
[
{
"conversations": [
{
"role": "system",
"content": "<system prompt text>"
},
{
"role": "user",
"content": "<user prompt text>"
},
{
"role": "assistant",
"content": "<assistant response text>"
},
// ... Muti Turn
{
"role": "user",
"content": "<user prompt text>"
},
{
"role": "assistant",
"content": "<assistant response text>"
}
]
}
// ...
]
```
**请注意,这种方法在微调的step较多的情况下会影响到模型的工具调用功能**
如果您希望微调模型的对话和工具能力,您应该按照以下格式整理数据。
```json
[
{
"tools": [
// available tools, format is not restricted
],
"conversations": [
{
"role": "system",
"content": "<system prompt text>"
},
{
"role": "user",
"content": "<user prompt text>"
},
{
"role": "assistant",
"content": "<assistant thought to text>"
},
{
"role": "tool",
"name": "<name of the tool to be called",
"parameters": {
"<parameter_name>": "<parameter_value>"
},
"observation": "<observation>"
// don't have to be string
},
{
"role": "assistant",
"content": "<assistant response to observation>"
},
// ... Muti Turn
{
"role": "user",
"content": "<user prompt text>"
},
{
"role": "assistant",
"content": "<assistant response text>"
}
]
}
// ...
]
```
- 关于工具描述的 system prompt 无需手动插入,预处理时会将 `tools` 字段使用 `json.dumps(..., ensure_ascii=False)` 格式化后插入为首条 system prompt。
- 每种角色可以附带一个 `bool` 类型的 `loss` 字段,表示该字段所预测的内容是否参与 `loss` 计算。若没有该字段,样例实现中默认对 `system`, `user` 不计算 `loss`,其余角色则计算 `loss`
- `tool` 并不是 ChatGLM3 中的原生角色,这里的 `tool` 在预处理阶段将被自动转化为一个具有工具调用 `metadata``assistant` 角色(默认计算 `loss`)和一个表示工具返回值的 `observation` 角色(不计算 `loss`)。
- 目前暂未实现 `Code interpreter`的微调任务。
- `system` 角色为可选角色,但若存在 `system` 角色,其必须出现在 `user` 角色之前,且一个完整的对话数据(无论单轮或者多轮对话)只能出现一次 `system` 角色。
作为示例,我们使用 ToolAlpaca 数据集来进行微调。首先,克隆 [ToolAlpaca 数据集](https://github.com/tangqiaoyu/ToolAlpaca),并使用
```bash
./scripts/format_tool_alpaca.py --path "ToolAlpaca/data/train_data.json"
```
将数据集处理成上述格式。在这里,我们有意将工具处理成了了 `list[str]` 这样的自然语言形式,以观察模型在微调前后对工具定义的理解能力。
### 微调模型
以下脚本提供了微调模型的参考方式。
```bash
./scripts/finetune_ds_multiturn.sh # 全量微调
./scripts/finetune_pt_multiturn.sh # P-Tuning v2 微调
```
### 部署
我们更新了 ChatGLM3 的综合 Demo,使其可以部署微调后的模型 checkpoint。
对于全量微调,可以使用以下方式进行部署
```bash
cd ../composite_demo
MODEL_PATH="path to finetuned model checkpoint" TOKENIZER_PATH="THUDM/chatglm3-6b" streamlit run main.py
```
对于 P-Tuning v2 微调,可以使用以下方式进行部署
```bash
cd ../composite_demo
MODEL_PATH="THUDM/chatglm3-6b" PT_PATH="path to p-tuning checkpoint" streamlit run main.py
```
## 输入输出格式
对于输入-输出格式,样例采用如下输入格式
```json
[
{
"prompt": "<prompt text>",
"response": "<response text>"
}
// ...
]
```
预处理时,不会拼接任何角色标识符。
作为示例,我们使用 AdvertiseGen 数据集来进行微调。从 [Google Drive](https://drive.google.com/file/d/13_vf0xRTQsyneRKdD1bZIr93vBGOczrk/view?usp=sharing) 或者 [Tsinghua Cloud](https://cloud.tsinghua.edu.cn/f/b3f119a008264b1cabd1/?dl=1) 下载处理好的 AdvertiseGen 数据集,将解压后的 `AdvertiseGen` 目录放到本目录下。
```bash
./scripts/format_advertise_gen.py --path "AdvertiseGen/train.json"
```
来下载和将数据集处理成上述格式。
### 微调模型
以下脚本提供了微调模型的参考方式。
```bash
./scripts/finetune_ds.sh # 全量微调
./scripts/finetune_pt.sh # P-Tuning v2 微调
```
### 推理验证
对于输入输出格式的微调,可使用 `inference.py` 进行基本的推理验证。
```bash
python inference.py \
--pt-checkpoint "path to p-tuning checkpoint" \
--model THUDM/chatglm3-6b
```
```bash
python inference.py \
--tokenizer THUDM/chatglm3-6b \
--model "path to finetuned model checkpoint"
```
### 提示
1. 微调代码在开始训练前,会先打印首条训练数据的预处理信息,显示为
```log
Sanity Check >>>>>>>>>>>>>
'[gMASK]': 64790 -> -100
'sop': 64792 -> -100
'<|system|>': 64794 -> -100
'': 30910 -> -100
'\n': 13 -> -100
'Answer': 20115 -> -100
'the': 267 -> -100
'following': 1762 -> -100
...
'know': 683 -> -100
'the': 267 -> -100
'response': 3010 -> -100
'details': 3296 -> -100
'.': 30930 -> -100
'<|assistant|>': 64796 -> -100
'': 30910 -> 30910
'\n': 13 -> 13
'I': 307 -> 307
'need': 720 -> 720
'to': 289 -> 289
'use': 792 -> 792
...
<<<<<<<<<<<<< Sanity Check
```
字样,每行依次表示一个 detokenized string, token_id 和 target_id。可在日志中查看这部分的 `loss_mask` 是否符合预期。若不符合,可能需要调整代码或数据。
2. 参考显存用量
- P-Tuning V2 `PRE_SEQ_LEN=128`, `DEV_BATCH_SIZE=1`, `GRAD_ACCUMULARION_STEPS=16`, `MAX_SEQ_LEN=2048` 配置下约需要 21GB 显存。
- 全量微调时,`./scripts/finetune_ds_multiturn.sh` 中的配置(`MAX_SEQ_LEN=2048`, `DEV_BATCH_SIZE=16`, `GRAD_ACCUMULARION_STEPS=1`)恰好用满 4 * 80GB 显存。
3. 若尝试后发现显存不足,可以考虑
- 尝试降低 `DEV_BATCH_SIZE` 并提升 `GRAD_ACCUMULARION_STEPS`
- 尝试添加 `--quantization_bit 8``--quantization_bit 4`
- `PRE_SEQ_LEN=128`, `DEV_BATCH_SIZE=1`, `GRAD_ACCUMULARION_STEPS=16`, `MAX_SEQ_LEN=1024` 配置下,`--quantization_bit 8` 约需 12GB 显存,`--quantization_bit 4` 约需 7.6GB 显存。
## 参考文献
```
@inproceedings{liu2022p,
title={P-tuning: Prompt tuning can be comparable to fine-tuning across scales and tasks},
author={Liu, Xiao and Ji, Kaixuan and Fu, Yicheng and Tam, Weng and Du, Zhengxiao and Yang, Zhilin and Tang, Jie},
booktitle={Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)},
pages={61--68},
year={2022}
}
@misc{tang2023toolalpaca,
title={ToolAlpaca: Generalized Tool Learning for Language Models with 3000 Simulated Cases},
author={Qiaoyu Tang and Ziliang Deng and Hongyu Lin and Xianpei Han and Qiao Liang and Le Sun},
year={2023},
eprint={2306.05301},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
```
from dataclasses import dataclass, field
from typing import Optional
@dataclass
class ModelArguments:
"""
Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
"""
model_name_or_path: str = field(
metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
)
ptuning_checkpoint: str = field(
default=None, metadata={"help": "Path to p-tuning v2 checkpoints"}
)
config_name: Optional[str] = field(
default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
)
tokenizer_name: Optional[str] = field(
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
)
cache_dir: Optional[str] = field(
default=None,
metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
)
use_fast_tokenizer: bool = field(
default=True,
metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
)
model_revision: str = field(
default="main",
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
)
use_auth_token: bool = field(
default=False,
metadata={
"help": (
"Will use the token generated when running `huggingface-cli login` (necessary to use this script "
"with private models)."
)
},
)
resize_position_embeddings: Optional[bool] = field(
default=None,
metadata={
"help": (
"Whether to automatically resize the position embeddings if `max_source_length` exceeds "
"the model's position embeddings."
)
},
)
quantization_bit: Optional[int] = field(
default=None
)
pre_seq_len: Optional[int] = field(
default=None
)
prefix_projection: bool = field(
default=False
)
@dataclass
class DataTrainingArguments:
"""
Arguments pertaining to what data we are going to input our model for training and eval.
"""
train_file: Optional[str] = field(
default=None, metadata={"help": "The input training data file (a jsonlines or csv file)."}
)
max_seq_length: Optional[int] = field(
default=2048,
metadata={
"help": (
"The maximum total input sequence length after tokenization. Sequences longer "
"than this will be truncated."
)
},
)
max_source_length: Optional[int] = field(
default=1024,
metadata={
"help": (
"The maximum total input sequence length after tokenization. Sequences longer "
"than this will be truncated, sequences shorter will be padded."
)
},
)
max_target_length: Optional[int] = field(
default=128,
metadata={
"help": (
"The maximum total sequence length for target text after tokenization. Sequences longer "
"than this will be truncated, sequences shorter will be padded."
)
},
)
train_format: str = field(
default=None, metadata={"help": "The format of the training data file (mulit-turn or input-output)"},
)
overwrite_cache: bool = field(
default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
)
preprocessing_num_workers: Optional[int] = field(
default=None,
metadata={"help": "The number of processes to use for the preprocessing."},
)
max_seq_length: Optional[int] = field(
default=1024,
metadata={
"help": (
"The maximum total input sequence length after tokenization. Sequences longer "
"than this will be truncated, sequences shorter will be padded."
)
},
)
pad_to_max_length: bool = field(
default=False,
metadata={
"help": (
"Whether to pad all samples to model maximum sentence length. "
"If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
"efficient on GPU but very bad for TPU."
)
},
)
max_train_samples: Optional[int] = field(
default=None,
metadata={
"help": (
"For debugging purposes or quicker training, truncate the number of training examples to this "
"value if set."
)
},
)
def __post_init__(self):
extension = self.train_file.split(".")[-1]
assert extension in {"jsonl", "json"}, "`train_file` should be a jsonl or a json file."
assert self.train_format in {"multi-turn", "input-output"}
\ No newline at end of file
{
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu" :"auto",
"gradient_accumulation_steps": "auto",
"zero_allow_untested_optimizer": true,
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"initial_scale_power": 16,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
"zero_optimization": {
"stage": 3,
"stage3_gather_16bit_weights_on_model_save": true,
"allgather_partitions": true,
"allgather_bucket_size": 5e8,
"overlap_comm": false,
"reduce_scatter": true,
"reduce_bucket_size": 5e8,
"contiguous_gradients" : true
}
}
#!/usr/bin/env python
# coding=utf-8
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Fine-tuning the library models for sequence to sequence.
"""
# You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments.
# Adapted from
import logging
import os
import sys
import torch
import json
import transformers
from transformers import (
AutoConfig,
AutoModel,
AutoTokenizer,
DataCollatorForSeq2Seq,
HfArgumentParser,
Seq2SeqTrainingArguments,
set_seed,
)
from trainer import PrefixTrainer
from arguments import ModelArguments, DataTrainingArguments
from preprocess_utils import sanity_check, MultiTurnDataset, InputOutputDataset
logger = logging.getLogger(__name__)
# import os
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32"
def main():
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))
if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
# If we pass only one argument to the script and it's the path to a json file,
# let's parse it to get our arguments.
model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
else:
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
# Setup logging
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)],
)
if training_args.should_log:
# The default of training_args.log_level is passive, so we set log level at info here to have that default.
transformers.utils.logging.set_verbosity_info()
log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
# datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
# Log on each process the small summary:
logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
)
logger.info(f"Training/evaluation parameters {training_args}")
# Set seed before initializing model.
set_seed(training_args.seed)
# Load pretrained model and tokenizer
config = AutoConfig.from_pretrained(model_args.model_name_or_path, trust_remote_code=True)
config.pre_seq_len = model_args.pre_seq_len
config.prefix_projection = model_args.prefix_projection
tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, trust_remote_code=True)
if model_args.ptuning_checkpoint is not None:
model = AutoModel.from_pretrained(model_args.model_name_or_path, config=config, trust_remote_code=True)
prefix_state_dict = torch.load(os.path.join(model_args.ptuning_checkpoint, "pytorch_model.bin"))
new_prefix_state_dict = {}
for k, v in prefix_state_dict.items():
if k.startswith("transformer.prefix_encoder."):
new_prefix_state_dict[k[len("transformer.prefix_encoder."):]] = v
model.transformer.prefix_encoder.load_state_dict(new_prefix_state_dict)
elif model_args.pre_seq_len is not None:
model = AutoModel.from_pretrained(model_args.model_name_or_path, config=config, trust_remote_code=True)#,empty_init=False)
else:
model = AutoModel.from_pretrained(model_args.model_name_or_path, config=config, trust_remote_code=True,empty_init=False)
if model_args.quantization_bit is not None:
print(f"Quantized to {model_args.quantization_bit} bit")
model = model.quantize(model_args.quantization_bit)
if model_args.pre_seq_len is not None:
# P-tuning v2
model = model.half()
model.transformer.prefix_encoder.float()
else:
# Finetune
model = model.float()
with open(data_args.train_file, "r", encoding="utf-8") as f:
if data_args.train_file.endswith(".json"):
train_data = json.load(f)
elif data_args.train_file.endswith(".jsonl"):
train_data = [json.loads(line) for line in f]
if data_args.train_format == "multi-turn":
train_dataset = MultiTurnDataset(
train_data,
tokenizer,
data_args.max_seq_length,
)
elif data_args.train_format == "input-output":
train_dataset = InputOutputDataset(
train_data,
tokenizer,
data_args.max_source_length,
data_args.max_target_length,
)
else:
raise ValueError(f"Unknown train format: {data_args.train_format}")
if training_args.local_rank < 1:
sanity_check(train_dataset[0]['input_ids'], train_dataset[0]['labels'], tokenizer)
# Data collator
data_collator = DataCollatorForSeq2Seq(
tokenizer,
model=model,
label_pad_token_id=-100,
pad_to_multiple_of=None,
padding=False
)
# Initialize our Trainer
trainer = PrefixTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
tokenizer=tokenizer,
data_collator=data_collator,
save_changed=model_args.pre_seq_len is not None
)
checkpoint = None
if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint
model.gradient_checkpointing_enable()
model.enable_input_require_grads()
trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model() # Saves the tokenizer too for easy upload
trainer.save_state()
if __name__ == "__main__":
main()
This diff is collapsed.
import argparse
from transformers import AutoConfig, AutoModel, AutoTokenizer
import torch
import os
parser = argparse.ArgumentParser()
parser.add_argument("--pt-checkpoint", type=str, default=None, help="The checkpoint path")
parser.add_argument("--model", type=str, default=None, help="main model weights")
parser.add_argument("--tokenizer", type=str, default=None, help="main model weights")
parser.add_argument("--pt-pre-seq-len", type=int, default=128, help="The pre-seq-len used in p-tuning")
parser.add_argument("--device", type=str, default="cuda")
parser.add_argument("--max-new-tokens", type=int, default=128)
args = parser.parse_args()
if args.tokenizer is None:
args.tokenizer = args.model
if args.pt_checkpoint:
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, trust_remote_code=True)
config = AutoConfig.from_pretrained(args.model, trust_remote_code=True, pre_seq_len=args.pt_pre_seq_len)
model = AutoModel.from_pretrained(args.model, config=config, trust_remote_code=True).cuda()
prefix_state_dict = torch.load(os.path.join(args.pt_checkpoint, "pytorch_model.bin"))
new_prefix_state_dict = {}
for k, v in prefix_state_dict.items():
if k.startswith("transformer.prefix_encoder."):
new_prefix_state_dict[k[len("transformer.prefix_encoder."):]] = v
model.transformer.prefix_encoder.load_state_dict(new_prefix_state_dict)
else:
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, trust_remote_code=True)
model = AutoModel.from_pretrained(args.model, trust_remote_code=True)
model = model.to(args.device)
while True:
prompt = input("Prompt:")
inputs = tokenizer(prompt, return_tensors="pt")
inputs = inputs.to(args.device)
response = model.generate(input_ids=inputs["input_ids"], max_length=inputs["input_ids"].shape[-1] + args.max_new_tokens)
response = response[0, inputs["input_ids"].shape[-1]:]
print("Response:", tokenizer.decode(response, skip_special_tokens=True))
\ No newline at end of file
import json
import ast
import astunparse
from transformers import PreTrainedTokenizer
from torch.utils.data import Dataset
from copy import deepcopy
from typing import Dict, List
# text constants
FUNCTION_CALL_NAME = 'tool_call'
FUNCTION_CALL_PREFIX = '```python\n'
FUNCTION_CALL_POSTFIX = '\n```'
TOOL_DEFINITION_PREFIX = 'Answer the following questions as best as you can. You have access to the following tools:\n'
CONVERSATOIN_KEY = 'conversations'
TOOL_DESC_KEY = 'tools'
def format_function_call(function_name: str, parameters: Dict[str, str]):
function_name = ast.Name(id=function_name)
keywords = [
ast.keyword(arg=arg_name, value=ast.Constant(arg_value))
for arg_name, arg_value in parameters.items()
]
func_call = ast.Call(func=function_name, args=[], keywords=keywords)
return astunparse.unparse(func_call).strip()
def format_conversation(item, tokenizer, conversation_key: str, tool_key: str):
conversations = deepcopy(item[conversation_key])
# Note: `loss_mask` here means whether *the prediction* of the token should take loss
tokens, loss_masks = [tokenizer.get_command("[gMASK]"), tokenizer.get_command("sop")], [0, 0]
def _update(_tokens: List[int], value: int = 1):
value = int(value)
tokens.extend(_tokens)
loss_masks.extend([value] * len(_tokens))
# insert system prompt for tools
if tool_key in item:
conversations.insert(0,
{
"role": "system",
"content": TOOL_DEFINITION_PREFIX + json.dumps(item[tool_key], indent=4, ensure_ascii=False)
}
)
for idx, conv in enumerate(conversations):
loss = conv.get("loss", True)
if conv['role'] in {'system', 'user'}:
loss = False
if conv['role'] == 'tool':
# function call python code
value = FUNCTION_CALL_PREFIX + format_function_call(FUNCTION_CALL_NAME, conv["parameters"]) + FUNCTION_CALL_POSTFIX
text = tokenizer.build_single_message("assistant", conv["name"], value)
_update(text, loss)
# function call result
value = conv.get('observation', None)
if not isinstance(value, str):
value = json.dumps(value, ensure_ascii=False)
text = tokenizer.build_single_message("observation", "", value)
_update(text, False)
else:
text = tokenizer.build_single_message(conv['role'], "", conv["content"])
_update(text, loss)
_update([tokenizer.eos_token_id], False)
assert len(tokens) == len(loss_masks), f"length mismatch: {len(tokens)} vs {len(loss_masks)}"
return tokens, loss_masks
def sanity_check(tokens: List[int], target: List[int], tokenizer: PreTrainedTokenizer):
print("Sanity Check >>>>>>>>>>>>>")
for t, m in zip(tokens, target):
decoded = tokenizer.tokenizer.index_special_tokens[t] \
if t in tokenizer.tokenizer.index_special_tokens \
else tokenizer.decode([t])
print("%20s: %6d -> %6d" % (repr(decoded), t, m))
print("<<<<<<<<<<<<< Sanity Check")
assert len(tokens) == len(target), f"length mismatch: {len(tokens)} vs {len(target)}"
class MultiTurnDataset(Dataset):
def __init__(self, data: List[dict], tokenizer: PreTrainedTokenizer, max_seq_length: int):
super(MultiTurnDataset, self).__init__()
self.tokenizer = tokenizer
self.max_seq_length = max_seq_length
self.data = data
def __len__(self):
return len(self.data)
def __getitem__(self, i) -> dict:
data_item = self.data[i]
tokens, loss_masks = format_conversation(data_item, self.tokenizer, CONVERSATOIN_KEY, TOOL_DESC_KEY)
# labels are used inside the model
target_based_loss_mask = [False] + loss_masks[:-1]
labels = [(t if m else -100) for t, m in zip(tokens, target_based_loss_mask)]
tokens = tokens[:self.max_seq_length]
labels = labels[:self.max_seq_length]
tokens += [self.tokenizer.pad_token_id] * (self.max_seq_length - len(tokens))
labels += [-100] * (self.max_seq_length - len(labels))
assert len(tokens) == len(labels), f"length mismatch: {len(tokens)} vs {len(labels)}"
return {
"input_ids": tokens,
"labels": labels
}
class InputOutputDataset(Dataset):
def __init__(self, data: List[dict], tokenizer: PreTrainedTokenizer, max_source_length: int, max_target_length: int):
super(InputOutputDataset, self).__init__()
self.tokenizer = tokenizer
self.max_source_length = max_source_length
self.max_target_length = max_target_length
self.max_seq_length = max_source_length + max_target_length + 1
self.data = data
def __len__(self):
return len(self.data)
def __getitem__(self, i) -> dict:
data_item = self.data[i]
a_ids = self.tokenizer.encode(text=data_item['prompt'], add_special_tokens=True, truncation=True,
max_length=self.max_source_length)
b_ids = self.tokenizer.encode(text=data_item['response'], add_special_tokens=False, truncation=True,
max_length=self.max_target_length)
context_length = len(a_ids)
input_ids = a_ids + b_ids + [self.tokenizer.eos_token_id]
labels = [self.tokenizer.pad_token_id] * context_length + b_ids + [self.tokenizer.eos_token_id]
pad_len = self.max_seq_length - len(input_ids)
input_ids = input_ids + [self.tokenizer.pad_token_id] * pad_len
labels = labels + [self.tokenizer.pad_token_id] * pad_len
labels = [(l if l != self.tokenizer.pad_token_id else -100) for l in labels]
assert len(input_ids) == len(labels), f"length mismatch: {len(input_ids)} vs {len(labels)}"
return {
"input_ids": input_ids,
"labels": labels
}
transformers==4.30.2
accelerate
sentencepiece
astunparse
deepspeed
\ No newline at end of file
#! /usr/bin/env bash
set -ex
LR=1e-4
NUM_GPUS=8
MAX_SOURCE_LEN=1024
MAX_TARGET_LEN=128
DEV_BATCH_SIZE=4
GRAD_ACCUMULARION_STEPS=1
MAX_STEP=20
SAVE_INTERVAL=500
RUN_NAME=advertise_gen_ft
BASE_MODEL_PATH=/chatglm3/chatglm3-6b
DATASET_PATH=../formatted_data/advertise_gen.jsonl
DATESTR=`date +%Y%m%d-%H%M%S`
OUTPUT_DIR=output/${RUN_NAME}-${DATESTR}-${LR}
MASTER_PORT=$(shuf -n 1 -i 10000-65535)
mkdir -p $OUTPUT_DIR
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
torchrun --standalone --nnodes=1 --nproc_per_node=$NUM_GPUS ../finetune.py \
--train_format input-output \
--train_file $DATASET_PATH \
--preprocessing_num_workers 1 \
--model_name_or_path $BASE_MODEL_PATH \
--output_dir $OUTPUT_DIR \
--max_source_length $MAX_SOURCE_LEN \
--max_target_length $MAX_TARGET_LEN \
--per_device_train_batch_size $DEV_BATCH_SIZE \
--gradient_accumulation_steps $GRAD_ACCUMULARION_STEPS \
--max_steps $MAX_STEP \
--logging_steps 1 \
--save_steps $SAVE_INTERVAL \
--learning_rate $LR \
--fp16 \
--deepspeed ../configs/deepspeed.json 2>&1 | tee ${OUTPUT_DIR}/train.log
#! /usr/bin/env bash
set -ex
LR=1e-4
NUM_GPUS=8
MAX_SEQ_LEN=2048
DEV_BATCH_SIZE=2
GRAD_ACCUMULARION_STEPS=1
MAX_STEP=200
SAVE_INTERVAL=50
DATESTR=`date +%Y%m%d-%H%M%S`
RUN_NAME=tool_alpaca_ft
DATASET_PATH=../formatted_data/tool_alpaca.jsonl
BASE_MODEL_PATH=/chatglm3/chatglm3-6b
OUTPUT_DIR=output/${RUN_NAME}-${DATESTR}-${LR}
mkdir -p $OUTPUT_DIR
torchrun --standalone --nnodes=1 --nproc_per_node=$NUM_GPUS ../finetune.py \
--train_format multi-turn \
--train_file $DATASET_PATH \
--max_seq_length $MAX_SEQ_LEN \
--preprocessing_num_workers 1 \
--model_name_or_path $BASE_MODEL_PATH \
--output_dir $OUTPUT_DIR \
--per_device_train_batch_size $DEV_BATCH_SIZE \
--gradient_accumulation_steps $GRAD_ACCUMULARION_STEPS \
--max_steps $MAX_STEP \
--logging_steps 1 \
--save_steps $SAVE_INTERVAL \
--fp16 \
--deepspeed ../configs/deepspeed.json 2>&1 | tee ${OUTPUT_DIR}/train.log
#! /usr/bin/env bash
set -ex
PRE_SEQ_LEN=128
LR=2e-2
NUM_GPUS=1
MAX_SOURCE_LEN=1024
MAX_TARGET_LEN=128
DEV_BATCH_SIZE=1
GRAD_ACCUMULARION_STEPS=1
MAX_STEP=20
SAVE_INTERVAL=500
DATESTR=`date +%Y%m%d-%H%M%S`
RUN_NAME=advertise_gen_pt
BASE_MODEL_PATH=/chatglm3/chatglm3-6b
DATASET_PATH=../formatted_data/advertise_gen.jsonl
OUTPUT_DIR=output/${RUN_NAME}-${DATESTR}-${PRE_SEQ_LEN}-${LR}
mkdir -p $OUTPUT_DIR
export HIP_VISIBLE_DEVICES=4,5,6,7
torchrun --standalone --nnodes=1 --nproc_per_node=$NUM_GPUS ../finetune.py \
--train_format input-output \
--train_file $DATASET_PATH \
--preprocessing_num_workers 1 \
--model_name_or_path $BASE_MODEL_PATH \
--output_dir $OUTPUT_DIR \
--max_source_length $MAX_SOURCE_LEN \
--max_target_length $MAX_TARGET_LEN \
--per_device_train_batch_size $DEV_BATCH_SIZE \
--gradient_accumulation_steps $GRAD_ACCUMULARION_STEPS \
--max_steps $MAX_STEP \
--logging_steps 1 \
--save_steps $SAVE_INTERVAL \
--learning_rate $LR \
--pre_seq_len $PRE_SEQ_LEN 2>&1 | tee ${OUTPUT_DIR}/train.log
#! /usr/bin/env bash
set -ex
PRE_SEQ_LEN=128
LR=2e-2
NUM_GPUS=1
MAX_SEQ_LEN=2048
DEV_BATCH_SIZE=1
GRAD_ACCUMULARION_STEPS=16
MAX_STEP=1000
SAVE_INTERVAL=500
DATESTR=`date +%Y%m%d-%H%M%S`
RUN_NAME=tool_alpaca_pt
BASE_MODEL_PATH=/chatglm3/chatglm3-6b
DATASET_PATH=../formatted_data/tool_alpaca.jsonl
OUTPUT_DIR=output/${RUN_NAME}-${DATESTR}-${PRE_SEQ_LEN}-${LR}
mkdir -p $OUTPUT_DIR
torchrun --standalone --nnodes=1 --nproc_per_node=$NUM_GPUS ../finetune.py \
--train_format multi-turn \
--train_file $DATASET_PATH \
--max_seq_length $MAX_SEQ_LEN \
--preprocessing_num_workers 1 \
--model_name_or_path $BASE_MODEL_PATH \
--output_dir $OUTPUT_DIR \
--per_device_train_batch_size $DEV_BATCH_SIZE \
--gradient_accumulation_steps $GRAD_ACCUMULARION_STEPS \
--max_steps $MAX_STEP \
--logging_steps 1 \
--save_steps $SAVE_INTERVAL \
--learning_rate $LR \
--pre_seq_len $PRE_SEQ_LEN 2>&1 | tee ${OUTPUT_DIR}/train.log
#! /usr/bin/env python
import json
from collections import Counter
from argparse import ArgumentParser
import os
parser = ArgumentParser()
parser.add_argument("--path", type=str, required=True)
args = parser.parse_args()
with open(args.path) as f:
data = [json.loads(line) for line in f]
train_examples = [{
"prompt": x['content'],
"response": x['summary'],
} for x in data]
os.makedirs("formatted_data", exist_ok=True)
with open("formatted_data/advertise_gen.jsonl", "w") as f:
for e in train_examples:
f.write(json.dumps(e, ensure_ascii=False) + "\n")
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment