"lm_eval/tasks/tinyBenchmarks/utils_winogrande.py" did not exist on "1980a13c9d7bcdc6e2a19228c203f9f7834ac9b8"
Commit 467ec853 authored by lvzhen's avatar lvzhen
Browse files

Merge branch 'master' into 'master'

ChatGLM3-6B 微调训练

See merge request !2
parents 971c0aee 0006ad16
import json
import ast
import astunparse
from transformers import PreTrainedTokenizer
from torch.utils.data import Dataset
from copy import deepcopy
from typing import Dict, List
# text constants
FUNCTION_CALL_NAME = 'tool_call'
FUNCTION_CALL_PREFIX = '```python\n'
FUNCTION_CALL_POSTFIX = '\n```'
TOOL_DEFINITION_PREFIX = 'Answer the following questions as best as you can. You have access to the following tools:\n'
CONVERSATOIN_KEY = 'conversations'
TOOL_DESC_KEY = 'tools'
def format_function_call(function_name: str, parameters: Dict[str, str]):
function_name = ast.Name(id=function_name)
keywords = [
ast.keyword(arg=arg_name, value=ast.Constant(arg_value))
for arg_name, arg_value in parameters.items()
]
func_call = ast.Call(func=function_name, args=[], keywords=keywords)
return astunparse.unparse(func_call).strip()
def format_conversation(item, tokenizer, conversation_key: str, tool_key: str):
conversations = deepcopy(item[conversation_key])
# Note: `loss_mask` here means whether *the prediction* of the token should take loss
tokens, loss_masks = [tokenizer.get_command("[gMASK]"), tokenizer.get_command("sop")], [0, 0]
def _update(_tokens: List[int], value: int = 1):
value = int(value)
tokens.extend(_tokens)
loss_masks.extend([value] * len(_tokens))
# insert system prompt for tools
if tool_key in item:
conversations.insert(0,
{
"role": "system",
"content": TOOL_DEFINITION_PREFIX + json.dumps(item[tool_key], indent=4, ensure_ascii=False)
}
)
for idx, conv in enumerate(conversations):
loss = conv.get("loss", True)
if conv['role'] in {'system', 'user'}:
loss = False
if conv['role'] == 'tool':
# function call python code
value = FUNCTION_CALL_PREFIX + format_function_call(FUNCTION_CALL_NAME, conv["parameters"]) + FUNCTION_CALL_POSTFIX
text = tokenizer.build_single_message("assistant", conv["name"], value)
_update(text, loss)
# function call result
value = conv.get('observation', None)
if not isinstance(value, str):
value = json.dumps(value, ensure_ascii=False)
text = tokenizer.build_single_message("observation", "", value)
_update(text, False)
else:
text = tokenizer.build_single_message(conv['role'], "", conv["content"])
_update(text, loss)
_update([tokenizer.eos_token_id], False)
assert len(tokens) == len(loss_masks), f"length mismatch: {len(tokens)} vs {len(loss_masks)}"
return tokens, loss_masks
def sanity_check(tokens: List[int], target: List[int], tokenizer: PreTrainedTokenizer):
print("Sanity Check >>>>>>>>>>>>>")
for t, m in zip(tokens, target):
decoded = tokenizer.tokenizer.index_special_tokens[t] \
if t in tokenizer.tokenizer.index_special_tokens \
else tokenizer.decode([t])
print("%20s: %6d -> %6d" % (repr(decoded), t, m))
print("<<<<<<<<<<<<< Sanity Check")
assert len(tokens) == len(target), f"length mismatch: {len(tokens)} vs {len(target)}"
class MultiTurnDataset(Dataset):
def __init__(self, data: List[dict], tokenizer: PreTrainedTokenizer, max_seq_length: int):
super(MultiTurnDataset, self).__init__()
self.tokenizer = tokenizer
self.max_seq_length = max_seq_length
self.data = data
def __len__(self):
return len(self.data)
def __getitem__(self, i) -> dict:
data_item = self.data[i]
tokens, loss_masks = format_conversation(data_item, self.tokenizer, CONVERSATOIN_KEY, TOOL_DESC_KEY)
# labels are used inside the model
target_based_loss_mask = [False] + loss_masks[:-1]
labels = [(t if m else -100) for t, m in zip(tokens, target_based_loss_mask)]
tokens = tokens[:self.max_seq_length]
labels = labels[:self.max_seq_length]
tokens += [self.tokenizer.pad_token_id] * (self.max_seq_length - len(tokens))
labels += [-100] * (self.max_seq_length - len(labels))
assert len(tokens) == len(labels), f"length mismatch: {len(tokens)} vs {len(labels)}"
return {
"input_ids": tokens,
"labels": labels
}
class InputOutputDataset(Dataset):
def __init__(self, data: List[dict], tokenizer: PreTrainedTokenizer, max_source_length: int, max_target_length: int):
super(InputOutputDataset, self).__init__()
self.tokenizer = tokenizer
self.max_source_length = max_source_length
self.max_target_length = max_target_length
self.max_seq_length = max_source_length + max_target_length + 1
self.data = data
def __len__(self):
return len(self.data)
def __getitem__(self, i) -> dict:
data_item = self.data[i]
a_ids = self.tokenizer.encode(text=data_item['prompt'], add_special_tokens=True, truncation=True,
max_length=self.max_source_length)
b_ids = self.tokenizer.encode(text=data_item['response'], add_special_tokens=False, truncation=True,
max_length=self.max_target_length)
context_length = len(a_ids)
input_ids = a_ids + b_ids + [self.tokenizer.eos_token_id]
labels = [self.tokenizer.pad_token_id] * context_length + b_ids + [self.tokenizer.eos_token_id]
pad_len = self.max_seq_length - len(input_ids)
input_ids = input_ids + [self.tokenizer.pad_token_id] * pad_len
labels = labels + [self.tokenizer.pad_token_id] * pad_len
labels = [(l if l != self.tokenizer.pad_token_id else -100) for l in labels]
assert len(input_ids) == len(labels), f"length mismatch: {len(input_ids)} vs {len(labels)}"
return {
"input_ids": input_ids,
"labels": labels
}
transformers==4.30.2
accelerate
sentencepiece
astunparse
deepspeed
\ No newline at end of file
#! /usr/bin/env bash
set -ex
LR=1e-4
NUM_GPUS=8
MAX_SOURCE_LEN=1024
MAX_TARGET_LEN=128
DEV_BATCH_SIZE=4
GRAD_ACCUMULARION_STEPS=1
MAX_STEP=20
SAVE_INTERVAL=500
RUN_NAME=advertise_gen_ft
BASE_MODEL_PATH=/chatglm3/chatglm3-6b
DATASET_PATH=../formatted_data/advertise_gen.jsonl
DATESTR=`date +%Y%m%d-%H%M%S`
OUTPUT_DIR=output/${RUN_NAME}-${DATESTR}-${LR}
MASTER_PORT=$(shuf -n 1 -i 10000-65535)
mkdir -p $OUTPUT_DIR
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
torchrun --standalone --nnodes=1 --nproc_per_node=$NUM_GPUS ../finetune.py \
--train_format input-output \
--train_file $DATASET_PATH \
--preprocessing_num_workers 1 \
--model_name_or_path $BASE_MODEL_PATH \
--output_dir $OUTPUT_DIR \
--max_source_length $MAX_SOURCE_LEN \
--max_target_length $MAX_TARGET_LEN \
--per_device_train_batch_size $DEV_BATCH_SIZE \
--gradient_accumulation_steps $GRAD_ACCUMULARION_STEPS \
--max_steps $MAX_STEP \
--logging_steps 1 \
--save_steps $SAVE_INTERVAL \
--learning_rate $LR \
--fp16 \
--deepspeed ../configs/deepspeed.json 2>&1 | tee ${OUTPUT_DIR}/train.log
#! /usr/bin/env bash
set -ex
LR=1e-4
NUM_GPUS=8
MAX_SEQ_LEN=2048
DEV_BATCH_SIZE=2
GRAD_ACCUMULARION_STEPS=1
MAX_STEP=200
SAVE_INTERVAL=50
DATESTR=`date +%Y%m%d-%H%M%S`
RUN_NAME=tool_alpaca_ft
DATASET_PATH=../formatted_data/tool_alpaca.jsonl
BASE_MODEL_PATH=/chatglm3/chatglm3-6b
OUTPUT_DIR=output/${RUN_NAME}-${DATESTR}-${LR}
mkdir -p $OUTPUT_DIR
torchrun --standalone --nnodes=1 --nproc_per_node=$NUM_GPUS ../finetune.py \
--train_format multi-turn \
--train_file $DATASET_PATH \
--max_seq_length $MAX_SEQ_LEN \
--preprocessing_num_workers 1 \
--model_name_or_path $BASE_MODEL_PATH \
--output_dir $OUTPUT_DIR \
--per_device_train_batch_size $DEV_BATCH_SIZE \
--gradient_accumulation_steps $GRAD_ACCUMULARION_STEPS \
--max_steps $MAX_STEP \
--logging_steps 1 \
--save_steps $SAVE_INTERVAL \
--fp16 \
--deepspeed ../configs/deepspeed.json 2>&1 | tee ${OUTPUT_DIR}/train.log
#! /usr/bin/env bash
set -ex
PRE_SEQ_LEN=128
LR=2e-2
NUM_GPUS=1
MAX_SOURCE_LEN=1024
MAX_TARGET_LEN=128
DEV_BATCH_SIZE=1
GRAD_ACCUMULARION_STEPS=1
MAX_STEP=20
SAVE_INTERVAL=500
DATESTR=`date +%Y%m%d-%H%M%S`
RUN_NAME=advertise_gen_pt
BASE_MODEL_PATH=/chatglm3/chatglm3-6b
DATASET_PATH=../formatted_data/advertise_gen.jsonl
OUTPUT_DIR=output/${RUN_NAME}-${DATESTR}-${PRE_SEQ_LEN}-${LR}
mkdir -p $OUTPUT_DIR
export HIP_VISIBLE_DEVICES=4,5,6,7
torchrun --standalone --nnodes=1 --nproc_per_node=$NUM_GPUS ../finetune.py \
--train_format input-output \
--train_file $DATASET_PATH \
--preprocessing_num_workers 1 \
--model_name_or_path $BASE_MODEL_PATH \
--output_dir $OUTPUT_DIR \
--max_source_length $MAX_SOURCE_LEN \
--max_target_length $MAX_TARGET_LEN \
--per_device_train_batch_size $DEV_BATCH_SIZE \
--gradient_accumulation_steps $GRAD_ACCUMULARION_STEPS \
--max_steps $MAX_STEP \
--logging_steps 1 \
--save_steps $SAVE_INTERVAL \
--learning_rate $LR \
--pre_seq_len $PRE_SEQ_LEN 2>&1 | tee ${OUTPUT_DIR}/train.log
#! /usr/bin/env bash
set -ex
PRE_SEQ_LEN=128
LR=2e-2
NUM_GPUS=1
MAX_SEQ_LEN=2048
DEV_BATCH_SIZE=1
GRAD_ACCUMULARION_STEPS=16
MAX_STEP=1000
SAVE_INTERVAL=500
DATESTR=`date +%Y%m%d-%H%M%S`
RUN_NAME=tool_alpaca_pt
BASE_MODEL_PATH=/chatglm3/chatglm3-6b
DATASET_PATH=../formatted_data/tool_alpaca.jsonl
OUTPUT_DIR=output/${RUN_NAME}-${DATESTR}-${PRE_SEQ_LEN}-${LR}
mkdir -p $OUTPUT_DIR
torchrun --standalone --nnodes=1 --nproc_per_node=$NUM_GPUS ../finetune.py \
--train_format multi-turn \
--train_file $DATASET_PATH \
--max_seq_length $MAX_SEQ_LEN \
--preprocessing_num_workers 1 \
--model_name_or_path $BASE_MODEL_PATH \
--output_dir $OUTPUT_DIR \
--per_device_train_batch_size $DEV_BATCH_SIZE \
--gradient_accumulation_steps $GRAD_ACCUMULARION_STEPS \
--max_steps $MAX_STEP \
--logging_steps 1 \
--save_steps $SAVE_INTERVAL \
--learning_rate $LR \
--pre_seq_len $PRE_SEQ_LEN 2>&1 | tee ${OUTPUT_DIR}/train.log
#! /usr/bin/env python
import json
from collections import Counter
from argparse import ArgumentParser
import os
parser = ArgumentParser()
parser.add_argument("--path", type=str, required=True)
args = parser.parse_args()
with open(args.path) as f:
data = [json.loads(line) for line in f]
train_examples = [{
"prompt": x['content'],
"response": x['summary'],
} for x in data]
os.makedirs("formatted_data", exist_ok=True)
with open("formatted_data/advertise_gen.jsonl", "w") as f:
for e in train_examples:
f.write(json.dumps(e, ensure_ascii=False) + "\n")
#! /usr/bin/env python
import json
from collections import Counter
from argparse import ArgumentParser
import os
parser = ArgumentParser()
parser.add_argument("--path", type=str, required=True)
args = parser.parse_args()
with open(args.path) as f:
data = json.load(f)
train_examples = []
err_count = 0
for setting in data:
api_desc = [setting["NLDocumentation"]]
for instance in setting["Instances"]:
try:
conv = [{
"role": "user",
"content": instance['input'],
}]
for step in instance['intermediate_steps']:
tool_name, params, react = step[0]
step_thought = react.split("Action:")[0].strip()
observation = step[1]
conv.append({
"role": "assistant",
"content": step_thought,
})
conv.append({
"role": "tool",
"name": tool_name,
"parameters": json.loads(params),
"observation": observation,
})
conv.append({
"role": "assistant",
"content": instance['Final Thought'] + "\n" + instance['output'],
})
except:
err_count += 1
else:
train_examples.append({
"tools": api_desc,
"conversations": conv
})
print("err_count:", err_count)
print("train_examples:", len(train_examples))
print("conversation distribution:", Counter([len(e["conversations"]) for e in train_examples]))
os.makedirs("formatted_data", exist_ok=True)
with open("formatted_data/tool_alpaca.jsonl", "w") as f:
for e in train_examples:
f.write(json.dumps(e, ensure_ascii=False) + "\n")
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
# coding=utf-8
# Copyright 2020-present the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
The Trainer class, to easily train a 🤗 Transformers from scratch or finetune it on a new task.
"""
import os
from typing import Optional
from transformers import Trainer
import torch
from transformers.modeling_utils import PreTrainedModel, unwrap_model
from transformers.utils import logging
logger = logging.get_logger(__name__)
WEIGHTS_NAME = "pytorch_model.bin"
TRAINING_ARGS_NAME = "training_args.bin"
class PrefixTrainer(Trainer):
def __init__(self, *args, save_changed=False, **kwargs):
self.save_changed = save_changed
super().__init__(*args, **kwargs)
def _save(self, output_dir: Optional[str] = None, state_dict=None):
# If we are executing this function, we are the process zero, so we don't check for that.
output_dir = output_dir if output_dir is not None else self.args.output_dir
os.makedirs(output_dir, exist_ok=True)
logger.info(f"Saving model checkpoint to {output_dir}")
# Save a trained model and configuration using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
if not isinstance(self.model, PreTrainedModel):
if isinstance(unwrap_model(self.model), PreTrainedModel):
if state_dict is None:
state_dict = self.model.state_dict()
unwrap_model(self.model).save_pretrained(output_dir, state_dict=state_dict)
else:
logger.info("Trainer.model is not a `PreTrainedModel`, only saving its state dict.")
if state_dict is None:
state_dict = self.model.state_dict()
torch.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME))
else:
if self.save_changed:
print("Saving PrefixEncoder")
state_dict = self.model.state_dict()
filtered_state_dict = {}
for k, v in self.model.named_parameters():
if v.requires_grad:
filtered_state_dict[k] = state_dict[k]
self.model.save_pretrained(output_dir, state_dict=filtered_state_dict)
else:
print("Saving the whole model")
self.model.save_pretrained(output_dir, state_dict=state_dict)
if self.tokenizer is not None:
self.tokenizer.save_pretrained(output_dir)
# Good practice: save your training arguments together with the trained model
torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
# ChatGLM3-6B finetune
This directory provides fine-tuning examples of the ChatGLM3-6B model, including full fine-tuning and P-Tuning v2. In
terms of format, it provides multiple rounds of dialogue fine-tuning samples and input and output format fine-tuning
samples.
If the model is downloaded locally, the `THUDM/chatglm3-6b` field in this article and the code should be replaced with
the corresponding address to load the model locally.
Running the example requires `python>=3.10`. In addition to the basic `torch` dependency, the example code also requires
dependencies to run.
**We provide [sample notebook](lora_finetune.ipynb) to demonstrate how to use our fine-tuning code. **
```bash
pip install -r requirements.txt
```
## Test hardware standards
We only provide single-machine multi-card/multi-machine multi-card running examples, so you will need at least one
machine with multiple GPUs. In the **default configuration file** in this warehouse, we record the usage of video
memory:
+ SFT full fine-tuning: evenly distributed among 4 graphics cards, each graphics card occupies `48346MiB` of video
memory.
+ P-TuningV2 fine-tuning: 1 graphics card, occupying `18426MiB` memory.
+ LORA fine-tuning: 1 graphics card, occupying `14082MiB` memory.
> Please note that this result is for reference only, and the memory usage may be different for different parameters.
> Please make adjustments based on your hardware conditions.
## Multi-turn dialogue format
The multi-round dialogue fine-tuning example adopts the ChatGLM3 dialogue format convention and adds
different `loss_mask` to different characters to calculate `loss` for multiple rounds of responses in one pass.
For data files, the sample adopts the following format
If you only want to fine-tune your model's conversational capabilities, rather than its tool capabilities, you should
organize your data in the following format.
```json
[
{
"conversations": [
{
"role": "system",
"content": "<system prompt text>"
},
{
"role": "user",
"content": "<user prompt text>"
},
{
"role": "assistant",
"content": "<assistant response text>"
},
// ... Muti Turn
{
"role": "user",
"content": "<user prompt text>"
},
{
"role": "assistant",
"content": "<assistant response text>"
}
]
}
// ...
]
```
**Please note that this method will affect the tool calling function of the model when there are many fine-tuning steps
**
If you wish to fine-tune your model's dialog and tool capabilities, you should organize your data in the following
format.
```json
[
{
"tools": [
// available tools, format is not restricted
],
"conversations": [
{
"role": "system",
"content": "<system prompt text>"
},
{
"role": "user",
"content": "<user prompt text>"
},
{
"role": "assistant",
"content": "<assistant thought to text>"
},
{
"role": "tool",
"name": "<name of the tool to be called",
"parameters": {
"<parameter_name>": "<parameter_value>"
},
"observation": "<observation>"
// don't have to be string
},
{
"role": "assistant",
"content": "<assistant response to observation>"
},
// ... Muti Turn
{
"role": "user",
"content": "<user prompt text>"
},
{
"role": "assistant",
"content": "<assistant response text>"
}
]
}
// ...
]
```
- There is no need to manually insert the system prompt about the tool description. The `tools` field will be used
during preprocessing using `json.dumps(..., ensure_ascii=False)`
After formatting, insert it as the first system prompt.
- Each role can be accompanied by a `loss` field of type `bool`, indicating whether the content predicted by this field
participates in `loss`
calculate. If there is no such field, the sample implementation does not calculate `loss` for `system` and `user` by
default, but calculates `loss` for other roles.
- `tool` is not a native role in ChatGLM3. The `tool` here will be automatically converted into an `assistant` with tool
call `metadata` during the preprocessing stage.
role (default `loss` is calculated) and an `observation` role representing the tool return value (`loss` is not
calculated).
- The fine-tuning task of `Code interpreter` has not been implemented yet.
- The `system` role is optional, but if the `system` role exists, it must appear in `user`
Before the character, the `system` character can only appear once in a complete dialogue data (regardless of single
round or multiple rounds of dialogue).
## Dataset format example
Here we take the AdvertiseGen data set as an example,
You can download it
from [Google Drive](https://drive.google.com/file/d/13_vf0xRTQsyneRKdD1bZIr93vBGOczrk/view?usp=sharing)
Or [Tsinghua Cloud](https://cloud.tsinghua.edu.cn/f/b3f119a008264b1cabd1/?dl=1) download the AdvertiseGen data set.
Place the decompressed AdvertiseGen directory in the `data` directory and convert it into the following format data set
yourself.
> Please note that the verification set is added to the current fine-tuning code. Therefore, for a complete set of
> fine-tuning data sets, the training data set and the verification data set must be included, and the test data set
> does
> not need to be filled in. Or directly use the validation data set instead.
```
{"conversations": [{"role": "user", "content": "Type#skirt*skirt length#skirt"}, {"role": "assistant", "content": "This is versatile Fashionable fairy skirt, the overall design is very elegant and casual. Every girl can instantly turn into a fairy after wearing it. The material is very light and breathable, making it very comfortable to wear in summer."} ]}
```
## Configuration file
Fine-tuning configuration files are located in the `config` directory and include the following files:
1. `ds_zereo_2 / ds_zereo_3.json`: deepspeed configuration file.
2. `lora.yaml / ptuning.yaml / sft.yaml`: Configuration files for different models, including model parameters,
optimizer parameters, training parameters, etc. Some important parameters are explained as follows:
+ data_config section
+ train_file: The file path of the training data set.
+ val_file: The file path of the verification data set.
+ test_file: The file path of the test data set.
+ num_proc: Number of processes used when loading data.
+ max_input_length: The maximum length of the input sequence.
+ max_output_length: The maximum length of the output sequence.
+ training_args section
+ output_dir: Directory for saving models and other outputs.
+ max_steps: The maximum number of steps for training.
+ per_device_train_batch_size: training batch size per device (e.g. GPU).
+ dataloader_num_workers: The number of worker threads used when loading data.
+ remove_unused_columns: Whether to remove unused columns in the data.
+ save_strategy: model saving strategy (for example, how many steps should be saved).
+ save_steps: How many steps should be taken to save the model.
+ log_level: log level (such as info).
+ logging_strategy: logging strategy.
+ logging_steps: How many steps to log.
+ per_device_eval_batch_size: Evaluation batch size per device.
+ evaluation_strategy: Evaluation strategy (e.g. how many steps should be evaluated).
+ eval_steps: How many steps to evaluate.
+ predict_with_generate: Whether to use generate mode for prediction.
+ generation_config section
+ max_new_tokens: The maximum number of new tokens generated.
+ peft_config section
+ peft_type: The parameter valid adjustment type used (e.g. LORA).
+ task_type: task type, here is the causal language model (CAUSAL_LM).
+ Lora parameters:
+ r: LoRA rank.
+ lora_alpha: Scaling factor for LoRA.
+ lora_dropout: dropout probability used in LoRA layer
+ P-TuningV2 parameters:
+ num_virtual_tokens: The number of virtual tokens.
## Start fine-tuning
Use the following code to execute **single machine multiple cards/multiple machines multiple cards** operation.
```angular2html
cd finetune_demo
OMP_NUM_THREADS=1 torchrun --standalone --nnodes=1 --nproc_per_node=8 finetune_hf.py data/AdvertiseGen/ THUDM/chatglm3-6b configs/lora.yaml configs/ds_zero_2.json
```
Execute **Single machine single card** operation through the following code.
```angular2html
cd finetune_demo
python finetune_hf.py data/AdvertiseGen/ THUDM/chatglm3-6b configs/lora.yaml
```
## Fine-tuning from a checkpoint
If you train according to the above method, each fine-tuning will start from scratch. If you want to fine-tune from a
half-trained model, you can add a fourth parameter, which has two ways to pass in:
1. `yes`, automatically start training from the last saved Checkpoint
2. `XX`, breakpoint number, for example, `600` means training from Checkpoint number 600
For example, this is an example of continuing fine-tuning from the last saved point
```angular2html
cd finetune_demo
python finetune_hf.py data/AdvertiseGen/ THUDM/chatglm3-6b configs/lora.yaml yes
```
## Use the fine-tuned model
### Verify the fine-tuned model in inference_hf.py
You can use our fine-tuned model in `finetune_demo/inference_hf.py`, which can be easily tested with just one line of
code.
```angular2html
python inference_hf.py your_finetune_path --prompt your prompt
```
In this way, the answer you get is a fine-tuned answer.
### Use the fine-tuned model in other demos in this repos or external repos
You can use our `lora` and fully parameterized fine-tuned models in any demo, as follows:
1. Use the method of reading the model in `finetune_demo/inference_hf.py` to replace the method of reading the model in
the demo.
> Please note that for LORA and P-TuningV2 we do not merge the trained models, but in `adapter_config.json`
> The fine-tuning path is recorded in . If your original model location changes, you should modify the path
> of `base_model_name_or_path` in `adapter_config.json`.
> Please note that we have only tested using NVIDIA Hopper (representative GPU: H100) and Ampère (representative GPU:
> A100) architecture and series of graphics cards. If you use a graphics card with another architecture, you may
> experience
> 1. Unknown training problem/Video memory usage is different from the above.
> 2. The architecture is too low and does not support certain features.
> 3. The problem of reasoning effect.
> The above three situations are problems that the community has encountered before. Although the probability is
extremely low, if you encounter the above problems, you can try to solve them in the community.
```python
def load_model_and_tokenizer(
model_dir: Union[str, Path], trust_remote_code: bool = True
) -> tuple[ModelType, TokenizerType]:
model_dir = _resolve_path(model_dir)
if (model_dir / 'adapter_config.json').exists():
model = AutoPeftModelForCausalLM.from_pretrained(
model_dir, trust_remote_code=trust_remote_code, device_map='auto'
)
tokenizer_dir = model.peft_config['default'].base_model_name_or_path
else:
model = AutoModelForCausalLM.from_pretrained(
model_dir, trust_remote_code=trust_remote_code, device_map='auto'
)
tokenizer_dir = model_dir
tokenizer = AutoTokenizer.from_pretrained(
tokenizer_dir, trust_remote_code=trust_remote_code
)
return model, tokenizer
```
2. Read the fine-tuned model, please note that you should use the location of the fine-tuned model, for example, if your
model location is `/path/to/finetune_adapter_model`
, the original model address is `path/to/base_model`, then you should use `/path/to/finetune_adapter_model`
as `model_dir`.
3. After completing the above operations, you can use the fine-tuned model normally, and other calling methods remain
unchanged.
### hint
1. Before starting training, the fine-tuning code will print the preprocessing information of the first training data (
it is commented by default and can be uncommented), which is displayed as
```log
Sanity
Check >> >> >> >> >> >> >
'[gMASK]': 64790 -> -100
'sop': 64792 -> -100
'<|system|>': 64794 -> -100
'': 30910 -> -100
'\n': 13 -> -100
'Answer': 20115 -> -100
'the': 267 -> -100
'following': 1762 -> -100
...
'know': 683 -> -100
'the': 267 -> -100
'response': 3010 -> -100
'details': 3296 -> -100
'.': 30930 -> -100
'<|assistant|>': 64796 -> -100
'': 30910 -> 30910
'\n': 13 -> 13
'I': 307 -> 307
'need': 720 -> 720
'to': 289 -> 289
'use': 792 -> 792
...
<< << << << << << < Sanity
Check
```
words, each line represents a detokenized string, token_id and target_id in turn. Among them, `target_id` is the index
of `token_id` in the model vocabulary, and `-100` means that
Token does not participate in `loss` calculation.
2. The function of `_prepare_model_for_training` is to iterate through all the trainable parameters of the model and
ensure that their data type is `torch.float32`.
This is necessary in some cases because mixed precision training or other operations may change the data type of the
model parameters. This code is opened by default and can be commented, but if you use
If there is a problem with `half` format training, you can switch back to this code, and the video memory may
increase.
3. In our [Huggingface model code](https://huggingface.co/THUDM/chatglm3-6b/blob/main/modeling_chatglm.py), there is the
following content:
```python
if self.gradient_checkpointing and self.training:
layer_ret = torch.utils.checkpoint.checkpoint(
layer,
hidden_states,
attention_mask,
rotary_pos_emb,
kv_caches[index],
use_cache,
use_reentrant=False
)
```
This may cause the video memory to increase during training, so if you have insufficient video memory, you can try
changing ``` use_reentrant``` to `True`.
4. The fine-tuned model can use any model acceleration framework that supports `peft` loading. Here, we do not provide a
demo.
5. There are certain differences between the fine-tuning data set format of this warehouse and the API fine-tuning data
set format.
+ The `messages` field in the ZhipuAI API fine-tuning data set is the `conversation` field in this warehouse.
+ The fine-tuning file in ZhipuAI API is `jsonl`. In this warehouse, you need to simply change the file name
to `json`.
## Citation
```
@inproceedings{liu2022p,
title={P-tuning: Prompt tuning can be comparable to fine-tuning across scales and tasks},
author={Liu, Xiao and Ji, Kaixuan and Fu, Yicheng and Tam, Weng and Du, Zhengxiao and Yang, Zhilin and Tang, Jie},
booktitle={Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short
Papers)},
pages={61--68},
year={2022}
}
@misc{tang2023toolalpaca,
title={ToolAlpaca: Generalized Tool Learning for Language Models with 3000 Simulated Cases},
author={Qiaoyu Tang and Ziliang Deng and Hongyu Lin and Xianpei Han and Qiao Liang and Le Sun},
year={2023},
eprint={2306.05301},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
```
\ No newline at end of file
{
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": "auto"
},
"zero_optimization": {
"stage": 2,
"allgather_partitions": true,
"allgather_bucket_size": 5e8,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 5e8,
"contiguous_gradients": true
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 2000,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false
}
\ No newline at end of file
{
"train_micro_batch_size_per_gpu": "auto",
"zero_allow_untested_optimizer": true,
"bf16": {
"enabled": "auto"
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"zero_optimization": {
"stage": 3,
"allgather_partitions": true,
"allgather_bucket_size": 5e8,
"reduce_scatter": true,
"contiguous_gradients": true,
"overlap_comm": true,
"sub_group_size": 1e9,
"reduce_bucket_size": "auto",
"stage3_prefetch_bucket_size": "auto",
"stage3_param_persistence_threshold": "auto",
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"stage3_gather_16bit_weights_on_model_save": true
}
}
\ No newline at end of file
data_config:
train_file: train.json
val_file: dev.json
test_file: dev.json
num_proc: 16
max_input_length: 256
max_output_length: 512
training_args:
# see `transformers.Seq2SeqTrainingArguments`
output_dir: ./output
max_steps: 3000
# needed to be fit for the dataset
learning_rate: 5e-5
# settings for data loading
per_device_train_batch_size: 4
dataloader_num_workers: 16
remove_unused_columns: false
# settings for saving checkpoints
save_strategy: steps
save_steps: 500
# settings for logging
log_level: info
logging_strategy: steps
logging_steps: 10
# settings for evaluation
per_device_eval_batch_size: 16
evaluation_strategy: steps
eval_steps: 500
# settings for optimizer
# adam_epsilon: 1e-6
# uncomment the following line to detect nan or inf values
# debug: underflow_overflow
predict_with_generate: true
# see `transformers.GenerationConfig`
generation_config:
max_new_tokens: 512
# set your absolute deepspeed path here
#deepspeed: ds_zero_2.json
# set to true if train with cpu.
use_cpu: false
peft_config:
peft_type: LORA
task_type: CAUSAL_LM
r: 8
lora_alpha: 32
lora_dropout: 0.1
data_config:
train_file: train.json
val_file: dev.json
test_file: dev.json
num_proc: 16
max_input_length: 256
max_output_length: 512
training_args:
# see `transformers.Seq2SeqTrainingArguments`
output_dir: ./output
max_steps: 3000
# needed to be fit for the dataset
learning_rate: 5e-5
# settings for data loading
per_device_train_batch_size: 4
dataloader_num_workers: 16
remove_unused_columns: false
# settings for saving checkpoints
save_strategy: steps
save_steps: 500
# settings for logging
log_level: info
logging_strategy: steps
logging_steps: 10
# settings for evaluation
per_device_eval_batch_size: 16
evaluation_strategy: steps
eval_steps: 500
# settings for optimizer
# adam_epsilon: 1e-6
# uncomment the following line to detect nan or inf values
# debug: underflow_overflow
predict_with_generate: true
# see `transformers.GenerationConfig`
generation_config:
max_new_tokens: 512
# set your absolute deepspeed path here
#deepspeed: ds_zero_3.json
use_cpu: false
peft_config:
peft_type: PREFIX_TUNING
task_type: CAUSAL_LM
num_virtual_tokens: 128
data_config:
train_file: train.json
val_file: dev.json
test_file: dev.json
num_proc: 16
max_input_length: 256
max_output_length: 512
training_args:
# see `transformers.Seq2SeqTrainingArguments`
output_dir: ./output
max_steps: 3000
# needed to be fit for the dataset
learning_rate: 5e-5
# settings for data loading
per_device_train_batch_size: 4
dataloader_num_workers: 16
remove_unused_columns: false
# settings for saving checkpoints
save_strategy: steps
save_steps: 500
# settings for logging
log_level: info
logging_strategy: steps
logging_steps: 10
# settings for evaluation
per_device_eval_batch_size: 16
evaluation_strategy: steps
eval_steps: 500
# settings for optimizer
# adam_epsilon: 1e-6
# uncomment the following line to detect nan or inf values
# debug: underflow_overflow
predict_with_generate: true
generation_config:
max_new_tokens: 512
# set your absolute deepspeed path here
deepspeed: ds_zero_3.json
# -*- coding: utf-8 -*-
import os
import jieba
import dataclasses as dc
import functools
from collections.abc import Callable, Mapping, Sequence
from pathlib import Path
from typing import Annotated, Any, Optional, Union
import numpy as np
import ruamel.yaml as yaml
import torch
import typer
from datasets import Dataset, DatasetDict, NamedSplit, Split, load_dataset
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
from peft import (
PeftConfig,
PeftModelForCausalLM,
get_peft_config,
get_peft_model
)
from rouge_chinese import Rouge
from torch import nn
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
EvalPrediction,
GenerationConfig,
PreTrainedModel,
PreTrainedTokenizer,
PreTrainedTokenizerFast,
Seq2SeqTrainingArguments, AutoConfig,
)
from transformers import DataCollatorForSeq2Seq as _DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainer as _Seq2SeqTrainer
ModelType = Union[PreTrainedModel, PeftModelForCausalLM]
TokenizerType = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
app = typer.Typer(pretty_exceptions_show_locals=False)
class DataCollatorForSeq2Seq(_DataCollatorForSeq2Seq):
def __call__(self, features, return_tensors=None):
output_ids = (
[feature['output_ids'] for feature in features]
if 'output_ids' in features[0].keys()
else None
)
if output_ids is not None:
max_output_length = max(len(out) for out in output_ids)
if self.pad_to_multiple_of is not None:
max_output_length = (
(
max_output_length + self.pad_to_multiple_of - 1) //
self.pad_to_multiple_of * self.pad_to_multiple_of
)
for feature in features:
remainder = [self.tokenizer.pad_token_id] * (
max_output_length - len(feature['output_ids'])
)
if isinstance(feature['output_ids'], list):
feature['output_ids'] = feature['output_ids'] + remainder
else:
feature['output_ids'] = np.concatenate(
[feature['output_ids'], remainder]
).astype(np.int64)
return super().__call__(features, return_tensors)
class Seq2SeqTrainer(_Seq2SeqTrainer):
def prediction_step(
self,
model: nn.Module,
inputs: dict[str, Any],
prediction_loss_only: bool,
ignore_keys=None,
**gen_kwargs,
) -> tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
if self.args.predict_with_generate:
output_ids = inputs.pop('output_ids')
input_ids = inputs['input_ids']
loss, generated_tokens, labels = super().prediction_step(
model, inputs, prediction_loss_only, ignore_keys, **gen_kwargs
)
generated_tokens = generated_tokens[:, input_ids.size()[1]:]
if self.args.predict_with_generate:
labels = output_ids
return loss, generated_tokens, labels
# For P-Tuning a new save_model function is fine for the prefix_encoder model
# but may cost problems for the whole model loading
# def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = False):
# if output_dir is None:
# output_dir = self.args.output_dir
# os.makedirs(output_dir, exist_ok=True)
# ptuning_params = {k: v for k, v in self.model.transformer.prefix_encoder.state_dict().items()}
#
# torch.save(ptuning_params, os.path.join(output_dir, 'pytorch_model.bin'))
#
# print(f"P-Tuning model weights saved in {output_dir}")
#
# if self.tokenizer is not None:
# self.tokenizer.save_pretrained(output_dir)
def _resolve_path(path: Union[str, Path]) -> Path:
return Path(path).expanduser().resolve()
def _sanity_check(
input_ids: Sequence[int],
output_ids: Sequence[int],
tokenizer: PreTrainedTokenizer,
):
print('--> Sanity check')
for in_id, out_id in zip(input_ids, output_ids):
if in_id == 0:
continue
if in_id in tokenizer.tokenizer.index_special_tokens:
in_text = tokenizer.tokenizer.index_special_tokens[in_id]
else:
in_text = tokenizer.decode([in_id])
print(f'{repr(in_text):>20}: {in_id} -> {out_id}')
@functools.cache
def _get_yaml_parser() -> yaml.YAML:
parser = yaml.YAML(typ='safe', pure=True)
parser.indent(mapping=2, offset=2, sequence=4)
parser.default_flow_style = False
return parser
@dc.dataclass
class DataConfig(object):
train_file: str
val_file: Optional[str] = None
test_file: Optional[str] = None
num_proc: Optional[int] = None
@property
def data_format(self) -> str:
return Path(self.train_file).suffix
@property
def data_files(self) -> dict[NamedSplit, str]:
return {
split: data_file
for split, data_file in zip(
[Split.TRAIN, Split.VALIDATION, Split.TEST],
[self.train_file, self.val_file, self.test_file],
)
if data_file is not None
}
@dc.dataclass
class FinetuningConfig(object):
data_config: DataConfig
max_input_length: int
max_output_length: int
training_args: Seq2SeqTrainingArguments = dc.field(
default_factory=lambda: Seq2SeqTrainingArguments(output_dir='./output')
)
peft_config: Optional[PeftConfig] = None
def __post_init__(self):
if not self.training_args.do_eval or self.data_config.val_file is None:
# skips the evaluation stage when `do_eval` or `eval_file` is not provided
self.training_args.do_eval = False
self.training_args.evaluation_strategy = 'no'
self.data_config.val_file = None
else:
self.training_args.per_device_eval_batch_size = (
self.training_args.per_device_eval_batch_size
or self.training_args.per_device_train_batch_size
)
@classmethod
def from_dict(cls, **kwargs) -> 'FinetuningConfig':
training_args = kwargs.get('training_args', None)
if training_args is not None and not isinstance(
training_args, Seq2SeqTrainingArguments
):
gen_config = training_args.get('generation_config')
# TODO: a bit hacky
if not isinstance(gen_config, GenerationConfig):
training_args['generation_config'] = GenerationConfig(
**gen_config
)
kwargs['training_args'] = Seq2SeqTrainingArguments(**training_args)
data_config = kwargs.get('data_config')
if not isinstance(data_config, DataConfig):
kwargs['data_config'] = DataConfig(**data_config)
peft_config = kwargs.get('peft_config', None)
if peft_config is not None and not isinstance(peft_config, PeftConfig):
kwargs['peft_config'] = get_peft_config(peft_config)
return cls(**kwargs)
@classmethod
def from_file(cls, path: Union[str, Path]) -> 'FinetuningConfig':
path = _resolve_path(path)
kwargs = _get_yaml_parser().load(path)
return cls.from_dict(**kwargs)
def _load_datasets(
data_dir: Path,
data_format: str,
data_files: dict[NamedSplit, str],
num_proc: Optional[int],
) -> DatasetDict:
if data_format in ('.csv', '.json', '.jsonl'):
dataset_dct = load_dataset(
data_format[1:],
data_dir=data_dir,
data_files=data_files,
num_proc=num_proc,
)
else:
err_msg = f"Cannot load dataset in the '{data_format}' format."
raise NotImplementedError(err_msg)
return dataset_dct
class DataManager(object):
def __init__(self, data_dir: str, data_config: DataConfig):
self._num_proc = data_config.num_proc
self._dataset_dct = _load_datasets(
_resolve_path(data_dir),
data_config.data_format,
data_config.data_files,
self._num_proc,
)
def _get_dataset(self, split: NamedSplit) -> Optional[Dataset]:
return self._dataset_dct.get(split, None)
def get_dataset(
self,
split: NamedSplit,
process_fn: Callable[[dict[str, Any]], dict[str, Any]],
batched: bool = True,
remove_orig_columns: bool = True,
) -> Optional[Dataset]:
orig_dataset = self._get_dataset(split)
if orig_dataset is None:
return
if remove_orig_columns:
remove_columns = orig_dataset.column_names
else:
remove_columns = None
return orig_dataset.map(
process_fn,
batched=batched,
remove_columns=remove_columns,
num_proc=self._num_proc,
)
def print_model_size(model: PreTrainedModel):
print("--> Model")
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"\n--> model has {total_params / 1e6}M params\n")
def process_batch(
batch: Mapping[str, Sequence],
tokenizer: PreTrainedTokenizer,
max_input_length: int,
max_output_length: int,
) -> dict[str, list]:
batched_tools = batch.get('tools', None)
batched_conv = batch['conversations']
batched_input_ids = []
batched_labels = []
if batched_tools is None:
batched_tools = [None] * len(batched_conv)
for tools, conv in zip(batched_tools, batched_conv):
input_ids, loss_masks = [
tokenizer.get_command('[gMASK]'),
tokenizer.get_command('sop'),
], [False, False]
if tools is not None:
raise NotImplementedError()
for message in conv:
if message['role'] in ('system', 'user'):
loss_mask_val = False
else:
loss_mask_val = True
if message['role'] == 'tool':
raise NotImplementedError()
else:
new_input_ids = tokenizer.build_single_message(
message['role'], '', message['content']
)
new_loss_masks = [loss_mask_val] * len(new_input_ids)
input_ids += new_input_ids
loss_masks += new_loss_masks
input_ids.append(tokenizer.eos_token_id)
loss_masks = [False, *loss_masks]
labels = []
for input_id, mask in zip(input_ids, loss_masks):
if mask:
labels.append(input_id)
else:
labels.append(-100)
max_length = max_input_length + max_output_length + 1
batched_input_ids.append(input_ids[:max_length])
batched_labels.append(labels[:max_length])
return {'input_ids': batched_input_ids, 'labels': batched_labels}
def process_batch_eval(
batch: Mapping[str, Sequence],
tokenizer: PreTrainedTokenizer,
max_input_length: int,
max_output_length: int,
) -> dict[str, list]:
batched_tools = batch.get('tools', None)
batched_conv = batch['conversations']
batched_input_ids = []
# To avoid computing loss, we do not provide the `labels` field in the input dictionary.
batched_output_ids = []
if batched_tools is None:
batched_tools = [None] * len(batched_conv)
for tools, conv in zip(batched_tools, batched_conv):
input_ids = [
tokenizer.get_command('[gMASK]'),
tokenizer.get_command('sop'),
]
if tools is not None:
raise NotImplementedError()
for message in conv:
if len(input_ids) >= max_input_length:
break
if message['role'] == 'tool':
raise NotImplementedError()
else:
new_input_ids = tokenizer.build_single_message(
message['role'], '', message['content']
)
if message['role'] == 'assistant':
output_prompt, output_ids = (
new_input_ids[:1],
new_input_ids[1:],
)
output_ids.append(tokenizer.eos_token_id)
batched_input_ids.append(
input_ids[:max_input_length] + output_prompt[:1]
)
batched_output_ids.append(output_ids[:max_output_length])
input_ids += new_input_ids
return {'input_ids': batched_input_ids, 'output_ids': batched_output_ids}
# Not sure if this is necessary, can set it to half.
# If train with cpu, cast all params to fp32 instead of trainable ones.
def _prepare_model_for_training(model: nn.Module, use_cpu: bool):
for param in model.parameters():
if param.requires_grad or use_cpu:
param.data = param.data.to(torch.float32)
def load_tokenizer_and_model(
model_dir: str,
peft_config: Optional[PeftConfig] = None,
) -> tuple[PreTrainedTokenizer, nn.Module]:
tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
if peft_config is not None:
if peft_config.peft_type.name == "PREFIX_TUNING":
config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True)
config.pre_seq_len = peft_config.num_virtual_tokens
config.use_cache = False
model = AutoModelForCausalLM.from_pretrained(
model_dir,
trust_remote_code=True,
config=config,
)
if peft_config.peft_type.name == "LORA":
model = AutoModelForCausalLM.from_pretrained(
model_dir,
trust_remote_code=True,
empty_init=False,
use_cache=False
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
else:
model = AutoModelForCausalLM.from_pretrained(
model_dir,
trust_remote_code=True,
empty_init=False,
use_cache=False
)
print_model_size(model)
return tokenizer, model
def compute_metrics(eval_preds: EvalPrediction, tokenizer: PreTrainedTokenizer):
batched_pred_ids, batched_label_ids = eval_preds
metrics_dct = {'rouge-1': [], 'rouge-2': [], 'rouge-l': [], 'bleu-4': []}
for pred_ids, label_ids in zip(batched_pred_ids, batched_label_ids):
pred_txt = tokenizer.decode(pred_ids).strip()
label_txt = tokenizer.decode(label_ids).strip()
pred_tokens = list(jieba.cut(pred_txt))
label_tokens = list(jieba.cut(label_txt))
rouge = Rouge()
scores = rouge.get_scores(' '.join(pred_tokens), ' '.join(label_tokens))
for k, v in scores[0].items():
metrics_dct[k].append(round(v['f'] * 100, 4))
metrics_dct['bleu-4'].append(
sentence_bleu(
[label_tokens],
pred_tokens,
smoothing_function=SmoothingFunction().method3,
)
)
return {k: np.mean(v) for k, v in metrics_dct.items()}
@app.command()
def main(
data_dir: Annotated[str, typer.Argument(help='')],
model_dir: Annotated[
str,
typer.Argument(
help='A string that specifies the model id of a pretrained model configuration hosted on huggingface.co, or a path to a directory containing a model configuration file.'
),
],
config_file: Annotated[str, typer.Argument(help='')],
auto_resume_from_checkpoint: str = typer.Argument(
default='',
help='If entered as yes, automatically use the latest save checkpoint. If it is a numerical example 12 15, use the corresponding save checkpoint. If the input is no, restart training'
),
):
ft_config = FinetuningConfig.from_file(config_file)
tokenizer, model = load_tokenizer_and_model(model_dir, peft_config=ft_config.peft_config)
data_manager = DataManager(data_dir, ft_config.data_config)
train_dataset = data_manager.get_dataset(
Split.TRAIN,
functools.partial(
process_batch,
tokenizer=tokenizer,
max_input_length=ft_config.max_input_length,
max_output_length=ft_config.max_output_length,
),
batched=True,
)
print('train_dataset:', train_dataset)
val_dataset = data_manager.get_dataset(
Split.VALIDATION,
functools.partial(
process_batch_eval,
tokenizer=tokenizer,
max_input_length=ft_config.max_input_length,
max_output_length=ft_config.max_output_length,
),
batched=True,
)
if val_dataset is not None:
print('val_dataset:', val_dataset)
test_dataset = data_manager.get_dataset(
Split.TEST,
functools.partial(
process_batch_eval,
tokenizer=tokenizer,
max_input_length=ft_config.max_input_length,
max_output_length=ft_config.max_output_length,
),
batched=True,
)
if test_dataset is not None:
print('test_dataset:', test_dataset)
# checks encoded dataset
_sanity_check(
train_dataset[0]["input_ids"], train_dataset[0]["labels"], tokenizer
)
# turn model to fp32
_prepare_model_for_training(model, ft_config.training_args.use_cpu)
ft_config.training_args.generation_config.pad_token_id = (
tokenizer.pad_token_id
)
ft_config.training_args.generation_config.eos_token_id = [
tokenizer.eos_token_id,
tokenizer.get_command('<|user|>'),
tokenizer.get_command('<|observation|>'),
]
model.gradient_checkpointing_enable()
model.enable_input_require_grads()
trainer = Seq2SeqTrainer(
model=model,
args=ft_config.training_args,
data_collator=DataCollatorForSeq2Seq(
tokenizer=tokenizer,
padding='longest',
return_tensors='pt',
),
train_dataset=train_dataset,
eval_dataset=val_dataset.select(list(range(50))),
tokenizer=tokenizer if ft_config.peft_config.peft_type != "LORA" else None, # LORA does not need tokenizer
compute_metrics=functools.partial(compute_metrics, tokenizer=tokenizer),
)
if auto_resume_from_checkpoint.upper() == "" or auto_resume_from_checkpoint is None:
trainer.train()
else:
output_dir = ft_config.training_args.output_dir
dirlist = os.listdir(output_dir)
checkpoint_sn = 0
for checkpoint_str in dirlist:
if checkpoint_str.find("eckpoint") > 0 and checkpoint_str.find("tmp") == -1:
checkpoint = int(checkpoint_str.replace("checkpoint-", ""))
if checkpoint > checkpoint_sn:
checkpoint_sn = checkpoint
if auto_resume_from_checkpoint.upper() == "YES":
if checkpoint_sn > 0:
model.gradient_checkpointing_enable()
model.enable_input_require_grads()
checkpoint_directory = os.path.join(output_dir, "checkpoint-" + str(checkpoint_sn))
print("resume checkpoint from checkpoint-" + str(checkpoint_sn))
trainer.train(resume_from_checkpoint=checkpoint_directory)
else:
trainer.train()
else:
if auto_resume_from_checkpoint.isdigit():
if int(auto_resume_from_checkpoint) > 0:
checkpoint_sn = int(auto_resume_from_checkpoint)
model.gradient_checkpointing_enable()
model.enable_input_require_grads()
checkpoint_directory = os.path.join(output_dir, "checkpoint-" + str(checkpoint_sn))
print("resume checkpoint from checkpoint-" + str(checkpoint_sn))
trainer.train(resume_from_checkpoint=checkpoint_directory)
else:
print(auto_resume_from_checkpoint,
"The specified checkpoint sn(" + auto_resume_from_checkpoint + ") has not been saved. Please search for the correct chkeckpoint in the model output directory")
# test stage
if test_dataset is not None:
trainer.predict(test_dataset)
if __name__ == '__main__':
app()
# -*- coding: utf-8 -*-
import os
import jieba
import dataclasses as dc
import functools
from collections.abc import Callable, Mapping, Sequence
from pathlib import Path
from typing import Annotated, Any, Optional, Union
import numpy as np
import ruamel.yaml as yaml
import torch
import typer
from datasets import Dataset, DatasetDict, NamedSplit, Split, load_dataset
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
from peft import (
PeftConfig,
PeftModelForCausalLM,
get_peft_config,
get_peft_model
)
from rouge_chinese import Rouge
from torch import nn
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
EvalPrediction,
GenerationConfig,
PreTrainedModel,
PreTrainedTokenizer,
PreTrainedTokenizerFast,
Seq2SeqTrainingArguments, AutoConfig,
)
from transformers import DataCollatorForSeq2Seq as _DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainer as _Seq2SeqTrainer
ModelType = Union[PreTrainedModel, PeftModelForCausalLM]
TokenizerType = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
app = typer.Typer(pretty_exceptions_show_locals=False)
class DataCollatorForSeq2Seq(_DataCollatorForSeq2Seq):
def __call__(self, features, return_tensors=None):
output_ids = (
[feature['output_ids'] for feature in features]
if 'output_ids' in features[0].keys()
else None
)
if output_ids is not None:
max_output_length = max(len(out) for out in output_ids)
if self.pad_to_multiple_of is not None:
max_output_length = (
(
max_output_length + self.pad_to_multiple_of - 1) //
self.pad_to_multiple_of * self.pad_to_multiple_of
)
for feature in features:
remainder = [self.tokenizer.pad_token_id] * (
max_output_length - len(feature['output_ids'])
)
if isinstance(feature['output_ids'], list):
feature['output_ids'] = feature['output_ids'] + remainder
else:
feature['output_ids'] = np.concatenate(
[feature['output_ids'], remainder]
).astype(np.int64)
return super().__call__(features, return_tensors)
class Seq2SeqTrainer(_Seq2SeqTrainer):
def prediction_step(
self,
model: nn.Module,
inputs: dict[str, Any],
prediction_loss_only: bool,
ignore_keys=None,
**gen_kwargs,
) -> tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
if self.args.predict_with_generate:
output_ids = inputs.pop('output_ids')
input_ids = inputs['input_ids']
loss, generated_tokens, labels = super().prediction_step(
model, inputs, prediction_loss_only, ignore_keys, **gen_kwargs
)
generated_tokens = generated_tokens[:, input_ids.size()[1]:]
if self.args.predict_with_generate:
labels = output_ids
return loss, generated_tokens, labels
# For P-Tuning a new save_model function is fine for the prefix_encoder model
# but may cost problems for the whole model loading
# def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = False):
# if output_dir is None:
# output_dir = self.args.output_dir
# os.makedirs(output_dir, exist_ok=True)
# ptuning_params = {k: v for k, v in self.model.transformer.prefix_encoder.state_dict().items()}
#
# torch.save(ptuning_params, os.path.join(output_dir, 'pytorch_model.bin'))
#
# print(f"P-Tuning model weights saved in {output_dir}")
#
# if self.tokenizer is not None:
# self.tokenizer.save_pretrained(output_dir)
def _resolve_path(path: Union[str, Path]) -> Path:
return Path(path).expanduser().resolve()
def _sanity_check(
input_ids: Sequence[int],
output_ids: Sequence[int],
tokenizer: PreTrainedTokenizer,
):
print('--> Sanity check')
for in_id, out_id in zip(input_ids, output_ids):
if in_id == 0:
continue
if in_id in tokenizer.tokenizer.index_special_tokens:
in_text = tokenizer.tokenizer.index_special_tokens[in_id]
else:
in_text = tokenizer.decode([in_id])
print(f'{repr(in_text):>20}: {in_id} -> {out_id}')
@functools.cache
def _get_yaml_parser() -> yaml.YAML:
parser = yaml.YAML(typ='safe', pure=True)
parser.indent(mapping=2, offset=2, sequence=4)
parser.default_flow_style = False
return parser
@dc.dataclass
class DataConfig(object):
train_file: str
val_file: Optional[str] = None
test_file: Optional[str] = None
num_proc: Optional[int] = None
@property
def data_format(self) -> str:
return Path(self.train_file).suffix
@property
def data_files(self) -> dict[NamedSplit, str]:
return {
split: data_file
for split, data_file in zip(
[Split.TRAIN, Split.VALIDATION, Split.TEST],
[self.train_file, self.val_file, self.test_file],
)
if data_file is not None
}
@dc.dataclass
class FinetuningConfig(object):
data_config: DataConfig
max_input_length: int
max_output_length: int
training_args: Seq2SeqTrainingArguments = dc.field(
default_factory=lambda: Seq2SeqTrainingArguments(output_dir='./output')
)
peft_config: Optional[PeftConfig] = None
def __post_init__(self):
if not self.training_args.do_eval or self.data_config.val_file is None:
# skips the evaluation stage when `do_eval` or `eval_file` is not provided
self.training_args.do_eval = False
self.training_args.evaluation_strategy = 'no'
self.data_config.val_file = None
else:
self.training_args.per_device_eval_batch_size = (
self.training_args.per_device_eval_batch_size
or self.training_args.per_device_train_batch_size
)
@classmethod
def from_dict(cls, **kwargs) -> 'FinetuningConfig':
training_args = kwargs.get('training_args', None)
if training_args is not None and not isinstance(
training_args, Seq2SeqTrainingArguments
):
gen_config = training_args.get('generation_config')
# TODO: a bit hacky
if not isinstance(gen_config, GenerationConfig):
training_args['generation_config'] = GenerationConfig(
**gen_config
)
kwargs['training_args'] = Seq2SeqTrainingArguments(**training_args)
data_config = kwargs.get('data_config')
if not isinstance(data_config, DataConfig):
kwargs['data_config'] = DataConfig(**data_config)
peft_config = kwargs.get('peft_config', None)
if peft_config is not None and not isinstance(peft_config, PeftConfig):
kwargs['peft_config'] = get_peft_config(peft_config)
return cls(**kwargs)
@classmethod
def from_file(cls, path: Union[str, Path]) -> 'FinetuningConfig':
path = _resolve_path(path)
kwargs = _get_yaml_parser().load(path)
return cls.from_dict(**kwargs)
def _load_datasets(
data_dir: Path,
data_format: str,
data_files: dict[NamedSplit, str],
num_proc: Optional[int],
) -> DatasetDict:
if data_format in ('.csv', '.json', '.jsonl'):
dataset_dct = load_dataset(
data_format[1:],
data_dir=data_dir,
data_files=data_files,
num_proc=num_proc,
)
else:
err_msg = f"Cannot load dataset in the '{data_format}' format."
raise NotImplementedError(err_msg)
return dataset_dct
class DataManager(object):
def __init__(self, data_dir: str, data_config: DataConfig):
self._num_proc = data_config.num_proc
self._dataset_dct = _load_datasets(
_resolve_path(data_dir),
data_config.data_format,
data_config.data_files,
self._num_proc,
)
def _get_dataset(self, split: NamedSplit) -> Optional[Dataset]:
return self._dataset_dct.get(split, None)
def get_dataset(
self,
split: NamedSplit,
process_fn: Callable[[dict[str, Any]], dict[str, Any]],
batched: bool = True,
remove_orig_columns: bool = True,
) -> Optional[Dataset]:
orig_dataset = self._get_dataset(split)
if orig_dataset is None:
return
if remove_orig_columns:
remove_columns = orig_dataset.column_names
else:
remove_columns = None
return orig_dataset.map(
process_fn,
batched=batched,
remove_columns=remove_columns,
num_proc=self._num_proc,
)
def print_model_size(model: PreTrainedModel):
print("--> Model")
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"\n--> model has {total_params / 1e6}M params\n")
def process_batch(
batch: Mapping[str, Sequence],
tokenizer: PreTrainedTokenizer,
max_input_length: int,
max_output_length: int,
) -> dict[str, list]:
batched_tools = batch.get('tools', None)
batched_conv = batch['conversations']
batched_input_ids = []
batched_labels = []
if batched_tools is None:
batched_tools = [None] * len(batched_conv)
for tools, conv in zip(batched_tools, batched_conv):
input_ids, loss_masks = [
tokenizer.get_command('[gMASK]'),
tokenizer.get_command('sop'),
], [False, False]
if tools is not None:
raise NotImplementedError()
for message in conv:
if message['role'] in ('system', 'user'):
loss_mask_val = False
else:
loss_mask_val = True
if message['role'] == 'tool':
raise NotImplementedError()
else:
new_input_ids = tokenizer.build_single_message(
message['role'], '', message['content']
)
new_loss_masks = [loss_mask_val] * len(new_input_ids)
input_ids += new_input_ids
loss_masks += new_loss_masks
input_ids.append(tokenizer.eos_token_id)
loss_masks = [False, *loss_masks]
labels = []
for input_id, mask in zip(input_ids, loss_masks):
if mask:
labels.append(input_id)
else:
labels.append(-100)
max_length = max_input_length + max_output_length + 1
batched_input_ids.append(input_ids[:max_length])
batched_labels.append(labels[:max_length])
return {'input_ids': batched_input_ids, 'labels': batched_labels}
def process_batch_eval(
batch: Mapping[str, Sequence],
tokenizer: PreTrainedTokenizer,
max_input_length: int,
max_output_length: int,
) -> dict[str, list]:
batched_tools = batch.get('tools', None)
batched_conv = batch['conversations']
batched_input_ids = []
# To avoid computing loss, we do not provide the `labels` field in the input dictionary.
batched_output_ids = []
if batched_tools is None:
batched_tools = [None] * len(batched_conv)
for tools, conv in zip(batched_tools, batched_conv):
input_ids = [
tokenizer.get_command('[gMASK]'),
tokenizer.get_command('sop'),
]
if tools is not None:
raise NotImplementedError()
for message in conv:
if len(input_ids) >= max_input_length:
break
if message['role'] == 'tool':
raise NotImplementedError()
else:
new_input_ids = tokenizer.build_single_message(
message['role'], '', message['content']
)
if message['role'] == 'assistant':
output_prompt, output_ids = (
new_input_ids[:1],
new_input_ids[1:],
)
output_ids.append(tokenizer.eos_token_id)
batched_input_ids.append(
input_ids[:max_input_length] + output_prompt[:1]
)
batched_output_ids.append(output_ids[:max_output_length])
input_ids += new_input_ids
return {'input_ids': batched_input_ids, 'output_ids': batched_output_ids}
# Not sure if this is necessary, can set it to half.
# If train with cpu, cast all params to fp32 instead of trainable ones.
def _prepare_model_for_training(model: nn.Module, use_cpu: bool):
for param in model.parameters():
if param.requires_grad or use_cpu:
param.data = param.data.to(torch.float32)
def load_tokenizer_and_model(
model_dir: str,
peft_config: Optional[PeftConfig] = None,
) -> tuple[PreTrainedTokenizer, nn.Module]:
tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
if peft_config is not None:
if peft_config.peft_type.name == "PREFIX_TUNING":
config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True)
config.pre_seq_len = peft_config.num_virtual_tokens
config.use_cache = False
model = AutoModelForCausalLM.from_pretrained(
model_dir,
trust_remote_code=True,
config=config,
)
if peft_config.peft_type.name == "LORA":
model = AutoModelForCausalLM.from_pretrained(
model_dir,
trust_remote_code=True,
empty_init=False,
use_cache=False
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
else:
model = AutoModelForCausalLM.from_pretrained(
model_dir,
trust_remote_code=True,
empty_init=False,
use_cache=False
)
print_model_size(model)
return tokenizer, model
def compute_metrics(eval_preds: EvalPrediction, tokenizer: PreTrainedTokenizer):
batched_pred_ids, batched_label_ids = eval_preds
metrics_dct = {'rouge-1': [], 'rouge-2': [], 'rouge-l': [], 'bleu-4': []}
for pred_ids, label_ids in zip(batched_pred_ids, batched_label_ids):
pred_txt = tokenizer.decode(pred_ids).strip()
label_txt = tokenizer.decode(label_ids).strip()
pred_tokens = list(jieba.cut(pred_txt))
label_tokens = list(jieba.cut(label_txt))
rouge = Rouge()
scores = rouge.get_scores(' '.join(pred_tokens), ' '.join(label_tokens))
for k, v in scores[0].items():
metrics_dct[k].append(round(v['f'] * 100, 4))
metrics_dct['bleu-4'].append(
sentence_bleu(
[label_tokens],
pred_tokens,
smoothing_function=SmoothingFunction().method3,
)
)
return {k: np.mean(v) for k, v in metrics_dct.items()}
@app.command()
def main(
data_dir: Annotated[str, typer.Argument(help='')],
model_dir: Annotated[
str,
typer.Argument(
help='A string that specifies the model id of a pretrained model configuration hosted on huggingface.co, or a path to a directory containing a model configuration file.'
),
],
config_file: Annotated[str, typer.Argument(help='')],
auto_resume_from_checkpoint: str = typer.Argument(
default='',
help='If entered as yes, automatically use the latest save checkpoint. If it is a numerical example 12 15, use the corresponding save checkpoint. If the input is no, restart training'
),
):
ft_config = FinetuningConfig.from_file(config_file)
tokenizer, model = load_tokenizer_and_model(model_dir, peft_config=ft_config.peft_config)
data_manager = DataManager(data_dir, ft_config.data_config)
train_dataset = data_manager.get_dataset(
Split.TRAIN,
functools.partial(
process_batch,
tokenizer=tokenizer,
max_input_length=ft_config.max_input_length,
max_output_length=ft_config.max_output_length,
),
batched=True,
)
print('train_dataset:', train_dataset)
val_dataset = data_manager.get_dataset(
Split.VALIDATION,
functools.partial(
process_batch_eval,
tokenizer=tokenizer,
max_input_length=ft_config.max_input_length,
max_output_length=ft_config.max_output_length,
),
batched=True,
)
if val_dataset is not None:
print('val_dataset:', val_dataset)
test_dataset = data_manager.get_dataset(
Split.TEST,
functools.partial(
process_batch_eval,
tokenizer=tokenizer,
max_input_length=ft_config.max_input_length,
max_output_length=ft_config.max_output_length,
),
batched=True,
)
if test_dataset is not None:
print('test_dataset:', test_dataset)
# checks encoded dataset
_sanity_check(
train_dataset[0]["input_ids"], train_dataset[0]["labels"], tokenizer
)
# turn model to fp32
_prepare_model_for_training(model, ft_config.training_args.use_cpu)
ft_config.training_args.generation_config.pad_token_id = (
tokenizer.pad_token_id
)
ft_config.training_args.generation_config.eos_token_id = [
tokenizer.eos_token_id,
tokenizer.get_command('<|user|>'),
tokenizer.get_command('<|observation|>'),
]
model.gradient_checkpointing_enable()
model.enable_input_require_grads()
trainer = Seq2SeqTrainer(
model=model,
args=ft_config.training_args,
data_collator=DataCollatorForSeq2Seq(
tokenizer=tokenizer,
padding='longest',
return_tensors='pt',
),
train_dataset=train_dataset,
eval_dataset=val_dataset.select(list(range(50))),
tokenizer=tokenizer, # LORA does not need tokenizer
compute_metrics=functools.partial(compute_metrics, tokenizer=tokenizer),
)
if auto_resume_from_checkpoint.upper() == "" or auto_resume_from_checkpoint is None:
trainer.train()
else:
output_dir = ft_config.training_args.output_dir
dirlist = os.listdir(output_dir)
checkpoint_sn = 0
for checkpoint_str in dirlist:
if checkpoint_str.find("eckpoint") > 0 and checkpoint_str.find("tmp") == -1:
checkpoint = int(checkpoint_str.replace("checkpoint-", ""))
if checkpoint > checkpoint_sn:
checkpoint_sn = checkpoint
if auto_resume_from_checkpoint.upper() == "YES":
if checkpoint_sn > 0:
model.gradient_checkpointing_enable()
model.enable_input_require_grads()
checkpoint_directory = os.path.join(output_dir, "checkpoint-" + str(checkpoint_sn))
print("resume checkpoint from checkpoint-" + str(checkpoint_sn))
trainer.train(resume_from_checkpoint=checkpoint_directory)
else:
trainer.train()
else:
if auto_resume_from_checkpoint.isdigit():
if int(auto_resume_from_checkpoint) > 0:
checkpoint_sn = int(auto_resume_from_checkpoint)
model.gradient_checkpointing_enable()
model.enable_input_require_grads()
checkpoint_directory = os.path.join(output_dir, "checkpoint-" + str(checkpoint_sn))
print("resume checkpoint from checkpoint-" + str(checkpoint_sn))
trainer.train(resume_from_checkpoint=checkpoint_directory)
else:
print(auto_resume_from_checkpoint,
"The specified checkpoint sn(" + auto_resume_from_checkpoint + ") has not been saved. Please search for the correct chkeckpoint in the model output directory")
# test stage
if test_dataset is not None:
trainer.predict(test_dataset)
if __name__ == '__main__':
app()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from pathlib import Path
from typing import Annotated, Union
import typer
from peft import AutoPeftModelForCausalLM, PeftModelForCausalLM
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
PreTrainedModel,
PreTrainedTokenizer,
PreTrainedTokenizerFast,
)
ModelType = Union[PreTrainedModel, PeftModelForCausalLM]
TokenizerType = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
app = typer.Typer(pretty_exceptions_show_locals=False)
def _resolve_path(path: Union[str, Path]) -> Path:
return Path(path).expanduser().resolve()
def load_model_and_tokenizer(model_dir: Union[str, Path]) -> tuple[ModelType, TokenizerType]:
model_dir = _resolve_path(model_dir)
if (model_dir / 'adapter_config.json').exists():
model = AutoPeftModelForCausalLM.from_pretrained(
model_dir, trust_remote_code=True, device_map='auto'
)
tokenizer_dir = model.peft_config['default'].base_model_name_or_path
else:
model = AutoModelForCausalLM.from_pretrained(
model_dir, trust_remote_code=True, device_map='auto'
)
tokenizer_dir = model_dir
tokenizer = AutoTokenizer.from_pretrained(
tokenizer_dir, trust_remote_code=True
)
return model, tokenizer
@app.command()
def main(
model_dir: Annotated[str, typer.Argument(help='')],
prompt: Annotated[str, typer.Option(help='')],
):
model, tokenizer = load_model_and_tokenizer(model_dir)
response, _ = model.chat(tokenizer, prompt)
print(response)
if __name__ == '__main__':
app()
export HIP_VISIBLE_DEVICES=7
python finetune_hf.py data/AdvertiseGen_fix /path/to/chatglm3-6b configs/lora.yaml
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment