Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ChatGLM3-6B_pytorch
Commits
3a3f5683
Commit
3a3f5683
authored
Dec 03, 2023
by
zhaoying1
Browse files
added chatglm3
parents
Pipeline
#657
failed with stages
in 0 seconds
Changes
98
Pipelines
1
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1156 additions
and
0 deletions
+1156
-0
finetune_basemodel_demo/scripts/finetune_lora.sh
finetune_basemodel_demo/scripts/finetune_lora.sh
+43
-0
finetune_basemodel_demo/scripts/formate_alpaca2jsonl.py
finetune_basemodel_demo/scripts/formate_alpaca2jsonl.py
+28
-0
finetune_basemodel_demo/trainer.py
finetune_basemodel_demo/trainer.py
+52
-0
finetune_chatmodel_demo/AdvertiseGen/dev.json
finetune_chatmodel_demo/AdvertiseGen/dev.json
+0
-0
finetune_chatmodel_demo/AdvertiseGen/train.json
finetune_chatmodel_demo/AdvertiseGen/train.json
+0
-0
finetune_chatmodel_demo/README.md
finetune_chatmodel_demo/README.md
+261
-0
finetune_chatmodel_demo/arguments.py
finetune_chatmodel_demo/arguments.py
+151
-0
finetune_chatmodel_demo/configs/deepspeed.json
finetune_chatmodel_demo/configs/deepspeed.json
+24
-0
finetune_chatmodel_demo/finetune.py
finetune_chatmodel_demo/finetune.py
+169
-0
finetune_chatmodel_demo/formatted_data/advertise_gen.jsonl
finetune_chatmodel_demo/formatted_data/advertise_gen.jsonl
+0
-0
finetune_chatmodel_demo/formatted_data/tool_alpaca.jsonl
finetune_chatmodel_demo/formatted_data/tool_alpaca.jsonl
+0
-0
finetune_chatmodel_demo/inference.py
finetune_chatmodel_demo/inference.py
+42
-0
finetune_chatmodel_demo/preprocess_utils.py
finetune_chatmodel_demo/preprocess_utils.py
+146
-0
finetune_chatmodel_demo/requirements.txt
finetune_chatmodel_demo/requirements.txt
+6
-0
finetune_chatmodel_demo/scripts/finetune_ds.sh
finetune_chatmodel_demo/scripts/finetune_ds.sh
+39
-0
finetune_chatmodel_demo/scripts/finetune_ds_multiturn.sh
finetune_chatmodel_demo/scripts/finetune_ds_multiturn.sh
+35
-0
finetune_chatmodel_demo/scripts/finetune_pt.sh
finetune_chatmodel_demo/scripts/finetune_pt.sh
+38
-0
finetune_chatmodel_demo/scripts/finetune_pt_multiturn.sh
finetune_chatmodel_demo/scripts/finetune_pt_multiturn.sh
+36
-0
finetune_chatmodel_demo/scripts/format_advertise_gen.py
finetune_chatmodel_demo/scripts/format_advertise_gen.py
+25
-0
finetune_chatmodel_demo/scripts/format_tool_alpaca.py
finetune_chatmodel_demo/scripts/format_tool_alpaca.py
+61
-0
No files found.
finetune_basemodel_demo/scripts/finetune_lora.sh
0 → 100644
View file @
3a3f5683
#! /usr/bin/env bash
set
-ex
LR
=
1e-4
NUM_GPUS
=
4
LORA_RANK
=
8
LORA_ALPHA
=
32
LORA_DROUPOUT
=
0.1
MAX_SOURCE_LEN
=
512
MAX_TARGET_LEN
=
128
DEV_BATCH_SIZE
=
1
GRAD_ACCUMULARION_STEPS
=
1
MAX_STEP
=
500
SAVE_INTERVAL
=
50
MAX_SEQ_LEN
=
512
RUN_NAME
=
text
BASE_MODEL_PATH
=
THUDM/chatglm3-6b-base
DATASET_PATH
=
data/alpaca_data.jsonl
DATESTR
=
`
date
+%Y%m%d-%H%M%S
`
OUTPUT_DIR
=
output/
${
RUN_NAME
}
-
${
DATESTR
}
-
${
LR
}
MASTER_PORT
=
$(
shuf
-n
1
-i
10000-65535
)
mkdir
-p
$OUTPUT_DIR
torchrun
--standalone
--nnodes
=
1
--nproc_per_node
=
$NUM_GPUS
finetune.py
\
--train_format
input-output
\
--train_file
$DATASET_PATH
\
--lora_rank
$LORA_RANK
\
--lora_alpha
$LORA_ALPHA
\
--lora_dropout
$LORA_DROUPOUT
\
--max_seq_length
$MAX_SEQ_LEN
\
--preprocessing_num_workers
1
\
--model_name_or_path
$BASE_MODEL_PATH
\
--output_dir
$OUTPUT_DIR
\
--per_device_train_batch_size
$DEV_BATCH_SIZE
\
--gradient_accumulation_steps
$GRAD_ACCUMULARION_STEPS
\
--max_steps
$MAX_STEP
\
--logging_steps
1
\
--save_steps
$SAVE_INTERVAL
\
--learning_rate
$LR
2>&1 |
tee
${
OUTPUT_DIR
}
/train.log
finetune_basemodel_demo/scripts/formate_alpaca2jsonl.py
0 → 100644
View file @
3a3f5683
import
argparse
import
json
import
tqdm
def
format_example
(
example
:
dict
)
->
dict
:
context
=
f
"Instruction:
{
example
[
'instruction'
]
}
\n
"
if
example
.
get
(
"input"
):
context
+=
f
"Input:
{
example
[
'input'
]
}
\n
"
context
+=
"Answer: "
target
=
example
[
"output"
]
return
{
"context"
:
context
,
"target"
:
target
}
def
main
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--data_path"
,
type
=
str
,
default
=
"data/alpaca_data.json"
)
parser
.
add_argument
(
"--save_path"
,
type
=
str
,
default
=
"data/alpaca_data.jsonl"
)
args
=
parser
.
parse_args
()
print
(
"args:"
,
args
)
with
open
(
args
.
data_path
)
as
f
:
examples
=
json
.
load
(
f
)
with
open
(
args
.
save_path
,
'w'
)
as
f
:
for
example
in
tqdm
.
tqdm
(
examples
,
desc
=
"formatting.."
):
f
.
write
(
json
.
dumps
(
format_example
(
example
),
ensure_ascii
=
False
)
+
'
\n
'
)
if
__name__
==
"__main__"
:
main
()
finetune_basemodel_demo/trainer.py
0 → 100644
View file @
3a3f5683
# coding=utf-8
# Copyright 2020-present the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
The Trainer class, to easily train a 🤗 Transformers from scratch or finetune it on a new task.
"""
import
os
from
typing
import
Optional
from
transformers
import
Trainer
import
torch
from
transformers.modeling_utils
import
PreTrainedModel
,
unwrap_model
from
transformers.utils
import
logging
logger
=
logging
.
get_logger
(
__name__
)
WEIGHTS_NAME
=
"pytorch_model.pt"
TRAINING_ARGS_NAME
=
"training_args.bin"
class
LoRATrainer
(
Trainer
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
)
def
compute_loss
(
self
,
model
,
inputs
,
return_outputs
=
False
):
return
model
(
**
inputs
).
loss
def
save_model
(
self
,
output_dir
=
None
,
_internal_call
=
False
):
output_dir
=
output_dir
if
output_dir
is
not
None
else
self
.
args
.
output_dir
os
.
makedirs
(
output_dir
,
exist_ok
=
True
)
logger
.
info
(
f
"Saving model checkpoint to
{
output_dir
}
"
)
model_to_save
=
unwrap_model
(
self
.
model
)
saved_params
=
{
k
:
v
.
to
(
"cuda"
)
for
k
,
v
in
model_to_save
.
named_parameters
()
if
v
.
requires_grad
}
torch
.
save
(
saved_params
,
os
.
path
.
join
(
output_dir
,
WEIGHTS_NAME
))
if
self
.
tokenizer
is
not
None
:
self
.
tokenizer
.
save_pretrained
(
output_dir
)
torch
.
save
(
self
.
args
,
os
.
path
.
join
(
output_dir
,
TRAINING_ARGS_NAME
))
finetune_chatmodel_demo/AdvertiseGen/dev.json
0 → 100644
View file @
3a3f5683
This diff is collapsed.
Click to expand it.
finetune_chatmodel_demo/AdvertiseGen/train.json
0 → 100644
View file @
3a3f5683
This diff is collapsed.
Click to expand it.
finetune_chatmodel_demo/README.md
0 → 100644
View file @
3a3f5683
# ChatGLM3-6B 微调示例
本目录提供 ChatGLM3-6B 模型的微调示例,包括全量微调和 P-Tuning v2。格式上,提供多轮对话微调样例和输入输出格式微调样例。
如果将模型下载到了本地,本文和代码中的
`THUDM/chatglm3-6b`
字段均应替换为相应地址以从本地加载模型。
运行示例需要
`python>=3.10`
,除基础的
`torch`
依赖外,示例代码运行还需要依赖
```
bash
pip
install
requirements.txt
```
## 多轮对话格式
多轮对话微调示例采用 ChatGLM3 对话格式约定,对不同角色添加不同
`loss_mask`
从而在一遍计算中为多轮回复计算
`loss`
。
### 数据格式和预处理
对于数据文件,样例采用如下格式
如果您仅希望微调模型的对话能力,而非工具能力,您应该按照以下格式整理数据。
```
json
[
{
"conversations"
:
[
{
"role"
:
"system"
,
"content"
:
"<system prompt text>"
},
{
"role"
:
"user"
,
"content"
:
"<user prompt text>"
},
{
"role"
:
"assistant"
,
"content"
:
"<assistant response text>"
},
//
...
Muti
Turn
{
"role"
:
"user"
,
"content"
:
"<user prompt text>"
},
{
"role"
:
"assistant"
,
"content"
:
"<assistant response text>"
}
]
}
//
...
]
```
**请注意,这种方法在微调的step较多的情况下会影响到模型的工具调用功能**
如果您希望微调模型的对话和工具能力,您应该按照以下格式整理数据。
```
json
[
{
"tools"
:
[
//
available
tools
,
format
is
not
restricted
],
"conversations"
:
[
{
"role"
:
"system"
,
"content"
:
"<system prompt text>"
},
{
"role"
:
"user"
,
"content"
:
"<user prompt text>"
},
{
"role"
:
"assistant"
,
"content"
:
"<assistant thought to text>"
},
{
"role"
:
"tool"
,
"name"
:
"<name of the tool to be called"
,
"parameters"
:
{
"<parameter_name>"
:
"<parameter_value>"
},
"observation"
:
"<observation>"
//
don't
have
to
be
string
},
{
"role"
:
"assistant"
,
"content"
:
"<assistant response to observation>"
},
//
...
Muti
Turn
{
"role"
:
"user"
,
"content"
:
"<user prompt text>"
},
{
"role"
:
"assistant"
,
"content"
:
"<assistant response text>"
}
]
}
//
...
]
```
-
关于工具描述的 system prompt 无需手动插入,预处理时会将
`tools`
字段使用
`json.dumps(..., ensure_ascii=False)`
格式化后插入为首条 system prompt。
-
每种角色可以附带一个
`bool`
类型的
`loss`
字段,表示该字段所预测的内容是否参与
`loss`
计算。若没有该字段,样例实现中默认对
`system`
,
`user`
不计算
`loss`
,其余角色则计算
`loss`
。
-
`tool`
并不是 ChatGLM3 中的原生角色,这里的
`tool`
在预处理阶段将被自动转化为一个具有工具调用
`metadata`
的
`assistant`
角色(默认计算
`loss`
)和一个表示工具返回值的
`observation`
角色(不计算
`loss`
)。
-
目前暂未实现
`Code interpreter`
的微调任务。
-
`system`
角色为可选角色,但若存在
`system`
角色,其必须出现在
`user`
角色之前,且一个完整的对话数据(无论单轮或者多轮对话)只能出现一次
`system`
角色。
作为示例,我们使用 ToolAlpaca 数据集来进行微调。首先,克隆
[
ToolAlpaca 数据集
](
https://github.com/tangqiaoyu/ToolAlpaca
)
,并使用
```
bash
./scripts/format_tool_alpaca.py
--path
"ToolAlpaca/data/train_data.json"
```
将数据集处理成上述格式。在这里,我们有意将工具处理成了了
`list[str]`
这样的自然语言形式,以观察模型在微调前后对工具定义的理解能力。
### 微调模型
以下脚本提供了微调模型的参考方式。
```
bash
./scripts/finetune_ds_multiturn.sh
# 全量微调
./scripts/finetune_pt_multiturn.sh
# P-Tuning v2 微调
```
### 部署
我们更新了 ChatGLM3 的综合 Demo,使其可以部署微调后的模型 checkpoint。
对于全量微调,可以使用以下方式进行部署
```
bash
cd
../composite_demo
MODEL_PATH
=
"path to finetuned model checkpoint"
TOKENIZER_PATH
=
"THUDM/chatglm3-6b"
streamlit run main.py
```
对于 P-Tuning v2 微调,可以使用以下方式进行部署
```
bash
cd
../composite_demo
MODEL_PATH
=
"THUDM/chatglm3-6b"
PT_PATH
=
"path to p-tuning checkpoint"
streamlit run main.py
```
## 输入输出格式
对于输入-输出格式,样例采用如下输入格式
```
json
[
{
"prompt"
:
"<prompt text>"
,
"response"
:
"<response text>"
}
//
...
]
```
预处理时,不会拼接任何角色标识符。
作为示例,我们使用 AdvertiseGen 数据集来进行微调。从
[
Google Drive
](
https://drive.google.com/file/d/13_vf0xRTQsyneRKdD1bZIr93vBGOczrk/view?usp=sharing
)
或者
[
Tsinghua Cloud
](
https://cloud.tsinghua.edu.cn/f/b3f119a008264b1cabd1/?dl=1
)
下载处理好的 AdvertiseGen 数据集,将解压后的
`AdvertiseGen`
目录放到本目录下。
```
bash
./scripts/format_advertise_gen.py
--path
"AdvertiseGen/train.json"
```
来下载和将数据集处理成上述格式。
### 微调模型
以下脚本提供了微调模型的参考方式。
```
bash
./scripts/finetune_ds.sh
# 全量微调
./scripts/finetune_pt.sh
# P-Tuning v2 微调
```
### 推理验证
对于输入输出格式的微调,可使用
`inference.py`
进行基本的推理验证。
```
bash
python inference.py
\
--pt-checkpoint
"path to p-tuning checkpoint"
\
--model
THUDM/chatglm3-6b
```
```
bash
python inference.py
\
--tokenizer
THUDM/chatglm3-6b
\
--model
"path to finetuned model checkpoint"
```
### 提示
1.
微调代码在开始训练前,会先打印首条训练数据的预处理信息,显示为
```log
Sanity Check >>>>>>>>>>>>>
'[gMASK]': 64790 -> -100
'sop': 64792 -> -100
'<|system|>': 64794 -> -100
'': 30910 -> -100
'\n': 13 -> -100
'Answer': 20115 -> -100
'the': 267 -> -100
'following': 1762 -> -100
...
'know': 683 -> -100
'the': 267 -> -100
'response': 3010 -> -100
'details': 3296 -> -100
'.': 30930 -> -100
'<|assistant|>': 64796 -> -100
'': 30910 -> 30910
'\n': 13 -> 13
'I': 307 -> 307
'need': 720 -> 720
'to': 289 -> 289
'use': 792 -> 792
...
<<<<<<<<<<<<< Sanity Check
```
字样,每行依次表示一个 detokenized string, token_id 和 target_id。可在日志中查看这部分的 `loss_mask` 是否符合预期。若不符合,可能需要调整代码或数据。
2.
参考显存用量
- P-Tuning V2 `PRE_SEQ_LEN=128`, `DEV_BATCH_SIZE=1`, `GRAD_ACCUMULARION_STEPS=16`, `MAX_SEQ_LEN=2048` 配置下约需要 21GB 显存。
- 全量微调时,`./scripts/finetune_ds_multiturn.sh` 中的配置(`MAX_SEQ_LEN=2048`, `DEV_BATCH_SIZE=16`, `GRAD_ACCUMULARION_STEPS=1`)恰好用满 4 * 80GB 显存。
3.
若尝试后发现显存不足,可以考虑
-
尝试降低
`DEV_BATCH_SIZE`
并提升
`GRAD_ACCUMULARION_STEPS`
-
尝试添加
`--quantization_bit 8`
或
`--quantization_bit 4`
。
-
`PRE_SEQ_LEN=128`
,
`DEV_BATCH_SIZE=1`
,
`GRAD_ACCUMULARION_STEPS=16`
,
`MAX_SEQ_LEN=1024`
配置下,
`--quantization_bit 8`
约需 12GB 显存,
`--quantization_bit 4`
约需 7.6GB 显存。
## 参考文献
```
@inproceedings{liu2022p,
title={P-tuning: Prompt tuning can be comparable to fine-tuning across scales and tasks},
author={Liu, Xiao and Ji, Kaixuan and Fu, Yicheng and Tam, Weng and Du, Zhengxiao and Yang, Zhilin and Tang, Jie},
booktitle={Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)},
pages={61--68},
year={2022}
}
@misc{tang2023toolalpaca,
title={ToolAlpaca: Generalized Tool Learning for Language Models with 3000 Simulated Cases},
author={Qiaoyu Tang and Ziliang Deng and Hongyu Lin and Xianpei Han and Qiao Liang and Le Sun},
year={2023},
eprint={2306.05301},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
```
finetune_chatmodel_demo/arguments.py
0 → 100644
View file @
3a3f5683
from
dataclasses
import
dataclass
,
field
from
typing
import
Optional
@
dataclass
class
ModelArguments
:
"""
Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
"""
model_name_or_path
:
str
=
field
(
metadata
=
{
"help"
:
"Path to pretrained model or model identifier from huggingface.co/models"
}
)
ptuning_checkpoint
:
str
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"Path to p-tuning v2 checkpoints"
}
)
config_name
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"Pretrained config name or path if not the same as model_name"
}
)
tokenizer_name
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"Pretrained tokenizer name or path if not the same as model_name"
}
)
cache_dir
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"Where to store the pretrained models downloaded from huggingface.co"
},
)
use_fast_tokenizer
:
bool
=
field
(
default
=
True
,
metadata
=
{
"help"
:
"Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."
},
)
model_revision
:
str
=
field
(
default
=
"main"
,
metadata
=
{
"help"
:
"The specific model version to use (can be a branch name, tag name or commit id)."
},
)
use_auth_token
:
bool
=
field
(
default
=
False
,
metadata
=
{
"help"
:
(
"Will use the token generated when running `huggingface-cli login` (necessary to use this script "
"with private models)."
)
},
)
resize_position_embeddings
:
Optional
[
bool
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
(
"Whether to automatically resize the position embeddings if `max_source_length` exceeds "
"the model's position embeddings."
)
},
)
quantization_bit
:
Optional
[
int
]
=
field
(
default
=
None
)
pre_seq_len
:
Optional
[
int
]
=
field
(
default
=
None
)
prefix_projection
:
bool
=
field
(
default
=
False
)
@
dataclass
class
DataTrainingArguments
:
"""
Arguments pertaining to what data we are going to input our model for training and eval.
"""
train_file
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"The input training data file (a jsonlines or csv file)."
}
)
max_seq_length
:
Optional
[
int
]
=
field
(
default
=
2048
,
metadata
=
{
"help"
:
(
"The maximum total input sequence length after tokenization. Sequences longer "
"than this will be truncated."
)
},
)
max_source_length
:
Optional
[
int
]
=
field
(
default
=
1024
,
metadata
=
{
"help"
:
(
"The maximum total input sequence length after tokenization. Sequences longer "
"than this will be truncated, sequences shorter will be padded."
)
},
)
max_target_length
:
Optional
[
int
]
=
field
(
default
=
128
,
metadata
=
{
"help"
:
(
"The maximum total sequence length for target text after tokenization. Sequences longer "
"than this will be truncated, sequences shorter will be padded."
)
},
)
train_format
:
str
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"The format of the training data file (mulit-turn or input-output)"
},
)
overwrite_cache
:
bool
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"Overwrite the cached training and evaluation sets"
}
)
preprocessing_num_workers
:
Optional
[
int
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"The number of processes to use for the preprocessing."
},
)
max_seq_length
:
Optional
[
int
]
=
field
(
default
=
1024
,
metadata
=
{
"help"
:
(
"The maximum total input sequence length after tokenization. Sequences longer "
"than this will be truncated, sequences shorter will be padded."
)
},
)
pad_to_max_length
:
bool
=
field
(
default
=
False
,
metadata
=
{
"help"
:
(
"Whether to pad all samples to model maximum sentence length. "
"If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
"efficient on GPU but very bad for TPU."
)
},
)
max_train_samples
:
Optional
[
int
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
(
"For debugging purposes or quicker training, truncate the number of training examples to this "
"value if set."
)
},
)
def
__post_init__
(
self
):
extension
=
self
.
train_file
.
split
(
"."
)[
-
1
]
assert
extension
in
{
"jsonl"
,
"json"
},
"`train_file` should be a jsonl or a json file."
assert
self
.
train_format
in
{
"multi-turn"
,
"input-output"
}
\ No newline at end of file
finetune_chatmodel_demo/configs/deepspeed.json
0 → 100644
View file @
3a3f5683
{
"train_batch_size"
:
"auto"
,
"train_micro_batch_size_per_gpu"
:
"auto"
,
"gradient_accumulation_steps"
:
"auto"
,
"zero_allow_untested_optimizer"
:
true
,
"fp16"
:
{
"enabled"
:
"auto"
,
"loss_scale"
:
0
,
"initial_scale_power"
:
16
,
"loss_scale_window"
:
1000
,
"hysteresis"
:
2
,
"min_loss_scale"
:
1
},
"zero_optimization"
:
{
"stage"
:
3
,
"allgather_partitions"
:
true
,
"allgather_bucket_size"
:
5e8
,
"overlap_comm"
:
false
,
"reduce_scatter"
:
true
,
"reduce_bucket_size"
:
5e8
,
"contiguous_gradients"
:
true
}
}
\ No newline at end of file
finetune_chatmodel_demo/finetune.py
0 → 100644
View file @
3a3f5683
#!/usr/bin/env python
# coding=utf-8
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Fine-tuning the library models for sequence to sequence.
"""
# You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments.
# Adapted from
import
logging
import
os
import
sys
import
torch
import
json
import
transformers
from
transformers
import
(
AutoConfig
,
AutoModel
,
AutoTokenizer
,
DataCollatorForSeq2Seq
,
HfArgumentParser
,
Seq2SeqTrainingArguments
,
set_seed
,
)
from
trainer
import
PrefixTrainer
from
arguments
import
ModelArguments
,
DataTrainingArguments
from
preprocess_utils
import
sanity_check
,
MultiTurnDataset
,
InputOutputDataset
logger
=
logging
.
getLogger
(
__name__
)
# import os
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32"
def
main
():
parser
=
HfArgumentParser
((
ModelArguments
,
DataTrainingArguments
,
Seq2SeqTrainingArguments
))
if
len
(
sys
.
argv
)
==
2
and
sys
.
argv
[
1
].
endswith
(
".json"
):
# If we pass only one argument to the script and it's the path to a json file,
# let's parse it to get our arguments.
model_args
,
data_args
,
training_args
=
parser
.
parse_json_file
(
json_file
=
os
.
path
.
abspath
(
sys
.
argv
[
1
]))
else
:
model_args
,
data_args
,
training_args
=
parser
.
parse_args_into_dataclasses
()
# Setup logging
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(name)s - %(message)s"
,
datefmt
=
"%m/%d/%Y %H:%M:%S"
,
handlers
=
[
logging
.
StreamHandler
(
sys
.
stdout
)],
)
if
training_args
.
should_log
:
# The default of training_args.log_level is passive, so we set log level at info here to have that default.
transformers
.
utils
.
logging
.
set_verbosity_info
()
log_level
=
training_args
.
get_process_log_level
()
logger
.
setLevel
(
log_level
)
# datasets.utils.logging.set_verbosity(log_level)
transformers
.
utils
.
logging
.
set_verbosity
(
log_level
)
transformers
.
utils
.
logging
.
enable_default_handler
()
transformers
.
utils
.
logging
.
enable_explicit_format
()
# Log on each process the small summary:
logger
.
warning
(
f
"Process rank:
{
training_args
.
local_rank
}
, device:
{
training_args
.
device
}
, n_gpu:
{
training_args
.
n_gpu
}
"
+
f
"distributed training:
{
bool
(
training_args
.
local_rank
!=
-
1
)
}
, 16-bits training:
{
training_args
.
fp16
}
"
)
logger
.
info
(
f
"Training/evaluation parameters
{
training_args
}
"
)
# Set seed before initializing model.
set_seed
(
training_args
.
seed
)
# Load pretrained model and tokenizer
config
=
AutoConfig
.
from_pretrained
(
model_args
.
model_name_or_path
,
trust_remote_code
=
True
)
config
.
pre_seq_len
=
model_args
.
pre_seq_len
config
.
prefix_projection
=
model_args
.
prefix_projection
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_args
.
model_name_or_path
,
trust_remote_code
=
True
)
if
model_args
.
ptuning_checkpoint
is
not
None
:
model
=
AutoModel
.
from_pretrained
(
model_args
.
model_name_or_path
,
config
=
config
,
trust_remote_code
=
True
)
prefix_state_dict
=
torch
.
load
(
os
.
path
.
join
(
model_args
.
ptuning_checkpoint
,
"pytorch_model.bin"
))
new_prefix_state_dict
=
{}
for
k
,
v
in
prefix_state_dict
.
items
():
if
k
.
startswith
(
"transformer.prefix_encoder."
):
new_prefix_state_dict
[
k
[
len
(
"transformer.prefix_encoder."
):]]
=
v
model
.
transformer
.
prefix_encoder
.
load_state_dict
(
new_prefix_state_dict
)
elif
model_args
.
pre_seq_len
is
not
None
:
model
=
AutoModel
.
from_pretrained
(
model_args
.
model_name_or_path
,
config
=
config
,
trust_remote_code
=
True
)
#,empty_init=False)
else
:
model
=
AutoModel
.
from_pretrained
(
model_args
.
model_name_or_path
,
config
=
config
,
trust_remote_code
=
True
,
empty_init
=
False
)
if
model_args
.
quantization_bit
is
not
None
:
print
(
f
"Quantized to
{
model_args
.
quantization_bit
}
bit"
)
model
=
model
.
quantize
(
model_args
.
quantization_bit
)
if
model_args
.
pre_seq_len
is
not
None
:
# P-tuning v2
model
=
model
.
half
()
model
.
transformer
.
prefix_encoder
.
float
()
else
:
# Finetune
model
=
model
.
float
()
with
open
(
data_args
.
train_file
,
"r"
,
encoding
=
"utf-8"
)
as
f
:
if
data_args
.
train_file
.
endswith
(
".json"
):
train_data
=
json
.
load
(
f
)
elif
data_args
.
train_file
.
endswith
(
".jsonl"
):
train_data
=
[
json
.
loads
(
line
)
for
line
in
f
]
if
data_args
.
train_format
==
"multi-turn"
:
train_dataset
=
MultiTurnDataset
(
train_data
,
tokenizer
,
data_args
.
max_seq_length
,
)
elif
data_args
.
train_format
==
"input-output"
:
train_dataset
=
InputOutputDataset
(
train_data
,
tokenizer
,
data_args
.
max_source_length
,
data_args
.
max_target_length
,
)
else
:
raise
ValueError
(
f
"Unknown train format:
{
data_args
.
train_format
}
"
)
if
training_args
.
local_rank
<
1
:
sanity_check
(
train_dataset
[
0
][
'input_ids'
],
train_dataset
[
0
][
'labels'
],
tokenizer
)
# Data collator
data_collator
=
DataCollatorForSeq2Seq
(
tokenizer
,
model
=
model
,
label_pad_token_id
=-
100
,
pad_to_multiple_of
=
None
,
padding
=
False
)
# Initialize our Trainer
trainer
=
PrefixTrainer
(
model
=
model
,
args
=
training_args
,
train_dataset
=
train_dataset
,
tokenizer
=
tokenizer
,
data_collator
=
data_collator
,
save_changed
=
model_args
.
pre_seq_len
is
not
None
)
checkpoint
=
None
if
training_args
.
resume_from_checkpoint
is
not
None
:
checkpoint
=
training_args
.
resume_from_checkpoint
model
.
gradient_checkpointing_enable
()
model
.
enable_input_require_grads
()
trainer
.
train
(
resume_from_checkpoint
=
checkpoint
)
trainer
.
save_model
()
# Saves the tokenizer too for easy upload
trainer
.
save_state
()
if
__name__
==
"__main__"
:
main
()
finetune_chatmodel_demo/formatted_data/advertise_gen.jsonl
0 → 100644
View file @
3a3f5683
This diff is collapsed.
Click to expand it.
finetune_chatmodel_demo/formatted_data/tool_alpaca.jsonl
0 → 100644
View file @
3a3f5683
This diff is collapsed.
Click to expand it.
finetune_chatmodel_demo/inference.py
0 → 100644
View file @
3a3f5683
import
argparse
from
transformers
import
AutoConfig
,
AutoModel
,
AutoTokenizer
import
torch
import
os
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--pt-checkpoint"
,
type
=
str
,
default
=
None
,
help
=
"The checkpoint path"
)
parser
.
add_argument
(
"--model"
,
type
=
str
,
default
=
None
,
help
=
"main model weights"
)
parser
.
add_argument
(
"--tokenizer"
,
type
=
str
,
default
=
None
,
help
=
"main model weights"
)
parser
.
add_argument
(
"--pt-pre-seq-len"
,
type
=
int
,
default
=
128
,
help
=
"The pre-seq-len used in p-tuning"
)
parser
.
add_argument
(
"--device"
,
type
=
str
,
default
=
"cuda"
)
parser
.
add_argument
(
"--max-new-tokens"
,
type
=
int
,
default
=
128
)
args
=
parser
.
parse_args
()
if
args
.
tokenizer
is
None
:
args
.
tokenizer
=
args
.
model
if
args
.
pt_checkpoint
:
tokenizer
=
AutoTokenizer
.
from_pretrained
(
args
.
tokenizer
,
trust_remote_code
=
True
)
config
=
AutoConfig
.
from_pretrained
(
args
.
model
,
trust_remote_code
=
True
,
pre_seq_len
=
args
.
pt_pre_seq_len
)
model
=
AutoModel
.
from_pretrained
(
args
.
model
,
config
=
config
,
trust_remote_code
=
True
).
cuda
()
prefix_state_dict
=
torch
.
load
(
os
.
path
.
join
(
args
.
pt_checkpoint
,
"pytorch_model.bin"
))
new_prefix_state_dict
=
{}
for
k
,
v
in
prefix_state_dict
.
items
():
if
k
.
startswith
(
"transformer.prefix_encoder."
):
new_prefix_state_dict
[
k
[
len
(
"transformer.prefix_encoder."
):]]
=
v
model
.
transformer
.
prefix_encoder
.
load_state_dict
(
new_prefix_state_dict
)
else
:
tokenizer
=
AutoTokenizer
.
from_pretrained
(
args
.
tokenizer
,
trust_remote_code
=
True
)
model
=
AutoModel
.
from_pretrained
(
args
.
model
,
trust_remote_code
=
True
)
model
=
model
.
to
(
args
.
device
)
while
True
:
prompt
=
input
(
"Prompt:"
)
inputs
=
tokenizer
(
prompt
,
return_tensors
=
"pt"
)
inputs
=
inputs
.
to
(
args
.
device
)
response
=
model
.
generate
(
input_ids
=
inputs
[
"input_ids"
],
max_length
=
inputs
[
"input_ids"
].
shape
[
-
1
]
+
args
.
max_new_tokens
)
response
=
response
[
0
,
inputs
[
"input_ids"
].
shape
[
-
1
]:]
print
(
"Response:"
,
tokenizer
.
decode
(
response
,
skip_special_tokens
=
True
))
\ No newline at end of file
finetune_chatmodel_demo/preprocess_utils.py
0 → 100644
View file @
3a3f5683
import
json
import
ast
import
astunparse
from
transformers
import
PreTrainedTokenizer
from
torch.utils.data
import
Dataset
from
copy
import
deepcopy
from
typing
import
Dict
,
List
# text constants
FUNCTION_CALL_NAME
=
'tool_call'
FUNCTION_CALL_PREFIX
=
'```python
\n
'
FUNCTION_CALL_POSTFIX
=
'
\n
```'
TOOL_DEFINITION_PREFIX
=
'Answer the following questions as best as you can. You have access to the following tools:
\n
'
CONVERSATOIN_KEY
=
'conversations'
TOOL_DESC_KEY
=
'tools'
def
format_function_call
(
function_name
:
str
,
parameters
:
Dict
[
str
,
str
]):
function_name
=
ast
.
Name
(
id
=
function_name
)
keywords
=
[
ast
.
keyword
(
arg
=
arg_name
,
value
=
ast
.
Constant
(
arg_value
))
for
arg_name
,
arg_value
in
parameters
.
items
()
]
func_call
=
ast
.
Call
(
func
=
function_name
,
args
=
[],
keywords
=
keywords
)
return
astunparse
.
unparse
(
func_call
).
strip
()
def
format_conversation
(
item
,
tokenizer
,
conversation_key
:
str
,
tool_key
:
str
):
conversations
=
deepcopy
(
item
[
conversation_key
])
# Note: `loss_mask` here means whether *the prediction* of the token should take loss
tokens
,
loss_masks
=
[
tokenizer
.
get_command
(
"[gMASK]"
),
tokenizer
.
get_command
(
"sop"
)],
[
0
,
0
]
def
_update
(
_tokens
:
List
[
int
],
value
:
int
=
1
):
value
=
int
(
value
)
tokens
.
extend
(
_tokens
)
loss_masks
.
extend
([
value
]
*
len
(
_tokens
))
# insert system prompt for tools
if
tool_key
in
item
:
conversations
.
insert
(
0
,
{
"role"
:
"system"
,
"content"
:
TOOL_DEFINITION_PREFIX
+
json
.
dumps
(
item
[
tool_key
],
indent
=
4
,
ensure_ascii
=
False
)
}
)
for
idx
,
conv
in
enumerate
(
conversations
):
loss
=
conv
.
get
(
"loss"
,
True
)
if
conv
[
'role'
]
in
{
'system'
,
'user'
}:
loss
=
False
if
conv
[
'role'
]
==
'tool'
:
# function call python code
value
=
FUNCTION_CALL_PREFIX
+
format_function_call
(
FUNCTION_CALL_NAME
,
conv
[
"parameters"
])
+
FUNCTION_CALL_POSTFIX
text
=
tokenizer
.
build_single_message
(
"assistant"
,
conv
[
"name"
],
value
)
_update
(
text
,
loss
)
# function call result
value
=
conv
.
get
(
'observation'
,
None
)
if
not
isinstance
(
value
,
str
):
value
=
json
.
dumps
(
value
,
ensure_ascii
=
False
)
text
=
tokenizer
.
build_single_message
(
"observation"
,
""
,
value
)
_update
(
text
,
False
)
else
:
text
=
tokenizer
.
build_single_message
(
conv
[
'role'
],
""
,
conv
[
"content"
])
_update
(
text
,
loss
)
_update
([
tokenizer
.
eos_token_id
],
False
)
assert
len
(
tokens
)
==
len
(
loss_masks
),
f
"length mismatch:
{
len
(
tokens
)
}
vs
{
len
(
loss_masks
)
}
"
return
tokens
,
loss_masks
def
sanity_check
(
tokens
:
List
[
int
],
target
:
List
[
int
],
tokenizer
:
PreTrainedTokenizer
):
print
(
"Sanity Check >>>>>>>>>>>>>"
)
for
t
,
m
in
zip
(
tokens
,
target
):
decoded
=
tokenizer
.
tokenizer
.
index_special_tokens
[
t
]
\
if
t
in
tokenizer
.
tokenizer
.
index_special_tokens
\
else
tokenizer
.
decode
([
t
])
print
(
"%20s: %6d -> %6d"
%
(
repr
(
decoded
),
t
,
m
))
print
(
"<<<<<<<<<<<<< Sanity Check"
)
assert
len
(
tokens
)
==
len
(
target
),
f
"length mismatch:
{
len
(
tokens
)
}
vs
{
len
(
target
)
}
"
class
MultiTurnDataset
(
Dataset
):
def
__init__
(
self
,
data
:
List
[
dict
],
tokenizer
:
PreTrainedTokenizer
,
max_seq_length
:
int
):
super
(
MultiTurnDataset
,
self
).
__init__
()
self
.
tokenizer
=
tokenizer
self
.
max_seq_length
=
max_seq_length
self
.
data
=
data
def
__len__
(
self
):
return
len
(
self
.
data
)
def
__getitem__
(
self
,
i
)
->
dict
:
data_item
=
self
.
data
[
i
]
tokens
,
loss_masks
=
format_conversation
(
data_item
,
self
.
tokenizer
,
CONVERSATOIN_KEY
,
TOOL_DESC_KEY
)
# labels are used inside the model
target_based_loss_mask
=
[
False
]
+
loss_masks
[:
-
1
]
labels
=
[(
t
if
m
else
-
100
)
for
t
,
m
in
zip
(
tokens
,
target_based_loss_mask
)]
tokens
=
tokens
[:
self
.
max_seq_length
]
labels
=
labels
[:
self
.
max_seq_length
]
tokens
+=
[
self
.
tokenizer
.
pad_token_id
]
*
(
self
.
max_seq_length
-
len
(
tokens
))
labels
+=
[
-
100
]
*
(
self
.
max_seq_length
-
len
(
labels
))
assert
len
(
tokens
)
==
len
(
labels
),
f
"length mismatch:
{
len
(
tokens
)
}
vs
{
len
(
labels
)
}
"
return
{
"input_ids"
:
tokens
,
"labels"
:
labels
}
class
InputOutputDataset
(
Dataset
):
def
__init__
(
self
,
data
:
List
[
dict
],
tokenizer
:
PreTrainedTokenizer
,
max_source_length
:
int
,
max_target_length
:
int
):
super
(
InputOutputDataset
,
self
).
__init__
()
self
.
tokenizer
=
tokenizer
self
.
max_source_length
=
max_source_length
self
.
max_target_length
=
max_target_length
self
.
max_seq_length
=
max_source_length
+
max_target_length
+
1
self
.
data
=
data
def
__len__
(
self
):
return
len
(
self
.
data
)
def
__getitem__
(
self
,
i
)
->
dict
:
data_item
=
self
.
data
[
i
]
a_ids
=
self
.
tokenizer
.
encode
(
text
=
data_item
[
'prompt'
],
add_special_tokens
=
True
,
truncation
=
True
,
max_length
=
self
.
max_source_length
)
b_ids
=
self
.
tokenizer
.
encode
(
text
=
data_item
[
'response'
],
add_special_tokens
=
False
,
truncation
=
True
,
max_length
=
self
.
max_target_length
)
context_length
=
len
(
a_ids
)
input_ids
=
a_ids
+
b_ids
+
[
self
.
tokenizer
.
eos_token_id
]
labels
=
[
self
.
tokenizer
.
pad_token_id
]
*
context_length
+
b_ids
+
[
self
.
tokenizer
.
eos_token_id
]
pad_len
=
self
.
max_seq_length
-
len
(
input_ids
)
input_ids
=
input_ids
+
[
self
.
tokenizer
.
pad_token_id
]
*
pad_len
labels
=
labels
+
[
self
.
tokenizer
.
pad_token_id
]
*
pad_len
labels
=
[(
l
if
l
!=
self
.
tokenizer
.
pad_token_id
else
-
100
)
for
l
in
labels
]
assert
len
(
input_ids
)
==
len
(
labels
),
f
"length mismatch:
{
len
(
input_ids
)
}
vs
{
len
(
labels
)
}
"
return
{
"input_ids"
:
input_ids
,
"labels"
:
labels
}
finetune_chatmodel_demo/requirements.txt
0 → 100644
View file @
3a3f5683
transformers==4.30.2
accelerate
sentencepiece
astunparse
deepspeed
\ No newline at end of file
finetune_chatmodel_demo/scripts/finetune_ds.sh
0 → 100644
View file @
3a3f5683
#! /usr/bin/env bash
set
-ex
LR
=
1e-4
NUM_GPUS
=
8
MAX_SOURCE_LEN
=
1024
MAX_TARGET_LEN
=
128
DEV_BATCH_SIZE
=
4
GRAD_ACCUMULARION_STEPS
=
1
MAX_STEP
=
20
SAVE_INTERVAL
=
500
RUN_NAME
=
advertise_gen_ft
BASE_MODEL_PATH
=
/chatglm3/chatglm3-6b
DATASET_PATH
=
../formatted_data/advertise_gen.jsonl
DATESTR
=
`
date
+%Y%m%d-%H%M%S
`
OUTPUT_DIR
=
output/
${
RUN_NAME
}
-
${
DATESTR
}
-
${
LR
}
MASTER_PORT
=
$(
shuf
-n
1
-i
10000-65535
)
mkdir
-p
$OUTPUT_DIR
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
torchrun
--standalone
--nnodes
=
1
--nproc_per_node
=
$NUM_GPUS
../finetune.py
\
--train_format
input-output
\
--train_file
$DATASET_PATH
\
--preprocessing_num_workers
1
\
--model_name_or_path
$BASE_MODEL_PATH
\
--output_dir
$OUTPUT_DIR
\
--max_source_length
$MAX_SOURCE_LEN
\
--max_target_length
$MAX_TARGET_LEN
\
--per_device_train_batch_size
$DEV_BATCH_SIZE
\
--gradient_accumulation_steps
$GRAD_ACCUMULARION_STEPS
\
--max_steps
$MAX_STEP
\
--logging_steps
1
\
--save_steps
$SAVE_INTERVAL
\
--learning_rate
$LR
\
--fp16
\
--deepspeed
../configs/deepspeed.json 2>&1 |
tee
${
OUTPUT_DIR
}
/train.log
finetune_chatmodel_demo/scripts/finetune_ds_multiturn.sh
0 → 100644
View file @
3a3f5683
#! /usr/bin/env bash
set
-ex
LR
=
1e-4
NUM_GPUS
=
8
MAX_SEQ_LEN
=
2048
DEV_BATCH_SIZE
=
2
GRAD_ACCUMULARION_STEPS
=
1
MAX_STEP
=
200
SAVE_INTERVAL
=
50
DATESTR
=
`
date
+%Y%m%d-%H%M%S
`
RUN_NAME
=
tool_alpaca_ft
DATASET_PATH
=
../formatted_data/tool_alpaca.jsonl
BASE_MODEL_PATH
=
/chatglm3/chatglm3-6b
OUTPUT_DIR
=
output/
${
RUN_NAME
}
-
${
DATESTR
}
-
${
LR
}
mkdir
-p
$OUTPUT_DIR
torchrun
--standalone
--nnodes
=
1
--nproc_per_node
=
$NUM_GPUS
../finetune.py
\
--train_format
multi-turn
\
--train_file
$DATASET_PATH
\
--max_seq_length
$MAX_SEQ_LEN
\
--preprocessing_num_workers
1
\
--model_name_or_path
$BASE_MODEL_PATH
\
--output_dir
$OUTPUT_DIR
\
--per_device_train_batch_size
$DEV_BATCH_SIZE
\
--gradient_accumulation_steps
$GRAD_ACCUMULARION_STEPS
\
--max_steps
$MAX_STEP
\
--logging_steps
1
\
--save_steps
$SAVE_INTERVAL
\
--fp16
\
--deepspeed
../configs/deepspeed.json 2>&1 |
tee
${
OUTPUT_DIR
}
/train.log
finetune_chatmodel_demo/scripts/finetune_pt.sh
0 → 100644
View file @
3a3f5683
#! /usr/bin/env bash
set
-ex
PRE_SEQ_LEN
=
128
LR
=
2e-2
NUM_GPUS
=
1
MAX_SOURCE_LEN
=
1024
MAX_TARGET_LEN
=
128
DEV_BATCH_SIZE
=
1
GRAD_ACCUMULARION_STEPS
=
1
MAX_STEP
=
20
SAVE_INTERVAL
=
500
DATESTR
=
`
date
+%Y%m%d-%H%M%S
`
RUN_NAME
=
advertise_gen_pt
BASE_MODEL_PATH
=
/chatglm3/chatglm3-6b
DATASET_PATH
=
../formatted_data/advertise_gen.jsonl
OUTPUT_DIR
=
output/
${
RUN_NAME
}
-
${
DATESTR
}
-
${
PRE_SEQ_LEN
}
-
${
LR
}
mkdir
-p
$OUTPUT_DIR
export
HIP_VISIBLE_DEVICES
=
4,5,6,7
torchrun
--standalone
--nnodes
=
1
--nproc_per_node
=
$NUM_GPUS
../finetune.py
\
--train_format
input-output
\
--train_file
$DATASET_PATH
\
--preprocessing_num_workers
1
\
--model_name_or_path
$BASE_MODEL_PATH
\
--output_dir
$OUTPUT_DIR
\
--max_source_length
$MAX_SOURCE_LEN
\
--max_target_length
$MAX_TARGET_LEN
\
--per_device_train_batch_size
$DEV_BATCH_SIZE
\
--gradient_accumulation_steps
$GRAD_ACCUMULARION_STEPS
\
--max_steps
$MAX_STEP
\
--logging_steps
1
\
--save_steps
$SAVE_INTERVAL
\
--learning_rate
$LR
\
--pre_seq_len
$PRE_SEQ_LEN
2>&1 |
tee
${
OUTPUT_DIR
}
/train.log
finetune_chatmodel_demo/scripts/finetune_pt_multiturn.sh
0 → 100644
View file @
3a3f5683
#! /usr/bin/env bash
set
-ex
PRE_SEQ_LEN
=
128
LR
=
2e-2
NUM_GPUS
=
1
MAX_SEQ_LEN
=
2048
DEV_BATCH_SIZE
=
1
GRAD_ACCUMULARION_STEPS
=
16
MAX_STEP
=
1000
SAVE_INTERVAL
=
500
DATESTR
=
`
date
+%Y%m%d-%H%M%S
`
RUN_NAME
=
tool_alpaca_pt
BASE_MODEL_PATH
=
/chatglm3/chatglm3-6b
DATASET_PATH
=
../formatted_data/tool_alpaca.jsonl
OUTPUT_DIR
=
output/
${
RUN_NAME
}
-
${
DATESTR
}
-
${
PRE_SEQ_LEN
}
-
${
LR
}
mkdir
-p
$OUTPUT_DIR
torchrun
--standalone
--nnodes
=
1
--nproc_per_node
=
$NUM_GPUS
../finetune.py
\
--train_format
multi-turn
\
--train_file
$DATASET_PATH
\
--max_seq_length
$MAX_SEQ_LEN
\
--preprocessing_num_workers
1
\
--model_name_or_path
$BASE_MODEL_PATH
\
--output_dir
$OUTPUT_DIR
\
--per_device_train_batch_size
$DEV_BATCH_SIZE
\
--gradient_accumulation_steps
$GRAD_ACCUMULARION_STEPS
\
--max_steps
$MAX_STEP
\
--logging_steps
1
\
--save_steps
$SAVE_INTERVAL
\
--learning_rate
$LR
\
--pre_seq_len
$PRE_SEQ_LEN
2>&1 |
tee
${
OUTPUT_DIR
}
/train.log
finetune_chatmodel_demo/scripts/format_advertise_gen.py
0 → 100644
View file @
3a3f5683
#! /usr/bin/env python
import
json
from
collections
import
Counter
from
argparse
import
ArgumentParser
import
os
parser
=
ArgumentParser
()
parser
.
add_argument
(
"--path"
,
type
=
str
,
required
=
True
)
args
=
parser
.
parse_args
()
with
open
(
args
.
path
)
as
f
:
data
=
[
json
.
loads
(
line
)
for
line
in
f
]
train_examples
=
[{
"prompt"
:
x
[
'content'
],
"response"
:
x
[
'summary'
],
}
for
x
in
data
]
os
.
makedirs
(
"formatted_data"
,
exist_ok
=
True
)
with
open
(
"formatted_data/advertise_gen.jsonl"
,
"w"
)
as
f
:
for
e
in
train_examples
:
f
.
write
(
json
.
dumps
(
e
,
ensure_ascii
=
False
)
+
"
\n
"
)
finetune_chatmodel_demo/scripts/format_tool_alpaca.py
0 → 100644
View file @
3a3f5683
#! /usr/bin/env python
import
json
from
collections
import
Counter
from
argparse
import
ArgumentParser
import
os
parser
=
ArgumentParser
()
parser
.
add_argument
(
"--path"
,
type
=
str
,
required
=
True
)
args
=
parser
.
parse_args
()
with
open
(
args
.
path
)
as
f
:
data
=
json
.
load
(
f
)
train_examples
=
[]
err_count
=
0
for
setting
in
data
:
api_desc
=
[
setting
[
"NLDocumentation"
]]
for
instance
in
setting
[
"Instances"
]:
try
:
conv
=
[{
"role"
:
"user"
,
"content"
:
instance
[
'input'
],
}]
for
step
in
instance
[
'intermediate_steps'
]:
tool_name
,
params
,
react
=
step
[
0
]
step_thought
=
react
.
split
(
"Action:"
)[
0
].
strip
()
observation
=
step
[
1
]
conv
.
append
({
"role"
:
"assistant"
,
"content"
:
step_thought
,
})
conv
.
append
({
"role"
:
"tool"
,
"name"
:
tool_name
,
"parameters"
:
json
.
loads
(
params
),
"observation"
:
observation
,
})
conv
.
append
({
"role"
:
"assistant"
,
"content"
:
instance
[
'Final Thought'
]
+
"
\n
"
+
instance
[
'output'
],
})
except
:
err_count
+=
1
else
:
train_examples
.
append
({
"tools"
:
api_desc
,
"conversations"
:
conv
})
print
(
"err_count:"
,
err_count
)
print
(
"train_examples:"
,
len
(
train_examples
))
print
(
"conversation distribution:"
,
Counter
([
len
(
e
[
"conversations"
])
for
e
in
train_examples
]))
os
.
makedirs
(
"formatted_data"
,
exist_ok
=
True
)
with
open
(
"formatted_data/tool_alpaca.jsonl"
,
"w"
)
as
f
:
for
e
in
train_examples
:
f
.
write
(
json
.
dumps
(
e
,
ensure_ascii
=
False
)
+
"
\n
"
)
\ No newline at end of file
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment