Commit 6f97a5dc authored by zhougaofeng's avatar zhougaofeng
Browse files

Update doc/inf_result.png, result/result.png, doc/training_loss.png,...

Update doc/inf_result.png, result/result.png, doc/training_loss.png, doc/2403.04652v1.pdf, inference/6B_single_dcu.py, result/training_loss.png, finetune/single_node.sh, finetune/multi_node.sh, finetune/data/dataset_info.json, finetune/data/identity.json, finetune/data/mllm_demo.json, finetune/data/README.md, finetune/data/README_zh.md, finetune/data/alpaca_zh_demo.json, finetune/data/c4_demo.json, finetune/data/glaive_toolcall_en_demo.json, finetune/data/glaive_toolcall_zh_demo.json, finetune/data/alpaca_en_demo.json, finetune/data/dpo_zh_demo.json, finetune/data/kto_en_demo.json, finetune/data/wiki_demo.txt, finetune/data/dpo_en_demo.json, finetune/scripts/cal_flops.py, finetune/scripts/cal_lr.py, finetune/scripts/cal_ppl.py, finetune/scripts/length_cdf.py, finetune/scripts/llamafy_baichuan2.py, finetune/scripts/loftq_init.py, finetune/scripts/llamafy_qwen.py, finetune/scripts/llama_pro.py, finetune/src/api.py, finetune/src/train.py, finetune/src/webui.py, finetune/src/llamafactory/__init__.py, finetune/src/llamafactory/cli.py, finetune/src/llamafactory/api/__init__.py, finetune/src/llamafactory/api/common.py, finetune/src/llamafactory/api/chat.py, finetune/src/llamafactory/api/protocol.py, finetune/src/llamafactory/api/app.py, finetune/src/llamafactory/chat/__init__.py, finetune/src/llamafactory/chat/base_engine.py, finetune/src/llamafactory/chat/hf_engine.py, finetune/src/llamafactory/chat/vllm_engine.py, finetune/src/llamafactory/chat/chat_model.py, finetune/src/llamafactory/data/__init__.py, finetune/src/llamafactory/data/collator.py, finetune/src/llamafactory/data/utils.py, finetune/src/llamafactory/data/aligner.py, finetune/src/llamafactory/data/formatter.py, finetune/src/llamafactory/data/preprocess.py, finetune/src/llamafactory/data/parser.py, finetune/src/llamafactory/data/loader.py, finetune/src/llamafactory/data/template.py, finetune/src/llamafactory/eval/__init__.py, finetune/src/llamafactory/eval/evaluator.py, finetune/src/llamafactory/eval/template.py, finetune/src/llamafactory/extras/__init__.py, finetune/src/llamafactory/extras/ploting.py, finetune/src/llamafactory/extras/logging.py, finetune/src/llamafactory/extras/constants.py, finetune/src/llamafactory/extras/misc.py, finetune/src/llamafactory/extras/packages.py, finetune/src/llamafactory/extras/callbacks.py, finetune/src/llamafactory/hparams/__init__.py, finetune/src/llamafactory/hparams/data_args.py, finetune/src/llamafactory/hparams/evaluation_args.py, finetune/src/llamafactory/hparams/generating_args.py, finetune/src/llamafactory/hparams/finetuning_args.py, finetune/src/llamafactory/hparams/parser.py, finetune/src/llamafactory/hparams/model_args.py, finetune/src/llamafactory/model/__init__.py, finetune/src/llamafactory/model/adapter.py, finetune/src/llamafactory/model/loader.py, finetune/src/llamafactory/model/patcher.py, finetune/src/llamafactory/model/utils/__init__.py, finetune/src/llamafactory/model/utils/checkpointing.py, finetune/src/llamafactory/model/utils/embedding.py, finetune/src/llamafactory/model/utils/moe.py, finetune/src/llamafactory/model/utils/attention.py, finetune/src/llamafactory/model/utils/quantization.py, finetune/src/llamafactory/model/utils/valuehead.py, finetune/src/llamafactory/model/utils/longlora.py, finetune/src/llamafactory/model/utils/visual.py, finetune/src/llamafactory/model/utils/misc.py, finetune/src/llamafactory/model/utils/mod.py, finetune/src/llamafactory/model/utils/unsloth.py, finetune/src/llamafactory/model/utils/rope.py, finetune/src/llamafactory/train/__init__.py, finetune/src/llamafactory/train/utils.py, finetune/src/llamafactory/train/tuner.py, finetune/src/llamafactory/train/dpo/__init__.py, finetune/src/llamafactory/train/dpo/trainer.py, finetune/src/llamafactory/train/dpo/workflow.py, finetune/src/llamafactory/train/kto/__init__.py, finetune/src/llamafactory/train/kto/workflow.py, finetune/src/llamafactory/train/kto/trainer.py, finetune/src/llamafactory/train/orpo/__init__.py, finetune/src/llamafactory/train/orpo/workflow.py, finetune/src/llamafactory/train/orpo/trainer.py, finetune/src/llamafactory/train/ppo/__init__.py, finetune/src/llamafactory/train/ppo/workflow.py, finetune/src/llamafactory/train/ppo/utils.py, finetune/src/llamafactory/train/ppo/trainer.py, finetune/src/llamafactory/train/pt/__init__.py, finetune/src/llamafactory/train/pt/trainer.py, finetune/src/llamafactory/train/pt/workflow.py, finetune/src/llamafactory/train/rm/__init__.py, finetune/src/llamafactory/train/rm/metric.py, finetune/src/llamafactory/train/rm/trainer.py, finetune/src/llamafactory/train/rm/workflow.py, finetune/src/llamafactory/train/sft/__init__.py, finetune/src/llamafactory/train/sft/workflow.py, finetune/src/llamafactory/train/sft/trainer.py, finetune/src/llamafactory/train/sft/metric.py, finetune/src/llamafactory/webui/__init__.py, finetune/src/llamafactory/webui/chatter.py, finetune/src/llamafactory/webui/common.py, finetune/src/llamafactory/webui/interface.py, finetune/src/llamafactory/webui/runner.py, finetune/src/llamafactory/webui/css.py, finetune/src/llamafactory/webui/engine.py, finetune/src/llamafactory/webui/utils.py, finetune/src/llamafactory/webui/manager.py, finetune/src/llamafactory/webui/locales.py, finetune/src/llamafactory/webui/components/__init__.py, finetune/src/llamafactory/webui/components/data.py, finetune/src/llamafactory/webui/components/chatbot.py, finetune/src/llamafactory/webui/components/train.py, finetune/src/llamafactory/webui/components/top.py, finetune/src/llamafactory/webui/components/infer.py, finetune/src/llamafactory/webui/components/eval.py, finetune/src/llamafactory/webui/components/export.py files
parent 2d76eb38
Pipeline #1290 canceled with stages
The [dataset_info.json](dataset_info.json) contains all available datasets. If you are using a custom dataset, please **make sure** to add a *dataset description* in `dataset_info.json` and specify `dataset: dataset_name` before training to use it.
Currently we support datasets in **alpaca** and **sharegpt** format.
```json
"dataset_name": {
"hf_hub_url": "the name of the dataset repository on the Hugging Face hub. (if specified, ignore script_url and file_name)",
"ms_hub_url": "the name of the dataset repository on the Model Scope hub. (if specified, ignore script_url and file_name)",
"script_url": "the name of the directory containing a dataset loading script. (if specified, ignore file_name)",
"file_name": "the name of the dataset folder or dataset file in this directory. (required if above are not specified)",
"formatting": "the format of the dataset. (optional, default: alpaca, can be chosen from {alpaca, sharegpt})",
"ranking": "whether the dataset is a preference dataset or not. (default: False)",
"subset": "the name of the subset. (optional, default: None)",
"folder": "the name of the folder of the dataset repository on the Hugging Face hub. (optional, default: None)",
"columns (optional)": {
"prompt": "the column name in the dataset containing the prompts. (default: instruction)",
"query": "the column name in the dataset containing the queries. (default: input)",
"response": "the column name in the dataset containing the responses. (default: output)",
"history": "the column name in the dataset containing the histories. (default: None)",
"messages": "the column name in the dataset containing the messages. (default: conversations)",
"system": "the column name in the dataset containing the system prompts. (default: None)",
"tools": "the column name in the dataset containing the tool description. (default: None)",
"images": "the column name in the dataset containing the image inputs. (default: None)",
"chosen": "the column name in the dataset containing the chosen answers. (default: None)",
"rejected": "the column name in the dataset containing the rejected answers. (default: None)",
"kto_tag": "the column name in the dataset containing the kto tags. (default: None)"
},
"tags (optional, used for the sharegpt format)": {
"role_tag": "the key in the message represents the identity. (default: from)",
"content_tag": "the key in the message represents the content. (default: value)",
"user_tag": "the value of the role_tag represents the user. (default: human)",
"assistant_tag": "the value of the role_tag represents the assistant. (default: gpt)",
"observation_tag": "the value of the role_tag represents the tool results. (default: observation)",
"function_tag": "the value of the role_tag represents the function call. (default: function_call)",
"system_tag": "the value of the role_tag represents the system prompt. (default: system, can override system column)"
}
}
```
## Alpaca Format
### Supervised Fine-Tuning Dataset
* [Example dataset](alpaca_en_demo.json)
In supervised fine-tuning, the `instruction` column will be concatenated with the `input` column and used as the human prompt, then the human prompt would be `instruction\ninput`. The `output` column represents the model response.
The `system` column will be used as the system prompt if specified.
The `history` column is a list consisting of string tuples representing prompt-response pairs in the history messages. Note that the responses in the history **will also be learned by the model** in supervised fine-tuning.
```json
[
{
"instruction": "human instruction (required)",
"input": "human input (optional)",
"output": "model response (required)",
"system": "system prompt (optional)",
"history": [
["human instruction in the first round (optional)", "model response in the first round (optional)"],
["human instruction in the second round (optional)", "model response in the second round (optional)"]
]
}
]
```
Regarding the above dataset, the *dataset description* in `dataset_info.json` should be:
```json
"dataset_name": {
"file_name": "data.json",
"columns": {
"prompt": "instruction",
"query": "input",
"response": "output",
"system": "system",
"history": "history"
}
}
```
### Pre-training Dataset
- [Example dataset](c4_demo.json)
In pre-training, only the `text` column will be used for model learning.
```json
[
{"text": "document"},
{"text": "document"}
]
```
Regarding the above dataset, the *dataset description* in `dataset_info.json` should be:
```json
"dataset_name": {
"file_name": "data.json",
"columns": {
"prompt": "text"
}
}
```
### Preference Dataset
Preference datasets are used for reward modeling, DPO training and ORPO training.
It requires a better response in `chosen` column and a worse response in `rejected` column.
```json
[
{
"instruction": "human instruction (required)",
"input": "human input (optional)",
"chosen": "chosen answer (required)",
"rejected": "rejected answer (required)"
}
]
```
Regarding the above dataset, the *dataset description* in `dataset_info.json` should be:
```json
"dataset_name": {
"file_name": "data.json",
"ranking": true,
"columns": {
"prompt": "instruction",
"query": "input",
"chosen": "chosen",
"rejected": "rejected"
}
}
```
### KTO Dataset
- [Example dataset](kto_en_demo.json)
KTO datasets require a extra `kto_tag` column containing the boolean human feedback.
```json
[
{
"instruction": "human instruction (required)",
"input": "human input (optional)",
"output": "model response (required)",
"kto_tag": "human feedback [true/false] (required)"
}
]
```
Regarding the above dataset, the *dataset description* in `dataset_info.json` should be:
```json
"dataset_name": {
"file_name": "data.json",
"columns": {
"prompt": "instruction",
"query": "input",
"response": "output",
"kto_tag": "kto_tag"
}
}
```
### Multimodal Dataset
- [Example dataset](mllm_demo.json)
Multimodal datasets require a `images` column containing the paths to the input images. Currently we only support one image.
```json
[
{
"instruction": "human instruction (required)",
"input": "human input (optional)",
"output": "model response (required)",
"images": [
"image path (required)"
]
}
]
```
Regarding the above dataset, the *dataset description* in `dataset_info.json` should be:
```json
"dataset_name": {
"file_name": "data.json",
"columns": {
"prompt": "instruction",
"query": "input",
"response": "output",
"images": "images"
}
}
```
## Sharegpt Format
### Supervised Fine-Tuning Dataset
- [Example dataset](glaive_toolcall_en_demo.json)
Compared to the alpaca format, the sharegpt format allows the datasets have **more roles**, such as human, gpt, observation and function. They are presented in a list of objects in the `conversations` column.
Note that the human and observation should appear in odd positions, while gpt and function should appear in even positions.
```json
[
{
"conversations": [
{
"from": "human",
"value": "human instruction"
},
{
"from": "function_call",
"value": "tool arguments"
},
{
"from": "observation",
"value": "tool result"
},
{
"from": "gpt",
"value": "model response"
}
],
"system": "system prompt (optional)",
"tools": "tool description (optional)"
}
]
```
Regarding the above dataset, the *dataset description* in `dataset_info.json` should be:
```json
"dataset_name": {
"file_name": "data.json",
"formatting": "sharegpt",
"columns": {
"messages": "conversations",
"system": "system",
"tools": "tools"
}
}
```
### Preference Dataset
- [Example dataset](dpo_en_demo.json)
Preference datasets in sharegpt format also require a better message in `chosen` column and a worse message in `rejected` column.
```json
[
{
"conversations": [
{
"from": "human",
"value": "human instruction"
},
{
"from": "gpt",
"value": "model response"
},
{
"from": "human",
"value": "human instruction"
}
],
"chosen": {
"from": "gpt",
"value": "chosen answer (required)"
},
"rejected": {
"from": "gpt",
"value": "rejected answer (required)"
}
}
]
```
Regarding the above dataset, the *dataset description* in `dataset_info.json` should be:
```json
"dataset_name": {
"file_name": "data.json",
"formatting": "sharegpt",
"ranking": true,
"columns": {
"messages": "conversations",
"chosen": "chosen",
"rejected": "rejected"
}
}
```
### OpenAI Format
The openai format is simply a special case of the sharegpt format, where the first message may be a system prompt.
```json
[
{
"messages": [
{
"role": "system",
"content": "system prompt (optional)"
},
{
"role": "user",
"content": "human instruction"
},
{
"role": "assistant",
"content": "model response"
}
]
}
]
```
Regarding the above dataset, the *dataset description* in `dataset_info.json` should be:
```json
"dataset_name": {
"file_name": "data.json",
"formatting": "sharegpt",
"columns": {
"messages": "messages"
},
"tags": {
"role_tag": "role",
"content_tag": "content",
"user_tag": "user",
"assistant_tag": "assistant",
"system_tag": "system"
}
}
```
The KTO datasets and multimodal datasets in sharegpt format are similar to the alpaca format.
Pre-training datasets are **incompatible** with the sharegpt format.
[dataset_info.json](dataset_info.json) 包含了所有可用的数据集。如果您希望使用自定义数据集,请**务必**`dataset_info.json` 文件中添加*数据集描述*,并通过修改 `dataset: 数据集名称` 配置来使用数据集。
目前我们支持 **alpaca** 格式和 **sharegpt** 格式的数据集。
```json
"数据集名称": {
"hf_hub_url": "Hugging Face 的数据集仓库地址(若指定,则忽略 script_url 和 file_name)",
"ms_hub_url": "ModelScope 的数据集仓库地址(若指定,则忽略 script_url 和 file_name)",
"script_url": "包含数据加载脚本的本地文件夹名称(若指定,则忽略 file_name)",
"file_name": "该目录下数据集文件的名称(若上述参数未指定,则此项必需)",
"formatting": "数据集格式(可选,默认:alpaca,可以为 alpaca 或 sharegpt)",
"ranking": "是否为偏好数据集(可选,默认:False)",
"subset": "数据集子集的名称(可选,默认:None)",
"folder": "Hugging Face 仓库的文件夹名称(可选,默认:None)",
"columns(可选)": {
"prompt": "数据集代表提示词的表头名称(默认:instruction)",
"query": "数据集代表请求的表头名称(默认:input)",
"response": "数据集代表回答的表头名称(默认:output)",
"history": "数据集代表历史对话的表头名称(默认:None)",
"messages": "数据集代表消息列表的表头名称(默认:conversations)",
"system": "数据集代表系统提示的表头名称(默认:None)",
"tools": "数据集代表工具描述的表头名称(默认:None)",
"images": "数据集代表图像输入的表头名称(默认:None)",
"chosen": "数据集代表更优回答的表头名称(默认:None)",
"rejected": "数据集代表更差回答的表头名称(默认:None)",
"kto_tag": "数据集代表 KTO 标签的表头名称(默认:None)"
},
"tags(可选,用于 sharegpt 格式)": {
"role_tag": "消息中代表发送者身份的键名(默认:from)",
"content_tag": "消息中代表文本内容的键名(默认:value)",
"user_tag": "消息中代表用户的 role_tag(默认:human)",
"assistant_tag": "消息中代表助手的 role_tag(默认:gpt)",
"observation_tag": "消息中代表工具返回结果的 role_tag(默认:observation)",
"function_tag": "消息中代表工具调用的 role_tag(默认:function_call)",
"system_tag": "消息中代表系统提示的 role_tag(默认:system,会覆盖 system column)"
}
}
```
## Alpaca 格式
### 指令监督微调数据集
- [样例数据集](alpaca_zh_demo.json)
在指令监督微调时,`instruction` 列对应的内容会与 `input` 列对应的内容拼接后作为人类指令,即人类指令为 `instruction\ninput`。而 `output` 列对应的内容为模型回答。
如果指定,`system` 列对应的内容将被作为系统提示词。
`history` 列是由多个字符串二元组构成的列表,分别代表历史消息中每轮对话的指令和回答。注意在指令监督微调时,历史消息中的回答内容**也会被用于模型学习**
```json
[
{
"instruction": "人类指令(必填)",
"input": "人类输入(选填)",
"output": "模型回答(必填)",
"system": "系统提示词(选填)",
"history": [
["第一轮指令(选填)", "第一轮回答(选填)"],
["第二轮指令(选填)", "第二轮回答(选填)"]
]
}
]
```
对于上述格式的数据,`dataset_info.json` 中的*数据集描述*应为:
```json
"数据集名称": {
"file_name": "data.json",
"columns": {
"prompt": "instruction",
"query": "input",
"response": "output",
"system": "system",
"history": "history"
}
}
```
### 预训练数据集
- [样例数据集](c4_demo.json)
在预训练时,只有 `text` 列中的内容会用于模型学习。
```json
[
{"text": "document"},
{"text": "document"}
]
```
对于上述格式的数据,`dataset_info.json` 中的*数据集描述*应为:
```json
"数据集名称": {
"file_name": "data.json",
"columns": {
"prompt": "text"
}
}
```
### 偏好数据集
偏好数据集用于奖励模型训练、DPO 训练和 ORPO 训练。
它需要在 `chosen` 列中提供更优的回答,并在 `rejected` 列中提供更差的回答。
```json
[
{
"instruction": "人类指令(必填)",
"input": "人类输入(选填)",
"chosen": "优质回答(必填)",
"rejected": "劣质回答(必填)"
}
]
```
对于上述格式的数据,`dataset_info.json` 中的*数据集描述*应为:
```json
"数据集名称": {
"file_name": "data.json",
"ranking": true,
"columns": {
"prompt": "instruction",
"query": "input",
"chosen": "chosen",
"rejected": "rejected"
}
}
```
### KTO 数据集
- [样例数据集](kto_en_demo.json)
KTO 数据集需要额外添加一个 `kto_tag` 列,包含 bool 类型的人类反馈。
```json
[
{
"instruction": "人类指令(必填)",
"input": "人类输入(选填)",
"output": "模型回答(必填)",
"kto_tag": "人类反馈 [true/false](必填)"
}
]
```
对于上述格式的数据,`dataset_info.json` 中的*数据集描述*应为:
```json
"数据集名称": {
"file_name": "data.json",
"columns": {
"prompt": "instruction",
"query": "input",
"response": "output",
"kto_tag": "kto_tag"
}
}
```
### 多模态数据集
- [样例数据集](mllm_demo.json)
多模态数据集需要额外添加一个 `images` 列,包含输入图像的路径。目前我们仅支持单张图像输入。
```json
[
{
"instruction": "人类指令(必填)",
"input": "人类输入(选填)",
"output": "模型回答(必填)",
"images": [
"图像路径(必填)"
]
}
]
```
对于上述格式的数据,`dataset_info.json` 中的*数据集描述*应为:
```json
"数据集名称": {
"file_name": "data.json",
"columns": {
"prompt": "instruction",
"query": "input",
"response": "output",
"images": "images"
}
}
```
## Sharegpt 格式
### 指令监督微调数据集
- [样例数据集](glaive_toolcall_zh_demo.json)
相比 alpaca 格式的数据集,sharegpt 格式支持**更多的角色种类**,例如 human、gpt、observation、function 等等。它们构成一个对象列表呈现在 `conversations` 列中。
注意其中 human 和 observation 必须出现在奇数位置,gpt 和 function 必须出现在偶数位置。
```json
[
{
"conversations": [
{
"from": "human",
"value": "人类指令"
},
{
"from": "function_call",
"value": "工具参数"
},
{
"from": "observation",
"value": "工具结果"
},
{
"from": "gpt",
"value": "模型回答"
}
],
"system": "系统提示词(选填)",
"tools": "工具描述(选填)"
}
]
```
对于上述格式的数据,`dataset_info.json` 中的*数据集描述*应为:
```json
"数据集名称": {
"file_name": "data.json",
"formatting": "sharegpt",
"columns": {
"messages": "conversations",
"system": "system",
"tools": "tools"
}
}
```
### 偏好数据集
- [样例数据集](dpo_zh_demo.json)
Sharegpt 格式的偏好数据集同样需要在 `chosen` 列中提供更优的消息,并在 `rejected` 列中提供更差的消息。
```json
[
{
"conversations": [
{
"from": "human",
"value": "人类指令"
},
{
"from": "gpt",
"value": "模型回答"
},
{
"from": "human",
"value": "人类指令"
}
],
"chosen": {
"from": "gpt",
"value": "优质回答"
},
"rejected": {
"from": "gpt",
"value": "劣质回答"
}
}
]
```
对于上述格式的数据,`dataset_info.json` 中的*数据集描述*应为:
```json
"数据集名称": {
"file_name": "data.json",
"formatting": "sharegpt",
"ranking": true,
"columns": {
"messages": "conversations",
"chosen": "chosen",
"rejected": "rejected"
}
}
```
### OpenAI 格式
OpenAI 格式仅仅是 sharegpt 格式的一种特殊情况,其中第一条消息可能是系统提示词。
```json
[
{
"messages": [
{
"role": "system",
"content": "系统提示词(选填)"
},
{
"role": "user",
"content": "人类指令"
},
{
"role": "assistant",
"content": "模型回答"
}
]
}
]
```
对于上述格式的数据,`dataset_info.json` 中的*数据集描述*应为:
```json
"数据集名称": {
"file_name": "data.json",
"formatting": "sharegpt",
"columns": {
"messages": "messages"
},
"tags": {
"role_tag": "role",
"content_tag": "content",
"user_tag": "user",
"assistant_tag": "assistant",
"system_tag": "system"
}
}
```
Sharegpt 格式中的 KTO 数据集和多模态数据集与 alpaca 格式的类似。
预训练数据集**不支持** sharegpt 格式。
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
{
"identity": {
"file_name": "identity.json"
},
"alpaca_en_demo": {
"file_name": "alpaca_en_demo.json"
},
"alpaca_zh_demo": {
"file_name": "alpaca_zh_demo.json"
},
"glaive_toolcall_en_demo": {
"file_name": "glaive_toolcall_en_demo.json",
"formatting": "sharegpt",
"columns": {
"messages": "conversations",
"tools": "tools"
}
},
"glaive_toolcall_zh_demo": {
"file_name": "glaive_toolcall_zh_demo.json",
"formatting": "sharegpt",
"columns": {
"messages": "conversations",
"tools": "tools"
}
},
"mllm_demo": {
"file_name": "mllm_demo.json",
"formatting": "sharegpt",
"columns": {
"messages": "messages",
"images": "images"
},
"tags": {
"role_tag": "role",
"content_tag": "content",
"user_tag": "user",
"assistant_tag": "assistant"
}
},
"alpaca_en": {
"hf_hub_url": "llamafactory/alpaca_en",
"ms_hub_url": "llamafactory/alpaca_en"
},
"alpaca_zh": {
"hf_hub_url": "llamafactory/alpaca_zh",
"ms_hub_url": "llamafactory/alpaca_zh"
},
"alpaca_gpt4_en": {
"hf_hub_url": "llamafactory/alpaca_gpt4_en",
"ms_hub_url": "llamafactory/alpaca_gpt4_en"
},
"alpaca_gpt4_zh": {
"hf_hub_url": "llamafactory/alpaca_gpt4_zh",
"ms_hub_url": "llamafactory/alpaca_gpt4_zh"
},
"glaive_toolcall_en": {
"hf_hub_url": "llamafactory/glaive_toolcall_en",
"formatting": "sharegpt",
"columns": {
"messages": "conversations",
"tools": "tools"
}
},
"glaive_toolcall_zh": {
"hf_hub_url": "llamafactory/glaive_toolcall_zh",
"formatting": "sharegpt",
"columns": {
"messages": "conversations",
"tools": "tools"
}
},
"lima": {
"hf_hub_url": "llamafactory/lima",
"formatting": "sharegpt"
},
"guanaco": {
"hf_hub_url": "JosephusCheung/GuanacoDataset",
"ms_hub_url": "AI-ModelScope/GuanacoDataset"
},
"belle_2m": {
"hf_hub_url": "BelleGroup/train_2M_CN",
"ms_hub_url": "AI-ModelScope/train_2M_CN"
},
"belle_1m": {
"hf_hub_url": "BelleGroup/train_1M_CN",
"ms_hub_url": "AI-ModelScope/train_1M_CN"
},
"belle_0.5m": {
"hf_hub_url": "BelleGroup/train_0.5M_CN",
"ms_hub_url": "AI-ModelScope/train_0.5M_CN"
},
"belle_dialog": {
"hf_hub_url": "BelleGroup/generated_chat_0.4M",
"ms_hub_url": "AI-ModelScope/generated_chat_0.4M"
},
"belle_math": {
"hf_hub_url": "BelleGroup/school_math_0.25M",
"ms_hub_url": "AI-ModelScope/school_math_0.25M"
},
"belle_multiturn": {
"script_url": "belle_multiturn",
"formatting": "sharegpt"
},
"ultra_chat": {
"script_url": "ultra_chat",
"formatting": "sharegpt"
},
"open_platypus": {
"hf_hub_url": "garage-bAInd/Open-Platypus",
"ms_hub_url": "AI-ModelScope/Open-Platypus"
},
"codealpaca": {
"hf_hub_url": "sahil2801/CodeAlpaca-20k",
"ms_hub_url": "AI-ModelScope/CodeAlpaca-20k"
},
"alpaca_cot": {
"hf_hub_url": "QingyiSi/Alpaca-CoT",
"ms_hub_url": "AI-ModelScope/Alpaca-CoT"
},
"openorca": {
"hf_hub_url": "Open-Orca/OpenOrca",
"ms_hub_url": "AI-ModelScope/OpenOrca",
"columns": {
"prompt": "question",
"response": "response",
"system": "system_prompt"
}
},
"slimorca": {
"hf_hub_url": "Open-Orca/SlimOrca",
"formatting": "sharegpt"
},
"mathinstruct": {
"hf_hub_url": "TIGER-Lab/MathInstruct",
"ms_hub_url": "AI-ModelScope/MathInstruct",
"columns": {
"prompt": "instruction",
"response": "output"
}
},
"firefly": {
"hf_hub_url": "YeungNLP/firefly-train-1.1M",
"columns": {
"prompt": "input",
"response": "target"
}
},
"wikiqa": {
"hf_hub_url": "wiki_qa",
"columns": {
"prompt": "question",
"response": "answer"
}
},
"webqa": {
"hf_hub_url": "suolyer/webqa",
"ms_hub_url": "AI-ModelScope/webqa",
"columns": {
"prompt": "input",
"response": "output"
}
},
"webnovel": {
"hf_hub_url": "zxbsmk/webnovel_cn",
"ms_hub_url": "AI-ModelScope/webnovel_cn"
},
"nectar_sft": {
"hf_hub_url": "AstraMindAI/SFT-Nectar",
"ms_hub_url": "AI-ModelScope/SFT-Nectar"
},
"deepctrl": {
"ms_hub_url": "deepctrl/deepctrl-sft-data"
},
"adgen": {
"hf_hub_url": "HasturOfficial/adgen",
"ms_hub_url": "AI-ModelScope/adgen",
"columns": {
"prompt": "content",
"response": "summary"
}
},
"sharegpt_hyper": {
"hf_hub_url": "totally-not-an-llm/sharegpt-hyperfiltered-3k",
"formatting": "sharegpt"
},
"sharegpt4": {
"hf_hub_url": "shibing624/sharegpt_gpt4",
"ms_hub_url": "AI-ModelScope/sharegpt_gpt4",
"formatting": "sharegpt"
},
"ultrachat_200k": {
"hf_hub_url": "HuggingFaceH4/ultrachat_200k",
"ms_hub_url": "AI-ModelScope/ultrachat_200k",
"formatting": "sharegpt",
"columns": {
"messages": "messages"
},
"tags": {
"role_tag": "role",
"content_tag": "content",
"user_tag": "user",
"assistant_tag": "assistant"
}
},
"agent_instruct": {
"hf_hub_url": "THUDM/AgentInstruct",
"ms_hub_url": "ZhipuAI/AgentInstruct",
"formatting": "sharegpt"
},
"lmsys_chat": {
"hf_hub_url": "lmsys/lmsys-chat-1m",
"ms_hub_url": "AI-ModelScope/lmsys-chat-1m",
"formatting": "sharegpt",
"columns": {
"messages": "conversation"
},
"tags": {
"role_tag": "role",
"content_tag": "content",
"user_tag": "human",
"assistant_tag": "assistant"
}
},
"evol_instruct": {
"hf_hub_url": "WizardLM/WizardLM_evol_instruct_V2_196k",
"ms_hub_url": "AI-ModelScope/WizardLM_evol_instruct_V2_196k",
"formatting": "sharegpt"
},
"glaive_toolcall_100k": {
"hf_hub_url": "hiyouga/glaive-function-calling-v2-sharegpt",
"formatting": "sharegpt",
"columns": {
"messages": "conversations",
"tools": "tools"
}
},
"cosmopedia": {
"hf_hub_url": "HuggingFaceTB/cosmopedia",
"columns": {
"prompt": "prompt",
"response": "text"
}
},
"stem_zh": {
"hf_hub_url": "hfl/stem_zh_instruction"
},
"ruozhiba_gpt4": {
"hf_hub_url": "hfl/ruozhiba_gpt4_turbo"
},
"llava_150k_en": {
"hf_hub_url": "BUAADreamer/llava-en-zh-300k",
"subset": "en",
"formatting": "sharegpt",
"columns": {
"messages": "messages",
"images": "images"
},
"tags": {
"role_tag": "role",
"content_tag": "content",
"user_tag": "user",
"assistant_tag": "assistant"
}
},
"llava_150k_zh": {
"hf_hub_url": "BUAADreamer/llava-en-zh-300k",
"subset": "zh",
"formatting": "sharegpt",
"columns": {
"messages": "messages",
"images": "images"
},
"tags": {
"role_tag": "role",
"content_tag": "content",
"user_tag": "user",
"assistant_tag": "assistant"
}
},
"oasst_de": {
"hf_hub_url": "mayflowergmbh/oasst_de"
},
"dolly_15k_de": {
"hf_hub_url": "mayflowergmbh/dolly-15k_de"
},
"alpaca-gpt4_de": {
"hf_hub_url": "mayflowergmbh/alpaca-gpt4_de"
},
"openschnabeltier_de": {
"hf_hub_url": "mayflowergmbh/openschnabeltier_de"
},
"evol_instruct_de": {
"hf_hub_url": "mayflowergmbh/evol-instruct_de"
},
"dolphin_de": {
"hf_hub_url": "mayflowergmbh/dolphin_de"
},
"booksum_de": {
"hf_hub_url": "mayflowergmbh/booksum_de"
},
"airoboros_de": {
"hf_hub_url": "mayflowergmbh/airoboros-3.0_de"
},
"ultrachat_de": {
"hf_hub_url": "mayflowergmbh/ultra-chat_de"
},
"dpo_en_demo": {
"file_name": "dpo_en_demo.json",
"ranking": true,
"formatting": "sharegpt",
"columns": {
"messages": "conversations",
"chosen": "chosen",
"rejected": "rejected"
}
},
"dpo_zh_demo": {
"file_name": "dpo_zh_demo.json",
"ranking": true,
"formatting": "sharegpt",
"columns": {
"messages": "conversations",
"chosen": "chosen",
"rejected": "rejected"
}
},
"dpo_mix_en": {
"hf_hub_url": "hiyouga/DPO-En-Zh-20k",
"subset": "en",
"ranking": true,
"formatting": "sharegpt",
"columns": {
"messages": "conversations",
"chosen": "chosen",
"rejected": "rejected"
}
},
"dpo_mix_zh": {
"hf_hub_url": "hiyouga/DPO-En-Zh-20k",
"subset": "zh",
"ranking": true,
"formatting": "sharegpt",
"columns": {
"messages": "conversations",
"chosen": "chosen",
"rejected": "rejected"
}
},
"orca_pairs": {
"hf_hub_url": "Intel/orca_dpo_pairs",
"ranking": true,
"columns": {
"prompt": "question",
"chosen": "chosen",
"rejected": "rejected",
"system": "system"
}
},
"hh_rlhf_en": {
"script_url": "hh_rlhf_en",
"ranking": true,
"columns": {
"prompt": "instruction",
"chosen": "chosen",
"rejected": "rejected",
"history": "history"
}
},
"nectar_rm": {
"hf_hub_url": "AstraMindAI/RLAIF-Nectar",
"ms_hub_url": "AI-ModelScope/RLAIF-Nectar",
"ranking": true
},
"orca_dpo_de": {
"hf_hub_url": "mayflowergmbh/intel_orca_dpo_pairs_de",
"ranking": true
},
"kto_en_demo": {
"file_name": "kto_en_demo.json",
"formatting": "sharegpt",
"columns": {
"messages": "messages",
"kto_tag": "label"
},
"tags": {
"role_tag": "role",
"content_tag": "content",
"user_tag": "user",
"assistant_tag": "assistant"
}
},
"kto_mix_en": {
"hf_hub_url": "argilla/kto-mix-15k",
"formatting": "sharegpt",
"columns": {
"messages": "completion",
"kto_tag": "label"
},
"tags": {
"role_tag": "role",
"content_tag": "content",
"user_tag": "user",
"assistant_tag": "assistant"
}
},
"wiki_demo": {
"file_name": "wiki_demo.txt",
"columns": {
"prompt": "text"
}
},
"c4_demo": {
"file_name": "c4_demo.json",
"columns": {
"prompt": "text"
}
},
"refinedweb": {
"hf_hub_url": "tiiuae/falcon-refinedweb",
"columns": {
"prompt": "content"
}
},
"redpajama_v2": {
"hf_hub_url": "togethercomputer/RedPajama-Data-V2",
"columns": {
"prompt": "raw_content"
},
"subset": "default"
},
"wikipedia_en": {
"hf_hub_url": "olm/olm-wikipedia-20221220",
"ms_hub_url": "AI-ModelScope/olm-wikipedia-20221220",
"columns": {
"prompt": "text"
}
},
"wikipedia_zh": {
"hf_hub_url": "pleisto/wikipedia-cn-20230720-filtered",
"ms_hub_url": "AI-ModelScope/wikipedia-cn-20230720-filtered",
"columns": {
"prompt": "completion"
}
},
"pile": {
"hf_hub_url": "monology/pile-uncopyrighted",
"ms_hub_url": "AI-ModelScope/pile",
"columns": {
"prompt": "text"
}
},
"skypile": {
"hf_hub_url": "Skywork/SkyPile-150B",
"ms_hub_url": "AI-ModelScope/SkyPile-150B",
"columns": {
"prompt": "text"
}
},
"the_stack": {
"hf_hub_url": "bigcode/the-stack",
"ms_hub_url": "AI-ModelScope/the-stack",
"columns": {
"prompt": "content"
}
},
"starcoder_python": {
"hf_hub_url": "bigcode/starcoderdata",
"ms_hub_url": "AI-ModelScope/starcoderdata",
"columns": {
"prompt": "content"
},
"folder": "python"
}
}
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
[
{
"messages": [
{
"content": "Who are they?",
"role": "user"
},
{
"content": "They're Kane and Gretzka from Bayern Munich.",
"role": "assistant"
},
{
"content": "What are they doing?",
"role": "user"
},
{
"content": "They are celebrating on the soccer field.",
"role": "assistant"
}
],
"images": [
"mllm_demo_data/1.jpg"
]
},
{
"messages": [
{
"content": "Who is he?",
"role": "user"
},
{
"content": "He's Thomas Muller from Bayern Munich.",
"role": "assistant"
},
{
"content": "Why is he on the ground?",
"role": "user"
},
{
"content": "Because he's sliding on his knees to celebrate.",
"role": "assistant"
}
],
"images": [
"mllm_demo_data/2.jpg"
]
},
{
"messages": [
{
"content": "Please describe this image",
"role": "user"
},
{
"content": "Chinese astronaut Gui Haichao is giving a speech.",
"role": "assistant"
},
{
"content": "What has he accomplished?",
"role": "user"
},
{
"content": "He was appointed to be a payload specialist on Shenzhou 16 mission in June 2022, thus becoming the first Chinese civilian of Group 3 in space on 30 May 2023. He is responsible for the on-orbit operation of space science experimental payloads.",
"role": "assistant"
}
],
"images": [
"mllm_demo_data/3.jpg"
]
},
{
"messages": [
{
"content": "他们是谁?",
"role": "user"
},
{
"content": "他们是拜仁慕尼黑的凯恩和格雷茨卡。",
"role": "assistant"
},
{
"content": "他们在做什么?",
"role": "user"
},
{
"content": "他们在足球场上庆祝。",
"role": "assistant"
}
],
"images": [
"mllm_demo_data/1.jpg"
]
},
{
"messages": [
{
"content": "他是谁?",
"role": "user"
},
{
"content": "他是来自拜仁慕尼黑的托马斯·穆勒。",
"role": "assistant"
},
{
"content": "他为什么在地上?",
"role": "user"
},
{
"content": "因为他正在双膝跪地滑行庆祝。",
"role": "assistant"
}
],
"images": [
"mllm_demo_data/2.jpg"
]
},
{
"messages": [
{
"content": "请描述这张图片",
"role": "user"
},
{
"content": "中国宇航员桂海潮正在讲话。",
"role": "assistant"
},
{
"content": "他取得过哪些成就?",
"role": "user"
},
{
"content": "他于2022年6月被任命为神舟十六号任务的有效载荷专家,从而成为2023年5月30日进入太空的首位平民宇航员。他负责在轨操作空间科学实验有效载荷。",
"role": "assistant"
}
],
"images": [
"mllm_demo_data/3.jpg"
]
}
]
\ No newline at end of file
This diff is collapsed.
#!/bin/bash
# also launch it on slave machine using slave_config.yaml
NPROC_PER_NODE=2
NNODES=1
RANK=0
MASTER_ADDR=127.0.0.1
MASTER_PORT=17170
HIP_VISIBLE_DEVICES=6,7 torchrun \
--nproc_per_node $NPROC_PER_NODE \
--nnodes $NNODES \
--node_rank $RANK \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT \
src/train.py \
--stage sft --do_train \
--model_name_or_path /home/practice/Yi-1.5-6B-Chat <Your model path>\
--dataset alpaca_en_demo --template yi --finetuning_type lora --lora_target q_proj,v_proj \
--output_dir saves/yi-6b/lora/sft \
--overwrite_output_dir \
--overwrite_cache --per_device_train_batch_size 2 --gradient_accumulation_steps 32 --lr_scheduler_type cosine \
--logging_steps 10 --save_steps 1000 --learning_rate 1e-4 --num_train_epochs 3.0 --plot_loss --fp16
# coding=utf-8
# Calculates the flops of pre-trained models.
# Usage: python cal_flops.py --model_name_or_path path_to_model --batch_size 1 --seq_length 512
# Inspired by: https://www.deepspeed.ai/tutorials/flops-profiler/
import fire
import torch
from deepspeed.accelerator import get_accelerator # type: ignore
from deepspeed.profiling.flops_profiler import get_model_profile # type: ignore
from llamafactory.chat import ChatModel
def calculate_flops(
model_name_or_path: str,
batch_size: int = 1,
seq_length: int = 256,
flash_attn: str = "auto",
):
with get_accelerator().device(0):
chat_model = ChatModel(dict(model_name_or_path=model_name_or_path, template="empty", flash_attn=flash_attn))
fake_input = torch.ones((batch_size, seq_length), dtype=torch.long, device=chat_model.model.device)
input_dict = {"input_ids": fake_input, "labels": fake_input.clone()}
flops, macs, params = get_model_profile(chat_model.model, kwargs=input_dict, print_profile=True, detailed=True)
print("FLOPs:", flops)
print("MACs:", macs)
print("Params:", params)
if __name__ == "__main__":
fire.Fire(calculate_flops)
# coding=utf-8
# Calculates the optimal learning rate for 7B/13B models using LLaMA's hyper-parameters.
# Usage: python cal_lr.py --model_name_or_path path_to_model --dataset alpaca_en --cutoff_len 1024 --batch_size 16
# Inspired by: https://github.com/imoneoi/openchat/blob/master/ochat/training_deepspeed/train.py
import math
from typing import Literal
import fire
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import DataCollatorForLanguageModeling, DataCollatorForSeq2Seq
from llamafactory.data import get_dataset
from llamafactory.extras.constants import IGNORE_INDEX
from llamafactory.hparams import get_train_args
from llamafactory.model import load_tokenizer
BASE_LR = 3e-4 # 1.5e-4 for 30B-70B models
BASE_BS = 4_000_000 # from llama paper
def calculate_lr(
model_name_or_path: str,
batch_size: int, # total batch size, namely (batch size * gradient accumulation * world size)
stage: Literal["pt", "sft"] = "sft",
dataset: str = "alpaca_en",
dataset_dir: str = "data",
template: str = "default",
cutoff_len: int = 1024, # i.e. maximum input length during training
is_mistral: bool = False, # mistral model uses a smaller learning rate,
):
model_args, data_args, training_args, _, _ = get_train_args(
dict(
stage=stage,
model_name_or_path=model_name_or_path,
dataset=dataset,
dataset_dir=dataset_dir,
template=template,
cutoff_len=cutoff_len,
output_dir="dummy_dir",
overwrite_cache=True,
)
)
tokenizer_module = load_tokenizer(model_args)
tokenizer = tokenizer_module["tokenizer"]
trainset = get_dataset(model_args, data_args, training_args, stage, **tokenizer_module)
if stage == "pt":
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
elif stage == "sft":
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, label_pad_token_id=IGNORE_INDEX)
else:
raise NotImplementedError
dataloader = DataLoader(trainset, batch_size, shuffle=False, collate_fn=data_collator, pin_memory=True)
valid_tokens, total_tokens = 0, 0
for batch in tqdm(dataloader):
valid_tokens += torch.sum(batch["labels"] != IGNORE_INDEX).item()
total_tokens += torch.numel(batch["labels"])
batch_max_len = cutoff_len * batch_size # max tokens in a batch
valid_ratio = valid_tokens / total_tokens
batch_valid_len = batch_max_len * valid_ratio
lr = BASE_LR * math.sqrt(batch_valid_len / BASE_BS) # lr ~ sqrt(batch_size)
lr = lr / 6.0 if is_mistral else lr
print(
"Optimal learning rate is {:.2e} for valid ratio% {:.2f} and effective batch size {:.2f}".format(
lr, valid_ratio * 100, batch_valid_len
)
)
if __name__ == "__main__":
fire.Fire(calculate_lr)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment