Commit c0d96b32 authored by chenzk's avatar chenzk
Browse files

v1.0

parents
Pipeline #2852 failed with stages
in 0 seconds
#!/bin/bash
NPROC_PER_NODE=8
NNODES=1
RANK=0
MASTER_ADDR=127.0.0.1
MASTER_PORT=29500
export NCCL_P2P_DISABLE=1
export NCCL_IB_DISABLE=1
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun \
--nproc_per_node $NPROC_PER_NODE \
--nnodes $NNODES \
--node_rank $RANK \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT \
src/train.py /root/ld/ld_project/LLaMA-Factory/examples/minicpm/minicpm_sft.yaml
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# MiniCPM-2B 参数高效微调(LoRA)A100 80G 单卡示例\n",
"\n",
"显存更小的显卡可用 batch size 和 grad_accum 间时间换空间\n",
"\n",
"本 notebook 是一个使用 `AdvertiseGen` 数据集对 MiniCPM-2B 进行 LoRA 微调,使其具备专业的广告生成能力的代码示例。\n",
"\n",
"## 最低硬件需求\n",
"- 显存:12GB\n",
"- 显卡架构:安培架构(推荐)\n",
"- 内存:16GB"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. 准备数据集\n",
"\n",
"将数据集转换为更通用的格式\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 转换为 ChatML 格式\n",
"import os\n",
"import shutil\n",
"import json\n",
"\n",
"input_dir = \"data/AdvertiseGen\"\n",
"output_dir = \"data/AdvertiseGenChatML\"\n",
"if os.path.exists(output_dir):\n",
" shutil.rmtree(output_dir)\n",
"os.makedirs(output_dir, exist_ok=True)\n",
"\n",
"for fn in [\"train.json\", \"dev.json\"]:\n",
" data_out_list = []\n",
" with open(os.path.join(input_dir, fn), \"r\") as f, open(os.path.join(output_dir, fn), \"w\") as fo:\n",
" for line in f:\n",
" if len(line.strip()) > 0:\n",
" data = json.loads(line)\n",
" data_out = {\n",
" \"messages\": [\n",
" {\n",
" \"role\": \"user\",\n",
" \"content\": data[\"content\"],\n",
" },\n",
" {\n",
" \"role\": \"assistant\",\n",
" \"content\": data[\"summary\"],\n",
" },\n",
" ]\n",
" }\n",
" data_out_list.append(data_out)\n",
" json.dump(data_out_list, fo, ensure_ascii=False, indent=4)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. 使用 LoRA 进行微调\n",
"\n",
"命令行一键运行"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!bash lora_finetune.sh"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 推理验证"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"from tqdm import tqdm\n",
"from transformers import AutoModelForCausalLM, AutoTokenizer"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"path = \"output/AdvertiseGenLoRA/20240315224356/checkpoint-3000\"\n",
"tokenizer = AutoTokenizer.from_pretrained(path)\n",
"model = AutoModelForCausalLM.from_pretrained(\n",
" path, torch_dtype=torch.bfloat16, device_map=\"cuda\", trust_remote_code=True\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"res, history = model.chat(tokenizer, query=\"<用户>类型#上衣*材质#牛仔布*颜色#白色*风格#简约*图案#刺绣*衣样式#外套*衣款式#破洞<AI>\", max_length=80, top_p=0.5)\n",
"res, history"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
formatted_time=$(date +"%Y%m%d%H%M%S")
echo $formatted_time
deepspeed --include localhost:1 finetune.py \
--model_name_or_path MiniCPM-2B-sft-bf16 \
--output_dir output/AdvertiseGenLoRA/$formatted_time/ \
--train_data_path data/AdvertiseGenChatML/train.json \
--eval_data_path data/AdvertiseGenChatML/dev.json \
--learning_rate 5e-5 --per_device_train_batch_size 32 \
--per_device_eval_batch_size 64 --model_max_length 384 --bf16 --use_lora \
--gradient_accumulation_steps 1 --warmup_steps 100 \
--max_steps 3000 --weight_decay 0.01 \
--evaluation_strategy steps --eval_steps 500 \
--save_strategy steps --save_steps 500 --seed 42 \
--log_level info --logging_strategy steps --logging_steps 10 \
--deepspeed configs/ds_config_zero3_offload.json
formatted_time=$(date +"%Y%m%d%H%M%S")
echo $formatted_time
export HIP_VISIBLE_DEVICES=0,1,2,3
deepspeed --include localhost:0,1,2,3 --master_port 19888 finetune.py \
--model_name_or_path ../openbmb/MiniCPM4-8B \
--output_dir output/OCNLILoRA/$formatted_time/ \
--train_data_path data/ocnli_public_chatml/train.json \
--eval_data_path data/ocnli_public_chatml/dev.json \
--learning_rate 5e-5 --per_device_train_batch_size 40 \
--per_device_eval_batch_size 128 --model_max_length 128 --bf16 --use_lora \
--gradient_accumulation_steps 1 --warmup_steps 100 \
--max_steps 1000 --weight_decay 0.01 \
--eval_steps 500 \
--save_strategy steps --save_steps 500 --seed 42 \
--log_level info --logging_strategy steps --logging_steps 10 \
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# MiniCPM-2B 参数高效微调(LoRA)A100 80G 单卡示例\n",
"\n",
"显存更小的显卡可用 batch size 和 grad_accum 间时间换空间\n",
"\n",
"本 notebook 是一个使用 `OCNLI` 数据集对 MiniCPM-2B 进行 LoRA 微调,使其具备专业的广告生成能力的代码示例。\n",
"\n",
"## 最低硬件需求\n",
"- 显存:12GB\n",
"- 显卡架构:安培架构(推荐)\n",
"- 内存:16GB"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. 准备数据集\n",
"\n",
"将数据转换为更通用的格式"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# 转换为 ChatML 格式\n",
"import os\n",
"import shutil\n",
"import json\n",
"\n",
"input_dir = \"data/ocnli_public\"\n",
"output_dir = \"data/ocnli_public_chatml\"\n",
"if os.path.exists(output_dir):\n",
" shutil.rmtree(output_dir)\n",
"os.makedirs(output_dir, exist_ok=True)\n",
"\n",
"for fn in [\"train.json\", \"dev.json\"]:\n",
" data_out_list = []\n",
" with open(os.path.join(input_dir, fn), \"r\") as f, open(os.path.join(output_dir, fn), \"w\") as fo:\n",
" for line in f:\n",
" if len(line.strip()) > 0:\n",
" data = json.loads(line)\n",
" data_out = {\n",
" \"messages\": [\n",
" {\n",
" \"role\": \"user\",\n",
" \"content\": f\"请判断下边两个句子的关系属于 [entailment, neutral, contradiction]中的哪一种?\\n句子1: {data['sentence1']}\\n句子2:{data['sentence2']}\\n\"\n",
" },\n",
" {\n",
" \"role\": \"assistant\",\n",
" \"content\": data[\"label\"],\n",
" },\n",
" ]\n",
" }\n",
" data_out_list.append(data_out)\n",
" json.dump(data_out_list, fo, ensure_ascii=False, indent=4)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. 使用 LoRA 进行微调\n",
"\n",
"命令行一键运行"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"20240315212836\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2024-03-15 21:28:38,758] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
"[2024-03-15 21:28:45,799] [WARNING] [runner.py:202:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only.\n",
"[2024-03-15 21:28:45,799] [INFO] [runner.py:568:main] cmd = /usr/bin/python3 -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMF19 --master_addr=127.0.0.1 --master_port=19888 --enable_each_rank_log=None finetune.py --model_name_or_path MiniCPM-2B-sft-bf16 --output_dir output/ocnli_public_chatml/20240315212836/ --train_data_path data/ocnli_public_chatml/train.json --eval_data_path data/ocnli_public_chatml/dev.json --learning_rate 5e-5 --per_device_train_batch_size 64 --per_device_eval_batch_size 128 --model_max_length 128 --bf16 --use_lora --gradient_accumulation_steps 1 --warmup_steps 100 --max_steps 1000 --weight_decay 0.01 --evaluation_strategy steps --eval_steps 500 --save_strategy steps --save_steps 500 --seed 42 --log_level info --logging_strategy steps --logging_steps 10 --deepspeed configs/ds_config_zero3_offload.json\n",
"[2024-03-15 21:28:47,849] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
"[2024-03-15 21:28:54,904] [INFO] [launch.py:145:main] WORLD INFO DICT: {'localhost': [0]}\n",
"[2024-03-15 21:28:54,905] [INFO] [launch.py:151:main] nnodes=1, num_local_procs=1, node_rank=0\n",
"[2024-03-15 21:28:54,905] [INFO] [launch.py:162:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0]})\n",
"[2024-03-15 21:28:54,905] [INFO] [launch.py:163:main] dist_world_size=1\n",
"[2024-03-15 21:28:54,905] [INFO] [launch.py:165:main] Setting CUDA_VISIBLE_DEVICES=0\n",
"[2024-03-15 21:28:54,905] [INFO] [launch.py:253:main] process 86577 spawned with command: ['/usr/bin/python3', '-u', 'finetune.py', '--local_rank=0', '--model_name_or_path', 'MiniCPM-2B-sft-bf16', '--output_dir', 'output/ocnli_public_chatml/20240315212836/', '--train_data_path', 'data/ocnli_public_chatml/train.json', '--eval_data_path', 'data/ocnli_public_chatml/dev.json', '--learning_rate', '5e-5', '--per_device_train_batch_size', '64', '--per_device_eval_batch_size', '128', '--model_max_length', '128', '--bf16', '--use_lora', '--gradient_accumulation_steps', '1', '--warmup_steps', '100', '--max_steps', '1000', '--weight_decay', '0.01', '--evaluation_strategy', 'steps', '--eval_steps', '500', '--save_strategy', 'steps', '--save_steps', '500', '--seed', '42', '--log_level', 'info', '--logging_strategy', 'steps', '--logging_steps', '10', '--deepspeed', 'configs/ds_config_zero3_offload.json']\n",
"[2024-03-15 21:29:03,964] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
"[2024-03-15 21:29:04,250] [INFO] [comm.py:637:init_distributed] cdb=None\n",
"[2024-03-15 21:29:04,250] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl\n",
"/usr/local/lib/python3.10/dist-packages/torch/_utils.py:836: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
" return self.fget.__get__(instance, owner)()\n",
"Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in MiniCPMForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained(\"openai/whisper-tiny\", attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)`\n",
"You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.\n",
"Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in MiniCPMModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained(\"openai/whisper-tiny\", attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)`\n",
"[2024-03-15 21:29:08,998] [INFO] [partition_parameters.py:343:__exit__] finished initializing model - num_params = 363, num_elems = 3.01B\n",
"trainable params: 2,949,120 || all params: 2,727,830,016 || trainable%: 0.10811230841738784\n",
"input: <s> <用户> 请判断下边两个句子的关系属于 [entailment, neutral, contradiction]中的哪一种?\n",
"句子1: 一月份跟二月份肯定有一个月份有.\n",
"句子2:肯定有一个月份有\n",
" <AI> entailment</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s>\n",
"label: entailment\n",
"input: <s> <用户> 请判断下边两个句子的关系属于 [entailment, neutral, contradiction]中的哪一种?\n",
"句子1: 身上裹一件工厂发的棉大衣,手插在袖筒里\n",
"句子2:身上至少一件衣服\n",
" <AI> entailment</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s>\n",
"label: entailment\n",
"Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.\n",
"max_steps is given, it will override any value given in num_train_epochs\n",
"Using auto half precision backend\n",
"Detected ZeRO Offload and non-DeepSpeed optimizers: This combination should work as long as the custom optimizer has both CPU and GPU implementation (except LAMB)\n",
"Using /home/jeeves/.cache/torch_extensions/py310_cu123 as PyTorch extensions root...\n",
"Detected CUDA files, patching ldflags\n",
"Emitting ninja build file /home/jeeves/.cache/torch_extensions/py310_cu123/cpu_adam/build.ninja...\n",
"Building extension module cpu_adam...\n",
"Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
"ninja: no work to do.\n",
"Loading extension module cpu_adam...\n",
"Time to load cpu_adam op: 2.3341457843780518 seconds\n",
"Adam Optimizer #0 is created with AVX512 arithmetic capability.\n",
"Config: alpha=0.000050, betas=(0.900000, 0.999000), weight_decay=0.010000, adam_w=1\n",
"[2024-03-15 21:29:15,864] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed info: version=0.14.0, git-hash=unknown, git-branch=unknown\n",
"[2024-03-15 21:29:15,884] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False\n",
"[2024-03-15 21:29:15,886] [INFO] [logging.py:96:log_dist] [Rank 0] Using client Optimizer as basic optimizer\n",
"[2024-03-15 21:29:15,886] [INFO] [logging.py:96:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer\n",
"[2024-03-15 21:29:15,895] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Basic Optimizer = DeepSpeedCPUAdam\n",
"[2024-03-15 21:29:15,896] [INFO] [utils.py:56:is_zero_supported_optimizer] Checking ZeRO support for optimizer=DeepSpeedCPUAdam type=<class 'deepspeed.ops.adam.cpu_adam.DeepSpeedCPUAdam'>\n",
"[2024-03-15 21:29:15,896] [INFO] [logging.py:96:log_dist] [Rank 0] Creating fp16 ZeRO stage 3 optimizer, MiCS is enabled False, Hierarchical params gather False\n",
"[2024-03-15 21:29:15,896] [INFO] [logging.py:96:log_dist] [Rank 0] Creating torch.bfloat16 ZeRO stage 3 optimizer\n",
"[2024-03-15 21:29:16,049] [INFO] [utils.py:800:see_memory_usage] Stage 3 initialize beginning\n",
"[2024-03-15 21:29:16,049] [INFO] [utils.py:801:see_memory_usage] MA 0.03 GB Max_MA 1.62 GB CA 0.04 GB Max_CA 2 GB \n",
"[2024-03-15 21:29:16,049] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory: used = 138.03 GB, percent = 13.7%\n",
"[2024-03-15 21:29:16,053] [INFO] [stage3.py:130:__init__] Reduce bucket size 5308416\n",
"[2024-03-15 21:29:16,053] [INFO] [stage3.py:131:__init__] Prefetch bucket size 4777574\n",
"[2024-03-15 21:29:16,201] [INFO] [utils.py:800:see_memory_usage] DeepSpeedZeRoOffload initialize [begin]\n",
"[2024-03-15 21:29:16,201] [INFO] [utils.py:801:see_memory_usage] MA 0.03 GB Max_MA 0.03 GB CA 0.04 GB Max_CA 0 GB \n",
"[2024-03-15 21:29:16,201] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory: used = 138.03 GB, percent = 13.7%\n",
"Parameter Offload: Total persistent parameters: 3135744 in 241 params\n",
"[2024-03-15 21:29:16,449] [INFO] [utils.py:800:see_memory_usage] DeepSpeedZeRoOffload initialize [end]\n",
"[2024-03-15 21:29:16,450] [INFO] [utils.py:801:see_memory_usage] MA 0.02 GB Max_MA 0.03 GB CA 0.04 GB Max_CA 0 GB \n",
"[2024-03-15 21:29:16,450] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory: used = 138.04 GB, percent = 13.7%\n",
"[2024-03-15 21:29:16,608] [INFO] [utils.py:800:see_memory_usage] Before creating fp16 partitions\n",
"[2024-03-15 21:29:16,609] [INFO] [utils.py:801:see_memory_usage] MA 0.02 GB Max_MA 0.02 GB CA 0.04 GB Max_CA 0 GB \n",
"[2024-03-15 21:29:16,609] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory: used = 138.05 GB, percent = 13.7%\n",
"[2024-03-15 21:29:16,776] [INFO] [utils.py:800:see_memory_usage] After creating fp16 partitions: 1\n",
"[2024-03-15 21:29:16,777] [INFO] [utils.py:801:see_memory_usage] MA 0.02 GB Max_MA 0.02 GB CA 0.04 GB Max_CA 0 GB \n",
"[2024-03-15 21:29:16,777] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory: used = 138.05 GB, percent = 13.7%\n",
"[2024-03-15 21:29:16,931] [INFO] [utils.py:800:see_memory_usage] Before creating fp32 partitions\n",
"[2024-03-15 21:29:16,932] [INFO] [utils.py:801:see_memory_usage] MA 0.02 GB Max_MA 0.02 GB CA 0.04 GB Max_CA 0 GB \n",
"[2024-03-15 21:29:16,932] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory: used = 138.05 GB, percent = 13.7%\n",
"[2024-03-15 21:29:17,099] [INFO] [utils.py:800:see_memory_usage] After creating fp32 partitions\n",
"[2024-03-15 21:29:17,100] [INFO] [utils.py:801:see_memory_usage] MA 0.02 GB Max_MA 0.02 GB CA 0.04 GB Max_CA 0 GB \n",
"[2024-03-15 21:29:17,100] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory: used = 138.04 GB, percent = 13.7%\n",
"[2024-03-15 21:29:17,254] [INFO] [utils.py:800:see_memory_usage] Before initializing optimizer states\n",
"[2024-03-15 21:29:17,254] [INFO] [utils.py:801:see_memory_usage] MA 0.02 GB Max_MA 0.02 GB CA 0.04 GB Max_CA 0 GB \n",
"[2024-03-15 21:29:17,254] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory: used = 138.04 GB, percent = 13.7%\n",
"[2024-03-15 21:29:17,425] [INFO] [utils.py:800:see_memory_usage] After initializing optimizer states\n",
"[2024-03-15 21:29:17,425] [INFO] [utils.py:801:see_memory_usage] MA 0.02 GB Max_MA 0.02 GB CA 0.04 GB Max_CA 0 GB \n",
"[2024-03-15 21:29:17,425] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory: used = 138.04 GB, percent = 13.7%\n",
"[2024-03-15 21:29:17,426] [INFO] [stage3.py:486:_setup_for_real_optimizer] optimizer state initialized\n",
"[2024-03-15 21:29:17,633] [INFO] [utils.py:800:see_memory_usage] After initializing ZeRO optimizer\n",
"[2024-03-15 21:29:17,633] [INFO] [utils.py:801:see_memory_usage] MA 0.03 GB Max_MA 0.03 GB CA 0.06 GB Max_CA 0 GB \n",
"[2024-03-15 21:29:17,634] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory: used = 138.05 GB, percent = 13.7%\n",
"[2024-03-15 21:29:17,634] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Final Optimizer = DeepSpeedCPUAdam\n",
"[2024-03-15 21:29:17,634] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed using client LR scheduler\n",
"[2024-03-15 21:29:17,634] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed LR Scheduler = None\n",
"[2024-03-15 21:29:17,634] [INFO] [logging.py:96:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0], mom=[(0.9, 0.999)]\n",
"[2024-03-15 21:29:17,636] [INFO] [config.py:996:print] DeepSpeedEngine configuration:\n",
"[2024-03-15 21:29:17,636] [INFO] [config.py:1000:print] activation_checkpointing_config {\n",
" \"partition_activations\": false, \n",
" \"contiguous_memory_optimization\": false, \n",
" \"cpu_checkpointing\": false, \n",
" \"number_checkpoints\": null, \n",
" \"synchronize_checkpoint_boundary\": false, \n",
" \"profile\": false\n",
"}\n",
"[2024-03-15 21:29:17,636] [INFO] [config.py:1000:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True}\n",
"[2024-03-15 21:29:17,636] [INFO] [config.py:1000:print] amp_enabled .................. False\n",
"[2024-03-15 21:29:17,636] [INFO] [config.py:1000:print] amp_params ................... False\n",
"[2024-03-15 21:29:17,636] [INFO] [config.py:1000:print] autotuning_config ............ {\n",
" \"enabled\": false, \n",
" \"start_step\": null, \n",
" \"end_step\": null, \n",
" \"metric_path\": null, \n",
" \"arg_mappings\": null, \n",
" \"metric\": \"throughput\", \n",
" \"model_info\": null, \n",
" \"results_dir\": \"autotuning_results\", \n",
" \"exps_dir\": \"autotuning_exps\", \n",
" \"overwrite\": true, \n",
" \"fast\": true, \n",
" \"start_profile_step\": 3, \n",
" \"end_profile_step\": 5, \n",
" \"tuner_type\": \"gridsearch\", \n",
" \"tuner_early_stopping\": 5, \n",
" \"tuner_num_trials\": 50, \n",
" \"model_info_path\": null, \n",
" \"mp_size\": 1, \n",
" \"max_train_batch_size\": null, \n",
" \"min_train_batch_size\": 1, \n",
" \"max_train_micro_batch_size_per_gpu\": 1.024000e+03, \n",
" \"min_train_micro_batch_size_per_gpu\": 1, \n",
" \"num_tuning_micro_batch_sizes\": 3\n",
"}\n",
"[2024-03-15 21:29:17,636] [INFO] [config.py:1000:print] bfloat16_enabled ............. True\n",
"[2024-03-15 21:29:17,636] [INFO] [config.py:1000:print] bfloat16_immediate_grad_update False\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] checkpoint_parallel_write_pipeline False\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] checkpoint_tag_validation_enabled True\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] checkpoint_tag_validation_fail False\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] comms_config ................. <deepspeed.comm.config.DeepSpeedCommsConfig object at 0x7f095baedab0>\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] communication_data_type ...... None\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] compile_config ............... enabled=False backend='inductor' kwargs={}\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}}\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] curriculum_enabled_legacy .... False\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] curriculum_params_legacy ..... False\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}}\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] data_efficiency_enabled ...... False\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] dataloader_drop_last ......... False\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] disable_allgather ............ False\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] dump_state ................... False\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] dynamic_loss_scale_args ...... None\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] eigenvalue_enabled ........... False\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] eigenvalue_gas_boundary_resolution 1\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] eigenvalue_layer_name ........ bert.encoder.layer\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] eigenvalue_layer_num ......... 0\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] eigenvalue_max_iter .......... 100\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] eigenvalue_stability ......... 1e-06\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] eigenvalue_tol ............... 0.01\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] eigenvalue_verbose ........... False\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] elasticity_enabled ........... False\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] flops_profiler_config ........ {\n",
" \"enabled\": false, \n",
" \"recompute_fwd_factor\": 0.0, \n",
" \"profile_step\": 1, \n",
" \"module_depth\": -1, \n",
" \"top_modules\": 1, \n",
" \"detailed\": true, \n",
" \"output_file\": null\n",
"}\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] fp16_auto_cast ............... None\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] fp16_enabled ................. False\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] fp16_master_weights_and_gradients False\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] global_rank .................. 0\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] grad_accum_dtype ............. None\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] gradient_accumulation_steps .. 1\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] gradient_clipping ............ 1.0\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] gradient_predivide_factor .... 1.0\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] graph_harvesting ............. False\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] initial_dynamic_scale ........ 1\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] load_universal_checkpoint .... False\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] loss_scale ................... 1.0\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] memory_breakdown ............. False\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] mics_hierarchial_params_gather False\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] mics_shard_size .............. -1\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] nebula_config ................ {\n",
" \"enabled\": false, \n",
" \"persistent_storage_path\": null, \n",
" \"persistent_time_interval\": 100, \n",
" \"num_of_version_in_retention\": 2, \n",
" \"enable_nebula_load\": true, \n",
" \"load_path\": null\n",
"}\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] optimizer_legacy_fusion ...... False\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] optimizer_name ............... None\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] optimizer_params ............. None\n",
"[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True}\n",
"[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print] pld_enabled .................. False\n",
"[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print] pld_params ................... False\n",
"[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print] prescale_gradients ........... False\n",
"[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print] scheduler_name ............... None\n",
"[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print] scheduler_params ............. None\n",
"[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print] seq_parallel_communication_data_type torch.float32\n",
"[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print] sparse_attention ............. None\n",
"[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print] sparse_gradients_enabled ..... False\n",
"[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print] steps_per_print .............. inf\n",
"[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print] train_batch_size ............. 64\n",
"[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print] train_micro_batch_size_per_gpu 64\n",
"[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print] use_data_before_expert_parallel_ False\n",
"[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print] use_node_local_storage ....... False\n",
"[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print] wall_clock_breakdown ......... False\n",
"[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print] weight_quantization_config ... None\n",
"[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print] world_size ................... 1\n",
"[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print] zero_allow_untested_optimizer True\n",
"[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print] zero_config .................. stage=3 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=5308416 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=DeepSpeedZeroOffloadParamConfig(device='cpu', nvme_path=None, buffer_count=5, buffer_size=100,000,000, max_in_cpu=1,000,000,000, pin_memory=True) offload_optimizer=DeepSpeedZeroOffloadOptimizerConfig(device='cpu', nvme_path=None, buffer_count=4, pin_memory=True, pipeline=False, pipeline_read=False, pipeline_write=False, fast_init=False, ratio=1.0) sub_group_size=1,000,000,000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=4777574 param_persistence_threshold=23040 model_persistence_threshold=sys.maxsize max_live_parameters=1,000,000,000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=True stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True pipeline_loading_checkpoint=False override_module_apply=True\n",
"[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print] zero_enabled ................. True\n",
"[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print] zero_force_ds_cpu_optimizer .. True\n",
"[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print] zero_optimization_stage ...... 3\n",
"[2024-03-15 21:29:17,638] [INFO] [config.py:986:print_user_config] json = {\n",
" \"fp16\": {\n",
" \"enabled\": false, \n",
" \"loss_scale\": 0, \n",
" \"loss_scale_window\": 1000, \n",
" \"initial_scale_power\": 16, \n",
" \"hysteresis\": 2, \n",
" \"min_loss_scale\": 1\n",
" }, \n",
" \"bf16\": {\n",
" \"enabled\": true\n",
" }, \n",
" \"zero_optimization\": {\n",
" \"stage\": 3, \n",
" \"allgather_partitions\": true, \n",
" \"allgather_bucket_size\": 5.000000e+08, \n",
" \"reduce_scatter\": true, \n",
" \"contiguous_gradients\": true, \n",
" \"overlap_comm\": true, \n",
" \"reduce_bucket_size\": 5.308416e+06, \n",
" \"stage3_prefetch_bucket_size\": 4.777574e+06, \n",
" \"stage3_param_persistence_threshold\": 2.304000e+04, \n",
" \"stage3_gather_16bit_weights_on_model_save\": true, \n",
" \"offload_optimizer\": {\n",
" \"device\": \"cpu\", \n",
" \"pin_memory\": true\n",
" }, \n",
" \"offload_param\": {\n",
" \"device\": \"cpu\", \n",
" \"pin_memory\": true\n",
" }\n",
" }, \n",
" \"train_batch_size\": 64, \n",
" \"train_micro_batch_size_per_gpu\": 64, \n",
" \"gradient_accumulation_steps\": 1, \n",
" \"gradient_clipping\": 1.0, \n",
" \"wall_clock_breakdown\": false, \n",
" \"flops_profiler\": {\n",
" \"enabled\": false, \n",
" \"profile_step\": 1, \n",
" \"module_depth\": -1, \n",
" \"top_modules\": 1, \n",
" \"detailed\": true, \n",
" \"output_file\": null\n",
" }, \n",
" \"steps_per_print\": inf, \n",
" \"zero_allow_untested_optimizer\": true\n",
"}\n",
"***** Running training *****\n",
" Num examples = 50,486\n",
" Num Epochs = 2\n",
" Instantaneous batch size per device = 64\n",
" Total train batch size (w. parallel, distributed & accumulation) = 64\n",
" Gradient Accumulation steps = 1\n",
" Total optimization steps = 1,000\n",
" Number of trainable parameters = 2,949,120\n",
"{'loss': 2.2004, 'grad_norm': 44.037304409869364, 'learning_rate': 5e-06, 'epoch': 0.01}\n",
"{'loss': 1.4786, 'grad_norm': 39.531078618699645, 'learning_rate': 1e-05, 'epoch': 0.03}\n",
"{'loss': 0.9955, 'grad_norm': 16.66467873479667, 'learning_rate': 1.5e-05, 'epoch': 0.04}\n",
"{'loss': 0.7026, 'grad_norm': 7.417151045965821, 'learning_rate': 2e-05, 'epoch': 0.05}\n",
"{'loss': 0.6713, 'grad_norm': 7.608669365784156, 'learning_rate': 2.5e-05, 'epoch': 0.06}\n",
"{'loss': 0.5867, 'grad_norm': 12.552373192106195, 'learning_rate': 3e-05, 'epoch': 0.08}\n",
"{'loss': 0.6067, 'grad_norm': 10.342863016044076, 'learning_rate': 3.5e-05, 'epoch': 0.09}\n",
"{'loss': 0.5857, 'grad_norm': 10.985433470517048, 'learning_rate': 4e-05, 'epoch': 0.1}\n",
"{'loss': 0.5306, 'grad_norm': 5.22097493330033, 'learning_rate': 4.5e-05, 'epoch': 0.11}\n",
"{'loss': 0.5517, 'grad_norm': 3.9679057507396682, 'learning_rate': 5e-05, 'epoch': 0.13}\n",
"{'loss': 0.4573, 'grad_norm': 4.77643976524929, 'learning_rate': 4.9444444444444446e-05, 'epoch': 0.14}\n",
"{'loss': 0.469, 'grad_norm': 7.6144285869051345, 'learning_rate': 4.888888888888889e-05, 'epoch': 0.15}\n",
"{'loss': 0.4748, 'grad_norm': 4.787471338888486, 'learning_rate': 4.8333333333333334e-05, 'epoch': 0.16}\n",
"{'loss': 0.433, 'grad_norm': 3.3189167275368225, 'learning_rate': 4.7777777777777784e-05, 'epoch': 0.18}\n",
"{'loss': 0.4282, 'grad_norm': 7.248232922110331, 'learning_rate': 4.722222222222222e-05, 'epoch': 0.19}\n",
"{'loss': 0.409, 'grad_norm': 6.293684915700438, 'learning_rate': 4.666666666666667e-05, 'epoch': 0.2}\n",
"{'loss': 0.4451, 'grad_norm': 3.8753855113566833, 'learning_rate': 4.6111111111111115e-05, 'epoch': 0.22}\n",
"{'loss': 0.4288, 'grad_norm': 3.625475227512274, 'learning_rate': 4.555555555555556e-05, 'epoch': 0.23}\n",
"{'loss': 0.4506, 'grad_norm': 4.2449874489534665, 'learning_rate': 4.5e-05, 'epoch': 0.24}\n",
"{'loss': 0.4484, 'grad_norm': 6.084320127673726, 'learning_rate': 4.4444444444444447e-05, 'epoch': 0.25}\n",
"{'loss': 0.4487, 'grad_norm': 8.363684454316004, 'learning_rate': 4.388888888888889e-05, 'epoch': 0.27}\n",
"{'loss': 0.4878, 'grad_norm': 3.747181659840593, 'learning_rate': 4.3333333333333334e-05, 'epoch': 0.28}\n",
"{'loss': 0.412, 'grad_norm': 8.645140642353612, 'learning_rate': 4.277777777777778e-05, 'epoch': 0.29}\n",
"{'loss': 0.4558, 'grad_norm': 4.5260457637696625, 'learning_rate': 4.222222222222222e-05, 'epoch': 0.3}\n",
"{'loss': 0.4108, 'grad_norm': 4.781991938451388, 'learning_rate': 4.166666666666667e-05, 'epoch': 0.32}\n",
"{'loss': 0.4407, 'grad_norm': 5.893275628361186, 'learning_rate': 4.111111111111111e-05, 'epoch': 0.33}\n",
"{'loss': 0.4475, 'grad_norm': 4.100649312404707, 'learning_rate': 4.055555555555556e-05, 'epoch': 0.34}\n",
"{'loss': 0.4041, 'grad_norm': 7.0290388233232255, 'learning_rate': 4e-05, 'epoch': 0.35}\n",
"{'loss': 0.3599, 'grad_norm': 3.511374655086493, 'learning_rate': 3.944444444444445e-05, 'epoch': 0.37}\n",
"{'loss': 0.4706, 'grad_norm': 5.813953833114259, 'learning_rate': 3.888888888888889e-05, 'epoch': 0.38}\n",
"{'loss': 0.3911, 'grad_norm': 4.0524183329331604, 'learning_rate': 3.8333333333333334e-05, 'epoch': 0.39}\n",
"{'loss': 0.4033, 'grad_norm': 3.875046268309963, 'learning_rate': 3.777777777777778e-05, 'epoch': 0.41}\n",
"{'loss': 0.4199, 'grad_norm': 5.059711960144461, 'learning_rate': 3.722222222222222e-05, 'epoch': 0.42}\n",
"{'loss': 0.4216, 'grad_norm': 3.959248018825387, 'learning_rate': 3.6666666666666666e-05, 'epoch': 0.43}\n",
"{'loss': 0.367, 'grad_norm': 4.493383842056094, 'learning_rate': 3.611111111111111e-05, 'epoch': 0.44}\n",
"{'loss': 0.3686, 'grad_norm': 6.826580929267439, 'learning_rate': 3.555555555555556e-05, 'epoch': 0.46}\n",
"{'loss': 0.3566, 'grad_norm': 6.61801729550354, 'learning_rate': 3.5e-05, 'epoch': 0.47}\n",
"{'loss': 0.3932, 'grad_norm': 4.124116051492338, 'learning_rate': 3.444444444444445e-05, 'epoch': 0.48}\n",
"{'loss': 0.3514, 'grad_norm': 4.545406773056064, 'learning_rate': 3.388888888888889e-05, 'epoch': 0.49}\n",
"{'loss': 0.4364, 'grad_norm': 5.868492580695467, 'learning_rate': 3.3333333333333335e-05, 'epoch': 0.51}\n",
"{'loss': 0.346, 'grad_norm': 5.245615445258653, 'learning_rate': 3.277777777777778e-05, 'epoch': 0.52}\n",
"{'loss': 0.335, 'grad_norm': 3.6031965739940257, 'learning_rate': 3.222222222222223e-05, 'epoch': 0.53}\n",
"{'loss': 0.37, 'grad_norm': 5.240535743057915, 'learning_rate': 3.1666666666666666e-05, 'epoch': 0.54}\n",
"{'loss': 0.3732, 'grad_norm': 7.290964612314844, 'learning_rate': 3.111111111111111e-05, 'epoch': 0.56}\n",
"{'loss': 0.378, 'grad_norm': 5.352972449129333, 'learning_rate': 3.055555555555556e-05, 'epoch': 0.57}\n",
"{'loss': 0.3512, 'grad_norm': 3.2834858860521705, 'learning_rate': 3e-05, 'epoch': 0.58}\n",
"{'loss': 0.3963, 'grad_norm': 5.047726585891225, 'learning_rate': 2.9444444444444448e-05, 'epoch': 0.6}\n",
"{'loss': 0.3825, 'grad_norm': 3.6864211233732562, 'learning_rate': 2.8888888888888888e-05, 'epoch': 0.61}\n",
"{'loss': 0.3715, 'grad_norm': 4.97593217867295, 'learning_rate': 2.8333333333333335e-05, 'epoch': 0.62}\n",
"{'loss': 0.4358, 'grad_norm': 5.702141663942072, 'learning_rate': 2.777777777777778e-05, 'epoch': 0.63}\n",
" 50%|████████████████████ | 500/1000 [10:03<09:58, 1.20s/it]***** Running Evaluation *****\n",
" Num examples = 3000\n",
" Batch size = 128\n",
"\n",
" 0%| | 0/24 [00:00<?, ?it/s]\u001b[A\n",
" 8%|███▋ | 2/24 [00:01<00:19, 1.11it/s]\u001b[A\n",
" 12%|█████▌ | 3/24 [00:02<00:18, 1.11it/s]\u001b[A\n",
" 17%|███████▎ | 4/24 [00:03<00:17, 1.11it/s]\u001b[A\n",
" 21%|█████████▏ | 5/24 [00:04<00:17, 1.11it/s]\u001b[A\n",
" 25%|███████████ | 6/24 [00:05<00:16, 1.11it/s]\u001b[A\n",
" 29%|████████████▊ | 7/24 [00:06<00:15, 1.11it/s]\u001b[A\n",
" 33%|██████████████▋ | 8/24 [00:07<00:14, 1.11it/s]\u001b[A\n",
" 38%|████████████████▌ | 9/24 [00:08<00:13, 1.11it/s]\u001b[A\n",
" 42%|█████████████████▉ | 10/24 [00:08<00:12, 1.11it/s]\u001b[A\n",
" 46%|███████████████████▋ | 11/24 [00:09<00:11, 1.11it/s]\u001b[A\n",
" 50%|█████████████████████▌ | 12/24 [00:10<00:10, 1.11it/s]\u001b[A\n",
" 54%|███████████████████████▎ | 13/24 [00:11<00:09, 1.11it/s]\u001b[A\n",
" 58%|█████████████████████████ | 14/24 [00:12<00:08, 1.11it/s]\u001b[A\n",
" 62%|██████████████████████████▉ | 15/24 [00:13<00:08, 1.11it/s]\u001b[A\n",
" 67%|████████████████████████████▋ | 16/24 [00:14<00:07, 1.11it/s]\u001b[A\n",
" 71%|██████████████████████████████▍ | 17/24 [00:15<00:06, 1.11it/s]\u001b[A\n",
" 75%|████████████████████████████████▎ | 18/24 [00:16<00:05, 1.11it/s]\u001b[A\n",
" 79%|██████████████████████████████████ | 19/24 [00:17<00:04, 1.11it/s]\u001b[A\n",
" 83%|███████████████████████████████████▊ | 20/24 [00:17<00:03, 1.11it/s]\u001b[A\n",
" 88%|█████████████████████████████████████▋ | 21/24 [00:18<00:02, 1.11it/s]\u001b[A\n",
" 92%|███████████████████████████████████████▍ | 22/24 [00:19<00:01, 1.11it/s]\u001b[A\n",
" 96%|█████████████████████████████████████████▏ | 23/24 [00:20<00:00, 1.12it/s]\u001b[A\n",
" \u001b[A\n",
"\u001b[A{'eval_loss': 0.4814399480819702, 'eval_runtime': 23.5015, 'eval_samples_per_second': 127.651, 'eval_steps_per_second': 1.021, 'epoch': 0.63}\n",
" 50%|████████████████████ | 500/1000 [10:26<09:58, 1.20s/it]\n",
"100%|███████████████████████████████████████████| 24/24 [00:21<00:00, 1.22it/s]\u001b[A\n",
" \u001b[ASaving model checkpoint to output/ocnli_public_chatml/20240315212836/tmp-checkpoint-500\n",
"tokenizer config file saved in output/ocnli_public_chatml/20240315212836/tmp-checkpoint-500/tokenizer_config.json\n",
"Special tokens file saved in output/ocnli_public_chatml/20240315212836/tmp-checkpoint-500/special_tokens_map.json\n",
"[2024-03-15 21:39:48,407] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step500 is about to be saved!\n",
"/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1876: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
" warnings.warn(\n",
"[2024-03-15 21:39:48,447] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: output/ocnli_public_chatml/20240315212836/tmp-checkpoint-500/global_step500/zero_pp_rank_0_mp_rank_00_model_states.pt\n",
"[2024-03-15 21:39:48,447] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving output/ocnli_public_chatml/20240315212836/tmp-checkpoint-500/global_step500/zero_pp_rank_0_mp_rank_00_model_states.pt...\n",
"[2024-03-15 21:39:48,455] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved output/ocnli_public_chatml/20240315212836/tmp-checkpoint-500/global_step500/zero_pp_rank_0_mp_rank_00_model_states.pt.\n",
"[2024-03-15 21:39:48,455] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving output/ocnli_public_chatml/20240315212836/tmp-checkpoint-500/global_step500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt...\n",
"[2024-03-15 21:39:48,493] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved output/ocnli_public_chatml/20240315212836/tmp-checkpoint-500/global_step500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt.\n",
"[2024-03-15 21:39:48,493] [INFO] [engine.py:3488:_save_zero_checkpoint] zero checkpoint saved output/ocnli_public_chatml/20240315212836/tmp-checkpoint-500/global_step500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt\n",
"[2024-03-15 21:39:48,498] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step500 is ready now!\n",
"[2024-03-15 21:39:49,718] [WARNING] [stage3.py:2069:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
"{'loss': 0.4598, 'grad_norm': 6.5312558406821974, 'learning_rate': 2.7222222222222223e-05, 'epoch': 0.65}\n",
"{'loss': 0.355, 'grad_norm': 3.9302654106847914, 'learning_rate': 2.6666666666666667e-05, 'epoch': 0.66}\n",
"{'loss': 0.3781, 'grad_norm': 4.25997203692361, 'learning_rate': 2.6111111111111114e-05, 'epoch': 0.67}\n",
"{'loss': 0.3668, 'grad_norm': 3.5989513406349776, 'learning_rate': 2.5555555555555554e-05, 'epoch': 0.68}\n",
"{'loss': 0.3585, 'grad_norm': 3.6575850959103717, 'learning_rate': 2.5e-05, 'epoch': 0.7}\n",
"{'loss': 0.3674, 'grad_norm': 4.911812708486751, 'learning_rate': 2.4444444444444445e-05, 'epoch': 0.71}\n",
"{'loss': 0.368, 'grad_norm': 4.194735979358348, 'learning_rate': 2.3888888888888892e-05, 'epoch': 0.72}\n",
"{'loss': 0.3891, 'grad_norm': 3.5460606114800868, 'learning_rate': 2.3333333333333336e-05, 'epoch': 0.74}\n",
"{'loss': 0.3977, 'grad_norm': 3.150838310468473, 'learning_rate': 2.277777777777778e-05, 'epoch': 0.75}\n",
"{'loss': 0.3533, 'grad_norm': 3.9069432978502756, 'learning_rate': 2.2222222222222223e-05, 'epoch': 0.76}\n",
"{'loss': 0.3811, 'grad_norm': 5.105086367004499, 'learning_rate': 2.1666666666666667e-05, 'epoch': 0.77}\n",
"{'loss': 0.325, 'grad_norm': 4.369369589510735, 'learning_rate': 2.111111111111111e-05, 'epoch': 0.79}\n",
"{'loss': 0.3641, 'grad_norm': 6.171511559710524, 'learning_rate': 2.0555555555555555e-05, 'epoch': 0.8}\n",
"{'loss': 0.3316, 'grad_norm': 3.7044215769355313, 'learning_rate': 2e-05, 'epoch': 0.81}\n",
"{'loss': 0.3898, 'grad_norm': 3.788686076864363, 'learning_rate': 1.9444444444444445e-05, 'epoch': 0.82}\n",
"{'loss': 0.3732, 'grad_norm': 6.75853923792821, 'learning_rate': 1.888888888888889e-05, 'epoch': 0.84}\n",
"{'loss': 0.3827, 'grad_norm': 5.165864430975117, 'learning_rate': 1.8333333333333333e-05, 'epoch': 0.85}\n",
"{'loss': 0.3565, 'grad_norm': 3.535604172460323, 'learning_rate': 1.777777777777778e-05, 'epoch': 0.86}\n",
"{'loss': 0.3345, 'grad_norm': 3.633280931030727, 'learning_rate': 1.7222222222222224e-05, 'epoch': 0.87}\n",
"{'loss': 0.3639, 'grad_norm': 4.485584268777012, 'learning_rate': 1.6666666666666667e-05, 'epoch': 0.89}\n",
"{'loss': 0.402, 'grad_norm': 3.7925929660253317, 'learning_rate': 1.6111111111111115e-05, 'epoch': 0.9}\n",
"{'loss': 0.3452, 'grad_norm': 5.183220810399684, 'learning_rate': 1.5555555555555555e-05, 'epoch': 0.91}\n",
"{'loss': 0.3936, 'grad_norm': 9.733180087550997, 'learning_rate': 1.5e-05, 'epoch': 0.93}\n",
"{'loss': 0.3367, 'grad_norm': 5.1834921923924755, 'learning_rate': 1.4444444444444444e-05, 'epoch': 0.94}\n",
"{'loss': 0.3681, 'grad_norm': 4.409917292781669, 'learning_rate': 1.388888888888889e-05, 'epoch': 0.95}\n",
"{'loss': 0.3348, 'grad_norm': 3.335369553115092, 'learning_rate': 1.3333333333333333e-05, 'epoch': 0.96}\n",
"{'loss': 0.3972, 'grad_norm': 5.322684365694768, 'learning_rate': 1.2777777777777777e-05, 'epoch': 0.98}\n",
"{'loss': 0.3835, 'grad_norm': 6.105565593241867, 'learning_rate': 1.2222222222222222e-05, 'epoch': 0.99}\n",
"{'loss': 0.3916, 'grad_norm': 3.49133044485143, 'learning_rate': 1.1666666666666668e-05, 'epoch': 1.0}\n",
"{'loss': 0.3597, 'grad_norm': 2.888336925676786, 'learning_rate': 1.1111111111111112e-05, 'epoch': 1.01}\n",
"{'loss': 0.3304, 'grad_norm': 2.9537925974792714, 'learning_rate': 1.0555555555555555e-05, 'epoch': 1.03}\n",
"{'loss': 0.3392, 'grad_norm': 5.712451906231322, 'learning_rate': 1e-05, 'epoch': 1.04}\n",
"{'loss': 0.3393, 'grad_norm': 5.12273971212701, 'learning_rate': 9.444444444444445e-06, 'epoch': 1.05}\n",
"{'loss': 0.3018, 'grad_norm': 3.2845513584107033, 'learning_rate': 8.88888888888889e-06, 'epoch': 1.06}\n",
"{'loss': 0.3384, 'grad_norm': 3.2604963558968145, 'learning_rate': 8.333333333333334e-06, 'epoch': 1.08}\n",
"{'loss': 0.3252, 'grad_norm': 6.04878965518926, 'learning_rate': 7.777777777777777e-06, 'epoch': 1.09}\n",
"{'loss': 0.384, 'grad_norm': 5.226938733071884, 'learning_rate': 7.222222222222222e-06, 'epoch': 1.1}\n",
"{'loss': 0.2914, 'grad_norm': 3.8905566106093925, 'learning_rate': 6.666666666666667e-06, 'epoch': 1.12}\n",
"{'loss': 0.2984, 'grad_norm': 3.3599598929872525, 'learning_rate': 6.111111111111111e-06, 'epoch': 1.13}\n",
"{'loss': 0.3459, 'grad_norm': 5.669365782344921, 'learning_rate': 5.555555555555556e-06, 'epoch': 1.14}\n",
"{'loss': 0.3393, 'grad_norm': 3.078993311756746, 'learning_rate': 5e-06, 'epoch': 1.15}\n",
"{'loss': 0.3314, 'grad_norm': 5.3827552737002495, 'learning_rate': 4.444444444444445e-06, 'epoch': 1.17}\n",
"{'loss': 0.3345, 'grad_norm': 3.2322873367016665, 'learning_rate': 3.888888888888889e-06, 'epoch': 1.18}\n",
"{'loss': 0.3363, 'grad_norm': 3.3300669560846425, 'learning_rate': 3.3333333333333333e-06, 'epoch': 1.19}\n",
"{'loss': 0.344, 'grad_norm': 3.7589742724407653, 'learning_rate': 2.777777777777778e-06, 'epoch': 1.2}\n",
"{'loss': 0.3195, 'grad_norm': 2.8061902793867626, 'learning_rate': 2.2222222222222225e-06, 'epoch': 1.22}\n",
"{'loss': 0.3128, 'grad_norm': 3.3215568095822516, 'learning_rate': 1.6666666666666667e-06, 'epoch': 1.23}\n",
"{'loss': 0.3035, 'grad_norm': 4.30331459929754, 'learning_rate': 1.1111111111111112e-06, 'epoch': 1.24}\n",
"{'loss': 0.3374, 'grad_norm': 3.9324447635716995, 'learning_rate': 5.555555555555556e-07, 'epoch': 1.25}\n",
"{'loss': 0.3254, 'grad_norm': 4.112509804571923, 'learning_rate': 0.0, 'epoch': 1.27}\n",
"100%|███████████████████████████████████████| 1000/1000 [20:30<00:00, 1.19s/it]***** Running Evaluation *****\n",
" Num examples = 3000\n",
" Batch size = 128\n",
"\n",
" 0%| | 0/24 [00:00<?, ?it/s]\u001b[A\n",
" 8%|███▋ | 2/24 [00:00<00:09, 2.23it/s]\u001b[A\n",
" 12%|█████▌ | 3/24 [00:01<00:13, 1.58it/s]\u001b[A\n",
" 17%|███████▎ | 4/24 [00:02<00:14, 1.37it/s]\u001b[A\n",
" 21%|█████████▏ | 5/24 [00:03<00:14, 1.27it/s]\u001b[A\n",
" 25%|███████████ | 6/24 [00:04<00:14, 1.21it/s]\u001b[A\n",
" 29%|████████████▊ | 7/24 [00:05<00:14, 1.18it/s]\u001b[A\n",
" 33%|██████████████▋ | 8/24 [00:06<00:13, 1.16it/s]\u001b[A\n",
" 38%|████████████████▌ | 9/24 [00:07<00:13, 1.14it/s]\u001b[A\n",
" 42%|█████████████████▉ | 10/24 [00:08<00:12, 1.13it/s]\u001b[A\n",
" 46%|███████████████████▋ | 11/24 [00:08<00:11, 1.13it/s]\u001b[A\n",
" 50%|█████████████████████▌ | 12/24 [00:09<00:10, 1.12it/s]\u001b[A\n",
" 54%|███████████████████████▎ | 13/24 [00:10<00:09, 1.12it/s]\u001b[A\n",
" 58%|█████████████████████████ | 14/24 [00:11<00:08, 1.12it/s]\u001b[A\n",
" 62%|██████████████████████████▉ | 15/24 [00:12<00:08, 1.12it/s]\u001b[A\n",
" 67%|████████████████████████████▋ | 16/24 [00:13<00:07, 1.12it/s]\u001b[A\n",
" 71%|██████████████████████████████▍ | 17/24 [00:14<00:06, 1.11it/s]\u001b[A\n",
" 75%|████████████████████████████████▎ | 18/24 [00:15<00:05, 1.12it/s]\u001b[A\n",
" 79%|██████████████████████████████████ | 19/24 [00:16<00:04, 1.12it/s]\u001b[A\n",
" 83%|███████████████████████████████████▊ | 20/24 [00:17<00:03, 1.12it/s]\u001b[A\n",
" 88%|█████████████████████████████████████▋ | 21/24 [00:17<00:02, 1.12it/s]\u001b[A\n",
" 92%|███████████████████████████████████████▍ | 22/24 [00:18<00:01, 1.12it/s]\u001b[A\n",
" 96%|█████████████████████████████████████████▏ | 23/24 [00:19<00:00, 1.12it/s]\u001b[A\n",
" \u001b[A\n",
"\u001b[A{'eval_loss': 0.414621502161026, 'eval_runtime': 21.2011, 'eval_samples_per_second': 141.502, 'eval_steps_per_second': 1.132, 'epoch': 1.27}\n",
"100%|███████████████████████████████████████| 1000/1000 [20:52<00:00, 1.19s/it]\n",
"100%|███████████████████████████████████████████| 24/24 [00:20<00:00, 1.22it/s]\u001b[A\n",
" \u001b[ASaving model checkpoint to output/ocnli_public_chatml/20240315212836/tmp-checkpoint-1000\n",
"tokenizer config file saved in output/ocnli_public_chatml/20240315212836/tmp-checkpoint-1000/tokenizer_config.json\n",
"Special tokens file saved in output/ocnli_public_chatml/20240315212836/tmp-checkpoint-1000/special_tokens_map.json\n",
"[2024-03-15 21:50:12,793] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step1000 is about to be saved!\n",
"/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1876: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
" warnings.warn(\n",
"[2024-03-15 21:50:12,809] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: output/ocnli_public_chatml/20240315212836/tmp-checkpoint-1000/global_step1000/zero_pp_rank_0_mp_rank_00_model_states.pt\n",
"[2024-03-15 21:50:12,809] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving output/ocnli_public_chatml/20240315212836/tmp-checkpoint-1000/global_step1000/zero_pp_rank_0_mp_rank_00_model_states.pt...\n",
"[2024-03-15 21:50:12,817] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved output/ocnli_public_chatml/20240315212836/tmp-checkpoint-1000/global_step1000/zero_pp_rank_0_mp_rank_00_model_states.pt.\n",
"[2024-03-15 21:50:12,818] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving output/ocnli_public_chatml/20240315212836/tmp-checkpoint-1000/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt...\n",
"[2024-03-15 21:50:12,851] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved output/ocnli_public_chatml/20240315212836/tmp-checkpoint-1000/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt.\n",
"[2024-03-15 21:50:12,852] [INFO] [engine.py:3488:_save_zero_checkpoint] zero checkpoint saved output/ocnli_public_chatml/20240315212836/tmp-checkpoint-1000/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt\n",
"[2024-03-15 21:50:12,856] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step1000 is ready now!\n",
"\n",
"\n",
"Training completed. Do not forget to share your model on huggingface.co/models =)\n",
"\n",
"\n",
"{'train_runtime': 1255.2202, 'train_samples_per_second': 50.987, 'train_steps_per_second': 0.797, 'train_loss': 0.43027476024627687, 'epoch': 1.27}\n",
"100%|███████████████████████████████████████| 1000/1000 [20:55<00:00, 1.26s/it]\n",
"[2024-03-15 21:50:18,203] [INFO] [launch.py:348:main] Process 86577 exits successfully.\n"
]
}
],
"source": [
"!bash lora_finetune_ocnli.sh"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. 推理验证"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import torch\n",
"from tqdm import tqdm\n",
"from transformers import AutoModelForCausalLM, AutoTokenizer"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"path = \"output/ocnli_public_chatml/20240316002856/checkpoint-1500\"\n",
"tokenizer = AutoTokenizer.from_pretrained(path)\n",
"model = AutoModelForCausalLM.from_pretrained(\n",
" path, torch_dtype=torch.bfloat16, device_map=\"cuda\", trust_remote_code=True\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n"
]
},
{
"data": {
"text/plain": [
"('entailment',\n",
" [{'role': 'user',\n",
" 'content': '<用户>请判断下边两个句子的关系属于 [entailment, neutral, contradiction]中的哪一种?\\n句子1: 身上裹一件工厂发的棉大衣,手插在袖筒里\\n句子2:身上至少一件衣服\\n<AI>'},\n",
" {'role': 'assistant', 'content': 'entailment'}])"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"res, history = model.chat(tokenizer, query=\"<用户>请判断下边两个句子的关系属于 [entailment, neutral, contradiction]中的哪一种?\\n句子1: 身上裹一件工厂发的棉大衣,手插在袖筒里\\n句子2:身上至少一件衣服\\n<AI>\")\n",
"res, history"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"with open(\"data/ocnli_public_chatml/dev.json\", 'r') as f:\n",
" dev_sample_list = json.load(f)\n"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 0/500 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 0%| | 1/500 [00:00<00:54, 9.12it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 0%| | 2/500 [00:00<00:54, 9.09it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 1%| | 3/500 [00:00<00:55, 8.98it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 1%| | 5/500 [00:00<00:49, 9.99it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 1%| | 6/500 [00:00<00:51, 9.67it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 2%|▏ | 8/500 [00:00<00:44, 11.07it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 2%|▏ | 10/500 [00:00<00:48, 10.17it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 2%|▏ | 12/500 [00:01<00:47, 10.30it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 3%|▎ | 14/500 [00:01<00:46, 10.51it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 3%|▎ | 16/500 [00:01<00:42, 11.33it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 4%|▎ | 18/500 [00:01<00:40, 11.96it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 4%|▍ | 20/500 [00:01<00:38, 12.42it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 4%|▍ | 22/500 [00:02<00:42, 11.19it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 5%|▍ | 24/500 [00:02<00:40, 11.83it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 5%|▌ | 26/500 [00:02<00:38, 12.32it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 6%|▌ | 28/500 [00:02<00:39, 11.87it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 6%|▌ | 30/500 [00:02<00:40, 11.55it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 6%|▋ | 32/500 [00:02<00:41, 11.34it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 7%|▋ | 34/500 [00:03<00:39, 11.94it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 7%|▋ | 36/500 [00:03<00:37, 12.38it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 8%|▊ | 38/500 [00:03<00:38, 11.92it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 8%|▊ | 40/500 [00:03<00:37, 12.37it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 8%|▊ | 42/500 [00:03<00:36, 12.71it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 9%|▉ | 44/500 [00:03<00:40, 11.39it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 9%|▉ | 46/500 [00:04<00:37, 11.98it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 10%|▉ | 48/500 [00:04<00:38, 11.66it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 10%|█ | 50/500 [00:04<00:36, 12.19it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 10%|█ | 52/500 [00:04<00:35, 12.58it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 11%|█ | 54/500 [00:04<00:34, 12.87it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 11%|█ | 56/500 [00:04<00:34, 13.06it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 12%|█▏ | 58/500 [00:05<00:38, 11.58it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 12%|█▏ | 60/500 [00:05<00:38, 11.39it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 12%|█▏ | 62/500 [00:05<00:36, 11.97it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 13%|█▎ | 64/500 [00:05<00:35, 12.36it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 13%|█▎ | 66/500 [00:05<00:36, 11.89it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 14%|█▎ | 68/500 [00:05<00:37, 11.60it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 14%|█▍ | 70/500 [00:06<00:39, 10.75it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 14%|█▍ | 72/500 [00:06<00:41, 10.23it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 15%|█▍ | 74/500 [00:06<00:40, 10.44it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 15%|█▌ | 76/500 [00:06<00:38, 11.10it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 16%|█▌ | 78/500 [00:06<00:36, 11.67it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 16%|█▌ | 80/500 [00:06<00:37, 11.35it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 16%|█▋ | 82/500 [00:07<00:37, 11.23it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 17%|█▋ | 84/500 [00:07<00:37, 11.15it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 17%|█▋ | 86/500 [00:07<00:37, 11.10it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 18%|█▊ | 88/500 [00:07<00:37, 11.06it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 18%|█▊ | 90/500 [00:07<00:34, 11.72it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 18%|█▊ | 92/500 [00:08<00:35, 11.50it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 19%|█▉ | 94/500 [00:08<00:33, 12.07it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 19%|█▉ | 96/500 [00:08<00:34, 11.73it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 20%|█▉ | 98/500 [00:08<00:32, 12.24it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 20%|██ | 100/500 [00:08<00:33, 11.84it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 20%|██ | 102/500 [00:08<00:34, 11.57it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 21%|██ | 104/500 [00:08<00:32, 12.11it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 21%|██ | 106/500 [00:09<00:31, 12.53it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 22%|██▏ | 108/500 [00:09<00:30, 12.85it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 22%|██▏ | 110/500 [00:09<00:29, 13.07it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 22%|██▏ | 112/500 [00:09<00:29, 13.24it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 23%|██▎ | 114/500 [00:09<00:28, 13.35it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 23%|██▎ | 116/500 [00:09<00:28, 13.43it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 24%|██▎ | 118/500 [00:10<00:31, 12.23it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 24%|██▍ | 120/500 [00:10<00:33, 11.29it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 24%|██▍ | 122/500 [00:10<00:33, 11.19it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 25%|██▍ | 124/500 [00:10<00:31, 11.81it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 25%|██▌ | 126/500 [00:10<00:30, 12.25it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 26%|██▌ | 128/500 [00:10<00:31, 11.77it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 26%|██▌ | 130/500 [00:11<00:30, 12.25it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 26%|██▋ | 132/500 [00:11<00:29, 12.63it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 27%|██▋ | 134/500 [00:11<00:32, 11.36it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 27%|██▋ | 136/500 [00:11<00:34, 10.62it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 28%|██▊ | 138/500 [00:11<00:33, 10.73it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 28%|██▊ | 140/500 [00:12<00:33, 10.80it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 28%|██▊ | 142/500 [00:12<00:31, 11.52it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 29%|██▉ | 144/500 [00:12<00:31, 11.36it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 29%|██▉ | 146/500 [00:12<00:29, 11.96it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 30%|██▉ | 148/500 [00:12<00:30, 11.65it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 30%|███ | 150/500 [00:12<00:28, 12.20it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 30%|███ | 152/500 [00:12<00:27, 12.60it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 31%|███ | 154/500 [00:13<00:28, 12.07it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 31%|███ | 156/500 [00:13<00:27, 12.49it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 32%|███▏ | 158/500 [00:13<00:26, 12.82it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 32%|███▏ | 160/500 [00:13<00:27, 12.21it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 32%|███▏ | 162/500 [00:13<00:30, 11.11it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 33%|███▎ | 164/500 [00:14<00:30, 11.07it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 33%|███▎ | 166/500 [00:14<00:28, 11.73it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 34%|███▎ | 168/500 [00:14<00:27, 12.24it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 34%|███▍ | 170/500 [00:14<00:29, 11.14it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 34%|███▍ | 172/500 [00:14<00:29, 11.06it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 35%|███▍ | 174/500 [00:14<00:27, 11.67it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 35%|███▌ | 176/500 [00:15<00:26, 12.17it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 36%|███▌ | 178/500 [00:15<00:25, 12.55it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 36%|███▌ | 180/500 [00:15<00:24, 12.82it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 36%|███▋ | 182/500 [00:15<00:26, 12.20it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 37%|███▋ | 184/500 [00:15<00:26, 11.79it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 37%|███▋ | 186/500 [00:15<00:25, 12.26it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 38%|███▊ | 188/500 [00:16<00:26, 11.83it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 38%|███▊ | 190/500 [00:16<00:28, 10.88it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 38%|███▊ | 192/500 [00:16<00:26, 11.57it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 39%|███▉ | 194/500 [00:16<00:26, 11.38it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 39%|███▉ | 196/500 [00:16<00:27, 11.17it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 40%|███▉ | 198/500 [00:16<00:25, 11.74it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 40%|████ | 200/500 [00:17<00:24, 12.21it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 40%|████ | 202/500 [00:17<00:23, 12.59it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 41%|████ | 204/500 [00:17<00:24, 12.04it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 41%|████ | 206/500 [00:17<00:25, 11.69it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 42%|████▏ | 208/500 [00:17<00:25, 11.45it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 42%|████▏ | 210/500 [00:17<00:25, 11.29it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 42%|████▏ | 212/500 [00:18<00:24, 11.86it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 43%|████▎ | 214/500 [00:18<00:24, 11.52it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 43%|████▎ | 216/500 [00:18<00:23, 12.06it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 44%|████▎ | 218/500 [00:18<00:25, 11.02it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 44%|████▍ | 220/500 [00:18<00:23, 11.68it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 44%|████▍ | 222/500 [00:18<00:24, 11.45it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 45%|████▍ | 224/500 [00:19<00:25, 10.64it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 45%|████▌ | 226/500 [00:19<00:25, 10.73it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 46%|████▌ | 228/500 [00:19<00:25, 10.79it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 46%|████▌ | 230/500 [00:19<00:24, 10.83it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 46%|████▋ | 232/500 [00:19<00:24, 10.86it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 47%|████▋ | 234/500 [00:20<00:26, 10.16it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 47%|████▋ | 236/500 [00:20<00:25, 10.33it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 48%|████▊ | 238/500 [00:20<00:23, 11.11it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 48%|████▊ | 240/500 [00:20<00:22, 11.74it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 48%|████▊ | 242/500 [00:20<00:23, 10.82it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 49%|████▉ | 244/500 [00:21<00:22, 11.53it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 49%|████▉ | 246/500 [00:21<00:23, 10.71it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 50%|████▉ | 248/500 [00:21<00:24, 10.22it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 50%|█████ | 250/500 [00:21<00:22, 11.06it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 50%|█████ | 252/500 [00:21<00:21, 11.74it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 51%|█████ | 254/500 [00:21<00:20, 12.27it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 51%|█████ | 256/500 [00:22<00:20, 11.81it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 52%|█████▏ | 258/500 [00:22<00:19, 12.23it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 52%|█████▏ | 260/500 [00:22<00:19, 12.55it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 52%|█████▏ | 262/500 [00:22<00:18, 12.82it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 53%|█████▎ | 264/500 [00:22<00:19, 12.17it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 53%|█████▎ | 266/500 [00:22<00:18, 12.55it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 54%|█████▎ | 268/500 [00:22<00:18, 12.83it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 54%|█████▍ | 270/500 [00:23<00:17, 13.04it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 54%|█████▍ | 272/500 [00:23<00:17, 13.19it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 55%|█████▍ | 274/500 [00:23<00:17, 13.29it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 55%|█████▌ | 276/500 [00:23<00:17, 12.48it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 56%|█████▌ | 278/500 [00:23<00:17, 12.77it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 56%|█████▌ | 280/500 [00:23<00:17, 12.94it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 56%|█████▋ | 282/500 [00:24<00:17, 12.11it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 57%|█████▋ | 284/500 [00:24<00:17, 12.51it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 57%|█████▋ | 286/500 [00:24<00:18, 11.29it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 58%|█████▊ | 288/500 [00:24<00:18, 11.17it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 58%|█████▊ | 290/500 [00:24<00:20, 10.43it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 58%|█████▊ | 292/500 [00:25<00:19, 10.57it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 59%|█████▉ | 294/500 [00:25<00:19, 10.68it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 59%|█████▉ | 296/500 [00:25<00:17, 11.39it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 60%|█████▉ | 298/500 [00:25<00:17, 11.25it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 60%|██████ | 300/500 [00:25<00:17, 11.15it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 60%|██████ | 302/500 [00:25<00:17, 11.09it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 61%|██████ | 304/500 [00:26<00:16, 11.72it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 61%|██████ | 306/500 [00:26<00:16, 11.47it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 62%|██████▏ | 308/500 [00:26<00:16, 11.31it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 62%|██████▏ | 310/500 [00:26<00:16, 11.85it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 62%|██████▏ | 312/500 [00:26<00:16, 11.47it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 63%|██████▎ | 314/500 [00:26<00:16, 11.25it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 63%|██████▎ | 316/500 [00:27<00:16, 11.14it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 64%|██████▎ | 318/500 [00:27<00:16, 11.06it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 64%|██████▍ | 320/500 [00:27<00:15, 11.68it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 64%|██████▍ | 322/500 [00:27<00:14, 12.19it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 65%|██████▍ | 324/500 [00:27<00:13, 12.58it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 65%|██████▌ | 326/500 [00:27<00:14, 12.04it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 66%|██████▌ | 328/500 [00:28<00:13, 12.46it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 66%|██████▌ | 330/500 [00:28<00:14, 11.96it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 66%|██████▋ | 332/500 [00:28<00:13, 12.33it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 67%|██████▋ | 334/500 [00:28<00:14, 11.84it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 67%|██████▋ | 336/500 [00:28<00:13, 12.30it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 68%|██████▊ | 338/500 [00:28<00:13, 11.78it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 68%|██████▊ | 340/500 [00:29<00:13, 12.24it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 68%|██████▊ | 342/500 [00:29<00:12, 12.60it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 69%|██████▉ | 344/500 [00:29<00:12, 12.03it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 69%|██████▉ | 346/500 [00:29<00:13, 11.68it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 70%|██████▉ | 348/500 [00:29<00:14, 10.77it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 70%|███████ | 350/500 [00:30<00:14, 10.23it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 70%|███████ | 352/500 [00:30<00:14, 10.43it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 71%|███████ | 354/500 [00:30<00:13, 10.55it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 71%|███████ | 356/500 [00:30<00:12, 11.29it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 72%|███████▏ | 358/500 [00:30<00:13, 10.20it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 72%|███████▏ | 360/500 [00:31<00:13, 10.38it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 72%|███████▏ | 362/500 [00:31<00:12, 10.77it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 73%|███████▎ | 364/500 [00:31<00:12, 11.03it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 73%|███████▎ | 366/500 [00:31<00:12, 10.81it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 74%|███████▎ | 368/500 [00:31<00:13, 9.74it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 74%|███████▍ | 370/500 [00:31<00:12, 10.21it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 74%|███████▍ | 372/500 [00:32<00:12, 9.95it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 75%|███████▍ | 374/500 [00:32<00:12, 10.08it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 75%|███████▌ | 376/500 [00:32<00:11, 10.46it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 76%|███████▌ | 378/500 [00:32<00:11, 10.17it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 76%|███████▌ | 380/500 [00:32<00:11, 10.26it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 76%|███████▋ | 382/500 [00:33<00:11, 10.05it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 77%|███████▋ | 384/500 [00:33<00:11, 10.47it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 77%|███████▋ | 386/500 [00:33<00:10, 10.96it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 78%|███████▊ | 388/500 [00:33<00:10, 10.85it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 78%|███████▊ | 390/500 [00:33<00:10, 10.84it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 78%|███████▊ | 392/500 [00:34<00:10, 10.24it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 79%|███████▉ | 394/500 [00:34<00:09, 11.03it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 79%|███████▉ | 396/500 [00:34<00:08, 11.67it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 80%|███████▉ | 398/500 [00:34<00:08, 12.15it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 80%|████████ | 400/500 [00:34<00:09, 11.03it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 80%|████████ | 402/500 [00:34<00:08, 11.54it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 81%|████████ | 404/500 [00:35<00:08, 11.31it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 81%|████████ | 406/500 [00:35<00:07, 11.87it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 82%|████████▏ | 408/500 [00:35<00:07, 12.30it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 82%|████████▏ | 410/500 [00:35<00:07, 11.83it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 82%|████████▏ | 412/500 [00:35<00:08, 10.84it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 83%|████████▎ | 414/500 [00:35<00:07, 11.50it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 83%|████████▎ | 416/500 [00:36<00:06, 12.02it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 84%|████████▎ | 418/500 [00:36<00:07, 11.65it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 84%|████████▍ | 420/500 [00:36<00:06, 12.14it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 84%|████████▍ | 422/500 [00:36<00:06, 11.69it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 85%|████████▍ | 424/500 [00:36<00:06, 12.10it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 85%|████████▌ | 426/500 [00:36<00:06, 11.60it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 86%|████████▌ | 428/500 [00:37<00:05, 12.09it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 86%|████████▌ | 430/500 [00:37<00:05, 12.46it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 86%|████████▋ | 432/500 [00:37<00:06, 11.20it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 87%|████████▋ | 434/500 [00:37<00:05, 11.79it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 87%|████████▋ | 436/500 [00:37<00:05, 12.26it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 88%|████████▊ | 438/500 [00:37<00:04, 12.62it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 88%|████████▊ | 440/500 [00:38<00:04, 12.06it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 88%|████████▊ | 442/500 [00:38<00:04, 12.42it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 89%|████████▉ | 444/500 [00:38<00:04, 12.72it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 89%|████████▉ | 446/500 [00:38<00:04, 12.94it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 90%|████████▉ | 448/500 [00:38<00:03, 13.10it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 90%|█████████ | 450/500 [00:38<00:04, 12.36it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 90%|█████████ | 452/500 [00:39<00:04, 11.89it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 91%|█████████ | 454/500 [00:39<00:03, 12.33it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 91%|█████████ | 456/500 [00:39<00:03, 12.66it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 92%|█████████▏| 458/500 [00:39<00:03, 12.08it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 92%|█████████▏| 460/500 [00:39<00:03, 12.49it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 92%|█████████▏| 462/500 [00:39<00:03, 11.24it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 93%|█████████▎| 464/500 [00:40<00:03, 11.14it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 93%|█████████▎| 466/500 [00:40<00:03, 10.38it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 94%|█████████▎| 468/500 [00:40<00:03, 10.50it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 94%|█████████▍| 470/500 [00:40<00:02, 11.25it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 94%|█████████▍| 472/500 [00:40<00:02, 11.11it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 95%|█████████▍| 474/500 [00:40<00:02, 11.64it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 95%|█████████▌| 476/500 [00:41<00:02, 11.37it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 96%|█████████▌| 478/500 [00:41<00:01, 11.89it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 96%|█████████▌| 480/500 [00:41<00:01, 12.29it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 96%|█████████▋| 482/500 [00:41<00:01, 12.63it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 97%|█████████▋| 484/500 [00:41<00:01, 12.89it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 97%|█████████▋| 486/500 [00:41<00:01, 12.23it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 98%|█████████▊| 488/500 [00:42<00:01, 11.80it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 98%|█████████▊| 490/500 [00:42<00:00, 12.28it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 98%|█████████▊| 492/500 [00:42<00:00, 11.12it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 99%|█████████▉| 494/500 [00:42<00:00, 11.06it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
" 99%|█████████▉| 496/500 [00:42<00:00, 10.40it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"100%|█████████▉| 498/500 [00:43<00:00, 10.55it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
"100%|██████████| 500/500 [00:43<00:00, 11.56it/s]\n"
]
}
],
"source": [
"pos = 0\n",
"neg = 0\n",
"for sample in tqdm(dev_sample_list[:500]):\n",
" res, history = model.chat(tokenizer, query=\"<用户>{}<AI>\".format(sample[\"messages\"][0][\"content\"]), max_length=128, top_p=0.5, temperature=0.8)\n",
" if sample[\"messages\"][1][\"content\"] in res.strip().lower():\n",
" pos += 1\n",
" else:\n",
" neg += 1"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(0.81, 405, 95)"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pos / (pos+neg), pos, neg"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
formatted_time=$(date +"%Y%m%d%H%M%S")
echo $formatted_time
deepspeed --include localhost:1 --master_port 19888 finetune.py \
--model_name_or_path MiniCPM-2B-sft-bf16 \
--output_dir output/OCNLILoRA/$formatted_time/ \
--train_data_path data/ocnli_public_chatml/train.json \
--eval_data_path data/ocnli_public_chatml/dev.json \
--learning_rate 5e-5 --per_device_train_batch_size 80 \
--per_device_eval_batch_size 128 --model_max_length 128 --bf16 --use_lora \
--gradient_accumulation_steps 1 --warmup_steps 100 \
--max_steps 1000 --weight_decay 0.01 \
--evaluation_strategy steps --eval_steps 500 \
--save_strategy steps --save_steps 500 --seed 42 \
--log_level info --logging_strategy steps --logging_steps 10 \
--deepspeed configs/ds_config_zero3_offload.json
# Copyright © 2023-2024 Apple Inc.
"""
This script demonstrates how to fine-tune a LoRA model on AdvertiseGen dataset in mlx.
Using Code is modified from https://github.com/ml-explore/mlx-examples.
Using Model with https://huggingface.co/mlx-community/MiniCPM-2B-sft-bf16-llama-format-mlx
Use this Code with command:
train:
首先处理数据,运行data_processing.ipynb
python mlx_finetune.py --model MiniCPM-2B-sft-bf16-llama-format-mlx --data data/mlx_AdvertiseGen --train --seed 2024 --iters 500
输出结果如下:
Training
Iter 1: Val loss 4.015, Val took 1067.669s
Iter 2: Val loss 4.001, Val took 1061.649s
...
训练结束之后,文件夹下会有 adapters.npz 文件,用于后续的测试。接着,运行测试命令
test:
python mlx_finetune.py --model MiniCPM-2B-sft-bf16-llama-format-mlx --data data/mlx_AdvertiseGen --test --seed 2024
输出结果如下:
Testing
Test loss 3.977, Test ppl 53.350.
"""
import argparse
import json
import time
from pathlib import Path
from typing import Generator
import transformers
import numpy as np
from huggingface_hub import snapshot_download
import glob
import inspect
import math
from dataclasses import dataclass
from typing import Dict, Optional, Tuple, Union
from mlx.utils import tree_flatten, tree_unflatten
import mlx.optimizers as optim
import mlx.core as mx
import mlx.nn as nn
@dataclass
class ModelArgs:
hidden_size: int
num_hidden_layers: int
intermediate_size: int
num_attention_heads: int
rms_norm_eps: float
vocab_size: int
num_key_value_heads: int = None
rope_theta: float = 10000
rope_traditional: bool = False
model_type: str = None
rope_scaling: Optional[Dict[str, Union[float, str]]] = None
def __post_init__(self):
if self.num_key_value_heads is None:
self.num_key_value_heads = self.num_attention_heads
if self.rope_scaling:
required_keys = {"factor", "type"}
if not all(key in self.rope_scaling for key in required_keys):
raise ValueError(f"rope_scaling must contain keys {required_keys}")
if self.rope_scaling["type"] != "linear":
raise ValueError("rope_scaling 'type' currently only supports 'linear'")
@classmethod
def from_dict(cls, params):
return cls(
**{
k: v
for k, v in params.items()
if k in inspect.signature(cls).parameters
}
)
class LoRALinear(nn.Module):
@staticmethod
def from_linear(linear: nn.Linear, rank: int = 8):
# TODO remove when input_dims and output_dims are attributes
# on linear and quantized linear
output_dims, input_dims = linear.weight.shape
if isinstance(linear, nn.QuantizedLinear):
input_dims *= 32 // linear.bits
lora_lin = LoRALinear(input_dims, output_dims, rank)
lora_lin.linear = linear
return lora_lin
def to_linear(self):
linear = self.linear
bias = "bias" in linear
weight = linear.weight
is_quantized = isinstance(linear, nn.QuantizedLinear)
# Use the same type as the linear weight if not quantized
dtype = weight.dtype
if is_quantized:
dtype = mx.float16
weight = mx.dequantize(
weight,
linear.scales,
linear.biases,
linear.group_size,
linear.bits,
)
output_dims, input_dims = weight.shape
fused_linear = nn.Linear(input_dims, output_dims, bias=bias)
lora_b = (self.scale * self.lora_b.T).astype(dtype)
lora_a = self.lora_a.T.astype(dtype)
fused_linear.weight = weight + lora_b @ lora_a
if bias:
fused_linear.bias = linear.bias
if is_quantized:
fused_linear = nn.QuantizedLinear.from_linear(
fused_linear,
linear.group_size,
linear.bits,
)
return fused_linear
def __init__(
self,
input_dims: int,
output_dims: int,
lora_rank: int = 8,
bias: bool = False,
scale: float = 20.0,
):
super().__init__()
# Regular linear layer weights
self.linear = nn.Linear(input_dims, output_dims, bias=bias)
# Scale for low-rank update
self.scale = scale
# Low rank lora weights
scale = 1 / math.sqrt(input_dims)
self.lora_a = mx.random.uniform(
low=-scale,
high=scale,
shape=(input_dims, lora_rank),
)
self.lora_b = mx.zeros(shape=(lora_rank, output_dims))
def __call__(self, x):
dtype = self.linear.weight.dtype
if isinstance(self.linear, nn.QuantizedLinear):
dtype = self.linear.scales.dtype
y = self.linear(x.astype(dtype))
z = (x @ self.lora_a) @ self.lora_b
return y + self.scale * z
class Attention(nn.Module):
def __init__(self, args: ModelArgs):
super().__init__()
dim = args.hidden_size
self.n_heads = n_heads = args.num_attention_heads
self.n_kv_heads = n_kv_heads = args.num_key_value_heads
self.repeats = n_heads // n_kv_heads
head_dim = args.hidden_size // n_heads
self.scale = head_dim ** -0.5
self.q_proj = nn.Linear(dim, n_heads * head_dim, bias=False)
self.k_proj = nn.Linear(dim, n_kv_heads * head_dim, bias=False)
self.v_proj = nn.Linear(dim, n_kv_heads * head_dim, bias=False)
self.o_proj = nn.Linear(n_heads * head_dim, dim, bias=False)
rope_scale = (
1 / args.rope_scaling["factor"]
if args.rope_scaling is not None and args.rope_scaling["type"] == "linear"
else 1
)
self.rope = nn.RoPE(
head_dim,
traditional=args.rope_traditional,
base=args.rope_theta,
scale=rope_scale,
)
def __call__(
self,
x: mx.array,
mask: Optional[mx.array] = None,
cache: Optional[Tuple[mx.array, mx.array]] = None,
) -> mx.array:
B, L, D = x.shape
queries, keys, values = self.q_proj(x), self.k_proj(x), self.v_proj(x)
# Prepare the queries, keys and values for the attention computation
queries = queries.reshape(B, L, self.n_heads, -1).transpose(0, 2, 1, 3)
keys = keys.reshape(B, L, self.n_kv_heads, -1).transpose(0, 2, 1, 3)
values = values.reshape(B, L, self.n_kv_heads, -1).transpose(0, 2, 1, 3)
if cache is not None:
key_cache, value_cache = cache
queries = self.rope(queries, offset=key_cache.shape[2])
keys = self.rope(keys, offset=key_cache.shape[2])
keys = mx.concatenate([key_cache, keys], axis=2)
values = mx.concatenate([value_cache, values], axis=2)
else:
queries = self.rope(queries)
keys = self.rope(keys)
output = mx.fast.scaled_dot_product_attention(
queries, keys, values, scale=self.scale, mask=mask
)
output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
return self.o_proj(output), (keys, values)
class MLP(nn.Module):
def __init__(self, dim, hidden_dim):
super().__init__()
self.gate_proj = nn.Linear(dim, hidden_dim, bias=False)
self.down_proj = nn.Linear(hidden_dim, dim, bias=False)
self.up_proj = nn.Linear(dim, hidden_dim, bias=False)
def __call__(self, x) -> mx.array:
return self.down_proj(nn.silu(self.gate_proj(x)) * self.up_proj(x))
class TransformerBlock(nn.Module):
def __init__(self, args: ModelArgs):
super().__init__()
self.num_attention_heads = args.num_attention_heads
self.hidden_size = args.hidden_size
self.self_attn = Attention(args)
self.mlp = MLP(args.hidden_size, args.intermediate_size)
self.input_layernorm = nn.RMSNorm(args.hidden_size, eps=args.rms_norm_eps)
self.post_attention_layernorm = nn.RMSNorm(
args.hidden_size, eps=args.rms_norm_eps
)
self.args = args
def __call__(
self,
x: mx.array,
mask: Optional[mx.array] = None,
cache: Optional[Tuple[mx.array, mx.array]] = None,
) -> mx.array:
r, cache = self.self_attn(self.input_layernorm(x), mask, cache)
h = x + r
r = self.mlp(self.post_attention_layernorm(h))
out = h + r
return out, cache
class LlamaModel(nn.Module):
def __init__(self, args: ModelArgs):
super().__init__()
self.args = args
self.vocab_size = args.vocab_size
self.num_hidden_layers = args.num_hidden_layers
assert self.vocab_size > 0
self.embed_tokens = nn.Embedding(args.vocab_size, args.hidden_size)
self.layers = [
TransformerBlock(args=args) for _ in range(args.num_hidden_layers)
]
self.norm = nn.RMSNorm(args.hidden_size, eps=args.rms_norm_eps)
def __call__(
self,
inputs: mx.array,
cache=None,
):
h = self.embed_tokens(inputs)
mask = None
if h.shape[1] > 1:
mask = nn.MultiHeadAttention.create_additive_causal_mask(h.shape[1])
mask = mask.astype(h.dtype)
if cache is None:
cache = [None] * len(self.layers)
for e, layer in enumerate(self.layers):
h, cache[e] = layer(h, mask, cache[e])
return self.norm(h), cache
class Model(nn.Module):
def __init__(self, args: ModelArgs):
super().__init__()
self.model = LlamaModel(args)
self.lm_head = nn.Linear(args.hidden_size, args.vocab_size, bias=False)
def __call__(
self,
inputs: mx.array,
cache=None,
):
out, cache = self.model(inputs, cache)
return self.lm_head(out), cache
def build_parser():
parser = argparse.ArgumentParser(description="LoRA or QLoRA finetuning.")
parser.add_argument(
"--model",
default="/Users/liudan/Downloads/模型/llamaformat_minicpm",
help="The path to the local model directory or Hugging Face repo.",
)
# Generation args
parser.add_argument(
"--max-tokens",
"-m",
type=int,
default=100,
help="The maximum number of tokens to generate",
)
parser.add_argument(
"--temp", type=float, default=0.8, help="The sampling temperature"
)
parser.add_argument(
"--prompt",
"-p",
type=str,
help="The prompt for generation"
)
# Training args
parser.add_argument(
"--train",
action="store_true",
help="Do training",
)
parser.add_argument(
"--data",
type=str,
default="data/mlx_AdvertiseGen",
help="Directory with {train, valid, test}.json files",
)
parser.add_argument(
"--lora-layers",
type=int,
default=16,
help="Number of layers to fine-tune",
)
parser.add_argument("--batch-size", type=int, default=4, help="Minibatch size.")
parser.add_argument(
"--iters", type=int, default=1000, help="Iterations to train for."
)
parser.add_argument(
"--val-batches",
type=int,
default=25,
help="Number of validation batches, -1 uses the entire validation set.",
)
parser.add_argument(
"--learning-rate", type=float, default=1e-5, help="Adam learning rate."
)
parser.add_argument(
"--steps-per-report",
type=int,
default=10,
help="Number of training steps between loss reporting.",
)
parser.add_argument(
"--steps-per-eval",
type=int,
default=200,
help="Number of training steps between validations.",
)
parser.add_argument(
"--resume-adapter-file",
type=str,
default=None,
help="Load path to resume training with the given adapter weights.",
)
parser.add_argument(
"--adapter-file",
type=str,
default="adapters.npz",
help="Save/load path for the trained adapter weights.",
)
parser.add_argument(
"--save-every",
type=int,
default=100,
help="Save the model every N iterations.",
)
parser.add_argument(
"--test",
action="store_true",
help="Evaluate on the test set after training",
)
parser.add_argument(
"--test-batches",
type=int,
default=500,
help="Number of test set batches, -1 uses the entire test set.",
)
parser.add_argument("--seed", type=int, default=0, help="The PRNG seed")
return parser
class ConversationDataset:
def __init__(self, path: Path):
with open(path, "r") as fid:
self._data = [json.loads(l) for l in fid]
def __getitem__(self, idx: int):
entry = self._data[idx]
content = entry.get("input", "")
summary = entry.get("output", "")
prompt = entry.get("prompt", "")
return prompt, content, summary
def __len__(self):
return len(self._data)
def load(args):
def load_and_check(name):
dataset_path = Path(args.data) / f"{name}.json"
try:
return ConversationDataset(dataset_path)
except Exception as e:
print(f"Unable to build dataset {dataset_path} ({e})")
raise
names = ("train", "dev", "dev")
train, valid, test = (load_and_check(n) for n in names)
if args.train and len(train) == 0:
raise ValueError(
"Training set not found or empty. Must provide training set for fine-tuning."
)
if args.train and len(valid) == 0:
raise ValueError(
"Validation set not found or empty. Must provide validation set for fine-tuning."
)
if args.test and len(test) == 0:
raise ValueError(
"Test set not found or empty. Must provide test set for evaluation."
)
return train, valid, test
def loss(model, inputs, targets, lengths):
logits, _ = model(inputs)
logits = logits.astype(mx.float32)
length_mask = mx.arange(inputs.shape[1])[None, :] < lengths[:, None]
ce = nn.losses.cross_entropy(logits, targets) * length_mask
ntoks = length_mask.sum()
ce = ce.sum() / ntoks
return ce, ntoks
def iterate_batches(dset, tokenizer, batch_size, train=False):
# Shuffle indices
while True:
indices = np.arange(len(dset))
if train:
indices = np.random.permutation(indices)
# Collect batches from dataset
for i in range(0, len(indices) - batch_size + 1, batch_size):
# Encode batch
batch_samples=[dset[indices[i + j]] for j in range(batch_size)]
batch_format_text=['<用户>{}<AI>{}'.format(i[1]+i[0],i[2]) for i in batch_samples]
batch = [tokenizer.encode(i)+[tokenizer.eos_token_id] for i in batch_format_text]
lengths = [len(x) for x in batch]
# Check if any sequence is longer than 2048 tokens
if max(lengths) > 2048:
print(
"[WARNING] Some sequences are longer than 2048 tokens. "
"Consider pre-splitting your data to save memory."
)
# Pad to the max length
batch_arr = np.zeros((batch_size, max(lengths)), np.int32)
for j in range(batch_size):
batch_arr[j, : lengths[j]] = batch[j]
batch = mx.array(batch_arr)
yield batch[:, :-1], batch[:, 1:], mx.array(lengths)
if not train:
break
def load_model(path_or_hf_repo: str):
# If the path exists, it will try to load model form it
# otherwise download and cache from the hf_repo and cache
model_path = Path(path_or_hf_repo)
if not model_path.exists():
model_path = Path(
snapshot_download(
repo_id=path_or_hf_repo,
allow_patterns=["*.json", "*.safetensors", "tokenizer.model"],
)
)
with open(model_path / "config.json", "r") as f:
config = json.loads(f.read())
quantization = config.get("quantization", None)
weight_files = glob.glob(str(model_path / "*.safetensors"))
if len(weight_files) == 0:
raise FileNotFoundError("No safetensors found in {}".format(model_path))
weights = {}
for wf in weight_files:
weights.update(mx.load(wf).items())
model_args = ModelArgs.from_dict(config)
model = Model(model_args)
if quantization is not None:
nn.QuantizedLinear.quantize_module(
model,
**quantization,
linear_class_predicate=lambda m: isinstance(m, nn.Linear)
and m.weight.shape[0] != 8,
)
model.load_weights(list(weights.items()))
mx.eval(model.parameters())
tokenizer = transformers.AutoTokenizer.from_pretrained(model_path)
return model, tokenizer, config
def generate(
prompt: mx.array, model: nn.Module, temp: float = 0.0
) -> Generator[mx.array, None, None]:
"""
Generate text based on the given prompt and model.
Args:
prompt (mx.array): The input prompt.
model (nn.Module): The model to use for generation.
temp (float): The temperature for sampling. If temp is 0, use max sampling.
Yields:
mx.array: The generated text.
"""
def sample(logits: mx.array) -> mx.array:
return (
mx.argmax(logits, axis=-1)
if temp == 0
else mx.random.categorical(logits * (1 / temp))
)
y = prompt
cache = None
while True:
logits, cache = model(y[None], cache=cache)
logits = logits[:, -1, :]
y = sample(logits)
yield y
def evaluate(model, dataset, loss, tokenizer, batch_size, num_batches):
all_losses = []
ntokens = 0
for it, batch in zip(
range(num_batches),
iterate_batches(dataset, tokenizer, batch_size),
):
losses, toks = loss(model, *batch)
all_losses.append((losses * toks).item())
ntokens += toks.item()
return np.sum(all_losses) / ntokens
def train(model, train_set, val_set, optimizer, loss, tokenizer, args):
# Create value and grad function for loss
loss_value_and_grad = nn.value_and_grad(model, loss)
losses = []
n_tokens = 0
# Main training loop
start = time.perf_counter()
for it, batch in zip(
range(args.iters),
iterate_batches(train_set, tokenizer, args.batch_size, train=True),
):
# Forward and backward pass
(lvalue, toks), grad = loss_value_and_grad(model, *batch)
# Model update
optimizer.update(model, grad)
mx.eval(model.parameters(), optimizer.state, lvalue)
# Record loss
losses.append(lvalue.item())
n_tokens += toks.item()
if (it + 1) % args.steps_per_report == 0:
train_loss = np.mean(losses)
stop = time.perf_counter()
print(
f"Iter {it + 1}: Train loss {train_loss:.3f}, "
f"It/sec {args.steps_per_report / (stop - start):.3f}, "
f"Tokens/sec {float(n_tokens) / (stop - start):.3f}"
)
losses = []
n_tokens = 0
start = time.perf_counter()
# Report validation loss if needed
if it == 0 or (it + 1) % args.steps_per_eval == 0:
stop = time.perf_counter()
val_loss = evaluate(
model, val_set, loss, tokenizer, args.batch_size, args.val_batches
)
print(
f"Iter {it + 1}: "
f"Val loss {val_loss:.3f}, "
f"Val took {(time.perf_counter() - stop):.3f}s"
)
start = time.perf_counter()
# Save adapter weights if needed
if (it + 1) % args.save_every == 0:
mx.savez(
args.adapter_file, **dict(tree_flatten(model.trainable_parameters()))
)
print(f"Iter {it + 1}: Saved adapter weights to {args.adapter_file}.")
def generate_string(model, prompt, tokenizer, args):
print(prompt, end="", flush=True)
prompt = mx.array(tokenizer.encode(prompt))
tokens = []
skip = 0
for token, n in zip(
generate(prompt, model, args.temp),
range(args.max_tokens),
):
if token == tokenizer.eos_token_id:
break
tokens.append(token.item())
s = tokenizer.decode(tokens)
if len(s) - skip > 1:
print(s[skip:-1], end="", flush=True)
skip = len(s) - 1
print(tokenizer.decode(tokens)[skip:], flush=True)
print("=" * 10)
if len(tokens) == 0:
print("No tokens generated for this prompt")
return
if __name__ == "__main__":
parser = build_parser()
args = parser.parse_args()
np.random.seed(args.seed)
print("Loading pretrained model")
model, tokenizer, _ = load_model(args.model)
# Freeze all layers other than LORA linears
model.freeze()
for l in model.model.layers[len(model.model.layers) - args.lora_layers:]:
l.self_attn.q_proj = LoRALinear.from_linear(l.self_attn.q_proj)
l.self_attn.v_proj = LoRALinear.from_linear(l.self_attn.v_proj)
if hasattr(l, "block_sparse_moe"):
l.block_sparse_moe.gate = LoRALinear.from_linear(l.block_sparse_moe.gate)
p = sum(v.size for _, v in tree_flatten(model.parameters())) / 10 ** 6
print(f"Total parameters {p:.3f}M")
p = sum(v.size for _, v in tree_flatten(model.trainable_parameters())) / 10 ** 6
print(f"Trainable parameters {p:.3f}M")
print("Loading datasets")
train_set, valid_set, test_set = load(args)
# Resume training the given adapters.
if args.resume_adapter_file is not None:
print(f"Loading pretrained adapters from {args.resume_adapter_file}")
model.load_weights(args.resume_adapter_file, strict=False)
if args.train:
print("Training")
opt = optim.Adam(learning_rate=args.learning_rate)
# Train model
train(model, train_set, valid_set, opt, loss, tokenizer, args)
# Save adapter weights
mx.savez(args.adapter_file, **dict(tree_flatten(model.trainable_parameters())))
# Load the LoRA adapter weights which we assume should exist by this point
if not Path(args.adapter_file).is_file():
raise ValueError(
f"Adapter file {args.adapter_file} missing. "
"Use --train to learn and save the adapters.npz."
)
model.load_weights(args.adapter_file, strict=False)
if args.test:
print("Testing")
model.eval()
test_loss = evaluate(
model,
test_set,
loss,
tokenizer,
args.batch_size,
num_batches=args.test_batches,
)
test_ppl = math.exp(test_loss)
print(f"Test loss {test_loss:.3f}, Test ppl {test_ppl:.3f}.")
if args.prompt is not None:
print("Generating")
generate_string(model, args.prompt, tokenizer, args)
# for finetune
jieba>=0.42.1
ruamel_yaml>=0.18.5
rouge_chinese>=1.0.3
jupyter>=1.0.0
datasets>=2.16.1
peft>=0.7.1
deepspeed>=0.13.1
flash_attn>=2.5.1
transformers>=4.39.1
torch>=2.2.0
triton>=2.2.0
httpx>=0.27.0
gradio>=4.26.0
flash_attn>=2.4.1
accelerate>=0.29.2
sentence_transformers>=2.6.1
sse_starlette>=2.1.0
tiktoken>=0.6.0
mlx_lm>=0.8.0
openai>=0.16.2
\ No newline at end of file
# -*- coding: utf-8 -*-
import json
import os
from dataclasses import dataclass, field
from typing import Dict, Optional, Union, Any
import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers
from torch.utils.data import Dataset
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
Trainer,
TrainingArguments as HFTrainingArguments,
BitsAndBytesConfig,
)
import copy
@dataclass
class ModelArguments:
model_name_or_path: Optional[str] = field(default="openbmb/MiniCPM-2B-sft-bf16")
@dataclass
class DataArguments:
train_data_path: str = field(
default="data/AdvertiseGenChatML/train.json",
metadata={"help": "Path to the training data."},
)
eval_data_path: str = field(
default="data/AdvertiseGenChatML/dev.json",
metadata={"help": "Path to the test data."},
)
@dataclass
class TrainingArguments(HFTrainingArguments):
cache_dir: Optional[str] = field(default=None)
optim: str = field(default="adamw_torch")
model_max_length: int = field(
default=512,
metadata={
"help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
},
)
use_lora: bool = field(default=False)
qlora: bool = field(default=False)
# DPO相关参数
use_dpo: bool = field(default=False, metadata={"help": "Whether to use DPO training"})
dpo_beta: float = field(default=0.1, metadata={"help": "Beta parameter for DPO loss"})
reference_model_path: Optional[str] = field(default=None, metadata={"help": "Path to reference model for DPO"})
# SFT损失权重参数
sft_loss_weight: float = field(default=0.0, metadata={"help": "Weight for SFT loss when combined with DPO"})
class SupervisedDataset(Dataset):
"""Dataset for supervised fine-tuning."""
def __init__(
self,
data_path,
tokenizer,
model_max_length=4096,
):
super(SupervisedDataset, self).__init__()
self.data = json.load(open(data_path))
self.tokenizer = tokenizer
self.model_max_length = model_max_length
self.ignore_index = -100
item = self.preprocessing(self.data[0])
print("input:", self.tokenizer.decode(item["input_ids"]))
labels = []
for id_ in item["labels"]:
if id_ == -100:
continue
labels.append(id_)
print("label:", self.tokenizer.decode(labels))
def __len__(self):
return len(self.data)
def preprocessing(self, example):
input_ids = [self.tokenizer.bos_token_id]
label_ids = [self.ignore_index]
for message in example["messages"]:
role = message["role"]
content = message["content"]
content_ids = self.tokenizer.apply_chat_template([message])
if role == "user":
if self.tokenizer.eos_token_id == 73440: # minicpm3.0 and minicpm4.0
input_ids += self.tokenizer.apply_chat_template(
[message], add_generation_prompt=True
)
label_ids += [self.ignore_index] * len(
self.tokenizer.apply_chat_template(
[message], add_generation_prompt=True
)
)
else: # minicpm2.0
input_ids += content_ids
label_ids += [self.ignore_index] * len(content_ids)
elif role == "system":
input_ids += content_ids
label_ids += [self.ignore_index] * len(content_ids)
elif role == "assistant":
if self.tokenizer.eos_token_id == 73440: # minicpm3.0 and minicpm4.0
input_ids += self.tokenizer.encode(content, add_special_tokens=False)
label_ids += self.tokenizer.encode(content, add_special_tokens=False)
else: # minicpm2.0
input_ids += content_ids
label_ids += content_ids
input_ids.append(self.tokenizer.eos_token_id)
label_ids.append(self.tokenizer.eos_token_id)
# truncate to max len
input_ids = input_ids[: self.model_max_length]
label_ids = label_ids[: self.model_max_length]
attention_mask = [1] * len(input_ids)
# pad to max len
input_ids += [self.tokenizer.eos_token_id] * (
self.model_max_length - len(input_ids)
)
label_ids += [self.ignore_index] * (self.model_max_length - len(label_ids))
attention_mask += [0] * (self.model_max_length - len(attention_mask))
# convert to pt tensor
input_ids = torch.LongTensor(input_ids)
label_ids = torch.LongTensor(label_ids)
attention_mask = torch.LongTensor(attention_mask)
return {
"input_ids": input_ids,
"labels": label_ids,
"attention_mask": attention_mask,
}
def __getitem__(self, idx) -> Dict[str, torch.Tensor]:
return self.preprocessing(self.data[idx])
class DPODataset(Dataset):
"""Dataset for DPO training with optional SFT data."""
def __init__(self, data_path, tokenizer, model_max_length=4096, include_sft_data=False):
super(DPODataset, self).__init__()
self.data = json.load(open(data_path))
self.tokenizer = tokenizer
self.model_max_length = model_max_length
self.ignore_index = -100
self.include_sft_data = include_sft_data
# 展示第一个样本的处理结果
if len(self.data) > 0:
item = self.preprocessing(self.data[0])
print("DPO Dataset Sample:")
print("Chosen input:", self.tokenizer.decode(item["chosen_input_ids"], skip_special_tokens=True))
print("Rejected input:", self.tokenizer.decode(item["rejected_input_ids"], skip_special_tokens=True))
def __len__(self):
return len(self.data)
def build_conversation(self, instruction, input_text="", history=None):
"""构建对话格式"""
messages = []
# 添加历史对话
if history:
for user_msg, assistant_msg in history:
messages.append({"role": "user", "content": user_msg})
messages.append({"role": "assistant", "content": assistant_msg})
# 添加当前指令
current_input = instruction
if input_text:
current_input = f"{instruction}\n{input_text}"
messages.append({"role": "user", "content": current_input})
return messages
def encode_conversation(self, messages, response):
"""编码对话和回复"""
# 构建完整对话
full_messages = messages + [{"role": "assistant", "content": response}]
# 使用chat template编码
if hasattr(self.tokenizer, 'apply_chat_template'):
input_ids = self.tokenizer.apply_chat_template(
full_messages,
tokenize=True,
add_generation_prompt=False,
return_tensors="pt"
).squeeze(0)
else:
# 如果没有chat template,使用简单拼接
text = ""
for msg in full_messages:
if msg["role"] == "user":
text += f"User: {msg['content']}\n"
elif msg["role"] == "assistant":
text += f"Assistant: {msg['content']}\n"
input_ids = self.tokenizer.encode(text, return_tensors="pt").squeeze(0)
# 截断到最大长度
if len(input_ids) > self.model_max_length:
input_ids = input_ids[:self.model_max_length]
# 计算attention mask
attention_mask = torch.ones_like(input_ids)
# 填充到固定长度
if len(input_ids) < self.model_max_length:
pad_length = self.model_max_length - len(input_ids)
input_ids = torch.cat([
input_ids,
torch.full((pad_length,), self.tokenizer.pad_token_id, dtype=input_ids.dtype)
])
attention_mask = torch.cat([
attention_mask,
torch.zeros(pad_length, dtype=attention_mask.dtype)
])
return input_ids, attention_mask
def encode_conversation_with_labels(self, messages, response):
"""编码对话和回复,同时生成SFT训练所需的labels"""
# 构建完整对话
full_messages = messages + [{"role": "assistant", "content": response}]
# 使用chat template编码
if hasattr(self.tokenizer, 'apply_chat_template'):
input_ids = self.tokenizer.apply_chat_template(
full_messages,
tokenize=True,
add_generation_prompt=False,
return_tensors="pt"
).squeeze(0)
else:
# 如果没有chat template,使用简单拼接
text = ""
for msg in full_messages:
if msg["role"] == "user":
text += f"User: {msg['content']}\n"
elif msg["role"] == "assistant":
text += f"Assistant: {msg['content']}\n"
input_ids = self.tokenizer.encode(text, return_tensors="pt").squeeze(0)
# 创建labels,只对assistant回复部分计算损失
labels = input_ids.clone()
# 编码不包含assistant回复的部分,用于确定哪些token需要ignore
prompt_messages = messages
if hasattr(self.tokenizer, 'apply_chat_template'):
prompt_ids = self.tokenizer.apply_chat_template(
prompt_messages,
tokenize=True,
add_generation_prompt=True,
return_tensors="pt"
).squeeze(0)
else:
prompt_text = ""
for msg in prompt_messages:
if msg["role"] == "user":
prompt_text += f"User: {msg['content']}\n"
prompt_text += "Assistant: "
prompt_ids = self.tokenizer.encode(prompt_text, return_tensors="pt").squeeze(0)
# 对prompt部分设置ignore_index
prompt_len = len(prompt_ids)
if prompt_len < len(labels):
labels[:prompt_len] = self.ignore_index
# 截断到最大长度
if len(input_ids) > self.model_max_length:
input_ids = input_ids[:self.model_max_length]
labels = labels[:self.model_max_length]
# 计算attention mask
attention_mask = torch.ones_like(input_ids)
# 填充到固定长度
if len(input_ids) < self.model_max_length:
pad_length = self.model_max_length - len(input_ids)
input_ids = torch.cat([
input_ids,
torch.full((pad_length,), self.tokenizer.pad_token_id, dtype=input_ids.dtype)
])
labels = torch.cat([
labels,
torch.full((pad_length,), self.ignore_index, dtype=labels.dtype)
])
attention_mask = torch.cat([
attention_mask,
torch.zeros(pad_length, dtype=attention_mask.dtype)
])
return input_ids, attention_mask, labels
def preprocessing(self, example):
"""预处理DPO数据样本"""
instruction = example["instruction"]
input_text = example.get("input", "")
chosen = example["chosen"]
rejected = example["rejected"]
history = example.get("history", [])
# 构建对话消息
messages = self.build_conversation(instruction, input_text, history)
# 编码chosen和rejected回复
chosen_input_ids, chosen_attention_mask = self.encode_conversation(messages, chosen)
rejected_input_ids, rejected_attention_mask = self.encode_conversation(messages, rejected)
result = {
"chosen_input_ids": chosen_input_ids,
"chosen_attention_mask": chosen_attention_mask,
"rejected_input_ids": rejected_input_ids,
"rejected_attention_mask": rejected_attention_mask,
}
# 只有在需要SFT损失时才生成相关数据
if self.include_sft_data:
chosen_input_ids_sft, chosen_attention_mask_sft, chosen_labels = self.encode_conversation_with_labels(messages, chosen)
result.update({
"chosen_input_ids_sft": chosen_input_ids_sft,
"chosen_attention_mask_sft": chosen_attention_mask_sft,
"chosen_labels": chosen_labels,
})
return result
def __getitem__(self, idx):
return self.preprocessing(self.data[idx])
class DPODataCollator:
"""自定义的DPO数据collator,处理特殊的DPO数据格式"""
def __init__(self, tokenizer, include_sft_data=False):
self.tokenizer = tokenizer
self.include_sft_data = include_sft_data
def __call__(self, features):
batch = {}
# 处理基本的DPO字段
dpo_keys = ["chosen_input_ids", "chosen_attention_mask", "rejected_input_ids", "rejected_attention_mask"]
for key in dpo_keys:
if key in features[0]:
batch[key] = torch.stack([f[key] for f in features])
# 如果包含SFT数据,也处理SFT相关字段
if self.include_sft_data:
sft_keys = ["chosen_input_ids_sft", "chosen_attention_mask_sft", "chosen_labels"]
for key in sft_keys:
if key in features[0]:
batch[key] = torch.stack([f[key] for f in features])
return batch
class DPOTrainer(Trainer):
"""Custom Trainer for DPO with optional SFT loss."""
def __init__(self, reference_model=None, dpo_beta=0.1, sft_loss_weight=0.0, **kwargs):
super().__init__(**kwargs)
self.reference_model = reference_model
self.dpo_beta = dpo_beta
self.sft_loss_weight = sft_loss_weight
self.use_sft = sft_loss_weight > 0
# 将参考模型移动到正确的设备
if self.reference_model is not None:
self.reference_model.to(self.args.device)
self.reference_model.eval()
# 确保参考模型不需要梯度
for param in self.reference_model.parameters():
param.requires_grad = False
def get_log_probabilities(self, model, input_ids, attention_mask):
"""计算序列的log概率"""
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
logits = outputs.logits
# 计算每个token的log概率
log_probs = F.log_softmax(logits, dim=-1)
# 获取实际token的log概率
# shift操作:预测下一个token
shift_log_probs = log_probs[..., :-1, :].contiguous()
shift_labels = input_ids[..., 1:].contiguous()
shift_attention = attention_mask[..., 1:].contiguous()
# 收集每个位置的log概率
gathered_log_probs = torch.gather(
shift_log_probs,
dim=-1,
index=shift_labels.unsqueeze(-1)
).squeeze(-1)
# 只计算非padding部分的平均log概率
masked_log_probs = gathered_log_probs * shift_attention.float()
sequence_log_prob = masked_log_probs.sum(dim=-1) / (shift_attention.sum(dim=-1).float() + 1e-8)
return sequence_log_prob
def compute_dpo_loss(self, policy_chosen_logps, policy_rejected_logps,
reference_chosen_logps, reference_rejected_logps):
"""计算DPO损失函数"""
# 计算相对于参考模型的log概率比值
policy_ratio_chosen = policy_chosen_logps - reference_chosen_logps
policy_ratio_rejected = policy_rejected_logps - reference_rejected_logps
# DPO损失
logits = self.dpo_beta * (policy_ratio_chosen - policy_ratio_rejected)
loss = -F.logsigmoid(logits).mean()
# 计算准确率(chosen概率高于rejected的比例)
accuracy = (policy_ratio_chosen > policy_ratio_rejected).float().mean()
return loss, accuracy
def compute_sft_loss(self, logits, labels):
"""计算SFT损失"""
# Shift so that tokens < n predict n
shift_logits = logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
# Flatten the tokens
loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
shift_logits = shift_logits.view(-1, shift_logits.size(-1))
shift_labels = shift_labels.view(-1)
loss = loss_fct(shift_logits, shift_labels)
return loss
def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
"""计算DPO损失和可选的SFT损失,并分别输出"""
# 计算策略模型的log概率(用于DPO)
policy_chosen_logps = self.get_log_probabilities(
model, inputs["chosen_input_ids"], inputs["chosen_attention_mask"]
)
policy_rejected_logps = self.get_log_probabilities(
model, inputs["rejected_input_ids"], inputs["rejected_attention_mask"]
)
# 计算参考模型的log概率(用于DPO)
with torch.no_grad():
reference_chosen_logps = self.get_log_probabilities(
self.reference_model, inputs["chosen_input_ids"], inputs["chosen_attention_mask"]
)
reference_rejected_logps = self.get_log_probabilities(
self.reference_model, inputs["rejected_input_ids"], inputs["rejected_attention_mask"]
)
# 计算DPO损失
dpo_loss, accuracy = self.compute_dpo_loss(
policy_chosen_logps, policy_rejected_logps,
reference_chosen_logps, reference_rejected_logps
)
# 初始化总损失为DPO损失
total_loss = dpo_loss
# 准备日志字典
log_dict = {
"dpo_loss": dpo_loss.item(),
"dpo_accuracy": accuracy.item()
}
# 计算SFT损失(如果启用且数据可用)
sft_loss = None
if self.use_sft and "chosen_labels" in inputs:
# 使用chosen回复计算SFT损失
outputs = model(
input_ids=inputs["chosen_input_ids_sft"],
attention_mask=inputs["chosen_attention_mask_sft"]
)
sft_loss = self.compute_sft_loss(outputs.logits, inputs["chosen_labels"])
# 将SFT损失加入总损失
total_loss = total_loss + self.sft_loss_weight * sft_loss
# 添加SFT损失到日志
log_dict.update({
"sft_loss": sft_loss.item(),
"sft_loss_weight": self.sft_loss_weight,
"total_loss": total_loss.item()
})
else:
# 如果没有SFT损失,总损失就是DPO损失
log_dict["total_loss"] = total_loss.item()
# 记录所有指标
self.log(log_dict)
return (total_loss, None) if return_outputs else total_loss
def load_model_and_tokenizer(
model_path: str,
max_length: int = 4096,
use_lora: bool = True,
qlora: bool = False,
bf16: bool = False,
fp16: bool = False,
):
"""load model and tokenizer"""
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
assert not (bf16 and fp16), "bf16 or fp16, not both"
if bf16:
dtype = torch.bfloat16
elif fp16:
dtype = torch.float16
else:
dtype = torch.float32
if qlora:
assert use_lora, "use_lora must be True when use_qlora is True"
quantization_config = BitsAndBytesConfig(
load_in_4bit=True, # 是否进行4bit量化
load_in_8bit=False, # 是否进行8bit量化
bnb_4bit_compute_dtype=torch.float16, # 计算精度设置
bnb_4bit_quant_storage=torch.uint8, # 量化权重的储存格式
bnb_4bit_quant_type="nf4", # 量化格式,这里用的是正太分布的int4
bnb_4bit_use_double_quant=True, # 是否采用双量化,即对zeropoint和scaling参数进行量化
llm_int8_enable_fp32_cpu_offload=False, # 是否llm使用int8,cpu上保存的参数使用fp32
llm_int8_has_fp16_weight=False, # 是否启用混合精度
# llm_int8_skip_modules=["out_proj", "kv_proj", "lm_head"], # 不进行量化的模块
llm_int8_threshold=6.0, # llm.int8()算法中的离群值,根据这个值区分是否进行量化
)
model = AutoModelForCausalLM.from_pretrained(
model_path,
torch_dtype=dtype,
trust_remote_code=True,
quantization_config=quantization_config,
)
else:
model = AutoModelForCausalLM.from_pretrained(
model_path,
torch_dtype=dtype,
trust_remote_code=True,
)
if use_lora:
from peft import LoraConfig, TaskType, get_peft_model
lora_config = LoraConfig(
init_lora_weights="gaussian",
task_type=TaskType.CAUSAL_LM,
target_modules=(
["q_a_proj", "kv_a_proj_with_mqa", "q_b_proj", "kv_b_proj"]
if hasattr(model.config, 'architectures') and model.config.architectures == ["MiniCPM3ForCausalLM"]
else ["q_proj", "v_proj"]
),
r=64,
lora_alpha=32,
lora_dropout=0.1,
inference_mode=False,
)
model = get_peft_model(model, lora_config)
# trainable params: 2,949,120 || all params: 3,010,652,928 || trainable%: 0.09795616002669305
model.print_trainable_parameters()
model.enable_input_require_grads() # need when using adapter
return model, tokenizer
def load_reference_model(model_path, bf16=False, fp16=False):
"""加载参考模型(用于DPO训练)"""
assert not (bf16 and fp16), "bf16 or fp16, not both"
if bf16:
dtype = torch.bfloat16
elif fp16:
dtype = torch.float16
else:
dtype = torch.float32
reference_model = AutoModelForCausalLM.from_pretrained(
model_path,
torch_dtype=dtype,
trust_remote_code=True,
)
# 参考模型不需要梯度
for param in reference_model.parameters():
param.requires_grad = False
return reference_model
if __name__ == "__main__":
parser = transformers.HfArgumentParser(
(ModelArguments, DataArguments, TrainingArguments)
)
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
# Create output directory
os.makedirs(training_args.output_dir, exist_ok=True)
model, tokenizer = load_model_and_tokenizer(
model_path=model_args.model_name_or_path,
max_length=training_args.model_max_length,
use_lora=training_args.use_lora,
qlora=training_args.qlora,
bf16=training_args.bf16,
fp16=training_args.fp16,
)
if training_args.use_dpo:
# 如果没有指定参考模型路径,则使用当前模型作为参考模型
reference_model_path = training_args.reference_model_path or model_args.model_name_or_path
reference_model = load_reference_model(
model_path=reference_model_path,
bf16=training_args.bf16,
fp16=training_args.fp16,
)
train_dataset = DPODataset(
data_path=data_args.train_data_path,
tokenizer=tokenizer,
model_max_length=training_args.model_max_length,
include_sft_data=training_args.sft_loss_weight > 0,
)
eval_dataset = DPODataset(
data_path=data_args.eval_data_path,
tokenizer=tokenizer,
model_max_length=training_args.model_max_length,
include_sft_data=training_args.sft_loss_weight > 0,
) if os.path.exists(data_args.eval_data_path) else None
# 创建自定义数据collator
data_collator = DPODataCollator(
tokenizer=tokenizer,
include_sft_data=training_args.sft_loss_weight > 0
)
trainer = DPOTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
tokenizer=tokenizer,
data_collator=data_collator, # 使用自定义data collator
reference_model=reference_model,
dpo_beta=training_args.dpo_beta,
sft_loss_weight=training_args.sft_loss_weight,
)
else:
train_dataset = SupervisedDataset(
data_path=data_args.train_data_path,
tokenizer=tokenizer,
model_max_length=training_args.model_max_length,
)
eval_dataset = SupervisedDataset(
data_path=data_args.eval_data_path,
tokenizer=tokenizer,
model_max_length=training_args.model_max_length,
) if os.path.exists(data_args.eval_data_path) else None
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
tokenizer=tokenizer,
)
trainer.train()
# save the incremental PEFT weights, more details can be found in https://huggingface.co/blog/peft
trainer.save_model()
\ No newline at end of file
#!/bin/bash
# DPO训练脚本示例
# 使用带有SFT损失的DPO训练
python finetune_dpo_trainer.py \
--model_name_or_path "/root/autodl-tmp/MiniCPM3-4B" \
--train_data_path "/root/autodl-tmp/dpo_train_data.json" \
--eval_data_path "/root/autodl-tmp/dpo_train_data.json" \
--output_dir "./output_dpo_sft" \
--num_train_epochs 3 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--gradient_accumulation_steps 8 \
--learning_rate 5e-6 \
--weight_decay 0.01 \
--warmup_steps 100 \
--logging_steps 10 \
--save_steps 500 \
--eval_steps 500 \
--model_max_length 512 \
--use_lora True \
--bf16 True \
--gradient_checkpointing True \
--dataloader_num_workers 4 \
--remove_unused_columns False \
--use_dpo True \
--dpo_beta 0.1 \
--sft_loss_weight 0.5
\ No newline at end of file
#!/bin/bash
# 获取当前时间戳(格式:年月日时分秒)
formatted_time=$(date +"%Y%m%d%H%M%S")
echo $formatted_time
# 启动DeepSpeed分布式训练
deepspeed --include localhost:0,1 finetune.py \
# 预训练模型路径或名称
--model_name_or_path MiniCPM-2B-sft-bf16 \
# 输出目录(包含时间戳)
--output_dir output/AdvertiseGenSFT/$formatted_time/ \
# 训练数据路径,修改成chatml格式的json地址
--train_data_path data/AdvertiseGenChatML/train.json \
# 验证数据路径,修改成chatml格式的json地址
--eval_data_path data/AdvertiseGenChatML/dev.json \
# 学习率设置
--learning_rate 5e-5 \
# 每个设备的训练批次大小
--per_device_train_batch_size 14 \
# 每个设备的验证批次大小
--per_device_eval_batch_size 32 \
# 启用BF16混合精度
--bf16 \
# 梯度累积步数
--gradient_accumulation_steps 2 \
# 预热步数
--warmup_steps 100 \
# 最大训练步数
--max_steps 3000 \
# 权重衰减系数
--weight_decay 0.01 \
# 评估策略(按步数)
--evaluation_strategy steps \
# 每100步评估一次
--eval_steps 100 \
# 保存策略(按步数)
--save_strategy steps \
# 每500步保存一次
--save_steps 500 \
# 随机种子
--seed 42 \
# 日志级别
--log_level info \
# 日志记录策略(按步数)
--logging_strategy steps \
# 每10步记录一次日志
--logging_steps 10 \
# DeepSpeed配置文件路径
--deepspeed configs/ds_config_zero2.json
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
torch.manual_seed(0)
path = 'openbmb/MiniCPM4-8B'
device = "cuda"
tokenizer = AutoTokenizer.from_pretrained(path)
model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.bfloat16, device_map=device, trust_remote_code=True)
# User can directly use the chat interface
# responds, history = model.chat(tokenizer, "Write an article about Artificial Intelligence.", temperature=0.7, top_p=0.7)
# print(responds)
# User can also use the generate interface
messages = [
{"role": "user", "content": "Write an article about Artificial Intelligence."},
]
prompt_text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
)
model_inputs = tokenizer([prompt_text], return_tensors="pt").to(device)
model_outputs = model.generate(
**model_inputs,
max_new_tokens=1024,
top_p=0.7,
temperature=0.7
)
output_token_ids = [
model_outputs[i][len(model_inputs[i]):] for i in range(len(model_inputs['input_ids']))
]
responses = tokenizer.batch_decode(output_token_ids, skip_special_tokens=True)[0]
print(responses)
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
model_name = "openbmb/MiniCPM4-8B"
prompt = [{"role": "user", "content": "推荐5个北京的景点。"}]
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
input_text = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
llm = LLM(
model=model_name,
trust_remote_code=True,
max_num_batched_tokens=32768,
dtype="bfloat16",
gpu_memory_utilization=0.8,
)
sampling_params = SamplingParams(top_p=0.7, temperature=0.7, max_tokens=1024, repetition_penalty=1.02)
outputs = llm.generate(prompts=input_text, sampling_params=sampling_params)
print(outputs[0].outputs[0].text)
# 模型编码
modelCode=1675
# 模型名称
modelName=MiniCPM4_pytorch
# 模型描述
modelDescription=速度狂飙,快至220倍!MiniCPM4.0-8B是首个原生稀疏模型,5%的极高稀疏度加持系统级创新技术的大爆发,宣告了端侧长文本时代到来!
# 应用场景
appScenario=推理,训练,对话问答,制造,广媒,金融,能源,医疗,家居,教育
# 框架类型
frameType=pytorch
---
license: apache-2.0
language:
- zh
- en
pipeline_tag: text-generation
library_name: transformers
---
<div align="center">
<img src="https://github.com/OpenBMB/MiniCPM/blob/main/assets/minicpm_logo.png?raw=true" width="500em" ></img>
</div>
<p align="center">
<a href="https://github.com/OpenBMB/MiniCPM/" target="_blank">GitHub Repo</a> |
<a href="https://arxiv.org/abs/2506.07900" target="_blank">Technical Report</a> |
<a href="https://mp.weixin.qq.com/s/KIhH2nCURBXuFXAtYRpuXg?poc_token=HBIsUWijxino8oJ5s6HcjcfXFRi0Xj2LJlxPYD9c">Join Us</a>
</p>
<p align="center">
👋 Contact us in <a href="https://discord.gg/3cGQn9b3YM" target="_blank">Discord</a> and <a href="https://github.com/OpenBMB/MiniCPM/blob/main/assets/wechat.jpg" target="_blank">WeChat</a>
</p>
## What's New
- [2025.06.06] **MiniCPM4** series are released! This model achieves ultimate efficiency improvements while maintaining optimal performance at the same scale! It can achieve over 5x generation acceleration on typical end-side chips! You can find technical report [here](https://github.com/OpenBMB/MiniCPM/tree/main/report/MiniCPM_4_Technical_Report.pdf).🔥🔥🔥
## MiniCPM4 Series
MiniCPM4 series are highly efficient large language models (LLMs) designed explicitly for end-side devices, which achieves this efficiency through systematic innovation in four key dimensions: model architecture, training data, training algorithms, and inference systems.
- [MiniCPM4-8B](https://huggingface.co/openbmb/MiniCPM4-8B): The flagship of MiniCPM4, with 8B parameters, trained on 8T tokens. (**<-- you are here**)
- [MiniCPM4-0.5B](https://huggingface.co/openbmb/MiniCPM4-0.5B): The small version of MiniCPM4, with 0.5B parameters, trained on 1T tokens.
- [MiniCPM4-8B-Eagle-FRSpec](https://huggingface.co/openbmb/MiniCPM4-8B-Eagle-FRSpec): Eagle head for FRSpec, accelerating speculative inference for MiniCPM4-8B.
- [MiniCPM4-8B-Eagle-FRSpec-QAT-cpmcu](https://huggingface.co/openbmb/MiniCPM4-8B-Eagle-FRSpec-QAT-cpmcu): Eagle head trained with QAT for FRSpec, efficiently integrate speculation and quantization to achieve ultra acceleration for MiniCPM4-8B.
- [MiniCPM4-8B-Eagle-vLLM](https://huggingface.co/openbmb/MiniCPM4-8B-Eagle-vLLM): Eagle head in vLLM format, accelerating speculative inference for MiniCPM4-8B.
- [MiniCPM4-8B-marlin-Eagle-vLLM](https://huggingface.co/openbmb/MiniCPM4-8B-marlin-Eagle-vLLM): Quantized Eagle head for vLLM format, accelerating speculative inference for MiniCPM4-8B.
- [BitCPM4-0.5B](https://huggingface.co/openbmb/BitCPM4-0.5B): Extreme ternary quantization applied to MiniCPM4-0.5B compresses model parameters into ternary values, achieving a 90% reduction in bit width.
- [BitCPM4-1B](https://huggingface.co/openbmb/BitCPM4-1B): Extreme ternary quantization applied to MiniCPM3-1B compresses model parameters into ternary values, achieving a 90% reduction in bit width.
- [MiniCPM4-Survey](https://huggingface.co/openbmb/MiniCPM4-Survey): Based on MiniCPM4-8B, accepts users' quiries as input and autonomously generate trustworthy, long-form survey papers.
- [MiniCPM4-MCP](https://huggingface.co/openbmb/MiniCPM4-MCP): Based on MiniCPM4-8B, accepts users' queries and available MCP tools as input and autonomously calls relevant MCP tools to satisfy users' requirements.
## Introduction
MiniCPM 4 is an extremely efficient edge-side large model that has undergone efficient optimization across four dimensions: model architecture, learning algorithms, training data, and inference systems, achieving ultimate efficiency improvements.
- 🏗️ **Efficient Model Architecture:**
- InfLLM v2 -- Trainable Sparse Attention Mechanism: Adopts a trainable sparse attention mechanism architecture where each token only needs to compute relevance with less than 5% of tokens in 128K long text processing, significantly reducing computational overhead for long texts
- 🧠 **Efficient Learning Algorithms:**
- Model Wind Tunnel 2.0 -- Efficient Predictable Scaling: Introduces scaling prediction methods for performance of downstream tasks, enabling more precise model training configuration search
- BitCPM -- Ultimate Ternary Quantization: Compresses model parameter bit-width to 3 values, achieving 90% extreme model bit-width reduction
- Efficient Training Engineering Optimization: Adopts FP8 low-precision computing technology combined with Multi-token Prediction training strategy
- 📚 **High-Quality Training Data:**
- UltraClean -- High-quality Pre-training Data Filtering and Generation: Builds iterative data cleaning strategies based on efficient data verification, open-sourcing high-quality Chinese and English pre-training dataset [UltraFinweb](https://huggingface.co/datasets/openbmb/Ultra-FineWeb)
- UltraChat v2 -- High-quality Supervised Fine-tuning Data Generation: Constructs large-scale high-quality supervised fine-tuning datasets covering multiple dimensions including knowledge-intensive data, reasoning-intensive data, instruction-following data, long text understanding data, and tool calling data
-**Efficient Inference System:**
- CPM.cu -- Lightweight and Efficient CUDA Inference Framework: Integrates sparse attention, model quantization, and speculative sampling to achieve efficient prefilling and decoding
- ArkInfer -- Cross-platform Deployment System: Supports efficient deployment across multiple backend environments, providing flexible cross-platform adaptation capabilities
## Usage
### Inference with [CPM.cu](https://github.com/OpenBMB/cpm.cu)
We recommend using [CPM.cu](https://github.com/OpenBMB/cpm.cu) for the inference of MiniCPM4. CPM.cu is a CUDA inference framework developed by OpenBMB, which integrates efficient sparse, speculative sampling, and quantization techniques, fully leveraging the efficiency advantages of MiniCPM4.
You can install CPM.cu by running the following command:
```bash
git clone https://github.com/OpenBMB/cpm.cu.git --recursive
cd cpm.cu
python3 setup.py install
```
MiniCPM4 natively supports context lengths of up to 32,768 tokens. To reproduce the long-text acceleration effect in the paper, we recommend using the LongRoPE factors that have been validated. Change the `rope_scaling` field in the `config.json` file as the following to enable LongRoPE.
```json
{
...,
"rope_scaling": {
"rope_type": "longrope",
"long_factor": [0.9977997200264581, 1.014658295992452, 1.0349680404997148, 1.059429246056193, 1.0888815016813513, 1.1243301355211495, 1.166977103606075, 1.2182568066927284, 1.2798772354275727, 1.3538666751582975, 1.4426259039919596, 1.5489853358570191, 1.6762658237220625, 1.8283407612492941, 2.0096956085876183, 2.225478927469756, 2.481536379650452, 2.784415934557119, 3.1413289096347365, 3.560047844772632, 4.048719380066383, 4.752651957515948, 5.590913044973868, 6.584005926629993, 7.7532214876576155, 9.119754865903639, 10.704443927019176, 12.524994176518703, 14.59739595363613, 16.93214476166354, 19.53823297353041, 22.417131025031697, 25.568260840911098, 28.991144156566317, 32.68408069090375, 36.65174474170465, 40.90396065611201, 45.4664008671033, 50.37147343433591, 55.6804490772103, 61.470816952306556, 67.8622707390618, 75.00516023410414, 83.11898235973767, 92.50044360202462, 103.57086856690864, 116.9492274587385, 118.16074567836519, 119.18497548708795, 120.04810876261652, 120.77352815196981, 121.38182790207875, 121.89094985353891, 122.31638758099915, 122.6714244963338, 122.9673822552567, 123.21386397019609, 123.41898278254268, 123.58957065488238, 123.73136519024158, 123.84917421274221, 123.94701903496814, 124.02825801299717, 124.09569231686116],
"short_factor": [0.9977997200264581, 1.014658295992452, 1.0349680404997148, 1.059429246056193, 1.0888815016813513, 1.1243301355211495, 1.166977103606075, 1.2182568066927284, 1.2798772354275727, 1.3538666751582975, 1.4426259039919596, 1.5489853358570191, 1.6762658237220625, 1.8283407612492941, 2.0096956085876183, 2.225478927469756, 2.481536379650452, 2.784415934557119, 3.1413289096347365, 3.560047844772632, 4.048719380066383, 4.752651957515948, 5.590913044973868, 6.584005926629993, 7.7532214876576155, 9.119754865903639, 10.704443927019176, 12.524994176518703, 14.59739595363613, 16.93214476166354, 19.53823297353041, 22.417131025031697, 25.568260840911098, 28.991144156566317, 32.68408069090375, 36.65174474170465, 40.90396065611201, 45.4664008671033, 50.37147343433591, 55.6804490772103, 61.470816952306556, 67.8622707390618, 75.00516023410414, 83.11898235973767, 92.50044360202462, 103.57086856690864, 116.9492274587385, 118.16074567836519, 119.18497548708795, 120.04810876261652, 120.77352815196981, 121.38182790207875, 121.89094985353891, 122.31638758099915, 122.6714244963338, 122.9673822552567, 123.21386397019609, 123.41898278254268, 123.58957065488238, 123.73136519024158, 123.84917421274221, 123.94701903496814, 124.02825801299717, 124.09569231686116],
"original_max_position_embeddings": 32768
}
}
```
After modification, you can run the following command to reproduce the long-context acceleration effect (the script will automatically download the model weights from HuggingFace)
```bash
python3 tests/test_generate.py
```
For more details about CPM.cu, please refer to [the repo CPM.cu](https://github.com/OpenBMB/cpm.cu).
### Inference with Transformers
```python
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
torch.manual_seed(0)
path = 'openbmb/MiniCPM4-8B'
device = "cuda"
tokenizer = AutoTokenizer.from_pretrained(path)
model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.bfloat16, device_map=device, trust_remote_code=True)
# User can directly use the chat interface
# responds, history = model.chat(tokenizer, "Write an article about Artificial Intelligence.", temperature=0.7, top_p=0.7)
# print(responds)
# User can also use the generate interface
messages = [
{"role": "user", "content": "Write an article about Artificial Intelligence."},
]
prompt_text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
)
model_inputs = tokenizer([prompt_text], return_tensors="pt").to(device)
model_outputs = model.generate(
**model_inputs,
max_new_tokens=1024,
top_p=0.7,
temperature=0.7
)
output_token_ids = [
model_outputs[i][len(model_inputs[i]):] for i in range(len(model_inputs['input_ids']))
]
responses = tokenizer.batch_decode(output_token_ids, skip_special_tokens=True)[0]
print(responses)
```
MiniCPM4-8B supports `InfLLM v2`, a sparse attention mechanism designed for efficient long-sequence inference. It requires the [infllmv2_cuda_impl](https://github.com/OpenBMB/infllmv2_cuda_impl) library.
You can install it by running the following command:
```bash
git clone -b feature_infer https://github.com/OpenBMB/infllmv2_cuda_impl.git
cd infllmv2_cuda_impl
git submodule update --init --recursive
pip install -e . # or python setup.py install
```
To enable InfLLM v2, you need to add the `sparse_config` field in `config.json`:
```json
{
...,
"sparse_config": {
"kernel_size": 32,
"kernel_stride": 16,
"init_blocks": 1,
"block_size": 64,
"window_size": 2048,
"topk": 64,
"use_nope": false,
"dense_len": 8192
}
}
```
These parameters control the behavior of InfLLM v2:
* `kernel_size` (default: 32): The size of semantic kernels.
* `kernel_stride` (default: 16): The stride between adjacent kernels.
* `init_blocks` (default: 1): The number of initial blocks that every query token attends to. This ensures attention to the beginning of the sequence.
* `block_size` (default: 64): The block size for key-value blocks.
* `window_size` (default: 2048): The size of the local sliding window.
* `topk` (default: 64): The specifies that each token computes attention with only the top-k most relevant key-value blocks.
* `use_nope` (default: false): Whether to use the NOPE technique in block selection for improved performance.
* `dense_len` (default: 8192): Since Sparse Attention offers limited benefits for short sequences, the model can use standard (dense) attention for shorter texts. The model will use dense attention for sequences with a token length below `dense_len` and switch to sparse attention for sequences exceeding this length. Set this to `-1` to always use sparse attention regardless of sequence length.
MiniCPM4 natively supports context lengths of up to 32,768 tokens. For conversations where the total length (including both input and output) significantly exceeds this limit, we recommend using RoPE scaling techniques for effective handling of long texts. We have validated the model's performance on context lengths of up to 131,072 tokens by modifying the LongRoPE factor.
You can apply the LongRoPE factor modification by modifying the model files. Specifically, in the `config.json` file, adjust the `rope_scaling` fields.
```json
{
...,
"rope_scaling": {
"rope_type": "longrope",
"long_factor": [0.9977997200264581, 1.014658295992452, 1.0349680404997148, 1.059429246056193, 1.0888815016813513, 1.1243301355211495, 1.166977103606075, 1.2182568066927284, 1.2798772354275727, 1.3538666751582975, 1.4426259039919596, 1.5489853358570191, 1.6762658237220625, 1.8283407612492941, 2.0096956085876183, 2.225478927469756, 2.481536379650452, 2.784415934557119, 3.1413289096347365, 3.560047844772632, 4.048719380066383, 4.752651957515948, 5.590913044973868, 6.584005926629993, 7.7532214876576155, 9.119754865903639, 10.704443927019176, 12.524994176518703, 14.59739595363613, 16.93214476166354, 19.53823297353041, 22.417131025031697, 25.568260840911098, 28.991144156566317, 32.68408069090375, 36.65174474170465, 40.90396065611201, 45.4664008671033, 50.37147343433591, 55.6804490772103, 61.470816952306556, 67.8622707390618, 75.00516023410414, 83.11898235973767, 92.50044360202462, 103.57086856690864, 116.9492274587385, 118.16074567836519, 119.18497548708795, 120.04810876261652, 120.77352815196981, 121.38182790207875, 121.89094985353891, 122.31638758099915, 122.6714244963338, 122.9673822552567, 123.21386397019609, 123.41898278254268, 123.58957065488238, 123.73136519024158, 123.84917421274221, 123.94701903496814, 124.02825801299717, 124.09569231686116],
"short_factor": [0.9977997200264581, 1.014658295992452, 1.0349680404997148, 1.059429246056193, 1.0888815016813513, 1.1243301355211495, 1.166977103606075, 1.2182568066927284, 1.2798772354275727, 1.3538666751582975, 1.4426259039919596, 1.5489853358570191, 1.6762658237220625, 1.8283407612492941, 2.0096956085876183, 2.225478927469756, 2.481536379650452, 2.784415934557119, 3.1413289096347365, 3.560047844772632, 4.048719380066383, 4.752651957515948, 5.590913044973868, 6.584005926629993, 7.7532214876576155, 9.119754865903639, 10.704443927019176, 12.524994176518703, 14.59739595363613, 16.93214476166354, 19.53823297353041, 22.417131025031697, 25.568260840911098, 28.991144156566317, 32.68408069090375, 36.65174474170465, 40.90396065611201, 45.4664008671033, 50.37147343433591, 55.6804490772103, 61.470816952306556, 67.8622707390618, 75.00516023410414, 83.11898235973767, 92.50044360202462, 103.57086856690864, 116.9492274587385, 118.16074567836519, 119.18497548708795, 120.04810876261652, 120.77352815196981, 121.38182790207875, 121.89094985353891, 122.31638758099915, 122.6714244963338, 122.9673822552567, 123.21386397019609, 123.41898278254268, 123.58957065488238, 123.73136519024158, 123.84917421274221, 123.94701903496814, 124.02825801299717, 124.09569231686116],
"original_max_position_embeddings": 32768
}
}
```
### Inference with [SGLang](https://github.com/sgl-project/sglang)
For now, you need to install our forked version of SGLang.
```bash
git clone -b openbmb https://github.com/OpenBMB/sglang.git
cd sglang
pip install --upgrade pip
pip install -e "python[all]"
```
You can start the inference server by running the following command:
```bash
python -m sglang.launch_server --model openbmb/MiniCPM4-8B --trust-remote-code --port 30000 --chat-template chatml
```
Then you can use the chat interface by running the following command:
```python
import openai
client = openai.Client(base_url=f"http://localhost:30000/v1", api_key="None")
response = client.chat.completions.create(
model="openbmb/MiniCPM4-8B",
messages=[
{"role": "user", "content": "Write an article about Artificial Intelligence."},
],
temperature=0.7,
max_tokens=1024,
)
print(response.choices[0].message.content)
```
### Inference with [vLLM](https://github.com/vllm-project/vllm)
For now, you need to install the latest version of vLLM.
```
pip install -U vllm \
--pre \
--extra-index-url https://wheels.vllm.ai/nightly
```
Then you can inference MiniCPM4-8B with vLLM:
```python
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
model_name = "openbmb/MiniCPM4-8B"
prompt = [{"role": "user", "content": "Please recommend 5 tourist attractions in Beijing. "}]
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
input_text = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
llm = LLM(
model=model_name,
trust_remote_code=True,
max_num_batched_tokens=32768,
dtype="bfloat16",
gpu_memory_utilization=0.8,
)
sampling_params = SamplingParams(top_p=0.7, temperature=0.7, max_tokens=1024, repetition_penalty=1.02)
outputs = llm.generate(prompts=input_text, sampling_params=sampling_params)
print(outputs[0].outputs[0].text)
```
Also, you can start the inference server by running the following command:
> **Note**: In vLLM's chat API, `add_special_tokens` is `False` by default. This means important special tokens—such as the beginning-of-sequence (BOS) token—will not be added automatically. To ensure the input prompt is correctly formatted for the model, you should explicitly set `extra_body={"add_special_tokens": True}`.
```bash
vllm serve openbmb/MiniCPM4-8B
```
Then you can use the chat interface by running the following code:
```python
import openai
client = openai.Client(base_url="http://localhost:8000/v1", api_key="EMPTY")
response = client.chat.completions.create(
model="openbmb/MiniCPM4-8B",
messages=[
{"role": "user", "content": "Write an article about Artificial Intelligence."},
],
temperature=0.7,
max_tokens=1024,
extra_body=dict(add_special_tokens=True), # Ensures special tokens are added for chat template
)
print(response.choices[0].message.content)
```
## Evaluation Results
On two typical end-side chips, Jetson AGX Orin and RTX 4090, MiniCPM4 demonstrates significantly faster processing speed compared to similar-size models in long text processing tasks. As text length increases, MiniCPM4's efficiency advantage becomes more pronounced. On the Jetson AGX Orin platform, compared to Qwen3-8B, MiniCPM4 achieves approximately 7x decoding speed improvement.
![benchmark](https://github.com/OpenBMB/MiniCPM/blob/main/assets/minicpm4/efficiency.png?raw=true)
#### Comprehensive Evaluation
MiniCPM4 launches end-side versions with 8B and 0.5B parameter scales, both achieving best-in-class performance in their respective categories.
![benchmark](https://github.com/OpenBMB/MiniCPM/blob/main/assets/minicpm4/benchmark.png?raw=true)
#### Long Text Evaluation
MiniCPM4 is pre-trained on 32K long texts and achieves length extension through YaRN technology. In the 128K long text needle-in-a-haystack task, MiniCPM4 demonstrates outstanding performance.
![long-niah](https://github.com/OpenBMB/MiniCPM/blob/main/assets/minicpm4/128k-niah.png?raw=true)
## Statement
- As a language model, MiniCPM generates content by learning from a vast amount of text.
- However, it does not possess the ability to comprehend or express personal opinions or value judgments.
- Any content generated by MiniCPM does not represent the viewpoints or positions of the model developers.
- Therefore, when using content generated by MiniCPM, users should take full responsibility for evaluating and verifying it on their own.
## LICENSE
- This repository and MiniCPM models are released under the [Apache-2.0](https://github.com/OpenBMB/MiniCPM/blob/main/LICENSE) License.
## Citation
- Please cite our [paper](https://github.com/OpenBMB/MiniCPM/tree/main/report/MiniCPM_4_Technical_Report.pdf) if you find our work valuable.
```bibtex
@article{minicpm4,
title={{MiniCPM4}: Ultra-Efficient LLMs on End Devices},
author={MiniCPM Team},
year={2025}
}
```
\ No newline at end of file
from datasets import load_dataset
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer
import os
model_path = '/root/ld/ld_model_pretrained/minicpm3' # model_path or model_id
quant_path = '/root/ld/ld_model_pretrained/minicpm3_awq' # quant_save_path
quant_data_path='/Users/liudan/ai/pull_request/MiniCPM/quantize/quantize_data/wikitext'# 写入自带数据集地址
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" } # "w_bit":4 or 8
quant_samples=512 # how many samples to use for calibration
custom_data=[ # first custom data
[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "assistant", "content": "你好,有什么我可以帮助你的吗?"},
{"role": "user", "content": "我想了解如何编写Python代码。"},
], # second custom data
[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "assistant", "content": "你好,有什么我可以帮助你的吗?"},
{"role": "user", "content": "我想了解如何编写Python代码。"},
]
#....more custom data
]
# Load model
model = AutoAWQForCausalLM.from_pretrained(model_path,safetensors=False)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True,device_map={"": "cuda:0"})
# Define data loading methods
def load_alpaca(quant_data_path):
data = load_dataset(quant_data_path, split="train") # Set the absolute path to alpaca or huggingface id
# concatenate data
def concatenate_data(x):
if x['input'] and x['instruction']:
line = [
{"role": "system", "content": x['instruction']},
{"role": "user", "content": x['input']},
{"role": "assistant", "content": x['output']},
]
elif x['input']:
line = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": x['input']},
{"role": "assistant", "content": x['output']},
]
else:
line = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": x['instruction']},
{"role": "assistant", "content": x['output']},
]
if model.config.architectures == ["MiniCPM3ForCausalLM"]:
print(tokenizer.decode(tokenizer.apply_chat_template(line)))
return {"text":tokenizer.decode(tokenizer.apply_chat_template(line))}
else:
return {"text": '<用户>'+x['instruction'] + x['input'] + '<AI>' + '\n' + x['output']}
concatenated = data.map(concatenate_data)[:quant_samples]
return [text for text in concatenated["text"]]
def load_wikitext(quant_data_path):
data = load_dataset(quant_data_path, split="train")
return [text for text in data["text"] if text.strip() != '' and len(text.split(' ')) > 20][:quant_samples]
def load_cust_data(custom_data):
quant_data=[tokenizer.decode(tokenizer.apply_chat_template(i)) for i in custom_data]
return quant_data[:quant_samples]
# Quantize
model.quantize(tokenizer, quant_config=quant_config, calib_data=load_wikitext(quant_data_path=quant_data_path))
# Save quantized model
model.save_quantized(quant_path)
tokenizer.save_pretrained(quant_path)
print(f'Model is quantized and saved at "{quant_path}"')
\ No newline at end of file
"""
the script will use bitandbytes to quantize the MiniCPM language model.
the be quantized model can be finetuned by MiniCPM or not.
you only need to set the model_path 、save_path and run bash code
cd MiniCPM
python quantize/bnb_quantize.py
you will get the quantized model in save_path、quantized_model test time and gpu usage
"""
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import time
import torch
import GPUtil
import os
model_path = "/root/ld/ld_model_pretrain/MiniCPM-1B-sft-bf16" # 模型下载地址
save_path = "/root/ld/ld_model_pretrain/MiniCPM-1B-sft-bf16_int4" # 量化模型保存地址
device = "cuda" if torch.cuda.is_available() else "cpu"
# 创建一个配置对象来指定量化参数
quantization_config = BitsAndBytesConfig(
load_in_4bit=True, # 是否进行4bit量化
load_in_8bit=False, # 是否进行8bit量化
bnb_4bit_compute_dtype=torch.float16, # 计算精度设置
bnb_4bit_quant_storage=torch.uint8, # 量化权重的储存格式
bnb_4bit_quant_type="nf4", # 量化格式,这里用的是正太分布的int4
bnb_4bit_use_double_quant=True, # 是否采用双量化,即对zeropoint和scaling参数进行量化
llm_int8_enable_fp32_cpu_offload=False, # 是否llm使用int8,cpu上保存的参数使用fp32
llm_int8_has_fp16_weight=False, # 是否启用混合精度
#llm_int8_skip_modules=["out_proj", "kv_proj", "lm_head"], # 不进行量化的模块
llm_int8_threshold=6.0, # llm.int8()算法中的离群值,根据这个值区分是否进行量化
)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_path,
device_map=device, # 分配模型到device
quantization_config=quantization_config,
trust_remote_code=True,
)
gpu_usage = GPUtil.getGPUs()[0].memoryUsed
start = time.time()
response = model.chat(tokenizer, "<用户>给我讲一个故事<AI>",history=[], temperature=0.5, top_p=0.8, repetition_penalty=1.02) # 模型推理
print("量化后输出", response)
print("量化后推理用时", time.time() - start)
print(f"量化后显存占用: {round(gpu_usage/1024,2)}GB")
# 保存模型和分词器
os.makedirs(save_path, exist_ok=True)
model.save_pretrained(save_path, safe_serialization=True)
tokenizer.save_pretrained(save_path)
"""
使用gptq量化前,请先安装我们的autogptq分支,否则代码无法正常运行。
‘’‘bash
git clone https://github.com/LDLINGLINGLING/AutoGPTQ/tree/minicpm_gptq
cd Autogptq
# 如果量化minicpm3.0
git checkout minicpm3
# 如果量化minicpm2.0
git checkout minicpm_autogptq
pip install e .
‘’‘
"""
import json
import random
import time
from argparse import ArgumentParser
import torch
from datasets import Dataset
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
import os
import shutil
def copy_missing_files(src_path, dst_path):
src_files=os.listdir(src_path)
dst_files=os.listdir(dst_path)
for src_file in src_files:
if src_file not in dst_files and src_file.endswith(('.bin', '.json'))!=True and src_file.startswith('.')!=True:
src_file_path = os.path.join(src_path, src_file)
dst_file_path = os.path.join(dst_path, src_file)
shutil.copy2(src_file_path, dst_file_path)
def load_data(data_path, tokenizer, n_samples):
with open(data_path, "r", encoding="utf-8") as f:
raw_data = json.load(f)
raw_data = random.sample(raw_data, k=min(n_samples, len(raw_data)))
def dummy_gen():
return raw_data
def tokenize(examples):
instructions = examples["instruction"]
inputs = examples["input"]
outputs = examples["output"]
prompts = []
texts = []
input_ids = []
attention_mask = []
for istr, inp, opt in zip(instructions, inputs, outputs):
if inp:
line = [
{"role": "system", "content": istr},
{"role": "assistant", "content": inp},
{"role": "user", "content": opt},
]
prompt = tokenizer.decode(tokenizer.apply_chat_template(line[:2]))
text = tokenizer.decode(tokenizer.apply_chat_template(line))
else:
line = [
{"role": "assistant", "content": istr},
{"role": "user", "content": opt},
]
prompt = tokenizer.decode(tokenizer.apply_chat_template(line[:1]))
text = tokenizer.decode(tokenizer.apply_chat_template(line))
if len(tokenizer(prompt)["input_ids"]) >= tokenizer.model_max_length:
continue
tokenized_data = tokenizer(text)
input_ids.append(tokenized_data["input_ids"][: tokenizer.model_max_length])
attention_mask.append(tokenized_data["attention_mask"][: tokenizer.model_max_length])
prompts.append(prompt)
texts.append(text)
return {
"input_ids": input_ids,
"attention_mask": attention_mask,
"prompt": prompts,
}
dataset = Dataset.from_generator(dummy_gen)
dataset = dataset.map(
tokenize,
batched=True,
batch_size=len(dataset),
num_proc=1,
keep_in_memory=True,
load_from_cache_file=False,
remove_columns=["instruction", "input"],
)
dataset = dataset.to_list()
for sample in dataset:
sample["input_ids"] = torch.LongTensor(sample["input_ids"])
sample["attention_mask"] = torch.LongTensor(sample["attention_mask"])
return dataset
def main():
parser = ArgumentParser()
parser.add_argument("--pretrained_model_dir", type=str,default='/root/ld/ld_model_pretrained/minicpm3')
parser.add_argument("--quantized_model_dir", type=str, default='/root/ld/ld_model_pretrained/minicpm3_gptq_4bit')
parser.add_argument("--bits", type=int, default=4, choices=[2, 3, 4])#do not use 8 bit
parser.add_argument(
"--group_size",
type=int,
default=128,
help="group size, -1 means no grouping or full rank",
)
parser.add_argument("--desc_act", action="store_true", default=True,help="whether to quantize with desc_act")
parser.add_argument(
"--num_samples",
type=int,
default=256,
help="how many samples will be used to quantize model",
)
parser.add_argument(
"--save_and_reload",
action="store_true",
default=True,
help="whether save quantized model to disk and reload back",
)
parser.add_argument("--fast_tokenizer", action="store_true", help="whether use fast tokenizer")
parser.add_argument(
"--use_triton",
action="store_true",
help="whether use triton to speedup at inference",
)
parser.add_argument(
"--per_gpu_max_memory",
type=int,
default=None,
help="max memory used to load model per gpu",
)
parser.add_argument(
"--cpu_max_memory",
type=int,
default=None,
help="max memory used to offload model to cpu",
)
parser.add_argument(
"--quant_batch_size",
type=int,
default=8,
help="examples batch size for quantization",
)
parser.add_argument(
"--trust_remote_code",
default=True,
action="store_true",
help="whether to trust remote code when loading model",
)
parser.add_argument(
"--quant_data",
default='quantize_data/alpaca_data_cleaned.json',
help="the quant data path",
)
args = parser.parse_args()
max_memory = {}
if args.per_gpu_max_memory is not None and args.per_gpu_max_memory > 0:
if torch.cuda.is_available():
max_memory.update({i: f"{args.per_gpu_max_memory}GIB" for i in range(torch.cuda.device_count())})
if args.cpu_max_memory is not None and args.cpu_max_memory > 0 and max_memory:
max_memory["cpu"] = f"{args.cpu_max_memory}GIB"
if not max_memory:
max_memory = None
tokenizer = AutoTokenizer.from_pretrained(
args.pretrained_model_dir,
use_fast=args.fast_tokenizer,
trust_remote_code=args.trust_remote_code,
)
model = AutoGPTQForCausalLM.from_pretrained(
args.pretrained_model_dir,
quantize_config=BaseQuantizeConfig(bits=args.bits, group_size=args.group_size, desc_act=args.desc_act),
max_memory=max_memory,
trust_remote_code=args.trust_remote_code,
)
examples = load_data(args.quant_data, tokenizer, args.num_samples)
examples_for_quant = [
{"input_ids": example["input_ids"], "attention_mask": example["attention_mask"]} for example in examples
]
start = time.time()
model.quantize(
examples_for_quant,
batch_size=args.quant_batch_size,
use_triton=args.use_triton,
autotune_warmup_after_quantized=args.use_triton,
)
end = time.time()
print(f"quantization took: {end - start: .4f}s")
if not args.quantized_model_dir:
args.quantized_model_dir = args.pretrained_model_dir
if args.save_and_reload:
model.save_quantized(args.quantized_model_dir)
tokenizer.save_pretrained(args.quantized_model_dir)
copy_missing_files(args.pretrained_model_dir,args.quantized_model_dir)
del model
if torch.cuda.is_available():
torch.cuda.empty_cache()
model = AutoGPTQForCausalLM.from_quantized(
args.quantized_model_dir,
device="cuda:0",
use_triton=args.use_triton,
max_memory=max_memory,
inject_fused_mlp=True,
inject_fused_attention=True,
trust_remote_code=args.trust_remote_code,
)
pipeline_init_kwargs = {"model": model, "tokenizer": tokenizer}
if not max_memory:
pipeline_init_kwargs["device"] = "cuda:0"
for example in random.sample(examples, k=min(4, len(examples))):
print(f"prompt: {example['prompt']}")
print("-" * 42)
print(f"golden: {example['output']}")
print("-" * 42)
start = time.time()
print(tokenizer.decode(model.generate(**tokenizer("{}".format(example['prompt']), return_tensors="pt").to(model.device),max_new_tokens=100)[0]))
if __name__ == "__main__":
import logging
logging.basicConfig(
format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
level=logging.INFO,
datefmt="%Y-%m-%d %H:%M:%S",
)
main()
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment