v1.0

c0d96b32 · chenzk · c0d96b32 · c0d96b32 · c0d96b32 · c0d96b32
Commit c0d96b32 authored Jul 23, 2025 by chenzk
20 changed files
--- a/finetune/llama_factory_example/single_node.sh
+++ b/finetune/llama_factory_example/single_node.sh
+#!/bin/bash
+
+NPROC_PER_NODE=8
+NNODES=1
+RANK=0
+MASTER_ADDR=127.0.0.1
+MASTER_PORT=29500
+export NCCL_P2P_DISABLE=1
+export NCCL_IB_DISABLE=1 
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun \
+    --nproc_per_node $NPROC_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT \
+    src/train.py /root/ld/ld_project/LLaMA-Factory/examples/minicpm/minicpm_sft.yaml
--- a/finetune/lora_finetune.ipynb
+++ b/finetune/lora_finetune.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# MiniCPM-2B 参数高效微调（LoRA）A100 80G 单卡示例\n",
+    "\n",
+    "显存更小的显卡可用 batch size 和 grad_accum 间时间换空间\n",
+    "\n",
+    "本 notebook 是一个使用 `AdvertiseGen` 数据集对 MiniCPM-2B 进行 LoRA 微调，使其具备专业的广告生成能力的代码示例。\n",
+    "\n",
+    "## 最低硬件需求\n",
+    "- 显存：12GB\n",
+    "- 显卡架构：安培架构（推荐）\n",
+    "- 内存：16GB"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. 准备数据集\n",
+    "\n",
+    "将数据集转换为更通用的格式\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 转换为 ChatML 格式\n",
+    "import os\n",
+    "import shutil\n",
+    "import json\n",
+    "\n",
+    "input_dir = \"data/AdvertiseGen\"\n",
+    "output_dir = \"data/AdvertiseGenChatML\"\n",
+    "if os.path.exists(output_dir):\n",
+    "    shutil.rmtree(output_dir)\n",
+    "os.makedirs(output_dir, exist_ok=True)\n",
+    "\n",
+    "for fn in [\"train.json\", \"dev.json\"]:\n",
+    "    data_out_list = []\n",
+    "    with open(os.path.join(input_dir, fn), \"r\") as f, open(os.path.join(output_dir, fn), \"w\") as fo:\n",
+    "        for line in f:\n",
+    "            if len(line.strip()) > 0:\n",
+    "                data = json.loads(line)\n",
+    "                data_out = {\n",
+    "                    \"messages\": [\n",
+    "                        {\n",
+    "                            \"role\": \"user\",\n",
+    "                            \"content\": data[\"content\"],\n",
+    "                        },\n",
+    "                        {\n",
+    "                            \"role\": \"assistant\",\n",
+    "                            \"content\": data[\"summary\"],\n",
+    "                        },\n",
+    "                    ]\n",
+    "                }\n",
+    "                data_out_list.append(data_out)\n",
+    "        json.dump(data_out_list, fo, ensure_ascii=False, indent=4)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. 使用 LoRA 进行微调\n",
+    "\n",
+    "命令行一键运行"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!bash lora_finetune.sh"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 推理验证"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from tqdm import tqdm\n",
+    "from transformers import AutoModelForCausalLM, AutoTokenizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "path = \"output/AdvertiseGenLoRA/20240315224356/checkpoint-3000\"\n",
+    "tokenizer = AutoTokenizer.from_pretrained(path)\n",
+    "model = AutoModelForCausalLM.from_pretrained(\n",
+    "    path, torch_dtype=torch.bfloat16, device_map=\"cuda\", trust_remote_code=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "res, history = model.chat(tokenizer, query=\"<用户>类型#上衣*材质#牛仔布*颜色#白色*风格#简约*图案#刺绣*衣样式#外套*衣款式#破洞<AI>\", max_length=80, top_p=0.5)\n",
+    "res, history"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/finetune/lora_finetune.sh
+++ b/finetune/lora_finetune.sh
+formatted_time=$(date +"%Y%m%d%H%M%S")
+echo $formatted_time
+
+
+deepspeed --include localhost:1 finetune.py \
+    --model_name_or_path MiniCPM-2B-sft-bf16 \
+    --output_dir output/AdvertiseGenLoRA/$formatted_time/ \
+    --train_data_path data/AdvertiseGenChatML/train.json \
+    --eval_data_path data/AdvertiseGenChatML/dev.json \
+    --learning_rate 5e-5 --per_device_train_batch_size 32 \
+    --per_device_eval_batch_size 64  --model_max_length 384 --bf16 --use_lora \
+    --gradient_accumulation_steps 1 --warmup_steps 100 \
+    --max_steps 3000 --weight_decay 0.01 \
+    --evaluation_strategy steps --eval_steps 500 \
+    --save_strategy steps --save_steps 500 --seed 42 \
+    --log_level info --logging_strategy steps --logging_steps 10 \
+    --deepspeed configs/ds_config_zero3_offload.json
--- a/finetune/lora_finetune_minicpm4.sh
+++ b/finetune/lora_finetune_minicpm4.sh
+formatted_time=$(date +"%Y%m%d%H%M%S")
+echo $formatted_time
+
+export HIP_VISIBLE_DEVICES=0,1,2,3
+
+deepspeed --include localhost:0,1,2,3 --master_port 19888 finetune.py \
+    --model_name_or_path ../openbmb/MiniCPM4-8B \
+    --output_dir output/OCNLILoRA/$formatted_time/ \
+    --train_data_path data/ocnli_public_chatml/train.json \
+    --eval_data_path data/ocnli_public_chatml/dev.json \
+    --learning_rate 5e-5 --per_device_train_batch_size 40 \
+    --per_device_eval_batch_size 128 --model_max_length 128 --bf16 --use_lora \
+    --gradient_accumulation_steps 1 --warmup_steps 100 \
+    --max_steps 1000 --weight_decay 0.01 \
+    --eval_steps 500 \
+    --save_strategy steps --save_steps 500 --seed 42 \
+    --log_level info --logging_strategy steps --logging_steps 10 \
--- a/finetune/lora_finetune_ocnli.ipynb
+++ b/finetune/lora_finetune_ocnli.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# MiniCPM-2B 参数高效微调（LoRA）A100 80G 单卡示例\n",
+    "\n",
+    "显存更小的显卡可用 batch size 和 grad_accum 间时间换空间\n",
+    "\n",
+    "本 notebook 是一个使用 `OCNLI` 数据集对 MiniCPM-2B 进行 LoRA 微调，使其具备专业的广告生成能力的代码示例。\n",
+    "\n",
+    "## 最低硬件需求\n",
+    "- 显存：12GB\n",
+    "- 显卡架构：安培架构（推荐）\n",
+    "- 内存：16GB"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. 准备数据集\n",
+    "\n",
+    "将数据转换为更通用的格式"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 转换为 ChatML 格式\n",
+    "import os\n",
+    "import shutil\n",
+    "import json\n",
+    "\n",
+    "input_dir = \"data/ocnli_public\"\n",
+    "output_dir = \"data/ocnli_public_chatml\"\n",
+    "if os.path.exists(output_dir):\n",
+    "    shutil.rmtree(output_dir)\n",
+    "os.makedirs(output_dir, exist_ok=True)\n",
+    "\n",
+    "for fn in [\"train.json\", \"dev.json\"]:\n",
+    "    data_out_list = []\n",
+    "    with open(os.path.join(input_dir, fn), \"r\") as f, open(os.path.join(output_dir, fn), \"w\") as fo:\n",
+    "        for line in f:\n",
+    "            if len(line.strip()) > 0:\n",
+    "                data = json.loads(line)\n",
+    "                data_out = {\n",
+    "                    \"messages\": [\n",
+    "                        {\n",
+    "                            \"role\": \"user\",\n",
+    "                            \"content\": f\"请判断下边两个句子的关系属于 [entailment, neutral, contradiction]中的哪一种？\\n句子1: {data['sentence1']}\\n句子2：{data['sentence2']}\\n\"\n",
+    "                        },\n",
+    "                        {\n",
+    "                            \"role\": \"assistant\",\n",
+    "                            \"content\": data[\"label\"],\n",
+    "                        },\n",
+    "                    ]\n",
+    "                }\n",
+    "                data_out_list.append(data_out)\n",
+    "        json.dump(data_out_list, fo, ensure_ascii=False, indent=4)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. 使用 LoRA 进行微调\n",
+    "\n",
+    "命令行一键运行"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "20240315212836\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-03-15 21:28:38,758] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-15 21:28:45,799] [WARNING] [runner.py:202:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only.\n",
+      "[2024-03-15 21:28:45,799] [INFO] [runner.py:568:main] cmd = /usr/bin/python3 -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMF19 --master_addr=127.0.0.1 --master_port=19888 --enable_each_rank_log=None finetune.py --model_name_or_path MiniCPM-2B-sft-bf16 --output_dir output/ocnli_public_chatml/20240315212836/ --train_data_path data/ocnli_public_chatml/train.json --eval_data_path data/ocnli_public_chatml/dev.json --learning_rate 5e-5 --per_device_train_batch_size 64 --per_device_eval_batch_size 128 --model_max_length 128 --bf16 --use_lora --gradient_accumulation_steps 1 --warmup_steps 100 --max_steps 1000 --weight_decay 0.01 --evaluation_strategy steps --eval_steps 500 --save_strategy steps --save_steps 500 --seed 42 --log_level info --logging_strategy steps --logging_steps 10 --deepspeed configs/ds_config_zero3_offload.json\n",
+      "[2024-03-15 21:28:47,849] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-15 21:28:54,904] [INFO] [launch.py:145:main] WORLD INFO DICT: {'localhost': [0]}\n",
+      "[2024-03-15 21:28:54,905] [INFO] [launch.py:151:main] nnodes=1, num_local_procs=1, node_rank=0\n",
+      "[2024-03-15 21:28:54,905] [INFO] [launch.py:162:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0]})\n",
+      "[2024-03-15 21:28:54,905] [INFO] [launch.py:163:main] dist_world_size=1\n",
+      "[2024-03-15 21:28:54,905] [INFO] [launch.py:165:main] Setting CUDA_VISIBLE_DEVICES=0\n",
+      "[2024-03-15 21:28:54,905] [INFO] [launch.py:253:main] process 86577 spawned with command: ['/usr/bin/python3', '-u', 'finetune.py', '--local_rank=0', '--model_name_or_path', 'MiniCPM-2B-sft-bf16', '--output_dir', 'output/ocnli_public_chatml/20240315212836/', '--train_data_path', 'data/ocnli_public_chatml/train.json', '--eval_data_path', 'data/ocnli_public_chatml/dev.json', '--learning_rate', '5e-5', '--per_device_train_batch_size', '64', '--per_device_eval_batch_size', '128', '--model_max_length', '128', '--bf16', '--use_lora', '--gradient_accumulation_steps', '1', '--warmup_steps', '100', '--max_steps', '1000', '--weight_decay', '0.01', '--evaluation_strategy', 'steps', '--eval_steps', '500', '--save_strategy', 'steps', '--save_steps', '500', '--seed', '42', '--log_level', 'info', '--logging_strategy', 'steps', '--logging_steps', '10', '--deepspeed', 'configs/ds_config_zero3_offload.json']\n",
+      "[2024-03-15 21:29:03,964] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2024-03-15 21:29:04,250] [INFO] [comm.py:637:init_distributed] cdb=None\n",
+      "[2024-03-15 21:29:04,250] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:836: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in MiniCPMForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained(\"openai/whisper-tiny\", attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)`\n",
+      "You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.\n",
+      "Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in MiniCPMModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained(\"openai/whisper-tiny\", attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)`\n",
+      "[2024-03-15 21:29:08,998] [INFO] [partition_parameters.py:343:__exit__] finished initializing model - num_params = 363, num_elems = 3.01B\n",
+      "trainable params: 2,949,120 || all params: 2,727,830,016 || trainable%: 0.10811230841738784\n",
+      "input: <s> <用户> 请判断下边两个句子的关系属于 [entailment, neutral, contradiction]中的哪一种？\n",
+      "句子1: 一月份跟二月份肯定有一个月份有.\n",
+      "句子2：肯定有一个月份有\n",
+      " <AI> entailment</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s>\n",
+      "label: entailment\n",
+      "input: <s> <用户> 请判断下边两个句子的关系属于 [entailment, neutral, contradiction]中的哪一种？\n",
+      "句子1: 身上裹一件工厂发的棉大衣,手插在袖筒里\n",
+      "句子2：身上至少一件衣服\n",
+      " <AI> entailment</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s>\n",
+      "label: entailment\n",
+      "Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.\n",
+      "max_steps is given, it will override any value given in num_train_epochs\n",
+      "Using auto half precision backend\n",
+      "Detected ZeRO Offload and non-DeepSpeed optimizers: This combination should work as long as the custom optimizer has both CPU and GPU implementation (except LAMB)\n",
+      "Using /home/jeeves/.cache/torch_extensions/py310_cu123 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /home/jeeves/.cache/torch_extensions/py310_cu123/cpu_adam/build.ninja...\n",
+      "Building extension module cpu_adam...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module cpu_adam...\n",
+      "Time to load cpu_adam op: 2.3341457843780518 seconds\n",
+      "Adam Optimizer #0 is created with AVX512 arithmetic capability.\n",
+      "Config: alpha=0.000050, betas=(0.900000, 0.999000), weight_decay=0.010000, adam_w=1\n",
+      "[2024-03-15 21:29:15,864] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed info: version=0.14.0, git-hash=unknown, git-branch=unknown\n",
+      "[2024-03-15 21:29:15,884] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False\n",
+      "[2024-03-15 21:29:15,886] [INFO] [logging.py:96:log_dist] [Rank 0] Using client Optimizer as basic optimizer\n",
+      "[2024-03-15 21:29:15,886] [INFO] [logging.py:96:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer\n",
+      "[2024-03-15 21:29:15,895] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Basic Optimizer = DeepSpeedCPUAdam\n",
+      "[2024-03-15 21:29:15,896] [INFO] [utils.py:56:is_zero_supported_optimizer] Checking ZeRO support for optimizer=DeepSpeedCPUAdam type=<class 'deepspeed.ops.adam.cpu_adam.DeepSpeedCPUAdam'>\n",
+      "[2024-03-15 21:29:15,896] [INFO] [logging.py:96:log_dist] [Rank 0] Creating fp16 ZeRO stage 3 optimizer, MiCS is enabled False, Hierarchical params gather False\n",
+      "[2024-03-15 21:29:15,896] [INFO] [logging.py:96:log_dist] [Rank 0] Creating torch.bfloat16 ZeRO stage 3 optimizer\n",
+      "[2024-03-15 21:29:16,049] [INFO] [utils.py:800:see_memory_usage] Stage 3 initialize beginning\n",
+      "[2024-03-15 21:29:16,049] [INFO] [utils.py:801:see_memory_usage] MA 0.03 GB         Max_MA 1.62 GB         CA 0.04 GB         Max_CA 2 GB \n",
+      "[2024-03-15 21:29:16,049] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory:  used = 138.03 GB, percent = 13.7%\n",
+      "[2024-03-15 21:29:16,053] [INFO] [stage3.py:130:__init__] Reduce bucket size 5308416\n",
+      "[2024-03-15 21:29:16,053] [INFO] [stage3.py:131:__init__] Prefetch bucket size 4777574\n",
+      "[2024-03-15 21:29:16,201] [INFO] [utils.py:800:see_memory_usage] DeepSpeedZeRoOffload initialize [begin]\n",
+      "[2024-03-15 21:29:16,201] [INFO] [utils.py:801:see_memory_usage] MA 0.03 GB         Max_MA 0.03 GB         CA 0.04 GB         Max_CA 0 GB \n",
+      "[2024-03-15 21:29:16,201] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory:  used = 138.03 GB, percent = 13.7%\n",
+      "Parameter Offload: Total persistent parameters: 3135744 in 241 params\n",
+      "[2024-03-15 21:29:16,449] [INFO] [utils.py:800:see_memory_usage] DeepSpeedZeRoOffload initialize [end]\n",
+      "[2024-03-15 21:29:16,450] [INFO] [utils.py:801:see_memory_usage] MA 0.02 GB         Max_MA 0.03 GB         CA 0.04 GB         Max_CA 0 GB \n",
+      "[2024-03-15 21:29:16,450] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory:  used = 138.04 GB, percent = 13.7%\n",
+      "[2024-03-15 21:29:16,608] [INFO] [utils.py:800:see_memory_usage] Before creating fp16 partitions\n",
+      "[2024-03-15 21:29:16,609] [INFO] [utils.py:801:see_memory_usage] MA 0.02 GB         Max_MA 0.02 GB         CA 0.04 GB         Max_CA 0 GB \n",
+      "[2024-03-15 21:29:16,609] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory:  used = 138.05 GB, percent = 13.7%\n",
+      "[2024-03-15 21:29:16,776] [INFO] [utils.py:800:see_memory_usage] After creating fp16 partitions: 1\n",
+      "[2024-03-15 21:29:16,777] [INFO] [utils.py:801:see_memory_usage] MA 0.02 GB         Max_MA 0.02 GB         CA 0.04 GB         Max_CA 0 GB \n",
+      "[2024-03-15 21:29:16,777] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory:  used = 138.05 GB, percent = 13.7%\n",
+      "[2024-03-15 21:29:16,931] [INFO] [utils.py:800:see_memory_usage] Before creating fp32 partitions\n",
+      "[2024-03-15 21:29:16,932] [INFO] [utils.py:801:see_memory_usage] MA 0.02 GB         Max_MA 0.02 GB         CA 0.04 GB         Max_CA 0 GB \n",
+      "[2024-03-15 21:29:16,932] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory:  used = 138.05 GB, percent = 13.7%\n",
+      "[2024-03-15 21:29:17,099] [INFO] [utils.py:800:see_memory_usage] After creating fp32 partitions\n",
+      "[2024-03-15 21:29:17,100] [INFO] [utils.py:801:see_memory_usage] MA 0.02 GB         Max_MA 0.02 GB         CA 0.04 GB         Max_CA 0 GB \n",
+      "[2024-03-15 21:29:17,100] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory:  used = 138.04 GB, percent = 13.7%\n",
+      "[2024-03-15 21:29:17,254] [INFO] [utils.py:800:see_memory_usage] Before initializing optimizer states\n",
+      "[2024-03-15 21:29:17,254] [INFO] [utils.py:801:see_memory_usage] MA 0.02 GB         Max_MA 0.02 GB         CA 0.04 GB         Max_CA 0 GB \n",
+      "[2024-03-15 21:29:17,254] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory:  used = 138.04 GB, percent = 13.7%\n",
+      "[2024-03-15 21:29:17,425] [INFO] [utils.py:800:see_memory_usage] After initializing optimizer states\n",
+      "[2024-03-15 21:29:17,425] [INFO] [utils.py:801:see_memory_usage] MA 0.02 GB         Max_MA 0.02 GB         CA 0.04 GB         Max_CA 0 GB \n",
+      "[2024-03-15 21:29:17,425] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory:  used = 138.04 GB, percent = 13.7%\n",
+      "[2024-03-15 21:29:17,426] [INFO] [stage3.py:486:_setup_for_real_optimizer] optimizer state initialized\n",
+      "[2024-03-15 21:29:17,633] [INFO] [utils.py:800:see_memory_usage] After initializing ZeRO optimizer\n",
+      "[2024-03-15 21:29:17,633] [INFO] [utils.py:801:see_memory_usage] MA 0.03 GB         Max_MA 0.03 GB         CA 0.06 GB         Max_CA 0 GB \n",
+      "[2024-03-15 21:29:17,634] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory:  used = 138.05 GB, percent = 13.7%\n",
+      "[2024-03-15 21:29:17,634] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Final Optimizer = DeepSpeedCPUAdam\n",
+      "[2024-03-15 21:29:17,634] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed using client LR scheduler\n",
+      "[2024-03-15 21:29:17,634] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed LR Scheduler = None\n",
+      "[2024-03-15 21:29:17,634] [INFO] [logging.py:96:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0], mom=[(0.9, 0.999)]\n",
+      "[2024-03-15 21:29:17,636] [INFO] [config.py:996:print] DeepSpeedEngine configuration:\n",
+      "[2024-03-15 21:29:17,636] [INFO] [config.py:1000:print]   activation_checkpointing_config  {\n",
+      "    \"partition_activations\": false, \n",
+      "    \"contiguous_memory_optimization\": false, \n",
+      "    \"cpu_checkpointing\": false, \n",
+      "    \"number_checkpoints\": null, \n",
+      "    \"synchronize_checkpoint_boundary\": false, \n",
+      "    \"profile\": false\n",
+      "}\n",
+      "[2024-03-15 21:29:17,636] [INFO] [config.py:1000:print]   aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True}\n",
+      "[2024-03-15 21:29:17,636] [INFO] [config.py:1000:print]   amp_enabled .................. False\n",
+      "[2024-03-15 21:29:17,636] [INFO] [config.py:1000:print]   amp_params ................... False\n",
+      "[2024-03-15 21:29:17,636] [INFO] [config.py:1000:print]   autotuning_config ............ {\n",
+      "    \"enabled\": false, \n",
+      "    \"start_step\": null, \n",
+      "    \"end_step\": null, \n",
+      "    \"metric_path\": null, \n",
+      "    \"arg_mappings\": null, \n",
+      "    \"metric\": \"throughput\", \n",
+      "    \"model_info\": null, \n",
+      "    \"results_dir\": \"autotuning_results\", \n",
+      "    \"exps_dir\": \"autotuning_exps\", \n",
+      "    \"overwrite\": true, \n",
+      "    \"fast\": true, \n",
+      "    \"start_profile_step\": 3, \n",
+      "    \"end_profile_step\": 5, \n",
+      "    \"tuner_type\": \"gridsearch\", \n",
+      "    \"tuner_early_stopping\": 5, \n",
+      "    \"tuner_num_trials\": 50, \n",
+      "    \"model_info_path\": null, \n",
+      "    \"mp_size\": 1, \n",
+      "    \"max_train_batch_size\": null, \n",
+      "    \"min_train_batch_size\": 1, \n",
+      "    \"max_train_micro_batch_size_per_gpu\": 1.024000e+03, \n",
+      "    \"min_train_micro_batch_size_per_gpu\": 1, \n",
+      "    \"num_tuning_micro_batch_sizes\": 3\n",
+      "}\n",
+      "[2024-03-15 21:29:17,636] [INFO] [config.py:1000:print]   bfloat16_enabled ............. True\n",
+      "[2024-03-15 21:29:17,636] [INFO] [config.py:1000:print]   bfloat16_immediate_grad_update  False\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   checkpoint_parallel_write_pipeline  False\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   checkpoint_tag_validation_enabled  True\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   checkpoint_tag_validation_fail  False\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   comms_config ................. <deepspeed.comm.config.DeepSpeedCommsConfig object at 0x7f095baedab0>\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   communication_data_type ...... None\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   compile_config ............... enabled=False backend='inductor' kwargs={}\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}}\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   curriculum_enabled_legacy .... False\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   curriculum_params_legacy ..... False\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}}\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   data_efficiency_enabled ...... False\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   dataloader_drop_last ......... False\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   disable_allgather ............ False\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   dump_state ................... False\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   dynamic_loss_scale_args ...... None\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   eigenvalue_enabled ........... False\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   eigenvalue_gas_boundary_resolution  1\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   eigenvalue_layer_name ........ bert.encoder.layer\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   eigenvalue_layer_num ......... 0\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   eigenvalue_max_iter .......... 100\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   eigenvalue_stability ......... 1e-06\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   eigenvalue_tol ............... 0.01\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   eigenvalue_verbose ........... False\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   elasticity_enabled ........... False\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   flops_profiler_config ........ {\n",
+      "    \"enabled\": false, \n",
+      "    \"recompute_fwd_factor\": 0.0, \n",
+      "    \"profile_step\": 1, \n",
+      "    \"module_depth\": -1, \n",
+      "    \"top_modules\": 1, \n",
+      "    \"detailed\": true, \n",
+      "    \"output_file\": null\n",
+      "}\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   fp16_auto_cast ............... None\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   fp16_enabled ................. False\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   fp16_master_weights_and_gradients  False\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   global_rank .................. 0\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   grad_accum_dtype ............. None\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   gradient_accumulation_steps .. 1\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   gradient_clipping ............ 1.0\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   gradient_predivide_factor .... 1.0\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   graph_harvesting ............. False\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   initial_dynamic_scale ........ 1\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   load_universal_checkpoint .... False\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   loss_scale ................... 1.0\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   memory_breakdown ............. False\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   mics_hierarchial_params_gather  False\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   mics_shard_size .............. -1\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   nebula_config ................ {\n",
+      "    \"enabled\": false, \n",
+      "    \"persistent_storage_path\": null, \n",
+      "    \"persistent_time_interval\": 100, \n",
+      "    \"num_of_version_in_retention\": 2, \n",
+      "    \"enable_nebula_load\": true, \n",
+      "    \"load_path\": null\n",
+      "}\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   optimizer_legacy_fusion ...... False\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   optimizer_name ............... None\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   optimizer_params ............. None\n",
+      "[2024-03-15 21:29:17,637] [INFO] [config.py:1000:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True}\n",
+      "[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print]   pld_enabled .................. False\n",
+      "[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print]   pld_params ................... False\n",
+      "[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print]   prescale_gradients ........... False\n",
+      "[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print]   scheduler_name ............... None\n",
+      "[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print]   scheduler_params ............. None\n",
+      "[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print]   seq_parallel_communication_data_type  torch.float32\n",
+      "[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print]   sparse_attention ............. None\n",
+      "[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print]   sparse_gradients_enabled ..... False\n",
+      "[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print]   steps_per_print .............. inf\n",
+      "[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print]   train_batch_size ............. 64\n",
+      "[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print]   train_micro_batch_size_per_gpu  64\n",
+      "[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print]   use_data_before_expert_parallel_  False\n",
+      "[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print]   use_node_local_storage ....... False\n",
+      "[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print]   wall_clock_breakdown ......... False\n",
+      "[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print]   weight_quantization_config ... None\n",
+      "[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print]   world_size ................... 1\n",
+      "[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print]   zero_allow_untested_optimizer  True\n",
+      "[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print]   zero_config .................. stage=3 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=5308416 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=DeepSpeedZeroOffloadParamConfig(device='cpu', nvme_path=None, buffer_count=5, buffer_size=100,000,000, max_in_cpu=1,000,000,000, pin_memory=True) offload_optimizer=DeepSpeedZeroOffloadOptimizerConfig(device='cpu', nvme_path=None, buffer_count=4, pin_memory=True, pipeline=False, pipeline_read=False, pipeline_write=False, fast_init=False, ratio=1.0) sub_group_size=1,000,000,000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=4777574 param_persistence_threshold=23040 model_persistence_threshold=sys.maxsize max_live_parameters=1,000,000,000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=True stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True pipeline_loading_checkpoint=False override_module_apply=True\n",
+      "[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print]   zero_enabled ................. True\n",
+      "[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print]   zero_force_ds_cpu_optimizer .. True\n",
+      "[2024-03-15 21:29:17,638] [INFO] [config.py:1000:print]   zero_optimization_stage ...... 3\n",
+      "[2024-03-15 21:29:17,638] [INFO] [config.py:986:print_user_config]   json = {\n",
+      "    \"fp16\": {\n",
+      "        \"enabled\": false, \n",
+      "        \"loss_scale\": 0, \n",
+      "        \"loss_scale_window\": 1000, \n",
+      "        \"initial_scale_power\": 16, \n",
+      "        \"hysteresis\": 2, \n",
+      "        \"min_loss_scale\": 1\n",
+      "    }, \n",
+      "    \"bf16\": {\n",
+      "        \"enabled\": true\n",
+      "    }, \n",
+      "    \"zero_optimization\": {\n",
+      "        \"stage\": 3, \n",
+      "        \"allgather_partitions\": true, \n",
+      "        \"allgather_bucket_size\": 5.000000e+08, \n",
+      "        \"reduce_scatter\": true, \n",
+      "        \"contiguous_gradients\": true, \n",
+      "        \"overlap_comm\": true, \n",
+      "        \"reduce_bucket_size\": 5.308416e+06, \n",
+      "        \"stage3_prefetch_bucket_size\": 4.777574e+06, \n",
+      "        \"stage3_param_persistence_threshold\": 2.304000e+04, \n",
+      "        \"stage3_gather_16bit_weights_on_model_save\": true, \n",
+      "        \"offload_optimizer\": {\n",
+      "            \"device\": \"cpu\", \n",
+      "            \"pin_memory\": true\n",
+      "        }, \n",
+      "        \"offload_param\": {\n",
+      "            \"device\": \"cpu\", \n",
+      "            \"pin_memory\": true\n",
+      "        }\n",
+      "    }, \n",
+      "    \"train_batch_size\": 64, \n",
+      "    \"train_micro_batch_size_per_gpu\": 64, \n",
+      "    \"gradient_accumulation_steps\": 1, \n",
+      "    \"gradient_clipping\": 1.0, \n",
+      "    \"wall_clock_breakdown\": false, \n",
+      "    \"flops_profiler\": {\n",
+      "        \"enabled\": false, \n",
+      "        \"profile_step\": 1, \n",
+      "        \"module_depth\": -1, \n",
+      "        \"top_modules\": 1, \n",
+      "        \"detailed\": true, \n",
+      "        \"output_file\": null\n",
+      "    }, \n",
+      "    \"steps_per_print\": inf, \n",
+      "    \"zero_allow_untested_optimizer\": true\n",
+      "}\n",
+      "***** Running training *****\n",
+      "  Num examples = 50,486\n",
+      "  Num Epochs = 2\n",
+      "  Instantaneous batch size per device = 64\n",
+      "  Total train batch size (w. parallel, distributed & accumulation) = 64\n",
+      "  Gradient Accumulation steps = 1\n",
+      "  Total optimization steps = 1,000\n",
+      "  Number of trainable parameters = 2,949,120\n",
+      "{'loss': 2.2004, 'grad_norm': 44.037304409869364, 'learning_rate': 5e-06, 'epoch': 0.01}\n",
+      "{'loss': 1.4786, 'grad_norm': 39.531078618699645, 'learning_rate': 1e-05, 'epoch': 0.03}\n",
+      "{'loss': 0.9955, 'grad_norm': 16.66467873479667, 'learning_rate': 1.5e-05, 'epoch': 0.04}\n",
+      "{'loss': 0.7026, 'grad_norm': 7.417151045965821, 'learning_rate': 2e-05, 'epoch': 0.05}\n",
+      "{'loss': 0.6713, 'grad_norm': 7.608669365784156, 'learning_rate': 2.5e-05, 'epoch': 0.06}\n",
+      "{'loss': 0.5867, 'grad_norm': 12.552373192106195, 'learning_rate': 3e-05, 'epoch': 0.08}\n",
+      "{'loss': 0.6067, 'grad_norm': 10.342863016044076, 'learning_rate': 3.5e-05, 'epoch': 0.09}\n",
+      "{'loss': 0.5857, 'grad_norm': 10.985433470517048, 'learning_rate': 4e-05, 'epoch': 0.1}\n",
+      "{'loss': 0.5306, 'grad_norm': 5.22097493330033, 'learning_rate': 4.5e-05, 'epoch': 0.11}\n",
+      "{'loss': 0.5517, 'grad_norm': 3.9679057507396682, 'learning_rate': 5e-05, 'epoch': 0.13}\n",
+      "{'loss': 0.4573, 'grad_norm': 4.77643976524929, 'learning_rate': 4.9444444444444446e-05, 'epoch': 0.14}\n",
+      "{'loss': 0.469, 'grad_norm': 7.6144285869051345, 'learning_rate': 4.888888888888889e-05, 'epoch': 0.15}\n",
+      "{'loss': 0.4748, 'grad_norm': 4.787471338888486, 'learning_rate': 4.8333333333333334e-05, 'epoch': 0.16}\n",
+      "{'loss': 0.433, 'grad_norm': 3.3189167275368225, 'learning_rate': 4.7777777777777784e-05, 'epoch': 0.18}\n",
+      "{'loss': 0.4282, 'grad_norm': 7.248232922110331, 'learning_rate': 4.722222222222222e-05, 'epoch': 0.19}\n",
+      "{'loss': 0.409, 'grad_norm': 6.293684915700438, 'learning_rate': 4.666666666666667e-05, 'epoch': 0.2}\n",
+      "{'loss': 0.4451, 'grad_norm': 3.8753855113566833, 'learning_rate': 4.6111111111111115e-05, 'epoch': 0.22}\n",
+      "{'loss': 0.4288, 'grad_norm': 3.625475227512274, 'learning_rate': 4.555555555555556e-05, 'epoch': 0.23}\n",
+      "{'loss': 0.4506, 'grad_norm': 4.2449874489534665, 'learning_rate': 4.5e-05, 'epoch': 0.24}\n",
+      "{'loss': 0.4484, 'grad_norm': 6.084320127673726, 'learning_rate': 4.4444444444444447e-05, 'epoch': 0.25}\n",
+      "{'loss': 0.4487, 'grad_norm': 8.363684454316004, 'learning_rate': 4.388888888888889e-05, 'epoch': 0.27}\n",
+      "{'loss': 0.4878, 'grad_norm': 3.747181659840593, 'learning_rate': 4.3333333333333334e-05, 'epoch': 0.28}\n",
+      "{'loss': 0.412, 'grad_norm': 8.645140642353612, 'learning_rate': 4.277777777777778e-05, 'epoch': 0.29}\n",
+      "{'loss': 0.4558, 'grad_norm': 4.5260457637696625, 'learning_rate': 4.222222222222222e-05, 'epoch': 0.3}\n",
+      "{'loss': 0.4108, 'grad_norm': 4.781991938451388, 'learning_rate': 4.166666666666667e-05, 'epoch': 0.32}\n",
+      "{'loss': 0.4407, 'grad_norm': 5.893275628361186, 'learning_rate': 4.111111111111111e-05, 'epoch': 0.33}\n",
+      "{'loss': 0.4475, 'grad_norm': 4.100649312404707, 'learning_rate': 4.055555555555556e-05, 'epoch': 0.34}\n",
+      "{'loss': 0.4041, 'grad_norm': 7.0290388233232255, 'learning_rate': 4e-05, 'epoch': 0.35}\n",
+      "{'loss': 0.3599, 'grad_norm': 3.511374655086493, 'learning_rate': 3.944444444444445e-05, 'epoch': 0.37}\n",
+      "{'loss': 0.4706, 'grad_norm': 5.813953833114259, 'learning_rate': 3.888888888888889e-05, 'epoch': 0.38}\n",
+      "{'loss': 0.3911, 'grad_norm': 4.0524183329331604, 'learning_rate': 3.8333333333333334e-05, 'epoch': 0.39}\n",
+      "{'loss': 0.4033, 'grad_norm': 3.875046268309963, 'learning_rate': 3.777777777777778e-05, 'epoch': 0.41}\n",
+      "{'loss': 0.4199, 'grad_norm': 5.059711960144461, 'learning_rate': 3.722222222222222e-05, 'epoch': 0.42}\n",
+      "{'loss': 0.4216, 'grad_norm': 3.959248018825387, 'learning_rate': 3.6666666666666666e-05, 'epoch': 0.43}\n",
+      "{'loss': 0.367, 'grad_norm': 4.493383842056094, 'learning_rate': 3.611111111111111e-05, 'epoch': 0.44}\n",
+      "{'loss': 0.3686, 'grad_norm': 6.826580929267439, 'learning_rate': 3.555555555555556e-05, 'epoch': 0.46}\n",
+      "{'loss': 0.3566, 'grad_norm': 6.61801729550354, 'learning_rate': 3.5e-05, 'epoch': 0.47}\n",
+      "{'loss': 0.3932, 'grad_norm': 4.124116051492338, 'learning_rate': 3.444444444444445e-05, 'epoch': 0.48}\n",
+      "{'loss': 0.3514, 'grad_norm': 4.545406773056064, 'learning_rate': 3.388888888888889e-05, 'epoch': 0.49}\n",
+      "{'loss': 0.4364, 'grad_norm': 5.868492580695467, 'learning_rate': 3.3333333333333335e-05, 'epoch': 0.51}\n",
+      "{'loss': 0.346, 'grad_norm': 5.245615445258653, 'learning_rate': 3.277777777777778e-05, 'epoch': 0.52}\n",
+      "{'loss': 0.335, 'grad_norm': 3.6031965739940257, 'learning_rate': 3.222222222222223e-05, 'epoch': 0.53}\n",
+      "{'loss': 0.37, 'grad_norm': 5.240535743057915, 'learning_rate': 3.1666666666666666e-05, 'epoch': 0.54}\n",
+      "{'loss': 0.3732, 'grad_norm': 7.290964612314844, 'learning_rate': 3.111111111111111e-05, 'epoch': 0.56}\n",
+      "{'loss': 0.378, 'grad_norm': 5.352972449129333, 'learning_rate': 3.055555555555556e-05, 'epoch': 0.57}\n",
+      "{'loss': 0.3512, 'grad_norm': 3.2834858860521705, 'learning_rate': 3e-05, 'epoch': 0.58}\n",
+      "{'loss': 0.3963, 'grad_norm': 5.047726585891225, 'learning_rate': 2.9444444444444448e-05, 'epoch': 0.6}\n",
+      "{'loss': 0.3825, 'grad_norm': 3.6864211233732562, 'learning_rate': 2.8888888888888888e-05, 'epoch': 0.61}\n",
+      "{'loss': 0.3715, 'grad_norm': 4.97593217867295, 'learning_rate': 2.8333333333333335e-05, 'epoch': 0.62}\n",
+      "{'loss': 0.4358, 'grad_norm': 5.702141663942072, 'learning_rate': 2.777777777777778e-05, 'epoch': 0.63}\n",
+      " 50%|████████████████████                    | 500/1000 [10:03<09:58,  1.20s/it]***** Running Evaluation *****\n",
+      "  Num examples = 3000\n",
+      "  Batch size = 128\n",
+      "\n",
+      "  0%|                                                    | 0/24 [00:00<?, ?it/s]\u001b[A\n",
+      "  8%|███▋                                        | 2/24 [00:01<00:19,  1.11it/s]\u001b[A\n",
+      " 12%|█████▌                                      | 3/24 [00:02<00:18,  1.11it/s]\u001b[A\n",
+      " 17%|███████▎                                    | 4/24 [00:03<00:17,  1.11it/s]\u001b[A\n",
+      " 21%|█████████▏                                  | 5/24 [00:04<00:17,  1.11it/s]\u001b[A\n",
+      " 25%|███████████                                 | 6/24 [00:05<00:16,  1.11it/s]\u001b[A\n",
+      " 29%|████████████▊                               | 7/24 [00:06<00:15,  1.11it/s]\u001b[A\n",
+      " 33%|██████████████▋                             | 8/24 [00:07<00:14,  1.11it/s]\u001b[A\n",
+      " 38%|████████████████▌                           | 9/24 [00:08<00:13,  1.11it/s]\u001b[A\n",
+      " 42%|█████████████████▉                         | 10/24 [00:08<00:12,  1.11it/s]\u001b[A\n",
+      " 46%|███████████████████▋                       | 11/24 [00:09<00:11,  1.11it/s]\u001b[A\n",
+      " 50%|█████████████████████▌                     | 12/24 [00:10<00:10,  1.11it/s]\u001b[A\n",
+      " 54%|███████████████████████▎                   | 13/24 [00:11<00:09,  1.11it/s]\u001b[A\n",
+      " 58%|█████████████████████████                  | 14/24 [00:12<00:08,  1.11it/s]\u001b[A\n",
+      " 62%|██████████████████████████▉                | 15/24 [00:13<00:08,  1.11it/s]\u001b[A\n",
+      " 67%|████████████████████████████▋              | 16/24 [00:14<00:07,  1.11it/s]\u001b[A\n",
+      " 71%|██████████████████████████████▍            | 17/24 [00:15<00:06,  1.11it/s]\u001b[A\n",
+      " 75%|████████████████████████████████▎          | 18/24 [00:16<00:05,  1.11it/s]\u001b[A\n",
+      " 79%|██████████████████████████████████         | 19/24 [00:17<00:04,  1.11it/s]\u001b[A\n",
+      " 83%|███████████████████████████████████▊       | 20/24 [00:17<00:03,  1.11it/s]\u001b[A\n",
+      " 88%|█████████████████████████████████████▋     | 21/24 [00:18<00:02,  1.11it/s]\u001b[A\n",
+      " 92%|███████████████████████████████████████▍   | 22/24 [00:19<00:01,  1.11it/s]\u001b[A\n",
+      " 96%|█████████████████████████████████████████▏ | 23/24 [00:20<00:00,  1.12it/s]\u001b[A\n",
+      "                                                                                \u001b[A\n",
+      "\u001b[A{'eval_loss': 0.4814399480819702, 'eval_runtime': 23.5015, 'eval_samples_per_second': 127.651, 'eval_steps_per_second': 1.021, 'epoch': 0.63}\n",
+      " 50%|████████████████████                    | 500/1000 [10:26<09:58,  1.20s/it]\n",
+      "100%|███████████████████████████████████████████| 24/24 [00:21<00:00,  1.22it/s]\u001b[A\n",
+      "                                                                                \u001b[ASaving model checkpoint to output/ocnli_public_chatml/20240315212836/tmp-checkpoint-500\n",
+      "tokenizer config file saved in output/ocnli_public_chatml/20240315212836/tmp-checkpoint-500/tokenizer_config.json\n",
+      "Special tokens file saved in output/ocnli_public_chatml/20240315212836/tmp-checkpoint-500/special_tokens_map.json\n",
+      "[2024-03-15 21:39:48,407] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step500 is about to be saved!\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1876: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "[2024-03-15 21:39:48,447] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: output/ocnli_public_chatml/20240315212836/tmp-checkpoint-500/global_step500/zero_pp_rank_0_mp_rank_00_model_states.pt\n",
+      "[2024-03-15 21:39:48,447] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving output/ocnli_public_chatml/20240315212836/tmp-checkpoint-500/global_step500/zero_pp_rank_0_mp_rank_00_model_states.pt...\n",
+      "[2024-03-15 21:39:48,455] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved output/ocnli_public_chatml/20240315212836/tmp-checkpoint-500/global_step500/zero_pp_rank_0_mp_rank_00_model_states.pt.\n",
+      "[2024-03-15 21:39:48,455] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving output/ocnli_public_chatml/20240315212836/tmp-checkpoint-500/global_step500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt...\n",
+      "[2024-03-15 21:39:48,493] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved output/ocnli_public_chatml/20240315212836/tmp-checkpoint-500/global_step500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt.\n",
+      "[2024-03-15 21:39:48,493] [INFO] [engine.py:3488:_save_zero_checkpoint] zero checkpoint saved output/ocnli_public_chatml/20240315212836/tmp-checkpoint-500/global_step500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt\n",
+      "[2024-03-15 21:39:48,498] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step500 is ready now!\n",
+      "[2024-03-15 21:39:49,718] [WARNING] [stage3.py:2069:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "{'loss': 0.4598, 'grad_norm': 6.5312558406821974, 'learning_rate': 2.7222222222222223e-05, 'epoch': 0.65}\n",
+      "{'loss': 0.355, 'grad_norm': 3.9302654106847914, 'learning_rate': 2.6666666666666667e-05, 'epoch': 0.66}\n",
+      "{'loss': 0.3781, 'grad_norm': 4.25997203692361, 'learning_rate': 2.6111111111111114e-05, 'epoch': 0.67}\n",
+      "{'loss': 0.3668, 'grad_norm': 3.5989513406349776, 'learning_rate': 2.5555555555555554e-05, 'epoch': 0.68}\n",
+      "{'loss': 0.3585, 'grad_norm': 3.6575850959103717, 'learning_rate': 2.5e-05, 'epoch': 0.7}\n",
+      "{'loss': 0.3674, 'grad_norm': 4.911812708486751, 'learning_rate': 2.4444444444444445e-05, 'epoch': 0.71}\n",
+      "{'loss': 0.368, 'grad_norm': 4.194735979358348, 'learning_rate': 2.3888888888888892e-05, 'epoch': 0.72}\n",
+      "{'loss': 0.3891, 'grad_norm': 3.5460606114800868, 'learning_rate': 2.3333333333333336e-05, 'epoch': 0.74}\n",
+      "{'loss': 0.3977, 'grad_norm': 3.150838310468473, 'learning_rate': 2.277777777777778e-05, 'epoch': 0.75}\n",
+      "{'loss': 0.3533, 'grad_norm': 3.9069432978502756, 'learning_rate': 2.2222222222222223e-05, 'epoch': 0.76}\n",
+      "{'loss': 0.3811, 'grad_norm': 5.105086367004499, 'learning_rate': 2.1666666666666667e-05, 'epoch': 0.77}\n",
+      "{'loss': 0.325, 'grad_norm': 4.369369589510735, 'learning_rate': 2.111111111111111e-05, 'epoch': 0.79}\n",
+      "{'loss': 0.3641, 'grad_norm': 6.171511559710524, 'learning_rate': 2.0555555555555555e-05, 'epoch': 0.8}\n",
+      "{'loss': 0.3316, 'grad_norm': 3.7044215769355313, 'learning_rate': 2e-05, 'epoch': 0.81}\n",
+      "{'loss': 0.3898, 'grad_norm': 3.788686076864363, 'learning_rate': 1.9444444444444445e-05, 'epoch': 0.82}\n",
+      "{'loss': 0.3732, 'grad_norm': 6.75853923792821, 'learning_rate': 1.888888888888889e-05, 'epoch': 0.84}\n",
+      "{'loss': 0.3827, 'grad_norm': 5.165864430975117, 'learning_rate': 1.8333333333333333e-05, 'epoch': 0.85}\n",
+      "{'loss': 0.3565, 'grad_norm': 3.535604172460323, 'learning_rate': 1.777777777777778e-05, 'epoch': 0.86}\n",
+      "{'loss': 0.3345, 'grad_norm': 3.633280931030727, 'learning_rate': 1.7222222222222224e-05, 'epoch': 0.87}\n",
+      "{'loss': 0.3639, 'grad_norm': 4.485584268777012, 'learning_rate': 1.6666666666666667e-05, 'epoch': 0.89}\n",
+      "{'loss': 0.402, 'grad_norm': 3.7925929660253317, 'learning_rate': 1.6111111111111115e-05, 'epoch': 0.9}\n",
+      "{'loss': 0.3452, 'grad_norm': 5.183220810399684, 'learning_rate': 1.5555555555555555e-05, 'epoch': 0.91}\n",
+      "{'loss': 0.3936, 'grad_norm': 9.733180087550997, 'learning_rate': 1.5e-05, 'epoch': 0.93}\n",
+      "{'loss': 0.3367, 'grad_norm': 5.1834921923924755, 'learning_rate': 1.4444444444444444e-05, 'epoch': 0.94}\n",
+      "{'loss': 0.3681, 'grad_norm': 4.409917292781669, 'learning_rate': 1.388888888888889e-05, 'epoch': 0.95}\n",
+      "{'loss': 0.3348, 'grad_norm': 3.335369553115092, 'learning_rate': 1.3333333333333333e-05, 'epoch': 0.96}\n",
+      "{'loss': 0.3972, 'grad_norm': 5.322684365694768, 'learning_rate': 1.2777777777777777e-05, 'epoch': 0.98}\n",
+      "{'loss': 0.3835, 'grad_norm': 6.105565593241867, 'learning_rate': 1.2222222222222222e-05, 'epoch': 0.99}\n",
+      "{'loss': 0.3916, 'grad_norm': 3.49133044485143, 'learning_rate': 1.1666666666666668e-05, 'epoch': 1.0}\n",
+      "{'loss': 0.3597, 'grad_norm': 2.888336925676786, 'learning_rate': 1.1111111111111112e-05, 'epoch': 1.01}\n",
+      "{'loss': 0.3304, 'grad_norm': 2.9537925974792714, 'learning_rate': 1.0555555555555555e-05, 'epoch': 1.03}\n",
+      "{'loss': 0.3392, 'grad_norm': 5.712451906231322, 'learning_rate': 1e-05, 'epoch': 1.04}\n",
+      "{'loss': 0.3393, 'grad_norm': 5.12273971212701, 'learning_rate': 9.444444444444445e-06, 'epoch': 1.05}\n",
+      "{'loss': 0.3018, 'grad_norm': 3.2845513584107033, 'learning_rate': 8.88888888888889e-06, 'epoch': 1.06}\n",
+      "{'loss': 0.3384, 'grad_norm': 3.2604963558968145, 'learning_rate': 8.333333333333334e-06, 'epoch': 1.08}\n",
+      "{'loss': 0.3252, 'grad_norm': 6.04878965518926, 'learning_rate': 7.777777777777777e-06, 'epoch': 1.09}\n",
+      "{'loss': 0.384, 'grad_norm': 5.226938733071884, 'learning_rate': 7.222222222222222e-06, 'epoch': 1.1}\n",
+      "{'loss': 0.2914, 'grad_norm': 3.8905566106093925, 'learning_rate': 6.666666666666667e-06, 'epoch': 1.12}\n",
+      "{'loss': 0.2984, 'grad_norm': 3.3599598929872525, 'learning_rate': 6.111111111111111e-06, 'epoch': 1.13}\n",
+      "{'loss': 0.3459, 'grad_norm': 5.669365782344921, 'learning_rate': 5.555555555555556e-06, 'epoch': 1.14}\n",
+      "{'loss': 0.3393, 'grad_norm': 3.078993311756746, 'learning_rate': 5e-06, 'epoch': 1.15}\n",
+      "{'loss': 0.3314, 'grad_norm': 5.3827552737002495, 'learning_rate': 4.444444444444445e-06, 'epoch': 1.17}\n",
+      "{'loss': 0.3345, 'grad_norm': 3.2322873367016665, 'learning_rate': 3.888888888888889e-06, 'epoch': 1.18}\n",
+      "{'loss': 0.3363, 'grad_norm': 3.3300669560846425, 'learning_rate': 3.3333333333333333e-06, 'epoch': 1.19}\n",
+      "{'loss': 0.344, 'grad_norm': 3.7589742724407653, 'learning_rate': 2.777777777777778e-06, 'epoch': 1.2}\n",
+      "{'loss': 0.3195, 'grad_norm': 2.8061902793867626, 'learning_rate': 2.2222222222222225e-06, 'epoch': 1.22}\n",
+      "{'loss': 0.3128, 'grad_norm': 3.3215568095822516, 'learning_rate': 1.6666666666666667e-06, 'epoch': 1.23}\n",
+      "{'loss': 0.3035, 'grad_norm': 4.30331459929754, 'learning_rate': 1.1111111111111112e-06, 'epoch': 1.24}\n",
+      "{'loss': 0.3374, 'grad_norm': 3.9324447635716995, 'learning_rate': 5.555555555555556e-07, 'epoch': 1.25}\n",
+      "{'loss': 0.3254, 'grad_norm': 4.112509804571923, 'learning_rate': 0.0, 'epoch': 1.27}\n",
+      "100%|███████████████████████████████████████| 1000/1000 [20:30<00:00,  1.19s/it]***** Running Evaluation *****\n",
+      "  Num examples = 3000\n",
+      "  Batch size = 128\n",
+      "\n",
+      "  0%|                                                    | 0/24 [00:00<?, ?it/s]\u001b[A\n",
+      "  8%|███▋                                        | 2/24 [00:00<00:09,  2.23it/s]\u001b[A\n",
+      " 12%|█████▌                                      | 3/24 [00:01<00:13,  1.58it/s]\u001b[A\n",
+      " 17%|███████▎                                    | 4/24 [00:02<00:14,  1.37it/s]\u001b[A\n",
+      " 21%|█████████▏                                  | 5/24 [00:03<00:14,  1.27it/s]\u001b[A\n",
+      " 25%|███████████                                 | 6/24 [00:04<00:14,  1.21it/s]\u001b[A\n",
+      " 29%|████████████▊                               | 7/24 [00:05<00:14,  1.18it/s]\u001b[A\n",
+      " 33%|██████████████▋                             | 8/24 [00:06<00:13,  1.16it/s]\u001b[A\n",
+      " 38%|████████████████▌                           | 9/24 [00:07<00:13,  1.14it/s]\u001b[A\n",
+      " 42%|█████████████████▉                         | 10/24 [00:08<00:12,  1.13it/s]\u001b[A\n",
+      " 46%|███████████████████▋                       | 11/24 [00:08<00:11,  1.13it/s]\u001b[A\n",
+      " 50%|█████████████████████▌                     | 12/24 [00:09<00:10,  1.12it/s]\u001b[A\n",
+      " 54%|███████████████████████▎                   | 13/24 [00:10<00:09,  1.12it/s]\u001b[A\n",
+      " 58%|█████████████████████████                  | 14/24 [00:11<00:08,  1.12it/s]\u001b[A\n",
+      " 62%|██████████████████████████▉                | 15/24 [00:12<00:08,  1.12it/s]\u001b[A\n",
+      " 67%|████████████████████████████▋              | 16/24 [00:13<00:07,  1.12it/s]\u001b[A\n",
+      " 71%|██████████████████████████████▍            | 17/24 [00:14<00:06,  1.11it/s]\u001b[A\n",
+      " 75%|████████████████████████████████▎          | 18/24 [00:15<00:05,  1.12it/s]\u001b[A\n",
+      " 79%|██████████████████████████████████         | 19/24 [00:16<00:04,  1.12it/s]\u001b[A\n",
+      " 83%|███████████████████████████████████▊       | 20/24 [00:17<00:03,  1.12it/s]\u001b[A\n",
+      " 88%|█████████████████████████████████████▋     | 21/24 [00:17<00:02,  1.12it/s]\u001b[A\n",
+      " 92%|███████████████████████████████████████▍   | 22/24 [00:18<00:01,  1.12it/s]\u001b[A\n",
+      " 96%|█████████████████████████████████████████▏ | 23/24 [00:19<00:00,  1.12it/s]\u001b[A\n",
+      "                                                                                \u001b[A\n",
+      "\u001b[A{'eval_loss': 0.414621502161026, 'eval_runtime': 21.2011, 'eval_samples_per_second': 141.502, 'eval_steps_per_second': 1.132, 'epoch': 1.27}\n",
+      "100%|███████████████████████████████████████| 1000/1000 [20:52<00:00,  1.19s/it]\n",
+      "100%|███████████████████████████████████████████| 24/24 [00:20<00:00,  1.22it/s]\u001b[A\n",
+      "                                                                                \u001b[ASaving model checkpoint to output/ocnli_public_chatml/20240315212836/tmp-checkpoint-1000\n",
+      "tokenizer config file saved in output/ocnli_public_chatml/20240315212836/tmp-checkpoint-1000/tokenizer_config.json\n",
+      "Special tokens file saved in output/ocnli_public_chatml/20240315212836/tmp-checkpoint-1000/special_tokens_map.json\n",
+      "[2024-03-15 21:50:12,793] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step1000 is about to be saved!\n",
+      "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1876: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "[2024-03-15 21:50:12,809] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: output/ocnli_public_chatml/20240315212836/tmp-checkpoint-1000/global_step1000/zero_pp_rank_0_mp_rank_00_model_states.pt\n",
+      "[2024-03-15 21:50:12,809] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving output/ocnli_public_chatml/20240315212836/tmp-checkpoint-1000/global_step1000/zero_pp_rank_0_mp_rank_00_model_states.pt...\n",
+      "[2024-03-15 21:50:12,817] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved output/ocnli_public_chatml/20240315212836/tmp-checkpoint-1000/global_step1000/zero_pp_rank_0_mp_rank_00_model_states.pt.\n",
+      "[2024-03-15 21:50:12,818] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving output/ocnli_public_chatml/20240315212836/tmp-checkpoint-1000/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt...\n",
+      "[2024-03-15 21:50:12,851] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved output/ocnli_public_chatml/20240315212836/tmp-checkpoint-1000/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt.\n",
+      "[2024-03-15 21:50:12,852] [INFO] [engine.py:3488:_save_zero_checkpoint] zero checkpoint saved output/ocnli_public_chatml/20240315212836/tmp-checkpoint-1000/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt\n",
+      "[2024-03-15 21:50:12,856] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step1000 is ready now!\n",
+      "\n",
+      "\n",
+      "Training completed. Do not forget to share your model on huggingface.co/models =)\n",
+      "\n",
+      "\n",
+      "{'train_runtime': 1255.2202, 'train_samples_per_second': 50.987, 'train_steps_per_second': 0.797, 'train_loss': 0.43027476024627687, 'epoch': 1.27}\n",
+      "100%|███████████████████████████████████████| 1000/1000 [20:55<00:00,  1.26s/it]\n",
+      "[2024-03-15 21:50:18,203] [INFO] [launch.py:348:main] Process 86577 exits successfully.\n"
+     ]
+    }
+   ],
+   "source": [
+    "!bash lora_finetune_ocnli.sh"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. 推理验证"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import torch\n",
+    "from tqdm import tqdm\n",
+    "from transformers import AutoModelForCausalLM, AutoTokenizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "path = \"output/ocnli_public_chatml/20240316002856/checkpoint-1500\"\n",
+    "tokenizer = AutoTokenizer.from_pretrained(path)\n",
+    "model = AutoModelForCausalLM.from_pretrained(\n",
+    "    path, torch_dtype=torch.bfloat16, device_map=\"cuda\", trust_remote_code=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "('entailment',\n",
+       " [{'role': 'user',\n",
+       "   'content': '<用户>请判断下边两个句子的关系属于 [entailment, neutral, contradiction]中的哪一种？\\n句子1: 身上裹一件工厂发的棉大衣,手插在袖筒里\\n句子2：身上至少一件衣服\\n<AI>'},\n",
+       "  {'role': 'assistant', 'content': 'entailment'}])"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "res, history = model.chat(tokenizer, query=\"<用户>请判断下边两个句子的关系属于 [entailment, neutral, contradiction]中的哪一种？\\n句子1: 身上裹一件工厂发的棉大衣,手插在袖筒里\\n句子2：身上至少一件衣服\\n<AI>\")\n",
+    "res, history"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(\"data/ocnli_public_chatml/dev.json\", 'r') as f:\n",
+    "    dev_sample_list = json.load(f)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|          | 0/500 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "  0%|          | 1/500 [00:00<00:54,  9.12it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "  0%|          | 2/500 [00:00<00:54,  9.09it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "  1%|          | 3/500 [00:00<00:55,  8.98it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "  1%|          | 5/500 [00:00<00:49,  9.99it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "  1%|          | 6/500 [00:00<00:51,  9.67it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "  2%|▏         | 8/500 [00:00<00:44, 11.07it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "  2%|▏         | 10/500 [00:00<00:48, 10.17it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "  2%|▏         | 12/500 [00:01<00:47, 10.30it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "  3%|▎         | 14/500 [00:01<00:46, 10.51it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "  3%|▎         | 16/500 [00:01<00:42, 11.33it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "  4%|▎         | 18/500 [00:01<00:40, 11.96it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "  4%|▍         | 20/500 [00:01<00:38, 12.42it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "  4%|▍         | 22/500 [00:02<00:42, 11.19it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "  5%|▍         | 24/500 [00:02<00:40, 11.83it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "  5%|▌         | 26/500 [00:02<00:38, 12.32it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "  6%|▌         | 28/500 [00:02<00:39, 11.87it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "  6%|▌         | 30/500 [00:02<00:40, 11.55it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "  6%|▋         | 32/500 [00:02<00:41, 11.34it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "  7%|▋         | 34/500 [00:03<00:39, 11.94it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "  7%|▋         | 36/500 [00:03<00:37, 12.38it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "  8%|▊         | 38/500 [00:03<00:38, 11.92it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "  8%|▊         | 40/500 [00:03<00:37, 12.37it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "  8%|▊         | 42/500 [00:03<00:36, 12.71it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "  9%|▉         | 44/500 [00:03<00:40, 11.39it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "  9%|▉         | 46/500 [00:04<00:37, 11.98it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 10%|▉         | 48/500 [00:04<00:38, 11.66it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 10%|█         | 50/500 [00:04<00:36, 12.19it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 10%|█         | 52/500 [00:04<00:35, 12.58it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 11%|█         | 54/500 [00:04<00:34, 12.87it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 11%|█         | 56/500 [00:04<00:34, 13.06it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 12%|█▏        | 58/500 [00:05<00:38, 11.58it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 12%|█▏        | 60/500 [00:05<00:38, 11.39it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 12%|█▏        | 62/500 [00:05<00:36, 11.97it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 13%|█▎        | 64/500 [00:05<00:35, 12.36it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 13%|█▎        | 66/500 [00:05<00:36, 11.89it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 14%|█▎        | 68/500 [00:05<00:37, 11.60it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 14%|█▍        | 70/500 [00:06<00:39, 10.75it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 14%|█▍        | 72/500 [00:06<00:41, 10.23it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 15%|█▍        | 74/500 [00:06<00:40, 10.44it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 15%|█▌        | 76/500 [00:06<00:38, 11.10it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 16%|█▌        | 78/500 [00:06<00:36, 11.67it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 16%|█▌        | 80/500 [00:06<00:37, 11.35it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 16%|█▋        | 82/500 [00:07<00:37, 11.23it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 17%|█▋        | 84/500 [00:07<00:37, 11.15it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 17%|█▋        | 86/500 [00:07<00:37, 11.10it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 18%|█▊        | 88/500 [00:07<00:37, 11.06it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 18%|█▊        | 90/500 [00:07<00:34, 11.72it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 18%|█▊        | 92/500 [00:08<00:35, 11.50it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 19%|█▉        | 94/500 [00:08<00:33, 12.07it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 19%|█▉        | 96/500 [00:08<00:34, 11.73it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 20%|█▉        | 98/500 [00:08<00:32, 12.24it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 20%|██        | 100/500 [00:08<00:33, 11.84it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 20%|██        | 102/500 [00:08<00:34, 11.57it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 21%|██        | 104/500 [00:08<00:32, 12.11it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 21%|██        | 106/500 [00:09<00:31, 12.53it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 22%|██▏       | 108/500 [00:09<00:30, 12.85it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 22%|██▏       | 110/500 [00:09<00:29, 13.07it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 22%|██▏       | 112/500 [00:09<00:29, 13.24it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 23%|██▎       | 114/500 [00:09<00:28, 13.35it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 23%|██▎       | 116/500 [00:09<00:28, 13.43it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 24%|██▎       | 118/500 [00:10<00:31, 12.23it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 24%|██▍       | 120/500 [00:10<00:33, 11.29it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 24%|██▍       | 122/500 [00:10<00:33, 11.19it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 25%|██▍       | 124/500 [00:10<00:31, 11.81it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 25%|██▌       | 126/500 [00:10<00:30, 12.25it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 26%|██▌       | 128/500 [00:10<00:31, 11.77it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 26%|██▌       | 130/500 [00:11<00:30, 12.25it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 26%|██▋       | 132/500 [00:11<00:29, 12.63it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 27%|██▋       | 134/500 [00:11<00:32, 11.36it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 27%|██▋       | 136/500 [00:11<00:34, 10.62it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 28%|██▊       | 138/500 [00:11<00:33, 10.73it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 28%|██▊       | 140/500 [00:12<00:33, 10.80it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 28%|██▊       | 142/500 [00:12<00:31, 11.52it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 29%|██▉       | 144/500 [00:12<00:31, 11.36it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 29%|██▉       | 146/500 [00:12<00:29, 11.96it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 30%|██▉       | 148/500 [00:12<00:30, 11.65it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 30%|███       | 150/500 [00:12<00:28, 12.20it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 30%|███       | 152/500 [00:12<00:27, 12.60it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 31%|███       | 154/500 [00:13<00:28, 12.07it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 31%|███       | 156/500 [00:13<00:27, 12.49it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 32%|███▏      | 158/500 [00:13<00:26, 12.82it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 32%|███▏      | 160/500 [00:13<00:27, 12.21it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 32%|███▏      | 162/500 [00:13<00:30, 11.11it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 33%|███▎      | 164/500 [00:14<00:30, 11.07it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 33%|███▎      | 166/500 [00:14<00:28, 11.73it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 34%|███▎      | 168/500 [00:14<00:27, 12.24it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 34%|███▍      | 170/500 [00:14<00:29, 11.14it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 34%|███▍      | 172/500 [00:14<00:29, 11.06it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 35%|███▍      | 174/500 [00:14<00:27, 11.67it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 35%|███▌      | 176/500 [00:15<00:26, 12.17it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 36%|███▌      | 178/500 [00:15<00:25, 12.55it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 36%|███▌      | 180/500 [00:15<00:24, 12.82it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 36%|███▋      | 182/500 [00:15<00:26, 12.20it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 37%|███▋      | 184/500 [00:15<00:26, 11.79it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 37%|███▋      | 186/500 [00:15<00:25, 12.26it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 38%|███▊      | 188/500 [00:16<00:26, 11.83it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 38%|███▊      | 190/500 [00:16<00:28, 10.88it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 38%|███▊      | 192/500 [00:16<00:26, 11.57it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 39%|███▉      | 194/500 [00:16<00:26, 11.38it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 39%|███▉      | 196/500 [00:16<00:27, 11.17it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 40%|███▉      | 198/500 [00:16<00:25, 11.74it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 40%|████      | 200/500 [00:17<00:24, 12.21it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 40%|████      | 202/500 [00:17<00:23, 12.59it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 41%|████      | 204/500 [00:17<00:24, 12.04it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 41%|████      | 206/500 [00:17<00:25, 11.69it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 42%|████▏     | 208/500 [00:17<00:25, 11.45it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 42%|████▏     | 210/500 [00:17<00:25, 11.29it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 42%|████▏     | 212/500 [00:18<00:24, 11.86it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 43%|████▎     | 214/500 [00:18<00:24, 11.52it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 43%|████▎     | 216/500 [00:18<00:23, 12.06it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 44%|████▎     | 218/500 [00:18<00:25, 11.02it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 44%|████▍     | 220/500 [00:18<00:23, 11.68it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 44%|████▍     | 222/500 [00:18<00:24, 11.45it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 45%|████▍     | 224/500 [00:19<00:25, 10.64it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 45%|████▌     | 226/500 [00:19<00:25, 10.73it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 46%|████▌     | 228/500 [00:19<00:25, 10.79it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 46%|████▌     | 230/500 [00:19<00:24, 10.83it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 46%|████▋     | 232/500 [00:19<00:24, 10.86it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 47%|████▋     | 234/500 [00:20<00:26, 10.16it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 47%|████▋     | 236/500 [00:20<00:25, 10.33it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 48%|████▊     | 238/500 [00:20<00:23, 11.11it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 48%|████▊     | 240/500 [00:20<00:22, 11.74it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 48%|████▊     | 242/500 [00:20<00:23, 10.82it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 49%|████▉     | 244/500 [00:21<00:22, 11.53it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 49%|████▉     | 246/500 [00:21<00:23, 10.71it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 50%|████▉     | 248/500 [00:21<00:24, 10.22it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 50%|█████     | 250/500 [00:21<00:22, 11.06it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 50%|█████     | 252/500 [00:21<00:21, 11.74it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 51%|█████     | 254/500 [00:21<00:20, 12.27it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 51%|█████     | 256/500 [00:22<00:20, 11.81it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 52%|█████▏    | 258/500 [00:22<00:19, 12.23it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 52%|█████▏    | 260/500 [00:22<00:19, 12.55it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 52%|█████▏    | 262/500 [00:22<00:18, 12.82it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 53%|█████▎    | 264/500 [00:22<00:19, 12.17it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 53%|█████▎    | 266/500 [00:22<00:18, 12.55it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 54%|█████▎    | 268/500 [00:22<00:18, 12.83it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 54%|█████▍    | 270/500 [00:23<00:17, 13.04it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 54%|█████▍    | 272/500 [00:23<00:17, 13.19it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 55%|█████▍    | 274/500 [00:23<00:17, 13.29it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 55%|█████▌    | 276/500 [00:23<00:17, 12.48it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 56%|█████▌    | 278/500 [00:23<00:17, 12.77it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 56%|█████▌    | 280/500 [00:23<00:17, 12.94it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 56%|█████▋    | 282/500 [00:24<00:17, 12.11it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 57%|█████▋    | 284/500 [00:24<00:17, 12.51it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 57%|█████▋    | 286/500 [00:24<00:18, 11.29it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 58%|█████▊    | 288/500 [00:24<00:18, 11.17it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 58%|█████▊    | 290/500 [00:24<00:20, 10.43it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 58%|█████▊    | 292/500 [00:25<00:19, 10.57it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 59%|█████▉    | 294/500 [00:25<00:19, 10.68it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 59%|█████▉    | 296/500 [00:25<00:17, 11.39it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 60%|█████▉    | 298/500 [00:25<00:17, 11.25it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 60%|██████    | 300/500 [00:25<00:17, 11.15it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 60%|██████    | 302/500 [00:25<00:17, 11.09it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 61%|██████    | 304/500 [00:26<00:16, 11.72it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 61%|██████    | 306/500 [00:26<00:16, 11.47it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 62%|██████▏   | 308/500 [00:26<00:16, 11.31it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 62%|██████▏   | 310/500 [00:26<00:16, 11.85it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 62%|██████▏   | 312/500 [00:26<00:16, 11.47it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 63%|██████▎   | 314/500 [00:26<00:16, 11.25it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 63%|██████▎   | 316/500 [00:27<00:16, 11.14it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 64%|██████▎   | 318/500 [00:27<00:16, 11.06it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 64%|██████▍   | 320/500 [00:27<00:15, 11.68it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 64%|██████▍   | 322/500 [00:27<00:14, 12.19it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 65%|██████▍   | 324/500 [00:27<00:13, 12.58it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 65%|██████▌   | 326/500 [00:27<00:14, 12.04it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 66%|██████▌   | 328/500 [00:28<00:13, 12.46it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 66%|██████▌   | 330/500 [00:28<00:14, 11.96it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 66%|██████▋   | 332/500 [00:28<00:13, 12.33it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 67%|██████▋   | 334/500 [00:28<00:14, 11.84it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 67%|██████▋   | 336/500 [00:28<00:13, 12.30it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 68%|██████▊   | 338/500 [00:28<00:13, 11.78it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 68%|██████▊   | 340/500 [00:29<00:13, 12.24it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 68%|██████▊   | 342/500 [00:29<00:12, 12.60it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 69%|██████▉   | 344/500 [00:29<00:12, 12.03it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 69%|██████▉   | 346/500 [00:29<00:13, 11.68it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 70%|██████▉   | 348/500 [00:29<00:14, 10.77it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 70%|███████   | 350/500 [00:30<00:14, 10.23it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 70%|███████   | 352/500 [00:30<00:14, 10.43it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 71%|███████   | 354/500 [00:30<00:13, 10.55it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 71%|███████   | 356/500 [00:30<00:12, 11.29it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 72%|███████▏  | 358/500 [00:30<00:13, 10.20it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 72%|███████▏  | 360/500 [00:31<00:13, 10.38it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 72%|███████▏  | 362/500 [00:31<00:12, 10.77it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 73%|███████▎  | 364/500 [00:31<00:12, 11.03it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 73%|███████▎  | 366/500 [00:31<00:12, 10.81it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 74%|███████▎  | 368/500 [00:31<00:13,  9.74it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 74%|███████▍  | 370/500 [00:31<00:12, 10.21it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 74%|███████▍  | 372/500 [00:32<00:12,  9.95it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 75%|███████▍  | 374/500 [00:32<00:12, 10.08it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 75%|███████▌  | 376/500 [00:32<00:11, 10.46it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 76%|███████▌  | 378/500 [00:32<00:11, 10.17it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 76%|███████▌  | 380/500 [00:32<00:11, 10.26it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 76%|███████▋  | 382/500 [00:33<00:11, 10.05it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 77%|███████▋  | 384/500 [00:33<00:11, 10.47it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 77%|███████▋  | 386/500 [00:33<00:10, 10.96it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 78%|███████▊  | 388/500 [00:33<00:10, 10.85it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 78%|███████▊  | 390/500 [00:33<00:10, 10.84it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 78%|███████▊  | 392/500 [00:34<00:10, 10.24it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 79%|███████▉  | 394/500 [00:34<00:09, 11.03it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 79%|███████▉  | 396/500 [00:34<00:08, 11.67it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 80%|███████▉  | 398/500 [00:34<00:08, 12.15it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 80%|████████  | 400/500 [00:34<00:09, 11.03it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 80%|████████  | 402/500 [00:34<00:08, 11.54it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 81%|████████  | 404/500 [00:35<00:08, 11.31it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 81%|████████  | 406/500 [00:35<00:07, 11.87it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 82%|████████▏ | 408/500 [00:35<00:07, 12.30it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 82%|████████▏ | 410/500 [00:35<00:07, 11.83it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 82%|████████▏ | 412/500 [00:35<00:08, 10.84it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 83%|████████▎ | 414/500 [00:35<00:07, 11.50it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 83%|████████▎ | 416/500 [00:36<00:06, 12.02it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 84%|████████▎ | 418/500 [00:36<00:07, 11.65it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 84%|████████▍ | 420/500 [00:36<00:06, 12.14it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 84%|████████▍ | 422/500 [00:36<00:06, 11.69it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 85%|████████▍ | 424/500 [00:36<00:06, 12.10it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 85%|████████▌ | 426/500 [00:36<00:06, 11.60it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 86%|████████▌ | 428/500 [00:37<00:05, 12.09it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 86%|████████▌ | 430/500 [00:37<00:05, 12.46it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 86%|████████▋ | 432/500 [00:37<00:06, 11.20it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 87%|████████▋ | 434/500 [00:37<00:05, 11.79it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 87%|████████▋ | 436/500 [00:37<00:05, 12.26it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 88%|████████▊ | 438/500 [00:37<00:04, 12.62it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 88%|████████▊ | 440/500 [00:38<00:04, 12.06it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 88%|████████▊ | 442/500 [00:38<00:04, 12.42it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 89%|████████▉ | 444/500 [00:38<00:04, 12.72it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 89%|████████▉ | 446/500 [00:38<00:04, 12.94it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 90%|████████▉ | 448/500 [00:38<00:03, 13.10it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 90%|█████████ | 450/500 [00:38<00:04, 12.36it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 90%|█████████ | 452/500 [00:39<00:04, 11.89it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 91%|█████████ | 454/500 [00:39<00:03, 12.33it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 91%|█████████ | 456/500 [00:39<00:03, 12.66it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 92%|█████████▏| 458/500 [00:39<00:03, 12.08it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 92%|█████████▏| 460/500 [00:39<00:03, 12.49it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 92%|█████████▏| 462/500 [00:39<00:03, 11.24it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 93%|█████████▎| 464/500 [00:40<00:03, 11.14it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 93%|█████████▎| 466/500 [00:40<00:03, 10.38it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 94%|█████████▎| 468/500 [00:40<00:03, 10.50it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 94%|█████████▍| 470/500 [00:40<00:02, 11.25it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 94%|█████████▍| 472/500 [00:40<00:02, 11.11it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 95%|█████████▍| 474/500 [00:40<00:02, 11.64it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 95%|█████████▌| 476/500 [00:41<00:02, 11.37it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 96%|█████████▌| 478/500 [00:41<00:01, 11.89it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 96%|█████████▌| 480/500 [00:41<00:01, 12.29it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 96%|█████████▋| 482/500 [00:41<00:01, 12.63it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 97%|█████████▋| 484/500 [00:41<00:01, 12.89it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 97%|█████████▋| 486/500 [00:41<00:01, 12.23it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 98%|█████████▊| 488/500 [00:42<00:01, 11.80it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 98%|█████████▊| 490/500 [00:42<00:00, 12.28it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 98%|█████████▊| 492/500 [00:42<00:00, 11.12it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 99%|█████████▉| 494/500 [00:42<00:00, 11.06it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      " 99%|█████████▉| 496/500 [00:42<00:00, 10.40it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "100%|█████████▉| 498/500 [00:43<00:00, 10.55it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "100%|██████████| 500/500 [00:43<00:00, 11.56it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "pos = 0\n",
+    "neg = 0\n",
+    "for sample in tqdm(dev_sample_list[:500]):\n",
+    "    res, history = model.chat(tokenizer, query=\"<用户>{}<AI>\".format(sample[\"messages\"][0][\"content\"]), max_length=128, top_p=0.5, temperature=0.8)\n",
+    "    if sample[\"messages\"][1][\"content\"] in res.strip().lower():\n",
+    "        pos += 1\n",
+    "    else:\n",
+    "        neg += 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(0.81, 405, 95)"
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pos / (pos+neg), pos, neg"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/finetune/lora_finetune_ocnli.sh
+++ b/finetune/lora_finetune_ocnli.sh
+formatted_time=$(date +"%Y%m%d%H%M%S")
+echo $formatted_time
+
+
+deepspeed --include localhost:1 --master_port 19888 finetune.py \
+    --model_name_or_path MiniCPM-2B-sft-bf16 \
+    --output_dir output/OCNLILoRA/$formatted_time/ \
+    --train_data_path data/ocnli_public_chatml/train.json \
+    --eval_data_path data/ocnli_public_chatml/dev.json \
+    --learning_rate 5e-5 --per_device_train_batch_size 80 \
+    --per_device_eval_batch_size 128 --model_max_length 128 --bf16 --use_lora \
+    --gradient_accumulation_steps 1 --warmup_steps 100 \
+    --max_steps 1000 --weight_decay 0.01 \
+    --evaluation_strategy steps --eval_steps 500 \
+    --save_strategy steps --save_steps 500 --seed 42 \
+    --log_level info --logging_strategy steps --logging_steps 10 \
+    --deepspeed configs/ds_config_zero3_offload.json
--- a/finetune/mlx_finetune.py
+++ b/finetune/mlx_finetune.py
+# Copyright © 2023-2024 Apple Inc.
+"""
+This script demonstrates how to fine-tune a LoRA model on AdvertiseGen dataset in mlx.
+Using Code is modified from https://github.com/ml-explore/mlx-examples.
+Using Model with https://huggingface.co/mlx-community/MiniCPM-2B-sft-bf16-llama-format-mlx
+
+Use this Code with command:
+
+train:
+首先处理数据，运行data_processing.ipynb
+python mlx_finetune.py --model MiniCPM-2B-sft-bf16-llama-format-mlx  --data data/mlx_AdvertiseGen  --train  --seed 2024 --iters 500
+
+输出结果如下：
+
+Training
+Iter 1: Val loss 4.015, Val took 1067.669s
+Iter 2: Val loss 4.001, Val took 1061.649s
+...
+
+训练结束之后，文件夹下会有 adapters.npz 文件，用于后续的测试。接着，运行测试命令
+
+test:
+python mlx_finetune.py --model MiniCPM-2B-sft-bf16-llama-format-mlx  --data data/mlx_AdvertiseGen  --test --seed 2024
+
+输出结果如下：
+
+Testing
+Test loss 3.977, Test ppl 53.350.
+
+
+"""
+import argparse
+import json
+import time
+from pathlib import Path
+from typing import Generator
+import transformers
+import numpy as np
+from huggingface_hub import snapshot_download
+import glob
+import inspect
+import math
+from dataclasses import dataclass
+from typing import Dict, Optional, Tuple, Union
+
+from mlx.utils import tree_flatten, tree_unflatten
+import mlx.optimizers as optim
+import mlx.core as mx
+import mlx.nn as nn
+
+
+@dataclass
+class ModelArgs:
+    hidden_size: int
+    num_hidden_layers: int
+    intermediate_size: int
+    num_attention_heads: int
+    rms_norm_eps: float
+    vocab_size: int
+    num_key_value_heads: int = None
+    rope_theta: float = 10000
+    rope_traditional: bool = False
+    model_type: str = None
+    rope_scaling: Optional[Dict[str, Union[float, str]]] = None
+
+    def __post_init__(self):
+        if self.num_key_value_heads is None:
+            self.num_key_value_heads = self.num_attention_heads
+
+        if self.rope_scaling:
+            required_keys = {"factor", "type"}
+            if not all(key in self.rope_scaling for key in required_keys):
+                raise ValueError(f"rope_scaling must contain keys {required_keys}")
+
+            if self.rope_scaling["type"] != "linear":
+                raise ValueError("rope_scaling 'type' currently only supports 'linear'")
+
+    @classmethod
+    def from_dict(cls, params):
+        return cls(
+            **{
+                k: v
+                for k, v in params.items()
+                if k in inspect.signature(cls).parameters
+            }
+        )
+
+
+class LoRALinear(nn.Module):
+    @staticmethod
+    def from_linear(linear: nn.Linear, rank: int = 8):
+        # TODO remove when input_dims and output_dims are attributes
+        # on linear and quantized linear
+        output_dims, input_dims = linear.weight.shape
+        if isinstance(linear, nn.QuantizedLinear):
+            input_dims *= 32 // linear.bits
+        lora_lin = LoRALinear(input_dims, output_dims, rank)
+        lora_lin.linear = linear
+        return lora_lin
+
+    def to_linear(self):
+        linear = self.linear
+        bias = "bias" in linear
+        weight = linear.weight
+        is_quantized = isinstance(linear, nn.QuantizedLinear)
+
+        # Use the same type as the linear weight if not quantized
+        dtype = weight.dtype
+
+        if is_quantized:
+            dtype = mx.float16
+            weight = mx.dequantize(
+                weight,
+                linear.scales,
+                linear.biases,
+                linear.group_size,
+                linear.bits,
+            )
+        output_dims, input_dims = weight.shape
+        fused_linear = nn.Linear(input_dims, output_dims, bias=bias)
+
+        lora_b = (self.scale * self.lora_b.T).astype(dtype)
+        lora_a = self.lora_a.T.astype(dtype)
+        fused_linear.weight = weight + lora_b @ lora_a
+        if bias:
+            fused_linear.bias = linear.bias
+
+        if is_quantized:
+            fused_linear = nn.QuantizedLinear.from_linear(
+                fused_linear,
+                linear.group_size,
+                linear.bits,
+            )
+
+        return fused_linear
+
+    def __init__(
+            self,
+            input_dims: int,
+            output_dims: int,
+            lora_rank: int = 8,
+            bias: bool = False,
+            scale: float = 20.0,
+    ):
+        super().__init__()
+
+        # Regular linear layer weights
+        self.linear = nn.Linear(input_dims, output_dims, bias=bias)
+
+        # Scale for low-rank update
+        self.scale = scale
+
+        # Low rank lora weights
+        scale = 1 / math.sqrt(input_dims)
+        self.lora_a = mx.random.uniform(
+            low=-scale,
+            high=scale,
+            shape=(input_dims, lora_rank),
+        )
+        self.lora_b = mx.zeros(shape=(lora_rank, output_dims))
+
+    def __call__(self, x):
+        dtype = self.linear.weight.dtype
+        if isinstance(self.linear, nn.QuantizedLinear):
+            dtype = self.linear.scales.dtype
+        y = self.linear(x.astype(dtype))
+        z = (x @ self.lora_a) @ self.lora_b
+        return y + self.scale * z
+
+
+class Attention(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+
+        dim = args.hidden_size
+        self.n_heads = n_heads = args.num_attention_heads
+        self.n_kv_heads = n_kv_heads = args.num_key_value_heads
+
+        self.repeats = n_heads // n_kv_heads
+
+        head_dim = args.hidden_size // n_heads
+        self.scale = head_dim ** -0.5
+
+        self.q_proj = nn.Linear(dim, n_heads * head_dim, bias=False)
+        self.k_proj = nn.Linear(dim, n_kv_heads * head_dim, bias=False)
+        self.v_proj = nn.Linear(dim, n_kv_heads * head_dim, bias=False)
+        self.o_proj = nn.Linear(n_heads * head_dim, dim, bias=False)
+        rope_scale = (
+            1 / args.rope_scaling["factor"]
+            if args.rope_scaling is not None and args.rope_scaling["type"] == "linear"
+            else 1
+        )
+        self.rope = nn.RoPE(
+            head_dim,
+            traditional=args.rope_traditional,
+            base=args.rope_theta,
+            scale=rope_scale,
+        )
+
+    def __call__(
+            self,
+            x: mx.array,
+            mask: Optional[mx.array] = None,
+            cache: Optional[Tuple[mx.array, mx.array]] = None,
+    ) -> mx.array:
+        B, L, D = x.shape
+
+        queries, keys, values = self.q_proj(x), self.k_proj(x), self.v_proj(x)
+
+        # Prepare the queries, keys and values for the attention computation
+        queries = queries.reshape(B, L, self.n_heads, -1).transpose(0, 2, 1, 3)
+        keys = keys.reshape(B, L, self.n_kv_heads, -1).transpose(0, 2, 1, 3)
+        values = values.reshape(B, L, self.n_kv_heads, -1).transpose(0, 2, 1, 3)
+
+        if cache is not None:
+            key_cache, value_cache = cache
+            queries = self.rope(queries, offset=key_cache.shape[2])
+            keys = self.rope(keys, offset=key_cache.shape[2])
+            keys = mx.concatenate([key_cache, keys], axis=2)
+            values = mx.concatenate([value_cache, values], axis=2)
+        else:
+            queries = self.rope(queries)
+            keys = self.rope(keys)
+
+        output = mx.fast.scaled_dot_product_attention(
+            queries, keys, values, scale=self.scale, mask=mask
+        )
+        output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
+        return self.o_proj(output), (keys, values)
+
+
+class MLP(nn.Module):
+    def __init__(self, dim, hidden_dim):
+        super().__init__()
+        self.gate_proj = nn.Linear(dim, hidden_dim, bias=False)
+        self.down_proj = nn.Linear(hidden_dim, dim, bias=False)
+        self.up_proj = nn.Linear(dim, hidden_dim, bias=False)
+
+    def __call__(self, x) -> mx.array:
+        return self.down_proj(nn.silu(self.gate_proj(x)) * self.up_proj(x))
+
+
+class TransformerBlock(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        self.num_attention_heads = args.num_attention_heads
+        self.hidden_size = args.hidden_size
+        self.self_attn = Attention(args)
+        self.mlp = MLP(args.hidden_size, args.intermediate_size)
+        self.input_layernorm = nn.RMSNorm(args.hidden_size, eps=args.rms_norm_eps)
+        self.post_attention_layernorm = nn.RMSNorm(
+            args.hidden_size, eps=args.rms_norm_eps
+        )
+        self.args = args
+
+    def __call__(
+            self,
+            x: mx.array,
+            mask: Optional[mx.array] = None,
+            cache: Optional[Tuple[mx.array, mx.array]] = None,
+    ) -> mx.array:
+        r, cache = self.self_attn(self.input_layernorm(x), mask, cache)
+        h = x + r
+        r = self.mlp(self.post_attention_layernorm(h))
+        out = h + r
+        return out, cache
+
+
+class LlamaModel(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        self.args = args
+        self.vocab_size = args.vocab_size
+        self.num_hidden_layers = args.num_hidden_layers
+        assert self.vocab_size > 0
+        self.embed_tokens = nn.Embedding(args.vocab_size, args.hidden_size)
+        self.layers = [
+            TransformerBlock(args=args) for _ in range(args.num_hidden_layers)
+        ]
+        self.norm = nn.RMSNorm(args.hidden_size, eps=args.rms_norm_eps)
+
+    def __call__(
+            self,
+            inputs: mx.array,
+            cache=None,
+    ):
+        h = self.embed_tokens(inputs)
+
+        mask = None
+        if h.shape[1] > 1:
+            mask = nn.MultiHeadAttention.create_additive_causal_mask(h.shape[1])
+            mask = mask.astype(h.dtype)
+
+        if cache is None:
+            cache = [None] * len(self.layers)
+
+        for e, layer in enumerate(self.layers):
+            h, cache[e] = layer(h, mask, cache[e])
+
+        return self.norm(h), cache
+
+
+class Model(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        self.model = LlamaModel(args)
+        self.lm_head = nn.Linear(args.hidden_size, args.vocab_size, bias=False)
+
+    def __call__(
+            self,
+            inputs: mx.array,
+            cache=None,
+    ):
+        out, cache = self.model(inputs, cache)
+        return self.lm_head(out), cache
+
+
+def build_parser():
+    parser = argparse.ArgumentParser(description="LoRA or QLoRA finetuning.")
+    parser.add_argument(
+        "--model",
+        default="/Users/liudan/Downloads/模型/llamaformat_minicpm",
+        help="The path to the local model directory or Hugging Face repo.",
+    )
+    # Generation args
+    parser.add_argument(
+        "--max-tokens",
+        "-m",
+        type=int,
+        default=100,
+        help="The maximum number of tokens to generate",
+    )
+    parser.add_argument(
+        "--temp", type=float, default=0.8, help="The sampling temperature"
+    )
+    parser.add_argument(
+        "--prompt",
+        "-p",
+        type=str,
+        help="The prompt for generation"
+    )
+
+    # Training args
+    parser.add_argument(
+        "--train",
+        action="store_true",
+        help="Do training",
+    )
+    parser.add_argument(
+        "--data",
+        type=str,
+        default="data/mlx_AdvertiseGen",
+        help="Directory with {train, valid, test}.json files",
+    )
+    parser.add_argument(
+        "--lora-layers",
+        type=int,
+        default=16,
+        help="Number of layers to fine-tune",
+    )
+    parser.add_argument("--batch-size", type=int, default=4, help="Minibatch size.")
+    parser.add_argument(
+        "--iters", type=int, default=1000, help="Iterations to train for."
+    )
+    parser.add_argument(
+        "--val-batches",
+        type=int,
+        default=25,
+        help="Number of validation batches, -1 uses the entire validation set.",
+    )
+    parser.add_argument(
+        "--learning-rate", type=float, default=1e-5, help="Adam learning rate."
+    )
+    parser.add_argument(
+        "--steps-per-report",
+        type=int,
+        default=10,
+        help="Number of training steps between loss reporting.",
+    )
+    parser.add_argument(
+        "--steps-per-eval",
+        type=int,
+        default=200,
+        help="Number of training steps between validations.",
+    )
+    parser.add_argument(
+        "--resume-adapter-file",
+        type=str,
+        default=None,
+        help="Load path to resume training with the given adapter weights.",
+    )
+    parser.add_argument(
+        "--adapter-file",
+        type=str,
+        default="adapters.npz",
+        help="Save/load path for the trained adapter weights.",
+    )
+    parser.add_argument(
+        "--save-every",
+        type=int,
+        default=100,
+        help="Save the model every N iterations.",
+    )
+    parser.add_argument(
+        "--test",
+        action="store_true",
+        help="Evaluate on the test set after training",
+    )
+    parser.add_argument(
+        "--test-batches",
+        type=int,
+        default=500,
+        help="Number of test set batches, -1 uses the entire test set.",
+    )
+    parser.add_argument("--seed", type=int, default=0, help="The PRNG seed")
+    return parser
+
+
+class ConversationDataset:
+
+    def __init__(self, path: Path):
+        with open(path, "r") as fid:
+            self._data = [json.loads(l) for l in fid]
+
+    def __getitem__(self, idx: int):
+        entry = self._data[idx]
+        content = entry.get("input", "")
+        summary = entry.get("output", "")
+        prompt  = entry.get("prompt", "")
+        return prompt, content, summary
+
+    def __len__(self):
+        return len(self._data)
+
+
+def load(args):
+    def load_and_check(name):
+        dataset_path = Path(args.data) / f"{name}.json"
+        try:
+            return ConversationDataset(dataset_path)
+        except Exception as e:
+            print(f"Unable to build dataset {dataset_path} ({e})")
+            raise
+
+    names = ("train", "dev", "dev")
+    train, valid, test = (load_and_check(n) for n in names)
+
+    if args.train and len(train) == 0:
+        raise ValueError(
+            "Training set not found or empty. Must provide training set for fine-tuning."
+        )
+    if args.train and len(valid) == 0:
+        raise ValueError(
+            "Validation set not found or empty. Must provide validation set for fine-tuning."
+        )
+    if args.test and len(test) == 0:
+        raise ValueError(
+            "Test set not found or empty. Must provide test set for evaluation."
+        )
+    return train, valid, test
+
+
+def loss(model, inputs, targets, lengths):
+    logits, _ = model(inputs)
+    logits = logits.astype(mx.float32)
+    length_mask = mx.arange(inputs.shape[1])[None, :] < lengths[:, None]
+    ce = nn.losses.cross_entropy(logits, targets) * length_mask
+    ntoks = length_mask.sum()
+    ce = ce.sum() / ntoks
+    return ce, ntoks
+
+
+def iterate_batches(dset, tokenizer, batch_size, train=False):
+    # Shuffle indices
+    while True:
+        indices = np.arange(len(dset))
+        if train:
+            indices = np.random.permutation(indices)
+
+        # Collect batches from dataset
+        for i in range(0, len(indices) - batch_size + 1, batch_size):
+            # Encode batch
+            batch_samples=[dset[indices[i + j]] for j in range(batch_size)]
+            batch_format_text=['<用户>{}<AI>{}'.format(i[1]+i[0],i[2]) for i in batch_samples]
+            batch = [tokenizer.encode(i)+[tokenizer.eos_token_id] for i in batch_format_text]
+            lengths = [len(x) for x in batch]
+            # Check if any sequence is longer than 2048 tokens
+            if max(lengths) > 2048:
+                print(
+                    "[WARNING] Some sequences are longer than 2048 tokens. "
+                    "Consider pre-splitting your data to save memory."
+                )
+
+            # Pad to the max length
+            batch_arr = np.zeros((batch_size, max(lengths)), np.int32)
+
+            for j in range(batch_size):
+                batch_arr[j, : lengths[j]] = batch[j]
+            batch = mx.array(batch_arr)
+            yield batch[:, :-1], batch[:, 1:], mx.array(lengths)
+
+        if not train:
+            break
+
+
+def load_model(path_or_hf_repo: str):
+    # If the path exists, it will try to load model form it
+    # otherwise download and cache from the hf_repo and cache
+    model_path = Path(path_or_hf_repo)
+    if not model_path.exists():
+        model_path = Path(
+            snapshot_download(
+                repo_id=path_or_hf_repo,
+                allow_patterns=["*.json", "*.safetensors", "tokenizer.model"],
+            )
+        )
+
+    with open(model_path / "config.json", "r") as f:
+        config = json.loads(f.read())
+        quantization = config.get("quantization", None)
+
+    weight_files = glob.glob(str(model_path / "*.safetensors"))
+    if len(weight_files) == 0:
+        raise FileNotFoundError("No safetensors found in {}".format(model_path))
+
+    weights = {}
+    for wf in weight_files:
+        weights.update(mx.load(wf).items())
+
+    model_args = ModelArgs.from_dict(config)
+    model = Model(model_args)
+    if quantization is not None:
+        nn.QuantizedLinear.quantize_module(
+            model,
+            **quantization,
+            linear_class_predicate=lambda m: isinstance(m, nn.Linear)
+                                             and m.weight.shape[0] != 8,
+        )
+
+    model.load_weights(list(weights.items()))
+
+    mx.eval(model.parameters())
+    tokenizer = transformers.AutoTokenizer.from_pretrained(model_path)
+    return model, tokenizer, config
+
+
+def generate(
+        prompt: mx.array, model: nn.Module, temp: float = 0.0
+) -> Generator[mx.array, None, None]:
+    """
+    Generate text based on the given prompt and model.
+
+    Args:
+        prompt (mx.array): The input prompt.
+        model (nn.Module): The model to use for generation.
+        temp (float): The temperature for sampling. If temp is 0, use max sampling.
+
+    Yields:
+        mx.array: The generated text.
+    """
+
+    def sample(logits: mx.array) -> mx.array:
+        return (
+            mx.argmax(logits, axis=-1)
+            if temp == 0
+            else mx.random.categorical(logits * (1 / temp))
+        )
+
+    y = prompt
+    cache = None
+    while True:
+        logits, cache = model(y[None], cache=cache)
+        logits = logits[:, -1, :]
+        y = sample(logits)
+        yield y
+
+
+def evaluate(model, dataset, loss, tokenizer, batch_size, num_batches):
+    all_losses = []
+    ntokens = 0
+    for it, batch in zip(
+            range(num_batches),
+            iterate_batches(dataset, tokenizer, batch_size),
+    ):
+        losses, toks = loss(model, *batch)
+        all_losses.append((losses * toks).item())
+        ntokens += toks.item()
+
+    return np.sum(all_losses) / ntokens
+
+
+def train(model, train_set, val_set, optimizer, loss, tokenizer, args):
+    # Create value and grad function for loss
+    loss_value_and_grad = nn.value_and_grad(model, loss)
+
+    losses = []
+    n_tokens = 0
+
+    # Main training loop
+    start = time.perf_counter()
+    for it, batch in zip(
+            range(args.iters),
+            iterate_batches(train_set, tokenizer, args.batch_size, train=True),
+    ):
+        # Forward and backward pass
+        (lvalue, toks), grad = loss_value_and_grad(model, *batch)
+
+        # Model update
+        optimizer.update(model, grad)
+        mx.eval(model.parameters(), optimizer.state, lvalue)
+
+        # Record loss
+        losses.append(lvalue.item())
+        n_tokens += toks.item()
+
+        if (it + 1) % args.steps_per_report == 0:
+            train_loss = np.mean(losses)
+
+            stop = time.perf_counter()
+            print(
+                f"Iter {it + 1}: Train loss {train_loss:.3f}, "
+                f"It/sec {args.steps_per_report / (stop - start):.3f}, "
+                f"Tokens/sec {float(n_tokens) / (stop - start):.3f}"
+            )
+            losses = []
+            n_tokens = 0
+            start = time.perf_counter()
+
+        # Report validation loss if needed
+        if it == 0 or (it + 1) % args.steps_per_eval == 0:
+            stop = time.perf_counter()
+            val_loss = evaluate(
+                model, val_set, loss, tokenizer, args.batch_size, args.val_batches
+            )
+            print(
+                f"Iter {it + 1}: "
+                f"Val loss {val_loss:.3f}, "
+                f"Val took {(time.perf_counter() - stop):.3f}s"
+            )
+
+            start = time.perf_counter()
+
+        # Save adapter weights if needed
+        if (it + 1) % args.save_every == 0:
+            mx.savez(
+                args.adapter_file, **dict(tree_flatten(model.trainable_parameters()))
+            )
+            print(f"Iter {it + 1}: Saved adapter weights to {args.adapter_file}.")
+
+
+def generate_string(model, prompt, tokenizer, args):
+    print(prompt, end="", flush=True)
+
+    prompt = mx.array(tokenizer.encode(prompt))
+
+    tokens = []
+    skip = 0
+    for token, n in zip(
+            generate(prompt, model, args.temp),
+            range(args.max_tokens),
+    ):
+        if token == tokenizer.eos_token_id:
+            break
+
+        tokens.append(token.item())
+        s = tokenizer.decode(tokens)
+        if len(s) - skip > 1:
+            print(s[skip:-1], end="", flush=True)
+            skip = len(s) - 1
+    print(tokenizer.decode(tokens)[skip:], flush=True)
+    print("=" * 10)
+    if len(tokens) == 0:
+        print("No tokens generated for this prompt")
+        return
+
+
+if __name__ == "__main__":
+    parser = build_parser()
+    args = parser.parse_args()
+
+    np.random.seed(args.seed)
+
+    print("Loading pretrained model")
+    model, tokenizer, _ = load_model(args.model)
+
+    # Freeze all layers other than LORA linears
+    model.freeze()
+    for l in model.model.layers[len(model.model.layers) - args.lora_layers:]:
+        l.self_attn.q_proj = LoRALinear.from_linear(l.self_attn.q_proj)
+        l.self_attn.v_proj = LoRALinear.from_linear(l.self_attn.v_proj)
+        if hasattr(l, "block_sparse_moe"):
+            l.block_sparse_moe.gate = LoRALinear.from_linear(l.block_sparse_moe.gate)
+
+    p = sum(v.size for _, v in tree_flatten(model.parameters())) / 10 ** 6
+    print(f"Total parameters {p:.3f}M")
+    p = sum(v.size for _, v in tree_flatten(model.trainable_parameters())) / 10 ** 6
+    print(f"Trainable parameters {p:.3f}M")
+
+    print("Loading datasets")
+    train_set, valid_set, test_set = load(args)
+
+    # Resume training the given adapters.
+    if args.resume_adapter_file is not None:
+        print(f"Loading pretrained adapters from {args.resume_adapter_file}")
+        model.load_weights(args.resume_adapter_file, strict=False)
+
+    if args.train:
+        print("Training")
+        opt = optim.Adam(learning_rate=args.learning_rate)
+
+        # Train model
+        train(model, train_set, valid_set, opt, loss, tokenizer, args)
+
+        # Save adapter weights
+        mx.savez(args.adapter_file, **dict(tree_flatten(model.trainable_parameters())))
+
+    # Load the LoRA adapter weights which we assume should exist by this point
+    if not Path(args.adapter_file).is_file():
+        raise ValueError(
+            f"Adapter file {args.adapter_file} missing. "
+            "Use --train to learn and save the adapters.npz."
+        )
+    model.load_weights(args.adapter_file, strict=False)
+
+    if args.test:
+        print("Testing")
+        model.eval()
+        test_loss = evaluate(
+            model,
+            test_set,
+            loss,
+            tokenizer,
+            args.batch_size,
+            num_batches=args.test_batches,
+        )
+        test_ppl = math.exp(test_loss)
+
+        print(f"Test loss {test_loss:.3f}, Test ppl {test_ppl:.3f}.")
+
+    if args.prompt is not None:
+        print("Generating")
+        generate_string(model, args.prompt, tokenizer, args)
--- a/finetune/requirements.txt
+++ b/finetune/requirements.txt
+# for finetune
+jieba>=0.42.1
+ruamel_yaml>=0.18.5
+rouge_chinese>=1.0.3
+jupyter>=1.0.0
+datasets>=2.16.1
+peft>=0.7.1
+deepspeed>=0.13.1
+flash_attn>=2.5.1
--- a/finetune/requirements_mlx.txt
+++ b/finetune/requirements_mlx.txt
+transformers>=4.39.1
+torch>=2.2.0
+triton>=2.2.0
+httpx>=0.27.0
+gradio>=4.26.0
+flash_attn>=2.4.1
+accelerate>=0.29.2
+sentence_transformers>=2.6.1
+sse_starlette>=2.1.0
+tiktoken>=0.6.0
+mlx_lm>=0.8.0
+openai>=0.16.2
\ No newline at end of file
--- a/finetune/sft_dpo_trainer/finetune_dpo_trainer.py
+++ b/finetune/sft_dpo_trainer/finetune_dpo_trainer.py
+# -*- coding: utf-8 -*-
+import json
+import os
+from dataclasses import dataclass, field
+from typing import Dict, Optional, Union, Any
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import transformers
+from torch.utils.data import Dataset
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    Trainer,
+    TrainingArguments as HFTrainingArguments,
+    BitsAndBytesConfig,
+)
+import copy
+
+
+@dataclass
+class ModelArguments:
+    model_name_or_path: Optional[str] = field(default="openbmb/MiniCPM-2B-sft-bf16")
+
+
+@dataclass
+class DataArguments:
+    train_data_path: str = field(
+        default="data/AdvertiseGenChatML/train.json",
+        metadata={"help": "Path to the training data."},
+    )
+    eval_data_path: str = field(
+        default="data/AdvertiseGenChatML/dev.json",
+        metadata={"help": "Path to the test data."},
+    )
+
+
+@dataclass
+class TrainingArguments(HFTrainingArguments):
+    cache_dir: Optional[str] = field(default=None)
+    optim: str = field(default="adamw_torch")
+    model_max_length: int = field(
+        default=512,
+        metadata={
+            "help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
+        },
+    )
+    use_lora: bool = field(default=False)
+    qlora: bool = field(default=False)
+    # DPO相关参数
+    use_dpo: bool = field(default=False, metadata={"help": "Whether to use DPO training"})
+    dpo_beta: float = field(default=0.1, metadata={"help": "Beta parameter for DPO loss"})
+    reference_model_path: Optional[str] = field(default=None, metadata={"help": "Path to reference model for DPO"})
+    # SFT损失权重参数
+    sft_loss_weight: float = field(default=0.0, metadata={"help": "Weight for SFT loss when combined with DPO"})
+
+
+class SupervisedDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+
+    def __init__(
+        self,
+        data_path,
+        tokenizer,
+        model_max_length=4096,
+    ):
+        super(SupervisedDataset, self).__init__()
+        self.data = json.load(open(data_path))
+        self.tokenizer = tokenizer
+        self.model_max_length = model_max_length
+        self.ignore_index = -100
+        item = self.preprocessing(self.data[0])
+        print("input:", self.tokenizer.decode(item["input_ids"]))
+        labels = []
+        for id_ in item["labels"]:
+            if id_ == -100:
+                continue
+            labels.append(id_)
+        print("label:", self.tokenizer.decode(labels))
+
+    def __len__(self):
+        return len(self.data)
+
+    def preprocessing(self, example):
+        input_ids = [self.tokenizer.bos_token_id]
+        label_ids = [self.ignore_index]
+
+        for message in example["messages"]:
+            role = message["role"]
+            content = message["content"]
+
+            content_ids = self.tokenizer.apply_chat_template([message])
+
+            if role == "user":
+                if self.tokenizer.eos_token_id == 73440:  # minicpm3.0 and minicpm4.0
+                    input_ids += self.tokenizer.apply_chat_template(
+                        [message], add_generation_prompt=True
+                    )
+                    label_ids += [self.ignore_index] * len(
+                        self.tokenizer.apply_chat_template(
+                            [message], add_generation_prompt=True
+                        )
+                    )
+                else: # minicpm2.0
+                    input_ids += content_ids
+                    label_ids += [self.ignore_index] * len(content_ids)
+            elif role == "system":
+                input_ids += content_ids
+                label_ids += [self.ignore_index] * len(content_ids)
+            elif role == "assistant":
+                if self.tokenizer.eos_token_id == 73440:  # minicpm3.0 and minicpm4.0
+                    input_ids += self.tokenizer.encode(content, add_special_tokens=False)
+                    label_ids += self.tokenizer.encode(content, add_special_tokens=False)
+                else: # minicpm2.0
+                    input_ids += content_ids
+                    label_ids += content_ids
+
+        input_ids.append(self.tokenizer.eos_token_id)
+        label_ids.append(self.tokenizer.eos_token_id)
+        # truncate to max len
+        input_ids = input_ids[: self.model_max_length]
+        label_ids = label_ids[: self.model_max_length]
+        attention_mask = [1] * len(input_ids)
+        # pad to max len
+        input_ids += [self.tokenizer.eos_token_id] * (
+            self.model_max_length - len(input_ids)
+        )
+        label_ids += [self.ignore_index] * (self.model_max_length - len(label_ids))
+        attention_mask += [0] * (self.model_max_length - len(attention_mask))
+        # convert to pt tensor
+        input_ids = torch.LongTensor(input_ids)
+        label_ids = torch.LongTensor(label_ids)
+        attention_mask = torch.LongTensor(attention_mask)
+        return {
+            "input_ids": input_ids,
+            "labels": label_ids,
+            "attention_mask": attention_mask,
+        }
+
+    def __getitem__(self, idx) -> Dict[str, torch.Tensor]:
+        return self.preprocessing(self.data[idx])
+
+
+class DPODataset(Dataset):
+    """Dataset for DPO training with optional SFT data."""
+    
+    def __init__(self, data_path, tokenizer, model_max_length=4096, include_sft_data=False):
+        super(DPODataset, self).__init__()
+        self.data = json.load(open(data_path))
+        self.tokenizer = tokenizer
+        self.model_max_length = model_max_length
+        self.ignore_index = -100
+        self.include_sft_data = include_sft_data
+        
+        # 展示第一个样本的处理结果
+        if len(self.data) > 0:
+            item = self.preprocessing(self.data[0])
+            print("DPO Dataset Sample:")
+            print("Chosen input:", self.tokenizer.decode(item["chosen_input_ids"], skip_special_tokens=True))
+            print("Rejected input:", self.tokenizer.decode(item["rejected_input_ids"], skip_special_tokens=True))
+    
+    def __len__(self):
+        return len(self.data)
+    
+    def build_conversation(self, instruction, input_text="", history=None):
+        """构建对话格式"""
+        messages = []
+        
+        # 添加历史对话
+        if history:
+            for user_msg, assistant_msg in history:
+                messages.append({"role": "user", "content": user_msg})
+                messages.append({"role": "assistant", "content": assistant_msg})
+        
+        # 添加当前指令
+        current_input = instruction
+        if input_text:
+            current_input = f"{instruction}\n{input_text}"
+        messages.append({"role": "user", "content": current_input})
+        
+        return messages
+    
+    def encode_conversation(self, messages, response):
+        """编码对话和回复"""
+        # 构建完整对话
+        full_messages = messages + [{"role": "assistant", "content": response}]
+        
+        # 使用chat template编码
+        if hasattr(self.tokenizer, 'apply_chat_template'):
+            input_ids = self.tokenizer.apply_chat_template(
+                full_messages, 
+                tokenize=True, 
+                add_generation_prompt=False,
+                return_tensors="pt"
+            ).squeeze(0)
+        else:
+            # 如果没有chat template，使用简单拼接
+            text = ""
+            for msg in full_messages:
+                if msg["role"] == "user":
+                    text += f"User: {msg['content']}\n"
+                elif msg["role"] == "assistant":
+                    text += f"Assistant: {msg['content']}\n"
+            input_ids = self.tokenizer.encode(text, return_tensors="pt").squeeze(0)
+        
+        # 截断到最大长度
+        if len(input_ids) > self.model_max_length:
+            input_ids = input_ids[:self.model_max_length]
+        
+        # 计算attention mask
+        attention_mask = torch.ones_like(input_ids)
+        
+        # 填充到固定长度
+        if len(input_ids) < self.model_max_length:
+            pad_length = self.model_max_length - len(input_ids)
+            input_ids = torch.cat([
+                input_ids, 
+                torch.full((pad_length,), self.tokenizer.pad_token_id, dtype=input_ids.dtype)
+            ])
+            attention_mask = torch.cat([
+                attention_mask,
+                torch.zeros(pad_length, dtype=attention_mask.dtype)
+            ])
+        
+        return input_ids, attention_mask
+    
+    def encode_conversation_with_labels(self, messages, response):
+        """编码对话和回复，同时生成SFT训练所需的labels"""
+        # 构建完整对话
+        full_messages = messages + [{"role": "assistant", "content": response}]
+        
+        # 使用chat template编码
+        if hasattr(self.tokenizer, 'apply_chat_template'):
+            input_ids = self.tokenizer.apply_chat_template(
+                full_messages, 
+                tokenize=True, 
+                add_generation_prompt=False,
+                return_tensors="pt"
+            ).squeeze(0)
+        else:
+            # 如果没有chat template，使用简单拼接
+            text = ""
+            for msg in full_messages:
+                if msg["role"] == "user":
+                    text += f"User: {msg['content']}\n"
+                elif msg["role"] == "assistant":
+                    text += f"Assistant: {msg['content']}\n"
+            input_ids = self.tokenizer.encode(text, return_tensors="pt").squeeze(0)
+        
+        # 创建labels，只对assistant回复部分计算损失
+        labels = input_ids.clone()
+        
+        # 编码不包含assistant回复的部分，用于确定哪些token需要ignore
+        prompt_messages = messages
+        if hasattr(self.tokenizer, 'apply_chat_template'):
+            prompt_ids = self.tokenizer.apply_chat_template(
+                prompt_messages, 
+                tokenize=True, 
+                add_generation_prompt=True,
+                return_tensors="pt"
+            ).squeeze(0)
+        else:
+            prompt_text = ""
+            for msg in prompt_messages:
+                if msg["role"] == "user":
+                    prompt_text += f"User: {msg['content']}\n"
+            prompt_text += "Assistant: "
+            prompt_ids = self.tokenizer.encode(prompt_text, return_tensors="pt").squeeze(0)
+        
+        # 对prompt部分设置ignore_index
+        prompt_len = len(prompt_ids)
+        if prompt_len < len(labels):
+            labels[:prompt_len] = self.ignore_index
+        
+        # 截断到最大长度
+        if len(input_ids) > self.model_max_length:
+            input_ids = input_ids[:self.model_max_length]
+            labels = labels[:self.model_max_length]
+        
+        # 计算attention mask
+        attention_mask = torch.ones_like(input_ids)
+        
+        # 填充到固定长度
+        if len(input_ids) < self.model_max_length:
+            pad_length = self.model_max_length - len(input_ids)
+            input_ids = torch.cat([
+                input_ids, 
+                torch.full((pad_length,), self.tokenizer.pad_token_id, dtype=input_ids.dtype)
+            ])
+            labels = torch.cat([
+                labels,
+                torch.full((pad_length,), self.ignore_index, dtype=labels.dtype)
+            ])
+            attention_mask = torch.cat([
+                attention_mask,
+                torch.zeros(pad_length, dtype=attention_mask.dtype)
+            ])
+        
+        return input_ids, attention_mask, labels
+    
+    def preprocessing(self, example):
+        """预处理DPO数据样本"""
+        instruction = example["instruction"]
+        input_text = example.get("input", "")
+        chosen = example["chosen"]
+        rejected = example["rejected"]
+        history = example.get("history", [])
+        
+        # 构建对话消息
+        messages = self.build_conversation(instruction, input_text, history)
+        
+        # 编码chosen和rejected回复
+        chosen_input_ids, chosen_attention_mask = self.encode_conversation(messages, chosen)
+        rejected_input_ids, rejected_attention_mask = self.encode_conversation(messages, rejected)
+        
+        result = {
+            "chosen_input_ids": chosen_input_ids,
+            "chosen_attention_mask": chosen_attention_mask,
+            "rejected_input_ids": rejected_input_ids,
+            "rejected_attention_mask": rejected_attention_mask,
+        }
+        
+        # 只有在需要SFT损失时才生成相关数据
+        if self.include_sft_data:
+            chosen_input_ids_sft, chosen_attention_mask_sft, chosen_labels = self.encode_conversation_with_labels(messages, chosen)
+            result.update({
+                "chosen_input_ids_sft": chosen_input_ids_sft,
+                "chosen_attention_mask_sft": chosen_attention_mask_sft,
+                "chosen_labels": chosen_labels,
+            })
+        
+        return result
+    
+    def __getitem__(self, idx):
+        return self.preprocessing(self.data[idx])
+
+
+class DPODataCollator:
+    """自定义的DPO数据collator，处理特殊的DPO数据格式"""
+    
+    def __init__(self, tokenizer, include_sft_data=False):
+        self.tokenizer = tokenizer
+        self.include_sft_data = include_sft_data
+    
+    def __call__(self, features):
+        batch = {}
+        
+        # 处理基本的DPO字段
+        dpo_keys = ["chosen_input_ids", "chosen_attention_mask", "rejected_input_ids", "rejected_attention_mask"]
+        
+        for key in dpo_keys:
+            if key in features[0]:
+                batch[key] = torch.stack([f[key] for f in features])
+        
+        # 如果包含SFT数据，也处理SFT相关字段
+        if self.include_sft_data:
+            sft_keys = ["chosen_input_ids_sft", "chosen_attention_mask_sft", "chosen_labels"]
+            for key in sft_keys:
+                if key in features[0]:
+                    batch[key] = torch.stack([f[key] for f in features])
+        
+        return batch
+
+
+class DPOTrainer(Trainer):
+    """Custom Trainer for DPO with optional SFT loss."""
+    
+    def __init__(self, reference_model=None, dpo_beta=0.1, sft_loss_weight=0.0, **kwargs):
+        super().__init__(**kwargs)
+        self.reference_model = reference_model
+        self.dpo_beta = dpo_beta
+        self.sft_loss_weight = sft_loss_weight
+        self.use_sft = sft_loss_weight > 0
+        
+        # 将参考模型移动到正确的设备
+        if self.reference_model is not None:
+            self.reference_model.to(self.args.device)
+            self.reference_model.eval()
+            # 确保参考模型不需要梯度
+            for param in self.reference_model.parameters():
+                param.requires_grad = False
+    
+    def get_log_probabilities(self, model, input_ids, attention_mask):
+        """计算序列的log概率"""
+        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
+        logits = outputs.logits
+        
+        # 计算每个token的log概率
+        log_probs = F.log_softmax(logits, dim=-1)
+        
+        # 获取实际token的log概率
+        # shift操作：预测下一个token
+        shift_log_probs = log_probs[..., :-1, :].contiguous()
+        shift_labels = input_ids[..., 1:].contiguous()
+        shift_attention = attention_mask[..., 1:].contiguous()
+        
+        # 收集每个位置的log概率
+        gathered_log_probs = torch.gather(
+            shift_log_probs, 
+            dim=-1, 
+            index=shift_labels.unsqueeze(-1)
+        ).squeeze(-1)
+        
+        # 只计算非padding部分的平均log概率
+        masked_log_probs = gathered_log_probs * shift_attention.float()
+        sequence_log_prob = masked_log_probs.sum(dim=-1) / (shift_attention.sum(dim=-1).float() + 1e-8)
+        
+        return sequence_log_prob
+    
+    def compute_dpo_loss(self, policy_chosen_logps, policy_rejected_logps, 
+                         reference_chosen_logps, reference_rejected_logps):
+        """计算DPO损失函数"""
+        # 计算相对于参考模型的log概率比值
+        policy_ratio_chosen = policy_chosen_logps - reference_chosen_logps
+        policy_ratio_rejected = policy_rejected_logps - reference_rejected_logps
+        
+        # DPO损失
+        logits = self.dpo_beta * (policy_ratio_chosen - policy_ratio_rejected)
+        loss = -F.logsigmoid(logits).mean()
+        
+        # 计算准确率（chosen概率高于rejected的比例）
+        accuracy = (policy_ratio_chosen > policy_ratio_rejected).float().mean()
+        
+        return loss, accuracy
+    
+    def compute_sft_loss(self, logits, labels):
+        """计算SFT损失"""
+        # Shift so that tokens < n predict n
+        shift_logits = logits[..., :-1, :].contiguous()
+        shift_labels = labels[..., 1:].contiguous()
+        
+        # Flatten the tokens
+        loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
+        shift_logits = shift_logits.view(-1, shift_logits.size(-1))
+        shift_labels = shift_labels.view(-1)
+        
+        loss = loss_fct(shift_logits, shift_labels)
+        return loss
+    
+    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
+        """计算DPO损失和可选的SFT损失，并分别输出"""
+        # 计算策略模型的log概率（用于DPO）
+        policy_chosen_logps = self.get_log_probabilities(
+            model, inputs["chosen_input_ids"], inputs["chosen_attention_mask"]
+        )
+        policy_rejected_logps = self.get_log_probabilities(
+            model, inputs["rejected_input_ids"], inputs["rejected_attention_mask"]
+        )
+        
+        # 计算参考模型的log概率（用于DPO）
+        with torch.no_grad():
+            reference_chosen_logps = self.get_log_probabilities(
+                self.reference_model, inputs["chosen_input_ids"], inputs["chosen_attention_mask"]
+            )
+            reference_rejected_logps = self.get_log_probabilities(
+                self.reference_model, inputs["rejected_input_ids"], inputs["rejected_attention_mask"]
+            )
+        
+        # 计算DPO损失
+        dpo_loss, accuracy = self.compute_dpo_loss(
+            policy_chosen_logps, policy_rejected_logps,
+            reference_chosen_logps, reference_rejected_logps
+        )
+        
+        # 初始化总损失为DPO损失
+        total_loss = dpo_loss
+        
+        # 准备日志字典
+        log_dict = {
+            "dpo_loss": dpo_loss.item(), 
+            "dpo_accuracy": accuracy.item()
+        }
+        
+        # 计算SFT损失（如果启用且数据可用）
+        sft_loss = None
+        if self.use_sft and "chosen_labels" in inputs:
+            # 使用chosen回复计算SFT损失
+            outputs = model(
+                input_ids=inputs["chosen_input_ids_sft"], 
+                attention_mask=inputs["chosen_attention_mask_sft"]
+            )
+            sft_loss = self.compute_sft_loss(outputs.logits, inputs["chosen_labels"])
+            
+            # 将SFT损失加入总损失
+            total_loss = total_loss + self.sft_loss_weight * sft_loss
+            
+            # 添加SFT损失到日志
+            log_dict.update({
+                "sft_loss": sft_loss.item(),
+                "sft_loss_weight": self.sft_loss_weight,
+                "total_loss": total_loss.item()
+            })
+        else:
+            # 如果没有SFT损失，总损失就是DPO损失
+            log_dict["total_loss"] = total_loss.item()
+        
+        # 记录所有指标
+        self.log(log_dict)
+        
+        return (total_loss, None) if return_outputs else total_loss
+
+
+def load_model_and_tokenizer(
+    model_path: str,
+    max_length: int = 4096,
+    use_lora: bool = True,
+    qlora: bool = False,
+    bf16: bool = False,
+    fp16: bool = False,
+):
+    """load model and tokenizer"""
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    tokenizer.pad_token = tokenizer.eos_token
+
+    assert not (bf16 and fp16), "bf16 or fp16, not both"
+    if bf16:
+        dtype = torch.bfloat16
+    elif fp16:
+        dtype = torch.float16
+    else:
+        dtype = torch.float32
+    if qlora:
+        assert use_lora, "use_lora must be True when use_qlora is True"
+        quantization_config = BitsAndBytesConfig(
+            load_in_4bit=True,  # 是否进行4bit量化
+            load_in_8bit=False,  # 是否进行8bit量化
+            bnb_4bit_compute_dtype=torch.float16,  # 计算精度设置
+            bnb_4bit_quant_storage=torch.uint8,  # 量化权重的储存格式
+            bnb_4bit_quant_type="nf4",  # 量化格式，这里用的是正太分布的int4
+            bnb_4bit_use_double_quant=True,  # 是否采用双量化，即对zeropoint和scaling参数进行量化
+            llm_int8_enable_fp32_cpu_offload=False,  # 是否llm使用int8，cpu上保存的参数使用fp32
+            llm_int8_has_fp16_weight=False,  # 是否启用混合精度
+            # llm_int8_skip_modules=["out_proj", "kv_proj", "lm_head"],  # 不进行量化的模块
+            llm_int8_threshold=6.0,  # llm.int8()算法中的离群值，根据这个值区分是否进行量化
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            torch_dtype=dtype,
+            trust_remote_code=True,
+            quantization_config=quantization_config,
+        )
+    else:
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            torch_dtype=dtype,
+            trust_remote_code=True,
+        )
+    if use_lora:
+        from peft import LoraConfig, TaskType, get_peft_model
+
+        lora_config = LoraConfig(
+            init_lora_weights="gaussian",
+            task_type=TaskType.CAUSAL_LM,
+            target_modules=(
+                ["q_a_proj", "kv_a_proj_with_mqa", "q_b_proj", "kv_b_proj"]
+                if hasattr(model.config, 'architectures') and model.config.architectures == ["MiniCPM3ForCausalLM"]
+                else ["q_proj", "v_proj"]
+            ),
+            r=64,
+            lora_alpha=32,
+            lora_dropout=0.1,
+            inference_mode=False,
+        )
+        model = get_peft_model(model, lora_config)
+        # trainable params: 2,949,120 || all params: 3,010,652,928 || trainable%: 0.09795616002669305
+        model.print_trainable_parameters()
+        model.enable_input_require_grads()  # need when using adapter
+
+    return model, tokenizer
+
+
+def load_reference_model(model_path, bf16=False, fp16=False):
+    """加载参考模型（用于DPO训练）"""
+    assert not (bf16 and fp16), "bf16 or fp16, not both"
+    if bf16:
+        dtype = torch.bfloat16
+    elif fp16:
+        dtype = torch.float16
+    else:
+        dtype = torch.float32
+    
+    reference_model = AutoModelForCausalLM.from_pretrained(
+        model_path,
+        torch_dtype=dtype,
+        trust_remote_code=True,
+    )
+    
+    # 参考模型不需要梯度
+    for param in reference_model.parameters():
+        param.requires_grad = False
+    
+    return reference_model
+
+
+if __name__ == "__main__":
+    parser = transformers.HfArgumentParser(
+        (ModelArguments, DataArguments, TrainingArguments)
+    )
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    
+    # Create output directory
+    os.makedirs(training_args.output_dir, exist_ok=True)
+    
+    model, tokenizer = load_model_and_tokenizer(
+        model_path=model_args.model_name_or_path,
+        max_length=training_args.model_max_length,
+        use_lora=training_args.use_lora,
+        qlora=training_args.qlora,
+        bf16=training_args.bf16,
+        fp16=training_args.fp16,
+    )
+
+    if training_args.use_dpo:
+        # 如果没有指定参考模型路径，则使用当前模型作为参考模型
+        reference_model_path = training_args.reference_model_path or model_args.model_name_or_path
+        reference_model = load_reference_model(
+            model_path=reference_model_path,
+            bf16=training_args.bf16,
+            fp16=training_args.fp16,
+        )
+        
+        train_dataset = DPODataset(
+            data_path=data_args.train_data_path,
+            tokenizer=tokenizer,
+            model_max_length=training_args.model_max_length,
+            include_sft_data=training_args.sft_loss_weight > 0,
+        )
+        eval_dataset = DPODataset(
+            data_path=data_args.eval_data_path,
+            tokenizer=tokenizer,
+            model_max_length=training_args.model_max_length,
+            include_sft_data=training_args.sft_loss_weight > 0,
+        ) if os.path.exists(data_args.eval_data_path) else None
+        
+        # 创建自定义数据collator
+        data_collator = DPODataCollator(
+            tokenizer=tokenizer,
+            include_sft_data=training_args.sft_loss_weight > 0
+        )
+        
+        trainer = DPOTrainer(
+            model=model,
+            args=training_args,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            tokenizer=tokenizer,
+            data_collator=data_collator,  # 使用自定义data collator
+            reference_model=reference_model,
+            dpo_beta=training_args.dpo_beta,
+            sft_loss_weight=training_args.sft_loss_weight,
+        )
+    else:
+        train_dataset = SupervisedDataset(
+            data_path=data_args.train_data_path,
+            tokenizer=tokenizer,
+            model_max_length=training_args.model_max_length,
+        )
+        eval_dataset = SupervisedDataset(
+            data_path=data_args.eval_data_path,
+            tokenizer=tokenizer,
+            model_max_length=training_args.model_max_length,
+        ) if os.path.exists(data_args.eval_data_path) else None
+
+        trainer = Trainer(
+            model=model,
+            args=training_args,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            tokenizer=tokenizer,
+        )
+
+    trainer.train()
+    
+    # save the incremental PEFT weights, more details can be found in https://huggingface.co/blog/peft
+    trainer.save_model()
\ No newline at end of file
--- a/finetune/sft_dpo_trainer/sft_dpo.sh
+++ b/finetune/sft_dpo_trainer/sft_dpo.sh
+#!/bin/bash
+
+# DPO训练脚本示例
+# 使用带有SFT损失的DPO训练
+
+python finetune_dpo_trainer.py \
+    --model_name_or_path "/root/autodl-tmp/MiniCPM3-4B" \
+    --train_data_path "/root/autodl-tmp/dpo_train_data.json" \
+    --eval_data_path "/root/autodl-tmp/dpo_train_data.json" \
+    --output_dir "./output_dpo_sft" \
+    --num_train_epochs 3 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 8 \
+    --learning_rate 5e-6 \
+    --weight_decay 0.01 \
+    --warmup_steps 100 \
+    --logging_steps 10 \
+    --save_steps 500 \
+    --eval_steps 500 \
+    --model_max_length 512 \
+    --use_lora True \
+    --bf16 True \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 4 \
+    --remove_unused_columns False \
+    --use_dpo True \
+    --dpo_beta 0.1 \
+    --sft_loss_weight 0.5
\ No newline at end of file
--- a/finetune/sft_finetune.sh
+++ b/finetune/sft_finetune.sh
+#!/bin/bash
+# 获取当前时间戳（格式：年月日时分秒）
+formatted_time=$(date +"%Y%m%d%H%M%S")
+echo $formatted_time
+
+# 启动DeepSpeed分布式训练
+deepspeed --include localhost:0,1 finetune.py \
+    # 预训练模型路径或名称
+    --model_name_or_path MiniCPM-2B-sft-bf16 \
+    # 输出目录（包含时间戳）
+    --output_dir output/AdvertiseGenSFT/$formatted_time/ \
+    # 训练数据路径，修改成chatml格式的json地址
+    --train_data_path data/AdvertiseGenChatML/train.json \
+    # 验证数据路径，修改成chatml格式的json地址
+    --eval_data_path data/AdvertiseGenChatML/dev.json \
+    # 学习率设置
+    --learning_rate 5e-5 \
+    # 每个设备的训练批次大小
+    --per_device_train_batch_size 14 \
+    # 每个设备的验证批次大小
+    --per_device_eval_batch_size 32 \
+    # 启用BF16混合精度
+    --bf16 \
+    # 梯度累积步数
+    --gradient_accumulation_steps 2 \
+    # 预热步数
+    --warmup_steps 100 \
+    # 最大训练步数
+    --max_steps 3000 \
+    # 权重衰减系数
+    --weight_decay 0.01 \
+    # 评估策略（按步数）
+    --evaluation_strategy steps \
+    # 每100步评估一次
+    --eval_steps 100 \
+    # 保存策略（按步数）
+    --save_strategy steps \
+    # 每500步保存一次
+    --save_steps 500 \
+    # 随机种子
+    --seed 42 \
+    # 日志级别
+    --log_level info \
+    # 日志记录策略（按步数）
+    --logging_strategy steps \
+    # 每10步记录一次日志
+    --logging_steps 10 \
+    # DeepSpeed配置文件路径
+    --deepspeed configs/ds_config_zero2.json
--- a/infer_transformers.py
+++ b/infer_transformers.py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+torch.manual_seed(0)
+
+path = 'openbmb/MiniCPM4-8B'
+device = "cuda"
+tokenizer = AutoTokenizer.from_pretrained(path)
+model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.bfloat16, device_map=device, trust_remote_code=True)
+
+# User can directly use the chat interface
+# responds, history = model.chat(tokenizer, "Write an article about Artificial Intelligence.", temperature=0.7, top_p=0.7)
+# print(responds)
+
+# User can also use the generate interface
+messages = [
+    {"role": "user", "content": "Write an article about Artificial Intelligence."},
+]
+prompt_text = tokenizer.apply_chat_template(
+    messages,
+    tokenize=False,
+    add_generation_prompt=True,
+)
+model_inputs = tokenizer([prompt_text], return_tensors="pt").to(device)
+
+model_outputs = model.generate(
+    **model_inputs,
+    max_new_tokens=1024,
+    top_p=0.7,
+    temperature=0.7
+)
+output_token_ids = [
+    model_outputs[i][len(model_inputs[i]):] for i in range(len(model_inputs['input_ids']))
+]
+
+responses = tokenizer.batch_decode(output_token_ids, skip_special_tokens=True)[0]
+print(responses)
--- a/infer_vllm.py
+++ b/infer_vllm.py
+from transformers import AutoTokenizer
+from vllm import LLM, SamplingParams
+
+model_name = "openbmb/MiniCPM4-8B"
+prompt = [{"role": "user", "content": "推荐5个北京的景点。"}]
+
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+input_text = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
+
+llm = LLM(
+    model=model_name,
+    trust_remote_code=True,
+    max_num_batched_tokens=32768, 
+    dtype="bfloat16", 
+    gpu_memory_utilization=0.8, 
+)
+
+sampling_params = SamplingParams(top_p=0.7, temperature=0.7, max_tokens=1024, repetition_penalty=1.02)
+
+outputs = llm.generate(prompts=input_text, sampling_params=sampling_params)
+
+print(outputs[0].outputs[0].text)
--- a/model.properties
+++ b/model.properties
+# 模型编码
+modelCode=1675
+# 模型名称
+modelName=MiniCPM4_pytorch
+# 模型描述
+modelDescription=速度狂飙，快至220倍！MiniCPM4.0-8B是首个原生稀疏模型，5%的极高稀疏度加持系统级创新技术的大爆发，宣告了端侧长文本时代到来！
+# 应用场景
+appScenario=推理,训练,对话问答,制造,广媒,金融,能源,医疗,家居,教育
+# 框架类型
+frameType=pytorch
--- a/openbmb/MiniCPM4-8B/README.md
+++ b/openbmb/MiniCPM4-8B/README.md
+---
+license: apache-2.0
+language:
+- zh
+- en
+pipeline_tag: text-generation
+library_name: transformers
+---
+<div align="center">
+<img src="https://github.com/OpenBMB/MiniCPM/blob/main/assets/minicpm_logo.png?raw=true" width="500em" ></img> 
+</div>
+
+<p align="center">
+<a href="https://github.com/OpenBMB/MiniCPM/" target="_blank">GitHub Repo</a> |
+<a href="https://arxiv.org/abs/2506.07900" target="_blank">Technical Report</a> |
+<a href="https://mp.weixin.qq.com/s/KIhH2nCURBXuFXAtYRpuXg?poc_token=HBIsUWijxino8oJ5s6HcjcfXFRi0Xj2LJlxPYD9c">Join Us</a>
+</p>
+<p align="center">
+👋 Contact us in <a href="https://discord.gg/3cGQn9b3YM" target="_blank">Discord</a> and <a href="https://github.com/OpenBMB/MiniCPM/blob/main/assets/wechat.jpg" target="_blank">WeChat</a>
+</p>
+
+## What's New
+- [2025.06.06] **MiniCPM4** series are released! This model achieves ultimate efficiency improvements while maintaining optimal performance at the same scale! It can achieve over 5x generation acceleration on typical end-side chips! You can find technical report [here](https://github.com/OpenBMB/MiniCPM/tree/main/report/MiniCPM_4_Technical_Report.pdf).🔥🔥🔥
+
+## MiniCPM4 Series
+MiniCPM4 series are highly efficient large language models (LLMs) designed explicitly for end-side devices, which achieves this efficiency through systematic innovation in four key dimensions: model architecture, training data, training algorithms, and inference systems.
+- [MiniCPM4-8B](https://huggingface.co/openbmb/MiniCPM4-8B): The flagship of MiniCPM4, with 8B parameters, trained on 8T tokens. (**<-- you are here**)
+- [MiniCPM4-0.5B](https://huggingface.co/openbmb/MiniCPM4-0.5B): The small version of MiniCPM4, with 0.5B parameters, trained on 1T tokens.
+- [MiniCPM4-8B-Eagle-FRSpec](https://huggingface.co/openbmb/MiniCPM4-8B-Eagle-FRSpec): Eagle head for FRSpec, accelerating speculative inference for MiniCPM4-8B.
+- [MiniCPM4-8B-Eagle-FRSpec-QAT-cpmcu](https://huggingface.co/openbmb/MiniCPM4-8B-Eagle-FRSpec-QAT-cpmcu): Eagle head trained with QAT for FRSpec, efficiently integrate speculation and quantization to achieve ultra acceleration for MiniCPM4-8B.
+- [MiniCPM4-8B-Eagle-vLLM](https://huggingface.co/openbmb/MiniCPM4-8B-Eagle-vLLM): Eagle head in vLLM format, accelerating speculative inference for MiniCPM4-8B.
+- [MiniCPM4-8B-marlin-Eagle-vLLM](https://huggingface.co/openbmb/MiniCPM4-8B-marlin-Eagle-vLLM): Quantized Eagle head for vLLM format, accelerating speculative inference for MiniCPM4-8B.
+- [BitCPM4-0.5B](https://huggingface.co/openbmb/BitCPM4-0.5B): Extreme ternary quantization applied to MiniCPM4-0.5B compresses model parameters into ternary values, achieving a 90% reduction in bit width.
+- [BitCPM4-1B](https://huggingface.co/openbmb/BitCPM4-1B): Extreme ternary quantization applied to MiniCPM3-1B compresses model parameters into ternary values, achieving a 90% reduction in bit width.
+- [MiniCPM4-Survey](https://huggingface.co/openbmb/MiniCPM4-Survey): Based on MiniCPM4-8B, accepts users' quiries as input and autonomously generate trustworthy, long-form survey papers.
+- [MiniCPM4-MCP](https://huggingface.co/openbmb/MiniCPM4-MCP): Based on MiniCPM4-8B, accepts users' queries and available MCP tools as input and autonomously calls relevant MCP tools to satisfy users' requirements.
+
+## Introduction
+MiniCPM 4 is an extremely efficient edge-side large model that has undergone efficient optimization across four dimensions: model architecture, learning algorithms, training data, and inference systems, achieving ultimate efficiency improvements.
+
+- 🏗️ **Efficient Model Architecture:**
+  - InfLLM v2 -- Trainable Sparse Attention Mechanism: Adopts a trainable sparse attention mechanism architecture where each token only needs to compute relevance with less than 5% of tokens in 128K long text processing, significantly reducing computational overhead for long texts
+
+- 🧠 **Efficient Learning Algorithms:**
+  - Model Wind Tunnel 2.0 -- Efficient Predictable Scaling: Introduces scaling prediction methods for performance of downstream tasks, enabling more precise model training configuration search
+  - BitCPM -- Ultimate Ternary Quantization: Compresses model parameter bit-width to 3 values, achieving 90% extreme model bit-width reduction
+  - Efficient Training Engineering Optimization: Adopts FP8 low-precision computing technology combined with Multi-token Prediction training strategy
+
+- 📚 **High-Quality Training Data:**
+  - UltraClean -- High-quality Pre-training Data Filtering and Generation: Builds iterative data cleaning strategies based on efficient data verification, open-sourcing high-quality Chinese and English pre-training dataset [UltraFinweb](https://huggingface.co/datasets/openbmb/Ultra-FineWeb)
+  - UltraChat v2 -- High-quality Supervised Fine-tuning Data Generation: Constructs large-scale high-quality supervised fine-tuning datasets covering multiple dimensions including knowledge-intensive data, reasoning-intensive data, instruction-following data, long text understanding data, and tool calling data
+
+- ⚡ **Efficient Inference System:**
+  - CPM.cu -- Lightweight and Efficient CUDA Inference Framework: Integrates sparse attention, model quantization, and speculative sampling to achieve efficient prefilling and decoding
+  - ArkInfer -- Cross-platform Deployment System: Supports efficient deployment across multiple backend environments, providing flexible cross-platform adaptation capabilities
+
+## Usage
+
+### Inference with [CPM.cu](https://github.com/OpenBMB/cpm.cu)
+
+We recommend using [CPM.cu](https://github.com/OpenBMB/cpm.cu) for the inference of MiniCPM4. CPM.cu is a CUDA inference framework developed by OpenBMB, which integrates efficient sparse, speculative sampling, and quantization techniques, fully leveraging the efficiency advantages of MiniCPM4.
+
+You can install CPM.cu by running the following command:
+
+```bash
+git clone https://github.com/OpenBMB/cpm.cu.git --recursive
+cd cpm.cu
+python3 setup.py install
+```
+
+MiniCPM4 natively supports context lengths of up to 32,768 tokens. To reproduce the long-text acceleration effect in the paper, we recommend using the LongRoPE factors that have been validated. Change the `rope_scaling` field in the `config.json` file as the following to enable LongRoPE.
+```json
+{
+    ...,
+    "rope_scaling": {
+        "rope_type": "longrope", 
+        "long_factor": [0.9977997200264581, 1.014658295992452, 1.0349680404997148, 1.059429246056193, 1.0888815016813513, 1.1243301355211495, 1.166977103606075, 1.2182568066927284, 1.2798772354275727, 1.3538666751582975, 1.4426259039919596, 1.5489853358570191, 1.6762658237220625, 1.8283407612492941, 2.0096956085876183, 2.225478927469756, 2.481536379650452, 2.784415934557119, 3.1413289096347365, 3.560047844772632, 4.048719380066383, 4.752651957515948, 5.590913044973868, 6.584005926629993, 7.7532214876576155, 9.119754865903639, 10.704443927019176, 12.524994176518703, 14.59739595363613, 16.93214476166354, 19.53823297353041, 22.417131025031697, 25.568260840911098, 28.991144156566317, 32.68408069090375, 36.65174474170465, 40.90396065611201, 45.4664008671033, 50.37147343433591, 55.6804490772103, 61.470816952306556, 67.8622707390618, 75.00516023410414, 83.11898235973767, 92.50044360202462, 103.57086856690864, 116.9492274587385, 118.16074567836519, 119.18497548708795, 120.04810876261652, 120.77352815196981, 121.38182790207875, 121.89094985353891, 122.31638758099915, 122.6714244963338, 122.9673822552567, 123.21386397019609, 123.41898278254268, 123.58957065488238, 123.73136519024158, 123.84917421274221, 123.94701903496814, 124.02825801299717, 124.09569231686116],
+        "short_factor": [0.9977997200264581, 1.014658295992452, 1.0349680404997148, 1.059429246056193, 1.0888815016813513, 1.1243301355211495, 1.166977103606075, 1.2182568066927284, 1.2798772354275727, 1.3538666751582975, 1.4426259039919596, 1.5489853358570191, 1.6762658237220625, 1.8283407612492941, 2.0096956085876183, 2.225478927469756, 2.481536379650452, 2.784415934557119, 3.1413289096347365, 3.560047844772632, 4.048719380066383, 4.752651957515948, 5.590913044973868, 6.584005926629993, 7.7532214876576155, 9.119754865903639, 10.704443927019176, 12.524994176518703, 14.59739595363613, 16.93214476166354, 19.53823297353041, 22.417131025031697, 25.568260840911098, 28.991144156566317, 32.68408069090375, 36.65174474170465, 40.90396065611201, 45.4664008671033, 50.37147343433591, 55.6804490772103, 61.470816952306556, 67.8622707390618, 75.00516023410414, 83.11898235973767, 92.50044360202462, 103.57086856690864, 116.9492274587385, 118.16074567836519, 119.18497548708795, 120.04810876261652, 120.77352815196981, 121.38182790207875, 121.89094985353891, 122.31638758099915, 122.6714244963338, 122.9673822552567, 123.21386397019609, 123.41898278254268, 123.58957065488238, 123.73136519024158, 123.84917421274221, 123.94701903496814, 124.02825801299717, 124.09569231686116],
+        "original_max_position_embeddings": 32768
+    }
+}
+```
+
+After modification, you can run the following command to reproduce the long-context acceleration effect (the script will automatically download the model weights from HuggingFace)
+```bash
+python3 tests/test_generate.py
+```
+
+For more details about CPM.cu, please refer to [the repo CPM.cu](https://github.com/OpenBMB/cpm.cu).
+
+### Inference with Transformers
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+torch.manual_seed(0)
+
+path = 'openbmb/MiniCPM4-8B'
+device = "cuda"
+tokenizer = AutoTokenizer.from_pretrained(path)
+model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.bfloat16, device_map=device, trust_remote_code=True)
+
+# User can directly use the chat interface
+# responds, history = model.chat(tokenizer, "Write an article about Artificial Intelligence.", temperature=0.7, top_p=0.7)
+# print(responds)
+
+# User can also use the generate interface
+messages = [
+    {"role": "user", "content": "Write an article about Artificial Intelligence."},
+]
+prompt_text = tokenizer.apply_chat_template(
+    messages,
+    tokenize=False,
+    add_generation_prompt=True,
+)
+model_inputs = tokenizer([prompt_text], return_tensors="pt").to(device)
+
+model_outputs = model.generate(
+    **model_inputs,
+    max_new_tokens=1024,
+    top_p=0.7,
+    temperature=0.7
+)
+output_token_ids = [
+    model_outputs[i][len(model_inputs[i]):] for i in range(len(model_inputs['input_ids']))
+]
+
+responses = tokenizer.batch_decode(output_token_ids, skip_special_tokens=True)[0]
+print(responses)
+```
+
+MiniCPM4-8B supports `InfLLM v2`, a sparse attention mechanism designed for efficient long-sequence inference. It requires the [infllmv2_cuda_impl](https://github.com/OpenBMB/infllmv2_cuda_impl) library.
+
+You can install it by running the following command:
+```bash
+git clone -b feature_infer https://github.com/OpenBMB/infllmv2_cuda_impl.git
+cd infllmv2_cuda_impl
+git submodule update --init --recursive
+pip install -e . # or python setup.py install 
+```
+
+To enable InfLLM v2, you need to add the `sparse_config` field in `config.json`:
+```json
+{
+    ...,
+    "sparse_config": {
+        "kernel_size": 32,
+        "kernel_stride": 16,
+        "init_blocks": 1,
+        "block_size": 64,
+        "window_size": 2048,
+        "topk": 64,
+        "use_nope": false,
+        "dense_len": 8192
+    }
+}
+```
+
+These parameters control the behavior of InfLLM v2:
+* `kernel_size` (default: 32): The size of semantic kernels.
+* `kernel_stride` (default: 16): The stride between adjacent kernels.
+* `init_blocks` (default: 1): The number of initial blocks that every query token attends to. This ensures attention to the beginning of the sequence.
+* `block_size` (default: 64): The block size for key-value blocks.
+* `window_size` (default: 2048): The size of the local sliding window. 
+* `topk` (default: 64): The specifies that each token computes attention with only the top-k most relevant key-value blocks.
+* `use_nope` (default: false): Whether to use the NOPE technique in block selection for improved performance.
+* `dense_len` (default: 8192): Since Sparse Attention offers limited benefits for short sequences, the model can use standard (dense) attention for shorter texts. The model will use dense attention for sequences with a token length below `dense_len` and switch to sparse attention for sequences exceeding this length. Set this to `-1` to always use sparse attention regardless of sequence length.
+
+MiniCPM4 natively supports context lengths of up to 32,768 tokens. For conversations where the total length (including both input and output) significantly exceeds this limit, we recommend using RoPE scaling techniques for effective handling of long texts. We have validated the model's performance on context lengths of up to 131,072 tokens by modifying the LongRoPE factor.
+
+You can apply the LongRoPE factor modification by modifying the model files. Specifically, in the `config.json` file, adjust the `rope_scaling` fields.
+```json
+{
+    ...,
+    "rope_scaling": {
+        "rope_type": "longrope", 
+        "long_factor": [0.9977997200264581, 1.014658295992452, 1.0349680404997148, 1.059429246056193, 1.0888815016813513, 1.1243301355211495, 1.166977103606075, 1.2182568066927284, 1.2798772354275727, 1.3538666751582975, 1.4426259039919596, 1.5489853358570191, 1.6762658237220625, 1.8283407612492941, 2.0096956085876183, 2.225478927469756, 2.481536379650452, 2.784415934557119, 3.1413289096347365, 3.560047844772632, 4.048719380066383, 4.752651957515948, 5.590913044973868, 6.584005926629993, 7.7532214876576155, 9.119754865903639, 10.704443927019176, 12.524994176518703, 14.59739595363613, 16.93214476166354, 19.53823297353041, 22.417131025031697, 25.568260840911098, 28.991144156566317, 32.68408069090375, 36.65174474170465, 40.90396065611201, 45.4664008671033, 50.37147343433591, 55.6804490772103, 61.470816952306556, 67.8622707390618, 75.00516023410414, 83.11898235973767, 92.50044360202462, 103.57086856690864, 116.9492274587385, 118.16074567836519, 119.18497548708795, 120.04810876261652, 120.77352815196981, 121.38182790207875, 121.89094985353891, 122.31638758099915, 122.6714244963338, 122.9673822552567, 123.21386397019609, 123.41898278254268, 123.58957065488238, 123.73136519024158, 123.84917421274221, 123.94701903496814, 124.02825801299717, 124.09569231686116],
+        "short_factor": [0.9977997200264581, 1.014658295992452, 1.0349680404997148, 1.059429246056193, 1.0888815016813513, 1.1243301355211495, 1.166977103606075, 1.2182568066927284, 1.2798772354275727, 1.3538666751582975, 1.4426259039919596, 1.5489853358570191, 1.6762658237220625, 1.8283407612492941, 2.0096956085876183, 2.225478927469756, 2.481536379650452, 2.784415934557119, 3.1413289096347365, 3.560047844772632, 4.048719380066383, 4.752651957515948, 5.590913044973868, 6.584005926629993, 7.7532214876576155, 9.119754865903639, 10.704443927019176, 12.524994176518703, 14.59739595363613, 16.93214476166354, 19.53823297353041, 22.417131025031697, 25.568260840911098, 28.991144156566317, 32.68408069090375, 36.65174474170465, 40.90396065611201, 45.4664008671033, 50.37147343433591, 55.6804490772103, 61.470816952306556, 67.8622707390618, 75.00516023410414, 83.11898235973767, 92.50044360202462, 103.57086856690864, 116.9492274587385, 118.16074567836519, 119.18497548708795, 120.04810876261652, 120.77352815196981, 121.38182790207875, 121.89094985353891, 122.31638758099915, 122.6714244963338, 122.9673822552567, 123.21386397019609, 123.41898278254268, 123.58957065488238, 123.73136519024158, 123.84917421274221, 123.94701903496814, 124.02825801299717, 124.09569231686116],
+        "original_max_position_embeddings": 32768
+    }
+}
+```
+
+### Inference with [SGLang](https://github.com/sgl-project/sglang)
+
+For now, you need to install our forked version of SGLang.
+```bash
+git clone -b openbmb https://github.com/OpenBMB/sglang.git
+cd sglang
+
+pip install --upgrade pip
+pip install -e "python[all]"
+```
+
+You can start the inference server by running the following command:
+```bash
+python -m sglang.launch_server --model openbmb/MiniCPM4-8B --trust-remote-code --port 30000 --chat-template chatml
+```
+
+Then you can use the chat interface by running the following command:
+```python
+import openai
+
+client = openai.Client(base_url=f"http://localhost:30000/v1", api_key="None")
+
+response = client.chat.completions.create(
+    model="openbmb/MiniCPM4-8B",
+    messages=[
+        {"role": "user", "content": "Write an article about Artificial Intelligence."},
+    ],
+    temperature=0.7,
+    max_tokens=1024,
+)
+
+print(response.choices[0].message.content)
+```
+
+### Inference with [vLLM](https://github.com/vllm-project/vllm)
+For now, you need to install the latest version of vLLM.
+```
+pip install -U vllm \
+    --pre \
+    --extra-index-url https://wheels.vllm.ai/nightly
+```
+
+Then you can inference MiniCPM4-8B with vLLM:
+```python
+from transformers import AutoTokenizer
+from vllm import LLM, SamplingParams
+
+model_name = "openbmb/MiniCPM4-8B"
+prompt = [{"role": "user", "content": "Please recommend 5 tourist attractions in Beijing. "}]
+
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+input_text = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
+
+llm = LLM(
+    model=model_name,
+    trust_remote_code=True,
+    max_num_batched_tokens=32768, 
+    dtype="bfloat16", 
+    gpu_memory_utilization=0.8, 
+)
+sampling_params = SamplingParams(top_p=0.7, temperature=0.7, max_tokens=1024, repetition_penalty=1.02)
+
+outputs = llm.generate(prompts=input_text, sampling_params=sampling_params)
+
+print(outputs[0].outputs[0].text)
+```
+
+Also, you can start the inference server by running the following command:
+> **Note**: In vLLM's chat API, `add_special_tokens` is `False` by default. This means important special tokens—such as the beginning-of-sequence (BOS) token—will not be added automatically. To ensure the input prompt is correctly formatted for the model, you should explicitly set `extra_body={"add_special_tokens": True}`.
+
+```bash
+vllm serve openbmb/MiniCPM4-8B 
+```
+
+Then you can use the chat interface by running the following code:
+
+```python
+import openai
+
+client = openai.Client(base_url="http://localhost:8000/v1", api_key="EMPTY")
+
+response = client.chat.completions.create(
+    model="openbmb/MiniCPM4-8B",
+    messages=[
+        {"role": "user", "content": "Write an article about Artificial Intelligence."},
+    ],
+    temperature=0.7,
+    max_tokens=1024,
+    extra_body=dict(add_special_tokens=True),  # Ensures special tokens are added for chat template
+    
+)
+
+print(response.choices[0].message.content)
+```
+
+## Evaluation Results
+On two typical end-side chips, Jetson AGX Orin and RTX 4090, MiniCPM4 demonstrates significantly faster processing speed compared to similar-size models in long text processing tasks. As text length increases, MiniCPM4's efficiency advantage becomes more pronounced. On the Jetson AGX Orin platform, compared to Qwen3-8B, MiniCPM4 achieves approximately 7x decoding speed improvement.
+
+![benchmark](https://github.com/OpenBMB/MiniCPM/blob/main/assets/minicpm4/efficiency.png?raw=true)
+
+#### Comprehensive Evaluation
+MiniCPM4 launches end-side versions with 8B and 0.5B parameter scales, both achieving best-in-class performance in their respective categories.
+
+![benchmark](https://github.com/OpenBMB/MiniCPM/blob/main/assets/minicpm4/benchmark.png?raw=true)
+
+#### Long Text Evaluation
+MiniCPM4 is pre-trained on 32K long texts and achieves length extension through YaRN technology. In the 128K long text needle-in-a-haystack task, MiniCPM4 demonstrates outstanding performance.
+
+![long-niah](https://github.com/OpenBMB/MiniCPM/blob/main/assets/minicpm4/128k-niah.png?raw=true)
+
+## Statement
+- As a language model, MiniCPM generates content by learning from a vast amount of text. 
+- However, it does not possess the ability to comprehend or express personal opinions or value judgments. 
+- Any content generated by MiniCPM does not represent the viewpoints or positions of the model developers. 
+- Therefore, when using content generated by MiniCPM, users should take full responsibility for evaluating and verifying it on their own.
+
+## LICENSE
+- This repository and MiniCPM models are released under the [Apache-2.0](https://github.com/OpenBMB/MiniCPM/blob/main/LICENSE) License. 
+
+## Citation
+- Please cite our [paper](https://github.com/OpenBMB/MiniCPM/tree/main/report/MiniCPM_4_Technical_Report.pdf) if you find our work valuable.
+
+```bibtex
+@article{minicpm4,
+  title={{MiniCPM4}: Ultra-Efficient LLMs on End Devices},
+  author={MiniCPM Team},
+  year={2025}
+}
+```
\ No newline at end of file
--- a/quantize/awq_quantize.py
+++ b/quantize/awq_quantize.py
+from datasets import load_dataset	
+from awq import AutoAWQForCausalLM	
+from transformers import AutoTokenizer	
+import os	
+
+model_path = '/root/ld/ld_model_pretrained/minicpm3' # model_path or model_id	
+quant_path = '/root/ld/ld_model_pretrained/minicpm3_awq' # quant_save_path	
+quant_data_path='/Users/liudan/ai/pull_request/MiniCPM/quantize/quantize_data/wikitext'# 写入自带数据集地址	
+quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" } # "w_bit":4 or 8	
+quant_samples=512 # how many samples to use for calibration	
+custom_data=[ # first custom data
+                [
+                    {"role": "system", "content": "You are a helpful assistant."},
+                    {"role": "assistant", "content": "你好，有什么我可以帮助你的吗？"},
+                    {"role": "user", "content": "我想了解如何编写Python代码。"},
+                ], # second custom data
+                [
+                    {"role": "system", "content": "You are a helpful assistant."},
+                    {"role": "assistant", "content": "你好，有什么我可以帮助你的吗？"},
+                    {"role": "user", "content": "我想了解如何编写Python代码。"},
+                ]
+                 #....more custom data
+            ]	
+# Load model	
+model = AutoAWQForCausalLM.from_pretrained(model_path,safetensors=False)	
+tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True,device_map={"": "cuda:0"})	
+
+# Define data loading methods	
+def load_alpaca(quant_data_path):	
+    data = load_dataset(quant_data_path, split="train") # Set the absolute path to alpaca or huggingface id	
+
+    # concatenate data	
+    def concatenate_data(x):
+        if x['input'] and x['instruction']:
+            line = [
+                        {"role": "system", "content": x['instruction']},
+                        {"role": "user", "content": x['input']},
+                        {"role": "assistant", "content": x['output']},
+                    ]
+        elif x['input']:
+            line = [    
+                        {"role": "system", "content": "You are a helpful assistant."},
+                        {"role": "user", "content": x['input']},
+                        {"role": "assistant", "content": x['output']},
+                    ]
+        else:
+            line = [    
+                        {"role": "system", "content": "You are a helpful assistant."},
+                        {"role": "user", "content": x['instruction']},
+                        {"role": "assistant", "content": x['output']},
+                    ]
+        if model.config.architectures == ["MiniCPM3ForCausalLM"]:
+            print(tokenizer.decode(tokenizer.apply_chat_template(line)))
+            return {"text":tokenizer.decode(tokenizer.apply_chat_template(line))}
+        else:
+            return {"text": '<用户>'+x['instruction']  + x['input'] + '<AI>' + '\n' + x['output']}	
+
+    concatenated = data.map(concatenate_data)[:quant_samples]	
+    return [text for text in concatenated["text"]]	
+
+def load_wikitext(quant_data_path):	
+    data = load_dataset(quant_data_path,  split="train")	
+    return [text for text in data["text"] if text.strip() != '' and len(text.split(' ')) > 20][:quant_samples]	
+
+def load_cust_data(custom_data):
+    quant_data=[tokenizer.decode(tokenizer.apply_chat_template(i)) for i in custom_data]	
+    return quant_data[:quant_samples]	
+# Quantize	
+model.quantize(tokenizer, quant_config=quant_config, calib_data=load_wikitext(quant_data_path=quant_data_path))	
+
+# Save quantized model	
+model.save_quantized(quant_path)	
+tokenizer.save_pretrained(quant_path)	
+
+print(f'Model is quantized and saved at "{quant_path}"')
\ No newline at end of file
--- a/quantize/bnb_quantize.py
+++ b/quantize/bnb_quantize.py
+"""
+the script will use bitandbytes to quantize the MiniCPM language model.
+the be quantized model can be finetuned by MiniCPM or not.
+you only need to set the model_path 、save_path and run bash code 
+
+cd MiniCPM
+python quantize/bnb_quantize.py
+
+you will get the quantized model in save_path、quantized_model test time and gpu usage
+"""
+
+
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+import time
+import torch
+import GPUtil
+import os
+
+model_path = "/root/ld/ld_model_pretrain/MiniCPM-1B-sft-bf16"  # 模型下载地址
+save_path = "/root/ld/ld_model_pretrain/MiniCPM-1B-sft-bf16_int4"  # 量化模型保存地址
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+# 创建一个配置对象来指定量化参数
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,  # 是否进行4bit量化
+    load_in_8bit=False,  # 是否进行8bit量化
+    bnb_4bit_compute_dtype=torch.float16,  # 计算精度设置
+    bnb_4bit_quant_storage=torch.uint8,  # 量化权重的储存格式
+    bnb_4bit_quant_type="nf4",  # 量化格式，这里用的是正太分布的int4
+    bnb_4bit_use_double_quant=True,  # 是否采用双量化，即对zeropoint和scaling参数进行量化
+    llm_int8_enable_fp32_cpu_offload=False,  # 是否llm使用int8，cpu上保存的参数使用fp32
+    llm_int8_has_fp16_weight=False,  # 是否启用混合精度
+    #llm_int8_skip_modules=["out_proj", "kv_proj", "lm_head"],  # 不进行量化的模块
+    llm_int8_threshold=6.0,  # llm.int8()算法中的离群值，根据这个值区分是否进行量化
+)
+
+tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(
+    model_path,
+    device_map=device,  # 分配模型到device
+    quantization_config=quantization_config,
+    trust_remote_code=True,
+)
+
+gpu_usage = GPUtil.getGPUs()[0].memoryUsed
+start = time.time()
+response =  model.chat(tokenizer, "<用户>给我讲一个故事<AI>",history=[], temperature=0.5, top_p=0.8, repetition_penalty=1.02)  # 模型推理
+print("量化后输出", response)
+print("量化后推理用时", time.time() - start)
+print(f"量化后显存占用: {round(gpu_usage/1024,2)}GB")
+
+
+# 保存模型和分词器
+os.makedirs(save_path, exist_ok=True)
+model.save_pretrained(save_path, safe_serialization=True)
+tokenizer.save_pretrained(save_path)
--- a/quantize/gptq_quantize.py
+++ b/quantize/gptq_quantize.py
+"""	
+使用gptq量化前，请先安装我们的autogptq分支,否则代码无法正常运行。	
+‘’‘bash	
+git clone https://github.com/LDLINGLINGLING/AutoGPTQ/tree/minicpm_gptq	
+cd Autogptq	
+# 如果量化minicpm3.0
+git checkout minicpm3
+# 如果量化minicpm2.0
+git checkout minicpm_autogptq
+pip install e .	
+‘’‘	
+"""	
+
+import json	
+import random	
+import time	
+from argparse import ArgumentParser	
+import torch	
+from datasets import Dataset	
+from transformers import AutoTokenizer	
+from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig	
+
+import os	
+import shutil	
+
+def copy_missing_files(src_path, dst_path):	
+    src_files=os.listdir(src_path)	
+    dst_files=os.listdir(dst_path)	
+    for src_file in src_files:	
+        if src_file not in dst_files and src_file.endswith(('.bin', '.json'))!=True and src_file.startswith('.')!=True:	
+            src_file_path = os.path.join(src_path, src_file)	
+            dst_file_path = os.path.join(dst_path, src_file)	
+            shutil.copy2(src_file_path, dst_file_path)	
+
+def load_data(data_path, tokenizer, n_samples):	
+
+    with open(data_path, "r", encoding="utf-8") as f:	
+        raw_data = json.load(f)	
+
+    raw_data = random.sample(raw_data, k=min(n_samples, len(raw_data)))	
+    def dummy_gen():	
+        return raw_data	
+
+    def tokenize(examples):	
+        instructions = examples["instruction"]	
+        inputs = examples["input"]	
+        outputs = examples["output"]	
+
+        prompts = []	
+        texts = []	
+        input_ids = []	
+        attention_mask = []	
+        for istr, inp, opt in zip(instructions, inputs, outputs):	
+            if inp:
+                line = [
+                    {"role": "system", "content": istr},
+                    {"role": "assistant", "content": inp},
+                    {"role": "user", "content": opt},
+                ]
+                prompt = tokenizer.decode(tokenizer.apply_chat_template(line[:2]))
+                text = tokenizer.decode(tokenizer.apply_chat_template(line))	
+            else:	
+                line = [
+                    {"role": "assistant", "content": istr},
+                    {"role": "user", "content": opt},
+                ]
+                prompt = tokenizer.decode(tokenizer.apply_chat_template(line[:1]))
+                text = tokenizer.decode(tokenizer.apply_chat_template(line))		
+            if len(tokenizer(prompt)["input_ids"]) >= tokenizer.model_max_length:	
+                continue	
+
+            tokenized_data = tokenizer(text)	
+
+            input_ids.append(tokenized_data["input_ids"][: tokenizer.model_max_length])	
+            attention_mask.append(tokenized_data["attention_mask"][: tokenizer.model_max_length])	
+            prompts.append(prompt)	
+            texts.append(text)	
+
+        return {	
+            "input_ids": input_ids,	
+            "attention_mask": attention_mask,	
+            "prompt": prompts,	
+        }	
+
+    dataset = Dataset.from_generator(dummy_gen)	
+
+    dataset = dataset.map(	
+        tokenize,	
+        batched=True,	
+        batch_size=len(dataset),	
+        num_proc=1,	
+        keep_in_memory=True,	
+        load_from_cache_file=False,	
+        remove_columns=["instruction", "input"],	
+    )	
+
+    dataset = dataset.to_list()	
+
+    for sample in dataset:	
+        sample["input_ids"] = torch.LongTensor(sample["input_ids"])	
+        sample["attention_mask"] = torch.LongTensor(sample["attention_mask"])	
+
+    return dataset	
+
+
+def main():	
+    parser = ArgumentParser()	
+    parser.add_argument("--pretrained_model_dir", type=str,default='/root/ld/ld_model_pretrained/minicpm3')	
+    parser.add_argument("--quantized_model_dir", type=str, default='/root/ld/ld_model_pretrained/minicpm3_gptq_4bit')	
+    parser.add_argument("--bits", type=int, default=4, choices=[2, 3, 4])#do not use 8 bit	
+    parser.add_argument(	
+        "--group_size",	
+        type=int,	
+        default=128,	
+        help="group size, -1 means no grouping or full rank",	
+    )	
+    parser.add_argument("--desc_act", action="store_true", default=True,help="whether to quantize with desc_act")	
+    parser.add_argument(	
+        "--num_samples",	
+        type=int,	
+        default=256,	
+        help="how many samples will be used to quantize model",	
+    )	
+    parser.add_argument(	
+        "--save_and_reload",	
+        action="store_true",	
+        default=True,	
+        help="whether save quantized model to disk and reload back",	
+    )	
+    parser.add_argument("--fast_tokenizer", action="store_true", help="whether use fast tokenizer")	
+    parser.add_argument(	
+        "--use_triton",	
+        action="store_true",	
+        help="whether use triton to speedup at inference",	
+    )	
+    parser.add_argument(	
+        "--per_gpu_max_memory",	
+        type=int,	
+        default=None,	
+        help="max memory used to load model per gpu",	
+    )	
+    parser.add_argument(	
+        "--cpu_max_memory",	
+        type=int,	
+        default=None,	
+        help="max memory used to offload model to cpu",	
+    )	
+    parser.add_argument(	
+        "--quant_batch_size",	
+        type=int,	
+        default=8,	
+        help="examples batch size for quantization",	
+    )	
+    parser.add_argument(	
+        "--trust_remote_code",	
+        default=True,	
+        action="store_true",	
+        help="whether to trust remote code when loading model",	
+    )	
+    parser.add_argument(	
+        "--quant_data",	
+        default='quantize_data/alpaca_data_cleaned.json',	
+        help="the quant data path",	
+    )	
+
+    args = parser.parse_args()	
+
+    max_memory = {}	
+    if args.per_gpu_max_memory is not None and args.per_gpu_max_memory > 0:	
+        if torch.cuda.is_available():	
+            max_memory.update({i: f"{args.per_gpu_max_memory}GIB" for i in range(torch.cuda.device_count())})	
+    if args.cpu_max_memory is not None and args.cpu_max_memory > 0 and max_memory:	
+        max_memory["cpu"] = f"{args.cpu_max_memory}GIB"	
+    if not max_memory:	
+        max_memory = None	
+
+    tokenizer = AutoTokenizer.from_pretrained(	
+        args.pretrained_model_dir,	
+        use_fast=args.fast_tokenizer,	
+        trust_remote_code=args.trust_remote_code,	
+    )	
+    model = AutoGPTQForCausalLM.from_pretrained(	
+        args.pretrained_model_dir,	
+        quantize_config=BaseQuantizeConfig(bits=args.bits, group_size=args.group_size, desc_act=args.desc_act),	
+        max_memory=max_memory,	
+        trust_remote_code=args.trust_remote_code,	
+    )	
+
+    examples = load_data(args.quant_data, tokenizer, args.num_samples)	
+    examples_for_quant = [	
+        {"input_ids": example["input_ids"], "attention_mask": example["attention_mask"]} for example in examples	
+    ]	
+
+    start = time.time()	
+    model.quantize(	
+        examples_for_quant,	
+        batch_size=args.quant_batch_size,	
+        use_triton=args.use_triton,	
+        autotune_warmup_after_quantized=args.use_triton,	
+    )	
+    end = time.time()	
+    print(f"quantization took: {end - start: .4f}s")	
+
+    if not args.quantized_model_dir:	
+        args.quantized_model_dir = args.pretrained_model_dir	
+
+    if args.save_and_reload:	
+        model.save_quantized(args.quantized_model_dir)	
+        tokenizer.save_pretrained(args.quantized_model_dir)	
+        copy_missing_files(args.pretrained_model_dir,args.quantized_model_dir)	
+        del model	
+        if torch.cuda.is_available():	
+            torch.cuda.empty_cache()	
+        model = AutoGPTQForCausalLM.from_quantized(	
+            args.quantized_model_dir,	
+            device="cuda:0",	
+            use_triton=args.use_triton,	
+            max_memory=max_memory,	
+            inject_fused_mlp=True,	
+            inject_fused_attention=True,	
+            trust_remote_code=args.trust_remote_code,	
+        )	
+
+    pipeline_init_kwargs = {"model": model, "tokenizer": tokenizer}	
+    if not max_memory:	
+        pipeline_init_kwargs["device"] = "cuda:0"	
+    for example in random.sample(examples, k=min(4, len(examples))):	
+        print(f"prompt: {example['prompt']}")	
+        print("-" * 42)	
+        print(f"golden: {example['output']}")	
+        print("-" * 42)	
+        start = time.time()	
+        print(tokenizer.decode(model.generate(**tokenizer("{}".format(example['prompt']), return_tensors="pt").to(model.device),max_new_tokens=100)[0]))	
+
+if __name__ == "__main__":	
+    import logging	
+
+    logging.basicConfig(	
+        format="%(asctime)s %(levelname)s [%(name)s] %(message)s",	
+        level=logging.INFO,	
+        datefmt="%Y-%m-%d %H:%M:%S",	
+    )	
+    main()
\ No newline at end of file
--- a/quantize/quantize_data/alpaca/downloads/69e71f4ebbd3d7f8f822d46b759228b3fb785d2c74d87f14126cf8da4f403610
+++ b/quantize/quantize_data/alpaca/downloads/69e71f4ebbd3d7f8f822d46b759228b3fb785d2c74d87f14126cf8da4f403610