Merge branch 'master' into 'master'

ChatGLM3-6B 微调训练 See merge request !2

Merge branch 'master' into 'master'
ChatGLM3-6B 微调训练 See merge request !2
467ec853 · lvzhen · 971c0aee · 0006ad16 · 971c0aee · 971c0aee
Commit 467ec853 authored May 10, 2024 by lvzhen
20 changed files
--- a/finetune_chatmodel_demo/preprocess_utils.py
+++ b/finetune_chatmodel_demo/preprocess_utils.py
-import json
-import ast
-import astunparse
-from transformers import PreTrainedTokenizer
-from torch.utils.data import Dataset
-from copy import deepcopy
-from typing import Dict, List
-
-# text constants
-FUNCTION_CALL_NAME     = 'tool_call'
-FUNCTION_CALL_PREFIX   = '```python\n'
-FUNCTION_CALL_POSTFIX  = '\n```'
-TOOL_DEFINITION_PREFIX = 'Answer the following questions as best as you can. You have access to the following tools:\n'
-CONVERSATOIN_KEY       = 'conversations'
-TOOL_DESC_KEY          = 'tools'
-
-def format_function_call(function_name: str, parameters: Dict[str, str]):
-    function_name = ast.Name(id=function_name)
-    keywords = [
-        ast.keyword(arg=arg_name, value=ast.Constant(arg_value)) 
-        for arg_name, arg_value in parameters.items()
-    ]
-    func_call = ast.Call(func=function_name, args=[], keywords=keywords)
-    return astunparse.unparse(func_call).strip()
-
-def format_conversation(item, tokenizer, conversation_key: str, tool_key: str):
-    conversations = deepcopy(item[conversation_key])
-
-    # Note: `loss_mask` here means whether *the prediction* of the token should take loss
-    tokens, loss_masks = [tokenizer.get_command("[gMASK]"), tokenizer.get_command("sop")], [0, 0]
-
-    def _update(_tokens: List[int], value: int = 1):
-        value = int(value)
-        tokens.extend(_tokens)
-        loss_masks.extend([value] * len(_tokens))
-
-    # insert system prompt for tools
-    if tool_key in item:
-        conversations.insert(0, 
-            {
-                "role": "system", 
-                "content": TOOL_DEFINITION_PREFIX + json.dumps(item[tool_key], indent=4, ensure_ascii=False)
-            }
-        )
-    
-    for idx, conv in enumerate(conversations):
-        loss = conv.get("loss", True)
-        if conv['role'] in {'system', 'user'}:
-            loss = False
-        if conv['role'] == 'tool':
-            # function call python code
-            value = FUNCTION_CALL_PREFIX + format_function_call(FUNCTION_CALL_NAME, conv["parameters"]) + FUNCTION_CALL_POSTFIX
-            text = tokenizer.build_single_message("assistant", conv["name"], value)
-            _update(text, loss)
-
-            # function call result
-            value = conv.get('observation', None)
-            if not isinstance(value, str):
-                value = json.dumps(value, ensure_ascii=False)
-            text = tokenizer.build_single_message("observation", "", value)
-            _update(text, False)
-        else:
-            text = tokenizer.build_single_message(conv['role'], "", conv["content"])
-            _update(text, loss)
-
-    _update([tokenizer.eos_token_id], False)
-
-    assert len(tokens) == len(loss_masks), f"length mismatch: {len(tokens)} vs {len(loss_masks)}"
-    return tokens, loss_masks
-
-def sanity_check(tokens: List[int], target: List[int], tokenizer: PreTrainedTokenizer):
-    print("Sanity Check >>>>>>>>>>>>>")
-    for t, m in zip(tokens, target):
-        decoded =  tokenizer.tokenizer.index_special_tokens[t] \
-            if t in tokenizer.tokenizer.index_special_tokens \
-            else tokenizer.decode([t])
-        print("%20s: %6d -> %6d" % (repr(decoded), t, m))
-    print("<<<<<<<<<<<<< Sanity Check")
-
-    assert len(tokens) == len(target), f"length mismatch: {len(tokens)} vs {len(target)}"
-
-class MultiTurnDataset(Dataset):
-    def __init__(self, data: List[dict], tokenizer: PreTrainedTokenizer, max_seq_length: int):
-        super(MultiTurnDataset, self).__init__()
-        self.tokenizer = tokenizer
-        self.max_seq_length = max_seq_length
-        self.data = data
-
-    def __len__(self):
-        return len(self.data)
-
-    def __getitem__(self, i) -> dict:
-        data_item = self.data[i]
-        tokens, loss_masks = format_conversation(data_item, self.tokenizer, CONVERSATOIN_KEY, TOOL_DESC_KEY)
-
-        # labels are used inside the model
-        target_based_loss_mask = [False] + loss_masks[:-1]
-        labels = [(t if m else -100) for t, m in zip(tokens, target_based_loss_mask)]
-
-        tokens = tokens[:self.max_seq_length]
-        labels = labels[:self.max_seq_length]
-        tokens += [self.tokenizer.pad_token_id] * (self.max_seq_length - len(tokens))
-        labels += [-100] * (self.max_seq_length - len(labels))
-
-        assert len(tokens) == len(labels), f"length mismatch: {len(tokens)} vs {len(labels)}"
-
-        return {
-            "input_ids": tokens,
-            "labels": labels
-        }
-    
-class InputOutputDataset(Dataset):
-    def __init__(self, data: List[dict], tokenizer: PreTrainedTokenizer, max_source_length: int, max_target_length: int):
-        super(InputOutputDataset, self).__init__()
-        self.tokenizer = tokenizer
-        self.max_source_length = max_source_length
-        self.max_target_length = max_target_length
-        self.max_seq_length = max_source_length + max_target_length + 1
-        self.data = data
-
-    def __len__(self):
-        return len(self.data)
-    
-    def __getitem__(self, i) -> dict:
-        data_item = self.data[i]
-
-        a_ids = self.tokenizer.encode(text=data_item['prompt'], add_special_tokens=True, truncation=True,
-                                         max_length=self.max_source_length)
-        b_ids = self.tokenizer.encode(text=data_item['response'], add_special_tokens=False, truncation=True,
-                                    max_length=self.max_target_length)
-
-        context_length = len(a_ids)
-        input_ids = a_ids + b_ids + [self.tokenizer.eos_token_id]
-        labels = [self.tokenizer.pad_token_id] * context_length + b_ids + [self.tokenizer.eos_token_id]
-        
-        pad_len = self.max_seq_length - len(input_ids)
-        input_ids = input_ids + [self.tokenizer.pad_token_id] * pad_len
-        labels = labels + [self.tokenizer.pad_token_id] * pad_len
-        labels = [(l if l != self.tokenizer.pad_token_id else -100) for l in labels]
-
-        assert len(input_ids) == len(labels), f"length mismatch: {len(input_ids)} vs {len(labels)}"
-
-        return {
-            "input_ids": input_ids,
-            "labels": labels
-        }
--- a/finetune_chatmodel_demo/requirements.txt
+++ b/finetune_chatmodel_demo/requirements.txt
-transformers==4.30.2
-accelerate
-sentencepiece
-astunparse
-deepspeed
\ No newline at end of file
--- a/finetune_chatmodel_demo/scripts/finetune_ds.sh
+++ b/finetune_chatmodel_demo/scripts/finetune_ds.sh
-#! /usr/bin/env bash
-
-set -ex
-
-LR=1e-4
-NUM_GPUS=8
-MAX_SOURCE_LEN=1024
-MAX_TARGET_LEN=128
-DEV_BATCH_SIZE=4
-GRAD_ACCUMULARION_STEPS=1
-MAX_STEP=20
-SAVE_INTERVAL=500
-
-RUN_NAME=advertise_gen_ft
-BASE_MODEL_PATH=/chatglm3/chatglm3-6b
-DATASET_PATH=../formatted_data/advertise_gen.jsonl
-
-DATESTR=`date +%Y%m%d-%H%M%S`
-OUTPUT_DIR=output/${RUN_NAME}-${DATESTR}-${LR}
-MASTER_PORT=$(shuf -n 1 -i 10000-65535)
-
-mkdir -p $OUTPUT_DIR
-export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-torchrun --standalone --nnodes=1 --nproc_per_node=$NUM_GPUS ../finetune.py \
-    --train_format input-output \
-    --train_file $DATASET_PATH \
-    --preprocessing_num_workers 1 \
-    --model_name_or_path $BASE_MODEL_PATH \
-    --output_dir $OUTPUT_DIR \
-    --max_source_length $MAX_SOURCE_LEN \
-    --max_target_length $MAX_TARGET_LEN \
-    --per_device_train_batch_size $DEV_BATCH_SIZE \
-    --gradient_accumulation_steps $GRAD_ACCUMULARION_STEPS \
-    --max_steps $MAX_STEP \
-    --logging_steps 1 \
-    --save_steps $SAVE_INTERVAL \
-    --learning_rate $LR \
-    --fp16 \
-    --deepspeed ../configs/deepspeed.json 2>&1 | tee ${OUTPUT_DIR}/train.log
--- a/finetune_chatmodel_demo/scripts/finetune_ds_multiturn.sh
+++ b/finetune_chatmodel_demo/scripts/finetune_ds_multiturn.sh
-#! /usr/bin/env bash
-
-set -ex
-
-LR=1e-4
-NUM_GPUS=8
-MAX_SEQ_LEN=2048
-DEV_BATCH_SIZE=2
-GRAD_ACCUMULARION_STEPS=1
-MAX_STEP=200
-SAVE_INTERVAL=50
-
-DATESTR=`date +%Y%m%d-%H%M%S`
-RUN_NAME=tool_alpaca_ft
-DATASET_PATH=../formatted_data/tool_alpaca.jsonl
-
-BASE_MODEL_PATH=/chatglm3/chatglm3-6b
-OUTPUT_DIR=output/${RUN_NAME}-${DATESTR}-${LR}
-
-mkdir -p $OUTPUT_DIR
-
-torchrun --standalone --nnodes=1 --nproc_per_node=$NUM_GPUS ../finetune.py \
-    --train_format multi-turn \
-    --train_file $DATASET_PATH \
-    --max_seq_length $MAX_SEQ_LEN \
-    --preprocessing_num_workers 1 \
-    --model_name_or_path $BASE_MODEL_PATH \
-    --output_dir $OUTPUT_DIR \
-    --per_device_train_batch_size $DEV_BATCH_SIZE \
-    --gradient_accumulation_steps $GRAD_ACCUMULARION_STEPS \
-    --max_steps $MAX_STEP \
-    --logging_steps 1 \
-    --save_steps $SAVE_INTERVAL \
-    --fp16 \
-    --deepspeed ../configs/deepspeed.json 2>&1 | tee ${OUTPUT_DIR}/train.log
--- a/finetune_chatmodel_demo/scripts/finetune_pt.sh
+++ b/finetune_chatmodel_demo/scripts/finetune_pt.sh
-#! /usr/bin/env bash
-
-set -ex
-
-PRE_SEQ_LEN=128
-LR=2e-2
-NUM_GPUS=1
-MAX_SOURCE_LEN=1024
-MAX_TARGET_LEN=128
-DEV_BATCH_SIZE=1
-GRAD_ACCUMULARION_STEPS=1
-MAX_STEP=20
-SAVE_INTERVAL=500
-
-DATESTR=`date +%Y%m%d-%H%M%S`
-RUN_NAME=advertise_gen_pt
-
-BASE_MODEL_PATH=/chatglm3/chatglm3-6b
-DATASET_PATH=../formatted_data/advertise_gen.jsonl
-OUTPUT_DIR=output/${RUN_NAME}-${DATESTR}-${PRE_SEQ_LEN}-${LR}
-
-mkdir -p $OUTPUT_DIR
-export HIP_VISIBLE_DEVICES=4,5,6,7
-torchrun --standalone --nnodes=1 --nproc_per_node=$NUM_GPUS ../finetune.py \
-    --train_format input-output \
-    --train_file $DATASET_PATH \
-    --preprocessing_num_workers 1 \
-    --model_name_or_path $BASE_MODEL_PATH \
-    --output_dir $OUTPUT_DIR \
-    --max_source_length $MAX_SOURCE_LEN \
-    --max_target_length $MAX_TARGET_LEN \
-    --per_device_train_batch_size $DEV_BATCH_SIZE \
-    --gradient_accumulation_steps $GRAD_ACCUMULARION_STEPS \
-    --max_steps $MAX_STEP \
-    --logging_steps 1 \
-    --save_steps $SAVE_INTERVAL \
-    --learning_rate $LR \
-    --pre_seq_len $PRE_SEQ_LEN 2>&1 | tee ${OUTPUT_DIR}/train.log
--- a/finetune_chatmodel_demo/scripts/finetune_pt_multiturn.sh
+++ b/finetune_chatmodel_demo/scripts/finetune_pt_multiturn.sh
-#! /usr/bin/env bash
-
-set -ex
-
-PRE_SEQ_LEN=128
-LR=2e-2
-NUM_GPUS=1
-MAX_SEQ_LEN=2048
-DEV_BATCH_SIZE=1
-GRAD_ACCUMULARION_STEPS=16
-MAX_STEP=1000
-SAVE_INTERVAL=500
-
-DATESTR=`date +%Y%m%d-%H%M%S`
-RUN_NAME=tool_alpaca_pt
-
-BASE_MODEL_PATH=/chatglm3/chatglm3-6b
-DATASET_PATH=../formatted_data/tool_alpaca.jsonl
-OUTPUT_DIR=output/${RUN_NAME}-${DATESTR}-${PRE_SEQ_LEN}-${LR}
-
-mkdir -p $OUTPUT_DIR
-
-torchrun --standalone --nnodes=1 --nproc_per_node=$NUM_GPUS ../finetune.py \
-    --train_format multi-turn \
-    --train_file $DATASET_PATH \
-    --max_seq_length $MAX_SEQ_LEN \
-    --preprocessing_num_workers 1 \
-    --model_name_or_path $BASE_MODEL_PATH \
-    --output_dir $OUTPUT_DIR \
-    --per_device_train_batch_size $DEV_BATCH_SIZE \
-    --gradient_accumulation_steps $GRAD_ACCUMULARION_STEPS \
-    --max_steps $MAX_STEP \
-    --logging_steps 1 \
-    --save_steps $SAVE_INTERVAL \
-    --learning_rate $LR \
-    --pre_seq_len $PRE_SEQ_LEN 2>&1 | tee ${OUTPUT_DIR}/train.log
--- a/finetune_chatmodel_demo/scripts/format_advertise_gen.py
+++ b/finetune_chatmodel_demo/scripts/format_advertise_gen.py
-#! /usr/bin/env python
-
-import json
-from collections import Counter
-from argparse import ArgumentParser
-import os
-
-parser = ArgumentParser()
-parser.add_argument("--path", type=str, required=True)
-
-args = parser.parse_args()
-
-with open(args.path) as f:
-    data = [json.loads(line) for line in f]
-
-train_examples = [{
-    "prompt": x['content'],
-    "response": x['summary'],
-} for x in data]
-
-os.makedirs("formatted_data", exist_ok=True)
-
-with open("formatted_data/advertise_gen.jsonl", "w") as f:
-    for e in train_examples:
-        f.write(json.dumps(e, ensure_ascii=False) + "\n")
--- a/finetune_chatmodel_demo/scripts/format_tool_alpaca.py
+++ b/finetune_chatmodel_demo/scripts/format_tool_alpaca.py
-#! /usr/bin/env python
-
-import json
-from collections import Counter
-from argparse import ArgumentParser
-import os
-
-parser = ArgumentParser()
-parser.add_argument("--path", type=str, required=True)
-
-args = parser.parse_args()
-
-with open(args.path) as f:
-    data = json.load(f)
-
-train_examples = []
-err_count = 0
-for setting in data:
-    api_desc = [setting["NLDocumentation"]]
-    for instance in setting["Instances"]:
-        try:
-            conv = [{
-                "role": "user",
-                "content": instance['input'],
-            }]
-            for step in instance['intermediate_steps']:
-                tool_name, params, react = step[0]
-                step_thought = react.split("Action:")[0].strip()
-                observation = step[1]
-                conv.append({
-                    "role": "assistant",
-                    "content": step_thought,
-                })
-                conv.append({
-                    "role": "tool",
-                    "name": tool_name,
-                    "parameters": json.loads(params),
-                    "observation": observation,
-                })
-            conv.append({
-                "role": "assistant",
-                "content": instance['Final Thought'] + "\n" + instance['output'],
-            })
-        except:
-            err_count += 1
-        else:
-            train_examples.append({
-                "tools": api_desc,
-                "conversations": conv
-            })
-
-print("err_count:", err_count)
-print("train_examples:", len(train_examples))
-print("conversation distribution:", Counter([len(e["conversations"]) for e in train_examples]))
-
-os.makedirs("formatted_data", exist_ok=True)
-
-with open("formatted_data/tool_alpaca.jsonl", "w") as f:
-    for e in train_examples:
-        f.write(json.dumps(e, ensure_ascii=False) + "\n")
\ No newline at end of file
--- a/finetune_chatmodel_demo/train_data.json
+++ b/finetune_chatmodel_demo/train_data.json
--- a/finetune_chatmodel_demo/trainer.py
+++ b/finetune_chatmodel_demo/trainer.py
-# coding=utf-8
-# Copyright 2020-present the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-The Trainer class, to easily train a 🤗 Transformers from scratch or finetune it on a new task.
-"""
-import os
-from typing import Optional
-from transformers import Trainer
-
-import torch
-from transformers.modeling_utils import PreTrainedModel, unwrap_model
-from transformers.utils import logging
-
-logger = logging.get_logger(__name__)
-
-WEIGHTS_NAME = "pytorch_model.bin"
-TRAINING_ARGS_NAME = "training_args.bin"
-
-
-class PrefixTrainer(Trainer):
-    def __init__(self, *args, save_changed=False, **kwargs):
-        self.save_changed = save_changed
-        super().__init__(*args, **kwargs)
-
-    def _save(self, output_dir: Optional[str] = None, state_dict=None):
-        # If we are executing this function, we are the process zero, so we don't check for that.
-        output_dir = output_dir if output_dir is not None else self.args.output_dir
-        os.makedirs(output_dir, exist_ok=True)
-        logger.info(f"Saving model checkpoint to {output_dir}")
-        # Save a trained model and configuration using `save_pretrained()`.
-        # They can then be reloaded using `from_pretrained()`
-        if not isinstance(self.model, PreTrainedModel):
-            if isinstance(unwrap_model(self.model), PreTrainedModel):
-                if state_dict is None:
-                    state_dict = self.model.state_dict()
-                unwrap_model(self.model).save_pretrained(output_dir, state_dict=state_dict)
-            else:
-                logger.info("Trainer.model is not a `PreTrainedModel`, only saving its state dict.")
-                if state_dict is None:
-                    state_dict = self.model.state_dict()
-                torch.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME))
-        else:
-            if self.save_changed:
-                print("Saving PrefixEncoder")
-                state_dict = self.model.state_dict()
-                filtered_state_dict = {}
-                for k, v in self.model.named_parameters():
-                    if v.requires_grad:
-                        filtered_state_dict[k] = state_dict[k]
-                self.model.save_pretrained(output_dir, state_dict=filtered_state_dict)
-            else:
-                print("Saving the whole model")
-                self.model.save_pretrained(output_dir, state_dict=state_dict)
-        if self.tokenizer is not None:
-            self.tokenizer.save_pretrained(output_dir)
-
-        # Good practice: save your training arguments together with the trained model
-        torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
--- a/finetune_demo/README_en.md
+++ b/finetune_demo/README_en.md
+# ChatGLM3-6B finetune
+
+This directory provides fine-tuning examples of the ChatGLM3-6B model, including full fine-tuning and P-Tuning v2. In
+terms of format, it provides multiple rounds of dialogue fine-tuning samples and input and output format fine-tuning
+samples.
+
+If the model is downloaded locally, the `THUDM/chatglm3-6b` field in this article and the code should be replaced with
+the corresponding address to load the model locally.
+
+Running the example requires `python>=3.10`. In addition to the basic `torch` dependency, the example code also requires
+dependencies to run.
+
+**We provide [sample notebook](lora_finetune.ipynb) to demonstrate how to use our fine-tuning code. **
+
+```bash
+pip install -r requirements.txt
+```
+
+## Test hardware standards
+
+We only provide single-machine multi-card/multi-machine multi-card running examples, so you will need at least one
+machine with multiple GPUs. In the **default configuration file** in this warehouse, we record the usage of video
+memory:
+
+ SFT full fine-tuning: evenly distributed among 4 graphics cards, each graphics card occupies `48346MiB` of video
+  memory.
+ P-TuningV2 fine-tuning: 1 graphics card, occupying `18426MiB` memory.
+ LORA fine-tuning: 1 graphics card, occupying `14082MiB` memory.
+
+> Please note that this result is for reference only, and the memory usage may be different for different parameters.
+> Please make adjustments based on your hardware conditions.
+
+## Multi-turn dialogue format
+
+The multi-round dialogue fine-tuning example adopts the ChatGLM3 dialogue format convention and adds
+different `loss_mask` to different characters to calculate `loss` for multiple rounds of responses in one pass.
+
+For data files, the sample adopts the following format
+
+If you only want to fine-tune your model's conversational capabilities, rather than its tool capabilities, you should
+organize your data in the following format.
+
+```json
+[
+  {
+    "conversations": [
+      {
+        "role": "system",
+        "content": "<system prompt text>"
+      },
+      {
+        "role": "user",
+        "content": "<user prompt text>"
+      },
+      {
+        "role": "assistant",
+        "content": "<assistant response text>"
+      },
+      // ... Muti Turn
+      {
+        "role": "user",
+        "content": "<user prompt text>"
+      },
+      {
+        "role": "assistant",
+        "content": "<assistant response text>"
+      }
+    ]
+  }
+  // ...
+]
+```
+
+**Please note that this method will affect the tool calling function of the model when there are many fine-tuning steps
+**
+
+If you wish to fine-tune your model's dialog and tool capabilities, you should organize your data in the following
+format.
+
+```json
+[
+  {
+    "tools": [
+      // available tools, format is not restricted
+    ],
+    "conversations": [
+      {
+        "role": "system",
+        "content": "<system prompt text>"
+      },
+      {
+        "role": "user",
+        "content": "<user prompt text>"
+      },
+      {
+        "role": "assistant",
+        "content": "<assistant thought to text>"
+      },
+      {
+        "role": "tool",
+        "name": "<name of the tool to be called",
+        "parameters": {
+          "<parameter_name>": "<parameter_value>"
+        },
+        "observation": "<observation>"
+        // don't have to be string
+      },
+      {
+        "role": "assistant",
+        "content": "<assistant response to observation>"
+      },
+      // ... Muti Turn
+      {
+        "role": "user",
+        "content": "<user prompt text>"
+      },
+      {
+        "role": "assistant",
+        "content": "<assistant response text>"
+      }
+    ]
+  }
+  // ...
+]
+```
+
+- There is no need to manually insert the system prompt about the tool description. The `tools` field will be used
+  during preprocessing using `json.dumps(..., ensure_ascii=False)`
+  After formatting, insert it as the first system prompt.
+
+- Each role can be accompanied by a `loss` field of type `bool`, indicating whether the content predicted by this field
+  participates in `loss`
+  calculate. If there is no such field, the sample implementation does not calculate `loss` for `system` and `user` by
+  default, but calculates `loss` for other roles.
+
+- `tool` is not a native role in ChatGLM3. The `tool` here will be automatically converted into an `assistant` with tool
+  call `metadata` during the preprocessing stage.
+  role (default `loss` is calculated) and an `observation` role representing the tool return value (`loss` is not
+  calculated).
+
+- The fine-tuning task of `Code interpreter` has not been implemented yet.
+
+- The `system` role is optional, but if the `system` role exists, it must appear in `user`
+  Before the character, the `system` character can only appear once in a complete dialogue data (regardless of single
+  round or multiple rounds of dialogue).
+
+## Dataset format example
+
+Here we take the AdvertiseGen data set as an example,
+You can download it
+from [Google Drive](https://drive.google.com/file/d/13_vf0xRTQsyneRKdD1bZIr93vBGOczrk/view?usp=sharing)
+Or [Tsinghua Cloud](https://cloud.tsinghua.edu.cn/f/b3f119a008264b1cabd1/?dl=1) download the AdvertiseGen data set.
+Place the decompressed AdvertiseGen directory in the `data` directory and convert it into the following format data set
+yourself.
+
+> Please note that the verification set is added to the current fine-tuning code. Therefore, for a complete set of
+> fine-tuning data sets, the training data set and the verification data set must be included, and the test data set
+> does
+> not need to be filled in. Or directly use the validation data set instead.
+
+```
+{"conversations": [{"role": "user", "content": "Type#skirt*skirt length#skirt"}, {"role": "assistant", "content": "This is versatile Fashionable fairy skirt, the overall design is very elegant and casual. Every girl can instantly turn into a fairy after wearing it. The material is very light and breathable, making it very comfortable to wear in summer."} ]}
+```
+
+## Configuration file
+
+Fine-tuning configuration files are located in the `config` directory and include the following files:
+
+1. `ds_zereo_2 / ds_zereo_3.json`: deepspeed configuration file.
+2. `lora.yaml / ptuning.yaml / sft.yaml`: Configuration files for different models, including model parameters,
+   optimizer parameters, training parameters, etc. Some important parameters are explained as follows:
+    + data_config section
+        + train_file: The file path of the training data set.
+        + val_file: The file path of the verification data set.
+        + test_file: The file path of the test data set.
+        + num_proc: Number of processes used when loading data.
+    + max_input_length: The maximum length of the input sequence.
+    + max_output_length: The maximum length of the output sequence.
+    + training_args section
+        + output_dir: Directory for saving models and other outputs.
+        + max_steps: The maximum number of steps for training.
+        + per_device_train_batch_size: training batch size per device (e.g. GPU).
+        + dataloader_num_workers: The number of worker threads used when loading data.
+        + remove_unused_columns: Whether to remove unused columns in the data.
+        + save_strategy: model saving strategy (for example, how many steps should be saved).
+        + save_steps: How many steps should be taken to save the model.
+        + log_level: log level (such as info).
+        + logging_strategy: logging strategy.
+        + logging_steps: How many steps to log.
+        + per_device_eval_batch_size: Evaluation batch size per device.
+        + evaluation_strategy: Evaluation strategy (e.g. how many steps should be evaluated).
+        + eval_steps: How many steps to evaluate.
+        + predict_with_generate: Whether to use generate mode for prediction.
+    + generation_config section
+        + max_new_tokens: The maximum number of new tokens generated.
+    + peft_config section
+        + peft_type: The parameter valid adjustment type used (e.g. LORA).
+        + task_type: task type, here is the causal language model (CAUSAL_LM).
+    + Lora parameters:
+        + r: LoRA rank.
+        + lora_alpha: Scaling factor for LoRA.
+        + lora_dropout: dropout probability used in LoRA layer
+    + P-TuningV2 parameters:
+        + num_virtual_tokens: The number of virtual tokens.
+
+## Start fine-tuning
+
+Use the following code to execute **single machine multiple cards/multiple machines multiple cards** operation.
+
+```angular2html
+cd finetune_demo
+OMP_NUM_THREADS=1 torchrun --standalone --nnodes=1 --nproc_per_node=8  finetune_hf.py  data/AdvertiseGen/  THUDM/chatglm3-6b  configs/lora.yaml configs/ds_zero_2.json
+```
+
+Execute **Single machine single card** operation through the following code.
+
+```angular2html
+cd finetune_demo
+python finetune_hf.py data/AdvertiseGen/ THUDM/chatglm3-6b configs/lora.yaml
+```
+
+## Fine-tuning from a checkpoint
+
+If you train according to the above method, each fine-tuning will start from scratch. If you want to fine-tune from a
+half-trained model, you can add a fourth parameter, which has two ways to pass in:
+
+1. `yes`, automatically start training from the last saved Checkpoint
+2. `XX`, breakpoint number, for example, `600` means training from Checkpoint number 600
+
+For example, this is an example of continuing fine-tuning from the last saved point
+```angular2html
+
+cd finetune_demo
+python finetune_hf.py  data/AdvertiseGen/  THUDM/chatglm3-6b  configs/lora.yaml yes
+```
+
+## Use the fine-tuned model
+
+### Verify the fine-tuned model in inference_hf.py
+
+You can use our fine-tuned model in `finetune_demo/inference_hf.py`, which can be easily tested with just one line of
+code.
+
+```angular2html
+python inference_hf.py your_finetune_path --prompt your prompt
+```
+
+In this way, the answer you get is a fine-tuned answer.
+
+### Use the fine-tuned model in other demos in this repos or external repos
+
+You can use our `lora` and fully parameterized fine-tuned models in any demo, as follows:
+
+1. Use the method of reading the model in `finetune_demo/inference_hf.py` to replace the method of reading the model in
+   the demo.
+
+> Please note that for LORA and P-TuningV2 we do not merge the trained models, but in `adapter_config.json`
+> The fine-tuning path is recorded in . If your original model location changes, you should modify the path
+> of `base_model_name_or_path` in `adapter_config.json`.
+
+
+> Please note that we have only tested using NVIDIA Hopper (representative GPU: H100) and Ampère (representative GPU:
+> A100) architecture and series of graphics cards. If you use a graphics card with another architecture, you may
+> experience
+> 1. Unknown training problem/Video memory usage is different from the above.
+> 2. The architecture is too low and does not support certain features.
+> 3. The problem of reasoning effect.
+     > The above three situations are problems that the community has encountered before. Although the probability is
+     extremely low, if you encounter the above problems, you can try to solve them in the community.
+
+```python
+def load_model_and_tokenizer(
+        model_dir: Union[str, Path], trust_remote_code: bool = True
+) -> tuple[ModelType, TokenizerType]:
+    model_dir = _resolve_path(model_dir)
+    if (model_dir / 'adapter_config.json').exists():
+        model = AutoPeftModelForCausalLM.from_pretrained(
+            model_dir, trust_remote_code=trust_remote_code, device_map='auto'
+        )
+        tokenizer_dir = model.peft_config['default'].base_model_name_or_path
+    else:
+        model = AutoModelForCausalLM.from_pretrained(
+            model_dir, trust_remote_code=trust_remote_code, device_map='auto'
+        )
+        tokenizer_dir = model_dir
+    tokenizer = AutoTokenizer.from_pretrained(
+        tokenizer_dir, trust_remote_code=trust_remote_code
+    )
+    return model, tokenizer
+```
+
+2. Read the fine-tuned model, please note that you should use the location of the fine-tuned model, for example, if your
+   model location is `/path/to/finetune_adapter_model`
+   , the original model address is `path/to/base_model`, then you should use `/path/to/finetune_adapter_model`
+   as `model_dir`.
+3. After completing the above operations, you can use the fine-tuned model normally, and other calling methods remain
+   unchanged.
+
+### hint
+
+1. Before starting training, the fine-tuning code will print the preprocessing information of the first training data (
+   it is commented by default and can be uncommented), which is displayed as
+
+```log
+Sanity
+Check >> >> >> >> >> >> >
+'[gMASK]': 64790 ->   -100
+'sop': 64792 ->   -100
+'<|system|>': 64794 ->   -100
+'': 30910 ->   -100
+'\n': 13 ->   -100
+'Answer': 20115 ->   -100
+'the': 267 ->   -100
+'following': 1762 ->   -100
+...
+'know': 683 ->   -100
+'the': 267 ->   -100
+'response': 3010 ->   -100
+'details': 3296 ->   -100
+'.': 30930 ->   -100
+'<|assistant|>': 64796 ->   -100
+'': 30910 ->  30910
+'\n': 13 ->     13
+'I': 307 ->    307
+'need': 720 ->    720
+'to': 289 ->    289
+'use': 792 ->    792
+...
+<< << << << << << < Sanity
+Check
+```
+
+words, each line represents a detokenized string, token_id and target_id in turn. Among them, `target_id` is the index
+of `token_id` in the model vocabulary, and `-100` means that
+Token does not participate in `loss` calculation.
+
+2. The function of `_prepare_model_for_training` is to iterate through all the trainable parameters of the model and
+   ensure that their data type is `torch.float32`.
+   This is necessary in some cases because mixed precision training or other operations may change the data type of the
+   model parameters. This code is opened by default and can be commented, but if you use
+   If there is a problem with `half` format training, you can switch back to this code, and the video memory may
+   increase.
+3. In our [Huggingface model code](https://huggingface.co/THUDM/chatglm3-6b/blob/main/modeling_chatglm.py), there is the
+   following content:
+    ```python
+   if self.gradient_checkpointing and self.training:
+                layer_ret = torch.utils.checkpoint.checkpoint(
+                    layer,
+                    hidden_states,
+                    attention_mask,
+                    rotary_pos_emb,
+                    kv_caches[index],
+                    use_cache,
+                    use_reentrant=False
+                )
+   ```
+   This may cause the video memory to increase during training, so if you have insufficient video memory, you can try
+   changing ``` use_reentrant``` to `True`.
+4. The fine-tuned model can use any model acceleration framework that supports `peft` loading. Here, we do not provide a
+   demo.
+5. There are certain differences between the fine-tuning data set format of this warehouse and the API fine-tuning data
+   set format.
+    + The `messages` field in the ZhipuAI API fine-tuning data set is the `conversation` field in this warehouse.
+    + The fine-tuning file in ZhipuAI API is `jsonl`. In this warehouse, you need to simply change the file name
+      to `json`.
+
+## Citation
+
+```
+
+@inproceedings{liu2022p,
+title={P-tuning: Prompt tuning can be comparable to fine-tuning across scales and tasks},
+author={Liu, Xiao and Ji, Kaixuan and Fu, Yicheng and Tam, Weng and Du, Zhengxiao and Yang, Zhilin and Tang, Jie},
+booktitle={Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short
+Papers)},
+pages={61--68},
+year={2022}
+}
+
+@misc{tang2023toolalpaca,
+title={ToolAlpaca: Generalized Tool Learning for Language Models with 3000 Simulated Cases},
+author={Qiaoyu Tang and Ziliang Deng and Hongyu Lin and Xianpei Han and Qiao Liang and Le Sun},
+year={2023},
+eprint={2306.05301},
+archivePrefix={arXiv},
+primaryClass={cs.CL}
+}
+
+```
\ No newline at end of file
--- a/finetune_demo/configs/ds_zero_2.json
+++ b/finetune_demo/configs/ds_zero_2.json
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "zero_optimization": {
+        "stage": 2,
+        "allgather_partitions": true,
+        "allgather_bucket_size": 5e8,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 5e8,
+        "contiguous_gradients": true
+    },
+
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
\ No newline at end of file
--- a/finetune_demo/configs/ds_zero_3.json
+++ b/finetune_demo/configs/ds_zero_3.json
+{
+  "train_micro_batch_size_per_gpu": "auto",
+  "zero_allow_untested_optimizer": true,
+  "bf16": {
+    "enabled": "auto"
+  },
+  "optimizer": {
+    "type": "AdamW",
+    "params": {
+      "lr": "auto",
+      "betas": "auto",
+      "eps": "auto",
+      "weight_decay": "auto"
+    }
+  },
+  "zero_optimization": {
+    "stage": 3,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 5e8,
+    "reduce_scatter": true,
+    "contiguous_gradients": true,
+    "overlap_comm": true,
+    "sub_group_size": 1e9,
+    "reduce_bucket_size": "auto",
+    "stage3_prefetch_bucket_size": "auto",
+    "stage3_param_persistence_threshold": "auto",
+    "stage3_max_live_parameters": 1e9,
+    "stage3_max_reuse_distance": 1e9,
+    "stage3_gather_16bit_weights_on_model_save": true
+  }
+}
\ No newline at end of file
--- a/finetune_demo/configs/lora.yaml
+++ b/finetune_demo/configs/lora.yaml
+data_config:
+  train_file: train.json
+  val_file: dev.json
+  test_file: dev.json
+  num_proc: 16
+max_input_length: 256
+max_output_length: 512
+training_args:
+  # see `transformers.Seq2SeqTrainingArguments`
+  output_dir: ./output
+  max_steps: 3000
+  # needed to be fit for the dataset
+  learning_rate: 5e-5
+  # settings for data loading
+  per_device_train_batch_size: 4
+  dataloader_num_workers: 16
+  remove_unused_columns: false
+  # settings for saving checkpoints
+  save_strategy: steps
+  save_steps: 500
+  # settings for logging
+  log_level: info
+  logging_strategy: steps
+  logging_steps: 10
+  # settings for evaluation
+  per_device_eval_batch_size: 16
+  evaluation_strategy: steps
+  eval_steps: 500
+  # settings for optimizer
+  # adam_epsilon: 1e-6
+  # uncomment the following line to detect nan or inf values
+  # debug: underflow_overflow
+  predict_with_generate: true
+  # see `transformers.GenerationConfig`
+  generation_config:
+    max_new_tokens: 512
+  # set your absolute deepspeed path here
+  #deepspeed: ds_zero_2.json
+  # set to true if train with cpu.
+  use_cpu: false
+peft_config:
+  peft_type: LORA
+  task_type: CAUSAL_LM
+  r: 8
+  lora_alpha: 32
+  lora_dropout: 0.1
--- a/finetune_demo/configs/ptuning_v2.yaml
+++ b/finetune_demo/configs/ptuning_v2.yaml
+data_config:
+  train_file: train.json
+  val_file: dev.json
+  test_file: dev.json
+  num_proc: 16
+max_input_length: 256
+max_output_length: 512
+training_args:
+  # see `transformers.Seq2SeqTrainingArguments`
+  output_dir: ./output
+  max_steps: 3000
+  # needed to be fit for the dataset
+  learning_rate: 5e-5
+  # settings for data loading
+  per_device_train_batch_size: 4
+  dataloader_num_workers: 16
+  remove_unused_columns: false
+  # settings for saving checkpoints
+  save_strategy: steps
+  save_steps: 500
+  # settings for logging
+  log_level: info
+  logging_strategy: steps
+  logging_steps: 10
+  # settings for evaluation
+  per_device_eval_batch_size: 16
+  evaluation_strategy: steps
+  eval_steps: 500
+  # settings for optimizer
+  # adam_epsilon: 1e-6
+  # uncomment the following line to detect nan or inf values
+  # debug: underflow_overflow
+  predict_with_generate: true
+  # see `transformers.GenerationConfig`
+  generation_config:
+    max_new_tokens: 512
+  # set your absolute deepspeed path here
+  #deepspeed: ds_zero_3.json
+  use_cpu: false
+peft_config:
+  peft_type: PREFIX_TUNING
+  task_type: CAUSAL_LM
+  num_virtual_tokens: 128
--- a/finetune_demo/configs/sft.yaml
+++ b/finetune_demo/configs/sft.yaml
+data_config:
+  train_file: train.json
+  val_file: dev.json
+  test_file: dev.json
+  num_proc: 16
+max_input_length: 256
+max_output_length: 512
+training_args:
+  # see `transformers.Seq2SeqTrainingArguments`
+  output_dir: ./output
+  max_steps: 3000
+  # needed to be fit for the dataset
+  learning_rate: 5e-5
+  # settings for data loading
+  per_device_train_batch_size: 4
+  dataloader_num_workers: 16
+  remove_unused_columns: false
+  # settings for saving checkpoints
+  save_strategy: steps
+  save_steps: 500
+  # settings for logging
+  log_level: info
+  logging_strategy: steps
+  logging_steps: 10
+  # settings for evaluation
+  per_device_eval_batch_size: 16
+  evaluation_strategy: steps
+  eval_steps: 500
+  # settings for optimizer
+  # adam_epsilon: 1e-6
+  # uncomment the following line to detect nan or inf values
+  # debug: underflow_overflow
+  predict_with_generate: true
+  generation_config:
+    max_new_tokens: 512
+  # set your absolute deepspeed path here
+  deepspeed: ds_zero_3.json
--- a/finetune_demo/finetune_hf.py
+++ b/finetune_demo/finetune_hf.py
+# -*- coding: utf-8 -*-
+import os
+import jieba
+import dataclasses as dc
+import functools
+from collections.abc import Callable, Mapping, Sequence
+from pathlib import Path
+from typing import Annotated, Any, Optional, Union
+import numpy as np
+import ruamel.yaml as yaml
+import torch
+import typer
+from datasets import Dataset, DatasetDict, NamedSplit, Split, load_dataset
+from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
+from peft import (
+    PeftConfig,
+    PeftModelForCausalLM,
+    get_peft_config,
+    get_peft_model
+)
+from rouge_chinese import Rouge
+from torch import nn
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    EvalPrediction,
+    GenerationConfig,
+    PreTrainedModel,
+    PreTrainedTokenizer,
+    PreTrainedTokenizerFast,
+    Seq2SeqTrainingArguments, AutoConfig,
+)
+from transformers import DataCollatorForSeq2Seq as _DataCollatorForSeq2Seq
+
+from transformers import Seq2SeqTrainer as _Seq2SeqTrainer
+
+ModelType = Union[PreTrainedModel, PeftModelForCausalLM]
+TokenizerType = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
+app = typer.Typer(pretty_exceptions_show_locals=False)
+
+
+class DataCollatorForSeq2Seq(_DataCollatorForSeq2Seq):
+    def __call__(self, features, return_tensors=None):
+        output_ids = (
+            [feature['output_ids'] for feature in features]
+            if 'output_ids' in features[0].keys()
+            else None
+        )
+        if output_ids is not None:
+            max_output_length = max(len(out) for out in output_ids)
+            if self.pad_to_multiple_of is not None:
+                max_output_length = (
+                        (
+                                max_output_length + self.pad_to_multiple_of - 1) //
+                        self.pad_to_multiple_of * self.pad_to_multiple_of
+                )
+            for feature in features:
+                remainder = [self.tokenizer.pad_token_id] * (
+                        max_output_length - len(feature['output_ids'])
+                )
+                if isinstance(feature['output_ids'], list):
+                    feature['output_ids'] = feature['output_ids'] + remainder
+                else:
+                    feature['output_ids'] = np.concatenate(
+                        [feature['output_ids'], remainder]
+                    ).astype(np.int64)
+        return super().__call__(features, return_tensors)
+
+
+class Seq2SeqTrainer(_Seq2SeqTrainer):
+    def prediction_step(
+            self,
+            model: nn.Module,
+            inputs: dict[str, Any],
+            prediction_loss_only: bool,
+            ignore_keys=None,
+            **gen_kwargs,
+    ) -> tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
+        if self.args.predict_with_generate:
+            output_ids = inputs.pop('output_ids')
+        input_ids = inputs['input_ids']
+        loss, generated_tokens, labels = super().prediction_step(
+            model, inputs, prediction_loss_only, ignore_keys, **gen_kwargs
+        )
+        generated_tokens = generated_tokens[:, input_ids.size()[1]:]
+        if self.args.predict_with_generate:
+            labels = output_ids
+        return loss, generated_tokens, labels
+    # For P-Tuning a new save_model function is fine for the prefix_encoder model
+    # but may cost problems for the whole model loading
+
+    # def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = False):
+    #     if output_dir is None:
+    #         output_dir = self.args.output_dir
+    #     os.makedirs(output_dir, exist_ok=True)
+    #     ptuning_params = {k: v for k, v in self.model.transformer.prefix_encoder.state_dict().items()}
+    #
+    #     torch.save(ptuning_params, os.path.join(output_dir, 'pytorch_model.bin'))
+    #
+    #     print(f"P-Tuning model weights saved in {output_dir}")
+    #
+    #     if self.tokenizer is not None:
+    #         self.tokenizer.save_pretrained(output_dir)
+
+
+def _resolve_path(path: Union[str, Path]) -> Path:
+    return Path(path).expanduser().resolve()
+
+
+def _sanity_check(
+        input_ids: Sequence[int],
+        output_ids: Sequence[int],
+        tokenizer: PreTrainedTokenizer,
+):
+    print('--> Sanity check')
+    for in_id, out_id in zip(input_ids, output_ids):
+        if in_id == 0:
+            continue
+        if in_id in tokenizer.tokenizer.index_special_tokens:
+            in_text = tokenizer.tokenizer.index_special_tokens[in_id]
+        else:
+            in_text = tokenizer.decode([in_id])
+        print(f'{repr(in_text):>20}: {in_id} -> {out_id}')
+
+
+@functools.cache
+def _get_yaml_parser() -> yaml.YAML:
+    parser = yaml.YAML(typ='safe', pure=True)
+    parser.indent(mapping=2, offset=2, sequence=4)
+    parser.default_flow_style = False
+    return parser
+
+
+@dc.dataclass
+class DataConfig(object):
+    train_file: str
+    val_file: Optional[str] = None
+    test_file: Optional[str] = None
+
+    num_proc: Optional[int] = None
+
+    @property
+    def data_format(self) -> str:
+        return Path(self.train_file).suffix
+
+    @property
+    def data_files(self) -> dict[NamedSplit, str]:
+        return {
+            split: data_file
+            for split, data_file in zip(
+                [Split.TRAIN, Split.VALIDATION, Split.TEST],
+                [self.train_file, self.val_file, self.test_file],
+            )
+            if data_file is not None
+        }
+
+
+@dc.dataclass
+class FinetuningConfig(object):
+    data_config: DataConfig
+
+    max_input_length: int
+    max_output_length: int
+
+    training_args: Seq2SeqTrainingArguments = dc.field(
+        default_factory=lambda: Seq2SeqTrainingArguments(output_dir='./output')
+    )
+    peft_config: Optional[PeftConfig] = None
+
+    def __post_init__(self):
+        if not self.training_args.do_eval or self.data_config.val_file is None:
+            # skips the evaluation stage when `do_eval` or `eval_file` is not provided
+            self.training_args.do_eval = False
+            self.training_args.evaluation_strategy = 'no'
+            self.data_config.val_file = None
+        else:
+            self.training_args.per_device_eval_batch_size = (
+                    self.training_args.per_device_eval_batch_size
+                    or self.training_args.per_device_train_batch_size
+            )
+
+    @classmethod
+    def from_dict(cls, **kwargs) -> 'FinetuningConfig':
+        training_args = kwargs.get('training_args', None)
+        if training_args is not None and not isinstance(
+                training_args, Seq2SeqTrainingArguments
+        ):
+            gen_config = training_args.get('generation_config')
+            # TODO: a bit hacky
+            if not isinstance(gen_config, GenerationConfig):
+                training_args['generation_config'] = GenerationConfig(
+                    **gen_config
+                )
+            kwargs['training_args'] = Seq2SeqTrainingArguments(**training_args)
+
+        data_config = kwargs.get('data_config')
+        if not isinstance(data_config, DataConfig):
+            kwargs['data_config'] = DataConfig(**data_config)
+
+        peft_config = kwargs.get('peft_config', None)
+        if peft_config is not None and not isinstance(peft_config, PeftConfig):
+            kwargs['peft_config'] = get_peft_config(peft_config)
+        return cls(**kwargs)
+
+    @classmethod
+    def from_file(cls, path: Union[str, Path]) -> 'FinetuningConfig':
+        path = _resolve_path(path)
+        kwargs = _get_yaml_parser().load(path)
+        return cls.from_dict(**kwargs)
+
+
+def _load_datasets(
+        data_dir: Path,
+        data_format: str,
+        data_files: dict[NamedSplit, str],
+        num_proc: Optional[int],
+) -> DatasetDict:
+    if data_format in ('.csv', '.json', '.jsonl'):
+        dataset_dct = load_dataset(
+            data_format[1:],
+            data_dir=data_dir,
+            data_files=data_files,
+            num_proc=num_proc,
+        )
+    else:
+        err_msg = f"Cannot load dataset in the '{data_format}' format."
+        raise NotImplementedError(err_msg)
+
+    return dataset_dct
+
+
+class DataManager(object):
+    def __init__(self, data_dir: str, data_config: DataConfig):
+        self._num_proc = data_config.num_proc
+
+        self._dataset_dct = _load_datasets(
+            _resolve_path(data_dir),
+            data_config.data_format,
+            data_config.data_files,
+            self._num_proc,
+        )
+
+    def _get_dataset(self, split: NamedSplit) -> Optional[Dataset]:
+        return self._dataset_dct.get(split, None)
+
+    def get_dataset(
+            self,
+            split: NamedSplit,
+            process_fn: Callable[[dict[str, Any]], dict[str, Any]],
+            batched: bool = True,
+            remove_orig_columns: bool = True,
+    ) -> Optional[Dataset]:
+        orig_dataset = self._get_dataset(split)
+        if orig_dataset is None:
+            return
+
+        if remove_orig_columns:
+            remove_columns = orig_dataset.column_names
+        else:
+            remove_columns = None
+        return orig_dataset.map(
+            process_fn,
+            batched=batched,
+            remove_columns=remove_columns,
+            num_proc=self._num_proc,
+        )
+
+
+def print_model_size(model: PreTrainedModel):
+    print("--> Model")
+    total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print(f"\n--> model has {total_params / 1e6}M params\n")
+
+
+def process_batch(
+        batch: Mapping[str, Sequence],
+        tokenizer: PreTrainedTokenizer,
+        max_input_length: int,
+        max_output_length: int,
+) -> dict[str, list]:
+    batched_tools = batch.get('tools', None)
+    batched_conv = batch['conversations']
+    batched_input_ids = []
+    batched_labels = []
+
+    if batched_tools is None:
+        batched_tools = [None] * len(batched_conv)
+
+    for tools, conv in zip(batched_tools, batched_conv):
+        input_ids, loss_masks = [
+            tokenizer.get_command('[gMASK]'),
+            tokenizer.get_command('sop'),
+        ], [False, False]
+
+        if tools is not None:
+            raise NotImplementedError()
+
+        for message in conv:
+            if message['role'] in ('system', 'user'):
+                loss_mask_val = False
+            else:
+                loss_mask_val = True
+
+            if message['role'] == 'tool':
+                raise NotImplementedError()
+            else:
+                new_input_ids = tokenizer.build_single_message(
+                    message['role'], '', message['content']
+                )
+                new_loss_masks = [loss_mask_val] * len(new_input_ids)
+
+            input_ids += new_input_ids
+            loss_masks += new_loss_masks
+
+        input_ids.append(tokenizer.eos_token_id)
+        loss_masks = [False, *loss_masks]
+        labels = []
+        for input_id, mask in zip(input_ids, loss_masks):
+            if mask:
+                labels.append(input_id)
+            else:
+                labels.append(-100)
+        max_length = max_input_length + max_output_length + 1
+        batched_input_ids.append(input_ids[:max_length])
+        batched_labels.append(labels[:max_length])
+    return {'input_ids': batched_input_ids, 'labels': batched_labels}
+
+
+def process_batch_eval(
+        batch: Mapping[str, Sequence],
+        tokenizer: PreTrainedTokenizer,
+        max_input_length: int,
+        max_output_length: int,
+) -> dict[str, list]:
+    batched_tools = batch.get('tools', None)
+    batched_conv = batch['conversations']
+    batched_input_ids = []
+    # To avoid computing loss, we do not provide the `labels` field in the input dictionary.
+    batched_output_ids = []
+
+    if batched_tools is None:
+        batched_tools = [None] * len(batched_conv)
+
+    for tools, conv in zip(batched_tools, batched_conv):
+        input_ids = [
+            tokenizer.get_command('[gMASK]'),
+            tokenizer.get_command('sop'),
+        ]
+
+        if tools is not None:
+            raise NotImplementedError()
+
+        for message in conv:
+            if len(input_ids) >= max_input_length:
+                break
+            if message['role'] == 'tool':
+                raise NotImplementedError()
+            else:
+                new_input_ids = tokenizer.build_single_message(
+                    message['role'], '', message['content']
+                )
+                if message['role'] == 'assistant':
+                    output_prompt, output_ids = (
+                        new_input_ids[:1],
+                        new_input_ids[1:],
+                    )
+                    output_ids.append(tokenizer.eos_token_id)
+                    batched_input_ids.append(
+                        input_ids[:max_input_length] + output_prompt[:1]
+                    )
+                    batched_output_ids.append(output_ids[:max_output_length])
+                input_ids += new_input_ids
+    return {'input_ids': batched_input_ids, 'output_ids': batched_output_ids}
+
+
+# Not sure if this is necessary, can set it to half.
+# If train with cpu, cast all params to fp32 instead of trainable ones.
+def _prepare_model_for_training(model: nn.Module, use_cpu: bool):
+    for param in model.parameters():
+        if param.requires_grad or use_cpu:
+            param.data = param.data.to(torch.float32)
+
+
+def load_tokenizer_and_model(
+        model_dir: str,
+        peft_config: Optional[PeftConfig] = None,
+) -> tuple[PreTrainedTokenizer, nn.Module]:
+    tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
+    if peft_config is not None:
+        if peft_config.peft_type.name == "PREFIX_TUNING":
+            config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True)
+            config.pre_seq_len = peft_config.num_virtual_tokens
+            config.use_cache = False
+            model = AutoModelForCausalLM.from_pretrained(
+                model_dir,
+                trust_remote_code=True,
+                config=config,
+            )
+        if peft_config.peft_type.name == "LORA":
+            model = AutoModelForCausalLM.from_pretrained(
+                model_dir,
+                trust_remote_code=True,
+                empty_init=False,
+                use_cache=False
+            )
+            model = get_peft_model(model, peft_config)
+            model.print_trainable_parameters()
+    else:
+        model = AutoModelForCausalLM.from_pretrained(
+            model_dir,
+            trust_remote_code=True,
+            empty_init=False,
+            use_cache=False
+        )
+    print_model_size(model)
+    return tokenizer, model
+
+
+def compute_metrics(eval_preds: EvalPrediction, tokenizer: PreTrainedTokenizer):
+    batched_pred_ids, batched_label_ids = eval_preds
+
+    metrics_dct = {'rouge-1': [], 'rouge-2': [], 'rouge-l': [], 'bleu-4': []}
+    for pred_ids, label_ids in zip(batched_pred_ids, batched_label_ids):
+        pred_txt = tokenizer.decode(pred_ids).strip()
+        label_txt = tokenizer.decode(label_ids).strip()
+        pred_tokens = list(jieba.cut(pred_txt))
+        label_tokens = list(jieba.cut(label_txt))
+        rouge = Rouge()
+        scores = rouge.get_scores(' '.join(pred_tokens), ' '.join(label_tokens))
+        for k, v in scores[0].items():
+            metrics_dct[k].append(round(v['f'] * 100, 4))
+        metrics_dct['bleu-4'].append(
+            sentence_bleu(
+                [label_tokens],
+                pred_tokens,
+                smoothing_function=SmoothingFunction().method3,
+            )
+        )
+    return {k: np.mean(v) for k, v in metrics_dct.items()}
+
+
+@app.command()
+def main(
+        data_dir: Annotated[str, typer.Argument(help='')],
+        model_dir: Annotated[
+            str,
+            typer.Argument(
+                help='A string that specifies the model id of a pretrained model configuration hosted on huggingface.co, or a path to a directory containing a model configuration file.'
+            ),
+        ],
+        config_file: Annotated[str, typer.Argument(help='')],
+        auto_resume_from_checkpoint: str = typer.Argument(
+            default='',
+            help='If entered as yes, automatically use the latest save checkpoint. If it is a numerical example 12 15, use the corresponding save checkpoint. If the input is no, restart training'
+        ),
+
+):
+    ft_config = FinetuningConfig.from_file(config_file)
+    tokenizer, model = load_tokenizer_and_model(model_dir, peft_config=ft_config.peft_config)
+    data_manager = DataManager(data_dir, ft_config.data_config)
+
+    train_dataset = data_manager.get_dataset(
+        Split.TRAIN,
+        functools.partial(
+            process_batch,
+            tokenizer=tokenizer,
+            max_input_length=ft_config.max_input_length,
+            max_output_length=ft_config.max_output_length,
+        ),
+        batched=True,
+    )
+    print('train_dataset:', train_dataset)
+    val_dataset = data_manager.get_dataset(
+        Split.VALIDATION,
+        functools.partial(
+            process_batch_eval,
+            tokenizer=tokenizer,
+            max_input_length=ft_config.max_input_length,
+            max_output_length=ft_config.max_output_length,
+        ),
+        batched=True,
+    )
+    if val_dataset is not None:
+        print('val_dataset:', val_dataset)
+    test_dataset = data_manager.get_dataset(
+        Split.TEST,
+        functools.partial(
+            process_batch_eval,
+            tokenizer=tokenizer,
+            max_input_length=ft_config.max_input_length,
+            max_output_length=ft_config.max_output_length,
+        ),
+        batched=True,
+    )
+    if test_dataset is not None:
+        print('test_dataset:', test_dataset)
+
+    # checks encoded dataset
+    _sanity_check(
+        train_dataset[0]["input_ids"], train_dataset[0]["labels"], tokenizer
+    )
+
+    # turn model to fp32
+    _prepare_model_for_training(model, ft_config.training_args.use_cpu)
+
+    ft_config.training_args.generation_config.pad_token_id = (
+        tokenizer.pad_token_id
+    )
+    ft_config.training_args.generation_config.eos_token_id = [
+        tokenizer.eos_token_id,
+        tokenizer.get_command('<|user|>'),
+        tokenizer.get_command('<|observation|>'),
+    ]
+    model.gradient_checkpointing_enable()
+    model.enable_input_require_grads()
+
+    trainer = Seq2SeqTrainer(
+        model=model,
+        args=ft_config.training_args,
+        data_collator=DataCollatorForSeq2Seq(
+            tokenizer=tokenizer,
+            padding='longest',
+            return_tensors='pt',
+        ),
+        train_dataset=train_dataset,
+        eval_dataset=val_dataset.select(list(range(50))),
+        tokenizer=tokenizer if ft_config.peft_config.peft_type != "LORA" else None,  # LORA does not need tokenizer
+        compute_metrics=functools.partial(compute_metrics, tokenizer=tokenizer),
+    )
+
+    if auto_resume_from_checkpoint.upper() == "" or auto_resume_from_checkpoint is None:
+        trainer.train()
+    else:
+        output_dir = ft_config.training_args.output_dir
+        dirlist = os.listdir(output_dir)
+        checkpoint_sn = 0
+        for checkpoint_str in dirlist:
+            if checkpoint_str.find("eckpoint") > 0 and checkpoint_str.find("tmp") == -1:
+                checkpoint = int(checkpoint_str.replace("checkpoint-", ""))
+                if checkpoint > checkpoint_sn:
+                    checkpoint_sn = checkpoint
+        if auto_resume_from_checkpoint.upper() == "YES":
+            if checkpoint_sn > 0:
+                model.gradient_checkpointing_enable()
+                model.enable_input_require_grads()
+                checkpoint_directory = os.path.join(output_dir, "checkpoint-" + str(checkpoint_sn))
+                print("resume checkpoint from  checkpoint-" + str(checkpoint_sn))
+                trainer.train(resume_from_checkpoint=checkpoint_directory)
+            else:
+                trainer.train()
+        else:
+            if auto_resume_from_checkpoint.isdigit():
+                if int(auto_resume_from_checkpoint) > 0:
+                    checkpoint_sn = int(auto_resume_from_checkpoint)
+                    model.gradient_checkpointing_enable()
+                    model.enable_input_require_grads()
+                    checkpoint_directory = os.path.join(output_dir, "checkpoint-" + str(checkpoint_sn))
+                    print("resume checkpoint from  checkpoint-" + str(checkpoint_sn))
+                    trainer.train(resume_from_checkpoint=checkpoint_directory)
+            else:
+                print(auto_resume_from_checkpoint,
+                      "The specified checkpoint sn(" + auto_resume_from_checkpoint + ") has not been saved. Please search for the correct chkeckpoint in the model output directory")
+
+    # test stage
+    if test_dataset is not None:
+        trainer.predict(test_dataset)
+
+
+if __name__ == '__main__':
+    app()
--- a/finetune_demo/finetune_hf_sft.py
+++ b/finetune_demo/finetune_hf_sft.py
+# -*- coding: utf-8 -*-
+import os
+import jieba
+import dataclasses as dc
+import functools
+from collections.abc import Callable, Mapping, Sequence
+from pathlib import Path
+from typing import Annotated, Any, Optional, Union
+import numpy as np
+import ruamel.yaml as yaml
+import torch
+import typer
+from datasets import Dataset, DatasetDict, NamedSplit, Split, load_dataset
+from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
+from peft import (
+    PeftConfig,
+    PeftModelForCausalLM,
+    get_peft_config,
+    get_peft_model
+)
+from rouge_chinese import Rouge
+from torch import nn
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    EvalPrediction,
+    GenerationConfig,
+    PreTrainedModel,
+    PreTrainedTokenizer,
+    PreTrainedTokenizerFast,
+    Seq2SeqTrainingArguments, AutoConfig,
+)
+from transformers import DataCollatorForSeq2Seq as _DataCollatorForSeq2Seq
+
+from transformers import Seq2SeqTrainer as _Seq2SeqTrainer
+
+ModelType = Union[PreTrainedModel, PeftModelForCausalLM]
+TokenizerType = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
+app = typer.Typer(pretty_exceptions_show_locals=False)
+
+
+class DataCollatorForSeq2Seq(_DataCollatorForSeq2Seq):
+    def __call__(self, features, return_tensors=None):
+        output_ids = (
+            [feature['output_ids'] for feature in features]
+            if 'output_ids' in features[0].keys()
+            else None
+        )
+        if output_ids is not None:
+            max_output_length = max(len(out) for out in output_ids)
+            if self.pad_to_multiple_of is not None:
+                max_output_length = (
+                        (
+                                max_output_length + self.pad_to_multiple_of - 1) //
+                        self.pad_to_multiple_of * self.pad_to_multiple_of
+                )
+            for feature in features:
+                remainder = [self.tokenizer.pad_token_id] * (
+                        max_output_length - len(feature['output_ids'])
+                )
+                if isinstance(feature['output_ids'], list):
+                    feature['output_ids'] = feature['output_ids'] + remainder
+                else:
+                    feature['output_ids'] = np.concatenate(
+                        [feature['output_ids'], remainder]
+                    ).astype(np.int64)
+        return super().__call__(features, return_tensors)
+
+
+class Seq2SeqTrainer(_Seq2SeqTrainer):
+    def prediction_step(
+            self,
+            model: nn.Module,
+            inputs: dict[str, Any],
+            prediction_loss_only: bool,
+            ignore_keys=None,
+            **gen_kwargs,
+    ) -> tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
+        if self.args.predict_with_generate:
+            output_ids = inputs.pop('output_ids')
+        input_ids = inputs['input_ids']
+        loss, generated_tokens, labels = super().prediction_step(
+            model, inputs, prediction_loss_only, ignore_keys, **gen_kwargs
+        )
+        generated_tokens = generated_tokens[:, input_ids.size()[1]:]
+        if self.args.predict_with_generate:
+            labels = output_ids
+        return loss, generated_tokens, labels
+    # For P-Tuning a new save_model function is fine for the prefix_encoder model
+    # but may cost problems for the whole model loading
+
+    # def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = False):
+    #     if output_dir is None:
+    #         output_dir = self.args.output_dir
+    #     os.makedirs(output_dir, exist_ok=True)
+    #     ptuning_params = {k: v for k, v in self.model.transformer.prefix_encoder.state_dict().items()}
+    #
+    #     torch.save(ptuning_params, os.path.join(output_dir, 'pytorch_model.bin'))
+    #
+    #     print(f"P-Tuning model weights saved in {output_dir}")
+    #
+    #     if self.tokenizer is not None:
+    #         self.tokenizer.save_pretrained(output_dir)
+
+
+def _resolve_path(path: Union[str, Path]) -> Path:
+    return Path(path).expanduser().resolve()
+
+
+def _sanity_check(
+        input_ids: Sequence[int],
+        output_ids: Sequence[int],
+        tokenizer: PreTrainedTokenizer,
+):
+    print('--> Sanity check')
+    for in_id, out_id in zip(input_ids, output_ids):
+        if in_id == 0:
+            continue
+        if in_id in tokenizer.tokenizer.index_special_tokens:
+            in_text = tokenizer.tokenizer.index_special_tokens[in_id]
+        else:
+            in_text = tokenizer.decode([in_id])
+        print(f'{repr(in_text):>20}: {in_id} -> {out_id}')
+
+
+@functools.cache
+def _get_yaml_parser() -> yaml.YAML:
+    parser = yaml.YAML(typ='safe', pure=True)
+    parser.indent(mapping=2, offset=2, sequence=4)
+    parser.default_flow_style = False
+    return parser
+
+
+@dc.dataclass
+class DataConfig(object):
+    train_file: str
+    val_file: Optional[str] = None
+    test_file: Optional[str] = None
+
+    num_proc: Optional[int] = None
+
+    @property
+    def data_format(self) -> str:
+        return Path(self.train_file).suffix
+
+    @property
+    def data_files(self) -> dict[NamedSplit, str]:
+        return {
+            split: data_file
+            for split, data_file in zip(
+                [Split.TRAIN, Split.VALIDATION, Split.TEST],
+                [self.train_file, self.val_file, self.test_file],
+            )
+            if data_file is not None
+        }
+
+
+@dc.dataclass
+class FinetuningConfig(object):
+    data_config: DataConfig
+
+    max_input_length: int
+    max_output_length: int
+
+    training_args: Seq2SeqTrainingArguments = dc.field(
+        default_factory=lambda: Seq2SeqTrainingArguments(output_dir='./output')
+    )
+    peft_config: Optional[PeftConfig] = None
+
+    def __post_init__(self):
+        if not self.training_args.do_eval or self.data_config.val_file is None:
+            # skips the evaluation stage when `do_eval` or `eval_file` is not provided
+            self.training_args.do_eval = False
+            self.training_args.evaluation_strategy = 'no'
+            self.data_config.val_file = None
+        else:
+            self.training_args.per_device_eval_batch_size = (
+                    self.training_args.per_device_eval_batch_size
+                    or self.training_args.per_device_train_batch_size
+            )
+
+    @classmethod
+    def from_dict(cls, **kwargs) -> 'FinetuningConfig':
+        training_args = kwargs.get('training_args', None)
+        if training_args is not None and not isinstance(
+                training_args, Seq2SeqTrainingArguments
+        ):
+            gen_config = training_args.get('generation_config')
+            # TODO: a bit hacky
+            if not isinstance(gen_config, GenerationConfig):
+                training_args['generation_config'] = GenerationConfig(
+                    **gen_config
+                )
+            kwargs['training_args'] = Seq2SeqTrainingArguments(**training_args)
+
+        data_config = kwargs.get('data_config')
+        if not isinstance(data_config, DataConfig):
+            kwargs['data_config'] = DataConfig(**data_config)
+
+        peft_config = kwargs.get('peft_config', None)
+        if peft_config is not None and not isinstance(peft_config, PeftConfig):
+            kwargs['peft_config'] = get_peft_config(peft_config)
+        return cls(**kwargs)
+
+    @classmethod
+    def from_file(cls, path: Union[str, Path]) -> 'FinetuningConfig':
+        path = _resolve_path(path)
+        kwargs = _get_yaml_parser().load(path)
+        return cls.from_dict(**kwargs)
+
+
+def _load_datasets(
+        data_dir: Path,
+        data_format: str,
+        data_files: dict[NamedSplit, str],
+        num_proc: Optional[int],
+) -> DatasetDict:
+    if data_format in ('.csv', '.json', '.jsonl'):
+        dataset_dct = load_dataset(
+            data_format[1:],
+            data_dir=data_dir,
+            data_files=data_files,
+            num_proc=num_proc,
+        )
+    else:
+        err_msg = f"Cannot load dataset in the '{data_format}' format."
+        raise NotImplementedError(err_msg)
+
+    return dataset_dct
+
+
+class DataManager(object):
+    def __init__(self, data_dir: str, data_config: DataConfig):
+        self._num_proc = data_config.num_proc
+
+        self._dataset_dct = _load_datasets(
+            _resolve_path(data_dir),
+            data_config.data_format,
+            data_config.data_files,
+            self._num_proc,
+        )
+
+    def _get_dataset(self, split: NamedSplit) -> Optional[Dataset]:
+        return self._dataset_dct.get(split, None)
+
+    def get_dataset(
+            self,
+            split: NamedSplit,
+            process_fn: Callable[[dict[str, Any]], dict[str, Any]],
+            batched: bool = True,
+            remove_orig_columns: bool = True,
+    ) -> Optional[Dataset]:
+        orig_dataset = self._get_dataset(split)
+        if orig_dataset is None:
+            return
+
+        if remove_orig_columns:
+            remove_columns = orig_dataset.column_names
+        else:
+            remove_columns = None
+        return orig_dataset.map(
+            process_fn,
+            batched=batched,
+            remove_columns=remove_columns,
+            num_proc=self._num_proc,
+        )
+
+
+def print_model_size(model: PreTrainedModel):
+    print("--> Model")
+    total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print(f"\n--> model has {total_params / 1e6}M params\n")
+
+
+def process_batch(
+        batch: Mapping[str, Sequence],
+        tokenizer: PreTrainedTokenizer,
+        max_input_length: int,
+        max_output_length: int,
+) -> dict[str, list]:
+    batched_tools = batch.get('tools', None)
+    batched_conv = batch['conversations']
+    batched_input_ids = []
+    batched_labels = []
+
+    if batched_tools is None:
+        batched_tools = [None] * len(batched_conv)
+
+    for tools, conv in zip(batched_tools, batched_conv):
+        input_ids, loss_masks = [
+            tokenizer.get_command('[gMASK]'),
+            tokenizer.get_command('sop'),
+        ], [False, False]
+
+        if tools is not None:
+            raise NotImplementedError()
+
+        for message in conv:
+            if message['role'] in ('system', 'user'):
+                loss_mask_val = False
+            else:
+                loss_mask_val = True
+
+            if message['role'] == 'tool':
+                raise NotImplementedError()
+            else:
+                new_input_ids = tokenizer.build_single_message(
+                    message['role'], '', message['content']
+                )
+                new_loss_masks = [loss_mask_val] * len(new_input_ids)
+
+            input_ids += new_input_ids
+            loss_masks += new_loss_masks
+
+        input_ids.append(tokenizer.eos_token_id)
+        loss_masks = [False, *loss_masks]
+        labels = []
+        for input_id, mask in zip(input_ids, loss_masks):
+            if mask:
+                labels.append(input_id)
+            else:
+                labels.append(-100)
+        max_length = max_input_length + max_output_length + 1
+        batched_input_ids.append(input_ids[:max_length])
+        batched_labels.append(labels[:max_length])
+    return {'input_ids': batched_input_ids, 'labels': batched_labels}
+
+
+def process_batch_eval(
+        batch: Mapping[str, Sequence],
+        tokenizer: PreTrainedTokenizer,
+        max_input_length: int,
+        max_output_length: int,
+) -> dict[str, list]:
+    batched_tools = batch.get('tools', None)
+    batched_conv = batch['conversations']
+    batched_input_ids = []
+    # To avoid computing loss, we do not provide the `labels` field in the input dictionary.
+    batched_output_ids = []
+
+    if batched_tools is None:
+        batched_tools = [None] * len(batched_conv)
+
+    for tools, conv in zip(batched_tools, batched_conv):
+        input_ids = [
+            tokenizer.get_command('[gMASK]'),
+            tokenizer.get_command('sop'),
+        ]
+
+        if tools is not None:
+            raise NotImplementedError()
+
+        for message in conv:
+            if len(input_ids) >= max_input_length:
+                break
+            if message['role'] == 'tool':
+                raise NotImplementedError()
+            else:
+                new_input_ids = tokenizer.build_single_message(
+                    message['role'], '', message['content']
+                )
+                if message['role'] == 'assistant':
+                    output_prompt, output_ids = (
+                        new_input_ids[:1],
+                        new_input_ids[1:],
+                    )
+                    output_ids.append(tokenizer.eos_token_id)
+                    batched_input_ids.append(
+                        input_ids[:max_input_length] + output_prompt[:1]
+                    )
+                    batched_output_ids.append(output_ids[:max_output_length])
+                input_ids += new_input_ids
+    return {'input_ids': batched_input_ids, 'output_ids': batched_output_ids}
+
+
+# Not sure if this is necessary, can set it to half.
+# If train with cpu, cast all params to fp32 instead of trainable ones.
+def _prepare_model_for_training(model: nn.Module, use_cpu: bool):
+    for param in model.parameters():
+        if param.requires_grad or use_cpu:
+            param.data = param.data.to(torch.float32)
+
+
+def load_tokenizer_and_model(
+        model_dir: str,
+        peft_config: Optional[PeftConfig] = None,
+) -> tuple[PreTrainedTokenizer, nn.Module]:
+    tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
+    if peft_config is not None:
+        if peft_config.peft_type.name == "PREFIX_TUNING":
+            config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True)
+            config.pre_seq_len = peft_config.num_virtual_tokens
+            config.use_cache = False
+            model = AutoModelForCausalLM.from_pretrained(
+                model_dir,
+                trust_remote_code=True,
+                config=config,
+            )
+        if peft_config.peft_type.name == "LORA":
+            model = AutoModelForCausalLM.from_pretrained(
+                model_dir,
+                trust_remote_code=True,
+                empty_init=False,
+                use_cache=False
+            )
+            model = get_peft_model(model, peft_config)
+            model.print_trainable_parameters()
+    else:
+        model = AutoModelForCausalLM.from_pretrained(
+            model_dir,
+            trust_remote_code=True,
+            empty_init=False,
+            use_cache=False
+        )
+    print_model_size(model)
+    return tokenizer, model
+
+
+def compute_metrics(eval_preds: EvalPrediction, tokenizer: PreTrainedTokenizer):
+    batched_pred_ids, batched_label_ids = eval_preds
+
+    metrics_dct = {'rouge-1': [], 'rouge-2': [], 'rouge-l': [], 'bleu-4': []}
+    for pred_ids, label_ids in zip(batched_pred_ids, batched_label_ids):
+        pred_txt = tokenizer.decode(pred_ids).strip()
+        label_txt = tokenizer.decode(label_ids).strip()
+        pred_tokens = list(jieba.cut(pred_txt))
+        label_tokens = list(jieba.cut(label_txt))
+        rouge = Rouge()
+        scores = rouge.get_scores(' '.join(pred_tokens), ' '.join(label_tokens))
+        for k, v in scores[0].items():
+            metrics_dct[k].append(round(v['f'] * 100, 4))
+        metrics_dct['bleu-4'].append(
+            sentence_bleu(
+                [label_tokens],
+                pred_tokens,
+                smoothing_function=SmoothingFunction().method3,
+            )
+        )
+    return {k: np.mean(v) for k, v in metrics_dct.items()}
+
+
+@app.command()
+def main(
+        data_dir: Annotated[str, typer.Argument(help='')],
+        model_dir: Annotated[
+            str,
+            typer.Argument(
+                help='A string that specifies the model id of a pretrained model configuration hosted on huggingface.co, or a path to a directory containing a model configuration file.'
+            ),
+        ],
+        config_file: Annotated[str, typer.Argument(help='')],
+        auto_resume_from_checkpoint: str = typer.Argument(
+            default='',
+            help='If entered as yes, automatically use the latest save checkpoint. If it is a numerical example 12 15, use the corresponding save checkpoint. If the input is no, restart training'
+        ),
+
+):
+    ft_config = FinetuningConfig.from_file(config_file)
+    tokenizer, model = load_tokenizer_and_model(model_dir, peft_config=ft_config.peft_config)
+    data_manager = DataManager(data_dir, ft_config.data_config)
+
+    train_dataset = data_manager.get_dataset(
+        Split.TRAIN,
+        functools.partial(
+            process_batch,
+            tokenizer=tokenizer,
+            max_input_length=ft_config.max_input_length,
+            max_output_length=ft_config.max_output_length,
+        ),
+        batched=True,
+    )
+    print('train_dataset:', train_dataset)
+    val_dataset = data_manager.get_dataset(
+        Split.VALIDATION,
+        functools.partial(
+            process_batch_eval,
+            tokenizer=tokenizer,
+            max_input_length=ft_config.max_input_length,
+            max_output_length=ft_config.max_output_length,
+        ),
+        batched=True,
+    )
+    if val_dataset is not None:
+        print('val_dataset:', val_dataset)
+    test_dataset = data_manager.get_dataset(
+        Split.TEST,
+        functools.partial(
+            process_batch_eval,
+            tokenizer=tokenizer,
+            max_input_length=ft_config.max_input_length,
+            max_output_length=ft_config.max_output_length,
+        ),
+        batched=True,
+    )
+    if test_dataset is not None:
+        print('test_dataset:', test_dataset)
+
+    # checks encoded dataset
+    _sanity_check(
+        train_dataset[0]["input_ids"], train_dataset[0]["labels"], tokenizer
+    )
+
+    # turn model to fp32
+    _prepare_model_for_training(model, ft_config.training_args.use_cpu)
+
+    ft_config.training_args.generation_config.pad_token_id = (
+        tokenizer.pad_token_id
+    )
+    ft_config.training_args.generation_config.eos_token_id = [
+        tokenizer.eos_token_id,
+        tokenizer.get_command('<|user|>'),
+        tokenizer.get_command('<|observation|>'),
+    ]
+    model.gradient_checkpointing_enable()
+    model.enable_input_require_grads()
+
+    trainer = Seq2SeqTrainer(
+        model=model,
+        args=ft_config.training_args,
+        data_collator=DataCollatorForSeq2Seq(
+            tokenizer=tokenizer,
+            padding='longest',
+            return_tensors='pt',
+        ),
+        train_dataset=train_dataset,
+        eval_dataset=val_dataset.select(list(range(50))),
+        tokenizer=tokenizer,  # LORA does not need tokenizer
+        compute_metrics=functools.partial(compute_metrics, tokenizer=tokenizer),
+    )
+
+    if auto_resume_from_checkpoint.upper() == "" or auto_resume_from_checkpoint is None:
+        trainer.train()
+    else:
+        output_dir = ft_config.training_args.output_dir
+        dirlist = os.listdir(output_dir)
+        checkpoint_sn = 0
+        for checkpoint_str in dirlist:
+            if checkpoint_str.find("eckpoint") > 0 and checkpoint_str.find("tmp") == -1:
+                checkpoint = int(checkpoint_str.replace("checkpoint-", ""))
+                if checkpoint > checkpoint_sn:
+                    checkpoint_sn = checkpoint
+        if auto_resume_from_checkpoint.upper() == "YES":
+            if checkpoint_sn > 0:
+                model.gradient_checkpointing_enable()
+                model.enable_input_require_grads()
+                checkpoint_directory = os.path.join(output_dir, "checkpoint-" + str(checkpoint_sn))
+                print("resume checkpoint from  checkpoint-" + str(checkpoint_sn))
+                trainer.train(resume_from_checkpoint=checkpoint_directory)
+            else:
+                trainer.train()
+        else:
+            if auto_resume_from_checkpoint.isdigit():
+                if int(auto_resume_from_checkpoint) > 0:
+                    checkpoint_sn = int(auto_resume_from_checkpoint)
+                    model.gradient_checkpointing_enable()
+                    model.enable_input_require_grads()
+                    checkpoint_directory = os.path.join(output_dir, "checkpoint-" + str(checkpoint_sn))
+                    print("resume checkpoint from  checkpoint-" + str(checkpoint_sn))
+                    trainer.train(resume_from_checkpoint=checkpoint_directory)
+            else:
+                print(auto_resume_from_checkpoint,
+                      "The specified checkpoint sn(" + auto_resume_from_checkpoint + ") has not been saved. Please search for the correct chkeckpoint in the model output directory")
+
+    # test stage
+    if test_dataset is not None:
+        trainer.predict(test_dataset)
+
+
+if __name__ == '__main__':
+    app()
+
--- a/finetune_demo/inference_hf.py
+++ b/finetune_demo/inference_hf.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from pathlib import Path
+from typing import Annotated, Union
+
+import typer
+from peft import AutoPeftModelForCausalLM, PeftModelForCausalLM
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    PreTrainedModel,
+    PreTrainedTokenizer,
+    PreTrainedTokenizerFast,
+)
+
+ModelType = Union[PreTrainedModel, PeftModelForCausalLM]
+TokenizerType = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
+
+app = typer.Typer(pretty_exceptions_show_locals=False)
+
+
+def _resolve_path(path: Union[str, Path]) -> Path:
+    return Path(path).expanduser().resolve()
+
+
+def load_model_and_tokenizer(model_dir: Union[str, Path]) -> tuple[ModelType, TokenizerType]:
+    model_dir = _resolve_path(model_dir)
+    if (model_dir / 'adapter_config.json').exists():
+        model = AutoPeftModelForCausalLM.from_pretrained(
+            model_dir, trust_remote_code=True, device_map='auto'
+        )
+        tokenizer_dir = model.peft_config['default'].base_model_name_or_path
+    else:
+        model = AutoModelForCausalLM.from_pretrained(
+            model_dir, trust_remote_code=True, device_map='auto'
+        )
+        tokenizer_dir = model_dir
+    tokenizer = AutoTokenizer.from_pretrained(
+        tokenizer_dir, trust_remote_code=True
+    )
+    return model, tokenizer
+
+
+@app.command()
+def main(
+        model_dir: Annotated[str, typer.Argument(help='')],
+        prompt: Annotated[str, typer.Option(help='')],
+):
+    model, tokenizer = load_model_and_tokenizer(model_dir)
+    response, _ = model.chat(tokenizer, prompt)
+    print(response)
+
+
+if __name__ == '__main__':
+    app()
--- a/finetune_demo/lora.sh
+++ b/finetune_demo/lora.sh
+export HIP_VISIBLE_DEVICES=7
+python finetune_hf.py  data/AdvertiseGen_fix  /path/to/chatglm3-6b  configs/lora.yaml