Initial commit

afe180a6 · wanglch · afe180a6 · afe180a6 · afe180a6 · afe180a6
Commit afe180a6 authored May 21, 2024 by wanglch
20 changed files
--- a/LLaMA-Factory/src/llmtuner/data/utils.py
+++ b/LLaMA-Factory/src/llmtuner/data/utils.py
+import hashlib
+from enum import Enum, unique
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+
+from ..extras.logging import get_logger
+
+
+if TYPE_CHECKING:
+    from datasets import Dataset, IterableDataset
+    from transformers import TrainingArguments
+
+    from llmtuner.hparams import DataArguments
+
+
+logger = get_logger(__name__)
+
+
+@unique
+class Role(str, Enum):
+    USER = "user"
+    ASSISTANT = "assistant"
+    SYSTEM = "system"
+    FUNCTION = "function"
+    OBSERVATION = "observation"
+
+
+def checksum(data_files: List[str], file_sha1: Optional[str] = None) -> None:
+    if file_sha1 is None:
+        logger.warning("Checksum failed: missing SHA-1 hash value in dataset_info.json.")
+        return
+
+    if len(data_files) != 1:
+        logger.warning("Checksum failed: too many files.")
+        return
+
+    with open(data_files[0], "rb") as f:
+        sha1 = hashlib.sha1(f.read()).hexdigest()
+        if sha1 != file_sha1:
+            logger.warning("Checksum failed: mismatched SHA-1 hash value at {}.".format(data_files[0]))
+
+
+def infer_max_len(source_len: int, target_len: int, max_len: int, reserved_label_len: int) -> Tuple[int, int]:
+    max_target_len = int(max_len * (target_len / (source_len + target_len)))
+    max_target_len = max(max_target_len, reserved_label_len)
+    max_source_len = max_len - max_target_len
+    return max_source_len, max_target_len
+
+
+def split_dataset(
+    dataset: Union["Dataset", "IterableDataset"], data_args: "DataArguments", training_args: "TrainingArguments"
+) -> Dict[str, "Dataset"]:
+    if training_args.do_train:
+        if data_args.val_size > 1e-6:  # Split the dataset
+            if data_args.streaming:
+                val_set = dataset.take(int(data_args.val_size))
+                train_set = dataset.skip(int(data_args.val_size))
+                dataset = dataset.shuffle(buffer_size=data_args.buffer_size, seed=training_args.seed)
+                return {"train_dataset": train_set, "eval_dataset": val_set}
+            else:
+                val_size = int(data_args.val_size) if data_args.val_size > 1 else data_args.val_size
+                dataset = dataset.train_test_split(test_size=val_size, seed=training_args.seed)
+                return {"train_dataset": dataset["train"], "eval_dataset": dataset["test"]}
+        else:
+            if data_args.streaming:
+                dataset = dataset.shuffle(buffer_size=data_args.buffer_size, seed=training_args.seed)
+            return {"train_dataset": dataset}
+    else:  # do_eval or do_predict
+        return {"eval_dataset": dataset}
--- a/LLaMA-Factory/src/llmtuner/eval/__init__.py
+++ b/LLaMA-Factory/src/llmtuner/eval/__init__.py
+from .evaluator import Evaluator
+
+
+__all__ = ["Evaluator"]
--- a/LLaMA-Factory/src/llmtuner/eval/evaluator.py
+++ b/LLaMA-Factory/src/llmtuner/eval/evaluator.py
+# Inspired by: https://github.com/hendrycks/test/blob/master/evaluate_flan.py
+
+import inspect
+import json
+import os
+from typing import Any, Dict, List, Optional
+
+import numpy as np
+import torch
+from datasets import load_dataset
+from tqdm import tqdm, trange
+from transformers.utils import cached_file
+
+from ..data import get_template_and_fix_tokenizer
+from ..extras.constants import CHOICES, SUBJECTS
+from ..hparams import get_eval_args
+from ..model import dispatch_model, load_model_and_tokenizer
+from .template import get_eval_template
+
+
+class Evaluator:
+    def __init__(self, args: Optional[Dict[str, Any]] = None) -> None:
+        self.model_args, self.data_args, self.eval_args, finetuning_args = get_eval_args(args)
+        self.model, self.tokenizer = load_model_and_tokenizer(self.model_args, finetuning_args)
+        self.tokenizer.padding_side = "right"  # avoid overflow issue in batched inference for llama2
+        self.model = dispatch_model(self.model)
+        self.template = get_template_and_fix_tokenizer(self.tokenizer, self.data_args.template)
+        self.eval_template = get_eval_template(self.eval_args.lang)
+        self.choice_inputs = [
+            self.tokenizer.encode(self.eval_template.prefix + ch, add_special_tokens=False)[-1] for ch in CHOICES
+        ]
+
+    @torch.inference_mode()
+    def batch_inference(self, batch_input: Dict[str, torch.Tensor]) -> List[str]:
+        logits = self.model(**batch_input).logits
+        lengths = torch.sum(batch_input["attention_mask"], dim=-1)
+        word_probs = torch.stack([logits[i, lengths[i] - 1] for i in range(len(lengths))], dim=0)
+        choice_probs = torch.nn.functional.softmax(word_probs[:, self.choice_inputs], dim=-1).detach()
+        return [chr(ord("A") + offset.item()) for offset in torch.argmax(choice_probs, dim=-1)]
+
+    def eval(self) -> None:
+        mapping = cached_file(
+            path_or_repo_id=os.path.join(self.eval_args.task_dir, self.eval_args.task),
+            filename="mapping.json",
+            cache_dir=self.model_args.cache_dir,
+            token=self.model_args.hf_hub_token,
+        )
+
+        with open(mapping, "r", encoding="utf-8") as f:
+            categorys: Dict[str, Dict[str, str]] = json.load(f)
+
+        category_corrects = {subj: np.array([], dtype="bool") for subj in SUBJECTS}
+        pbar = tqdm(categorys.keys(), desc="Processing subjects", position=0)
+        results = {}
+        for subject in pbar:
+            if "trust_remote_code" in inspect.signature(load_dataset).parameters:  # for datasets==2.16.0
+                kwargs = {"trust_remote_code": True}
+            else:
+                kwargs = {}
+
+            dataset = load_dataset(
+                path=os.path.join(self.eval_args.task_dir, self.eval_args.task),
+                name=subject,
+                cache_dir=self.model_args.cache_dir,
+                download_mode=self.eval_args.download_mode,
+                token=self.model_args.hf_hub_token,
+                **kwargs,
+            )
+            pbar.set_postfix_str(categorys[subject]["name"])
+            inputs, outputs, labels = [], [], []
+            for i in trange(len(dataset[self.data_args.split]), desc="Formatting batches", position=1, leave=False):
+                support_set = (
+                    dataset["train"].shuffle().select(range(min(self.eval_args.n_shot, len(dataset["train"]))))
+                )
+                messages = self.eval_template.format_example(
+                    target_data=dataset[self.data_args.split][i],
+                    support_set=support_set,
+                    subject_name=categorys[subject]["name"],
+                )
+
+                input_ids, _ = self.template.encode_oneturn(tokenizer=self.tokenizer, messages=messages)
+                inputs.append({"input_ids": input_ids, "attention_mask": [1] * len(input_ids)})
+                labels.append(messages[-1]["content"])
+
+            for i in trange(
+                0, len(inputs), self.eval_args.batch_size, desc="Predicting batches", position=1, leave=False
+            ):
+                batch_input = self.tokenizer.pad(
+                    inputs[i : i + self.eval_args.batch_size], return_attention_mask=True, return_tensors="pt"
+                ).to(self.model.device)
+                preds = self.batch_inference(batch_input)
+                outputs += preds
+
+            corrects = np.array(outputs) == np.array(labels)
+            category_name = categorys[subject]["category"]
+            category_corrects[category_name] = np.concatenate([category_corrects[category_name], corrects], axis=0)
+            category_corrects["Average"] = np.concatenate([category_corrects["Average"], corrects], axis=0)
+            results[subject] = {str(i): outputs[i] for i in range(len(outputs))}
+
+        pbar.close()
+        self._save_results(category_corrects, results)
+
+    def _save_results(self, category_corrects: Dict[str, np.ndarray], results: Dict[str, Dict[int, str]]) -> None:
+        score_info = "\n".join(
+            [
+                "{:>15}: {:.2f}".format(category_name, 100 * np.mean(category_correct))
+                for category_name, category_correct in category_corrects.items()
+                if len(category_correct)
+            ]
+        )
+        print(score_info)
+        if self.eval_args.save_dir is not None:
+            os.makedirs(self.eval_args.save_dir, exist_ok=False)
+            with open(os.path.join(self.eval_args.save_dir, "results.json"), "w", encoding="utf-8", newline="\n") as f:
+                json.dump(results, f, indent=2)
+
+            with open(os.path.join(self.eval_args.save_dir, "results.log"), "w", encoding="utf-8", newline="\n") as f:
+                f.write(score_info)
+
+
+if __name__ == "__main__":
+    evaluator = Evaluator()
+    evaluator.eval()
--- a/LLaMA-Factory/src/llmtuner/eval/template.py
+++ b/LLaMA-Factory/src/llmtuner/eval/template.py
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Dict, List, Tuple
+
+from ..data import Role
+from ..extras.constants import CHOICES
+
+
+if TYPE_CHECKING:
+    from datasets import Dataset
+
+
+@dataclass
+class EvalTemplate:
+    system: str
+    choice: str
+    answer: str
+    prefix: str
+
+    def parse_example(self, example: Dict[str, str]) -> Tuple[str, str]:
+        candidates = [self.choice.format(choice=ch, content=example[ch]) for ch in CHOICES if ch in example]
+        return "".join([example["question"]] + candidates + [self.answer]), example["answer"]
+
+    def format_example(
+        self, target_data: Dict[str, str], support_set: "Dataset", subject_name: str
+    ) -> List[Dict[str, str]]:
+        messages = []
+        for k in range(len(support_set)):
+            prompt, response = self.parse_example(support_set[k])
+            messages.append({"role": Role.USER, "content": prompt})
+            messages.append({"role": Role.ASSISTANT, "content": response})
+
+        prompt, response = self.parse_example(target_data)
+        messages.append({"role": Role.USER, "content": prompt})
+        messages.append({"role": Role.ASSISTANT, "content": response})
+        messages[0]["content"] = self.system.format(subject=subject_name) + messages[0]["content"]
+        return messages
+
+
+eval_templates: Dict[str, "EvalTemplate"] = {}
+
+
+def register_eval_template(name: str, system: str, choice: str, answer: str, prefix: str) -> None:
+    eval_templates[name] = EvalTemplate(system=system, choice=choice, answer=answer, prefix=prefix)
+
+
+def get_eval_template(name: str) -> "EvalTemplate":
+    eval_template = eval_templates.get(name, None)
+    assert eval_template is not None, "Template {} does not exist.".format(name)
+    return eval_template
+
+
+register_eval_template(
+    name="en",
+    system="The following are multiple choice questions (with answers) about {subject}.\n\n",
+    choice="\n{choice}. {content}",
+    answer="\nAnswer: ",
+    prefix=" ",
+)
+
+
+register_eval_template(
+    name="zh",
+    system="以下是中国关于{subject}考试的单项选择题，请选出其中的正确答案。\n\n",
+    choice="\n{choice}. {content}",
+    answer="\n答案：",
+    prefix="\n",
+)
--- a/LLaMA-Factory/src/llmtuner/extras/__init__.py
+++ b/LLaMA-Factory/src/llmtuner/extras/__init__.py
--- a/LLaMA-Factory/src/llmtuner/extras/callbacks.py
+++ b/LLaMA-Factory/src/llmtuner/extras/callbacks.py
+import json
+import os
+import time
+from datetime import timedelta
+from typing import TYPE_CHECKING
+
+from transformers import TrainerCallback
+from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, has_length
+
+from .constants import LOG_FILE_NAME
+from .logging import get_logger
+from .misc import fix_valuehead_checkpoint
+
+
+if TYPE_CHECKING:
+    from transformers import TrainerControl, TrainerState, TrainingArguments
+
+
+logger = get_logger(__name__)
+
+
+class FixValueHeadModelCallback(TrainerCallback):
+    def on_save(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        r"""
+        Event called after a checkpoint save.
+        """
+        if args.should_save:
+            fix_valuehead_checkpoint(
+                model=kwargs.pop("model"),
+                output_dir=os.path.join(args.output_dir, "{}-{}".format(PREFIX_CHECKPOINT_DIR, state.global_step)),
+                safe_serialization=args.save_safetensors,
+            )
+
+
+class LogCallback(TrainerCallback):
+    def __init__(self, runner=None):
+        self.runner = runner
+        self.in_training = False
+        self.start_time = time.time()
+        self.cur_steps = 0
+        self.max_steps = 0
+        self.elapsed_time = ""
+        self.remaining_time = ""
+
+    def timing(self):
+        cur_time = time.time()
+        elapsed_time = cur_time - self.start_time
+        avg_time_per_step = elapsed_time / self.cur_steps if self.cur_steps != 0 else 0
+        remaining_time = (self.max_steps - self.cur_steps) * avg_time_per_step
+        self.elapsed_time = str(timedelta(seconds=int(elapsed_time)))
+        self.remaining_time = str(timedelta(seconds=int(remaining_time)))
+
+    def on_train_begin(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        r"""
+        Event called at the beginning of training.
+        """
+        if state.is_local_process_zero:
+            self.in_training = True
+            self.start_time = time.time()
+            self.max_steps = state.max_steps
+            if os.path.exists(os.path.join(args.output_dir, LOG_FILE_NAME)) and args.overwrite_output_dir:
+                logger.warning("Previous log file in this folder will be deleted.")
+                os.remove(os.path.join(args.output_dir, LOG_FILE_NAME))
+
+    def on_train_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        r"""
+        Event called at the end of training.
+        """
+        if state.is_local_process_zero:
+            self.in_training = False
+            self.cur_steps = 0
+            self.max_steps = 0
+
+    def on_substep_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        r"""
+        Event called at the end of an substep during gradient accumulation.
+        """
+        if state.is_local_process_zero and self.runner is not None and self.runner.aborted:
+            control.should_epoch_stop = True
+            control.should_training_stop = True
+
+    def on_step_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        r"""
+        Event called at the end of a training step.
+        """
+        if state.is_local_process_zero:
+            self.cur_steps = state.global_step
+            self.timing()
+            if self.runner is not None and self.runner.aborted:
+                control.should_epoch_stop = True
+                control.should_training_stop = True
+
+    def on_evaluate(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        r"""
+        Event called after an evaluation phase.
+        """
+        if state.is_local_process_zero and not self.in_training:
+            self.cur_steps = 0
+            self.max_steps = 0
+
+    def on_predict(
+        self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", *other, **kwargs
+    ):
+        r"""
+        Event called after a successful prediction.
+        """
+        if state.is_local_process_zero and not self.in_training:
+            self.cur_steps = 0
+            self.max_steps = 0
+
+    def on_log(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs) -> None:
+        r"""
+        Event called after logging the last logs.
+        """
+        if not state.is_local_process_zero:
+            return
+
+        logs = dict(
+            current_steps=self.cur_steps,
+            total_steps=self.max_steps,
+            loss=state.log_history[-1].get("loss", None),
+            eval_loss=state.log_history[-1].get("eval_loss", None),
+            predict_loss=state.log_history[-1].get("predict_loss", None),
+            reward=state.log_history[-1].get("reward", None),
+            learning_rate=state.log_history[-1].get("learning_rate", None),
+            epoch=state.log_history[-1].get("epoch", None),
+            percentage=round(self.cur_steps / self.max_steps * 100, 2) if self.max_steps != 0 else 100,
+            elapsed_time=self.elapsed_time,
+            remaining_time=self.remaining_time,
+        )
+        if self.runner is not None:
+            logger.info(
+                "{{'loss': {:.4f}, 'learning_rate': {:2.4e}, 'epoch': {:.2f}}}".format(
+                    logs["loss"] or 0, logs["learning_rate"] or 0, logs["epoch"] or 0
+                )
+            )
+
+        os.makedirs(args.output_dir, exist_ok=True)
+        with open(os.path.join(args.output_dir, "trainer_log.jsonl"), "a", encoding="utf-8") as f:
+            f.write(json.dumps(logs) + "\n")
+
+    def on_prediction_step(
+        self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs
+    ):
+        r"""
+        Event called after a prediction step.
+        """
+        eval_dataloader = kwargs.pop("eval_dataloader", None)
+        if state.is_local_process_zero and has_length(eval_dataloader) and not self.in_training:
+            if self.max_steps == 0:
+                self.max_steps = len(eval_dataloader)
+            self.cur_steps += 1
+            self.timing()
--- a/LLaMA-Factory/src/llmtuner/extras/constants.py
+++ b/LLaMA-Factory/src/llmtuner/extras/constants.py
+from collections import OrderedDict, defaultdict
+from enum import Enum
+from typing import Dict, Optional
+
+
+CHOICES = ["A", "B", "C", "D"]
+
+DATA_CONFIG = "dataset_info.json"
+
+DEFAULT_MODULE = defaultdict(str)
+
+DEFAULT_TEMPLATE = defaultdict(str)
+
+FILEEXT2TYPE = {
+    "arrow": "arrow",
+    "csv": "csv",
+    "json": "json",
+    "jsonl": "json",
+    "parquet": "parquet",
+    "txt": "text",
+}
+
+IGNORE_INDEX = -100
+
+LAYERNORM_NAMES = {"norm", "ln"}
+
+LOG_FILE_NAME = "trainer_log.jsonl"
+
+METHODS = ["full", "freeze", "lora"]
+
+PEFT_METHODS = ["lora"]
+
+SUBJECTS = ["Average", "STEM", "Social Sciences", "Humanities", "Other"]
+
+SUPPORTED_MODELS = OrderedDict()
+
+TRAINING_STAGES = {
+    "Supervised Fine-Tuning": "sft",
+    "Reward Modeling": "rm",
+    "PPO": "ppo",
+    "DPO": "dpo",
+    "Pre-Training": "pt",
+}
+
+V_HEAD_WEIGHTS_NAME = "value_head.bin"
+
+V_HEAD_SAFE_WEIGHTS_NAME = "value_head.safetensors"
+
+
+class DownloadSource(str, Enum):
+    DEFAULT = "hf"
+    MODELSCOPE = "ms"
+
+
+def register_model_group(
+    models: Dict[str, Dict[DownloadSource, str]],
+    module: Optional[str] = None,
+    template: Optional[str] = None,
+) -> None:
+    prefix = None
+    for name, path in models.items():
+        if prefix is None:
+            prefix = name.split("-")[0]
+        else:
+            assert prefix == name.split("-")[0], "prefix should be identical."
+        SUPPORTED_MODELS[name] = path
+    if module is not None:
+        DEFAULT_MODULE[prefix] = module
+    if template is not None:
+        DEFAULT_TEMPLATE[prefix] = template
+
+
+register_model_group(
+    models={
+        "Baichuan-7B-Base": {
+            DownloadSource.DEFAULT: "baichuan-inc/Baichuan-7B",
+            DownloadSource.MODELSCOPE: "baichuan-inc/baichuan-7B",
+        },
+        "Baichuan-13B-Base": {
+            DownloadSource.DEFAULT: "baichuan-inc/Baichuan-13B-Base",
+            DownloadSource.MODELSCOPE: "baichuan-inc/Baichuan-13B-Base",
+        },
+        "Baichuan-13B-Chat": {
+            DownloadSource.DEFAULT: "baichuan-inc/Baichuan-13B-Chat",
+            DownloadSource.MODELSCOPE: "baichuan-inc/Baichuan-13B-Chat",
+        },
+    },
+    module="W_pack",
+    template="baichuan",
+)
+
+
+register_model_group(
+    models={
+        "Baichuan2-7B-Base": {
+            DownloadSource.DEFAULT: "baichuan-inc/Baichuan2-7B-Base",
+            DownloadSource.MODELSCOPE: "baichuan-inc/Baichuan2-7B-Base",
+        },
+        "Baichuan2-13B-Base": {
+            DownloadSource.DEFAULT: "baichuan-inc/Baichuan2-13B-Base",
+            DownloadSource.MODELSCOPE: "baichuan-inc/Baichuan2-13B-Base",
+        },
+        "Baichuan2-7B-Chat": {
+            DownloadSource.DEFAULT: "baichuan-inc/Baichuan2-7B-Chat",
+            DownloadSource.MODELSCOPE: "baichuan-inc/Baichuan2-7B-Chat",
+        },
+        "Baichuan2-13B-Chat": {
+            DownloadSource.DEFAULT: "baichuan-inc/Baichuan2-13B-Chat",
+            DownloadSource.MODELSCOPE: "baichuan-inc/Baichuan2-13B-Chat",
+        },
+    },
+    module="W_pack",
+    template="baichuan2",
+)
+
+
+register_model_group(
+    models={
+        "BLOOM-560M": {
+            DownloadSource.DEFAULT: "bigscience/bloom-560m",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/bloom-560m",
+        },
+        "BLOOM-3B": {
+            DownloadSource.DEFAULT: "bigscience/bloom-3b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/bloom-3b",
+        },
+        "BLOOM-7B1": {
+            DownloadSource.DEFAULT: "bigscience/bloom-7b1",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/bloom-7b1",
+        },
+    },
+    module="query_key_value",
+)
+
+
+register_model_group(
+    models={
+        "BLOOMZ-560M": {
+            DownloadSource.DEFAULT: "bigscience/bloomz-560m",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/bloomz-560m",
+        },
+        "BLOOMZ-3B": {
+            DownloadSource.DEFAULT: "bigscience/bloomz-3b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/bloomz-3b",
+        },
+        "BLOOMZ-7B1-mt": {
+            DownloadSource.DEFAULT: "bigscience/bloomz-7b1-mt",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/bloomz-7b1-mt",
+        },
+    },
+    module="query_key_value",
+)
+
+
+register_model_group(
+    models={
+        "BlueLM-7B-Base": {
+            DownloadSource.DEFAULT: "vivo-ai/BlueLM-7B-Base",
+            DownloadSource.MODELSCOPE: "vivo-ai/BlueLM-7B-Base",
+        },
+        "BlueLM-7B-Chat": {
+            DownloadSource.DEFAULT: "vivo-ai/BlueLM-7B-Chat",
+            DownloadSource.MODELSCOPE: "vivo-ai/BlueLM-7B-Chat",
+        },
+    },
+    template="bluelm",
+)
+
+
+register_model_group(
+    models={
+        "ChatGLM2-6B-Chat": {
+            DownloadSource.DEFAULT: "THUDM/chatglm2-6b",
+            DownloadSource.MODELSCOPE: "ZhipuAI/chatglm2-6b",
+        }
+    },
+    module="query_key_value",
+    template="chatglm2",
+)
+
+
+register_model_group(
+    models={
+        "ChatGLM3-6B-Base": {
+            DownloadSource.DEFAULT: "THUDM/chatglm3-6b-base",
+            DownloadSource.MODELSCOPE: "ZhipuAI/chatglm3-6b-base",
+        },
+        "ChatGLM3-6B-Chat": {
+            DownloadSource.DEFAULT: "THUDM/chatglm3-6b",
+            DownloadSource.MODELSCOPE: "ZhipuAI/chatglm3-6b",
+        },
+    },
+    module="query_key_value",
+    template="chatglm3",
+)
+
+
+register_model_group(
+    models={
+        "ChineseLLaMA2-1.3B": {
+            DownloadSource.DEFAULT: "hfl/chinese-llama-2-1.3b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/chinese-llama-2-1.3b",
+        },
+        "ChineseLLaMA2-7B": {
+            DownloadSource.DEFAULT: "hfl/chinese-llama-2-7b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/chinese-llama-2-7b",
+        },
+        "ChineseLLaMA2-13B": {
+            DownloadSource.DEFAULT: "hfl/chinese-llama-2-13b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/chinese-llama-2-13b",
+        },
+        "ChineseLLaMA2-1.3B-Chat": {
+            DownloadSource.DEFAULT: "hfl/chinese-alpaca-2-1.3b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/chinese-alpaca-2-1.3b",
+        },
+        "ChineseLLaMA2-7B-Chat": {
+            DownloadSource.DEFAULT: "hfl/chinese-alpaca-2-7b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/chinese-alpaca-2-7b",
+        },
+        "ChineseLLaMA2-13B-Chat": {
+            DownloadSource.DEFAULT: "hfl/chinese-alpaca-2-13b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/chinese-alpaca-2-13b",
+        },
+    },
+    template="llama2_zh",
+)
+
+
+register_model_group(
+    models={
+        "DeepSeek-LLM-7B-Base": {
+            DownloadSource.DEFAULT: "deepseek-ai/deepseek-llm-7b-base",
+            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-llm-7b-base",
+        },
+        "DeepSeek-LLM-67B-Base": {
+            DownloadSource.DEFAULT: "deepseek-ai/deepseek-llm-67b-base",
+            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-llm-67b-base",
+        },
+        "DeepSeek-LLM-7B-Chat": {
+            DownloadSource.DEFAULT: "deepseek-ai/deepseek-llm-7b-chat",
+            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-llm-7b-chat",
+        },
+        "DeepSeek-LLM-67B-Chat": {
+            DownloadSource.DEFAULT: "deepseek-ai/deepseek-llm-67b-chat",
+            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-llm-67b-chat",
+        },
+        "DeepSeek-Math-7B-Base": {
+            DownloadSource.DEFAULT: "deepseek-ai/deepseek-math-7b-base",
+        },
+        "DeepSeek-Math-7B-Chat": {
+            DownloadSource.DEFAULT: "deepseek-ai/deepseek-math-7b-instruct",
+        },
+        "DeepSeek-MoE-16B-Base": {
+            DownloadSource.DEFAULT: "deepseek-ai/deepseek-moe-16b-base",
+            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-moe-16b-base",
+        },
+        "DeepSeek-MoE-16B-Chat": {
+            DownloadSource.DEFAULT: "deepseek-ai/deepseek-moe-16b-chat",
+            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-moe-16b-chat",
+        },
+    },
+    template="deepseek",
+)
+
+
+register_model_group(
+    models={
+        "DeepSeekCoder-6.7B-Base": {
+            DownloadSource.DEFAULT: "deepseek-ai/deepseek-coder-6.7b-base",
+            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-coder-6.7b-base",
+        },
+        "DeepSeekCoder-7B-Base": {
+            DownloadSource.DEFAULT: "deepseek-ai/deepseek-coder-7b-base-v1.5",
+        },
+        "DeepSeekCoder-33B-Base": {
+            DownloadSource.DEFAULT: "deepseek-ai/deepseek-coder-33b-base",
+            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-coder-33b-base",
+        },
+        "DeepSeekCoder-6.7B-Chat": {
+            DownloadSource.DEFAULT: "deepseek-ai/deepseek-coder-6.7b-instruct",
+            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-coder-6.7b-instruct",
+        },
+        "DeepSeekCoder-7B-Chat": {
+            DownloadSource.DEFAULT: "deepseek-ai/deepseek-coder-7b-instruct-v1.5",
+        },
+        "DeepSeekCoder-33B-Chat": {
+            DownloadSource.DEFAULT: "deepseek-ai/deepseek-coder-33b-instruct",
+            DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-coder-33b-instruct",
+        },
+    },
+    template="deepseekcoder",
+)
+
+
+register_model_group(
+    models={
+        "Falcon-7B": {
+            DownloadSource.DEFAULT: "tiiuae/falcon-7b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/falcon-7b",
+        },
+        "Falcon-40B": {
+            DownloadSource.DEFAULT: "tiiuae/falcon-40b",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/falcon-40b",
+        },
+        "Falcon-180B": {
+            DownloadSource.DEFAULT: "tiiuae/falcon-180b",
+            DownloadSource.MODELSCOPE: "modelscope/falcon-180B",
+        },
+        "Falcon-7B-Chat": {
+            DownloadSource.DEFAULT: "tiiuae/falcon-7b-instruct",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/falcon-7b-instruct",
+        },
+        "Falcon-40B-Chat": {
+            DownloadSource.DEFAULT: "tiiuae/falcon-40b-instruct",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/falcon-40b-instruct",
+        },
+        "Falcon-180B-Chat": {
+            DownloadSource.DEFAULT: "tiiuae/falcon-180b-chat",
+            DownloadSource.MODELSCOPE: "modelscope/falcon-180B-chat",
+        },
+    },
+    module="query_key_value",
+    template="falcon",
+)
+
+
+register_model_group(
+    models={
+        "InternLM-7B": {
+            DownloadSource.DEFAULT: "internlm/internlm-7b",
+            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm-7b",
+        },
+        "InternLM-20B": {
+            DownloadSource.DEFAULT: "internlm/internlm-20b",
+            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm-20b",
+        },
+        "InternLM-7B-Chat": {
+            DownloadSource.DEFAULT: "internlm/internlm-chat-7b",
+            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm-chat-7b",
+        },
+        "InternLM-20B-Chat": {
+            DownloadSource.DEFAULT: "internlm/internlm-chat-20b",
+            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm-chat-20b",
+        },
+    },
+    template="intern",
+)
+
+
+register_model_group(
+    models={
+        "InternLM2-7B": {
+            DownloadSource.DEFAULT: "internlm/internlm2-7b",
+            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm2-7b",
+        },
+        "InternLM2-20B": {
+            DownloadSource.DEFAULT: "internlm/internlm2-20b",
+            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm2-20b",
+        },
+        "InternLM2-7B-Chat": {
+            DownloadSource.DEFAULT: "internlm/internlm2-chat-7b",
+            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm2-chat-7b",
+        },
+        "InternLM2-20B-Chat": {
+            DownloadSource.DEFAULT: "internlm/internlm2-chat-20b",
+            DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm2-chat-20b",
+        },
+    },
+    module="wqkv",
+    template="intern2",
+)
+
+
+register_model_group(
+    models={
+        "LingoWhale-8B": {
+            DownloadSource.DEFAULT: "deeplang-ai/LingoWhale-8B",
+            DownloadSource.MODELSCOPE: "DeepLang/LingoWhale-8B",
+        }
+    },
+    module="qkv_proj",
+)
+
+
+register_model_group(
+    models={
+        "LLaMA-7B": {
+            DownloadSource.DEFAULT: "huggyllama/llama-7b",
+            DownloadSource.MODELSCOPE: "skyline2006/llama-7b",
+        },
+        "LLaMA-13B": {
+            DownloadSource.DEFAULT: "huggyllama/llama-13b",
+            DownloadSource.MODELSCOPE: "skyline2006/llama-13b",
+        },
+        "LLaMA-30B": {
+            DownloadSource.DEFAULT: "huggyllama/llama-30b",
+            DownloadSource.MODELSCOPE: "skyline2006/llama-30b",
+        },
+        "LLaMA-65B": {
+            DownloadSource.DEFAULT: "huggyllama/llama-65b",
+            DownloadSource.MODELSCOPE: "skyline2006/llama-65b",
+        },
+    }
+)
+
+
+register_model_group(
+    models={
+        "LLaMA2-7B": {
+            DownloadSource.DEFAULT: "meta-llama/Llama-2-7b-hf",
+            DownloadSource.MODELSCOPE: "modelscope/Llama-2-7b-ms",
+        },
+        "LLaMA2-13B": {
+            DownloadSource.DEFAULT: "meta-llama/Llama-2-13b-hf",
+            DownloadSource.MODELSCOPE: "modelscope/Llama-2-13b-ms",
+        },
+        "LLaMA2-70B": {
+            DownloadSource.DEFAULT: "meta-llama/Llama-2-70b-hf",
+            DownloadSource.MODELSCOPE: "modelscope/Llama-2-70b-ms",
+        },
+        "LLaMA2-7B-Chat": {
+            DownloadSource.DEFAULT: "meta-llama/Llama-2-7b-chat-hf",
+            DownloadSource.MODELSCOPE: "modelscope/Llama-2-7b-chat-ms",
+        },
+        "LLaMA2-13B-Chat": {
+            DownloadSource.DEFAULT: "meta-llama/Llama-2-13b-chat-hf",
+            DownloadSource.MODELSCOPE: "modelscope/Llama-2-13b-chat-ms",
+        },
+        "LLaMA2-70B-Chat": {
+            DownloadSource.DEFAULT: "meta-llama/Llama-2-70b-chat-hf",
+            DownloadSource.MODELSCOPE: "modelscope/Llama-2-70b-chat-ms",
+        },
+    },
+    template="llama2",
+)
+
+
+register_model_group(
+    models={
+        "Mistral-7B": {
+            DownloadSource.DEFAULT: "mistralai/Mistral-7B-v0.1",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/Mistral-7B-v0.1",
+        },
+        "Mistral-7B-Chat": {
+            DownloadSource.DEFAULT: "mistralai/Mistral-7B-Instruct-v0.1",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/Mistral-7B-Instruct-v0.1",
+        },
+        "Mistral-7B-v0.2-Chat": {
+            DownloadSource.DEFAULT: "mistralai/Mistral-7B-Instruct-v0.2",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/Mistral-7B-Instruct-v0.2",
+        },
+    },
+    template="mistral",
+)
+
+
+register_model_group(
+    models={
+        "Mixtral-8x7B": {
+            DownloadSource.DEFAULT: "mistralai/Mixtral-8x7B-v0.1",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/Mixtral-8x7B-v0.1",
+        },
+        "Mixtral-8x7B-Chat": {
+            DownloadSource.DEFAULT: "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/Mixtral-8x7B-Instruct-v0.1",
+        },
+    },
+    template="mistral",
+)
+
+
+register_model_group(
+    models={
+        "OpenChat3.5-7B-Chat": {
+            DownloadSource.DEFAULT: "openchat/openchat-3.5-0106",
+            DownloadSource.MODELSCOPE: "myxiongmodel/openchat_3.5",
+        }
+    },
+    template="openchat",
+)
+
+
+register_model_group(
+    models={
+        "Orion-14B-Base": {
+            DownloadSource.DEFAULT: "OrionStarAI/Orion-14B-Base",
+            DownloadSource.MODELSCOPE: "OrionStarAI/Orion-14B-Base",
+        },
+        "Orion-14B-Chat": {
+            DownloadSource.DEFAULT: "OrionStarAI/Orion-14B-Chat",
+            DownloadSource.MODELSCOPE: "OrionStarAI/Orion-14B-Chat",
+        },
+        "Orion-14B-Long-Chat": {
+            DownloadSource.DEFAULT: "OrionStarAI/Orion-14B-LongChat",
+            DownloadSource.MODELSCOPE: "OrionStarAI/Orion-14B-LongChat",
+        },
+        "Orion-14B-RAG-Chat": {
+            DownloadSource.DEFAULT: "OrionStarAI/Orion-14B-Chat-RAG",
+            DownloadSource.MODELSCOPE: "OrionStarAI/Orion-14B-Chat-RAG",
+        },
+        "Orion-14B-Plugin-Chat": {
+            DownloadSource.DEFAULT: "OrionStarAI/Orion-14B-Chat-Plugin",
+            DownloadSource.MODELSCOPE: "OrionStarAI/Orion-14B-Chat-Plugin",
+        },
+    },
+    template="orion",
+)
+
+
+register_model_group(
+    models={
+        "Phi-1.5-1.3B": {
+            DownloadSource.DEFAULT: "microsoft/phi-1_5",
+            DownloadSource.MODELSCOPE: "allspace/PHI_1-5",
+        },
+        "Phi-2-2.7B": {
+            DownloadSource.DEFAULT: "microsoft/phi-2",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/phi-2",
+        },
+    }
+)
+
+
+register_model_group(
+    models={
+        "Qwen-1.8B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen-1_8B",
+            DownloadSource.MODELSCOPE: "qwen/Qwen-1_8B",
+        },
+        "Qwen-7B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen-7B",
+            DownloadSource.MODELSCOPE: "qwen/Qwen-7B",
+        },
+        "Qwen-14B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen-14B",
+            DownloadSource.MODELSCOPE: "qwen/Qwen-14B",
+        },
+        "Qwen-72B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen-72B",
+            DownloadSource.MODELSCOPE: "qwen/Qwen-72B",
+        },
+        "Qwen-1.8B-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen-1_8B-Chat",
+            DownloadSource.MODELSCOPE: "qwen/Qwen-1_8B-Chat",
+        },
+        "Qwen-7B-Chat": {DownloadSource.DEFAULT: "Qwen/Qwen-7B-Chat", DownloadSource.MODELSCOPE: "qwen/Qwen-7B-Chat"},
+        "Qwen-14B-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen-14B-Chat",
+            DownloadSource.MODELSCOPE: "qwen/Qwen-14B-Chat",
+        },
+        "Qwen-72B-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen-72B-Chat",
+            DownloadSource.MODELSCOPE: "qwen/Qwen-72B-Chat",
+        },
+        "Qwen-1.8B-int8-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen-1_8B-Chat-Int8",
+            DownloadSource.MODELSCOPE: "qwen/Qwen-1_8B-Chat-Int8",
+        },
+        "Qwen-1.8B-int4-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen-1_8B-Chat-Int4",
+            DownloadSource.MODELSCOPE: "qwen/Qwen-1_8B-Chat-Int4",
+        },
+        "Qwen-7B-int8-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen-7B-Chat-Int8",
+            DownloadSource.MODELSCOPE: "qwen/Qwen-7B-Chat-Int8",
+        },
+        "Qwen-7B-int4-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen-7B-Chat-Int4",
+            DownloadSource.MODELSCOPE: "qwen/Qwen-7B-Chat-Int4",
+        },
+        "Qwen-14B-int8-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen-14B-Chat-Int8",
+            DownloadSource.MODELSCOPE: "qwen/Qwen-14B-Chat-Int8",
+        },
+        "Qwen-14B-int4-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen-14B-Chat-Int4",
+            DownloadSource.MODELSCOPE: "qwen/Qwen-14B-Chat-Int4",
+        },
+        "Qwen-72B-int8-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen-72B-Chat-Int8",
+            DownloadSource.MODELSCOPE: "qwen/Qwen-72B-Chat-Int8",
+        },
+        "Qwen-72B-int4-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen-72B-Chat-Int4",
+            DownloadSource.MODELSCOPE: "qwen/Qwen-72B-Chat-Int4",
+        },
+    },
+    module="c_attn",
+    template="qwen",
+)
+
+
+register_model_group(
+    models={
+        "Qwen1.5-0.5B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-0.5B",
+            DownloadSource.MODELSCOPE: "qwen/Qwen1.5-0.5B",
+        },
+        "Qwen1.5-1.8B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-1.8B",
+            DownloadSource.MODELSCOPE: "qwen/Qwen1.5-1.8B",
+        },
+        "Qwen1.5-4B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-4B",
+            DownloadSource.MODELSCOPE: "qwen/Qwen1.5-4B",
+        },
+        "Qwen1.5-7B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-7B",
+            DownloadSource.MODELSCOPE: "qwen/Qwen1.5-7B",
+        },
+        "Qwen1.5-14B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-14B",
+            DownloadSource.MODELSCOPE: "qwen/Qwen1.5-14B",
+        },
+        "Qwen1.5-72B": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-72B",
+            DownloadSource.MODELSCOPE: "qwen/Qwen1.5-72B",
+        },
+        "Qwen1.5-0.5B-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-0.5B-Chat",
+            DownloadSource.MODELSCOPE: "qwen/Qwen1.5-0.5B-Chat",
+        },
+        "Qwen1.5-1.8B-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-1.8B-Chat",
+            DownloadSource.MODELSCOPE: "qwen/Qwen1.5-1.8B-Chat",
+        },
+        "Qwen1.5-4B-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-4B-Chat",
+            DownloadSource.MODELSCOPE: "qwen/Qwen1.5-4B-Chat",
+        },
+        "Qwen1.5-7B-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-7B-Chat",
+            DownloadSource.MODELSCOPE: "qwen/Qwen1.5-7B-Chat",
+        },
+        "Qwen1.5-14B-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-14B-Chat",
+            DownloadSource.MODELSCOPE: "qwen/Qwen1.5-14B-Chat",
+        },
+        "Qwen1.5-72B-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-72B-Chat",
+            DownloadSource.MODELSCOPE: "qwen/Qwen1.5-72B-Chat",
+        },
+        "Qwen1.5-0.5B-int8-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8",
+        },
+        "Qwen1.5-0.5B-int4-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4",
+        },
+        "Qwen1.5-1.8B-int8-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8",
+        },
+        "Qwen1.5-1.8B-int4-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-1.8B-Chat-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "qwen/Qwen1.5-1.8B-Chat-GPTQ-Int4",
+        },
+        "Qwen1.5-4B-int8-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-4B-Chat-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "qwen/Qwen1.5-4B-Chat-GPTQ-Int8",
+        },
+        "Qwen1.5-4B-int4-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-4B-Chat-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "qwen/Qwen1.5-4B-Chat-GPTQ-Int4",
+        },
+        "Qwen1.5-7B-int8-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-7B-Chat-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "qwen/Qwen1.5-7B-Chat-GPTQ-Int8",
+        },
+        "Qwen1.5-7B-int4-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-7B-Chat-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "qwen/Qwen1.5-7B-Chat-GPTQ-Int4",
+        },
+        "Qwen1.5-14B-int8-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-14B-Chat-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "qwen/Qwen1.5-14B-Chat-GPTQ-Int8",
+        },
+        "Qwen1.5-14B-int4-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-14B-Chat-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "qwen/Qwen1.5-14B-Chat-GPTQ-Int4",
+        },
+        "Qwen1.5-72B-int8-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-72B-Chat-GPTQ-Int8",
+            DownloadSource.MODELSCOPE: "qwen/Qwen1.5-72B-Chat-GPTQ-Int8",
+        },
+        "Qwen1.5-72B-int4-Chat": {
+            DownloadSource.DEFAULT: "Qwen/Qwen1.5-72B-Chat-GPTQ-Int4",
+            DownloadSource.MODELSCOPE: "qwen/Qwen1.5-72B-Chat-GPTQ-Int4",
+        },
+    },
+    template="qwen",
+)
+
+
+register_model_group(
+    models={
+        "SOLAR-10.7B": {
+            DownloadSource.DEFAULT: "upstage/SOLAR-10.7B-v1.0",
+        },
+        "SOLAR-10.7B-Chat": {
+            DownloadSource.DEFAULT: "upstage/SOLAR-10.7B-Instruct-v1.0",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/SOLAR-10.7B-Instruct-v1.0",
+        },
+    },
+    template="solar",
+)
+
+
+register_model_group(
+    models={
+        "Skywork-13B-Base": {
+            DownloadSource.DEFAULT: "Skywork/Skywork-13B-base",
+            DownloadSource.MODELSCOPE: "skywork/Skywork-13B-base",
+        }
+    }
+)
+
+
+register_model_group(
+    models={
+        "Vicuna1.5-7B-Chat": {
+            DownloadSource.DEFAULT: "lmsys/vicuna-7b-v1.5",
+            DownloadSource.MODELSCOPE: "Xorbits/vicuna-7b-v1.5",
+        },
+        "Vicuna1.5-13B-Chat": {
+            DownloadSource.DEFAULT: "lmsys/vicuna-13b-v1.5",
+            DownloadSource.MODELSCOPE: "Xorbits/vicuna-13b-v1.5",
+        },
+    },
+    template="vicuna",
+)
+
+
+register_model_group(
+    models={
+        "XuanYuan-70B": {
+            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-70B",
+        },
+        "XuanYuan-70B-Chat": {
+            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-70B-Chat",
+        },
+        "XuanYuan-70B-int8-Chat": {
+            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-70B-Chat-8bit",
+        },
+        "XuanYuan-70B-int4-Chat": {
+            DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-70B-Chat-4bit",
+        },
+    },
+    template="xuanyuan",
+)
+
+
+register_model_group(
+    models={
+        "XVERSE-7B": {
+            DownloadSource.DEFAULT: "xverse/XVERSE-7B",
+            DownloadSource.MODELSCOPE: "xverse/XVERSE-7B",
+        },
+        "XVERSE-13B": {
+            DownloadSource.DEFAULT: "xverse/XVERSE-13B",
+            DownloadSource.MODELSCOPE: "xverse/XVERSE-13B",
+        },
+        "XVERSE-65B": {
+            DownloadSource.DEFAULT: "xverse/XVERSE-65B",
+            DownloadSource.MODELSCOPE: "xverse/XVERSE-65B",
+        },
+        "XVERSE-65B-2": {
+            DownloadSource.DEFAULT: "xverse/XVERSE-65B-2",
+            DownloadSource.MODELSCOPE: "xverse/XVERSE-65B-2",
+        },
+        "XVERSE-7B-Chat": {
+            DownloadSource.DEFAULT: "xverse/XVERSE-7B-Chat",
+            DownloadSource.MODELSCOPE: "xverse/XVERSE-7B-Chat",
+        },
+        "XVERSE-13B-Chat": {
+            DownloadSource.DEFAULT: "xverse/XVERSE-13B-Chat",
+            DownloadSource.MODELSCOPE: "xverse/XVERSE-13B-Chat",
+        },
+        "XVERSE-65B-Chat": {
+            DownloadSource.DEFAULT: "xverse/XVERSE-65B-Chat",
+            DownloadSource.MODELSCOPE: "xverse/XVERSE-65B-Chat",
+        },
+    },
+    template="xverse",
+)
+
+
+register_model_group(
+    models={
+        "Yayi-7B": {
+            DownloadSource.DEFAULT: "wenge-research/yayi-7b-llama2",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/yayi-7b-llama2",
+        },
+        "Yayi-13B": {
+            DownloadSource.DEFAULT: "wenge-research/yayi-13b-llama2",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/yayi-13b-llama2",
+        },
+    },
+    template="yayi",
+)
+
+
+register_model_group(
+    models={
+        "Yi-6B": {
+            DownloadSource.DEFAULT: "01-ai/Yi-6B",
+            DownloadSource.MODELSCOPE: "01ai/Yi-6B",
+        },
+        "Yi-34B": {
+            DownloadSource.DEFAULT: "01-ai/Yi-34B",
+            DownloadSource.MODELSCOPE: "01ai/Yi-34B",
+        },
+        "Yi-6B-Chat": {
+            DownloadSource.DEFAULT: "01-ai/Yi-6B-Chat",
+            DownloadSource.MODELSCOPE: "01ai/Yi-6B-Chat",
+        },
+        "Yi-34B-Chat": {
+            DownloadSource.DEFAULT: "01-ai/Yi-34B-Chat",
+            DownloadSource.MODELSCOPE: "01ai/Yi-34B-Chat",
+        },
+        "Yi-6B-int8-Chat": {
+            DownloadSource.DEFAULT: "01-ai/Yi-6B-Chat-8bits",
+            DownloadSource.MODELSCOPE: "01ai/Yi-6B-Chat-8bits",
+        },
+        "Yi-34B-int8-Chat": {
+            DownloadSource.DEFAULT: "01-ai/Yi-34B-Chat-8bits",
+            DownloadSource.MODELSCOPE: "01ai/Yi-34B-Chat-8bits",
+        },
+    },
+    template="yi",
+)
+
+
+register_model_group(
+    models={
+        "Yuan2-2B-Chat": {
+            DownloadSource.DEFAULT: "IEITYuan/Yuan2-2B-hf",
+            DownloadSource.MODELSCOPE: "YuanLLM/Yuan2.0-2B-hf",
+        },
+        "Yuan2-51B-Chat": {
+            DownloadSource.DEFAULT: "IEITYuan/Yuan2-51B-hf",
+            DownloadSource.MODELSCOPE: "YuanLLM/Yuan2.0-51B-hf",
+        },
+        "Yuan2-102B-Chat": {
+            DownloadSource.DEFAULT: "IEITYuan/Yuan2-102B-hf",
+            DownloadSource.MODELSCOPE: "YuanLLM/Yuan2.0-102B-hf",
+        },
+    },
+    template="yuan",
+)
+
+
+register_model_group(
+    models={
+        "Zephyr-7B-Alpha-Chat": {
+            DownloadSource.DEFAULT: "HuggingFaceH4/zephyr-7b-alpha",
+            DownloadSource.MODELSCOPE: "AI-ModelScope/zephyr-7b-alpha",
+        },
+        "Zephyr-7B-Beta-Chat": {
+            DownloadSource.DEFAULT: "HuggingFaceH4/zephyr-7b-beta",
+            DownloadSource.MODELSCOPE: "modelscope/zephyr-7b-beta",
+        },
+    },
+    template="zephyr",
+)
--- a/LLaMA-Factory/src/llmtuner/extras/logging.py
+++ b/LLaMA-Factory/src/llmtuner/extras/logging.py
+import logging
+import sys
+
+
+class LoggerHandler(logging.Handler):
+    r"""
+    Logger handler used in Web UI.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.log = ""
+
+    def reset(self):
+        self.log = ""
+
+    def emit(self, record):
+        if record.name == "httpx":
+            return
+        log_entry = self.format(record)
+        self.log += log_entry
+        self.log += "\n\n"
+
+
+def get_logger(name: str) -> logging.Logger:
+    r"""
+    Gets a standard logger with a stream hander to stdout.
+    """
+    formatter = logging.Formatter(
+        fmt="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S"
+    )
+    handler = logging.StreamHandler(sys.stdout)
+    handler.setFormatter(formatter)
+
+    logger = logging.getLogger(name)
+    logger.setLevel(logging.INFO)
+    logger.addHandler(handler)
+
+    return logger
+
+
+def reset_logging() -> None:
+    r"""
+    Removes basic config of root logger. (unused in script)
+    """
+    root = logging.getLogger()
+    list(map(root.removeHandler, root.handlers))
+    list(map(root.removeFilter, root.filters))
--- a/LLaMA-Factory/src/llmtuner/extras/misc.py
+++ b/LLaMA-Factory/src/llmtuner/extras/misc.py
+import gc
+import os
+from typing import TYPE_CHECKING, Dict, Tuple
+
+import torch
+from peft import PeftModel
+from transformers import InfNanRemoveLogitsProcessor, LogitsProcessorList, PreTrainedModel
+from transformers.utils import (
+    SAFE_WEIGHTS_NAME,
+    WEIGHTS_NAME,
+    is_torch_bf16_gpu_available,
+    is_torch_cuda_available,
+    is_torch_mps_available,
+    is_torch_npu_available,
+    is_torch_xpu_available,
+)
+
+from .constants import V_HEAD_SAFE_WEIGHTS_NAME, V_HEAD_WEIGHTS_NAME
+from .logging import get_logger
+
+
+_is_fp16_available = is_torch_npu_available() or is_torch_cuda_available()
+try:
+    _is_bf16_available = is_torch_bf16_gpu_available()
+except Exception:
+    _is_bf16_available = False
+
+
+if TYPE_CHECKING:
+    from trl import AutoModelForCausalLMWithValueHead
+
+    from llmtuner.hparams import ModelArguments
+
+
+logger = get_logger(__name__)
+
+
+class AverageMeter:
+    r"""
+    Computes and stores the average and current value.
+    """
+
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+
+def count_parameters(model: torch.nn.Module) -> Tuple[int, int]:
+    r"""
+    Returns the number of trainable parameters and number of all parameters in the model.
+    """
+    trainable_params, all_param = 0, 0
+    for param in model.parameters():
+        num_params = param.numel()
+        # if using DS Zero 3 and the weights are initialized empty
+        if num_params == 0 and hasattr(param, "ds_numel"):
+            num_params = param.ds_numel
+
+        # Due to the design of 4bit linear layers from bitsandbytes, multiply the number of parameters by 2
+        if param.__class__.__name__ == "Params4bit":
+            num_params = num_params * 2
+
+        all_param += num_params
+        if param.requires_grad:
+            trainable_params += num_params
+
+    return trainable_params, all_param
+
+
+def fix_valuehead_checkpoint(
+    model: "AutoModelForCausalLMWithValueHead", output_dir: str, safe_serialization: bool
+) -> None:
+    r"""
+    The model is already unwrapped.
+
+    There are three cases:
+    1. full tuning without ds_zero3: state_dict = {"model.layers.*": ..., "v_head.summary.*": ...}
+    2. lora tuning without ds_zero3: state_dict = {"v_head.summary.*": ...}
+    3. under deepspeed zero3: state_dict = {"pretrained_model.model.layers.*": ..., "v_head.summary.*": ...}
+
+    We assume `stage3_gather_16bit_weights_on_model_save=true`.
+    """
+    if not isinstance(model.pretrained_model, (PreTrainedModel, PeftModel)):
+        return
+
+    if safe_serialization:
+        from safetensors import safe_open
+        from safetensors.torch import save_file
+
+        path_to_checkpoint = os.path.join(output_dir, SAFE_WEIGHTS_NAME)
+        with safe_open(path_to_checkpoint, framework="pt", device="cpu") as f:
+            state_dict: Dict[str, torch.Tensor] = {key: f.get_tensor(key) for key in f.keys()}
+    else:
+        path_to_checkpoint = os.path.join(output_dir, WEIGHTS_NAME)
+        state_dict: Dict[str, torch.Tensor] = torch.load(path_to_checkpoint, map_location="cpu")
+
+    decoder_state_dict = {}
+    v_head_state_dict = {}
+    for name, param in state_dict.items():
+        if name.startswith("v_head."):
+            v_head_state_dict[name] = param
+        else:
+            decoder_state_dict[name.replace("pretrained_model.", "")] = param
+
+    os.remove(path_to_checkpoint)
+    model.pretrained_model.save_pretrained(
+        output_dir, state_dict=decoder_state_dict or None, safe_serialization=safe_serialization
+    )
+
+    if safe_serialization:
+        save_file(v_head_state_dict, os.path.join(output_dir, V_HEAD_SAFE_WEIGHTS_NAME), metadata={"format": "pt"})
+    else:
+        torch.save(v_head_state_dict, os.path.join(output_dir, V_HEAD_WEIGHTS_NAME))
+
+    logger.info("Value head model saved at: {}".format(output_dir))
+
+
+def get_current_device() -> torch.device:
+    r"""
+    Gets the current available device.
+    """
+    if is_torch_xpu_available():
+        device = "xpu:{}".format(os.environ.get("LOCAL_RANK", "0"))
+    elif is_torch_npu_available():
+        device = "npu:{}".format(os.environ.get("LOCAL_RANK", "0"))
+    elif is_torch_mps_available():
+        device = "mps:{}".format(os.environ.get("LOCAL_RANK", "0"))
+    elif is_torch_cuda_available():
+        device = "cuda:{}".format(os.environ.get("LOCAL_RANK", "0"))
+    else:
+        device = "cpu"
+
+    return torch.device(device)
+
+
+def get_device_count() -> int:
+    return torch.cuda.device_count()
+
+
+def get_logits_processor() -> "LogitsProcessorList":
+    r"""
+    Gets logits processor that removes NaN and Inf logits.
+    """
+    logits_processor = LogitsProcessorList()
+    logits_processor.append(InfNanRemoveLogitsProcessor())
+    return logits_processor
+
+
+def infer_optim_dtype(model_dtype: torch.dtype) -> torch.dtype:
+    r"""
+    Infers the optimal dtype according to the model_dtype and device compatibility.
+    """
+    if _is_bf16_available and model_dtype == torch.bfloat16:
+        return torch.bfloat16
+    elif _is_fp16_available:
+        return torch.float16
+    else:
+        return torch.float32
+
+
+def torch_gc() -> None:
+    r"""
+    Collects GPU memory.
+    """
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.ipc_collect()
+
+
+def try_download_model_from_ms(model_args: "ModelArguments") -> None:
+    if not use_modelscope() or os.path.exists(model_args.model_name_or_path):
+        return
+
+    try:
+        from modelscope import snapshot_download
+
+        revision = "master" if model_args.model_revision == "main" else model_args.model_revision
+        model_args.model_name_or_path = snapshot_download(
+            model_args.model_name_or_path, revision=revision, cache_dir=model_args.cache_dir
+        )
+    except ImportError:
+        raise ImportError("Please install modelscope via `pip install modelscope -U`")
+
+
+def use_modelscope() -> bool:
+    return bool(int(os.environ.get("USE_MODELSCOPE_HUB", "0")))
--- a/LLaMA-Factory/src/llmtuner/extras/packages.py
+++ b/LLaMA-Factory/src/llmtuner/extras/packages.py
+import importlib.metadata
+import importlib.util
+
+
+def _is_package_available(name: str) -> bool:
+    return importlib.util.find_spec(name) is not None
+
+
+def _get_package_version(name: str) -> str:
+    try:
+        return importlib.metadata.version(name)
+    except Exception:
+        return "0.0.0"
+
+
+def is_fastapi_availble():
+    return _is_package_available("fastapi")
+
+
+def is_flash_attn2_available():
+    return _is_package_available("flash_attn") and _get_package_version("flash_attn").startswith("2")
+
+
+def is_jieba_available():
+    return _is_package_available("jieba")
+
+
+def is_matplotlib_available():
+    return _is_package_available("matplotlib")
+
+
+def is_nltk_available():
+    return _is_package_available("nltk")
+
+
+def is_requests_available():
+    return _is_package_available("requests")
+
+
+def is_rouge_available():
+    return _is_package_available("rouge_chinese")
+
+
+def is_starlette_available():
+    return _is_package_available("sse_starlette")
+
+
+def is_unsloth_available():
+    return _is_package_available("unsloth")
+
+
+def is_uvicorn_available():
+    return _is_package_available("uvicorn")
--- a/LLaMA-Factory/src/llmtuner/extras/patches/__init__.py
+++ b/LLaMA-Factory/src/llmtuner/extras/patches/__init__.py
--- a/LLaMA-Factory/src/llmtuner/extras/patches/llama_patch.py
+++ b/LLaMA-Factory/src/llmtuner/extras/patches/llama_patch.py
+import math
+from typing import Optional, Tuple
+
+import torch
+import torch.nn as nn
+from transformers.models.llama.modeling_llama import (
+    Cache,
+    LlamaAttention,
+    LlamaFlashAttention2,
+    apply_rotary_pos_emb,
+    repeat_kv,
+)
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+# Modified from: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
+def llama_torch_attn_forward(
+    self: "LlamaAttention",
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_value: Optional["Cache"] = None,
+    output_attentions: bool = False,
+    **kwargs,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    bsz, q_len, _ = hidden_states.size()
+
+    query_states = self.q_proj(hidden_states)
+    key_states = self.k_proj(hidden_states)
+    value_states = self.v_proj(hidden_states)
+
+    query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+    key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+    value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+    kv_seq_len = key_states.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+    if past_key_value is not None:
+        cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+        key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+    key_states = repeat_kv(key_states, self.num_key_value_groups)
+    value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+    if getattr(self.config, "group_size_ratio", None) and self.training:  # shift
+        groupsz = int(q_len * getattr(self.config, "group_size_ratio"))
+        assert q_len % groupsz == 0, "q_len {} should be divisible by group size {}.".format(q_len, groupsz)
+        num_groups = q_len // groupsz
+
+        def shift(state: torch.Tensor) -> torch.Tensor:
+            state = state.transpose(1, 2)  # output: (bsz, seq_len, n_heads, head_dim)
+            state = torch.cat(
+                (state[:, :, : self.num_heads // 2], state[:, :, self.num_heads // 2 :].roll(-groupsz // 2, dims=1)),
+                dim=2,
+            )
+            return state.reshape(bsz * num_groups, groupsz, self.num_heads, self.head_dim).transpose(1, 2)
+
+        query_states, key_states, value_states = shift(query_states), shift(key_states), shift(value_states)
+        if attention_mask is not None:
+            attention_mask = attention_mask[:, :, :groupsz, :groupsz].repeat(num_groups, 1, 1, 1)
+
+    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+    if attention_mask is not None:
+        attn_weights = attn_weights + attention_mask
+
+    # upcast attention to fp32
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+    attn_output = torch.matmul(attn_weights, value_states)  # (bsz, :, seq_len, :) or (bsz*n_group, :, groupsz, :)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    if getattr(self.config, "group_size_ratio", None) and self.training:  # shift back
+        attn_output.reshape(bsz, q_len, self.num_heads, self.head_dim)
+        attn_output = torch.cat(
+            (
+                attn_output[:, :, : self.num_heads // 2],
+                attn_output[:, :, self.num_heads // 2 :].roll(groupsz // 2, dims=1),
+            )
+        )
+
+    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+    attn_output = self.o_proj(attn_output)
+
+    if not output_attentions:
+        attn_weights = None
+
+    return attn_output, attn_weights, past_key_value
+
+
+# Modified from: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
+def llama_flash_attn_forward(
+    self: "LlamaFlashAttention2",
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    **kwargs,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    # LlamaFlashAttention2 attention does not support output_attentions
+    output_attentions = False
+
+    bsz, q_len, _ = hidden_states.size()
+
+    query_states = self.q_proj(hidden_states)
+    key_states = self.k_proj(hidden_states)
+    value_states = self.v_proj(hidden_states)
+
+    # FlashAttention requires the input to have the shape (bsz, seq_len, n_heads, head_dim)
+    query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+    key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+    value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+    kv_seq_len = key_states.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+    if past_key_value is not None:
+        cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+        key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+    key_states = repeat_kv(key_states, self.num_key_value_groups)
+    value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+    query_states = query_states.transpose(1, 2)  # (bsz, seq_len, n_heads, head_dim)
+    key_states = key_states.transpose(1, 2)  # (bsz, seq_len, n_heads, head_dim)
+    value_states = value_states.transpose(1, 2)  # (bsz, seq_len, n_heads, head_dim)
+
+    dropout_rate = self.attention_dropout if self.training else 0.0
+
+    input_dtype = query_states.dtype
+    if input_dtype == torch.float32:
+        if torch.is_autocast_enabled():
+            target_dtype = torch.get_autocast_gpu_dtype()
+        elif hasattr(self.config, "_pre_quantization_dtype"):
+            target_dtype = self.config._pre_quantization_dtype
+        else:
+            target_dtype = self.q_proj.weight.dtype
+
+        logger.warning_once("The input hidden states seems to be silently casted in float32.")
+        query_states = query_states.to(target_dtype)
+        key_states = key_states.to(target_dtype)
+        value_states = value_states.to(target_dtype)
+
+    if getattr(self.config, "group_size_ratio", None) and self.training:  # shift
+        groupsz = int(q_len * getattr(self.config, "group_size_ratio"))
+        assert q_len % groupsz == 0, "q_len {} should be divisible by group size {}.".format(q_len, groupsz)
+        num_groups = q_len // groupsz
+
+        def shift(state: torch.Tensor) -> torch.Tensor:
+            state = torch.cat(
+                (state[:, :, : self.num_heads // 2], state[:, :, self.num_heads // 2 :].roll(-groupsz // 2, dims=1)),
+                dim=2,
+            )
+            return state.reshape(bsz * num_groups, groupsz, self.num_heads, self.head_dim)
+
+        query_states, key_states, value_states = shift(query_states), shift(key_states), shift(value_states)
+        if attention_mask is not None:
+            attention_mask = attention_mask[:, :, :groupsz, :groupsz].repeat(num_groups, 1, 1, 1)
+
+    attn_output: torch.Tensor = self._flash_attention_forward(
+        query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate
+    )
+
+    if getattr(self.config, "group_size_ratio", None) and self.training:  # shift back
+        attn_output.reshape(bsz, q_len, self.num_heads, self.head_dim)
+        attn_output = torch.cat(
+            (
+                attn_output[:, :, : self.num_heads // 2],
+                attn_output[:, :, self.num_heads // 2 :].roll(groupsz // 2, dims=1),
+            )
+        )
+
+    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+    attn_output = self.o_proj(attn_output)
+
+    if not output_attentions:
+        attn_weights = None
+
+    return attn_output, attn_weights, past_key_value
+
+
+def apply_llama_patch() -> None:
+    LlamaAttention.forward = llama_torch_attn_forward
+    LlamaFlashAttention2.forward = llama_flash_attn_forward
--- a/LLaMA-Factory/src/llmtuner/extras/patches/mixtral_patch.py
+++ b/LLaMA-Factory/src/llmtuner/extras/patches/mixtral_patch.py
+import torch
+import torch.nn.functional as F
+from transformers.models.mixtral.modeling_mixtral import MixtralBLockSparseTop2MLP, MixtralSparseMoeBlock
+
+
+def mlp_forward(self: "MixtralBLockSparseTop2MLP", hidden_states: torch.Tensor) -> torch.Tensor:
+    current_hidden_states = self.act_fn(self.w1(hidden_states)) * self.w3(hidden_states)
+    current_hidden_states = self.w2(current_hidden_states)
+    return current_hidden_states
+
+
+# Modified from: https://huggingface.co/deepseek-ai/deepseek-moe-16b-base/blob/main/modeling_deepseek.py
+def moe_forward(self: "MixtralSparseMoeBlock", hidden_states: torch.Tensor) -> torch.Tensor:
+    batch_size, sequence_length, hidden_dim = hidden_states.shape
+    hidden_states = hidden_states.view(-1, hidden_dim)
+    # router_logits: (batch * sequence_length, n_experts)
+    router_logits = self.gate(hidden_states)
+
+    routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+    topk_weight, topk_idx = torch.topk(routing_weights, self.top_k, dim=-1, sorted=False)
+    topk_weight /= topk_weight.sum(dim=-1, keepdim=True)
+    # we cast back to the input dtype
+    topk_weight = topk_weight.to(hidden_states.dtype)
+
+    hidden_states = hidden_states.repeat_interleave(self.top_k, dim=0)
+    y = torch.empty_like(hidden_states)
+    flat_topk_idx = topk_idx.view(-1)
+    for i in range(self.num_experts):
+        expert = self.experts[i]
+        y[flat_topk_idx == i] = expert(hidden_states[flat_topk_idx == i])
+    y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim=1)
+    final_hidden_states = y.reshape(batch_size, sequence_length, hidden_dim)
+    return final_hidden_states, router_logits
+
+
+def patch_mixtral_replace_moe_impl() -> None:
+    MixtralBLockSparseTop2MLP.forward = mlp_forward
+    MixtralSparseMoeBlock.forward = moe_forward
--- a/LLaMA-Factory/src/llmtuner/extras/ploting.py
+++ b/LLaMA-Factory/src/llmtuner/extras/ploting.py
+import json
+import math
+import os
+from typing import List, Optional
+
+from transformers.trainer import TRAINER_STATE_NAME
+
+from .logging import get_logger
+from .packages import is_matplotlib_available
+
+
+if is_matplotlib_available():
+    import matplotlib.pyplot as plt
+
+
+logger = get_logger(__name__)
+
+
+def smooth(scalars: List[float]) -> List[float]:
+    r"""
+    EMA implementation according to TensorBoard.
+    """
+    last = scalars[0]
+    smoothed = list()
+    weight = 1.8 * (1 / (1 + math.exp(-0.05 * len(scalars))) - 0.5)  # a sigmoid function
+    for next_val in scalars:
+        smoothed_val = last * weight + (1 - weight) * next_val
+        smoothed.append(smoothed_val)
+        last = smoothed_val
+    return smoothed
+
+
+def plot_loss(save_dictionary: os.PathLike, keys: Optional[List[str]] = ["loss"]) -> None:
+    with open(os.path.join(save_dictionary, TRAINER_STATE_NAME), "r", encoding="utf-8") as f:
+        data = json.load(f)
+
+    for key in keys:
+        steps, metrics = [], []
+        for i in range(len(data["log_history"])):
+            if key in data["log_history"][i]:
+                steps.append(data["log_history"][i]["step"])
+                metrics.append(data["log_history"][i][key])
+
+        if len(metrics) == 0:
+            logger.warning(f"No metric {key} to plot.")
+            continue
+
+        plt.figure()
+        plt.plot(steps, metrics, alpha=0.4, label="original")
+        plt.plot(steps, smooth(metrics), label="smoothed")
+        plt.title("training {} of {}".format(key, save_dictionary))
+        plt.xlabel("step")
+        plt.ylabel(key)
+        plt.legend()
+        plt.savefig(os.path.join(save_dictionary, "training_{}.png".format(key)), format="png", dpi=100)
+        print("Figure saved:", os.path.join(save_dictionary, "training_{}.png".format(key)))
--- a/LLaMA-Factory/src/llmtuner/hparams/__init__.py
+++ b/LLaMA-Factory/src/llmtuner/hparams/__init__.py
+from .data_args import DataArguments
+from .evaluation_args import EvaluationArguments
+from .finetuning_args import FinetuningArguments
+from .generating_args import GeneratingArguments
+from .model_args import ModelArguments
+from .parser import get_eval_args, get_infer_args, get_train_args
+
+
+__all__ = [
+    "DataArguments",
+    "EvaluationArguments",
+    "FinetuningArguments",
+    "GeneratingArguments",
+    "ModelArguments",
+    "get_eval_args",
+    "get_infer_args",
+    "get_train_args",
+]
--- a/LLaMA-Factory/src/llmtuner/hparams/data_args.py
+++ b/LLaMA-Factory/src/llmtuner/hparams/data_args.py
+from dataclasses import dataclass, field
+from typing import Literal, Optional
+
+
+@dataclass
+class DataArguments:
+    r"""
+    Arguments pertaining to what data we are going to input our model for training and evaluation.
+    """
+
+    template: Optional[str] = field(
+        default=None,
+        metadata={"help": "Which template to use for constructing prompts in training and inference."},
+    )
+    dataset: Optional[str] = field(
+        default=None,
+        metadata={"help": "The name of provided dataset(s) to use. Use commas to separate multiple datasets."},
+    )
+    dataset_dir: Optional[str] = field(
+        default="data",
+        metadata={"help": "Path to the folder containing the datasets."},
+    )
+    split: Optional[str] = field(
+        default="train",
+        metadata={"help": "Which dataset split to use for training and evaluation."},
+    )
+    cutoff_len: Optional[int] = field(
+        default=1024,
+        metadata={"help": "The cutoff length of the model inputs after tokenization."},
+    )
+    reserved_label_len: Optional[int] = field(
+        default=1,
+        metadata={"help": "The minimum cutoff length reserved for label after tokenization."},
+    )
+    train_on_prompt: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Whether to disable the mask on the prompt or not."},
+    )
+    streaming: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Enable dataset streaming."},
+    )
+    buffer_size: Optional[int] = field(
+        default=16384,
+        metadata={"help": "Size of the buffer to randomly sample examples from in dataset streaming."},
+    )
+    mix_strategy: Optional[Literal["concat", "interleave_under", "interleave_over"]] = field(
+        default="concat",
+        metadata={"help": "Strategy to use in dataset mixing (concat/interleave) (undersampling/oversampling)."},
+    )
+    interleave_probs: Optional[str] = field(
+        default=None,
+        metadata={"help": "Probabilities to sample data from datasets. Use commas to separate multiple datasets."},
+    )
+    overwrite_cache: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Overwrite the cached training and evaluation sets."},
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    max_samples: Optional[int] = field(
+        default=None,
+        metadata={"help": "For debugging purposes, truncate the number of examples for each dataset."},
+    )
+    eval_num_beams: Optional[int] = field(
+        default=None,
+        metadata={"help": "Number of beams to use for evaluation. This argument will be passed to `model.generate`"},
+    )
+    ignore_pad_token_for_loss: Optional[bool] = field(
+        default=True,
+        metadata={
+            "help": "Whether or not to ignore the tokens corresponding to padded labels in the loss computation."
+        },
+    )
+    val_size: Optional[float] = field(
+        default=0,
+        metadata={"help": "Size of the development set, should be an integer or a float in range `[0,1)`."},
+    )
+    sft_packing: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Packing the questions and answers in the supervised fine-tuning stage."},
+    )
+    cache_path: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to save or load the preprocessed datasets."},
+    )
+
+    def __post_init__(self):
+        if self.reserved_label_len >= self.cutoff_len:
+            raise ValueError("`reserved_label_len` must be smaller than `cutoff_len`.")
+
+        if self.streaming and self.val_size > 1e-6 and self.val_size < 1:
+            raise ValueError("Streaming mode should have an integer val size.")
+
+        if self.streaming and self.max_samples is not None:
+            raise ValueError("`max_samples` is incompatible with `streaming`.")
--- a/LLaMA-Factory/src/llmtuner/hparams/evaluation_args.py
+++ b/LLaMA-Factory/src/llmtuner/hparams/evaluation_args.py
+import os
+from dataclasses import dataclass, field
+from typing import Literal, Optional
+
+from datasets import DownloadMode
+
+
+@dataclass
+class EvaluationArguments:
+    r"""
+    Arguments pertaining to specify the evaluation parameters.
+    """
+
+    task: str = field(
+        metadata={"help": "Name of the evaluation task."},
+    )
+    task_dir: Optional[str] = field(
+        default="evaluation",
+        metadata={"help": "Path to the folder containing the evaluation datasets."},
+    )
+    batch_size: Optional[int] = field(
+        default=4,
+        metadata={"help": "The batch size per GPU for evaluation."},
+    )
+    seed: Optional[int] = field(
+        default=42,
+        metadata={"help": "Random seed to be used with data loaders."},
+    )
+    lang: Optional[Literal["en", "zh"]] = field(
+        default="en",
+        metadata={"help": "Language used at evaluation."},
+    )
+    n_shot: Optional[int] = field(
+        default=5,
+        metadata={"help": "Number of examplars for few-shot learning."},
+    )
+    save_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to save the evaluation results."},
+    )
+    download_mode: Optional[DownloadMode] = field(
+        default=DownloadMode.REUSE_DATASET_IF_EXISTS,
+        metadata={"help": "Download mode used for the evaluation datasets."},
+    )
+
+    def __post_init__(self):
+        if self.save_dir is not None and os.path.exists(self.save_dir):
+            raise ValueError("`save_dir` already exists, use another one.")
--- a/LLaMA-Factory/src/llmtuner/hparams/finetuning_args.py
+++ b/LLaMA-Factory/src/llmtuner/hparams/finetuning_args.py
+import json
+from dataclasses import asdict, dataclass, field
+from typing import Literal, Optional
+
+
+@dataclass
+class FreezeArguments:
+    r"""
+    Arguments pertaining to the freeze (partial-parameter) training.
+    """
+
+    name_module_trainable: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": """Name of trainable modules for partial-parameter (freeze) fine-tuning. \
+                    Use commas to separate multiple modules. \
+                    Use "all" to specify all the available modules. \
+                    LLaMA choices: ["mlp", "self_attn"], \
+                    BLOOM & Falcon & ChatGLM choices: ["mlp", "self_attention"], \
+                    Qwen choices: ["mlp", "attn"], \
+                    InternLM2 choices: ["feed_forward", "attention"], \
+                    Others choices: the same as LLaMA."""
+        },
+    )
+    num_layer_trainable: Optional[int] = field(
+        default=3,
+        metadata={"help": "The number of trainable layers for partial-parameter (freeze) fine-tuning."},
+    )
+    use_llama_pro: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Whether or not to use llama pro for partial-parameter (freeze) fine-tuning."},
+    )
+
+
+@dataclass
+class LoraArguments:
+    r"""
+    Arguments pertaining to the LoRA training.
+    """
+
+    additional_target: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Name(s) of modules apart from LoRA layers to be set as trainable and saved in the final checkpoint."
+        },
+    )
+    lora_alpha: Optional[int] = field(
+        default=None,
+        metadata={"help": "The scale factor for LoRA fine-tuning (default: lora_rank * 2)."},
+    )
+    lora_dropout: Optional[float] = field(
+        default=0.0,
+        metadata={"help": "Dropout rate for the LoRA fine-tuning."},
+    )
+    lora_rank: Optional[int] = field(
+        default=8,
+        metadata={"help": "The intrinsic dimension for LoRA fine-tuning."},
+    )
+    lora_target: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": """Name(s) of target modules to apply LoRA. \
+                    Use commas to separate multiple modules. \
+                    Use "all" to specify all the available modules. \
+                    LLaMA choices: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], \
+                    BLOOM & Falcon & ChatGLM choices: ["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"], \
+                    Baichuan choices: ["W_pack", "o_proj", "gate_proj", "up_proj", "down_proj"], \
+                    Qwen choices: ["c_attn", "attn.c_proj", "w1", "w2", "mlp.c_proj"], \
+                    InternLM2 choices: ["wqkv", "wo", "w1", "w2", "w3"], \
+                    Others choices: the same as LLaMA."""
+        },
+    )
+    lora_bf16_mode: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Whether or not to train lora adapters in bf16 precision."},
+    )
+    use_rslora: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Whether or not to use the rank stabilization scaling factor for LoRA layer."},
+    )
+    create_new_adapter: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Whether or not to create a new adapter with randomly initialized weight."},
+    )
+
+
+@dataclass
+class RLHFArguments:
+    r"""
+    Arguments pertaining to the PPO and DPO training.
+    """
+
+    dpo_beta: Optional[float] = field(
+        default=0.1,
+        metadata={"help": "The beta parameter for the DPO loss."},
+    )
+    dpo_loss: Optional[Literal["sigmoid", "hinge", "ipo", "kto"]] = field(
+        default="sigmoid",
+        metadata={"help": "The type of DPO loss to use."},
+    )
+    dpo_ftx: Optional[float] = field(
+        default=0,
+        metadata={"help": "The supervised fine-tuning loss coefficient in DPO training."},
+    )
+    ppo_buffer_size: Optional[int] = field(
+        default=1,
+        metadata={"help": "The number of mini-batches to make experience buffer in a PPO optimization step."},
+    )
+    ppo_epochs: Optional[int] = field(
+        default=4,
+        metadata={"help": "The number of epochs to perform in a PPO optimization step."},
+    )
+    ppo_logger: Optional[str] = field(
+        default=None,
+        metadata={"help": 'Log with either "wandb" or "tensorboard" in PPO training.'},
+    )
+    ppo_score_norm: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Use score normalization in PPO training."},
+    )
+    ppo_target: Optional[float] = field(
+        default=6.0,
+        metadata={"help": "Target KL value for adaptive KL control in PPO training."},
+    )
+    ppo_whiten_rewards: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Whiten the rewards before compute advantages in PPO training."},
+    )
+    ref_model: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to the reference model used for the PPO or DPO training."},
+    )
+    ref_model_adapters: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to the adapters of the reference model."},
+    )
+    ref_model_quantization_bit: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of bits to quantize the reference model."},
+    )
+    reward_model: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to the reward model used for the PPO training."},
+    )
+    reward_model_adapters: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to the adapters of the reward model."},
+    )
+    reward_model_quantization_bit: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of bits to quantize the reward model."},
+    )
+    reward_model_type: Optional[Literal["lora", "full", "api"]] = field(
+        default="lora",
+        metadata={"help": "The type of the reward model in PPO training. Lora model only supports lora training."},
+    )
+
+
+@dataclass
+class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments):
+    r"""
+    Arguments pertaining to which techniques we are going to fine-tuning with.
+    """
+
+    stage: Optional[Literal["pt", "sft", "rm", "ppo", "dpo"]] = field(
+        default="sft",
+        metadata={"help": "Which stage will be performed in training."},
+    )
+    finetuning_type: Optional[Literal["lora", "freeze", "full"]] = field(
+        default="lora",
+        metadata={"help": "Which fine-tuning method to use."},
+    )
+    disable_version_checking: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Whether or not to disable version checking."},
+    )
+    plot_loss: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Whether or not to save the training loss curves."},
+    )
+
+    def __post_init__(self):
+        def split_arg(arg):
+            if isinstance(arg, str):
+                return [item.strip() for item in arg.split(",")]
+            return arg
+
+        self.name_module_trainable = split_arg(self.name_module_trainable)
+        self.lora_alpha = self.lora_alpha or self.lora_rank * 2
+        self.lora_target = split_arg(self.lora_target)
+        self.additional_target = split_arg(self.additional_target)
+
+        assert self.finetuning_type in ["lora", "freeze", "full"], "Invalid fine-tuning method."
+        assert self.ref_model_quantization_bit in [None, 8, 4], "We only accept 4-bit or 8-bit quantization."
+        assert self.reward_model_quantization_bit in [None, 8, 4], "We only accept 4-bit or 8-bit quantization."
+
+        if self.stage == "ppo" and self.reward_model is None:
+            raise ValueError("Reward model is necessary for PPO training.")
+
+        if self.stage == "ppo" and self.reward_model_type == "lora" and self.finetuning_type != "lora":
+            raise ValueError("Freeze/Full PPO training needs `reward_model_type=full`.")
+
+    def save_to_json(self, json_path: str):
+        r"""Saves the content of this instance in JSON format inside `json_path`."""
+        json_string = json.dumps(asdict(self), indent=2, sort_keys=True) + "\n"
+        with open(json_path, "w", encoding="utf-8") as f:
+            f.write(json_string)
+
+    @classmethod
+    def load_from_json(cls, json_path: str):
+        r"""Creates an instance from the content of `json_path`."""
+        with open(json_path, "r", encoding="utf-8") as f:
+            text = f.read()
+
+        return cls(**json.loads(text))
--- a/LLaMA-Factory/src/llmtuner/hparams/generating_args.py
+++ b/LLaMA-Factory/src/llmtuner/hparams/generating_args.py
+from dataclasses import asdict, dataclass, field
+from typing import Any, Dict, Optional
+
+
+@dataclass
+class GeneratingArguments:
+    r"""
+    Arguments pertaining to specify the decoding parameters.
+    """
+
+    do_sample: Optional[bool] = field(
+        default=True,
+        metadata={"help": "Whether or not to use sampling, use greedy decoding otherwise."},
+    )
+    temperature: Optional[float] = field(
+        default=0.95,
+        metadata={"help": "The value used to modulate the next token probabilities."},
+    )
+    top_p: Optional[float] = field(
+        default=0.7,
+        metadata={
+            "help": "The smallest set of most probable tokens with probabilities that add up to top_p or higher are kept."
+        },
+    )
+    top_k: Optional[int] = field(
+        default=50,
+        metadata={"help": "The number of highest probability vocabulary tokens to keep for top-k filtering."},
+    )
+    num_beams: Optional[int] = field(
+        default=1,
+        metadata={"help": "Number of beams for beam search. 1 means no beam search."},
+    )
+    max_length: Optional[int] = field(
+        default=512,
+        metadata={"help": "The maximum length the generated tokens can have. It can be overridden by max_new_tokens."},
+    )
+    max_new_tokens: Optional[int] = field(
+        default=512,
+        metadata={"help": "The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt."},
+    )
+    repetition_penalty: Optional[float] = field(
+        default=1.0,
+        metadata={"help": "The parameter for repetition penalty. 1.0 means no penalty."},
+    )
+    length_penalty: Optional[float] = field(
+        default=1.0,
+        metadata={"help": "Exponential penalty to the length that is used with beam-based generation."},
+    )
+
+    def to_dict(self) -> Dict[str, Any]:
+        args = asdict(self)
+        if args.get("max_new_tokens", -1) > 0:
+            args.pop("max_length", None)
+        else:
+            args.pop("max_new_tokens", None)
+        return args
--- a/LLaMA-Factory/src/llmtuner/hparams/model_args.py
+++ b/LLaMA-Factory/src/llmtuner/hparams/model_args.py
+from dataclasses import asdict, dataclass, field
+from typing import Any, Dict, Literal, Optional
+
+
+@dataclass
+class ModelArguments:
+    r"""
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune.
+    """
+
+    model_name_or_path: str = field(
+        metadata={
+            "help": "Path to the model weight or identifier from huggingface.co/models or modelscope.cn/models."
+        },
+    )
+    adapter_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to the adapter weight or identifier from huggingface.co/models."},
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where to store the pre-trained models downloaded from huggingface.co or modelscope.cn."},
+    )
+    use_fast_tokenizer: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Whether or not to use one of the fast tokenizer (backed by the tokenizers library)."},
+    )
+    resize_vocab: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Whether or not to resize the tokenizer vocab and the embedding layers."},
+    )
+    split_special_tokens: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Whether or not the special tokens should be split during the tokenization process."},
+    )
+    model_revision: Optional[str] = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    quantization_bit: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of bits to quantize the model."},
+    )
+    quantization_type: Optional[Literal["fp4", "nf4"]] = field(
+        default="nf4",
+        metadata={"help": "Quantization data type to use in int4 training."},
+    )
+    double_quantization: Optional[bool] = field(
+        default=True,
+        metadata={"help": "Whether or not to use double quantization in int4 training."},
+    )
+    rope_scaling: Optional[Literal["linear", "dynamic"]] = field(
+        default=None,
+        metadata={"help": "Which scaling strategy should be adopted for the RoPE embeddings."},
+    )
+    flash_attn: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Enable FlashAttention-2 for faster training."},
+    )
+    shift_attn: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Enable shift short attention (S^2-Attn) proposed by LongLoRA."},
+    )
+    use_unsloth: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Whether or not to use unsloth's optimization for the LoRA training."},
+    )
+    disable_gradient_checkpointing: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Whether or not to disable gradient checkpointing."},
+    )
+    upcast_layernorm: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Whether or not to upcast the layernorm weights in fp32."},
+    )
+    upcast_lmhead_output: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Whether or not to upcast the output of lm_head in fp32."},
+    )
+    hf_hub_token: Optional[str] = field(
+        default=None,
+        metadata={"help": "Auth token to log in with Hugging Face Hub."},
+    )
+    ms_hub_token: Optional[str] = field(
+        default=None,
+        metadata={"help": "Auth token to log in with ModelScope Hub."},
+    )
+    export_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to the directory to save the exported model."},
+    )
+    export_size: Optional[int] = field(
+        default=1,
+        metadata={"help": "The file shard size (in GB) of the exported model."},
+    )
+    export_quantization_bit: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of bits to quantize the exported model."},
+    )
+    export_quantization_dataset: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to the dataset or dataset name to use in quantizing the exported model."},
+    )
+    export_quantization_nsamples: Optional[int] = field(
+        default=128,
+        metadata={"help": "The number of samples used for quantization."},
+    )
+    export_quantization_maxlen: Optional[int] = field(
+        default=1024,
+        metadata={"help": "The maximum length of the model inputs used for quantization."},
+    )
+    export_legacy_format: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Whether or not to save the `.bin` files instead of `.safetensors`."},
+    )
+    export_hub_model_id: Optional[str] = field(
+        default=None,
+        metadata={"help": "The name of the repository if push the model to the Hugging Face hub."},
+    )
+    print_param_status: Optional[bool] = field(
+        default=False,
+        metadata={"help": "For debugging purposes, print the status of the parameters in the model."},
+    )
+
+    def __post_init__(self):
+        self.compute_dtype = None
+        self.model_max_length = None
+
+        if self.split_special_tokens and self.use_fast_tokenizer:
+            raise ValueError("`split_special_tokens` is only supported for slow tokenizers.")
+
+        if self.adapter_name_or_path is not None:  # support merging multiple lora weights
+            self.adapter_name_or_path = [path.strip() for path in self.adapter_name_or_path.split(",")]
+
+        assert self.quantization_bit in [None, 8, 4], "We only accept 4-bit or 8-bit quantization."
+        assert self.export_quantization_bit in [None, 8, 4, 3, 2], "We only accept 2/3/4/8-bit quantization."
+
+        if self.export_quantization_bit is not None and self.export_quantization_dataset is None:
+            raise ValueError("Quantization dataset is necessary for exporting.")
+
+    def to_dict(self) -> Dict[str, Any]:
+        return asdict(self)