Initial commit

53b3977b · dongchy920 · 53b3977b · 53b3977b · 53b3977b · 53b3977b
Commit 53b3977b authored Jul 11, 2025 by dongchy920
20 changed files
--- a/LLaMA-Factory/evaluation/mmlu/mapping.json
+++ b/LLaMA-Factory/evaluation/mmlu/mapping.json
+{
+  "abstract_algebra": {
+    "name": "abstract algebra",
+    "category": "STEM"
+  },
+  "anatomy": {
+    "name": "anatomy",
+    "category": "Other"
+  },
+  "astronomy": {
+    "name": "astronomy",
+    "category": "STEM"
+  },
+  "business_ethics": {
+    "name": "business ethics",
+    "category": "Other"
+  },
+  "clinical_knowledge": {
+    "name": "clinical knowledge",
+    "category": "Other"
+  },
+  "college_biology": {
+    "name": "college biology",
+    "category": "STEM"
+  },
+  "college_chemistry": {
+    "name": "college chemistry",
+    "category": "STEM"
+  },
+  "college_computer_science": {
+    "name": "college computer science",
+    "category": "STEM"
+  },
+  "college_mathematics": {
+    "name": "college mathematics",
+    "category": "STEM"
+  },
+  "college_medicine": {
+    "name": "college medicine",
+    "category": "Other"
+  },
+  "college_physics": {
+    "name": "college physics",
+    "category": "STEM"
+  },
+  "computer_security": {
+    "name": "computer security",
+    "category": "STEM"
+  },
+  "conceptual_physics": {
+    "name": "conceptual physics",
+    "category": "STEM"
+  },
+  "econometrics": {
+    "name": "econometrics",
+    "category": "Social Sciences"
+  },
+  "electrical_engineering": {
+    "name": "electrical engineering",
+    "category": "STEM"
+  },
+  "elementary_mathematics": {
+    "name": "elementary mathematics",
+    "category": "STEM"
+  },
+  "formal_logic": {
+    "name": "formal logic",
+    "category": "Humanities"
+  },
+  "global_facts": {
+    "name": "global facts",
+    "category": "Other"
+  },
+  "high_school_biology": {
+    "name": "high school biology",
+    "category": "STEM"
+  },
+  "high_school_chemistry": {
+    "name": "high school chemistry",
+    "category": "STEM"
+  },
+  "high_school_computer_science": {
+    "name": "high school computer science",
+    "category": "STEM"
+  },
+  "high_school_european_history": {
+    "name": "high school european history",
+    "category": "Humanities"
+  },
+  "high_school_geography": {
+    "name": "high school geography",
+    "category": "Social Sciences"
+  },
+  "high_school_government_and_politics": {
+    "name": "high school government and politics",
+    "category": "Social Sciences"
+  },
+  "high_school_macroeconomics": {
+    "name": "high school macroeconomics",
+    "category": "Social Sciences"
+  },
+  "high_school_mathematics": {
+    "name": "high school mathematics",
+    "category": "STEM"
+  },
+  "high_school_microeconomics": {
+    "name": "high school microeconomics",
+    "category": "Social Sciences"
+  },
+  "high_school_physics": {
+    "name": "high school physics",
+    "category": "STEM"
+  },
+  "high_school_psychology": {
+    "name": "high school psychology",
+    "category": "Social Sciences"
+  },
+  "high_school_statistics": {
+    "name": "high school statistics",
+    "category": "STEM"
+  },
+  "high_school_us_history": {
+    "name": "high school us history",
+    "category": "Humanities"
+  },
+  "high_school_world_history": {
+    "name": "high school world history",
+    "category": "Humanities"
+  },
+  "human_aging": {
+    "name": "human aging",
+    "category": "Other"
+  },
+  "human_sexuality": {
+    "name": "human sexuality",
+    "category": "Social Sciences"
+  },
+  "international_law": {
+    "name": "international law",
+    "category": "Humanities"
+  },
+  "jurisprudence": {
+    "name": "jurisprudence",
+    "category": "Humanities"
+  },
+  "logical_fallacies": {
+    "name": "logical fallacies",
+    "category": "Humanities"
+  },
+  "machine_learning": {
+    "name": "machine learning",
+    "category": "STEM"
+  },
+  "management": {
+    "name": "management",
+    "category": "Other"
+  },
+  "marketing": {
+    "name": "marketing",
+    "category": "Other"
+  },
+  "medical_genetics": {
+    "name": "medical genetics",
+    "category": "Other"
+  },
+  "miscellaneous": {
+    "name": "miscellaneous",
+    "category": "Other"
+  },
+  "moral_disputes": {
+    "name": "moral disputes",
+    "category": "Humanities"
+  },
+  "moral_scenarios": {
+    "name": "moral scenarios",
+    "category": "Humanities"
+  },
+  "nutrition": {
+    "name": "nutrition",
+    "category": "Other"
+  },
+  "philosophy": {
+    "name": "philosophy",
+    "category": "Humanities"
+  },
+  "prehistory": {
+    "name": "prehistory",
+    "category": "Humanities"
+  },
+  "professional_accounting": {
+    "name": "professional accounting",
+    "category": "Other"
+  },
+  "professional_law": {
+    "name": "professional law",
+    "category": "Humanities"
+  },
+  "professional_medicine": {
+    "name": "professional medicine",
+    "category": "Other"
+  },
+  "professional_psychology": {
+    "name": "professional psychology",
+    "category": "Social Sciences"
+  },
+  "public_relations": {
+    "name": "public relations",
+    "category": "Social Sciences"
+  },
+  "security_studies": {
+    "name": "security studies",
+    "category": "Social Sciences"
+  },
+  "sociology": {
+    "name": "sociology",
+    "category": "Social Sciences"
+  },
+  "us_foreign_policy": {
+    "name": "us foreign policy",
+    "category": "Social Sciences"
+  },
+  "virology": {
+    "name": "virology",
+    "category": "Other"
+  },
+  "world_religions": {
+    "name": "world religions",
+    "category": "Humanities"
+  }
+}
--- a/LLaMA-Factory/evaluation/mmlu/mmlu.py
+++ b/LLaMA-Factory/evaluation/mmlu/mmlu.py
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import datasets
+import pandas as pd
+
+
+_CITATION = """\
+@article{hendryckstest2021,
+  title={Measuring Massive Multitask Language Understanding},
+  author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
+  journal={Proceedings of the International Conference on Learning Representations (ICLR)},
+  year={2021}
+}
+"""
+
+_DESCRIPTION = """\
+Measuring Massive Multitask Language Understanding by Dan Hendrycks, Collin Burns, Steven Basart, Andy Zou, Mantas Mazeika, Dawn Song, and Jacob Steinhardt (ICLR 2021).
+"""
+
+_HOMEPAGE = "https://github.com/hendrycks/test"
+
+_LICENSE = "MIT"
+
+_URL = "mmlu.zip"
+
+task_list = [
+    "high_school_european_history",
+    "business_ethics",
+    "clinical_knowledge",
+    "medical_genetics",
+    "high_school_us_history",
+    "high_school_physics",
+    "high_school_world_history",
+    "virology",
+    "high_school_microeconomics",
+    "econometrics",
+    "college_computer_science",
+    "high_school_biology",
+    "abstract_algebra",
+    "professional_accounting",
+    "philosophy",
+    "professional_medicine",
+    "nutrition",
+    "global_facts",
+    "machine_learning",
+    "security_studies",
+    "public_relations",
+    "professional_psychology",
+    "prehistory",
+    "anatomy",
+    "human_sexuality",
+    "college_medicine",
+    "high_school_government_and_politics",
+    "college_chemistry",
+    "logical_fallacies",
+    "high_school_geography",
+    "elementary_mathematics",
+    "human_aging",
+    "college_mathematics",
+    "high_school_psychology",
+    "formal_logic",
+    "high_school_statistics",
+    "international_law",
+    "high_school_mathematics",
+    "high_school_computer_science",
+    "conceptual_physics",
+    "miscellaneous",
+    "high_school_chemistry",
+    "marketing",
+    "professional_law",
+    "management",
+    "college_physics",
+    "jurisprudence",
+    "world_religions",
+    "sociology",
+    "us_foreign_policy",
+    "high_school_macroeconomics",
+    "computer_security",
+    "moral_scenarios",
+    "moral_disputes",
+    "electrical_engineering",
+    "astronomy",
+    "college_biology",
+]
+
+
+class MMLUConfig(datasets.BuilderConfig):
+    def __init__(self, **kwargs):
+        super().__init__(version=datasets.Version("1.0.0"), **kwargs)
+
+
+class MMLU(datasets.GeneratorBasedBuilder):
+    BUILDER_CONFIGS = [
+        MMLUConfig(
+            name=task_name,
+        )
+        for task_name in task_list
+    ]
+
+    def _info(self):
+        features = datasets.Features(
+            {
+                "question": datasets.Value("string"),
+                "A": datasets.Value("string"),
+                "B": datasets.Value("string"),
+                "C": datasets.Value("string"),
+                "D": datasets.Value("string"),
+                "answer": datasets.Value("string"),
+            }
+        )
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        data_dir = dl_manager.download_and_extract(_URL)
+        task_name = self.config.name
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={
+                    "filepath": os.path.join(data_dir, "data", "test", f"{task_name}_test.csv"),
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={
+                    "filepath": os.path.join(data_dir, "data", "val", f"{task_name}_val.csv"),
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={
+                    "filepath": os.path.join(data_dir, "data", "dev", f"{task_name}_dev.csv"),
+                },
+            ),
+        ]
+
+    def _generate_examples(self, filepath):
+        df = pd.read_csv(filepath, header=None)
+        df.columns = ["question", "A", "B", "C", "D", "answer"]
+
+        yield from enumerate(df.to_dict(orient="records"))
--- a/LLaMA-Factory/evaluation/mmlu/mmlu.zip
+++ b/LLaMA-Factory/evaluation/mmlu/mmlu.zip
--- a/LLaMA-Factory/examples/README.md
+++ b/LLaMA-Factory/examples/README.md
+We provide diverse examples about fine-tuning LLMs.
+
+Make sure to execute these commands in the `LLaMA-Factory` directory.
+
+## Table of Contents
+
+- [LoRA Fine-Tuning](#lora-fine-tuning)
+- [QLoRA Fine-Tuning](#qlora-fine-tuning)
+- [Full-Parameter Fine-Tuning](#full-parameter-fine-tuning)
+- [Merging LoRA Adapters and Quantization](#merging-lora-adapters-and-quantization)
+- [Inferring LoRA Fine-Tuned Models](#inferring-lora-fine-tuned-models)
+- [Extras](#extras)
+
+Use `CUDA_VISIBLE_DEVICES` (GPU) or `ASCEND_RT_VISIBLE_DEVICES` (NPU) to choose computing devices.
+
+By default, LLaMA-Factory uses all visible computing devices.
+
+## Examples
+
+### LoRA Fine-Tuning
+
+#### (Continuous) Pre-Training
+
+```bash
+llamafactory-cli train examples/train_lora/llama3_lora_pretrain.yaml
+```
+
+#### Supervised Fine-Tuning
+
+```bash
+llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
+```
+
+#### Multimodal Supervised Fine-Tuning
+
+```bash
+llamafactory-cli train examples/train_lora/llava1_5_lora_sft.yaml
+llamafactory-cli train examples/train_lora/qwen2vl_lora_sft.yaml
+```
+
+#### DPO/ORPO/SimPO Training
+
+```bash
+llamafactory-cli train examples/train_lora/llama3_lora_dpo.yaml
+```
+
+#### Multimodal DPO/ORPO/SimPO Training
+
+```bash
+llamafactory-cli train examples/train_lora/qwen2vl_lora_dpo.yaml
+```
+
+#### Reward Modeling
+
+```bash
+llamafactory-cli train examples/train_lora/llama3_lora_reward.yaml
+```
+
+#### PPO Training
+
+```bash
+llamafactory-cli train examples/train_lora/llama3_lora_ppo.yaml
+```
+
+#### KTO Training
+
+```bash
+llamafactory-cli train examples/train_lora/llama3_lora_kto.yaml
+```
+
+#### Preprocess Dataset
+
+It is useful for large dataset, use `tokenized_path` in config to load the preprocessed dataset.
+
+```bash
+llamafactory-cli train examples/train_lora/llama3_preprocess.yaml
+```
+
+#### Evaluating on MMLU/CMMLU/C-Eval Benchmarks
+
+```bash
+llamafactory-cli eval examples/train_lora/llama3_lora_eval.yaml
+```
+
+#### Supervised Fine-Tuning on Multiple Nodes
+
+```bash
+FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
+FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
+```
+
+#### Supervised Fine-Tuning with DeepSpeed ZeRO-3 (Weight Sharding)
+
+```bash
+FORCE_TORCHRUN=1 llamafactory-cli train examples/train_lora/llama3_lora_sft_ds3.yaml
+```
+
+### QLoRA Fine-Tuning
+
+#### Supervised Fine-Tuning with 4/8-bit Bitsandbytes/HQQ/EETQ Quantization (Recommended)
+
+```bash
+llamafactory-cli train examples/train_qlora/llama3_lora_sft_otfq.yaml
+```
+
+#### Supervised Fine-Tuning with 4/8-bit GPTQ Quantization
+
+```bash
+llamafactory-cli train examples/train_qlora/llama3_lora_sft_gptq.yaml
+```
+
+#### Supervised Fine-Tuning with 4-bit AWQ Quantization
+
+```bash
+llamafactory-cli train examples/train_qlora/llama3_lora_sft_awq.yaml
+```
+
+#### Supervised Fine-Tuning with 2-bit AQLM Quantization
+
+```bash
+llamafactory-cli train examples/train_qlora/llama3_lora_sft_aqlm.yaml
+```
+
+### Full-Parameter Fine-Tuning
+
+#### Supervised Fine-Tuning on Single Node
+
+```bash
+FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
+```
+
+#### Supervised Fine-Tuning on Multiple Nodes
+
+```bash
+FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
+FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
+```
+
+#### Multimodal Supervised Fine-Tuning
+
+```bash
+FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/qwen2vl_full_sft.yaml
+```
+
+### Merging LoRA Adapters and Quantization
+
+#### Merge LoRA Adapters
+
+Note: DO NOT use quantized model or `quantization_bit` when merging LoRA adapters.
+
+```bash
+llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
+```
+
+#### Quantizing Model using AutoGPTQ
+
+```bash
+llamafactory-cli export examples/merge_lora/llama3_gptq.yaml
+```
+
+### Inferring LoRA Fine-Tuned Models
+
+#### Batch Generation using vLLM Tensor Parallel
+
+```
+python scripts/vllm_infer.py --model_name_or_path path_to_merged_model --dataset alpaca_en_demo
+```
+
+#### Use CLI ChatBox
+
+```bash
+llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
+```
+
+#### Use Web UI ChatBox
+
+```bash
+llamafactory-cli webchat examples/inference/llama3_lora_sft.yaml
+```
+
+#### Launch OpenAI-style API
+
+```bash
+llamafactory-cli api examples/inference/llama3_lora_sft.yaml
+```
+
+### Extras
+
+#### Full-Parameter Fine-Tuning using GaLore
+
+```bash
+llamafactory-cli train examples/extras/galore/llama3_full_sft.yaml
+```
+
+#### Full-Parameter Fine-Tuning using BAdam
+
+```bash
+llamafactory-cli train examples/extras/badam/llama3_full_sft.yaml
+```
+
+#### Full-Parameter Fine-Tuning using Adam-mini
+
+```bash
+llamafactory-cli train examples/extras/adam_mini/qwen2_full_sft.yaml
+```
+
+#### LoRA+ Fine-Tuning
+
+```bash
+llamafactory-cli train examples/extras/loraplus/llama3_lora_sft.yaml
+```
+
+#### PiSSA Fine-Tuning
+
+```bash
+llamafactory-cli train examples/extras/pissa/llama3_lora_sft.yaml
+```
+
+#### Mixture-of-Depths Fine-Tuning
+
+```bash
+llamafactory-cli train examples/extras/mod/llama3_full_sft.yaml
+```
+
+#### LLaMA-Pro Fine-Tuning
+
+```bash
+bash examples/extras/llama_pro/expand.sh
+llamafactory-cli train examples/extras/llama_pro/llama3_freeze_sft.yaml
+```
+
+#### FSDP+QLoRA Fine-Tuning
+
+```bash
+bash examples/extras/fsdp_qlora/train.sh
+```
+
+#### Computing BLEU and ROUGE Scores
+
+```bash
+llamafactory-cli train examples/extras/nlg_eval/llama3_lora_predict.yaml
+```
--- a/LLaMA-Factory/examples/README_zh.md
+++ b/LLaMA-Factory/examples/README_zh.md
+我们提供了多样化的大模型微调示例脚本。
+
+请确保在 `LLaMA-Factory` 目录下执行下述命令。
+
+## 目录
+
+- [LoRA 微调](#lora-微调)
+- [QLoRA 微调](#qlora-微调)
+- [全参数微调](#全参数微调)
+- [合并 LoRA 适配器与模型量化](#合并-lora-适配器与模型量化)
+- [推理 LoRA 模型](#推理-lora-模型)
+- [杂项](#杂项)
+
+使用 `CUDA_VISIBLE_DEVICES`（GPU）或 `ASCEND_RT_VISIBLE_DEVICES`（NPU）选择计算设备。
+
+LLaMA-Factory 默认使用所有可见的计算设备。
+
+## 示例
+
+### LoRA 微调
+
+#### （增量）预训练
+
+```bash
+llamafactory-cli train examples/train_lora/llama3_lora_pretrain.yaml
+```
+
+#### 指令监督微调
+
+```bash
+llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
+```
+
+#### 多模态指令监督微调
+
+```bash
+llamafactory-cli train examples/train_lora/llava1_5_lora_sft.yaml
+llamafactory-cli train examples/train_lora/qwen2vl_lora_sft.yaml
+```
+
+#### DPO/ORPO/SimPO 训练
+
+```bash
+llamafactory-cli train examples/train_lora/llama3_lora_dpo.yaml
+```
+
+#### 多模态 DPO/ORPO/SimPO 训练
+
+```bash
+llamafactory-cli train examples/train_lora/qwen2vl_lora_dpo.yaml
+```
+
+#### 奖励模型训练
+
+```bash
+llamafactory-cli train examples/train_lora/llama3_lora_reward.yaml
+```
+
+#### PPO 训练
+
+```bash
+llamafactory-cli train examples/train_lora/llama3_lora_ppo.yaml
+```
+
+#### KTO 训练
+
+```bash
+llamafactory-cli train examples/train_lora/llama3_lora_kto.yaml
+```
+
+#### 预处理数据集
+
+对于大数据集有帮助，在配置中使用 `tokenized_path` 以加载预处理后的数据集。
+
+```bash
+llamafactory-cli train examples/train_lora/llama3_preprocess.yaml
+```
+
+#### 在 MMLU/CMMLU/C-Eval 上评估
+
+```bash
+llamafactory-cli eval examples/train_lora/llama3_lora_eval.yaml
+```
+
+#### 多机指令监督微调
+
+```bash
+FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
+FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
+```
+
+#### 使用 DeepSpeed ZeRO-3 平均分配显存
+
+```bash
+FORCE_TORCHRUN=1 llamafactory-cli train examples/train_lora/llama3_lora_sft_ds3.yaml
+```
+
+### QLoRA 微调
+
+#### 基于 4/8 比特 Bitsandbytes/HQQ/EETQ 量化进行指令监督微调（推荐）
+
+```bash
+llamafactory-cli train examples/train_qlora/llama3_lora_sft_otfq.yaml
+```
+
+#### 基于 4/8 比特 GPTQ 量化进行指令监督微调
+
+```bash
+llamafactory-cli train examples/train_qlora/llama3_lora_sft_gptq.yaml
+```
+
+#### 基于 4 比特 AWQ 量化进行指令监督微调
+
+```bash
+llamafactory-cli train examples/train_qlora/llama3_lora_sft_awq.yaml
+```
+
+#### 基于 2 比特 AQLM 量化进行指令监督微调
+
+```bash
+llamafactory-cli train examples/train_qlora/llama3_lora_sft_aqlm.yaml
+```
+
+### 全参数微调
+
+#### 在单机上进行指令监督微调
+
+```bash
+FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
+```
+
+#### 在多机上进行指令监督微调
+
+```bash
+FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
+FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
+```
+
+#### 多模态指令监督微调
+
+```bash
+FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/qwen2vl_full_sft.yaml
+```
+
+### 合并 LoRA 适配器与模型量化
+
+#### 合并 LoRA 适配器
+
+注：请勿使用量化后的模型或 `quantization_bit` 参数来合并 LoRA 适配器。
+
+```bash
+llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
+```
+
+#### 使用 AutoGPTQ 量化模型
+
+```bash
+llamafactory-cli export examples/merge_lora/llama3_gptq.yaml
+```
+
+### 推理 LoRA 模型
+
+#### 使用 vLLM+TP 批量推理
+
+```
+python scripts/vllm_infer.py --model_name_or_path path_to_merged_model --dataset alpaca_en_demo
+```
+
+#### 使用命令行对话框
+
+```bash
+llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
+```
+
+#### 使用浏览器对话框
+
+```bash
+llamafactory-cli webchat examples/inference/llama3_lora_sft.yaml
+```
+
+#### 启动 OpenAI 风格 API
+
+```bash
+llamafactory-cli api examples/inference/llama3_lora_sft.yaml
+```
+
+### 杂项
+
+#### 使用 GaLore 进行全参数训练
+
+```bash
+llamafactory-cli train examples/extras/galore/llama3_full_sft.yaml
+```
+
+#### 使用 BAdam 进行全参数训练
+
+```bash
+llamafactory-cli train examples/extras/badam/llama3_full_sft.yaml
+```
+
+#### 使用 Adam-mini 进行全参数训练
+
+```bash
+llamafactory-cli train examples/extras/adam_mini/qwen2_full_sft.yaml
+```
+
+#### LoRA+ 微调
+
+```bash
+llamafactory-cli train examples/extras/loraplus/llama3_lora_sft.yaml
+```
+
+#### PiSSA 微调
+
+```bash
+llamafactory-cli train examples/extras/pissa/llama3_lora_sft.yaml
+```
+
+#### 深度混合微调
+
+```bash
+llamafactory-cli train examples/extras/mod/llama3_full_sft.yaml
+```
+
+#### LLaMA-Pro 微调
+
+```bash
+bash examples/extras/llama_pro/expand.sh
+llamafactory-cli train examples/extras/llama_pro/llama3_freeze_sft.yaml
+```
+
+#### FSDP+QLoRA 微调
+
+```bash
+bash examples/extras/fsdp_qlora/train.sh
+```
+
+#### 计算 BLEU 和 ROUGE 分数
+
+```bash
+llamafactory-cli train examples/extras/nlg_eval/llama3_lora_predict.yaml
+```
--- a/LLaMA-Factory/examples/accelerate/fsdp_config.yaml
+++ b/LLaMA-Factory/examples/accelerate/fsdp_config.yaml
+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: FSDP
+downcast_bf16: 'no'
+fsdp_config:
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_backward_prefetch: BACKWARD_PRE
+  fsdp_forward_prefetch: false
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_offload_params: true # offload may affect training speed
+  fsdp_sharding_strategy: FULL_SHARD
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_sync_module_states: true
+  fsdp_use_orig_params: true
+machine_rank: 0
+main_training_function: main
+mixed_precision: fp16 # or bf16
+num_machines: 1 # the number of nodes
+num_processes: 2 # the number of GPUs in all nodes
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
--- a/LLaMA-Factory/examples/deepspeed/ds_z0_config.json
+++ b/LLaMA-Factory/examples/deepspeed/ds_z0_config.json
+{
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
+  "zero_allow_untested_optimizer": true,
+  "fp16": {
+    "enabled": "auto",
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 16,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "bf16": {
+    "enabled": "auto"
+  },
+  "zero_optimization": {
+    "stage": 0,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 5e8,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 5e8,
+    "contiguous_gradients": true,
+    "round_robin_gradients": true
+  }
+}
--- a/LLaMA-Factory/examples/deepspeed/ds_z2_config.json
+++ b/LLaMA-Factory/examples/deepspeed/ds_z2_config.json
+{
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
+  "zero_allow_untested_optimizer": true,
+  "fp16": {
+    "enabled": "auto",
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 16,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "bf16": {
+    "enabled": "auto"
+  },
+  "zero_optimization": {
+    "stage": 2,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 5e8,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 5e8,
+    "contiguous_gradients": true,
+    "round_robin_gradients": true
+  }
+}
--- a/LLaMA-Factory/examples/deepspeed/ds_z2_offload_config.json
+++ b/LLaMA-Factory/examples/deepspeed/ds_z2_offload_config.json
+{
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
+  "zero_allow_untested_optimizer": true,
+  "fp16": {
+    "enabled": "auto",
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 16,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "bf16": {
+    "enabled": "auto"
+  },
+  "zero_optimization": {
+    "stage": 2,
+    "offload_optimizer": {
+      "device": "cpu",
+      "pin_memory": true
+    },
+    "allgather_partitions": true,
+    "allgather_bucket_size": 5e8,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 5e8,
+    "contiguous_gradients": true,
+    "round_robin_gradients": true
+  }
+}
--- a/LLaMA-Factory/examples/deepspeed/ds_z3_config.json
+++ b/LLaMA-Factory/examples/deepspeed/ds_z3_config.json
+{
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
+  "zero_allow_untested_optimizer": true,
+  "fp16": {
+    "enabled": "auto",
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 16,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "bf16": {
+    "enabled": "auto"
+  },
+  "zero_optimization": {
+    "stage": 3,
+    "overlap_comm": true,
+    "contiguous_gradients": true,
+    "sub_group_size": 1e9,
+    "reduce_bucket_size": "auto",
+    "stage3_prefetch_bucket_size": "auto",
+    "stage3_param_persistence_threshold": "auto",
+    "stage3_max_live_parameters": 1e9,
+    "stage3_max_reuse_distance": 1e9,
+    "stage3_gather_16bit_weights_on_model_save": true
+  }
+}
--- a/LLaMA-Factory/examples/deepspeed/ds_z3_offload_config.json
+++ b/LLaMA-Factory/examples/deepspeed/ds_z3_offload_config.json
+{
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
+  "zero_allow_untested_optimizer": true,
+  "fp16": {
+    "enabled": "auto",
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 16,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "bf16": {
+    "enabled": "auto"
+  },
+  "zero_optimization": {
+    "stage": 3,
+    "offload_optimizer": {
+      "device": "cpu",
+      "pin_memory": true
+    },
+    "offload_param": {
+      "device": "cpu",
+      "pin_memory": true
+    },
+    "overlap_comm": true,
+    "contiguous_gradients": true,
+    "sub_group_size": 1e9,
+    "reduce_bucket_size": "auto",
+    "stage3_prefetch_bucket_size": "auto",
+    "stage3_param_persistence_threshold": "auto",
+    "stage3_max_live_parameters": 1e9,
+    "stage3_max_reuse_distance": 1e9,
+    "stage3_gather_16bit_weights_on_model_save": true
+  }
+}
--- a/LLaMA-Factory/examples/extras/adam_mini/qwen2_full_sft.yaml
+++ b/LLaMA-Factory/examples/extras/adam_mini/qwen2_full_sft.yaml
+### model
+model_name_or_path: Qwen/Qwen2-1.5B-Instruct
+trust_remote_code: true
+
+### method
+stage: sft
+do_train: true
+finetuning_type: full
+use_adam_mini: true
+
+### dataset
+dataset: identity,alpaca_en_demo
+template: qwen
+cutoff_len: 2048
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+### output
+output_dir: saves/qwen2-1_5b/full/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 1.0e-5
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+
+### eval
+val_size: 0.1
+per_device_eval_batch_size: 1
+eval_strategy: steps
+eval_steps: 500
--- a/LLaMA-Factory/examples/extras/badam/llama3_full_sft.yaml
+++ b/LLaMA-Factory/examples/extras/badam/llama3_full_sft.yaml
+### model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+trust_remote_code: true
+
+### method
+stage: sft
+do_train: true
+finetuning_type: full
+use_badam: true
+badam_mode: layer
+badam_switch_mode: ascending
+badam_switch_interval: 50
+badam_verbose: 2
+# deepspeed: examples/deepspeed/ds_z3_config.json
+
+### dataset
+dataset: identity,alpaca_en_demo
+template: llama3
+cutoff_len: 2048
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+### output
+output_dir: saves/llama3-8b/full/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 1.0e-5
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+
+### eval
+val_size: 0.1
+per_device_eval_batch_size: 1
+eval_strategy: steps
+eval_steps: 500
--- a/LLaMA-Factory/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
+++ b/LLaMA-Factory/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
+### model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+quantization_bit: 4
+trust_remote_code: true
+
+### method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_target: all
+
+### dataset
+dataset: identity,alpaca_en_demo
+template: llama3
+cutoff_len: 2048
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+### output
+output_dir: saves/llama3-8b/lora/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 1.0e-4
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+
+### eval
+val_size: 0.1
+per_device_eval_batch_size: 1
+eval_strategy: steps
+eval_steps: 500
--- a/LLaMA-Factory/examples/extras/fsdp_qlora/train.sh
+++ b/LLaMA-Factory/examples/extras/fsdp_qlora/train.sh
+#!/bin/bash
+# DO NOT use GPTQ/AWQ model in FSDP+QLoRA
+
+CUDA_VISIBLE_DEVICES=0,1 accelerate launch \
+    --config_file examples/accelerate/fsdp_config.yaml \
+    src/train.py examples/extras/fsdp_qlora/llama3_lora_sft.yaml
--- a/LLaMA-Factory/examples/extras/galore/llama3_full_sft.yaml
+++ b/LLaMA-Factory/examples/extras/galore/llama3_full_sft.yaml
+### model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+trust_remote_code: true
+
+### method
+stage: sft
+do_train: true
+finetuning_type: full
+use_galore: true
+galore_layerwise: true
+galore_target: mlp,self_attn
+galore_rank: 128
+galore_scale: 2.0
+
+### dataset
+dataset: identity,alpaca_en_demo
+template: llama3
+cutoff_len: 2048
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+### output
+output_dir: saves/llama3-8b/full/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 1
+learning_rate: 1.0e-5
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+pure_bf16: true
+ddp_timeout: 180000000
+
+### eval
+val_size: 0.1
+per_device_eval_batch_size: 1
+eval_strategy: steps
+eval_steps: 500
--- a/LLaMA-Factory/examples/extras/llama_pro/expand.sh
+++ b/LLaMA-Factory/examples/extras/llama_pro/expand.sh
+#!/bin/bash
+
+python scripts/llama_pro.py \
+    --model_name_or_path meta-llama/Meta-Llama-3-8B-Instruct \
+    --output_dir models/llama3-8b-pro \
+    --num_expand 8
--- a/LLaMA-Factory/examples/extras/llama_pro/llama3_freeze_sft.yaml
+++ b/LLaMA-Factory/examples/extras/llama_pro/llama3_freeze_sft.yaml
+### model
+model_name_or_path: models/llama3-8b-pro
+trust_remote_code: true
+
+### method
+stage: sft
+do_train: true
+finetuning_type: freeze
+freeze_trainable_layers: 8
+freeze_trainable_modules: all
+use_llama_pro: true
+
+### dataset
+dataset: identity,alpaca_en_demo
+template: llama3
+cutoff_len: 2048
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+### output
+output_dir: saves/llama3-8b-pro/freeze/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 1.0e-4
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+
+### eval
+val_size: 0.1
+per_device_eval_batch_size: 1
+eval_strategy: steps
+eval_steps: 500
--- a/LLaMA-Factory/examples/extras/loraplus/llama3_lora_sft.yaml
+++ b/LLaMA-Factory/examples/extras/loraplus/llama3_lora_sft.yaml
+### model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+trust_remote_code: true
+
+### method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_target: all
+loraplus_lr_ratio: 16.0
+
+### dataset
+dataset: identity,alpaca_en_demo
+template: llama3
+cutoff_len: 2048
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+### output
+output_dir: saves/llama3-8b/lora/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 1.0e-4
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+
+### eval
+val_size: 0.1
+per_device_eval_batch_size: 1
+eval_strategy: steps
+eval_steps: 500
--- a/LLaMA-Factory/examples/extras/mod/llama3_full_sft.yaml
+++ b/LLaMA-Factory/examples/extras/mod/llama3_full_sft.yaml
+### model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+trust_remote_code: true
+
+### method
+stage: sft
+do_train: true
+finetuning_type: full
+mixture_of_depths: convert
+
+### dataset
+dataset: identity,alpaca_en_demo
+template: llama3
+cutoff_len: 2048
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+### output
+output_dir: saves/llama3-8b-mod/full/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+optim: paged_adamw_8bit
+learning_rate: 1.0e-5
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+pure_bf16: true
+ddp_timeout: 180000000
+
+### eval
+val_size: 0.1
+per_device_eval_batch_size: 1
+eval_strategy: steps
+eval_steps: 500