update to 0.9.2.dev0

8293100a · luopl · 2778a3d0 · 8293100a · 8293100a · 2778a3d0
Commit 8293100a authored Jan 16, 2025 by luopl
20 changed files
--- a/.env.local
+++ b/.env.local
@@ -12,6 +12,7 @@ FORCE_CHECK_IMPORTS=
 LLAMAFACTORY_VERBOSITY=
 USE_MODELSCOPE_HUB=
 USE_OPENMIND_HUB=
+USE_RAY=
 RECORD_VRAM=
 # torchrun
 FORCE_TORCHRUN=

--- a/.gitignore
+++ b/.gitignore
@@ -171,3 +171,5 @@ config/
 saves/
 output/
 wandb/
+swanlog/
+generated_predictions.jsonl
--- a/assets/wechat.jpg
+++ b/assets/wechat.jpg
--- a/assets/wechat_npu.jpg
+++ b/assets/wechat_npu.jpg
--- a/data/dataset_info.json
+++ b/data/dataset_info.json
@@ -296,6 +296,14 @@
      "response": "answer"
    }
  },
+  "openo1_sft": {
+    "hf_hub_url": "llamafactory/OpenO1-SFT",
+    "ms_hub_url": "llamafactory/OpenO1-SFT",
+    "columns": {
+      "prompt": "prompt",
+      "response": "response"
+    }
+  },
  "llava_1k_en": {
    "hf_hub_url": "BUAADreamer/llava-en-zh-2k",
    "subset": "en",
@@ -426,7 +434,7 @@
    }
  },
  "dpo_mix_en": {
-    "hf_hub_url": "hiyouga/DPO-En-Zh-20k",
+    "hf_hub_url": "llamafactory/DPO-En-Zh-20k",
    "subset": "en",
    "ranking": true,
    "formatting": "sharegpt",
@@ -437,7 +445,7 @@
    }
  },
  "dpo_mix_zh": {
-    "hf_hub_url": "hiyouga/DPO-En-Zh-20k",
+    "hf_hub_url": "llamafactory/DPO-En-Zh-20k",
    "subset": "zh",
    "ranking": true,
    "formatting": "sharegpt",

--- a/examples/README.md
+++ b/examples/README.md
@@ -13,6 +13,8 @@ Make sure to execute these commands in the `LLaMA-Factory` directory.

 Use `CUDA_VISIBLE_DEVICES` (GPU) or `ASCEND_RT_VISIBLE_DEVICES` (NPU) to choose computing devices.

+By default, LLaMA-Factory uses all visible computing devices.
+
 ## Examples

 ### LoRA Fine-Tuning
@@ -80,12 +82,6 @@ llamafactory-cli train examples/train_lora/llama3_preprocess.yaml
 llamafactory-cli eval examples/train_lora/llama3_lora_eval.yaml
 ```

-#### Batch Predicting and Computing BLEU and ROUGE Scores
-
-```bash
-llamafactory-cli train examples/train_lora/llama3_lora_predict.yaml
-```
-
 #### Supervised Fine-Tuning on Multiple Nodes

 ```bash
@@ -99,6 +95,12 @@ FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500
 FORCE_TORCHRUN=1 llamafactory-cli train examples/train_lora/llama3_lora_sft_ds3.yaml
 ```

+#### Supervised Fine-Tuning with Ray on 4 GPUs
+
+```bash
+USE_RAY=1 llamafactory-cli train examples/train_full/llama3_lora_sft_ray.yaml
+```
+
 ### QLoRA Fine-Tuning

 #### Supervised Fine-Tuning with 4/8-bit Bitsandbytes/HQQ/EETQ Quantization (Recommended)
@@ -107,6 +109,12 @@ FORCE_TORCHRUN=1 llamafactory-cli train examples/train_lora/llama3_lora_sft_ds3.
 llamafactory-cli train examples/train_qlora/llama3_lora_sft_otfq.yaml
 ```

+#### Supervised Fine-Tuning with 4-bit Bitsandbytes Quantization on Ascend NPU
+
+```bash
+llamafactory-cli train examples/train_qlora/llama3_lora_sft_bnb_npu.yaml
+```
+
 #### Supervised Fine-Tuning with 4/8-bit GPTQ Quantization

 ```bash
@@ -130,14 +138,14 @@ llamafactory-cli train examples/train_qlora/llama3_lora_sft_aqlm.yaml
 #### Supervised Fine-Tuning on Single Node

 ```bash
-FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/llama3_full_sft_ds3.yaml
+FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
 ```

 #### Supervised Fine-Tuning on Multiple Nodes

 ```bash
-FORCE_TORCHRUN=1 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft_ds3.yaml
-FORCE_TORCHRUN=1 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft_ds3.yaml
+FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
+FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
 ```

 #### Multimodal Supervised Fine-Tuning
@@ -146,12 +154,6 @@ FORCE_TORCHRUN=1 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llama
 FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/qwen2vl_full_sft.yaml
 ```

-#### Batch Predicting and Computing BLEU and ROUGE Scores
-
-```bash
-llamafactory-cli train examples/train_full/llama3_full_predict.yaml
-```
-
 ### Merging LoRA Adapters and Quantization

 #### Merge LoRA Adapters
@@ -170,13 +172,19 @@ llamafactory-cli export examples/merge_lora/llama3_gptq.yaml

 ### Inferring LoRA Fine-Tuned Models

-#### Use CLI
+#### Batch Generation using vLLM Tensor Parallel
+
+```
+python scripts/vllm_infer.py --model_name_or_path path_to_merged_model --dataset alpaca_en_demo
+```
+
+#### Use CLI ChatBox

 ```bash
 llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
 ```

-#### Use Web UI
+#### Use Web UI ChatBox

 ```bash
 llamafactory-cli webchat examples/inference/llama3_lora_sft.yaml
@@ -196,6 +204,12 @@ llamafactory-cli api examples/inference/llama3_lora_sft.yaml
 llamafactory-cli train examples/extras/galore/llama3_full_sft.yaml
 ```

+#### Full-Parameter Fine-Tuning using APOLLO
+
+```bash
+llamafactory-cli train examples/extras/apollo/llama3_full_sft.yaml
+```
+
 #### Full-Parameter Fine-Tuning using BAdam

 ```bash
@@ -238,3 +252,9 @@ llamafactory-cli train examples/extras/llama_pro/llama3_freeze_sft.yaml
 ```bash
 bash examples/extras/fsdp_qlora/train.sh
 ```
+
+#### Computing BLEU and ROUGE Scores
+
+```bash
+llamafactory-cli train examples/extras/nlg_eval/llama3_lora_predict.yaml
+```
--- a/examples/README_zh.md
+++ b/examples/README_zh.md
@@ -13,6 +13,8 @@

 使用 `CUDA_VISIBLE_DEVICES`（GPU）或 `ASCEND_RT_VISIBLE_DEVICES`（NPU）选择计算设备。

+LLaMA-Factory 默认使用所有可见的计算设备。
+
 ## 示例

 ### LoRA 微调
@@ -80,12 +82,6 @@ llamafactory-cli train examples/train_lora/llama3_preprocess.yaml
 llamafactory-cli eval examples/train_lora/llama3_lora_eval.yaml
 ```

-#### 批量预测并计算 BLEU 和 ROUGE 分数
-
-```bash
-llamafactory-cli train examples/train_lora/llama3_lora_predict.yaml
-```
-
 #### 多机指令监督微调

 ```bash
@@ -99,6 +95,12 @@ FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500
 FORCE_TORCHRUN=1 llamafactory-cli train examples/train_lora/llama3_lora_sft_ds3.yaml
 ```

+#### 使用 Ray 在 4 张 GPU 上微调
+
+```bash
+USE_RAY=1 llamafactory-cli train examples/train_full/llama3_lora_sft_ray.yaml
+```
+
 ### QLoRA 微调

 #### 基于 4/8 比特 Bitsandbytes/HQQ/EETQ 量化进行指令监督微调（推荐）
@@ -107,6 +109,12 @@ FORCE_TORCHRUN=1 llamafactory-cli train examples/train_lora/llama3_lora_sft_ds3.
 llamafactory-cli train examples/train_qlora/llama3_lora_sft_otfq.yaml
 ```

+#### 在 NPU 上基于 4 比特 Bitsandbytes 量化进行指令监督微调
+
+```bash
+llamafactory-cli train examples/train_qlora/llama3_lora_sft_bnb_npu.yaml
+```
+
 #### 基于 4/8 比特 GPTQ 量化进行指令监督微调

 ```bash
@@ -130,14 +138,14 @@ llamafactory-cli train examples/train_qlora/llama3_lora_sft_aqlm.yaml
 #### 在单机上进行指令监督微调

 ```bash
-FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/llama3_full_sft_ds3.yaml
+FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
 ```

 #### 在多机上进行指令监督微调

 ```bash
-FORCE_TORCHRUN=1 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft_ds3.yaml
-FORCE_TORCHRUN=1 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft_ds3.yaml
+FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
+FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/train_full/llama3_full_sft.yaml
 ```

 #### 多模态指令监督微调
@@ -146,12 +154,6 @@ FORCE_TORCHRUN=1 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llama
 FORCE_TORCHRUN=1 llamafactory-cli train examples/train_full/qwen2vl_full_sft.yaml
 ```

-#### 批量预测并计算 BLEU 和 ROUGE 分数
-
-```bash
-llamafactory-cli train examples/train_full/llama3_full_predict.yaml
-```
-
 ### 合并 LoRA 适配器与模型量化

 #### 合并 LoRA 适配器
@@ -170,13 +172,19 @@ llamafactory-cli export examples/merge_lora/llama3_gptq.yaml

 ### 推理 LoRA 模型

-#### 使用命令行接口
+#### 使用 vLLM+TP 批量推理
+
+```
+python scripts/vllm_infer.py --model_name_or_path path_to_merged_model --dataset alpaca_en_demo
+```
+
+#### 使用命令行对话框

 ```bash
 llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
 ```

-#### 使用浏览器界面
+#### 使用浏览器对话框

 ```bash
 llamafactory-cli webchat examples/inference/llama3_lora_sft.yaml
@@ -196,6 +204,12 @@ llamafactory-cli api examples/inference/llama3_lora_sft.yaml
 llamafactory-cli train examples/extras/galore/llama3_full_sft.yaml
 ```

+#### 使用 APOLLO 进行全参数训练
+
+```bash
+llamafactory-cli train examples/extras/apollo/llama3_full_sft.yaml
+```
+
 #### 使用 BAdam 进行全参数训练

 ```bash
@@ -238,3 +252,9 @@ llamafactory-cli train examples/extras/llama_pro/llama3_freeze_sft.yaml
 ```bash
 bash examples/extras/fsdp_qlora/train.sh
 ```
+
+#### 计算 BLEU 和 ROUGE 分数
+
+```bash
+llamafactory-cli train examples/extras/nlg_eval/llama3_lora_predict.yaml
+```
--- a/examples/extras/adam_mini/qwen2_full_sft.yaml
+++ b/examples/extras/adam_mini/qwen2_full_sft.yaml
 ### model
 model_name_or_path: Qwen/Qwen2-1.5B-Instruct
+trust_remote_code: true

 ### method
 stage: sft

--- a/examples/extras/apollo/llama3_full_sft.yaml
+++ b/examples/extras/apollo/llama3_full_sft.yaml
+### model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+trust_remote_code: true
+
+### method
+stage: sft
+do_train: true
+finetuning_type: full
+use_apollo: true
+apollo_layerwise: true  # choices: [true, false], use false for DDP training
+apollo_target: all
+apollo_rank: 128
+apollo_scale: 32.0
+apollo_scale_type: channel
+
+### dataset
+dataset: identity,alpaca_en_demo
+template: llama3
+cutoff_len: 2048
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+### output
+output_dir: saves/llama3-8b/full/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 1  # use 1 for layerwise apollo
+learning_rate: 1.0e-5
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+pure_bf16: true
+ddp_timeout: 180000000
+
+### eval
+val_size: 0.1
+per_device_eval_batch_size: 1
+eval_strategy: steps
+eval_steps: 500
--- a/examples/extras/badam/llama3_full_sft.yaml
+++ b/examples/extras/badam/llama3_full_sft.yaml
 ### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+trust_remote_code: true

 ### method
 stage: sft

--- a/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
+++ b/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
 ### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 quantization_bit: 4
+trust_remote_code: true

 ### method
 stage: sft

--- a/examples/extras/galore/llama3_full_sft.yaml
+++ b/examples/extras/galore/llama3_full_sft.yaml
 ### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+trust_remote_code: true

 ### method
 stage: sft
 do_train: true
 finetuning_type: full
 use_galore: true
-galore_layerwise: true
-galore_target: mlp,self_attn
+galore_layerwise: true  # choices: [true, false], use false for DDP training
+galore_target: all
 galore_rank: 128
 galore_scale: 2.0

@@ -28,7 +29,7 @@ overwrite_output_dir: true

 ### train
 per_device_train_batch_size: 1
-gradient_accumulation_steps: 1
+gradient_accumulation_steps: 1  # use 1 for layerwise galore
 learning_rate: 1.0e-5
 num_train_epochs: 3.0
 lr_scheduler_type: cosine

--- a/examples/extras/llama_pro/llama3_freeze_sft.yaml
+++ b/examples/extras/llama_pro/llama3_freeze_sft.yaml
 ### model
 model_name_or_path: models/llama3-8b-pro
+trust_remote_code: true

 ### method
 stage: sft

--- a/examples/extras/loraplus/llama3_lora_sft.yaml
+++ b/examples/extras/loraplus/llama3_lora_sft.yaml
 ### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+trust_remote_code: true

 ### method
 stage: sft

--- a/examples/extras/mod/llama3_full_sft.yaml
+++ b/examples/extras/mod/llama3_full_sft.yaml
 ### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+trust_remote_code: true

 ### method
 stage: sft

--- a/examples/extras/nlg_eval/llama3_lora_predict.yaml
+++ b/examples/extras/nlg_eval/llama3_lora_predict.yaml
+# The batch generation can be SLOW using this config.
+# For faster inference, we recommend to use `scripts/vllm_infer.py`.
+
+### model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+adapter_name_or_path: saves/llama3-8b/lora/sft
+trust_remote_code: true
+
+### method
+stage: sft
+do_predict: true
+finetuning_type: lora
+
+### dataset
+eval_dataset: identity,alpaca_en_demo
+template: llama3
+cutoff_len: 2048
+max_samples: 50
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+### output
+output_dir: saves/llama3-8b/lora/predict
+overwrite_output_dir: true
+
+### eval
+per_device_eval_batch_size: 1
+predict_with_generate: true
+ddp_timeout: 180000000
--- a/examples/extras/pissa/llama3_lora_sft.yaml
+++ b/examples/extras/pissa/llama3_lora_sft.yaml
 ### model
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+trust_remote_code: true

 ### method
 stage: sft

--- a/examples/inference/llama3.yaml
+++ b/examples/inference/llama3.yaml
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 template: llama3
+infer_backend: huggingface  # choices: [huggingface, vllm]
+trust_remote_code: true
--- a/examples/inference/llama3_full_sft.yaml
+++ b/examples/inference/llama3_full_sft.yaml
+model_name_or_path: saves/llama3-8b/full/sft
+template: llama3
+infer_backend: huggingface  # choices: [huggingface, vllm]
+trust_remote_code: true
--- a/examples/inference/llama3_lora_sft.yaml
+++ b/examples/inference/llama3_lora_sft.yaml
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 adapter_name_or_path: saves/llama3-8b/lora/sft
 template: llama3
-finetuning_type: lora
+infer_backend: huggingface  # choices: [huggingface, vllm]
+trust_remote_code: true