Add QWQ-32B

317a82e2 · chenych · 37b0ad9f · 317a82e2 · 317a82e2 · 317a82e2
Commit 317a82e2 authored Mar 07, 2025 by chenych
20 changed files
--- a/examples/README_zh.md
+++ b/examples/README_zh.md
@@ -98,7 +98,7 @@ FORCE_TORCHRUN=1 llamafactory-cli train examples/train_lora/llama3_lora_sft_ds3.
 #### 使用 Ray 在 4 张 GPU 上微调

 ```bash
-USE_RAY=1 llamafactory-cli train examples/train_full/llama3_lora_sft_ray.yaml
+USE_RAY=1 llamafactory-cli train examples/train_lora/llama3_lora_sft_ray.yaml
 ```

 ### QLoRA 微调
@@ -170,6 +170,12 @@ llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
 llamafactory-cli export examples/merge_lora/llama3_gptq.yaml
 ```

+### 保存 Ollama 配置文件
+
+```bash
+llamafactory-cli export examples/merge_lora/llama3_full_sft.yaml
+```
+
 ### 推理 LoRA 模型

 #### 使用 vLLM+TP 批量推理

--- a/examples/extras/adam_mini/qwen2_full_sft.yaml
+++ b/examples/extras/adam_mini/qwen2_full_sft.yaml
@@ -34,7 +34,7 @@ bf16: true
 ddp_timeout: 180000000

 ### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
--- a/examples/extras/apollo/llama3_full_sft.yaml
+++ b/examples/extras/apollo/llama3_full_sft.yaml
@@ -39,7 +39,7 @@ pure_bf16: true
 ddp_timeout: 180000000

 ### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
--- a/examples/extras/badam/llama3_full_sft.yaml
+++ b/examples/extras/badam/llama3_full_sft.yaml
@@ -37,7 +37,7 @@ lr_scheduler_type: cosine
 warmup_ratio: 0.1

 ### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
--- a/examples/extras/badam/llama3_full_sft_ds3.yaml
+++ b/examples/extras/badam/llama3_full_sft_ds3.yaml
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-
-### method
-stage: sft
-do_train: true
-finetuning_type: full
-use_badam: true
-badam_mode: layer
-badam_switch_mode: ascending
-badam_switch_interval: 50
-badam_verbose: 2
-deepspeed: examples/deepspeed/ds_z3_config.json
-
-### dataset
-dataset: identity,alpaca_en_demo
-template: llama3
-cutoff_len: 1024
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llama3-8b/full/sft
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 8
-learning_rate: 1.0e-4
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-
-### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
--- a/examples/extras/badam/llama3_lora_sft.yaml
+++ b/examples/extras/badam/llama3_lora_sft.yaml
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-
-### method
-stage: sft
-do_train: true
-finetuning_type: full
-use_badam: true
-badam_switch_mode: ascending
-badam_switch_interval: 50
-badam_verbose: 2
-
-### dataset
-dataset: identity,alpaca_en_demo
-template: llama3
-cutoff_len: 1024
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llama3-8b/full/sft
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 8
-learning_rate: 1.0e-4
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-pure_bf16: true
-
-### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
--- a/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
+++ b/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
@@ -7,6 +7,7 @@ trust_remote_code: true
 stage: sft
 do_train: true
 finetuning_type: lora
+lora_rank: 8
 lora_target: all

 ### dataset
@@ -35,7 +36,7 @@ bf16: true
 ddp_timeout: 180000000

 ### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
--- a/examples/extras/fsdp_qlora/single_node.sh
+++ b/examples/extras/fsdp_qlora/single_node.sh
-#!/bin/bash
-# DO NOT use GPTQ/AWQ model in FSDP+QLoRA
-
-CUDA_VISIBLE_DEVICES=0,1 accelerate launch \
-    --config_file examples/accelerate/fsdp_config.yaml \
-    src/train.py examples/extras/fsdp_qlora/llama3_lora_sft.yaml
--- a/examples/extras/galore/llama3_full_sft.yaml
+++ b/examples/extras/galore/llama3_full_sft.yaml
@@ -38,7 +38,7 @@ pure_bf16: true
 ddp_timeout: 180000000

 ### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
--- a/examples/extras/llama_pro/llama3_freeze_sft.yaml
+++ b/examples/extras/llama_pro/llama3_freeze_sft.yaml
@@ -36,7 +36,7 @@ bf16: true
 ddp_timeout: 180000000

 ### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
--- a/examples/extras/loraplus/llama3_lora_sft.yaml
+++ b/examples/extras/loraplus/llama3_lora_sft.yaml
@@ -6,6 +6,7 @@ trust_remote_code: true
 stage: sft
 do_train: true
 finetuning_type: lora
+lora_rank: 8
 lora_target: all
 loraplus_lr_ratio: 16.0

@@ -35,7 +36,7 @@ bf16: true
 ddp_timeout: 180000000

 ### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
--- a/examples/extras/mod/llama3_full_sft.yaml
+++ b/examples/extras/mod/llama3_full_sft.yaml
@@ -35,7 +35,7 @@ pure_bf16: true
 ddp_timeout: 180000000

 ### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
--- a/examples/extras/pissa/llama3_lora_sft.yaml
+++ b/examples/extras/pissa/llama3_lora_sft.yaml
@@ -6,6 +6,7 @@ trust_remote_code: true
 stage: sft
 do_train: true
 finetuning_type: lora
+lora_rank: 8
 lora_target: all
 pissa_init: true
 pissa_iter: 16
@@ -37,7 +38,7 @@ bf16: true
 ddp_timeout: 180000000

 ### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
--- a/examples/full_multi_gpu/llama3_full_predict.yaml
+++ b/examples/full_multi_gpu/llama3_full_predict.yaml
-### model
-model_name_or_path: saves/llama3-8b/full/sft
-
-### method
-stage: sft
-do_predict: true
-finetuning_type: full
-
-### dataset
-dataset: identity,alpaca_en_demo
-template: llama3
-cutoff_len: 1024
-max_samples: 50
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llama3-8b/full/predict
-overwrite_output_dir: true
-
-### eval
-per_device_eval_batch_size: 1
-predict_with_generate: true
--- a/examples/full_multi_gpu/llama3_full_sft.yaml
+++ b/examples/full_multi_gpu/llama3_full_sft.yaml
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-
-### method
-stage: sft
-do_train: true
-finetuning_type: full
-
-### ddp
-ddp_timeout: 180000000
-deepspeed: examples/deepspeed/ds_z3_config.json
-
-### dataset
-dataset: identity,alpaca_en_demo
-template: llama3
-cutoff_len: 1024
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llama3-8b/full/sft
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 2
-learning_rate: 1.0e-4
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-fp16: true
-
-### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
--- a/examples/lora_multi_gpu/llama3_lora_sft.yaml
+++ b/examples/lora_multi_gpu/llama3_lora_sft.yaml
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-
-### method
-stage: sft
-do_train: true
-finetuning_type: lora
-lora_target: all
-
-### ddp
-ddp_timeout: 180000000
-
-### dataset
-dataset: identity,alpaca_en_demo
-template: llama3
-cutoff_len: 1024
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llama3-8b/lora/sft
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 2
-learning_rate: 1.0e-4
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-fp16: true
-
-### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
--- a/examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
+++ b/examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-
-### method
-stage: sft
-do_train: true
-finetuning_type: lora
-lora_target: all
-
-### ddp
-ddp_timeout: 180000000
-deepspeed: examples/deepspeed/ds_z3_config.json
-
-### dataset
-dataset: identity,alpaca_en_demo
-template: llama3
-cutoff_len: 1024
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llama3-8b/lora/sft
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 2
-learning_rate: 1.0e-4
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-fp16: true
-
-### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
--- a/examples/lora_single_gpu/llama3_lora_dpo.yaml
+++ b/examples/lora_single_gpu/llama3_lora_dpo.yaml
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-
-### method
-stage: dpo
-do_train: true
-finetuning_type: lora
-lora_target: all
-pref_beta: 0.1
-pref_loss: sigmoid  # [sigmoid (dpo), orpo, simpo]
-
-### dataset
-dataset: dpo_en_demo
-template: llama3
-cutoff_len: 1024
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llama3-8b/lora/dpo
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 8
-learning_rate: 5.0e-6
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-fp16: true
-
-### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
--- a/examples/lora_single_gpu/llama3_lora_eval.yaml
+++ b/examples/lora_single_gpu/llama3_lora_eval.yaml
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-adapter_name_or_path: saves/llama3-8b/lora/sft
-
-### method
-finetuning_type: lora
-
-### dataset
-task: mmlu
-split: test
-template: fewshot
-lang: en
-n_shot: 5
-
-### output
-save_dir: saves/llama3-8b/lora/eval
-
-### eval
-batch_size: 4
--- a/examples/lora_single_gpu/llama3_lora_kto.yaml
+++ b/examples/lora_single_gpu/llama3_lora_kto.yaml
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-
-### method
-stage: kto
-do_train: true
-finetuning_type: lora
-lora_target: all
-
-### dataset
-dataset: kto_en_demo
-template: llama3
-cutoff_len: 1024
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llama3-8b/lora/kto
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 8
-learning_rate: 5.0e-6
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-fp16: true
-
-### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500