Add QWQ-32B

317a82e2 · chenych · 37b0ad9f · 317a82e2 · 37b0ad9f · 317a82e2
Commit 317a82e2 authored Mar 07, 2025 by chenych
20 changed files
--- a/examples/train_lora/llama3_lora_ppo.yaml
+++ b/examples/train_lora/llama3_lora_ppo.yaml
@@ -7,6 +7,7 @@ trust_remote_code: true
 stage: ppo
 do_train: true
 finetuning_type: lora
+lora_rank: 8
 lora_target: all
 ### dataset

--- a/examples/train_lora/llama3_lora_predict.yaml
+++ b/examples/train_lora/llama3_lora_predict.yaml
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-adapter_name_or_path: saves/llama3-8b/lora/sft
-### method
-stage: sft
-do_predict: true
-finetuning_type: lora
-### dataset
-eval_dataset: identity,alpaca_en_demo
-template: llama3
-cutoff_len: 2048
-max_samples: 50
-overwrite_cache: true
-preprocessing_num_workers: 16
-### output
-output_dir: saves/llama3-8b/lora/predict
-overwrite_output_dir: true
-### eval
-per_device_eval_batch_size: 1
-predict_with_generate: true
-ddp_timeout: 180000000
--- a/examples/train_lora/llama3_lora_pretrain.yaml
+++ b/examples/train_lora/llama3_lora_pretrain.yaml
@@ -6,6 +6,7 @@ trust_remote_code: true
 stage: pt
 do_train: true
 finetuning_type: lora
+lora_rank: 8
 lora_target: all
 ### dataset
@@ -14,6 +15,7 @@ cutoff_len: 2048
 max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
+dataloader_num_workers: 4
 ### output
 output_dir: saves/llama3-8b/lora/pretrain
@@ -21,6 +23,7 @@ logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
+save_only_model: false
 ### train
 per_device_train_batch_size: 1
@@ -31,9 +34,11 @@ lr_scheduler_type: cosine
 warmup_ratio: 0.1
 bf16: true
 ddp_timeout: 180000000
+resume_from_checkpoint: null
 ### eval
-val_size: 0.1
+# eval_dataset: c4_demo
-per_device_eval_batch_size: 1
+# val_size: 0.1
-eval_strategy: steps
+# per_device_eval_batch_size: 1
-eval_steps: 500
+# eval_strategy: steps
+# eval_steps: 500
--- a/examples/train_lora/llama3_lora_reward.yaml
+++ b/examples/train_lora/llama3_lora_reward.yaml
@@ -6,6 +6,7 @@ trust_remote_code: true
 stage: rm
 do_train: true
 finetuning_type: lora
+lora_rank: 8
 lora_target: all
 ### dataset
@@ -15,6 +16,7 @@ cutoff_len: 2048
 max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
+dataloader_num_workers: 4
 ### output
 output_dir: saves/llama3-8b/lora/reward
@@ -22,6 +24,7 @@ logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
+save_only_model: false
 ### train
 per_device_train_batch_size: 1
@@ -32,9 +35,11 @@ lr_scheduler_type: cosine
 warmup_ratio: 0.1
 bf16: true
 ddp_timeout: 180000000
+resume_from_checkpoint: null
 ### eval
-val_size: 0.1
+# eval_dataset: dpo_en_demo
-per_device_eval_batch_size: 1
+# val_size: 0.1
-eval_strategy: steps
+# per_device_eval_batch_size: 1
-eval_steps: 500
+# eval_strategy: steps
+# eval_steps: 500
--- a/examples/train_lora/llama3_lora_sft.yaml
+++ b/examples/train_lora/llama3_lora_sft.yaml
@@ -6,6 +6,7 @@ trust_remote_code: true
 stage: sft
 do_train: true
 finetuning_type: lora
+lora_rank: 8
 lora_target: all
 ### dataset
@@ -15,6 +16,7 @@ cutoff_len: 2048
 max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
+dataloader_num_workers: 4
 ### output
 output_dir: saves/llama3-8b/lora/sft
@@ -22,6 +24,7 @@ logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
+save_only_model: false
 ### train
 per_device_train_batch_size: 1
@@ -32,9 +35,11 @@ lr_scheduler_type: cosine
 warmup_ratio: 0.1
 bf16: true
 ddp_timeout: 180000000
+resume_from_checkpoint: null
 ### eval
-val_size: 0.1
+# eval_dataset: alpaca_en_demo
-per_device_eval_batch_size: 1
+# val_size: 0.1
-eval_strategy: steps
+# per_device_eval_batch_size: 1
-eval_steps: 500
+# eval_strategy: steps
+# eval_steps: 500
--- a/examples/train_lora/llama3_lora_sft_ds0.yaml
+++ b/examples/train_lora/llama3_lora_sft_ds0.yaml
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-### method
-stage: sft
-do_train: true
-finetuning_type: lora
-lora_target: all
-deepspeed: examples/deepspeed/ds_z0_config.json
-### dataset
-dataset: identity,alpaca_en_demo
-template: llama3
-cutoff_len: 2048
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-### output
-output_dir: saves/llama3-8b/lora/sft
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 2
-learning_rate: 1.0e-4
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-bf16: true
-ddp_timeout: 180000000
-### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
--- a/examples/train_lora/llama3_lora_sft_ds3.yaml
+++ b/examples/train_lora/llama3_lora_sft_ds3.yaml
@@ -6,6 +6,7 @@ trust_remote_code: true
 stage: sft
 do_train: true
 finetuning_type: lora
+lora_rank: 8
 lora_target: all
 deepspeed: examples/deepspeed/ds_z3_config.json  # choices: [ds_z0_config.json, ds_z2_config.json, ds_z3_config.json]
@@ -16,6 +17,7 @@ cutoff_len: 2048
 max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
+dataloader_num_workers: 4
 ### output
 output_dir: saves/llama3-8b/lora/sft
@@ -23,6 +25,7 @@ logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
+save_only_model: false
 ### train
 per_device_train_batch_size: 1
@@ -33,9 +36,11 @@ lr_scheduler_type: cosine
 warmup_ratio: 0.1
 bf16: true
 ddp_timeout: 180000000
+resume_from_checkpoint: null
 ### eval
-val_size: 0.1
+# eval_dataset: alpaca_en_demo
-per_device_eval_batch_size: 1
+# val_size: 0.1
-eval_strategy: steps
+# per_device_eval_batch_size: 1
-eval_steps: 500
+# eval_strategy: steps
+# eval_steps: 500
--- a/examples/train_lora/llama3_lora_sft_ray.yaml
+++ b/examples/train_lora/llama3_lora_sft_ray.yaml
@@ -6,6 +6,7 @@ trust_remote_code: true
 stage: sft
 do_train: true
 finetuning_type: lora
+lora_rank: 8
 lora_target: all
 ### dataset
@@ -16,6 +17,7 @@ cutoff_len: 2048
 max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
+dataloader_num_workers: 4
 ### output
 output_dir: tmp_dir
@@ -23,6 +25,15 @@ logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
+save_only_model: false
+### ray
+ray_run_name: llama3_8b_sft_lora
+ray_storage_path: ./saves
+ray_num_workers: 4  # number of GPUs to use
+resources_per_worker:
+  GPU: 1
+placement_strategy: PACK
 ### train
 per_device_train_batch_size: 1
@@ -33,16 +44,11 @@ lr_scheduler_type: cosine
 warmup_ratio: 0.1
 bf16: true
 ddp_timeout: 180000000
+resume_from_checkpoint: null
 ### eval
-val_size: 0.1
+# eval_dataset: alpaca_en_demo
-per_device_eval_batch_size: 1
+# val_size: 0.1
-eval_strategy: steps
+# per_device_eval_batch_size: 1
-eval_steps: 500
+# eval_strategy: steps
+# eval_steps: 500
-### ray
-ray_run_name: llama3_8b_sft_lora
-ray_num_workers: 4  # number of GPUs to use
-resources_per_worker:
-  GPU: 1
-placement_strategy: PACK
--- a/examples/train_lora/llama3_preprocess.yaml
+++ b/examples/train_lora/llama3_preprocess.yaml
@@ -6,6 +6,7 @@ trust_remote_code: true
 stage: sft
 do_train: true
 finetuning_type: lora
+lora_rank: 8
 lora_target: all
 ### dataset

--- a/examples/train_lora/llava1_5_lora_sft.yaml
+++ b/examples/train_lora/llava1_5_lora_sft.yaml
@@ -6,6 +6,7 @@ trust_remote_code: true
 stage: sft
 do_train: true
 finetuning_type: lora
+lora_rank: 8
 lora_target: all
 ### dataset
@@ -15,6 +16,7 @@ cutoff_len: 2048
 max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
+dataloader_num_workers: 4
 ### output
 output_dir: saves/llava1_5-7b/lora/sft
@@ -22,6 +24,7 @@ logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
+save_only_model: false
 ### train
 per_device_train_batch_size: 1
@@ -32,9 +35,10 @@ lr_scheduler_type: cosine
 warmup_ratio: 0.1
 bf16: true
 ddp_timeout: 180000000
+resume_from_checkpoint: null
 ### eval
-val_size: 0.1
+# val_size: 0.1
-per_device_eval_batch_size: 1
+# per_device_eval_batch_size: 1
-eval_strategy: steps
+# eval_strategy: steps
-eval_steps: 500
+# eval_steps: 500
--- a/examples/train_lora/qwen2vl_lora_dpo.yaml
+++ b/examples/train_lora/qwen2vl_lora_dpo.yaml
 ### model
 model_name_or_path: Qwen/Qwen2-VL-7B-Instruct
+image_max_pixels: 262144
+video_max_pixels: 16384
 trust_remote_code: true
 ### method
 stage: dpo
 do_train: true
 finetuning_type: lora
+lora_rank: 8
 lora_target: all
 pref_beta: 0.1
 pref_loss: sigmoid  # choices: [sigmoid (dpo), orpo, simpo]
@@ -17,6 +20,7 @@ cutoff_len: 2048
 max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
+dataloader_num_workers: 4
 ### output
 output_dir: saves/qwen2_vl-7b/lora/dpo
@@ -24,6 +28,7 @@ logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
+save_only_model: false
 ### train
 per_device_train_batch_size: 1
@@ -34,9 +39,10 @@ lr_scheduler_type: cosine
 warmup_ratio: 0.1
 bf16: true
 ddp_timeout: 180000000
+resume_from_checkpoint: null
 ### eval
-val_size: 0.1
+# val_size: 0.1
-per_device_eval_batch_size: 1
+# per_device_eval_batch_size: 1
-eval_strategy: steps
+# eval_strategy: steps
-eval_steps: 500
+# eval_steps: 500
--- a/examples/train_lora/qwen2vl_lora_sft.yaml
+++ b/examples/train_lora/qwen2vl_lora_sft.yaml
 ### model
 model_name_or_path: Qwen/Qwen2-VL-7B-Instruct
+image_max_pixels: 262144
+video_max_pixels: 16384
 trust_remote_code: true
 ### method
 stage: sft
 do_train: true
 finetuning_type: lora
+lora_rank: 8
 lora_target: all
 ### dataset
@@ -15,6 +18,7 @@ cutoff_len: 2048
 max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
+dataloader_num_workers: 4
 ### output
 output_dir: saves/qwen2_vl-7b/lora/sft
@@ -22,6 +26,7 @@ logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
+save_only_model: false
 ### train
 per_device_train_batch_size: 1
@@ -32,9 +37,10 @@ lr_scheduler_type: cosine
 warmup_ratio: 0.1
 bf16: true
 ddp_timeout: 180000000
+resume_from_checkpoint: null
 ### eval
-val_size: 0.1
+# val_size: 0.1
-per_device_eval_batch_size: 1
+# per_device_eval_batch_size: 1
-eval_strategy: steps
+# eval_strategy: steps
-eval_steps: 500
+# eval_steps: 500
--- a/examples/train_qlora/llama3_lora_sft_aqlm.yaml
+++ b/examples/train_qlora/llama3_lora_sft_aqlm.yaml
@@ -6,6 +6,7 @@ trust_remote_code: true
 stage: sft
 do_train: true
 finetuning_type: lora
+lora_rank: 8
 lora_target: all
 ### dataset
@@ -34,7 +35,7 @@ bf16: true
 ddp_timeout: 180000000
 ### eval
-val_size: 0.1
+# val_size: 0.1
-per_device_eval_batch_size: 1
+# per_device_eval_batch_size: 1
-eval_strategy: steps
+# eval_strategy: steps
-eval_steps: 500
+# eval_steps: 500
--- a/examples/train_qlora/llama3_lora_sft_awq.yaml
+++ b/examples/train_qlora/llama3_lora_sft_awq.yaml
@@ -6,6 +6,7 @@ trust_remote_code: true
 stage: sft
 do_train: true
 finetuning_type: lora
+lora_rank: 8
 lora_target: all
 ### dataset
@@ -34,7 +35,7 @@ bf16: true
 ddp_timeout: 180000000
 ### eval
-val_size: 0.1
+# val_size: 0.1
-per_device_eval_batch_size: 1
+# per_device_eval_batch_size: 1
-eval_strategy: steps
+# eval_strategy: steps
-eval_steps: 500
+# eval_steps: 500
--- a/examples/train_qlora/llama3_lora_sft_bitsandbytes.yaml
+++ b/examples/train_qlora/llama3_lora_sft_bitsandbytes.yaml
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-quantization_bit: 4
-### method
-stage: sft
-do_train: true
-finetuning_type: lora
-lora_target: all
-### dataset
-dataset: identity,alpaca_en_demo
-template: llama3
-cutoff_len: 1024
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-### output
-output_dir: saves/llama3-8b/lora/sft
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 8
-learning_rate: 1.0e-4
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-fp16: true
-ddp_timeout: 180000000
-### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
--- a/examples/train_qlora/llama3_lora_sft_bnb_npu.yaml
+++ b/examples/train_qlora/llama3_lora_sft_bnb_npu.yaml
@@ -9,6 +9,7 @@ trust_remote_code: true
 stage: sft
 do_train: true
 finetuning_type: lora
+lora_rank: 8
 lora_target: all
 ### dataset
@@ -37,7 +38,7 @@ bf16: true
 ddp_timeout: 180000000
 ### eval
-val_size: 0.1
+# val_size: 0.1
-per_device_eval_batch_size: 1
+# per_device_eval_batch_size: 1
-eval_strategy: steps
+# eval_strategy: steps
-eval_steps: 500
+# eval_steps: 500
--- a/examples/train_qlora/llama3_lora_sft_gptq.yaml
+++ b/examples/train_qlora/llama3_lora_sft_gptq.yaml
@@ -6,6 +6,7 @@ trust_remote_code: true
 stage: sft
 do_train: true
 finetuning_type: lora
+lora_rank: 8
 lora_target: all
 ### dataset
@@ -34,7 +35,7 @@ bf16: true
 ddp_timeout: 180000000
 ### eval
-val_size: 0.1
+# val_size: 0.1
-per_device_eval_batch_size: 1
+# per_device_eval_batch_size: 1
-eval_strategy: steps
+# eval_strategy: steps
-eval_steps: 500
+# eval_steps: 500
--- a/examples/train_qlora/llama3_lora_sft_otfq.yaml
+++ b/examples/train_qlora/llama3_lora_sft_otfq.yaml
@@ -8,6 +8,7 @@ trust_remote_code: true
 stage: sft
 do_train: true
 finetuning_type: lora
+lora_rank: 8
 lora_target: all
 ### dataset
@@ -36,7 +37,7 @@ bf16: true
 ddp_timeout: 180000000
 ### eval
-val_size: 0.1
+# val_size: 0.1
-per_device_eval_batch_size: 1
+# per_device_eval_batch_size: 1
-eval_strategy: steps
+# eval_strategy: steps
-eval_steps: 500
+# eval_steps: 500
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -2,6 +2,22 @@
 requires = ["setuptools>=61.0"]
 build-backend = "setuptools.build_meta"
+[project]
+name = "llamafactory"
+dynamic = [
+    "version",
+    "dependencies",
+    "optional-dependencies",
+    "requires-python",
+    "scripts",
+    "authors",
+    "description",
+    "readme",
+    "license",
+    "keywords",
+    "classifiers"
+]
 [tool.ruff]
 target-version = "py38"
 line-length = 119
@@ -31,3 +47,19 @@ indent-style = "space"
 docstring-code-format = true
 skip-magic-trailing-comma = false
 line-ending = "auto"
+[tool.uv]
+conflicts = [
+    [
+        { extra = "torch-npu" },
+        { extra = "aqlm" },
+    ],
+    [
+        { extra = "torch-npu" },
+        { extra = "liger-kernel" },
+    ],
+    [
+        { extra = "torch-npu" },
+        { extra = "vllm" },
+    ]
+]
--- a/requirements.txt
+++ b/requirements.txt
-transformers>=4.41.2,<=4.46.1
+transformers>=4.41.2,<=4.49.0,!=4.46.*,!=4.47.*,!=4.48.*;python_version<'3.10'
-datasets>=2.16.0,<=3.1.0
+transformers>=4.41.2,<=4.49.0,!=4.46.*,!=4.47.*,!=4.48.0;python_version>='3.10'
-accelerate>=0.34.0,<=1.0.1
+datasets>=2.16.0,<=3.2.0
+accelerate>=0.34.0,<=1.2.1
 peft>=0.11.1,<=0.12.0
 trl>=0.8.6,<=0.9.6
-tokenizers>=0.19.0,<0.20.4
+tokenizers>=0.19.0,<=0.21.0
-gradio>=4.0.0,<5.0.0
+gradio>=4.38.0,<=5.18.0
 pandas>=2.0.0
 scipy
 einops
@@ -21,4 +22,5 @@ packaging
 pyyaml
 numpy<2.0.0
 av
+librosa
 tyro<0.9.0