Add QWQ-32B

317a82e2 · chenych · 37b0ad9f · 37b0ad9f · 37b0ad9f · 37b0ad9f
Commit 317a82e2 authored Mar 07, 2025 by chenych
20 changed files
--- a/examples/lora_single_gpu/llama3_lora_ppo.yaml
+++ b/examples/lora_single_gpu/llama3_lora_ppo.yaml
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-reward_model: saves/llama3-8b/lora/reward
-
-### method
-stage: ppo
-do_train: true
-finetuning_type: lora
-lora_target: all
-
-### dataset
-dataset: identity,alpaca_en_demo
-template: llama3
-cutoff_len: 1024
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llama3-8b/lora/ppo
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 8
-learning_rate: 1.0e-5
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-fp16: true
-
-### generate
-max_new_tokens: 512
-top_k: 0
-top_p: 0.9
--- a/examples/lora_single_gpu/llama3_lora_predict.yaml
+++ b/examples/lora_single_gpu/llama3_lora_predict.yaml
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-adapter_name_or_path: saves/llama3-8b/lora/sft
-
-### method
-stage: sft
-do_predict: true
-finetuning_type: lora
-
-### dataset
-dataset: identity,alpaca_en_demo
-template: llama3
-cutoff_len: 1024
-max_samples: 50
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llama3-8b/lora/predict
-overwrite_output_dir: true
-
-### eval
-per_device_eval_batch_size: 1
-predict_with_generate: true
--- a/examples/lora_single_gpu/llama3_lora_pretrain.yaml
+++ b/examples/lora_single_gpu/llama3_lora_pretrain.yaml
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-
-### method
-stage: pt
-do_train: true
-finetuning_type: lora
-lora_target: all
-
-### dataset
-dataset: c4_demo
-cutoff_len: 1024
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llama3-8b/lora/sft
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 8
-learning_rate: 1.0e-4
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-fp16: true
-
-### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
--- a/examples/lora_single_gpu/llama3_lora_reward.yaml
+++ b/examples/lora_single_gpu/llama3_lora_reward.yaml
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-
-### method
-stage: rm
-do_train: true
-finetuning_type: lora
-lora_target: all
-
-### dataset
-dataset: dpo_en_demo
-template: llama3
-cutoff_len: 1024
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llama3-8b/lora/reward
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 8
-learning_rate: 1.0e-5
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-fp16: true
-
-### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
--- a/examples/lora_single_gpu/llama3_lora_sft.yaml
+++ b/examples/lora_single_gpu/llama3_lora_sft.yaml
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-
-### method
-stage: sft
-do_train: true
-finetuning_type: lora
-lora_target: all
-
-### dataset
-dataset: identity,alpaca_en_demo
-template: llama3
-cutoff_len: 1024
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llama3-8b/lora/sft
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 8
-learning_rate: 1.0e-4
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-fp16: true
-
-### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
--- a/examples/lora_single_gpu/llama3_preprocess.yaml
+++ b/examples/lora_single_gpu/llama3_preprocess.yaml
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-
-### method
-stage: sft
-do_train: true
-finetuning_type: lora
-lora_target: all
-
-### dataset
-dataset: identity,alpaca_en_demo
-template: llama3
-cutoff_len: 1024
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-tokenized_path: saves/llama3-8b/dataset/sft
-
-### output
-output_dir: saves/llama3-8b/lora/sft
-overwrite_output_dir: true
--- a/examples/lora_single_gpu/llava1_5_lora_sft.yaml
+++ b/examples/lora_single_gpu/llava1_5_lora_sft.yaml
-### model
-model_name_or_path: llava-hf/llava-1.5-7b-hf
-visual_inputs: true
-
-### method
-stage: sft
-do_train: true
-finetuning_type: lora
-lora_target: all
-
-### dataset
-dataset: mllm_demo
-template: vicuna
-cutoff_len: 1024
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llava1_5-7b/lora/sft
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 8
-learning_rate: 1.0e-4
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-fp16: true
-
-### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
--- a/examples/merge_lora/llama3_full_sft.yaml
+++ b/examples/merge_lora/llama3_full_sft.yaml
+### model
+model_name_or_path: saves/llama3-8b/full/sft
+template: llama3
+trust_remote_code: true
+
+### export
+export_dir: output/llama3_full_sft
+export_size: 5
+export_device: cpu
+export_legacy_format: false
--- a/examples/merge_lora/llama3_gptq.yaml
+++ b/examples/merge_lora/llama3_gptq.yaml
@@ -4,9 +4,9 @@ template: llama3
 trust_remote_code: true

 ### export
-export_dir: models/llama3_gptq
+export_dir: output/llama3_gptq
 export_quantization_bit: 4
 export_quantization_dataset: data/c4_demo.json
-export_size: 2
+export_size: 5
 export_device: cpu
 export_legacy_format: false
--- a/examples/merge_lora/llama3_lora_sft.yaml
+++ b/examples/merge_lora/llama3_lora_sft.yaml
@@ -4,11 +4,10 @@
 model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
 adapter_name_or_path: saves/llama3-8b/lora/sft
 template: llama3
-finetuning_type: lora
 trust_remote_code: true

 ### export
-export_dir: models/llama3_lora_sft
-export_size: 2
+export_dir: output/llama3_lora_sft
+export_size: 5
 export_device: cpu
 export_legacy_format: false
--- a/examples/merge_lora/qwen2vl_lora_sft.yaml
+++ b/examples/merge_lora/qwen2vl_lora_sft.yaml
@@ -4,11 +4,10 @@
 model_name_or_path: Qwen/Qwen2-VL-7B-Instruct
 adapter_name_or_path: saves/qwen2_vl-7b/lora/sft
 template: qwen2_vl
-finetuning_type: lora
 trust_remote_code: true

 ### export
-export_dir: models/qwen2_vl_lora_sft
-export_size: 2
+export_dir: output/qwen2_vl_lora_sft
+export_size: 5
 export_device: cpu
 export_legacy_format: false
--- a/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
+++ b/examples/qlora_single_gpu/llama3_lora_sft_aqlm.yaml
-### model
-model_name_or_path: ISTA-DASLab/Meta-Llama-3-8B-Instruct-AQLM-2Bit-1x16
-
-### method
-stage: sft
-do_train: true
-finetuning_type: lora
-lora_target: all
-
-### dataset
-dataset: identity,alpaca_en_demo
-template: llama3
-cutoff_len: 1024
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llama3-8b/lora/sft
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 8
-learning_rate: 1.0e-4
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-fp16: true
-
-### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
--- a/examples/qlora_single_gpu/llama3_lora_sft_awq.yaml
+++ b/examples/qlora_single_gpu/llama3_lora_sft_awq.yaml
-### model
-model_name_or_path: TechxGenus/Meta-Llama-3-8B-Instruct-AWQ
-
-### method
-stage: sft
-do_train: true
-finetuning_type: lora
-lora_target: all
-
-### dataset
-dataset: identity,alpaca_en_demo
-template: llama3
-cutoff_len: 1024
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llama3-8b/lora/sft
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 8
-learning_rate: 1.0e-4
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-fp16: true
-
-### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
--- a/examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml
+++ b/examples/qlora_single_gpu/llama3_lora_sft_bitsandbytes.yaml
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-quantization_bit: 4
-
-### method
-stage: sft
-do_train: true
-finetuning_type: lora
-lora_target: all
-
-### dataset
-dataset: identity,alpaca_en_demo
-template: llama3
-cutoff_len: 1024
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llama3-8b/lora/sft
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 8
-learning_rate: 1.0e-4
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-fp16: true
-
-### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
--- a/examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml
+++ b/examples/qlora_single_gpu/llama3_lora_sft_gptq.yaml
-### model
-model_name_or_path: TechxGenus/Meta-Llama-3-8B-Instruct-GPTQ
-
-### method
-stage: sft
-do_train: true
-finetuning_type: lora
-lora_target: all
-
-### dataset
-dataset: identity,alpaca_en_demo
-template: llama3
-cutoff_len: 1024
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llama3-8b/lora/sft
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 8
-learning_rate: 1.0e-4
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-fp16: true
-
-### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
--- a/examples/train_full/llama3_full_predict.yaml
+++ b/examples/train_full/llama3_full_predict.yaml
-### model
-model_name_or_path: saves/llama3-8b/full/sft
-
-### method
-stage: sft
-do_predict: true
-finetuning_type: full
-
-### dataset
-eval_dataset: identity,alpaca_en_demo
-template: llama3
-cutoff_len: 2048
-max_samples: 50
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llama3-8b/full/predict
-overwrite_output_dir: true
-
-### eval
-per_device_eval_batch_size: 1
-predict_with_generate: true
--- a/examples/train_full/llama3_full_sft.yaml
+++ b/examples/train_full/llama3_full_sft.yaml
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-trust_remote_code: true
-
-### method
-stage: sft
-do_train: true
-finetuning_type: full
-deepspeed: examples/deepspeed/ds_z3_config.json  # choices: [ds_z0_config.json, ds_z2_config.json, ds_z3_config.json]
-
-### dataset
-dataset: identity,alpaca_en_demo
-template: llama3
-cutoff_len: 2048
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llama3-8b/full/sft
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 2
-learning_rate: 1.0e-5
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-bf16: true
-ddp_timeout: 180000000
-
-### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
--- a/examples/train_full/qwen2vl_full_sft.yaml
+++ b/examples/train_full/qwen2vl_full_sft.yaml
 ### model
 model_name_or_path: Qwen/Qwen2-VL-7B-Instruct
+image_max_pixels: 262144
+video_max_pixels: 16384
 trust_remote_code: true

 ### method
@@ -18,6 +20,7 @@ cutoff_len: 2048
 max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
+dataloader_num_workers: 4

 ### output
 output_dir: saves/qwen2_vl-7b/full/sft
@@ -25,6 +28,7 @@ logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
+save_only_model: false

 ### train
 per_device_train_batch_size: 1
@@ -35,9 +39,10 @@ lr_scheduler_type: cosine
 warmup_ratio: 0.1
 bf16: true
 ddp_timeout: 180000000
+resume_from_checkpoint: null

 ### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
--- a/examples/train_lora/llama3_lora_dpo.yaml
+++ b/examples/train_lora/llama3_lora_dpo.yaml
@@ -6,6 +6,7 @@ trust_remote_code: true
 stage: dpo
 do_train: true
 finetuning_type: lora
+lora_rank: 8
 lora_target: all
 pref_beta: 0.1
 pref_loss: sigmoid  # choices: [sigmoid (dpo), orpo, simpo]
@@ -17,6 +18,7 @@ cutoff_len: 2048
 max_samples: 1000
 overwrite_cache: true
 preprocessing_num_workers: 16
+dataloader_num_workers: 4

 ### output
 output_dir: saves/llama3-8b/lora/dpo
@@ -24,6 +26,7 @@ logging_steps: 10
 save_steps: 500
 plot_loss: true
 overwrite_output_dir: true
+save_only_model: false

 ### train
 per_device_train_batch_size: 1
@@ -34,9 +37,11 @@ lr_scheduler_type: cosine
 warmup_ratio: 0.1
 bf16: true
 ddp_timeout: 180000000
+resume_from_checkpoint: null

 ### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
+# eval_dataset: dpo_en_demo
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
--- a/examples/train_lora/llama3_lora_kto.yaml
+++ b/examples/train_lora/llama3_lora_kto.yaml
@@ -6,6 +6,7 @@ trust_remote_code: true
 stage: kto
 do_train: true
 finetuning_type: lora
+lora_rank: 8
 lora_target: all
 pref_beta: 0.1

@@ -35,7 +36,7 @@ bf16: true
 ddp_timeout: 180000000

 ### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500