uodata

ca625f43 · shihm · 7164651d · ca625f43 · ca625f43 · ca625f43
Commit ca625f43 authored Mar 30, 2026 by shihm
20 changed files
--- a/examples/megatron/qwen3_moe_full.yaml
+++ b/examples/megatron/qwen3_moe_full.yaml
+model_name_or_path: Qwen/Qwen3-30B-A3B-Instruct-2507
+# GPU memory: 8 * 78GB
+do_train: true
+stage: sft
+finetuning_type: full # only support full for now
+dataset: alpaca_en_demo
+preprocessing_num_workers: 8
+cutoff_len: 4096
+template: qwen3_nothink
+# global batchsize = (8 // 2 // 4) * 8 = 8
+output_dir: saves/mca/qwen3_moe_full
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+num_train_epochs: 2
+learning_rate: 3e-6
+logging_steps: 1
+save_steps: 100
+lr_scheduler_type: constant
+bf16: true
+# mcore speed up
+tensor_model_parallel_size: 1
+sequence_parallel: false
+pipeline_model_parallel_size: 4
+bias_activation_fusion: true
+apply_rope_fusion: true
+use_distributed_optimizer: true
+overlap_param_gather: true
+overlap_grad_reduce: true
+moe_grouped_gemm: true
+moe_token_dispatcher_type: alltoall
+expert_model_parallel_size: 2
+recompute_granularity: full
--- a/examples/merge_lora/qwen3_full_sft.yaml
+++ b/examples/merge_lora/qwen3_full_sft.yaml
+### model
+model_name_or_path: saves/qwen3-4b/full/sft
+template: qwen3_nothink
+trust_remote_code: true
+### export
+export_dir: saves/qwen3_sft_merged
+export_size: 5
+export_device: cpu  # choices: [cpu, auto]
+export_legacy_format: false
--- a/examples/merge_lora/qwen3_gptq.yaml
+++ b/examples/merge_lora/qwen3_gptq.yaml
+### model
+model_name_or_path: Qwen/Qwen3-4B-Instruct-2507
+template: qwen3_nothink
+trust_remote_code: true
+### export
+export_dir: saves/qwen3_gptq
+export_quantization_bit: 4
+export_quantization_dataset: data/c4_demo.jsonl
+export_size: 5
+export_device: cpu  # choices: [cpu, auto]
+export_legacy_format: false
--- a/examples/merge_lora/qwen3_lora_sft.yaml
+++ b/examples/merge_lora/qwen3_lora_sft.yaml
+### Note: DO NOT use quantized model or quantization_bit when merging lora adapters
+### model
+model_name_or_path: Qwen/Qwen3-4B-Instruct-2507
+adapter_name_or_path: saves/qwen3-4b/lora/sft
+template: qwen3_nothink
+trust_remote_code: true
+### export
+export_dir: saves/qwen3_sft_merged
+export_size: 5
+export_device: cpu  # choices: [cpu, auto]
+export_legacy_format: false
--- a/examples/merge_lora/qwen3vl_lora_sft.yaml
+++ b/examples/merge_lora/qwen3vl_lora_sft.yaml
+### Note: DO NOT use quantized model or quantization_bit when merging lora adapters
+### model
+model_name_or_path: Qwen/Qwen3-VL-4B-Instruct
+adapter_name_or_path: saves/qwen3-vl-4b/lora/sft
+template: qwen3_vl_nothink
+trust_remote_code: true
+### export
+export_dir: saves/qwen3_vl_sft_merged
+export_size: 5
+export_device: cpu  # choices: [cpu, auto]
+export_legacy_format: false
--- a/examples/requirements/adam-mini.txt
+++ b/examples/requirements/adam-mini.txt
+adam-mini
--- a/examples/requirements/apollo.txt
+++ b/examples/requirements/apollo.txt
+apollo-torch
--- a/examples/requirements/aqlm.txt
+++ b/examples/requirements/aqlm.txt
+aqlm[gpu]>=1.1.0
--- a/examples/requirements/badam.txt
+++ b/examples/requirements/badam.txt
+badam>=1.2.1
--- a/examples/requirements/bitsandbytes.txt
+++ b/examples/requirements/bitsandbytes.txt
+bitsandbytes>=0.39.0
--- a/examples/requirements/eetq.txt
+++ b/examples/requirements/eetq.txt
+eetq
--- a/examples/requirements/fp8-te.txt
+++ b/examples/requirements/fp8-te.txt
+transformer_engine[pytorch]>=2.0.0
+accelerate>=1.10.0
--- a/examples/requirements/fp8.txt
+++ b/examples/requirements/fp8.txt
+torchao>=0.8.0
+accelerate>=1.10.0
--- a/examples/requirements/galore.txt
+++ b/examples/requirements/galore.txt
+galore-torch
--- a/examples/requirements/gptq.txt
+++ b/examples/requirements/gptq.txt
+optimum>=1.24.0
+gptqmodel>=2.0.0
--- a/examples/requirements/hqq.txt
+++ b/examples/requirements/hqq.txt
+hqq
--- a/examples/requirements/liger-kernel.txt
+++ b/examples/requirements/liger-kernel.txt
+liger-kernel>=0.5.5
--- a/examples/requirements/minicpm-v.txt
+++ b/examples/requirements/minicpm-v.txt
+soundfile
+torchvision
+torchaudio
+vector_quantize_pytorch
+vocos
+msgpack
+referencing
+jsonschema_specifications
--- a/examples/requirements/openmind.txt
+++ b/examples/requirements/openmind.txt
+openmind
--- a/examples/requirements/sglang.txt
+++ b/examples/requirements/sglang.txt
+sglang[srt]>=0.4.5
+transformers==4.51.1