Commit ca625f43 authored by shihm's avatar shihm
Browse files

uodata

parent 7164651d
model_name_or_path: Qwen/Qwen3-30B-A3B-Instruct-2507
# GPU memory: 8 * 78GB
do_train: true
stage: sft
finetuning_type: full # only support full for now
dataset: alpaca_en_demo
preprocessing_num_workers: 8
cutoff_len: 4096
template: qwen3_nothink
# global batchsize = (8 // 2 // 4) * 8 = 8
output_dir: saves/mca/qwen3_moe_full
per_device_train_batch_size: 1
gradient_accumulation_steps: 8
num_train_epochs: 2
learning_rate: 3e-6
logging_steps: 1
save_steps: 100
lr_scheduler_type: constant
bf16: true
# mcore speed up
tensor_model_parallel_size: 1
sequence_parallel: false
pipeline_model_parallel_size: 4
bias_activation_fusion: true
apply_rope_fusion: true
use_distributed_optimizer: true
overlap_param_gather: true
overlap_grad_reduce: true
moe_grouped_gemm: true
moe_token_dispatcher_type: alltoall
expert_model_parallel_size: 2
recompute_granularity: full
### model
model_name_or_path: saves/qwen3-4b/full/sft
template: qwen3_nothink
trust_remote_code: true
### export
export_dir: saves/qwen3_sft_merged
export_size: 5
export_device: cpu # choices: [cpu, auto]
export_legacy_format: false
### model
model_name_or_path: Qwen/Qwen3-4B-Instruct-2507
template: qwen3_nothink
trust_remote_code: true
### export
export_dir: saves/qwen3_gptq
export_quantization_bit: 4
export_quantization_dataset: data/c4_demo.jsonl
export_size: 5
export_device: cpu # choices: [cpu, auto]
export_legacy_format: false
### Note: DO NOT use quantized model or quantization_bit when merging lora adapters
### model
model_name_or_path: Qwen/Qwen3-4B-Instruct-2507
adapter_name_or_path: saves/qwen3-4b/lora/sft
template: qwen3_nothink
trust_remote_code: true
### export
export_dir: saves/qwen3_sft_merged
export_size: 5
export_device: cpu # choices: [cpu, auto]
export_legacy_format: false
### Note: DO NOT use quantized model or quantization_bit when merging lora adapters
### model
model_name_or_path: Qwen/Qwen3-VL-4B-Instruct
adapter_name_or_path: saves/qwen3-vl-4b/lora/sft
template: qwen3_vl_nothink
trust_remote_code: true
### export
export_dir: saves/qwen3_vl_sft_merged
export_size: 5
export_device: cpu # choices: [cpu, auto]
export_legacy_format: false
transformer_engine[pytorch]>=2.0.0
accelerate>=1.10.0
torchao>=0.8.0
accelerate>=1.10.0
optimum>=1.24.0
gptqmodel>=2.0.0
soundfile
torchvision
torchaudio
vector_quantize_pytorch
vocos
msgpack
referencing
jsonschema_specifications
sglang[srt]>=0.4.5
transformers==4.51.1
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment