uodata

ca625f43 · shihm · 7164651d · ca625f43 · ca625f43 · ca625f43
Commit ca625f43 authored Mar 30, 2026 by shihm
20 changed files
--- a/examples/ascend/qwen3moe_full_sft_fsdp.yaml
+++ b/examples/ascend/qwen3moe_full_sft_fsdp.yaml
+# Start FSDP fine-tuning
+# accelerate launch \
+#     --config_file examples/accelerate/fsdp_config.yaml \
+#     src/train.py examples/ascend/qwen3moe_full_sft_fsdp.yaml
+# Change `num_processes` in fsdp_config.yaml to 16 in A3
+### model
+model_name_or_path: Qwen/Qwen3-30B-A3B-Instruct-2507
+trust_remote_code: true
+use_v1_kernels: true
+flash_attn: fa2
+### method
+stage: sft
+do_train: true
+finetuning_type: full
+disable_gradient_checkpointing: false
+### dataset
+dataset: alpaca_zh
+template: qwen3
+cutoff_len: 1024
+overwrite_cache: true
+preprocessing_num_workers: 16
+dataloader_num_workers: 4
+### output
+output_dir: saves/Qwen3-30B-A3B-Instruct-2507/full/sft
+logging_steps: 1
+save_steps: 500
+max_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+save_only_model: true
+report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]
+### train
+per_device_train_batch_size: 4
+gradient_accumulation_steps: 1
+learning_rate: 1.0e-4
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+resume_from_checkpoint: null
+seed: 1234
--- a/examples/ascend/qwen3vlmoe_full_sft_fsdp2.yaml
+++ b/examples/ascend/qwen3vlmoe_full_sft_fsdp2.yaml
+# Start FSDP2 fine-tuning
+# accelerate launch \
+#     --config_file examples/accelerate/fsdp2_config.yaml \
+#     src/train.py examples/ascend/qwen3vlmoe_full_sft_fsdp2.yaml
+# Change `num_processes` in fsdp2_config.yaml to 16 in A3
+### model
+model_name_or_path: Qwen/Qwen3-VL-30B-A3B-Instruct
+image_max_pixels: 262144
+video_max_pixels: 16384
+trust_remote_code: true
+use_v1_kernels: true
+flash_attn: fa2
+### method
+stage: sft
+do_train: true
+finetuning_type: full
+disable_gradient_checkpointing: false
+### dataset
+dataset: llava_1k_en, llava_1k_zh
+template: qwen3_vl
+cutoff_len: 1024
+overwrite_cache: true
+preprocessing_num_workers: 16
+dataloader_num_workers: 4
+### output
+output_dir: saves/Qwen3-VL-30B-A3B-Instruct/full/sft
+logging_steps: 1
+save_steps: 500
+max_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+save_only_model: true
+report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]
+### train
+per_device_train_batch_size: 2
+gradient_accumulation_steps: 1
+learning_rate: 1.0e-4
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+resume_from_checkpoint: null
+seed: 1234
--- a/examples/ascend/qwen3vlmoe_lora_sft_fsdp.yaml
+++ b/examples/ascend/qwen3vlmoe_lora_sft_fsdp.yaml
+### model
+model_name_or_path: Qwen/Qwen3-VL-30B-A3B-Instruct
+image_max_pixels: 262144
+video_max_pixels: 16384
+trust_remote_code: true
+use_v1_kernels: true  # replaced kernels: [NpuRMSNormKernel, NpuRoPEKernel, NpuQwen3VLMoEFusedMoEKernel]
+### method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_rank: 8
+lora_target: all
+disable_gradient_checkpointing: false
+flash_attn: disabled
+### dataset
+dataset: alpaca_zh_demo, alpaca_en_demo
+template: qwen3_vl
+cutoff_len: 1024
+overwrite_cache: true
+preprocessing_num_workers: 16
+dataloader_num_workers: 4
+### output
+output_dir: saves/qwen3vlmoe/lora/sft
+logging_steps: 1
+plot_loss: true
+overwrite_output_dir: true
+save_only_model: true
+report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]
+### train
+per_device_train_batch_size: 8
+gradient_accumulation_steps: 1
+learning_rate: 1.0e-4
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+resume_from_checkpoint: null
+seed: 1234
--- a/examples/deepspeed/ds_z2_autotp_config.json
+++ b/examples/deepspeed/ds_z2_autotp_config.json
+{
+  "_comment": "suooprted model list: https://www.deepspeed.ai/tutorials/automatic-tensor-parallelism/#supported-models",
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
+  "zero_allow_untested_optimizer": true,
+  "fp16": {
+    "enabled": "auto",
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 16,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "bf16": {
+    "enabled": "auto"
+  },
+  "zero_optimization": {
+    "stage": 2,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 5e8,
+    "overlap_comm": false,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 5e8,
+    "contiguous_gradients": true,
+    "round_robin_gradients": true
+  },
+  "tensor_parallel": {
+    "autotp_size": 2
+  }
+}
--- a/examples/deepspeed/ds_z3_fp8_config.json
+++ b/examples/deepspeed/ds_z3_fp8_config.json
+{
+  "train_micro_batch_size_per_gpu": "auto",
+  "gradient_clipping": "auto",
+  "zero_allow_untested_optimizer": true,
+  "zero_force_ds_cpu_optimizer": true,
+  "fp16": {
+    "enabled": false,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 16,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "bf16": {
+    "enabled": "auto"
+  },
+  "zero_optimization": {
+    "stage": 3,
+    "offload_optimizer": {
+      "device": "cpu",
+      "pin_memory": false
+    },
+    "overlap_comm": false,
+    "contiguous_gradients": true,
+    "sub_group_size": 1000000000,
+    "reduce_bucket_size": 12845056,
+    "stage3_prefetch_bucket_size": 11560550,
+    "stage3_param_persistence_threshold": 35840,
+    "stage3_max_live_parameters": 1000000000,
+    "stage3_max_reuse_distance": 1000000000,
+    "stage3_gather_16bit_weights_on_model_save": true
+  },
+  "steps_per_print": 10000000,
+  "gradient_accumulation_steps": "auto",
+  "comms_config": {
+    "verbose": false
+  },
+  "monitor_config": {
+    "enabled": true,
+    "tag": "DeepSpeedMonitor",
+    "csv_monitor": {
+      "enabled": false
+    }
+  }
+}
--- a/examples/extras/asft/llama2_full_asft.yaml
+++ b/examples/extras/asft/llama2_full_asft.yaml
+### model
+model_name_or_path: models/Llama-2-7b
+trust_remote_code: true
+### method
+stage: sft
+do_train: true
+finetuning_type: full
+deepspeed: examples/deepspeed/ds_z0_config.json
+use_asft_loss: true
+asft_alpha: 0.1
+### dataset
+dataset: med
+template: llama2
+cutoff_len: 2048
+max_samples: 10000
+overwrite_cache: true
+preprocessing_num_workers: 16
+dataloader_num_workers: 4
+### output
+output_dir: saves/llama2-7b/full/asft2
+logging_steps: 1
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+save_only_model: false
+report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]
+### train
+per_device_train_batch_size: 4
+gradient_accumulation_steps: 8
+learning_rate: 2.0e-5
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+### eval
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
--- a/examples/extras/asft/qwen2_full_asft.yaml
+++ b/examples/extras/asft/qwen2_full_asft.yaml
+### model
+model_name_or_path: models/Qwen2.5-7B
+trust_remote_code: true
+### method
+stage: sft
+do_train: true
+finetuning_type: full
+deepspeed: examples/deepspeed/ds_z0_config.json
+use_asft_loss: true
+asft_alpha: 0.05
+### dataset
+dataset: math
+template: qwen
+cutoff_len: 2048
+max_samples: 10000
+overwrite_cache: true
+preprocessing_num_workers: 16
+dataloader_num_workers: 4
+### output
+output_dir: saves/qwen2-7b/full/asft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+save_only_model: false
+report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]
+### train
+per_device_train_batch_size: 4
+gradient_accumulation_steps: 8
+learning_rate: 5.0e-5
+num_train_epochs: 1.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+### eval
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
--- a/examples/extras/dft/qwen2_full_sft.yaml
+++ b/examples/extras/dft/qwen2_full_sft.yaml
+### model
+model_name_or_path: Qwen/Qwen2-1.5B-Instruct
+trust_remote_code: true
+### method
+stage: sft
+do_train: true
+finetuning_type: full
+use_dft_loss: true
+### dataset
+dataset: identity,alpaca_en_demo
+template: qwen
+cutoff_len: 2048
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+dataloader_num_workers: 4
+### output
+output_dir: saves/qwen2-1_5b/full/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+save_only_model: false
+report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 1.0e-5
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+### eval
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
--- a/examples/extras/eaft/qwen25_05b_eaft_full.yaml
+++ b/examples/extras/eaft/qwen25_05b_eaft_full.yaml
+### model
+model_name_or_path: Qwen/Qwen2.5-0.5B-Instruct
+trust_remote_code: true
+### method
+stage: sft
+do_train: true
+finetuning_type: full
+use_eaft_loss: true
+### dataset
+dataset: identity,alpaca_en_demo
+template: qwen
+cutoff_len: 2048
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+dataloader_num_workers: 4
+### output
+output_dir: qwen2.5-0_5b/full/sft_eaft
+logging_steps: 1
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+save_only_model: false
+report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]
+### train
+per_device_train_batch_size: 2
+gradient_accumulation_steps: 8
+learning_rate: 1.0e-5
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
--- a/examples/extras/fp8/llama3_fp8_deepspeed_sft.yaml
+++ b/examples/extras/fp8/llama3_fp8_deepspeed_sft.yaml
+# FP8 training example with DeepSpeed ZeRO-3
+# This config demonstrates FP8 mixed precision training using HuggingFace Accelerate
+# with DeepSpeed providing memory optimization (not FP8 handling)
+### Model configuration
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+trust_remote_code: true
+### Method configuration
+stage: sft
+do_train: true
+finetuning_type: full
+### Dataset configuration
+dataset: identity
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+### Output configuration
+output_dir: saves/llama3-8b/fp8-deepspeed/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+### Training configuration
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 5.0e-5
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+### FP8 configuration
+fp8: true
+fp8_backend: torchao  # Use TorchAO backend for FP8
+fp8_enable_fsdp_float8_all_gather: false  # Not used with DeepSpeed
+### DeepSpeed configuration
+deepspeed: examples/deepspeed/ds_z3_fp8_config.json
+### Logging configuration
+report_to: wandb
+run_name: llama3_fp8_deepspeed_sft
--- a/examples/extras/fp8/llama3_fp8_fsdp_sft.yaml
+++ b/examples/extras/fp8/llama3_fp8_fsdp_sft.yaml
+# FP8 training example with FSDP
+# This config demonstrates FP8 mixed precision training using HuggingFace Accelerate
+# with FSDP for distributed training and float8 all-gather optimization
+### Model configuration
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+trust_remote_code: true
+### Method configuration
+stage: sft
+do_train: true
+finetuning_type: full
+### Dataset configuration
+dataset: identity
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+### Output configuration
+output_dir: saves/llama3-8b/fp8-fsdp/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+### Training configuration
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 5.0e-5
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+### FP8 configuration
+fp8: true
+fp8_backend: torchao  # Use TorchAO backend for FP8
+fp8_enable_fsdp_float8_all_gather: true  # Enable FSDP2 float8 all-gather optimization
+### FSDP configuration (using training arguments - no separate FSDP config file)
+fsdp:
+  - full_shard
+  - auto_wrap
+fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
+### Logging configuration
+report_to: wandb
+run_name: llama3_fp8_fsdp_sft
--- a/examples/extras/multi_tokens/tokens_cfg.yaml
+++ b/examples/extras/multi_tokens/tokens_cfg.yaml
+# SVG Container Tags
+"<|START_OF_SVG|>": "Marks the beginning of an SVG document"
+"<|END_OF_SVG|>": "Marks the end of an SVG document"
+# SVG Group Tags
+"<|start_of_g|>": "Begins a group element in SVG for organizing related shapes"
+"<|end_of_g|>": "Ends a group element"
+# SVG Shape Tags
+"<|start_of_rect|>": "Begins a rectangle shape with width and height attributes"
+"<|end_of_rect|>": "Ends a rectangle shape definition"
+"<|start_of_circle|>": "Begins a circular shape with radius attribute"
+"<|end_of_circle|>": "Ends a circular shape definition"
+"<|start_of_path|>": "Begins a path element for drawing custom vector graphics"
+"<|end_of_path|>": "Ends a path element definition"
+"<|start_of_ellipse|>": "Begins an ellipse shape with x and y radii"
+"<|end_of_ellipse|>": "Ends an ellipse shape definition"
+# SVG Text Tags
+"<|start_of_text|>": "Begins a text element for rendering text content"
+"<|end_of_text|>": "Ends a text element"
+# SVG Style Tags
+"<|start_of_style|>": "Begins a style definition block for CSS styling"
+"<|end_of_style|>": "Ends a style definition block"
--- a/examples/extras/oft/llama3_oft_sft.yaml
+++ b/examples/extras/oft/llama3_oft_sft.yaml
+### model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+trust_remote_code: true
+### method
+stage: sft
+do_train: true
+finetuning_type: oft
+oft_block_size: 32
+oft_target: all
+### dataset
+dataset: identity,alpaca_en_demo
+template: llama3
+cutoff_len: 2048
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+dataloader_num_workers: 4
+### output
+output_dir: saves/llama3-8b/oft/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+save_only_model: false
+report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 1.0e-4
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+resume_from_checkpoint: null
+### eval
+# eval_dataset: alpaca_en_demo
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
--- a/examples/extras/oft/qwen2_5vl_oft_sft.yaml
+++ b/examples/extras/oft/qwen2_5vl_oft_sft.yaml
+### model
+model_name_or_path: Qwen/Qwen2.5-VL-7B-Instruct
+image_max_pixels: 262144
+video_max_pixels: 16384
+trust_remote_code: true
+### method
+stage: sft
+do_train: true
+finetuning_type: oft
+oft_block_size: 32
+oft_target: all
+### dataset
+dataset: mllm_demo,identity,alpaca_en_demo  # video: mllm_video_demo
+template: qwen2_vl
+cutoff_len: 2048
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+dataloader_num_workers: 4
+### output
+output_dir: saves/qwen2_5vl-7b/oft/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+save_only_model: false
+report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 1.0e-4
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+resume_from_checkpoint: null
+### eval
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
--- a/examples/extras/qoft/llama3_oft_sft_awq.yaml
+++ b/examples/extras/qoft/llama3_oft_sft_awq.yaml
+### model
+model_name_or_path: TechxGenus/Meta-Llama-3-8B-Instruct-AWQ
+trust_remote_code: true
+### method
+stage: sft
+do_train: true
+finetuning_type: oft
+oft_block_size: 32
+oft_target: all
+### dataset
+dataset: identity,alpaca_en_demo
+template: llama3
+cutoff_len: 2048
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+dataloader_num_workers: 4
+### output
+output_dir: saves/llama3-8b/oft/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+save_only_model: false
+report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]
+### train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 8
+learning_rate: 1.0e-4
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+### eval
+# val_size: 0.1
+# per_device_eval_batch_size: 1
+# eval_strategy: steps
+# eval_steps: 500
--- a/examples/extras/qoft/llama3_oft_sft_bnb_npu.yaml
+++ b/examples/extras/qoft/llama3_oft_sft_bnb_npu.yaml
--- a/examples/extras/qoft/llama3_oft_sft_gptq.yaml
+++ b/examples/extras/qoft/llama3_oft_sft_gptq.yaml
--- a/examples/inference/qwen3.yaml
+++ b/examples/inference/qwen3.yaml
+model_name_or_path: Qwen/Qwen3-4B-Instruct-2507
+template: qwen3_nothink
+infer_backend: huggingface  # choices: [huggingface, vllm, sglang, ktransformers]
+trust_remote_code: true
--- a/examples/inference/qwen3_full_sft.yaml
+++ b/examples/inference/qwen3_full_sft.yaml
+model_name_or_path: saves/qwen3-4b/full/sft
+template: qwen3_nothink
+infer_backend: huggingface  # choices: [huggingface, vllm, sglang, ktransformers]
+trust_remote_code: true
--- a/examples/inference/qwen3_lora_sft.yaml
+++ b/examples/inference/qwen3_lora_sft.yaml
+model_name_or_path: Qwen/Qwen3-4B-Instruct-2507
+adapter_name_or_path: saves/qwen3-4b/lora/sft
+template: qwen3_nothink
+infer_backend: huggingface  # choices: [huggingface, vllm, sglang, ktransformers]
+trust_remote_code: true