Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
LLaMA-Factory
Commits
ca625f43
Commit
ca625f43
authored
Mar 30, 2026
by
shihm
Browse files
uodata
parent
7164651d
Changes
327
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
749 additions
and
0 deletions
+749
-0
examples/ascend/qwen3moe_full_sft_fsdp.yaml
examples/ascend/qwen3moe_full_sft_fsdp.yaml
+46
-0
examples/ascend/qwen3vlmoe_full_sft_fsdp2.yaml
examples/ascend/qwen3vlmoe_full_sft_fsdp2.yaml
+48
-0
examples/ascend/qwen3vlmoe_lora_sft_fsdp.yaml
examples/ascend/qwen3vlmoe_lora_sft_fsdp.yaml
+42
-0
examples/deepspeed/ds_z2_autotp_config.json
examples/deepspeed/ds_z2_autotp_config.json
+32
-0
examples/deepspeed/ds_z3_fp8_config.json
examples/deepspeed/ds_z3_fp8_config.json
+45
-0
examples/extras/asft/llama2_full_asft.yaml
examples/extras/asft/llama2_full_asft.yaml
+45
-0
examples/extras/asft/qwen2_full_asft.yaml
examples/extras/asft/qwen2_full_asft.yaml
+45
-0
examples/extras/dft/qwen2_full_sft.yaml
examples/extras/dft/qwen2_full_sft.yaml
+43
-0
examples/extras/eaft/qwen25_05b_eaft_full.yaml
examples/extras/eaft/qwen25_05b_eaft_full.yaml
+38
-0
examples/extras/fp8/llama3_fp8_deepspeed_sft.yaml
examples/extras/fp8/llama3_fp8_deepspeed_sft.yaml
+48
-0
examples/extras/fp8/llama3_fp8_fsdp_sft.yaml
examples/extras/fp8/llama3_fp8_fsdp_sft.yaml
+51
-0
examples/extras/multi_tokens/tokens_cfg.yaml
examples/extras/multi_tokens/tokens_cfg.yaml
+25
-0
examples/extras/oft/llama3_oft_sft.yaml
examples/extras/oft/llama3_oft_sft.yaml
+46
-0
examples/extras/oft/qwen2_5vl_oft_sft.yaml
examples/extras/oft/qwen2_5vl_oft_sft.yaml
+47
-0
examples/extras/qoft/llama3_oft_sft_awq.yaml
examples/extras/qoft/llama3_oft_sft_awq.yaml
+44
-0
examples/extras/qoft/llama3_oft_sft_bnb_npu.yaml
examples/extras/qoft/llama3_oft_sft_bnb_npu.yaml
+47
-0
examples/extras/qoft/llama3_oft_sft_gptq.yaml
examples/extras/qoft/llama3_oft_sft_gptq.yaml
+44
-0
examples/inference/qwen3.yaml
examples/inference/qwen3.yaml
+4
-0
examples/inference/qwen3_full_sft.yaml
examples/inference/qwen3_full_sft.yaml
+4
-0
examples/inference/qwen3_lora_sft.yaml
examples/inference/qwen3_lora_sft.yaml
+5
-0
No files found.
examples/ascend/qwen3moe_full_sft_fsdp.yaml
0 → 100644
View file @
ca625f43
# Start FSDP fine-tuning
# accelerate launch \
# --config_file examples/accelerate/fsdp_config.yaml \
# src/train.py examples/ascend/qwen3moe_full_sft_fsdp.yaml
# Change `num_processes` in fsdp_config.yaml to 16 in A3
### model
model_name_or_path
:
Qwen/Qwen3-30B-A3B-Instruct-2507
trust_remote_code
:
true
use_v1_kernels
:
true
flash_attn
:
fa2
### method
stage
:
sft
do_train
:
true
finetuning_type
:
full
disable_gradient_checkpointing
:
false
### dataset
dataset
:
alpaca_zh
template
:
qwen3
cutoff_len
:
1024
overwrite_cache
:
true
preprocessing_num_workers
:
16
dataloader_num_workers
:
4
### output
output_dir
:
saves/Qwen3-30B-A3B-Instruct-2507/full/sft
logging_steps
:
1
save_steps
:
500
max_steps
:
500
plot_loss
:
true
overwrite_output_dir
:
true
save_only_model
:
true
report_to
:
none
# choices: [none, wandb, tensorboard, swanlab, mlflow]
### train
per_device_train_batch_size
:
4
gradient_accumulation_steps
:
1
learning_rate
:
1.0e-4
lr_scheduler_type
:
cosine
warmup_ratio
:
0.1
bf16
:
true
ddp_timeout
:
180000000
resume_from_checkpoint
:
null
seed
:
1234
examples/ascend/qwen3vlmoe_full_sft_fsdp2.yaml
0 → 100644
View file @
ca625f43
# Start FSDP2 fine-tuning
# accelerate launch \
# --config_file examples/accelerate/fsdp2_config.yaml \
# src/train.py examples/ascend/qwen3vlmoe_full_sft_fsdp2.yaml
# Change `num_processes` in fsdp2_config.yaml to 16 in A3
### model
model_name_or_path
:
Qwen/Qwen3-VL-30B-A3B-Instruct
image_max_pixels
:
262144
video_max_pixels
:
16384
trust_remote_code
:
true
use_v1_kernels
:
true
flash_attn
:
fa2
### method
stage
:
sft
do_train
:
true
finetuning_type
:
full
disable_gradient_checkpointing
:
false
### dataset
dataset
:
llava_1k_en, llava_1k_zh
template
:
qwen3_vl
cutoff_len
:
1024
overwrite_cache
:
true
preprocessing_num_workers
:
16
dataloader_num_workers
:
4
### output
output_dir
:
saves/Qwen3-VL-30B-A3B-Instruct/full/sft
logging_steps
:
1
save_steps
:
500
max_steps
:
500
plot_loss
:
true
overwrite_output_dir
:
true
save_only_model
:
true
report_to
:
none
# choices: [none, wandb, tensorboard, swanlab, mlflow]
### train
per_device_train_batch_size
:
2
gradient_accumulation_steps
:
1
learning_rate
:
1.0e-4
lr_scheduler_type
:
cosine
warmup_ratio
:
0.1
bf16
:
true
ddp_timeout
:
180000000
resume_from_checkpoint
:
null
seed
:
1234
examples/ascend/qwen3vlmoe_lora_sft_fsdp.yaml
0 → 100644
View file @
ca625f43
### model
model_name_or_path
:
Qwen/Qwen3-VL-30B-A3B-Instruct
image_max_pixels
:
262144
video_max_pixels
:
16384
trust_remote_code
:
true
use_v1_kernels
:
true
# replaced kernels: [NpuRMSNormKernel, NpuRoPEKernel, NpuQwen3VLMoEFusedMoEKernel]
### method
stage
:
sft
do_train
:
true
finetuning_type
:
lora
lora_rank
:
8
lora_target
:
all
disable_gradient_checkpointing
:
false
flash_attn
:
disabled
### dataset
dataset
:
alpaca_zh_demo, alpaca_en_demo
template
:
qwen3_vl
cutoff_len
:
1024
overwrite_cache
:
true
preprocessing_num_workers
:
16
dataloader_num_workers
:
4
### output
output_dir
:
saves/qwen3vlmoe/lora/sft
logging_steps
:
1
plot_loss
:
true
overwrite_output_dir
:
true
save_only_model
:
true
report_to
:
none
# choices: [none, wandb, tensorboard, swanlab, mlflow]
### train
per_device_train_batch_size
:
8
gradient_accumulation_steps
:
1
learning_rate
:
1.0e-4
lr_scheduler_type
:
cosine
warmup_ratio
:
0.1
bf16
:
true
ddp_timeout
:
180000000
resume_from_checkpoint
:
null
seed
:
1234
examples/deepspeed/ds_z2_autotp_config.json
0 → 100644
View file @
ca625f43
{
"_comment"
:
"suooprted model list: https://www.deepspeed.ai/tutorials/automatic-tensor-parallelism/#supported-models"
,
"train_batch_size"
:
"auto"
,
"train_micro_batch_size_per_gpu"
:
"auto"
,
"gradient_accumulation_steps"
:
"auto"
,
"gradient_clipping"
:
"auto"
,
"zero_allow_untested_optimizer"
:
true
,
"fp16"
:
{
"enabled"
:
"auto"
,
"loss_scale"
:
0
,
"loss_scale_window"
:
1000
,
"initial_scale_power"
:
16
,
"hysteresis"
:
2
,
"min_loss_scale"
:
1
},
"bf16"
:
{
"enabled"
:
"auto"
},
"zero_optimization"
:
{
"stage"
:
2
,
"allgather_partitions"
:
true
,
"allgather_bucket_size"
:
5e8
,
"overlap_comm"
:
false
,
"reduce_scatter"
:
true
,
"reduce_bucket_size"
:
5e8
,
"contiguous_gradients"
:
true
,
"round_robin_gradients"
:
true
},
"tensor_parallel"
:
{
"autotp_size"
:
2
}
}
examples/deepspeed/ds_z3_fp8_config.json
0 → 100644
View file @
ca625f43
{
"train_micro_batch_size_per_gpu"
:
"auto"
,
"gradient_clipping"
:
"auto"
,
"zero_allow_untested_optimizer"
:
true
,
"zero_force_ds_cpu_optimizer"
:
true
,
"fp16"
:
{
"enabled"
:
false
,
"loss_scale"
:
0
,
"loss_scale_window"
:
1000
,
"initial_scale_power"
:
16
,
"hysteresis"
:
2
,
"min_loss_scale"
:
1
},
"bf16"
:
{
"enabled"
:
"auto"
},
"zero_optimization"
:
{
"stage"
:
3
,
"offload_optimizer"
:
{
"device"
:
"cpu"
,
"pin_memory"
:
false
},
"overlap_comm"
:
false
,
"contiguous_gradients"
:
true
,
"sub_group_size"
:
1000000000
,
"reduce_bucket_size"
:
12845056
,
"stage3_prefetch_bucket_size"
:
11560550
,
"stage3_param_persistence_threshold"
:
35840
,
"stage3_max_live_parameters"
:
1000000000
,
"stage3_max_reuse_distance"
:
1000000000
,
"stage3_gather_16bit_weights_on_model_save"
:
true
},
"steps_per_print"
:
10000000
,
"gradient_accumulation_steps"
:
"auto"
,
"comms_config"
:
{
"verbose"
:
false
},
"monitor_config"
:
{
"enabled"
:
true
,
"tag"
:
"DeepSpeedMonitor"
,
"csv_monitor"
:
{
"enabled"
:
false
}
}
}
examples/extras/asft/llama2_full_asft.yaml
0 → 100644
View file @
ca625f43
### model
model_name_or_path
:
models/Llama-2-7b
trust_remote_code
:
true
### method
stage
:
sft
do_train
:
true
finetuning_type
:
full
deepspeed
:
examples/deepspeed/ds_z0_config.json
use_asft_loss
:
true
asft_alpha
:
0.1
### dataset
dataset
:
med
template
:
llama2
cutoff_len
:
2048
max_samples
:
10000
overwrite_cache
:
true
preprocessing_num_workers
:
16
dataloader_num_workers
:
4
### output
output_dir
:
saves/llama2-7b/full/asft2
logging_steps
:
1
save_steps
:
500
plot_loss
:
true
overwrite_output_dir
:
true
save_only_model
:
false
report_to
:
none
# choices: [none, wandb, tensorboard, swanlab, mlflow]
### train
per_device_train_batch_size
:
4
gradient_accumulation_steps
:
8
learning_rate
:
2.0e-5
num_train_epochs
:
3.0
lr_scheduler_type
:
cosine
warmup_ratio
:
0.1
bf16
:
true
ddp_timeout
:
180000000
### eval
# val_size: 0.1
# per_device_eval_batch_size: 1
# eval_strategy: steps
# eval_steps: 500
examples/extras/asft/qwen2_full_asft.yaml
0 → 100644
View file @
ca625f43
### model
model_name_or_path
:
models/Qwen2.5-7B
trust_remote_code
:
true
### method
stage
:
sft
do_train
:
true
finetuning_type
:
full
deepspeed
:
examples/deepspeed/ds_z0_config.json
use_asft_loss
:
true
asft_alpha
:
0.05
### dataset
dataset
:
math
template
:
qwen
cutoff_len
:
2048
max_samples
:
10000
overwrite_cache
:
true
preprocessing_num_workers
:
16
dataloader_num_workers
:
4
### output
output_dir
:
saves/qwen2-7b/full/asft
logging_steps
:
10
save_steps
:
500
plot_loss
:
true
overwrite_output_dir
:
true
save_only_model
:
false
report_to
:
none
# choices: [none, wandb, tensorboard, swanlab, mlflow]
### train
per_device_train_batch_size
:
4
gradient_accumulation_steps
:
8
learning_rate
:
5.0e-5
num_train_epochs
:
1.0
lr_scheduler_type
:
cosine
warmup_ratio
:
0.1
bf16
:
true
ddp_timeout
:
180000000
### eval
# val_size: 0.1
# per_device_eval_batch_size: 1
# eval_strategy: steps
# eval_steps: 500
examples/extras/dft/qwen2_full_sft.yaml
0 → 100644
View file @
ca625f43
### model
model_name_or_path
:
Qwen/Qwen2-1.5B-Instruct
trust_remote_code
:
true
### method
stage
:
sft
do_train
:
true
finetuning_type
:
full
use_dft_loss
:
true
### dataset
dataset
:
identity,alpaca_en_demo
template
:
qwen
cutoff_len
:
2048
max_samples
:
1000
overwrite_cache
:
true
preprocessing_num_workers
:
16
dataloader_num_workers
:
4
### output
output_dir
:
saves/qwen2-1_5b/full/sft
logging_steps
:
10
save_steps
:
500
plot_loss
:
true
overwrite_output_dir
:
true
save_only_model
:
false
report_to
:
none
# choices: [none, wandb, tensorboard, swanlab, mlflow]
### train
per_device_train_batch_size
:
1
gradient_accumulation_steps
:
8
learning_rate
:
1.0e-5
num_train_epochs
:
3.0
lr_scheduler_type
:
cosine
warmup_ratio
:
0.1
bf16
:
true
ddp_timeout
:
180000000
### eval
# val_size: 0.1
# per_device_eval_batch_size: 1
# eval_strategy: steps
# eval_steps: 500
examples/extras/eaft/qwen25_05b_eaft_full.yaml
0 → 100644
View file @
ca625f43
### model
model_name_or_path
:
Qwen/Qwen2.5-0.5B-Instruct
trust_remote_code
:
true
### method
stage
:
sft
do_train
:
true
finetuning_type
:
full
use_eaft_loss
:
true
### dataset
dataset
:
identity,alpaca_en_demo
template
:
qwen
cutoff_len
:
2048
max_samples
:
1000
overwrite_cache
:
true
preprocessing_num_workers
:
16
dataloader_num_workers
:
4
### output
output_dir
:
qwen2.5-0_5b/full/sft_eaft
logging_steps
:
1
save_steps
:
500
plot_loss
:
true
overwrite_output_dir
:
true
save_only_model
:
false
report_to
:
none
# choices: [none, wandb, tensorboard, swanlab, mlflow]
### train
per_device_train_batch_size
:
2
gradient_accumulation_steps
:
8
learning_rate
:
1.0e-5
num_train_epochs
:
3.0
lr_scheduler_type
:
cosine
warmup_ratio
:
0.1
bf16
:
true
ddp_timeout
:
180000000
examples/extras/fp8/llama3_fp8_deepspeed_sft.yaml
0 → 100644
View file @
ca625f43
# FP8 training example with DeepSpeed ZeRO-3
# This config demonstrates FP8 mixed precision training using HuggingFace Accelerate
# with DeepSpeed providing memory optimization (not FP8 handling)
### Model configuration
model_name_or_path
:
meta-llama/Meta-Llama-3-8B-Instruct
trust_remote_code
:
true
### Method configuration
stage
:
sft
do_train
:
true
finetuning_type
:
full
### Dataset configuration
dataset
:
identity
template
:
llama3
cutoff_len
:
1024
max_samples
:
1000
overwrite_cache
:
true
preprocessing_num_workers
:
16
### Output configuration
output_dir
:
saves/llama3-8b/fp8-deepspeed/sft
logging_steps
:
10
save_steps
:
500
plot_loss
:
true
overwrite_output_dir
:
true
### Training configuration
per_device_train_batch_size
:
1
gradient_accumulation_steps
:
8
learning_rate
:
5.0e-5
num_train_epochs
:
3.0
lr_scheduler_type
:
cosine
warmup_ratio
:
0.1
bf16
:
true
### FP8 configuration
fp8
:
true
fp8_backend
:
torchao
# Use TorchAO backend for FP8
fp8_enable_fsdp_float8_all_gather
:
false
# Not used with DeepSpeed
### DeepSpeed configuration
deepspeed
:
examples/deepspeed/ds_z3_fp8_config.json
### Logging configuration
report_to
:
wandb
run_name
:
llama3_fp8_deepspeed_sft
examples/extras/fp8/llama3_fp8_fsdp_sft.yaml
0 → 100644
View file @
ca625f43
# FP8 training example with FSDP
# This config demonstrates FP8 mixed precision training using HuggingFace Accelerate
# with FSDP for distributed training and float8 all-gather optimization
### Model configuration
model_name_or_path
:
meta-llama/Meta-Llama-3-8B-Instruct
trust_remote_code
:
true
### Method configuration
stage
:
sft
do_train
:
true
finetuning_type
:
full
### Dataset configuration
dataset
:
identity
template
:
llama3
cutoff_len
:
1024
max_samples
:
1000
overwrite_cache
:
true
preprocessing_num_workers
:
16
### Output configuration
output_dir
:
saves/llama3-8b/fp8-fsdp/sft
logging_steps
:
10
save_steps
:
500
plot_loss
:
true
overwrite_output_dir
:
true
### Training configuration
per_device_train_batch_size
:
1
gradient_accumulation_steps
:
8
learning_rate
:
5.0e-5
num_train_epochs
:
3.0
lr_scheduler_type
:
cosine
warmup_ratio
:
0.1
bf16
:
true
### FP8 configuration
fp8
:
true
fp8_backend
:
torchao
# Use TorchAO backend for FP8
fp8_enable_fsdp_float8_all_gather
:
true
# Enable FSDP2 float8 all-gather optimization
### FSDP configuration (using training arguments - no separate FSDP config file)
fsdp
:
-
full_shard
-
auto_wrap
fsdp_transformer_layer_cls_to_wrap
:
LlamaDecoderLayer
### Logging configuration
report_to
:
wandb
run_name
:
llama3_fp8_fsdp_sft
examples/extras/multi_tokens/tokens_cfg.yaml
0 → 100644
View file @
ca625f43
# SVG Container Tags
"
<|START_OF_SVG|>"
:
"
Marks
the
beginning
of
an
SVG
document"
"
<|END_OF_SVG|>"
:
"
Marks
the
end
of
an
SVG
document"
# SVG Group Tags
"
<|start_of_g|>"
:
"
Begins
a
group
element
in
SVG
for
organizing
related
shapes"
"
<|end_of_g|>"
:
"
Ends
a
group
element"
# SVG Shape Tags
"
<|start_of_rect|>"
:
"
Begins
a
rectangle
shape
with
width
and
height
attributes"
"
<|end_of_rect|>"
:
"
Ends
a
rectangle
shape
definition"
"
<|start_of_circle|>"
:
"
Begins
a
circular
shape
with
radius
attribute"
"
<|end_of_circle|>"
:
"
Ends
a
circular
shape
definition"
"
<|start_of_path|>"
:
"
Begins
a
path
element
for
drawing
custom
vector
graphics"
"
<|end_of_path|>"
:
"
Ends
a
path
element
definition"
"
<|start_of_ellipse|>"
:
"
Begins
an
ellipse
shape
with
x
and
y
radii"
"
<|end_of_ellipse|>"
:
"
Ends
an
ellipse
shape
definition"
# SVG Text Tags
"
<|start_of_text|>"
:
"
Begins
a
text
element
for
rendering
text
content"
"
<|end_of_text|>"
:
"
Ends
a
text
element"
# SVG Style Tags
"
<|start_of_style|>"
:
"
Begins
a
style
definition
block
for
CSS
styling"
"
<|end_of_style|>"
:
"
Ends
a
style
definition
block"
examples/extras/oft/llama3_oft_sft.yaml
0 → 100644
View file @
ca625f43
### model
model_name_or_path
:
meta-llama/Meta-Llama-3-8B-Instruct
trust_remote_code
:
true
### method
stage
:
sft
do_train
:
true
finetuning_type
:
oft
oft_block_size
:
32
oft_target
:
all
### dataset
dataset
:
identity,alpaca_en_demo
template
:
llama3
cutoff_len
:
2048
max_samples
:
1000
overwrite_cache
:
true
preprocessing_num_workers
:
16
dataloader_num_workers
:
4
### output
output_dir
:
saves/llama3-8b/oft/sft
logging_steps
:
10
save_steps
:
500
plot_loss
:
true
overwrite_output_dir
:
true
save_only_model
:
false
report_to
:
none
# choices: [none, wandb, tensorboard, swanlab, mlflow]
### train
per_device_train_batch_size
:
1
gradient_accumulation_steps
:
8
learning_rate
:
1.0e-4
num_train_epochs
:
3.0
lr_scheduler_type
:
cosine
warmup_ratio
:
0.1
bf16
:
true
ddp_timeout
:
180000000
resume_from_checkpoint
:
null
### eval
# eval_dataset: alpaca_en_demo
# val_size: 0.1
# per_device_eval_batch_size: 1
# eval_strategy: steps
# eval_steps: 500
examples/extras/oft/qwen2_5vl_oft_sft.yaml
0 → 100644
View file @
ca625f43
### model
model_name_or_path
:
Qwen/Qwen2.5-VL-7B-Instruct
image_max_pixels
:
262144
video_max_pixels
:
16384
trust_remote_code
:
true
### method
stage
:
sft
do_train
:
true
finetuning_type
:
oft
oft_block_size
:
32
oft_target
:
all
### dataset
dataset
:
mllm_demo,identity,alpaca_en_demo
# video: mllm_video_demo
template
:
qwen2_vl
cutoff_len
:
2048
max_samples
:
1000
overwrite_cache
:
true
preprocessing_num_workers
:
16
dataloader_num_workers
:
4
### output
output_dir
:
saves/qwen2_5vl-7b/oft/sft
logging_steps
:
10
save_steps
:
500
plot_loss
:
true
overwrite_output_dir
:
true
save_only_model
:
false
report_to
:
none
# choices: [none, wandb, tensorboard, swanlab, mlflow]
### train
per_device_train_batch_size
:
1
gradient_accumulation_steps
:
8
learning_rate
:
1.0e-4
num_train_epochs
:
3.0
lr_scheduler_type
:
cosine
warmup_ratio
:
0.1
bf16
:
true
ddp_timeout
:
180000000
resume_from_checkpoint
:
null
### eval
# val_size: 0.1
# per_device_eval_batch_size: 1
# eval_strategy: steps
# eval_steps: 500
examples/extras/qoft/llama3_oft_sft_awq.yaml
0 → 100644
View file @
ca625f43
### model
model_name_or_path
:
TechxGenus/Meta-Llama-3-8B-Instruct-AWQ
trust_remote_code
:
true
### method
stage
:
sft
do_train
:
true
finetuning_type
:
oft
oft_block_size
:
32
oft_target
:
all
### dataset
dataset
:
identity,alpaca_en_demo
template
:
llama3
cutoff_len
:
2048
max_samples
:
1000
overwrite_cache
:
true
preprocessing_num_workers
:
16
dataloader_num_workers
:
4
### output
output_dir
:
saves/llama3-8b/oft/sft
logging_steps
:
10
save_steps
:
500
plot_loss
:
true
overwrite_output_dir
:
true
save_only_model
:
false
report_to
:
none
# choices: [none, wandb, tensorboard, swanlab, mlflow]
### train
per_device_train_batch_size
:
1
gradient_accumulation_steps
:
8
learning_rate
:
1.0e-4
num_train_epochs
:
3.0
lr_scheduler_type
:
cosine
warmup_ratio
:
0.1
bf16
:
true
ddp_timeout
:
180000000
### eval
# val_size: 0.1
# per_device_eval_batch_size: 1
# eval_strategy: steps
# eval_steps: 500
examples/extras/qoft/llama3_oft_sft_bnb_npu.yaml
0 → 100644
View file @
ca625f43
This diff is collapsed.
Click to expand it.
examples/extras/qoft/llama3_oft_sft_gptq.yaml
0 → 100644
View file @
ca625f43
This diff is collapsed.
Click to expand it.
examples/inference/qwen3.yaml
0 → 100644
View file @
ca625f43
model_name_or_path
:
Qwen/Qwen3-4B-Instruct-2507
template
:
qwen3_nothink
infer_backend
:
huggingface
# choices: [huggingface, vllm, sglang, ktransformers]
trust_remote_code
:
true
examples/inference/qwen3_full_sft.yaml
0 → 100644
View file @
ca625f43
model_name_or_path
:
saves/qwen3-4b/full/sft
template
:
qwen3_nothink
infer_backend
:
huggingface
# choices: [huggingface, vllm, sglang, ktransformers]
trust_remote_code
:
true
examples/inference/qwen3_lora_sft.yaml
0 → 100644
View file @
ca625f43
model_name_or_path
:
Qwen/Qwen3-4B-Instruct-2507
adapter_name_or_path
:
saves/qwen3-4b/lora/sft
template
:
qwen3_nothink
infer_backend
:
huggingface
# choices: [huggingface, vllm, sglang, ktransformers]
trust_remote_code
:
true
Prev
1
2
3
4
5
6
7
…
17
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment