Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
LLaMA-Factory
Commits
ca625f43
Commit
ca625f43
authored
Mar 30, 2026
by
shihm
Browse files
uodata
parent
7164651d
Changes
327
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
666 additions
and
3 deletions
+666
-3
examples/requirements/swanlab.txt
examples/requirements/swanlab.txt
+1
-0
examples/requirements/vllm.txt
examples/requirements/vllm.txt
+1
-0
examples/train_full/qwen3_full_sft.yaml
examples/train_full/qwen3_full_sft.yaml
+44
-0
examples/train_full/qwen3vl_full_sft.yaml
examples/train_full/qwen3vl_full_sft.yaml
+48
-0
examples/train_lora/qwen3_lora_dpo.yaml
examples/train_lora/qwen3_lora_dpo.yaml
+47
-0
examples/train_lora/qwen3_lora_kto.yaml
examples/train_lora/qwen3_lora_kto.yaml
+43
-0
examples/train_lora/qwen3_lora_pretrain.yaml
examples/train_lora/qwen3_lora_pretrain.yaml
+44
-0
examples/train_lora/qwen3_lora_reward.yaml
examples/train_lora/qwen3_lora_reward.yaml
+45
-0
examples/train_lora/qwen3_lora_sft.sh
examples/train_lora/qwen3_lora_sft.sh
+35
-0
examples/train_lora/qwen3_lora_sft.yaml
examples/train_lora/qwen3_lora_sft.yaml
+45
-0
examples/train_lora/qwen3_lora_sft_ds3.yaml
examples/train_lora/qwen3_lora_sft_ds3.yaml
+46
-0
examples/train_lora/qwen3_lora_sft_ray.yaml
examples/train_lora/qwen3_lora_sft_ray.yaml
+60
-0
examples/train_lora/qwen3_preprocess.yaml
examples/train_lora/qwen3_preprocess.yaml
+22
-0
examples/train_lora/qwen3vl_lora_dpo.yaml
examples/train_lora/qwen3vl_lora_dpo.yaml
+48
-0
examples/train_lora/qwen3vl_lora_sft.yaml
examples/train_lora/qwen3vl_lora_sft.yaml
+46
-0
examples/train_qlora/llama3_lora_sft_aqlm.yaml
examples/train_qlora/llama3_lora_sft_aqlm.yaml
+0
-1
examples/train_qlora/llama3_lora_sft_awq.yaml
examples/train_qlora/llama3_lora_sft_awq.yaml
+0
-1
examples/train_qlora/llama3_lora_sft_gptq.yaml
examples/train_qlora/llama3_lora_sft_gptq.yaml
+0
-1
examples/train_qlora/qwen3_lora_sft_bnb_npu.yaml
examples/train_qlora/qwen3_lora_sft_bnb_npu.yaml
+46
-0
examples/train_qlora/qwen3_lora_sft_otfq.yaml
examples/train_qlora/qwen3_lora_sft_otfq.yaml
+45
-0
No files found.
examples/requirements/swanlab.txt
0 → 100644
View file @
ca625f43
swanlab
examples/requirements/vllm.txt
0 → 100644
View file @
ca625f43
vllm>=0.4.3,<=0.11.0
examples/train_full/qwen3_full_sft.yaml
0 → 100644
View file @
ca625f43
### model
model_name_or_path
:
Qwen/Qwen3-4B-Instruct-2507
trust_remote_code
:
true
### method
stage
:
sft
do_train
:
true
finetuning_type
:
full
deepspeed
:
examples/deepspeed/ds_z3_config.json
# choices: [ds_z0_config.json, ds_z2_config.json, ds_z3_config.json]
### dataset
dataset
:
identity,alpaca_en_demo
template
:
qwen3_nothink
cutoff_len
:
2048
max_samples
:
1000
preprocessing_num_workers
:
16
dataloader_num_workers
:
4
### output
output_dir
:
saves/qwen3-4b/full/sft
logging_steps
:
10
save_steps
:
500
plot_loss
:
true
overwrite_output_dir
:
true
save_only_model
:
false
report_to
:
none
# choices: [none, wandb, tensorboard, swanlab, mlflow]
### train
per_device_train_batch_size
:
1
gradient_accumulation_steps
:
2
learning_rate
:
1.0e-5
num_train_epochs
:
3.0
lr_scheduler_type
:
cosine
warmup_ratio
:
0.1
bf16
:
true
ddp_timeout
:
180000000
resume_from_checkpoint
:
null
### eval
# eval_dataset: alpaca_en_demo
# val_size: 0.1
# per_device_eval_batch_size: 1
# eval_strategy: steps
# eval_steps: 500
examples/train_full/qwen3vl_full_sft.yaml
0 → 100644
View file @
ca625f43
### model
model_name_or_path
:
Qwen/Qwen3-VL-4B-Instruct
image_max_pixels
:
262144
video_max_pixels
:
16384
trust_remote_code
:
true
### method
stage
:
sft
do_train
:
true
finetuning_type
:
full
freeze_vision_tower
:
true
freeze_multi_modal_projector
:
true
freeze_language_model
:
false
deepspeed
:
examples/deepspeed/ds_z3_config.json
### dataset
dataset
:
mllm_demo,identity,alpaca_en_demo
template
:
qwen3_vl_nothink
cutoff_len
:
2048
max_samples
:
1000
preprocessing_num_workers
:
16
dataloader_num_workers
:
4
### output
output_dir
:
saves/qwen3-vl-4b/full/sft
logging_steps
:
10
save_steps
:
500
plot_loss
:
true
overwrite_output_dir
:
true
save_only_model
:
false
report_to
:
none
# choices: [none, wandb, tensorboard, swanlab, mlflow]
### train
per_device_train_batch_size
:
1
gradient_accumulation_steps
:
2
learning_rate
:
1.0e-5
num_train_epochs
:
3.0
lr_scheduler_type
:
cosine
warmup_ratio
:
0.1
bf16
:
true
ddp_timeout
:
180000000
resume_from_checkpoint
:
null
### eval
# val_size: 0.1
# per_device_eval_batch_size: 1
# eval_strategy: steps
# eval_steps: 500
examples/train_lora/qwen3_lora_dpo.yaml
0 → 100644
View file @
ca625f43
### model
model_name_or_path
:
Qwen/Qwen3-4B-Instruct-2507
trust_remote_code
:
true
### method
stage
:
dpo
do_train
:
true
finetuning_type
:
lora
lora_rank
:
8
lora_target
:
all
pref_beta
:
0.1
pref_loss
:
sigmoid
# choices: [sigmoid (dpo), orpo, simpo]
### dataset
dataset
:
dpo_en_demo
template
:
qwen3_nothink
cutoff_len
:
2048
max_samples
:
1000
preprocessing_num_workers
:
16
dataloader_num_workers
:
4
### output
output_dir
:
saves/qwen3-4b/lora/dpo
logging_steps
:
10
save_steps
:
500
plot_loss
:
true
overwrite_output_dir
:
true
save_only_model
:
false
report_to
:
none
# choices: [none, wandb, tensorboard, swanlab, mlflow]
### train
per_device_train_batch_size
:
1
gradient_accumulation_steps
:
8
learning_rate
:
5.0e-6
num_train_epochs
:
3.0
lr_scheduler_type
:
cosine
warmup_ratio
:
0.1
bf16
:
true
ddp_timeout
:
180000000
resume_from_checkpoint
:
null
### eval
# eval_dataset: dpo_en_demo
# val_size: 0.1
# per_device_eval_batch_size: 1
# eval_strategy: steps
# eval_steps: 500
examples/train_lora/qwen3_lora_kto.yaml
0 → 100644
View file @
ca625f43
### model
model_name_or_path
:
Qwen/Qwen3-4B-Instruct-2507
trust_remote_code
:
true
### method
stage
:
kto
do_train
:
true
finetuning_type
:
lora
lora_rank
:
8
lora_target
:
all
pref_beta
:
0.1
### dataset
dataset
:
kto_en_demo
template
:
qwen3_nothink
cutoff_len
:
2048
max_samples
:
1000
preprocessing_num_workers
:
16
dataloader_num_workers
:
4
### output
output_dir
:
saves/qwen3-4b/lora/kto
logging_steps
:
10
save_steps
:
500
plot_loss
:
true
overwrite_output_dir
:
true
report_to
:
none
# choices: [none, wandb, tensorboard, swanlab, mlflow]
### train
per_device_train_batch_size
:
1
gradient_accumulation_steps
:
8
learning_rate
:
5.0e-6
num_train_epochs
:
3.0
lr_scheduler_type
:
cosine
warmup_ratio
:
0.1
bf16
:
true
ddp_timeout
:
180000000
### eval
# val_size: 0.1
# per_device_eval_batch_size: 1
# eval_strategy: steps
# eval_steps: 500
examples/train_lora/qwen3_lora_pretrain.yaml
0 → 100644
View file @
ca625f43
### model
model_name_or_path
:
Qwen/Qwen3-4B-Instruct-2507
trust_remote_code
:
true
### method
stage
:
pt
do_train
:
true
finetuning_type
:
lora
lora_rank
:
8
lora_target
:
all
### dataset
dataset
:
c4_demo
cutoff_len
:
2048
max_samples
:
1000
preprocessing_num_workers
:
16
dataloader_num_workers
:
4
### output
output_dir
:
saves/qwen3-4b/lora/pretrain
logging_steps
:
10
save_steps
:
500
plot_loss
:
true
overwrite_output_dir
:
true
save_only_model
:
false
report_to
:
none
# choices: [none, wandb, tensorboard, swanlab, mlflow]
### train
per_device_train_batch_size
:
1
gradient_accumulation_steps
:
8
learning_rate
:
1.0e-4
num_train_epochs
:
3.0
lr_scheduler_type
:
cosine
warmup_ratio
:
0.1
bf16
:
true
ddp_timeout
:
180000000
resume_from_checkpoint
:
null
### eval
# eval_dataset: c4_demo
# val_size: 0.1
# per_device_eval_batch_size: 1
# eval_strategy: steps
# eval_steps: 500
examples/train_lora/qwen3_lora_reward.yaml
0 → 100644
View file @
ca625f43
### model
model_name_or_path
:
Qwen/Qwen3-4B-Instruct-2507
trust_remote_code
:
true
### method
stage
:
rm
do_train
:
true
finetuning_type
:
lora
lora_rank
:
8
lora_target
:
all
### dataset
dataset
:
dpo_en_demo
template
:
qwen3_nothink
cutoff_len
:
2048
max_samples
:
1000
preprocessing_num_workers
:
16
dataloader_num_workers
:
4
### output
output_dir
:
saves/qwen3-4b/lora/reward
logging_steps
:
10
save_steps
:
500
plot_loss
:
true
overwrite_output_dir
:
true
save_only_model
:
false
report_to
:
none
# choices: [none, wandb, tensorboard, swanlab, mlflow]
### train
per_device_train_batch_size
:
1
gradient_accumulation_steps
:
8
learning_rate
:
1.0e-4
num_train_epochs
:
3.0
lr_scheduler_type
:
cosine
warmup_ratio
:
0.1
bf16
:
true
ddp_timeout
:
180000000
resume_from_checkpoint
:
null
### eval
# eval_dataset: dpo_en_demo
# val_size: 0.1
# per_device_eval_batch_size: 1
# eval_strategy: steps
# eval_steps: 500
examples/train_lora/qwen3_lora_sft.sh
0 → 100644
View file @
ca625f43
#!/bin/bash
set
-x
MODEL_PATH
=
Qwen/Qwen3-4B-Instruct-2507
llamafactory-cli train
\
--model_name_or_path
${
MODEL_PATH
}
\
--trust_remote_code
\
--stage
sft
\
--do_train
\
--finetuning_type
lora
\
--lora_rank
8
\
--lora_target
all
\
--dataset
identity,alpaca_en_demo
\
--template
qwen3_nothink
\
--cutoff_len
2048
\
--max_samples
1000
\
--preprocessing_num_workers
16
\
--dataloader_num_workers
4
\
--output_dir
saves/qwen3-4b/lora/sft
\
--logging_steps
10
\
--save_steps
500
\
--plot_loss
\
--overwrite_output_dir
\
--save_only_model
false
\
--report_to
none
\
--per_device_train_batch_size
1
\
--gradient_accumulation_steps
8
\
--learning_rate
1e-4
\
--num_train_epochs
3.0
\
--lr_scheduler_type
cosine
\
--warmup_ratio
0.1
\
--bf16
\
--ddp_timeout
180000000
examples/train_lora/qwen3_lora_sft.yaml
0 → 100644
View file @
ca625f43
### model
model_name_or_path
:
Qwen/Qwen3-4B-Instruct-2507
trust_remote_code
:
true
### method
stage
:
sft
do_train
:
true
finetuning_type
:
lora
lora_rank
:
8
lora_target
:
all
### dataset
dataset
:
identity,alpaca_en_demo
template
:
qwen3_nothink
cutoff_len
:
2048
max_samples
:
1000
preprocessing_num_workers
:
16
dataloader_num_workers
:
4
### output
output_dir
:
saves/qwen3-4b/lora/sft
logging_steps
:
10
save_steps
:
500
plot_loss
:
true
overwrite_output_dir
:
true
save_only_model
:
false
report_to
:
none
# choices: [none, wandb, tensorboard, swanlab, mlflow]
### train
per_device_train_batch_size
:
1
gradient_accumulation_steps
:
8
learning_rate
:
1.0e-4
num_train_epochs
:
3.0
lr_scheduler_type
:
cosine
warmup_ratio
:
0.1
bf16
:
true
ddp_timeout
:
180000000
resume_from_checkpoint
:
null
### eval
# eval_dataset: alpaca_en_demo
# val_size: 0.1
# per_device_eval_batch_size: 1
# eval_strategy: steps
# eval_steps: 500
examples/train_lora/qwen3_lora_sft_ds3.yaml
0 → 100644
View file @
ca625f43
### model
model_name_or_path
:
Qwen/Qwen3-4B-Instruct-2507
trust_remote_code
:
true
### method
stage
:
sft
do_train
:
true
finetuning_type
:
lora
lora_rank
:
8
lora_target
:
all
deepspeed
:
examples/deepspeed/ds_z3_config.json
# choices: [ds_z0_config.json, ds_z2_config.json, ds_z3_config.json]
### dataset
dataset
:
identity,alpaca_en_demo
template
:
qwen3_nothink
cutoff_len
:
2048
max_samples
:
1000
preprocessing_num_workers
:
16
dataloader_num_workers
:
4
### output
output_dir
:
saves/qwen3-4b/lora/sft
logging_steps
:
10
save_steps
:
500
plot_loss
:
true
overwrite_output_dir
:
true
save_only_model
:
false
report_to
:
none
# choices: [none, wandb, tensorboard, swanlab, mlflow]
### train
per_device_train_batch_size
:
1
gradient_accumulation_steps
:
2
learning_rate
:
1.0e-4
num_train_epochs
:
3.0
lr_scheduler_type
:
cosine
warmup_ratio
:
0.1
bf16
:
true
ddp_timeout
:
180000000
resume_from_checkpoint
:
null
### eval
# eval_dataset: alpaca_en_demo
# val_size: 0.1
# per_device_eval_batch_size: 1
# eval_strategy: steps
# eval_steps: 500
examples/train_lora/qwen3_lora_sft_ray.yaml
0 → 100644
View file @
ca625f43
### model
model_name_or_path
:
Qwen/Qwen3-4B-Instruct-2507
# or use local absolute path
trust_remote_code
:
true
### method
stage
:
sft
do_train
:
true
finetuning_type
:
lora
lora_rank
:
8
lora_target
:
all
### dataset
dataset
:
identity,alpaca_en_demo
dataset_dir
:
REMOTE:llamafactory/demo_data
# or use local absolute path
template
:
qwen3_nothink
cutoff_len
:
2048
max_samples
:
1000
preprocessing_num_workers
:
16
dataloader_num_workers
:
4
### output
output_dir
:
tmp_dir
logging_steps
:
10
save_steps
:
500
plot_loss
:
true
overwrite_output_dir
:
true
save_only_model
:
false
report_to
:
none
# choices: [none, wandb, tensorboard, swanlab, mlflow]
### ray
ray_run_name
:
qwen3_4b_sft_lora
ray_storage_path
:
./saves
ray_num_workers
:
4
# Number of GPUs to use.
placement_strategy
:
PACK
resources_per_worker
:
GPU
:
1
# ray_init_kwargs:
# runtime_env:
# env_vars:
# <YOUR-ENV-VAR-HERE>: "<YOUR-ENV-VAR-HERE>"
# pip:
# - emoji
### train
per_device_train_batch_size
:
1
gradient_accumulation_steps
:
8
learning_rate
:
1.0e-4
num_train_epochs
:
3.0
lr_scheduler_type
:
cosine
warmup_ratio
:
0.1
bf16
:
true
ddp_timeout
:
180000000
resume_from_checkpoint
:
null
### eval
# eval_dataset: alpaca_en_demo
# val_size: 0.1
# per_device_eval_batch_size: 1
# eval_strategy: steps
# eval_steps: 500
examples/train_lora/qwen3_preprocess.yaml
0 → 100644
View file @
ca625f43
### model
model_name_or_path
:
Qwen/Qwen3-4B-Instruct-2507
trust_remote_code
:
true
### method
stage
:
sft
do_train
:
true
finetuning_type
:
lora
lora_rank
:
8
lora_target
:
all
### dataset
dataset
:
identity,alpaca_en_demo
template
:
qwen3_nothink
cutoff_len
:
2048
max_samples
:
1000
preprocessing_num_workers
:
16
tokenized_path
:
saves/qwen3-4b/dataset/sft
### output (not used)
output_dir
:
saves/qwen3-4b/lora/sft
overwrite_output_dir
:
true
examples/train_lora/qwen3vl_lora_dpo.yaml
0 → 100644
View file @
ca625f43
### model
model_name_or_path
:
Qwen/Qwen3-VL-4B-Instruct
image_max_pixels
:
262144
video_max_pixels
:
16384
trust_remote_code
:
true
### method
stage
:
dpo
do_train
:
true
finetuning_type
:
lora
lora_rank
:
8
lora_target
:
all
pref_beta
:
0.1
pref_loss
:
sigmoid
# choices: [sigmoid (dpo), orpo, simpo]
### dataset
dataset
:
rlhf_v
template
:
qwen3_vl_nothink
cutoff_len
:
2048
max_samples
:
1000
preprocessing_num_workers
:
16
dataloader_num_workers
:
4
### output
output_dir
:
saves/qwen3-vl-4b/lora/dpo
logging_steps
:
10
save_steps
:
500
plot_loss
:
true
overwrite_output_dir
:
true
save_only_model
:
false
report_to
:
none
# choices: [none, wandb, tensorboard, swanlab, mlflow]
### train
per_device_train_batch_size
:
1
gradient_accumulation_steps
:
8
learning_rate
:
5.0e-6
num_train_epochs
:
3.0
lr_scheduler_type
:
cosine
warmup_ratio
:
0.1
bf16
:
true
ddp_timeout
:
180000000
resume_from_checkpoint
:
null
### eval
# val_size: 0.1
# per_device_eval_batch_size: 1
# eval_strategy: steps
# eval_steps: 500
examples/train_lora/qwen3vl_lora_sft.yaml
0 → 100644
View file @
ca625f43
### model
model_name_or_path
:
Qwen/Qwen3-VL-4B-Instruct
image_max_pixels
:
262144
video_max_pixels
:
16384
trust_remote_code
:
true
### method
stage
:
sft
do_train
:
true
finetuning_type
:
lora
lora_rank
:
8
lora_target
:
all
### dataset
dataset
:
mllm_demo,identity,alpaca_en_demo
# video: mllm_video_demo
template
:
qwen3_vl_nothink
cutoff_len
:
2048
max_samples
:
1000
preprocessing_num_workers
:
16
dataloader_num_workers
:
4
### output
output_dir
:
saves/qwen3-vl-4b/lora/sft
logging_steps
:
10
save_steps
:
500
plot_loss
:
true
overwrite_output_dir
:
true
save_only_model
:
false
report_to
:
none
# choices: [none, wandb, tensorboard, swanlab, mlflow]
### train
per_device_train_batch_size
:
1
gradient_accumulation_steps
:
8
learning_rate
:
1.0e-4
num_train_epochs
:
3.0
lr_scheduler_type
:
cosine
warmup_ratio
:
0.1
bf16
:
true
ddp_timeout
:
180000000
resume_from_checkpoint
:
null
### eval
# val_size: 0.1
# per_device_eval_batch_size: 1
# eval_strategy: steps
# eval_steps: 500
examples/train_qlora/llama3_lora_sft_aqlm.yaml
View file @
ca625f43
...
@@ -14,7 +14,6 @@ dataset: identity,alpaca_en_demo
...
@@ -14,7 +14,6 @@ dataset: identity,alpaca_en_demo
template
:
llama3
template
:
llama3
cutoff_len
:
2048
cutoff_len
:
2048
max_samples
:
1000
max_samples
:
1000
overwrite_cache
:
true
preprocessing_num_workers
:
16
preprocessing_num_workers
:
16
dataloader_num_workers
:
4
dataloader_num_workers
:
4
...
...
examples/train_qlora/llama3_lora_sft_awq.yaml
View file @
ca625f43
...
@@ -14,7 +14,6 @@ dataset: identity,alpaca_en_demo
...
@@ -14,7 +14,6 @@ dataset: identity,alpaca_en_demo
template
:
llama3
template
:
llama3
cutoff_len
:
2048
cutoff_len
:
2048
max_samples
:
1000
max_samples
:
1000
overwrite_cache
:
true
preprocessing_num_workers
:
16
preprocessing_num_workers
:
16
dataloader_num_workers
:
4
dataloader_num_workers
:
4
...
...
examples/train_qlora/llama3_lora_sft_gptq.yaml
View file @
ca625f43
...
@@ -14,7 +14,6 @@ dataset: identity,alpaca_en_demo
...
@@ -14,7 +14,6 @@ dataset: identity,alpaca_en_demo
template
:
llama3
template
:
llama3
cutoff_len
:
2048
cutoff_len
:
2048
max_samples
:
1000
max_samples
:
1000
overwrite_cache
:
true
preprocessing_num_workers
:
16
preprocessing_num_workers
:
16
dataloader_num_workers
:
4
dataloader_num_workers
:
4
...
...
examples/train_qlora/qwen3_lora_sft_bnb_npu.yaml
0 → 100644
View file @
ca625f43
### model
model_name_or_path
:
Qwen/Qwen3-4B-Instruct-2507
quantization_bit
:
4
quantization_method
:
bnb
double_quantization
:
false
trust_remote_code
:
true
### method
stage
:
sft
do_train
:
true
finetuning_type
:
lora
lora_rank
:
8
lora_target
:
all
### dataset
dataset
:
identity,alpaca_en_demo
template
:
qwen3_nothink
cutoff_len
:
2048
max_samples
:
1000
preprocessing_num_workers
:
16
dataloader_num_workers
:
4
### output
output_dir
:
saves/qwen3-4b/lora/sft
logging_steps
:
10
save_steps
:
500
plot_loss
:
true
overwrite_output_dir
:
true
save_only_model
:
false
report_to
:
none
# choices: [none, wandb, tensorboard, swanlab, mlflow]
### train
per_device_train_batch_size
:
1
gradient_accumulation_steps
:
8
learning_rate
:
1.0e-4
num_train_epochs
:
3.0
lr_scheduler_type
:
cosine
warmup_ratio
:
0.1
bf16
:
true
ddp_timeout
:
180000000
### eval
# val_size: 0.1
# per_device_eval_batch_size: 1
# eval_strategy: steps
# eval_steps: 500
examples/train_qlora/qwen3_lora_sft_otfq.yaml
0 → 100644
View file @
ca625f43
### model
model_name_or_path
:
Qwen/Qwen3-4B-Instruct-2507
quantization_bit
:
4
# choices: [8 (bnb/hqq/eetq), 4 (bnb/hqq), 3 (hqq), 2 (hqq)]
quantization_method
:
bnb
# choices: [bnb, hqq, eetq]
trust_remote_code
:
true
### method
stage
:
sft
do_train
:
true
finetuning_type
:
lora
lora_rank
:
8
lora_target
:
all
### dataset
dataset
:
identity,alpaca_en_demo
template
:
qwen3_nothink
cutoff_len
:
2048
max_samples
:
1000
preprocessing_num_workers
:
16
dataloader_num_workers
:
4
### output
output_dir
:
saves/qwen3-4b/lora/sft
logging_steps
:
10
save_steps
:
500
plot_loss
:
true
overwrite_output_dir
:
true
save_only_model
:
false
report_to
:
none
# choices: [none, wandb, tensorboard, swanlab, mlflow]
### train
per_device_train_batch_size
:
1
gradient_accumulation_steps
:
8
learning_rate
:
1.0e-4
num_train_epochs
:
3.0
lr_scheduler_type
:
cosine
warmup_ratio
:
0.1
bf16
:
true
ddp_timeout
:
180000000
### eval
# val_size: 0.1
# per_device_eval_batch_size: 1
# eval_strategy: steps
# eval_steps: 500
Prev
1
2
3
4
5
6
7
8
9
10
…
17
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment