Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenych
llama-grpo
Commits
c7c477c7
Commit
c7c477c7
authored
Sep 24, 2025
by
chenych
Browse files
add grpo
parents
Pipeline
#2942
failed with stages
in 0 seconds
Changes
282
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
682 additions
and
0 deletions
+682
-0
examples/accelerate/fsdp_config_offload.yaml
examples/accelerate/fsdp_config_offload.yaml
+25
-0
examples/deepspeed/ds_z0_config.json
examples/deepspeed/ds_z0_config.json
+28
-0
examples/deepspeed/ds_z2_config.json
examples/deepspeed/ds_z2_config.json
+28
-0
examples/deepspeed/ds_z2_offload_config.json
examples/deepspeed/ds_z2_offload_config.json
+32
-0
examples/deepspeed/ds_z3_config.json
examples/deepspeed/ds_z3_config.json
+30
-0
examples/deepspeed/ds_z3_offload_config.json
examples/deepspeed/ds_z3_offload_config.json
+38
-0
examples/extras/adam_mini/qwen2_full_sft.yaml
examples/extras/adam_mini/qwen2_full_sft.yaml
+43
-0
examples/extras/apollo/llama3_full_sft.yaml
examples/extras/apollo/llama3_full_sft.yaml
+48
-0
examples/extras/badam/llama3_full_sft.yaml
examples/extras/badam/llama3_full_sft.yaml
+46
-0
examples/extras/fsdp_qlora/llama3_lora_sft.yaml
examples/extras/fsdp_qlora/llama3_lora_sft.yaml
+45
-0
examples/extras/fsdp_qlora/train.sh
examples/extras/fsdp_qlora/train.sh
+6
-0
examples/extras/galore/llama3_full_sft.yaml
examples/extras/galore/llama3_full_sft.yaml
+47
-0
examples/extras/llama_pro/expand.sh
examples/extras/llama_pro/expand.sh
+6
-0
examples/extras/llama_pro/llama3_freeze_sft.yaml
examples/extras/llama_pro/llama3_freeze_sft.yaml
+45
-0
examples/extras/loraplus/llama3_lora_sft.yaml
examples/extras/loraplus/llama3_lora_sft.yaml
+45
-0
examples/extras/mod/llama3_full_sft.yaml
examples/extras/mod/llama3_full_sft.yaml
+44
-0
examples/extras/muon/qwen2_full_sft.yaml
examples/extras/muon/qwen2_full_sft.yaml
+43
-0
examples/extras/nlg_eval/llama3_lora_predict.yaml
examples/extras/nlg_eval/llama3_lora_predict.yaml
+31
-0
examples/extras/pissa/init.sh
examples/extras/pissa/init.sh
+5
-0
examples/extras/pissa/llama3_lora_sft.yaml
examples/extras/pissa/llama3_lora_sft.yaml
+47
-0
No files found.
examples/accelerate/fsdp_config_offload.yaml
0 → 100644
View file @
c7c477c7
compute_environment
:
LOCAL_MACHINE
debug
:
false
distributed_type
:
FSDP
downcast_bf16
:
'
no'
fsdp_config
:
fsdp_auto_wrap_policy
:
TRANSFORMER_BASED_WRAP
fsdp_backward_prefetch
:
BACKWARD_PRE
fsdp_forward_prefetch
:
false
fsdp_cpu_ram_efficient_loading
:
true
fsdp_offload_params
:
true
# offload may affect training speed
fsdp_sharding_strategy
:
FULL_SHARD
fsdp_state_dict_type
:
FULL_STATE_DICT
fsdp_sync_module_states
:
true
fsdp_use_orig_params
:
true
machine_rank
:
0
main_training_function
:
main
mixed_precision
:
bf16
# or fp16
num_machines
:
1
# the number of nodes
num_processes
:
2
# the number of GPUs in all nodes
rdzv_backend
:
static
same_network
:
true
tpu_env
:
[]
tpu_use_cluster
:
false
tpu_use_sudo
:
false
use_cpu
:
false
examples/deepspeed/ds_z0_config.json
0 → 100644
View file @
c7c477c7
{
"train_batch_size"
:
"auto"
,
"train_micro_batch_size_per_gpu"
:
"auto"
,
"gradient_accumulation_steps"
:
"auto"
,
"gradient_clipping"
:
"auto"
,
"zero_allow_untested_optimizer"
:
true
,
"fp16"
:
{
"enabled"
:
"auto"
,
"loss_scale"
:
0
,
"loss_scale_window"
:
1000
,
"initial_scale_power"
:
16
,
"hysteresis"
:
2
,
"min_loss_scale"
:
1
},
"bf16"
:
{
"enabled"
:
"auto"
},
"zero_optimization"
:
{
"stage"
:
0
,
"allgather_partitions"
:
true
,
"allgather_bucket_size"
:
5e8
,
"overlap_comm"
:
false
,
"reduce_scatter"
:
true
,
"reduce_bucket_size"
:
5e8
,
"contiguous_gradients"
:
true
,
"round_robin_gradients"
:
true
}
}
examples/deepspeed/ds_z2_config.json
0 → 100644
View file @
c7c477c7
{
"train_batch_size"
:
"auto"
,
"train_micro_batch_size_per_gpu"
:
"auto"
,
"gradient_accumulation_steps"
:
"auto"
,
"gradient_clipping"
:
"auto"
,
"zero_allow_untested_optimizer"
:
true
,
"fp16"
:
{
"enabled"
:
"auto"
,
"loss_scale"
:
0
,
"loss_scale_window"
:
1000
,
"initial_scale_power"
:
16
,
"hysteresis"
:
2
,
"min_loss_scale"
:
1
},
"bf16"
:
{
"enabled"
:
"auto"
},
"zero_optimization"
:
{
"stage"
:
2
,
"allgather_partitions"
:
true
,
"allgather_bucket_size"
:
5e8
,
"overlap_comm"
:
false
,
"reduce_scatter"
:
true
,
"reduce_bucket_size"
:
5e8
,
"contiguous_gradients"
:
true
,
"round_robin_gradients"
:
true
}
}
examples/deepspeed/ds_z2_offload_config.json
0 → 100644
View file @
c7c477c7
{
"train_batch_size"
:
"auto"
,
"train_micro_batch_size_per_gpu"
:
"auto"
,
"gradient_accumulation_steps"
:
"auto"
,
"gradient_clipping"
:
"auto"
,
"zero_allow_untested_optimizer"
:
true
,
"fp16"
:
{
"enabled"
:
"auto"
,
"loss_scale"
:
0
,
"loss_scale_window"
:
1000
,
"initial_scale_power"
:
16
,
"hysteresis"
:
2
,
"min_loss_scale"
:
1
},
"bf16"
:
{
"enabled"
:
"auto"
},
"zero_optimization"
:
{
"stage"
:
2
,
"offload_optimizer"
:
{
"device"
:
"cpu"
,
"pin_memory"
:
true
},
"allgather_partitions"
:
true
,
"allgather_bucket_size"
:
5e8
,
"overlap_comm"
:
false
,
"reduce_scatter"
:
true
,
"reduce_bucket_size"
:
5e8
,
"contiguous_gradients"
:
true
,
"round_robin_gradients"
:
true
}
}
examples/deepspeed/ds_z3_config.json
0 → 100644
View file @
c7c477c7
{
"train_batch_size"
:
"auto"
,
"train_micro_batch_size_per_gpu"
:
"auto"
,
"gradient_accumulation_steps"
:
"auto"
,
"gradient_clipping"
:
"auto"
,
"zero_allow_untested_optimizer"
:
true
,
"fp16"
:
{
"enabled"
:
"auto"
,
"loss_scale"
:
0
,
"loss_scale_window"
:
1000
,
"initial_scale_power"
:
16
,
"hysteresis"
:
2
,
"min_loss_scale"
:
1
},
"bf16"
:
{
"enabled"
:
"auto"
},
"zero_optimization"
:
{
"stage"
:
3
,
"overlap_comm"
:
false
,
"contiguous_gradients"
:
true
,
"sub_group_size"
:
1e9
,
"reduce_bucket_size"
:
"auto"
,
"stage3_prefetch_bucket_size"
:
"auto"
,
"stage3_param_persistence_threshold"
:
"auto"
,
"stage3_max_live_parameters"
:
1e9
,
"stage3_max_reuse_distance"
:
1e9
,
"stage3_gather_16bit_weights_on_model_save"
:
true
}
}
examples/deepspeed/ds_z3_offload_config.json
0 → 100644
View file @
c7c477c7
{
"train_batch_size"
:
"auto"
,
"train_micro_batch_size_per_gpu"
:
"auto"
,
"gradient_accumulation_steps"
:
"auto"
,
"gradient_clipping"
:
"auto"
,
"zero_allow_untested_optimizer"
:
true
,
"fp16"
:
{
"enabled"
:
"auto"
,
"loss_scale"
:
0
,
"loss_scale_window"
:
1000
,
"initial_scale_power"
:
16
,
"hysteresis"
:
2
,
"min_loss_scale"
:
1
},
"bf16"
:
{
"enabled"
:
"auto"
},
"zero_optimization"
:
{
"stage"
:
3
,
"offload_optimizer"
:
{
"device"
:
"cpu"
,
"pin_memory"
:
true
},
"offload_param"
:
{
"device"
:
"cpu"
,
"pin_memory"
:
true
},
"overlap_comm"
:
false
,
"contiguous_gradients"
:
true
,
"sub_group_size"
:
1e9
,
"reduce_bucket_size"
:
"auto"
,
"stage3_prefetch_bucket_size"
:
"auto"
,
"stage3_param_persistence_threshold"
:
"auto"
,
"stage3_max_live_parameters"
:
1e9
,
"stage3_max_reuse_distance"
:
1e9
,
"stage3_gather_16bit_weights_on_model_save"
:
true
}
}
examples/extras/adam_mini/qwen2_full_sft.yaml
0 → 100644
View file @
c7c477c7
### model
model_name_or_path
:
Qwen/Qwen2-1.5B-Instruct
trust_remote_code
:
true
### method
stage
:
sft
do_train
:
true
finetuning_type
:
full
use_adam_mini
:
true
### dataset
dataset
:
identity,alpaca_en_demo
template
:
qwen
cutoff_len
:
2048
max_samples
:
1000
overwrite_cache
:
true
preprocessing_num_workers
:
16
dataloader_num_workers
:
4
### output
output_dir
:
saves/qwen2-1_5b/full/sft
logging_steps
:
10
save_steps
:
500
plot_loss
:
true
overwrite_output_dir
:
true
save_only_model
:
false
report_to
:
none
# choices: [none, wandb, tensorboard, swanlab, mlflow]
### train
per_device_train_batch_size
:
1
gradient_accumulation_steps
:
8
learning_rate
:
1.0e-5
num_train_epochs
:
3.0
lr_scheduler_type
:
cosine
warmup_ratio
:
0.1
bf16
:
true
ddp_timeout
:
180000000
### eval
# val_size: 0.1
# per_device_eval_batch_size: 1
# eval_strategy: steps
# eval_steps: 500
examples/extras/apollo/llama3_full_sft.yaml
0 → 100644
View file @
c7c477c7
### model
model_name_or_path
:
meta-llama/Meta-Llama-3-8B-Instruct
trust_remote_code
:
true
### method
stage
:
sft
do_train
:
true
finetuning_type
:
full
use_apollo
:
true
apollo_layerwise
:
true
# choices: [true, false], use false for DDP training
apollo_target
:
all
apollo_rank
:
128
apollo_scale
:
32.0
apollo_scale_type
:
channel
### dataset
dataset
:
identity,alpaca_en_demo
template
:
llama3
cutoff_len
:
2048
max_samples
:
1000
overwrite_cache
:
true
preprocessing_num_workers
:
16
dataloader_num_workers
:
4
### output
output_dir
:
saves/llama3-8b/full/sft
logging_steps
:
10
save_steps
:
500
plot_loss
:
true
overwrite_output_dir
:
true
save_only_model
:
false
report_to
:
none
# choices: [none, wandb, tensorboard, swanlab, mlflow]
### train
per_device_train_batch_size
:
1
gradient_accumulation_steps
:
1
# use 1 for layerwise apollo
learning_rate
:
1.0e-5
num_train_epochs
:
3.0
lr_scheduler_type
:
cosine
warmup_ratio
:
0.1
pure_bf16
:
true
ddp_timeout
:
180000000
### eval
# val_size: 0.1
# per_device_eval_batch_size: 1
# eval_strategy: steps
# eval_steps: 500
examples/extras/badam/llama3_full_sft.yaml
0 → 100644
View file @
c7c477c7
### model
model_name_or_path
:
meta-llama/Meta-Llama-3-8B-Instruct
trust_remote_code
:
true
### method
stage
:
sft
do_train
:
true
finetuning_type
:
full
use_badam
:
true
badam_mode
:
layer
badam_switch_mode
:
ascending
badam_switch_interval
:
50
badam_verbose
:
2
# deepspeed: examples/deepspeed/ds_z3_config.json
### dataset
dataset
:
identity,alpaca_en_demo
template
:
llama3
cutoff_len
:
2048
max_samples
:
1000
overwrite_cache
:
true
preprocessing_num_workers
:
16
dataloader_num_workers
:
4
### output
output_dir
:
saves/llama3-8b/full/sft
logging_steps
:
10
save_steps
:
500
plot_loss
:
true
overwrite_output_dir
:
true
save_only_model
:
false
report_to
:
none
# choices: [none, wandb, tensorboard, swanlab, mlflow]
### train
per_device_train_batch_size
:
1
gradient_accumulation_steps
:
8
learning_rate
:
1.0e-5
num_train_epochs
:
3.0
lr_scheduler_type
:
cosine
warmup_ratio
:
0.1
### eval
# val_size: 0.1
# per_device_eval_batch_size: 1
# eval_strategy: steps
# eval_steps: 500
examples/extras/fsdp_qlora/llama3_lora_sft.yaml
0 → 100644
View file @
c7c477c7
### model
model_name_or_path
:
meta-llama/Meta-Llama-3-8B-Instruct
quantization_bit
:
4
trust_remote_code
:
true
### method
stage
:
sft
do_train
:
true
finetuning_type
:
lora
lora_rank
:
8
lora_target
:
all
### dataset
dataset
:
identity,alpaca_en_demo
template
:
llama3
cutoff_len
:
2048
max_samples
:
1000
overwrite_cache
:
true
preprocessing_num_workers
:
16
dataloader_num_workers
:
4
### output
output_dir
:
saves/llama3-8b/lora/sft
logging_steps
:
10
save_steps
:
500
plot_loss
:
true
overwrite_output_dir
:
true
save_only_model
:
false
report_to
:
none
# choices: [none, wandb, tensorboard, swanlab, mlflow]
### train
per_device_train_batch_size
:
1
gradient_accumulation_steps
:
8
learning_rate
:
1.0e-4
num_train_epochs
:
3.0
lr_scheduler_type
:
cosine
warmup_ratio
:
0.1
bf16
:
true
ddp_timeout
:
180000000
### eval
# val_size: 0.1
# per_device_eval_batch_size: 1
# eval_strategy: steps
# eval_steps: 500
examples/extras/fsdp_qlora/train.sh
0 → 100644
View file @
c7c477c7
#!/bin/bash
# DO NOT use GPTQ/AWQ model in FSDP+QLoRA
CUDA_VISIBLE_DEVICES
=
0,1 accelerate launch
\
--config_file
examples/accelerate/fsdp_config.yaml
\
src/train.py examples/extras/fsdp_qlora/llama3_lora_sft.yaml
examples/extras/galore/llama3_full_sft.yaml
0 → 100644
View file @
c7c477c7
### model
model_name_or_path
:
meta-llama/Meta-Llama-3-8B-Instruct
trust_remote_code
:
true
### method
stage
:
sft
do_train
:
true
finetuning_type
:
full
use_galore
:
true
galore_layerwise
:
true
# choices: [true, false], use false for DDP training
galore_target
:
all
galore_rank
:
128
galore_scale
:
2.0
### dataset
dataset
:
identity,alpaca_en_demo
template
:
llama3
cutoff_len
:
2048
max_samples
:
1000
overwrite_cache
:
true
preprocessing_num_workers
:
16
dataloader_num_workers
:
4
### output
output_dir
:
saves/llama3-8b/full/sft
logging_steps
:
10
save_steps
:
500
plot_loss
:
true
overwrite_output_dir
:
true
save_only_model
:
false
report_to
:
none
# choices: [none, wandb, tensorboard, swanlab, mlflow]
### train
per_device_train_batch_size
:
1
gradient_accumulation_steps
:
1
# use 1 for layerwise galore
learning_rate
:
1.0e-5
num_train_epochs
:
3.0
lr_scheduler_type
:
cosine
warmup_ratio
:
0.1
pure_bf16
:
true
ddp_timeout
:
180000000
### eval
# val_size: 0.1
# per_device_eval_batch_size: 1
# eval_strategy: steps
# eval_steps: 500
examples/extras/llama_pro/expand.sh
0 → 100644
View file @
c7c477c7
#!/bin/bash
python scripts/llama_pro.py
\
--model_name_or_path
meta-llama/Meta-Llama-3-8B-Instruct
\
--output_dir
models/llama3-8b-pro
\
--num_expand
8
examples/extras/llama_pro/llama3_freeze_sft.yaml
0 → 100644
View file @
c7c477c7
### model
model_name_or_path
:
models/llama3-8b-pro
trust_remote_code
:
true
### method
stage
:
sft
do_train
:
true
finetuning_type
:
freeze
freeze_trainable_layers
:
8
freeze_trainable_modules
:
all
use_llama_pro
:
true
### dataset
dataset
:
identity,alpaca_en_demo
template
:
llama3
cutoff_len
:
2048
max_samples
:
1000
overwrite_cache
:
true
preprocessing_num_workers
:
16
dataloader_num_workers
:
4
### output
output_dir
:
saves/llama3-8b-pro/freeze/sft
logging_steps
:
10
save_steps
:
500
plot_loss
:
true
overwrite_output_dir
:
true
save_only_model
:
false
report_to
:
none
# choices: [none, wandb, tensorboard, swanlab, mlflow]
### train
per_device_train_batch_size
:
1
gradient_accumulation_steps
:
8
learning_rate
:
1.0e-4
num_train_epochs
:
3.0
lr_scheduler_type
:
cosine
warmup_ratio
:
0.1
bf16
:
true
ddp_timeout
:
180000000
### eval
# val_size: 0.1
# per_device_eval_batch_size: 1
# eval_strategy: steps
# eval_steps: 500
examples/extras/loraplus/llama3_lora_sft.yaml
0 → 100644
View file @
c7c477c7
### model
model_name_or_path
:
meta-llama/Meta-Llama-3-8B-Instruct
trust_remote_code
:
true
### method
stage
:
sft
do_train
:
true
finetuning_type
:
lora
lora_rank
:
8
lora_target
:
all
loraplus_lr_ratio
:
16.0
### dataset
dataset
:
identity,alpaca_en_demo
template
:
llama3
cutoff_len
:
2048
max_samples
:
1000
overwrite_cache
:
true
preprocessing_num_workers
:
16
dataloader_num_workers
:
4
### output
output_dir
:
saves/llama3-8b/lora/sft
logging_steps
:
10
save_steps
:
500
plot_loss
:
true
overwrite_output_dir
:
true
save_only_model
:
false
report_to
:
none
# choices: [none, wandb, tensorboard, swanlab, mlflow]
### train
per_device_train_batch_size
:
1
gradient_accumulation_steps
:
8
learning_rate
:
1.0e-4
num_train_epochs
:
3.0
lr_scheduler_type
:
cosine
warmup_ratio
:
0.1
bf16
:
true
ddp_timeout
:
180000000
### eval
# val_size: 0.1
# per_device_eval_batch_size: 1
# eval_strategy: steps
# eval_steps: 500
examples/extras/mod/llama3_full_sft.yaml
0 → 100644
View file @
c7c477c7
### model
model_name_or_path
:
meta-llama/Meta-Llama-3-8B-Instruct
trust_remote_code
:
true
### method
stage
:
sft
do_train
:
true
finetuning_type
:
full
mixture_of_depths
:
convert
### dataset
dataset
:
identity,alpaca_en_demo
template
:
llama3
cutoff_len
:
2048
max_samples
:
1000
overwrite_cache
:
true
preprocessing_num_workers
:
16
dataloader_num_workers
:
4
### output
output_dir
:
saves/llama3-8b-mod/full/sft
logging_steps
:
10
save_steps
:
500
plot_loss
:
true
overwrite_output_dir
:
true
save_only_model
:
false
report_to
:
none
# choices: [none, wandb, tensorboard, swanlab, mlflow]
### train
per_device_train_batch_size
:
1
gradient_accumulation_steps
:
8
optim
:
paged_adamw_8bit
learning_rate
:
1.0e-5
num_train_epochs
:
3.0
lr_scheduler_type
:
cosine
warmup_ratio
:
0.1
pure_bf16
:
true
ddp_timeout
:
180000000
### eval
# val_size: 0.1
# per_device_eval_batch_size: 1
# eval_strategy: steps
# eval_steps: 500
examples/extras/muon/qwen2_full_sft.yaml
0 → 100644
View file @
c7c477c7
### model
model_name_or_path
:
Qwen/Qwen2-1.5B-Instruct
trust_remote_code
:
true
### method
stage
:
sft
do_train
:
true
finetuning_type
:
full
use_muon
:
true
### dataset
dataset
:
identity,alpaca_en_demo
template
:
qwen
cutoff_len
:
2048
max_samples
:
1000
overwrite_cache
:
true
preprocessing_num_workers
:
16
dataloader_num_workers
:
4
### output
output_dir
:
saves/qwen2-1_5b/full/sft
logging_steps
:
10
save_steps
:
500
plot_loss
:
true
overwrite_output_dir
:
true
save_only_model
:
false
report_to
:
none
# choices: [none, wandb, tensorboard, swanlab, mlflow]
### train
per_device_train_batch_size
:
1
gradient_accumulation_steps
:
8
learning_rate
:
1.0e-5
num_train_epochs
:
3.0
lr_scheduler_type
:
cosine
warmup_ratio
:
0.1
bf16
:
true
ddp_timeout
:
180000000
### eval
# val_size: 0.1
# per_device_eval_batch_size: 1
# eval_strategy: steps
# eval_steps: 500
examples/extras/nlg_eval/llama3_lora_predict.yaml
0 → 100644
View file @
c7c477c7
# The batch generation can be SLOW using this config.
# For faster inference, we recommend to use `scripts/vllm_infer.py`.
### model
model_name_or_path
:
meta-llama/Meta-Llama-3-8B-Instruct
adapter_name_or_path
:
saves/llama3-8b/lora/sft
trust_remote_code
:
true
### method
stage
:
sft
do_predict
:
true
finetuning_type
:
lora
### dataset
eval_dataset
:
identity,alpaca_en_demo
template
:
llama3
cutoff_len
:
2048
max_samples
:
50
overwrite_cache
:
true
preprocessing_num_workers
:
16
dataloader_num_workers
:
4
### output
output_dir
:
saves/llama3-8b/lora/predict
overwrite_output_dir
:
true
report_to
:
none
# choices: [none, wandb, tensorboard, swanlab, mlflow]
### eval
per_device_eval_batch_size
:
1
predict_with_generate
:
true
ddp_timeout
:
180000000
examples/extras/pissa/init.sh
0 → 100644
View file @
c7c477c7
#!/bin/bash
python scripts/pissa_init.py
\
--model_name_or_path
meta-llama/Meta-Llama-3-8B-Instruct
\
--output_dir
models/llama3-8b-pissa
examples/extras/pissa/llama3_lora_sft.yaml
0 → 100644
View file @
c7c477c7
### model
model_name_or_path
:
meta-llama/Meta-Llama-3-8B-Instruct
trust_remote_code
:
true
### method
stage
:
sft
do_train
:
true
finetuning_type
:
lora
lora_rank
:
8
lora_target
:
all
pissa_init
:
true
pissa_iter
:
16
pissa_convert
:
true
### dataset
dataset
:
identity,alpaca_en_demo
template
:
llama3
cutoff_len
:
2048
max_samples
:
1000
overwrite_cache
:
true
preprocessing_num_workers
:
16
dataloader_num_workers
:
4
### output
output_dir
:
saves/llama3-8b/lora/sft
logging_steps
:
10
save_steps
:
500
plot_loss
:
true
overwrite_output_dir
:
true
save_only_model
:
false
report_to
:
none
# choices: [none, wandb, tensorboard, swanlab, mlflow]
### train
per_device_train_batch_size
:
1
gradient_accumulation_steps
:
8
learning_rate
:
1.0e-4
num_train_epochs
:
3.0
lr_scheduler_type
:
cosine
warmup_ratio
:
0.1
bf16
:
true
ddp_timeout
:
180000000
### eval
# val_size: 0.1
# per_device_eval_batch_size: 1
# eval_strategy: steps
# eval_steps: 500
Prev
1
2
3
4
5
6
7
8
…
15
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment