Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
Firefly-Llama3_unsloth
Commits
d74a64c4
Commit
d74a64c4
authored
Jul 31, 2024
by
chenzk
Browse files
v1.0
parents
Pipeline
#1450
canceled with stages
Changes
98
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
692 additions
and
0 deletions
+692
-0
train_args/dpo/full/qwen1.5-7b-dpo-full.json
train_args/dpo/full/qwen1.5-7b-dpo-full.json
+36
-0
train_args/dpo/lora/minicpm-2b-dpo-lora.json
train_args/dpo/lora/minicpm-2b-dpo-lora.json
+36
-0
train_args/dpo/lora/qwen1.5-7b-dpo-lora.json
train_args/dpo/lora/qwen1.5-7b-dpo-lora.json
+36
-0
train_args/dpo/qlora/minicpm-2b-dpo-qlora.json
train_args/dpo/qlora/minicpm-2b-dpo-qlora.json
+36
-0
train_args/dpo/qlora/qwen1.5-7b-dpo-qlora.json
train_args/dpo/qlora/qwen1.5-7b-dpo-qlora.json
+36
-0
train_args/ds_z3_config.json
train_args/ds_z3_config.json
+55
-0
train_args/pretrain/full/bloom-1b1-pretrain-full.json
train_args/pretrain/full/bloom-1b1-pretrain-full.json
+34
-0
train_args/pretrain/full/bloom-3b-pretrain-full.json
train_args/pretrain/full/bloom-3b-pretrain-full.json
+34
-0
train_args/pretrain/full/qwen-7b-pretrain-full.json
train_args/pretrain/full/qwen-7b-pretrain-full.json
+34
-0
train_args/pretrain/qlora/bloom-3b-pretrain-qlora.json
train_args/pretrain/qlora/bloom-3b-pretrain-qlora.json
+32
-0
train_args/pretrain/qlora/qwen-7b-pretrain-qlora.json
train_args/pretrain/qlora/qwen-7b-pretrain-qlora.json
+32
-0
train_args/pretrain/qlora/yi-6b-pretrain-qlora.json
train_args/pretrain/qlora/yi-6b-pretrain-qlora.json
+32
-0
train_args/sft/full/bloom-1b1-sft-full.json
train_args/sft/full/bloom-1b1-sft-full.json
+31
-0
train_args/sft/full/bloom-3b-sft-full.json
train_args/sft/full/bloom-3b-sft-full.json
+33
-0
train_args/sft/full/minicpm-2b-sft-full.json
train_args/sft/full/minicpm-2b-sft-full.json
+31
-0
train_args/sft/full/qwen-7b-sft-full.json
train_args/sft/full/qwen-7b-sft-full.json
+33
-0
train_args/sft/full/yi-6b-sft-full.json
train_args/sft/full/yi-6b-sft-full.json
+33
-0
train_args/sft/lora/bloom-7b1-sft-lora.json
train_args/sft/lora/bloom-7b1-sft-lora.json
+33
-0
train_args/sft/lora/qwen1.5-7b-sft-lora.json
train_args/sft/lora/qwen1.5-7b-sft-lora.json
+33
-0
train_args/sft/qlora/baichuan-13b-sft-qlora.json
train_args/sft/qlora/baichuan-13b-sft-qlora.json
+32
-0
No files found.
train_args/dpo/full/qwen1.5-7b-dpo-full.json
0 → 100644
View file @
d74a64c4
{
"output_dir"
:
"output/firefly-qwen1.5-7b-dpo-full"
,
"model_name_or_path"
:
"Qwen/Qwen1.5-7B-Chat"
,
"train_file"
:
"./data/dummy_dpo.jsonl"
,
"template_name"
:
"qwen"
,
"train_mode"
:
"full"
,
"task_type"
:
"dpo"
,
"beta"
:
0.1
,
"num_train_epochs"
:
1
,
"per_device_train_batch_size"
:
1
,
"gradient_accumulation_steps"
:
16
,
"learning_rate"
:
5e-7
,
"max_seq_length"
:
1024
,
"max_prompt_length"
:
300
,
"logging_steps"
:
100
,
"save_steps"
:
100
,
"save_total_limit"
:
1
,
"lr_scheduler_type"
:
"constant_with_warmup"
,
"warmup_steps"
:
100
,
"lora_rank"
:
64
,
"lora_alpha"
:
16
,
"lora_dropout"
:
0.05
,
"gradient_checkpointing"
:
true
,
"disable_tqdm"
:
false
,
"optim"
:
"paged_adamw_32bit"
,
"seed"
:
42
,
"fp16"
:
true
,
"report_to"
:
"tensorboard"
,
"dataloader_num_workers"
:
0
,
"save_strategy"
:
"steps"
,
"weight_decay"
:
0
,
"max_grad_norm"
:
0.3
,
"remove_unused_columns"
:
false
}
\ No newline at end of file
train_args/dpo/lora/minicpm-2b-dpo-lora.json
0 → 100644
View file @
d74a64c4
{
"output_dir"
:
"output/firefly-minicpm-2b-dpo-lora"
,
"model_name_or_path"
:
"openbmb/MiniCPM-2B-dpo-fp16"
,
"train_file"
:
"./data/dummy_dpo.jsonl"
,
"template_name"
:
"minicpm"
,
"train_mode"
:
"lora"
,
"task_type"
:
"dpo"
,
"beta"
:
0.1
,
"num_train_epochs"
:
1
,
"per_device_train_batch_size"
:
1
,
"gradient_accumulation_steps"
:
16
,
"learning_rate"
:
2e-4
,
"max_seq_length"
:
1024
,
"max_prompt_length"
:
300
,
"logging_steps"
:
100
,
"save_steps"
:
100
,
"save_total_limit"
:
1
,
"lr_scheduler_type"
:
"constant_with_warmup"
,
"warmup_steps"
:
100
,
"lora_rank"
:
64
,
"lora_alpha"
:
16
,
"lora_dropout"
:
0.05
,
"gradient_checkpointing"
:
true
,
"disable_tqdm"
:
false
,
"optim"
:
"paged_adamw_32bit"
,
"seed"
:
42
,
"fp16"
:
true
,
"report_to"
:
"tensorboard"
,
"dataloader_num_workers"
:
0
,
"save_strategy"
:
"steps"
,
"weight_decay"
:
0
,
"max_grad_norm"
:
0.3
,
"remove_unused_columns"
:
false
}
\ No newline at end of file
train_args/dpo/lora/qwen1.5-7b-dpo-lora.json
0 → 100644
View file @
d74a64c4
{
"output_dir"
:
"output/firefly-qwen1.5-7b-dpo-lora"
,
"model_name_or_path"
:
"Qwen/Qwen1.5-7B-Chat"
,
"train_file"
:
"./data/dummy_dpo.jsonl"
,
"template_name"
:
"qwen"
,
"train_mode"
:
"lora"
,
"task_type"
:
"dpo"
,
"beta"
:
0.1
,
"num_train_epochs"
:
1
,
"per_device_train_batch_size"
:
1
,
"gradient_accumulation_steps"
:
16
,
"learning_rate"
:
2e-4
,
"max_seq_length"
:
1024
,
"max_prompt_length"
:
300
,
"logging_steps"
:
100
,
"save_steps"
:
100
,
"save_total_limit"
:
1
,
"lr_scheduler_type"
:
"constant_with_warmup"
,
"warmup_steps"
:
100
,
"lora_rank"
:
64
,
"lora_alpha"
:
16
,
"lora_dropout"
:
0.05
,
"gradient_checkpointing"
:
true
,
"disable_tqdm"
:
false
,
"optim"
:
"paged_adamw_32bit"
,
"seed"
:
42
,
"fp16"
:
true
,
"report_to"
:
"tensorboard"
,
"dataloader_num_workers"
:
0
,
"save_strategy"
:
"steps"
,
"weight_decay"
:
0
,
"max_grad_norm"
:
0.3
,
"remove_unused_columns"
:
false
}
\ No newline at end of file
train_args/dpo/qlora/minicpm-2b-dpo-qlora.json
0 → 100644
View file @
d74a64c4
{
"output_dir"
:
"output/firefly-minicpm-2b-dpo-qlora"
,
"model_name_or_path"
:
"openbmb/MiniCPM-2B-dpo-fp16"
,
"train_file"
:
"./data/dummy_dpo.jsonl"
,
"template_name"
:
"minicpm"
,
"train_mode"
:
"qlora"
,
"task_type"
:
"dpo"
,
"beta"
:
0.1
,
"num_train_epochs"
:
1
,
"per_device_train_batch_size"
:
1
,
"gradient_accumulation_steps"
:
16
,
"learning_rate"
:
2e-4
,
"max_seq_length"
:
1024
,
"max_prompt_length"
:
300
,
"logging_steps"
:
100
,
"save_steps"
:
100
,
"save_total_limit"
:
1
,
"lr_scheduler_type"
:
"constant_with_warmup"
,
"warmup_steps"
:
100
,
"lora_rank"
:
64
,
"lora_alpha"
:
16
,
"lora_dropout"
:
0.05
,
"gradient_checkpointing"
:
true
,
"disable_tqdm"
:
false
,
"optim"
:
"paged_adamw_32bit"
,
"seed"
:
42
,
"fp16"
:
true
,
"report_to"
:
"tensorboard"
,
"dataloader_num_workers"
:
0
,
"save_strategy"
:
"steps"
,
"weight_decay"
:
0
,
"max_grad_norm"
:
0.3
,
"remove_unused_columns"
:
false
}
\ No newline at end of file
train_args/dpo/qlora/qwen1.5-7b-dpo-qlora.json
0 → 100644
View file @
d74a64c4
{
"output_dir"
:
"output/firefly-qwen1.5-7b-dpo-qlora"
,
"model_name_or_path"
:
"Qwen/Qwen1.5-7B-Chat"
,
"train_file"
:
"./data/dummy_dpo.jsonl"
,
"template_name"
:
"qwen"
,
"train_mode"
:
"qlora"
,
"task_type"
:
"dpo"
,
"beta"
:
0.1
,
"num_train_epochs"
:
1
,
"per_device_train_batch_size"
:
1
,
"gradient_accumulation_steps"
:
16
,
"learning_rate"
:
2e-4
,
"max_seq_length"
:
1024
,
"max_prompt_length"
:
300
,
"logging_steps"
:
100
,
"save_steps"
:
100
,
"save_total_limit"
:
1
,
"lr_scheduler_type"
:
"constant_with_warmup"
,
"warmup_steps"
:
100
,
"lora_rank"
:
64
,
"lora_alpha"
:
16
,
"lora_dropout"
:
0.05
,
"gradient_checkpointing"
:
true
,
"disable_tqdm"
:
false
,
"optim"
:
"paged_adamw_32bit"
,
"seed"
:
42
,
"fp16"
:
true
,
"report_to"
:
"tensorboard"
,
"dataloader_num_workers"
:
0
,
"save_strategy"
:
"steps"
,
"weight_decay"
:
0
,
"max_grad_norm"
:
0.3
,
"remove_unused_columns"
:
false
}
\ No newline at end of file
train_args/ds_z3_config.json
0 → 100644
View file @
d74a64c4
{
"gradient_accumulation_steps"
:
"auto"
,
"gradient_clipping"
:
"auto"
,
"steps_per_print"
:
200
,
"train_batch_size"
:
"auto"
,
"train_micro_batch_size_per_gpu"
:
"auto"
,
"wall_clock_breakdown"
:
false
,
"optimizer"
:
{
"type"
:
"Adam"
,
"params"
:
{
"lr"
:
"auto"
,
"betas"
:
"auto"
,
"eps"
:
"auto"
,
"weight_decay"
:
"auto"
}
},
"fp16"
:
{
"enabled"
:
"auto"
,
"loss_scale"
:
0
,
"loss_scale_window"
:
1000
,
"initial_scale_power"
:
16
,
"hysteresis"
:
2
,
"min_loss_scale"
:
1
},
"zero_optimization"
:
{
"stage"
:
3
,
"offload_optimizer"
:
{
"device"
:
"cpu"
,
"pin_memory"
:
true
},
"offload_param"
:
{
"device"
:
"cpu"
,
"pin_memory"
:
true
},
"overlap_comm"
:
true
,
"contiguous_gradients"
:
true
,
"sub_group_size"
:
1e9
,
"reduce_bucket_size"
:
"auto"
,
"stage3_prefetch_bucket_size"
:
"auto"
,
"stage3_param_persistence_threshold"
:
"auto"
,
"stage3_max_live_parameters"
:
1e9
,
"stage3_max_reuse_distance"
:
1e9
,
"stage3_gather_16bit_weights_on_model_save"
:
true
},
"scheduler"
:
{
"type"
:
"WarmupLR"
,
"params"
:
{
"warmup_min_lr"
:
"auto"
,
"warmup_max_lr"
:
"auto"
,
"warmup_num_steps"
:
"auto"
}
}
}
\ No newline at end of file
train_args/pretrain/full/bloom-1b1-pretrain-full.json
0 → 100644
View file @
d74a64c4
{
"output_dir"
:
"output/firefly-bloom-1b1-pretrain-full"
,
"model_name_or_path"
:
"bigscience/bloom-1b1"
,
"deepspeed"
:
"./train_args/ds_z3_config.json"
,
"train_file"
:
"./data/pretrain"
,
"train_mode"
:
"full"
,
"task_type"
:
"pretrain"
,
"num_train_epochs"
:
1
,
"tokenize_num_workers"
:
10
,
"per_device_train_batch_size"
:
1
,
"gradient_accumulation_steps"
:
16
,
"learning_rate"
:
1e-5
,
"max_seq_length"
:
1024
,
"logging_steps"
:
100
,
"save_steps"
:
100
,
"save_total_limit"
:
1
,
"lr_scheduler_type"
:
"cosine"
,
"warmup_ratio"
:
0.01
,
"gradient_checkpointing"
:
true
,
"logging_first_step"
:
false
,
"disable_tqdm"
:
false
,
"optim"
:
"adamw_hf"
,
"seed"
:
42
,
"fp16"
:
true
,
"report_to"
:
"tensorboard"
,
"dataloader_num_workers"
:
0
,
"save_strategy"
:
"steps"
,
"weight_decay"
:
0
,
"max_grad_norm"
:
1.0
,
"remove_unused_columns"
:
false
}
\ No newline at end of file
train_args/pretrain/full/bloom-3b-pretrain-full.json
0 → 100644
View file @
d74a64c4
{
"output_dir"
:
"output/firefly-bloom-3b-pretrain-full"
,
"model_name_or_path"
:
"bigscience/bloom-3b"
,
"deepspeed"
:
"./train_args/ds_z3_config.json"
,
"train_file"
:
"./data/pretrain"
,
"train_mode"
:
"full"
,
"task_type"
:
"pretrain"
,
"num_train_epochs"
:
1
,
"tokenize_num_workers"
:
10
,
"per_device_train_batch_size"
:
1
,
"gradient_accumulation_steps"
:
16
,
"learning_rate"
:
1e-5
,
"max_seq_length"
:
1024
,
"logging_steps"
:
100
,
"save_steps"
:
100
,
"save_total_limit"
:
1
,
"lr_scheduler_type"
:
"cosine"
,
"warmup_ratio"
:
0.01
,
"gradient_checkpointing"
:
true
,
"logging_first_step"
:
false
,
"disable_tqdm"
:
false
,
"optim"
:
"adamw_hf"
,
"seed"
:
42
,
"fp16"
:
true
,
"report_to"
:
"tensorboard"
,
"dataloader_num_workers"
:
0
,
"save_strategy"
:
"steps"
,
"weight_decay"
:
0
,
"max_grad_norm"
:
1.0
,
"remove_unused_columns"
:
false
}
\ No newline at end of file
train_args/pretrain/full/qwen-7b-pretrain-full.json
0 → 100644
View file @
d74a64c4
{
"output_dir"
:
"output/firefly-qwen-7b-pretrain-full"
,
"model_name_or_path"
:
"Qwen/Qwen-7B"
,
"deepspeed"
:
"./train_args/ds_z3_config.json"
,
"train_file"
:
"./data/pretrain"
,
"train_mode"
:
"full"
,
"task_type"
:
"pretrain"
,
"num_train_epochs"
:
1
,
"tokenize_num_workers"
:
10
,
"per_device_train_batch_size"
:
1
,
"gradient_accumulation_steps"
:
16
,
"learning_rate"
:
1e-5
,
"max_seq_length"
:
2048
,
"logging_steps"
:
100
,
"save_steps"
:
100
,
"save_total_limit"
:
1
,
"lr_scheduler_type"
:
"cosine"
,
"warmup_ratio"
:
0.01
,
"gradient_checkpointing"
:
true
,
"logging_first_step"
:
false
,
"disable_tqdm"
:
false
,
"optim"
:
"adamw_hf"
,
"seed"
:
42
,
"fp16"
:
true
,
"report_to"
:
"tensorboard"
,
"dataloader_num_workers"
:
0
,
"save_strategy"
:
"steps"
,
"weight_decay"
:
0
,
"max_grad_norm"
:
1.0
,
"remove_unused_columns"
:
false
}
\ No newline at end of file
train_args/pretrain/qlora/bloom-3b-pretrain-qlora.json
0 → 100644
View file @
d74a64c4
{
"output_dir"
:
"output/firefly-bloom-3b-pretrain-qlora"
,
"model_name_or_path"
:
"bigscience/bloom-3b"
,
"train_file"
:
"./data/pretrain"
,
"train_mode"
:
"qlora"
,
"task_type"
:
"pretrain"
,
"num_train_epochs"
:
1
,
"tokenize_num_workers"
:
10
,
"per_device_train_batch_size"
:
1
,
"gradient_accumulation_steps"
:
16
,
"learning_rate"
:
1e-5
,
"max_seq_length"
:
2048
,
"logging_steps"
:
100
,
"save_steps"
:
100
,
"lr_scheduler_type"
:
"cosine"
,
"warmup_ratio"
:
0.01
,
"gradient_checkpointing"
:
true
,
"logging_first_step"
:
false
,
"disable_tqdm"
:
false
,
"optim"
:
"adamw_hf"
,
"seed"
:
42
,
"fp16"
:
true
,
"report_to"
:
"tensorboard"
,
"dataloader_num_workers"
:
0
,
"save_strategy"
:
"steps"
,
"weight_decay"
:
0
,
"max_grad_norm"
:
1.0
,
"remove_unused_columns"
:
false
}
\ No newline at end of file
train_args/pretrain/qlora/qwen-7b-pretrain-qlora.json
0 → 100644
View file @
d74a64c4
{
"output_dir"
:
"output/firefly-qwen-7b-pretrain-qlora"
,
"model_name_or_path"
:
"Qwen/Qwen-7B"
,
"train_file"
:
"./data/pretrain"
,
"train_mode"
:
"qlora"
,
"task_type"
:
"pretrain"
,
"num_train_epochs"
:
1
,
"tokenize_num_workers"
:
10
,
"per_device_train_batch_size"
:
1
,
"gradient_accumulation_steps"
:
16
,
"learning_rate"
:
1e-5
,
"max_seq_length"
:
2048
,
"logging_steps"
:
100
,
"save_steps"
:
100
,
"lr_scheduler_type"
:
"cosine"
,
"warmup_ratio"
:
0.01
,
"gradient_checkpointing"
:
true
,
"logging_first_step"
:
false
,
"disable_tqdm"
:
false
,
"optim"
:
"adamw_hf"
,
"seed"
:
42
,
"fp16"
:
true
,
"report_to"
:
"tensorboard"
,
"dataloader_num_workers"
:
0
,
"save_strategy"
:
"steps"
,
"weight_decay"
:
0
,
"max_grad_norm"
:
1.0
,
"remove_unused_columns"
:
false
}
\ No newline at end of file
train_args/pretrain/qlora/yi-6b-pretrain-qlora.json
0 → 100644
View file @
d74a64c4
{
"output_dir"
:
"output/firefly-yi-6b-pretrain-qlora"
,
"model_name_or_path"
:
"01-ai/Yi-6B"
,
"train_file"
:
"./data/pretrain"
,
"train_mode"
:
"qlora"
,
"task_type"
:
"pretrain"
,
"num_train_epochs"
:
1
,
"tokenize_num_workers"
:
10
,
"per_device_train_batch_size"
:
1
,
"gradient_accumulation_steps"
:
16
,
"learning_rate"
:
1e-5
,
"max_seq_length"
:
2048
,
"logging_steps"
:
100
,
"save_steps"
:
100
,
"lr_scheduler_type"
:
"cosine"
,
"warmup_ratio"
:
0.01
,
"gradient_checkpointing"
:
true
,
"logging_first_step"
:
false
,
"disable_tqdm"
:
false
,
"optim"
:
"adamw_hf"
,
"seed"
:
42
,
"fp16"
:
true
,
"report_to"
:
"tensorboard"
,
"dataloader_num_workers"
:
0
,
"save_strategy"
:
"steps"
,
"weight_decay"
:
0
,
"max_grad_norm"
:
1.0
,
"remove_unused_columns"
:
false
}
\ No newline at end of file
train_args/sft/full/bloom-1b1-sft-full.json
0 → 100644
View file @
d74a64c4
{
"output_dir"
:
"output/firefly-bloom-1b1-sft-full"
,
"model_name_or_path"
:
"bigscience/bloom-1b1"
,
"deepspeed"
:
"./train_args/ds_z3_config.json"
,
"train_file"
:
"./data/dummy_data.jsonl"
,
"template_name"
:
"default"
,
"train_mode"
:
"full"
,
"num_train_epochs"
:
1
,
"per_device_train_batch_size"
:
1
,
"gradient_accumulation_steps"
:
16
,
"learning_rate"
:
1e-5
,
"max_seq_length"
:
1024
,
"logging_steps"
:
100
,
"save_steps"
:
100
,
"save_total_limit"
:
1
,
"lr_scheduler_type"
:
"cosine"
,
"warmup_steps"
:
100
,
"gradient_checkpointing"
:
true
,
"disable_tqdm"
:
false
,
"optim"
:
"adamw_hf"
,
"seed"
:
42
,
"fp16"
:
true
,
"report_to"
:
"tensorboard"
,
"dataloader_num_workers"
:
0
,
"save_strategy"
:
"steps"
,
"weight_decay"
:
0
,
"max_grad_norm"
:
1.0
,
"remove_unused_columns"
:
false
}
\ No newline at end of file
train_args/sft/full/bloom-3b-sft-full.json
0 → 100644
View file @
d74a64c4
{
"output_dir"
:
"output/firefly-bloom-3b-sft-full"
,
"model_name_or_path"
:
"bigscience/bloom-3b"
,
"deepspeed"
:
"./train_args/ds_z3_config.json"
,
"train_file"
:
"./data/dummy_data.jsonl"
,
"template_name"
:
"default"
,
"train_mode"
:
"full"
,
"num_train_epochs"
:
1
,
"per_device_train_batch_size"
:
1
,
"gradient_accumulation_steps"
:
16
,
"learning_rate"
:
1e-5
,
"max_seq_length"
:
1024
,
"logging_steps"
:
100
,
"save_steps"
:
100
,
"save_total_limit"
:
1
,
"lr_scheduler_type"
:
"cosine"
,
"warmup_steps"
:
100
,
"gradient_checkpointing"
:
true
,
"disable_tqdm"
:
false
,
"optim"
:
"adamw_hf"
,
"seed"
:
42
,
"fp16"
:
true
,
"report_to"
:
"tensorboard"
,
"dataloader_num_workers"
:
0
,
"save_strategy"
:
"steps"
,
"weight_decay"
:
0
,
"max_grad_norm"
:
1.0
,
"remove_unused_columns"
:
false
}
train_args/sft/full/minicpm-2b-sft-full.json
0 → 100644
View file @
d74a64c4
{
"output_dir"
:
"output/firefly-minicpm-2b-sft-full"
,
"model_name_or_path"
:
"openbmb/MiniCPM-2B-dpo-fp16"
,
"deepspeed"
:
"./train_args/ds_z3_config.json"
,
"train_file"
:
"./data/dummy_data.jsonl"
,
"template_name"
:
"minicpm"
,
"train_mode"
:
"full"
,
"num_train_epochs"
:
1
,
"per_device_train_batch_size"
:
1
,
"gradient_accumulation_steps"
:
16
,
"learning_rate"
:
1e-5
,
"max_seq_length"
:
1024
,
"logging_steps"
:
100
,
"save_steps"
:
100
,
"save_total_limit"
:
1
,
"lr_scheduler_type"
:
"cosine"
,
"warmup_steps"
:
100
,
"gradient_checkpointing"
:
true
,
"disable_tqdm"
:
false
,
"optim"
:
"adamw_hf"
,
"seed"
:
42
,
"fp16"
:
true
,
"report_to"
:
"tensorboard"
,
"dataloader_num_workers"
:
0
,
"save_strategy"
:
"steps"
,
"weight_decay"
:
0
,
"max_grad_norm"
:
1.0
,
"remove_unused_columns"
:
false
}
\ No newline at end of file
train_args/sft/full/qwen-7b-sft-full.json
0 → 100644
View file @
d74a64c4
{
"output_dir"
:
"output/firefly-qwen-7b-sft-full"
,
"model_name_or_path"
:
"Qwen/Qwen-7B-Chat"
,
"deepspeed"
:
"./train_args/ds_z3_config.json"
,
"train_file"
:
"./data/dummy_data.jsonl"
,
"template_name"
:
"qwen"
,
"train_mode"
:
"full"
,
"num_train_epochs"
:
1
,
"per_device_train_batch_size"
:
1
,
"gradient_accumulation_steps"
:
16
,
"learning_rate"
:
1e-5
,
"max_seq_length"
:
1024
,
"logging_steps"
:
100
,
"save_steps"
:
100
,
"save_total_limit"
:
1
,
"lr_scheduler_type"
:
"cosine"
,
"warmup_steps"
:
100
,
"gradient_checkpointing"
:
true
,
"disable_tqdm"
:
false
,
"optim"
:
"adamw_hf"
,
"seed"
:
42
,
"fp16"
:
true
,
"report_to"
:
"tensorboard"
,
"dataloader_num_workers"
:
0
,
"save_strategy"
:
"steps"
,
"weight_decay"
:
0
,
"max_grad_norm"
:
1.0
,
"remove_unused_columns"
:
false
}
train_args/sft/full/yi-6b-sft-full.json
0 → 100644
View file @
d74a64c4
{
"output_dir"
:
"output/firefly-yi-6b-sft-full"
,
"model_name_or_path"
:
"01-ai/Yi-6B-Chat"
,
"deepspeed"
:
"./train_args/ds_z3_config.json"
,
"train_file"
:
"./data/dummy_data.jsonl"
,
"template_name"
:
"yi"
,
"train_mode"
:
"full"
,
"num_train_epochs"
:
1
,
"per_device_train_batch_size"
:
1
,
"gradient_accumulation_steps"
:
16
,
"learning_rate"
:
1e-5
,
"max_seq_length"
:
1024
,
"logging_steps"
:
100
,
"save_steps"
:
100
,
"save_total_limit"
:
1
,
"lr_scheduler_type"
:
"cosine"
,
"warmup_steps"
:
100
,
"gradient_checkpointing"
:
true
,
"disable_tqdm"
:
false
,
"optim"
:
"adamw_hf"
,
"seed"
:
42
,
"fp16"
:
true
,
"report_to"
:
"tensorboard"
,
"dataloader_num_workers"
:
0
,
"save_strategy"
:
"steps"
,
"weight_decay"
:
0
,
"max_grad_norm"
:
1.0
,
"remove_unused_columns"
:
false
}
train_args/sft/lora/bloom-7b1-sft-lora.json
0 → 100644
View file @
d74a64c4
{
"output_dir"
:
"output/firefly-bloom-7b1-sft-lora"
,
"model_name_or_path"
:
"bigscience/bloom-7b1"
,
"train_file"
:
"./data/dummy_data.jsonl"
,
"template_name"
:
"default"
,
"train_mode"
:
"lora"
,
"num_train_epochs"
:
1
,
"per_device_train_batch_size"
:
1
,
"gradient_accumulation_steps"
:
16
,
"learning_rate"
:
2e-4
,
"max_seq_length"
:
1024
,
"logging_steps"
:
100
,
"save_steps"
:
100
,
"save_total_limit"
:
1
,
"lr_scheduler_type"
:
"constant_with_warmup"
,
"warmup_steps"
:
100
,
"lora_rank"
:
64
,
"lora_alpha"
:
16
,
"lora_dropout"
:
0.05
,
"gradient_checkpointing"
:
true
,
"disable_tqdm"
:
false
,
"optim"
:
"paged_adamw_32bit"
,
"seed"
:
42
,
"fp16"
:
true
,
"report_to"
:
"tensorboard"
,
"dataloader_num_workers"
:
0
,
"save_strategy"
:
"steps"
,
"weight_decay"
:
0
,
"max_grad_norm"
:
0.3
,
"remove_unused_columns"
:
false
}
\ No newline at end of file
train_args/sft/lora/qwen1.5-7b-sft-lora.json
0 → 100644
View file @
d74a64c4
{
"output_dir"
:
"output/firefly-qwen1.5-7b-sft-lora"
,
"model_name_or_path"
:
"Qwen/Qwen1.5-7B-Chat"
,
"train_file"
:
"./data/dummy_data.jsonl"
,
"template_name"
:
"qwen"
,
"train_mode"
:
"lora"
,
"num_train_epochs"
:
1
,
"per_device_train_batch_size"
:
1
,
"gradient_accumulation_steps"
:
16
,
"learning_rate"
:
2e-4
,
"max_seq_length"
:
1024
,
"logging_steps"
:
100
,
"save_steps"
:
100
,
"save_total_limit"
:
1
,
"lr_scheduler_type"
:
"constant_with_warmup"
,
"warmup_steps"
:
100
,
"lora_rank"
:
64
,
"lora_alpha"
:
16
,
"lora_dropout"
:
0.05
,
"gradient_checkpointing"
:
true
,
"disable_tqdm"
:
false
,
"optim"
:
"paged_adamw_32bit"
,
"seed"
:
42
,
"fp16"
:
true
,
"report_to"
:
"tensorboard"
,
"dataloader_num_workers"
:
0
,
"save_strategy"
:
"steps"
,
"weight_decay"
:
0
,
"max_grad_norm"
:
0.3
,
"remove_unused_columns"
:
false
}
\ No newline at end of file
train_args/sft/qlora/baichuan-13b-sft-qlora.json
0 → 100644
View file @
d74a64c4
{
"output_dir"
:
"output/firefly-baichuan-13b-sft-qlora"
,
"model_name_or_path"
:
"baichuan-inc/Baichuan-13B-Chat"
,
"train_file"
:
"./data/dummy_data.jsonl"
,
"template_name"
:
"baichuan"
,
"num_train_epochs"
:
1
,
"per_device_train_batch_size"
:
1
,
"gradient_accumulation_steps"
:
16
,
"learning_rate"
:
1e-4
,
"max_seq_length"
:
1024
,
"logging_steps"
:
100
,
"save_steps"
:
100
,
"save_total_limit"
:
1
,
"lr_scheduler_type"
:
"constant_with_warmup"
,
"warmup_steps"
:
100
,
"lora_rank"
:
64
,
"lora_alpha"
:
16
,
"lora_dropout"
:
0.05
,
"gradient_checkpointing"
:
true
,
"disable_tqdm"
:
false
,
"optim"
:
"paged_adamw_32bit"
,
"seed"
:
42
,
"fp16"
:
true
,
"report_to"
:
"tensorboard"
,
"dataloader_num_workers"
:
5
,
"save_strategy"
:
"steps"
,
"weight_decay"
:
0
,
"max_grad_norm"
:
0.3
,
"remove_unused_columns"
:
false
}
\ No newline at end of file
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment