Commit d74a64c4 authored by chenzk's avatar chenzk
Browse files

v1.0

parents
Pipeline #1450 canceled with stages
{
"output_dir": "output/firefly-qwen1.5-7b-dpo-full",
"model_name_or_path": "Qwen/Qwen1.5-7B-Chat",
"train_file": "./data/dummy_dpo.jsonl",
"template_name": "qwen",
"train_mode": "full",
"task_type": "dpo",
"beta": 0.1,
"num_train_epochs": 1,
"per_device_train_batch_size": 1,
"gradient_accumulation_steps": 16,
"learning_rate": 5e-7,
"max_seq_length": 1024,
"max_prompt_length": 300,
"logging_steps": 100,
"save_steps": 100,
"save_total_limit": 1,
"lr_scheduler_type": "constant_with_warmup",
"warmup_steps": 100,
"lora_rank": 64,
"lora_alpha": 16,
"lora_dropout": 0.05,
"gradient_checkpointing": true,
"disable_tqdm": false,
"optim": "paged_adamw_32bit",
"seed": 42,
"fp16": true,
"report_to": "tensorboard",
"dataloader_num_workers": 0,
"save_strategy": "steps",
"weight_decay": 0,
"max_grad_norm": 0.3,
"remove_unused_columns": false
}
\ No newline at end of file
{
"output_dir": "output/firefly-minicpm-2b-dpo-lora",
"model_name_or_path": "openbmb/MiniCPM-2B-dpo-fp16",
"train_file": "./data/dummy_dpo.jsonl",
"template_name": "minicpm",
"train_mode": "lora",
"task_type": "dpo",
"beta": 0.1,
"num_train_epochs": 1,
"per_device_train_batch_size": 1,
"gradient_accumulation_steps": 16,
"learning_rate": 2e-4,
"max_seq_length": 1024,
"max_prompt_length": 300,
"logging_steps": 100,
"save_steps": 100,
"save_total_limit": 1,
"lr_scheduler_type": "constant_with_warmup",
"warmup_steps": 100,
"lora_rank": 64,
"lora_alpha": 16,
"lora_dropout": 0.05,
"gradient_checkpointing": true,
"disable_tqdm": false,
"optim": "paged_adamw_32bit",
"seed": 42,
"fp16": true,
"report_to": "tensorboard",
"dataloader_num_workers": 0,
"save_strategy": "steps",
"weight_decay": 0,
"max_grad_norm": 0.3,
"remove_unused_columns": false
}
\ No newline at end of file
{
"output_dir": "output/firefly-qwen1.5-7b-dpo-lora",
"model_name_or_path": "Qwen/Qwen1.5-7B-Chat",
"train_file": "./data/dummy_dpo.jsonl",
"template_name": "qwen",
"train_mode": "lora",
"task_type": "dpo",
"beta": 0.1,
"num_train_epochs": 1,
"per_device_train_batch_size": 1,
"gradient_accumulation_steps": 16,
"learning_rate": 2e-4,
"max_seq_length": 1024,
"max_prompt_length": 300,
"logging_steps": 100,
"save_steps": 100,
"save_total_limit": 1,
"lr_scheduler_type": "constant_with_warmup",
"warmup_steps": 100,
"lora_rank": 64,
"lora_alpha": 16,
"lora_dropout": 0.05,
"gradient_checkpointing": true,
"disable_tqdm": false,
"optim": "paged_adamw_32bit",
"seed": 42,
"fp16": true,
"report_to": "tensorboard",
"dataloader_num_workers": 0,
"save_strategy": "steps",
"weight_decay": 0,
"max_grad_norm": 0.3,
"remove_unused_columns": false
}
\ No newline at end of file
{
"output_dir": "output/firefly-minicpm-2b-dpo-qlora",
"model_name_or_path": "openbmb/MiniCPM-2B-dpo-fp16",
"train_file": "./data/dummy_dpo.jsonl",
"template_name": "minicpm",
"train_mode": "qlora",
"task_type": "dpo",
"beta": 0.1,
"num_train_epochs": 1,
"per_device_train_batch_size": 1,
"gradient_accumulation_steps": 16,
"learning_rate": 2e-4,
"max_seq_length": 1024,
"max_prompt_length": 300,
"logging_steps": 100,
"save_steps": 100,
"save_total_limit": 1,
"lr_scheduler_type": "constant_with_warmup",
"warmup_steps": 100,
"lora_rank": 64,
"lora_alpha": 16,
"lora_dropout": 0.05,
"gradient_checkpointing": true,
"disable_tqdm": false,
"optim": "paged_adamw_32bit",
"seed": 42,
"fp16": true,
"report_to": "tensorboard",
"dataloader_num_workers": 0,
"save_strategy": "steps",
"weight_decay": 0,
"max_grad_norm": 0.3,
"remove_unused_columns": false
}
\ No newline at end of file
{
"output_dir": "output/firefly-qwen1.5-7b-dpo-qlora",
"model_name_or_path": "Qwen/Qwen1.5-7B-Chat",
"train_file": "./data/dummy_dpo.jsonl",
"template_name": "qwen",
"train_mode": "qlora",
"task_type": "dpo",
"beta": 0.1,
"num_train_epochs": 1,
"per_device_train_batch_size": 1,
"gradient_accumulation_steps": 16,
"learning_rate": 2e-4,
"max_seq_length": 1024,
"max_prompt_length": 300,
"logging_steps": 100,
"save_steps": 100,
"save_total_limit": 1,
"lr_scheduler_type": "constant_with_warmup",
"warmup_steps": 100,
"lora_rank": 64,
"lora_alpha": 16,
"lora_dropout": 0.05,
"gradient_checkpointing": true,
"disable_tqdm": false,
"optim": "paged_adamw_32bit",
"seed": 42,
"fp16": true,
"report_to": "tensorboard",
"dataloader_num_workers": 0,
"save_strategy": "steps",
"weight_decay": 0,
"max_grad_norm": 0.3,
"remove_unused_columns": false
}
\ No newline at end of file
{
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 200,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false,
"optimizer": {
"type": "Adam",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},
"zero_optimization": {
"stage": 3,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"offload_param": {
"device": "cpu",
"pin_memory": true
},
"overlap_comm": true,
"contiguous_gradients": true,
"sub_group_size": 1e9,
"reduce_bucket_size": "auto",
"stage3_prefetch_bucket_size": "auto",
"stage3_param_persistence_threshold": "auto",
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"stage3_gather_16bit_weights_on_model_save": true
},
"scheduler": {
"type": "WarmupLR",
"params": {
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto"
}
}
}
\ No newline at end of file
{
"output_dir": "output/firefly-bloom-1b1-pretrain-full",
"model_name_or_path": "bigscience/bloom-1b1",
"deepspeed": "./train_args/ds_z3_config.json",
"train_file": "./data/pretrain",
"train_mode": "full",
"task_type": "pretrain",
"num_train_epochs": 1,
"tokenize_num_workers": 10,
"per_device_train_batch_size": 1,
"gradient_accumulation_steps": 16,
"learning_rate": 1e-5,
"max_seq_length": 1024,
"logging_steps": 100,
"save_steps": 100,
"save_total_limit": 1,
"lr_scheduler_type": "cosine",
"warmup_ratio": 0.01,
"gradient_checkpointing": true,
"logging_first_step": false,
"disable_tqdm": false,
"optim": "adamw_hf",
"seed": 42,
"fp16": true,
"report_to": "tensorboard",
"dataloader_num_workers": 0,
"save_strategy": "steps",
"weight_decay": 0,
"max_grad_norm": 1.0,
"remove_unused_columns": false
}
\ No newline at end of file
{
"output_dir": "output/firefly-bloom-3b-pretrain-full",
"model_name_or_path": "bigscience/bloom-3b",
"deepspeed": "./train_args/ds_z3_config.json",
"train_file": "./data/pretrain",
"train_mode": "full",
"task_type": "pretrain",
"num_train_epochs": 1,
"tokenize_num_workers": 10,
"per_device_train_batch_size": 1,
"gradient_accumulation_steps": 16,
"learning_rate": 1e-5,
"max_seq_length": 1024,
"logging_steps": 100,
"save_steps": 100,
"save_total_limit": 1,
"lr_scheduler_type": "cosine",
"warmup_ratio": 0.01,
"gradient_checkpointing": true,
"logging_first_step": false,
"disable_tqdm": false,
"optim": "adamw_hf",
"seed": 42,
"fp16": true,
"report_to": "tensorboard",
"dataloader_num_workers": 0,
"save_strategy": "steps",
"weight_decay": 0,
"max_grad_norm": 1.0,
"remove_unused_columns": false
}
\ No newline at end of file
{
"output_dir": "output/firefly-qwen-7b-pretrain-full",
"model_name_or_path": "Qwen/Qwen-7B",
"deepspeed": "./train_args/ds_z3_config.json",
"train_file": "./data/pretrain",
"train_mode": "full",
"task_type": "pretrain",
"num_train_epochs": 1,
"tokenize_num_workers": 10,
"per_device_train_batch_size": 1,
"gradient_accumulation_steps": 16,
"learning_rate": 1e-5,
"max_seq_length": 2048,
"logging_steps": 100,
"save_steps": 100,
"save_total_limit": 1,
"lr_scheduler_type": "cosine",
"warmup_ratio": 0.01,
"gradient_checkpointing": true,
"logging_first_step": false,
"disable_tqdm": false,
"optim": "adamw_hf",
"seed": 42,
"fp16": true,
"report_to": "tensorboard",
"dataloader_num_workers": 0,
"save_strategy": "steps",
"weight_decay": 0,
"max_grad_norm": 1.0,
"remove_unused_columns": false
}
\ No newline at end of file
{
"output_dir": "output/firefly-bloom-3b-pretrain-qlora",
"model_name_or_path": "bigscience/bloom-3b",
"train_file": "./data/pretrain",
"train_mode": "qlora",
"task_type": "pretrain",
"num_train_epochs": 1,
"tokenize_num_workers": 10,
"per_device_train_batch_size": 1,
"gradient_accumulation_steps": 16,
"learning_rate": 1e-5,
"max_seq_length": 2048,
"logging_steps": 100,
"save_steps": 100,
"lr_scheduler_type": "cosine",
"warmup_ratio": 0.01,
"gradient_checkpointing": true,
"logging_first_step": false,
"disable_tqdm": false,
"optim": "adamw_hf",
"seed": 42,
"fp16": true,
"report_to": "tensorboard",
"dataloader_num_workers": 0,
"save_strategy": "steps",
"weight_decay": 0,
"max_grad_norm": 1.0,
"remove_unused_columns": false
}
\ No newline at end of file
{
"output_dir": "output/firefly-qwen-7b-pretrain-qlora",
"model_name_or_path": "Qwen/Qwen-7B",
"train_file": "./data/pretrain",
"train_mode": "qlora",
"task_type": "pretrain",
"num_train_epochs": 1,
"tokenize_num_workers": 10,
"per_device_train_batch_size": 1,
"gradient_accumulation_steps": 16,
"learning_rate": 1e-5,
"max_seq_length": 2048,
"logging_steps": 100,
"save_steps": 100,
"lr_scheduler_type": "cosine",
"warmup_ratio": 0.01,
"gradient_checkpointing": true,
"logging_first_step": false,
"disable_tqdm": false,
"optim": "adamw_hf",
"seed": 42,
"fp16": true,
"report_to": "tensorboard",
"dataloader_num_workers": 0,
"save_strategy": "steps",
"weight_decay": 0,
"max_grad_norm": 1.0,
"remove_unused_columns": false
}
\ No newline at end of file
{
"output_dir": "output/firefly-yi-6b-pretrain-qlora",
"model_name_or_path": "01-ai/Yi-6B",
"train_file": "./data/pretrain",
"train_mode": "qlora",
"task_type": "pretrain",
"num_train_epochs": 1,
"tokenize_num_workers": 10,
"per_device_train_batch_size": 1,
"gradient_accumulation_steps": 16,
"learning_rate": 1e-5,
"max_seq_length": 2048,
"logging_steps": 100,
"save_steps": 100,
"lr_scheduler_type": "cosine",
"warmup_ratio": 0.01,
"gradient_checkpointing": true,
"logging_first_step": false,
"disable_tqdm": false,
"optim": "adamw_hf",
"seed": 42,
"fp16": true,
"report_to": "tensorboard",
"dataloader_num_workers": 0,
"save_strategy": "steps",
"weight_decay": 0,
"max_grad_norm": 1.0,
"remove_unused_columns": false
}
\ No newline at end of file
{
"output_dir": "output/firefly-bloom-1b1-sft-full",
"model_name_or_path": "bigscience/bloom-1b1",
"deepspeed": "./train_args/ds_z3_config.json",
"train_file": "./data/dummy_data.jsonl",
"template_name": "default",
"train_mode": "full",
"num_train_epochs": 1,
"per_device_train_batch_size": 1,
"gradient_accumulation_steps": 16,
"learning_rate": 1e-5,
"max_seq_length": 1024,
"logging_steps": 100,
"save_steps": 100,
"save_total_limit": 1,
"lr_scheduler_type": "cosine",
"warmup_steps": 100,
"gradient_checkpointing": true,
"disable_tqdm": false,
"optim": "adamw_hf",
"seed": 42,
"fp16": true,
"report_to": "tensorboard",
"dataloader_num_workers": 0,
"save_strategy": "steps",
"weight_decay": 0,
"max_grad_norm": 1.0,
"remove_unused_columns": false
}
\ No newline at end of file
{
"output_dir": "output/firefly-bloom-3b-sft-full",
"model_name_or_path": "bigscience/bloom-3b",
"deepspeed": "./train_args/ds_z3_config.json",
"train_file": "./data/dummy_data.jsonl",
"template_name": "default",
"train_mode": "full",
"num_train_epochs": 1,
"per_device_train_batch_size": 1,
"gradient_accumulation_steps": 16,
"learning_rate": 1e-5,
"max_seq_length": 1024,
"logging_steps": 100,
"save_steps": 100,
"save_total_limit": 1,
"lr_scheduler_type": "cosine",
"warmup_steps": 100,
"gradient_checkpointing": true,
"disable_tqdm": false,
"optim": "adamw_hf",
"seed": 42,
"fp16": true,
"report_to": "tensorboard",
"dataloader_num_workers": 0,
"save_strategy": "steps",
"weight_decay": 0,
"max_grad_norm": 1.0,
"remove_unused_columns": false
}
{
"output_dir": "output/firefly-minicpm-2b-sft-full",
"model_name_or_path": "openbmb/MiniCPM-2B-dpo-fp16",
"deepspeed": "./train_args/ds_z3_config.json",
"train_file": "./data/dummy_data.jsonl",
"template_name": "minicpm",
"train_mode": "full",
"num_train_epochs": 1,
"per_device_train_batch_size": 1,
"gradient_accumulation_steps": 16,
"learning_rate": 1e-5,
"max_seq_length": 1024,
"logging_steps": 100,
"save_steps": 100,
"save_total_limit": 1,
"lr_scheduler_type": "cosine",
"warmup_steps": 100,
"gradient_checkpointing": true,
"disable_tqdm": false,
"optim": "adamw_hf",
"seed": 42,
"fp16": true,
"report_to": "tensorboard",
"dataloader_num_workers": 0,
"save_strategy": "steps",
"weight_decay": 0,
"max_grad_norm": 1.0,
"remove_unused_columns": false
}
\ No newline at end of file
{
"output_dir": "output/firefly-qwen-7b-sft-full",
"model_name_or_path": "Qwen/Qwen-7B-Chat",
"deepspeed": "./train_args/ds_z3_config.json",
"train_file": "./data/dummy_data.jsonl",
"template_name": "qwen",
"train_mode": "full",
"num_train_epochs": 1,
"per_device_train_batch_size": 1,
"gradient_accumulation_steps": 16,
"learning_rate": 1e-5,
"max_seq_length": 1024,
"logging_steps": 100,
"save_steps": 100,
"save_total_limit": 1,
"lr_scheduler_type": "cosine",
"warmup_steps": 100,
"gradient_checkpointing": true,
"disable_tqdm": false,
"optim": "adamw_hf",
"seed": 42,
"fp16": true,
"report_to": "tensorboard",
"dataloader_num_workers": 0,
"save_strategy": "steps",
"weight_decay": 0,
"max_grad_norm": 1.0,
"remove_unused_columns": false
}
{
"output_dir": "output/firefly-yi-6b-sft-full",
"model_name_or_path": "01-ai/Yi-6B-Chat",
"deepspeed": "./train_args/ds_z3_config.json",
"train_file": "./data/dummy_data.jsonl",
"template_name": "yi",
"train_mode": "full",
"num_train_epochs": 1,
"per_device_train_batch_size": 1,
"gradient_accumulation_steps": 16,
"learning_rate": 1e-5,
"max_seq_length": 1024,
"logging_steps": 100,
"save_steps": 100,
"save_total_limit": 1,
"lr_scheduler_type": "cosine",
"warmup_steps": 100,
"gradient_checkpointing": true,
"disable_tqdm": false,
"optim": "adamw_hf",
"seed": 42,
"fp16": true,
"report_to": "tensorboard",
"dataloader_num_workers": 0,
"save_strategy": "steps",
"weight_decay": 0,
"max_grad_norm": 1.0,
"remove_unused_columns": false
}
{
"output_dir": "output/firefly-bloom-7b1-sft-lora",
"model_name_or_path": "bigscience/bloom-7b1",
"train_file": "./data/dummy_data.jsonl",
"template_name": "default",
"train_mode": "lora",
"num_train_epochs": 1,
"per_device_train_batch_size": 1,
"gradient_accumulation_steps": 16,
"learning_rate": 2e-4,
"max_seq_length": 1024,
"logging_steps": 100,
"save_steps": 100,
"save_total_limit": 1,
"lr_scheduler_type": "constant_with_warmup",
"warmup_steps": 100,
"lora_rank": 64,
"lora_alpha": 16,
"lora_dropout": 0.05,
"gradient_checkpointing": true,
"disable_tqdm": false,
"optim": "paged_adamw_32bit",
"seed": 42,
"fp16": true,
"report_to": "tensorboard",
"dataloader_num_workers": 0,
"save_strategy": "steps",
"weight_decay": 0,
"max_grad_norm": 0.3,
"remove_unused_columns": false
}
\ No newline at end of file
{
"output_dir": "output/firefly-qwen1.5-7b-sft-lora",
"model_name_or_path": "Qwen/Qwen1.5-7B-Chat",
"train_file": "./data/dummy_data.jsonl",
"template_name": "qwen",
"train_mode": "lora",
"num_train_epochs": 1,
"per_device_train_batch_size": 1,
"gradient_accumulation_steps": 16,
"learning_rate": 2e-4,
"max_seq_length": 1024,
"logging_steps": 100,
"save_steps": 100,
"save_total_limit": 1,
"lr_scheduler_type": "constant_with_warmup",
"warmup_steps": 100,
"lora_rank": 64,
"lora_alpha": 16,
"lora_dropout": 0.05,
"gradient_checkpointing": true,
"disable_tqdm": false,
"optim": "paged_adamw_32bit",
"seed": 42,
"fp16": true,
"report_to": "tensorboard",
"dataloader_num_workers": 0,
"save_strategy": "steps",
"weight_decay": 0,
"max_grad_norm": 0.3,
"remove_unused_columns": false
}
\ No newline at end of file
{
"output_dir": "output/firefly-baichuan-13b-sft-qlora",
"model_name_or_path": "baichuan-inc/Baichuan-13B-Chat",
"train_file": "./data/dummy_data.jsonl",
"template_name": "baichuan",
"num_train_epochs": 1,
"per_device_train_batch_size": 1,
"gradient_accumulation_steps": 16,
"learning_rate": 1e-4,
"max_seq_length": 1024,
"logging_steps": 100,
"save_steps": 100,
"save_total_limit": 1,
"lr_scheduler_type": "constant_with_warmup",
"warmup_steps": 100,
"lora_rank": 64,
"lora_alpha": 16,
"lora_dropout": 0.05,
"gradient_checkpointing": true,
"disable_tqdm": false,
"optim": "paged_adamw_32bit",
"seed": 42,
"fp16": true,
"report_to": "tensorboard",
"dataloader_num_workers": 5,
"save_strategy": "steps",
"weight_decay": 0,
"max_grad_norm": 0.3,
"remove_unused_columns": false
}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment