Commit d3e0fa63 authored by chenzk's avatar chenzk
Browse files

v1.0.3

parents
Pipeline #1452 canceled with stages
{
"output_dir": "output/firefly-baichuan-13b-sft-qlora",
"model_name_or_path": "baichuan-inc/Baichuan-13B-Chat",
"train_file": "./data/dummy_data.jsonl",
"template_name": "baichuan",
"num_train_epochs": 1,
"per_device_train_batch_size": 1,
"gradient_accumulation_steps": 16,
"learning_rate": 1e-4,
"max_seq_length": 1024,
"logging_steps": 100,
"save_steps": 100,
"save_total_limit": 1,
"lr_scheduler_type": "constant_with_warmup",
"warmup_steps": 100,
"lora_rank": 64,
"lora_alpha": 16,
"lora_dropout": 0.05,
"gradient_checkpointing": true,
"disable_tqdm": false,
"optim": "paged_adamw_32bit",
"seed": 42,
"fp16": true,
"report_to": "tensorboard",
"dataloader_num_workers": 5,
"save_strategy": "steps",
"weight_decay": 0,
"max_grad_norm": 0.3,
"remove_unused_columns": false
}
\ No newline at end of file
{
"output_dir": "output/firefly-baichuan-7b-sft-qlora",
"model_name_or_path": "baichuan-inc/baichuan-7B",
"train_file": "./data/dummy_data.jsonl",
"template_name": "baichuan",
"num_train_epochs": 1,
"per_device_train_batch_size": 1,
"gradient_accumulation_steps": 16,
"learning_rate": 2e-4,
"max_seq_length": 1024,
"logging_steps": 100,
"save_steps": 100,
"save_total_limit": 1,
"lr_scheduler_type": "constant_with_warmup",
"warmup_steps": 100,
"lora_rank": 64,
"lora_alpha": 16,
"lora_dropout": 0.05,
"gradient_checkpointing": true,
"disable_tqdm": false,
"optim": "paged_adamw_32bit",
"seed": 42,
"fp16": true,
"report_to": "tensorboard",
"dataloader_num_workers": 0,
"save_strategy": "steps",
"weight_decay": 0,
"max_grad_norm": 0.3,
"remove_unused_columns": false
}
\ No newline at end of file
{
"output_dir": "output/firefly-baichuan2-13b-sft-qlora",
"model_name_or_path": "baichuan-inc/Baichuan2-13B-Chat",
"train_file": "./data/dummy_data.jsonl",
"template_name": "baichuan2",
"num_train_epochs": 1,
"per_device_train_batch_size": 1,
"gradient_accumulation_steps": 16,
"learning_rate": 1e-4,
"max_seq_length": 1024,
"logging_steps": 100,
"save_steps": 100,
"save_total_limit": 1,
"lr_scheduler_type": "constant_with_warmup",
"warmup_steps": 100,
"lora_rank": 64,
"lora_alpha": 16,
"lora_dropout": 0.05,
"gradient_checkpointing": true,
"disable_tqdm": false,
"optim": "paged_adamw_32bit",
"seed": 42,
"fp16": true,
"report_to": "tensorboard",
"dataloader_num_workers": 0,
"save_strategy": "steps",
"weight_decay": 0,
"max_grad_norm": 0.3,
"remove_unused_columns": false
}
\ No newline at end of file
{
"output_dir": "output/firefly-baichuan2-7b-sft-qlora",
"model_name_or_path": "baichuan-inc/Baichuan2-7B-Chat",
"train_file": "./data/dummy_data.jsonl",
"template_name": "baichuan2",
"num_train_epochs": 1,
"per_device_train_batch_size": 1,
"gradient_accumulation_steps": 16,
"learning_rate": 2e-4,
"max_seq_length": 1024,
"logging_steps": 100,
"save_steps": 100,
"save_total_limit": 1,
"lr_scheduler_type": "constant_with_warmup",
"warmup_steps": 100,
"lora_rank": 64,
"lora_alpha": 16,
"lora_dropout": 0.05,
"gradient_checkpointing": true,
"disable_tqdm": false,
"optim": "paged_adamw_32bit",
"seed": 42,
"fp16": true,
"report_to": "tensorboard",
"dataloader_num_workers": 0,
"save_strategy": "steps",
"weight_decay": 0,
"max_grad_norm": 0.3,
"remove_unused_columns": false
}
\ No newline at end of file
{
"output_dir": "output/firefly-bloom-7b1-sft-qlora",
"model_name_or_path": "bigscience/bloom-7b1",
"train_file": "./data/dummy_data.jsonl",
"template_name": "default",
"num_train_epochs": 1,
"per_device_train_batch_size": 1,
"gradient_accumulation_steps": 16,
"learning_rate": 2e-4,
"max_seq_length": 1024,
"logging_steps": 100,
"save_steps": 100,
"save_total_limit": 1,
"lr_scheduler_type": "constant_with_warmup",
"warmup_steps": 100,
"lora_rank": 64,
"lora_alpha": 16,
"lora_dropout": 0.05,
"gradient_checkpointing": true,
"disable_tqdm": false,
"optim": "paged_adamw_32bit",
"seed": 42,
"fp16": true,
"report_to": "tensorboard",
"dataloader_num_workers": 0,
"save_strategy": "steps",
"weight_decay": 0,
"max_grad_norm": 0.3,
"remove_unused_columns": false
}
\ No newline at end of file
{
"output_dir": "output/firefly-chatglm2-6b-sft-qlora",
"model_name_or_path": "THUDM/chatglm2-6b",
"train_file": "./data/dummy_data.jsonl",
"template_name": "chatglm2",
"num_train_epochs": 1,
"per_device_train_batch_size": 1,
"gradient_accumulation_steps": 16,
"learning_rate": 2e-4,
"max_seq_length": 1024,
"logging_steps": 100,
"save_steps": 100,
"save_total_limit": 1,
"lr_scheduler_type": "constant_with_warmup",
"warmup_steps": 100,
"lora_rank": 64,
"lora_alpha": 16,
"lora_dropout": 0.05,
"gradient_checkpointing": true,
"disable_tqdm": false,
"optim": "paged_adamw_32bit",
"seed": 42,
"fp16": true,
"report_to": "tensorboard",
"dataloader_num_workers": 0,
"save_strategy": "steps",
"weight_decay": 0,
"max_grad_norm": 0.3,
"remove_unused_columns": false
}
\ No newline at end of file
{
"output_dir": "output/firefly-chatglm3-6b-sft-qlora",
"model_name_or_path": "THUDM/chatglm3-6b",
"train_file": "./data/dummy_data_chatglm3.jsonl",
"template_name": "chatglm3",
"num_train_epochs": 1,
"per_device_train_batch_size": 1,
"gradient_accumulation_steps": 16,
"learning_rate": 2e-4,
"max_seq_length": 1024,
"logging_steps": 100,
"save_steps": 100,
"save_total_limit": 1,
"lr_scheduler_type": "constant_with_warmup",
"warmup_steps": 100,
"lora_rank": 64,
"lora_alpha": 16,
"lora_dropout": 0.05,
"gradient_checkpointing": true,
"disable_tqdm": false,
"optim": "paged_adamw_32bit",
"seed": 42,
"fp16": true,
"report_to": "tensorboard",
"dataloader_num_workers": 0,
"save_strategy": "steps",
"weight_decay": 0,
"max_grad_norm": 0.3,
"remove_unused_columns": false
}
\ No newline at end of file
{
"output_dir": "output/firefly-deepseek-7b-sft-qlora",
"model_name_or_path": "deepseek-ai/deepseek-llm-7b-chat",
"train_file": "./data/dummy_data.jsonl",
"template_name": "deepseek",
"num_train_epochs": 1,
"per_device_train_batch_size": 1,
"gradient_accumulation_steps": 16,
"learning_rate": 2e-4,
"max_seq_length": 1024,
"logging_steps": 100,
"save_steps": 100,
"save_total_limit": 1,
"lr_scheduler_type": "constant_with_warmup",
"warmup_steps": 100,
"lora_rank": 64,
"lora_alpha": 16,
"lora_dropout": 0.05,
"gradient_checkpointing": true,
"disable_tqdm": false,
"optim": "paged_adamw_32bit",
"seed": 42,
"fp16": true,
"report_to": "tensorboard",
"dataloader_num_workers": 0,
"save_strategy": "steps",
"weight_decay": 0,
"max_grad_norm": 0.3,
"remove_unused_columns": false
}
\ No newline at end of file
{
"output_dir": "output/firefly-deepseek-moe-16b-sft-qlora",
"model_name_or_path": "deepseek-ai/deepseek-moe-16b-chat",
"train_file": "./data/dummy_data.jsonl",
"template_name": "deepseek",
"num_train_epochs": 1,
"per_device_train_batch_size": 1,
"gradient_accumulation_steps": 16,
"learning_rate": 1e-4,
"max_seq_length": 1024,
"logging_steps": 100,
"save_steps": 100,
"save_total_limit": 1,
"lr_scheduler_type": "constant_with_warmup",
"warmup_steps": 100,
"lora_rank": 64,
"lora_alpha": 16,
"lora_dropout": 0.05,
"gradient_checkpointing": true,
"disable_tqdm": false,
"optim": "paged_adamw_32bit",
"seed": 42,
"fp16": true,
"report_to": "tensorboard",
"dataloader_num_workers": 0,
"save_strategy": "steps",
"weight_decay": 0,
"max_grad_norm": 0.3,
"remove_unused_columns": false
}
\ No newline at end of file
{
"output_dir": "output/firefly-gemma-2b-sft-qlora",
"model_name_or_path": "google/gemma-2b-it",
"train_file": "./data/dummy_data.jsonl",
"template_name": "gemma",
"num_train_epochs": 1,
"per_device_train_batch_size": 1,
"gradient_accumulation_steps": 16,
"learning_rate": 2e-4,
"max_seq_length": 1024,
"logging_steps": 100,
"save_steps": 100,
"save_total_limit": 1,
"lr_scheduler_type": "constant_with_warmup",
"warmup_steps": 100,
"lora_rank": 64,
"lora_alpha": 16,
"lora_dropout": 0.05,
"use_unsloth": true,
"gradient_checkpointing": true,
"disable_tqdm": false,
"optim": "paged_adamw_32bit",
"seed": 42,
"fp16": true,
"report_to": "tensorboard",
"dataloader_num_workers": 0,
"save_strategy": "steps",
"weight_decay": 0,
"max_grad_norm": 0.3,
"remove_unused_columns": false
}
\ No newline at end of file
{
"output_dir": "output/firefly-gemma-7b-sft-qlora",
"model_name_or_path": "google/gemma-7b-it",
"train_file": "./data/dummy_data.jsonl",
"template_name": "gemma",
"num_train_epochs": 1,
"per_device_train_batch_size": 1,
"gradient_accumulation_steps": 16,
"learning_rate": 2e-4,
"max_seq_length": 1024,
"logging_steps": 100,
"save_steps": 100,
"save_total_limit": 1,
"lr_scheduler_type": "constant_with_warmup",
"warmup_steps": 100,
"lora_rank": 64,
"lora_alpha": 16,
"lora_dropout": 0.05,
"use_unsloth": true,
"gradient_checkpointing": true,
"disable_tqdm": false,
"optim": "paged_adamw_32bit",
"seed": 42,
"fp16": true,
"report_to": "tensorboard",
"dataloader_num_workers": 0,
"save_strategy": "steps",
"weight_decay": 0,
"max_grad_norm": 0.3,
"remove_unused_columns": false
}
\ No newline at end of file
{
"output_dir": "output/firefly-internlm-20b-sft-qlora",
"model_name_or_path": "internlm/internlm-chat-20b",
"train_file": "./data/dummy_data.jsonl",
"template_name": "internlm",
"num_train_epochs": 1,
"per_device_train_batch_size": 1,
"gradient_accumulation_steps": 16,
"learning_rate": 1e-4,
"max_seq_length": 1024,
"logging_steps": 100,
"save_steps": 100,
"save_total_limit": 1,
"lr_scheduler_type": "constant_with_warmup",
"warmup_steps": 100,
"lora_rank": 64,
"lora_alpha": 16,
"lora_dropout": 0.05,
"gradient_checkpointing": true,
"disable_tqdm": false,
"optim": "paged_adamw_32bit",
"seed": 42,
"fp16": true,
"report_to": "tensorboard",
"dataloader_num_workers": 0,
"save_strategy": "steps",
"weight_decay": 0,
"max_grad_norm": 0.3,
"remove_unused_columns": false
}
\ No newline at end of file
{
"output_dir": "output/firefly-internlm-7b-sft-qlora",
"model_name_or_path": "internlm/internlm-chat-7b",
"train_file": "./data/dummy_data.jsonl",
"template_name": "internlm",
"num_train_epochs": 1,
"per_device_train_batch_size": 1,
"gradient_accumulation_steps": 16,
"learning_rate": 2e-4,
"max_seq_length": 1024,
"logging_steps": 100,
"save_steps": 100,
"save_total_limit": 1,
"lr_scheduler_type": "constant_with_warmup",
"warmup_steps": 100,
"lora_rank": 64,
"lora_alpha": 16,
"lora_dropout": 0.05,
"gradient_checkpointing": true,
"disable_tqdm": false,
"optim": "paged_adamw_32bit",
"seed": 42,
"fp16": true,
"report_to": "tensorboard",
"dataloader_num_workers": 0,
"save_strategy": "steps",
"weight_decay": 0,
"max_grad_norm": 0.3,
"remove_unused_columns": false
}
\ No newline at end of file
{
"output_dir": "output/firefly-internlm2-20b-sft-qlora",
"model_name_or_path": "internlm/internlm2-chat-20b",
"train_file": "./data/dummy_data.jsonl",
"template_name": "internlm2",
"num_train_epochs": 1,
"per_device_train_batch_size": 1,
"gradient_accumulation_steps": 16,
"learning_rate": 1e-4,
"max_seq_length": 1024,
"logging_steps": 100,
"save_steps": 100,
"save_total_limit": 1,
"lr_scheduler_type": "constant_with_warmup",
"warmup_steps": 100,
"lora_rank": 64,
"lora_alpha": 16,
"lora_dropout": 0.05,
"gradient_checkpointing": true,
"disable_tqdm": false,
"optim": "paged_adamw_32bit",
"seed": 42,
"fp16": true,
"report_to": "tensorboard",
"dataloader_num_workers": 0,
"save_strategy": "steps",
"weight_decay": 0,
"max_grad_norm": 0.3,
"remove_unused_columns": false
}
\ No newline at end of file
{
"output_dir": "output/firefly-internlm2-7b-sft-qlora",
"model_name_or_path": "internlm/internlm2-chat-7b",
"train_file": "./data/dummy_data.jsonl",
"template_name": "internlm2",
"num_train_epochs": 1,
"per_device_train_batch_size": 1,
"gradient_accumulation_steps": 16,
"learning_rate": 2e-4,
"max_seq_length": 1024,
"logging_steps": 100,
"save_steps": 100,
"save_total_limit": 1,
"lr_scheduler_type": "constant_with_warmup",
"warmup_steps": 100,
"lora_rank": 64,
"lora_alpha": 16,
"lora_dropout": 0.05,
"gradient_checkpointing": true,
"disable_tqdm": false,
"optim": "paged_adamw_32bit",
"seed": 42,
"fp16": true,
"report_to": "tensorboard",
"dataloader_num_workers": 0,
"save_strategy": "steps",
"weight_decay": 0,
"max_grad_norm": 0.3,
"remove_unused_columns": false
}
\ No newline at end of file
{
"output_dir": "output/firefly-llama2-13b-sft-qlora",
"model_name_or_path": "NousResearch/Llama-2-13b-chat-hf",
"train_file": "./data/dummy_data.jsonl",
"template_name": "llama2",
"num_train_epochs": 1,
"per_device_train_batch_size": 1,
"gradient_accumulation_steps": 16,
"learning_rate": 1e-4,
"max_seq_length": 1024,
"logging_steps": 100,
"save_steps": 100,
"save_total_limit": 1,
"lr_scheduler_type": "constant_with_warmup",
"warmup_steps": 100,
"lora_rank": 64,
"lora_alpha": 16,
"lora_dropout": 0.05,
"gradient_checkpointing": true,
"disable_tqdm": false,
"optim": "paged_adamw_32bit",
"seed": 42,
"fp16": true,
"report_to": "tensorboard",
"dataloader_num_workers": 0,
"save_strategy": "steps",
"weight_decay": 0,
"max_grad_norm": 0.3,
"remove_unused_columns": false
}
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment