script.sh

# We include a simple full-parameter finetuning & inference script here. Our V0.1 chat model is finetuned using this script. 
# The FT dataset we use is openassistant-guanaco. For finetuning with less than 4GB RAM, we refer you to the Qlora and bitsandbytes repo.
# We did not undergone extensive hyperparameter tuning nor choosing more performant FT datasets. 
# We hope the community can explore on finetuning TinyLlama and come up with better chat models. I will include community-finetuned models in this repo.

# V0.1
# CUDA_VISIBLE_DEVICES=0 accelerate launch --main_process_port 1234 sft/finetune.py \
CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch --multi_gpu --num_processes 4 --main_process_port 1234 sft/finetune.py \
    --model_name_or_path PY007/TinyLlama-1.1B-intermediate-step-240k-503b \
    --output_dir ./output/503B_FT_lr1e-5_ep5 \
    --logging_steps 10 \
    --save_strategy epoch \
    --data_seed 42 \
    --save_total_limit 6 \
    --evaluation_strategy epoch \
    --eval_dataset_size 512 \
    --max_eval_samples 1000 \
    --per_device_eval_batch_size 1 \
    --max_new_tokens 32 \
    --dataloader_num_workers 3 \
    --group_by_length=False \
    --logging_strategy steps \
    --remove_unused_columns False \
    --do_train \
    --do_eval \
    --warmup_ratio 0.05 \
    --lr_scheduler_type constant \
    --dataset oasst1 \
    --source_max_len 16 \
    --target_max_len 512 \
    --per_device_train_batch_size 4 \
    --max_steps 0 \
    --num_train_epochs 5 \
    --learning_rate 1e-5 \
    --adam_beta2 0.999 \
    --max_grad_norm 1.0 \
    --weight_decay 0.0 \
    --seed 0 \
    --trust_remote_code \
    --report_to wandb 


# # V0.2
# CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch --multi_gpu --num_processes 4 --main_process_port 1234 sft/finetune.py \
#     --model_name_or_path PY007/TinyLlama-1.1B-intermediate-step-480k-1T \
#     --output_dir ./output/503B_FT_lr1e-5_ep5_top1_2023-08-25 \
#     --logging_steps 10 \
#     --save_strategy epoch \
#     --data_seed 42 \
#     --save_total_limit 6 \
#     --evaluation_strategy epoch \
#     --eval_dataset_size 512 \
#     --max_eval_samples 1000 \
#     --per_device_eval_batch_size 1 \
#     --max_new_tokens 32 \
#     --dataloader_num_workers 3 \
#     --group_by_length=False \
#     --logging_strategy steps \
#     --remove_unused_columns False \
#     --do_train \
#     --do_eval \
#     --warmup_ratio 0.05 \
#     --lr_scheduler_type constant \
#     --dataset OpenAssistant/oasst_top1_2023-08-25 \
#     --dataset_format oasst1 \
#     --source_max_len 16 \
#     --target_max_len 512 \
#     --per_device_train_batch_size 4 \
#     --max_steps 0 \
#     --num_train_epochs 5 \
#     --learning_rate 1e-5 \
#     --adam_beta2 0.999 \
#     --max_grad_norm 1.0 \
#     --weight_decay 0.0 \
#     --seed 0 \
#     --trust_remote_code \
#     --report_to wandb