deepspeed --num_gpus=8 src/train_bash.py \ --stage sft \ --model_name_or_path ../baichuan-13b-chat \ --do_train \ --template baichuan \ --dataset alpaca_gpt4_en,alpaca_gpt4_zh,self_cognition,oaast_sft,sharegpt_zh,lima \ --finetuning_type full \ --output_dir output \ --per_device_train_batch_size 1 \ --per_device_eval_batch_size 1 \ --gradient_accumulation_steps 2 \ --preprocessing_num_workers 16 \ --lr_scheduler_type cosine \ --logging_steps 10 \ --save_steps 100 \ --eval_steps 100 \ --learning_rate 5e-5 \ --max_grad_norm 0.5 \ --num_train_epochs 2.0 \ --val_size 0.01 \ --evaluation_strategy steps \ --weight_decay 0. \ --warmup_ratio 0.03 \ --max_source_length 2000 \ --max_target_length 1200 \ --plot_loss \ --fp16 \ --deepspeed deepspeed.json