#!/bin/bash HIP_VISIBLE_DEVICES=4,5,6,7 GPUS_PER_NODE=4 torchrun $DISTRIBUTED_ARGS finetune.py \ --model_name_or_path $MODEL \ --llm_type $LLM_TYPE \ --data_path $DATA \ --eval_data_path $EVAL_DATA \ --remove_unused_columns false \ --label_names "labels" \ --prediction_loss_only false \ --bf16 false \ --bf16_full_eval false \ --fp16 true \ --fp16_full_eval true \ --do_train \ --do_eval \ --tune_vision true \ --tune_llm false \ --use_lora true \ --lora_target_modules "llm\..*layers\.\d+\.self_attn\.(q_proj|k_proj)" \ --model_max_length 2048 \ --max_slice_nums 9 \ --max_steps 100 \ --eval_steps 10 \ --output_dir /home/wanglch/projects/saves/MiniCPM-Llama3-V-2_5/lora_train_dtk \ --logging_dir /home/wanglch/projects/saves/MiniCPM-Llama3-V-2_5/lora_train_dtk \ --logging_strategy "steps" \ --per_device_train_batch_size 2 \ --per_device_eval_batch_size 1 \ --gradient_accumulation_steps 1 \ --evaluation_strategy "steps" \ --save_strategy "steps" \ --save_steps 100 \ --save_total_limit 10 \ --learning_rate 1e-6 \ --weight_decay 0.1 \ --adam_beta2 0.95 \ --warmup_ratio 0.01 \ --lr_scheduler_type "cosine" \ --logging_steps 1 \ --gradient_checkpointing true \ --deepspeed ds_config_zero2.json \ --report_to "tensorboard" # wandb {'train_runtime': 305.4486, 'train_samples_per_second': 2.619, 'train_steps_per_second': 0.327, 'train_loss': 0.06511555195844267, 'epoch': 100.0} {'eval_loss': 0.389315664768219, 'eval_runtime': 0.2161, 'eval_samples_per_second': 4.628, 'eval_steps_per_second': 4.628, 'epoch': 100.0}