set -x GPUS=${GPUS:-8} BATCH_SIZE=${BATCH_SIZE:-16} PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-2} GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS)) export PYTHONPATH="${PYTHONPATH}:$(pwd)" export MASTER_PORT=34229 export TF_CPP_MIN_LOG_LEVEL=3 export LAUNCHER=pytorch OUTPUT_DIR='/InternVL/saves/internvl2-40b/finetune_multi_dcu' if [ ! -d "$OUTPUT_DIR" ]; then mkdir -p "$OUTPUT_DIR" fi # number of gpus: 4 # batch size per gpu: 8 # gradient accumulation steps: 2 # total batch size: 16 # epoch: 1 torchrun \ --nnodes=1 \ --node_rank=0 \ --master_addr=127.0.0.1 \ --nproc_per_node=${GPUS} \ --master_port=${MASTER_PORT} \ internvl/train/internvl_chat_finetune.py \ --model_name_or_path "/InternVL/InternVL2-40B" \ --conv_style "internlm2-chat" \ --output_dir ${OUTPUT_DIR} \ --meta_path "/internvl_chat/shell/data/internvl_1_2_finetune_custom.json" \ --overwrite_output_dir True \ --force_image_size 448 \ --max_dynamic_patch 12 \ --down_sample_ratio 0.5 \ --drop_path_rate 0.0 \ --freeze_llm True \ --freeze_mlp True \ --freeze_backbone True \ --use_llm_lora 16 \ --vision_select_layer -1 \ --dataloader_num_workers 8 \ --fp16 True \ --num_train_epochs 1 \ --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \ --gradient_accumulation_steps ${GRADIENT_ACC} \ --evaluation_strategy "no" \ --save_strategy "steps" \ --save_steps 200 \ --save_total_limit 1 \ --learning_rate 2e-5 \ --weight_decay 0.05 \ --warmup_ratio 0.03 \ --lr_scheduler_type "cosine" \ --logging_steps 1 \ --max_seq_length 4096 \ --do_train True \ --grad_checkpoint True \ --group_by_length True \ --dynamic_image_size True \ --use_thumbnail True \ --ps_version 'v2' \ --deepspeed "./internvl_chat/zero_stage3_config_34b.json" \ 2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"