#! /usr/bin/env bash set -ex DATESTR=`date +%Y%m%d-%H%M%S` OUTPUT_DIR=output/${RUN_NAME}-${DATESTR}-${LR} CACHE_DIR=cache MASTER_PORT=$(shuf -n 1 -i 10000-65535) export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export HSA_FORCE_FINE_GRAIN_PCIE=1 export NCCL_LAUNCH_MODE=GROUP # export NCCL_DEBUG=INFO export NCCL_P2P_DISABLE=0 # export MASTER_ADDR="127.0.0.1" # export MASTER_PORT=59992 export LLAMA_NN=1 export TORCH_NCCL_TIMEOUT=3600000 export TORCH_DISTRIBUTED_DEFAULT_TIMEOUT=1800 export NCCL_MAX_NCHANNELS=16 export NCCL_MIN_NCHANNELS=20 export NCCL_P2P_LEVEL=SYS export ROCBLAS_COMPUTETYPE_FP16R=0 export LD_LIBRARY_PATH=/home/rocblas-install/lib:$LD_LIBRARY_PATH export TOKENIZERS_PARALLELISM=false # 可以通过设置环境变量临时屏蔽这些警告 export PYTHONWARNINGS="ignore" mkdir -p $OUTPUT_DIR deepspeed --num_gpus 8 --num_nodes 1 --master_port=$MASTER_PORT src/train.py \ --stage sft \ --do_train \ --lora_rank 8 \ --lora_alpha 8 \ --lora_target all \ --resize_vocab True \ --optim adamw_torch \ --model_name_or_path /workspace/DL_DATA/llm-models/qwen2.5/Qwen2.5-VL-7B-Instruct \ --dataset chartqa \ --template qwen2_vl \ --finetuning_type lora \ --output_dir $OUTPUT_DIR \ --overwrite_cache \ --overwrite_output_dir True \ --warmup_steps 100 \ --max_grad_norm 1.0 \ --weight_decay 0.1 \ --ddp_timeout 120000000 \ --per_device_train_batch_size 16 \ --gradient_accumulation_steps 16 \ --lr_scheduler_type cosine \ --logging_steps 10 \ --learning_rate 1e-4 \ --num_train_epochs 2 \ --max_samples 1000 \ --plot_loss \ --bf16 \ --logging_dir /home/project/nanwang/qwen2.5_vl_prof/z_logs \ --deepspeed examples/deepspeed/ds_z2_config.json \ --dataloader_num_workers 32 2>&1 | tee -a ${OUTPUT_DIR}/train_dcu.log