export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export ROCBLAS_COMPUTETYPE_FP16R=0 export HSA_FORCE_FINE_GRAIN_PCIE=1 export OMP_NUM_THREADS=1 export NCCL_ALGO=Ring export NCCL_LAUNCH_MODE=GROUP export NCCL_NCHANNELS_PER_PEER=16 export NCCL_MAX_NCHANNELS=16 export NCCL_MIN_NCHANNELS=16 export NCCL_IB_TIMEOUT=22 export CUDA_DEVICE_MAX_CONNECTIONS=1 export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1 export NCCL_P2P_LEVEL=SYS export NCCL_NET_GDR_LEVEL=7 export NCCL_NET_GDR_READ=1 export RCCL_SDMA_COPY_ENABLE=0 export SENDRECV_STREAM_WITH_COMPUTE=1 export NCCL_TOPO_FILE="/workspace/test/topo.xml" export LD_LIBRARY_PATH=/usr/local/lib/python3.10/site-packages/torch/lib/:$LD_LIBRARY_PATH export ALLREDUCE_STREAM_WITH_COMPUTE=1 export VLLM_NUMA_BIND=1 export VLLM_RANK0_NUMA=3 export VLLM_RANK1_NUMA=1 export VLLM_RANK2_NUMA=1 export VLLM_RANK3_NUMA=0 export VLLM_RANK4_NUMA=7 export VLLM_RANK5_NUMA=5 export VLLM_RANK6_NUMA=5 export VLLM_RANK7_NUMA=4 export VLLM_RPC_TIMEOUT=100000 #!/bin/bash # 模型配置文件路径 MODELS_CONFIG="/workspace/test/models-to-test.cfg" # 结果目录 RESULTS_DIR="/workspace/test/results" # 读取配置文件,跳过注释和空行 while IFS= read -r line || [[ -n "$line" ]]; do # 跳过注释行和空行 if [[ "$line" =~ ^# ]] || [[ -z "$line" ]]; then continue fi # 解析配置行 IFS=';' read -ra CONFIG <<< "$line" model_name="${CONFIG[0]}" model_path="${CONFIG[1]}" tp="${CONFIG[2]}" batch="${CONFIG[3]//,/ }" # 将逗号替换为空格 prompt_tokens="${CONFIG[4]//,/ }" completion_tokens="${CONFIG[5]//,/ }" dtype="${CONFIG[6]}" max_model_len="${CONFIG[7]}" gpu_memory_utilization="${CONFIG[8]}" echo "开始测试模型: $model_name" echo "模型路径: $model_path" echo "参数配置:" echo " tensor_parallel_size: $tp" echo " batch_sizes: $batch" echo " prompt_tokens: $prompt_tokens" echo " completion_tokens: $completion_tokens" echo " dtype: $dtype" echo " max_model_len: $max_model_len" echo " gpu_memory_utilization: $gpu_memory_utilization" # 创建模型专属结果目录 model_result_dir="${RESULTS_DIR}/${model_name}" mkdir -p "$model_result_dir" # 运行基准测试 python /workspace/test/benchmark_throughput_0.7.2.py \ --model "$model_path" \ --tensor-parallel-size "$tp" \ --num-prompts $batch \ --input-len $prompt_tokens \ --output-len $completion_tokens \ --dtype "$dtype" \ --trust-remote-code \ --max-model-len "$max_model_len" \ --gpu-memory-utilization "$gpu_memory_utilization" \ --output-json "${model_result_dir}/${model_name}_tp${tp}.txt" \ 2>&1 | tee "${model_result_dir}/${model_name}_tp${tp}.log" echo "完成测试模型: $model_name" echo "结果保存在: $model_result_dir" echo "----------------------------------------" done < "$MODELS_CONFIG"