#!/bin/bash ## 产品部提供的vllm 0.9.2默认环境变量,用于测试非DS 671B模型 export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export ALLREDUCE_STREAM_WITH_COMPUTE=1 export NCCL_MIN_NCHANNELS=16 export NCCL_MAX_NCHANNELS=16 # export VLLM_PCIE_USE_CUSTOM_ALLREDUCE=1 #K100-AI需要 ## 根据TOPO进行NUMA绑核 export VLLM_NUMA_BIND=1 export VLLM_RANK0_NUMA=0 export VLLM_RANK1_NUMA=0 export VLLM_RANK2_NUMA=0 export VLLM_RANK3_NUMA=0 export VLLM_RANK4_NUMA=0 export VLLM_RANK5_NUMA=0 export VLLM_RANK6_NUMA=0 export VLLM_RANK7_NUMA=0 ## 项目上反馈的可能对性能有提升的环境变量,可酌情使用 # export HSA_FORCE_FINE_GRAIN_PCIE=1 # export NCCL_P2P_LEVEL=SYS # export NCCL_LAUNCH_MODE=GROUP # export VLLM_RPC_TIMEOUT=1800000 # export VLLM_SPEC_DECODE_EAGER=1 # export VLLM_ENFORCE_EAGER_BS_THRESHOLD=44 # K100AI设置为44,BW1000去掉该环境变量 # export VLLM_MLA_DISABLE=0 # export VLLM_USE_FLASH_MLA=1 # export VLLM_ZERO_OVERHEAD=1 # 该参数对某些模型性能有明显提升;但在某些环境下表现异常,根据实际情况斟酌选用;且使用后对Qwen3模型会关闭其thinking功能 # export W8A8_SUPPORT_METHODS=3 # 对W8A8的量化模型有提升 # export ROCBLAS_INT8_ENABLE=0 # 对W8A8的量化模型有提升 # export VLLM_USE_FLASH_ATTN_PA=0 # 解决精度乱码问题 # 禁止生成core文件 ulimit -c 0 # 读取json配置文件 json_data=$(cat auto_quick_check_config.json) # 获取基础信息 DCU=$(echo $json_data | jq -r '.DCU') vllm_version=$(echo $json_data | jq -r '.vllm_version') pkg_version=$(echo $json_data | jq -r '.pkg_version') dst_path=$(echo $json_data | jq -r '.dst_path') items=$(echo $json_data | jq -c '.items[]') while read -r item; do model_name=$(echo "$item" | jq -r '.model_name') model_path=$(echo "$item" | jq -r '.model_path') dtype=$(echo "$item" | jq -r '.dtype') tensor_parallel=$(echo "$item" | jq -r '.tensor_parallel') batch_size=$(echo "$item" | jq -r '.batch_size') seqlen_tuple=$(echo "$item" | jq -r '.seqlen_tuple') result_path=${dst_path}/${model_name}/ if [ ! -f ${result_path} ]; then mkdir ${result_path} -p fi if [ -e "${result_path}output.csv" ] && [ -s "${result_path}output.csv" ]; then : else echo "model_name,DCU,DCU nums,precision,input_len,output_len,bs,TTFT_mean(s),TPOT_mean(s),GenerateThroughput(tokens/s),total_time(s),version" > ${result_path}output.csv fi echo $tensor_parallel | jq -c '.[]' | while read -r tp; do echo $batch_size | jq -c '.[]' | while read -r bs; do echo $seqlen_tuple | jq -c '.[]' | while read -r sq; do IFS=' ' read -r input_len output_len <<< ${sq//\"/} case $dtype in "float16" | "bfloat16") extra="--dtype ${dtype}" ;; "gptq") extra="--quantization gptq" ;; "awq") extra="--quantization awq" ;; esac # 当前离线测试脚本仅支持V0 Engine,推荐使用在线测试方法 VLLM_USE_V1=0 python benchmarks/benchmark_throughput.py --model ${model_path} --tensor-parallel-size ${tp} \ --num-prompts ${bs} --input-len ${input_len} --output-len ${output_len} ${extra} --trust-remote-code --max-model-len 32768 \ 2>&1 | tee ${result_path}/vllm_${model_name}_bs_${bs}_inputlen_${input_len}_outputlen_${output_len}_tp_${tp}.log throughput=`grep -a "^Generate Throughput:" ${result_path}/vllm_${model_name}_bs_${bs}_inputlen_${input_len}_outputlen_${output_len}_tp_${tp}.log | awk -F ' ' '{print $3}'` TTFT_mean=`grep -a "^TTFT mean:" ${result_path}/vllm_${model_name}_bs_${bs}_inputlen_${input_len}_outputlen_${output_len}_tp_${tp}.log | awk -F ' ' '{print $3}'` TPOT_mean=`grep -a "^TPOT mean:" ${result_path}/vllm_${model_name}_bs_${bs}_inputlen_${input_len}_outputlen_${output_len}_tp_${tp}.log | awk -F ' ' '{print $3}'` total_time=`grep -a "^Elapsed_time:" ${result_path}/vllm_${model_name}_bs_${bs}_inputlen_${input_len}_outputlen_${output_len}_tp_${tp}.log | awk -F ' ' '{print $2}'` echo "$model_name,$DCU,$tp,$dtype,$input_len,$output_len,$bs,$TTFT_mean,$TPOT_mean,$throughput,$total_time,$pkg_version" >> ${result_path}output.csv done done done done <<< "$items"