auto_quick_check.sh 4.39 KB
Newer Older
liuxu3's avatar
liuxu3 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/bin/bash

## 产品部提供的vllm 0.9.2默认环境变量,用于测试非DS 671B模型
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export ALLREDUCE_STREAM_WITH_COMPUTE=1
export NCCL_MIN_NCHANNELS=16
export NCCL_MAX_NCHANNELS=16
# export VLLM_PCIE_USE_CUSTOM_ALLREDUCE=1 #K100-AI需要

## 根据TOPO进行NUMA绑核
export VLLM_NUMA_BIND=1
export VLLM_RANK0_NUMA=0
export VLLM_RANK1_NUMA=0
export VLLM_RANK2_NUMA=0
export VLLM_RANK3_NUMA=0
export VLLM_RANK4_NUMA=0
export VLLM_RANK5_NUMA=0
export VLLM_RANK6_NUMA=0
export VLLM_RANK7_NUMA=0

## 项目上反馈的可能对性能有提升的环境变量,可酌情使用
# export HSA_FORCE_FINE_GRAIN_PCIE=1
# export NCCL_P2P_LEVEL=SYS
# export NCCL_LAUNCH_MODE=GROUP
# export VLLM_RPC_TIMEOUT=1800000
# export VLLM_SPEC_DECODE_EAGER=1
# export VLLM_ENFORCE_EAGER_BS_THRESHOLD=44   # K100AI设置为44,BW1000去掉该环境变量

# export VLLM_MLA_DISABLE=0
# export VLLM_USE_FLASH_MLA=1
# export VLLM_ZERO_OVERHEAD=1     # 该参数对某些模型性能有明显提升;但在某些环境下表现异常,根据实际情况斟酌选用;且使用后对Qwen3模型会关闭其thinking功能

# export W8A8_SUPPORT_METHODS=3 # 对W8A8的量化模型有提升
# export ROCBLAS_INT8_ENABLE=0 # 对W8A8的量化模型有提升

# export VLLM_USE_FLASH_ATTN_PA=0 # 解决精度乱码问题

# 禁止生成core文件
ulimit -c 0

# 读取json配置文件
json_data=$(cat auto_quick_check_config.json)

# 获取基础信息
DCU=$(echo $json_data | jq -r '.DCU')
vllm_version=$(echo $json_data | jq -r '.vllm_version')
pkg_version=$(echo $json_data | jq -r '.pkg_version')
dst_path=$(echo $json_data | jq -r '.dst_path')

items=$(echo $json_data | jq -c '.items[]')
while read -r item; do
    model_name=$(echo "$item" | jq -r '.model_name')
    model_path=$(echo "$item" | jq -r '.model_path')
    dtype=$(echo "$item" | jq -r '.dtype')
    tensor_parallel=$(echo "$item" | jq -r '.tensor_parallel')
    batch_size=$(echo "$item" | jq -r '.batch_size')
    seqlen_tuple=$(echo "$item" | jq -r '.seqlen_tuple')

    result_path=${dst_path}/${model_name}/
    if [ ! -f ${result_path} ]; then
        mkdir ${result_path} -p
    fi

    if [ -e "${result_path}output.csv" ] && [ -s "${result_path}output.csv" ]; then
        :
    else
        echo "model_name,DCU,DCU nums,precision,input_len,output_len,bs,TTFT_mean(s),TPOT_mean(s),GenerateThroughput(tokens/s),total_time(s),version" > ${result_path}output.csv
    fi
    echo $tensor_parallel | jq -c '.[]' | while read -r tp; do
        echo $batch_size | jq -c '.[]' | while read -r bs; do
            echo $seqlen_tuple | jq -c '.[]' | while read -r sq; do
                IFS=' ' read -r input_len output_len <<< ${sq//\"/}

                case $dtype in
                    "float16" | "bfloat16")
                        extra="--dtype ${dtype}" ;;
                    "gptq")
                        extra="--quantization gptq" ;;
                    "awq")
                        extra="--quantization awq" ;;
                esac

                # 当前离线测试脚本仅支持V0 Engine,推荐使用在线测试方法
                VLLM_USE_V1=0 python benchmarks/benchmark_throughput.py --model ${model_path} --tensor-parallel-size ${tp} \
                    --num-prompts ${bs} --input-len ${input_len} --output-len ${output_len} ${extra} --trust-remote-code --max-model-len 32768 \
                    2>&1 | tee ${result_path}/vllm_${model_name}_bs_${bs}_inputlen_${input_len}_outputlen_${output_len}_tp_${tp}.log

                throughput=`grep -a "^Generate Throughput:" ${result_path}/vllm_${model_name}_bs_${bs}_inputlen_${input_len}_outputlen_${output_len}_tp_${tp}.log | awk -F ' ' '{print $3}'`
                TTFT_mean=`grep -a "^TTFT mean:" ${result_path}/vllm_${model_name}_bs_${bs}_inputlen_${input_len}_outputlen_${output_len}_tp_${tp}.log | awk -F ' ' '{print $3}'`
                TPOT_mean=`grep -a "^TPOT mean:" ${result_path}/vllm_${model_name}_bs_${bs}_inputlen_${input_len}_outputlen_${output_len}_tp_${tp}.log | awk -F ' ' '{print $3}'`
                total_time=`grep -a "^Elapsed_time:" ${result_path}/vllm_${model_name}_bs_${bs}_inputlen_${input_len}_outputlen_${output_len}_tp_${tp}.log | awk -F ' ' '{print $2}'`		
                echo "$model_name,$DCU,$tp,$dtype,$input_len,$output_len,$bs,$TTFT_mean,$TPOT_mean,$throughput,$total_time,$pkg_version" >> ${result_path}output.csv
            done
        done
    done

done <<< "$items"