Commit a3c89b8c authored by liuxu3's avatar liuxu3
Browse files

added vllm 0.11.0 auto test scripts

parent fba2e3b5
# vllm-auto-test
# vLLM 0.11.0 Management
vLLM-0.11.0的软件版本管理及脚本程序管理
## 当前版本信息
1. 最新镜像
docker pull harbor.sourcefind.cn:5443/dcu/admin/base/vllm:0.11.0-ubuntu22.04-dtk25.04.2-1226-das1.7-py3.10-20251226
该镜像默认使用V1 Engine,并且默认开启了Prefix Caching功能
2. 常见模型部署方法
参考智算产品部提供的部署手册(定期更新):https://r0ddbu55vzx.feishu.cn/docx/LL7KdYsWeoch7PxaS7wcBR5OnLe?from=from_copylink
3. 常见模型性能摸测结果
【金山文档 | WPS云文档】 大模型推理性能记录表-2026:https://www.kdocs.cn/l/cg98BKZula49(by 刘玉升/刘煦)
4. 通用小参数量的大模型推荐的w8a8精度的量化方法是compressed-tensors、quark、w8a8-dynamic?(by 王凯雄)
DCU推荐使用: compressed-tensors
compressed-tensors 量化方法见链接: https://sw4sldkryl8.feishu.cn/docx/RJqldrez2o477Cxyo40cg3Ven7h?from=from_copylink
w8a8-dynamic 是华为卡上模型量化后的格式
## 代码/脚本更新日志
2026/01/27:启动方式改用vllm serve/vllm bench指令;新增FP8精度支持(by 刘煦)
2026/01/27:新增vllm0.11.0在线测试的自动化测试脚本,使用相关的环境变量;新增单并发吞吐指标记录(by 刘煦)
# vLLM 0.11.0 Management
离线环境安装包
api server测试方法需要jq和netcat依赖包,在离线环境内需要自行安装
jq_depends.zip和netcat_depends.zip是deb包,在ubuntu环境下可以直接安装
```
unzip jq_depends.zip
unzip netcat_depends.zip
dpkg -i *.deb
```
jq-1.5.tar.gz和netcat-0.7.1.tar.gz是源码,可以直接源码编译
# Online API Server Benchmark Scripts
vLLM 0.11.0 在线评测测试的代码/脚本管理
## 更新日志
2026/01/27:启动方式改用vllm serve/vllm bench指令;新增FP8精度支持(by 刘煦)
2026/01/27:新增vllm0.11.0在线测试的自动化测试脚本,使用相关的环境变量;新增单并发吞吐指标记录(by 刘煦)
## 使用方法
1. 需要安装jq和netcat
```
apt-get install jq
apt-get install netcat
```
2. 文件说明:
1)auto_quick_check_config.json:配置文件
2)online_quick_check.sh:自动化启动脚本
3)run_apiserver.sh:运行服务的脚本
```
# 配置文件说明 auto_quick_check_config.json
{
"DCU": "BW1000", # DCU型号
"vllm_version": "0.11.0", # vllm版本号
"pkg_version": "dtk25.04.2", # DTK版本号
"dst_path": "./result/", # 结果/日志存放目录
"items":[
{
"model_name": "Qwen3-32B", # 模型名称
"model_path": "/data/models/Qwen3-32B/", # 模型路径
"dtype": "float16",, # 数据类型,可选范围:float16/bfloat16/gptq-int8/gptq-int4/float8/w8a8/awq
"tensor_parallel": [2, 4], # 使用卡数
"batch_size": [1, 4, 8, 16], # 并发数
"seqlen_tuple": ["512 512", "4096 1024"] # 输入输出序列组合
}
]
}
```
3. 运行及逻辑:
指令:bash online_quick_check.sh
逻辑:调用run_apiserver.sh启动服务 -> 监听端口是否启动 -> 启动后运行评测指令评测 -> 汇总数据
{
"DCU": "BW1000",
"vllm_version": "0.11.0",
"pkg_version": "dtk25.04.2",
"dst_path": "./result/",
"items":[
{
"model_name": "Qwen3-32B",
"model_path": "/data/models/Qwen3-32B/",
"dtype": "float16",
"tensor_parallel": [2, 4],
"batch_size": [1, 4, 8, 16, 32, 64, 128],
"seqlen_tuple": ["512 512", "4096 1024", "16384 1024"]
}
]
}
\ No newline at end of file
#!/bin/bash
HOST=127.0.0.1
PORT=8081
TIMEOUT=1200 # 监控超时设置
INTERVAL=60 # 监控时间间隔
ENDPOINT=/v1/completions
# 读取json配置文件
json_data=$(cat auto_quick_check_config.json)
DCU=$(echo $json_data | jq -r '.DCU')
vllm_version=$(echo $json_data | jq -r '.vllm_version')
pkg_version=$(echo $json_data | jq -r '.pkg_version')
dst_path=$(echo $json_data | jq -r '.dst_path')
items=$(echo $json_data | jq -c '.items[]')
while read -r item; do
model_name=$(echo "$item" | jq -r '.model_name')
model_path=$(echo "$item" | jq -r '.model_path')
dtype=$(echo "$item" | jq -r '.dtype')
tensor_parallel=$(echo "$item" | jq -r '.tensor_parallel')
batch_size=$(echo "$item" | jq -r '.batch_size')
seqlen_tuple=$(echo "$item" | jq -r '.seqlen_tuple')
result_path=${dst_path}/${model_name}/
if [ ! -f ${result_path} ]; then
mkdir ${result_path} -p
fi
if [ -e "${result_path}output.csv" ] && [ -s "${result_path}output.csv" ]; then
:
else
echo "model_name,DCU,DCU nums,precision,input_len,output_len,bs,TTFT_mean(ms),TPOT_mean(ms),ITL_mean(ms),GenerateThroughput(tokens/s),TotalThroughput(tokens/s),Duration(s),OutputThroughputPerBS(tokens/s),DecodeThroughputPerBS(tokens/s),version" > ${result_path}output.csv
fi
echo $tensor_parallel | jq -c '.[]' | while read -r tp; do
# 运行服务端启动脚本
nohup bash run_apiserver.sh $model_name $model_path $tp $dtype $HOST $PORT $result_path &
start_time=$(date +%s)
while true; do
if nc -zv $HOST $PORT; then # 检查端口是否打开
echo $seqlen_tuple | jq -c '.[]' | while read -r sq; do
echo $batch_size | jq -c '.[]' | while read -r bs; do
IFS=' ' read -r input_len output_len <<< ${sq//\"/}
# 运行评测脚本
vllm bench serve --model ${model_name} \
--dataset-name random \
--tokenizer ${model_path} \
--trust-remote-code \
--port ${PORT} \
--endpoint ${ENDPOINT} \
--random-input-len ${input_len} \
--random-output-len ${output_len} \
--ignore_eos \
--num-prompts ${bs} \
--max-concurrency ${bs} 2>&1 | tee ${result_path}/${model_name}-tp-${tp}-input_len-${input_len}-output_len-${output_len}-bs-${bs}.log
output_throughput=`grep -a "^Output token throughput (tok/s):" ${result_path}/${model_name}-tp-${tp}-input_len-${input_len}-output_len-${output_len}-bs-${bs}.log | awk -F ' ' '{print $5}'`
total_throughput=`grep -a "^Total Token throughput (tok/s):" ${result_path}/${model_name}-tp-${tp}-input_len-${input_len}-output_len-${output_len}-bs-${bs}.log | awk -F ' ' '{print $5}'`
TTFT_mean=`grep -a "^Mean TTFT (ms):" ${result_path}/${model_name}-tp-${tp}-input_len-${input_len}-output_len-${output_len}-bs-${bs}.log | awk -F ' ' '{print $4}'`
TPOT_mean=`grep -a "^Mean TPOT (ms):" ${result_path}/${model_name}-tp-${tp}-input_len-${input_len}-output_len-${output_len}-bs-${bs}.log | awk -F ' ' '{print $4}'`
ITL_mean=`grep -a "^Mean ITL (ms):" ${result_path}/${model_name}-tp-${tp}-input_len-${input_len}-output_len-${output_len}-bs-${bs}.log | awk -F ' ' '{print $4}'`
duration=`grep -a "^Benchmark duration (s):" ${result_path}/${model_name}-tp-${tp}-input_len-${input_len}-output_len-${output_len}-bs-${bs}.log | awk -F ' ' '{print $4}'`
output_throughput_per_bs=$(python -c "print(round(${output_throughput} / ${bs}, 2))")
decode_throughput_per_bs=$(python -c "print(round(1000.0 / ${TPOT_mean}, 2))")
echo "$model_name,$DCU,$tp,$dtype,$input_len,$output_len,$bs,$TTFT_mean,$TPOT_mean,$ITL_mean,$output_throughput,$total_throughput,$duration,$output_throughput_per_bs,$decode_throughput_per_bs,$pkg_version" >> ${result_path}output.csv
sleep 10
done
done
break
else
current_time=$(date +%s)
elapsed_time=$((current_time - start_time))
if [ $elapsed_time -ge $TIMEOUT ]; then
echo "ERR:PORT ${PORT} launch time out, exit!!!。"
exit 1
fi
echo "PORT ${PORT} has not been launched yet, please wait...."
sleep $INTERVAL
fi
done
pkill -f vllm
sleep 60
done
done <<< "$items"
#!/bin/bash
export VLLM_NUMA_BIND=1 # 参考BW1000拓扑结果进行绑核
export VLLM_RANK0_NUMA=3
export VLLM_RANK1_NUMA=1
export VLLM_RANK2_NUMA=1
export VLLM_RANK3_NUMA=0
export VLLM_RANK4_NUMA=7
export VLLM_RANK5_NUMA=5
export VLLM_RANK6_NUMA=5
export VLLM_RANK7_NUMA=4
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export HSA_FORCE_FINE_GRAIN_PCIE=1
export NCCL_MIN_NCHANNELS=16
export NCCL_MAX_NCHANNELS=16
export NCCL_P2P_LEVEL=SYS
export NCCL_LAUNCH_MODE=GROUP
export ALLREDUCE_STREAM_WITH_COMPUTE=1
export VLLM_RPC_TIMEOUT=1800000
export VLLM_ZERO_OVERHEAD=1
export VLLM_ZERO_OPT_ZEROS=1
# 测试Qwen3-30B-A3B所需环境变量
export VLLM_USE_FUSED_RMS_ROPE=1
export VLLM_USE_MARLIN_W16A16_MOE=1
# 测试Qwen3-Next需要环境变量
export VLLM_USE_NN=0
export TRITON_MOVE_LOAD_TOFRONT_DOT=0
# 禁止生成core文件
ulimit -c 0
# 模型参数
model_name=$1
model_path=$2
tp=$3
dtype=$4
host=$5
port=$6
dst_path=$7
case $dtype in
"float16" | "bfloat16")
extra="--dtype ${dtype}" ;;
"gptq-int8" | "gptq-int4")
export GPTQ_CK_GEMMBS=15000
extra="--quantization gptq" ;;
"float8")
extra="--quantization fp8" ;;
"w8a8")
extra="--quantization compressed-tensors" ;;
"awq")
export AWQ_CK_GEMMBS=15000
extra="--quantization awq" ;;
esac
# vllm 0.11.0 弃用V0 engine,默认使用V1 Engine
# vllm 0.11.0 默认开启 Prefix Caching 功能(推荐),如果有测试需求可以自行关闭
# 开启/关闭Prefix Caching:--enable-prefix-caching / --no-enable-prefix-caching
date=$(date +"%Y-%m-%d-%H-%M-%S")
vllm serve $model_path \
--served-model-name $model_name \
--tensor-parallel-size $tp \
--host $host --port $port \
--gpu-memory-utilization 0.95 $extra \
--max-model-len 32768 \
--trust-remote-code \
--no-enable-prefix-caching \
--enable-chunked-prefill \
--disable-cascade-attn \
--disable-log-stats \
2>&1 | tee ${dst_path}/launcher_${model_name}_tp_${tp}_dtype_${dtype}_${date}.log
# Online API Server Benchmark Scripts
vLLM 0.11.0 在线评测测试的代码/脚本管理
## 更新日志
2026/01/27:启动方式改用vllm serve/vllm bench指令;新增FP8精度支持(by 刘煦)
2026/01/27:新增vllm0.11.0在线测试的自动化测试脚本,使用相关的环境变量;新增单并发吞吐指标记录(by 刘煦)
## 使用方法
1. 需要安装jq和netcat
```
apt-get install jq
apt-get install netcat
```
2. 文件说明:
1)auto_quick_check_config.json:配置文件
2)online_quick_check_maxbs.sh:自动化启动脚本
3)run_apiserver.sh:运行服务的脚本
```
# 配置文件说明 auto_quick_check_config.json
{
"DCU": "BW1000", # DCU型号
"vllm_version": "0.11.0", # vllm版本号
"pkg_version": "dtk25.04.2", # DTK版本号
"dst_path": "./result/", # 结果/日志存放目录
"items":[
{
"model_name": "Qwen3-32B", # 模型名称
"model_path": "/data/models/Qwen3-32B/", # 模型路径
"dtype": "float16", # 数据类型,可选范围:float16/bfloat16/gptq-int8/gptq-int4/float8/w8a8/awq
"tensor_parallel": [4], # 使用卡数
"batch_size_start": 4, # 起始并发数
"batch_size_interval": 4, # 并发数增加间隔
"seqlen_tuple": ["512 512", "1024 1024"], # 输入输出序列组合
"ttft_thres": [3000, 3000], # ttft阈值,单位毫秒("inf"表示无限制)
"tpot_thres": [100, "inf"] # tpot阈值,单位毫秒("inf"表示无限制)
}
]
}
注:
1. 【输入输出序列组合】、【ttft阈值】、【tpot阈值】是一一对应的关系,数量需要确保一致
2. 【ttft阈值】、【tpot阈值】不可均设置成"inf"
```
3. 运行及逻辑:
指令:bash online_quick_check_maxbs.sh
逻辑:调用run_apiserver.sh启动服务 -> 监听端口是否启动 -> 启动后运行评测指令评测 -> 汇总数据
最大并发数测试逻辑:从batch_size_start开始测试,并且逐步增加batch_size_interval,直到某一个并发数的ttft/tpot超过了设置的阈值停止测试;从output.csv中可以获取最大并发数
{
"DCU": "BW1000",
"vllm_version": "0.11.0",
"pkg_version": "dtk25.04.2",
"dst_path": "./result/",
"items":[
{
"model_name": "Qwen3-32B",
"model_path": "/data/models/Qwen3-32B/",
"dtype": "float16",
"tensor_parallel": [4],
"batch_size_start": 4,
"batch_size_interval": 4,
"seqlen_tuple": ["512 512", "1024 1024"],
"ttft_thres": [3000, 3000],
"tpot_thres": [100, "inf"]
}
]
}
\ No newline at end of file
#!/bin/bash
HOST=127.0.0.1
PORT=8081
TIMEOUT=1200 # 监控超时设置
INTERVAL=60 # 监控时间间隔
ENDPOINT=/v1/completions
# 读取json配置文件
json_data=$(cat auto_quick_check_config.json)
DCU=$(echo $json_data | jq -r '.DCU')
vllm_version=$(echo $json_data | jq -r '.vllm_version')
pkg_version=$(echo $json_data | jq -r '.pkg_version')
dst_path=$(echo $json_data | jq -r '.dst_path')
items=$(echo $json_data | jq -c '.items[]')
while read -r item; do
model_name=$(echo "$item" | jq -r '.model_name')
model_path=$(echo "$item" | jq -r '.model_path')
dtype=$(echo "$item" | jq -r '.dtype')
tensor_parallel=$(echo "$item" | jq -r '.tensor_parallel')
seqlen_len=$(echo "$item" | jq -r '.seqlen_tuple | length')
ttft_len=$(echo "$item" | jq -r '.ttft_thres | length')
tpot_len=$(echo "$item" | jq -r '.tpot_thres | length')
if ! [[ $seqlen_len -eq $tpot_len && $seqlen_len -eq $tpot_len ]]; then
echo "***********************************"
echo "测试项:模型 ${model_name} "
echo "输入输出序列、ttft阈值、tpot阈值数量存在不一致,无法测试最大并发量"
echo "跳过该测试项"
echo "***********************************"
continue
fi
result_path=${dst_path}/${model_name}/
if [ ! -f ${result_path} ]; then
mkdir ${result_path} -p
fi
if [ -e "${result_path}output.csv" ] && [ -s "${result_path}output.csv" ]; then
:
else
echo "model_name,DCU,DCU nums,precision,input_len,output_len,bs,TTFT_mean(ms),TPOT_mean(ms),ITL_mean(ms),GenerateThroughput(tokens/s),TotalThroughput(tokens/s),Duration(s),OutputThroughputPerBS(tokens/s),DecodeThroughputPerBS(tokens/s),version" > ${result_path}output.csv
fi
echo $tensor_parallel | jq -c '.[]' | while read -r tp; do
# 运行服务端启动脚本
nohup bash run_apiserver.sh $model_name $model_path $tp $dtype $HOST $PORT $result_path &
start_time=$(date +%s)
while true; do
if nc -zv localhost $PORT; then # 检查端口是否打开
for ((i=0; i<seqlen_len; i++)); do
seqlen=$(echo "$item" | jq -r ".seqlen_tuple[$i]")
ttft_thre=$(echo "$item" | jq -r ".ttft_thres[$i]")
tpot_thre=$(echo "$item" | jq -r ".tpot_thres[$i]")
IFS=' ' read -ra seq_parts <<< "$seqlen"
input_len=${seq_parts[0]}
output_len=${seq_parts[1]}
if [[ "$ttft_thre" == "inf" && "$tpot_thre" == "inf" ]]; then
echo "***********************************"
echo "测试项:模型 ${model_name} 输入 ${input_len} 输出 ${output_len}"
echo "需要设置ttft或tpot阈值(不能都为inf),否则无法测试最大并发量"
echo "跳过该测试项"
echo "***********************************"
continue
fi
bs=$(echo "$item" | jq -r '.batch_size_start')
bs_interval=$(echo "$item" | jq -r '.batch_size_interval')
while true; do
# 运行评测脚本
vllm bench serve --model ${model_name} \
--dataset-name random \
--tokenizer ${model_path} \
--trust-remote-code \
--port ${PORT} \
--endpoint ${ENDPOINT} \
--random-input-len ${input_len} \
--random-output-len ${output_len} \
--ignore_eos \
--num-prompts ${bs} \
--max-concurrency ${bs} 2>&1 | tee ${result_path}/${model_name}-tp-${tp}-input_len-${input_len}-output_len-${output_len}-ttft_thre-${ttft_thre}-tpot_thre-${tpot_thre}-bs-${bs}.log
output_throughput=`grep -a "^Output token throughput (tok/s):" ${result_path}/${model_name}-tp-${tp}-input_len-${input_len}-output_len-${output_len}-ttft_thre-${ttft_thre}-tpot_thre-${tpot_thre}-bs-${bs}.log | awk -F ' ' '{print $5}'`
total_throughput=`grep -a "^Total Token throughput (tok/s):" ${result_path}/${model_name}-tp-${tp}-input_len-${input_len}-output_len-${output_len}-ttft_thre-${ttft_thre}-tpot_thre-${tpot_thre}-bs-${bs}.log | awk -F ' ' '{print $5}'`
TTFT_mean=`grep -a "^Mean TTFT (ms):" ${result_path}/${model_name}-tp-${tp}-input_len-${input_len}-output_len-${output_len}-ttft_thre-${ttft_thre}-tpot_thre-${tpot_thre}-bs-${bs}.log | awk -F ' ' '{print $4}'`
TPOT_mean=`grep -a "^Mean TPOT (ms):" ${result_path}/${model_name}-tp-${tp}-input_len-${input_len}-output_len-${output_len}-ttft_thre-${ttft_thre}-tpot_thre-${tpot_thre}-bs-${bs}.log | awk -F ' ' '{print $4}'`
ITL_mean=`grep -a "^Mean ITL (ms):" ${result_path}/${model_name}-tp-${tp}-input_len-${input_len}-output_len-${output_len}-ttft_thre-${ttft_thre}-tpot_thre-${tpot_thre}-bs-${bs}.log | awk -F ' ' '{print $4}'`
duration=`grep -a "^Benchmark duration (s):" ${result_path}/${model_name}-tp-${tp}-input_len-${input_len}-output_len-${output_len}-ttft_thre-${ttft_thre}-tpot_thre-${tpot_thre}-bs-${bs}.log | awk -F ' ' '{print $4}'`
output_throughput_per_bs=$(python -c "print(round(${output_throughput} / ${bs}, 2))")
decode_throughput_per_bs=$(python -c "print(round(1000.0 / ${TPOT_mean}, 2))")
echo "$model_name,$DCU,$tp,$dtype,$input_len,$output_len,$bs,$TTFT_mean,$TPOT_mean,$ITL_mean,$output_throughput,$total_throughput,$duration,$output_throughput_per_bs,$decode_throughput_per_bs,$pkg_version" >> ${result_path}output.csv
sleep 10
condition=$(python <<- EOF
try:
ttft = float('$TTFT_mean')
tpot = float('$TPOT_mean')
ttft_thre = float('$ttft_thre')
tpot_thre = float('$tpot_thre')
print(1 if (ttft <= ttft_thre and tpot <= tpot_thre) else 0)
except:
print(0)
EOF
)
if [ "$condition" -eq 1 ]; then
bs=$((bs + bs_interval))
else
break
fi
done
done
break
else
current_time=$(date +%s)
elapsed_time=$((current_time - start_time))
if [ $elapsed_time -ge $TIMEOUT ]; then
echo "ERR:PORT ${PORT} launch time out, exit!!!。"
exit 1
fi
echo "PORT ${PORT} has not been launched yet, please wait...."
sleep $INTERVAL
fi
done
pkill -f vllm
sleep 60
done
done <<< "$items"
#!/bin/bash
export VLLM_NUMA_BIND=1 # 参考BW1000拓扑结果进行绑核
export VLLM_RANK0_NUMA=3
export VLLM_RANK1_NUMA=1
export VLLM_RANK2_NUMA=1
export VLLM_RANK3_NUMA=0
export VLLM_RANK4_NUMA=7
export VLLM_RANK5_NUMA=5
export VLLM_RANK6_NUMA=5
export VLLM_RANK7_NUMA=4
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export HSA_FORCE_FINE_GRAIN_PCIE=1
export NCCL_MIN_NCHANNELS=16
export NCCL_MAX_NCHANNELS=16
export NCCL_P2P_LEVEL=SYS
export NCCL_LAUNCH_MODE=GROUP
export ALLREDUCE_STREAM_WITH_COMPUTE=1
export VLLM_RPC_TIMEOUT=1800000
export VLLM_ZERO_OVERHEAD=1
export VLLM_ZERO_OPT_ZEROS=1
# 测试Qwen3-30B-A3B所需环境变量
export VLLM_USE_FUSED_RMS_ROPE=1
export VLLM_USE_MARLIN_W16A16_MOE=1
# 测试Qwen3-Next需要环境变量
export VLLM_USE_NN=0
export TRITON_MOVE_LOAD_TOFRONT_DOT=0
# 禁止生成core文件
ulimit -c 0
# 模型参数
model_name=$1
model_path=$2
tp=$3
dtype=$4
host=$5
port=$6
dst_path=$7
case $dtype in
"float16" | "bfloat16")
extra="--dtype ${dtype}" ;;
"gptq-int8" | "gptq-int4")
export GPTQ_CK_GEMMBS=15000
extra="--quantization gptq" ;;
"float8")
extra="--quantization fp8" ;;
"w8a8")
extra="--quantization compressed-tensors" ;;
"awq")
export AWQ_CK_GEMMBS=15000
extra="--quantization awq" ;;
esac
# vllm 0.11.0 弃用V0 engine,默认使用V1 Engine
# vllm 0.11.0 默认开启 Prefix Caching 功能(推荐),如果有测试需求可以自行关闭
# 开启/关闭Prefix Caching:--enable-prefix-caching / --no-enable-prefix-caching
date=$(date +"%Y-%m-%d-%H-%M-%S")
vllm serve $model_path \
--served-model-name $model_name \
--tensor-parallel-size $tp \
--host $host --port $port \
--gpu-memory-utilization 0.95 $extra \
--max-model-len 32768 \
--trust-remote-code \
--no-enable-prefix-caching \
--enable-chunked-prefill \
--disable-cascade-attn \
--disable-log-stats \
2>&1 | tee ${dst_path}/launcher_${model_name}_tp_${tp}_dtype_${dtype}_${date}.log
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment