#!/bin/bash HOST=127.0.0.1 PORT=8081 TIMEOUT=1200 # 监控超时设置 INTERVAL=60 # 监控时间间隔 ENDPOINT=/v1/completions # 读取json配置文件 json_data=$(cat auto_quick_check_config.json) DCU=$(echo $json_data | jq -r '.DCU') vllm_version=$(echo $json_data | jq -r '.vllm_version') pkg_version=$(echo $json_data | jq -r '.pkg_version') dst_path=$(echo $json_data | jq -r '.dst_path') items=$(echo $json_data | jq -c '.items[]') while read -r item; do model_name=$(echo "$item" | jq -r '.model_name') model_path=$(echo "$item" | jq -r '.model_path') dtype=$(echo "$item" | jq -r '.dtype') tensor_parallel=$(echo "$item" | jq -r '.tensor_parallel') batch_size=$(echo "$item" | jq -r '.batch_size') seqlen_tuple=$(echo "$item" | jq -r '.seqlen_tuple') result_path=${dst_path}/${model_name}/ if [ ! -f ${result_path} ]; then mkdir ${result_path} -p fi if [ -e "${result_path}output.csv" ] && [ -s "${result_path}output.csv" ]; then : else echo "model_name,DCU,DCU nums,precision,input_len,output_len,bs,TTFT_mean(ms),TPOT_mean(ms),ITL_mean(ms),GenerateThroughput(tokens/s),TotalThroughput(tokens/s),Duration(s),OutputThroughputPerBS(tokens/s),DecodeThroughputPerBS(tokens/s),version" > ${result_path}output.csv fi echo $tensor_parallel | jq -c '.[]' | while read -r tp; do # 运行服务端启动脚本 nohup bash run_apiserver.sh $model_name $model_path $tp $dtype $HOST $PORT $result_path & start_time=$(date +%s) while true; do if nc -zv $HOST $PORT; then # 检查端口是否打开 echo $seqlen_tuple | jq -c '.[]' | while read -r sq; do echo $batch_size | jq -c '.[]' | while read -r bs; do IFS=' ' read -r input_len output_len <<< ${sq//\"/} # 运行评测脚本 vllm bench serve --model ${model_name} \ --dataset-name random \ --tokenizer ${model_path} \ --trust-remote-code \ --port ${PORT} \ --endpoint ${ENDPOINT} \ --random-input-len ${input_len} \ --random-output-len ${output_len} \ --ignore_eos \ --num-prompts ${bs} \ --max-concurrency ${bs} 2>&1 | tee ${result_path}/${model_name}-tp-${tp}-input_len-${input_len}-output_len-${output_len}-bs-${bs}.log output_throughput=`grep -a "^Output token throughput (tok/s):" ${result_path}/${model_name}-tp-${tp}-input_len-${input_len}-output_len-${output_len}-bs-${bs}.log | awk -F ' ' '{print $5}'` total_throughput=`grep -a -i "^Total Token throughput (tok/s):" ${result_path}/${model_name}-tp-${tp}-input_len-${input_len}-output_len-${output_len}-bs-${bs}.log | awk -F ' ' '{print $5}'` TTFT_mean=`grep -a "^Mean TTFT (ms):" ${result_path}/${model_name}-tp-${tp}-input_len-${input_len}-output_len-${output_len}-bs-${bs}.log | awk -F ' ' '{print $4}'` TPOT_mean=`grep -a "^Mean TPOT (ms):" ${result_path}/${model_name}-tp-${tp}-input_len-${input_len}-output_len-${output_len}-bs-${bs}.log | awk -F ' ' '{print $4}'` ITL_mean=`grep -a "^Mean ITL (ms):" ${result_path}/${model_name}-tp-${tp}-input_len-${input_len}-output_len-${output_len}-bs-${bs}.log | awk -F ' ' '{print $4}'` duration=`grep -a "^Benchmark duration (s):" ${result_path}/${model_name}-tp-${tp}-input_len-${input_len}-output_len-${output_len}-bs-${bs}.log | awk -F ' ' '{print $4}'` output_throughput_per_bs=$(python -c "print(round(${output_throughput} / ${bs}, 2))") decode_throughput_per_bs=$(python -c "print(round(1000.0 / ${TPOT_mean}, 2))") echo "$model_name,$DCU,$tp,$dtype,$input_len,$output_len,$bs,$TTFT_mean,$TPOT_mean,$ITL_mean,$output_throughput,$total_throughput,$duration,$output_throughput_per_bs,$decode_throughput_per_bs,$pkg_version" >> ${result_path}output.csv sleep 10 done done break else current_time=$(date +%s) elapsed_time=$((current_time - start_time)) if [ $elapsed_time -ge $TIMEOUT ]; then echo "ERR:PORT ${PORT} launch time out, exit!!!。" exit 1 fi echo "PORT ${PORT} has not been launched yet, please wait...." sleep $INTERVAL fi done pkill -f vllm sleep 60 done done <<< "$items"