Commit 23d4b8c8 authored by sunzhq2's avatar sunzhq2
Browse files

init aisbench-tools

parents
# AISBench benchmark 链接
- https://gitee.com/aisbench/benchmark
- git clone https://gitee.com/aisbench/benchmark.git
- AISBench benchmark安装:具体安装可以参考 AISBench 对应的README
# aisbench-tools工具
- monitor_gpu.sh 用于记录DCU的使用状态
- run.sh
- 随机输入输出
```
bash ./bench-test.sh --gpus 3 \
--model vllm_api_stream_chat \
--dataset synthetic_gen \
--model-name test_model \
--batch-size "8 16 32" \
--input-len 512 \
--max-out-len 512 \
--ais-model /data2/models/qwen3-8B \
--model-path /data2/models/qwen3-8B \
--host-port 23456
```
- 对应数据集
```
bash ./bench-test.sh --gpus 3 \
--model vllm_api_stream_chat \
--dataset aime2025_gen \
--model-name test_model \
--batch-size "16" \
--max-out-len 512 \
--ais-model /data2/models/qwen3-8B \
--model-path /data2/models/qwen3-8B \
--host-port 23456
```
- 仅跑eval指令
```
ais_bench --models vllm_api_stream_chat \
--datasets aime2025_gen --mode eval \
--debug --reuse xxxxxx
```
\ No newline at end of file
#!/bin/bash
# perf+eval 连续运行脚本
# 先跑 perf,自动获取 --reuse 路径,再跑 eval
set -e
# ============ 配置 ============
MONITOR_SCRIPT="./monitor_gpu.sh" # monitor_gpu.sh 路径
TARGET_GPUS="3" # 目标GPU
AISBENCH_BASE_DIR="./"
# 推理参数
MODEL="vllm_api_stream_chat"
DATASET="aime2025_gen"
BATCH_SIZES="32" # 多个batch_size用空格分隔,如 "1 4 8"
MAX_OUT_LEN="512"
INPUT_LEN="512" # 输入长度(仅synthetic数据集使用)
AIS_MODEL="/data2/models/qwen3-8B" # vLLM服务实际部署的模型名,如 "Qwen/Qwen2.5-7B-Instruct"。为空时自动探测
MODEL_PATH="/data2/models/qwen3-8B" # 模型本地路径,如 "/data/models/Qwen2.5-7B"。为空时不设
HOST_PORT="23456" # vLLM服务端口
# 配置文件路径
SYNTHETIC_CONFIG="${AISBENCH_BASE_DIR}ais_bench/datasets/synthetic/synthetic_config.py"
SYNTHETIC_CONFIG_BAK="${SYNTHETIC_CONFIG}.bak"
VLLM_CONFIG="${AISBENCH_BASE_DIR}ais_bench/benchmark/configs/models/vllm_api/${MODEL}.py"
VLLM_CONFIG_BAK="${VLLM_CONFIG}.bak"
# 输出目录命名参数
MODEL_NAME="test_model"
while [[ $# -gt 0 ]]; do
case $1 in
--gpus)
TARGET_GPUS="$2"
shift 2
;;
--model)
MODEL="$2"
shift 2
;;
--dataset)
DATASET="$2"
shift 2
;;
--output-dir)
BASE_OUTPUT_DIR="$2"
shift 2
;;
--monitor-script)
MONITOR_SCRIPT="$2"
shift 2
;;
--batch-size)
BATCH_SIZES="$2"
shift 2
;;
--max-out-len)
MAX_OUT_LEN="$2"
shift 2
;;
--input-len)
INPUT_LEN="$2"
shift 2
;;
--ais-model)
AIS_MODEL="$2"
shift 2
;;
--model-path)
MODEL_PATH="$2"
shift 2
;;
--host-port)
HOST_PORT="$2"
shift 2
;;
--model-name)
MODEL_NAME="$2"
shift 2
;;
*)
echo "未知参数: $1"
echo "用法: $0 [--gpus 4,5,6,7] \
[--model vllm_api_stream_chat] \
[--dataset aime2025_gen] \
[--model-name test_model] \
[--batch-size \"1 4 8\"] \
[--max-out-len 512] \
[--input-len 512] \
[--ais-model Qwen/Qwen2.5-7B-Instruct] \
[--model-path /data/models/Qwen2.5-7B] \
[--host-port 8080]"
exit 1
;;
esac
done
if [ -z "$SUB_DIR" ]; then
if [ "$DATASET" = "synthetic_gen" ]; then
SUB_DIR="synthetic_gen/input-${INPUT_LEN}-output-${MAX_OUT_LEN}"
else
SUB_DIR="${DATASET}"
fi
fi
echo "=========================================="
echo "Perf + Eval 连续运行脚本"
echo "=========================================="
echo "目标GPU: $TARGET_GPUS"
echo "模型: $MODEL"
echo "模型名: $MODEL_NAME"
echo "数据集: $DATASET"
echo "子目录: $SUB_DIR"
echo "BatchSizes: $BATCH_SIZES"
echo "MaxOutLen: $MAX_OUT_LEN"
if [[ "$DATASET" == *synthetic* ]]; then
echo "InputLen: $INPUT_LEN"
fi
echo "=========================================="
echo ""
OVERALL_EXIT=0
IS_SYNTHETIC=0
if [[ "$DATASET" == *synthetic* ]]; then
IS_SYNTHETIC=1
echo ">>> 检测到synthetic数据集,将动态修改 synthetic_config.py 并仅运行Perf"
fi
echo "[Setup] 备份 vllm_api_stream_chat.py ..."
cp "$VLLM_CONFIG" "$VLLM_CONFIG_BAK"
for BS in $BATCH_SIZES; do
export BS="$BS"
export INPUT_LEN="$INPUT_LEN"
export MAX_OUT_LEN="$MAX_OUT_LEN"
export AIS_MODEL="$AIS_MODEL"
export MODEL_PATH="$MODEL_PATH"
export HOST_PORT="$HOST_PORT"
export SYNTHETIC_CONFIG="$SYNTHETIC_CONFIG"
export VLLM_CONFIG="$VLLM_CONFIG"
if [ $IS_SYNTHETIC -eq 1 ]; then
export IGNORE_EOS="True"
else
export IGNORE_EOS="False"
fi
TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
BASE_OUTPUT_DIR="./test_outputs/${MODEL_NAME}/${SUB_DIR}/bs_${BS}_bench_results_${TIMESTAMP}"
mkdir -p "$BASE_OUTPUT_DIR"
BASE_OUTPUT_DIR=$(cd "$BASE_OUTPUT_DIR" && pwd)
echo ""
echo "###############################################"
echo "### BatchSize=$BS"
echo "### 输出目录: $BASE_OUTPUT_DIR"
echo "###############################################"
echo ""
# ============ 动态修改 vllm_api_stream_chat.py ============
echo "[BS=$BS] 重写 vllm_api_stream_chat.py: batch_size=$BS, max_out_len=$MAX_OUT_LEN, model=$AIS_MODEL, model_path=$MODEL_PATH, host_port=$HOST_PORT, ignore_eos=$IGNORE_EOS"
python3 -c "
import os
bs = int(os.environ.get('BS', '1'))
max_out_len = int(os.environ.get('MAX_OUT_LEN', '512'))
model = os.environ.get('AIS_MODEL', '')
model_path = os.environ.get('MODEL_PATH', '')
host_port = int(os.environ.get('HOST_PORT', '8080'))
ignore_eos = os.environ.get('IGNORE_EOS', 'False')
config_path = os.environ.get('VLLM_CONFIG', '')
content = '''from ais_bench.benchmark.models import VLLMCustomAPIChatStream
from ais_bench.benchmark.utils.model_postprocessors import extract_non_reasoning_content
models = [
dict(
attr=\"service\",
type=VLLMCustomAPIChatStream,
abbr='vllm-api-stream-chat',
path=\"%s\",
model=\"%s\",
request_rate = 0,
retry = 2,
host_ip = \"localhost\",
host_port = %d,
max_out_len = %d,
batch_size = %d,
trust_remote_code=True,
generation_kwargs = dict(
temperature = 0.5,
top_k = 10,
top_p = 0.95,
seed = None,
repetition_penalty = 1.03,
ignore_eos = %s,
),
pred_postprocessor=dict(type=extract_non_reasoning_content)
)
]
''' % (model_path, model, host_port, max_out_len, bs, ignore_eos)
with open(config_path, 'w') as f:
f.write(content)
print('vllm_api_stream_chat.py 已更新')
"
echo ""
# ============ synthetic数据集:动态修改配置 ============
if [ $IS_SYNTHETIC -eq 1 ]; then
echo "[BS=$BS] 修改 synthetic_config.py: Type=string, RequestCount=$BS, InputLen=$INPUT_LEN, OutputLen=$MAX_OUT_LEN"
cp "$SYNTHETIC_CONFIG" "$SYNTHETIC_CONFIG_BAK"
python3 -c "
import os
bs = int(os.environ.get('BS', '1'))
input_len = int(os.environ.get('INPUT_LEN', '512'))
max_out_len = int(os.environ.get('MAX_OUT_LEN', '512'))
config_path = os.environ.get('SYNTHETIC_CONFIG', '')
input_min = max(1, input_len - 8)
input_max = max(1, input_len - 8)
content = '''synthetic_config = {
\"Type\": \"string\",
\"RequestCount\": %d,
\"TrustRemoteCode\": False,
\"StringConfig\": {
\"Input\": {
\"Method\": \"uniform\",
\"Params\": {\"MinValue\": %d, \"MaxValue\": %d}
},
\"Output\": {
\"Method\": \"gaussian\",
\"Params\": {\"Mean\": 100, \"Var\": 200, \"MinValue\": %d, \"MaxValue\": %d}
}
},
\"TokenIdConfig\": {
\"RequestSize\": 10
}
}
''' % (bs, input_min, input_max, max_out_len, max_out_len)
with open(config_path, 'w') as f:
f.write(content)
print('synthetic_config.py 已更新')
"
echo ""
fi
# ============ 第一步:运行 Perf(带GPU监控) ============
echo "=========================================="
echo "[BS=$BS] 第一步: 运行 Perf 测试(带GPU监控)"
echo "=========================================="
PERF_DIR="$BASE_OUTPUT_DIR/perf"
mkdir -p "$PERF_DIR"
set +e
bash "$MONITOR_SCRIPT" \
--gpus "$TARGET_GPUS" \
--log-name perf_test.log \
--output-dir "$PERF_DIR" \
--bench-dir "$AISBENCH_BASE_DIR" \
ais_bench \
--models "$MODEL" \
--datasets "$DATASET" \
--mode perf \
--debug
PERF_EXIT_CODE=$?
set -e
if [ $PERF_EXIT_CODE -ne 0 ]; then
echo ""
echo "❌ [BS=$BS] Perf 测试失败 (退出码: $PERF_EXIT_CODE),跳过eval"
OVERALL_EXIT=1
if [ $IS_SYNTHETIC -eq 1 ] && [ -f "$SYNTHETIC_CONFIG_BAK" ]; then
mv "$SYNTHETIC_CONFIG_BAK" "$SYNTHETIC_CONFIG"
fi
continue
fi
echo ""
echo "✓ [BS=$BS] Perf 测试完成"
# ============ synthetic数据集:恢复配置,跳过eval ============
if [ $IS_SYNTHETIC -eq 1 ]; then
if [ -f "$SYNTHETIC_CONFIG_BAK" ]; then
mv "$SYNTHETIC_CONFIG_BAK" "$SYNTHETIC_CONFIG"
echo "[BS=$BS] 已恢复 synthetic_config.py,跳过Eval"
fi
echo ""
continue
fi
# ============ 获取 --reuse 路径 ============
echo ""
echo "=========================================="
echo "[BS=$BS] 获取 ais_bench 输出路径用于 --reuse"
echo "=========================================="
DIR_NAME=$(grep -oP 'outputs/default/\d{8}_\d{6}' "$PERF_DIR/perf_test.log" | head -1 | xargs basename)
REUSE_DIR="$PERF_DIR/aisbench_output_${DIR_NAME}"
if [ ! -d "$REUSE_DIR" ]; then
echo "❌ [BS=$BS] 错误: --reuse 路径不存在: $REUSE_DIR"
echo ""
echo "调试信息:"
echo "提取到的时间戳: $DIR_NAME"
echo ""
echo "perf 目录内容:"
ls -la "$PERF_DIR/"
OVERALL_EXIT=1
continue
fi
echo "[BS=$BS] 获取到 --reuse 路径: $REUSE_DIR"
echo ""
# ============ 第二步:运行 Eval(不做GPU监控) ============
echo "=========================================="
echo "[BS=$BS] 第二步: 运行 Eval 测试(无GPU监控)"
echo "=========================================="
EVAL_DIR="$BASE_OUTPUT_DIR/eval"
mkdir -p "$EVAL_DIR"
echo "等待GPU资源释放..."
sleep 10
EVAL_LOG="$EVAL_DIR/eval_test.log"
set +e
ais_bench \
--models "$MODEL" \
--datasets "$DATASET" \
--mode eval \
--debug \
--reuse "$REUSE_DIR" \
> "$EVAL_LOG" 2>&1
EVAL_EXIT_CODE=$?
set -e
if [ $EVAL_EXIT_CODE -eq 0 ]; then
echo "✓ [BS=$BS] Eval 测试完成"
else
echo "⚠ [BS=$BS] Eval 测试退出码: $EVAL_EXIT_CODE"
OVERALL_EXIT=1
fi
done
echo "[Cleanup] 恢复 vllm_api_stream_chat.py ..."
if [ -f "$VLLM_CONFIG_BAK" ]; then
mv "$VLLM_CONFIG_BAK" "$VLLM_CONFIG"
echo "[Cleanup] vllm_api_stream_chat.py 已恢复"
fi
echo ""
echo "=========================================="
echo "全部运行完成"
echo "=========================================="
if [ $OVERALL_EXIT -eq 0 ]; then
echo "✓ 所有BatchSize测试完成"
else
echo "⚠ 部分BatchSize测试出现问题,请检查输出"
fi
exit $OVERALL_EXIT
\ No newline at end of file
#!/bin/bash
# GPU监控包装脚本
# 从推理开始持续监控到结束,记录所有GPU状态
# 配置参数
CHECK_INTERVAL=2 # 检测GPU状态的间隔(秒)
UTIL_THRESHOLD=5 # GPU利用率阈值(%),超过此值认为推理开始
MONITOR_INTERVAL=1 # 监控采样间隔(秒)
TARGET_GPUS="" # 统计目标GPU序号,空字符串表示所有卡
BENCH_LOG_NAME="" # 推理输出日志文件名
OUTPUT_DIR="" # 用户指定的输出目录
AISBENCH_BASE_DIR="./" # ais_bench基础目录
# 解析命令行参数
while [[ $# -gt 0 ]]; do
case $1 in
--gpus)
TARGET_GPUS="$2"
shift 2
;;
--threshold)
UTIL_THRESHOLD="$2"
shift 2
;;
--interval)
MONITOR_INTERVAL="$2"
shift 2
;;
--check-interval)
CHECK_INTERVAL="$2"
shift 2
;;
--log-name)
BENCH_LOG_NAME="$2"
shift 2
;;
--output-dir)
OUTPUT_DIR="$2"
shift 2
;;
--bench-dir)
AISBENCH_BASE_DIR="$2"
shift 2
;;
*)
break
;;
esac
done
# ============ 设置输出目录(在所有参数解析完成后) ============
if [ -z "$OUTPUT_DIR" ]; then
TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
OUTPUT_DIR="./gpu_monitor_${TIMESTAMP}"
fi
# 创建输出目录(如果父目录不存在也一并创建)
mkdir -p "$OUTPUT_DIR"
if [ ! -d "$OUTPUT_DIR" ]; then
echo "错误: 无法创建输出目录 $OUTPUT_DIR"
exit 1
fi
# 使用绝对路径
OUTPUT_DIR=$(cd "$OUTPUT_DIR" && pwd)
MONITOR_LOG="$OUTPUT_DIR/monitor.log"
BENCH_LOG_NAME_FILE="$OUTPUT_DIR/.bench_log_name" # 保存日志文件名供外部脚本读取
DETAIL_LOG="$OUTPUT_DIR/gpu_detail.log"
SUMMARY_LOG="$OUTPUT_DIR/gpu_summary.log"
# 设置推理输出日志文件名
if [ -z "$BENCH_LOG_NAME" ]; then
BENCH_LOG="$OUTPUT_DIR/benchmark_output.log"
else
BENCH_LOG="$OUTPUT_DIR/${BENCH_LOG_NAME}"
fi
# 保存关键信息供外部脚本使用
echo "$OUTPUT_DIR" > "$OUTPUT_DIR/.output_dir"
echo "$BENCH_LOG" > "$OUTPUT_DIR/.bench_log_path"
echo "$BENCH_LOG_NAME" > "$BENCH_LOG_NAME_FILE"
if [ -n "$TARGET_GPUS" ]; then
echo "$TARGET_GPUS" > "$OUTPUT_DIR/.target_gpus"
fi
echo "=========================================="
echo "GPU监控包装脚本启动"
echo "输出目录: $OUTPUT_DIR"
echo "推理日志: $BENCH_LOG"
if [ -n "$TARGET_GPUS" ]; then
echo "统计GPU: $TARGET_GPUS"
else
echo "统计GPU: 所有可用GPU"
fi
echo "记录GPU: 所有GPU(含未统计的)"
echo "监控模式: 持续监控直到推理任务结束"
echo "=========================================="
echo ""
# ============ 过滤GPU用于统计 ============
filter_for_stats() {
local input="$1"
if [ -z "$TARGET_GPUS" ]; then
echo "$input"
else
echo "$input" | awk -v gpus="$TARGET_GPUS" '
BEGIN {
split(gpus, gpu_list, ",")
for (i in gpu_list) target[gpu_list[i]] = 1
}
{ if ($1 in target) print }
'
fi
}
# ============ 获取统计GPU列表 ============
get_stats_gpu_list() {
if [ -z "$TARGET_GPUS" ]; then
hy-smi 2>/dev/null | grep -E '^[[:space:]]*[0-9]+' | sed 's/ */ /g' | awk '{print $1}'
else
echo "$TARGET_GPUS" | tr ',' '\n'
fi
}
# ============ 获取所有GPU列表 ============
get_all_gpu_list() {
hy-smi 2>/dev/null | grep -E '^[[:space:]]*[0-9]+' | sed 's/ */ /g' | awk '{print $1}'
}
# ============ 检测GPU状态函数 ============
check_gpu_utilization() {
local ALL_SMI=$(hy-smi 2>/dev/null | grep -E '^[[:space:]]*[0-9]+' | sed 's/ */ /g')
local STATS_SMI=$(filter_for_stats "$ALL_SMI")
local AVG_UTIL=$(echo "$STATS_SMI" | awk '{
gsub(/%/,"",$7)
util+=$7; count++
} END {
if(count>0) printf "%.1f", util/count
else print "0"
}')
echo "$AVG_UTIL"
}
# ============ 提取ais_bench输出路径 ============
extract_aisbench_output_path() {
local log_file="$1"
local output_path=""
if [ -f "$log_file" ]; then
# 匹配 outputs/default/YYYYMMDD_HHMMSS 模式
output_path=$(grep -oP 'outputs/default/\d{8}_\d{6}' "$log_file" | head -1)
# 如果有绝对路径
if [ -z "$output_path" ]; then
output_path=$(grep -oP '/\S*outputs/default/\d{8}_\d{6}' "$log_file" | head -1)
fi
# 清理路径
output_path=$(echo "$output_path" | sed 's/["\",]//g')
fi
echo "$output_path"
}
# ============ 获取要执行的命令 ============
if [ $# -eq 0 ]; then
echo "用法: $0 [选项] <要执行的命令>"
echo ""
echo "选项:"
echo " --gpus <序号> 统计目标GPU序号,逗号分隔 (默认: 所有GPU)"
echo " --threshold <百分比> 触发监控的GPU利用率阈值 (默认: 5)"
echo " --interval <秒> 采样间隔 (默认: 1)"
echo " --check-interval <秒> 检测间隔 (默认: 2)"
echo " --log-name <文件名> 推理输出日志文件名 (默认: benchmark_output.log)"
echo " --output-dir <目录> 输出目录路径 (默认: ./gpu_monitor_时间戳)"
echo " --bench-dir <目录> ais_bench基础目录 (默认: .)"
echo ""
echo "说明:"
echo " - 详细日志记录所有GPU状态"
echo " - 统计计算仅针对 --gpus 指定的GPU"
echo " - 从GPU活动开始持续监控到推理任务结束"
echo " - 自动捕获并保存 ais_bench 输出目录到监控目录"
echo ""
echo "示例:"
echo " # 基础用法"
echo " $0 ais_bench --models vllm_api_stream_chat --datasets aime2025_gen --mode perf --debug"
echo ""
echo " # 指定GPU、日志名和输出目录"
echo " $0 --gpus 4,5,6,7 --log-name perf_test.log --output-dir /workspace/results/gpu_test \\"
echo " ais_bench --models vllm_api_stream_chat --datasets aime2025_gen --mode perf --debug"
exit 1
fi
BENCHMARK_CMD="$@"
echo "执行命令: $BENCHMARK_CMD" | tee "$MONITOR_LOG"
if [ -n "$TARGET_GPUS" ]; then
echo "统计GPU: $TARGET_GPUS" | tee -a "$MONITOR_LOG"
else
echo "统计GPU: 所有可用GPU" | tee -a "$MONITOR_LOG"
fi
echo "" | tee -a "$MONITOR_LOG"
# 记录初始GPU状态
echo "初始GPU状态:" | tee -a "$MONITOR_LOG"
hy-smi | tee -a "$MONITOR_LOG"
echo "" | tee -a "$MONITOR_LOG"
# 启动推理任务(后台运行)
echo "启动推理任务..." | tee -a "$MONITOR_LOG"
$BENCHMARK_CMD > "$BENCH_LOG" 2>&1 &
BENCH_PID=$!
echo "推理任务PID: $BENCH_PID" | tee -a "$MONITOR_LOG"
# ============ 等待GPU活动 ============
echo "等待GPU开始工作..." | tee -a "$MONITOR_LOG"
WAIT_COUNT=0
MAX_WAIT=300
while [ $WAIT_COUNT -lt $MAX_WAIT ]; do
if ! kill -0 $BENCH_PID 2>/dev/null; then
echo "推理任务已结束,但未检测到GPU活动" | tee -a "$MONITOR_LOG"
# 尝试保存ais_bench输出目录
AISBENCH_DIR=$(extract_aisbench_output_path "$BENCH_LOG")
if [ -n "$AISBENCH_DIR" ]; then
save_aisbench_output "$AISBENCH_DIR"
fi
echo "" | tee -a "$MONITOR_LOG"
echo "推理输出(最后50行):" | tee -a "$MONITOR_LOG"
tail -50 "$BENCH_LOG" | tee -a "$MONITOR_LOG"
exit 1
fi
CURRENT_UTIL=$(check_gpu_utilization)
echo "$(date +"%H:%M:%S") 目标GPU利用率: ${CURRENT_UTIL}% 等待中... ($WAIT_COUNT/$MAX_WAIT)" | tee -a "$MONITOR_LOG"
UTIL_INT=$(echo "$CURRENT_UTIL" | cut -d. -f1)
if [ -n "$UTIL_INT" ] && [ "$UTIL_INT" -gt "$UTIL_THRESHOLD" ] 2>/dev/null; then
echo "检测到GPU活动! 利用率: ${CURRENT_UTIL}%" | tee -a "$MONITOR_LOG"
break
fi
WAIT_COUNT=$(( WAIT_COUNT + 1 ))
sleep $CHECK_INTERVAL
done
if [ $WAIT_COUNT -ge $MAX_WAIT ]; then
echo "超时: 等待 ${MAX_WAIT} 个周期后仍未检测到GPU活动" | tee -a "$MONITOR_LOG"
kill $BENCH_PID 2>/dev/null
exit 1
fi
# 等待利用率稳定
echo "等待GPU利用率稳定(5秒)..." | tee -a "$MONITOR_LOG"
sleep 5
# ============ 持续监控 ============
echo "" | tee -a "$MONITOR_LOG"
echo "==========================================" | tee -a "$MONITOR_LOG"
echo "开始持续GPU监控(直到推理任务结束)" | tee -a "$MONITOR_LOG"
echo "==========================================" | tee -a "$MONITOR_LOG"
# 临时文件
TMP_DIR=$(mktemp -d)
trap "rm -rf $TMP_DIR" EXIT
# 为所有GPU卡和统计目标卡创建历史文件
get_all_gpu_list | while read gpu_id; do
echo -n "" > "$TMP_DIR/all_hcu_${gpu_id}_util.log"
echo -n "" > "$TMP_DIR/all_hcu_${gpu_id}_mem.log"
done
get_stats_gpu_list | while read gpu_id; do
echo -n "" > "$TMP_DIR/stats_hcu_${gpu_id}_util.log"
echo -n "" > "$TMP_DIR/stats_hcu_${gpu_id}_mem.log"
done
UTIL_FILE="$TMP_DIR/util_avg.log"
MEM_FILE="$TMP_DIR/mem_avg.log"
echo -n "" > "$UTIL_FILE"
echo -n "" > "$MEM_FILE"
# 写入详细日志表头
{
echo "============================================================"
echo "GPU监控详细记录"
echo "开始时间: $(date '+%Y-%m-%d %H:%M:%S')"
echo "采样间隔: ${MONITOR_INTERVAL}秒"
if [ -n "$TARGET_GPUS" ]; then
echo "统计目标GPU: $TARGET_GPUS"
else
echo "统计目标: 所有GPU"
fi
echo "记录范围: 所有GPU"
echo "============================================================"
echo ""
} > "$DETAIL_LOG"
SAMPLE_COUNT=0
MONITOR_START=$(date +%s)
while kill -0 $BENCH_PID 2>/dev/null; do
SAMPLE_COUNT=$(( SAMPLE_COUNT + 1 ))
CURRENT_TIME=$(date +"%H:%M:%S")
# 获取所有GPU数据
ALL_SMI=$(hy-smi 2>/dev/null | grep -E '^[[:space:]]*[0-9]+' | sed 's/ */ /g')
if [ -z "$ALL_SMI" ]; then
sleep $MONITOR_INTERVAL
continue
fi
# ====== 记录所有GPU到详细日志 ======
{
echo "[样本 #$SAMPLE_COUNT] 时间: $CURRENT_TIME"
echo "--------------------------------------------------------------------------------"
printf "%-6s %-10s %-10s %-10s %-10s %-10s %-10s\n" \
"HCU" "Temp" "AvgPwr" "VRAM%" "HCU%" "Dec%" "Enc%"
echo "--------------------------------------------------------------------------------"
} >> "$DETAIL_LOG"
echo "$ALL_SMI" | while read line; do
HCU_ID=$(echo "$line" | awk '{print $1}')
TEMP=$(echo "$line" | awk '{print $2}')
PWR=$(echo "$line" | awk '{print $3}')
VRAM=$(echo "$line" | awk '{gsub(/%/,"",$6); print $6}')
HCU_UTIL=$(echo "$line" | awk '{gsub(/%/,"",$7); print $7}')
DEC=$(echo "$line" | awk '{print $8}')
ENC=$(echo "$line" | awk '{print $9}')
printf "%-6s %-10s %-10s %-10s %-10s %-10s %-10s\n" \
"$HCU_ID" "$TEMP" "$PWR" "${VRAM}%" "${HCU_UTIL}%" "$DEC" "$ENC" >> "$DETAIL_LOG"
# 记录所有GPU历史
echo "$HCU_UTIL" >> "$TMP_DIR/all_hcu_${HCU_ID}_util.log"
echo "$VRAM" >> "$TMP_DIR/all_hcu_${HCU_ID}_mem.log"
done
# ====== 计算目标GPU统计 ======
STATS_SMI=$(filter_for_stats "$ALL_SMI")
# 记录目标GPU历史
echo "$STATS_SMI" | while read line; do
HCU_ID=$(echo "$line" | awk '{print $1}')
VRAM=$(echo "$line" | awk '{gsub(/%/,"",$6); print $6}')
HCU_UTIL=$(echo "$line" | awk '{gsub(/%/,"",$7); print $7}')
echo "$HCU_UTIL" >> "$TMP_DIR/stats_hcu_${HCU_ID}_util.log"
echo "$VRAM" >> "$TMP_DIR/stats_hcu_${HCU_ID}_mem.log"
done
# 目标卡平均
AVG_VRAM=$(echo "$STATS_SMI" | awk '{gsub(/%/,"",$6); vram+=$6; count++} END {if(count>0) printf "%.1f", vram/count; else print "0"}')
AVG_HCU=$(echo "$STATS_SMI" | awk '{gsub(/%/,"",$7); hcu+=$7; count++} END {if(count>0) printf "%.1f", hcu/count; else print "0"}')
{
echo "--------------------------------------------------------------------------------"
printf "%-6s %-10s %-10s %-10s %-10s %-10s %-10s\n" \
"目标均" "-" "-" "${AVG_VRAM}%" "${AVG_HCU}%" "-" "-"
echo ""
} >> "$DETAIL_LOG"
echo "$AVG_HCU" >> "$UTIL_FILE"
echo "$AVG_VRAM" >> "$MEM_FILE"
# 计算已运行时间
ELAPSED=$(( $(date +%s) - MONITOR_START ))
echo "$CURRENT_TIME 样本#$SAMPLE_COUNT [${ELAPSED}s] | 目标HCU: ${AVG_HCU}% | 目标VRAM: ${AVG_VRAM}%" | tee -a "$MONITOR_LOG"
sleep $MONITOR_INTERVAL
done
MONITOR_END=$(date +%s)
TOTAL_MONITOR_TIME=$(( MONITOR_END - MONITOR_START ))
echo "" | tee -a "$MONITOR_LOG"
echo "推理任务已结束,停止GPU监控" | tee -a "$MONITOR_LOG"
echo "总监控时长: ${TOTAL_MONITOR_TIME}秒, 总采样数: $SAMPLE_COUNT" | tee -a "$MONITOR_LOG"
# ============ 等待推理任务完全结束 ============
wait $BENCH_PID
BENCH_EXIT_CODE=$?
# ============ 保存ais_bench输出目录 ============
save_aisbench_output() {
local aisbench_dir="$1"
if [ -z "$aisbench_dir" ]; then
return
fi
echo "" | tee -a "$MONITOR_LOG"
echo "保存 ais_bench 输出目录..." | tee -a "$MONITOR_LOG"
echo "检测到路径: $aisbench_dir" | tee -a "$MONITOR_LOG"
local dir_name=$(basename "$aisbench_dir")
local dest_dir="$OUTPUT_DIR/aisbench_output_${dir_name}"
# 尝试多个可能的完整路径
local possible_paths=(
"$aisbench_dir"
"$AISBENCH_BASE_DIR/$aisbench_dir"
"$(pwd)/$aisbench_dir"
)
for src_dir in "${possible_paths[@]}"; do
if [ -d "$src_dir" ]; then
echo "找到目录: $src_dir" | tee -a "$MONITOR_LOG"
cp -r "$src_dir" "$dest_dir" 2>/dev/null
if [ $? -eq 0 ]; then
echo "✓ 已复制到: $dest_dir" | tee -a "$MONITOR_LOG"
# 保存路径供外部脚本使用
echo "$dest_dir" > "$OUTPUT_DIR/.aisbench_output_dir"
echo "$dir_name" > "$OUTPUT_DIR/.aisbench_dir_name"
return
fi
fi
done
# 尝试查找最新生成的目录
local latest_dir=$(ls -dt "$AISBENCH_BASE_DIR/outputs/default/20"* 2>/dev/null | head -1)
if [ -n "$latest_dir" ] && [ "$(stat -c %Y "$latest_dir" 2>/dev/null || stat -f %m "$latest_dir" 2>/dev/null)" -gt "$MONITOR_START" ]; then
local latest_name=$(basename "$latest_dir")
dest_dir="$OUTPUT_DIR/aisbench_output_${latest_name}"
cp -r "$latest_dir" "$dest_dir" 2>/dev/null
echo "✓ 找到最新输出目录并复制: $dest_dir" | tee -a "$MONITOR_LOG"
echo "$dest_dir" > "$OUTPUT_DIR/.aisbench_output_dir"
echo "$latest_name" > "$OUTPUT_DIR/.aisbench_dir_name"
return
fi
echo "警告: 无法找到 ais_bench 输出目录" | tee -a "$MONITOR_LOG"
}
AISBENCH_DIR=$(extract_aisbench_output_path "$BENCH_LOG")
save_aisbench_output "$AISBENCH_DIR"
# ============ 生成统计汇总 ============
echo "" | tee -a "$MONITOR_LOG"
echo "生成统计报告..." | tee -a "$MONITOR_LOG"
TOTAL_SAMPLES=$(wc -l < "$UTIL_FILE")
if [ "$TOTAL_SAMPLES" -eq 0 ]; then
echo "警告: 没有采集到任何数据" | tee -a "$MONITOR_LOG"
else
# 稳定阶段: 跳过前25%样本
SKIP_SAMPLES=$(( TOTAL_SAMPLES / 4 ))
STABLE_SAMPLES=$(( TOTAL_SAMPLES - SKIP_SAMPLES ))
[ $STABLE_SAMPLES -lt 1 ] && STABLE_SAMPLES=1
AVG_HCU_ALL=$(tail -n $STABLE_SAMPLES "$UTIL_FILE" | awk '{sum+=$1; count++} END {printf "%.1f", sum/count}')
AVG_VRAM_ALL=$(tail -n $STABLE_SAMPLES "$MEM_FILE" | awk '{sum+=$1; count++} END {printf "%.1f", sum/count}')
MAX_HCU=$(tail -n $STABLE_SAMPLES "$UTIL_FILE" | sort -n | tail -1)
MIN_HCU=$(tail -n $STABLE_SAMPLES "$UTIL_FILE" | sort -n | head -1)
MAX_VRAM=$(tail -n $STABLE_SAMPLES "$MEM_FILE" | sort -n | tail -1)
MIN_VRAM=$(tail -n $STABLE_SAMPLES "$MEM_FILE" | sort -n | head -1)
{
echo "============================================================"
echo "GPU监控汇总统计"
echo "生成时间: $(date '+%Y-%m-%d %H:%M:%S')"
echo "============================================================"
echo ""
echo "监控概要:"
echo " - 总监控时长: ${TOTAL_MONITOR_TIME}秒"
echo " - 采样间隔: ${MONITOR_INTERVAL}秒"
echo " - 总采样数: $TOTAL_SAMPLES"
echo " - 稳定阶段采样数: $STABLE_SAMPLES (跳过前25%)"
if [ -n "$TARGET_GPUS" ]; then
echo " - 统计目标GPU: $TARGET_GPUS"
else
echo " - 统计目标: 所有GPU"
fi
echo ""
echo "============================================================"
echo "目标GPU整体统计 (稳定阶段)"
echo "============================================================"
echo " 平均GPU利用率: ${AVG_HCU_ALL}%"
echo " 利用率范围: ${MIN_HCU}% ~ ${MAX_HCU}%"
echo " 平均显存占用: ${AVG_VRAM_ALL}%"
echo " 显存占用范围: ${MIN_VRAM}% ~ ${MAX_VRAM}%"
echo ""
echo "============================================================"
echo "目标GPU逐卡统计 (稳定阶段)"
echo "============================================================"
printf "%-6s %-14s %-14s %-14s %-14s\n" "HCU" "平均HCU%" "最大HCU%" "平均VRAM%" "最大VRAM%"
echo "------------------------------------------------------------------------"
} > "$SUMMARY_LOG"
get_stats_gpu_list | while read gpu_id; do
STATS_FILE="$TMP_DIR/stats_hcu_${gpu_id}_util.log"
MEMS_FILE="$TMP_DIR/stats_hcu_${gpu_id}_mem.log"
if [ -f "$STATS_FILE" ] && [ -s "$STATS_FILE" ]; then
CARD_TOTAL=$(wc -l < "$STATS_FILE")
CARD_SKIP=$(( CARD_TOTAL / 4 ))
CARD_STABLE=$(( CARD_TOTAL - CARD_SKIP ))
[ $CARD_STABLE -lt 1 ] && CARD_STABLE=1
AVG_UTIL=$(tail -n $CARD_STABLE "$STATS_FILE" | awk '{sum+=$1; count++} END {printf "%.1f", sum/count}')
MAX_UTIL=$(tail -n $CARD_STABLE "$STATS_FILE" | sort -n | tail -1)
AVG_MEM=$(tail -n $CARD_STABLE "$MEMS_FILE" | awk '{sum+=$1; count++} END {printf "%.1f", sum/count}')
MAX_MEM=$(tail -n $CARD_STABLE "$MEMS_FILE" | sort -n | tail -1)
else
AVG_UTIL="N/A"; MAX_UTIL="N/A"; AVG_MEM="N/A"; MAX_MEM="N/A"
fi
printf "%-6s %-14s %-14s %-14s %-14s\n" \
"$gpu_id" "${AVG_UTIL}%" "${MAX_UTIL}%" "${AVG_MEM}%" "${MAX_MEM}%" >> "$SUMMARY_LOG"
done
{
echo ""
echo "============================================================"
echo "所有GPU最新状态"
echo "============================================================"
} >> "$SUMMARY_LOG"
hy-smi 2>/dev/null | grep -E '^[[:space:]]*[0-9]+' | sed 's/ */ /g' | awk '{
gsub(/%/,"")
printf " HCU %s : Temp=%s, Pwr=%s, VRAM=%s%%, HCU=%s%%\n", $1, $2, $3, $6, $7
}' >> "$SUMMARY_LOG"
# 终端输出
echo "" | tee -a "$MONITOR_LOG"
echo "============================================================" | tee -a "$MONITOR_LOG"
echo " 监控统计结果" | tee -a "$MONITOR_LOG"
echo "============================================================" | tee -a "$MONITOR_LOG"
echo "监控时长: ${TOTAL_MONITOR_TIME}秒 ($SAMPLE_COUNT 个样本)" | tee -a "$MONITOR_LOG"
echo "" | tee -a "$MONITOR_LOG"
echo "目标GPU (稳定阶段):" | tee -a "$MONITOR_LOG"
echo " 平均利用率: ${AVG_HCU_ALL}% (${MIN_HCU}% ~ ${MAX_HCU}%)" | tee -a "$MONITOR_LOG"
echo " 平均显存: ${AVG_VRAM_ALL}% (${MIN_VRAM}% ~ ${MAX_VRAM}%)" | tee -a "$MONITOR_LOG"
fi
echo "" | tee -a "$MONITOR_LOG"
echo "==========================================" | tee -a "$MONITOR_LOG"
echo "推理任务结束 (退出码: $BENCH_EXIT_CODE)" | tee -a "$MONITOR_LOG"
echo "==========================================" | tee -a "$MONITOR_LOG"
echo "" | tee -a "$MONITOR_LOG"
echo "最终GPU状态:" | tee -a "$MONITOR_LOG"
hy-smi | tee -a "$MONITOR_LOG"
echo "" | tee -a "$MONITOR_LOG"
echo "==========================================" | tee -a "$MONITOR_LOG"
echo "输出文件:" | tee -a "$MONITOR_LOG"
ls -lh "$OUTPUT_DIR/" | tee -a "$MONITOR_LOG"
echo "==========================================" | tee -a "$MONITOR_LOG"
echo "所有文件已保存到: $OUTPUT_DIR" | tee -a "$MONITOR_LOG"
\ No newline at end of file
bash ./bench-test.sh --gpus 3 \
--model vllm_api_stream_chat \
--dataset aime2025_gen \
--model-name test_model \
--batch-size "16" \
--input-len 512 \
--max-out-len 512 \
--ais-model /data2/models/qwen3-8B \
--model-path /data2/models/qwen3-8B \
--host-port 23456
# bash ./bench-test.sh --gpus 3 \
# --model vllm_api_stream_chat \
# --dataset synthetic_gen \
# --model-name test_model \
# --batch-size "8 16 32" \
# --input-len 512 \
# --max-out-len 512 \
# --ais-model /data2/models/qwen3-8B \
# --model-path /data2/models/qwen3-8B \
# --host-port 23456
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment