#!/bin/bash # GPU监控包装脚本 # 从推理开始持续监控到结束,记录所有GPU状态 # 配置参数 CHECK_INTERVAL=2 # 检测GPU状态的间隔(秒) UTIL_THRESHOLD=5 # GPU利用率阈值(%),超过此值认为推理开始 MONITOR_INTERVAL=1 # 监控采样间隔(秒) TARGET_GPUS="" # 统计目标GPU序号,空字符串表示所有卡 BENCH_LOG_NAME="" # 推理输出日志文件名 OUTPUT_DIR="" # 用户指定的输出目录 AISBENCH_BASE_DIR="./" # ais_bench基础目录 # 解析命令行参数 while [[ $# -gt 0 ]]; do case $1 in --gpus) TARGET_GPUS="$2" shift 2 ;; --threshold) UTIL_THRESHOLD="$2" shift 2 ;; --interval) MONITOR_INTERVAL="$2" shift 2 ;; --check-interval) CHECK_INTERVAL="$2" shift 2 ;; --log-name) BENCH_LOG_NAME="$2" shift 2 ;; --output-dir) OUTPUT_DIR="$2" shift 2 ;; --bench-dir) AISBENCH_BASE_DIR="$2" shift 2 ;; *) break ;; esac done # ============ 设置输出目录(在所有参数解析完成后) ============ if [ -z "$OUTPUT_DIR" ]; then TIMESTAMP=$(date +"%Y%m%d_%H%M%S") OUTPUT_DIR="./gpu_monitor_${TIMESTAMP}" fi # 创建输出目录(如果父目录不存在也一并创建) mkdir -p "$OUTPUT_DIR" if [ ! -d "$OUTPUT_DIR" ]; then echo "错误: 无法创建输出目录 $OUTPUT_DIR" exit 1 fi # 使用绝对路径 OUTPUT_DIR=$(cd "$OUTPUT_DIR" && pwd) MONITOR_LOG="$OUTPUT_DIR/monitor.log" BENCH_LOG_NAME_FILE="$OUTPUT_DIR/.bench_log_name" # 保存日志文件名供外部脚本读取 DETAIL_LOG="$OUTPUT_DIR/gpu_detail.log" SUMMARY_LOG="$OUTPUT_DIR/gpu_summary.log" # 设置推理输出日志文件名 if [ -z "$BENCH_LOG_NAME" ]; then BENCH_LOG="$OUTPUT_DIR/benchmark_output.log" else BENCH_LOG="$OUTPUT_DIR/${BENCH_LOG_NAME}" fi # 保存关键信息供外部脚本使用 echo "$OUTPUT_DIR" > "$OUTPUT_DIR/.output_dir" echo "$BENCH_LOG" > "$OUTPUT_DIR/.bench_log_path" echo "$BENCH_LOG_NAME" > "$BENCH_LOG_NAME_FILE" if [ -n "$TARGET_GPUS" ]; then echo "$TARGET_GPUS" > "$OUTPUT_DIR/.target_gpus" fi echo "==========================================" echo "GPU监控包装脚本启动" echo "输出目录: $OUTPUT_DIR" echo "推理日志: $BENCH_LOG" if [ -n "$TARGET_GPUS" ]; then echo "统计GPU: $TARGET_GPUS" else echo "统计GPU: 所有可用GPU" fi echo "记录GPU: 所有GPU(含未统计的)" echo "监控模式: 持续监控直到推理任务结束" echo "==========================================" echo "" # ============ 过滤GPU用于统计 ============ filter_for_stats() { local input="$1" if [ -z "$TARGET_GPUS" ]; then echo "$input" else echo "$input" | awk -v gpus="$TARGET_GPUS" ' BEGIN { split(gpus, gpu_list, ",") for (i in gpu_list) target[gpu_list[i]] = 1 } { if ($1 in target) print } ' fi } # ============ 获取统计GPU列表 ============ get_stats_gpu_list() { if [ -z "$TARGET_GPUS" ]; then hy-smi 2>/dev/null | grep -E '^[[:space:]]*[0-9]+' | sed 's/ */ /g' | awk '{print $1}' else echo "$TARGET_GPUS" | tr ',' '\n' fi } # ============ 获取所有GPU列表 ============ get_all_gpu_list() { hy-smi 2>/dev/null | grep -E '^[[:space:]]*[0-9]+' | sed 's/ */ /g' | awk '{print $1}' } # ============ 检测GPU状态函数 ============ check_gpu_utilization() { local ALL_SMI=$(hy-smi 2>/dev/null | grep -E '^[[:space:]]*[0-9]+' | sed 's/ */ /g') local STATS_SMI=$(filter_for_stats "$ALL_SMI") local AVG_UTIL=$(echo "$STATS_SMI" | awk '{ gsub(/%/,"",$7) util+=$7; count++ } END { if(count>0) printf "%.1f", util/count else print "0" }') echo "$AVG_UTIL" } # ============ 提取ais_bench输出路径 ============ extract_aisbench_output_path() { local log_file="$1" local output_path="" if [ -f "$log_file" ]; then # 匹配 outputs/default/YYYYMMDD_HHMMSS 模式 output_path=$(grep -oP 'outputs/default/\d{8}_\d{6}' "$log_file" | head -1) # 如果有绝对路径 if [ -z "$output_path" ]; then output_path=$(grep -oP '/\S*outputs/default/\d{8}_\d{6}' "$log_file" | head -1) fi # 清理路径 output_path=$(echo "$output_path" | sed 's/["\",]//g') fi echo "$output_path" } # ============ 获取要执行的命令 ============ if [ $# -eq 0 ]; then echo "用法: $0 [选项] <要执行的命令>" echo "" echo "选项:" echo " --gpus <序号> 统计目标GPU序号,逗号分隔 (默认: 所有GPU)" echo " --threshold <百分比> 触发监控的GPU利用率阈值 (默认: 5)" echo " --interval <秒> 采样间隔 (默认: 1)" echo " --check-interval <秒> 检测间隔 (默认: 2)" echo " --log-name <文件名> 推理输出日志文件名 (默认: benchmark_output.log)" echo " --output-dir <目录> 输出目录路径 (默认: ./gpu_monitor_时间戳)" echo " --bench-dir <目录> ais_bench基础目录 (默认: .)" echo "" echo "说明:" echo " - 详细日志记录所有GPU状态" echo " - 统计计算仅针对 --gpus 指定的GPU" echo " - 从GPU活动开始持续监控到推理任务结束" echo " - 自动捕获并保存 ais_bench 输出目录到监控目录" echo "" echo "示例:" echo " # 基础用法" echo " $0 ais_bench --models vllm_api_stream_chat --datasets aime2025_gen --mode perf --debug" echo "" echo " # 指定GPU、日志名和输出目录" echo " $0 --gpus 4,5,6,7 --log-name perf_test.log --output-dir /workspace/results/gpu_test \\" echo " ais_bench --models vllm_api_stream_chat --datasets aime2025_gen --mode perf --debug" exit 1 fi BENCHMARK_CMD="$@" echo "执行命令: $BENCHMARK_CMD" | tee "$MONITOR_LOG" if [ -n "$TARGET_GPUS" ]; then echo "统计GPU: $TARGET_GPUS" | tee -a "$MONITOR_LOG" else echo "统计GPU: 所有可用GPU" | tee -a "$MONITOR_LOG" fi echo "" | tee -a "$MONITOR_LOG" # 记录初始GPU状态 echo "初始GPU状态:" | tee -a "$MONITOR_LOG" hy-smi | tee -a "$MONITOR_LOG" echo "" | tee -a "$MONITOR_LOG" # 启动推理任务(后台运行) echo "启动推理任务..." | tee -a "$MONITOR_LOG" $BENCHMARK_CMD > "$BENCH_LOG" 2>&1 & BENCH_PID=$! echo "推理任务PID: $BENCH_PID" | tee -a "$MONITOR_LOG" # ============ 等待GPU活动 ============ echo "等待GPU开始工作..." | tee -a "$MONITOR_LOG" WAIT_COUNT=0 MAX_WAIT=300 while [ $WAIT_COUNT -lt $MAX_WAIT ]; do if ! kill -0 $BENCH_PID 2>/dev/null; then echo "推理任务已结束,但未检测到GPU活动" | tee -a "$MONITOR_LOG" # 尝试保存ais_bench输出目录 AISBENCH_DIR=$(extract_aisbench_output_path "$BENCH_LOG") if [ -n "$AISBENCH_DIR" ]; then save_aisbench_output "$AISBENCH_DIR" fi echo "" | tee -a "$MONITOR_LOG" echo "推理输出(最后50行):" | tee -a "$MONITOR_LOG" tail -50 "$BENCH_LOG" | tee -a "$MONITOR_LOG" exit 1 fi CURRENT_UTIL=$(check_gpu_utilization) echo "$(date +"%H:%M:%S") 目标GPU利用率: ${CURRENT_UTIL}% 等待中... ($WAIT_COUNT/$MAX_WAIT)" | tee -a "$MONITOR_LOG" UTIL_INT=$(echo "$CURRENT_UTIL" | cut -d. -f1) if [ -n "$UTIL_INT" ] && [ "$UTIL_INT" -gt "$UTIL_THRESHOLD" ] 2>/dev/null; then echo "检测到GPU活动! 利用率: ${CURRENT_UTIL}%" | tee -a "$MONITOR_LOG" break fi WAIT_COUNT=$(( WAIT_COUNT + 1 )) sleep $CHECK_INTERVAL done if [ $WAIT_COUNT -ge $MAX_WAIT ]; then echo "超时: 等待 ${MAX_WAIT} 个周期后仍未检测到GPU活动" | tee -a "$MONITOR_LOG" kill $BENCH_PID 2>/dev/null exit 1 fi # 等待利用率稳定 echo "等待GPU利用率稳定(5秒)..." | tee -a "$MONITOR_LOG" sleep 5 # ============ 持续监控 ============ echo "" | tee -a "$MONITOR_LOG" echo "==========================================" | tee -a "$MONITOR_LOG" echo "开始持续GPU监控(直到推理任务结束)" | tee -a "$MONITOR_LOG" echo "==========================================" | tee -a "$MONITOR_LOG" # 临时文件 TMP_DIR=$(mktemp -d) trap "rm -rf $TMP_DIR" EXIT # 为所有GPU卡和统计目标卡创建历史文件 get_all_gpu_list | while read gpu_id; do echo -n "" > "$TMP_DIR/all_hcu_${gpu_id}_util.log" echo -n "" > "$TMP_DIR/all_hcu_${gpu_id}_mem.log" done get_stats_gpu_list | while read gpu_id; do echo -n "" > "$TMP_DIR/stats_hcu_${gpu_id}_util.log" echo -n "" > "$TMP_DIR/stats_hcu_${gpu_id}_mem.log" done UTIL_FILE="$TMP_DIR/util_avg.log" MEM_FILE="$TMP_DIR/mem_avg.log" echo -n "" > "$UTIL_FILE" echo -n "" > "$MEM_FILE" # 写入详细日志表头 { echo "============================================================" echo "GPU监控详细记录" echo "开始时间: $(date '+%Y-%m-%d %H:%M:%S')" echo "采样间隔: ${MONITOR_INTERVAL}秒" if [ -n "$TARGET_GPUS" ]; then echo "统计目标GPU: $TARGET_GPUS" else echo "统计目标: 所有GPU" fi echo "记录范围: 所有GPU" echo "============================================================" echo "" } > "$DETAIL_LOG" SAMPLE_COUNT=0 MONITOR_START=$(date +%s) while kill -0 $BENCH_PID 2>/dev/null; do SAMPLE_COUNT=$(( SAMPLE_COUNT + 1 )) CURRENT_TIME=$(date +"%H:%M:%S") # 获取所有GPU数据 ALL_SMI=$(hy-smi 2>/dev/null | grep -E '^[[:space:]]*[0-9]+' | sed 's/ */ /g') if [ -z "$ALL_SMI" ]; then sleep $MONITOR_INTERVAL continue fi # ====== 记录所有GPU到详细日志 ====== { echo "[样本 #$SAMPLE_COUNT] 时间: $CURRENT_TIME" echo "--------------------------------------------------------------------------------" printf "%-6s %-10s %-10s %-10s %-10s %-10s %-10s\n" \ "HCU" "Temp" "AvgPwr" "VRAM%" "HCU%" "Dec%" "Enc%" echo "--------------------------------------------------------------------------------" } >> "$DETAIL_LOG" echo "$ALL_SMI" | while read line; do HCU_ID=$(echo "$line" | awk '{print $1}') TEMP=$(echo "$line" | awk '{print $2}') PWR=$(echo "$line" | awk '{print $3}') VRAM=$(echo "$line" | awk '{gsub(/%/,"",$6); print $6}') HCU_UTIL=$(echo "$line" | awk '{gsub(/%/,"",$7); print $7}') DEC=$(echo "$line" | awk '{print $8}') ENC=$(echo "$line" | awk '{print $9}') printf "%-6s %-10s %-10s %-10s %-10s %-10s %-10s\n" \ "$HCU_ID" "$TEMP" "$PWR" "${VRAM}%" "${HCU_UTIL}%" "$DEC" "$ENC" >> "$DETAIL_LOG" # 记录所有GPU历史 echo "$HCU_UTIL" >> "$TMP_DIR/all_hcu_${HCU_ID}_util.log" echo "$VRAM" >> "$TMP_DIR/all_hcu_${HCU_ID}_mem.log" done # ====== 计算目标GPU统计 ====== STATS_SMI=$(filter_for_stats "$ALL_SMI") # 记录目标GPU历史 echo "$STATS_SMI" | while read line; do HCU_ID=$(echo "$line" | awk '{print $1}') VRAM=$(echo "$line" | awk '{gsub(/%/,"",$6); print $6}') HCU_UTIL=$(echo "$line" | awk '{gsub(/%/,"",$7); print $7}') echo "$HCU_UTIL" >> "$TMP_DIR/stats_hcu_${HCU_ID}_util.log" echo "$VRAM" >> "$TMP_DIR/stats_hcu_${HCU_ID}_mem.log" done # 目标卡平均 AVG_VRAM=$(echo "$STATS_SMI" | awk '{gsub(/%/,"",$6); vram+=$6; count++} END {if(count>0) printf "%.1f", vram/count; else print "0"}') AVG_HCU=$(echo "$STATS_SMI" | awk '{gsub(/%/,"",$7); hcu+=$7; count++} END {if(count>0) printf "%.1f", hcu/count; else print "0"}') { echo "--------------------------------------------------------------------------------" printf "%-6s %-10s %-10s %-10s %-10s %-10s %-10s\n" \ "目标均" "-" "-" "${AVG_VRAM}%" "${AVG_HCU}%" "-" "-" echo "" } >> "$DETAIL_LOG" echo "$AVG_HCU" >> "$UTIL_FILE" echo "$AVG_VRAM" >> "$MEM_FILE" # 计算已运行时间 ELAPSED=$(( $(date +%s) - MONITOR_START )) echo "$CURRENT_TIME 样本#$SAMPLE_COUNT [${ELAPSED}s] | 目标HCU: ${AVG_HCU}% | 目标VRAM: ${AVG_VRAM}%" | tee -a "$MONITOR_LOG" sleep $MONITOR_INTERVAL done MONITOR_END=$(date +%s) TOTAL_MONITOR_TIME=$(( MONITOR_END - MONITOR_START )) echo "" | tee -a "$MONITOR_LOG" echo "推理任务已结束,停止GPU监控" | tee -a "$MONITOR_LOG" echo "总监控时长: ${TOTAL_MONITOR_TIME}秒, 总采样数: $SAMPLE_COUNT" | tee -a "$MONITOR_LOG" # ============ 等待推理任务完全结束 ============ wait $BENCH_PID BENCH_EXIT_CODE=$? # ============ 保存ais_bench输出目录 ============ save_aisbench_output() { local aisbench_dir="$1" if [ -z "$aisbench_dir" ]; then return fi echo "" | tee -a "$MONITOR_LOG" echo "保存 ais_bench 输出目录..." | tee -a "$MONITOR_LOG" echo "检测到路径: $aisbench_dir" | tee -a "$MONITOR_LOG" local dir_name=$(basename "$aisbench_dir") local dest_dir="$OUTPUT_DIR/aisbench_output_${dir_name}" # 尝试多个可能的完整路径 local possible_paths=( "$aisbench_dir" "$AISBENCH_BASE_DIR/$aisbench_dir" "$(pwd)/$aisbench_dir" ) for src_dir in "${possible_paths[@]}"; do if [ -d "$src_dir" ]; then echo "找到目录: $src_dir" | tee -a "$MONITOR_LOG" cp -r "$src_dir" "$dest_dir" 2>/dev/null if [ $? -eq 0 ]; then echo "✓ 已复制到: $dest_dir" | tee -a "$MONITOR_LOG" # 保存路径供外部脚本使用 echo "$dest_dir" > "$OUTPUT_DIR/.aisbench_output_dir" echo "$dir_name" > "$OUTPUT_DIR/.aisbench_dir_name" return fi fi done # 尝试查找最新生成的目录 local latest_dir=$(ls -dt "$AISBENCH_BASE_DIR/outputs/default/20"* 2>/dev/null | head -1) if [ -n "$latest_dir" ] && [ "$(stat -c %Y "$latest_dir" 2>/dev/null || stat -f %m "$latest_dir" 2>/dev/null)" -gt "$MONITOR_START" ]; then local latest_name=$(basename "$latest_dir") dest_dir="$OUTPUT_DIR/aisbench_output_${latest_name}" cp -r "$latest_dir" "$dest_dir" 2>/dev/null echo "✓ 找到最新输出目录并复制: $dest_dir" | tee -a "$MONITOR_LOG" echo "$dest_dir" > "$OUTPUT_DIR/.aisbench_output_dir" echo "$latest_name" > "$OUTPUT_DIR/.aisbench_dir_name" return fi echo "警告: 无法找到 ais_bench 输出目录" | tee -a "$MONITOR_LOG" } AISBENCH_DIR=$(extract_aisbench_output_path "$BENCH_LOG") save_aisbench_output "$AISBENCH_DIR" # ============ 生成统计汇总 ============ echo "" | tee -a "$MONITOR_LOG" echo "生成统计报告..." | tee -a "$MONITOR_LOG" TOTAL_SAMPLES=$(wc -l < "$UTIL_FILE") if [ "$TOTAL_SAMPLES" -eq 0 ]; then echo "警告: 没有采集到任何数据" | tee -a "$MONITOR_LOG" else # 稳定阶段: 跳过前25%样本 SKIP_SAMPLES=$(( TOTAL_SAMPLES / 4 )) STABLE_SAMPLES=$(( TOTAL_SAMPLES - SKIP_SAMPLES )) [ $STABLE_SAMPLES -lt 1 ] && STABLE_SAMPLES=1 AVG_HCU_ALL=$(tail -n $STABLE_SAMPLES "$UTIL_FILE" | awk '{sum+=$1; count++} END {printf "%.1f", sum/count}') AVG_VRAM_ALL=$(tail -n $STABLE_SAMPLES "$MEM_FILE" | awk '{sum+=$1; count++} END {printf "%.1f", sum/count}') MAX_HCU=$(tail -n $STABLE_SAMPLES "$UTIL_FILE" | sort -n | tail -1) MIN_HCU=$(tail -n $STABLE_SAMPLES "$UTIL_FILE" | sort -n | head -1) MAX_VRAM=$(tail -n $STABLE_SAMPLES "$MEM_FILE" | sort -n | tail -1) MIN_VRAM=$(tail -n $STABLE_SAMPLES "$MEM_FILE" | sort -n | head -1) { echo "============================================================" echo "GPU监控汇总统计" echo "生成时间: $(date '+%Y-%m-%d %H:%M:%S')" echo "============================================================" echo "" echo "监控概要:" echo " - 总监控时长: ${TOTAL_MONITOR_TIME}秒" echo " - 采样间隔: ${MONITOR_INTERVAL}秒" echo " - 总采样数: $TOTAL_SAMPLES" echo " - 稳定阶段采样数: $STABLE_SAMPLES (跳过前25%)" if [ -n "$TARGET_GPUS" ]; then echo " - 统计目标GPU: $TARGET_GPUS" else echo " - 统计目标: 所有GPU" fi echo "" echo "============================================================" echo "目标GPU整体统计 (稳定阶段)" echo "============================================================" echo " 平均GPU利用率: ${AVG_HCU_ALL}%" echo " 利用率范围: ${MIN_HCU}% ~ ${MAX_HCU}%" echo " 平均显存占用: ${AVG_VRAM_ALL}%" echo " 显存占用范围: ${MIN_VRAM}% ~ ${MAX_VRAM}%" echo "" echo "============================================================" echo "目标GPU逐卡统计 (稳定阶段)" echo "============================================================" printf "%-6s %-14s %-14s %-14s %-14s\n" "HCU" "平均HCU%" "最大HCU%" "平均VRAM%" "最大VRAM%" echo "------------------------------------------------------------------------" } > "$SUMMARY_LOG" get_stats_gpu_list | while read gpu_id; do STATS_FILE="$TMP_DIR/stats_hcu_${gpu_id}_util.log" MEMS_FILE="$TMP_DIR/stats_hcu_${gpu_id}_mem.log" if [ -f "$STATS_FILE" ] && [ -s "$STATS_FILE" ]; then CARD_TOTAL=$(wc -l < "$STATS_FILE") CARD_SKIP=$(( CARD_TOTAL / 4 )) CARD_STABLE=$(( CARD_TOTAL - CARD_SKIP )) [ $CARD_STABLE -lt 1 ] && CARD_STABLE=1 AVG_UTIL=$(tail -n $CARD_STABLE "$STATS_FILE" | awk '{sum+=$1; count++} END {printf "%.1f", sum/count}') MAX_UTIL=$(tail -n $CARD_STABLE "$STATS_FILE" | sort -n | tail -1) AVG_MEM=$(tail -n $CARD_STABLE "$MEMS_FILE" | awk '{sum+=$1; count++} END {printf "%.1f", sum/count}') MAX_MEM=$(tail -n $CARD_STABLE "$MEMS_FILE" | sort -n | tail -1) else AVG_UTIL="N/A"; MAX_UTIL="N/A"; AVG_MEM="N/A"; MAX_MEM="N/A" fi printf "%-6s %-14s %-14s %-14s %-14s\n" \ "$gpu_id" "${AVG_UTIL}%" "${MAX_UTIL}%" "${AVG_MEM}%" "${MAX_MEM}%" >> "$SUMMARY_LOG" done { echo "" echo "============================================================" echo "所有GPU最新状态" echo "============================================================" } >> "$SUMMARY_LOG" hy-smi 2>/dev/null | grep -E '^[[:space:]]*[0-9]+' | sed 's/ */ /g' | awk '{ gsub(/%/,"") printf " HCU %s : Temp=%s, Pwr=%s, VRAM=%s%%, HCU=%s%%\n", $1, $2, $3, $6, $7 }' >> "$SUMMARY_LOG" # 终端输出 echo "" | tee -a "$MONITOR_LOG" echo "============================================================" | tee -a "$MONITOR_LOG" echo " 监控统计结果" | tee -a "$MONITOR_LOG" echo "============================================================" | tee -a "$MONITOR_LOG" echo "监控时长: ${TOTAL_MONITOR_TIME}秒 ($SAMPLE_COUNT 个样本)" | tee -a "$MONITOR_LOG" echo "" | tee -a "$MONITOR_LOG" echo "目标GPU (稳定阶段):" | tee -a "$MONITOR_LOG" echo " 平均利用率: ${AVG_HCU_ALL}% (${MIN_HCU}% ~ ${MAX_HCU}%)" | tee -a "$MONITOR_LOG" echo " 平均显存: ${AVG_VRAM_ALL}% (${MIN_VRAM}% ~ ${MAX_VRAM}%)" | tee -a "$MONITOR_LOG" fi echo "" | tee -a "$MONITOR_LOG" echo "==========================================" | tee -a "$MONITOR_LOG" echo "推理任务结束 (退出码: $BENCH_EXIT_CODE)" | tee -a "$MONITOR_LOG" echo "==========================================" | tee -a "$MONITOR_LOG" echo "" | tee -a "$MONITOR_LOG" echo "最终GPU状态:" | tee -a "$MONITOR_LOG" hy-smi | tee -a "$MONITOR_LOG" echo "" | tee -a "$MONITOR_LOG" echo "==========================================" | tee -a "$MONITOR_LOG" echo "输出文件:" | tee -a "$MONITOR_LOG" ls -lh "$OUTPUT_DIR/" | tee -a "$MONITOR_LOG" echo "==========================================" | tee -a "$MONITOR_LOG" echo "所有文件已保存到: $OUTPUT_DIR" | tee -a "$MONITOR_LOG"