monitor_gpu.sh 20.3 KB
Newer Older
sunzhq2's avatar
sunzhq2 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
#!/bin/bash

# GPU监控包装脚本
# 从推理开始持续监控到结束,记录所有GPU状态

# 配置参数
CHECK_INTERVAL=2                     # 检测GPU状态的间隔(秒)
UTIL_THRESHOLD=5                     # GPU利用率阈值(%),超过此值认为推理开始
MONITOR_INTERVAL=1                   # 监控采样间隔(秒)
TARGET_GPUS=""                       # 统计目标GPU序号,空字符串表示所有卡
BENCH_LOG_NAME=""                    # 推理输出日志文件名
OUTPUT_DIR=""                        # 用户指定的输出目录
AISBENCH_BASE_DIR="./"               # ais_bench基础目录

# 解析命令行参数
while [[ $# -gt 0 ]]; do
    case $1 in
        --gpus)
            TARGET_GPUS="$2"
            shift 2
            ;;
        --threshold)
            UTIL_THRESHOLD="$2"
            shift 2
            ;;
        --interval)
            MONITOR_INTERVAL="$2"
            shift 2
            ;;
        --check-interval)
            CHECK_INTERVAL="$2"
            shift 2
            ;;
        --log-name)
            BENCH_LOG_NAME="$2"
            shift 2
            ;;
        --output-dir)
            OUTPUT_DIR="$2"
            shift 2
            ;;
        --bench-dir)
            AISBENCH_BASE_DIR="$2"
            shift 2
            ;;
        *)
            break
            ;;
    esac
done

# ============ 设置输出目录(在所有参数解析完成后) ============
if [ -z "$OUTPUT_DIR" ]; then
    TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
    OUTPUT_DIR="./gpu_monitor_${TIMESTAMP}"
fi

# 创建输出目录(如果父目录不存在也一并创建)
mkdir -p "$OUTPUT_DIR"
if [ ! -d "$OUTPUT_DIR" ]; then
    echo "错误: 无法创建输出目录 $OUTPUT_DIR"
    exit 1
fi

# 使用绝对路径
OUTPUT_DIR=$(cd "$OUTPUT_DIR" && pwd)

MONITOR_LOG="$OUTPUT_DIR/monitor.log"
BENCH_LOG_NAME_FILE="$OUTPUT_DIR/.bench_log_name"  # 保存日志文件名供外部脚本读取
DETAIL_LOG="$OUTPUT_DIR/gpu_detail.log"
SUMMARY_LOG="$OUTPUT_DIR/gpu_summary.log"

# 设置推理输出日志文件名
if [ -z "$BENCH_LOG_NAME" ]; then
    BENCH_LOG="$OUTPUT_DIR/benchmark_output.log"
else
    BENCH_LOG="$OUTPUT_DIR/${BENCH_LOG_NAME}"
fi

# 保存关键信息供外部脚本使用
echo "$OUTPUT_DIR" > "$OUTPUT_DIR/.output_dir"
echo "$BENCH_LOG" > "$OUTPUT_DIR/.bench_log_path"
echo "$BENCH_LOG_NAME" > "$BENCH_LOG_NAME_FILE"
if [ -n "$TARGET_GPUS" ]; then
    echo "$TARGET_GPUS" > "$OUTPUT_DIR/.target_gpus"
fi

echo "=========================================="
echo "GPU监控包装脚本启动"
echo "输出目录: $OUTPUT_DIR"
echo "推理日志: $BENCH_LOG"
if [ -n "$TARGET_GPUS" ]; then
    echo "统计GPU: $TARGET_GPUS"
else
    echo "统计GPU: 所有可用GPU"
fi
echo "记录GPU: 所有GPU(含未统计的)"
echo "监控模式: 持续监控直到推理任务结束"
echo "=========================================="
echo ""

# ============ 过滤GPU用于统计 ============
filter_for_stats() {
    local input="$1"
    if [ -z "$TARGET_GPUS" ]; then
        echo "$input"
    else
        echo "$input" | awk -v gpus="$TARGET_GPUS" '
        BEGIN {
            split(gpus, gpu_list, ",")
            for (i in gpu_list) target[gpu_list[i]] = 1
        }
        { if ($1 in target) print }
        '
    fi
}

# ============ 获取统计GPU列表 ============
get_stats_gpu_list() {
    if [ -z "$TARGET_GPUS" ]; then
        hy-smi 2>/dev/null | grep -E '^[[:space:]]*[0-9]+' | sed 's/  */ /g' | awk '{print $1}'
    else
        echo "$TARGET_GPUS" | tr ',' '\n'
    fi
}

# ============ 获取所有GPU列表 ============
get_all_gpu_list() {
    hy-smi 2>/dev/null | grep -E '^[[:space:]]*[0-9]+' | sed 's/  */ /g' | awk '{print $1}'
}

# ============ 检测GPU状态函数 ============
check_gpu_utilization() {
    local ALL_SMI=$(hy-smi 2>/dev/null | grep -E '^[[:space:]]*[0-9]+' | sed 's/  */ /g')
    local STATS_SMI=$(filter_for_stats "$ALL_SMI")
    
    local AVG_UTIL=$(echo "$STATS_SMI" | awk '{
        gsub(/%/,"",$7)
        util+=$7; count++
    } END {
        if(count>0) printf "%.1f", util/count
        else print "0"
    }')
    echo "$AVG_UTIL"
}

# ============ 提取ais_bench输出路径 ============
extract_aisbench_output_path() {
    local log_file="$1"
    local output_path=""
    
    if [ -f "$log_file" ]; then
        # 匹配 outputs/default/YYYYMMDD_HHMMSS 模式
        output_path=$(grep -oP 'outputs/default/\d{8}_\d{6}' "$log_file" | head -1)
        
        # 如果有绝对路径
        if [ -z "$output_path" ]; then
            output_path=$(grep -oP '/\S*outputs/default/\d{8}_\d{6}' "$log_file" | head -1)
        fi
        
        # 清理路径
        output_path=$(echo "$output_path" | sed 's/["\",]//g')
    fi
    
    echo "$output_path"
}

# ============ 获取要执行的命令 ============
if [ $# -eq 0 ]; then
    echo "用法: $0 [选项] <要执行的命令>"
    echo ""
    echo "选项:"
    echo "  --gpus <序号>           统计目标GPU序号,逗号分隔 (默认: 所有GPU)"
    echo "  --threshold <百分比>     触发监控的GPU利用率阈值 (默认: 5)"
    echo "  --interval <秒>         采样间隔 (默认: 1)"
    echo "  --check-interval <秒>    检测间隔 (默认: 2)"
    echo "  --log-name <文件名>      推理输出日志文件名 (默认: benchmark_output.log)"
    echo "  --output-dir <目录>      输出目录路径 (默认: ./gpu_monitor_时间戳)"
    echo "  --bench-dir <目录>       ais_bench基础目录 (默认: .)"
    echo ""
    echo "说明:"
    echo "  - 详细日志记录所有GPU状态"
    echo "  - 统计计算仅针对 --gpus 指定的GPU"
    echo "  - 从GPU活动开始持续监控到推理任务结束"
    echo "  - 自动捕获并保存 ais_bench 输出目录到监控目录"
    echo ""
    echo "示例:"
    echo "  # 基础用法"
    echo "  $0 ais_bench --models vllm_api_stream_chat --datasets aime2025_gen --mode perf --debug"
    echo ""
    echo "  # 指定GPU、日志名和输出目录"
    echo "  $0 --gpus 4,5,6,7 --log-name perf_test.log --output-dir /workspace/results/gpu_test \\"
    echo "     ais_bench --models vllm_api_stream_chat --datasets aime2025_gen --mode perf --debug"
    exit 1
fi

BENCHMARK_CMD="$@"

echo "执行命令: $BENCHMARK_CMD" | tee "$MONITOR_LOG"
if [ -n "$TARGET_GPUS" ]; then
    echo "统计GPU: $TARGET_GPUS" | tee -a "$MONITOR_LOG"
else
    echo "统计GPU: 所有可用GPU" | tee -a "$MONITOR_LOG"
fi
echo "" | tee -a "$MONITOR_LOG"

# 记录初始GPU状态
echo "初始GPU状态:" | tee -a "$MONITOR_LOG"
hy-smi | tee -a "$MONITOR_LOG"
echo "" | tee -a "$MONITOR_LOG"

# 启动推理任务(后台运行)
echo "启动推理任务..." | tee -a "$MONITOR_LOG"
$BENCHMARK_CMD > "$BENCH_LOG" 2>&1 &
BENCH_PID=$!
echo "推理任务PID: $BENCH_PID" | tee -a "$MONITOR_LOG"

# ============ 等待GPU活动 ============
echo "等待GPU开始工作..." | tee -a "$MONITOR_LOG"
WAIT_COUNT=0
MAX_WAIT=300

while [ $WAIT_COUNT -lt $MAX_WAIT ]; do
    if ! kill -0 $BENCH_PID 2>/dev/null; then
        echo "推理任务已结束,但未检测到GPU活动" | tee -a "$MONITOR_LOG"
        
        # 尝试保存ais_bench输出目录
        AISBENCH_DIR=$(extract_aisbench_output_path "$BENCH_LOG")
        if [ -n "$AISBENCH_DIR" ]; then
            save_aisbench_output "$AISBENCH_DIR"
        fi
        
        echo "" | tee -a "$MONITOR_LOG"
        echo "推理输出(最后50行):" | tee -a "$MONITOR_LOG"
        tail -50 "$BENCH_LOG" | tee -a "$MONITOR_LOG"
        exit 1
    fi
    
    CURRENT_UTIL=$(check_gpu_utilization)
    echo "$(date +"%H:%M:%S")  目标GPU利用率: ${CURRENT_UTIL}%  等待中... ($WAIT_COUNT/$MAX_WAIT)" | tee -a "$MONITOR_LOG"
    
    UTIL_INT=$(echo "$CURRENT_UTIL" | cut -d. -f1)
    if [ -n "$UTIL_INT" ] && [ "$UTIL_INT" -gt "$UTIL_THRESHOLD" ] 2>/dev/null; then
        echo "检测到GPU活动! 利用率: ${CURRENT_UTIL}%" | tee -a "$MONITOR_LOG"
        break
    fi
    
    WAIT_COUNT=$(( WAIT_COUNT + 1 ))
    sleep $CHECK_INTERVAL
done

if [ $WAIT_COUNT -ge $MAX_WAIT ]; then
    echo "超时: 等待 ${MAX_WAIT} 个周期后仍未检测到GPU活动" | tee -a "$MONITOR_LOG"
    kill $BENCH_PID 2>/dev/null
    exit 1
fi

# 等待利用率稳定
echo "等待GPU利用率稳定(5秒)..." | tee -a "$MONITOR_LOG"
sleep 5

# ============ 持续监控 ============
echo "" | tee -a "$MONITOR_LOG"
echo "==========================================" | tee -a "$MONITOR_LOG"
echo "开始持续GPU监控(直到推理任务结束)" | tee -a "$MONITOR_LOG"
echo "==========================================" | tee -a "$MONITOR_LOG"

# 临时文件
TMP_DIR=$(mktemp -d)
trap "rm -rf $TMP_DIR" EXIT

# 为所有GPU卡和统计目标卡创建历史文件
get_all_gpu_list | while read gpu_id; do
    echo -n "" > "$TMP_DIR/all_hcu_${gpu_id}_util.log"
    echo -n "" > "$TMP_DIR/all_hcu_${gpu_id}_mem.log"
done

get_stats_gpu_list | while read gpu_id; do
    echo -n "" > "$TMP_DIR/stats_hcu_${gpu_id}_util.log"
    echo -n "" > "$TMP_DIR/stats_hcu_${gpu_id}_mem.log"
done

UTIL_FILE="$TMP_DIR/util_avg.log"
MEM_FILE="$TMP_DIR/mem_avg.log"
echo -n "" > "$UTIL_FILE"
echo -n "" > "$MEM_FILE"

# 写入详细日志表头
{
    echo "============================================================"
    echo "GPU监控详细记录"
    echo "开始时间: $(date '+%Y-%m-%d %H:%M:%S')"
    echo "采样间隔: ${MONITOR_INTERVAL}秒"
    if [ -n "$TARGET_GPUS" ]; then
        echo "统计目标GPU: $TARGET_GPUS"
    else
        echo "统计目标: 所有GPU"
    fi
    echo "记录范围: 所有GPU"
    echo "============================================================"
    echo ""
} > "$DETAIL_LOG"

SAMPLE_COUNT=0
MONITOR_START=$(date +%s)

while kill -0 $BENCH_PID 2>/dev/null; do
    SAMPLE_COUNT=$(( SAMPLE_COUNT + 1 ))
    CURRENT_TIME=$(date +"%H:%M:%S")
    
    # 获取所有GPU数据
    ALL_SMI=$(hy-smi 2>/dev/null | grep -E '^[[:space:]]*[0-9]+' | sed 's/  */ /g')
    
    if [ -z "$ALL_SMI" ]; then
        sleep $MONITOR_INTERVAL
        continue
    fi
    
    # ====== 记录所有GPU到详细日志 ======
    {
        echo "[样本 #$SAMPLE_COUNT] 时间: $CURRENT_TIME"
        echo "--------------------------------------------------------------------------------"
        printf "%-6s %-10s %-10s %-10s %-10s %-10s %-10s\n" \
            "HCU" "Temp" "AvgPwr" "VRAM%" "HCU%" "Dec%" "Enc%"
        echo "--------------------------------------------------------------------------------"
    } >> "$DETAIL_LOG"
    
    echo "$ALL_SMI" | while read line; do
        HCU_ID=$(echo "$line" | awk '{print $1}')
        TEMP=$(echo "$line" | awk '{print $2}')
        PWR=$(echo "$line" | awk '{print $3}')
        VRAM=$(echo "$line" | awk '{gsub(/%/,"",$6); print $6}')
        HCU_UTIL=$(echo "$line" | awk '{gsub(/%/,"",$7); print $7}')
        DEC=$(echo "$line" | awk '{print $8}')
        ENC=$(echo "$line" | awk '{print $9}')
        
        printf "%-6s %-10s %-10s %-10s %-10s %-10s %-10s\n" \
            "$HCU_ID" "$TEMP" "$PWR" "${VRAM}%" "${HCU_UTIL}%" "$DEC" "$ENC" >> "$DETAIL_LOG"
        
        # 记录所有GPU历史
        echo "$HCU_UTIL" >> "$TMP_DIR/all_hcu_${HCU_ID}_util.log"
        echo "$VRAM" >> "$TMP_DIR/all_hcu_${HCU_ID}_mem.log"
    done
    
    # ====== 计算目标GPU统计 ======
    STATS_SMI=$(filter_for_stats "$ALL_SMI")
    
    # 记录目标GPU历史
    echo "$STATS_SMI" | while read line; do
        HCU_ID=$(echo "$line" | awk '{print $1}')
        VRAM=$(echo "$line" | awk '{gsub(/%/,"",$6); print $6}')
        HCU_UTIL=$(echo "$line" | awk '{gsub(/%/,"",$7); print $7}')
        
        echo "$HCU_UTIL" >> "$TMP_DIR/stats_hcu_${HCU_ID}_util.log"
        echo "$VRAM" >> "$TMP_DIR/stats_hcu_${HCU_ID}_mem.log"
    done
    
    # 目标卡平均
    AVG_VRAM=$(echo "$STATS_SMI" | awk '{gsub(/%/,"",$6); vram+=$6; count++} END {if(count>0) printf "%.1f", vram/count; else print "0"}')
    AVG_HCU=$(echo "$STATS_SMI" | awk '{gsub(/%/,"",$7); hcu+=$7; count++} END {if(count>0) printf "%.1f", hcu/count; else print "0"}')
    
    {
        echo "--------------------------------------------------------------------------------"
        printf "%-6s %-10s %-10s %-10s %-10s %-10s %-10s\n" \
            "目标均" "-" "-" "${AVG_VRAM}%" "${AVG_HCU}%" "-" "-"
        echo ""
    } >> "$DETAIL_LOG"
    
    echo "$AVG_HCU" >> "$UTIL_FILE"
    echo "$AVG_VRAM" >> "$MEM_FILE"
    
    # 计算已运行时间
    ELAPSED=$(( $(date +%s) - MONITOR_START ))
    echo "$CURRENT_TIME  样本#$SAMPLE_COUNT [${ELAPSED}s] | 目标HCU: ${AVG_HCU}% | 目标VRAM: ${AVG_VRAM}%" | tee -a "$MONITOR_LOG"
    
    sleep $MONITOR_INTERVAL
done

MONITOR_END=$(date +%s)
TOTAL_MONITOR_TIME=$(( MONITOR_END - MONITOR_START ))

echo "" | tee -a "$MONITOR_LOG"
echo "推理任务已结束,停止GPU监控" | tee -a "$MONITOR_LOG"
echo "总监控时长: ${TOTAL_MONITOR_TIME}秒, 总采样数: $SAMPLE_COUNT" | tee -a "$MONITOR_LOG"

# ============ 等待推理任务完全结束 ============
wait $BENCH_PID
BENCH_EXIT_CODE=$?

# ============ 保存ais_bench输出目录 ============
save_aisbench_output() {
    local aisbench_dir="$1"
    
    if [ -z "$aisbench_dir" ]; then
        return
    fi
    
    echo "" | tee -a "$MONITOR_LOG"
    echo "保存 ais_bench 输出目录..." | tee -a "$MONITOR_LOG"
    echo "检测到路径: $aisbench_dir" | tee -a "$MONITOR_LOG"
    
    local dir_name=$(basename "$aisbench_dir")
    local dest_dir="$OUTPUT_DIR/aisbench_output_${dir_name}"
    
    # 尝试多个可能的完整路径
    local possible_paths=(
        "$aisbench_dir"
        "$AISBENCH_BASE_DIR/$aisbench_dir"
        "$(pwd)/$aisbench_dir"
    )
    
    for src_dir in "${possible_paths[@]}"; do
        if [ -d "$src_dir" ]; then
            echo "找到目录: $src_dir" | tee -a "$MONITOR_LOG"
            cp -r "$src_dir" "$dest_dir" 2>/dev/null
            
            if [ $? -eq 0 ]; then
                echo "✓ 已复制到: $dest_dir" | tee -a "$MONITOR_LOG"
                # 保存路径供外部脚本使用
                echo "$dest_dir" > "$OUTPUT_DIR/.aisbench_output_dir"
                echo "$dir_name" > "$OUTPUT_DIR/.aisbench_dir_name"
                return
            fi
        fi
    done
    
    # 尝试查找最新生成的目录
    local latest_dir=$(ls -dt "$AISBENCH_BASE_DIR/outputs/default/20"* 2>/dev/null | head -1)
    if [ -n "$latest_dir" ] && [ "$(stat -c %Y "$latest_dir" 2>/dev/null || stat -f %m "$latest_dir" 2>/dev/null)" -gt "$MONITOR_START" ]; then
        local latest_name=$(basename "$latest_dir")
        dest_dir="$OUTPUT_DIR/aisbench_output_${latest_name}"
        cp -r "$latest_dir" "$dest_dir" 2>/dev/null
        echo "✓ 找到最新输出目录并复制: $dest_dir" | tee -a "$MONITOR_LOG"
        echo "$dest_dir" > "$OUTPUT_DIR/.aisbench_output_dir"
        echo "$latest_name" > "$OUTPUT_DIR/.aisbench_dir_name"
        return
    fi
    
    echo "警告: 无法找到 ais_bench 输出目录" | tee -a "$MONITOR_LOG"
}

AISBENCH_DIR=$(extract_aisbench_output_path "$BENCH_LOG")
save_aisbench_output "$AISBENCH_DIR"

# ============ 生成统计汇总 ============
echo "" | tee -a "$MONITOR_LOG"
echo "生成统计报告..." | tee -a "$MONITOR_LOG"

TOTAL_SAMPLES=$(wc -l < "$UTIL_FILE")

if [ "$TOTAL_SAMPLES" -eq 0 ]; then
    echo "警告: 没有采集到任何数据" | tee -a "$MONITOR_LOG"
else
    # 稳定阶段: 跳过前25%样本
    SKIP_SAMPLES=$(( TOTAL_SAMPLES / 4 ))
    STABLE_SAMPLES=$(( TOTAL_SAMPLES - SKIP_SAMPLES ))
    [ $STABLE_SAMPLES -lt 1 ] && STABLE_SAMPLES=1
    
    AVG_HCU_ALL=$(tail -n $STABLE_SAMPLES "$UTIL_FILE" | awk '{sum+=$1; count++} END {printf "%.1f", sum/count}')
    AVG_VRAM_ALL=$(tail -n $STABLE_SAMPLES "$MEM_FILE" | awk '{sum+=$1; count++} END {printf "%.1f", sum/count}')
    MAX_HCU=$(tail -n $STABLE_SAMPLES "$UTIL_FILE" | sort -n | tail -1)
    MIN_HCU=$(tail -n $STABLE_SAMPLES "$UTIL_FILE" | sort -n | head -1)
    MAX_VRAM=$(tail -n $STABLE_SAMPLES "$MEM_FILE" | sort -n | tail -1)
    MIN_VRAM=$(tail -n $STABLE_SAMPLES "$MEM_FILE" | sort -n | head -1)
    
    {
        echo "============================================================"
        echo "GPU监控汇总统计"
        echo "生成时间: $(date '+%Y-%m-%d %H:%M:%S')"
        echo "============================================================"
        echo ""
        echo "监控概要:"
        echo "  - 总监控时长: ${TOTAL_MONITOR_TIME}秒"
        echo "  - 采样间隔: ${MONITOR_INTERVAL}秒"
        echo "  - 总采样数: $TOTAL_SAMPLES"
        echo "  - 稳定阶段采样数: $STABLE_SAMPLES (跳过前25%)"
        if [ -n "$TARGET_GPUS" ]; then
            echo "  - 统计目标GPU: $TARGET_GPUS"
        else
            echo "  - 统计目标: 所有GPU"
        fi
        echo ""
        echo "============================================================"
        echo "目标GPU整体统计 (稳定阶段)"
        echo "============================================================"
        echo "  平均GPU利用率: ${AVG_HCU_ALL}%"
        echo "  利用率范围:    ${MIN_HCU}% ~ ${MAX_HCU}%"
        echo "  平均显存占用: ${AVG_VRAM_ALL}%"
        echo "  显存占用范围:  ${MIN_VRAM}% ~ ${MAX_VRAM}%"
        echo ""
        echo "============================================================"
        echo "目标GPU逐卡统计 (稳定阶段)"
        echo "============================================================"
        printf "%-6s %-14s %-14s %-14s %-14s\n" "HCU" "平均HCU%" "最大HCU%" "平均VRAM%" "最大VRAM%"
        echo "------------------------------------------------------------------------"
    } > "$SUMMARY_LOG"
    
    get_stats_gpu_list | while read gpu_id; do
        STATS_FILE="$TMP_DIR/stats_hcu_${gpu_id}_util.log"
        MEMS_FILE="$TMP_DIR/stats_hcu_${gpu_id}_mem.log"
        
        if [ -f "$STATS_FILE" ] && [ -s "$STATS_FILE" ]; then
            CARD_TOTAL=$(wc -l < "$STATS_FILE")
            CARD_SKIP=$(( CARD_TOTAL / 4 ))
            CARD_STABLE=$(( CARD_TOTAL - CARD_SKIP ))
            [ $CARD_STABLE -lt 1 ] && CARD_STABLE=1
            
            AVG_UTIL=$(tail -n $CARD_STABLE "$STATS_FILE" | awk '{sum+=$1; count++} END {printf "%.1f", sum/count}')
            MAX_UTIL=$(tail -n $CARD_STABLE "$STATS_FILE" | sort -n | tail -1)
            AVG_MEM=$(tail -n $CARD_STABLE "$MEMS_FILE" | awk '{sum+=$1; count++} END {printf "%.1f", sum/count}')
            MAX_MEM=$(tail -n $CARD_STABLE "$MEMS_FILE" | sort -n | tail -1)
        else
            AVG_UTIL="N/A"; MAX_UTIL="N/A"; AVG_MEM="N/A"; MAX_MEM="N/A"
        fi
        
        printf "%-6s %-14s %-14s %-14s %-14s\n" \
            "$gpu_id" "${AVG_UTIL}%" "${MAX_UTIL}%" "${AVG_MEM}%" "${MAX_MEM}%" >> "$SUMMARY_LOG"
    done
    
    {
        echo ""
        echo "============================================================"
        echo "所有GPU最新状态"
        echo "============================================================"
    } >> "$SUMMARY_LOG"
    
    hy-smi 2>/dev/null | grep -E '^[[:space:]]*[0-9]+' | sed 's/  */ /g' | awk '{
        gsub(/%/,"")
        printf "  HCU %s : Temp=%s, Pwr=%s, VRAM=%s%%, HCU=%s%%\n", $1, $2, $3, $6, $7
    }' >> "$SUMMARY_LOG"
    
    # 终端输出
    echo "" | tee -a "$MONITOR_LOG"
    echo "============================================================" | tee -a "$MONITOR_LOG"
    echo "                    监控统计结果" | tee -a "$MONITOR_LOG"
    echo "============================================================" | tee -a "$MONITOR_LOG"
    echo "监控时长: ${TOTAL_MONITOR_TIME}秒 ($SAMPLE_COUNT 个样本)" | tee -a "$MONITOR_LOG"
    echo "" | tee -a "$MONITOR_LOG"
    echo "目标GPU (稳定阶段):" | tee -a "$MONITOR_LOG"
    echo "  平均利用率: ${AVG_HCU_ALL}%  (${MIN_HCU}% ~ ${MAX_HCU}%)" | tee -a "$MONITOR_LOG"
    echo "  平均显存:   ${AVG_VRAM_ALL}%  (${MIN_VRAM}% ~ ${MAX_VRAM}%)" | tee -a "$MONITOR_LOG"
fi

echo "" | tee -a "$MONITOR_LOG"
echo "==========================================" | tee -a "$MONITOR_LOG"
echo "推理任务结束 (退出码: $BENCH_EXIT_CODE)" | tee -a "$MONITOR_LOG"
echo "==========================================" | tee -a "$MONITOR_LOG"

echo "" | tee -a "$MONITOR_LOG"
echo "最终GPU状态:" | tee -a "$MONITOR_LOG"
hy-smi | tee -a "$MONITOR_LOG"

echo "" | tee -a "$MONITOR_LOG"
echo "==========================================" | tee -a "$MONITOR_LOG"
echo "输出文件:" | tee -a "$MONITOR_LOG"
ls -lh "$OUTPUT_DIR/" | tee -a "$MONITOR_LOG"
echo "==========================================" | tee -a "$MONITOR_LOG"
echo "所有文件已保存到: $OUTPUT_DIR" | tee -a "$MONITOR_LOG"