run_perf.sh 37.7 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Unified observability capture — starts services, collects ALL observability
# data (nsys, perf, BPF, system stats, Prometheus), runs aiperf load, then
# shuts down and exports results.
#
# Usage:
#   ./run_perf.sh --model Qwen/Qwen3-0.6B --concurrency 64 --num-requests 4096
#   ./run_perf.sh --skip-bpf --skip-nsys   # opt-out of heavy tools
#   sudo ./run_perf.sh --skip-nsys --skip-perf \       # run bpf tracing as root
#        --model Qwen/Qwen3-0.6B --concurrency 64 --num-requests 4096
#
# Output: artifacts/obs_YYYYMMDD_HHMMSS/ with subdirs for each data source.
#
# Prerequisites:
#   - dynamo.mocker and dynamo.frontend installed
#   - aiperf installed
#   - Optional: nsys, perf, bpftrace, flamegraph tools (auto-detected)

set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="${SCRIPT_DIR}/../../.."

# Raise file descriptor limit — high concurrency runs open many sockets
# (aiperf connections + frontend + mocker workers + /proc polling + captures).
# The default 1024 is insufficient at concurrency >= 100.
ulimit -n 65536 2>/dev/null || ulimit -n 8192 2>/dev/null || true

# When running under sudo, preserve the invoking user's environment:
# - HF cache (so models downloaded as the regular user are visible)
# - PATH (so aiperf, python, etc. are found)
if [[ -n "${SUDO_USER:-}" ]]; then
    REAL_HOME=$(getent passwd "$SUDO_USER" | cut -d: -f6)
    export HF_HOME="${HF_HOME:-${REAL_HOME}/.cache/huggingface}"
    # Add common user binary paths that sudo strips
    for p in "${REAL_HOME}/.local/bin" "${REPO_ROOT}/dynamo/bin"; do
        [[ -d "$p" ]] && [[ ":$PATH:" != *":$p:"* ]] && export PATH="$p:$PATH"
    done
fi

# ─── Defaults ───────────────────────────────────────────────────────────────
MODEL="${MODEL:-nvidia/Llama-3.1-8B-Instruct-FP8}"
MODEL_NAME=""
NUM_WORKERS="${NUM_WORKERS:-2}"
SPEEDUP_RATIO="${SPEEDUP_RATIO:-1.0}"
CONCURRENCY="${CONCURRENCY:-64}"
NUM_REQUESTS="${NUM_REQUESTS:-}"
ISL="${ISL:-1024}"
OSL="${OSL:-256}"
REQUEST_PLANE="${REQUEST_PLANE:-tcp}"
EVENT_PLANE="${EVENT_PLANE:-nats}"
CAPTURE_DURATION="${CAPTURE_DURATION:-60}"
OUTPUT_DIR=""
DATA_PARALLEL_SIZE="${DATA_PARALLEL_SIZE:-1}"
FRONTEND_PORT="${FRONTEND_PORT:-8000}"
PLANNER_PROFILE="${PLANNER_PROFILE:-}"
TOKENIZER_BACKEND="${TOKENIZER_BACKEND:-}"
NUM_MODELS="${NUM_MODELS:-1}"                 # Number of model instances (each gets NUM_WORKERS workers)
AIPERF_TARGETS="${AIPERF_TARGETS:-first}"     # "first" = model-1 only, "all" = one aiperf run per model
BENCHMARK_DURATION="${BENCHMARK_DURATION:-}"  # aiperf --benchmark-duration (seconds)
REQUEST_RATE="${REQUEST_RATE:-}"              # aiperf --request-rate (requests/sec)
WARMUP_DURATION="${WARMUP_DURATION:-}"        # aiperf --warmup-duration (seconds)
WARMUP_COUNT="${WARMUP_COUNT:-}"              # aiperf --warmup-request-count

# Opt-out flags
SKIP_BPF=false
SKIP_NSYS=false
SKIP_FLAMEGRAPH=false
SKIP_PERF=false

# Optional captures
ENABLE_TCPDUMP=false
TCPDUMP_PORT=""

# Custom tool paths
NSYS_PATH=""

# ─── Argument parsing ───────────────────────────────────────────────────────
while [[ $# -gt 0 ]]; do
    case $1 in
        --model)                MODEL="$2"; shift 2 ;;
        --model-name)           MODEL_NAME="$2"; shift 2 ;;
        --workers)              NUM_WORKERS="$2"; shift 2 ;;
        --speedup-ratio)        SPEEDUP_RATIO="$2"; shift 2 ;;
        --concurrency)          CONCURRENCY="$2"; shift 2 ;;
        --num-requests)         NUM_REQUESTS="$2"; shift 2 ;;
        --isl)                  ISL="$2"; shift 2 ;;
        --osl)                  OSL="$2"; shift 2 ;;
        --request-plane)        REQUEST_PLANE="$2"; shift 2 ;;
        --event-plane)          EVENT_PLANE="$2"; shift 2 ;;
        --capture-duration)     CAPTURE_DURATION="$2"; shift 2 ;;
        --output-dir)           OUTPUT_DIR="$2"; shift 2 ;;
        --data-parallel-size)   DATA_PARALLEL_SIZE="$2"; shift 2 ;;
        --frontend-port)        FRONTEND_PORT="$2"; shift 2 ;;
        --planner-profile)      PLANNER_PROFILE="$2"; shift 2 ;;
        --tokenizer-backend)    TOKENIZER_BACKEND="$2"; shift 2 ;;
        --fast-tokens)          TOKENIZER_BACKEND="fast"; shift ;;
        --num-models)           NUM_MODELS="$2"; shift 2 ;;
        --aiperf-targets)       AIPERF_TARGETS="$2"; shift 2 ;;
        --benchmark-duration)   BENCHMARK_DURATION="$2"; shift 2 ;;
        --request-rate)         REQUEST_RATE="$2"; shift 2 ;;
        --warmup-duration)      WARMUP_DURATION="$2"; shift 2 ;;
        --warmup-count)         WARMUP_COUNT="$2"; shift 2 ;;
        --skip-bpf)             SKIP_BPF=true; shift ;;
        --skip-nsys)            SKIP_NSYS=true; shift ;;
        --skip-flamegraph)      SKIP_FLAMEGRAPH=true; shift ;;
        --skip-perf)            SKIP_PERF=true; shift ;;
        --tcpdump)              ENABLE_TCPDUMP=true; shift ;;
        --tcpdump-port)         TCPDUMP_PORT="$2"; shift 2 ;;
        --nsys-path)            NSYS_PATH="$2"; shift 2 ;;
        -h|--help)
            cat <<'USAGE'
Usage: run_perf.sh [OPTIONS]

Starts mocker + frontend, runs parallel observability capture, then aiperf load.

Service Options:
  --model PATH              Model path (default: nvidia/Llama-3.1-8B-Instruct-FP8)
  --model-name NAME         Served model name (default: same as --model)
  --workers N               Number of mocker workers (default: 2)
124
  --speedup-ratio RATIO     Mocker speedup ratio (default: 1.0; use large value for near-instant)
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
  --data-parallel-size N    Mocker DP workers (default: 1)
  --request-plane PLANE     nats|http|tcp (default: tcp)
  --event-plane PLANE       nats|zmq (default: nats)
  --frontend-port PORT      Frontend HTTP port (default: 8000)
  --planner-profile PATH    Planner profile data path
  --tokenizer-backend NAME  Tokenizer backend: "fast" or "hf" (default: unset, uses HF)
  --fast-tokens             Shorthand for --tokenizer-backend fast
  --num-models N            Number of model instances (default: 1). Each gets --workers workers
                            with names model-1, model-2, ...
  --aiperf-targets MODE     "first" (default): aiperf targets model-1 only.
                            "all": run aiperf sequentially for each model.
  --benchmark-duration N    aiperf run duration in seconds (default: use --num-requests)
  --request-rate N          Target requests per second (aiperf --request-rate)
  --warmup-duration N       aiperf warmup phase duration in seconds
  --warmup-count N          aiperf warmup request count (default: concurrency)

Load Options:
  --concurrency N           aiperf concurrency (default: 64)
  --num-requests N          Total requests (default: 640)
  --isl N                   Input sequence length (default: 1024)
  --osl N                   Output sequence length (default: 256)

Capture Options:
  --capture-duration N      Duration for parallel captures in seconds (default: 60)
  --output-dir DIR          Output directory (default: auto timestamped)
  --skip-bpf                Skip BPF tracing
  --skip-nsys               Skip Nsight Systems profiling
  --skip-flamegraph         Skip flamegraph generation
  --skip-perf               Skip perf record/stat
  --tcpdump                 Enable packet capture via tcpdump
  --tcpdump-port PORT       Port filter for tcpdump (default: --frontend-port)
  --nsys-path PATH          Path to nsys binary (default: auto-detected from PATH)
USAGE
            exit 0
            ;;
        *) echo "Unknown option: $1"; exit 1 ;;
    esac
done

# Default model-name to model if not set
[[ -z "$MODEL_NAME" ]] && MODEL_NAME="$MODEL"

# For multi-model: build array of model names.
# Each model group registers under its own served name (model-1, model-2, ...)
# but uses the same --model-path for weights. The mocker publishes source_path
# (the real HF model ID) in the ModelDeploymentCard so the frontend can resolve
# tokenizer/config even when the served name differs.
MODEL_NAMES=()
if [[ "$NUM_MODELS" -le 1 ]]; then
    MODEL_NAMES=("$MODEL_NAME")
else
    for m in $(seq 1 "$NUM_MODELS"); do
        MODEL_NAMES+=("model-${m}")
    done
fi

# Default tcpdump port to frontend port
[[ -z "$TCPDUMP_PORT" ]] && TCPDUMP_PORT="$FRONTEND_PORT"

# Auto-sync capture-duration to cover the full benchmark window.
# If benchmark-duration is set and capture-duration wasn't explicitly overridden,
# extend captures to benchmark-duration + warmup headroom + buffer.
if [[ -n "$BENCHMARK_DURATION" ]] && [[ "$CAPTURE_DURATION" -eq 60 ]]; then
    _WARMUP_HEADROOM=${WARMUP_DURATION:-10}
    CAPTURE_DURATION=$(( BENCHMARK_DURATION + _WARMUP_HEADROOM + 5 ))
    echo "  Auto-adjusted capture-duration to ${CAPTURE_DURATION}s (benchmark=${BENCHMARK_DURATION}s + warmup + 5s buffer)"
fi

# Resolve nsys binary path
if [[ -n "$NSYS_PATH" ]]; then
    if [[ ! -x "$NSYS_PATH" ]]; then
        echo "ERROR: --nsys-path '$NSYS_PATH' is not executable"; exit 1
    fi
    NSYS_CMD="$NSYS_PATH"
else
    NSYS_CMD="nsys"
fi

# ─── Output directory ────────────────────────────────────────────────────────
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
if [[ -z "$OUTPUT_DIR" ]]; then
    OUTPUT_DIR="${REPO_ROOT}/artifacts/obs_${TIMESTAMP}"
fi
mkdir -p "$OUTPUT_DIR"/{aiperf,nsys,perf,bpf,system,prometheus,logs}

echo "╔══════════════════════════════════════════════════════════════╗"
echo "║          Unified Observability Capture                      ║"
echo "╚══════════════════════════════════════════════════════════════╝"
echo ""
echo "Output:    $OUTPUT_DIR"
echo "Tokenizer: ${TOKENIZER_BACKEND:-hf (default)}"
echo ""

# ─── Pre-flight: detect available tools ──────────────────────────────────────
echo "--- Pre-flight checks ---"

HAS_NSYS=false
HAS_PERF=false
HAS_BPF=false
HAS_FLAMEGRAPH_RENDERER=false

if command -v "$NSYS_CMD" &>/dev/null && [[ "$SKIP_NSYS" == false ]]; then
    HAS_NSYS=true
    echo "  nsys:       $("$NSYS_CMD" --version 2>/dev/null | head -1)"
else
    echo "  nsys:       SKIP"
fi

if command -v perf &>/dev/null && [[ "$SKIP_PERF" == false ]]; then
    HAS_PERF=true
    echo "  perf:       $(perf version 2>/dev/null | head -1)"
else
    echo "  perf:       SKIP"
fi

if command -v bpftrace &>/dev/null && [[ "$SKIP_BPF" == false ]]; then
    # BPF needs root or both CAP_BPF and CAP_PERFMON
    if [[ $(id -u) -eq 0 ]]; then
        HAS_BPF=true
        echo "  bpftrace:   available (root)"
    elif command -v capsh &>/dev/null \
         && capsh --print 2>/dev/null | grep '^Current:' | grep -q cap_bpf \
         && capsh --print 2>/dev/null | grep '^Current:' | grep -q cap_perfmon; then
        HAS_BPF=true
        echo "  bpftrace:   available (CAP_BPF + CAP_PERFMON)"
    else
        echo "  bpftrace:   SKIP (needs root or CAP_BPF + CAP_PERFMON)"
    fi
else
    echo "  bpftrace:   SKIP"
fi

if command -v flamegraph.pl &>/dev/null || command -v inferno-flamegraph &>/dev/null; then
    HAS_FLAMEGRAPH_RENDERER=true
    echo "  flamegraph: available"
else
    echo "  flamegraph: SKIP (install inferno: cargo install inferno)"
fi

if ! command -v jq &>/dev/null; then
    echo "ERROR: jq not found. Install: apt-get install -y jq (or brew install jq)"
    exit 1
fi
echo "  jq:         available"

if ! command -v aiperf &>/dev/null && ! python3 -c "import aiperf" 2>/dev/null; then
    echo "ERROR: aiperf not found. Install: pip install git+https://github.com/ai-dynamo/aiperf.git"
    exit 1
fi
echo "  aiperf:     available"
echo ""

# ─── Tracked PIDs for cleanup ───────────────────────────────────────────────
ALL_PIDS=()      # everything we need to kill on exit
CAPTURE_PIDS=()  # capture processes we wait for
ETCD_PID=""      # etcd PID if we started it
ETCD_DATA_DIR=""
NATS_PID=""      # nats-server PID if we started it

cleanup() {
    echo ""
    echo "--- Cleaning up ---"
    # Stop capture processes with SIGINT first (bpftrace needs INT to flush maps)
    for pid in "${CAPTURE_PIDS[@]}"; do
        kill -INT "$pid" 2>/dev/null || true
    done
    # Give bpftrace a moment to flush aggregation output
    sleep 2
    # SIGTERM any captures still running
    for pid in "${CAPTURE_PIDS[@]}"; do
        kill "$pid" 2>/dev/null || true
    done
    # Then service processes
    for pid in "${ALL_PIDS[@]}"; do
        kill "$pid" 2>/dev/null || true
    done
    # Give processes a moment to exit, then force kill
    sleep 1
    for pid in "${ALL_PIDS[@]}" "${CAPTURE_PIDS[@]}"; do
        kill -9 "$pid" 2>/dev/null || true
    done
    # Stop infrastructure daemons if we started them
    if [[ -n "$ETCD_PID" ]]; then
        kill "$ETCD_PID" 2>/dev/null || true
        sleep 1
        kill -9 "$ETCD_PID" 2>/dev/null || true
        [[ -n "$ETCD_DATA_DIR" ]] && rm -rf "$ETCD_DATA_DIR"
    fi
    if [[ -n "$NATS_PID" ]]; then
        kill "$NATS_PID" 2>/dev/null || true
        sleep 1
        kill -9 "$NATS_PID" 2>/dev/null || true
    fi
    wait 2>/dev/null || true
    # Wait for the frontend port to be fully released (TIME_WAIT, etc.)
    _port_wait=0
    while ss -tlnp 2>/dev/null | grep -q ":${FRONTEND_PORT} "; do
        sleep 1
        _port_wait=$((_port_wait + 1))
        if [[ $_port_wait -ge 15 ]]; then
            break
        fi
    done
    echo "Cleanup complete."
}
trap cleanup EXIT INT TERM

# ─── Step 0: Ensure etcd + NATS are running ──────────────────────────────────
echo "--- Checking infrastructure (etcd + NATS) ---"

# etcd
if ! curl -sf http://localhost:2379/health >/dev/null 2>&1; then
    if command -v etcd &>/dev/null; then
        echo "  etcd not running — starting it..."
        ETCD_DATA_DIR=$(mktemp -d)
        etcd --data-dir="$ETCD_DATA_DIR" \
             --listen-client-urls=http://localhost:2379 \
             --advertise-client-urls=http://localhost:2379 \
             --listen-peer-urls=http://localhost:2380 \
             --initial-advertise-peer-urls=http://localhost:2380 \
             --initial-cluster=default=http://localhost:2380 \
             > /dev/null 2>&1 &
        ETCD_PID=$!
        for i in $(seq 1 30); do
            if curl -sf http://localhost:2379/health >/dev/null 2>&1; then
                echo "  etcd ready (PID $ETCD_PID)"
                break
            fi
            sleep 1
            if [[ $i -eq 30 ]]; then
                echo "ERROR: etcd failed to start after 30s"; exit 1
            fi
        done
    else
        echo "ERROR: etcd not running and not found in PATH."; exit 1
    fi
else
    echo "  etcd already running"
fi

# NATS
if ! nc -z localhost 4222 2>/dev/null; then
    if command -v nats-server &>/dev/null; then
        echo "  nats-server not running — starting it..."
        nats-server > /dev/null 2>&1 &
        NATS_PID=$!
        for i in $(seq 1 30); do
            if nc -z localhost 4222 2>/dev/null; then
                echo "  nats-server ready (PID $NATS_PID)"
                break
            fi
            sleep 1
            if [[ $i -eq 30 ]]; then
                echo "ERROR: nats-server failed to start after 30s"; exit 1
            fi
        done
    else
        echo "ERROR: nats-server not running and not found in PATH."; exit 1
    fi
else
    echo "  nats-server already running"
fi
echo ""

# ─── Step 1: Start mocker workers ───────────────────────────────────────────
TOTAL_WORKERS=$(( ${#MODEL_NAMES[@]} * NUM_WORKERS ))
echo "--- Starting $TOTAL_WORKERS mocker worker(s) (${#MODEL_NAMES[@]} model(s) x $NUM_WORKERS worker(s)) ---"
BASE_SYSTEM_PORT=8081
WORKER_IDX=0
for MN in "${MODEL_NAMES[@]}"; do
    for i in $(seq 1 "$NUM_WORKERS"); do
        WORKER_IDX=$((WORKER_IDX + 1))
        WORKER_PORT=$((BASE_SYSTEM_PORT + WORKER_IDX - 1))
        MOCKER_ARGS=(
            --model-path "$MODEL"
            --model-name "$MN"
            --speedup-ratio "$SPEEDUP_RATIO"
            --request-plane "$REQUEST_PLANE"
        )
        if [[ "$DATA_PARALLEL_SIZE" -gt 1 ]]; then
            MOCKER_ARGS+=(--data-parallel-size "$DATA_PARALLEL_SIZE")
        fi
        if [[ -n "$PLANNER_PROFILE" ]]; then
            MOCKER_ARGS+=(--planner-profile-data "$PLANNER_PROFILE")
        fi

        MN_SAFE="${MN//\//_}"
        HF_HUB_OFFLINE=1 DYN_SYSTEM_PORT=$WORKER_PORT DYN_EVENT_PLANE="$EVENT_PLANE" python -m dynamo.mocker "${MOCKER_ARGS[@]}" \
            > "$OUTPUT_DIR/logs/mocker_${MN_SAFE}_${i}.log" 2>&1 &
        ALL_PIDS+=($!)
        echo "  Worker $WORKER_IDX ($MN #$i): PID ${ALL_PIDS[-1]}, port $WORKER_PORT"
    done
done
# Update NUM_WORKERS to total for Prometheus scraping later
NUM_WORKERS_TOTAL=$WORKER_IDX

# ─── Step 2: Start frontend (optionally under nsys) ─────────────────────────
echo ""
echo "--- Starting frontend ---"

FRONTEND_ENV=(
    HF_HUB_OFFLINE=1
    DYN_HTTP_PORT="$FRONTEND_PORT"
    DYN_PERF_DIAG=1
    DYN_ENABLE_NVTX=1
    DYN_REQUEST_PLANE="$REQUEST_PLANE"
    DYN_EVENT_PLANE="$EVENT_PLANE"
)

if [[ -n "$TOKENIZER_BACKEND" ]]; then
    # Map human-readable CLI values to DYN_TOKENIZER env var values
    # (Rust model_card.rs reads DYN_TOKENIZER, expects "fastokens" or "default")
    case "$TOKENIZER_BACKEND" in
        fast|fastokens) _DYN_TOK_VAL="fastokens" ;;
        hf|default|"")  _DYN_TOK_VAL="default" ;;
        *)               _DYN_TOK_VAL="$TOKENIZER_BACKEND" ;;
    esac
    FRONTEND_ENV+=(DYN_TOKENIZER="$_DYN_TOK_VAL")
    echo "  Tokenizer backend: $TOKENIZER_BACKEND (DYN_TOKENIZER=$_DYN_TOK_VAL)"
fi

# Enable Rust NVTX annotations when nsys profiling is active.
# The Rust NVTX subsystem (lib/runtime/src/nvtx.rs) requires both the
# compile-time "nvtx" feature AND this runtime env var. We only set it
# when nsys is active to avoid the ~50ns/annotation overhead during
# clean throughput runs.
if [[ "$HAS_NSYS" == true ]]; then
    FRONTEND_ENV+=(DYN_ENABLE_RUST_NVTX=1)
    echo "  Rust NVTX annotations: enabled"
fi

if [[ "$HAS_NSYS" == true ]]; then
    echo "  (under nsys profiling)"
    env "${FRONTEND_ENV[@]}" \
        "$NSYS_CMD" profile \
        --trace=osrt,nvtx \
        --sample=cpu \
        --cpuctxsw=none \
        --output="${OUTPUT_DIR}/nsys/frontend" \
        --force-overwrite=true \
        python -m dynamo.frontend \
        > "$OUTPUT_DIR/logs/frontend.log" 2>&1 &
    NSYS_WRAPPER_PID=$!
    ALL_PIDS+=($NSYS_WRAPPER_PID)
    # Resolve the actual python child PID for perf stat / proc polling.
    # nsys spawns python as a child; we need the real PID for --pid attachments.
    FRONTEND_PID=""
    for _try in $(seq 1 30); do
        sleep 1
        _child=$(pgrep -P "$NSYS_WRAPPER_PID" -f "python.*dynamo.frontend" 2>/dev/null | head -1 || true)
        if [[ -n "$_child" ]]; then
            FRONTEND_PID="$_child"
            break
        fi
    done
    if [[ -z "$FRONTEND_PID" ]]; then
        echo "  WARNING: could not resolve python child PID under nsys; using nsys wrapper PID"
        echo "  (perf stat and /proc polling will attach to nsys, not the frontend process)"
        FRONTEND_PID="$NSYS_WRAPPER_PID"
    fi
    echo "  nsys wrapper PID: $NSYS_WRAPPER_PID"
    echo "  Frontend PID: $FRONTEND_PID"
else
    env "${FRONTEND_ENV[@]}" python -m dynamo.frontend \
        > "$OUTPUT_DIR/logs/frontend.log" 2>&1 &
    FRONTEND_PID=$!
    ALL_PIDS+=($FRONTEND_PID)
    echo "  Frontend PID: $FRONTEND_PID"
fi

# ─── Step 3: Wait for readiness ─────────────────────────────────────────────
echo ""
echo "Waiting for ${#MODEL_NAMES[@]} model(s) to be ready: ${MODEL_NAMES[*]}..."
MAX_WAIT=180
WAITED=0
_all_models_ready=false
while [[ "$_all_models_ready" == false ]]; do
    _all_models_ready=true
    for _mn in "${MODEL_NAMES[@]}"; do
        if ! curl -s --max-time 5 "http://127.0.0.1:$FRONTEND_PORT/v1/models" 2>/dev/null | \
              jq -e --arg model "$_mn" '.data[]? | select(.id == $model)' >/dev/null 2>&1; then
            _all_models_ready=false
            break
        fi
    done
    if [[ "$_all_models_ready" == true ]]; then
        break
    fi
    sleep 2
    WAITED=$((WAITED + 2))
    if [[ $WAITED -ge $MAX_WAIT ]]; then
        echo "ERROR: Models not ready after ${MAX_WAIT}s."
        echo "Last 20 lines of frontend log:"
        tail -20 "$OUTPUT_DIR/logs/frontend.log" 2>/dev/null || true
        exit 1
    fi
    if ! kill -0 "$FRONTEND_PID" 2>/dev/null; then
        echo "ERROR: Frontend process died."
        echo "Last 20 lines of frontend log:"
        tail -20 "$OUTPUT_DIR/logs/frontend.log" 2>/dev/null || true
        exit 1
    fi
done
echo "All ${#MODEL_NAMES[@]} model(s) ready (waited ${WAITED}s)"

# Capture initial Prometheus snapshot (baseline for histogram delta analysis)
echo "  Capturing initial Prometheus snapshot..."
{
    curl -s --max-time 5 "http://127.0.0.1:$FRONTEND_PORT/metrics" 2>/dev/null || true
    for wi in $(seq 1 "$NUM_WORKERS_TOTAL"); do
        WPORT=$((BASE_SYSTEM_PORT + wi - 1))
        curl -s --max-time 5 "http://127.0.0.1:$WPORT/metrics" 2>/dev/null || true
    done
} > "$OUTPUT_DIR/prometheus/initial_snapshot.txt"

# ─── Step 4: Start parallel captures ────────────────────────────────────────
echo ""
echo "--- Starting parallel captures (${CAPTURE_DURATION}s) ---"

# 4a. perf stat — with NMI watchdog fallback
if [[ "$HAS_PERF" == true ]]; then
    NMI_WATCHDOG=$(cat /proc/sys/kernel/nmi_watchdog 2>/dev/null || echo "0")
    if [[ "$NMI_WATCHDOG" == "1" ]]; then
        # NMI watchdog holds a HW PMU counter, causing <not counted> for
        # most hardware events. Fall back to software counters + userspace-
        # qualified cycles/instructions which work on the remaining PMUs.
        echo "  [perf stat] pid=$FRONTEND_PID (NMI watchdog active — using software + :u counters)"
        echo "  TIP: disable watchdog for full HW counters: sudo sysctl kernel.nmi_watchdog=0"
        perf stat --pid "$FRONTEND_PID" \
            -e task-clock,context-switches,cpu-migrations,page-faults,cycles:u,instructions:u \
            -o "$OUTPUT_DIR/perf/perf_stat.txt" \
            -- sleep "$CAPTURE_DURATION" &
        CAPTURE_PIDS+=($!)
    else
        echo "  [perf stat] pid=$FRONTEND_PID"
        perf stat --pid "$FRONTEND_PID" \
            -o "$OUTPUT_DIR/perf/perf_stat.txt" \
            -- sleep "$CAPTURE_DURATION" &
        CAPTURE_PIDS+=($!)
    fi
fi

# 4b. BPF scripts (if available) — delegate to bpf/run.sh
# run.sh --batch waits for its children internally and forwards TERM→INT
# so bpftrace flushes aggregation maps before exiting.
if [[ "$HAS_BPF" == true ]]; then
    "${SCRIPT_DIR}/bpf/run.sh" --batch \
        --pid "$FRONTEND_PID" \
        --output-dir "$OUTPUT_DIR/bpf" \
        --duration "$CAPTURE_DURATION" &
    CAPTURE_PIDS+=($!)
fi

# 4e. Flamegraph captures — delegate to flamegraph/ scripts
if [[ "$SKIP_FLAMEGRAPH" == false ]]; then
    if command -v perf &>/dev/null || command -v samply &>/dev/null; then
        echo "  [flamegraph] CPU, pid=$FRONTEND_PID"
        "${SCRIPT_DIR}/flamegraph/cpu_flamegraph.sh" \
            --pid "$FRONTEND_PID" \
            --duration "$CAPTURE_DURATION" \
            --output-dir "$OUTPUT_DIR/perf" \
            --output cpu_flamegraph &
        CAPTURE_PIDS+=($!)
    else
        echo "  [flamegraph] SKIP CPU — install perf or samply:"
        echo "    apt install linux-tools-\$(uname -r)  OR  cargo install samply"
    fi
    if [[ "$HAS_BPF" == true ]]; then
        echo "  [flamegraph] Off-CPU, pid=$FRONTEND_PID"
        "${SCRIPT_DIR}/flamegraph/offcpu_flamegraph.sh" \
            --pid "$FRONTEND_PID" \
            --duration "$CAPTURE_DURATION" \
            --output-dir "$OUTPUT_DIR/perf" \
            --output offcpu_flamegraph &
        CAPTURE_PIDS+=($!)
    else
        echo "  [flamegraph] SKIP Off-CPU — needs bpftrace with root or CAP_BPF"
    fi
fi

# 4c. System capture (/proc polling, thread/fd counts, socket stats)
echo "  [system] /proc stats, thread/fd count, ss"
(
    INTERVAL=1
    for _i in $(seq 1 "$CAPTURE_DURATION"); do
        TS=$(date -Iseconds)

        # /proc status
        echo "--- $TS ---" >> "$OUTPUT_DIR/system/proc_status.txt"
        cat "/proc/$FRONTEND_PID/status" >> "$OUTPUT_DIR/system/proc_status.txt" 2>/dev/null || true

        # /proc stat — raw scheduler/CPU time info
        echo "--- $TS ---" >> "$OUTPUT_DIR/system/proc_stat.txt"
        cat "/proc/$FRONTEND_PID/stat" >> "$OUTPUT_DIR/system/proc_stat.txt" 2>/dev/null || true

        # /proc statm — page-level memory info
        echo "--- $TS ---" >> "$OUTPUT_DIR/system/proc_statm.txt"
        cat "/proc/$FRONTEND_PID/statm" >> "$OUTPUT_DIR/system/proc_statm.txt" 2>/dev/null || true

        # Thread count
        THREADS=$(ls -1 "/proc/$FRONTEND_PID/task/" 2>/dev/null | wc -l)
        echo "$TS threads=$THREADS" >> "$OUTPUT_DIR/system/thread_count.txt"

        # FD count
        FDS=$(ls -1 "/proc/$FRONTEND_PID/fd/" 2>/dev/null | wc -l)
        echo "$TS fds=$FDS" >> "$OUTPUT_DIR/system/fd_count.txt"

        # Socket stats
        echo "--- $TS ---" >> "$OUTPUT_DIR/system/ss_stats.txt"
        ss -tin >> "$OUTPUT_DIR/system/ss_stats.txt" 2>/dev/null || true

        sleep "$INTERVAL"
    done
) &
CAPTURE_PIDS+=($!)

# 4c-2. tcpdump packet capture (optional)
if [[ "$ENABLE_TCPDUMP" == true ]]; then
    if command -v tcpdump &>/dev/null; then
        echo "  [tcpdump] port=$TCPDUMP_PORT"
        timeout "$CAPTURE_DURATION" tcpdump -i any \
            -w "$OUTPUT_DIR/system/capture.pcap" \
            "port $TCPDUMP_PORT" \
            -s 96 -c 100000 &>/dev/null &
        CAPTURE_PIDS+=($!)
    else
        echo "  [tcpdump] SKIP — tcpdump not found (apt install tcpdump)"
    fi
fi

# 4d. Periodic Prometheus /metrics scraping (frontend + all mocker workers)
echo "  [prometheus] scraping every 1s"
(
    for _i in $(seq 1 "$CAPTURE_DURATION"); do
        METRICS=$(curl -s --max-time 3 "http://127.0.0.1:$FRONTEND_PORT/metrics" 2>/dev/null || echo "")
        # Append mocker worker metrics (ports BASE_SYSTEM_PORT .. BASE_SYSTEM_PORT+NUM_WORKERS-1)
        for wi in $(seq 1 "$NUM_WORKERS_TOTAL"); do
            WPORT=$((BASE_SYSTEM_PORT + wi - 1))
            WMETRICS=$(curl -s --max-time 3 "http://127.0.0.1:$WPORT/metrics" 2>/dev/null || echo "")
            if [[ -n "$WMETRICS" ]]; then
                METRICS="${METRICS}"$'\n'"${WMETRICS}"
            fi
        done
        if [[ -n "$METRICS" ]]; then
            # JSONL: one line per scrape with timestamp
            TS=$(date -Iseconds)
            printf '{"ts":"%s","metrics":%s}\n' "$TS" "$(echo "$METRICS" | python3 -c '
import sys, json
lines = sys.stdin.read().strip().split("\n")
out = {}
for line in lines:
    if line.startswith("#") or not line.strip():
        continue
    parts = line.split()
    if len(parts) >= 2:
        out[parts[0]] = parts[1]
print(json.dumps(out))
' 2>/dev/null || echo '"{}"')" >> "$OUTPUT_DIR/prometheus/timeseries.jsonl"
        fi
        sleep 1
    done
) &
CAPTURE_PIDS+=($!)

# ─── Step 5: Run aiperf load ────────────────────────────────────────────────
echo ""
echo "--- Running aiperf load ---"
echo "  concurrency=$CONCURRENCY  requests=${NUM_REQUESTS:-auto}  isl=$ISL  osl=$OSL"
[[ -n "$BENCHMARK_DURATION" ]] && echo "  benchmark-duration=${BENCHMARK_DURATION}s"
[[ -n "$REQUEST_RATE" ]] && echo "  request-rate=${REQUEST_RATE} req/s"

# Build load-control args: prefer --benchmark-duration (time-based) over
# --request-count (count-based) so each run gets a consistent measurement
# window regardless of throughput.  This mirrors k8s_run_aiperf() in sweep.sh.
_LOAD_ARGS=()
_EFFECTIVE_REQUESTS="null"
if [[ -n "$BENCHMARK_DURATION" ]]; then
    _LOAD_ARGS+=(--benchmark-duration "$BENCHMARK_DURATION")
fi
if [[ -n "$REQUEST_RATE" ]]; then
    _LOAD_ARGS+=(--request-rate "$REQUEST_RATE")
fi
if [[ -n "$NUM_REQUESTS" ]]; then
    _LOAD_ARGS+=(--request-count "$NUM_REQUESTS")
    _EFFECTIVE_REQUESTS="$NUM_REQUESTS"
fi
# If neither was specified, default to concurrency * 20 (min 640)
if [[ ${#_LOAD_ARGS[@]} -eq 0 ]]; then
    _AUTO=$(( CONCURRENCY * 20 ))
    [[ "$_AUTO" -lt 640 ]] && _AUTO=640
    _LOAD_ARGS+=(--request-count "$_AUTO")
    _EFFECTIVE_REQUESTS="$_AUTO"
fi

_WARMUP_ARGS=()
if [[ -n "$WARMUP_DURATION" ]]; then
    _WARMUP_ARGS+=(--warmup-duration "$WARMUP_DURATION")
elif [[ -n "$WARMUP_COUNT" ]]; then
    _WARMUP_ARGS+=(--warmup-request-count "$WARMUP_COUNT")
else
    _WARMUP_ARGS+=(--warmup-request-count "$CONCURRENCY")
fi

# Build the list of models to target
_AIPERF_MODELS=()
if [[ "$AIPERF_TARGETS" == "all" && ${#MODEL_NAMES[@]} -gt 1 ]]; then
    _AIPERF_MODELS=("${MODEL_NAMES[@]}")
else
    _AIPERF_MODELS=("${MODEL_NAMES[0]}")
fi
echo "  aiperf targets: ${_AIPERF_MODELS[*]} (mode=$AIPERF_TARGETS)"

for _AIPERF_MODEL in "${_AIPERF_MODELS[@]}"; do
    if [[ ${#_AIPERF_MODELS[@]} -gt 1 ]]; then
        AIPERF_ARTIFACT_DIR="$OUTPUT_DIR/aiperf/${_AIPERF_MODEL}"
        echo ""
        echo "  --- aiperf: model=${_AIPERF_MODEL} ---"
    else
        AIPERF_ARTIFACT_DIR="$OUTPUT_DIR/aiperf"
    fi
    mkdir -p "$AIPERF_ARTIFACT_DIR"

    # When the served model name differs from the HF model path (multi-model),
    # tell aiperf where to find the tokenizer.
    _AIPERF_TOK_ARGS=()
    if [[ "$_AIPERF_MODEL" != "$MODEL" ]]; then
        _AIPERF_TOK_ARGS=(--tokenizer "$MODEL")
    fi

    HF_HUB_OFFLINE=1 aiperf profile --artifact-dir "$AIPERF_ARTIFACT_DIR" \
        --model "$_AIPERF_MODEL" \
        "${_AIPERF_TOK_ARGS[@]}" \
        --endpoint-type chat \
        --endpoint /v1/chat/completions \
        --streaming \
        --url "http://127.0.0.1:$FRONTEND_PORT" \
        --synthetic-input-tokens-mean "$ISL" \
        --synthetic-input-tokens-stddev 0 \
        --output-tokens-mean "$OSL" \
        --output-tokens-stddev 0 \
        --extra-inputs max_tokens:"$OSL" \
        --extra-inputs min_tokens:"$OSL" \
        --extra-inputs ignore_eos:true \
        --extra-inputs repetition_penalty:1.0 \
        --extra-inputs temperature:0.0 \
        --concurrency "$CONCURRENCY" \
        "${_LOAD_ARGS[@]}" \
        "${_WARMUP_ARGS[@]}" \
        --num-dataset-entries 12800 \
        --random-seed 100 \
        --workers-max "$CONCURRENCY" \
        --record-processors 32 \
        --ui simple || echo "WARNING: aiperf failed for model ${_AIPERF_MODEL}"
done

# Check for server_metrics_export.json in the primary aiperf dir
_PRIMARY_AIPERF_DIR="$OUTPUT_DIR/aiperf"
[[ ${#_AIPERF_MODELS[@]} -gt 1 ]] && _PRIMARY_AIPERF_DIR="$OUTPUT_DIR/aiperf/${_AIPERF_MODELS[0]}"
if [[ -f "$_PRIMARY_AIPERF_DIR/server_metrics_export.json" ]]; then
    echo "  Found server_metrics_export.json"
fi

# ─── Step 6: Wait for captures to finish ─────────────────────────────────────
echo ""
echo "--- Waiting for captures to finish ---"

# Give captures CAPTURE_DURATION + 15s grace period, then force-kill stragglers.
# Profiler processes (perf record, flamegraph, bpftrace) can hang if the target
# PID becomes idle or exits, or if hardware counters can't be released cleanly.
_CAPTURE_DEADLINE=$(( CAPTURE_DURATION + 15 ))
_capture_waited=0
_all_done=false
while [[ "$_all_done" == false && $_capture_waited -lt $_CAPTURE_DEADLINE ]]; do
    _all_done=true
    for pid in "${CAPTURE_PIDS[@]}"; do
        if kill -0 "$pid" 2>/dev/null; then
            _all_done=false
            break
        fi
    done
    if [[ "$_all_done" == false ]]; then
        sleep 1
        _capture_waited=$((_capture_waited + 1))
    fi
done

if [[ "$_all_done" == false ]]; then
    echo "  WARNING: captures still running after ${_CAPTURE_DEADLINE}s — sending SIGTERM"
    for pid in "${CAPTURE_PIDS[@]}"; do
        kill "$pid" 2>/dev/null || true
    done
    sleep 2
    # Force kill any remaining
    for pid in "${CAPTURE_PIDS[@]}"; do
        kill -9 "$pid" 2>/dev/null || true
    done
fi
# Reap all child statuses
for pid in "${CAPTURE_PIDS[@]}"; do
    wait "$pid" 2>/dev/null || true
done

# Capture final Prometheus snapshot AFTER all aiperf requests have completed.
# This must happen after aiperf (Step 5) and after captures (Step 6) to ensure
# all request metrics are reflected in the snapshot.
echo "  Capturing final Prometheus snapshot..."
{
    curl -s --max-time 5 "http://127.0.0.1:$FRONTEND_PORT/metrics" 2>/dev/null || true
    for wi in $(seq 1 "$NUM_WORKERS_TOTAL"); do
        WPORT=$((BASE_SYSTEM_PORT + wi - 1))
        curl -s --max-time 5 "http://127.0.0.1:$WPORT/metrics" 2>/dev/null || true
    done
} > "$OUTPUT_DIR/prometheus/final_snapshot.txt"
# Save individual worker snapshots for debugging
for wi in $(seq 1 "$NUM_WORKERS_TOTAL"); do
    WPORT=$((BASE_SYSTEM_PORT + wi - 1))
    curl -s --max-time 5 "http://127.0.0.1:$WPORT/metrics" \
        > "$OUTPUT_DIR/prometheus/mocker_${wi}_snapshot.txt" 2>/dev/null || true
done

# ─── Step 7: Stop frontend and post-process nsys ─────────────────────────────
echo ""
echo "--- Post-processing ---"

# 7a. Stop frontend so nsys can finalize the .nsys-rep file.
# When running under nsys, the .nsys-rep is only written after the wrapped
# process exits. We must stop the frontend before attempting the export.
if [[ "$HAS_NSYS" == true ]]; then
    echo "  Stopping frontend (PID $FRONTEND_PID) so nsys can finalize..."
    kill -INT "$FRONTEND_PID" 2>/dev/null || true
    # Wait for the nsys-wrapped frontend to exit and write the .nsys-rep
    NSYS_WAIT=0
    NSYS_WAIT_MAX=30
    while kill -0 "$FRONTEND_PID" 2>/dev/null; do
        sleep 1
        NSYS_WAIT=$((NSYS_WAIT + 1))
        if [[ $NSYS_WAIT -ge $NSYS_WAIT_MAX ]]; then
            echo "  WARNING: frontend did not exit after ${NSYS_WAIT_MAX}s, sending SIGTERM"
            kill "$FRONTEND_PID" 2>/dev/null || true
            sleep 2
            break
        fi
    done
    # Remove frontend from ALL_PIDS so cleanup() doesn't try to kill it again
    NEW_ALL_PIDS=()
    for pid in "${ALL_PIDS[@]}"; do
        [[ "$pid" != "$FRONTEND_PID" ]] && NEW_ALL_PIDS+=("$pid")
    done
    ALL_PIDS=("${NEW_ALL_PIDS[@]}")

    if [[ -f "$OUTPUT_DIR/nsys/frontend.nsys-rep" ]]; then
        echo "  Exporting nsys to SQLite..."
        "$NSYS_CMD" export --type sqlite \
            --output "$OUTPUT_DIR/nsys/frontend.sqlite" \
            "$OUTPUT_DIR/nsys/frontend.nsys-rep" 2>/dev/null || \
            echo "  WARNING: nsys sqlite export failed"
        if [[ -f "$OUTPUT_DIR/nsys/frontend.sqlite" ]]; then
            echo "  nsys SQLite: $OUTPUT_DIR/nsys/frontend.sqlite"
        fi
    else
        echo "  WARNING: $OUTPUT_DIR/nsys/frontend.nsys-rep not found after frontend exit"
    fi
else
    # No nsys — stop frontend normally (cleanup will handle it, but stop early
    # to avoid holding ports during config save)
    kill -INT "$FRONTEND_PID" 2>/dev/null || true
fi

# ─── Step 8: Save config ────────────────────────────────────────────────────
cat > "$OUTPUT_DIR/config.json" <<EOF
{
  "timestamp": "$TIMESTAMP",
  "model": "$MODEL",
  "model_name": "$MODEL_NAME",
  "model_names": $(printf '%s\n' "${MODEL_NAMES[@]}" | jq -R . | jq -s .),
  "num_models": $NUM_MODELS,
  "num_workers_per_model": $NUM_WORKERS,
  "num_workers_total": $NUM_WORKERS_TOTAL,
  "speedup_ratio": "$SPEEDUP_RATIO",
  "data_parallel_size": $DATA_PARALLEL_SIZE,
  "request_plane": "$REQUEST_PLANE",
  "event_plane": "$EVENT_PLANE",
  "frontend_port": $FRONTEND_PORT,
  "tokenizer_backend": "${TOKENIZER_BACKEND:-hf}",
  "concurrency": $CONCURRENCY,
  "num_requests": ${_EFFECTIVE_REQUESTS:-null},
  "benchmark_duration": ${BENCHMARK_DURATION:-null},
  "request_rate": ${REQUEST_RATE:-null},
  "isl": $ISL,
  "osl": $OSL,
  "capture_duration": $CAPTURE_DURATION,
  "frontend_pid": $FRONTEND_PID,
  "has_nsys": $HAS_NSYS,
  "has_perf": $HAS_PERF,
  "has_bpf": $HAS_BPF,
  "skip_bpf": $SKIP_BPF,
  "skip_nsys": $SKIP_NSYS,
  "skip_flamegraph": $SKIP_FLAMEGRAPH,
  "tcpdump": $ENABLE_TCPDUMP,
  "tcpdump_port": $TCPDUMP_PORT,
  "nsys_path": "${NSYS_PATH:-auto}"
}
EOF

# ─── Done ────────────────────────────────────────────────────────────────────
echo ""
echo "╔══════════════════════════════════════════════════════════════╗"
echo "║          Capture Complete                                   ║"
echo "╚══════════════════════════════════════════════════════════════╝"
echo ""
echo "Output directory: $OUTPUT_DIR"
echo ""
echo "Contents:"
find "$OUTPUT_DIR" -type f | sort | while read -r f; do
    SIZE=$(stat -c%s "$f" 2>/dev/null || echo "?")
    REL="${f#$OUTPUT_DIR/}"
    printf "  %-50s %s bytes\n" "$REL" "$SIZE"
done
echo ""
echo "Run analysis:"
echo "  python3 ${SCRIPT_DIR}/analysis/create_report.py analyze $OUTPUT_DIR"