feat(perf): add frontend performance sweep runner with full observability (#6749)

01bf7170 · Biswa Panda · GitHub · 7389a369 · 01bf7170 · 01bf7170
Unverified Commit 01bf7170 authored Mar 24, 2026 by Biswa Panda Committed by GitHub Mar 24, 2026
8 changed files
--- a/benchmarks/frontend/scripts/README.md
+++ b/benchmarks/frontend/scripts/README.md
+# Frontend Performance Profiling
+Unified observability and benchmarking suite for Dynamo frontend performance.
+## Quick Start
+```bash
+cd ~/dev/dynamo
+source dynamo/bin/activate
+# Single run (mocker + frontend + aiperf + Prometheus)
+cd benchmarks/frontend/scripts
+./run_perf.sh --model Qwen/Qwen3-0.6B --concurrency 32 --num-requests 640 \
+    --speedup-ratio 0 --skip-bpf --skip-nsys --skip-flamegraph --skip-perf
+# Sweep (multiple config points)
+python3 sweep_runner.py --tokenizers hf --concurrency 32 --isl 512 \
+    --benchmark-duration 30 --speedup-ratio 0 \
+    -- --skip-bpf --skip-nsys --skip-flamegraph --skip-perf
+```
+## Architecture
+The benchmarking suite has two layers: a Python sweep orchestrator that builds a grid of configurations, and a shell harness that executes each individual run.
+```mermaid
+flowchart TB
+    subgraph Orchestrator ["sweep_runner.py (Python orchestrator)"]
+        direction TB
+        grid["Build sweep grid<br/>(tokenizers x concurrency x ISL x workers x models x rps)"]
+        loop["For each config point"]
+        collect["Collect results into CSV + summary.md"]
+        report["Generate per-run report.md"]
+        grid --> loop --> collect --> report
+    end
+    loop -- "invokes" --> run_perf
+    subgraph run_perf ["run_perf.sh (per-run harness)"]
+        direction TB
+        infra["Step 0: Ensure etcd + NATS"]
+        mockers["Step 1: Start mocker workers<br/>(N models x M workers)"]
+        frontend["Step 2: Start frontend<br/>(optionally under nsys)"]
+        ready["Step 3: Wait for /v1/models readiness"]
+        captures["Step 4: Start parallel captures<br/>(perf stat, BPF, flamegraph, /proc, Prometheus)"]
+        load["Step 5: aiperf load generation"]
+        wait["Step 6: Wait for captures to finish"]
+        export["Step 7: Final Prometheus snapshot + nsys export"]
+        save["Step 8: Save config.json"]
+        infra --> mockers --> frontend --> ready --> captures --> load --> wait --> export --> save
+    end
+```
+### Runtime topology
+During a benchmark run, the following processes are active. The frontend receives HTTP requests from aiperf, tokenizes the input, routes to a backend model via the request plane (TCP), and streams response tokens back to the client.
+```mermaid
+flowchart LR
+    aiperf["aiperf<br/>(load generator)"]
+    subgraph Frontend ["Frontend (Rust, port 8000)"]
+        direction TB
+        http["HTTP server<br/>/v1/chat/completions"]
+        preprocess["Preprocess<br/>(template + tokenize)"]
+        router["Router<br/>(model lookup)"]
+        transport["Transport<br/>(TCP request plane)"]
+        http --> preprocess --> router --> transport
+    end
+    subgraph Models ["Mocker Workers"]
+        direction TB
+        subgraph model1 ["model-1"]
+            w1a["worker 1<br/>port 8081"]
+            w1b["worker 2<br/>port 8082"]
+        end
+        subgraph model2 ["model-2"]
+            w2a["worker 1<br/>port 8083"]
+            w2b["worker 2<br/>port 8084"]
+        end
+    end
+    subgraph Infra ["Infrastructure"]
+        etcd["etcd<br/>(service discovery)"]
+        nats["NATS<br/>(event plane)"]
+    end
+    subgraph Observability ["Parallel Captures"]
+        prom["Prometheus<br/>(/metrics scraping)"]
+        perf["perf stat<br/>(HW counters)"]
+        nsys["nsys<br/>(NVTX + OS runtime)"]
+        flame["flamegraph<br/>(CPU + off-CPU)"]
+        bpf["BPF traces<br/>(kernel-level)"]
+    end
+    aiperf -- "HTTP/SSE" --> http
+    transport -- "TCP" --> w1a & w1b & w2a & w2b
+    Frontend -. "register/discover" .-> etcd
+    Models -. "register/discover" .-> etcd
+    Models -. "events" .-> nats
+    Frontend -. "events" .-> nats
+    prom -. "scrape" .-> Frontend & Models
+    perf -. "attach" .-> Frontend
+    nsys -. "profile" .-> Frontend
+    flame -. "sample" .-> Frontend
+    bpf -. "trace" .-> Frontend
+```
+### Multi-model naming
+When `--num-models` is 1, the served model name matches the HF model path (e.g., `Qwen/Qwen3-0.6B`). When `--num-models` is greater than 1, each model instance gets a synthetic name (`model-1`, `model-2`, ...) but all share the same underlying `--model-path` for weights and tokenizer config.
+## Prerequisites
+| Tool | Required | Install |
+|------|----------|---------|
+| etcd | Yes | `apt install etcd` or [releases](https://github.com/etcd-io/etcd/releases) |
+| nats-server | Yes | `apt install nats-server` or [nats.io](https://nats.io/download/) |
+| aiperf | Yes | `uv pip install "git+https://github.com/ai-dynamo/aiperf.git@main"` (in dynamo venv) |
+| jq | Yes | `apt install jq` |
+| perf | Optional | `apt install linux-tools-$(uname -r)` |
+| bpftrace | Optional | `apt install bpftrace` (needs root or CAP_BPF + CAP_PERFMON) |
+| inferno | Optional | `cargo install inferno` (for flamegraphs) |
+| nsys | Optional | NVIDIA Nsight Systems |
+## sweep_runner.py
+The main entry point for running performance sweeps. Iterates over a grid of configurations and delegates each point to `run_perf.sh`.
+### Basic Usage
+```bash
+# Smoke test (1 run)
+python3 sweep_runner.py --tokenizers hf --concurrency 32 --isl 512 \
+    --benchmark-duration 30 --speedup-ratio 0 \
+    -- --skip-bpf --skip-nsys --skip-flamegraph --skip-perf
+# Full tokenizer comparison
+python3 sweep_runner.py --tokenizers hf,fastokens \
+    --concurrency 32,64 --isl 512,1024,2048 \
+    --benchmark-duration 60 --speedup-ratio 0
+# Transport saturation (vary workers and request count)
+python3 sweep_runner.py --tokenizers hf --concurrency 4096 \
+    --num-requests 16384,32768 --workers 1,2,4,8 --speedup-ratio 0
+# Preview sweep plan without running
+python3 sweep_runner.py --dry-run --tokenizers hf,fastokens \
+    --concurrency 32,64 --isl 512,1024
+```
+### Multi-Model and Worker Sweeps
+The `--num-models` and `--workers` flags control how many model instances and backend workers per model are launched. These are the primary knobs for studying frontend scalability under multi-tenant and parallel-worker configurations.
+#### Scaling models (fixed workers per model)
+Useful for measuring how adding more served models affects frontend routing, transport fan-out, and per-model latency.
+```bash
+# Sweep across 1, 2, 3, 4 model instances, 1 worker each, at 75 rps
+for m in 1 2 3 4; do
+    python3 sweep_runner.py \
+        --tokenizers hf \
+        --concurrency 512 \
+        --isl 512 \
+        --workers 1 \
+        --num-models $m \
+        --rps 75 \
+        --benchmark-duration 60 \
+        --speedup-ratio 0 \
+        --output-dir artifacts/sweep_models/m${m} \
+        -- --skip-bpf
+done
+# Compare results
+for m in 1 2 3 4; do
+    echo "=== m=$m ==="
+    cat artifacts/sweep_models/m${m}/summary.md
+    echo
+done
+```
+#### Scaling workers per model (fixed model count)
+Useful for measuring whether adding more backend workers relieves transport bottlenecks for a single model under heavy load.
+```bash
+# Sweep across 1, 2, 4, 8 workers for a single model
+python3 sweep_runner.py \
+    --tokenizers hf \
+    --concurrency 512 \
+    --isl 512 \
+    --workers 1,2,4,8 \
+    --num-models 1 \
+    --rps 75 \
+    --benchmark-duration 60 \
+    --speedup-ratio 0 \
+    --output-dir artifacts/sweep_workers \
+    -- --skip-bpf
+```
+#### Combined model + worker grid
+For a full factorial sweep over both dimensions, supply multiple values for both flags. Each combination produces a separate run.
+```bash
+# 2x3 grid: (1 model, 2 models) x (1, 2, 4 workers)
+python3 sweep_runner.py \
+    --tokenizers hf \
+    --concurrency 256 \
+    --isl 512 \
+    --workers 1,2,4 \
+    --num-models 2 \
+    --rps 50 \
+    --benchmark-duration 60 \
+    --speedup-ratio 0 \
+    --output-dir artifacts/sweep_grid \
+    -- --skip-bpf
+```
+> **Note:** `--num-models` is a single integer (not comma-separated). To sweep across model counts, loop externally as shown in the "Scaling models" example above.
+#### What to look for in the results
+| Metric | Where to find it | What it tells you |
+|--------|-----------------|-------------------|
+| Req/s and Tok/s | `summary.md` | Whether the frontend can sustain the target load |
+| TTFT p50/p99 | `summary.md` | End-to-end first-token latency (includes preprocess + routing + transport) |
+| `transport_roundtrip` p50 | `report.md` section 4 | Time spent in the TCP request plane; grows when workers are saturated |
+| Tokio worker busy ratio | `report.md` section 7 | Fraction of time each async worker is busy; values above 0.95 indicate saturation |
+| Event loop stalls | `report.md` section 7 | How often the Tokio runtime stalled; high counts suggest blocking work on the async executor |
+| `preprocess.tokenize` | `report.md` section 5 (NVTX) | Per-request tokenization cost; varies by tokenizer backend |
+### With Profilers
+```bash
+# With perf stat + flamegraphs (no root needed)
+python3 sweep_runner.py --tokenizers hf --concurrency 64 --isl 1024 \
+    --benchmark-duration 60 --speedup-ratio 0
+# With everything including BPF (needs sudo)
+sudo -E python3 sweep_runner.py --tokenizers hf --concurrency 64 --isl 1024 \
+    --benchmark-duration 60 --speedup-ratio 0
+# nsys profiling (needs nsys in PATH)
+python3 sweep_runner.py --tokenizers hf --concurrency 64 --isl 1024 \
+    --benchmark-duration 60 --speedup-ratio 0 \
+    -- --nsys-path /opt/nvidia/nsight-systems/bin/nsys
+```
+Profiler controls are passed through to run_perf.sh after `--`:
+| Flag | Effect |
+|------|--------|
+| `--skip-bpf` | Skip BPF tracing |
+| `--skip-nsys` | Skip Nsight Systems |
+| `--skip-flamegraph` | Skip CPU/off-CPU flamegraphs |
+| `--skip-perf` | Skip perf stat hardware counters |
+### All Options
+| Option | Default | Description |
+|--------|---------|-------------|
+| `--model` | `Qwen/Qwen3-0.6B` | HF model path |
+| `--backend` | `mocker` | Engine: `mocker` (synthetic) or `vllm` |
+| `--tokenizers` | `hf,fastokens` | Comma-separated tokenizer backends |
+| `--concurrency` | `50,100,200` | Comma-separated concurrency levels |
+| `--isl` | `512,1024,2048` | Comma-separated input sequence lengths |
+| `--osl` | `256` | Output sequence length |
+| `--workers` | `2` | Comma-separated worker counts per model |
+| `--num-models` | `1` | Number of model instances (each gets `--workers` workers) |
+| `--rps` | - | Comma-separated target request rates (req/s) |
+| `--aiperf-targets` | `first` | `first`: model-1 only. `all`: run aiperf for each model |
+| `--speedup-ratio` | `1.0` | Mocker speedup (0 = infinite) |
+| `--benchmark-duration` | `60` | aiperf run duration (seconds) |
+| `--num-requests` | - | Comma-separated request counts (overrides duration) |
+| `--output-dir` | auto | Output directory |
+| `--max-consecutive-fails` | `2` | Skip remaining ISLs after N failures |
+| `--cooldown` | `3` | Seconds between runs |
+| `--dry-run` | - | Print plan without executing |
+| `--no-report` | - | Skip per-run report generation |
+## run_perf.sh
+Low-level per-run harness. Normally called by sweep_runner.py, but can be used directly for single runs.
+```bash
+# Minimal (no profilers)
+./run_perf.sh --model Qwen/Qwen3-0.6B --concurrency 32 --num-requests 640 \
+    --speedup-ratio 0 --skip-bpf --skip-nsys --skip-flamegraph --skip-perf
+# Full observability (needs sudo for BPF)
+sudo -E ./run_perf.sh --model Qwen/Qwen3-0.6B --concurrency 64 \
+    --benchmark-duration 60 --speedup-ratio 0
+# Multi-model with 2 workers each
+./run_perf.sh --model Qwen/Qwen3-0.6B --num-models 2 --workers 2 \
+    --concurrency 32 --benchmark-duration 30 --speedup-ratio 0 \
+    --skip-bpf --skip-nsys --skip-flamegraph --skip-perf
+# 4 models, 1 worker each, rate-limited to 75 rps
+./run_perf.sh --model Qwen/Qwen3-0.6B --num-models 4 --workers 1 \
+    --concurrency 512 --benchmark-duration 60 --request-rate 75 \
+    --speedup-ratio 0 --skip-bpf
+```
+## Analyzing Results
+```bash
+# Per-run report (generated automatically by sweep_runner.py)
+python3 analysis/create_report.py analyze artifacts/sweep_<ts>/hf_c32_isl512_w2
+# Auto-find latest run
+python3 analysis/create_report.py analyze
+# Prometheus delta (initial vs final snapshot)
+diff <(grep "^dynamo_frontend" artifacts/.../prometheus/initial_snapshot.txt | sort) \
+     <(grep "^dynamo_frontend" artifacts/.../prometheus/final_snapshot.txt | sort)
+# nsys SQLite queries (when nsys was enabled)
+sqlite3 artifacts/.../nsys/frontend.sqlite \
+    "SELECT name, COUNT(*), ROUND(AVG(end-start)/1e3,1) as avg_us
+     FROM NVTX_EVENTS WHERE end > start GROUP BY name ORDER BY avg_us DESC"
+```
+## Output Structure
+```text
+artifacts/sweep_YYYYMMDD_HHMMSS/
+    results.csv                        Sweep results (all runs)
+    summary.md                         Comparison table
+    hf_c32_isl512_w2/                  Per-run directory
+        config.json                    Run parameters
+        report.md                      Analysis report
+        aiperf/
+            profile_export_aiperf.json aiperf metrics
+        prometheus/
+            initial_snapshot.txt        Pre-load metrics
+            final_snapshot.txt          Post-load metrics
+            timeseries.jsonl            Per-second scrapes
+        system/
+            thread_count.txt            Thread count over time
+            fd_count.txt                FD count over time
+            proc_status.txt             /proc/PID/status snapshots
+        logs/
+            frontend.log
+            mocker_*.log
+        perf/                           (if --with-perf)
+            perf_stat.txt
+            cpu_flamegraph.svg
+        bpf/                            (if --with-bpf, needs root)
+            runqlat.txt
+            syscall_latency.txt
+            ...
+        nsys/                           (if --with-nsys)
+            frontend.nsys-rep
+            frontend.sqlite
+```
--- a/benchmarks/frontend/scripts/analysis/__init__.py
+++ b/benchmarks/frontend/scripts/analysis/__init__.py
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
--- a/benchmarks/frontend/scripts/analysis/create_report.py
+++ b/benchmarks/frontend/scripts/analysis/create_report.py
--- a/benchmarks/frontend/scripts/analysis/frontend_perf_analysis.py
+++ b/benchmarks/frontend/scripts/analysis/frontend_perf_analysis.py
--- a/benchmarks/frontend/scripts/analysis/parsing_util.py
+++ b/benchmarks/frontend/scripts/analysis/parsing_util.py
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""
+data parsing utilities.
+Pure data-extraction functions for Prometheus histograms, nsys SQLite databases,
+perf stat output, bpftrace histograms, and timeseries files. Returns structured
+Python objects — no formatting or report generation.
+Used by create_report.py for report generation.
+"""
+import json
+import logging
+import re
+import sqlite3
+import sys
+from pathlib import Path
+from typing import Optional
+logger = logging.getLogger(__name__)
+# Reuse parsers from the existing analysis module (same directory)
+_SCRIPT_DIR = Path(__file__).resolve().parent
+sys.path.insert(0, str(_SCRIPT_DIR))
+from frontend_perf_analysis import (  # noqa: E402
+    AiperfResult,
+    PrometheusSnapshot,
+    _extract_aiperf_metrics,
+)
+# ─── Prometheus parsing ────────────────────────────────────────────────────
+def parse_prometheus_text(path: Path) -> Optional[PrometheusSnapshot]:
+    """Parse Prometheus text format from a specific file path.
+    Extracts stage durations, request plane metrics, transport breakdown,
+    Tokio runtime metrics, and transport/compute gauges into a PrometheusSnapshot.
+    """
+    if not path.exists() or path.stat().st_size == 0:
+        return None
+    text = path.read_text()
+    snap = PrometheusSnapshot()
+    def get_gauge(name: str) -> Optional[float]:
+        m = re.search(rf"^{re.escape(name)}\s+(\S+)", text, re.MULTILINE)
+        return float(m.group(1)) if m else None
+    def get_gauge_by_label(name: str, label_key: str) -> dict:
+        pattern = rf'^{re.escape(name)}\{{[^}}]*{re.escape(label_key)}="([^"]+)"[^}}]*\}}\s+(\S+)'
+        return {
+            m.group(1): float(m.group(2))
+            for m in re.finditer(pattern, text, re.MULTILINE)
+        }
+    def histogram_quantile(name: str, quantile: float, filter_label: str = "") -> float:
+        bucket_pattern = rf"^{re.escape(name)}_bucket\{{[^}}]*{re.escape(filter_label)}[^}}]*le=\"([^\"]+)\"[^}}]*\}}\s+(\S+)"
+        buckets = []
+        for m in re.finditer(bucket_pattern, text, re.MULTILINE):
+            le_str, count_str = m.group(1), m.group(2)
+            le = float("inf") if le_str == "+Inf" else float(le_str)
+            buckets.append((le, float(count_str)))
+        if not buckets:
+            return 0.0
+        buckets.sort(key=lambda x: x[0])
+        count_m = re.search(
+            rf"^{re.escape(name)}_count\{{{re.escape(filter_label)}[^}}]*\}}\s+(\S+)",
+            text,
+            re.MULTILINE,
+        )
+        total = float(count_m.group(1)) if count_m else buckets[-1][1]
+        if total == 0:
+            return 0.0
+        target = quantile * total
+        prev_le, prev_count = 0.0, 0.0
+        for le, count in buckets:
+            if count >= target:
+                if count == prev_count:
+                    return prev_le
+                frac = (target - prev_count) / (count - prev_count)
+                return prev_le + frac * (le - prev_le)
+            prev_le, prev_count = le, count
+        return buckets[-1][0] if buckets else 0.0
+    # Stage durations
+    for stage in ["preprocess", "route", "transport_roundtrip", "postprocess"]:
+        label_filter = f'stage="{stage}"'
+        p50 = histogram_quantile(
+            "dynamo_frontend_stage_duration_seconds", 0.50, label_filter
+        )
+        p95 = histogram_quantile(
+            "dynamo_frontend_stage_duration_seconds", 0.95, label_filter
+        )
+        p99 = histogram_quantile(
+            "dynamo_frontend_stage_duration_seconds", 0.99, label_filter
+        )
+        count_m = re.search(
+            rf"^dynamo_frontend_stage_duration_seconds_count\{{[^}}]*stage=\"{re.escape(stage)}\"[^}}]*\}}\s+(\S+)",
+            text,
+            re.MULTILINE,
+        )
+        if count_m and float(count_m.group(1)) > 0:
+            snap.stage_durations[stage] = {"p50": p50, "p95": p95, "p99": p99}
+    snap.request_plane_queue_p50 = histogram_quantile(
+        "dynamo_request_plane_queue_seconds", 0.50
+    )
+    snap.request_plane_send_p50 = histogram_quantile(
+        "dynamo_request_plane_send_seconds", 0.50
+    )
+    snap.request_plane_roundtrip_ttft_p50 = histogram_quantile(
+        "dynamo_request_plane_roundtrip_ttft_seconds", 0.50
+    )
+    snap.request_plane_inflight = get_gauge("dynamo_request_plane_inflight") or 0
+    # Transport breakdown (backend-side metrics)
+    for metric_name, attr_name in [
+        ("dynamo_component_network_transit_seconds", "work_handler_network_transit"),
+        (
+            "dynamo_component_time_to_first_response_seconds",
+            "work_handler_time_to_first_response",
+        ),
+    ]:
+        p50 = histogram_quantile(metric_name, 0.50)
+        p95 = histogram_quantile(metric_name, 0.95)
+        p99 = histogram_quantile(metric_name, 0.99)
+        if p50 > 0 or p95 > 0 or p99 > 0:
+            setattr(snap, attr_name, {"p50": p50, "p95": p95, "p99": p99})
+    poll_times = get_gauge_by_label("dynamo_tokio_worker_mean_poll_time_ns", "worker")
+    snap.tokio_worker_mean_poll_time_ns = list(poll_times.values())
+    snap.tokio_event_loop_stall_total = (
+        get_gauge("dynamo_frontend_event_loop_stall_total") or 0
+    )
+    snap.tokio_global_queue_depth = get_gauge("dynamo_tokio_global_queue_depth") or 0
+    snap.tokio_budget_forced_yield_total = (
+        get_gauge("dynamo_tokio_budget_forced_yield_total") or 0
+    )
+    busy_ratios_raw = get_gauge_by_label("dynamo_tokio_worker_busy_ratio", "worker")
+    snap.tokio_worker_busy_ratio = [v / 1000.0 for v in busy_ratios_raw.values()]
+    snap.tcp_pool_active = get_gauge("dynamo_transport_tcp_pool_active") or 0
+    snap.tcp_pool_idle = get_gauge("dynamo_transport_tcp_pool_idle") or 0
+    snap.compute_pool_active = (
+        get_gauge("dynamo_compute_compute_pool_active_tasks") or 0
+    )
+    return snap
+# ─── aiperf loading ────────────────────────────────────────────────────────
+def load_aiperf(obs_dir: Path) -> Optional[AiperfResult]:
+    """Load aiperf results from the aiperf subdir."""
+    aiperf_dir = obs_dir / "aiperf"
+    for candidate in [
+        aiperf_dir / "profile_export_aiperf.json",
+        aiperf_dir / "profile_results.json",
+    ]:
+        if candidate.exists():
+            try:
+                with open(candidate) as f:
+                    data = json.load(f)
+                return _extract_aiperf_metrics(data)
+            except (json.JSONDecodeError, KeyError):
+                continue
+    # Try any json file in aiperf dir
+    if aiperf_dir.is_dir():
+        for jf in sorted(aiperf_dir.glob("*.json")):
+            try:
+                with open(jf) as f:
+                    data = json.load(f)
+                if "time_to_first_token" in data or "ttft" in data:
+                    return _extract_aiperf_metrics(data)
+            except (json.JSONDecodeError, KeyError):
+                continue
+    return None
+def load_prometheus(obs_dir: Path) -> Optional[PrometheusSnapshot]:
+    """Load Prometheus snapshot — try final_snapshot.txt first, then aiperf dir."""
+    prom_dir = obs_dir / "prometheus"
+    final_path = prom_dir / "final_snapshot.txt"
+    if final_path.exists() and final_path.stat().st_size > 0:
+        return parse_prometheus_text(final_path)
+    # Fallback: check aiperf dir
+    aiperf_prom = obs_dir / "aiperf" / "prometheus_snapshot.txt"
+    if aiperf_prom.exists():
+        return parse_prometheus_text(aiperf_prom)
+    return None
+# ─── perf stat parsing ─────────────────────────────────────────────────────
+def parse_perf_stat(obs_dir: Path) -> Optional[dict]:
+    """Parse perf stat output into a dict of counter name -> value."""
+    path = obs_dir / "perf" / "perf_stat.txt"
+    if not path.exists():
+        return None
+    text = path.read_text()
+    counters = {}
+    patterns = {
+        "task-clock": r"([\d,\.]+)\s+msec\s+task-clock",
+        "context-switches": r"([\d,\.]+)\s+context-switches",
+        "cpu-migrations": r"([\d,\.]+)\s+cpu-migrations",
+        "page-faults": r"([\d,\.]+)\s+page-faults",
+        "cycles": r"([\d,\.]+)\s+cycles",
+        "instructions": r"([\d,\.]+)\s+instructions",
+        "branches": r"([\d,\.]+)\s+branches",
+        "branch-misses": r"([\d,\.]+)\s+branch-misses",
+        "cache-references": r"([\d,\.]+)\s+cache-references",
+        "cache-misses": r"([\d,\.]+)\s+cache-misses",
+    }
+    for name, pattern in patterns.items():
+        m = re.search(pattern, text)
+        if m:
+            counters[name] = float(m.group(1).replace(",", ""))
+    # Extract IPC if present
+    ipc_m = re.search(r"([\d,\.]+)\s+insn per cycle", text)
+    if ipc_m:
+        counters["ipc"] = float(ipc_m.group(1).replace(",", ""))
+    # Cache miss rate
+    cache_refs = counters.get("cache-references", 0)
+    cache_misses = counters.get("cache-misses", 0)
+    if cache_refs > 0:
+        counters["cache-miss-rate"] = cache_misses / cache_refs * 100
+    # Branch miss rate
+    branches = counters.get("branches", 0)
+    branch_misses = counters.get("branch-misses", 0)
+    if branches > 0:
+        counters["branch-miss-rate"] = branch_misses / branches * 100
+    return counters if counters else None
+# ─── bpftrace histogram parsing ────────────────────────────────────────────
+def parse_bpftrace_histograms(text: str) -> list[dict]:
+    """Parse bpftrace histogram output blocks.
+    Each block looks like:
+    @label_name[key]:
+    [1, 2)        123 |@@@@@@@@@@           |
+    [2, 4)        456 |@@@@@@@@@@@@@@@@@@@@|
+    """
+    histograms = []
+    current_label = None
+    current_buckets = []
+    for line in text.split("\n"):
+        # Match label line
+        label_m = re.match(r"^@(\w+)(?:\[([^\]]*)\])?:", line)
+        if label_m:
+            if current_label and current_buckets:
+                histograms.append({"label": current_label, "buckets": current_buckets})
+            current_label = label_m.group(1)
+            if label_m.group(2):
+                current_label += f"[{label_m.group(2)}]"
+            current_buckets = []
+            continue
+        # Match bucket line: [lo, hi)  count |bars|
+        # Handles optional unit suffixes: K (1024), M (1024^2)
+        bucket_m = re.match(r"\s*\[(\d+)([KkMm])?\s*,\s*(\d+)([KkMm])?\)\s+(\d+)", line)
+        if bucket_m and current_label:
+            _unit_mult = {"K": 1024, "k": 1024, "M": 1048576, "m": 1048576}
+            lo = int(bucket_m.group(1)) * _unit_mult.get(bucket_m.group(2) or "", 1)
+            hi = int(bucket_m.group(3)) * _unit_mult.get(bucket_m.group(4) or "", 1)
+            count = int(bucket_m.group(5))
+            current_buckets.append({"lo": lo, "hi": hi, "count": count})
+    if current_label and current_buckets:
+        histograms.append({"label": current_label, "buckets": current_buckets})
+    return histograms
+def summarize_histogram(buckets: list[dict]) -> dict:
+    """Compute basic stats (p50, p99, total, max_bucket) from histogram buckets."""
+    total = sum(b["count"] for b in buckets)
+    if total == 0:
+        return {"total": 0, "p50": 0, "p99": 0, "max_bucket": 0}
+    cumulative = 0
+    p50 = p99 = 0
+    max_bucket = 0
+    for b in buckets:
+        cumulative += b["count"]
+        mid = (b["lo"] + b["hi"]) / 2
+        if cumulative >= total * 0.50 and p50 == 0:
+            p50 = mid
+        if cumulative >= total * 0.99 and p99 == 0:
+            p99 = mid
+        if b["count"] > 0:
+            max_bucket = b["hi"]
+    return {"total": total, "p50": p50, "p99": p99, "max_bucket": max_bucket}
+# ─── timeseries parsing ────────────────────────────────────────────────────
+def parse_timeseries(path: Path, key: str) -> list[tuple[str, float]]:
+    """Parse lines like '2025-01-01T00:00:00+00:00 key=value'."""
+    if not path.exists():
+        return []
+    points = []
+    for line in path.read_text().strip().split("\n"):
+        m = re.match(rf"(\S+)\s+{re.escape(key)}=(\d+)", line)
+        if m:
+            points.append((m.group(1), float(m.group(2))))
+    return points
+# ─── nsys SQLite queries ───────────────────────────────────────────────────
+def parse_nvtx_stages(
+    obs_dir: Path,
+) -> Optional[list[dict]]:
+    """Parse NVTX_EVENTS from nsys SQLite, return list of stage dicts.
+    Each dict has keys: name, count, avg_us, min_us, max_us.
+    """
+    sqlite_path = obs_dir / "nsys" / "frontend.sqlite"
+    if not sqlite_path.exists():
+        return None
+    try:
+        conn = sqlite3.connect(str(sqlite_path))
+        tables = [
+            r[0]
+            for r in conn.execute(
+                "SELECT name FROM sqlite_master WHERE type='table'"
+            ).fetchall()
+        ]
+        if "NVTX_EVENTS" not in tables:
+            conn.close()
+            return None
+        rows = conn.execute(
+            """
+            SELECT text, COUNT(*) as cnt,
+                   AVG(end - start) as avg_ns,
+                   MIN(end - start) as min_ns,
+                   MAX(end - start) as max_ns
+            FROM NVTX_EVENTS
+            WHERE text IS NOT NULL AND end > start
+            GROUP BY text
+            ORDER BY avg_ns DESC
+        """
+        ).fetchall()
+        conn.close()
+        if not rows:
+            return None
+        return [
+            {
+                "name": text or "?",
+                "count": cnt,
+                "avg_us": avg_ns / 1000,
+                "min_us": min_ns / 1000,
+                "max_us": max_ns / 1000,
+            }
+            for text, cnt, avg_ns, min_ns, max_ns in rows
+        ]
+    except sqlite3.Error as e:
+        logger.debug("parse_nvtx_stages: sqlite error: %s", e)
+        return None
+def parse_syscall_profile(
+    obs_dir: Path,
+) -> Optional[list[dict]]:
+    """Parse OSRT_API from nsys SQLite (OS runtime API calls).
+    Each dict has keys: name, count, avg_us, total_ms.
+    """
+    sqlite_path = obs_dir / "nsys" / "frontend.sqlite"
+    if not sqlite_path.exists():
+        return None
+    try:
+        conn = sqlite3.connect(str(sqlite_path))
+        tables = [
+            r[0]
+            for r in conn.execute(
+                "SELECT name FROM sqlite_master WHERE type='table'"
+            ).fetchall()
+        ]
+        if "OSRT_API" not in tables:
+            conn.close()
+            return None
+        rows = conn.execute(
+            """
+            SELECT nameId, COUNT(*) as cnt,
+                   AVG(end - start) as avg_ns,
+                   SUM(end - start) as total_ns
+            FROM OSRT_API
+            WHERE end > start
+            GROUP BY nameId
+            ORDER BY total_ns DESC
+            LIMIT 20
+        """
+        ).fetchall()
+        # Try to resolve names from StringIds table
+        name_map = {}
+        if "StringIds" in tables:
+            for row in conn.execute("SELECT id, value FROM StringIds").fetchall():
+                name_map[row[0]] = row[1]
+        conn.close()
+        if not rows:
+            return None
+        return [
+            {
+                "name": name_map.get(name_id, f"id={name_id}"),
+                "count": cnt,
+                "avg_us": avg_ns / 1000,
+                "total_ms": total_ns / 1e6,
+            }
+            for name_id, cnt, avg_ns, total_ns in rows
+        ]
+    except sqlite3.Error as e:
+        logger.debug("parse_syscall_profile: sqlite error: %s", e)
+        return None
+def parse_nsys_context_switches(
+    obs_dir: Path,
+) -> Optional[dict]:
+    """Parse SCHED_EVENTS from nsys SQLite.
+    Returns dict with keys: total, avg_duration.
+    """
+    sqlite_path = obs_dir / "nsys" / "frontend.sqlite"
+    if not sqlite_path.exists():
+        return None
+    try:
+        conn = sqlite3.connect(str(sqlite_path))
+        tables = [
+            r[0]
+            for r in conn.execute(
+                "SELECT name FROM sqlite_master WHERE type='table'"
+            ).fetchall()
+        ]
+        if "SCHED_EVENTS" not in tables:
+            conn.close()
+            return None
+        row = conn.execute(
+            """
+            SELECT COUNT(*) as total,
+                   AVG(endGlobalTid - startGlobalTid) as avg_duration
+            FROM SCHED_EVENTS
+        """
+        ).fetchone()
+        conn.close()
+        if not row or row[0] == 0:
+            return None
+        return {"total": row[0], "avg_duration": row[1]}
+    except sqlite3.Error as e:
+        logger.debug("parse_nsys_context_switches: sqlite error: %s", e)
+        return None
+# ─── Directory utilities ───────────────────────────────────────────────────
+def find_latest_obs_dir(repo_root: Path) -> Optional[Path]:
+    """Find the most recent artifacts/obs_* directory."""
+    artifacts = repo_root / "artifacts"
+    if not artifacts.exists():
+        return None
+    dirs = sorted(artifacts.glob("obs_*"), reverse=True)
+    return dirs[0] if dirs else None
--- a/benchmarks/frontend/scripts/run_perf.sh
+++ b/benchmarks/frontend/scripts/run_perf.sh
--- a/benchmarks/frontend/scripts/sweep_runner.py
+++ b/benchmarks/frontend/scripts/sweep_runner.py
--- a/lib/bindings/python/rust/llm/entrypoint.rs
+++ b/lib/bindings/python/rust/llm/entrypoint.rs
@@ -339,6 +339,11 @@ pub fn make_engine<'p>(
            } else {
                // Mocker only needs tokenizer, not weights
                let ignore_weights = matches!(args.engine_type, EngineType::Mocker);
+                // Preserve the original HF model ID as source_path so the
+                // frontend can resolve model metadata even when the served
+                // model name differs (e.g., --model-name model-1 --model-path
+                // Qwen/Qwen3-0.6B).
+                builder.source_path(model_path.clone());
                LocalModel::fetch(&model_path.display().to_string(), ignore_weights)
                    .await
                    .map_err(to_pyerr)?