feat(frontend): three-layer frontend perf sweep with local and k8s support (#7700)

273252e6 · Biswa Panda · GitHub · 023a299c · 273252e6 · 273252e6
Unverified Commit 273252e6 authored Mar 31, 2026 by Biswa Panda Committed by GitHub Mar 31, 2026
8 changed files
--- a/benchmarks/frontend/scripts/sweep_executors/local.py
+++ b/benchmarks/frontend/scripts/sweep_executors/local.py
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""
+LocalExecutor -- wraps run_perf.sh for local sweep execution.
+
+This executor delegates each run to run_perf.sh, which handles service
+lifecycle (mocker + frontend), observability captures, and aiperf load.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import signal
+import socket
+import subprocess
+import time
+from pathlib import Path
+from typing import Optional
+
+from sweep_core.models import DeployDimension, RunResult, RunSpec, SweepConfig
+
+SCRIPT_DIR = Path(__file__).resolve().parent.parent
+
+
+class LocalExecutor:
+    """Executor that delegates runs to run_perf.sh."""
+
+    def __init__(self) -> None:
+        self._config: Optional[SweepConfig] = None
+        self._frontend_port: int = 8000
+
+    def prepare(self, config: SweepConfig) -> None:
+        """Store config for use during runs."""
+        self._config = config
+        self._frontend_port = 8000  # local mode always uses 8000
+
+    def apply_deploy(
+        self,
+        deploy: DeployDimension,
+        prev: Optional[DeployDimension],
+    ) -> None:
+        """In local mode, run_perf.sh handles its own service lifecycle.
+
+        We just wait for the port to be free from the previous run.
+        """
+        _wait_port_free(self._frontend_port)
+
+    def execute_run(self, run_spec: RunSpec, run_dir: Path) -> RunResult:
+        """Execute a single run via run_perf.sh."""
+        if self._config is None:
+            raise RuntimeError("prepare() must be called before execute_run()")
+        config = self._config
+        deploy = run_spec.deploy
+        aiperf = run_spec.aiperf
+
+        result = RunResult(run_spec=run_spec, run_dir=str(run_dir))
+
+        cmd = [
+            str(SCRIPT_DIR / "run_perf.sh"),
+            "--model",
+            config.model,
+            "--isl",
+            str(aiperf.isl),
+            "--osl",
+            str(aiperf.osl),
+            "--concurrency",
+            str(aiperf.concurrency),
+            "--workers",
+            str(deploy.workers),
+            "--speedup-ratio",
+            str(config.speedup_ratio),
+            "--num-models",
+            str(deploy.num_models),
+            "--aiperf-targets",
+            config.aiperf_targets,
+            "--output-dir",
+            str(run_dir),
+        ]
+
+        if aiperf.benchmark_duration:
+            cmd.extend(["--benchmark-duration", str(aiperf.benchmark_duration)])
+        if aiperf.num_requests:
+            cmd.extend(["--num-requests", str(aiperf.num_requests)])
+        if aiperf.request_rate:
+            cmd.extend(["--request-rate", str(aiperf.request_rate)])
+        if deploy.tokenizer in ("fast", "fastokens"):
+            cmd.append("--fast-tokens")
+
+        # TODO: when run_perf.sh gains --backend vllm support, pass it here
+        if deploy.backend == "vllm":
+            print(
+                "    WARNING: vllm backend not yet supported by run_perf.sh; using mocker"
+            )
+
+        # Passthrough args (e.g., --skip-bpf --skip-nsys)
+        cmd.extend(config.passthrough_args)
+
+        print(f"    cmd: {' '.join(cmd[:6])}...")
+
+        try:
+            proc = subprocess.Popen(
+                cmd,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                text=True,
+                start_new_session=True,
+            )
+            stdout, _ = proc.communicate(timeout=600)
+
+            if proc.returncode == 0:
+                result.status = "ok"
+            else:
+                result.status = "fail"
+                print(f"    run_perf.sh failed (rc={proc.returncode})")
+                lines = (stdout or "").strip().split("\n")
+                for line in lines[-5:]:
+                    print(f"      {line}")
+
+        except subprocess.TimeoutExpired:
+            result.status = "fail"
+            print("    TIMEOUT after 600s")
+            try:
+                pgid = os.getpgid(proc.pid)
+                os.killpg(pgid, signal.SIGTERM)
+                time.sleep(2)
+                os.killpg(pgid, signal.SIGKILL)
+            except ProcessLookupError:
+                pass
+        except Exception as e:
+            result.status = "fail"
+            print(f"    ERROR: {e}")
+
+        # Parse aiperf results
+        _parse_aiperf_into_result(result, run_dir)
+
+        return result
+
+    def cleanup(self) -> None:
+        """No persistent state to clean up in local mode."""
+        pass
+
+
+# ── Helpers ──────────────────────────────────────────────────────────────────
+
+
+def _parse_aiperf_json(json_path: Path) -> dict:
+    """Parse aiperf profile_export_aiperf.json."""
+    if not json_path.exists():
+        return {}
+    try:
+        data = json.loads(json_path.read_text())
+        result = {}
+        rt = data.get("request_throughput", {})
+        result["req_per_sec"] = rt.get("avg", 0)
+        ot = data.get("output_token_throughput", {})
+        result["output_tok_per_sec"] = ot.get("avg", 0)
+        ttft = data.get("time_to_first_token", data.get("ttft", {}))
+        if isinstance(ttft, dict):
+            result["ttft_p50_ms"] = ttft.get("p50", 0) or 0
+            result["ttft_p99_ms"] = ttft.get("p99", 0) or 0
+        itl = data.get("inter_token_latency", data.get("itl", {}))
+        if isinstance(itl, dict):
+            result["itl_p50_ms"] = itl.get("p50", 0) or 0
+            result["itl_p99_ms"] = itl.get("p99", 0) or 0
+        bd = data.get("benchmark_duration", 0)
+        result["duration_sec"] = bd.get("avg", 0) if isinstance(bd, dict) else (bd or 0)
+        return result
+    except (json.JSONDecodeError, KeyError, TypeError):
+        return {}
+
+
+def _parse_aiperf_into_result(result: RunResult, run_dir: Path) -> None:
+    """Parse aiperf results from the run directory into the RunResult."""
+    aiperf_json = run_dir / "aiperf" / "profile_export_aiperf.json"
+    if not aiperf_json.exists():
+        # Multi-model: results are in aiperf/<model-name>/
+        for candidate in sorted(
+            (run_dir / "aiperf").glob("*/profile_export_aiperf.json")
+        ):
+            aiperf_json = candidate
+            break
+    metrics = _parse_aiperf_json(aiperf_json)
+    if metrics:
+        result.req_per_sec = metrics.get("req_per_sec", 0)
+        result.output_tok_per_sec = metrics.get("output_tok_per_sec", 0)
+        result.ttft_p50_ms = metrics.get("ttft_p50_ms", 0)
+        result.ttft_p99_ms = metrics.get("ttft_p99_ms", 0)
+        result.itl_p50_ms = metrics.get("itl_p50_ms", 0)
+        result.itl_p99_ms = metrics.get("itl_p99_ms", 0)
+        result.duration_sec = metrics.get("duration_sec", 0)
+
+
+def _port_free(port: int) -> bool:
+    """Check if a port is free."""
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        return s.connect_ex(("127.0.0.1", port)) != 0
+
+
+def _kill_port(port: int) -> None:
+    """Kill any process holding a port."""
+    subprocess.run(
+        f"fuser -k -TERM {port}/tcp", shell=True, capture_output=True, timeout=5
+    )
+    time.sleep(2)
+    subprocess.run(
+        f"fuser -k -KILL {port}/tcp", shell=True, capture_output=True, timeout=5
+    )
+
+
+def _wait_port_free(port: int, timeout: int = 30) -> None:
+    """Wait for a port to become free."""
+    for i in range(timeout):
+        if _port_free(port):
+            return
+        if i == 0:
+            print(f"  Waiting for port {port} to free...")
+        time.sleep(1)
+    print(f"  Forcing port {port} release...")
+    _kill_port(port)
+    time.sleep(2)
--- a/benchmarks/frontend/scripts/sweep_k8s/__init__.py
+++ b/benchmarks/frontend/scripts/sweep_k8s/__init__.py
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""sweep_k8s -- Kubernetes subprocess helpers for DGD-based sweeps."""
--- a/benchmarks/frontend/scripts/sweep_k8s/aiperf.py
+++ b/benchmarks/frontend/scripts/sweep_k8s/aiperf.py
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""
+K8s aiperf Job launcher.
+
+Runs aiperf as a k8s Job inside the same namespace as the DGD, using the
+in-cluster service DNS endpoint. Uses python:3.12-slim with pip-installed
+aiperf (same pattern as recipes/qwen3-235b-a22b-fp8/trtllm/agg/perf.yaml).
+
+Artifacts are written inside the pod, then copied back to the local host
+via kubectl cp.
+"""
+
+from __future__ import annotations
+
+import json
+import subprocess
+import time
+from pathlib import Path
+from typing import Optional
+
+from sweep_k8s.kubectl import run_kubectl
+
+DEFAULT_HF_TOKEN_SECRET_NAME = "hf-token-secret"
+
+
+def _build_aiperf_script(
+    model_name: str,
+    endpoint: str,
+    concurrency: int,
+    isl: int,
+    osl: int = 256,
+    benchmark_duration: Optional[int] = None,
+    num_requests: Optional[int] = None,
+    request_rate: Optional[int] = None,
+    warmup_duration: Optional[int] = None,
+    warmup_count: Optional[int] = None,
+    export_level: str = "summary",
+) -> str:
+    """Build the shell script that runs inside the Job container."""
+    # Build load-control args
+    load_args = ""
+    if benchmark_duration:
+        load_args += f" --benchmark-duration {benchmark_duration}"
+    if num_requests:
+        load_args += f" --request-count {num_requests}"
+    if request_rate:
+        load_args += f" --request-rate {request_rate}"
+    if not load_args.strip():
+        auto_count = max(concurrency * 20, 640)
+        load_args = f" --request-count {auto_count}"
+
+    # Warmup args
+    warmup_args = ""
+    if warmup_duration:
+        warmup_args = f" --warmup-duration {warmup_duration}"
+    elif warmup_count:
+        warmup_args = f" --warmup-request-count {warmup_count}"
+    else:
+        warmup_args = f" --warmup-request-count {concurrency}"
+
+    return f"""set -e
+apt-get update -qq && apt-get install -y -qq curl jq git procps 2>/dev/null
+pip install --quiet git+https://github.com/ai-dynamo/aiperf.git@54cd6dc820bff8bfebc875da104e59d745e14f75
+echo "aiperf installed"
+
+# Wait for model
+echo "Waiting for model '{model_name}' at http://{endpoint}/v1/models..."
+while ! curl -sf "http://{endpoint}/v1/models" 2>/dev/null | \\
+      jq -e --arg m "{model_name}" '.data[]? | select(.id == $m)' >/dev/null 2>&1; do
+    echo "  Model not ready, sleeping 5s..."
+    sleep 5
+done
+echo "Model ready!"
+
+# Write artifacts to PVC so they persist after pod completion
+ARTIFACT_DIR="${{ARTIFACT_PVC_DIR:-/model-cache/perf/${{JOB_NAME}}}}"
+mkdir -p "$ARTIFACT_DIR"
+echo "Running aiperf: c={concurrency} isl={isl} osl={osl}"
+echo "Artifact dir: $ARTIFACT_DIR"
+aiperf profile \\
+    --artifact-dir "$ARTIFACT_DIR" \\
+    --model "{model_name}" \\
+    --tokenizer "{model_name}" \\
+    --endpoint-type chat \\
+    --endpoint /v1/chat/completions \\
+    --streaming \\
+    --url "http://{endpoint}" \\
+    --synthetic-input-tokens-mean {isl} \\
+    --synthetic-input-tokens-stddev 0 \\
+    --output-tokens-mean {osl} \\
+    --output-tokens-stddev 0 \\
+    --extra-inputs "max_tokens:{osl}" \\
+    --extra-inputs "min_tokens:{osl}" \\
+    --extra-inputs "ignore_eos:true" \\
+    --extra-inputs "repetition_penalty:1.0" \\
+    --extra-inputs "temperature:0.0" \\
+    --concurrency {concurrency} \\
+    {load_args.strip()} \\
+    {warmup_args.strip()} \\
+    --num-dataset-entries 12800 \\
+    --random-seed 100 \\
+    --workers-max {concurrency} \\
+    --record-processors 32 \\
+    --export-level {export_level} \\
+    --ui simple
+
+echo "aiperf done. Artifacts:"
+ls -la "$ARTIFACT_DIR"/
+"""
+
+
+def _indent(text: str, spaces: int) -> str:
+    """Indent each line of text by N spaces."""
+    prefix = " " * spaces
+    return "\n".join(prefix + line for line in text.split("\n"))
+
+
+def _build_job_yaml(
+    job_name: str,
+    namespace: str,
+    script: str,
+    image_pull_secret: str = "",
+    hf_token_secret_name: str = DEFAULT_HF_TOKEN_SECRET_NAME,
+) -> str:
+    """Build the aiperf k8s Job YAML.
+
+    Uses python:3.12-slim with pip-installed aiperf (same pattern as
+    recipes/qwen3-235b-a22b-fp8/trtllm/agg/perf.yaml).
+    """
+    image_pull_secret_block = ""
+    if image_pull_secret:
+        image_pull_secret_block = f"""
+      imagePullSecrets:
+        - name: {image_pull_secret}"""
+
+    return f"""apiVersion: batch/v1
+kind: Job
+metadata:
+  name: {job_name}
+  namespace: {namespace}
+  labels:
+    app: sweep-aiperf
+spec:
+  backoffLimit: 0
+  completions: 1
+  parallelism: 1
+  ttlSecondsAfterFinished: 600
+  template:
+    metadata:
+      labels:
+        app: sweep-aiperf
+        job-name: {job_name}
+    spec:
+      restartPolicy: Never
+{image_pull_secret_block}
+      securityContext:
+        sysctls:
+          - name: net.ipv4.ip_local_port_range
+            value: "1024 65000"
+      containers:
+        - name: aiperf
+          image: python:3.12-slim
+          imagePullPolicy: IfNotPresent
+          securityContext:
+            allowPrivilegeEscalation: false
+          command:
+            - /bin/bash
+            - -c
+            - |
+{_indent(script, 14)}
+          env:
+            - name: HF_HOME
+              value: /model-cache
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: {hf_token_secret_name}
+                  key: HF_TOKEN
+            - name: PYTHONUNBUFFERED
+              value: "1"
+            - name: AIPERF_HTTP_CONNECTION_LIMIT
+              value: "512"
+            - name: JOB_NAME
+              value: {job_name}
+            - name: ARTIFACT_PVC_DIR
+              value: /model-cache/perf/{job_name}
+          volumeMounts:
+            - name: model-cache
+              mountPath: /model-cache
+      volumes:
+        - name: model-cache
+          persistentVolumeClaim:
+            claimName: model-cache
+"""
+
+
+def _wait_for_job(
+    job_name: str,
+    namespace: str,
+    timeout: int = 600,
+) -> bool:
+    """Poll for Job completion. Returns True if succeeded."""
+    waited = 0
+    while waited < timeout:
+        try:
+            result = run_kubectl(
+                ["get", "job", job_name, "-o", "json"],
+                namespace=namespace,
+                check=False,
+            )
+            if result.returncode != 0:
+                time.sleep(5)
+                waited += 5
+                continue
+
+            job_data = json.loads(result.stdout)
+            conditions = job_data.get("status", {}).get("conditions", [])
+            for cond in conditions:
+                if cond.get("type") == "Complete" and cond.get("status") == "True":
+                    print(f"  aiperf Job completed (waited {waited}s)")
+                    return True
+                if cond.get("type") == "Failed" and cond.get("status") == "True":
+                    print(f"  aiperf Job FAILED (waited {waited}s)")
+                    _print_job_logs(job_name, namespace)
+                    return False
+        except (json.JSONDecodeError, subprocess.SubprocessError, OSError) as e:
+            print(f"  Transient error polling job {job_name} in {namespace}: {e}")
+
+        time.sleep(5)
+        waited += 5
+        if waited % 30 == 0:
+            print(f"  aiperf Job running ({waited}s / {timeout}s)...")
+
+    print(f"  aiperf Job timed out after {timeout}s")
+    _print_job_logs(job_name, namespace)
+    return False
+
+
+def _print_job_logs(job_name: str, namespace: str, tail: int = 20) -> None:
+    """Print last N lines of the Job pod logs."""
+    result = run_kubectl(
+        ["logs", f"job/{job_name}", f"--tail={tail}"],
+        namespace=namespace,
+        check=False,
+    )
+    if result.stdout:
+        print(f"  --- Last {tail} lines of aiperf logs ---")
+        for line in result.stdout.strip().split("\n"):
+            print(f"    {line}")
+
+
+def _get_job_pod_name(job_name: str, namespace: str) -> Optional[str]:
+    """Get the pod name for a Job."""
+    result = run_kubectl(
+        [
+            "get",
+            "pods",
+            "-l",
+            f"job-name={job_name}",
+            "-o",
+            "jsonpath={.items[0].metadata.name}",
+        ],
+        namespace=namespace,
+        check=False,
+    )
+    name = result.stdout.strip()
+    return name if name else None
+
+
+def _copy_artifacts_from_pvc(
+    job_name: str,
+    namespace: str,
+    local_dir: Path,
+) -> bool:
+    """Copy aiperf artifacts from the model-cache PVC to the local filesystem.
+
+    Spins up a temporary busybox pod that mounts the PVC, uses kubectl cp
+    to extract the artifacts, then deletes the pod.
+
+    Returns True if artifacts were successfully copied and the expected
+    profile_export_aiperf.json exists, False otherwise.
+    """
+    local_dir.mkdir(parents=True, exist_ok=True)
+    artifacts_ok = False
+    helper_name = f"copy-{job_name[-20:]}"
+    pvc_path = f"/model-cache/perf/{job_name}"
+
+    try:
+        # Create a helper pod to access the PVC
+        helper_yaml = f"""apiVersion: v1
+kind: Pod
+metadata:
+  name: {helper_name}
+  namespace: {namespace}
+spec:
+  restartPolicy: Never
+  containers:
+    - name: copy
+      image: busybox:latest
+      command: ["sh", "-c", "echo ready && sleep 300"]
+      volumeMounts:
+        - name: model-cache
+          mountPath: /model-cache
+          readOnly: true
+  volumes:
+    - name: model-cache
+      persistentVolumeClaim:
+        claimName: model-cache
+"""
+        run_kubectl(["apply", "-f", "-"], namespace=namespace, input_data=helper_yaml)
+
+        # Wait for helper pod to be ready
+        for _ in range(30):
+            result = run_kubectl(
+                ["get", "pod", helper_name, "-o", "jsonpath={.status.phase}"],
+                namespace=namespace,
+                check=False,
+            )
+            if result.stdout.strip() == "Running":
+                break
+            time.sleep(2)
+
+        # List what's on the PVC
+        result = run_kubectl(
+            ["exec", helper_name, "--", "ls", "-la", pvc_path],
+            namespace=namespace,
+            check=False,
+        )
+        if result.stdout:
+            print(f"  PVC artifacts ({pvc_path}):")
+            for line in result.stdout.strip().split("\n")[:6]:
+                print(f"    {line}")
+
+        # Copy artifacts locally
+        subprocess.run(
+            [
+                "kubectl",
+                "cp",
+                f"{namespace}/{helper_name}:{pvc_path}/",
+                str(local_dir) + "/",
+            ],
+            capture_output=True,
+            text=True,
+            check=True,
+            timeout=120,
+        )
+        files = list(local_dir.glob("*"))
+        print(f"  Copied {len(files)} artifact files to local")
+        for f in sorted(files)[:5]:
+            print(f"    {f.name} ({f.stat().st_size} bytes)")
+
+        expected = local_dir / "profile_export_aiperf.json"
+        if expected.exists() and expected.stat().st_size > 0:
+            artifacts_ok = True
+        else:
+            print(f"  WARNING: expected artifact missing or empty: {expected.name}")
+
+    except Exception as e:
+        print(f"  WARNING: artifact copy failed: {e}")
+    finally:
+        # Cleanup helper pod
+        run_kubectl(
+            ["delete", "pod", helper_name, "--ignore-not-found", "--grace-period=0"],
+            namespace=namespace,
+            check=False,
+        )
+
+    return artifacts_ok
+
+
+def run_aiperf(
+    artifact_dir: Path,
+    endpoint: str,
+    model_name: str,
+    concurrency: int,
+    isl: int,
+    namespace: str,
+    image: str,
+    run_id: str,
+    osl: int = 256,
+    benchmark_duration: Optional[int] = None,
+    num_requests: Optional[int] = None,
+    request_rate: Optional[int] = None,
+    warmup_duration: Optional[int] = None,
+    warmup_count: Optional[int] = None,
+    export_level: str = "summary",
+    image_pull_secret: str = "",
+    hf_token_secret_name: str = DEFAULT_HF_TOKEN_SECRET_NAME,
+    timeout: int = 600,
+) -> bool:
+    """Run aiperf as a k8s Job inside the namespace.
+
+    Creates a Job with python:3.12-slim, installs aiperf via pip, runs the
+    benchmark against the in-cluster service endpoint, then copies artifacts
+    back to the local filesystem.
+
+    Args:
+        artifact_dir: Local directory for aiperf artifacts.
+        endpoint: In-cluster frontend endpoint (service:port).
+        model_name: Model name for aiperf --model.
+        concurrency: Concurrency level.
+        isl: Input sequence length.
+        namespace: K8s namespace.
+        image: Container image (unused -- uses python:3.12-slim).
+        run_id: Unique run identifier (used in Job name).
+        osl: Output sequence length.
+        benchmark_duration: Optional benchmark duration in seconds.
+        num_requests: Optional request count.
+        request_rate: Optional request rate limit.
+        warmup_duration: Optional warmup duration in seconds.
+        warmup_count: Optional warmup request count.
+        export_level: aiperf export level (summary, records, raw).
+        image_pull_secret: Optional image pull secret for the Job pod.
+        hf_token_secret_name: Secret name that stores HF_TOKEN.
+        timeout: Job timeout in seconds.
+
+    Returns:
+        True if aiperf succeeded, False otherwise.
+    """
+    # Sanitize run_id for k8s naming (lowercase, no underscores, max 63 chars)
+    safe_id = run_id.lower().replace("_", "-")[:40]
+    ts = str(int(time.time()))[-6:]
+    job_name = f"aiperf-{safe_id}-{ts}"
+
+    print(f"  Creating aiperf Job: {job_name} (c={concurrency} isl={isl})")
+
+    script = _build_aiperf_script(
+        model_name=model_name,
+        endpoint=endpoint,
+        concurrency=concurrency,
+        isl=isl,
+        osl=osl,
+        benchmark_duration=benchmark_duration,
+        num_requests=num_requests,
+        request_rate=request_rate,
+        warmup_duration=warmup_duration,
+        warmup_count=warmup_count,
+        export_level=export_level,
+    )
+
+    job_yaml = _build_job_yaml(
+        job_name=job_name,
+        namespace=namespace,
+        script=script,
+        image_pull_secret=image_pull_secret,
+        hf_token_secret_name=hf_token_secret_name,
+    )
+
+    # Create the Job
+    try:
+        run_kubectl(
+            ["apply", "-f", "-"],
+            namespace=namespace,
+            input_data=job_yaml,
+        )
+    except Exception as e:
+        print(f"  ERROR: Failed to create aiperf Job: {e}")
+        return False
+
+    # Wait for completion
+    success = _wait_for_job(job_name, namespace, timeout=timeout)
+
+    # Copy artifacts from PVC regardless of success (partial results may exist)
+    artifacts_ok = _copy_artifacts_from_pvc(job_name, namespace, artifact_dir)
+    if success and not artifacts_ok:
+        print("  Job succeeded but artifacts missing -- marking as failure")
+        success = False
+
+    # Print logs on failure
+    if not success:
+        _print_job_logs(job_name, namespace, tail=30)
+
+    # Clean up the Job
+    run_kubectl(
+        ["delete", "job", job_name, "--ignore-not-found"],
+        namespace=namespace,
+        check=False,
+    )
+
+    return success
--- a/benchmarks/frontend/scripts/sweep_k8s/dgd.py
+++ b/benchmarks/frontend/scripts/sweep_k8s/dgd.py
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""
+DynamoGraphDeployment helpers -- backend switch, restart, readiness.
+
+Ported from sweep.sh functions: dgd_switch_backend, dgd_restart_frontend,
+dgd_restart_graph, dgd_wait_all_ready.
+"""
+
+from __future__ import annotations
+
+import json
+import random
+import subprocess
+import time
+import urllib.error
+import urllib.request
+
+from sweep_k8s.kubectl import (
+    delete_pod,
+    get_json,
+    get_pod_name,
+    patch_json,
+    patch_merge,
+    run_kubectl,
+    wait_for_pod_deletion,
+    wait_pod,
+)
+
+# Tokenizer backend name mapping for DGD env vars
+TOKENIZER_BACKEND_MAP = {
+    "hf": "default",
+    "default": "default",
+    "fast": "fast",
+    "fastokens": "fast",
+}
+
+
+def dgd_label_selector(dgd_name: str, component_type: str) -> str:
+    """Build a label selector for DGD-managed pods."""
+    return (
+        f"nvidia.com/dynamo-graph-deployment-name={dgd_name},"
+        f"nvidia.com/dynamo-component-type={component_type}"
+    )
+
+
+def wait_model_ready(
+    endpoint: str,
+    model_name: str,
+    max_wait: int = 300,
+    namespace: str = "",
+) -> None:
+    """Wait for a model to be registered at the frontend /v1/models endpoint.
+
+    Tries direct HTTP first. If the endpoint is not reachable from localhost
+    (in-cluster DNS), falls back to kubectl run to check from inside the cluster.
+    """
+    print(f"  Waiting for model '{model_name}' at http://{endpoint}/v1/models...")
+    waited = 0
+    while True:
+        # Try direct HTTP (works if endpoint is port-forwarded or localhost)
+        try:
+            req = urllib.request.Request(
+                f"http://{endpoint}/v1/models",
+                headers={"Accept": "application/json"},
+            )
+            with urllib.request.urlopen(req, timeout=10) as resp:
+                data = json.loads(resp.read().decode())
+                models = data.get("data", [])
+                if any(m.get("id") == model_name for m in models):
+                    print(f"  Model ready (waited {waited}s)")
+                    return
+        except (urllib.error.URLError, json.JSONDecodeError, OSError, ValueError):
+            pass
+
+        # Fallback: kubectl-based check for in-cluster endpoints
+        if namespace and _check_model_via_kubectl(endpoint, model_name, namespace):
+            print(f"  Model ready via kubectl (waited {waited}s)")
+            return
+
+        time.sleep(5)
+        waited += 5
+        if waited >= max_wait:
+            print(f"ERROR: Model not ready after {max_wait}s")
+            raise TimeoutError(f"Model '{model_name}' not ready after {max_wait}s")
+        if waited % 15 == 0:
+            print(f"  Still waiting ({waited}s / {max_wait}s)...")
+
+
+def _check_model_via_kubectl(
+    endpoint: str,
+    model_name: str,
+    namespace: str,
+) -> bool:
+    """Check model readiness by running curl from inside the cluster."""
+    pod_name = f"model-check-{int(time.time())}-{random.randint(0, 9999)}"
+    try:
+        result = subprocess.run(
+            [
+                "kubectl",
+                "run",
+                pod_name,
+                "--rm",
+                "-i",
+                "--restart=Never",
+                "-n",
+                namespace,
+                "--quiet",
+                "--image=curlimages/curl:latest",
+                "--",
+                "-sf",
+                f"http://{endpoint}/v1/models",
+            ],
+            capture_output=True,
+            text=True,
+            timeout=20,
+        )
+        if result.returncode == 0 and result.stdout.strip():
+            data = json.loads(result.stdout)
+            models = data.get("data", [])
+            return any(m.get("id") == model_name for m in models)
+    except (subprocess.SubprocessError, json.JSONDecodeError, OSError):
+        pass
+    return False
+
+
+def dgd_wait_all_ready(
+    dgd_name: str,
+    namespace: str,
+    endpoint: str,
+    model_name: str,
+    max_wait: int = 300,
+) -> None:
+    """Wait for all DGD worker pods to be Ready, then wait for model endpoint."""
+    print("  Waiting for all worker pods to be Ready...")
+    retries = 3
+    for attempt in range(retries):
+        try:
+            wait_pod(
+                dgd_label_selector(dgd_name, "worker"),
+                namespace,
+                timeout=max_wait,
+            )
+            break
+        except subprocess.TimeoutExpired:
+            raise
+        except subprocess.CalledProcessError as e:
+            if attempt < retries - 1:
+                print(f"  kubectl error (attempt {attempt + 1}/{retries}), retrying...")
+                time.sleep(5)
+            else:
+                raise RuntimeError(
+                    f"Worker pods not ready after {retries} retries: {e}"
+                ) from e
+
+    wait_model_ready(endpoint, model_name, max_wait, namespace=namespace)
+
+
+def dgd_switch_backend(
+    dgd_name: str,
+    namespace: str,
+    endpoint: str,
+    model_name: str,
+    backend: str,
+) -> None:
+    """Switch tokenizer backend on a DynamoGraphDeployment.
+
+    Patches the DGD spec to set DYN_TOKENIZER_BACKEND; the Grove operator
+    recreates the frontend pod automatically.
+    """
+    mapped_backend = TOKENIZER_BACKEND_MAP.get(backend, backend)
+    print(
+        f"\n--- Switching DGD tokenizer backend -> {mapped_backend} (dgd={dgd_name}) ---"
+    )
+
+    # Find the index of DYN_TOKENIZER_BACKEND in the Frontend env array
+    try:
+        dgd_json = get_json("dgd", dgd_name, namespace)
+        env_list = (
+            dgd_json.get("spec", {})
+            .get("services", {})
+            .get("Frontend", {})
+            .get("extraPodSpec", {})
+            .get("mainContainer", {})
+            .get("env", [])
+        )
+        idx = None
+        for i, env_var in enumerate(env_list):
+            if env_var.get("name") == "DYN_TOKENIZER_BACKEND":
+                idx = i
+                break
+    except Exception:
+        idx = None
+
+    # Capture the current frontend pod name BEFORE patching so we track
+    # the right pod for deletion (avoids racing with the operator).
+    old_pod = get_pod_name(
+        dgd_label_selector(dgd_name, "frontend"),
+        namespace,
+    )
+
+    if idx is not None:
+        patch_json(
+            "dgd",
+            dgd_name,
+            namespace,
+            [
+                {
+                    "op": "replace",
+                    "path": f"/spec/services/Frontend/extraPodSpec/mainContainer/env/{idx}/value",
+                    "value": mapped_backend,
+                }
+            ],
+        )
+    else:
+        patch_json(
+            "dgd",
+            dgd_name,
+            namespace,
+            [
+                {
+                    "op": "add",
+                    "path": "/spec/services/Frontend/extraPodSpec/mainContainer/env/-",
+                    "value": {"name": "DYN_TOKENIZER_BACKEND", "value": mapped_backend},
+                }
+            ],
+        )
+
+    print("  DGD patched -- waiting for frontend pod replacement...")
+    if old_pod:
+        print(f"  Waiting for old pod {old_pod} to terminate...")
+        wait_for_pod_deletion(old_pod, namespace, timeout=120)
+
+    # Wait for new frontend pod to be Ready
+    print("  Waiting for new frontend pod to be Ready...")
+    wait_pod(
+        dgd_label_selector(dgd_name, "frontend"),
+        namespace,
+        timeout=300,
+    )
+
+    dgd_wait_all_ready(dgd_name, namespace, endpoint, model_name)
+
+
+def dgd_restart_frontend(
+    dgd_name: str,
+    namespace: str,
+    endpoint: str,
+    model_name: str,
+) -> None:
+    """Restart only the frontend component to reset metrics counters."""
+    print("  Restarting frontend pod to reset metrics counters...")
+
+    old_pod = get_pod_name(
+        dgd_label_selector(dgd_name, "frontend"),
+        namespace,
+    )
+
+    if old_pod:
+        delete_pod(old_pod, namespace, grace_period=5)
+        print(f"  Waiting for old pod {old_pod} to terminate...")
+        # Wait for delete
+        try:
+            run_kubectl(
+                ["wait", "pod", old_pod, "--for=delete", "--timeout=90s"],
+                namespace=namespace,
+                check=False,
+            )
+        except Exception:
+            pass
+
+    print("  Waiting for new frontend pod to be Ready...")
+    wait_pod(
+        dgd_label_selector(dgd_name, "frontend"),
+        namespace,
+        timeout=300,
+    )
+
+    dgd_wait_all_ready(dgd_name, namespace, endpoint, model_name)
+
+
+def dgd_restart_graph(
+    dgd_name: str,
+    namespace: str,
+    endpoint: str,
+    model_name: str,
+) -> None:
+    """Trigger a full DGD restart through spec.restart.
+
+    Every run starts from a clean graph deployment state.
+    """
+    restart_id = f"bench-{time.strftime('%Y%m%d-%H%M%S')}-{random.randint(0, 9999)}"
+    print(f"  Restarting full DGD deployment (id={restart_id})...")
+
+    # Discover service names from the DGD spec so the restart order is correct
+    # for any backend (mocker, vllm, trtllm, etc.)
+    try:
+        dgd_spec = get_json("dgd", dgd_name, namespace, timeout=60)
+        services = list(dgd_spec.get("spec", {}).get("services", {}).keys())
+        # Put workers before frontend: restart workers first, then frontend
+        frontend_names = [s for s in services if s.lower() == "frontend"]
+        worker_names = [s for s in services if s.lower() != "frontend"]
+        restart_order = worker_names + frontend_names
+    except Exception:
+        restart_order = ["Frontend"]
+
+    print(f"  Restart order: {restart_order}")
+
+    patch_merge(
+        "dgd",
+        dgd_name,
+        namespace,
+        {
+            "spec": {
+                "restart": {
+                    "id": restart_id,
+                    "strategy": {
+                        "type": "Sequential",
+                        "order": restart_order,
+                    },
+                }
+            }
+        },
+    )
+
+    waited = 0
+    phase = "pending"
+    while True:
+        try:
+            state_json = get_json("dgd", dgd_name, namespace, timeout=60)
+            restart_status = state_json.get("status", {}).get("restart", {})
+            observed = restart_status.get("observedID", "")
+            phase = restart_status.get("phase", "")
+
+            if observed == restart_id:
+                if phase == "Completed":
+                    print(f"  DGD restart completed (waited {waited}s)")
+                    break
+                elif phase in ("Failed", "Superseded"):
+                    raise RuntimeError(
+                        f"DGD restart {restart_id} ended with phase={phase}"
+                    )
+        except (KeyError, TypeError):
+            pass
+        except (subprocess.TimeoutExpired, subprocess.CalledProcessError) as e:
+            # Transient kubectl timeout -- retry
+            print(f"  kubectl transient error, retrying... ({e.__class__.__name__})")
+
+        time.sleep(5)
+        waited += 5
+        if waited >= 600:
+            raise TimeoutError(f"Timed out waiting for DGD restart {restart_id}")
+        print(f"  Waiting for DGD restart ({waited}s / 600s)... phase={phase}")
+
+    dgd_wait_all_ready(dgd_name, namespace, endpoint, model_name)
--- a/benchmarks/frontend/scripts/sweep_k8s/kubectl.py
+++ b/benchmarks/frontend/scripts/sweep_k8s/kubectl.py
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""
+Safe kubectl subprocess helpers.
+
+All k8s interactions go through this module for consistent error handling
+and namespace scoping.
+"""
+
+from __future__ import annotations
+
+import json
+import subprocess
+import time
+from typing import Any, Dict, List, Optional
+
+
+def run_kubectl(
+    args: List[str],
+    namespace: Optional[str] = None,
+    capture: bool = True,
+    check: bool = True,
+    timeout: int = 60,
+    input_data: Optional[str] = None,
+) -> subprocess.CompletedProcess:
+    """Run a kubectl command with namespace scoping and error handling.
+
+    Args:
+        args: kubectl arguments (e.g., ["get", "pods"]).
+        namespace: K8s namespace (prepended as -n <namespace>).
+        capture: Whether to capture stdout/stderr.
+        check: Whether to raise on non-zero exit.
+        timeout: Command timeout in seconds.
+        input_data: Optional stdin input.
+
+    Returns:
+        CompletedProcess result.
+    """
+    cmd = ["kubectl"]
+    if namespace:
+        cmd.extend(["-n", namespace])
+    cmd.extend(args)
+
+    result = subprocess.run(
+        cmd,
+        capture_output=capture,
+        text=True,
+        check=False,
+        timeout=timeout,
+        input=input_data,
+    )
+    if check and result.returncode != 0:
+        stderr = result.stderr.strip() if result.stderr else ""
+        print(f"  kubectl error (rc={result.returncode}): {' '.join(args[:4])}")
+        if stderr:
+            print(f"    {stderr}")
+        result.check_returncode()
+    return result
+
+
+def get_json(
+    resource: str,
+    name: str,
+    namespace: str,
+    timeout: int = 30,
+) -> Dict[str, Any]:
+    """Get a k8s resource as a parsed JSON dict."""
+    result = run_kubectl(
+        ["get", resource, name, "-o", "json"],
+        namespace=namespace,
+        timeout=timeout,
+    )
+    return json.loads(result.stdout)
+
+
+def patch_json(
+    resource: str,
+    name: str,
+    namespace: str,
+    patch: List[Dict[str, Any]],
+    timeout: int = 30,
+) -> None:
+    """Apply a JSON patch to a k8s resource."""
+    patch_str = json.dumps(patch)
+    run_kubectl(
+        ["patch", resource, name, "--type=json", f"-p={patch_str}"],
+        namespace=namespace,
+        timeout=timeout,
+    )
+
+
+def patch_merge(
+    resource: str,
+    name: str,
+    namespace: str,
+    patch: Dict[str, Any],
+    timeout: int = 30,
+) -> None:
+    """Apply a strategic merge patch to a k8s resource."""
+    patch_str = json.dumps(patch)
+    run_kubectl(
+        ["patch", resource, name, "--type=merge", f"-p={patch_str}"],
+        namespace=namespace,
+        timeout=timeout,
+    )
+
+
+def wait_pod(
+    label_selector: str,
+    namespace: str,
+    condition: str = "Ready",
+    timeout: int = 300,
+) -> None:
+    """Wait for pod(s) matching a label selector to reach a condition."""
+    run_kubectl(
+        [
+            "wait",
+            "pod",
+            "-l",
+            label_selector,
+            f"--for=condition={condition}",
+            f"--timeout={timeout}s",
+        ],
+        namespace=namespace,
+        timeout=timeout + 10,
+    )
+
+
+def delete_pod(
+    name: str,
+    namespace: str,
+    grace_period: int = 5,
+) -> None:
+    """Delete a pod by name."""
+    run_kubectl(
+        ["delete", "pod", name, f"--grace-period={grace_period}"],
+        namespace=namespace,
+        check=False,
+    )
+
+
+def get_pod_name(
+    label_selector: str,
+    namespace: str,
+) -> Optional[str]:
+    """Get the name of the first pod matching a label selector."""
+    result = run_kubectl(
+        [
+            "get",
+            "pod",
+            "-l",
+            label_selector,
+            "-o",
+            "jsonpath={.items[0].metadata.name}",
+        ],
+        namespace=namespace,
+        check=False,
+    )
+    name = result.stdout.strip()
+    return name if name else None
+
+
+def pod_exists(name: str, namespace: str) -> bool:
+    """Check if a pod exists."""
+    result = run_kubectl(
+        ["get", "pod", name],
+        namespace=namespace,
+        check=False,
+    )
+    return result.returncode == 0
+
+
+def apply_yaml(yaml_content: str, namespace: str) -> None:
+    """Apply YAML content via kubectl apply -f -."""
+    run_kubectl(
+        ["apply", "-f", "-"],
+        namespace=namespace,
+        input_data=yaml_content,
+    )
+
+
+def apply_secret_literal(name: str, namespace: str, key: str, value: str) -> None:
+    """Create or update an opaque Secret from a literal value."""
+    secret_yaml = f"""apiVersion: v1
+kind: Secret
+metadata:
+  name: {name}
+type: Opaque
+stringData:
+  {key}: {json.dumps(value)}
+"""
+    apply_yaml(secret_yaml, namespace)
+
+
+def wait_for_pod_deletion(
+    name: str,
+    namespace: str,
+    timeout: int = 120,
+) -> None:
+    """Wait for a pod to be deleted."""
+    waited = 0
+    while pod_exists(name, namespace):
+        time.sleep(5)
+        waited += 5
+        if waited >= timeout:
+            print(f"  WARNING: pod {name} still present after {timeout}s")
+            break
--- a/benchmarks/frontend/scripts/sweep_k8s/metrics.py
+++ b/benchmarks/frontend/scripts/sweep_k8s/metrics.py
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""
+Prometheus metrics capture for k8s sweeps.
+
+Captures pre/post frontend /metrics snapshots for delta analysis.
+Supports both direct HTTP (when endpoint is reachable) and kubectl-exec
+(when only in-cluster DNS is available).
+"""
+
+from __future__ import annotations
+
+import shlex
+import subprocess
+import time
+import urllib.request
+from pathlib import Path
+from typing import Optional
+
+
+def capture_metrics(
+    endpoint: str,
+    dest: Path,
+    namespace: Optional[str] = None,
+    pod_label: Optional[str] = None,
+) -> None:
+    """Capture frontend /metrics to a file.
+
+    Tries direct HTTP first. If that fails and namespace + pod_label are
+    provided, falls back to kubectl exec curl from the frontend pod.
+
+    Args:
+        endpoint: Frontend endpoint (host:port) -- may be in-cluster DNS.
+        dest: Destination file path.
+        namespace: K8s namespace (for kubectl exec fallback).
+        pod_label: Pod label selector (for kubectl exec fallback).
+    """
+    dest.parent.mkdir(parents=True, exist_ok=True)
+
+    # Try direct HTTP first (works if port-forwarded or on same network)
+    body = _try_http(endpoint)
+
+    # Fallback: kubectl exec into the frontend pod to curl metrics
+    if body is None and namespace and pod_label:
+        body = _try_kubectl_exec(endpoint, namespace, pod_label)
+
+    # Fallback 2: kubectl run a temporary pod to curl
+    if body is None and namespace:
+        body = _try_kubectl_run(endpoint, namespace)
+
+    if body and body.strip():
+        dest.write_text(body)
+        line_count = len(body.strip().split("\n"))
+        print(f"  Metrics captured -> {dest.name} ({line_count} lines)")
+    else:
+        msg = f"# metrics capture failed at {time.strftime('%Y-%m-%dT%H:%M:%S')}\n"
+        dest.write_text(msg)
+        print(f"  WARNING: could not capture metrics from {endpoint}")
+
+
+def _try_http(endpoint: str) -> Optional[str]:
+    """Try fetching metrics via direct HTTP."""
+    try:
+        req = urllib.request.Request(f"http://{endpoint}/metrics")
+        with urllib.request.urlopen(req, timeout=10) as resp:
+            return resp.read().decode()
+    except Exception:
+        return None
+
+
+def _try_kubectl_exec(
+    endpoint: str,
+    namespace: str,
+    pod_label: str,
+) -> Optional[str]:
+    """Fetch metrics by exec-ing curl inside a running pod."""
+    try:
+        # Get a pod name from the label selector
+        result = subprocess.run(
+            [
+                "kubectl",
+                "-n",
+                namespace,
+                "get",
+                "pod",
+                "-l",
+                pod_label,
+                "-o",
+                "jsonpath={.items[0].metadata.name}",
+            ],
+            capture_output=True,
+            text=True,
+            timeout=10,
+        )
+        pod_name = result.stdout.strip()
+        if not pod_name:
+            return None
+
+        # Exec curl inside the pod (curl may not be available; try wget too)
+        safe_endpoint = shlex.quote(endpoint)
+        result = subprocess.run(
+            [
+                "kubectl",
+                "-n",
+                namespace,
+                "exec",
+                pod_name,
+                "--",
+                "sh",
+                "-c",
+                f"curl -sf http://{safe_endpoint}/metrics 2>/dev/null || "
+                f"wget -qO- http://{safe_endpoint}/metrics 2>/dev/null || "
+                f'python3 -c "import urllib.request,sys; print(urllib.request.urlopen(sys.argv[1]).read().decode())" http://{safe_endpoint}/metrics 2>/dev/null',
+            ],
+            capture_output=True,
+            text=True,
+            timeout=15,
+        )
+        if result.returncode == 0 and result.stdout.strip():
+            return result.stdout
+    except Exception:
+        pass
+    return None
+
+
+def _try_kubectl_run(endpoint: str, namespace: str) -> Optional[str]:
+    """Fetch metrics via a one-shot kubectl run --rm pod."""
+    try:
+        result = subprocess.run(
+            [
+                "kubectl",
+                "run",
+                "metrics-fetch",
+                "--rm",
+                "-i",
+                "--restart=Never",
+                "-n",
+                namespace,
+                "--image=curlimages/curl:latest",
+                "--",
+                "-sf",
+                f"http://{endpoint}/metrics",
+            ],
+            capture_output=True,
+            text=True,
+            timeout=30,
+        )
+        if result.returncode == 0 and result.stdout.strip():
+            return result.stdout
+    except Exception:
+        pass
+    return None
--- a/benchmarks/frontend/scripts/sweep_k8s/template.py
+++ b/benchmarks/frontend/scripts/sweep_k8s/template.py
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""
+Deploy YAML template rendering and application.
+
+Supports ${VARIABLE} placeholders using Python's string.Template.
+This enables arbitrary backend deployments (mocker, vLLM, TensorRT-LLM, etc.)
+without hardcoding DGD structures.
+"""
+
+from __future__ import annotations
+
+import string
+from pathlib import Path
+from typing import Dict
+
+from sweep_core.models import DeployDimension, SweepConfig
+from sweep_k8s.kubectl import apply_yaml
+
+# Tokenizer backend mapping for template substitution
+TOKENIZER_TEMPLATE_MAP = {
+    "hf": "default",
+    "default": "default",
+    "fast": "fast",
+    "fastokens": "fast",
+}
+
+DEFAULT_HF_TOKEN_SECRET_NAME = "hf-token-secret"
+
+
+def _indent_block(text: str, spaces: int) -> str:
+    prefix = " " * spaces
+    return "\n".join(f"{prefix}{line}" if line else "" for line in text.splitlines())
+
+
+def _build_image_pull_secrets_block(image_pull_secret: str) -> str:
+    if not image_pull_secret:
+        return ""
+    return _indent_block(
+        f"""imagePullSecrets:
+  - name: {image_pull_secret}""",
+        8,
+    )
+
+
+def build_substitution_dict(
+    deploy: DeployDimension,
+    config: SweepConfig,
+) -> Dict[str, str]:
+    """Build a variable substitution dictionary for template rendering.
+
+    Combines deploy dimensions, sweep config, and k8s config into a flat
+    dictionary suitable for string.Template substitution.
+    """
+    k8s = config.k8s
+    hf_token_secret_name = DEFAULT_HF_TOKEN_SECRET_NAME
+    variables: Dict[str, str] = {
+        # Deploy dimensions
+        "DYN_TOKENIZER_BACKEND": TOKENIZER_TEMPLATE_MAP.get(
+            deploy.tokenizer, deploy.tokenizer
+        ),
+        "NUM_WORKERS": str(deploy.workers),
+        # Model info
+        "MODEL": config.model,
+        "MODEL_NAME": config.model_name,
+        "MODEL_PATH": config.model,
+        # Image
+        "IMAGE": k8s.image,
+        # K8s config
+        "NAMESPACE": k8s.namespace,
+        "DGD_NAME": k8s.dgd_name,
+        "FRONTEND_PORT": str(k8s.frontend_port),
+        "WORKER_REPLICAS": str(k8s.worker_replicas),
+        "FRONTEND_REPLICAS": str(k8s.frontend_replicas),
+        "SPEEDUP_RATIO": str(config.speedup_ratio),
+        "REQUEST_PLANE": k8s.request_plane,
+        "EVENT_PLANE": k8s.event_plane,
+        "ROUTER_MODE": k8s.router_mode,
+        "HF_TOKEN_SECRET_NAME": hf_token_secret_name,
+        "FRONTEND_IMAGE_PULL_SECRETS_BLOCK": _build_image_pull_secrets_block(
+            k8s.image_pull_secret
+        ),
+        "WORKER_IMAGE_PULL_SECRETS_BLOCK": _build_image_pull_secrets_block(
+            k8s.image_pull_secret
+        ),
+    }
+
+    # Add any env_overrides from the deploy dimension
+    variables.update(deploy.env_overrides)
+
+    return variables
+
+
+def render_template(template_path: Path, variables: Dict[str, str]) -> str:
+    """Read a deploy YAML template and substitute ${VAR} placeholders.
+
+    Uses safe_substitute so missing variables are left as-is rather than
+    raising KeyError. This is important because DGD templates may contain
+    ${VARIABLE} references that are resolved by the k8s operator at runtime.
+    """
+    raw = template_path.read_text()
+    tmpl = string.Template(raw)
+    return tmpl.safe_substitute(variables)
+
+
+def apply_rendered_template(
+    template_path: Path,
+    deploy: DeployDimension,
+    config: SweepConfig,
+) -> None:
+    """Render a deploy template and apply it via kubectl."""
+    variables = build_substitution_dict(deploy, config)
+    rendered = render_template(template_path, variables)
+    print(f"  Applying rendered template: {template_path.name}")
+    apply_yaml(rendered, config.k8s.namespace)
--- a/benchmarks/frontend/scripts/sweep_runner.py
+++ b/benchmarks/frontend/scripts/sweep_runner.py
@@ -4,10 +4,12 @@
 """
 Frontend performance sweep runner.

-Standalone Python script that orchestrates performance sweeps by delegating
-each run to run_perf.sh.  Combines the sweep grid logic of sweep.sh with
-the saturation analysis of tasks/sweep.py, and the Prometheus/report
-integration of the analysis scripts.
+Thin CLI entry point that delegates to sweep_core (pure logic), sweep_executors
+(how runs execute), and sweep_k8s (k8s helpers).
+
+Supports two execution modes:
+  - local: delegates each run to run_perf.sh (mocker + frontend per run)
+  - k8s: DGD-based execution with aiperf against a k8s-deployed frontend

 Sweep dimensions (all configurable):
  - tokenizers (hf, fastokens)
@@ -15,644 +17,94 @@ Sweep dimensions (all configurable):
  - ISL values
  - worker counts

-Backends:
-  - mocker (default): fast synthetic backend, no real inference
-  - vllm: real vLLM inference server (produces TTFT/ITL metrics)
-
-Each (tokenizer, concurrency, ISL) point is a separate run_perf.sh invocation.
-Results are collected into CSV + summary.md + per-run reports.
-
 Usage:
-    # Smoke test (2 runs)
+    # Local smoke test (2 runs)
    python3 sweep_runner.py --tokenizers hf,fastokens --concurrency 32 --isl 512 \\
-        --benchmark-duration 30 --speedup-ratio 0
+        --benchmark-duration 30 --speedup-ratio 1000000

-    # Full sweep with mocker
+    # Full local sweep with mocker
    python3 sweep_runner.py --tokenizers hf,fastokens --concurrency 32,64 --isl 512,1024,2048

-    # vLLM backend (real inference)
-    python3 sweep_runner.py --backend vllm --tokenizers hf --concurrency 128 --isl 1024
+    # K8s sweep with DGD
+    python3 sweep_runner.py --mode k8s --dgd-name dynamo-bench-mocker \\
+        --tokenizers hf,fastokens --concurrency 50,100 --isl 512
+
+    # K8s with custom deploy template
+    python3 sweep_runner.py --mode k8s --deploy-template dgd/templates/vllm.yaml \\
+        --tokenizers hf --concurrency 128 --isl 1024

-    # Transport saturation sweep (tasks/sweep.py style)
+    # Transport saturation sweep
    python3 sweep_runner.py --tokenizers hf --concurrency 4096 \\
-        --num-requests 16384,32768 --workers 1,2,4,8 --speedup-ratio 0
+        --num-requests 16384,32768 --workers 1,2,4,8 --speedup-ratio 1000000

    # Dry run
    python3 sweep_runner.py --dry-run --tokenizers hf,fastokens --concurrency 32,64 --isl 512,1024
+
+    # Emit plan as JSON (for Argo or MCP)
+    python3 sweep_runner.py --emit-plan --tokenizers hf --concurrency 50 --isl 512
 """

-import argparse
-import csv
-import json
-import os
-import signal
-import subprocess
 import sys
-import time
-from dataclasses import dataclass
 from pathlib import Path
-from typing import Optional

+# Ensure the scripts directory is on the path for package imports
 SCRIPT_DIR = Path(__file__).resolve().parent
-REPO_ROOT = SCRIPT_DIR.parent.parent.parent
-ANALYSIS_DIR = SCRIPT_DIR / "analysis"
-
-# ── Defaults ─────────────────────────────────────────────────────────────────
-
-DEFAULT_MODEL = "Qwen/Qwen3-0.6B"
-DEFAULT_OSL = 256
-DEFAULT_SPEEDUP = 1.0
-DEFAULT_BENCHMARK_DURATION = 60
-DEFAULT_MAX_CONSECUTIVE_FAILS = 2
-DEFAULT_COOLDOWN = 3
-
-TOKENIZER_MAP = {
-    "fast": "fastokens",
-    "fastokens": "fastokens",
-    "hf": "default",
-    "default": "default",
-}
-
-
-# ── Data ─────────────────────────────────────────────────────────────────────
-
-
-@dataclass
-class RunConfig:
-    """Configuration for a single sweep point."""
-
-    backend: str  # "mocker" or "vllm"
-    tokenizer: str  # "hf" or "fastokens"
-    concurrency: int
-    isl: int
-    osl: int
-    workers: int
-    num_models: int
-    aiperf_targets: str  # "first" or "all"
-    speedup_ratio: float
-    model: str
-    benchmark_duration: Optional[int]
-    num_requests: Optional[int]
-    request_rate: Optional[int]
-
-    @property
-    def run_id(self) -> str:
-        base = f"{self.tokenizer}_c{self.concurrency}_isl{self.isl}_w{self.workers}"
-        if self.num_models > 1:
-            base += f"_m{self.num_models}"
-        if self.request_rate:
-            base += f"_rps{self.request_rate}"
-        return base
-
-
-@dataclass
-class RunResult:
-    """Result from a single sweep point."""
+if str(SCRIPT_DIR) not in sys.path:
+    sys.path.insert(0, str(SCRIPT_DIR))

-    config: RunConfig
-    status: str = "pending"  # ok, fail, skipped
-    req_per_sec: float = 0.0
-    output_tok_per_sec: float = 0.0
-    ttft_p50_ms: float = 0.0
-    ttft_p99_ms: float = 0.0
-    itl_p50_ms: float = 0.0
-    itl_p99_ms: float = 0.0
-    duration_sec: float = 0.0
-    run_dir: str = ""
-
-
-# ── Helpers ──────────────────────────────────────────────────────────────────
-
-
-def _kill_port(port: int):
-    """Kill any process holding a port (SIGTERM first, then SIGKILL)."""
-    subprocess.run(
-        f"fuser -k -TERM {port}/tcp", shell=True, capture_output=True, timeout=5
-    )
-    time.sleep(2)
-    subprocess.run(
-        f"fuser -k -KILL {port}/tcp", shell=True, capture_output=True, timeout=5
-    )
-
-
-def _port_free(port: int) -> bool:
-    import socket
-
-    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-    try:
-        return s.connect_ex(("127.0.0.1", port)) != 0
-    finally:
-        s.close()
-
-
-def _wait_port_free(port: int, timeout: int = 30):
-    """Wait for a port to become free."""
-    for i in range(timeout):
-        if _port_free(port):
-            return
-        if i == 0:
-            print(f"  Waiting for port {port} to free...")
-        time.sleep(1)
-    print(f"  Forcing port {port} release...")
-    _kill_port(port)
-    time.sleep(2)
-
-
-def _parse_aiperf_json(json_path: Path) -> dict:
-    """Parse aiperf profile_export_aiperf.json."""
-    if not json_path.exists():
-        return {}
-    try:
-        data = json.loads(json_path.read_text())
-        result = {}
-        # Request throughput
-        rt = data.get("request_throughput", {})
-        result["req_per_sec"] = rt.get("avg", 0)
-        # Output token throughput
-        ot = data.get("output_token_throughput", {})
-        result["output_tok_per_sec"] = ot.get("avg", 0)
-        # TTFT (aiperf exports in ms already)
-        ttft = data.get("time_to_first_token", data.get("ttft", {}))
-        if isinstance(ttft, dict):
-            result["ttft_p50_ms"] = ttft.get("p50", 0) or 0
-            result["ttft_p99_ms"] = ttft.get("p99", 0) or 0
-        # ITL
-        itl = data.get("inter_token_latency", data.get("itl", {}))
-        if isinstance(itl, dict):
-            result["itl_p50_ms"] = itl.get("p50", 0) or 0
-            result["itl_p99_ms"] = itl.get("p99", 0) or 0
-        # Duration (can be dict with .avg or raw float)
-        bd = data.get("benchmark_duration", 0)
-        result["duration_sec"] = bd.get("avg", 0) if isinstance(bd, dict) else (bd or 0)
-        return result
-    except (json.JSONDecodeError, KeyError, TypeError):
-        return {}
-
-
-def _run_single(
-    cfg: RunConfig,
-    run_dir: Path,
-    passthrough_args: list[str],
-) -> RunResult:
-    """Execute a single run_perf.sh invocation."""
-    result = RunResult(config=cfg, run_dir=str(run_dir))
-
-    cmd = [
-        str(SCRIPT_DIR / "run_perf.sh"),
-        "--model",
-        cfg.model,
-        "--isl",
-        str(cfg.isl),
-        "--osl",
-        str(cfg.osl),
-        "--concurrency",
-        str(cfg.concurrency),
-        "--workers",
-        str(cfg.workers),
-        "--speedup-ratio",
-        str(cfg.speedup_ratio),
-        "--num-models",
-        str(cfg.num_models),
-        "--aiperf-targets",
-        cfg.aiperf_targets,
-        "--output-dir",
-        str(run_dir),
-    ]
-    if cfg.benchmark_duration:
-        cmd.extend(["--benchmark-duration", str(cfg.benchmark_duration)])
-    if cfg.num_requests:
-        cmd.extend(["--num-requests", str(cfg.num_requests)])
-    if cfg.request_rate:
-        cmd.extend(["--request-rate", str(cfg.request_rate)])
-    if cfg.tokenizer in ("fast", "fastokens"):
-        cmd.append("--fast-tokens")
-    # TODO: when run_perf.sh gains --backend vllm support, pass it here
-    if cfg.backend == "vllm":
-        print(
-            "    WARNING: vllm backend not yet supported by run_perf.sh; using mocker"
-        )
-
-    cmd.extend(passthrough_args)
-
-    print(f"    cmd: {' '.join(cmd[:6])}...")
-
-    try:
-        proc = subprocess.Popen(
-            cmd,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.STDOUT,
-            text=True,
-            start_new_session=True,
-        )
-        stdout, _ = proc.communicate(timeout=600)
-
-        if proc.returncode == 0:
-            result.status = "ok"
-        else:
-            result.status = "fail"
-            print(f"    run_perf.sh failed (rc={proc.returncode})")
-            # Print last few lines of output for debugging
-            lines = (stdout or "").strip().split("\n")
-            for line in lines[-5:]:
-                print(f"      {line}")
-
-    except subprocess.TimeoutExpired:
-        result.status = "fail"
-        print("    TIMEOUT after 600s")
-        try:
-            pgid = os.getpgid(proc.pid)
-            os.killpg(pgid, signal.SIGTERM)
-            time.sleep(2)
-            os.killpg(pgid, signal.SIGKILL)
-        except ProcessLookupError:
-            pass  # already exited
-    except Exception as e:
-        result.status = "fail"
-        print(f"    ERROR: {e}")
-
-    # Parse aiperf results -- check both flat and multi-model layouts
-    aiperf_json = run_dir / "aiperf" / "profile_export_aiperf.json"
-    if not aiperf_json.exists():
-        # Multi-model: results are in aiperf/<model-name>/
-        for candidate in sorted(
-            (run_dir / "aiperf").glob("*/profile_export_aiperf.json")
-        ):
-            aiperf_json = candidate
-            break  # Use the first model's results for the summary row
-    metrics = _parse_aiperf_json(aiperf_json)
-    if metrics:
-        result.req_per_sec = metrics.get("req_per_sec", 0)
-        result.output_tok_per_sec = metrics.get("output_tok_per_sec", 0)
-        result.ttft_p50_ms = metrics.get("ttft_p50_ms", 0)
-        result.ttft_p99_ms = metrics.get("ttft_p99_ms", 0)
-        result.itl_p50_ms = metrics.get("itl_p50_ms", 0)
-        result.itl_p99_ms = metrics.get("itl_p99_ms", 0)
-        result.duration_sec = metrics.get("duration_sec", 0)
-
-    return result
-
-
-def _generate_report(run_dir: Path):
-    """Run create_report.py on a single run directory."""
-    try:
-        sys.path.insert(0, str(ANALYSIS_DIR))
-        from create_report import run_analysis
-
-        report = run_analysis(run_dir)
-        (run_dir / "report.md").write_text(report)
-    except Exception as e:
-        print(f"    Report generation failed: {e}")
-
-
-# ── Output ───────────────────────────────────────────────────────────────────
-
-
-def _write_csv(results: list[RunResult], csv_path: Path):
-    """Write incremental CSV (called after each run)."""
-    fieldnames = [
-        "run_id",
-        "backend",
-        "tokenizer",
-        "concurrency",
-        "isl",
-        "osl",
-        "workers",
-        "speedup_ratio",
-        "status",
-        "req_per_sec",
-        "output_tok_per_sec",
-        "ttft_p50_ms",
-        "ttft_p99_ms",
-        "itl_p50_ms",
-        "itl_p99_ms",
-        "duration_sec",
-        "run_dir",
-    ]
-    with open(csv_path, "w", newline="") as f:
-        writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction="ignore")
-        writer.writeheader()
-        for r in results:
-            row = {
-                "run_id": r.config.run_id,
-                "backend": r.config.backend,
-                "tokenizer": r.config.tokenizer,
-                "concurrency": r.config.concurrency,
-                "isl": r.config.isl,
-                "osl": r.config.osl,
-                "workers": r.config.workers,
-                "speedup_ratio": r.config.speedup_ratio,
-                "status": r.status,
-                "req_per_sec": f"{r.req_per_sec:.2f}" if r.req_per_sec else "",
-                "output_tok_per_sec": f"{r.output_tok_per_sec:.1f}"
-                if r.output_tok_per_sec
-                else "",
-                "ttft_p50_ms": f"{r.ttft_p50_ms:.1f}" if r.ttft_p50_ms else "",
-                "ttft_p99_ms": f"{r.ttft_p99_ms:.1f}" if r.ttft_p99_ms else "",
-                "itl_p50_ms": f"{r.itl_p50_ms:.1f}" if r.itl_p50_ms else "",
-                "itl_p99_ms": f"{r.itl_p99_ms:.1f}" if r.itl_p99_ms else "",
-                "duration_sec": f"{r.duration_sec:.1f}" if r.duration_sec else "",
-                "run_dir": r.run_dir,
-            }
-            writer.writerow(row)
-
-
-def _write_summary(results: list[RunResult], summary_path: Path):
-    """Write markdown summary table."""
-    lines = ["# Sweep Summary\n"]
-    lines.append(f"**Generated:** {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
-    lines.append(
-        "| Run ID | Req/s | Tok/s | TTFT p50 | TTFT p99 | ITL p50 | Duration | Status |"
-    )
-    lines.append(
-        "|--------|------:|------:|---------:|---------:|--------:|---------:|--------|"
-    )
-
-    for r in results:
-        rps = f"{r.req_per_sec:.1f}" if r.req_per_sec else "-"
-        tps = f"{r.output_tok_per_sec:.0f}" if r.output_tok_per_sec else "-"
-        tp50 = f"{r.ttft_p50_ms:.1f}ms" if r.ttft_p50_ms else "-"
-        tp99 = f"{r.ttft_p99_ms:.1f}ms" if r.ttft_p99_ms else "-"
-        ip50 = f"{r.itl_p50_ms:.1f}ms" if r.itl_p50_ms else "-"
-        dur = f"{r.duration_sec:.0f}s" if r.duration_sec else "-"
-        lines.append(
-            f"| {r.config.run_id} | {rps} | {tps} | {tp50} | {tp99} | {ip50} | {dur} | {r.status} |"
-        )
-
-    lines.append("")
-    ok = sum(1 for r in results if r.status == "ok")
-    fail = sum(1 for r in results if r.status == "fail")
-    skip = sum(1 for r in results if r.status == "skipped")
-    lines.append(
-        f"**Totals:** {ok} passed, {fail} failed, {skip} skipped out of {len(results)}"
-    )
-
-    summary_path.write_text("\n".join(lines) + "\n")
-
-
-def _print_results_table(results: list[RunResult]):
-    """Print a compact results table to stdout."""
-    print(f"\n{'='*90}")
-    print(
-        f"  {'Run ID':<30} {'Req/s':>8} {'Tok/s':>8} {'TTFT p50':>10} {'TTFT p99':>10} {'Status':>8}"
-    )
-    print(f"  {'-'*30} {'-'*8} {'-'*8} {'-'*10} {'-'*10} {'-'*8}")
-    for r in results:
-        rps = f"{r.req_per_sec:.1f}" if r.req_per_sec else "N/A"
-        tps = f"{r.output_tok_per_sec:.0f}" if r.output_tok_per_sec else "N/A"
-        tp50 = f"{r.ttft_p50_ms:.1f}ms" if r.ttft_p50_ms else "N/A"
-        tp99 = f"{r.ttft_p99_ms:.1f}ms" if r.ttft_p99_ms else "N/A"
-        print(
-            f"  {r.config.run_id:<30} {rps:>8} {tps:>8} {tp50:>10} {tp99:>10} {r.status:>8}"
-        )
-    print(f"{'='*90}")
-
-
-# ── Main ─────────────────────────────────────────────────────────────────────
+from sweep_core.config import build_argument_parser, config_from_args  # noqa: E402
+from sweep_core.orchestrator import run as run_sweep  # noqa: E402
+from sweep_core.planner import build_plan, print_plan  # noqa: E402


 def main():
-    parser = argparse.ArgumentParser(
-        description="Frontend performance sweep runner",
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog="""Examples:
-  # Smoke test
-  python3 sweep_runner.py --tokenizers hf,fastokens --concurrency 32 --isl 512 \\
-      --benchmark-duration 30 --speedup-ratio 0
+    parser = build_argument_parser()

-  # Full tokenizer comparison
-  python3 sweep_runner.py --tokenizers hf,fastokens --concurrency 32,64 --isl 512,1024,2048
-
-  # vLLM backend (real inference)
-  python3 sweep_runner.py --backend vllm --tokenizers hf --concurrency 128 --isl 1024
-
-  # Transport saturation (high concurrency, vary workers)
-  python3 sweep_runner.py --tokenizers hf --concurrency 4096 \\
-      --num-requests 16384,32768 --workers 1,2,4,8 --speedup-ratio 0
-
-  # With profilers (needs sudo for BPF)
-  sudo -E python3 sweep_runner.py --tokenizers hf --concurrency 64 --isl 1024 \\
-      -- --with-nsys --with-perf --with-bpf
-""",
-    )
-
-    parser.add_argument("--model", default=DEFAULT_MODEL, help="HF model path")
-    parser.add_argument(
-        "--backend",
-        choices=["mocker", "vllm"],
-        default="mocker",
-        help="Engine backend: mocker (synthetic) or vllm (real inference)",
-    )
-    parser.add_argument(
-        "--tokenizers",
-        default="hf,fastokens",
-        help="Comma-separated tokenizer backends (hf, fastokens)",
-    )
-    parser.add_argument(
-        "--concurrency", default="50,100,200", help="Comma-separated concurrency levels"
-    )
-    parser.add_argument(
-        "--isl", default="512,1024,2048", help="Comma-separated ISL values"
-    )
-    parser.add_argument(
-        "--osl", type=int, default=DEFAULT_OSL, help="Output sequence length"
-    )
-    parser.add_argument(
-        "--workers", default="2", help="Comma-separated worker counts per model"
-    )
-    parser.add_argument(
-        "--num-models",
-        type=int,
-        default=1,
-        help="Number of model instances (each gets --workers workers, named model-1, model-2, ...)",
-    )
-    parser.add_argument(
-        "--aiperf-targets",
-        choices=["first", "all"],
-        default="first",
-        help="'first': aiperf targets model-1 only (default). 'all': run aiperf for each model.",
-    )
-    parser.add_argument(
-        "--speedup-ratio",
-        type=float,
-        default=DEFAULT_SPEEDUP,
-        help="Mocker speedup (0=infinite)",
-    )
-    parser.add_argument(
-        "--benchmark-duration",
-        type=int,
-        default=DEFAULT_BENCHMARK_DURATION,
-        help="aiperf duration (seconds)",
-    )
-    parser.add_argument(
-        "--num-requests",
-        default=None,
-        help="Comma-separated request counts (overrides --benchmark-duration)",
-    )
-    parser.add_argument(
-        "--rps",
-        default=None,
-        help="Comma-separated target request rates (req/s). Sweep dimension when multiple values given.",
-    )
-    parser.add_argument(
-        "--output-dir",
-        default=None,
-        help="Output directory (default: auto timestamped)",
-    )
-    parser.add_argument(
-        "--max-consecutive-fails", type=int, default=DEFAULT_MAX_CONSECUTIVE_FAILS
-    )
+    # Add CLI-only flags that don't belong in SweepConfig
    parser.add_argument(
-        "--cooldown", type=int, default=DEFAULT_COOLDOWN, help="Seconds between runs"
-    )
-    parser.add_argument(
-        "--dry-run", action="store_true", help="Print plan without executing"
-    )
-    parser.add_argument(
-        "--no-report", action="store_true", help="Skip per-run report generation"
-    )
-    parser.add_argument(
-        "passthrough", nargs="*", help="Extra args passed to run_perf.sh (after --)"
+        "--emit-plan",
+        action="store_true",
+        help="Print the sweep plan as JSON and exit (no execution)",
    )

    args = parser.parse_args()

-    # Parse lists
-    tokenizers = [t.strip() for t in args.tokenizers.split(",")]
-    concurrencies = [int(c) for c in args.concurrency.split(",")]
-    isls = [int(i) for i in args.isl.split(",")]
-    worker_counts = [int(w) for w in args.workers.split(",")]
-    num_requests_list = (
-        [int(n) for n in args.num_requests.split(",")] if args.num_requests else [None]
-    )
-    rps_list = [int(r) for r in args.rps.split(",")] if args.rps else [None]
-
-    # Build sweep grid
-    configs: list[RunConfig] = []
-    for tokenizer in tokenizers:
-        for workers in worker_counts:
-            for concurrency in concurrencies:
-                for isl in isls:
-                    for nr in num_requests_list:
-                        for rps in rps_list:
-                            configs.append(
-                                RunConfig(
-                                    backend=args.backend,
-                                    tokenizer=tokenizer,
-                                    concurrency=concurrency,
-                                    isl=isl,
-                                    osl=args.osl,
-                                    workers=workers,
-                                    num_models=args.num_models,
-                                    aiperf_targets=args.aiperf_targets,
-                                    speedup_ratio=args.speedup_ratio,
-                                    model=args.model,
-                                    benchmark_duration=args.benchmark_duration
-                                    if nr is None
-                                    else None,
-                                    num_requests=nr,
-                                    request_rate=rps,
-                                )
-                            )
+    # Build typed config from args
+    config = config_from_args(args)

-    # Output directory
-    if args.output_dir:
-        output_root = Path(args.output_dir)
-    else:
-        ts = time.strftime("%Y%m%d_%H%M%S")
-        output_root = REPO_ROOT / "artifacts" / f"sweep_{ts}"
-
-    total = len(configs)
-    print(f"Sweep plan: {total} runs")
-    print(f"  Model:          {args.model}")
-    print(f"  Backend:        {args.backend}")
-    print(f"  Tokenizers:     {tokenizers}")
-    print(f"  Concurrencies:  {concurrencies}")
-    print(f"  ISLs:           {isls}")
-    print(f"  Workers/model:  {worker_counts}")
-    print(f"  Models:         {args.num_models}")
-    print(f"  Benchmark dur:  {args.benchmark_duration}s")
-    if args.num_requests:
-        print(f"  Num requests:   {[int(n) for n in args.num_requests.split(',')]}")
-    if args.rps:
-        print(f"  Request rates:  {[int(r) for r in args.rps.split(',')]} req/s")
-    print(f"  Output:         {output_root}")
-    print()
+    # Build plan
+    plan = build_plan(config)
+    print_plan(plan)

-    if args.dry_run:
-        for i, cfg in enumerate(configs, 1):
-            print(f"  [{i}/{total}] {cfg.run_id}")
+    # Emit plan JSON mode
+    if args.emit_plan:
+        print(plan.to_json())
        return

-    output_root.mkdir(parents=True, exist_ok=True)
-    csv_path = output_root / "results.csv"
-    summary_path = output_root / "summary.md"
-
-    # Passthrough args for run_perf.sh (e.g., --skip-bpf --skip-nsys)
-    passthrough = args.passthrough or []
-
-    results: list[RunResult] = []
-    consecutive_fails: dict[tuple, int] = {}  # (backend, concurrency, workers) -> count
-
-    try:
-        for i, cfg in enumerate(configs, 1):
-            key = (cfg.backend, cfg.concurrency, cfg.workers)
-            run_dir = output_root / cfg.run_id
-
-            # Skip after consecutive failures
-            if consecutive_fails.get(key, 0) >= args.max_consecutive_fails:
-                result = RunResult(config=cfg, status="skipped", run_dir=str(run_dir))
-                results.append(result)
-                print(
-                    f"\n  [{i}/{total}] SKIPPED {cfg.run_id} ({args.max_consecutive_fails} consecutive failures)"
-                )
-                continue
-
-            print(f"\n{'='*60}")
-            print(f"  [{i}/{total}] {cfg.run_id}")
-            print(f"{'='*60}")
-
-            # Wait for port from previous run
-            _wait_port_free(8000)
-
-            # Run
-            result = _run_single(cfg, run_dir, passthrough)
-            results.append(result)
-
-            # Update consecutive failure tracking
-            if result.status == "ok":
-                consecutive_fails[key] = 0
-                rps = f"{result.req_per_sec:.1f}" if result.req_per_sec else "N/A"
-                tp50 = f"{result.ttft_p50_ms:.1f}ms" if result.ttft_p50_ms else "N/A"
-                print(f"    OK: {rps} req/s, TTFT p50={tp50}")
-            else:
-                consecutive_fails[key] = consecutive_fails.get(key, 0) + 1
-                print(
-                    f"    FAIL (consecutive: {consecutive_fails[key]}/{args.max_consecutive_fails})"
-                )
-
-            # Generate per-run report
-            if not args.no_report and result.status == "ok":
-                _generate_report(run_dir)
+    # Dry run mode
+    if config.dry_run:
+        for i, run_spec in enumerate(plan.runs, 1):
+            print(f"  [{i}/{plan.total_runs}] {run_spec.run_id}")
+        return

-            # Write incremental CSV + summary
-            _write_csv(results, csv_path)
-            _write_summary(results, summary_path)
+    # Select executor based on mode
+    if config.mode == "local":
+        from sweep_executors.local import LocalExecutor

-            # Cooldown
-            if i < total:
-                time.sleep(args.cooldown)
+        executor = LocalExecutor()
+    elif config.mode == "k8s":
+        from sweep_executors.k8s_dgd import K8sDgdExecutor

-    except KeyboardInterrupt:
-        print("\n\nInterrupted! Partial results saved.")
-    finally:
-        _write_csv(results, csv_path)
-        _write_summary(results, summary_path)
+        executor = K8sDgdExecutor()
+    else:
+        print(
+            f"ERROR: Unknown mode '{config.mode}'. Use 'local' or 'k8s'.",
+            file=sys.stderr,
+        )
+        sys.exit(1)

-    # Final output
-    _print_results_table(results)
-    print(f"\nResults:  {csv_path}")
-    print(f"Summary:  {summary_path}")
-    print(f"Per-run:  {output_root}/<run_id>/report.md")
+    # Run the sweep
+    run_sweep(plan, executor)


 if __name__ == "__main__":