feat: remove kubectl dependencies from benchmarking (#3098)

Signed-off-by: Hannah Zhang <hannahz@nvidia.com> Signed-off-by: hhzhang16 <54051230+hhzhang16@users.noreply.github.com> Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>

feat: remove kubectl dependencies from benchmarking (#3098)
Signed-off-by: Hannah Zhang <hannahz@nvidia.com> Signed-off-by: hhzhang16 <54051230+hhzhang16@users.noreply.github.com> Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
20b7a8ae · hhzhang16 · GitHub · 3b6dbef2 · 20b7a8ae · 20b7a8ae
Unverified Commit 20b7a8ae authored Sep 18, 2025 by hhzhang16 Committed by GitHub Sep 18, 2025
4 changed files
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -24,10 +24,10 @@ First, deploy your DynamoGraphDeployment using the [deployment documentation](..

 ```bash
 # Port-forward your deployment to http://localhost:8000
-kubectl port-forward -n <namespace> svc/<frontend-service-name> 8000:8000 &
+kubectl port-forward -n <namespace> svc/<frontend-service-name> 8000:8000 > /dev/null 2>&1 &

 # Run benchmark
-python3 -m benchmarks.utils.benchmark --namespace <namespace> \
+python3 -m benchmarks.utils.benchmark \
    --input my-benchmark=http://localhost:8000 \
    --model "<your-model>"


--- a/benchmarks/utils/benchmark.py
+++ b/benchmarks/utils/benchmark.py
@@ -4,24 +4,39 @@
 # SPDX-License-Identifier: Apache-2.0

 import argparse
-import asyncio
+import re
 import sys
-from typing import Tuple
+from typing import Dict, Tuple

-from benchmarks.utils.workflow import categorize_inputs, run_benchmark_workflow
+from benchmarks.utils.workflow import run_benchmark_workflow
+
+
+def validate_inputs(inputs: Dict[str, str]) -> None:
+    """Validate that all inputs are HTTP endpoints"""
+    for label, value in inputs.items():
+        if not value.lower().startswith(("http://", "https://")):
+            raise ValueError(
+                f"Input '{label}' must be an HTTP endpoint (starting with http:// or https://). Got: {value}"
+            )
+
+        # Validate reserved labels
+        if label.lower() == "plots":
+            raise ValueError(
+                "Label 'plots' is reserved and cannot be used. Please choose a different label."
+            )


 def parse_input(input_str: str) -> Tuple[str, str]:
    """Parse input string in format key=value with additional validation"""
    if "=" not in input_str:
        raise ValueError(
-            f"Invalid input format. Expected: <label>=<manifest_path_or_endpoint>, got: {input_str}"
+            f"Invalid input format. Expected: <label>=<endpoint>, got: {input_str}"
        )

    parts = input_str.split("=", 1)  # Split on first '=' only
    if len(parts) != 2:
        raise ValueError(
-            f"Invalid input format. Expected: <label>=<manifest_path_or_endpoint>, got: {input_str}"
+            f"Invalid input format. Expected: <label>=<endpoint>, got: {input_str}"
        )

    label, value = parts
@@ -35,8 +50,6 @@ def parse_input(input_str: str) -> Tuple[str, str]:
    value = value.strip()

    # Validate label characters
-    import re
-
    if not re.match(r"^[a-zA-Z0-9_-]+$", label):
        raise ValueError(
            f"Label must contain only letters, numbers, hyphens, and underscores. Invalid label: {label}"
@@ -51,9 +64,8 @@ def main() -> int:
        "--input",
        action="append",
        dest="inputs",
-        help="Input in format <label>=<manifest_path_or_endpoint>. Can be specified multiple times for comparisons.",
+        help="Input in format <label>=<endpoint>. Can be specified multiple times for comparisons.",
    )
-    parser.add_argument("--namespace", required=True, help="Kubernetes namespace")
    parser.add_argument("--isl", type=int, default=2000, help="Input sequence length")
    parser.add_argument(
        "--std",
@@ -102,23 +114,21 @@ def main() -> int:
            )
            print()

-        endpoints, manifests = categorize_inputs(parsed_inputs)
+        # Validate that all inputs are HTTP endpoints
+        validate_inputs(parsed_inputs)

-    except (ValueError, FileNotFoundError) as e:
+    except ValueError as e:
        print(f"ERROR: {e}")
        return 1

    # Run the benchmark workflow with the parsed inputs
-    asyncio.run(
-        run_benchmark_workflow(
-            namespace=args.namespace,
-            inputs=parsed_inputs,
-            isl=args.isl,
-            std=args.std,
-            osl=args.osl,
-            model=args.model,
-            output_dir=args.output_dir,
-        )
+    run_benchmark_workflow(
+        inputs=parsed_inputs,
+        isl=args.isl,
+        std=args.std,
+        osl=args.osl,
+        model=args.model,
+        output_dir=args.output_dir,
    )
    return 0


--- a/benchmarks/utils/workflow.py
+++ b/benchmarks/utils/workflow.py
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0

-from dataclasses import dataclass
 from pathlib import Path
-from typing import Callable, Dict, List, Tuple
+from typing import Dict, List

 from benchmarks.utils.genai import run_concurrency_sweep
 from benchmarks.utils.plot import generate_plots
-from deploy.utils.dynamo_deployment import DynamoDeploymentClient
-
-
-@dataclass
-class DeploymentConfig:
-    """Configuration for a single deployment type"""
-
-    name: str  # Human-readable name (e.g., "aggregated")
-    manifest_path: str  # Path to deployment manifest
-    output_subdir: str  # Subdirectory name for results (e.g., "agg")
-    client_factory: Callable  # Function to create the client
-    deploy_func: Callable  # Function to deploy the client
-
-
-def create_dynamo_client(
-    namespace: str, deployment_name: str
-) -> DynamoDeploymentClient:
-    """Factory function for DynamoDeploymentClient"""
-    return DynamoDeploymentClient(namespace=namespace, deployment_name=deployment_name)
-
-
-async def deploy_dynamo_client(
-    client: DynamoDeploymentClient, manifest_path: str
-) -> None:
-    """Deploy a DynamoDeploymentClient"""
-    await client.create_deployment(manifest_path)
-    await client.wait_for_deployment_ready(timeout=1800)
-
-
-async def teardown(client) -> None:
-    """Clean up deployment and stop port forwarding"""
-    try:
-        if hasattr(client, "stop_port_forward"):
-            client.stop_port_forward()
-        await client.delete_deployment()
-    except Exception:
-        pass
-
-
-def print_deployment_start(config: DeploymentConfig, output_dir: str) -> None:
-    """Print deployment start messages"""
-    print(f"🚀 Starting {config.name} deployment benchmark...")
-    print(f"📄 Manifest: {config.manifest_path}")
-    print(f"📁 Results will be saved to: {Path(output_dir) / config.output_subdir}")


 def print_concurrency_start(
-    deployment_name: str, model: str, isl: int, osl: int, std: int
+    label: str, model: str, isl: int, osl: int, std: int
 ) -> None:
    """Print concurrency sweep start messages"""
-    print(f"⚙️  Starting {deployment_name} concurrency sweep!", flush=True)
+    print(f"⚙️  Starting {label} concurrency sweep!", flush=True)
    print(
        "⏱️  This may take several minutes - running through multiple concurrency levels...",
        flush=True,
@@ -65,65 +20,22 @@ def print_concurrency_start(
    print(f"🎯 Model: {model} | ISL: {isl} | OSL: {osl} | StdDev: {std}")


-def print_deployment_complete(config: DeploymentConfig) -> None:
-    """Print deployment completion message"""
-    print(f"✅ {config.name.title()} deployment benchmark completed successfully!")
-
-
-def print_deployment_skip(deployment_type: str) -> None:
-    """Print deployment skip message"""
-    print(f"⏭️  Skipping {deployment_type} deployment (not specified)")
-
-
-async def run_single_deployment_benchmark(
-    config: DeploymentConfig,
-    namespace: str,
-    output_dir: str,
-    model: str,
-    isl: int,
-    osl: int,
-    std: int,
-) -> None:
-    """Run benchmark for a single deployment type"""
-    print_deployment_start(config, output_dir)
-
-    # Create and deploy client
-    client = config.client_factory(namespace, config.output_subdir)
-    await config.deploy_func(client, config.manifest_path)
-
-    try:
-        print_concurrency_start(config.name, model, isl, osl, std)
-
-        # Run concurrency sweep
-        (Path(output_dir) / config.output_subdir).mkdir(parents=True, exist_ok=True)
-        run_concurrency_sweep(
-            service_url=client.port_forward_frontend(quiet=True),
-            model_name=model,
-            isl=isl,
-            osl=osl,
-            stddev=std,
-            output_dir=Path(output_dir) / config.output_subdir,
-        )
-
-    finally:
-        await teardown(client)
-
-    print_deployment_complete(config)
-
-
-async def run_endpoint_benchmark(
+def run_endpoint_benchmark(
    label: str,
    endpoint: str,
    model: str,
    isl: int,
    osl: int,
    std: int,
-    output_dir: str,
+    output_dir: Path,
 ) -> None:
    """Run benchmark for an existing endpoint with custom label"""
    print(f"🚀 Starting benchmark of endpoint '{label}': {endpoint}")
-    print(f"📁 Results will be saved to: {Path(output_dir) / label}")
-    print_concurrency_start(f"endpoint ({label})", model, isl, osl, std)
+    print(f"📁 Results will be saved to: {output_dir / label}")
+    print_concurrency_start(label, model, isl, osl, std)
+
+    # Create output directory
+    (output_dir / label).mkdir(parents=True, exist_ok=True)

    run_concurrency_sweep(
        service_url=endpoint,
@@ -131,122 +43,45 @@ async def run_endpoint_benchmark(
        isl=isl,
        osl=osl,
        stddev=std,
-        output_dir=Path(output_dir) / label,
+        output_dir=output_dir / label,
    )
    print("✅ Endpoint benchmark completed successfully!")


-def print_final_summary(output_dir: str, deployed_types: List[str]) -> None:
+def print_final_summary(output_dir: Path, labels: List[str]) -> None:
    """Print final benchmark summary"""
    print("📊 Generating performance plots...")
-    generate_plots(
-        base_output_dir=Path(output_dir), output_dir=Path(output_dir) / "plots"
-    )
-    print(f"📈 Plots saved to: {Path(output_dir) / 'plots'}")
-    print(f"📋 Summary saved to: {Path(output_dir) / 'SUMMARY.txt'}")
+    generate_plots(base_output_dir=output_dir, output_dir=output_dir / "plots")
+    print(f"📈 Plots saved to: {output_dir / 'plots'}")
+    print(f"📋 Summary saved to: {output_dir / 'plots' / 'SUMMARY.txt'}")

    print()
    print("🎉 Benchmark workflow completed successfully!")
    print(f"📁 All results available at: {output_dir}")

-    if deployed_types:
-        print(f"🚀 Benchmarked deployments: {', '.join(deployed_types)}")
+    if labels:
+        print(f"🚀 Benchmarked: {', '.join(labels)}")

-    print(f"📊 View plots at: {Path(output_dir) / 'plots'}")
+    print(f"📊 View plots at: {output_dir / 'plots'}")


-def categorize_inputs(inputs: Dict[str, str]) -> Tuple[Dict[str, str], Dict[str, str]]:
-    """Categorize inputs into endpoints and manifests"""
-    endpoints = {}
-    manifests = {}
-
-    for label, value in inputs.items():
-        # Validate reserved labels
-        if label.lower() == "plots":
-            raise ValueError(
-                "Label 'plots' is reserved and cannot be used. Please choose a different label."
-            )
-
-        if value.startswith(("http://", "https://")):
-            endpoints[label] = value
-        else:
-            # It should be a file path - validate it exists
-            if not Path(value).is_file():
-                raise FileNotFoundError(
-                    f"Manifest file not found for input '{label}': {value}"
-                )
-            manifests[label] = value
-
-    return endpoints, manifests
-
-
-def validate_dynamo_manifest(manifest_path: str) -> None:
-    """Validate that the manifest is a DynamoGraphDeployment"""
-    try:
-        with open(manifest_path, "r") as f:
-            content = f.read()
-
-        # Check for DynamoGraphDeployment
-        if "kind: DynamoGraphDeployment" not in content:
-            raise ValueError(
-                f"Manifest {manifest_path} is not a DynamoGraphDeployment. Only DynamoGraphDeployments are supported for deployment benchmarking."
-            )
-
-    except FileNotFoundError:
-        raise FileNotFoundError(f"Manifest file not found: {manifest_path}")
-    except Exception as e:
-        raise ValueError(f"Error reading manifest {manifest_path}: {e}")
-
-
-async def run_benchmark_workflow(
-    namespace: str,
+def run_benchmark_workflow(
    inputs: Dict[str, str],
-    isl: int = 200,
+    isl: int = 2000,
    std: int = 10,
-    osl: int = 200,
-    model: str = "nvidia/Llama-3.1-8B-Instruct-FP8",
+    osl: int = 256,
+    model: str = "Qwen/Qwen3-0.6B",
    output_dir: str = "benchmarks/results",
 ) -> None:
-    """Main benchmark workflow orchestrator with dynamic inputs"""
-    Path(output_dir).mkdir(parents=True, exist_ok=True)
-
-    # Categorize inputs into endpoints and manifests
-    endpoints, manifests = categorize_inputs(inputs)
+    """Main benchmark workflow orchestrator for HTTP endpoints only"""
+    output_dir_path = Path(output_dir)
+    output_dir_path.mkdir(parents=True, exist_ok=True)

    # Run endpoint benchmarks
-    for label, endpoint in endpoints.items():
-        await run_endpoint_benchmark(label, endpoint, model, isl, osl, std, output_dir)
-
-    # Create deployment configurations for manifests
-    deployment_configs = []
-
-    for label, manifest_path in manifests.items():
-        # Validate that it's a DynamoGraphDeployment
-        validate_dynamo_manifest(manifest_path)
-
-        config = DeploymentConfig(
-            name=label,
-            manifest_path=manifest_path,
-            output_subdir=label,
-            client_factory=create_dynamo_client,
-            deploy_func=deploy_dynamo_client,
-        )
-
-        deployment_configs.append(config)
-
-    # Run benchmarks for each deployment type
-    deployed_labels = list(endpoints.keys())
-    for config in deployment_configs:
-        await run_single_deployment_benchmark(
-            config=config,
-            namespace=namespace,
-            output_dir=output_dir,
-            model=model,
-            isl=isl,
-            osl=osl,
-            std=std,
-        )
-        deployed_labels.append(config.name)
+    benchmarked_labels = []
+    for label, endpoint in inputs.items():
+        run_endpoint_benchmark(label, endpoint, model, isl, osl, std, output_dir_path)
+        benchmarked_labels.append(label)

    # Generate final summary
-    print_final_summary(output_dir, deployed_labels)
+    print_final_summary(output_dir_path, benchmarked_labels)
--- a/docs/benchmarks/benchmarking.md
+++ b/docs/benchmarks/benchmarking.md
@@ -41,7 +41,12 @@ The framework is a Python-based wrapper around `genai-perf` that:

 3. **kubectl access** - You need `kubectl` installed and configured to access your Kubernetes cluster.

-4. **Benchmark dependencies** - Since benchmarks run locally, you need to install the required Python dependencies. Install them using:
+4. **HTTP endpoints** - Ensure you have HTTP endpoints available for benchmarking. These can be:
+   - DynamoGraphDeployments exposed via HTTP endpoints
+   - External services (vLLM, llm-d, AIBrix, etc.)
+   - Any HTTP endpoint serving HuggingFace-compatible models
+
+5. **Benchmark dependencies** - Since benchmarks run locally, you need to install the required Python dependencies. Install them using:
   ```bash
   pip install -r deploy/utils/requirements.txt
   ```
@@ -59,11 +64,11 @@ Deploy your DynamoGraphDeployments separately using the [deployment documentatio
 ### Step 3: Port-Forward and Benchmark Deployment A
 ```bash
 # Port-forward the frontend service for deployment A
-kubectl port-forward -n <namespace> svc/<frontend-service-name> 8000:8000 &
+kubectl port-forward -n <namespace> svc/<frontend-service-name> 8000:8000 > /dev/null 2>&1 &
 # Note: remember to stop the port-forward process after benchmarking.

 # Benchmark deployment A using Python scripts
-python3 -m benchmarks.utils.benchmark --namespace <namespace> \
+python3 -m benchmarks.utils.benchmark \
   --input deployment-a=http://localhost:8000 \
   --model "your-model-name" \
   --output-dir ./benchmarks/results
@@ -75,10 +80,10 @@ If comparing multiple deployments, teardown deployment A and deploy deployment B
 ### Step 5: [If Comparative] Port-Forward and Benchmark Deployment B
 ```bash
 # Port-forward the frontend service for deployment B
-kubectl port-forward -n <namespace> <frontend-service-name> 8001:8000 &
+kubectl port-forward -n <namespace> svc/<frontend-service-name> 8001:8000 > /dev/null 2>&1 &

 # Benchmark deployment B using Python scripts
-python3 -m benchmarks.utils.benchmark --namespace <namespace> \
+python3 -m benchmarks.utils.benchmark \
   --input deployment-b=http://localhost:8001 \
   --model "your-model-name" \
   --output-dir ./benchmarks/results
@@ -90,35 +95,6 @@ python3 -m benchmarks.utils.benchmark --namespace <namespace> \
 python3 -m benchmarks.utils.plot --data-dir ./benchmarks/results
 ```

-## Example Commands
-
-### Single Deployment Benchmark
-```bash
-# Port-forward and benchmark a single deployment
-kubectl port-forward -n my-namespace svc/my-frontend-service 8000:8000 &
-python3 -m benchmarks.utils.benchmark --namespace my-namespace \
-   --input my-deployment=http://localhost:8000 \
-   --model "meta-llama/Meta-Llama-3-8B"
-```
-
-### Comparative Benchmark
-```bash
-# Benchmark deployment A
-kubectl port-forward -n my-namespace svc/agg-frontend 8000:8000 &
-python3 -m benchmarks.utils.benchmark --namespace my-namespace \
-   --input aggregated=http://localhost:8000 \
-   --model "meta-llama/Meta-Llama-3-8B"
-
-# Benchmark deployment B (different port)
-kubectl port-forward -n my-namespace svc/disagg-frontend 8001:8000 &
-python3 -m benchmarks.utils.benchmark --namespace my-namespace \
-   --input disaggregated=http://localhost:8001 \
-   --model "meta-llama/Meta-Llama-3-8B"
-
-# Generate comparison plots
-python3 -m benchmarks.utils.plot --data-dir ./benchmarks/results
-```
-
 ## Use Cases

 The benchmarking framework supports various comparative analysis scenarios:
@@ -135,12 +111,11 @@ The benchmarking framework supports various comparative analysis scenarios:
 ### Command Line Options

 ```bash
-python3 -m benchmarks.utils.benchmark --namespace NAMESPACE --input <label>=<endpoint_url> [--input <label>=<endpoint_url>]... [OPTIONS]
+python3 -m benchmarks.utils.benchmark --input <label>=<endpoint_url> [--input <label>=<endpoint_url>]... [OPTIONS]

 REQUIRED:
-  -n, --namespace NAMESPACE           Kubernetes namespace
  --input <label>=<endpoint_url>     Benchmark input with custom label
-                                        - <label>: becomes the name/label in plots
+                                        - <label>: becomes the name/label in plots (see Important Notes for restrictions)
                                        - <endpoint_url>: HTTP endpoint URL (e.g., http://localhost:8000)
                                        Can be specified multiple times for comparisons

@@ -179,35 +154,8 @@ The Python plotting module:

 The benchmarking framework supports any HuggingFace-compatible LLM model. Specify your model in the benchmark script's `--model` parameter. It must match the model name of the deployment. You can override the default sequence lengths (2000/256 tokens) with `--isl` and `--osl` flags if needed for your specific workload.

-### Python Script Usage
-
-The benchmarking framework is built around Python modules that provide direct control over the benchmark workflow:
-
-```bash
-# Endpoint benchmarking
-python3 -u -m benchmarks.utils.benchmark \
-   --input experiment-a=http://your-endpoint:8000 \
-   --namespace $NAMESPACE \
-   --isl 2000 \
-   --std 10 \
-   --osl 256 \
-   --output-dir $OUTPUT_DIR
-
-# Deployment benchmarking (any combination)
-python3 -u -m benchmarks.utils.benchmark \
-   --input experiment-a=http://localhost:8000 \
-   --input experiment-b=http://localhost:8005 \
-   --namespace my-namespace \
-   --isl 2000 \
-   --std 10 \
-   --osl 256 \
-   --output-dir ./benchmarks/results
-
-# Generate plots separately
-python3 -m benchmarks.utils.plot --data-dir $OUTPUT_DIR
-```
+The benchmarking framework is built around Python modules that provide direct control over the benchmark workflow. The Python benchmarking module connects to your existing endpoints, runs the benchmarks, and can generate plots. Deployment is user-managed and out of scope for this tool.

-**Note**: The Python benchmarking module connects to your existing endpoints, runs the benchmarks, and can generate plots. Deployment is user-managed and out of scope for this tool.
 ### Comparison Limitations

 The plotting system supports up to 12 different inputs in a single comparison. If you need to compare more than 12 different deployments/endpoints, consider running separate benchmark sessions or grouping related comparisons together.
@@ -218,11 +166,13 @@ You can customize the concurrency levels using the CONCURRENCIES environment var

 ```bash
 # Custom concurrency levels
-CONCURRENCIES="1,5,20,50" python3 -m benchmarks.utils.benchmark --namespace $NAMESPACE --input my-test=http://localhost:8000
+CONCURRENCIES="1,5,20,50" python3 -m benchmarks.utils.benchmark \
+    --input my-test=http://localhost:8000

 # Or set permanently
 export CONCURRENCIES="1,2,5,10,25,50,100"
-python3 -m benchmarks.utils.benchmark --namespace $NAMESPACE --input test=http://localhost:8000
+python3 -m benchmarks.utils.benchmark \
+    --input test=http://localhost:8000
 ```

 ## Understanding Your Results