feat: update how inputs are input into the benchmark script (#3187)

Signed-off-by: Hannah Zhang <hannahz@nvidia.com>

feat: update how inputs are input into the benchmark script (#3187)
Signed-off-by: Hannah Zhang <hannahz@nvidia.com>
fb12b67f · hhzhang16 · GitHub · 980727bb · fb12b67f · fb12b67f
Unverified Commit fb12b67f authored Sep 24, 2025 by hhzhang16 Committed by GitHub Sep 24, 2025
5 changed files
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -28,7 +28,8 @@ kubectl port-forward -n <namespace> svc/<frontend-service-name> 8000:8000 > /dev
 # Run benchmark
 python3 -m benchmarks.utils.benchmark \
-    --input my-benchmark=http://localhost:8000 \
+    --benchmark-name my-benchmark \
+    --endpoint-url http://localhost:8000 \
    --model "<your-model>"
 # Generate plots
@@ -43,15 +44,10 @@ python3 -m benchmarks.utils.plot --data-dir ./benchmarks/results --benchmark-nam
 Benchmark any HTTP endpoints! The benchmarking framework supports:
 **Flexible Configuration:**
- User-defined labels for each input using `--input label=value` format
+- User-defined benchmark names using `--benchmark-name` flag
- Support for multiple inputs to enable comparisons
+- Support for single endpoint benchmarking with `--endpoint-url` flag
 - Customizable concurrency levels (configurable via CONCURRENCIES env var), sequence lengths, and models
- Automated performance plot generation with custom labels
+- Automated performance plot generation with custom benchmark names
-**Sequential Execution:**
- Benchmarks are run sequentially, not in parallel
- To avoid interference, ensure only one deployment is utilizing the target GPUs during a run
- This helps produce more comparable measurements across configurations
 **Supported Backends:**
 - DynamoGraphDeployments with port-forwarded endpoints

--- a/benchmarks/incluster/benchmark_job.yaml
+++ b/benchmarks/incluster/benchmark_job.yaml
@@ -17,13 +17,13 @@ spec:
        fsGroup: 1000
      containers:
      - name: benchmark-runner
+        # TODO: update to latest public image in next release
        image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
        securityContext:
          allowPrivilegeEscalation: false
          capabilities:
            drop:
            - ALL
-          readOnlyRootFilesystem: true
          runAsNonRoot: true
        resources:
          requests:
@@ -50,11 +50,10 @@ spec:
          - "256"
          - --output-dir
          - /data/results
-          - --input
+          - --benchmark-name
-          - "qwen-vllm-agg=vllm-agg-frontend:8000"
+          - "qwen3-0p6b-vllm-agg"
-          # add more copies of lines 58-59 for each additional service if you want to benchmark multiple services
+          - --endpoint-url
-          # - --input
+          - "vllm-agg-frontend:8000"
-          # - "name=service-url:port"
        volumeMounts:
          - name: data-volume
            mountPath: /data

--- a/benchmarks/utils/benchmark.py
+++ b/benchmarks/utils/benchmark.py
@@ -6,71 +6,59 @@
 import argparse
 import re
 import sys
-from typing import Dict, Tuple
 from urllib.parse import urlsplit
 from benchmarks.utils.workflow import has_http_scheme, run_benchmark_workflow
 from deploy.utils.kubernetes import is_running_in_cluster
-def validate_inputs(inputs: Dict[str, str]) -> None:
+def validate_endpoint(endpoint: str) -> None:
-    """Validate that all inputs are HTTP endpoints or internal service URLs when running in cluster"""
+    """Validate that endpoint is HTTP endpoint or internal service URL when running in cluster"""
-    for label, value in inputs.items():
+    v = endpoint.strip()
-        v = value.strip()
+    if is_running_in_cluster():
-        if is_running_in_cluster():
+        # Allow HTTP(S) or internal service URLs like host[:port][/path]
-            # Allow HTTP(S) or internal service URLs like host[:port][/path]
+        if has_http_scheme(v):
-            if has_http_scheme(v):
+            pass
-                pass
-            else:
-                parts = urlsplit(f"//{v}")
-                host_ok = bool(parts.hostname)
-                port_ok = parts.port is None or (1 <= parts.port <= 65535)
-                if not (host_ok and port_ok):
-                    raise ValueError(
-                        f"Input '{label}' must be HTTP(S) or internal service URL. Got: {value}"
-                    )
        else:
-            if not has_http_scheme(v):
+            parts = urlsplit(f"//{v}")
-                raise ValueError(f"Input '{label}' must be HTTP endpoint. Got: {value}")
+            host_ok = bool(parts.hostname)
+            port_ok = parts.port is None or (1 <= parts.port <= 65535)
-        # Validate reserved labels
+            if not (host_ok and port_ok):
-        if label.lower() == "plots":
+                raise ValueError(
-            raise ValueError("Label 'plots' is reserved")
+                    f"Endpoint must be HTTP(S) or internal service URL. Got: {endpoint}"
+                )
+    else:
-def parse_input(input_str: str) -> Tuple[str, str]:
+        if not has_http_scheme(v):
-    """Parse input string in format key=value with additional validation"""
+            raise ValueError(f"Endpoint must be HTTP endpoint. Got: {endpoint}")
-    if "=" not in input_str:
-        raise ValueError(f"Invalid input format: {input_str}")
-    parts = input_str.split("=", 1)  # Split on first '=' only
-    if len(parts) != 2:
-        raise ValueError(f"Invalid input format: {input_str}")
-    label, value = parts
-    if not label.strip():
+def validate_benchmark_name(name: str) -> None:
-        raise ValueError("Empty label")
+    """Validate benchmark name"""
-    if not value.strip():
+    if not name.strip():
-        raise ValueError("Empty value")
+        raise ValueError("Benchmark name cannot be empty")
-    label = label.strip()
+    name = name.strip()
-    value = value.strip()
-    # Validate label characters
+    # Validate name characters
-    if not re.match(r"^[a-zA-Z0-9_-]+$", label):
+    if not re.match(r"^[a-zA-Z0-9_-]+$", name):
-        raise ValueError(f"Invalid label: {label}")
+        raise ValueError(f"Invalid benchmark name: {name}")
-    return label, value
+    # Validate reserved names
+    if name.lower() == "plots":
+        raise ValueError("Benchmark name 'plots' is reserved")
 def main() -> int:
    parser = argparse.ArgumentParser(description="Benchmark Orchestrator")
    parser.add_argument(
-        "--input",
+        "--benchmark-name",
-        action="append",
+        required=True,
-        dest="inputs",
+        help="Name/label for this benchmark (used in plots and results)",
-        help="Input in format <label>=<endpoint>. Can be specified multiple times for comparisons.",
+    )
+    parser.add_argument(
+        "--endpoint-url",
+        required=True,
+        help="Endpoint to benchmark: HTTP(S) URL (e.g., http://localhost:8000) or in-cluster service URL host[:port]",
    )
    parser.add_argument("--isl", type=int, default=2000, help="Input sequence length")
    parser.add_argument(
@@ -83,7 +71,7 @@ def main() -> int:
    parser.add_argument(
        "--model",
        default="Qwen/Qwen3-0.6B",
-        help="Model name",
+        help="Model name (must match the model deployed at the endpoint)",
    )
    parser.add_argument(
        "--output-dir", type=str, default="benchmarks/results", help="Output directory"
@@ -91,45 +79,16 @@ def main() -> int:
    args = parser.parse_args()
    # Validate inputs
-    if not args.inputs:
-        print("ERROR: At least one --input must be specified")
-        return 1
-    # Parse inputs
    try:
-        parsed_inputs = {}
+        validate_benchmark_name(args.benchmark_name)
-        for input_str in args.inputs:
+        validate_endpoint(args.endpoint_url)
-            label, value = parse_input(input_str)
-            if label in parsed_inputs:
-                print(
-                    f"ERROR: Duplicate label '{label}' found. Each label must be unique."
-                )
-                return 1
-            parsed_inputs[label] = value
-        # Check for plotting limitations
-        if len(parsed_inputs) > 12:
-            print(
-                f"WARNING: You provided {len(parsed_inputs)} inputs, but the plotting system supports up to 12 inputs."
-            )
-            print(
-                "Consider running separate benchmark sessions or grouping related comparisons together."
-            )
-            print(
-                "Continuing with benchmark, but some inputs may not appear in plots..."
-            )
-            print()
-        # Validate that inputs are HTTP endpoints or in-cluster service URLs
-        validate_inputs(parsed_inputs)
    except ValueError as e:
        print(f"ERROR: {e}")
        return 1
    # Run the benchmark workflow with the parsed inputs
    run_benchmark_workflow(
-        inputs=parsed_inputs,
+        inputs={args.benchmark_name: args.endpoint_url},
        isl=args.isl,
        std=args.std,
        osl=args.osl,

--- a/benchmarks/utils/workflow.py
+++ b/benchmarks/utils/workflow.py
@@ -5,7 +5,6 @@ from pathlib import Path
 from typing import Dict, List
 from benchmarks.utils.genai import run_concurrency_sweep
-from benchmarks.utils.plot import generate_plots
 from deploy.utils.kubernetes import is_running_in_cluster
@@ -68,20 +67,12 @@ def run_endpoint_benchmark(
 def print_final_summary(output_dir: Path, labels: List[str]) -> None:
    """Print final benchmark summary"""
-    print("📊 Generating performance plots...")
-    generate_plots(base_output_dir=output_dir, output_dir=output_dir / "plots")
-    print(f"📈 Plots saved to: {output_dir / 'plots'}")
-    print(f"📋 Summary saved to: {output_dir / 'plots' / 'SUMMARY.txt'}")
-    print()
    print("🎉 Benchmark workflow completed successfully!")
    print(f"📁 All results available at: {output_dir}")
    if labels:
        print(f"🚀 Benchmarked: {', '.join(labels)}")
-    print(f"📊 View plots at: {output_dir / 'plots'}")
 def run_benchmark_workflow(
    inputs: Dict[str, str],

--- a/docs/benchmarks/benchmarking.md
+++ b/docs/benchmarks/benchmarking.md
@@ -110,7 +110,8 @@ kubectl port-forward -n <namespace> svc/<frontend-service-name> 8000:8000 > /dev
 # Benchmark deployment A using Python scripts
 python3 -m benchmarks.utils.benchmark \
-   --input deployment-a=http://localhost:8000 \
+   --benchmark-name deployment-a \
+   --endpoint-url http://localhost:8000 \
   --model "your-model-name" \
   --output-dir ./benchmarks/results
 ```
@@ -125,7 +126,8 @@ kubectl port-forward -n <namespace> svc/<frontend-service-name> 8001:8000 > /dev
 # Benchmark deployment B using Python scripts
 python3 -m benchmarks.utils.benchmark \
-   --input deployment-b=http://localhost:8001 \
+   --benchmark-name deployment-b \
+   --endpoint-url http://localhost:8001 \
   --model "your-model-name" \
   --output-dir ./benchmarks/results
 ```
@@ -155,13 +157,11 @@ The benchmarking framework supports various comparative analysis scenarios:
 ### Command Line Options
 ```bash
-python3 -m benchmarks.utils.benchmark --input <label>=<endpoint_url> [--input <label>=<endpoint_url>]... [OPTIONS]
+python3 -m benchmarks.utils.benchmark --benchmark-name <name> --endpoint-url <endpoint_url> [OPTIONS]
 REQUIRED:
-  --input <label>=<endpoint_url>     Benchmark input with custom label
+  --benchmark-name NAME           Name/label for this benchmark (used in plots and results)
-                                        - <label>: becomes the name/label in plots (see Important Notes for restrictions)
+  --endpoint-url URL              HTTP endpoint URL to benchmark (e.g., http://localhost:8000)
-                                        - <endpoint_url>: HTTP endpoint URL (e.g., http://localhost:8000)
-                                        Can be specified multiple times for comparisons
 OPTIONS:
  -h, --help                    Show help message and examples
@@ -176,8 +176,8 @@ OPTIONS:
 ### Important Notes
- **Custom Labels**: Each input must have a unique label that becomes the name in plots and results
+- **Benchmark Name**: The benchmark name becomes the label in plots and results
- **Label Restrictions**: Labels can only contain letters, numbers, hyphens, and underscores. The label `plots` is reserved.
+- **Name Restrictions**: Names can only contain letters, numbers, hyphens, and underscores. The name `plots` is reserved.
 - **Port-Forwarding**: You must have an exposed endpoint before benchmarking
 - **Model Parameter**: The `--model` parameter configures GenAI-Perf for testing and logging, and must match the model deployed at the endpoint
 - **Sequential Benchmarking**: For comparative benchmarks, deploy and benchmark each configuration separately
@@ -188,10 +188,10 @@ The Python benchmarking module:
 1. **Connects** to your port-forwarded endpoint
 2. **Benchmarks** using GenAI-Perf at various concurrency levels (default: 1, 2, 5, 10, 50, 100, 250)
 3. **Measures** key metrics: latency, throughput, time-to-first-token
-4. **Saves** results to an output directory organized by input labels
+4. **Saves** results to an output directory organized by benchmark name
 The Python plotting module:
-1. **Generates** comparison plots using your custom labels in `<OUTPUT_DIR>/plots/`
+1. **Generates** comparison plots using your benchmark name in `<OUTPUT_DIR>/plots/`
 2. **Creates** summary statistics and visualizations
 ### Plotting Options
@@ -224,7 +224,7 @@ The benchmarking framework is built around Python modules that provide direct co
 ### Comparison Limitations
-The plotting system supports up to 12 different inputs in a single comparison. If you need to compare more than 12 different deployments/endpoints, consider running separate benchmark sessions or grouping related comparisons together.
+The plotting system supports up to 12 different benchmarks in a single comparison.
 ### Concurrency Configuration
@@ -233,12 +233,14 @@ You can customize the concurrency levels using the CONCURRENCIES environment var
 ```bash
 # Custom concurrency levels
 CONCURRENCIES="1,5,20,50" python3 -m benchmarks.utils.benchmark \
-    --input my-test=http://localhost:8000
+    --benchmark-name my-test \
+    --endpoint-url http://localhost:8000
 # Or set permanently
 export CONCURRENCIES="1,2,5,10,25,50,100"
 python3 -m benchmarks.utils.benchmark \
-    --input test=http://localhost:8000
+    --benchmark-name test \
+    --endpoint-url http://localhost:8000
 ```
 ## Understanding Your Results
@@ -247,62 +249,60 @@ After benchmarking completes, check `./benchmarks/results/` (or your custom outp
 ### Plot Labels and Organization
-The plotting script uses the `--input` labels (the keys before the `=` sign) as the experiment names in all generated plots. For example:
+The plotting script uses the `--benchmark-name` as the experiment name in all generated plots. For example:
- `--input aggregated=http://localhost:8000` → plots will show "aggregated" as the label
+- `--benchmark-name aggregated` → plots will show "aggregated" as the label
- `--input vllm-disagg=http://localhost:8001` → plots will show "vllm-disagg" as the label
+- `--benchmark-name vllm-disagg` → plots will show "vllm-disagg" as the label
 This allows you to easily identify and compare different configurations in the visualization plots.
 ### Summary and Plots
 ```text
-benchmarks/results/
+benchmarks/results/plots
-├── SUMMARY.txt          # Quick overview of all results
+├── SUMMARY.txt                                     # Quick overview of all results
-└── plots/               # Visual comparisons (these are what you want!)
+├── p50_inter_token_latency_vs_concurrency.png      # Token generation speed
-    ├── p50_inter_token_latency_vs_concurrency.png      # Token generation speed
+├── avg_time_to_first_token_vs_concurrency.png      # Response time
-    ├── avg_time_to_first_token_vs_concurrency.png      # Response time
+├── request_throughput_vs_concurrency.png           # Requests per second
-    ├── request_throughput_vs_concurrency.png           # Requests per second
+├── efficiency_tok_s_gpu_vs_user.png                # GPU efficiency
-    ├── efficiency_tok_s_gpu_vs_user.png                # GPU efficiency
+└── avg_inter_token_latency_vs_concurrency.png      # Average latency
-    └── avg_inter_token_latency_vs_concurrency.png      # Average latency
 ```
 ### Data Files
 Raw data is organized by deployment/benchmark type and concurrency level:
-**For Any Benchmarking (uses your custom labels):**
+**For Any Benchmarking (uses your custom benchmark name):**
 ```text
-results/                          # Client-side: ./benchmarks/results/ or custom dir
+results/                         # Client-side: ./benchmarks/results/ or custom dir
-├── plots/                        # Server-side: /data/results/
+├── plots/                       # Server-side: /data/results/
 │   ├── SUMMARY.txt              # Performance visualization plots
 │   ├── p50_inter_token_latency_vs_concurrency.png
 │   ├── avg_inter_token_latency_vs_concurrency.png
 │   ├── request_throughput_vs_concurrency.png
 │   ├── efficiency_tok_s_gpu_vs_user.png
 │   └── avg_time_to_first_token_vs_concurrency.png
-├── <your-label-1>/              # Results for first input (uses your custom label)
+├── <your-benchmark-name>/       # Results for your benchmark (uses your custom name)
 │   ├── c1/                      # Concurrency level 1
 │   │   └── profile_export_genai_perf.json
 │   ├── c2/                      # Concurrency level 2
 │   ├── c5/                      # Concurrency level 5
 │   └── ...                      # Other concurrency levels (10, 50, 100, 250)
-├── <your-label-2>/              # Results for second input (if provided)
+└── <your-benchmark-name-N>/     # Results for additional benchmarking runs
-│   └── c*/                      # Same structure as above
-└── <your-label-N>/              # Results for additional inputs
    └── c*/                      # Same structure as above
 ```
-**Example with actual labels:**
+**Example with actual benchmark names:**
 ```text
 results/
 ├── plots/
-├── experiment-a/                  # --input experiment-a=http://localhost:8000
+├── experiment-a/                  # --benchmark-name experiment-a
-├── experiment-b/                  # --input experiment-b=http://localhost:8001
+├── experiment-b/                  # --benchmark-name experiment-b
-└── experiment-c/                  # --input experiment-c=http://localhost:8002
+└── experiment-c/                  # --benchmark-name experiment-c
 ```
 Each concurrency directory contains:
 - **`profile_export_genai_perf.json`** - Structured metrics from GenAI-Perf
+- **`profile_export_genai_perf.csv`** - CSV format metrics from GenAI-Perf
 - **`profile_export.json`** - Raw GenAI-Perf results
 - **`inputs.json`** - Generated test inputs
@@ -336,6 +336,7 @@ Deploy your DynamoGraphDeployment using the [deployment documentation](../../com
 ### Step 2: Deploy and Run Benchmark Job
+**Note**: The server-side benchmarking job requires a Docker image containing the Dynamo benchmarking tools. Before the 0.5.1 release, you must build your own Docker image using the [container build instructions](../../container/README.md), push it to your container registry, then update the `image` field in `benchmarks/incluster/benchmark_job.yaml` to use your built image tag.
 ```bash
 export NAMESPACE=benchmarking
@@ -352,7 +353,8 @@ kubectl logs -f job/dynamo-benchmark -n $NAMESPACE
 To customize the benchmark parameters, edit the `benchmarks/incluster/benchmark_job.yaml` file and modify:
 - **Model name**: Change `"Qwen/Qwen3-0.6B"` in the args section
- **Experiment name and service URL**: Change `"qwen-vllm-agg=vllm-agg-frontend:8000"` so the service URL matches your deployed service
+- **Benchmark name**: Change `"qwen3-0p6b-vllm-agg"` to your desired benchmark name
+- **Service URL**: Change `"vllm-agg-frontend:8000"` so the service URL matches your deployed service
 - **Docker image**: Change the image field if needed
 Then deploy:
@@ -365,11 +367,20 @@ kubectl apply -f benchmarks/incluster/benchmark_job.yaml -n $NAMESPACE
 # Download results from PVC (recommended)
 python3 -m deploy.utils.download_pvc_results \
  --namespace $NAMESPACE \
-  --output-dir ./benchmarks/results/${INPUT_NAME} \
+  --output-dir ./benchmarks/results/<benchmark-name> \
-  --folder /data/results/${INPUT_NAME} \
+  --folder /data/results/<benchmark-name> \
  --no-config
 ```
+### Step 4: Generate Plots
+```bash
+# Generate performance plots from the downloaded results
+python3 -m benchmarks.utils.plot \
+  --data-dir ./benchmarks/results
+```
+This will create visualization plots. For more details on interpreting these plots, see the [Summary and Plots](#summary-and-plots) section above.
 ## Cross-Namespace Service Access
 Server-side benchmarking can benchmark services across multiple namespaces from a single job using Kubernetes DNS. When referencing services in other namespaces, use the full DNS format:
@@ -397,7 +408,8 @@ The benchmark job is configured directly in the YAML file.
 ### Default Configuration
 - **Model**: `Qwen/Qwen3-0.6B`
- **Service**: `qwen-vllm-agg=vllm-agg-frontend:8000`
+- **Benchmark Name**: `qwen3-0p6b-vllm-agg`
+- **Service**: `vllm-agg-frontend:8000`
 - **Docker Image**: `nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag`
 ### Customizing the Job
@@ -405,32 +417,36 @@ The benchmark job is configured directly in the YAML file.
 To customize the benchmark, edit `benchmarks/incluster/benchmark_job.yaml`:
 1. **Change the model**: Update the `--model` argument
-2. **Change the experiment name and/or service URL**: Update the `--input` argument (use `svc_name.namespace.svc.cluster.local:port` for cross-namespace access)
+2. **Change the benchmark name**: Update the `--benchmark-name` argument
-3. **Add multiple services**: Uncomment and add more `--input` lines
+3. **Change the service URL**: Update the `--endpoint-url` argument (use `<svc_name>.<namespace>.svc.cluster.local:port` for cross-namespace access)
 4. **Change Docker image**: Update the image field if needed
 ### Example: Multi-Namespace Benchmarking
-To benchmark services across multiple namespaces, modify the `--input` arguments:
+To benchmark services across multiple namespaces, you would need to run separate benchmark jobs for each service since the format supports one benchmark per job. However, the results are stored in the same PVC and may be accessed together.
 ```yaml
+# Job 1: Production service
 args:
  - --model
  - "Qwen/Qwen3-0.6B"
-  - --isl
+  - --benchmark-name
-  - "2000"
+  - "prod-vllm"
-  - --std
+  - --endpoint-url
-  - "10"
+  - "vllm-agg-frontend.production.svc.cluster.local:8000"
-  - --osl
+  - --output-dir
-  - "256"
+  - /data/results
+# Job 2: Staging service
+args:
+  - --model
+  - "Qwen/Qwen3-0.6B"
+  - --benchmark-name
+  - "staging-vllm"
+  - --endpoint-url
+  - "vllm-agg-frontend.staging.svc.cluster.local:8000"
  - --output-dir
  - /data/results
-  - --input
-  - "prod-vllm=vllm-agg-frontend.production.svc.cluster.local:8000"
-  - --input
-  - "staging-vllm=vllm-agg-frontend.staging.svc.cluster.local:8000"
-  - --input
-  - "dev-vllm=vllm-agg-frontend.development.svc.cluster.local:8000"
 ```
 ## Understanding Your Results
@@ -439,14 +455,7 @@ Results are stored in `/data/results` and follow the same structure as client-si
 ```text
 /data/results/
-├── plots/                           # Performance visualization plots
+└── <benchmark-name>/                # Results for your benchmark name
-│   ├── SUMMARY.txt                  # Human-readable benchmark summary
-│   ├── p50_inter_token_latency_vs_concurrency.png
-│   ├── avg_inter_token_latency_vs_concurrency.png
-│   ├── request_throughput_vs_concurrency.png
-│   ├── efficiency_tok_s_gpu_vs_user.png
-│   └── avg_time_to_first_token_vs_concurrency.png
-└── dsr1/                           # Results for dsr1 input
    ├── c1/                          # Concurrency level 1
    │   └── profile_export_genai_perf.json
    ├── c2/                          # Concurrency level 2