fix: update invalid AIPerf scripts and parsing logic (#3675)

Signed-off-by: Anthony Casagrande <acasagrande@nvidia.com>

fix: update invalid AIPerf scripts and parsing logic (#3675)
Signed-off-by: Anthony Casagrande <acasagrande@nvidia.com>
333ee983 · Anthony Casagrande · GitHub · 2f793b45 · 333ee983 · 333ee983
Unverified Commit 333ee983 authored Oct 16, 2025 by Anthony Casagrande Committed by GitHub Oct 16, 2025
15 changed files
--- a/benchmarks/llm/perf.sh
+++ b/benchmarks/llm/perf.sh
@@ -235,9 +235,8 @@ for concurrency in "${concurrency_array[@]}"; do
    --num-dataset-entries $(($concurrency*12)) \
    --random-seed 100 \
    --artifact-dir ${artifact_dir} \
-    -- \
+    --ui simple \
    -v \
-    --max-threads ${concurrency} \
    -H 'Authorization: Bearer NOT USED' \
    -H 'Accept: text/event-stream'

--- a/benchmarks/profiler/profile_sla.py
+++ b/benchmarks/profiler/profile_sla.py
@@ -253,7 +253,7 @@ async def run_profile(args):
                    base_url=base_url,
                )
                if aiperf_result is not None:
-                    ttft = aiperf_result["records"]["ttft"]["avg"]
+                    ttft = aiperf_result["time_to_first_token"]["avg"]
                logger.info("Cleaning up deployment...")
                await client.delete_deployment()
@@ -432,11 +432,9 @@ async def run_profile(args):
                            base_url=base_url,
                        )
                        if aiperf_result is not None:
-                            itl = aiperf_result["records"]["inter_token_latency"]["avg"]
+                            itl = aiperf_result["inter_token_latency"]["avg"]
                            thpt_per_gpu = (
-                                aiperf_result["records"]["output_token_throughput"][
+                                aiperf_result["output_token_throughput"]["avg"]
-                                    "avg"
-                                ]
                                / num_gpus
                            )

--- a/benchmarks/profiler/utils/profile_decode.py
+++ b/benchmarks/profiler/utils/profile_decode.py
@@ -124,10 +124,8 @@ def profile_decode(
            base_url=url,
        )
        if aiperf_result is not None:
-            itl = aiperf_result["records"]["inter_token_latency"]["avg"]
+            itl = aiperf_result["inter_token_latency"]["avg"]
-            thpt_per_gpu = (
+            thpt_per_gpu = aiperf_result["output_token_throughput"]["avg"] / num_gpus
-                aiperf_result["records"]["output_token_throughput"]["avg"] / num_gpus
-            )
            return itl, thpt_per_gpu
        return None, None

--- a/benchmarks/profiler/utils/profile_prefill.py
+++ b/benchmarks/profiler/utils/profile_prefill.py
@@ -90,7 +90,7 @@ def profile_prefill(
            base_url=url,
        )
        if aiperf_result is not None:
-            return aiperf_result["records"]["ttft"]["avg"]
+            return aiperf_result["time_to_first_token"]["avg"]
        return None
    return _profile_prefill_helper(

--- a/benchmarks/router/README.md
+++ b/benchmarks/router/README.md
@@ -232,7 +232,7 @@ python real_data_benchmark.py --input-dataset trace.jsonl --prefix-root-multipli
 > [!Note]
 > At the time of writing this documentation, you may need to install the latest aiperf from the main source branch to loadgen on the trace files:
 > ```bash
-> pip install git+https://github.com/ai-dynamo/aiperf.git#subdirectory=aiperf
+> pip install git+https://github.com/ai-dynamo/aiperf.git
 > ```
 > However, by the time of release, the aiperf version included in the vLLM runtime container should be up to date enough to use as-is.

--- a/benchmarks/sin_load_generator/README.md
+++ b/benchmarks/sin_load_generator/README.md
@@ -5,7 +5,7 @@ SPDX-License-Identifier: Apache-2.0
 # Sinusoidal Load Generator
-`sin_synth.py` is a simple script to generate synthetic load with sinusoidal request rate and isl/osl ratio. The output is in [mooncake-style](https://github.com/kvcache-ai/Mooncake) jsonl format, which can be directly used in [AIPerf](https://github.com/ai-dynamo/aiperf/tree/main/aiperf).
+`sin_synth.py` is a simple script to generate synthetic load with sinusoidal request rate and isl/osl ratio. The output is in [mooncake-style](https://github.com/kvcache-ai/Mooncake) jsonl format, which can be directly used in [AIPerf](https://github.com/ai-dynamo/aiperf).
 ## Usage

--- a/docs/backends/trtllm/gpt-oss.md
+++ b/docs/backends/trtllm/gpt-oss.md
@@ -404,7 +404,7 @@ curl localhost:8000/v1/chat/completions   -H "Content-Type: application/json"
 ### Performance Testing with AIPerf
-The Dynamo container includes [AIPerf](https://github.com/ai-dynamo/aiperf/blob/main/README.md), NVIDIA's tool for benchmarking generative AI models. This tool helps measure throughput, latency, and other performance metrics for your deployment.
+The Dynamo container includes [AIPerf](https://github.com/ai-dynamo/aiperf/tree/main?tab=readme-ov-file#aiperf), NVIDIA's tool for benchmarking generative AI models. This tool helps measure throughput, latency, and other performance metrics for your deployment.
 **Run the following benchmark from inside the container** (after completing the deployment steps above):

--- a/recipes/gpt-oss-120b/trtllm/agg/perf.yaml
+++ b/recipes/gpt-oss-120b/trtllm/agg/perf.yaml
@@ -57,19 +57,20 @@ spec:
            aiperf profile --artifact-dir $ARTIFACT_DIR \
                --model $TARGET_MODEL \
                --tokenizer /model-cache/hub/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a  \
-                --endpoint-type chat  --endpoint /v1/chat/completions \
+                --endpoint-type chat  \
+                --endpoint /v1/chat/completions \
                --streaming \
                --url http://$ENDPOINT \
                --synthetic-input-tokens-mean $isl \
                --synthetic-input-tokens-stddev 0 \
                --output-tokens-mean $osl \
                --output-tokens-stddev 0 \
-                --extra-inputs "{\"max_tokens\":$osl}" \
+                --extra-inputs "max_tokens:$osl" \
-                --extra-inputs "{\"min_tokens\":$osl}" \
+                --extra-inputs "min_tokens:$osl" \
-                --extra-inputs "{\"ignore_eos\":true}" \
+                --extra-inputs "ignore_eos:true" \
                --extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
-                --extra-inputs "{\"repetition_penalty\":1.0}" \
+                --extra-inputs "repetition_penalty:1.0" \
-                --extra-inputs "{\"temperature\": 0.0}" \
+                --extra-inputs "temperature: 0.0" \
                --concurrency $concurrency \
                --request-count $((10*concurrency)) \
                --warmup-request-count $concurrency \

--- a/recipes/llama-3-70b/vllm/agg/perf.yaml
+++ b/recipes/llama-3-70b/vllm/agg/perf.yaml
@@ -49,7 +49,8 @@ spec:
            aiperf profile --artifact-dir $ARTIFACT_DIR \
                --model $TARGET_MODEL \
                --tokenizer /root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd  \
-                --endpoint-type chat  --endpoint /v1/chat/completions \
+                --endpoint-type chat  \
+                --endpoint /v1/chat/completions \
                --streaming \
                --url http://$ENDPOINT \
                --synthetic-input-tokens-mean $isl \

--- a/recipes/llama-3-70b/vllm/disagg-multi-node/perf.yaml
+++ b/recipes/llama-3-70b/vllm/disagg-multi-node/perf.yaml
@@ -49,7 +49,8 @@ spec:
            aiperf profile --artifact-dir $ARTIFACT_DIR \
                --model $TARGET_MODEL \
                --tokenizer /root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd  \
-                --endpoint-type chat  --endpoint /v1/chat/completions \
+                --endpoint-type chat  \
+                --endpoint /v1/chat/completions \
                --streaming \
                --url http://$ENDPOINT \
                --synthetic-input-tokens-mean $isl \

--- a/recipes/llama-3-70b/vllm/disagg-single-node/perf.yaml
+++ b/recipes/llama-3-70b/vllm/disagg-single-node/perf.yaml
@@ -49,7 +49,8 @@ spec:
            aiperf profile --artifact-dir $ARTIFACT_DIR \
                --model $TARGET_MODEL \
                --tokenizer /root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd  \
-                --endpoint-type chat  --endpoint /v1/chat/completions \
+                --endpoint-type chat  \
+                --endpoint /v1/chat/completions \
                --streaming \
                --url http://$ENDPOINT \
                --synthetic-input-tokens-mean $isl \

--- a/tests/fault_tolerance/deploy/client.py
+++ b/tests/fault_tolerance/deploy/client.py
@@ -383,27 +383,19 @@ def log_summary_metrics(
            with open(profile_json) as f:
                metrics = json.load(f)
-            # Extract key metrics from AI-Perf format
+            # Request count
-            records = metrics.get("records", {})
+            request_count = int(metrics.get("request_count", {}).get("avg", 0))
-            # Request count from request_count record
-            request_count_record = records.get("request_count", {})
-            request_count = (
-                int(request_count_record.get("avg", 0)) if request_count_record else 0
-            )
            # Check for errors
-            error_summary = metrics.get("error_summary", [])
+            error_count = len(metrics.get("error_summary", []))
-            error_count = len(error_summary)
            # Latency metrics (in milliseconds)
-            request_latency = records.get("request_latency", {})
+            request_latency = metrics.get("request_latency", {})
            avg_latency = request_latency.get("avg", 0) / 1000.0  # Convert to seconds
            p99_latency = request_latency.get("p99", 0) / 1000.0  # Convert to seconds
            # Throughput metrics
-            request_throughput = records.get("request_throughput", {})
+            throughput = metrics.get("request_throughput", {}).get("avg", 0)
-            throughput = request_throughput.get("avg", 0)
            # Log summary
            logger.info(
@@ -417,7 +409,7 @@ def log_summary_metrics(
            # Log success rate
            if request_count > 0:
-                success_rate = (request_count - error_count) / request_count * 100
+                success_rate = ((request_count - error_count) / request_count) * 100
                logger.info(f"Success rate: {success_rate:.1f}%")
            # Also write summary to CSV file for aggregation

--- a/tests/fault_tolerance/deploy/parse_results.py
+++ b/tests/fault_tolerance/deploy/parse_results.py
@@ -293,63 +293,44 @@ def parse_aiperf_client_results(log_dir: str) -> Dict[str, Any]:
                with open(profile_json) as f:
                    client_metrics = json.load(f)
-                # AI-Perf format has "records" dictionary at the top level
+                # Extract request count (this is the total successful requests made)
-                records = client_metrics.get("records", {})
+                request_count = int(
+                    client_metrics.get("request_count", {}).get("avg", 0)
-                # Extract request count (this is the total requests made)
-                request_count_record = records.get("request_count", {})
-                request_count = (
-                    int(request_count_record.get("avg", 0))
-                    if request_count_record
-                    else 0
                )
                # Check for errors in error_summary
-                error_summary = client_metrics.get("error_summary", [])
+                error_count = len(client_metrics.get("error_summary", []))
-                error_count = len(error_summary)
                # Check if test was cancelled
-                was_cancelled = client_metrics.get("was_cancelled", False)
+                if client_metrics.get("was_cancelled", False):
-                if was_cancelled:
                    error_count = request_count  # Mark all as failed if cancelled
                all_metrics["total_requests"] += request_count
                all_metrics["successful_requests"] += request_count - error_count
                all_metrics["failed_requests"] += error_count
-                # Extract latency from request_latency record
+                # Extract latency metrics
-                request_latency = records.get("request_latency", {})
+                request_latency = client_metrics.get("request_latency", None)
                if request_latency:
-                    # Convert milliseconds to seconds for consistency
+                    all_metrics["latencies"].append(request_latency["avg"] / 1000.0)
-                    if "avg" in request_latency:
+                    all_metrics["p50_latencies"].append(request_latency["p50"] / 1000.0)
-                        all_metrics["latencies"].append(request_latency["avg"] / 1000.0)
+                    all_metrics["p90_latencies"].append(request_latency["p90"] / 1000.0)
-                    if "p50" in request_latency:
+                    all_metrics["p99_latencies"].append(request_latency["p99"] / 1000.0)
-                        all_metrics["p50_latencies"].append(
-                            request_latency["p50"] / 1000.0
+                # Time to first token
-                        )
+                ttft = client_metrics.get("time_to_first_token", {}).get("avg", None)
-                    if "p90" in request_latency:
+                if ttft:
-                        all_metrics["p90_latencies"].append(
+                    all_metrics["ttft"].append(ttft / 1000.0)  # Convert ms to s
-                            request_latency["p90"] / 1000.0
-                        )
+                # Inter-token latency
-                    if "p99" in request_latency:
+                itl = client_metrics.get("inter_token_latency", {}).get("avg", None)
-                        all_metrics["p99_latencies"].append(
+                if itl:
-                            request_latency["p99"] / 1000.0
+                    all_metrics["itl"].append(itl / 1000.0)  # Convert ms to s
-                        )
-                # Time to first token (if available in records)
-                ttft = records.get("time_to_first_token", {}) or records.get("ttft", {})
-                if ttft and "avg" in ttft:
-                    all_metrics["ttft"].append(ttft["avg"] / 1000.0)  # Convert ms to s
-                # Inter-token latency (if available in records)
-                itl = records.get("inter_token_latency", {}) or records.get("itl", {})
-                if itl and "avg" in itl:
-                    all_metrics["itl"].append(itl["avg"] / 1000.0)  # Convert ms to s
                # Throughput from request_throughput record
-                request_throughput = records.get("request_throughput", {})
+                req_throughput = client_metrics.get("request_throughput", {}).get(
-                req_throughput = request_throughput.get("avg", 0)
+                    "avg", 0
+                )
                if req_throughput:
                    all_metrics["throughputs"].append(req_throughput)

--- a/tests/planner/README.md
+++ b/tests/planner/README.md
@@ -227,7 +227,7 @@ aiperf profile \
  --input-file payload:/workspace/rr-5-45_i3000o300.jsonl \ # path to the generated load dataset \
  --fixed-schedule True \
  --goodput time_to_first_token:200 inter_token_latency:10 \
-  -v \
+  -v
 ```
 > [!NOTE]

--- a/tests/planner/utils/load_generator.py
+++ b/tests/planner/utils/load_generator.py
@@ -116,8 +116,6 @@ class LoadGenerator:
            str(params["request_rate"]),
            "--request-count",
            str(request_count),  # Use request count to limit test duration
-            "--stability-percentage",
-            "50",
            "--num-dataset-entries",
            str(
                max(20, int(params["request_rate"] * 10))
@@ -210,35 +208,16 @@ class LoadGenerator:
            logger.info(f"Parsing results from: {results_file}")
            with open(results_file, "r") as f:
-                data = json.load(f)
+                metrics = json.load(f)
-            results = {}
+            results = {
-            if "experiments" in data and data["experiments"]:
+                "throughput": metrics.get("output_token_throughput", {}).get("avg", 0),
-                exp = data["experiments"][0]
+                "ttft_mean": metrics.get("time_to_first_token", {}).get("avg", 0),
-                if "perf_metrics" in exp:
+                "itl_mean": metrics.get("inter_token_latency", {}).get("avg", 0),
-                    metrics = exp["perf_metrics"]
+                "end_to_end_latency_mean": metrics.get("request_latency", {}).get(
-                    results.update(
+                    "avg", 0
-                        {
+                ),
-                            "throughput": metrics.get("throughput", {}).get("avg", 0),
+            }
-                            "ttft_mean": metrics.get("ttft", {}).get("avg", 0),
-                            "itl_mean": metrics.get("inter_token_latency", {}).get(
-                                "avg", 0
-                            ),
-                            "end_to_end_latency_mean": metrics.get(
-                                "request_latency", {}
-                            ).get("avg", 0),
-                        }
-                    )
-            if not results and "profile_export_aiperf" in data:
-                summary = data.get("summary", {})
-                results.update(
-                    {
-                        "throughput": summary.get("throughput", 0),
-                        "ttft_mean": summary.get("time_to_first_token_ms", 0),
-                        "itl_mean": summary.get("inter_token_latency_ms", 0),
-                    }
-                )
            logger.info(f"Parsed results: {results}")
            return results