feat: update benchmarking script to use aiperf (#3306)

Signed-off-by: Biswa Panda <biswa.panda@gmail.com> Signed-off-by: lkomali <lkomali@nvidia.com> Co-authored-by: lkomali <lkomali@nvidia.com> Co-authored-by: Harshini Komali <157742537+lkomali@users.noreply.github.com>

feat: update benchmarking script to use aiperf (#3306)
Signed-off-by: Biswa Panda <biswa.panda@gmail.com> Signed-off-by: lkomali <lkomali@nvidia.com> Co-authored-by: lkomali <lkomali@nvidia.com> Co-authored-by: Harshini Komali <157742537+lkomali@users.noreply.github.com>
de3ca70b · Biswa Panda · GitHub · 9b0948c6 · de3ca70b · de3ca70b
Unverified Commit de3ca70b authored Oct 13, 2025 by Biswa Panda Committed by GitHub Oct 14, 2025
20 changed files
--- a/README.md
+++ b/README.md
@@ -178,7 +178,7 @@ Rerun with `curl -N` and change `stream` in the request to `true` to get the res

 Dynamo provides comprehensive benchmarking tools to evaluate and optimize your deployments:

- **[Benchmarking Guide](docs/benchmarks/benchmarking.md)** – Compare deployment topologies (aggregated vs. disaggregated vs. vanilla vLLM) using GenAI-Perf
+- **[Benchmarking Guide](docs/benchmarks/benchmarking.md)** – Compare deployment topologies (aggregated vs. disaggregated vs. vanilla vLLM) using AIPerf
 - **[Pre-Deployment Profiling](docs/benchmarks/pre_deployment_profiling.md)** – Optimize configurations before deployment to meet SLA requirements

 # Engines

--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -15,7 +15,7 @@

 # Benchmarks

-This directory contains benchmarking scripts and tools for performance evaluation of Dynamo deployments. The benchmarking framework is a wrapper around genai-perf that makes it easy to benchmark DynamoGraphDeployments or other deployments with exposed endpoints.
+This directory contains benchmarking scripts and tools for performance evaluation of Dynamo deployments. The benchmarking framework is a wrapper around aiperf that makes it easy to benchmark DynamoGraphDeployments or other deployments with exposed endpoints.

 ## Quick Start


--- a/benchmarks/llm/perf.sh
+++ b/benchmarks/llm/perf.sh
@@ -202,7 +202,7 @@ if [ $index -gt 0 ]; then
    echo "--------------------------------"
 fi

-echo "Running genai-perf with:"
+echo "Running aiperf with:"
 echo "Model: $model"
 echo "ISL: $isl"
 echo "OSL: $osl"
@@ -214,7 +214,7 @@ for concurrency in "${concurrency_array[@]}"; do

  # NOTE: For Dynamo HTTP OpenAI frontend, use `nvext` for fields like
  # `ignore_eos` since they are not in the official OpenAI spec.
-  genai-perf profile \
+  aiperf profile \
    --model ${model} \
    --tokenizer ${model} \
    --endpoint-type chat \

--- a/benchmarks/llm/plot_pareto.py
+++ b/benchmarks/llm/plot_pareto.py
@@ -26,7 +26,7 @@ from matplotlib.ticker import MultipleLocator


 def get_json_paths(search_paths):
-    genai_perf_profile_export_json_paths = []
+    aiperf_profile_export_json_paths = []
    deployment_config_json_paths = []
    for search_path in search_paths:
        deployment_config_json_path = os.path.join(
@@ -34,15 +34,13 @@ def get_json_paths(search_paths):
        )
        if not os.path.exists(deployment_config_json_path):
            raise Exception(f"deployment_config.json not found in {search_path}")
-        for root, dirs, files in os.walk(search_path):
+        for root, _, files in os.walk(search_path):
            for file in files:
-                if file == "profile_export_genai_perf.json":
-                    genai_perf_profile_export_json_paths.append(
-                        os.path.join(root, file)
-                    )
+                if file == "profile_export_aiperf.json":
+                    aiperf_profile_export_json_paths.append(os.path.join(root, file))
                    deployment_config_json_paths.append(deployment_config_json_path)

-    return genai_perf_profile_export_json_paths, deployment_config_json_paths
+    return aiperf_profile_export_json_paths, deployment_config_json_paths


 # search for -concurrency<number> in the name
@@ -81,13 +79,13 @@ def parse_kind_and_mode(deployment_config_json_path):


 def extract_val_and_concurrency(
-    genai_perf_profile_export_json_paths, deployment_config_json_paths, stat_value="avg"
+    aiperf_profile_export_json_paths, deployment_config_json_paths, stat_value="avg"
 ):
    results = []
-    for genai_perf_profile_export_json_path, deployment_config_json_path in zip(
-        genai_perf_profile_export_json_paths, deployment_config_json_paths
+    for aiperf_profile_export_json_path, deployment_config_json_path in zip(
+        aiperf_profile_export_json_paths, deployment_config_json_paths
    ):
-        with open(genai_perf_profile_export_json_path, "r") as f:
+        with open(aiperf_profile_export_json_path, "r") as f:
            data = json.load(f)
            # output_token_throughput contains only avg
            output_token_throughput = data.get("output_token_throughput", {}).get("avg")
@@ -99,7 +97,7 @@ def extract_val_and_concurrency(
            # request_throughput contains only avg
            request_throughput = data.get("request_throughput", {}).get("avg")

-        concurrency = parse_concurrency(genai_perf_profile_export_json_path)
+        concurrency = parse_concurrency(aiperf_profile_export_json_path)
        num_gpus = parse_gpus(deployment_config_json_path)
        kind, mode = parse_kind_and_mode(deployment_config_json_path)

@@ -116,7 +114,7 @@ def extract_val_and_concurrency(

        results.append(
            {
-                "configuration": genai_perf_profile_export_json_path,
+                "configuration": aiperf_profile_export_json_path,
                "kind": kind,
                "mode": mode,
                "num_gpus": num_gpus,
@@ -241,12 +239,12 @@ if __name__ == "__main__":
    import os

    parser = argparse.ArgumentParser(
-        description="Plot Pareto graph from GenAI-Perf artifacts"
+        description="Plot Pareto graph from AIPerf artifacts"
    )
    parser.add_argument(
        "--artifacts-root-dir",
        required=True,
-        help="Root directory containing artifact directories to search for profile_export_genai_perf.json files",
+        help="Root directory containing artifact directories to search for profile_export_aiperf.json files",
    )
    parser.add_argument(
        "--title",
@@ -260,16 +258,16 @@ if __name__ == "__main__":
    if not artifacts_dirs:
        raise ValueError(f"No artifacts directories found in {args.artifacts_root_dir}")

-    genai_perf_profile_export_json_paths, deployment_config_json_paths = get_json_paths(
+    aiperf_profile_export_json_paths, deployment_config_json_paths = get_json_paths(
        artifacts_dirs
    )

-    if len(genai_perf_profile_export_json_paths) != len(deployment_config_json_paths):
+    if len(aiperf_profile_export_json_paths) != len(deployment_config_json_paths):
        raise ValueError(
-            f"Number of genai_perf_profile_export_json_paths ({len(genai_perf_profile_export_json_paths)}) does not match number of deployment_config_json_paths ({len(deployment_config_json_paths)})"
+            f"Number of aiperf_profile_export_json_paths ({len(aiperf_profile_export_json_paths)}) does not match number of deployment_config_json_paths ({len(deployment_config_json_paths)})"
        )

    extracted_values = extract_val_and_concurrency(
-        genai_perf_profile_export_json_paths, deployment_config_json_paths
+        aiperf_profile_export_json_paths, deployment_config_json_paths
    )
    create_pareto_graph(extracted_values, title=args.title)
--- a/benchmarks/profiler/profile_endpoint.py
+++ b/benchmarks/profiler/profile_endpoint.py
@@ -5,8 +5,8 @@ import argparse
 import logging
 import os

-from utils.profile_decode import profile_decode
-from utils.profile_prefill import profile_prefill
+from benchmarks.profiler.utils.profile_decode import profile_decode
+from benchmarks.profiler.utils.profile_prefill import profile_prefill

 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)

--- a/benchmarks/profiler/profile_sla.py
+++ b/benchmarks/profiler/profile_sla.py
@@ -22,13 +22,13 @@ import os
 import numpy as np
 import yaml

+from benchmarks.profiler.utils.aiperf import benchmark_decode, benchmark_prefill
 from benchmarks.profiler.utils.config import (
    CONFIG_MODIFIERS,
    WORKER_COMPONENT_NAMES,
    generate_dgd_config_with_planner,
 )
 from benchmarks.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator
-from benchmarks.profiler.utils.genai_perf import benchmark_decode, benchmark_prefill
 from benchmarks.profiler.utils.planner_utils import add_planner_arguments_to_parser
 from benchmarks.profiler.utils.plot import (
    plot_decode_performance,
@@ -245,18 +245,18 @@ async def run_profile(args):
                    f"Logs have been saved to {client.base_log_dir / client.deployment_name}"
                )

-                # run genai-perf
+                # run ai-perf
                base_url = client.get_service_url()
-                genai_perf_artifact_dir = f"{work_dir}/gap_isl{args.isl}"
-                gap_result = benchmark_prefill(
+                ai_perf_artifact_dir = f"{work_dir}/aiperf_isl{args.isl}"
+                aiperf_result = benchmark_prefill(
                    args.isl,
-                    genai_perf_artifact_dir,
+                    ai_perf_artifact_dir,
                    model_name,
                    model_name,
                    base_url=base_url,
                )
-                if gap_result is not None:
-                    ttft = gap_result["time_to_first_token"]["avg"]
+                if aiperf_result is not None:
+                    ttft = aiperf_result["records"]["ttft"]["avg"]

                logger.info("Cleaning up deployment...")
                await client.delete_deployment()
@@ -424,20 +424,23 @@ async def run_profile(args):
                        )
                    else:
                        base_url = client.get_service_url()
-                        genai_perf_artifact_dir = f"{work_dir}/gap_request{num_request}_isl{args.isl}_osl{args.osl}_n{num_request}"
-                        gap_result = benchmark_decode(
+                        ai_perf_artifact_dir = f"{work_dir}/aiperf_request{num_request}_isl{args.isl}_osl{args.osl}_n{num_request}"
+                        aiperf_result = benchmark_decode(
                            args.isl,
                            args.osl,
                            num_request,
-                            genai_perf_artifact_dir,
+                            ai_perf_artifact_dir,
                            model_name,
                            model_name,
                            base_url=base_url,
                        )
-                        if gap_result is not None:
-                            itl = gap_result["inter_token_latency"]["avg"]
+                        if aiperf_result is not None:
+                            itl = aiperf_result["records"]["inter_token_latency"]["avg"]
                            thpt_per_gpu = (
-                                gap_result["output_token_throughput"]["avg"] / num_gpus
+                                aiperf_result["records"]["output_token_throughput"][
+                                    "avg"
+                                ]
+                                / num_gpus
                            )

                    if itl is not None and thpt_per_gpu is not None:

--- a/benchmarks/profiler/utils/genai_perf.py
+++ b/benchmarks/profiler/utils/genai_perf.py
@@ -30,7 +30,7 @@ console_handler.setFormatter(formatter)
 logger.addHandler(console_handler)


-def _get_common_genai_perf_cmd(
+def _get_common_aiperf_cmd(
    artifact_dir,
    seed=100,
    model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
@@ -38,7 +38,7 @@ def _get_common_genai_perf_cmd(
    base_url="http://localhost:8000",
 ):
    return [
-        "genai-perf",
+        "aiperf",
        "profile",
        "--model",
        model,
@@ -64,7 +64,7 @@ def _get_common_genai_perf_cmd(
    ]


-def get_prefill_genai_perf_cmd(
+def get_prefill_aiperf_cmd(
    isl,
    artifact_dir,
    seed=100,
@@ -73,7 +73,7 @@ def get_prefill_genai_perf_cmd(
    osl=5,
    base_url="http://localhost:8000",
 ):
-    return _get_common_genai_perf_cmd(
+    return _get_common_aiperf_cmd(
        artifact_dir,
        seed,
        model,
@@ -99,7 +99,7 @@ def get_prefill_genai_perf_cmd(
    ]


-def get_decode_genai_perf_cmd(
+def get_decode_aiperf_cmd(
    isl,
    osl,
    artifact_dir,
@@ -109,7 +109,7 @@ def get_decode_genai_perf_cmd(
    tokenizer="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
    base_url="http://localhost:8000",
 ):
-    return _get_common_genai_perf_cmd(
+    return _get_common_aiperf_cmd(
        artifact_dir,
        seed,
        model,
@@ -137,15 +137,15 @@ def get_decode_genai_perf_cmd(
    ]


-def get_gap_result(artifact_dir: str) -> dict:
+def get_aiperf_result(artifact_dir: str) -> dict:
    json_file_path = None
    for root, _, files in os.walk(artifact_dir):
-        if "profile_export_genai_perf.json" in files:
-            json_file_path = os.path.join(root, "profile_export_genai_perf.json")
+        if "profile_export_aiperf.json" in files:
+            json_file_path = os.path.join(root, "profile_export_aiperf.json")
            break
    if json_file_path is None:
        raise FileNotFoundError(
-            f"profile_export_genai_perf.json not found in {artifact_dir}"
+            f"profile_export_aiperf.json not found in {artifact_dir}"
        )
    with open(json_file_path, "r") as f:
        return json.load(f)
@@ -153,35 +153,35 @@ def get_gap_result(artifact_dir: str) -> dict:

 def benchmark_prefill(
    isl,
-    genai_perf_artifact_dir,
+    aiperf_artifact_dir,
    model_name,
    tokenizer,
    base_url="http://localhost:8000",
 ):
-    logger.info(f"Running genai-perf with isl {isl}")
-    genai_perf_cmd = get_prefill_genai_perf_cmd(
+    logger.info(f"Running aiperf with isl {isl}")
+    aiperf_cmd = get_prefill_aiperf_cmd(
        isl,
-        genai_perf_artifact_dir,
+        aiperf_artifact_dir,
        model=model_name,
        tokenizer=tokenizer,
        base_url=base_url,
    )
-    print(f"genai-perf cmd: {genai_perf_cmd}")
+    print(f"aiperf cmd: {aiperf_cmd}")
    # import pdb; pdb.set_trace()
-    gap_process = subprocess.Popen(
-        genai_perf_cmd,
+    aiperf_process = subprocess.Popen(
+        aiperf_cmd,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True,
    )
-    stdout, stderr = gap_process.communicate()
-    if gap_process.returncode == 0:
-        logger.info("Genai-perf profiling completed successfully")
+    stdout, stderr = aiperf_process.communicate()
+    if aiperf_process.returncode == 0:
+        logger.info("AIperf profiling completed successfully")
        logger.info(stdout)
-        gap_result = get_gap_result(genai_perf_artifact_dir)
-        return gap_result
+        aiperf_result = get_aiperf_result(aiperf_artifact_dir)
+        return aiperf_result
    else:
-        logger.error(f"Genai-perf failed with error code: {gap_process.returncode}")
+        logger.error(f"AIPerf failed with error code: {aiperf_process.returncode}")
        logger.error(f"stderr: {stderr}")
        return None

@@ -190,7 +190,7 @@ def benchmark_decode(
    isl,
    osl,
    num_request,
-    genai_perf_artifact_dir,
+    aiperf_artifact_dir,
    model_name,
    tokenizer,
    base_url="http://localhost:8000",
@@ -201,47 +201,47 @@ def benchmark_decode(
    # we use the same random seed to make sure the prompt is the same
    seed = random.randint(0, 1000000)

-    genai_perf_cmd = get_decode_genai_perf_cmd(
+    aiperf_cmd = get_decode_aiperf_cmd(
        isl,
        osl,
-        f"{genai_perf_artifact_dir}_warmup",
+        f"{aiperf_artifact_dir}_warmup",
        num_request,
        seed=seed,
        model=model_name,
        tokenizer=tokenizer,
        base_url=base_url,
    )
-    gap_process = subprocess.Popen(
-        genai_perf_cmd,
+    aiperf_process = subprocess.Popen(
+        aiperf_cmd,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True,
    )
-    gap_process.communicate()
+    aiperf_process.communicate()
    # then send out the real requests, hopefully, this will skip all prefill computation
-    genai_perf_cmd = get_decode_genai_perf_cmd(
+    aiperf_cmd = get_decode_aiperf_cmd(
        isl,
        osl,
-        genai_perf_artifact_dir,
+        aiperf_artifact_dir,
        num_request,
        seed=seed,
        model=model_name,
        tokenizer=tokenizer,
        base_url=base_url,
    )
-    gap_process = subprocess.Popen(
-        genai_perf_cmd,
+    aiperf_process = subprocess.Popen(
+        aiperf_cmd,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True,
    )
-    stdout, stderr = gap_process.communicate()
-    if gap_process.returncode == 0:
-        logger.info("Genai-perf profiling completed successfully")
+    stdout, stderr = aiperf_process.communicate()
+    if aiperf_process.returncode == 0:
+        logger.info("AIperf profiling completed successfully")
        logger.info(stdout)
-        gap_result = get_gap_result(genai_perf_artifact_dir)
-        return gap_result
+        aiperf_result = get_aiperf_result(aiperf_artifact_dir)
+        return aiperf_result
    else:
-        logger.error(f"Genai-perf failed with error code: {gap_process.returncode}")
+        logger.error(f"AIPerf failed with error code: {aiperf_process.returncode}")
        logger.error(f"stderr: {stderr}")
        return None
--- a/benchmarks/profiler/utils/profile_cache.py
+++ b/benchmarks/profiler/utils/profile_cache.py
@@ -26,13 +26,13 @@ logger = logging.getLogger(__name__)
 def check_prefill_results_exist(output_dir: str, tp_size: int, isl: int) -> bool:
    """Check if prefill results already exist for a given TP size."""
    work_dir = f"{output_dir}/prefill_tp{tp_size}"
-    result_file = f"{work_dir}/gap_isl{isl}/*/profile_export_genai_perf.json"
+    result_file = f"{work_dir}/aiperf_isl{isl}/*/profile_export_aiperf.json"

    # Check if the work directory exists
    if not os.path.exists(work_dir):
        return False

-    # Look for the genai-perf result file
+    # Look for the aiperf result file
    result_files = glob.glob(result_file)
    if not result_files:
        return False
@@ -65,7 +65,7 @@ def check_decode_results_exist(

    # Look for at least one decode result file
    result_pattern = (
-        f"{work_dir}/gap_request*_isl{isl}_osl{osl}_n*/*/profile_export_genai_perf.json"
+        f"{work_dir}/aiperf_request*_isl{isl}_osl{osl}_n*/*/profile_export_aiperf.json"
    )
    result_files = glob.glob(result_pattern)

@@ -93,7 +93,7 @@ def load_existing_prefill_results(
 ) -> Tuple[Optional[float], Optional[float]]:
    """Load existing prefill results from disk."""
    work_dir = f"{output_dir}/prefill_tp{tp_size}"
-    result_file = f"{work_dir}/gap_isl{isl}/*/profile_export_genai_perf.json"
+    result_file = f"{work_dir}/aiperf_isl{isl}/*/profile_export_aiperf.json"

    result_files = glob.glob(result_file)
    if result_files:
@@ -115,7 +115,7 @@ def load_existing_decode_results(
    work_dir = f"{output_dir}/decode_tp{tp_size}"

    result_pattern = (
-        f"{work_dir}/gap_request*_isl{isl}_osl{osl}_n*/*/profile_export_genai_perf.json"
+        f"{work_dir}/aiperf_request*_isl{isl}_osl{osl}_n*/*/profile_export_aiperf.json"
    )
    result_files = glob.glob(result_pattern)

@@ -128,7 +128,7 @@ def load_existing_decode_results(
                thpt_per_gpu = data["output_token_throughput"]["avg"] / tp_size

                # Extract concurrency from filename
-                match = re.search(r"gap_request(\d+)_", result_file)
+                match = re.search(r"aiperf_request(\d+)_", result_file)
                if match:
                    concurrency = int(match.group(1))
                    decode_results.append((itl, thpt_per_gpu, concurrency))

--- a/benchmarks/profiler/utils/profile_decode.py
+++ b/benchmarks/profiler/utils/profile_decode.py
@@ -6,9 +6,9 @@ from typing import Callable, Optional, Tuple

 import numpy as np

+from benchmarks.profiler.utils.aiperf import benchmark_decode
 from benchmarks.profiler.utils.defaults import DECODE_MAX_CONCURRENCY
 from benchmarks.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator
-from benchmarks.profiler.utils.genai_perf import benchmark_decode
 from benchmarks.profiler.utils.plot import plot_decode_3d_surface

 logger = logging.getLogger(__name__)
@@ -113,19 +113,21 @@ def profile_decode(
    attention_dp_size,
 ):
    def get_itl_and_thpt_per_gpu(isl, osl, num_request):
-        genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}_osl{osl}_n{num_request}"
-        gap_result = benchmark_decode(
+        ai_perf_artifact_dir = f"{work_dir}/aiperf_isl{isl}_osl{osl}_n{num_request}"
+        aiperf_result = benchmark_decode(
            isl,
            osl,
            num_request,
-            genai_perf_artifact_dir,
+            ai_perf_artifact_dir,
            model_name,
            tokenizer,
            base_url=url,
        )
-        if gap_result is not None:
-            itl = gap_result["inter_token_latency"]["avg"]
-            thpt_per_gpu = gap_result["output_token_throughput"]["avg"] / num_gpus
+        if aiperf_result is not None:
+            itl = aiperf_result["records"]["inter_token_latency"]["avg"]
+            thpt_per_gpu = (
+                aiperf_result["records"]["output_token_throughput"]["avg"] / num_gpus
+            )
            return itl, thpt_per_gpu
        return None, None


--- a/benchmarks/profiler/utils/profile_prefill.py
+++ b/benchmarks/profiler/utils/profile_prefill.py
@@ -6,8 +6,8 @@ from typing import Callable, Optional

 import numpy as np

+from benchmarks.profiler.utils.aiperf import benchmark_prefill
 from benchmarks.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator
-from benchmarks.profiler.utils.genai_perf import benchmark_prefill
 from benchmarks.profiler.utils.plot import plot_prefill_interpolation

 logger = logging.getLogger(__name__)
@@ -81,16 +81,16 @@ def profile_prefill(
    interpolation_granularity,
 ):
    def get_ttft(isl):
-        genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}"
-        gap_result = benchmark_prefill(
+        ai_perf_artifact_dir = f"{work_dir}/aiperf_isl{isl}"
+        aiperf_result = benchmark_prefill(
            isl,
-            genai_perf_artifact_dir,
+            ai_perf_artifact_dir,
            model_name,
            tokenizer,
            base_url=url,
        )
-        if gap_result is not None:
-            return gap_result["time_to_first_token"]["avg"]
+        if aiperf_result is not None:
+            return aiperf_result["records"]["ttft"]["avg"]
        return None

    return _profile_prefill_helper(

--- a/benchmarks/utils/genai.py
+++ b/benchmarks/utils/genai.py
@@ -33,7 +33,7 @@ def get_concurrency_levels() -> List[int]:
 CONCURRENCIES: List[int] = get_concurrency_levels()


-def run_genai_perf(
+def run_aiperf(
    service_url: str,
    model_name: str,
    isl: int,
@@ -44,7 +44,7 @@ def run_genai_perf(
 ) -> None:
    output_dir.mkdir(parents=True, exist_ok=True)
    cmd = [
-        "genai-perf",
+        "aiperf",
        "profile",
        "-m",
        model_name,
@@ -76,28 +76,28 @@ def run_genai_perf(
        "--max-threads=300",
    ]
    print(
-        f"Running genai-perf with isl {isl}, osl {osl}, concurrency {concurrency}",
+        f"Running aiperf with isl {isl}, osl {osl}, concurrency {concurrency}",
        flush=True,
    )

-    gap_process = subprocess.Popen(
+    aip_process = subprocess.Popen(
        cmd,
        cwd=str(output_dir),
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True,
    )
-    stdout, stderr = gap_process.communicate()
-    if gap_process.returncode == 0:
-        print("Genai-perf profiling completed successfully", flush=True)
+    stdout, stderr = aip_process.communicate()
+    if aip_process.returncode == 0:
+        print("Aiperf profiling completed successfully", flush=True)
        if stdout:
            print(stdout)
    else:
-        print(f"Genai-perf failed with error code: {gap_process.returncode}")
+        print(f"Aiperf failed with error code: {aip_process.returncode}")
        if stderr:
            print(f"stderr: {stderr}")
        raise subprocess.CalledProcessError(
-            gap_process.returncode, cmd, output=stdout, stderr=stderr
+            aip_process.returncode, cmd, output=stdout, stderr=stderr
        )


@@ -113,6 +113,4 @@ def run_concurrency_sweep(

    for c in concurrency_levels:
        print(f"Starting concurrency level {c}", flush=True)
-        run_genai_perf(
-            service_url, model_name, isl, osl, stddev, c, output_dir / f"c{c}"
-        )
+        run_aiperf(service_url, model_name, isl, osl, stddev, c, output_dir / f"c{c}")
--- a/benchmarks/utils/plot.py
+++ b/benchmarks/utils/plot.py
@@ -32,22 +32,22 @@ def parse_benchmark_results(result_dir: Path) -> List[Tuple[int, Dict]]:
            continue
        concurrency = int(match.group(1))

-        # Find the genai-perf JSON file
-        genai_perf_json = None
-        for json_file in concurrency_dir.rglob("profile_export_genai_perf.json"):
-            genai_perf_json = json_file
+        # Find the aiperf JSON file
+        aiperf_json = None
+        for json_file in concurrency_dir.rglob("profile_export_aiperf.json"):
+            aiperf_json = json_file
            break

-        if genai_perf_json and genai_perf_json.exists():
+        if aiperf_json and aiperf_json.exists():
            try:
-                with open(genai_perf_json, "r") as f:
+                with open(aiperf_json, "r") as f:
                    metrics = json.load(f)
                results.append((concurrency, metrics))
                print(f"Loaded metrics for concurrency {concurrency}")
            except Exception as e:
-                print(f"Error loading {genai_perf_json}: {e}")
+                print(f"Error loading {aiperf_json}: {e}")
        else:
-            print(f"Warning: No genai-perf JSON found for {concurrency_dir}")
+            print(f"Warning: No aiperf JSON found for {concurrency_dir}")

    # Sort by concurrency level
    results.sort(key=lambda x: x[0])

--- a/container/Dockerfile.sglang
+++ b/container/Dockerfile.sglang
@@ -64,6 +64,7 @@ RUN apt-get update -y \
        # Python runtime - CRITICAL for virtual environment to work
        python${PYTHON_VERSION}-dev \
        build-essential \
+        git \
        # SGLang build dependencies
        cmake \
        ibverbs-providers \
@@ -147,6 +148,7 @@ RUN apt-get update && \
        build-essential \
        # jq and curl for polling various endpoints and health checks
        jq \
+        git \
        curl \
        # Libraries required by UCX to find RDMA devices
        libibverbs1 rdma-core ibverbs-utils libibumad3 \

--- a/container/Dockerfile.sglang-wideep
+++ b/container/Dockerfile.sglang-wideep
@@ -17,7 +17,7 @@ ARG CARGO_BUILD_JOBS="16"
 RUN apt-get update -y && \
    apt-get install -y \
      cmake meson ninja-build pybind11-dev patchelf net-tools \
-      build-essential protobuf-compiler libssl-dev pkg-config \
+      build-essential protobuf-compiler libssl-dev pkg-config git \
      clang libclang-dev git rapidjson-dev zlib1g-dev jq && \
    pip install --break-system-packages meson-python wheel build

@@ -128,7 +128,7 @@ RUN git clone --depth=1 \
    cmake --build perf_analyzer/build -- -j$(nproc)

 ENV PATH=/sgl-workspace/perf_analyzer/build/perf_analyzer/src/perf-analyzer-build:$PATH
-RUN pip install --break-system-packages genai-perf
+RUN pip install --break-system-packages aiperf

 # Enable forceful shutdown of inflight requests
 ENV SGL_FORCE_SHUTDOWN=1

--- a/container/Dockerfile.trtllm
+++ b/container/Dockerfile.trtllm
@@ -76,6 +76,7 @@ RUN apt-get update && \
        build-essential \
        g++ \
        ninja-build \
+        git \
        # Python runtime - CRITICAL for virtual environment to work
        python${PYTHON_VERSION}-dev \
        python3-pip \

--- a/container/Dockerfile.vllm
+++ b/container/Dockerfile.vllm
@@ -187,6 +187,7 @@ RUN apt-get update && \
        build-essential \
        # jq and curl for polling various endpoints and health checks
        jq \
+        git \
        curl \
        # Libraries required by UCX to find RDMA devices
        libibverbs1 rdma-core ibverbs-utils libibumad3 \

--- a/container/deps/requirements.txt
+++ b/container/deps/requirements.txt
@@ -2,7 +2,9 @@
 # SPDX-License-Identifier: Apache-2.0

 accelerate==1.6.0
+aiconfigurator @ git+https://github.com/ai-dynamo/aiconfigurator.git@e46d9089ffe4f5dd62c46914489c55b6dfdbc903
 aiofiles
+aiperf @ git+https://github.com/ai-dynamo/aiperf.git@e8f69abf180ff9ea96de9f9a8c955df8c024625b
 av==15.0.0
 fastapi==0.115.12
 ftfy
@@ -15,7 +17,6 @@ kubernetes_asyncio
 matplotlib
 msgspec
 mypy
-numpy==1.26.4 # pmdarima is not compatible with numpy 2
 nvidia-ml-py==13.580.65
 opentelemetry-api
 opentelemetry-sdk
@@ -26,7 +27,7 @@ prometheus-api-client
 prometheus_client
 prophet
 protobuf==5.29.5
-pydantic==2.10.6
+pydantic>=2.10.6
 pyright
 PyYAML
 scikit-learn

--- a/docs/benchmarks/benchmarking.md
+++ b/docs/benchmarks/benchmarking.md
@@ -61,7 +61,7 @@ Just quick testing/comparison? Client-side.

 ## What This Tool Does

-The framework is a Python-based wrapper around `genai-perf` that:
+The framework is a Python-based wrapper around `aiperf` that:
 - Benchmarks any HTTP endpoints
 - Runs concurrency sweeps across configurable load levels
 - Generates comparison plots with your custom labels
@@ -70,7 +70,7 @@ The framework is a Python-based wrapper around `genai-perf` that:

 **Default sequence lengths**: Input: 2000 tokens, Output: 256 tokens (configurable with `--isl` and `--osl`)

-**Important**: The `--model` parameter configures GenAI-Perf for benchmarking and provides logging context. The default `--model` value in the benchmarking script is `Qwen/Qwen3-0.6B`, but it must match the model deployed at the endpoint(s).
+**Important**: The `--model` parameter configures AIPerf for benchmarking and provides logging context. The default `--model` value in the benchmarking script is `Qwen/Qwen3-0.6B`, but it must match the model deployed at the endpoint(s).

 ---

@@ -165,7 +165,7 @@ REQUIRED:

 OPTIONS:
  -h, --help                    Show help message and examples
-  -m, --model MODEL             Model name for GenAI-Perf configuration and logging (default: Qwen/Qwen3-0.6B)
+  -m, --model MODEL             Model name for AIPerf configuration and logging (default: Qwen/Qwen3-0.6B)
                                NOTE: This must match the model deployed at the endpoint
  -i, --isl LENGTH              Input sequence length (default: 2000)
  -s, --std STDDEV              Input sequence standard deviation (default: 10)
@@ -179,14 +179,14 @@ OPTIONS:
 - **Benchmark Name**: The benchmark name becomes the label in plots and results
 - **Name Restrictions**: Names can only contain letters, numbers, hyphens, and underscores. The name `plots` is reserved.
 - **Port-Forwarding**: You must have an exposed endpoint before benchmarking
- **Model Parameter**: The `--model` parameter configures GenAI-Perf for testing and logging, and must match the model deployed at the endpoint
+- **Model Parameter**: The `--model` parameter configures AIPerf for testing and logging, and must match the model deployed at the endpoint
 - **Sequential Benchmarking**: For comparative benchmarks, deploy and benchmark each configuration separately

 ### What Happens During Benchmarking

 The Python benchmarking module:
 1. **Connects** to your port-forwarded endpoint
-2. **Benchmarks** using GenAI-Perf at various concurrency levels (default: 1, 2, 5, 10, 50, 100, 250)
+2. **Benchmarks** using AIPerf at various concurrency levels (default: 1, 2, 5, 10, 50, 100, 250)
 3. **Measures** key metrics: latency, throughput, time-to-first-token
 4. **Saves** results to an output directory organized by benchmark name

@@ -301,9 +301,9 @@ results/
 ```

 Each concurrency directory contains:
- **`profile_export_genai_perf.json`** - Structured metrics from GenAI-Perf
- **`profile_export_genai_perf.csv`** - CSV format metrics from GenAI-Perf
- **`profile_export.json`** - Raw GenAI-Perf results
+- **`profile_export_aiperf.json`** - Structured metrics from AIPerf
+- **`profile_export_aiperf.csv`** - CSV format metrics from AIPerf
+- **`profile_export.json`** - Raw AIPerf results
 - **`inputs.json`** - Generated test inputs

 ---
@@ -516,7 +516,7 @@ kubectl get endpoints "$SVC_NAME" -n "$NAMESPACE"

 ## Customize Benchmarking Behavior

-The built-in Python workflow connects to endpoints, benchmarks with genai-perf, and generates plots. If you want to modify the behavior:
+The built-in Python workflow connects to endpoints, benchmarks with aiperf, and generates plots. If you want to modify the behavior:

 1. **Extend the workflow**: Modify `benchmarks/utils/workflow.py` to add custom deployment types or metrics collection


--- a/recipes/llama-3-70b/vllm/agg/perf.yaml
+++ b/recipes/llama-3-70b/vllm/agg/perf.yaml
@@ -38,7 +38,7 @@ spec:
          mkdir -p "$ARTIFACT_DIR"
          echo "Running benchmark..."
          export COLUMNS=200
-          genai-perf profile \
+          aiperf profile \
            --model "$TARGET_MODEL" \
            --tokenizer ~/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd  \
            --endpoint-type chat --url "$ENDPOINT" --streaming \
@@ -58,10 +58,10 @@ spec:
            --num-dataset-entries=3000 -- \
            --max-threads 64
          echo "----------------json----------------"
-          PERF_JSON=$(find $ARTIFACT_DIR -name profile_export_genai_perf.json)
+          PERF_JSON=$(find $ARTIFACT_DIR -name profile_export_aiperf.json)
          cat $PERF_JSON | jq .
          echo "----------------csv-----------------"
-          PERF_CSV=$(find $ARTIFACT_DIR -name profile_export_genai_perf.csv)
+          PERF_CSV=$(find $ARTIFACT_DIR -name profile_export_aiperf.csv)
          cat $PERF_CSV
          echo "Benchmark completed successfully!"
        volumeMounts:

--- a/recipes/llama-3-70b/vllm/disagg-multi-node/perf.yaml
+++ b/recipes/llama-3-70b/vllm/disagg-multi-node/perf.yaml
@@ -38,7 +38,7 @@ spec:
          mkdir -p "$ARTIFACT_DIR"
          echo "Running benchmark..."
          export COLUMNS=200
-          genai-perf profile \
+          aiperf profile \
            --model "$TARGET_MODEL" \
            --tokenizer ~/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd  \
            --endpoint-type chat --url "$ENDPOINT" --streaming \
@@ -58,10 +58,10 @@ spec:
            --num-dataset-entries=3000 -- \
            --max-threads 64
          echo "----------------json----------------"
-          PERF_JSON=$(find $ARTIFACT_DIR -name profile_export_genai_perf.json)
+          PERF_JSON=$(find $ARTIFACT_DIR -name profile_export_aiperf.json)
          cat $PERF_JSON | jq .
          echo "----------------csv-----------------"
-          PERF_CSV=$(find $ARTIFACT_DIR -name profile_export_genai_perf.csv)
+          PERF_CSV=$(find $ARTIFACT_DIR -name profile_export_aiperf.csv)
          cat $PERF_CSV
          echo "Benchmark completed successfully!"
        volumeMounts: