"vscode:/vscode.git/clone" did not exist on "d870d4b090dceedfb937b048d88f1a07275f896d"
Unverified Commit de3ca70b authored by Biswa Panda's avatar Biswa Panda Committed by GitHub
Browse files

feat: update benchmarking script to use aiperf (#3306)


Signed-off-by: default avatarBiswa Panda <biswa.panda@gmail.com>
Signed-off-by: default avatarlkomali <lkomali@nvidia.com>
Co-authored-by: default avatarlkomali <lkomali@nvidia.com>
Co-authored-by: default avatarHarshini Komali <157742537+lkomali@users.noreply.github.com>
parent 9b0948c6
...@@ -178,7 +178,7 @@ Rerun with `curl -N` and change `stream` in the request to `true` to get the res ...@@ -178,7 +178,7 @@ Rerun with `curl -N` and change `stream` in the request to `true` to get the res
Dynamo provides comprehensive benchmarking tools to evaluate and optimize your deployments: Dynamo provides comprehensive benchmarking tools to evaluate and optimize your deployments:
- **[Benchmarking Guide](docs/benchmarks/benchmarking.md)** – Compare deployment topologies (aggregated vs. disaggregated vs. vanilla vLLM) using GenAI-Perf - **[Benchmarking Guide](docs/benchmarks/benchmarking.md)** – Compare deployment topologies (aggregated vs. disaggregated vs. vanilla vLLM) using AIPerf
- **[Pre-Deployment Profiling](docs/benchmarks/pre_deployment_profiling.md)** – Optimize configurations before deployment to meet SLA requirements - **[Pre-Deployment Profiling](docs/benchmarks/pre_deployment_profiling.md)** – Optimize configurations before deployment to meet SLA requirements
# Engines # Engines
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
# Benchmarks # Benchmarks
This directory contains benchmarking scripts and tools for performance evaluation of Dynamo deployments. The benchmarking framework is a wrapper around genai-perf that makes it easy to benchmark DynamoGraphDeployments or other deployments with exposed endpoints. This directory contains benchmarking scripts and tools for performance evaluation of Dynamo deployments. The benchmarking framework is a wrapper around aiperf that makes it easy to benchmark DynamoGraphDeployments or other deployments with exposed endpoints.
## Quick Start ## Quick Start
......
...@@ -202,7 +202,7 @@ if [ $index -gt 0 ]; then ...@@ -202,7 +202,7 @@ if [ $index -gt 0 ]; then
echo "--------------------------------" echo "--------------------------------"
fi fi
echo "Running genai-perf with:" echo "Running aiperf with:"
echo "Model: $model" echo "Model: $model"
echo "ISL: $isl" echo "ISL: $isl"
echo "OSL: $osl" echo "OSL: $osl"
...@@ -214,7 +214,7 @@ for concurrency in "${concurrency_array[@]}"; do ...@@ -214,7 +214,7 @@ for concurrency in "${concurrency_array[@]}"; do
# NOTE: For Dynamo HTTP OpenAI frontend, use `nvext` for fields like # NOTE: For Dynamo HTTP OpenAI frontend, use `nvext` for fields like
# `ignore_eos` since they are not in the official OpenAI spec. # `ignore_eos` since they are not in the official OpenAI spec.
genai-perf profile \ aiperf profile \
--model ${model} \ --model ${model} \
--tokenizer ${model} \ --tokenizer ${model} \
--endpoint-type chat \ --endpoint-type chat \
......
...@@ -26,7 +26,7 @@ from matplotlib.ticker import MultipleLocator ...@@ -26,7 +26,7 @@ from matplotlib.ticker import MultipleLocator
def get_json_paths(search_paths): def get_json_paths(search_paths):
genai_perf_profile_export_json_paths = [] aiperf_profile_export_json_paths = []
deployment_config_json_paths = [] deployment_config_json_paths = []
for search_path in search_paths: for search_path in search_paths:
deployment_config_json_path = os.path.join( deployment_config_json_path = os.path.join(
...@@ -34,15 +34,13 @@ def get_json_paths(search_paths): ...@@ -34,15 +34,13 @@ def get_json_paths(search_paths):
) )
if not os.path.exists(deployment_config_json_path): if not os.path.exists(deployment_config_json_path):
raise Exception(f"deployment_config.json not found in {search_path}") raise Exception(f"deployment_config.json not found in {search_path}")
for root, dirs, files in os.walk(search_path): for root, _, files in os.walk(search_path):
for file in files: for file in files:
if file == "profile_export_genai_perf.json": if file == "profile_export_aiperf.json":
genai_perf_profile_export_json_paths.append( aiperf_profile_export_json_paths.append(os.path.join(root, file))
os.path.join(root, file)
)
deployment_config_json_paths.append(deployment_config_json_path) deployment_config_json_paths.append(deployment_config_json_path)
return genai_perf_profile_export_json_paths, deployment_config_json_paths return aiperf_profile_export_json_paths, deployment_config_json_paths
# search for -concurrency<number> in the name # search for -concurrency<number> in the name
...@@ -81,13 +79,13 @@ def parse_kind_and_mode(deployment_config_json_path): ...@@ -81,13 +79,13 @@ def parse_kind_and_mode(deployment_config_json_path):
def extract_val_and_concurrency( def extract_val_and_concurrency(
genai_perf_profile_export_json_paths, deployment_config_json_paths, stat_value="avg" aiperf_profile_export_json_paths, deployment_config_json_paths, stat_value="avg"
): ):
results = [] results = []
for genai_perf_profile_export_json_path, deployment_config_json_path in zip( for aiperf_profile_export_json_path, deployment_config_json_path in zip(
genai_perf_profile_export_json_paths, deployment_config_json_paths aiperf_profile_export_json_paths, deployment_config_json_paths
): ):
with open(genai_perf_profile_export_json_path, "r") as f: with open(aiperf_profile_export_json_path, "r") as f:
data = json.load(f) data = json.load(f)
# output_token_throughput contains only avg # output_token_throughput contains only avg
output_token_throughput = data.get("output_token_throughput", {}).get("avg") output_token_throughput = data.get("output_token_throughput", {}).get("avg")
...@@ -99,7 +97,7 @@ def extract_val_and_concurrency( ...@@ -99,7 +97,7 @@ def extract_val_and_concurrency(
# request_throughput contains only avg # request_throughput contains only avg
request_throughput = data.get("request_throughput", {}).get("avg") request_throughput = data.get("request_throughput", {}).get("avg")
concurrency = parse_concurrency(genai_perf_profile_export_json_path) concurrency = parse_concurrency(aiperf_profile_export_json_path)
num_gpus = parse_gpus(deployment_config_json_path) num_gpus = parse_gpus(deployment_config_json_path)
kind, mode = parse_kind_and_mode(deployment_config_json_path) kind, mode = parse_kind_and_mode(deployment_config_json_path)
...@@ -116,7 +114,7 @@ def extract_val_and_concurrency( ...@@ -116,7 +114,7 @@ def extract_val_and_concurrency(
results.append( results.append(
{ {
"configuration": genai_perf_profile_export_json_path, "configuration": aiperf_profile_export_json_path,
"kind": kind, "kind": kind,
"mode": mode, "mode": mode,
"num_gpus": num_gpus, "num_gpus": num_gpus,
...@@ -241,12 +239,12 @@ if __name__ == "__main__": ...@@ -241,12 +239,12 @@ if __name__ == "__main__":
import os import os
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Plot Pareto graph from GenAI-Perf artifacts" description="Plot Pareto graph from AIPerf artifacts"
) )
parser.add_argument( parser.add_argument(
"--artifacts-root-dir", "--artifacts-root-dir",
required=True, required=True,
help="Root directory containing artifact directories to search for profile_export_genai_perf.json files", help="Root directory containing artifact directories to search for profile_export_aiperf.json files",
) )
parser.add_argument( parser.add_argument(
"--title", "--title",
...@@ -260,16 +258,16 @@ if __name__ == "__main__": ...@@ -260,16 +258,16 @@ if __name__ == "__main__":
if not artifacts_dirs: if not artifacts_dirs:
raise ValueError(f"No artifacts directories found in {args.artifacts_root_dir}") raise ValueError(f"No artifacts directories found in {args.artifacts_root_dir}")
genai_perf_profile_export_json_paths, deployment_config_json_paths = get_json_paths( aiperf_profile_export_json_paths, deployment_config_json_paths = get_json_paths(
artifacts_dirs artifacts_dirs
) )
if len(genai_perf_profile_export_json_paths) != len(deployment_config_json_paths): if len(aiperf_profile_export_json_paths) != len(deployment_config_json_paths):
raise ValueError( raise ValueError(
f"Number of genai_perf_profile_export_json_paths ({len(genai_perf_profile_export_json_paths)}) does not match number of deployment_config_json_paths ({len(deployment_config_json_paths)})" f"Number of aiperf_profile_export_json_paths ({len(aiperf_profile_export_json_paths)}) does not match number of deployment_config_json_paths ({len(deployment_config_json_paths)})"
) )
extracted_values = extract_val_and_concurrency( extracted_values = extract_val_and_concurrency(
genai_perf_profile_export_json_paths, deployment_config_json_paths aiperf_profile_export_json_paths, deployment_config_json_paths
) )
create_pareto_graph(extracted_values, title=args.title) create_pareto_graph(extracted_values, title=args.title)
...@@ -5,8 +5,8 @@ import argparse ...@@ -5,8 +5,8 @@ import argparse
import logging import logging
import os import os
from utils.profile_decode import profile_decode from benchmarks.profiler.utils.profile_decode import profile_decode
from utils.profile_prefill import profile_prefill from benchmarks.profiler.utils.profile_prefill import profile_prefill
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
......
...@@ -22,13 +22,13 @@ import os ...@@ -22,13 +22,13 @@ import os
import numpy as np import numpy as np
import yaml import yaml
from benchmarks.profiler.utils.aiperf import benchmark_decode, benchmark_prefill
from benchmarks.profiler.utils.config import ( from benchmarks.profiler.utils.config import (
CONFIG_MODIFIERS, CONFIG_MODIFIERS,
WORKER_COMPONENT_NAMES, WORKER_COMPONENT_NAMES,
generate_dgd_config_with_planner, generate_dgd_config_with_planner,
) )
from benchmarks.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator from benchmarks.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator
from benchmarks.profiler.utils.genai_perf import benchmark_decode, benchmark_prefill
from benchmarks.profiler.utils.planner_utils import add_planner_arguments_to_parser from benchmarks.profiler.utils.planner_utils import add_planner_arguments_to_parser
from benchmarks.profiler.utils.plot import ( from benchmarks.profiler.utils.plot import (
plot_decode_performance, plot_decode_performance,
...@@ -245,18 +245,18 @@ async def run_profile(args): ...@@ -245,18 +245,18 @@ async def run_profile(args):
f"Logs have been saved to {client.base_log_dir / client.deployment_name}" f"Logs have been saved to {client.base_log_dir / client.deployment_name}"
) )
# run genai-perf # run ai-perf
base_url = client.get_service_url() base_url = client.get_service_url()
genai_perf_artifact_dir = f"{work_dir}/gap_isl{args.isl}" ai_perf_artifact_dir = f"{work_dir}/aiperf_isl{args.isl}"
gap_result = benchmark_prefill( aiperf_result = benchmark_prefill(
args.isl, args.isl,
genai_perf_artifact_dir, ai_perf_artifact_dir,
model_name, model_name,
model_name, model_name,
base_url=base_url, base_url=base_url,
) )
if gap_result is not None: if aiperf_result is not None:
ttft = gap_result["time_to_first_token"]["avg"] ttft = aiperf_result["records"]["ttft"]["avg"]
logger.info("Cleaning up deployment...") logger.info("Cleaning up deployment...")
await client.delete_deployment() await client.delete_deployment()
...@@ -424,20 +424,23 @@ async def run_profile(args): ...@@ -424,20 +424,23 @@ async def run_profile(args):
) )
else: else:
base_url = client.get_service_url() base_url = client.get_service_url()
genai_perf_artifact_dir = f"{work_dir}/gap_request{num_request}_isl{args.isl}_osl{args.osl}_n{num_request}" ai_perf_artifact_dir = f"{work_dir}/aiperf_request{num_request}_isl{args.isl}_osl{args.osl}_n{num_request}"
gap_result = benchmark_decode( aiperf_result = benchmark_decode(
args.isl, args.isl,
args.osl, args.osl,
num_request, num_request,
genai_perf_artifact_dir, ai_perf_artifact_dir,
model_name, model_name,
model_name, model_name,
base_url=base_url, base_url=base_url,
) )
if gap_result is not None: if aiperf_result is not None:
itl = gap_result["inter_token_latency"]["avg"] itl = aiperf_result["records"]["inter_token_latency"]["avg"]
thpt_per_gpu = ( thpt_per_gpu = (
gap_result["output_token_throughput"]["avg"] / num_gpus aiperf_result["records"]["output_token_throughput"][
"avg"
]
/ num_gpus
) )
if itl is not None and thpt_per_gpu is not None: if itl is not None and thpt_per_gpu is not None:
......
...@@ -30,7 +30,7 @@ console_handler.setFormatter(formatter) ...@@ -30,7 +30,7 @@ console_handler.setFormatter(formatter)
logger.addHandler(console_handler) logger.addHandler(console_handler)
def _get_common_genai_perf_cmd( def _get_common_aiperf_cmd(
artifact_dir, artifact_dir,
seed=100, seed=100,
model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B", model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
...@@ -38,7 +38,7 @@ def _get_common_genai_perf_cmd( ...@@ -38,7 +38,7 @@ def _get_common_genai_perf_cmd(
base_url="http://localhost:8000", base_url="http://localhost:8000",
): ):
return [ return [
"genai-perf", "aiperf",
"profile", "profile",
"--model", "--model",
model, model,
...@@ -64,7 +64,7 @@ def _get_common_genai_perf_cmd( ...@@ -64,7 +64,7 @@ def _get_common_genai_perf_cmd(
] ]
def get_prefill_genai_perf_cmd( def get_prefill_aiperf_cmd(
isl, isl,
artifact_dir, artifact_dir,
seed=100, seed=100,
...@@ -73,7 +73,7 @@ def get_prefill_genai_perf_cmd( ...@@ -73,7 +73,7 @@ def get_prefill_genai_perf_cmd(
osl=5, osl=5,
base_url="http://localhost:8000", base_url="http://localhost:8000",
): ):
return _get_common_genai_perf_cmd( return _get_common_aiperf_cmd(
artifact_dir, artifact_dir,
seed, seed,
model, model,
...@@ -99,7 +99,7 @@ def get_prefill_genai_perf_cmd( ...@@ -99,7 +99,7 @@ def get_prefill_genai_perf_cmd(
] ]
def get_decode_genai_perf_cmd( def get_decode_aiperf_cmd(
isl, isl,
osl, osl,
artifact_dir, artifact_dir,
...@@ -109,7 +109,7 @@ def get_decode_genai_perf_cmd( ...@@ -109,7 +109,7 @@ def get_decode_genai_perf_cmd(
tokenizer="deepseek-ai/DeepSeek-R1-Distill-Llama-8B", tokenizer="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
base_url="http://localhost:8000", base_url="http://localhost:8000",
): ):
return _get_common_genai_perf_cmd( return _get_common_aiperf_cmd(
artifact_dir, artifact_dir,
seed, seed,
model, model,
...@@ -137,15 +137,15 @@ def get_decode_genai_perf_cmd( ...@@ -137,15 +137,15 @@ def get_decode_genai_perf_cmd(
] ]
def get_gap_result(artifact_dir: str) -> dict: def get_aiperf_result(artifact_dir: str) -> dict:
json_file_path = None json_file_path = None
for root, _, files in os.walk(artifact_dir): for root, _, files in os.walk(artifact_dir):
if "profile_export_genai_perf.json" in files: if "profile_export_aiperf.json" in files:
json_file_path = os.path.join(root, "profile_export_genai_perf.json") json_file_path = os.path.join(root, "profile_export_aiperf.json")
break break
if json_file_path is None: if json_file_path is None:
raise FileNotFoundError( raise FileNotFoundError(
f"profile_export_genai_perf.json not found in {artifact_dir}" f"profile_export_aiperf.json not found in {artifact_dir}"
) )
with open(json_file_path, "r") as f: with open(json_file_path, "r") as f:
return json.load(f) return json.load(f)
...@@ -153,35 +153,35 @@ def get_gap_result(artifact_dir: str) -> dict: ...@@ -153,35 +153,35 @@ def get_gap_result(artifact_dir: str) -> dict:
def benchmark_prefill( def benchmark_prefill(
isl, isl,
genai_perf_artifact_dir, aiperf_artifact_dir,
model_name, model_name,
tokenizer, tokenizer,
base_url="http://localhost:8000", base_url="http://localhost:8000",
): ):
logger.info(f"Running genai-perf with isl {isl}") logger.info(f"Running aiperf with isl {isl}")
genai_perf_cmd = get_prefill_genai_perf_cmd( aiperf_cmd = get_prefill_aiperf_cmd(
isl, isl,
genai_perf_artifact_dir, aiperf_artifact_dir,
model=model_name, model=model_name,
tokenizer=tokenizer, tokenizer=tokenizer,
base_url=base_url, base_url=base_url,
) )
print(f"genai-perf cmd: {genai_perf_cmd}") print(f"aiperf cmd: {aiperf_cmd}")
# import pdb; pdb.set_trace() # import pdb; pdb.set_trace()
gap_process = subprocess.Popen( aiperf_process = subprocess.Popen(
genai_perf_cmd, aiperf_cmd,
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, stderr=subprocess.PIPE,
text=True, text=True,
) )
stdout, stderr = gap_process.communicate() stdout, stderr = aiperf_process.communicate()
if gap_process.returncode == 0: if aiperf_process.returncode == 0:
logger.info("Genai-perf profiling completed successfully") logger.info("AIperf profiling completed successfully")
logger.info(stdout) logger.info(stdout)
gap_result = get_gap_result(genai_perf_artifact_dir) aiperf_result = get_aiperf_result(aiperf_artifact_dir)
return gap_result return aiperf_result
else: else:
logger.error(f"Genai-perf failed with error code: {gap_process.returncode}") logger.error(f"AIPerf failed with error code: {aiperf_process.returncode}")
logger.error(f"stderr: {stderr}") logger.error(f"stderr: {stderr}")
return None return None
...@@ -190,7 +190,7 @@ def benchmark_decode( ...@@ -190,7 +190,7 @@ def benchmark_decode(
isl, isl,
osl, osl,
num_request, num_request,
genai_perf_artifact_dir, aiperf_artifact_dir,
model_name, model_name,
tokenizer, tokenizer,
base_url="http://localhost:8000", base_url="http://localhost:8000",
...@@ -201,47 +201,47 @@ def benchmark_decode( ...@@ -201,47 +201,47 @@ def benchmark_decode(
# we use the same random seed to make sure the prompt is the same # we use the same random seed to make sure the prompt is the same
seed = random.randint(0, 1000000) seed = random.randint(0, 1000000)
genai_perf_cmd = get_decode_genai_perf_cmd( aiperf_cmd = get_decode_aiperf_cmd(
isl, isl,
osl, osl,
f"{genai_perf_artifact_dir}_warmup", f"{aiperf_artifact_dir}_warmup",
num_request, num_request,
seed=seed, seed=seed,
model=model_name, model=model_name,
tokenizer=tokenizer, tokenizer=tokenizer,
base_url=base_url, base_url=base_url,
) )
gap_process = subprocess.Popen( aiperf_process = subprocess.Popen(
genai_perf_cmd, aiperf_cmd,
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, stderr=subprocess.PIPE,
text=True, text=True,
) )
gap_process.communicate() aiperf_process.communicate()
# then send out the real requests, hopefully, this will skip all prefill computation # then send out the real requests, hopefully, this will skip all prefill computation
genai_perf_cmd = get_decode_genai_perf_cmd( aiperf_cmd = get_decode_aiperf_cmd(
isl, isl,
osl, osl,
genai_perf_artifact_dir, aiperf_artifact_dir,
num_request, num_request,
seed=seed, seed=seed,
model=model_name, model=model_name,
tokenizer=tokenizer, tokenizer=tokenizer,
base_url=base_url, base_url=base_url,
) )
gap_process = subprocess.Popen( aiperf_process = subprocess.Popen(
genai_perf_cmd, aiperf_cmd,
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, stderr=subprocess.PIPE,
text=True, text=True,
) )
stdout, stderr = gap_process.communicate() stdout, stderr = aiperf_process.communicate()
if gap_process.returncode == 0: if aiperf_process.returncode == 0:
logger.info("Genai-perf profiling completed successfully") logger.info("AIperf profiling completed successfully")
logger.info(stdout) logger.info(stdout)
gap_result = get_gap_result(genai_perf_artifact_dir) aiperf_result = get_aiperf_result(aiperf_artifact_dir)
return gap_result return aiperf_result
else: else:
logger.error(f"Genai-perf failed with error code: {gap_process.returncode}") logger.error(f"AIPerf failed with error code: {aiperf_process.returncode}")
logger.error(f"stderr: {stderr}") logger.error(f"stderr: {stderr}")
return None return None
...@@ -26,13 +26,13 @@ logger = logging.getLogger(__name__) ...@@ -26,13 +26,13 @@ logger = logging.getLogger(__name__)
def check_prefill_results_exist(output_dir: str, tp_size: int, isl: int) -> bool: def check_prefill_results_exist(output_dir: str, tp_size: int, isl: int) -> bool:
"""Check if prefill results already exist for a given TP size.""" """Check if prefill results already exist for a given TP size."""
work_dir = f"{output_dir}/prefill_tp{tp_size}" work_dir = f"{output_dir}/prefill_tp{tp_size}"
result_file = f"{work_dir}/gap_isl{isl}/*/profile_export_genai_perf.json" result_file = f"{work_dir}/aiperf_isl{isl}/*/profile_export_aiperf.json"
# Check if the work directory exists # Check if the work directory exists
if not os.path.exists(work_dir): if not os.path.exists(work_dir):
return False return False
# Look for the genai-perf result file # Look for the aiperf result file
result_files = glob.glob(result_file) result_files = glob.glob(result_file)
if not result_files: if not result_files:
return False return False
...@@ -65,7 +65,7 @@ def check_decode_results_exist( ...@@ -65,7 +65,7 @@ def check_decode_results_exist(
# Look for at least one decode result file # Look for at least one decode result file
result_pattern = ( result_pattern = (
f"{work_dir}/gap_request*_isl{isl}_osl{osl}_n*/*/profile_export_genai_perf.json" f"{work_dir}/aiperf_request*_isl{isl}_osl{osl}_n*/*/profile_export_aiperf.json"
) )
result_files = glob.glob(result_pattern) result_files = glob.glob(result_pattern)
...@@ -93,7 +93,7 @@ def load_existing_prefill_results( ...@@ -93,7 +93,7 @@ def load_existing_prefill_results(
) -> Tuple[Optional[float], Optional[float]]: ) -> Tuple[Optional[float], Optional[float]]:
"""Load existing prefill results from disk.""" """Load existing prefill results from disk."""
work_dir = f"{output_dir}/prefill_tp{tp_size}" work_dir = f"{output_dir}/prefill_tp{tp_size}"
result_file = f"{work_dir}/gap_isl{isl}/*/profile_export_genai_perf.json" result_file = f"{work_dir}/aiperf_isl{isl}/*/profile_export_aiperf.json"
result_files = glob.glob(result_file) result_files = glob.glob(result_file)
if result_files: if result_files:
...@@ -115,7 +115,7 @@ def load_existing_decode_results( ...@@ -115,7 +115,7 @@ def load_existing_decode_results(
work_dir = f"{output_dir}/decode_tp{tp_size}" work_dir = f"{output_dir}/decode_tp{tp_size}"
result_pattern = ( result_pattern = (
f"{work_dir}/gap_request*_isl{isl}_osl{osl}_n*/*/profile_export_genai_perf.json" f"{work_dir}/aiperf_request*_isl{isl}_osl{osl}_n*/*/profile_export_aiperf.json"
) )
result_files = glob.glob(result_pattern) result_files = glob.glob(result_pattern)
...@@ -128,7 +128,7 @@ def load_existing_decode_results( ...@@ -128,7 +128,7 @@ def load_existing_decode_results(
thpt_per_gpu = data["output_token_throughput"]["avg"] / tp_size thpt_per_gpu = data["output_token_throughput"]["avg"] / tp_size
# Extract concurrency from filename # Extract concurrency from filename
match = re.search(r"gap_request(\d+)_", result_file) match = re.search(r"aiperf_request(\d+)_", result_file)
if match: if match:
concurrency = int(match.group(1)) concurrency = int(match.group(1))
decode_results.append((itl, thpt_per_gpu, concurrency)) decode_results.append((itl, thpt_per_gpu, concurrency))
......
...@@ -6,9 +6,9 @@ from typing import Callable, Optional, Tuple ...@@ -6,9 +6,9 @@ from typing import Callable, Optional, Tuple
import numpy as np import numpy as np
from benchmarks.profiler.utils.aiperf import benchmark_decode
from benchmarks.profiler.utils.defaults import DECODE_MAX_CONCURRENCY from benchmarks.profiler.utils.defaults import DECODE_MAX_CONCURRENCY
from benchmarks.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator from benchmarks.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator
from benchmarks.profiler.utils.genai_perf import benchmark_decode
from benchmarks.profiler.utils.plot import plot_decode_3d_surface from benchmarks.profiler.utils.plot import plot_decode_3d_surface
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -113,19 +113,21 @@ def profile_decode( ...@@ -113,19 +113,21 @@ def profile_decode(
attention_dp_size, attention_dp_size,
): ):
def get_itl_and_thpt_per_gpu(isl, osl, num_request): def get_itl_and_thpt_per_gpu(isl, osl, num_request):
genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}_osl{osl}_n{num_request}" ai_perf_artifact_dir = f"{work_dir}/aiperf_isl{isl}_osl{osl}_n{num_request}"
gap_result = benchmark_decode( aiperf_result = benchmark_decode(
isl, isl,
osl, osl,
num_request, num_request,
genai_perf_artifact_dir, ai_perf_artifact_dir,
model_name, model_name,
tokenizer, tokenizer,
base_url=url, base_url=url,
) )
if gap_result is not None: if aiperf_result is not None:
itl = gap_result["inter_token_latency"]["avg"] itl = aiperf_result["records"]["inter_token_latency"]["avg"]
thpt_per_gpu = gap_result["output_token_throughput"]["avg"] / num_gpus thpt_per_gpu = (
aiperf_result["records"]["output_token_throughput"]["avg"] / num_gpus
)
return itl, thpt_per_gpu return itl, thpt_per_gpu
return None, None return None, None
......
...@@ -6,8 +6,8 @@ from typing import Callable, Optional ...@@ -6,8 +6,8 @@ from typing import Callable, Optional
import numpy as np import numpy as np
from benchmarks.profiler.utils.aiperf import benchmark_prefill
from benchmarks.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator from benchmarks.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator
from benchmarks.profiler.utils.genai_perf import benchmark_prefill
from benchmarks.profiler.utils.plot import plot_prefill_interpolation from benchmarks.profiler.utils.plot import plot_prefill_interpolation
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -81,16 +81,16 @@ def profile_prefill( ...@@ -81,16 +81,16 @@ def profile_prefill(
interpolation_granularity, interpolation_granularity,
): ):
def get_ttft(isl): def get_ttft(isl):
genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}" ai_perf_artifact_dir = f"{work_dir}/aiperf_isl{isl}"
gap_result = benchmark_prefill( aiperf_result = benchmark_prefill(
isl, isl,
genai_perf_artifact_dir, ai_perf_artifact_dir,
model_name, model_name,
tokenizer, tokenizer,
base_url=url, base_url=url,
) )
if gap_result is not None: if aiperf_result is not None:
return gap_result["time_to_first_token"]["avg"] return aiperf_result["records"]["ttft"]["avg"]
return None return None
return _profile_prefill_helper( return _profile_prefill_helper(
......
...@@ -33,7 +33,7 @@ def get_concurrency_levels() -> List[int]: ...@@ -33,7 +33,7 @@ def get_concurrency_levels() -> List[int]:
CONCURRENCIES: List[int] = get_concurrency_levels() CONCURRENCIES: List[int] = get_concurrency_levels()
def run_genai_perf( def run_aiperf(
service_url: str, service_url: str,
model_name: str, model_name: str,
isl: int, isl: int,
...@@ -44,7 +44,7 @@ def run_genai_perf( ...@@ -44,7 +44,7 @@ def run_genai_perf(
) -> None: ) -> None:
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
cmd = [ cmd = [
"genai-perf", "aiperf",
"profile", "profile",
"-m", "-m",
model_name, model_name,
...@@ -76,28 +76,28 @@ def run_genai_perf( ...@@ -76,28 +76,28 @@ def run_genai_perf(
"--max-threads=300", "--max-threads=300",
] ]
print( print(
f"Running genai-perf with isl {isl}, osl {osl}, concurrency {concurrency}", f"Running aiperf with isl {isl}, osl {osl}, concurrency {concurrency}",
flush=True, flush=True,
) )
gap_process = subprocess.Popen( aip_process = subprocess.Popen(
cmd, cmd,
cwd=str(output_dir), cwd=str(output_dir),
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, stderr=subprocess.PIPE,
text=True, text=True,
) )
stdout, stderr = gap_process.communicate() stdout, stderr = aip_process.communicate()
if gap_process.returncode == 0: if aip_process.returncode == 0:
print("Genai-perf profiling completed successfully", flush=True) print("Aiperf profiling completed successfully", flush=True)
if stdout: if stdout:
print(stdout) print(stdout)
else: else:
print(f"Genai-perf failed with error code: {gap_process.returncode}") print(f"Aiperf failed with error code: {aip_process.returncode}")
if stderr: if stderr:
print(f"stderr: {stderr}") print(f"stderr: {stderr}")
raise subprocess.CalledProcessError( raise subprocess.CalledProcessError(
gap_process.returncode, cmd, output=stdout, stderr=stderr aip_process.returncode, cmd, output=stdout, stderr=stderr
) )
...@@ -113,6 +113,4 @@ def run_concurrency_sweep( ...@@ -113,6 +113,4 @@ def run_concurrency_sweep(
for c in concurrency_levels: for c in concurrency_levels:
print(f"Starting concurrency level {c}", flush=True) print(f"Starting concurrency level {c}", flush=True)
run_genai_perf( run_aiperf(service_url, model_name, isl, osl, stddev, c, output_dir / f"c{c}")
service_url, model_name, isl, osl, stddev, c, output_dir / f"c{c}"
)
...@@ -32,22 +32,22 @@ def parse_benchmark_results(result_dir: Path) -> List[Tuple[int, Dict]]: ...@@ -32,22 +32,22 @@ def parse_benchmark_results(result_dir: Path) -> List[Tuple[int, Dict]]:
continue continue
concurrency = int(match.group(1)) concurrency = int(match.group(1))
# Find the genai-perf JSON file # Find the aiperf JSON file
genai_perf_json = None aiperf_json = None
for json_file in concurrency_dir.rglob("profile_export_genai_perf.json"): for json_file in concurrency_dir.rglob("profile_export_aiperf.json"):
genai_perf_json = json_file aiperf_json = json_file
break break
if genai_perf_json and genai_perf_json.exists(): if aiperf_json and aiperf_json.exists():
try: try:
with open(genai_perf_json, "r") as f: with open(aiperf_json, "r") as f:
metrics = json.load(f) metrics = json.load(f)
results.append((concurrency, metrics)) results.append((concurrency, metrics))
print(f"Loaded metrics for concurrency {concurrency}") print(f"Loaded metrics for concurrency {concurrency}")
except Exception as e: except Exception as e:
print(f"Error loading {genai_perf_json}: {e}") print(f"Error loading {aiperf_json}: {e}")
else: else:
print(f"Warning: No genai-perf JSON found for {concurrency_dir}") print(f"Warning: No aiperf JSON found for {concurrency_dir}")
# Sort by concurrency level # Sort by concurrency level
results.sort(key=lambda x: x[0]) results.sort(key=lambda x: x[0])
......
...@@ -64,6 +64,7 @@ RUN apt-get update -y \ ...@@ -64,6 +64,7 @@ RUN apt-get update -y \
# Python runtime - CRITICAL for virtual environment to work # Python runtime - CRITICAL for virtual environment to work
python${PYTHON_VERSION}-dev \ python${PYTHON_VERSION}-dev \
build-essential \ build-essential \
git \
# SGLang build dependencies # SGLang build dependencies
cmake \ cmake \
ibverbs-providers \ ibverbs-providers \
...@@ -147,6 +148,7 @@ RUN apt-get update && \ ...@@ -147,6 +148,7 @@ RUN apt-get update && \
build-essential \ build-essential \
# jq and curl for polling various endpoints and health checks # jq and curl for polling various endpoints and health checks
jq \ jq \
git \
curl \ curl \
# Libraries required by UCX to find RDMA devices # Libraries required by UCX to find RDMA devices
libibverbs1 rdma-core ibverbs-utils libibumad3 \ libibverbs1 rdma-core ibverbs-utils libibumad3 \
......
...@@ -17,7 +17,7 @@ ARG CARGO_BUILD_JOBS="16" ...@@ -17,7 +17,7 @@ ARG CARGO_BUILD_JOBS="16"
RUN apt-get update -y && \ RUN apt-get update -y && \
apt-get install -y \ apt-get install -y \
cmake meson ninja-build pybind11-dev patchelf net-tools \ cmake meson ninja-build pybind11-dev patchelf net-tools \
build-essential protobuf-compiler libssl-dev pkg-config \ build-essential protobuf-compiler libssl-dev pkg-config git \
clang libclang-dev git rapidjson-dev zlib1g-dev jq && \ clang libclang-dev git rapidjson-dev zlib1g-dev jq && \
pip install --break-system-packages meson-python wheel build pip install --break-system-packages meson-python wheel build
...@@ -128,7 +128,7 @@ RUN git clone --depth=1 \ ...@@ -128,7 +128,7 @@ RUN git clone --depth=1 \
cmake --build perf_analyzer/build -- -j$(nproc) cmake --build perf_analyzer/build -- -j$(nproc)
ENV PATH=/sgl-workspace/perf_analyzer/build/perf_analyzer/src/perf-analyzer-build:$PATH ENV PATH=/sgl-workspace/perf_analyzer/build/perf_analyzer/src/perf-analyzer-build:$PATH
RUN pip install --break-system-packages genai-perf RUN pip install --break-system-packages aiperf
# Enable forceful shutdown of inflight requests # Enable forceful shutdown of inflight requests
ENV SGL_FORCE_SHUTDOWN=1 ENV SGL_FORCE_SHUTDOWN=1
......
...@@ -76,6 +76,7 @@ RUN apt-get update && \ ...@@ -76,6 +76,7 @@ RUN apt-get update && \
build-essential \ build-essential \
g++ \ g++ \
ninja-build \ ninja-build \
git \
# Python runtime - CRITICAL for virtual environment to work # Python runtime - CRITICAL for virtual environment to work
python${PYTHON_VERSION}-dev \ python${PYTHON_VERSION}-dev \
python3-pip \ python3-pip \
......
...@@ -187,6 +187,7 @@ RUN apt-get update && \ ...@@ -187,6 +187,7 @@ RUN apt-get update && \
build-essential \ build-essential \
# jq and curl for polling various endpoints and health checks # jq and curl for polling various endpoints and health checks
jq \ jq \
git \
curl \ curl \
# Libraries required by UCX to find RDMA devices # Libraries required by UCX to find RDMA devices
libibverbs1 rdma-core ibverbs-utils libibumad3 \ libibverbs1 rdma-core ibverbs-utils libibumad3 \
......
...@@ -2,7 +2,9 @@ ...@@ -2,7 +2,9 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
accelerate==1.6.0 accelerate==1.6.0
aiconfigurator @ git+https://github.com/ai-dynamo/aiconfigurator.git@e46d9089ffe4f5dd62c46914489c55b6dfdbc903
aiofiles aiofiles
aiperf @ git+https://github.com/ai-dynamo/aiperf.git@e8f69abf180ff9ea96de9f9a8c955df8c024625b
av==15.0.0 av==15.0.0
fastapi==0.115.12 fastapi==0.115.12
ftfy ftfy
...@@ -15,7 +17,6 @@ kubernetes_asyncio ...@@ -15,7 +17,6 @@ kubernetes_asyncio
matplotlib matplotlib
msgspec msgspec
mypy mypy
numpy==1.26.4 # pmdarima is not compatible with numpy 2
nvidia-ml-py==13.580.65 nvidia-ml-py==13.580.65
opentelemetry-api opentelemetry-api
opentelemetry-sdk opentelemetry-sdk
...@@ -26,7 +27,7 @@ prometheus-api-client ...@@ -26,7 +27,7 @@ prometheus-api-client
prometheus_client prometheus_client
prophet prophet
protobuf==5.29.5 protobuf==5.29.5
pydantic==2.10.6 pydantic>=2.10.6
pyright pyright
PyYAML PyYAML
scikit-learn scikit-learn
......
...@@ -61,7 +61,7 @@ Just quick testing/comparison? Client-side. ...@@ -61,7 +61,7 @@ Just quick testing/comparison? Client-side.
## What This Tool Does ## What This Tool Does
The framework is a Python-based wrapper around `genai-perf` that: The framework is a Python-based wrapper around `aiperf` that:
- Benchmarks any HTTP endpoints - Benchmarks any HTTP endpoints
- Runs concurrency sweeps across configurable load levels - Runs concurrency sweeps across configurable load levels
- Generates comparison plots with your custom labels - Generates comparison plots with your custom labels
...@@ -70,7 +70,7 @@ The framework is a Python-based wrapper around `genai-perf` that: ...@@ -70,7 +70,7 @@ The framework is a Python-based wrapper around `genai-perf` that:
**Default sequence lengths**: Input: 2000 tokens, Output: 256 tokens (configurable with `--isl` and `--osl`) **Default sequence lengths**: Input: 2000 tokens, Output: 256 tokens (configurable with `--isl` and `--osl`)
**Important**: The `--model` parameter configures GenAI-Perf for benchmarking and provides logging context. The default `--model` value in the benchmarking script is `Qwen/Qwen3-0.6B`, but it must match the model deployed at the endpoint(s). **Important**: The `--model` parameter configures AIPerf for benchmarking and provides logging context. The default `--model` value in the benchmarking script is `Qwen/Qwen3-0.6B`, but it must match the model deployed at the endpoint(s).
--- ---
...@@ -165,7 +165,7 @@ REQUIRED: ...@@ -165,7 +165,7 @@ REQUIRED:
OPTIONS: OPTIONS:
-h, --help Show help message and examples -h, --help Show help message and examples
-m, --model MODEL Model name for GenAI-Perf configuration and logging (default: Qwen/Qwen3-0.6B) -m, --model MODEL Model name for AIPerf configuration and logging (default: Qwen/Qwen3-0.6B)
NOTE: This must match the model deployed at the endpoint NOTE: This must match the model deployed at the endpoint
-i, --isl LENGTH Input sequence length (default: 2000) -i, --isl LENGTH Input sequence length (default: 2000)
-s, --std STDDEV Input sequence standard deviation (default: 10) -s, --std STDDEV Input sequence standard deviation (default: 10)
...@@ -179,14 +179,14 @@ OPTIONS: ...@@ -179,14 +179,14 @@ OPTIONS:
- **Benchmark Name**: The benchmark name becomes the label in plots and results - **Benchmark Name**: The benchmark name becomes the label in plots and results
- **Name Restrictions**: Names can only contain letters, numbers, hyphens, and underscores. The name `plots` is reserved. - **Name Restrictions**: Names can only contain letters, numbers, hyphens, and underscores. The name `plots` is reserved.
- **Port-Forwarding**: You must have an exposed endpoint before benchmarking - **Port-Forwarding**: You must have an exposed endpoint before benchmarking
- **Model Parameter**: The `--model` parameter configures GenAI-Perf for testing and logging, and must match the model deployed at the endpoint - **Model Parameter**: The `--model` parameter configures AIPerf for testing and logging, and must match the model deployed at the endpoint
- **Sequential Benchmarking**: For comparative benchmarks, deploy and benchmark each configuration separately - **Sequential Benchmarking**: For comparative benchmarks, deploy and benchmark each configuration separately
### What Happens During Benchmarking ### What Happens During Benchmarking
The Python benchmarking module: The Python benchmarking module:
1. **Connects** to your port-forwarded endpoint 1. **Connects** to your port-forwarded endpoint
2. **Benchmarks** using GenAI-Perf at various concurrency levels (default: 1, 2, 5, 10, 50, 100, 250) 2. **Benchmarks** using AIPerf at various concurrency levels (default: 1, 2, 5, 10, 50, 100, 250)
3. **Measures** key metrics: latency, throughput, time-to-first-token 3. **Measures** key metrics: latency, throughput, time-to-first-token
4. **Saves** results to an output directory organized by benchmark name 4. **Saves** results to an output directory organized by benchmark name
...@@ -301,9 +301,9 @@ results/ ...@@ -301,9 +301,9 @@ results/
``` ```
Each concurrency directory contains: Each concurrency directory contains:
- **`profile_export_genai_perf.json`** - Structured metrics from GenAI-Perf - **`profile_export_aiperf.json`** - Structured metrics from AIPerf
- **`profile_export_genai_perf.csv`** - CSV format metrics from GenAI-Perf - **`profile_export_aiperf.csv`** - CSV format metrics from AIPerf
- **`profile_export.json`** - Raw GenAI-Perf results - **`profile_export.json`** - Raw AIPerf results
- **`inputs.json`** - Generated test inputs - **`inputs.json`** - Generated test inputs
--- ---
...@@ -516,7 +516,7 @@ kubectl get endpoints "$SVC_NAME" -n "$NAMESPACE" ...@@ -516,7 +516,7 @@ kubectl get endpoints "$SVC_NAME" -n "$NAMESPACE"
## Customize Benchmarking Behavior ## Customize Benchmarking Behavior
The built-in Python workflow connects to endpoints, benchmarks with genai-perf, and generates plots. If you want to modify the behavior: The built-in Python workflow connects to endpoints, benchmarks with aiperf, and generates plots. If you want to modify the behavior:
1. **Extend the workflow**: Modify `benchmarks/utils/workflow.py` to add custom deployment types or metrics collection 1. **Extend the workflow**: Modify `benchmarks/utils/workflow.py` to add custom deployment types or metrics collection
......
...@@ -38,7 +38,7 @@ spec: ...@@ -38,7 +38,7 @@ spec:
mkdir -p "$ARTIFACT_DIR" mkdir -p "$ARTIFACT_DIR"
echo "Running benchmark..." echo "Running benchmark..."
export COLUMNS=200 export COLUMNS=200
genai-perf profile \ aiperf profile \
--model "$TARGET_MODEL" \ --model "$TARGET_MODEL" \
--tokenizer ~/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \ --tokenizer ~/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \
--endpoint-type chat --url "$ENDPOINT" --streaming \ --endpoint-type chat --url "$ENDPOINT" --streaming \
...@@ -58,10 +58,10 @@ spec: ...@@ -58,10 +58,10 @@ spec:
--num-dataset-entries=3000 -- \ --num-dataset-entries=3000 -- \
--max-threads 64 --max-threads 64
echo "----------------json----------------" echo "----------------json----------------"
PERF_JSON=$(find $ARTIFACT_DIR -name profile_export_genai_perf.json) PERF_JSON=$(find $ARTIFACT_DIR -name profile_export_aiperf.json)
cat $PERF_JSON | jq . cat $PERF_JSON | jq .
echo "----------------csv-----------------" echo "----------------csv-----------------"
PERF_CSV=$(find $ARTIFACT_DIR -name profile_export_genai_perf.csv) PERF_CSV=$(find $ARTIFACT_DIR -name profile_export_aiperf.csv)
cat $PERF_CSV cat $PERF_CSV
echo "Benchmark completed successfully!" echo "Benchmark completed successfully!"
volumeMounts: volumeMounts:
......
...@@ -38,7 +38,7 @@ spec: ...@@ -38,7 +38,7 @@ spec:
mkdir -p "$ARTIFACT_DIR" mkdir -p "$ARTIFACT_DIR"
echo "Running benchmark..." echo "Running benchmark..."
export COLUMNS=200 export COLUMNS=200
genai-perf profile \ aiperf profile \
--model "$TARGET_MODEL" \ --model "$TARGET_MODEL" \
--tokenizer ~/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \ --tokenizer ~/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \
--endpoint-type chat --url "$ENDPOINT" --streaming \ --endpoint-type chat --url "$ENDPOINT" --streaming \
...@@ -58,10 +58,10 @@ spec: ...@@ -58,10 +58,10 @@ spec:
--num-dataset-entries=3000 -- \ --num-dataset-entries=3000 -- \
--max-threads 64 --max-threads 64
echo "----------------json----------------" echo "----------------json----------------"
PERF_JSON=$(find $ARTIFACT_DIR -name profile_export_genai_perf.json) PERF_JSON=$(find $ARTIFACT_DIR -name profile_export_aiperf.json)
cat $PERF_JSON | jq . cat $PERF_JSON | jq .
echo "----------------csv-----------------" echo "----------------csv-----------------"
PERF_CSV=$(find $ARTIFACT_DIR -name profile_export_genai_perf.csv) PERF_CSV=$(find $ARTIFACT_DIR -name profile_export_aiperf.csv)
cat $PERF_CSV cat $PERF_CSV
echo "Benchmark completed successfully!" echo "Benchmark completed successfully!"
volumeMounts: volumeMounts:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment