Unverified Commit de3ca70b authored by Biswa Panda's avatar Biswa Panda Committed by GitHub
Browse files

feat: update benchmarking script to use aiperf (#3306)


Signed-off-by: default avatarBiswa Panda <biswa.panda@gmail.com>
Signed-off-by: default avatarlkomali <lkomali@nvidia.com>
Co-authored-by: default avatarlkomali <lkomali@nvidia.com>
Co-authored-by: default avatarHarshini Komali <157742537+lkomali@users.noreply.github.com>
parent 9b0948c6
......@@ -178,7 +178,7 @@ Rerun with `curl -N` and change `stream` in the request to `true` to get the res
Dynamo provides comprehensive benchmarking tools to evaluate and optimize your deployments:
- **[Benchmarking Guide](docs/benchmarks/benchmarking.md)** – Compare deployment topologies (aggregated vs. disaggregated vs. vanilla vLLM) using GenAI-Perf
- **[Benchmarking Guide](docs/benchmarks/benchmarking.md)** – Compare deployment topologies (aggregated vs. disaggregated vs. vanilla vLLM) using AIPerf
- **[Pre-Deployment Profiling](docs/benchmarks/pre_deployment_profiling.md)** – Optimize configurations before deployment to meet SLA requirements
# Engines
......
......@@ -15,7 +15,7 @@
# Benchmarks
This directory contains benchmarking scripts and tools for performance evaluation of Dynamo deployments. The benchmarking framework is a wrapper around genai-perf that makes it easy to benchmark DynamoGraphDeployments or other deployments with exposed endpoints.
This directory contains benchmarking scripts and tools for performance evaluation of Dynamo deployments. The benchmarking framework is a wrapper around aiperf that makes it easy to benchmark DynamoGraphDeployments or other deployments with exposed endpoints.
## Quick Start
......
......@@ -202,7 +202,7 @@ if [ $index -gt 0 ]; then
echo "--------------------------------"
fi
echo "Running genai-perf with:"
echo "Running aiperf with:"
echo "Model: $model"
echo "ISL: $isl"
echo "OSL: $osl"
......@@ -214,7 +214,7 @@ for concurrency in "${concurrency_array[@]}"; do
# NOTE: For Dynamo HTTP OpenAI frontend, use `nvext` for fields like
# `ignore_eos` since they are not in the official OpenAI spec.
genai-perf profile \
aiperf profile \
--model ${model} \
--tokenizer ${model} \
--endpoint-type chat \
......
......@@ -26,7 +26,7 @@ from matplotlib.ticker import MultipleLocator
def get_json_paths(search_paths):
genai_perf_profile_export_json_paths = []
aiperf_profile_export_json_paths = []
deployment_config_json_paths = []
for search_path in search_paths:
deployment_config_json_path = os.path.join(
......@@ -34,15 +34,13 @@ def get_json_paths(search_paths):
)
if not os.path.exists(deployment_config_json_path):
raise Exception(f"deployment_config.json not found in {search_path}")
for root, dirs, files in os.walk(search_path):
for root, _, files in os.walk(search_path):
for file in files:
if file == "profile_export_genai_perf.json":
genai_perf_profile_export_json_paths.append(
os.path.join(root, file)
)
if file == "profile_export_aiperf.json":
aiperf_profile_export_json_paths.append(os.path.join(root, file))
deployment_config_json_paths.append(deployment_config_json_path)
return genai_perf_profile_export_json_paths, deployment_config_json_paths
return aiperf_profile_export_json_paths, deployment_config_json_paths
# search for -concurrency<number> in the name
......@@ -81,13 +79,13 @@ def parse_kind_and_mode(deployment_config_json_path):
def extract_val_and_concurrency(
genai_perf_profile_export_json_paths, deployment_config_json_paths, stat_value="avg"
aiperf_profile_export_json_paths, deployment_config_json_paths, stat_value="avg"
):
results = []
for genai_perf_profile_export_json_path, deployment_config_json_path in zip(
genai_perf_profile_export_json_paths, deployment_config_json_paths
for aiperf_profile_export_json_path, deployment_config_json_path in zip(
aiperf_profile_export_json_paths, deployment_config_json_paths
):
with open(genai_perf_profile_export_json_path, "r") as f:
with open(aiperf_profile_export_json_path, "r") as f:
data = json.load(f)
# output_token_throughput contains only avg
output_token_throughput = data.get("output_token_throughput", {}).get("avg")
......@@ -99,7 +97,7 @@ def extract_val_and_concurrency(
# request_throughput contains only avg
request_throughput = data.get("request_throughput", {}).get("avg")
concurrency = parse_concurrency(genai_perf_profile_export_json_path)
concurrency = parse_concurrency(aiperf_profile_export_json_path)
num_gpus = parse_gpus(deployment_config_json_path)
kind, mode = parse_kind_and_mode(deployment_config_json_path)
......@@ -116,7 +114,7 @@ def extract_val_and_concurrency(
results.append(
{
"configuration": genai_perf_profile_export_json_path,
"configuration": aiperf_profile_export_json_path,
"kind": kind,
"mode": mode,
"num_gpus": num_gpus,
......@@ -241,12 +239,12 @@ if __name__ == "__main__":
import os
parser = argparse.ArgumentParser(
description="Plot Pareto graph from GenAI-Perf artifacts"
description="Plot Pareto graph from AIPerf artifacts"
)
parser.add_argument(
"--artifacts-root-dir",
required=True,
help="Root directory containing artifact directories to search for profile_export_genai_perf.json files",
help="Root directory containing artifact directories to search for profile_export_aiperf.json files",
)
parser.add_argument(
"--title",
......@@ -260,16 +258,16 @@ if __name__ == "__main__":
if not artifacts_dirs:
raise ValueError(f"No artifacts directories found in {args.artifacts_root_dir}")
genai_perf_profile_export_json_paths, deployment_config_json_paths = get_json_paths(
aiperf_profile_export_json_paths, deployment_config_json_paths = get_json_paths(
artifacts_dirs
)
if len(genai_perf_profile_export_json_paths) != len(deployment_config_json_paths):
if len(aiperf_profile_export_json_paths) != len(deployment_config_json_paths):
raise ValueError(
f"Number of genai_perf_profile_export_json_paths ({len(genai_perf_profile_export_json_paths)}) does not match number of deployment_config_json_paths ({len(deployment_config_json_paths)})"
f"Number of aiperf_profile_export_json_paths ({len(aiperf_profile_export_json_paths)}) does not match number of deployment_config_json_paths ({len(deployment_config_json_paths)})"
)
extracted_values = extract_val_and_concurrency(
genai_perf_profile_export_json_paths, deployment_config_json_paths
aiperf_profile_export_json_paths, deployment_config_json_paths
)
create_pareto_graph(extracted_values, title=args.title)
......@@ -5,8 +5,8 @@ import argparse
import logging
import os
from utils.profile_decode import profile_decode
from utils.profile_prefill import profile_prefill
from benchmarks.profiler.utils.profile_decode import profile_decode
from benchmarks.profiler.utils.profile_prefill import profile_prefill
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
......
......@@ -22,13 +22,13 @@ import os
import numpy as np
import yaml
from benchmarks.profiler.utils.aiperf import benchmark_decode, benchmark_prefill
from benchmarks.profiler.utils.config import (
CONFIG_MODIFIERS,
WORKER_COMPONENT_NAMES,
generate_dgd_config_with_planner,
)
from benchmarks.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator
from benchmarks.profiler.utils.genai_perf import benchmark_decode, benchmark_prefill
from benchmarks.profiler.utils.planner_utils import add_planner_arguments_to_parser
from benchmarks.profiler.utils.plot import (
plot_decode_performance,
......@@ -245,18 +245,18 @@ async def run_profile(args):
f"Logs have been saved to {client.base_log_dir / client.deployment_name}"
)
# run genai-perf
# run ai-perf
base_url = client.get_service_url()
genai_perf_artifact_dir = f"{work_dir}/gap_isl{args.isl}"
gap_result = benchmark_prefill(
ai_perf_artifact_dir = f"{work_dir}/aiperf_isl{args.isl}"
aiperf_result = benchmark_prefill(
args.isl,
genai_perf_artifact_dir,
ai_perf_artifact_dir,
model_name,
model_name,
base_url=base_url,
)
if gap_result is not None:
ttft = gap_result["time_to_first_token"]["avg"]
if aiperf_result is not None:
ttft = aiperf_result["records"]["ttft"]["avg"]
logger.info("Cleaning up deployment...")
await client.delete_deployment()
......@@ -424,20 +424,23 @@ async def run_profile(args):
)
else:
base_url = client.get_service_url()
genai_perf_artifact_dir = f"{work_dir}/gap_request{num_request}_isl{args.isl}_osl{args.osl}_n{num_request}"
gap_result = benchmark_decode(
ai_perf_artifact_dir = f"{work_dir}/aiperf_request{num_request}_isl{args.isl}_osl{args.osl}_n{num_request}"
aiperf_result = benchmark_decode(
args.isl,
args.osl,
num_request,
genai_perf_artifact_dir,
ai_perf_artifact_dir,
model_name,
model_name,
base_url=base_url,
)
if gap_result is not None:
itl = gap_result["inter_token_latency"]["avg"]
if aiperf_result is not None:
itl = aiperf_result["records"]["inter_token_latency"]["avg"]
thpt_per_gpu = (
gap_result["output_token_throughput"]["avg"] / num_gpus
aiperf_result["records"]["output_token_throughput"][
"avg"
]
/ num_gpus
)
if itl is not None and thpt_per_gpu is not None:
......
......@@ -30,7 +30,7 @@ console_handler.setFormatter(formatter)
logger.addHandler(console_handler)
def _get_common_genai_perf_cmd(
def _get_common_aiperf_cmd(
artifact_dir,
seed=100,
model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
......@@ -38,7 +38,7 @@ def _get_common_genai_perf_cmd(
base_url="http://localhost:8000",
):
return [
"genai-perf",
"aiperf",
"profile",
"--model",
model,
......@@ -64,7 +64,7 @@ def _get_common_genai_perf_cmd(
]
def get_prefill_genai_perf_cmd(
def get_prefill_aiperf_cmd(
isl,
artifact_dir,
seed=100,
......@@ -73,7 +73,7 @@ def get_prefill_genai_perf_cmd(
osl=5,
base_url="http://localhost:8000",
):
return _get_common_genai_perf_cmd(
return _get_common_aiperf_cmd(
artifact_dir,
seed,
model,
......@@ -99,7 +99,7 @@ def get_prefill_genai_perf_cmd(
]
def get_decode_genai_perf_cmd(
def get_decode_aiperf_cmd(
isl,
osl,
artifact_dir,
......@@ -109,7 +109,7 @@ def get_decode_genai_perf_cmd(
tokenizer="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
base_url="http://localhost:8000",
):
return _get_common_genai_perf_cmd(
return _get_common_aiperf_cmd(
artifact_dir,
seed,
model,
......@@ -137,15 +137,15 @@ def get_decode_genai_perf_cmd(
]
def get_gap_result(artifact_dir: str) -> dict:
def get_aiperf_result(artifact_dir: str) -> dict:
json_file_path = None
for root, _, files in os.walk(artifact_dir):
if "profile_export_genai_perf.json" in files:
json_file_path = os.path.join(root, "profile_export_genai_perf.json")
if "profile_export_aiperf.json" in files:
json_file_path = os.path.join(root, "profile_export_aiperf.json")
break
if json_file_path is None:
raise FileNotFoundError(
f"profile_export_genai_perf.json not found in {artifact_dir}"
f"profile_export_aiperf.json not found in {artifact_dir}"
)
with open(json_file_path, "r") as f:
return json.load(f)
......@@ -153,35 +153,35 @@ def get_gap_result(artifact_dir: str) -> dict:
def benchmark_prefill(
isl,
genai_perf_artifact_dir,
aiperf_artifact_dir,
model_name,
tokenizer,
base_url="http://localhost:8000",
):
logger.info(f"Running genai-perf with isl {isl}")
genai_perf_cmd = get_prefill_genai_perf_cmd(
logger.info(f"Running aiperf with isl {isl}")
aiperf_cmd = get_prefill_aiperf_cmd(
isl,
genai_perf_artifact_dir,
aiperf_artifact_dir,
model=model_name,
tokenizer=tokenizer,
base_url=base_url,
)
print(f"genai-perf cmd: {genai_perf_cmd}")
print(f"aiperf cmd: {aiperf_cmd}")
# import pdb; pdb.set_trace()
gap_process = subprocess.Popen(
genai_perf_cmd,
aiperf_process = subprocess.Popen(
aiperf_cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
)
stdout, stderr = gap_process.communicate()
if gap_process.returncode == 0:
logger.info("Genai-perf profiling completed successfully")
stdout, stderr = aiperf_process.communicate()
if aiperf_process.returncode == 0:
logger.info("AIperf profiling completed successfully")
logger.info(stdout)
gap_result = get_gap_result(genai_perf_artifact_dir)
return gap_result
aiperf_result = get_aiperf_result(aiperf_artifact_dir)
return aiperf_result
else:
logger.error(f"Genai-perf failed with error code: {gap_process.returncode}")
logger.error(f"AIPerf failed with error code: {aiperf_process.returncode}")
logger.error(f"stderr: {stderr}")
return None
......@@ -190,7 +190,7 @@ def benchmark_decode(
isl,
osl,
num_request,
genai_perf_artifact_dir,
aiperf_artifact_dir,
model_name,
tokenizer,
base_url="http://localhost:8000",
......@@ -201,47 +201,47 @@ def benchmark_decode(
# we use the same random seed to make sure the prompt is the same
seed = random.randint(0, 1000000)
genai_perf_cmd = get_decode_genai_perf_cmd(
aiperf_cmd = get_decode_aiperf_cmd(
isl,
osl,
f"{genai_perf_artifact_dir}_warmup",
f"{aiperf_artifact_dir}_warmup",
num_request,
seed=seed,
model=model_name,
tokenizer=tokenizer,
base_url=base_url,
)
gap_process = subprocess.Popen(
genai_perf_cmd,
aiperf_process = subprocess.Popen(
aiperf_cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
)
gap_process.communicate()
aiperf_process.communicate()
# then send out the real requests, hopefully, this will skip all prefill computation
genai_perf_cmd = get_decode_genai_perf_cmd(
aiperf_cmd = get_decode_aiperf_cmd(
isl,
osl,
genai_perf_artifact_dir,
aiperf_artifact_dir,
num_request,
seed=seed,
model=model_name,
tokenizer=tokenizer,
base_url=base_url,
)
gap_process = subprocess.Popen(
genai_perf_cmd,
aiperf_process = subprocess.Popen(
aiperf_cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
)
stdout, stderr = gap_process.communicate()
if gap_process.returncode == 0:
logger.info("Genai-perf profiling completed successfully")
stdout, stderr = aiperf_process.communicate()
if aiperf_process.returncode == 0:
logger.info("AIperf profiling completed successfully")
logger.info(stdout)
gap_result = get_gap_result(genai_perf_artifact_dir)
return gap_result
aiperf_result = get_aiperf_result(aiperf_artifact_dir)
return aiperf_result
else:
logger.error(f"Genai-perf failed with error code: {gap_process.returncode}")
logger.error(f"AIPerf failed with error code: {aiperf_process.returncode}")
logger.error(f"stderr: {stderr}")
return None
......@@ -26,13 +26,13 @@ logger = logging.getLogger(__name__)
def check_prefill_results_exist(output_dir: str, tp_size: int, isl: int) -> bool:
"""Check if prefill results already exist for a given TP size."""
work_dir = f"{output_dir}/prefill_tp{tp_size}"
result_file = f"{work_dir}/gap_isl{isl}/*/profile_export_genai_perf.json"
result_file = f"{work_dir}/aiperf_isl{isl}/*/profile_export_aiperf.json"
# Check if the work directory exists
if not os.path.exists(work_dir):
return False
# Look for the genai-perf result file
# Look for the aiperf result file
result_files = glob.glob(result_file)
if not result_files:
return False
......@@ -65,7 +65,7 @@ def check_decode_results_exist(
# Look for at least one decode result file
result_pattern = (
f"{work_dir}/gap_request*_isl{isl}_osl{osl}_n*/*/profile_export_genai_perf.json"
f"{work_dir}/aiperf_request*_isl{isl}_osl{osl}_n*/*/profile_export_aiperf.json"
)
result_files = glob.glob(result_pattern)
......@@ -93,7 +93,7 @@ def load_existing_prefill_results(
) -> Tuple[Optional[float], Optional[float]]:
"""Load existing prefill results from disk."""
work_dir = f"{output_dir}/prefill_tp{tp_size}"
result_file = f"{work_dir}/gap_isl{isl}/*/profile_export_genai_perf.json"
result_file = f"{work_dir}/aiperf_isl{isl}/*/profile_export_aiperf.json"
result_files = glob.glob(result_file)
if result_files:
......@@ -115,7 +115,7 @@ def load_existing_decode_results(
work_dir = f"{output_dir}/decode_tp{tp_size}"
result_pattern = (
f"{work_dir}/gap_request*_isl{isl}_osl{osl}_n*/*/profile_export_genai_perf.json"
f"{work_dir}/aiperf_request*_isl{isl}_osl{osl}_n*/*/profile_export_aiperf.json"
)
result_files = glob.glob(result_pattern)
......@@ -128,7 +128,7 @@ def load_existing_decode_results(
thpt_per_gpu = data["output_token_throughput"]["avg"] / tp_size
# Extract concurrency from filename
match = re.search(r"gap_request(\d+)_", result_file)
match = re.search(r"aiperf_request(\d+)_", result_file)
if match:
concurrency = int(match.group(1))
decode_results.append((itl, thpt_per_gpu, concurrency))
......
......@@ -6,9 +6,9 @@ from typing import Callable, Optional, Tuple
import numpy as np
from benchmarks.profiler.utils.aiperf import benchmark_decode
from benchmarks.profiler.utils.defaults import DECODE_MAX_CONCURRENCY
from benchmarks.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator
from benchmarks.profiler.utils.genai_perf import benchmark_decode
from benchmarks.profiler.utils.plot import plot_decode_3d_surface
logger = logging.getLogger(__name__)
......@@ -113,19 +113,21 @@ def profile_decode(
attention_dp_size,
):
def get_itl_and_thpt_per_gpu(isl, osl, num_request):
genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}_osl{osl}_n{num_request}"
gap_result = benchmark_decode(
ai_perf_artifact_dir = f"{work_dir}/aiperf_isl{isl}_osl{osl}_n{num_request}"
aiperf_result = benchmark_decode(
isl,
osl,
num_request,
genai_perf_artifact_dir,
ai_perf_artifact_dir,
model_name,
tokenizer,
base_url=url,
)
if gap_result is not None:
itl = gap_result["inter_token_latency"]["avg"]
thpt_per_gpu = gap_result["output_token_throughput"]["avg"] / num_gpus
if aiperf_result is not None:
itl = aiperf_result["records"]["inter_token_latency"]["avg"]
thpt_per_gpu = (
aiperf_result["records"]["output_token_throughput"]["avg"] / num_gpus
)
return itl, thpt_per_gpu
return None, None
......
......@@ -6,8 +6,8 @@ from typing import Callable, Optional
import numpy as np
from benchmarks.profiler.utils.aiperf import benchmark_prefill
from benchmarks.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator
from benchmarks.profiler.utils.genai_perf import benchmark_prefill
from benchmarks.profiler.utils.plot import plot_prefill_interpolation
logger = logging.getLogger(__name__)
......@@ -81,16 +81,16 @@ def profile_prefill(
interpolation_granularity,
):
def get_ttft(isl):
genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}"
gap_result = benchmark_prefill(
ai_perf_artifact_dir = f"{work_dir}/aiperf_isl{isl}"
aiperf_result = benchmark_prefill(
isl,
genai_perf_artifact_dir,
ai_perf_artifact_dir,
model_name,
tokenizer,
base_url=url,
)
if gap_result is not None:
return gap_result["time_to_first_token"]["avg"]
if aiperf_result is not None:
return aiperf_result["records"]["ttft"]["avg"]
return None
return _profile_prefill_helper(
......
......@@ -33,7 +33,7 @@ def get_concurrency_levels() -> List[int]:
CONCURRENCIES: List[int] = get_concurrency_levels()
def run_genai_perf(
def run_aiperf(
service_url: str,
model_name: str,
isl: int,
......@@ -44,7 +44,7 @@ def run_genai_perf(
) -> None:
output_dir.mkdir(parents=True, exist_ok=True)
cmd = [
"genai-perf",
"aiperf",
"profile",
"-m",
model_name,
......@@ -76,28 +76,28 @@ def run_genai_perf(
"--max-threads=300",
]
print(
f"Running genai-perf with isl {isl}, osl {osl}, concurrency {concurrency}",
f"Running aiperf with isl {isl}, osl {osl}, concurrency {concurrency}",
flush=True,
)
gap_process = subprocess.Popen(
aip_process = subprocess.Popen(
cmd,
cwd=str(output_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
)
stdout, stderr = gap_process.communicate()
if gap_process.returncode == 0:
print("Genai-perf profiling completed successfully", flush=True)
stdout, stderr = aip_process.communicate()
if aip_process.returncode == 0:
print("Aiperf profiling completed successfully", flush=True)
if stdout:
print(stdout)
else:
print(f"Genai-perf failed with error code: {gap_process.returncode}")
print(f"Aiperf failed with error code: {aip_process.returncode}")
if stderr:
print(f"stderr: {stderr}")
raise subprocess.CalledProcessError(
gap_process.returncode, cmd, output=stdout, stderr=stderr
aip_process.returncode, cmd, output=stdout, stderr=stderr
)
......@@ -113,6 +113,4 @@ def run_concurrency_sweep(
for c in concurrency_levels:
print(f"Starting concurrency level {c}", flush=True)
run_genai_perf(
service_url, model_name, isl, osl, stddev, c, output_dir / f"c{c}"
)
run_aiperf(service_url, model_name, isl, osl, stddev, c, output_dir / f"c{c}")
......@@ -32,22 +32,22 @@ def parse_benchmark_results(result_dir: Path) -> List[Tuple[int, Dict]]:
continue
concurrency = int(match.group(1))
# Find the genai-perf JSON file
genai_perf_json = None
for json_file in concurrency_dir.rglob("profile_export_genai_perf.json"):
genai_perf_json = json_file
# Find the aiperf JSON file
aiperf_json = None
for json_file in concurrency_dir.rglob("profile_export_aiperf.json"):
aiperf_json = json_file
break
if genai_perf_json and genai_perf_json.exists():
if aiperf_json and aiperf_json.exists():
try:
with open(genai_perf_json, "r") as f:
with open(aiperf_json, "r") as f:
metrics = json.load(f)
results.append((concurrency, metrics))
print(f"Loaded metrics for concurrency {concurrency}")
except Exception as e:
print(f"Error loading {genai_perf_json}: {e}")
print(f"Error loading {aiperf_json}: {e}")
else:
print(f"Warning: No genai-perf JSON found for {concurrency_dir}")
print(f"Warning: No aiperf JSON found for {concurrency_dir}")
# Sort by concurrency level
results.sort(key=lambda x: x[0])
......
......@@ -64,6 +64,7 @@ RUN apt-get update -y \
# Python runtime - CRITICAL for virtual environment to work
python${PYTHON_VERSION}-dev \
build-essential \
git \
# SGLang build dependencies
cmake \
ibverbs-providers \
......@@ -147,6 +148,7 @@ RUN apt-get update && \
build-essential \
# jq and curl for polling various endpoints and health checks
jq \
git \
curl \
# Libraries required by UCX to find RDMA devices
libibverbs1 rdma-core ibverbs-utils libibumad3 \
......
......@@ -17,7 +17,7 @@ ARG CARGO_BUILD_JOBS="16"
RUN apt-get update -y && \
apt-get install -y \
cmake meson ninja-build pybind11-dev patchelf net-tools \
build-essential protobuf-compiler libssl-dev pkg-config \
build-essential protobuf-compiler libssl-dev pkg-config git \
clang libclang-dev git rapidjson-dev zlib1g-dev jq && \
pip install --break-system-packages meson-python wheel build
......@@ -128,7 +128,7 @@ RUN git clone --depth=1 \
cmake --build perf_analyzer/build -- -j$(nproc)
ENV PATH=/sgl-workspace/perf_analyzer/build/perf_analyzer/src/perf-analyzer-build:$PATH
RUN pip install --break-system-packages genai-perf
RUN pip install --break-system-packages aiperf
# Enable forceful shutdown of inflight requests
ENV SGL_FORCE_SHUTDOWN=1
......
......@@ -76,6 +76,7 @@ RUN apt-get update && \
build-essential \
g++ \
ninja-build \
git \
# Python runtime - CRITICAL for virtual environment to work
python${PYTHON_VERSION}-dev \
python3-pip \
......
......@@ -187,6 +187,7 @@ RUN apt-get update && \
build-essential \
# jq and curl for polling various endpoints and health checks
jq \
git \
curl \
# Libraries required by UCX to find RDMA devices
libibverbs1 rdma-core ibverbs-utils libibumad3 \
......
......@@ -2,7 +2,9 @@
# SPDX-License-Identifier: Apache-2.0
accelerate==1.6.0
aiconfigurator @ git+https://github.com/ai-dynamo/aiconfigurator.git@e46d9089ffe4f5dd62c46914489c55b6dfdbc903
aiofiles
aiperf @ git+https://github.com/ai-dynamo/aiperf.git@e8f69abf180ff9ea96de9f9a8c955df8c024625b
av==15.0.0
fastapi==0.115.12
ftfy
......@@ -15,7 +17,6 @@ kubernetes_asyncio
matplotlib
msgspec
mypy
numpy==1.26.4 # pmdarima is not compatible with numpy 2
nvidia-ml-py==13.580.65
opentelemetry-api
opentelemetry-sdk
......@@ -26,7 +27,7 @@ prometheus-api-client
prometheus_client
prophet
protobuf==5.29.5
pydantic==2.10.6
pydantic>=2.10.6
pyright
PyYAML
scikit-learn
......
......@@ -61,7 +61,7 @@ Just quick testing/comparison? Client-side.
## What This Tool Does
The framework is a Python-based wrapper around `genai-perf` that:
The framework is a Python-based wrapper around `aiperf` that:
- Benchmarks any HTTP endpoints
- Runs concurrency sweeps across configurable load levels
- Generates comparison plots with your custom labels
......@@ -70,7 +70,7 @@ The framework is a Python-based wrapper around `genai-perf` that:
**Default sequence lengths**: Input: 2000 tokens, Output: 256 tokens (configurable with `--isl` and `--osl`)
**Important**: The `--model` parameter configures GenAI-Perf for benchmarking and provides logging context. The default `--model` value in the benchmarking script is `Qwen/Qwen3-0.6B`, but it must match the model deployed at the endpoint(s).
**Important**: The `--model` parameter configures AIPerf for benchmarking and provides logging context. The default `--model` value in the benchmarking script is `Qwen/Qwen3-0.6B`, but it must match the model deployed at the endpoint(s).
---
......@@ -165,7 +165,7 @@ REQUIRED:
OPTIONS:
-h, --help Show help message and examples
-m, --model MODEL Model name for GenAI-Perf configuration and logging (default: Qwen/Qwen3-0.6B)
-m, --model MODEL Model name for AIPerf configuration and logging (default: Qwen/Qwen3-0.6B)
NOTE: This must match the model deployed at the endpoint
-i, --isl LENGTH Input sequence length (default: 2000)
-s, --std STDDEV Input sequence standard deviation (default: 10)
......@@ -179,14 +179,14 @@ OPTIONS:
- **Benchmark Name**: The benchmark name becomes the label in plots and results
- **Name Restrictions**: Names can only contain letters, numbers, hyphens, and underscores. The name `plots` is reserved.
- **Port-Forwarding**: You must have an exposed endpoint before benchmarking
- **Model Parameter**: The `--model` parameter configures GenAI-Perf for testing and logging, and must match the model deployed at the endpoint
- **Model Parameter**: The `--model` parameter configures AIPerf for testing and logging, and must match the model deployed at the endpoint
- **Sequential Benchmarking**: For comparative benchmarks, deploy and benchmark each configuration separately
### What Happens During Benchmarking
The Python benchmarking module:
1. **Connects** to your port-forwarded endpoint
2. **Benchmarks** using GenAI-Perf at various concurrency levels (default: 1, 2, 5, 10, 50, 100, 250)
2. **Benchmarks** using AIPerf at various concurrency levels (default: 1, 2, 5, 10, 50, 100, 250)
3. **Measures** key metrics: latency, throughput, time-to-first-token
4. **Saves** results to an output directory organized by benchmark name
......@@ -301,9 +301,9 @@ results/
```
Each concurrency directory contains:
- **`profile_export_genai_perf.json`** - Structured metrics from GenAI-Perf
- **`profile_export_genai_perf.csv`** - CSV format metrics from GenAI-Perf
- **`profile_export.json`** - Raw GenAI-Perf results
- **`profile_export_aiperf.json`** - Structured metrics from AIPerf
- **`profile_export_aiperf.csv`** - CSV format metrics from AIPerf
- **`profile_export.json`** - Raw AIPerf results
- **`inputs.json`** - Generated test inputs
---
......@@ -516,7 +516,7 @@ kubectl get endpoints "$SVC_NAME" -n "$NAMESPACE"
## Customize Benchmarking Behavior
The built-in Python workflow connects to endpoints, benchmarks with genai-perf, and generates plots. If you want to modify the behavior:
The built-in Python workflow connects to endpoints, benchmarks with aiperf, and generates plots. If you want to modify the behavior:
1. **Extend the workflow**: Modify `benchmarks/utils/workflow.py` to add custom deployment types or metrics collection
......
......@@ -38,7 +38,7 @@ spec:
mkdir -p "$ARTIFACT_DIR"
echo "Running benchmark..."
export COLUMNS=200
genai-perf profile \
aiperf profile \
--model "$TARGET_MODEL" \
--tokenizer ~/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \
--endpoint-type chat --url "$ENDPOINT" --streaming \
......@@ -58,10 +58,10 @@ spec:
--num-dataset-entries=3000 -- \
--max-threads 64
echo "----------------json----------------"
PERF_JSON=$(find $ARTIFACT_DIR -name profile_export_genai_perf.json)
PERF_JSON=$(find $ARTIFACT_DIR -name profile_export_aiperf.json)
cat $PERF_JSON | jq .
echo "----------------csv-----------------"
PERF_CSV=$(find $ARTIFACT_DIR -name profile_export_genai_perf.csv)
PERF_CSV=$(find $ARTIFACT_DIR -name profile_export_aiperf.csv)
cat $PERF_CSV
echo "Benchmark completed successfully!"
volumeMounts:
......
......@@ -38,7 +38,7 @@ spec:
mkdir -p "$ARTIFACT_DIR"
echo "Running benchmark..."
export COLUMNS=200
genai-perf profile \
aiperf profile \
--model "$TARGET_MODEL" \
--tokenizer ~/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \
--endpoint-type chat --url "$ENDPOINT" --streaming \
......@@ -58,10 +58,10 @@ spec:
--num-dataset-entries=3000 -- \
--max-threads 64
echo "----------------json----------------"
PERF_JSON=$(find $ARTIFACT_DIR -name profile_export_genai_perf.json)
PERF_JSON=$(find $ARTIFACT_DIR -name profile_export_aiperf.json)
cat $PERF_JSON | jq .
echo "----------------csv-----------------"
PERF_CSV=$(find $ARTIFACT_DIR -name profile_export_genai_perf.csv)
PERF_CSV=$(find $ARTIFACT_DIR -name profile_export_aiperf.csv)
cat $PERF_CSV
echo "Benchmark completed successfully!"
volumeMounts:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment