Unverified Commit 9f310225 authored by Harshini Komali's avatar Harshini Komali Committed by GitHub
Browse files

feat: Replace genai-perf with aiperf (#3533)


Signed-off-by: default avatarlkomali <lkomali@nvidia.com>
parent 1972f71f
......@@ -13,7 +13,7 @@ This directory contains scripts for benchmarking the Dynamo router with prefix c
- etcd and NATS running (required for Dynamo coordination)
- Required Python packages:
- `dynamo` package (with vllm and frontend modules)
- `genai-perf` for benchmarking
- `aiperf` for benchmarking
- `matplotlib` for plotting results
- `data-generator` package (install with `pip install -e ./benchmarks` from repo root)
......@@ -230,11 +230,11 @@ python real_data_benchmark.py --input-dataset trace.jsonl --prefix-root-multipli
```
> [!Note]
> At the time of writing this documentation, you may need to install the latest genai-perf from the main source branch to loadgen on the trace files:
> At the time of writing this documentation, you may need to install the latest aiperf from the main source branch to loadgen on the trace files:
> ```bash
> pip install git+https://github.com/triton-inference-server/perf_analyzer.git#subdirectory=genai-perf
> pip install git+https://github.com/ai-dynamo/aiperf.git#subdirectory=aiperf
> ```
> However, by the time of release, the genai-perf version included in the vLLM runtime container should be up to date enough to use as-is.
> However, by the time of release, the aiperf version included in the vLLM runtime container should be up to date enough to use as-is.
## Troubleshooting
......
......@@ -27,7 +27,7 @@ console_handler.setFormatter(formatter)
logger.addHandler(console_handler)
def get_genai_perf_cmd(
def get_aiperf_cmd(
model,
tokenizer, # Add tokenizer parameter
prefix_ratio,
......@@ -40,12 +40,12 @@ def get_genai_perf_cmd(
artifact_dir,
url="http://localhost:8888",
):
"""Build genai-perf command based on prefix ratio"""
"""Build aiperf command based on prefix ratio"""
prefix_length = int(isl * prefix_ratio)
synthetic_input_length = int(isl * (1 - prefix_ratio))
return [
"genai-perf",
"aiperf",
"profile",
"--model",
model,
......@@ -84,10 +84,7 @@ def get_genai_perf_cmd(
str(num_prefix_prompts),
"--artifact-dir",
artifact_dir,
"--",
"-v",
"--max-threads",
"256",
"-H",
"Authorization: Bearer NOT USED",
"-H",
......@@ -95,17 +92,17 @@ def get_genai_perf_cmd(
]
def get_gap_result(artifact_dir: str) -> dict:
"""Parse genai-perf results from JSON file"""
def get_aiperf_result(artifact_dir: str) -> dict:
"""Parse aiperf results from JSON file"""
json_file_path = None
for root, _, files in os.walk(artifact_dir):
if "profile_export_genai_perf.json" in files:
json_file_path = os.path.join(root, "profile_export_genai_perf.json")
if "profile_export_aiperf.json" in files:
json_file_path = os.path.join(root, "profile_export_aiperf.json")
break
if json_file_path is None:
raise FileNotFoundError(
f"profile_export_genai_perf.json not found in {artifact_dir}"
f"profile_export_aiperf.json not found in {artifact_dir}"
)
with open(json_file_path, "r") as f:
......@@ -125,8 +122,8 @@ def run_benchmark_single_url(
artifact_dir,
url,
) -> Optional[Dict]:
"""Run genai-perf benchmark for a single URL"""
genai_perf_cmd = get_genai_perf_cmd(
"""Run aiperf benchmark for a single URL"""
aiperf_cmd = get_aiperf_cmd(
model,
tokenizer, # Pass tokenizer parameter
prefix_ratio,
......@@ -140,21 +137,21 @@ def run_benchmark_single_url(
url,
)
logger.info(f"Running command for URL {url}: {' '.join(genai_perf_cmd)}")
logger.info(f"Running command for URL {url}: {' '.join(aiperf_cmd)}")
try:
gap_process = subprocess.run(
genai_perf_cmd, capture_output=True, text=True, check=True
aiperf_process = subprocess.run(
aiperf_cmd, capture_output=True, text=True, check=True
)
logger.info(f"Genai-perf profiling completed successfully for URL {url}")
logger.info(gap_process.stdout)
logger.info(f"AIPerf profiling completed successfully for URL {url}")
logger.info(aiperf_process.stdout)
gap_result = get_gap_result(artifact_dir)
return gap_result
aiperf_result = get_aiperf_result(artifact_dir)
return aiperf_result
except subprocess.CalledProcessError as e:
logger.error(f"Genai-perf failed for URL {url} with error code: {e.returncode}")
logger.error(f"AIPerf failed for URL {url} with error code: {e.returncode}")
logger.error(f"stderr: {e.stderr}")
return None
......@@ -197,7 +194,7 @@ def run_benchmark(
output_dir,
urls,
) -> Optional[Dict]:
"""Run genai-perf benchmark for a specific prefix ratio"""
"""Run aiperf benchmark for a specific prefix ratio"""
logger.info(
f"Running benchmark with prefix_ratio={prefix_ratio}, seed={seed}, URLs={urls}"
)
......@@ -242,7 +239,7 @@ def run_benchmark(
os.makedirs(artifact_dir, exist_ok=True)
artifact_dirs.append(artifact_dir)
genai_perf_cmd = get_genai_perf_cmd(
aiperf_cmd = get_aiperf_cmd(
model,
tokenizer, # Pass tokenizer parameter
prefix_ratio,
......@@ -256,10 +253,10 @@ def run_benchmark(
url,
)
logger.info(f"Launching process for URL {url}: {' '.join(genai_perf_cmd)}")
logger.info(f"Launching process for URL {url}: {' '.join(aiperf_cmd)}")
process = subprocess.Popen(
genai_perf_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
aiperf_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
)
processes.append((process, url, artifact_dir))
......@@ -269,18 +266,18 @@ def run_benchmark(
stdout, stderr = process.communicate()
if process.returncode == 0:
logger.info(f"Genai-perf completed successfully for URL {url}")
logger.info(f"AIPerf completed successfully for URL {url}")
logger.info(stdout)
try:
gap_result = get_gap_result(artifact_dir)
results.append(gap_result)
aiperf_result = get_aiperf_result(artifact_dir)
results.append(aiperf_result)
except Exception as e:
logger.error(f"Failed to get results for URL {url}: {e}")
results.append(None)
else:
logger.error(
f"Genai-perf failed for URL {url} with error code: {process.returncode}"
f"AIPerf failed for URL {url} with error code: {process.returncode}"
)
logger.error(f"stderr: {stderr}")
results.append(None)
......
......@@ -24,7 +24,7 @@ console_handler.setFormatter(formatter)
logger.addHandler(console_handler)
def get_genai_perf_cmd_for_trace(
def get_aiperf_cmd_for_trace(
model,
tokenizer,
input_dataset,
......@@ -33,7 +33,7 @@ def get_genai_perf_cmd_for_trace(
url="http://localhost:8888",
):
return [
"genai-perf",
"aiperf",
"profile",
"--model",
model,
......@@ -47,17 +47,13 @@ def get_genai_perf_cmd_for_trace(
"--url",
url,
"--input-file",
f"payload:{input_dataset}",
"--fixed-schedule",
"True",
f"{input_dataset}",
"--fixed-schedule-auto-offset",
"--random-seed",
str(seed),
"--artifact-dir",
artifact_dir,
"--",
"-v",
"--max-threads",
"256",
"-H",
"Authorization: Bearer NOT USED",
"-H",
......@@ -73,8 +69,8 @@ def run_benchmark_with_trace(
url,
seed,
):
"""Run genai-perf benchmark with a trace dataset"""
genai_perf_cmd = get_genai_perf_cmd_for_trace(
"""Run aiperf benchmark with a trace dataset"""
aiperf_cmd = get_aiperf_cmd_for_trace(
model,
tokenizer,
trace_dataset,
......@@ -83,17 +79,17 @@ def run_benchmark_with_trace(
url,
)
logger.info(f"Running genai-perf with trace dataset: {trace_dataset}")
logger.info(f"Command: {' '.join(genai_perf_cmd)}")
logger.info(f"Running aiperf with trace dataset: {trace_dataset}")
logger.info(f"Command: {' '.join(aiperf_cmd)}")
try:
# Run genai-perf and let it output directly to terminal
subprocess.run(genai_perf_cmd, check=True)
# Run aiperf and let it output directly to terminal
subprocess.run(aiperf_cmd, check=True)
logger.info("Genai-perf profiling completed successfully")
logger.info("AIPerf profiling completed successfully")
except subprocess.CalledProcessError as e:
logger.error(f"Genai-perf failed with error code: {e.returncode}")
logger.error(f"AIPerf failed with error code: {e.returncode}")
logger.error(f"stderr: {e.stderr}")
raise
......@@ -301,7 +297,7 @@ def main():
logger.info(f"Synthetic trace data saved to: {trace_dataset_path}")
# Run benchmark with the trace dataset
artifact_dir = os.path.join(args.output_dir, "genai_perf_artifacts")
artifact_dir = os.path.join(args.output_dir, "aiperf_artifacts")
os.makedirs(artifact_dir, exist_ok=True)
run_benchmark_with_trace(
......
......@@ -5,7 +5,7 @@ SPDX-License-Identifier: Apache-2.0
# Sinusoidal Load Generator
`sin_synth.py` is a simple script to generate synthetic load with sinusoidal request rate and isl/osl ratio. The output is in [mooncake-style](https://github.com/kvcache-ai/Mooncake) jsonl format, which can be directly used in [GenAI-Perf](https://github.com/triton-inference-server/perf_analyzer/tree/main/genai-perf/genai_perf).
`sin_synth.py` is a simple script to generate synthetic load with sinusoidal request rate and isl/osl ratio. The output is in [mooncake-style](https://github.com/kvcache-ai/Mooncake) jsonl format, which can be directly used in [AIPerf](https://github.com/ai-dynamo/aiperf/tree/main/aiperf).
## Usage
......
......@@ -4,7 +4,7 @@
from pathlib import Path
from typing import Dict, List
from benchmarks.utils.genai import run_concurrency_sweep
from benchmarks.utils.aiperf import run_concurrency_sweep
from deploy.utils.kubernetes import is_running_in_cluster
......
......@@ -404,7 +404,7 @@ curl localhost:8000/v1/chat/completions -H "Content-Type: application/json"
### Performance Testing with AIPerf
The Dynamo container includes [AIPerf](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/perf_analyzer/aiperf/README.html), NVIDIA's tool for benchmarking generative AI models. This tool helps measure throughput, latency, and other performance metrics for your deployment.
The Dynamo container includes [AIPerf](https://github.com/ai-dynamo/aiperf/blob/main/README.md), NVIDIA's tool for benchmarking generative AI models. This tool helps measure throughput, latency, and other performance metrics for your deployment.
**Run the following benchmark from inside the container** (after completing the deployment steps above):
......
......@@ -283,7 +283,7 @@ results/ # Client-side: ./benchmarks/results/ or custom
│ └── avg_time_to_first_token_vs_concurrency.png
├── <your-benchmark-name>/ # Results for your benchmark (uses your custom name)
│ ├── c1/ # Concurrency level 1
│ │ └── profile_export_genai_perf.json
│ │ └── profile_export_aiperf.json
│ ├── c2/ # Concurrency level 2
│ ├── c5/ # Concurrency level 5
│ └── ... # Other concurrency levels (10, 50, 100, 250)
......@@ -457,7 +457,7 @@ Results are stored in `/data/results` and follow the same structure as client-si
/data/results/
└── <benchmark-name>/ # Results for your benchmark name
├── c1/ # Concurrency level 1
│ └── profile_export_genai_perf.json
│ └── profile_export_aiperf.json
├── c2/ # Concurrency level 2
└── ... # Other concurrency levels
```
......
......@@ -56,11 +56,11 @@ Typically, the number of GPUs vs the performance follows the following pattern:
| 2 | 269 | 135 | 1.19x |
| 4 | 578 | 144 | 1.28x |
The best number of GPUs to use in the prefill and decode engines can be determined by running a few fixed ISL/OSL/concurrency test using [GenAI-Perf](https://github.com/triton-inference-server/perf_analyzer/tree/main/genai-perf) and compare with the SLA.
GenAI-Perf is pre-installed in the dynamo container.
The best number of GPUs to use in the prefill and decode engines can be determined by running a few fixed ISL/OSL/concurrency test using [AIPerf](https://github.com/ai-dynamo/aiperf/tree/main) and compare with the SLA.
AIPerf is pre-installed in the dynamo container.
> [!Tip]
> If you are unfamiliar with GenAI-Perf, please see this helpful [tutorial](https://github.com/triton-inference-server/perf_analyzer/blob/main/genai-perf/docs/tutorial.md) to get you started.
> If you are unfamiliar with AIPerf, please see this helpful [tutorial](https://github.com/ai-dynamo/aiperf/blob/main/docs/tutorial.md) to get you started.
Besides the parallelization mapping, other common knobs to tune are maximum batch size, maximum number of tokens, and block size.
For prefill engines, usually a small batch size and large `max_num_token` is preferred.
......
......@@ -54,4 +54,4 @@ curl localhost:8000/v1/chat/completions \
"max_tokens": 30
}'
```
You can also benchmark the performance of the endpoint by [GenAI-Perf](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/perf_analyzer/genai-perf/README.html)
You can also benchmark the performance of the endpoint by [AIPerf](https://github.com/ai-dynamo/aiperf/blob/main/README.md)
......@@ -39,4 +39,4 @@ curl localhost:8000/v1/chat/completions \
"max_tokens": 30
}'
```
You can also benchmark the performance of the endpoint by [GenAI-Perf](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/perf_analyzer/genai-perf/README.html)
You can also benchmark the performance of the endpoint by [AIPerf](https://github.com/ai-dynamo/aiperf/blob/main/README.md)
......@@ -80,7 +80,7 @@ While not implemented in this example, the router can also operate in a pure pre
- Integrates with vLLM's OpenAI serving components for request preprocessing and response formatting
### `perf.sh`
- Benchmarking script using `genai-perf` to test the router setup
- Benchmarking script using `aiperf` to test the router setup
- Configured for streaming chat completions with synthetic workloads
- Tests concurrent requests to evaluate routing performance
......
......@@ -28,7 +28,7 @@ num_unique_prompts=10
seed=42
genai-perf profile \
aiperf profile \
--model ${model} \
--tokenizer ${model} \
--endpoint-type ${type} \
......@@ -47,8 +47,6 @@ genai-perf profile \
--request-count ${num_requests} \
--num-dataset-entries ${num_unique_prompts} \
--random-seed ${seed} \
-- \
-v \
--max-threads 256 \
-H 'Authorization: Bearer NOT USED' \
-H 'Accept: text/event-stream'
......@@ -215,10 +215,10 @@ When running deployment with sla-planner, to reduce the image pulling time, depl
kubectl apply -f ./perf_test_configs/image_cache_daemonset.yaml -n <namespace>
```
Then, port-forward or shell into the frontend pod and run GenAI-Perf to get the goodput:
Then, port-forward or shell into the frontend pod and run AIPerf to get the goodput:
```bash
genai-perf profile \
aiperf profile \
--model nvidia/Llama-3.1-8B-Instruct-FP8 \
--tokenizer nvidia/Llama-3.1-8B-Instruct-FP8 \
--endpoint-type chat \
......@@ -227,11 +227,11 @@ genai-perf profile \
--input-file payload:/workspace/rr-5-45_i3000o300.jsonl \ # path to the generated load dataset \
--fixed-schedule True \
--goodput time_to_first_token:200 inter_token_latency:10 \
-- -v -max-threads 64 \
-v \
```
> [!NOTE]
> Sometimes, when sla planner scales down the number of workers, a few requests will error out and cause GenAI-Perf to stuck. We are aware of this issue and are working on fixing it.
> Sometimes, when sla planner scales down the number of workers, a few requests will error out and cause AIPerf to stuck. We are aware of this issue and are working on fixing it.
#### E2E Perf Test Results
......
......@@ -64,9 +64,9 @@ check_prerequisites() {
exit 1
fi
# Check for genai-perf
if ! command -v genai-perf &> /dev/null; then
log_error "genai-perf not found. This tool is required for load generation."
# Check for aiperf
if ! command -v aiperf &> /dev/null; then
log_error "aiperf not found. This tool is required for load generation."
log_error "Please install the required dependencies by following the instructions in tests/planner/README.md"
exit 1
fi
......
......@@ -4,7 +4,7 @@
"""
Load generation script for SLA planner scaling tests.
This script uses genai-perf to generate load at specific request rates
This script uses aiperf to generate load at specific request rates
to test the planner's scaling behavior.
"""
......@@ -24,7 +24,7 @@ logger = logging.getLogger(__name__)
class LoadGenerator:
"""Generate load using genai-perf to test planner scaling."""
"""Generate load using aiperf to test planner scaling."""
def __init__(
self,
......@@ -40,12 +40,12 @@ class LoadGenerator:
self.osl = osl
self.save_results = save_results
def _calculate_genai_perf_params(
def _calculate_aiperf_params(
self,
req_per_sec: float,
) -> Dict[str, Any]:
"""
Calculate genai-perf parameters to approximate desired request rate.
Calculate aiperf parameters to approximate desired request rate.
Args:
req_per_sec: Desired requests per second
......@@ -71,15 +71,15 @@ class LoadGenerator:
Args:
req_per_sec: Target requests per second
duration_sec: Duration to generate load (seconds)
artifact_dir: Directory to store genai-perf artifacts
artifact_dir: Directory to store aiperf artifacts
Returns:
Dictionary with load test results
"""
logger.info(f"Generating load: {req_per_sec} req/s for {duration_sec}s")
# Calculate genai-perf parameters
params = self._calculate_genai_perf_params(req_per_sec)
# Calculate aiperf parameters
params = self._calculate_aiperf_params(req_per_sec)
logger.info(f"Using request_rate={params['request_rate']} req/s")
# Create artifact directory if not provided
......@@ -95,9 +95,9 @@ class LoadGenerator:
f"Adjusted parameters: duration={duration_sec}s, request_count={request_count}"
)
# Build genai-perf command based on coworker's successful approach
# Build aiperf command based on coworker's successful approach
cmd = [
"genai-perf",
"aiperf",
"profile",
"--model",
self.model,
......@@ -124,10 +124,7 @@ class LoadGenerator:
), # Generate reasonable dataset size
"--artifact-dir",
artifact_dir,
"--",
"-v",
"-max-threads",
"64",
]
logger.info(f"Running command: {' '.join(cmd)}")
......@@ -135,7 +132,7 @@ class LoadGenerator:
f"Expected duration: {duration_sec}s, timeout: {max(duration_sec * 2 + 120, int(duration_sec * 2.5))}s"
)
# Run genai-perf (async)
# Run aiperf (async)
start_time = time.time()
# More generous timeout for high-load tests - allow 2x duration + 2 minutes buffer
timeout = max(duration_sec * 2 + 120, int(duration_sec * 2.5))
......@@ -152,7 +149,7 @@ class LoadGenerator:
except asyncio.TimeoutError:
proc.kill()
await proc.communicate()
logger.error("genai-perf timed out")
logger.error("aiperf timed out")
raise RuntimeError("Load generation timed out")
end_time = time.time()
......@@ -160,13 +157,9 @@ class LoadGenerator:
# Persist logs for debugging
try:
with open(
os.path.join(artifact_dir, "genai_perf.stdout.log"), "wb"
) as f:
with open(os.path.join(artifact_dir, "aiperf.stdout.log"), "wb") as f:
f.write(stdout or b"")
with open(
os.path.join(artifact_dir, "genai_perf.stderr.log"), "wb"
) as f:
with open(os.path.join(artifact_dir, "aiperf.stderr.log"), "wb") as f:
f.write(stderr or b"")
except Exception:
pass
......@@ -174,31 +167,31 @@ class LoadGenerator:
if proc.returncode == 0:
logger.info("Load generation completed successfully")
logger.info(f"Actual duration: {actual_duration:.2f}s")
results = self._parse_genai_perf_results(artifact_dir)
results = self._parse_aiperf_results(artifact_dir)
results.update(
{
"requested_req_per_sec": req_per_sec,
"actual_duration": actual_duration,
"target_duration": duration_sec,
"genai_perf_params": params,
"aiperf_params": params,
"artifact_dir": artifact_dir,
"success": True,
}
)
return results
else:
logger.error(f"genai-perf failed with return code {proc.returncode}")
raise RuntimeError("genai-perf failed; see logs in artifact dir")
logger.error(f"aiperf failed with return code {proc.returncode}")
raise RuntimeError("aiperf failed; see logs in artifact dir")
except RuntimeError:
raise
except Exception as e:
logger.error(f"genai-perf execution error: {e}")
logger.error(f"aiperf execution error: {e}")
raise
def _parse_genai_perf_results(self, artifact_dir: str) -> Dict[str, Any]:
"""Parse genai-perf results from artifact directory."""
def _parse_aiperf_results(self, artifact_dir: str) -> Dict[str, Any]:
"""Parse aiperf results from artifact directory."""
try:
# Look for the profile_export_genai_perf.json file
# Look for the profile_export_aiperf.json file
json_files = [f for f in os.listdir(artifact_dir) if f.endswith(".json")]
if not json_files:
logger.warning("No JSON results found in artifact directory")
......@@ -207,7 +200,7 @@ class LoadGenerator:
# Main results file
results_file = None
for json_file in json_files:
if "profile_export" in json_file or "genai_perf" in json_file:
if "profile_export" in json_file or "aiperf" in json_file:
results_file = os.path.join(artifact_dir, json_file)
break
......@@ -236,7 +229,7 @@ class LoadGenerator:
).get("avg", 0),
}
)
if not results and "profile_export_genai_perf" in data:
if not results and "profile_export_aiperf" in data:
summary = data.get("summary", {})
results.update(
{
......@@ -250,7 +243,7 @@ class LoadGenerator:
return results
except Exception as e:
logger.warning(f"Failed to parse genai-perf results: {e}")
logger.warning(f"Failed to parse aiperf results: {e}")
return {}
async def run_scaling_test(self) -> Dict[str, Any]:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment