"...git@developer.sourcefind.cn:2222/OpenDAS/vllm_cscc.git" did not exist on "e4a9c2cddb8c35b0eadaa65e81f9f6e5ce3c6058"
Unverified Commit 9f310225 authored by Harshini Komali's avatar Harshini Komali Committed by GitHub
Browse files

feat: Replace genai-perf with aiperf (#3533)


Signed-off-by: default avatarlkomali <lkomali@nvidia.com>
parent 1972f71f
...@@ -13,7 +13,7 @@ This directory contains scripts for benchmarking the Dynamo router with prefix c ...@@ -13,7 +13,7 @@ This directory contains scripts for benchmarking the Dynamo router with prefix c
- etcd and NATS running (required for Dynamo coordination) - etcd and NATS running (required for Dynamo coordination)
- Required Python packages: - Required Python packages:
- `dynamo` package (with vllm and frontend modules) - `dynamo` package (with vllm and frontend modules)
- `genai-perf` for benchmarking - `aiperf` for benchmarking
- `matplotlib` for plotting results - `matplotlib` for plotting results
- `data-generator` package (install with `pip install -e ./benchmarks` from repo root) - `data-generator` package (install with `pip install -e ./benchmarks` from repo root)
...@@ -230,11 +230,11 @@ python real_data_benchmark.py --input-dataset trace.jsonl --prefix-root-multipli ...@@ -230,11 +230,11 @@ python real_data_benchmark.py --input-dataset trace.jsonl --prefix-root-multipli
``` ```
> [!Note] > [!Note]
> At the time of writing this documentation, you may need to install the latest genai-perf from the main source branch to loadgen on the trace files: > At the time of writing this documentation, you may need to install the latest aiperf from the main source branch to loadgen on the trace files:
> ```bash > ```bash
> pip install git+https://github.com/triton-inference-server/perf_analyzer.git#subdirectory=genai-perf > pip install git+https://github.com/ai-dynamo/aiperf.git#subdirectory=aiperf
> ``` > ```
> However, by the time of release, the genai-perf version included in the vLLM runtime container should be up to date enough to use as-is. > However, by the time of release, the aiperf version included in the vLLM runtime container should be up to date enough to use as-is.
## Troubleshooting ## Troubleshooting
......
...@@ -27,7 +27,7 @@ console_handler.setFormatter(formatter) ...@@ -27,7 +27,7 @@ console_handler.setFormatter(formatter)
logger.addHandler(console_handler) logger.addHandler(console_handler)
def get_genai_perf_cmd( def get_aiperf_cmd(
model, model,
tokenizer, # Add tokenizer parameter tokenizer, # Add tokenizer parameter
prefix_ratio, prefix_ratio,
...@@ -40,12 +40,12 @@ def get_genai_perf_cmd( ...@@ -40,12 +40,12 @@ def get_genai_perf_cmd(
artifact_dir, artifact_dir,
url="http://localhost:8888", url="http://localhost:8888",
): ):
"""Build genai-perf command based on prefix ratio""" """Build aiperf command based on prefix ratio"""
prefix_length = int(isl * prefix_ratio) prefix_length = int(isl * prefix_ratio)
synthetic_input_length = int(isl * (1 - prefix_ratio)) synthetic_input_length = int(isl * (1 - prefix_ratio))
return [ return [
"genai-perf", "aiperf",
"profile", "profile",
"--model", "--model",
model, model,
...@@ -84,10 +84,7 @@ def get_genai_perf_cmd( ...@@ -84,10 +84,7 @@ def get_genai_perf_cmd(
str(num_prefix_prompts), str(num_prefix_prompts),
"--artifact-dir", "--artifact-dir",
artifact_dir, artifact_dir,
"--",
"-v", "-v",
"--max-threads",
"256",
"-H", "-H",
"Authorization: Bearer NOT USED", "Authorization: Bearer NOT USED",
"-H", "-H",
...@@ -95,17 +92,17 @@ def get_genai_perf_cmd( ...@@ -95,17 +92,17 @@ def get_genai_perf_cmd(
] ]
def get_gap_result(artifact_dir: str) -> dict: def get_aiperf_result(artifact_dir: str) -> dict:
"""Parse genai-perf results from JSON file""" """Parse aiperf results from JSON file"""
json_file_path = None json_file_path = None
for root, _, files in os.walk(artifact_dir): for root, _, files in os.walk(artifact_dir):
if "profile_export_genai_perf.json" in files: if "profile_export_aiperf.json" in files:
json_file_path = os.path.join(root, "profile_export_genai_perf.json") json_file_path = os.path.join(root, "profile_export_aiperf.json")
break break
if json_file_path is None: if json_file_path is None:
raise FileNotFoundError( raise FileNotFoundError(
f"profile_export_genai_perf.json not found in {artifact_dir}" f"profile_export_aiperf.json not found in {artifact_dir}"
) )
with open(json_file_path, "r") as f: with open(json_file_path, "r") as f:
...@@ -125,8 +122,8 @@ def run_benchmark_single_url( ...@@ -125,8 +122,8 @@ def run_benchmark_single_url(
artifact_dir, artifact_dir,
url, url,
) -> Optional[Dict]: ) -> Optional[Dict]:
"""Run genai-perf benchmark for a single URL""" """Run aiperf benchmark for a single URL"""
genai_perf_cmd = get_genai_perf_cmd( aiperf_cmd = get_aiperf_cmd(
model, model,
tokenizer, # Pass tokenizer parameter tokenizer, # Pass tokenizer parameter
prefix_ratio, prefix_ratio,
...@@ -140,21 +137,21 @@ def run_benchmark_single_url( ...@@ -140,21 +137,21 @@ def run_benchmark_single_url(
url, url,
) )
logger.info(f"Running command for URL {url}: {' '.join(genai_perf_cmd)}") logger.info(f"Running command for URL {url}: {' '.join(aiperf_cmd)}")
try: try:
gap_process = subprocess.run( aiperf_process = subprocess.run(
genai_perf_cmd, capture_output=True, text=True, check=True aiperf_cmd, capture_output=True, text=True, check=True
) )
logger.info(f"Genai-perf profiling completed successfully for URL {url}") logger.info(f"AIPerf profiling completed successfully for URL {url}")
logger.info(gap_process.stdout) logger.info(aiperf_process.stdout)
gap_result = get_gap_result(artifact_dir) aiperf_result = get_aiperf_result(artifact_dir)
return gap_result return aiperf_result
except subprocess.CalledProcessError as e: except subprocess.CalledProcessError as e:
logger.error(f"Genai-perf failed for URL {url} with error code: {e.returncode}") logger.error(f"AIPerf failed for URL {url} with error code: {e.returncode}")
logger.error(f"stderr: {e.stderr}") logger.error(f"stderr: {e.stderr}")
return None return None
...@@ -197,7 +194,7 @@ def run_benchmark( ...@@ -197,7 +194,7 @@ def run_benchmark(
output_dir, output_dir,
urls, urls,
) -> Optional[Dict]: ) -> Optional[Dict]:
"""Run genai-perf benchmark for a specific prefix ratio""" """Run aiperf benchmark for a specific prefix ratio"""
logger.info( logger.info(
f"Running benchmark with prefix_ratio={prefix_ratio}, seed={seed}, URLs={urls}" f"Running benchmark with prefix_ratio={prefix_ratio}, seed={seed}, URLs={urls}"
) )
...@@ -242,7 +239,7 @@ def run_benchmark( ...@@ -242,7 +239,7 @@ def run_benchmark(
os.makedirs(artifact_dir, exist_ok=True) os.makedirs(artifact_dir, exist_ok=True)
artifact_dirs.append(artifact_dir) artifact_dirs.append(artifact_dir)
genai_perf_cmd = get_genai_perf_cmd( aiperf_cmd = get_aiperf_cmd(
model, model,
tokenizer, # Pass tokenizer parameter tokenizer, # Pass tokenizer parameter
prefix_ratio, prefix_ratio,
...@@ -256,10 +253,10 @@ def run_benchmark( ...@@ -256,10 +253,10 @@ def run_benchmark(
url, url,
) )
logger.info(f"Launching process for URL {url}: {' '.join(genai_perf_cmd)}") logger.info(f"Launching process for URL {url}: {' '.join(aiperf_cmd)}")
process = subprocess.Popen( process = subprocess.Popen(
genai_perf_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True aiperf_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
) )
processes.append((process, url, artifact_dir)) processes.append((process, url, artifact_dir))
...@@ -269,18 +266,18 @@ def run_benchmark( ...@@ -269,18 +266,18 @@ def run_benchmark(
stdout, stderr = process.communicate() stdout, stderr = process.communicate()
if process.returncode == 0: if process.returncode == 0:
logger.info(f"Genai-perf completed successfully for URL {url}") logger.info(f"AIPerf completed successfully for URL {url}")
logger.info(stdout) logger.info(stdout)
try: try:
gap_result = get_gap_result(artifact_dir) aiperf_result = get_aiperf_result(artifact_dir)
results.append(gap_result) results.append(aiperf_result)
except Exception as e: except Exception as e:
logger.error(f"Failed to get results for URL {url}: {e}") logger.error(f"Failed to get results for URL {url}: {e}")
results.append(None) results.append(None)
else: else:
logger.error( logger.error(
f"Genai-perf failed for URL {url} with error code: {process.returncode}" f"AIPerf failed for URL {url} with error code: {process.returncode}"
) )
logger.error(f"stderr: {stderr}") logger.error(f"stderr: {stderr}")
results.append(None) results.append(None)
......
...@@ -24,7 +24,7 @@ console_handler.setFormatter(formatter) ...@@ -24,7 +24,7 @@ console_handler.setFormatter(formatter)
logger.addHandler(console_handler) logger.addHandler(console_handler)
def get_genai_perf_cmd_for_trace( def get_aiperf_cmd_for_trace(
model, model,
tokenizer, tokenizer,
input_dataset, input_dataset,
...@@ -33,7 +33,7 @@ def get_genai_perf_cmd_for_trace( ...@@ -33,7 +33,7 @@ def get_genai_perf_cmd_for_trace(
url="http://localhost:8888", url="http://localhost:8888",
): ):
return [ return [
"genai-perf", "aiperf",
"profile", "profile",
"--model", "--model",
model, model,
...@@ -47,17 +47,13 @@ def get_genai_perf_cmd_for_trace( ...@@ -47,17 +47,13 @@ def get_genai_perf_cmd_for_trace(
"--url", "--url",
url, url,
"--input-file", "--input-file",
f"payload:{input_dataset}", f"{input_dataset}",
"--fixed-schedule", "--fixed-schedule-auto-offset",
"True",
"--random-seed", "--random-seed",
str(seed), str(seed),
"--artifact-dir", "--artifact-dir",
artifact_dir, artifact_dir,
"--",
"-v", "-v",
"--max-threads",
"256",
"-H", "-H",
"Authorization: Bearer NOT USED", "Authorization: Bearer NOT USED",
"-H", "-H",
...@@ -73,8 +69,8 @@ def run_benchmark_with_trace( ...@@ -73,8 +69,8 @@ def run_benchmark_with_trace(
url, url,
seed, seed,
): ):
"""Run genai-perf benchmark with a trace dataset""" """Run aiperf benchmark with a trace dataset"""
genai_perf_cmd = get_genai_perf_cmd_for_trace( aiperf_cmd = get_aiperf_cmd_for_trace(
model, model,
tokenizer, tokenizer,
trace_dataset, trace_dataset,
...@@ -83,17 +79,17 @@ def run_benchmark_with_trace( ...@@ -83,17 +79,17 @@ def run_benchmark_with_trace(
url, url,
) )
logger.info(f"Running genai-perf with trace dataset: {trace_dataset}") logger.info(f"Running aiperf with trace dataset: {trace_dataset}")
logger.info(f"Command: {' '.join(genai_perf_cmd)}") logger.info(f"Command: {' '.join(aiperf_cmd)}")
try: try:
# Run genai-perf and let it output directly to terminal # Run aiperf and let it output directly to terminal
subprocess.run(genai_perf_cmd, check=True) subprocess.run(aiperf_cmd, check=True)
logger.info("Genai-perf profiling completed successfully") logger.info("AIPerf profiling completed successfully")
except subprocess.CalledProcessError as e: except subprocess.CalledProcessError as e:
logger.error(f"Genai-perf failed with error code: {e.returncode}") logger.error(f"AIPerf failed with error code: {e.returncode}")
logger.error(f"stderr: {e.stderr}") logger.error(f"stderr: {e.stderr}")
raise raise
...@@ -301,7 +297,7 @@ def main(): ...@@ -301,7 +297,7 @@ def main():
logger.info(f"Synthetic trace data saved to: {trace_dataset_path}") logger.info(f"Synthetic trace data saved to: {trace_dataset_path}")
# Run benchmark with the trace dataset # Run benchmark with the trace dataset
artifact_dir = os.path.join(args.output_dir, "genai_perf_artifacts") artifact_dir = os.path.join(args.output_dir, "aiperf_artifacts")
os.makedirs(artifact_dir, exist_ok=True) os.makedirs(artifact_dir, exist_ok=True)
run_benchmark_with_trace( run_benchmark_with_trace(
......
...@@ -5,7 +5,7 @@ SPDX-License-Identifier: Apache-2.0 ...@@ -5,7 +5,7 @@ SPDX-License-Identifier: Apache-2.0
# Sinusoidal Load Generator # Sinusoidal Load Generator
`sin_synth.py` is a simple script to generate synthetic load with sinusoidal request rate and isl/osl ratio. The output is in [mooncake-style](https://github.com/kvcache-ai/Mooncake) jsonl format, which can be directly used in [GenAI-Perf](https://github.com/triton-inference-server/perf_analyzer/tree/main/genai-perf/genai_perf). `sin_synth.py` is a simple script to generate synthetic load with sinusoidal request rate and isl/osl ratio. The output is in [mooncake-style](https://github.com/kvcache-ai/Mooncake) jsonl format, which can be directly used in [AIPerf](https://github.com/ai-dynamo/aiperf/tree/main/aiperf).
## Usage ## Usage
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
from pathlib import Path from pathlib import Path
from typing import Dict, List from typing import Dict, List
from benchmarks.utils.genai import run_concurrency_sweep from benchmarks.utils.aiperf import run_concurrency_sweep
from deploy.utils.kubernetes import is_running_in_cluster from deploy.utils.kubernetes import is_running_in_cluster
......
...@@ -404,7 +404,7 @@ curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" ...@@ -404,7 +404,7 @@ curl localhost:8000/v1/chat/completions -H "Content-Type: application/json"
### Performance Testing with AIPerf ### Performance Testing with AIPerf
The Dynamo container includes [AIPerf](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/perf_analyzer/aiperf/README.html), NVIDIA's tool for benchmarking generative AI models. This tool helps measure throughput, latency, and other performance metrics for your deployment. The Dynamo container includes [AIPerf](https://github.com/ai-dynamo/aiperf/blob/main/README.md), NVIDIA's tool for benchmarking generative AI models. This tool helps measure throughput, latency, and other performance metrics for your deployment.
**Run the following benchmark from inside the container** (after completing the deployment steps above): **Run the following benchmark from inside the container** (after completing the deployment steps above):
......
...@@ -283,7 +283,7 @@ results/ # Client-side: ./benchmarks/results/ or custom ...@@ -283,7 +283,7 @@ results/ # Client-side: ./benchmarks/results/ or custom
│ └── avg_time_to_first_token_vs_concurrency.png │ └── avg_time_to_first_token_vs_concurrency.png
├── <your-benchmark-name>/ # Results for your benchmark (uses your custom name) ├── <your-benchmark-name>/ # Results for your benchmark (uses your custom name)
│ ├── c1/ # Concurrency level 1 │ ├── c1/ # Concurrency level 1
│ │ └── profile_export_genai_perf.json │ │ └── profile_export_aiperf.json
│ ├── c2/ # Concurrency level 2 │ ├── c2/ # Concurrency level 2
│ ├── c5/ # Concurrency level 5 │ ├── c5/ # Concurrency level 5
│ └── ... # Other concurrency levels (10, 50, 100, 250) │ └── ... # Other concurrency levels (10, 50, 100, 250)
...@@ -457,7 +457,7 @@ Results are stored in `/data/results` and follow the same structure as client-si ...@@ -457,7 +457,7 @@ Results are stored in `/data/results` and follow the same structure as client-si
/data/results/ /data/results/
└── <benchmark-name>/ # Results for your benchmark name └── <benchmark-name>/ # Results for your benchmark name
├── c1/ # Concurrency level 1 ├── c1/ # Concurrency level 1
│ └── profile_export_genai_perf.json │ └── profile_export_aiperf.json
├── c2/ # Concurrency level 2 ├── c2/ # Concurrency level 2
└── ... # Other concurrency levels └── ... # Other concurrency levels
``` ```
......
...@@ -56,11 +56,11 @@ Typically, the number of GPUs vs the performance follows the following pattern: ...@@ -56,11 +56,11 @@ Typically, the number of GPUs vs the performance follows the following pattern:
| 2 | 269 | 135 | 1.19x | | 2 | 269 | 135 | 1.19x |
| 4 | 578 | 144 | 1.28x | | 4 | 578 | 144 | 1.28x |
The best number of GPUs to use in the prefill and decode engines can be determined by running a few fixed ISL/OSL/concurrency test using [GenAI-Perf](https://github.com/triton-inference-server/perf_analyzer/tree/main/genai-perf) and compare with the SLA. The best number of GPUs to use in the prefill and decode engines can be determined by running a few fixed ISL/OSL/concurrency test using [AIPerf](https://github.com/ai-dynamo/aiperf/tree/main) and compare with the SLA.
GenAI-Perf is pre-installed in the dynamo container. AIPerf is pre-installed in the dynamo container.
> [!Tip] > [!Tip]
> If you are unfamiliar with GenAI-Perf, please see this helpful [tutorial](https://github.com/triton-inference-server/perf_analyzer/blob/main/genai-perf/docs/tutorial.md) to get you started. > If you are unfamiliar with AIPerf, please see this helpful [tutorial](https://github.com/ai-dynamo/aiperf/blob/main/docs/tutorial.md) to get you started.
Besides the parallelization mapping, other common knobs to tune are maximum batch size, maximum number of tokens, and block size. Besides the parallelization mapping, other common knobs to tune are maximum batch size, maximum number of tokens, and block size.
For prefill engines, usually a small batch size and large `max_num_token` is preferred. For prefill engines, usually a small batch size and large `max_num_token` is preferred.
......
...@@ -54,4 +54,4 @@ curl localhost:8000/v1/chat/completions \ ...@@ -54,4 +54,4 @@ curl localhost:8000/v1/chat/completions \
"max_tokens": 30 "max_tokens": 30
}' }'
``` ```
You can also benchmark the performance of the endpoint by [GenAI-Perf](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/perf_analyzer/genai-perf/README.html) You can also benchmark the performance of the endpoint by [AIPerf](https://github.com/ai-dynamo/aiperf/blob/main/README.md)
...@@ -39,4 +39,4 @@ curl localhost:8000/v1/chat/completions \ ...@@ -39,4 +39,4 @@ curl localhost:8000/v1/chat/completions \
"max_tokens": 30 "max_tokens": 30
}' }'
``` ```
You can also benchmark the performance of the endpoint by [GenAI-Perf](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/perf_analyzer/genai-perf/README.html) You can also benchmark the performance of the endpoint by [AIPerf](https://github.com/ai-dynamo/aiperf/blob/main/README.md)
...@@ -80,7 +80,7 @@ While not implemented in this example, the router can also operate in a pure pre ...@@ -80,7 +80,7 @@ While not implemented in this example, the router can also operate in a pure pre
- Integrates with vLLM's OpenAI serving components for request preprocessing and response formatting - Integrates with vLLM's OpenAI serving components for request preprocessing and response formatting
### `perf.sh` ### `perf.sh`
- Benchmarking script using `genai-perf` to test the router setup - Benchmarking script using `aiperf` to test the router setup
- Configured for streaming chat completions with synthetic workloads - Configured for streaming chat completions with synthetic workloads
- Tests concurrent requests to evaluate routing performance - Tests concurrent requests to evaluate routing performance
......
...@@ -28,7 +28,7 @@ num_unique_prompts=10 ...@@ -28,7 +28,7 @@ num_unique_prompts=10
seed=42 seed=42
genai-perf profile \ aiperf profile \
--model ${model} \ --model ${model} \
--tokenizer ${model} \ --tokenizer ${model} \
--endpoint-type ${type} \ --endpoint-type ${type} \
...@@ -47,8 +47,6 @@ genai-perf profile \ ...@@ -47,8 +47,6 @@ genai-perf profile \
--request-count ${num_requests} \ --request-count ${num_requests} \
--num-dataset-entries ${num_unique_prompts} \ --num-dataset-entries ${num_unique_prompts} \
--random-seed ${seed} \ --random-seed ${seed} \
-- \
-v \ -v \
--max-threads 256 \
-H 'Authorization: Bearer NOT USED' \ -H 'Authorization: Bearer NOT USED' \
-H 'Accept: text/event-stream' -H 'Accept: text/event-stream'
...@@ -215,10 +215,10 @@ When running deployment with sla-planner, to reduce the image pulling time, depl ...@@ -215,10 +215,10 @@ When running deployment with sla-planner, to reduce the image pulling time, depl
kubectl apply -f ./perf_test_configs/image_cache_daemonset.yaml -n <namespace> kubectl apply -f ./perf_test_configs/image_cache_daemonset.yaml -n <namespace>
``` ```
Then, port-forward or shell into the frontend pod and run GenAI-Perf to get the goodput: Then, port-forward or shell into the frontend pod and run AIPerf to get the goodput:
```bash ```bash
genai-perf profile \ aiperf profile \
--model nvidia/Llama-3.1-8B-Instruct-FP8 \ --model nvidia/Llama-3.1-8B-Instruct-FP8 \
--tokenizer nvidia/Llama-3.1-8B-Instruct-FP8 \ --tokenizer nvidia/Llama-3.1-8B-Instruct-FP8 \
--endpoint-type chat \ --endpoint-type chat \
...@@ -227,11 +227,11 @@ genai-perf profile \ ...@@ -227,11 +227,11 @@ genai-perf profile \
--input-file payload:/workspace/rr-5-45_i3000o300.jsonl \ # path to the generated load dataset \ --input-file payload:/workspace/rr-5-45_i3000o300.jsonl \ # path to the generated load dataset \
--fixed-schedule True \ --fixed-schedule True \
--goodput time_to_first_token:200 inter_token_latency:10 \ --goodput time_to_first_token:200 inter_token_latency:10 \
-- -v -max-threads 64 \ -v \
``` ```
> [!NOTE] > [!NOTE]
> Sometimes, when sla planner scales down the number of workers, a few requests will error out and cause GenAI-Perf to stuck. We are aware of this issue and are working on fixing it. > Sometimes, when sla planner scales down the number of workers, a few requests will error out and cause AIPerf to stuck. We are aware of this issue and are working on fixing it.
#### E2E Perf Test Results #### E2E Perf Test Results
......
...@@ -64,9 +64,9 @@ check_prerequisites() { ...@@ -64,9 +64,9 @@ check_prerequisites() {
exit 1 exit 1
fi fi
# Check for genai-perf # Check for aiperf
if ! command -v genai-perf &> /dev/null; then if ! command -v aiperf &> /dev/null; then
log_error "genai-perf not found. This tool is required for load generation." log_error "aiperf not found. This tool is required for load generation."
log_error "Please install the required dependencies by following the instructions in tests/planner/README.md" log_error "Please install the required dependencies by following the instructions in tests/planner/README.md"
exit 1 exit 1
fi fi
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
""" """
Load generation script for SLA planner scaling tests. Load generation script for SLA planner scaling tests.
This script uses genai-perf to generate load at specific request rates This script uses aiperf to generate load at specific request rates
to test the planner's scaling behavior. to test the planner's scaling behavior.
""" """
...@@ -24,7 +24,7 @@ logger = logging.getLogger(__name__) ...@@ -24,7 +24,7 @@ logger = logging.getLogger(__name__)
class LoadGenerator: class LoadGenerator:
"""Generate load using genai-perf to test planner scaling.""" """Generate load using aiperf to test planner scaling."""
def __init__( def __init__(
self, self,
...@@ -40,12 +40,12 @@ class LoadGenerator: ...@@ -40,12 +40,12 @@ class LoadGenerator:
self.osl = osl self.osl = osl
self.save_results = save_results self.save_results = save_results
def _calculate_genai_perf_params( def _calculate_aiperf_params(
self, self,
req_per_sec: float, req_per_sec: float,
) -> Dict[str, Any]: ) -> Dict[str, Any]:
""" """
Calculate genai-perf parameters to approximate desired request rate. Calculate aiperf parameters to approximate desired request rate.
Args: Args:
req_per_sec: Desired requests per second req_per_sec: Desired requests per second
...@@ -71,15 +71,15 @@ class LoadGenerator: ...@@ -71,15 +71,15 @@ class LoadGenerator:
Args: Args:
req_per_sec: Target requests per second req_per_sec: Target requests per second
duration_sec: Duration to generate load (seconds) duration_sec: Duration to generate load (seconds)
artifact_dir: Directory to store genai-perf artifacts artifact_dir: Directory to store aiperf artifacts
Returns: Returns:
Dictionary with load test results Dictionary with load test results
""" """
logger.info(f"Generating load: {req_per_sec} req/s for {duration_sec}s") logger.info(f"Generating load: {req_per_sec} req/s for {duration_sec}s")
# Calculate genai-perf parameters # Calculate aiperf parameters
params = self._calculate_genai_perf_params(req_per_sec) params = self._calculate_aiperf_params(req_per_sec)
logger.info(f"Using request_rate={params['request_rate']} req/s") logger.info(f"Using request_rate={params['request_rate']} req/s")
# Create artifact directory if not provided # Create artifact directory if not provided
...@@ -95,9 +95,9 @@ class LoadGenerator: ...@@ -95,9 +95,9 @@ class LoadGenerator:
f"Adjusted parameters: duration={duration_sec}s, request_count={request_count}" f"Adjusted parameters: duration={duration_sec}s, request_count={request_count}"
) )
# Build genai-perf command based on coworker's successful approach # Build aiperf command based on coworker's successful approach
cmd = [ cmd = [
"genai-perf", "aiperf",
"profile", "profile",
"--model", "--model",
self.model, self.model,
...@@ -124,10 +124,7 @@ class LoadGenerator: ...@@ -124,10 +124,7 @@ class LoadGenerator:
), # Generate reasonable dataset size ), # Generate reasonable dataset size
"--artifact-dir", "--artifact-dir",
artifact_dir, artifact_dir,
"--",
"-v", "-v",
"-max-threads",
"64",
] ]
logger.info(f"Running command: {' '.join(cmd)}") logger.info(f"Running command: {' '.join(cmd)}")
...@@ -135,7 +132,7 @@ class LoadGenerator: ...@@ -135,7 +132,7 @@ class LoadGenerator:
f"Expected duration: {duration_sec}s, timeout: {max(duration_sec * 2 + 120, int(duration_sec * 2.5))}s" f"Expected duration: {duration_sec}s, timeout: {max(duration_sec * 2 + 120, int(duration_sec * 2.5))}s"
) )
# Run genai-perf (async) # Run aiperf (async)
start_time = time.time() start_time = time.time()
# More generous timeout for high-load tests - allow 2x duration + 2 minutes buffer # More generous timeout for high-load tests - allow 2x duration + 2 minutes buffer
timeout = max(duration_sec * 2 + 120, int(duration_sec * 2.5)) timeout = max(duration_sec * 2 + 120, int(duration_sec * 2.5))
...@@ -152,7 +149,7 @@ class LoadGenerator: ...@@ -152,7 +149,7 @@ class LoadGenerator:
except asyncio.TimeoutError: except asyncio.TimeoutError:
proc.kill() proc.kill()
await proc.communicate() await proc.communicate()
logger.error("genai-perf timed out") logger.error("aiperf timed out")
raise RuntimeError("Load generation timed out") raise RuntimeError("Load generation timed out")
end_time = time.time() end_time = time.time()
...@@ -160,13 +157,9 @@ class LoadGenerator: ...@@ -160,13 +157,9 @@ class LoadGenerator:
# Persist logs for debugging # Persist logs for debugging
try: try:
with open( with open(os.path.join(artifact_dir, "aiperf.stdout.log"), "wb") as f:
os.path.join(artifact_dir, "genai_perf.stdout.log"), "wb"
) as f:
f.write(stdout or b"") f.write(stdout or b"")
with open( with open(os.path.join(artifact_dir, "aiperf.stderr.log"), "wb") as f:
os.path.join(artifact_dir, "genai_perf.stderr.log"), "wb"
) as f:
f.write(stderr or b"") f.write(stderr or b"")
except Exception: except Exception:
pass pass
...@@ -174,31 +167,31 @@ class LoadGenerator: ...@@ -174,31 +167,31 @@ class LoadGenerator:
if proc.returncode == 0: if proc.returncode == 0:
logger.info("Load generation completed successfully") logger.info("Load generation completed successfully")
logger.info(f"Actual duration: {actual_duration:.2f}s") logger.info(f"Actual duration: {actual_duration:.2f}s")
results = self._parse_genai_perf_results(artifact_dir) results = self._parse_aiperf_results(artifact_dir)
results.update( results.update(
{ {
"requested_req_per_sec": req_per_sec, "requested_req_per_sec": req_per_sec,
"actual_duration": actual_duration, "actual_duration": actual_duration,
"target_duration": duration_sec, "target_duration": duration_sec,
"genai_perf_params": params, "aiperf_params": params,
"artifact_dir": artifact_dir, "artifact_dir": artifact_dir,
"success": True, "success": True,
} }
) )
return results return results
else: else:
logger.error(f"genai-perf failed with return code {proc.returncode}") logger.error(f"aiperf failed with return code {proc.returncode}")
raise RuntimeError("genai-perf failed; see logs in artifact dir") raise RuntimeError("aiperf failed; see logs in artifact dir")
except RuntimeError: except RuntimeError:
raise raise
except Exception as e: except Exception as e:
logger.error(f"genai-perf execution error: {e}") logger.error(f"aiperf execution error: {e}")
raise raise
def _parse_genai_perf_results(self, artifact_dir: str) -> Dict[str, Any]: def _parse_aiperf_results(self, artifact_dir: str) -> Dict[str, Any]:
"""Parse genai-perf results from artifact directory.""" """Parse aiperf results from artifact directory."""
try: try:
# Look for the profile_export_genai_perf.json file # Look for the profile_export_aiperf.json file
json_files = [f for f in os.listdir(artifact_dir) if f.endswith(".json")] json_files = [f for f in os.listdir(artifact_dir) if f.endswith(".json")]
if not json_files: if not json_files:
logger.warning("No JSON results found in artifact directory") logger.warning("No JSON results found in artifact directory")
...@@ -207,7 +200,7 @@ class LoadGenerator: ...@@ -207,7 +200,7 @@ class LoadGenerator:
# Main results file # Main results file
results_file = None results_file = None
for json_file in json_files: for json_file in json_files:
if "profile_export" in json_file or "genai_perf" in json_file: if "profile_export" in json_file or "aiperf" in json_file:
results_file = os.path.join(artifact_dir, json_file) results_file = os.path.join(artifact_dir, json_file)
break break
...@@ -236,7 +229,7 @@ class LoadGenerator: ...@@ -236,7 +229,7 @@ class LoadGenerator:
).get("avg", 0), ).get("avg", 0),
} }
) )
if not results and "profile_export_genai_perf" in data: if not results and "profile_export_aiperf" in data:
summary = data.get("summary", {}) summary = data.get("summary", {})
results.update( results.update(
{ {
...@@ -250,7 +243,7 @@ class LoadGenerator: ...@@ -250,7 +243,7 @@ class LoadGenerator:
return results return results
except Exception as e: except Exception as e:
logger.warning(f"Failed to parse genai-perf results: {e}") logger.warning(f"Failed to parse aiperf results: {e}")
return {} return {}
async def run_scaling_test(self) -> Dict[str, Any]: async def run_scaling_test(self) -> Dict[str, Any]:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment