Unverified Commit 333ee983 authored by Anthony Casagrande's avatar Anthony Casagrande Committed by GitHub
Browse files

fix: update invalid AIPerf scripts and parsing logic (#3675)


Signed-off-by: default avatarAnthony Casagrande <acasagrande@nvidia.com>
parent 2f793b45
...@@ -235,9 +235,8 @@ for concurrency in "${concurrency_array[@]}"; do ...@@ -235,9 +235,8 @@ for concurrency in "${concurrency_array[@]}"; do
--num-dataset-entries $(($concurrency*12)) \ --num-dataset-entries $(($concurrency*12)) \
--random-seed 100 \ --random-seed 100 \
--artifact-dir ${artifact_dir} \ --artifact-dir ${artifact_dir} \
-- \ --ui simple \
-v \ -v \
--max-threads ${concurrency} \
-H 'Authorization: Bearer NOT USED' \ -H 'Authorization: Bearer NOT USED' \
-H 'Accept: text/event-stream' -H 'Accept: text/event-stream'
......
...@@ -253,7 +253,7 @@ async def run_profile(args): ...@@ -253,7 +253,7 @@ async def run_profile(args):
base_url=base_url, base_url=base_url,
) )
if aiperf_result is not None: if aiperf_result is not None:
ttft = aiperf_result["records"]["ttft"]["avg"] ttft = aiperf_result["time_to_first_token"]["avg"]
logger.info("Cleaning up deployment...") logger.info("Cleaning up deployment...")
await client.delete_deployment() await client.delete_deployment()
...@@ -432,11 +432,9 @@ async def run_profile(args): ...@@ -432,11 +432,9 @@ async def run_profile(args):
base_url=base_url, base_url=base_url,
) )
if aiperf_result is not None: if aiperf_result is not None:
itl = aiperf_result["records"]["inter_token_latency"]["avg"] itl = aiperf_result["inter_token_latency"]["avg"]
thpt_per_gpu = ( thpt_per_gpu = (
aiperf_result["records"]["output_token_throughput"][ aiperf_result["output_token_throughput"]["avg"]
"avg"
]
/ num_gpus / num_gpus
) )
......
...@@ -124,10 +124,8 @@ def profile_decode( ...@@ -124,10 +124,8 @@ def profile_decode(
base_url=url, base_url=url,
) )
if aiperf_result is not None: if aiperf_result is not None:
itl = aiperf_result["records"]["inter_token_latency"]["avg"] itl = aiperf_result["inter_token_latency"]["avg"]
thpt_per_gpu = ( thpt_per_gpu = aiperf_result["output_token_throughput"]["avg"] / num_gpus
aiperf_result["records"]["output_token_throughput"]["avg"] / num_gpus
)
return itl, thpt_per_gpu return itl, thpt_per_gpu
return None, None return None, None
......
...@@ -90,7 +90,7 @@ def profile_prefill( ...@@ -90,7 +90,7 @@ def profile_prefill(
base_url=url, base_url=url,
) )
if aiperf_result is not None: if aiperf_result is not None:
return aiperf_result["records"]["ttft"]["avg"] return aiperf_result["time_to_first_token"]["avg"]
return None return None
return _profile_prefill_helper( return _profile_prefill_helper(
......
...@@ -232,7 +232,7 @@ python real_data_benchmark.py --input-dataset trace.jsonl --prefix-root-multipli ...@@ -232,7 +232,7 @@ python real_data_benchmark.py --input-dataset trace.jsonl --prefix-root-multipli
> [!Note] > [!Note]
> At the time of writing this documentation, you may need to install the latest aiperf from the main source branch to loadgen on the trace files: > At the time of writing this documentation, you may need to install the latest aiperf from the main source branch to loadgen on the trace files:
> ```bash > ```bash
> pip install git+https://github.com/ai-dynamo/aiperf.git#subdirectory=aiperf > pip install git+https://github.com/ai-dynamo/aiperf.git
> ``` > ```
> However, by the time of release, the aiperf version included in the vLLM runtime container should be up to date enough to use as-is. > However, by the time of release, the aiperf version included in the vLLM runtime container should be up to date enough to use as-is.
......
...@@ -5,7 +5,7 @@ SPDX-License-Identifier: Apache-2.0 ...@@ -5,7 +5,7 @@ SPDX-License-Identifier: Apache-2.0
# Sinusoidal Load Generator # Sinusoidal Load Generator
`sin_synth.py` is a simple script to generate synthetic load with sinusoidal request rate and isl/osl ratio. The output is in [mooncake-style](https://github.com/kvcache-ai/Mooncake) jsonl format, which can be directly used in [AIPerf](https://github.com/ai-dynamo/aiperf/tree/main/aiperf). `sin_synth.py` is a simple script to generate synthetic load with sinusoidal request rate and isl/osl ratio. The output is in [mooncake-style](https://github.com/kvcache-ai/Mooncake) jsonl format, which can be directly used in [AIPerf](https://github.com/ai-dynamo/aiperf).
## Usage ## Usage
......
...@@ -404,7 +404,7 @@ curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" ...@@ -404,7 +404,7 @@ curl localhost:8000/v1/chat/completions -H "Content-Type: application/json"
### Performance Testing with AIPerf ### Performance Testing with AIPerf
The Dynamo container includes [AIPerf](https://github.com/ai-dynamo/aiperf/blob/main/README.md), NVIDIA's tool for benchmarking generative AI models. This tool helps measure throughput, latency, and other performance metrics for your deployment. The Dynamo container includes [AIPerf](https://github.com/ai-dynamo/aiperf/tree/main?tab=readme-ov-file#aiperf), NVIDIA's tool for benchmarking generative AI models. This tool helps measure throughput, latency, and other performance metrics for your deployment.
**Run the following benchmark from inside the container** (after completing the deployment steps above): **Run the following benchmark from inside the container** (after completing the deployment steps above):
......
...@@ -57,19 +57,20 @@ spec: ...@@ -57,19 +57,20 @@ spec:
aiperf profile --artifact-dir $ARTIFACT_DIR \ aiperf profile --artifact-dir $ARTIFACT_DIR \
--model $TARGET_MODEL \ --model $TARGET_MODEL \
--tokenizer /model-cache/hub/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a \ --tokenizer /model-cache/hub/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a \
--endpoint-type chat --endpoint /v1/chat/completions \ --endpoint-type chat \
--endpoint /v1/chat/completions \
--streaming \ --streaming \
--url http://$ENDPOINT \ --url http://$ENDPOINT \
--synthetic-input-tokens-mean $isl \ --synthetic-input-tokens-mean $isl \
--synthetic-input-tokens-stddev 0 \ --synthetic-input-tokens-stddev 0 \
--output-tokens-mean $osl \ --output-tokens-mean $osl \
--output-tokens-stddev 0 \ --output-tokens-stddev 0 \
--extra-inputs "{\"max_tokens\":$osl}" \ --extra-inputs "max_tokens:$osl" \
--extra-inputs "{\"min_tokens\":$osl}" \ --extra-inputs "min_tokens:$osl" \
--extra-inputs "{\"ignore_eos\":true}" \ --extra-inputs "ignore_eos:true" \
--extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \ --extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
--extra-inputs "{\"repetition_penalty\":1.0}" \ --extra-inputs "repetition_penalty:1.0" \
--extra-inputs "{\"temperature\": 0.0}" \ --extra-inputs "temperature: 0.0" \
--concurrency $concurrency \ --concurrency $concurrency \
--request-count $((10*concurrency)) \ --request-count $((10*concurrency)) \
--warmup-request-count $concurrency \ --warmup-request-count $concurrency \
......
...@@ -49,7 +49,8 @@ spec: ...@@ -49,7 +49,8 @@ spec:
aiperf profile --artifact-dir $ARTIFACT_DIR \ aiperf profile --artifact-dir $ARTIFACT_DIR \
--model $TARGET_MODEL \ --model $TARGET_MODEL \
--tokenizer /root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \ --tokenizer /root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \
--endpoint-type chat --endpoint /v1/chat/completions \ --endpoint-type chat \
--endpoint /v1/chat/completions \
--streaming \ --streaming \
--url http://$ENDPOINT \ --url http://$ENDPOINT \
--synthetic-input-tokens-mean $isl \ --synthetic-input-tokens-mean $isl \
......
...@@ -49,7 +49,8 @@ spec: ...@@ -49,7 +49,8 @@ spec:
aiperf profile --artifact-dir $ARTIFACT_DIR \ aiperf profile --artifact-dir $ARTIFACT_DIR \
--model $TARGET_MODEL \ --model $TARGET_MODEL \
--tokenizer /root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \ --tokenizer /root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \
--endpoint-type chat --endpoint /v1/chat/completions \ --endpoint-type chat \
--endpoint /v1/chat/completions \
--streaming \ --streaming \
--url http://$ENDPOINT \ --url http://$ENDPOINT \
--synthetic-input-tokens-mean $isl \ --synthetic-input-tokens-mean $isl \
......
...@@ -49,7 +49,8 @@ spec: ...@@ -49,7 +49,8 @@ spec:
aiperf profile --artifact-dir $ARTIFACT_DIR \ aiperf profile --artifact-dir $ARTIFACT_DIR \
--model $TARGET_MODEL \ --model $TARGET_MODEL \
--tokenizer /root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \ --tokenizer /root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd \
--endpoint-type chat --endpoint /v1/chat/completions \ --endpoint-type chat \
--endpoint /v1/chat/completions \
--streaming \ --streaming \
--url http://$ENDPOINT \ --url http://$ENDPOINT \
--synthetic-input-tokens-mean $isl \ --synthetic-input-tokens-mean $isl \
......
...@@ -383,27 +383,19 @@ def log_summary_metrics( ...@@ -383,27 +383,19 @@ def log_summary_metrics(
with open(profile_json) as f: with open(profile_json) as f:
metrics = json.load(f) metrics = json.load(f)
# Extract key metrics from AI-Perf format # Request count
records = metrics.get("records", {}) request_count = int(metrics.get("request_count", {}).get("avg", 0))
# Request count from request_count record
request_count_record = records.get("request_count", {})
request_count = (
int(request_count_record.get("avg", 0)) if request_count_record else 0
)
# Check for errors # Check for errors
error_summary = metrics.get("error_summary", []) error_count = len(metrics.get("error_summary", []))
error_count = len(error_summary)
# Latency metrics (in milliseconds) # Latency metrics (in milliseconds)
request_latency = records.get("request_latency", {}) request_latency = metrics.get("request_latency", {})
avg_latency = request_latency.get("avg", 0) / 1000.0 # Convert to seconds avg_latency = request_latency.get("avg", 0) / 1000.0 # Convert to seconds
p99_latency = request_latency.get("p99", 0) / 1000.0 # Convert to seconds p99_latency = request_latency.get("p99", 0) / 1000.0 # Convert to seconds
# Throughput metrics # Throughput metrics
request_throughput = records.get("request_throughput", {}) throughput = metrics.get("request_throughput", {}).get("avg", 0)
throughput = request_throughput.get("avg", 0)
# Log summary # Log summary
logger.info( logger.info(
...@@ -417,7 +409,7 @@ def log_summary_metrics( ...@@ -417,7 +409,7 @@ def log_summary_metrics(
# Log success rate # Log success rate
if request_count > 0: if request_count > 0:
success_rate = (request_count - error_count) / request_count * 100 success_rate = ((request_count - error_count) / request_count) * 100
logger.info(f"Success rate: {success_rate:.1f}%") logger.info(f"Success rate: {success_rate:.1f}%")
# Also write summary to CSV file for aggregation # Also write summary to CSV file for aggregation
......
...@@ -293,63 +293,44 @@ def parse_aiperf_client_results(log_dir: str) -> Dict[str, Any]: ...@@ -293,63 +293,44 @@ def parse_aiperf_client_results(log_dir: str) -> Dict[str, Any]:
with open(profile_json) as f: with open(profile_json) as f:
client_metrics = json.load(f) client_metrics = json.load(f)
# AI-Perf format has "records" dictionary at the top level # Extract request count (this is the total successful requests made)
records = client_metrics.get("records", {}) request_count = int(
client_metrics.get("request_count", {}).get("avg", 0)
# Extract request count (this is the total requests made)
request_count_record = records.get("request_count", {})
request_count = (
int(request_count_record.get("avg", 0))
if request_count_record
else 0
) )
# Check for errors in error_summary # Check for errors in error_summary
error_summary = client_metrics.get("error_summary", []) error_count = len(client_metrics.get("error_summary", []))
error_count = len(error_summary)
# Check if test was cancelled # Check if test was cancelled
was_cancelled = client_metrics.get("was_cancelled", False) if client_metrics.get("was_cancelled", False):
if was_cancelled:
error_count = request_count # Mark all as failed if cancelled error_count = request_count # Mark all as failed if cancelled
all_metrics["total_requests"] += request_count all_metrics["total_requests"] += request_count
all_metrics["successful_requests"] += request_count - error_count all_metrics["successful_requests"] += request_count - error_count
all_metrics["failed_requests"] += error_count all_metrics["failed_requests"] += error_count
# Extract latency from request_latency record # Extract latency metrics
request_latency = records.get("request_latency", {}) request_latency = client_metrics.get("request_latency", None)
if request_latency: if request_latency:
# Convert milliseconds to seconds for consistency all_metrics["latencies"].append(request_latency["avg"] / 1000.0)
if "avg" in request_latency: all_metrics["p50_latencies"].append(request_latency["p50"] / 1000.0)
all_metrics["latencies"].append(request_latency["avg"] / 1000.0) all_metrics["p90_latencies"].append(request_latency["p90"] / 1000.0)
if "p50" in request_latency: all_metrics["p99_latencies"].append(request_latency["p99"] / 1000.0)
all_metrics["p50_latencies"].append(
request_latency["p50"] / 1000.0 # Time to first token
) ttft = client_metrics.get("time_to_first_token", {}).get("avg", None)
if "p90" in request_latency: if ttft:
all_metrics["p90_latencies"].append( all_metrics["ttft"].append(ttft / 1000.0) # Convert ms to s
request_latency["p90"] / 1000.0
) # Inter-token latency
if "p99" in request_latency: itl = client_metrics.get("inter_token_latency", {}).get("avg", None)
all_metrics["p99_latencies"].append( if itl:
request_latency["p99"] / 1000.0 all_metrics["itl"].append(itl / 1000.0) # Convert ms to s
)
# Time to first token (if available in records)
ttft = records.get("time_to_first_token", {}) or records.get("ttft", {})
if ttft and "avg" in ttft:
all_metrics["ttft"].append(ttft["avg"] / 1000.0) # Convert ms to s
# Inter-token latency (if available in records)
itl = records.get("inter_token_latency", {}) or records.get("itl", {})
if itl and "avg" in itl:
all_metrics["itl"].append(itl["avg"] / 1000.0) # Convert ms to s
# Throughput from request_throughput record # Throughput from request_throughput record
request_throughput = records.get("request_throughput", {}) req_throughput = client_metrics.get("request_throughput", {}).get(
req_throughput = request_throughput.get("avg", 0) "avg", 0
)
if req_throughput: if req_throughput:
all_metrics["throughputs"].append(req_throughput) all_metrics["throughputs"].append(req_throughput)
......
...@@ -227,7 +227,7 @@ aiperf profile \ ...@@ -227,7 +227,7 @@ aiperf profile \
--input-file payload:/workspace/rr-5-45_i3000o300.jsonl \ # path to the generated load dataset \ --input-file payload:/workspace/rr-5-45_i3000o300.jsonl \ # path to the generated load dataset \
--fixed-schedule True \ --fixed-schedule True \
--goodput time_to_first_token:200 inter_token_latency:10 \ --goodput time_to_first_token:200 inter_token_latency:10 \
-v \ -v
``` ```
> [!NOTE] > [!NOTE]
......
...@@ -116,8 +116,6 @@ class LoadGenerator: ...@@ -116,8 +116,6 @@ class LoadGenerator:
str(params["request_rate"]), str(params["request_rate"]),
"--request-count", "--request-count",
str(request_count), # Use request count to limit test duration str(request_count), # Use request count to limit test duration
"--stability-percentage",
"50",
"--num-dataset-entries", "--num-dataset-entries",
str( str(
max(20, int(params["request_rate"] * 10)) max(20, int(params["request_rate"] * 10))
...@@ -210,35 +208,16 @@ class LoadGenerator: ...@@ -210,35 +208,16 @@ class LoadGenerator:
logger.info(f"Parsing results from: {results_file}") logger.info(f"Parsing results from: {results_file}")
with open(results_file, "r") as f: with open(results_file, "r") as f:
data = json.load(f) metrics = json.load(f)
results = {} results = {
if "experiments" in data and data["experiments"]: "throughput": metrics.get("output_token_throughput", {}).get("avg", 0),
exp = data["experiments"][0] "ttft_mean": metrics.get("time_to_first_token", {}).get("avg", 0),
if "perf_metrics" in exp: "itl_mean": metrics.get("inter_token_latency", {}).get("avg", 0),
metrics = exp["perf_metrics"] "end_to_end_latency_mean": metrics.get("request_latency", {}).get(
results.update( "avg", 0
{ ),
"throughput": metrics.get("throughput", {}).get("avg", 0), }
"ttft_mean": metrics.get("ttft", {}).get("avg", 0),
"itl_mean": metrics.get("inter_token_latency", {}).get(
"avg", 0
),
"end_to_end_latency_mean": metrics.get(
"request_latency", {}
).get("avg", 0),
}
)
if not results and "profile_export_aiperf" in data:
summary = data.get("summary", {})
results.update(
{
"throughput": summary.get("throughput", 0),
"ttft_mean": summary.get("time_to_first_token_ms", 0),
"itl_mean": summary.get("inter_token_latency_ms", 0),
}
)
logger.info(f"Parsed results: {results}") logger.info(f"Parsed results: {results}")
return results return results
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment