"...ssh:/git@developer.sourcefind.cn:2222/OpenDAS/dynamo.git" did not exist on "704c1dad84a97ce045a7819db9c0f816ced5b180"
Unverified Commit fc5ddd2f authored by Jacky's avatar Jacky Committed by GitHub
Browse files

docs: Benchmarking guide interpreting results (#701)


Co-authored-by: default avatarZiqi Fan <ziqif@nvidia.com>
parent d0d364e3
......@@ -14,7 +14,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
model=neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic
# Parse command line arguments
model="neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic"
url="http://localhost:8000"
mode="aggregated"
artifacts_root_dir="artifacts_root"
deployment_kind="dynamo"
# Input Sequence Length (isl) 3000 and Output Sequence Length (osl) 150 are
# selected for chat use case. Note that for other use cases, the results and
......@@ -22,6 +28,135 @@ model=neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic
isl=3000
osl=150
tp=0
dp=0
prefill_tp=0
prefill_dp=0
decode_tp=0
decode_dp=0
# The defaults can be overridden by command line arguments.
while [[ $# -gt 0 ]]; do
case $1 in
--tensor-parallelism)
tp="$2"
shift 2
;;
--data-parallelism)
dp="$2"
shift 2
;;
--prefill-tensor-parallelism)
prefill_tp="$2"
shift 2
;;
--prefill-data-parallelism)
prefill_dp="$2"
shift 2
;;
--decode-tensor-parallelism)
decode_tp="$2"
shift 2
;;
--decode-data-parallelism)
decode_dp="$2"
shift 2
;;
--model)
model="$2"
shift 2
;;
--input-sequence-length)
isl="$2"
shift 2
;;
--output-sequence-length)
osl="$2"
shift 2
;;
--url)
url="$2"
shift 2
;;
--mode)
mode="$2"
shift 2
;;
--artifacts-root-dir)
artifacts_root_dir="$2"
shift 2
;;
--deployment-kind)
deployment_kind="$2"
shift 2
;;
*)
echo "Unknown option: $1"
exit 1
;;
esac
done
if [ "${mode}" == "aggregated" ]; then
if [ "${tp}" == "0" ] && [ "${dp}" == "0" ]; then
echo "--tensor-parallelism and --data-parallelism must be set for aggregated mode."
exit 1
fi
echo "Starting benchmark for the deployment service with the following configuration:"
echo " - Tensor Parallelism: ${tp}"
echo " - Data Parallelism: ${dp}"
elif [ "${mode}" == "disaggregated" ]; then
if [ "${prefill_tp}" == "0" ] && [ "${prefill_dp}" == "0" ] && [ "${decode_tp}" == "0" ] && [ "${decode_dp}" == "0" ]; then
echo "--prefill-tensor-parallelism, --prefill-data-parallelism, --decode-tensor-parallelism and --decode-data-parallelism must be set for disaggregated mode."
exit 1
fi
echo "Starting benchmark for the deployment service with the following configuration:"
echo " - Prefill Tensor Parallelism: ${prefill_tp}"
echo " - Prefill Data Parallelism: ${prefill_dp}"
echo " - Decode Tensor Parallelism: ${decode_tp}"
echo " - Decode Data Parallelism: ${decode_dp}"
else
echo "Unknown mode: ${mode}. Only 'aggregated' and 'disaggregated' modes are supported."
exit 1
fi
echo "--------------------------------"
echo "WARNING: This script does not validate tensor_parallelism=${tp} and data_parallelism=${dp} settings."
echo " The user is responsible for ensuring these match the deployment configuration being benchmarked."
echo " Incorrect settings may lead to misleading benchmark results."
echo "--------------------------------"
# Create artifacts root directory if it doesn't exist
if [ ! -d "${artifacts_root_dir}" ]; then
mkdir -p "${artifacts_root_dir}"
fi
# Find the next available artifacts directory index
index=0
while [ -d "${artifacts_root_dir}/artifacts_${index}" ]; do
index=$((index + 1))
done
# Create the new artifacts directory
artifact_dir="${artifacts_root_dir}/artifacts_${index}"
mkdir -p "${artifact_dir}"
# Print warning about existing artifacts directories
if [ $index -gt 0 ]; then
echo "--------------------------------"
echo "WARNING: Found ${index} existing artifacts directories:"
for ((i=0; i<index; i++)); do
if [ -f "${artifacts_root_dir}/artifacts_${i}/deployment_config.json" ]; then
echo "artifacts_${i}:"
cat "${artifacts_root_dir}/artifacts_${i}/deployment_config.json"
echo "--------------------------------"
fi
done
echo "Creating new artifacts directory: artifacts_${index}"
echo "--------------------------------"
fi
# Concurrency levels to test
for concurrency in 1 2 4 8 16 32 64 128 256; do
......@@ -33,7 +168,7 @@ for concurrency in 1 2 4 8 16 32 64 128 256; do
--endpoint-type chat \
--endpoint /v1/chat/completions \
--streaming \
--url http://localhost:8000 \
--url ${url} \
--synthetic-input-tokens-mean ${isl} \
--synthetic-input-tokens-stddev 0 \
--output-tokens-mean ${osl} \
......@@ -47,6 +182,7 @@ for concurrency in 1 2 4 8 16 32 64 128 256; do
--warmup-request-count $(($concurrency*2)) \
--num-dataset-entries $(($concurrency*12)) \
--random-seed 100 \
--artifact-dir ${artifact_dir} \
-- \
-v \
--max-threads 256 \
......@@ -54,3 +190,31 @@ for concurrency in 1 2 4 8 16 32 64 128 256; do
-H 'Accept: text/event-stream'
done
# The configuration is dumped to a JSON file which hold details of the OAI service
# being benchmarked.
deployment_config=$(cat << EOF
{
"kind": "${deployment_kind}",
"model": "${model}",
"input_sequence_length": ${isl},
"output_sequence_length": ${osl},
"tensor_parallelism": ${tp},
"data_parallelism": ${dp},
"prefill_tensor_parallelism": ${prefill_tp},
"prefill_data_parallelism": ${prefill_dp},
"decode_tensor_parallelism": ${decode_tp},
"decode_data_parallelism": ${decode_dp},
"mode": "${mode}"
}
EOF
)
mkdir -p "${artifact_dir}"
if [ -f "${artifact_dir}/deployment_config.json" ]; then
echo "Deployment configuration already exists. Overwriting..."
rm -f "${artifact_dir}/deployment_config.json"
fi
echo "${deployment_config}" > "${artifact_dir}/deployment_config.json"
echo "Benchmarking Successful!!!"
#!/usr/bin/env python3
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import re
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib.ticker import MultipleLocator
def get_json_paths(search_paths):
genai_perf_profile_export_json_paths = []
deployment_config_json_paths = []
for search_path in search_paths:
deployment_config_json_path = os.path.join(
search_path, "deployment_config.json"
)
if not os.path.exists(deployment_config_json_path):
raise Exception(f"deployment_config.json not found in {search_path}")
for root, dirs, files in os.walk(search_path):
for file in files:
if file == "profile_export_genai_perf.json":
genai_perf_profile_export_json_paths.append(
os.path.join(root, file)
)
deployment_config_json_paths.append(deployment_config_json_path)
return genai_perf_profile_export_json_paths, deployment_config_json_paths
# search for -concurrency<number> in the name
def parse_concurrency(name):
matches = re.findall(r"-concurrency(\d+)", name)
if len(matches) != 1:
raise Exception(f"non-unique matches: {matches}")
concurrency = 0
for c in matches:
concurrency += int(c)
return concurrency
# Get the number of GPUs from the deployment config
def parse_gpus(deployment_config_json_path):
with open(deployment_config_json_path, "r") as f:
deployment_config = json.load(f)
if deployment_config.get("mode") == "aggregated":
return deployment_config.get("tensor_parallelism") * deployment_config.get(
"data_parallelism"
)
else:
return deployment_config.get(
"prefill_tensor_parallelism"
) * deployment_config.get("prefill_data_parallelism") + deployment_config.get(
"decode_tensor_parallelism"
) * deployment_config.get(
"decode_data_parallelism"
)
def parse_kind_and_mode(deployment_config_json_path):
with open(deployment_config_json_path, "r") as f:
deployment_config = json.load(f)
return deployment_config.get("kind"), deployment_config.get("mode")
def extract_val_and_concurrency(
genai_perf_profile_export_json_paths, deployment_config_json_paths, stat_value="avg"
):
results = []
for genai_perf_profile_export_json_path, deployment_config_json_path in zip(
genai_perf_profile_export_json_paths, deployment_config_json_paths
):
with open(genai_perf_profile_export_json_path, "r") as f:
data = json.load(f)
# output_token_throughput contains only avg
output_token_throughput = data.get("output_token_throughput", {}).get("avg")
output_token_throughput_per_user = data.get(
"output_token_throughput_per_user", {}
).get(stat_value)
time_to_first_token = data.get("time_to_first_token", {}).get(stat_value)
inter_token_latency = data.get("inter_token_latency", {}).get(stat_value)
# request_throughput contains only avg
request_throughput = data.get("request_throughput", {}).get("avg")
concurrency = parse_concurrency(genai_perf_profile_export_json_path)
num_gpus = parse_gpus(deployment_config_json_path)
kind, mode = parse_kind_and_mode(deployment_config_json_path)
# Handle the case of num_gpus=0 to avoid division by zero
if num_gpus > 0 and output_token_throughput is not None:
output_token_throughput_per_gpu = output_token_throughput / num_gpus
else:
output_token_throughput_per_gpu = 0.0
if num_gpus > 0 and request_throughput is not None:
request_throughput_per_gpu = request_throughput / num_gpus
else:
request_throughput_per_gpu = 0.0
results.append(
{
"configuration": genai_perf_profile_export_json_path,
"kind": kind,
"mode": mode,
"num_gpus": num_gpus,
"concurrency": float(concurrency),
"output_token_throughput_avg": output_token_throughput,
f"output_token_throughput_per_user_{stat_value}": output_token_throughput_per_user,
"output_token_throughput_per_gpu_avg": output_token_throughput_per_gpu,
f"time_to_first_token_{stat_value}": time_to_first_token,
f"inter_token_latency_{stat_value}": inter_token_latency,
"request_throughput_per_gpu_avg": request_throughput_per_gpu,
}
)
return results
def create_pareto_graph(results, title="", stat_value="avg"):
data_points = [
{
"label": f"{result['kind']}_{result['mode']}",
"configuration": result["configuration"],
"concurrency": float(result["concurrency"]),
f"output_token_throughput_per_user_{stat_value}": result[
f"output_token_throughput_per_user_{stat_value}"
],
"output_token_throughput_per_gpu_avg": result[
"output_token_throughput_per_gpu_avg"
],
f"time_to_first_token_{stat_value}": result[
f"time_to_first_token_{stat_value}"
],
f"inter_token_latency_{stat_value}": result[
f"inter_token_latency_{stat_value}"
],
"is_pareto_efficient": False,
}
for result in results
]
df = pd.DataFrame(data_points)
def pareto_efficient(ids, points):
"""
Mark Pareto-efficient points.
A point p is dominated if there's another q
such that q is >= p in all dimensions.
"""
points = np.array(points)
pareto_points = []
for i, (point_id, point) in enumerate(zip(ids, points)):
dominated = False
for j, other_point in enumerate(points):
if i != j and all(other_point >= point):
dominated = True
break
if not dominated:
pareto_points.append(point)
df.at[point_id, "is_pareto_efficient"] = True
return np.array(pareto_points)
sns.set(style="whitegrid")
fig, ax = plt.subplots(figsize=(14, 6), constrained_layout=True)
labels = df["label"].unique()
for label in labels:
group = df[df["label"] == label]
# Scatter all points
ax.scatter(
group[f"output_token_throughput_per_user_{stat_value}"],
group["output_token_throughput_per_gpu_avg"],
label=f"Label {label}",
)
# Identify and mark Pareto frontier
pareto_points = pareto_efficient(
group.index,
group[
[
f"output_token_throughput_per_user_{stat_value}",
"output_token_throughput_per_gpu_avg",
]
].values,
)
# Sort by x-value for a clean line
pareto_points = pareto_points[np.argsort(pareto_points[:, 0])]
ax.plot(
pareto_points[:, 0],
pareto_points[:, 1],
linestyle="--",
label=f"Pareto Frontier {label}",
)
# Save CSV
if stat_value == "avg":
df_file_name = "results.csv"
else:
df_file_name = f"results_{stat_value}.csv"
df.to_csv(df_file_name)
# Axis labels and tick intervals
ax.set_xlabel(f"tokens/s/user {stat_value}")
ax.set_ylabel("tokens/s/gpu avg")
ax.set_title(f"Pareto - {title}")
ax.legend(bbox_to_anchor=(1.02, 1), loc="upper left")
ax.grid(True)
x_interval = 5
y_interval = 5
ax.xaxis.set_major_locator(MultipleLocator(x_interval))
ax.yaxis.set_major_locator(MultipleLocator(y_interval))
if stat_value == "avg":
file_name = "pareto_plot.png"
else:
file_name = f"pareto_plot_{stat_value}.png"
plt.savefig(file_name, dpi=300)
plt.close()
if __name__ == "__main__":
import argparse
import glob
import os
parser = argparse.ArgumentParser(
description="Plot Pareto graph from GenAI-Perf artifacts"
)
parser.add_argument(
"--artifacts-root-dir",
required=True,
help="Root directory containing artifact directories to search for profile_export_genai_perf.json files",
)
parser.add_argument(
"--title",
default="Single Node",
help="Title for the Pareto graph",
)
args = parser.parse_args()
# Find all artifacts directories under the root
artifacts_dirs = glob.glob(os.path.join(args.artifacts_root_dir, "artifacts_*"))
if not artifacts_dirs:
raise ValueError(f"No artifacts directories found in {args.artifacts_root_dir}")
genai_perf_profile_export_json_paths, deployment_config_json_paths = get_json_paths(
artifacts_dirs
)
if len(genai_perf_profile_export_json_paths) != len(deployment_config_json_paths):
raise ValueError(
f"Number of genai_perf_profile_export_json_paths ({len(genai_perf_profile_export_json_paths)}) does not match number of deployment_config_json_paths ({len(deployment_config_json_paths)})"
)
extracted_values = extract_val_and_concurrency(
genai_perf_profile_export_json_paths, deployment_config_json_paths
)
create_pareto_graph(extracted_values, title=args.title)
......@@ -98,8 +98,16 @@ With the Dynamo repository, benchmarking image and model available, and **NATS a
> [!Tip]
> Check the `disagg.log` to make sure the service is fully started before collecting performance numbers.
3. Collect the performance numbers as shown on the [Collecting Performance Numbers](#collecting-performance-numbers) section below.
3. Collect the performance numbers:
```bash
bash -x /workspace/benchmarks/llm/perf.sh --mode disaggregated --deployment-kind dynamo_vllm --prefill-tensor-parallelism 1 --prefill-data-parallelism 4 --decode-tensor-parallelism 4 --decode-data-parallelism 1
```
> [!Important]
> We should be careful in specifying these options in `perf.sh` script. They should closely reflect the deployment config that is being benchmarked. See `perf.sh --help` to learn more about these option. In the above command, we described that our deployment is using disaggregated serving in dynamo with vLLM backend. We have also accurately described that we have 4 prefill workers with TP=1 and 1 decode worker with TP=4
For more information see [Collecting Performance Numbers](#collecting-performance-numbers) section below.
## Disaggregated Multinode Benchmarking
......@@ -155,7 +163,16 @@ With the Dynamo repository, benchmarking image and model available, and **NATS a
> [!Tip]
> Check the `prefill_multinode.log` to make sure the service is fully started before collecting performance numbers.
5. Collect the performance numbers as shown on the [Collecting Performance Numbers](#collecting-performance-numbers) section above.
5. Collect the performance numbers:
```bash
bash -x /workspace/benchmarks/llm/perf.sh --mode disaggregated --deployment-kind dynamo_vllm --prefill-tensor-parallelism 1 --prefill-data-parallelism 8 --decode-tensor-parallelism 8 --decode-data-parallelism 1
```
> [!Important]
> We should be careful in specifying these options in `perf.sh` script. They should closely reflect the deployment config that is being benchmarked. See `perf.sh --help` to learn more about these option. In the above command, we described that our deployment is using disaggregated serving in dynamo with vLLM backend. We have also accurately described that we have 8 prefill workers with TP=1 and 1 decode worker with TP=8
For more information see [Collecting Performance Numbers](#collecting-performance-numbers) section below.
## vLLM Aggregated Baseline Benchmarking
......@@ -211,22 +228,79 @@ With the Dynamo repository and the benchmarking image available, perform the fol
> [!Note]
> If benchmarking over 2 nodes, the `upstream` configuration will need to be updated to link to the `vllm serve` on the second node.
4. Collect the performance numbers as shown on the [Collecting Performance Numbers](#collecting-performance-numbers) section below.
4. Collect the performance numbers:
Single-Node
```bash
bash -x /workspace/benchmarks/llm/perf.sh --mode aggregated --deployment-kind vllm_serve --tensor-parallelism 4 --data-parallelism 2
```
Two Nodes
```bash
bash -x /workspace/benchmarks/llm/perf.sh --mode aggregated --deployment-kind vllm_serve --tensor-parallelism 8 --data-parallelism 2
```
> [!Important]
> We should be careful in specifying these options in `perf.sh` script. They should closely reflect the deployment config that is being benchmarked. See `perf.sh --help` to learn more about these option. In the above command, we described that our deployment is using aggregated serving in `vllm serve`. We have also accurately described that we have 2 workers with TP=4(or TP=8 for two nodes).
For more information see [Collecting Performance Numbers](#collecting-performance-numbers) section below.
## Collecting Performance Numbers
Run the benchmarking script
Currently, there is no consistent way of obtaining the configuration of deployment service. Hence, we need to provide this information to the script in form of command line arguments. The benchmarking script `/workspace/examples/llm/benchmarks/perf.sh` uses GenAI-Perf tool to collect the performance numbers at various different request concurrencies. The perf.sh script can be run multiple times to collect numbers for various different deployments. Each script execution will create a new artifacts directory in `artifacts_root` and dump these numbers in it. See [Plotting Pareto Graphs](#plotting-pareto-graphs) to learn how to convert the data from this `artifacts_root` to generate pareto graphs for the performance.
```bash
bash -x /workspace/benchmarks/llm/perf.sh
```
Note: As each `perf.sh` adds a new artifacts directory in the `artifacts_root` always, proper care should be taken that we are starting experiment with clean `artifacts_root` so we include only results from runs that we want to compare.
> [!Tip]
> See [GenAI-Perf tutorial](https://github.com/triton-inference-server/perf_analyzer/blob/main/genai-perf/docs/tutorial.md)
> @ [GitHub](https://github.com/triton-inference-server/perf_analyzer) for additional information about how to run GenAI-Perf
> and how to interpret results.
## Iterpreting Results
### Plotting Pareto Graphs
The `artifacts` directory generated by GenAI-Perf contains the raw performance number from the benchmarking.
Using the benchmarking image, install the dependencies for plotting Pareto graph
```bash
pip3 install matplotlib seaborn
```
At the directory where the artifacts are located, plot the Pareto graph
Single-Node:
```bash
python3 /workspace/benchmarks/llm/plot_pareto.py --artifacts-root-dir artifacts_root
```
Two Nodes:
```bash
python3 /workspace/benchmarks/llm/plot_pareto.py --artifacts-root-dir artifacts_root --title "Two Nodes"
```
The graph will be saved to the current directory and named `pareto_plot.png`.
### Interpreting Pareto Graphs
The question we want to answer in this comparison is how much Output Token Throughput can be improved by switching from
aggregated to disaggregated serving when both are performing under similar Inter Token Latency.
For each concurrency benchmarked, it produces a latency and throughput value pair. The x-axis on the Pareto graph is
latency (tokens/s/user), which the latency is lower if the value is higher. The y-axis on the Pareto graph is throughput
(tokens/s/gpu). The latency and throughput value pair forms a dot on the Pareto graph. A line (Pareto Frontier) is
formed when the dots from different concurrency values are plotted on the graph.
With the Pareto Frontiers of the baseline and the disaggregated results plotted on the graph, we can look for the
greatest increase in throughput (along the y-axis) between the baseline and the disaggregated result Pareto Frontier,
over different latencies (along the x-axis).
For example, at 45 tokens/s/user, the increase in tokens/s/gpu is `145 - 80 = 65`, from the orange baseline to the
blue disaggregated line, so the improvement is around 1.44x speed up:
![Example Pareto Plot](./example_plots/single_node_pareto_plot.png)
Note: The above example was collected over a single benchmarking run, the actual number may vary between runs, configurations and hardware.
## Supporting Additional Models
......
File suppressed by a .gitattributes entry or the file's encoding is unsupported.
File suppressed by a .gitattributes entry or the file's encoding is unsupported.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment