Unverified Commit 0b284b63 authored by Hongkuan Zhou's avatar Hongkuan Zhou Committed by GitHub
Browse files

feat: add cost plot to profiler (#4003)


Signed-off-by: default avatarhongkuanz <hongkuanz@nvidia.com>
parent 4765d880
...@@ -27,6 +27,7 @@ from benchmarks.profiler.utils.dgd_generation import generate_dgd_config_with_pl ...@@ -27,6 +27,7 @@ from benchmarks.profiler.utils.dgd_generation import generate_dgd_config_with_pl
from benchmarks.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator from benchmarks.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator
from benchmarks.profiler.utils.plot import ( from benchmarks.profiler.utils.plot import (
plot_decode_performance, plot_decode_performance,
plot_pd_joint_results,
plot_prefill_performance, plot_prefill_performance,
) )
from benchmarks.profiler.utils.profile_cache import ( from benchmarks.profiler.utils.profile_cache import (
...@@ -280,14 +281,10 @@ async def run_profile(args): ...@@ -280,14 +281,10 @@ async def run_profile(args):
prefill_thpt_per_gpu.append(args.isl / ttft / num_gpus * 1000) prefill_thpt_per_gpu.append(args.isl / ttft / num_gpus * 1000)
# Plot the results as a 2D scatter plot # Plot the results as a 2D scatter plot
prefill_results = None
if prefill_num_gpus and prefill_ttft and prefill_thpt_per_gpu: if prefill_num_gpus and prefill_ttft and prefill_thpt_per_gpu:
plot_prefill_performance( prefill_results = (prefill_num_gpus, prefill_ttft, prefill_thpt_per_gpu)
prefill_num_gpus, plot_prefill_performance(prefill_results, args.ttft, args.output_dir)
prefill_ttft,
prefill_thpt_per_gpu,
args.ttft,
args.output_dir,
)
# then profile decode # then profile decode
decode_num_gpus = [] decode_num_gpus = []
...@@ -476,6 +473,11 @@ async def run_profile(args): ...@@ -476,6 +473,11 @@ async def run_profile(args):
if decode_results: if decode_results:
plot_decode_performance(decode_results, args.itl, args.output_dir) plot_decode_performance(decode_results, args.itl, args.output_dir)
if prefill_results and decode_results:
plot_pd_joint_results(
args.isl, args.osl, prefill_results, decode_results, args.output_dir
)
if args.dry_run: if args.dry_run:
logger.info("Skipping recommendations in dry run mode") logger.info("Skipping recommendations in dry run mode")
else: else:
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
def compute_pareto(x, y):
"""
compute the pareto front (top-left is better) for the given x and y values
return sorted lists of the x and y values for the pareto front
"""
# Validate inputs
if x is None or y is None:
return [], []
if len(x) != len(y):
raise ValueError("x and y must have the same length")
if len(x) == 0:
return [], []
# Build point list and sort by x asc, then y desc so we prefer smaller x and larger y.
points = list(zip(x, y))
points.sort(key=lambda p: (p[0], -p[1]))
# Single pass to keep only non-dominated points (minimize x, maximize y).
pareto = []
max_y = float("-inf")
for px, py in points:
if py > max_y:
pareto.append((px, py))
max_y = py
# Return sorted by x ascending for convenience
pareto.sort(key=lambda p: (p[0], p[1]))
xs = [px for px, _ in pareto]
ys = [py for _, py in pareto]
return xs, ys
...@@ -20,6 +20,8 @@ import numpy as np ...@@ -20,6 +20,8 @@ import numpy as np
from matplotlib import cm from matplotlib import cm
from scipy.interpolate import griddata from scipy.interpolate import griddata
from benchmarks.profiler.utils.pareto import compute_pareto
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler() console_handler = logging.StreamHandler()
...@@ -31,19 +33,16 @@ console_handler.setFormatter(formatter) ...@@ -31,19 +33,16 @@ console_handler.setFormatter(formatter)
logger.addHandler(console_handler) logger.addHandler(console_handler)
def plot_prefill_performance( def plot_prefill_performance(prefill_results, target_ttft, output_dir):
prefill_num_gpu, prefill_ttft, prefill_thpt_per_gpu, target_ttft, output_dir
):
""" """
Plot prefill performance as a 2D scatter plot with GPU count annotations. Plot prefill performance as a 2D scatter plot with GPU count annotations.
Args: Args:
prefill_num_gpu: list of GPU counts prefill_results: tuple of (prefill_num_gpu, prefill_ttft, prefill_thpt_per_gpu)
prefill_ttft: list of time to first token values
prefill_thpt_per_gpu: list of throughput per GPU values
target_ttft: target TTFT value for the vertical line target_ttft: target TTFT value for the vertical line
output_dir: directory to save the plot output_dir: directory to save the plot
""" """
prefill_num_gpu, prefill_ttft, prefill_thpt_per_gpu = prefill_results
plt.figure(figsize=(10, 6)) plt.figure(figsize=(10, 6))
plt.scatter(prefill_ttft, prefill_thpt_per_gpu, s=100) plt.scatter(prefill_ttft, prefill_thpt_per_gpu, s=100)
for i, num_gpu in enumerate(prefill_num_gpu): for i, num_gpu in enumerate(prefill_num_gpu):
...@@ -252,3 +251,47 @@ def plot_decode_3d_surface( ...@@ -252,3 +251,47 @@ def plot_decode_3d_surface(
logger.info(f"Saving throughput surface plot to {thpt_plot_path}") logger.info(f"Saving throughput surface plot to {thpt_plot_path}")
plt.savefig(thpt_plot_path, dpi=300, bbox_inches="tight") plt.savefig(thpt_plot_path, dpi=300, bbox_inches="tight")
plt.close() plt.close()
def plot_pd_joint_results(isl, osl, prefill_results, decode_results, output_dir):
GPU_COST_PER_HOUR = 3.0 # $3/hour
# compute pareto front for prefill
p_ttft, p_thpt = compute_pareto(prefill_results[1], prefill_results[2])
# compute pareto front for decode
_d_itl, _d_thpt = [], []
for _d_result in decode_results:
_d_itl.extend(_d_result[1])
_d_thpt.extend(_d_result[2])
d_itl, d_thpt = compute_pareto(_d_itl, _d_thpt)
# convert to cost per thousand requests
p_ttft = np.array(p_ttft)
p_thpt = np.array(p_thpt)
d_itl = np.array(d_itl)
d_thpt = np.array(d_thpt)
tokens_per_user = []
cost = []
ttft = []
for _p_ttft, _p_thpt in zip(p_ttft, p_thpt):
ttft.append(_p_ttft)
prefill_cost = isl * 1000 / _p_thpt * GPU_COST_PER_HOUR / 3600
tokens_per_user.append(1000 / d_itl)
cost.append(osl * 1000 / d_thpt * GPU_COST_PER_HOUR / 3600 + prefill_cost)
# plot
plt.figure(figsize=(12, 10))
plt.title(
f"Cost Per 1000 i{isl}o{osl} requests (GPU/hour = ${GPU_COST_PER_HOUR}) Under Different SLA"
)
for _tokens_per_user, _cost, _ttft in zip(tokens_per_user, cost, ttft):
line = plt.plot(_tokens_per_user, _cost, label=f"TTFT: {_ttft:.2f}ms")[0]
plt.scatter(_tokens_per_user, _cost, marker="x", s=100, color=line.get_color())
plt.xlabel("Tokens per User")
plt.ylabel("Cost ($)")
plt.grid(True)
plt.legend()
plt.savefig(f"{output_dir}/cost_sla.png", dpi=300)
plt.close()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment