feat: add cost plot to profiler (#4003)

Signed-off-by: hongkuanz <hongkuanz@nvidia.com>

feat: add cost plot to profiler (#4003)
Signed-off-by: hongkuanz <hongkuanz@nvidia.com>
0b284b63 · Hongkuan Zhou · GitHub · 4765d880 · 0b284b63 · 0b284b63
Unverified Commit 0b284b63 authored Nov 05, 2025 by Hongkuan Zhou Committed by GitHub Nov 05, 2025
3 changed files
--- a/benchmarks/profiler/profile_sla.py
+++ b/benchmarks/profiler/profile_sla.py
@@ -27,6 +27,7 @@ from benchmarks.profiler.utils.dgd_generation import generate_dgd_config_with_pl
 from benchmarks.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator
 from benchmarks.profiler.utils.plot import (
    plot_decode_performance,
+    plot_pd_joint_results,
    plot_prefill_performance,
 )
 from benchmarks.profiler.utils.profile_cache import (
@@ -280,14 +281,10 @@ async def run_profile(args):
                prefill_thpt_per_gpu.append(args.isl / ttft / num_gpus * 1000)
        # Plot the results as a 2D scatter plot
+        prefill_results = None
        if prefill_num_gpus and prefill_ttft and prefill_thpt_per_gpu:
-            plot_prefill_performance(
+            prefill_results = (prefill_num_gpus, prefill_ttft, prefill_thpt_per_gpu)
-                prefill_num_gpus,
+            plot_prefill_performance(prefill_results, args.ttft, args.output_dir)
-                prefill_ttft,
-                prefill_thpt_per_gpu,
-                args.ttft,
-                args.output_dir,
-            )
        # then profile decode
        decode_num_gpus = []
@@ -476,6 +473,11 @@ async def run_profile(args):
        if decode_results:
            plot_decode_performance(decode_results, args.itl, args.output_dir)
+        if prefill_results and decode_results:
+            plot_pd_joint_results(
+                args.isl, args.osl, prefill_results, decode_results, args.output_dir
+            )
        if args.dry_run:
            logger.info("Skipping recommendations in dry run mode")
        else:

--- a/benchmarks/profiler/utils/pareto.py
+++ b/benchmarks/profiler/utils/pareto.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+def compute_pareto(x, y):
+    """
+    compute the pareto front (top-left is better) for the given x and y values
+    return sorted lists of the x and y values for the pareto front
+    """
+    # Validate inputs
+    if x is None or y is None:
+        return [], []
+    if len(x) != len(y):
+        raise ValueError("x and y must have the same length")
+    if len(x) == 0:
+        return [], []
+    # Build point list and sort by x asc, then y desc so we prefer smaller x and larger y.
+    points = list(zip(x, y))
+    points.sort(key=lambda p: (p[0], -p[1]))
+    # Single pass to keep only non-dominated points (minimize x, maximize y).
+    pareto = []
+    max_y = float("-inf")
+    for px, py in points:
+        if py > max_y:
+            pareto.append((px, py))
+            max_y = py
+    # Return sorted by x ascending for convenience
+    pareto.sort(key=lambda p: (p[0], p[1]))
+    xs = [px for px, _ in pareto]
+    ys = [py for _, py in pareto]
+    return xs, ys
--- a/benchmarks/profiler/utils/plot.py
+++ b/benchmarks/profiler/utils/plot.py
@@ -20,6 +20,8 @@ import numpy as np
 from matplotlib import cm
 from scipy.interpolate import griddata
+from benchmarks.profiler.utils.pareto import compute_pareto
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 console_handler = logging.StreamHandler()
@@ -31,19 +33,16 @@ console_handler.setFormatter(formatter)
 logger.addHandler(console_handler)
-def plot_prefill_performance(
+def plot_prefill_performance(prefill_results, target_ttft, output_dir):
-    prefill_num_gpu, prefill_ttft, prefill_thpt_per_gpu, target_ttft, output_dir
-):
    """
    Plot prefill performance as a 2D scatter plot with GPU count annotations.
    Args:
-        prefill_num_gpu: list of GPU counts
+        prefill_results: tuple of (prefill_num_gpu, prefill_ttft, prefill_thpt_per_gpu)
-        prefill_ttft: list of time to first token values
-        prefill_thpt_per_gpu: list of throughput per GPU values
        target_ttft: target TTFT value for the vertical line
        output_dir: directory to save the plot
    """
+    prefill_num_gpu, prefill_ttft, prefill_thpt_per_gpu = prefill_results
    plt.figure(figsize=(10, 6))
    plt.scatter(prefill_ttft, prefill_thpt_per_gpu, s=100)
    for i, num_gpu in enumerate(prefill_num_gpu):
@@ -252,3 +251,47 @@ def plot_decode_3d_surface(
    logger.info(f"Saving throughput surface plot to {thpt_plot_path}")
    plt.savefig(thpt_plot_path, dpi=300, bbox_inches="tight")
    plt.close()
+def plot_pd_joint_results(isl, osl, prefill_results, decode_results, output_dir):
+    GPU_COST_PER_HOUR = 3.0  # $3/hour
+    # compute pareto front for prefill
+    p_ttft, p_thpt = compute_pareto(prefill_results[1], prefill_results[2])
+    # compute pareto front for decode
+    _d_itl, _d_thpt = [], []
+    for _d_result in decode_results:
+        _d_itl.extend(_d_result[1])
+        _d_thpt.extend(_d_result[2])
+    d_itl, d_thpt = compute_pareto(_d_itl, _d_thpt)
+    # convert to cost per thousand requests
+    p_ttft = np.array(p_ttft)
+    p_thpt = np.array(p_thpt)
+    d_itl = np.array(d_itl)
+    d_thpt = np.array(d_thpt)
+    tokens_per_user = []
+    cost = []
+    ttft = []
+    for _p_ttft, _p_thpt in zip(p_ttft, p_thpt):
+        ttft.append(_p_ttft)
+        prefill_cost = isl * 1000 / _p_thpt * GPU_COST_PER_HOUR / 3600
+        tokens_per_user.append(1000 / d_itl)
+        cost.append(osl * 1000 / d_thpt * GPU_COST_PER_HOUR / 3600 + prefill_cost)
+    # plot
+    plt.figure(figsize=(12, 10))
+    plt.title(
+        f"Cost Per 1000 i{isl}o{osl} requests (GPU/hour = ${GPU_COST_PER_HOUR}) Under Different SLA"
+    )
+    for _tokens_per_user, _cost, _ttft in zip(tokens_per_user, cost, ttft):
+        line = plt.plot(_tokens_per_user, _cost, label=f"TTFT: {_ttft:.2f}ms")[0]
+        plt.scatter(_tokens_per_user, _cost, marker="x", s=100, color=line.get_color())
+    plt.xlabel("Tokens per User")
+    plt.ylabel("Cost ($)")
+    plt.grid(True)
+    plt.legend()
+    plt.savefig(f"{output_dir}/cost_sla.png", dpi=300)
+    plt.close()