feat: standalone profiling script for a given endpoint (#2386)

fd358991 · Hongkuan Zhou · GitHub · dabd2267 · fd358991 · fd358991
Unverified Commit fd358991 authored Aug 11, 2025 by Hongkuan Zhou Committed by GitHub Aug 11, 2025
5 changed files
--- a/benchmarks/profiler/profile_endpoint.py
+++ b/benchmarks/profiler/profile_endpoint.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+import logging
+import os
+
+from utils.profile_prefill import profile_prefill
+
+from benchmarks.profiler.utils.profile_decode import profile_decode
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+console_handler = logging.StreamHandler()
+console_handler.setLevel(logging.INFO)
+formatter = logging.Formatter(
+    "%(asctime)s - %(name)s - %(levelname)s - %(message)s", "%Y-%m-%d %H:%M:%S"
+)
+console_handler.setFormatter(formatter)
+logger.addHandler(console_handler)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="profile a given endpoint's performance for prefill or decode"
+    )
+    parser.add_argument(
+        "--mode",
+        type=str,
+        required=True,
+        choices=["prefill", "decode"],
+        help="mode to profile",
+    )
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        required=True,
+        help="model name",
+    )
+    parser.add_argument(
+        "--url",
+        type=str,
+        required=True,
+        help="base url of the endpoint",
+    )
+    parser.add_argument(
+        "--num_gpus",
+        type=int,
+        required=True,
+        help="number of gpus",
+    )
+    parser.add_argument(
+        "--max_kv_tokens",
+        type=int,
+        required=False,
+        default=0,
+        help="max kv tokens of the endpoint (only used for decode)",
+    )
+    parser.add_argument(
+        "--work_dir",
+        type=str,
+        default="endpoint_profiling_results/",
+        help="work directory to save the results",
+    )
+    parser.add_argument(
+        "--max_context_length",
+        type=int,
+        default=16384,
+        help="max context length of the endpoint",
+    )
+    parser.add_argument(
+        "--interpolation_granularity",
+        type=int,
+        default=8,
+        help="interpolation granularity for the results",
+    )
+    args = parser.parse_args()
+
+    os.makedirs(args.work_dir, exist_ok=True)
+    if args.mode == "prefill":
+        profile_prefill(
+            args.work_dir,
+            args.model_name,
+            args.url,
+            args.num_gpus,
+            args.max_context_length,
+            args.interpolation_granularity,
+        )
+    elif args.mode == "decode":
+        assert args.max_kv_tokens > 0, "max_kv_tokens must be provided for decode"
+        profile_decode(
+            args.work_dir,
+            args.model_name,
+            args.url,
+            args.num_gpus,
+            args.max_kv_tokens,
+            args.max_context_length,
+            args.interpolation_granularity,
+        )
+    else:
+        raise ValueError(f"Invalid mode: {args.mode}")
--- a/benchmarks/profiler/profile_sla.py
+++ b/benchmarks/profiler/profile_sla.py
@@ -28,18 +28,16 @@ from utils.dynamo_deployment import (
    cleanup_remaining_deployments,
 )
 from utils.genai_perf import benchmark_decode, benchmark_prefill
-from utils.plot import (
-    plot_decode_3d_surface,
-    plot_decode_performance,
-    plot_prefill_interpolation,
-    plot_prefill_performance,
-)
+from utils.plot import plot_decode_performance, plot_prefill_performance
 from utils.profile_cache import (
    check_decode_results_exist,
    check_prefill_results_exist,
    load_existing_decode_results,
    load_existing_prefill_results,
 )
+from utils.profile_prefill import profile_prefill
+
+from benchmarks.profiler.utils.profile_decode import profile_decode

 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -373,9 +371,6 @@ async def run_profile(args):

        # interpolate ISL - TTFT with best prefill TP
        best_prefill_tp = prefill_tp_size[selected_prefill_idx]
-        prefill_isl = []
-        prefill_ttft = []
-        prefill_thpt_per_gpu = []
        logger.info(
            f"Profiling prefill under best TP {best_prefill_tp} with different ISL..."
        )
@@ -420,58 +415,22 @@ async def run_profile(args):
            )

        base_url = client.get_service_url()
-        for isl in range(
-            100,
+
+        profile_prefill(
+            work_dir,
+            model_name,
+            base_url,
+            best_prefill_tp,
            args.max_context_length,
-            (args.max_context_length - 100) // args.prefill_interpolation_granularity,
-        ):
-            # run genai-perf
-            genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}"
-            gap_result = benchmark_prefill(
-                isl, genai_perf_artifact_dir, model_name, base_url=base_url
-            )
-            if gap_result is not None:
-                ttft = gap_result["time_to_first_token"]["avg"]
-                prefill_isl.append(isl)
-                prefill_ttft.append(ttft)
-                prefill_thpt_per_gpu.append(isl / ttft / best_prefill_tp * 1000)
+            args.prefill_interpolation_granularity,
+        )

        print("Cleaning up deployment...")
        await client.delete_deployment()
        deployment_clients.remove(client)
        print("Deployment deleted")

-        # Interpolate prefill_ttft vs prefill_isl with quadratic function (y=ax^2+bx+c)
-        if len(prefill_isl) > 2:
-            logger.info("Interpolating prefill TTFT and throughput vs ISL...")
-
-            # Convert to numpy arrays for easier manipulation
-            prefill_isl_np = np.array(prefill_isl)
-            prefill_ttft_np = np.array(prefill_ttft)
-            prefill_thpt_per_gpu_np = np.array(prefill_thpt_per_gpu)
-
-            save_path = f"{work_dir}/raw_data.npz"
-            np.savez(
-                save_path,
-                prefill_isl=prefill_isl_np,
-                prefill_ttft=prefill_ttft_np,
-                prefill_thpt_per_gpu=prefill_thpt_per_gpu_np,
-            )
-
-            # Call the plotting function
-            plot_prefill_interpolation(
-                prefill_isl_np, prefill_ttft_np, prefill_thpt_per_gpu_np, work_dir
-            )
-        else:
-            logger.warning(
-                "Not enough data points to perform interpolation (need at least 3 points)"
-            )
-
        # interpolate ITL - Active_KV_Cache - Decode_Context_Length with best decode TP
-        x_kv_usage = []
-        y_context_length = []
-        z_itl = []
-        z_thpt_per_gpu = []
        best_decode_tp = decode_tp_size[selected_decode_idx]
        logger.info(f"Profiling decode with TP size {best_decode_tp}...")
        decode_config = config_modifier.set_config_tp_size(
@@ -508,64 +467,23 @@ async def run_profile(args):
            f"{work_dir}/vllm-v1-agg/vllmdecodeworker/0.log"
        )

-        osl = 500  # not too large to reduce ITL variance, not too small to have stable measurement
        base_url = client.get_service_url()
-        for isl in range(
-            100,
-            args.max_context_length - osl,
-            (args.max_context_length - osl) // args.decode_interpolation_granularity,
-        ):
-            max_concurrency = max_kv_tokens // (isl + osl)
-            sweep_num_request = list(
-                range(
-                    1,
-                    max_concurrency,
-                    max_concurrency // args.decode_interpolation_granularity,
-                )
-            )
-            for num_request in sweep_num_request:
-                genai_perf_artifact_dir = (
-                    f"{work_dir}/gap_isl{isl}_osl{osl}_n{num_request}"
-                )
-                gap_result = benchmark_decode(
-                    isl,
-                    osl,
-                    num_request,
-                    genai_perf_artifact_dir,
-                    model_name,
-                    base_url=base_url,
-                )
-                if gap_result is not None:
-                    itl = gap_result["inter_token_latency"]["avg"]
-                    x_kv_usage.append((isl + osl / 2) * num_request / max_kv_tokens)
-                    y_context_length.append(isl + osl / 2)
-                    z_itl.append(itl)
-                    z_thpt_per_gpu.append(
-                        gap_result["output_token_throughput"]["avg"] / best_decode_tp
-                    )
+
+        profile_decode(
+            work_dir,
+            model_name,
+            base_url,
+            best_decode_tp,
+            max_kv_tokens,
+            args.max_context_length,
+            args.decode_interpolation_granularity,
+        )

        print("Cleaning up deployment...")
        await client.delete_deployment()
        deployment_clients.remove(client)
        print("Deployment deleted")

-        # Save the data points to a .npz file
-        save_path = f"{work_dir}/raw_data.npz"
-        np.savez(
-            save_path,
-            x_kv_usage=np.array(x_kv_usage),
-            y_context_length=np.array(y_context_length),
-            z_itl=np.array(z_itl),
-            z_thpt_per_gpu=np.array(z_thpt_per_gpu),
-            max_kv_tokens=np.array([max_kv_tokens]),
-        )
-        logger.info(f"Saved data points to {save_path}")
-
-        # Plot 3D surface
-        plot_decode_3d_surface(
-            x_kv_usage, y_context_length, z_itl, best_decode_tp, work_dir
-        )
-
    except Exception as e:
        logger.error(f"Profile job failed with error: {e}")
        raise

--- a/benchmarks/profiler/utils/plot.py
+++ b/benchmarks/profiler/utils/plot.py
@@ -114,16 +114,13 @@ def plot_prefill_interpolation(
    """
    # Fit quadratic functions
    ttft_coeffs = np.polyfit(prefill_isl_np, prefill_ttft_np, 2)
-    thpt_coeffs = np.polyfit(prefill_isl_np, prefill_thpt_per_gpu_np, 2)

    # Create interpolation functions
    ttft_poly = np.poly1d(ttft_coeffs)
-    thpt_poly = np.poly1d(thpt_coeffs)

    # Generate points for smooth curves
    x_interp = np.linspace(min(prefill_isl_np), max(prefill_isl_np), 100)
    ttft_interp = ttft_poly(x_interp)
-    thpt_interp = thpt_poly(x_interp)

    # Plot TTFT vs ISL
    plt.figure(figsize=(10, 6))
@@ -148,14 +145,7 @@ def plot_prefill_interpolation(

    # Plot Throughput vs ISL
    plt.figure(figsize=(10, 6))
-    plt.scatter(prefill_isl_np, prefill_thpt_per_gpu_np, s=100, label="Measured data")
-    plt.plot(
-        x_interp,
-        thpt_interp,
-        "g-",
-        label=f"Quadratic fit: {thpt_coeffs[0]:.2e}x² + {thpt_coeffs[1]:.2e}x + {thpt_coeffs[2]:.2e}",
-    )
-
+    plt.scatter(prefill_isl_np, prefill_thpt_per_gpu_np, s=100, label="Throughput/GPU")
    plt.title("Prefill Throughput vs Input Sequence Length")
    plt.xlabel("Input Sequence Length (tokens)")
    plt.ylabel("Prefill throughput per GPU (tokens/s/GPU)")
@@ -170,7 +160,9 @@ def plot_prefill_interpolation(
    plt.close()


-def plot_decode_3d_surface(x_kv_usage, y_context_length, z_itl, tp_size, work_dir):
+def plot_decode_3d_surface(
+    x_kv_usage, y_context_length, z_itl, z_thpt_per_gpu, work_dir
+):
    """
    Plot 3D surface for decode interpolation with KV usage, context length, and ITL.

@@ -178,14 +170,18 @@ def plot_decode_3d_surface(x_kv_usage, y_context_length, z_itl, tp_size, work_di
        x_kv_usage: list of KV usage percentages
        y_context_length: list of context lengths
        z_itl: list of ITL values
-        tp_size: TP size for the plot filename
+        z_thpt_per_gpu: list of throughput per GPU values
        work_dir: directory to save the plot
    """
    xi = np.linspace(min(x_kv_usage), max(x_kv_usage), 100)
    yi = np.linspace(min(y_context_length), max(y_context_length), 100)
    X, Y = np.meshgrid(xi, yi)
-    Z = griddata((x_kv_usage, y_context_length), z_itl, (X, Y), method="cubic")
+    Z_itl = griddata((x_kv_usage, y_context_length), z_itl, (X, Y), method="cubic")
+    Z_thpt = griddata(
+        (x_kv_usage, y_context_length), z_thpt_per_gpu, (X, Y), method="cubic"
+    )

+    # Plot ITL surface
    fig = plt.figure(figsize=(12, 10))
    ax = fig.add_subplot(111, projection="3d")  # type: ignore

@@ -193,7 +189,7 @@ def plot_decode_3d_surface(x_kv_usage, y_context_length, z_itl, tp_size, work_di
    surf = ax.plot_surface(  # type: ignore
        X,
        Y,
-        Z,
+        Z_itl,
        cmap=cm.coolwarm,  # type: ignore
        linewidth=0.2,
        antialiased=True,
@@ -202,20 +198,57 @@ def plot_decode_3d_surface(x_kv_usage, y_context_length, z_itl, tp_size, work_di

    # Add a color bar with custom settings
    cbar = fig.colorbar(surf, ax=ax, shrink=0.5, aspect=5)
-    cbar.set_label("Z Value", fontsize=12)
+    cbar.set_label("ITL (ms)", fontsize=12)
    cbar.ax.tick_params(labelsize=10)

    # Add labels with custom font sizes
    ax.set_xlabel("Active KV Percentage", fontsize=12)
    ax.set_ylabel("Decode Context Length", fontsize=12)
    ax.set_zlabel("ITL", fontsize=12)  # type: ignore
+    ax.set_title("Decode ITL Interpolation", fontsize=14)

    # Set viewing angle
    ax.view_init(elev=30, azim=45)  # type: ignore
    ax.grid(True)
    ax.tick_params(axis="both", which="major", labelsize=10)

-    plot_path = f"{work_dir}/decode_tp{tp_size}.png"
+    plot_path = f"{work_dir}/decode_itl_interpolation.png"
    logger.info(f"Saving ITL surface plot to {plot_path}")
    plt.savefig(plot_path, dpi=300, bbox_inches="tight")
    plt.close()
+
+    # Plot Throughput surface
+    fig = plt.figure(figsize=(12, 10))
+    ax = fig.add_subplot(111, projection="3d")  # type: ignore
+
+    # Create the throughput surface plot with customizations
+    surf = ax.plot_surface(  # type: ignore
+        X,
+        Y,
+        Z_thpt,
+        cmap=cm.viridis,  # type: ignore
+        linewidth=0.2,
+        antialiased=True,
+        alpha=0.8,
+    )
+
+    # Add a color bar with custom settings
+    cbar = fig.colorbar(surf, ax=ax, shrink=0.5, aspect=5)
+    cbar.set_label("Throughput per GPU (tokens/s/GPU)", fontsize=12)
+    cbar.ax.tick_params(labelsize=10)
+
+    # Add labels with custom font sizes
+    ax.set_xlabel("Active KV Percentage", fontsize=12)
+    ax.set_ylabel("Decode Context Length", fontsize=12)
+    ax.set_zlabel("Throughput per GPU", fontsize=12)  # type: ignore
+    ax.set_title("Decode Throughput Interpolation", fontsize=14)
+
+    # Set viewing angle
+    ax.view_init(elev=30, azim=45)  # type: ignore
+    ax.grid(True)
+    ax.tick_params(axis="both", which="major", labelsize=10)
+
+    thpt_plot_path = f"{work_dir}/decode_throughput_interpolation.png"
+    logger.info(f"Saving throughput surface plot to {thpt_plot_path}")
+    plt.savefig(thpt_plot_path, dpi=300, bbox_inches="tight")
+    plt.close()
--- a/benchmarks/profiler/utils/profile_decode.py
+++ b/benchmarks/profiler/utils/profile_decode.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import logging
+
+import numpy as np
+from utils.genai_perf import benchmark_decode
+from utils.plot import plot_decode_3d_surface
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+console_handler = logging.StreamHandler()
+console_handler.setLevel(logging.INFO)
+formatter = logging.Formatter(
+    "%(asctime)s - %(name)s - %(levelname)s - %(message)s", "%Y-%m-%d %H:%M:%S"
+)
+console_handler.setFormatter(formatter)
+logger.addHandler(console_handler)
+
+
+def profile_decode(
+    work_dir,
+    model_name,
+    url,
+    num_gpus,
+    max_kv_tokens,
+    max_context_length,
+    interpolation_granularity,
+):
+    """interpolate ITL - Active_KV_Cache - Decode_Context_Length"""
+    x_kv_usage = []
+    y_context_length = []
+    z_itl = []
+    z_thpt_per_gpu = []
+
+    osl = 500  # not too large to reduce ITL variance, not too small to have stable measurement
+
+    for isl in range(
+        100,
+        max_context_length - osl,
+        (max_context_length - osl) // interpolation_granularity,
+    ):
+        max_concurrency = max_kv_tokens // (isl + osl)
+        sweep_num_request = range(
+            1,
+            max_concurrency,
+            max_concurrency // interpolation_granularity,
+        )
+        for num_request in sweep_num_request:
+            genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}_osl{osl}_n{num_request}"
+            gap_result = benchmark_decode(
+                isl,
+                osl,
+                num_request,
+                genai_perf_artifact_dir,
+                model_name,
+                base_url=url,
+            )
+            if gap_result is not None:
+                itl = gap_result["inter_token_latency"]["avg"]
+                x_kv_usage.append((isl + osl / 2) * num_request / max_kv_tokens)
+                y_context_length.append(isl + osl / 2)
+                z_itl.append(itl)
+                z_thpt_per_gpu.append(
+                    gap_result["output_token_throughput"]["avg"] / num_gpus
+                )
+
+    # Save the data points to a .npz file
+    save_path = f"{work_dir}/raw_data.npz"
+    np.savez(
+        save_path,
+        x_kv_usage=np.array(x_kv_usage),
+        y_context_length=np.array(y_context_length),
+        z_itl=np.array(z_itl),
+        z_thpt_per_gpu=np.array(z_thpt_per_gpu),
+        max_kv_tokens=np.array([max_kv_tokens]),
+    )
+    logger.info(f"Saved data points to {save_path}")
+
+    # Plot 3D surface
+    plot_decode_3d_surface(
+        x_kv_usage, y_context_length, z_itl, z_thpt_per_gpu, work_dir
+    )
+
+    return
--- a/benchmarks/profiler/utils/profile_prefill.py
+++ b/benchmarks/profiler/utils/profile_prefill.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import logging
+
+import numpy as np
+from utils.genai_perf import benchmark_prefill
+from utils.plot import plot_prefill_interpolation
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+console_handler = logging.StreamHandler()
+console_handler.setLevel(logging.INFO)
+formatter = logging.Formatter(
+    "%(asctime)s - %(name)s - %(levelname)s - %(message)s", "%Y-%m-%d %H:%M:%S"
+)
+console_handler.setFormatter(formatter)
+logger.addHandler(console_handler)
+
+
+def profile_prefill(
+    work_dir, model_name, url, num_gpus, max_context_length, interpolation_granularity
+):
+    prefill_isl = []
+    prefill_ttft = []
+    prefill_thpt_per_gpu = []
+    for isl in range(
+        100,
+        max_context_length,
+        (max_context_length - 100) // interpolation_granularity,
+    ):
+        # run genai-perf
+        genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}"
+        gap_result = benchmark_prefill(
+            isl, genai_perf_artifact_dir, model_name, base_url=url
+        )
+        if gap_result is not None:
+            ttft = gap_result["time_to_first_token"]["avg"]
+            prefill_isl.append(isl)
+            prefill_ttft.append(ttft)
+            prefill_thpt_per_gpu.append(isl / ttft / num_gpus * 1000)
+
+    # Interpolate prefill_ttft vs prefill_isl with quadratic function (y=ax^2+bx+c)
+    if len(prefill_isl) > 2:
+        logger.info("Interpolating prefill TTFT and throughput vs ISL...")
+
+        # Convert to numpy arrays for easier manipulation
+        prefill_isl_np = np.array(prefill_isl)
+        prefill_ttft_np = np.array(prefill_ttft)
+        prefill_thpt_per_gpu_np = np.array(prefill_thpt_per_gpu)
+
+        save_path = f"{work_dir}/raw_data.npz"
+        np.savez(
+            save_path,
+            prefill_isl=prefill_isl_np,
+            prefill_ttft=prefill_ttft_np,
+            prefill_thpt_per_gpu=prefill_thpt_per_gpu_np,
+        )
+
+        # Call the plotting function
+        plot_prefill_interpolation(
+            prefill_isl_np, prefill_ttft_np, prefill_thpt_per_gpu_np, work_dir
+        )
+    else:
+        logger.warning(
+            "Not enough data points to perform interpolation (need at least 3 points)"
+        )
+
+    return