Unverified Commit fd358991 authored by Hongkuan Zhou's avatar Hongkuan Zhou Committed by GitHub
Browse files

feat: standalone profiling script for a given endpoint (#2386)

parent dabd2267
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import argparse
import logging
import os
from utils.profile_prefill import profile_prefill
from benchmarks.profiler.utils.profile_decode import profile_decode
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
formatter = logging.Formatter(
"%(asctime)s - %(name)s - %(levelname)s - %(message)s", "%Y-%m-%d %H:%M:%S"
)
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="profile a given endpoint's performance for prefill or decode"
)
parser.add_argument(
"--mode",
type=str,
required=True,
choices=["prefill", "decode"],
help="mode to profile",
)
parser.add_argument(
"--model_name",
type=str,
required=True,
help="model name",
)
parser.add_argument(
"--url",
type=str,
required=True,
help="base url of the endpoint",
)
parser.add_argument(
"--num_gpus",
type=int,
required=True,
help="number of gpus",
)
parser.add_argument(
"--max_kv_tokens",
type=int,
required=False,
default=0,
help="max kv tokens of the endpoint (only used for decode)",
)
parser.add_argument(
"--work_dir",
type=str,
default="endpoint_profiling_results/",
help="work directory to save the results",
)
parser.add_argument(
"--max_context_length",
type=int,
default=16384,
help="max context length of the endpoint",
)
parser.add_argument(
"--interpolation_granularity",
type=int,
default=8,
help="interpolation granularity for the results",
)
args = parser.parse_args()
os.makedirs(args.work_dir, exist_ok=True)
if args.mode == "prefill":
profile_prefill(
args.work_dir,
args.model_name,
args.url,
args.num_gpus,
args.max_context_length,
args.interpolation_granularity,
)
elif args.mode == "decode":
assert args.max_kv_tokens > 0, "max_kv_tokens must be provided for decode"
profile_decode(
args.work_dir,
args.model_name,
args.url,
args.num_gpus,
args.max_kv_tokens,
args.max_context_length,
args.interpolation_granularity,
)
else:
raise ValueError(f"Invalid mode: {args.mode}")
......@@ -28,18 +28,16 @@ from utils.dynamo_deployment import (
cleanup_remaining_deployments,
)
from utils.genai_perf import benchmark_decode, benchmark_prefill
from utils.plot import (
plot_decode_3d_surface,
plot_decode_performance,
plot_prefill_interpolation,
plot_prefill_performance,
)
from utils.plot import plot_decode_performance, plot_prefill_performance
from utils.profile_cache import (
check_decode_results_exist,
check_prefill_results_exist,
load_existing_decode_results,
load_existing_prefill_results,
)
from utils.profile_prefill import profile_prefill
from benchmarks.profiler.utils.profile_decode import profile_decode
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
......@@ -373,9 +371,6 @@ async def run_profile(args):
# interpolate ISL - TTFT with best prefill TP
best_prefill_tp = prefill_tp_size[selected_prefill_idx]
prefill_isl = []
prefill_ttft = []
prefill_thpt_per_gpu = []
logger.info(
f"Profiling prefill under best TP {best_prefill_tp} with different ISL..."
)
......@@ -420,58 +415,22 @@ async def run_profile(args):
)
base_url = client.get_service_url()
for isl in range(
100,
profile_prefill(
work_dir,
model_name,
base_url,
best_prefill_tp,
args.max_context_length,
(args.max_context_length - 100) // args.prefill_interpolation_granularity,
):
# run genai-perf
genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}"
gap_result = benchmark_prefill(
isl, genai_perf_artifact_dir, model_name, base_url=base_url
)
if gap_result is not None:
ttft = gap_result["time_to_first_token"]["avg"]
prefill_isl.append(isl)
prefill_ttft.append(ttft)
prefill_thpt_per_gpu.append(isl / ttft / best_prefill_tp * 1000)
args.prefill_interpolation_granularity,
)
print("Cleaning up deployment...")
await client.delete_deployment()
deployment_clients.remove(client)
print("Deployment deleted")
# Interpolate prefill_ttft vs prefill_isl with quadratic function (y=ax^2+bx+c)
if len(prefill_isl) > 2:
logger.info("Interpolating prefill TTFT and throughput vs ISL...")
# Convert to numpy arrays for easier manipulation
prefill_isl_np = np.array(prefill_isl)
prefill_ttft_np = np.array(prefill_ttft)
prefill_thpt_per_gpu_np = np.array(prefill_thpt_per_gpu)
save_path = f"{work_dir}/raw_data.npz"
np.savez(
save_path,
prefill_isl=prefill_isl_np,
prefill_ttft=prefill_ttft_np,
prefill_thpt_per_gpu=prefill_thpt_per_gpu_np,
)
# Call the plotting function
plot_prefill_interpolation(
prefill_isl_np, prefill_ttft_np, prefill_thpt_per_gpu_np, work_dir
)
else:
logger.warning(
"Not enough data points to perform interpolation (need at least 3 points)"
)
# interpolate ITL - Active_KV_Cache - Decode_Context_Length with best decode TP
x_kv_usage = []
y_context_length = []
z_itl = []
z_thpt_per_gpu = []
best_decode_tp = decode_tp_size[selected_decode_idx]
logger.info(f"Profiling decode with TP size {best_decode_tp}...")
decode_config = config_modifier.set_config_tp_size(
......@@ -508,64 +467,23 @@ async def run_profile(args):
f"{work_dir}/vllm-v1-agg/vllmdecodeworker/0.log"
)
osl = 500 # not too large to reduce ITL variance, not too small to have stable measurement
base_url = client.get_service_url()
for isl in range(
100,
args.max_context_length - osl,
(args.max_context_length - osl) // args.decode_interpolation_granularity,
):
max_concurrency = max_kv_tokens // (isl + osl)
sweep_num_request = list(
range(
1,
max_concurrency,
max_concurrency // args.decode_interpolation_granularity,
)
)
for num_request in sweep_num_request:
genai_perf_artifact_dir = (
f"{work_dir}/gap_isl{isl}_osl{osl}_n{num_request}"
)
gap_result = benchmark_decode(
isl,
osl,
num_request,
genai_perf_artifact_dir,
model_name,
base_url=base_url,
)
if gap_result is not None:
itl = gap_result["inter_token_latency"]["avg"]
x_kv_usage.append((isl + osl / 2) * num_request / max_kv_tokens)
y_context_length.append(isl + osl / 2)
z_itl.append(itl)
z_thpt_per_gpu.append(
gap_result["output_token_throughput"]["avg"] / best_decode_tp
)
profile_decode(
work_dir,
model_name,
base_url,
best_decode_tp,
max_kv_tokens,
args.max_context_length,
args.decode_interpolation_granularity,
)
print("Cleaning up deployment...")
await client.delete_deployment()
deployment_clients.remove(client)
print("Deployment deleted")
# Save the data points to a .npz file
save_path = f"{work_dir}/raw_data.npz"
np.savez(
save_path,
x_kv_usage=np.array(x_kv_usage),
y_context_length=np.array(y_context_length),
z_itl=np.array(z_itl),
z_thpt_per_gpu=np.array(z_thpt_per_gpu),
max_kv_tokens=np.array([max_kv_tokens]),
)
logger.info(f"Saved data points to {save_path}")
# Plot 3D surface
plot_decode_3d_surface(
x_kv_usage, y_context_length, z_itl, best_decode_tp, work_dir
)
except Exception as e:
logger.error(f"Profile job failed with error: {e}")
raise
......
......@@ -114,16 +114,13 @@ def plot_prefill_interpolation(
"""
# Fit quadratic functions
ttft_coeffs = np.polyfit(prefill_isl_np, prefill_ttft_np, 2)
thpt_coeffs = np.polyfit(prefill_isl_np, prefill_thpt_per_gpu_np, 2)
# Create interpolation functions
ttft_poly = np.poly1d(ttft_coeffs)
thpt_poly = np.poly1d(thpt_coeffs)
# Generate points for smooth curves
x_interp = np.linspace(min(prefill_isl_np), max(prefill_isl_np), 100)
ttft_interp = ttft_poly(x_interp)
thpt_interp = thpt_poly(x_interp)
# Plot TTFT vs ISL
plt.figure(figsize=(10, 6))
......@@ -148,14 +145,7 @@ def plot_prefill_interpolation(
# Plot Throughput vs ISL
plt.figure(figsize=(10, 6))
plt.scatter(prefill_isl_np, prefill_thpt_per_gpu_np, s=100, label="Measured data")
plt.plot(
x_interp,
thpt_interp,
"g-",
label=f"Quadratic fit: {thpt_coeffs[0]:.2e}x² + {thpt_coeffs[1]:.2e}x + {thpt_coeffs[2]:.2e}",
)
plt.scatter(prefill_isl_np, prefill_thpt_per_gpu_np, s=100, label="Throughput/GPU")
plt.title("Prefill Throughput vs Input Sequence Length")
plt.xlabel("Input Sequence Length (tokens)")
plt.ylabel("Prefill throughput per GPU (tokens/s/GPU)")
......@@ -170,7 +160,9 @@ def plot_prefill_interpolation(
plt.close()
def plot_decode_3d_surface(x_kv_usage, y_context_length, z_itl, tp_size, work_dir):
def plot_decode_3d_surface(
x_kv_usage, y_context_length, z_itl, z_thpt_per_gpu, work_dir
):
"""
Plot 3D surface for decode interpolation with KV usage, context length, and ITL.
......@@ -178,14 +170,18 @@ def plot_decode_3d_surface(x_kv_usage, y_context_length, z_itl, tp_size, work_di
x_kv_usage: list of KV usage percentages
y_context_length: list of context lengths
z_itl: list of ITL values
tp_size: TP size for the plot filename
z_thpt_per_gpu: list of throughput per GPU values
work_dir: directory to save the plot
"""
xi = np.linspace(min(x_kv_usage), max(x_kv_usage), 100)
yi = np.linspace(min(y_context_length), max(y_context_length), 100)
X, Y = np.meshgrid(xi, yi)
Z = griddata((x_kv_usage, y_context_length), z_itl, (X, Y), method="cubic")
Z_itl = griddata((x_kv_usage, y_context_length), z_itl, (X, Y), method="cubic")
Z_thpt = griddata(
(x_kv_usage, y_context_length), z_thpt_per_gpu, (X, Y), method="cubic"
)
# Plot ITL surface
fig = plt.figure(figsize=(12, 10))
ax = fig.add_subplot(111, projection="3d") # type: ignore
......@@ -193,7 +189,7 @@ def plot_decode_3d_surface(x_kv_usage, y_context_length, z_itl, tp_size, work_di
surf = ax.plot_surface( # type: ignore
X,
Y,
Z,
Z_itl,
cmap=cm.coolwarm, # type: ignore
linewidth=0.2,
antialiased=True,
......@@ -202,20 +198,57 @@ def plot_decode_3d_surface(x_kv_usage, y_context_length, z_itl, tp_size, work_di
# Add a color bar with custom settings
cbar = fig.colorbar(surf, ax=ax, shrink=0.5, aspect=5)
cbar.set_label("Z Value", fontsize=12)
cbar.set_label("ITL (ms)", fontsize=12)
cbar.ax.tick_params(labelsize=10)
# Add labels with custom font sizes
ax.set_xlabel("Active KV Percentage", fontsize=12)
ax.set_ylabel("Decode Context Length", fontsize=12)
ax.set_zlabel("ITL", fontsize=12) # type: ignore
ax.set_title("Decode ITL Interpolation", fontsize=14)
# Set viewing angle
ax.view_init(elev=30, azim=45) # type: ignore
ax.grid(True)
ax.tick_params(axis="both", which="major", labelsize=10)
plot_path = f"{work_dir}/decode_tp{tp_size}.png"
plot_path = f"{work_dir}/decode_itl_interpolation.png"
logger.info(f"Saving ITL surface plot to {plot_path}")
plt.savefig(plot_path, dpi=300, bbox_inches="tight")
plt.close()
# Plot Throughput surface
fig = plt.figure(figsize=(12, 10))
ax = fig.add_subplot(111, projection="3d") # type: ignore
# Create the throughput surface plot with customizations
surf = ax.plot_surface( # type: ignore
X,
Y,
Z_thpt,
cmap=cm.viridis, # type: ignore
linewidth=0.2,
antialiased=True,
alpha=0.8,
)
# Add a color bar with custom settings
cbar = fig.colorbar(surf, ax=ax, shrink=0.5, aspect=5)
cbar.set_label("Throughput per GPU (tokens/s/GPU)", fontsize=12)
cbar.ax.tick_params(labelsize=10)
# Add labels with custom font sizes
ax.set_xlabel("Active KV Percentage", fontsize=12)
ax.set_ylabel("Decode Context Length", fontsize=12)
ax.set_zlabel("Throughput per GPU", fontsize=12) # type: ignore
ax.set_title("Decode Throughput Interpolation", fontsize=14)
# Set viewing angle
ax.view_init(elev=30, azim=45) # type: ignore
ax.grid(True)
ax.tick_params(axis="both", which="major", labelsize=10)
thpt_plot_path = f"{work_dir}/decode_throughput_interpolation.png"
logger.info(f"Saving throughput surface plot to {thpt_plot_path}")
plt.savefig(thpt_plot_path, dpi=300, bbox_inches="tight")
plt.close()
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import logging
import numpy as np
from utils.genai_perf import benchmark_decode
from utils.plot import plot_decode_3d_surface
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
formatter = logging.Formatter(
"%(asctime)s - %(name)s - %(levelname)s - %(message)s", "%Y-%m-%d %H:%M:%S"
)
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)
def profile_decode(
work_dir,
model_name,
url,
num_gpus,
max_kv_tokens,
max_context_length,
interpolation_granularity,
):
"""interpolate ITL - Active_KV_Cache - Decode_Context_Length"""
x_kv_usage = []
y_context_length = []
z_itl = []
z_thpt_per_gpu = []
osl = 500 # not too large to reduce ITL variance, not too small to have stable measurement
for isl in range(
100,
max_context_length - osl,
(max_context_length - osl) // interpolation_granularity,
):
max_concurrency = max_kv_tokens // (isl + osl)
sweep_num_request = range(
1,
max_concurrency,
max_concurrency // interpolation_granularity,
)
for num_request in sweep_num_request:
genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}_osl{osl}_n{num_request}"
gap_result = benchmark_decode(
isl,
osl,
num_request,
genai_perf_artifact_dir,
model_name,
base_url=url,
)
if gap_result is not None:
itl = gap_result["inter_token_latency"]["avg"]
x_kv_usage.append((isl + osl / 2) * num_request / max_kv_tokens)
y_context_length.append(isl + osl / 2)
z_itl.append(itl)
z_thpt_per_gpu.append(
gap_result["output_token_throughput"]["avg"] / num_gpus
)
# Save the data points to a .npz file
save_path = f"{work_dir}/raw_data.npz"
np.savez(
save_path,
x_kv_usage=np.array(x_kv_usage),
y_context_length=np.array(y_context_length),
z_itl=np.array(z_itl),
z_thpt_per_gpu=np.array(z_thpt_per_gpu),
max_kv_tokens=np.array([max_kv_tokens]),
)
logger.info(f"Saved data points to {save_path}")
# Plot 3D surface
plot_decode_3d_surface(
x_kv_usage, y_context_length, z_itl, z_thpt_per_gpu, work_dir
)
return
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import logging
import numpy as np
from utils.genai_perf import benchmark_prefill
from utils.plot import plot_prefill_interpolation
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
formatter = logging.Formatter(
"%(asctime)s - %(name)s - %(levelname)s - %(message)s", "%Y-%m-%d %H:%M:%S"
)
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)
def profile_prefill(
work_dir, model_name, url, num_gpus, max_context_length, interpolation_granularity
):
prefill_isl = []
prefill_ttft = []
prefill_thpt_per_gpu = []
for isl in range(
100,
max_context_length,
(max_context_length - 100) // interpolation_granularity,
):
# run genai-perf
genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}"
gap_result = benchmark_prefill(
isl, genai_perf_artifact_dir, model_name, base_url=url
)
if gap_result is not None:
ttft = gap_result["time_to_first_token"]["avg"]
prefill_isl.append(isl)
prefill_ttft.append(ttft)
prefill_thpt_per_gpu.append(isl / ttft / num_gpus * 1000)
# Interpolate prefill_ttft vs prefill_isl with quadratic function (y=ax^2+bx+c)
if len(prefill_isl) > 2:
logger.info("Interpolating prefill TTFT and throughput vs ISL...")
# Convert to numpy arrays for easier manipulation
prefill_isl_np = np.array(prefill_isl)
prefill_ttft_np = np.array(prefill_ttft)
prefill_thpt_per_gpu_np = np.array(prefill_thpt_per_gpu)
save_path = f"{work_dir}/raw_data.npz"
np.savez(
save_path,
prefill_isl=prefill_isl_np,
prefill_ttft=prefill_ttft_np,
prefill_thpt_per_gpu=prefill_thpt_per_gpu_np,
)
# Call the plotting function
plot_prefill_interpolation(
prefill_isl_np, prefill_ttft_np, prefill_thpt_per_gpu_np, work_dir
)
else:
logger.warning(
"Not enough data points to perform interpolation (need at least 3 points)"
)
return
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment