Unverified Commit 19948b7f authored by Ilya Sherstyuk's avatar Ilya Sherstyuk Committed by GitHub
Browse files

feat: Add --use-ai-configurator to profile_sla.py (#3079)


Signed-off-by: default avatarIlya Sherstyuk <isherstyuk@nvidia.com>
Signed-off-by: default avatarIlya Sherstyuk <46343317+ilyasher@users.noreply.github.com>
Co-authored-by: default avatarHongkuan Zhou <tedzhouhk@gmail.com>
parent 8534d170
...@@ -24,6 +24,7 @@ import yaml ...@@ -24,6 +24,7 @@ import yaml
from benchmarks.profiler.utils.config import CONFIG_MODIFIERS, WORKER_COMPONENT_NAMES from benchmarks.profiler.utils.config import CONFIG_MODIFIERS, WORKER_COMPONENT_NAMES
from benchmarks.profiler.utils.defaults import DECODE_NUM_REQUESTS_RANGE from benchmarks.profiler.utils.defaults import DECODE_NUM_REQUESTS_RANGE
from benchmarks.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator
from benchmarks.profiler.utils.genai_perf import benchmark_decode, benchmark_prefill from benchmarks.profiler.utils.genai_perf import benchmark_decode, benchmark_prefill
from benchmarks.profiler.utils.plot import ( from benchmarks.profiler.utils.plot import (
plot_decode_performance, plot_decode_performance,
...@@ -35,8 +36,14 @@ from benchmarks.profiler.utils.profile_cache import ( ...@@ -35,8 +36,14 @@ from benchmarks.profiler.utils.profile_cache import (
load_existing_decode_results, load_existing_decode_results,
load_existing_prefill_results, load_existing_prefill_results,
) )
from benchmarks.profiler.utils.profile_decode import profile_decode from benchmarks.profiler.utils.profile_decode import (
from benchmarks.profiler.utils.profile_prefill import profile_prefill profile_decode,
profile_decode_aiconfigurator,
)
from benchmarks.profiler.utils.profile_prefill import (
profile_prefill,
profile_prefill_aiconfigurator,
)
from deploy.utils.dynamo_deployment import ( from deploy.utils.dynamo_deployment import (
DynamoDeploymentClient, DynamoDeploymentClient,
cleanup_remaining_deployments, cleanup_remaining_deployments,
...@@ -86,6 +93,34 @@ async def run_profile(args): ...@@ -86,6 +93,34 @@ async def run_profile(args):
else: else:
logger.info("Skip existing results disabled - will re-run all tests") logger.info("Skip existing results disabled - will re-run all tests")
if args.use_ai_configurator:
if not args.aic_system:
raise ValueError(
"Must provide --aic-system when using --use-ai-configurator."
)
if not args.aic_model_name:
raise ValueError(
"Must provide --aic-model-name when using --use-ai-configurator."
)
if not args.backend_version:
raise ValueError(
"Must provide --backend-version when using --use-ai-configurator."
)
logger.info("Will use aiconfigurator to estimate perf.")
ai_configurator_perf_estimator = AIConfiguratorPerfEstimator(
args.aic_model_name,
args.aic_system.lower(),
args.backend,
args.backend_version,
)
else:
if args.aic_system or args.aic_model_name or args.backend_version:
logger.warning(
"Will ignore --aic-system, --aic-model-name, and/or --backend-version "
"when not using --use-ai-configurator."
)
# first profile prefill # first profile prefill
prefill_tp_size = [] prefill_tp_size = []
prefill_ttft = [] prefill_ttft = []
...@@ -93,6 +128,8 @@ async def run_profile(args): ...@@ -93,6 +128,8 @@ async def run_profile(args):
logger.info("Profiling prefill...") logger.info("Profiling prefill...")
prefill_config = config_modifier.convert_config(config, "prefill") prefill_config = config_modifier.convert_config(config, "prefill")
frontend_port = config_modifier.get_port(config) frontend_port = config_modifier.get_port(config)
itl: float | None = None
thpt_per_gpu: float | None = None
for tp_size in profile_tp_size: for tp_size in profile_tp_size:
logger.info(f"Profiling prefill with TP size {tp_size}...") logger.info(f"Profiling prefill with TP size {tp_size}...")
...@@ -125,8 +162,17 @@ async def run_profile(args): ...@@ -125,8 +162,17 @@ async def run_profile(args):
with open(prefill_config_fn, "w") as f: with open(prefill_config_fn, "w") as f:
yaml.dump(prefill_config, f) yaml.dump(prefill_config, f)
ttft = None
if args.dry_run: if args.dry_run:
logger.info("Skipping deployment creation in dry run mode") logger.info("Skipping deployment creation in dry run mode")
elif args.use_ai_configurator:
logger.info("Using ai-configurator to estimate prefill latency.")
perf_dict = ai_configurator_perf_estimator.estimate_prefill_perf(
args.isl,
tp_size=tp_size,
)
ttft = perf_dict["context_latency"]
logger.info(f"Estimated prefill TTFT: {ttft:.2f}ms")
else: else:
client = DynamoDeploymentClient( client = DynamoDeploymentClient(
namespace=args.namespace, namespace=args.namespace,
...@@ -161,15 +207,17 @@ async def run_profile(args): ...@@ -161,15 +207,17 @@ async def run_profile(args):
) )
if gap_result is not None: if gap_result is not None:
ttft = gap_result["time_to_first_token"]["avg"] ttft = gap_result["time_to_first_token"]["avg"]
prefill_tp_size.append(tp_size)
prefill_ttft.append(ttft)
prefill_thpt_per_gpu.append(args.isl / ttft / tp_size * 1000)
logger.info("Cleaning up deployment...") logger.info("Cleaning up deployment...")
await client.delete_deployment() await client.delete_deployment()
deployment_clients.remove(client) deployment_clients.remove(client)
logger.info("Deployment deleted") logger.info("Deployment deleted")
if ttft is not None:
prefill_tp_size.append(tp_size)
prefill_ttft.append(ttft)
prefill_thpt_per_gpu.append(args.isl / ttft / tp_size * 1000)
# Plot the results as a 2D scatter plot # Plot the results as a 2D scatter plot
if prefill_tp_size and prefill_ttft and prefill_thpt_per_gpu: if prefill_tp_size and prefill_ttft and prefill_thpt_per_gpu:
plot_prefill_performance( plot_prefill_performance(
...@@ -242,6 +290,15 @@ async def run_profile(args): ...@@ -242,6 +290,15 @@ async def run_profile(args):
if args.dry_run: if args.dry_run:
logger.info("Skipping deployment creation in dry run mode") logger.info("Skipping deployment creation in dry run mode")
elif args.use_ai_configurator:
# Compute max_concurrency and max_kv_tokens to know which
# num_request to sweep over.
max_concurrency = ai_configurator_perf_estimator.get_max_batch_size(
args.isl, args.osl, tp_size=tp_size
)
max_kv_tokens = max_concurrency * (args.isl + args.osl)
else: else:
client = DynamoDeploymentClient( client = DynamoDeploymentClient(
namespace=args.namespace, namespace=args.namespace,
...@@ -263,10 +320,14 @@ async def run_profile(args): ...@@ -263,10 +320,14 @@ async def run_profile(args):
f"Logs have been saved to {client.base_log_dir / client.deployment_name}" f"Logs have been saved to {client.base_log_dir / client.deployment_name}"
) )
# Compute max_concurrency and max_kv_tokens to know which
# num_request to sweep over.
max_kv_tokens = config_modifier.get_kv_cache_size_from_dynamo_log( max_kv_tokens = config_modifier.get_kv_cache_size_from_dynamo_log(
f"{work_dir}/{client.deployment_name}/{WORKER_COMPONENT_NAMES[args.backend].decode_worker_k8s_name.lower()}/0.log" f"{work_dir}/{client.deployment_name}/{WORKER_COMPONENT_NAMES[args.backend].decode_worker_k8s_name.lower()}/0.log"
) )
max_concurrency = max_kv_tokens // (args.isl + args.osl) max_concurrency = max_kv_tokens // (args.isl + args.osl)
if not args.dry_run:
sweep_num_request = [ sweep_num_request = [
num for num in DECODE_NUM_REQUESTS_RANGE if num <= max_concurrency num for num in DECODE_NUM_REQUESTS_RANGE if num <= max_concurrency
] ]
...@@ -276,8 +337,26 @@ async def run_profile(args): ...@@ -276,8 +337,26 @@ async def run_profile(args):
engine_decode_itl = [] engine_decode_itl = []
engine_decode_thpt_per_gpu = [] engine_decode_thpt_per_gpu = []
base_url = client.get_service_url()
for num_request in sweep_num_request: for num_request in sweep_num_request:
itl = thpt_per_gpu = None
if args.use_ai_configurator:
logger.info("Using ai-configurator to estimate decode latency.")
perf_dict = ai_configurator_perf_estimator.estimate_perf(
args.isl,
args.osl,
num_request,
mode="decode",
tp_size=tp_size,
)
itl = perf_dict["tpot"]
thpt_per_gpu = perf_dict["tokens/s/gpu"]
logger.info(f"Estimated decode ITL: {itl:.2f}ms")
logger.info(
f"Estimated decode throughput per GPU: {thpt_per_gpu:.2f} tokens/s/GPU"
)
else:
base_url = client.get_service_url()
genai_perf_artifact_dir = f"{work_dir}/gap_request{num_request}_isl{args.isl}_osl{args.osl}_n{num_request}" genai_perf_artifact_dir = f"{work_dir}/gap_request{num_request}_isl{args.isl}_osl{args.osl}_n{num_request}"
gap_result = benchmark_decode( gap_result = benchmark_decode(
args.isl, args.isl,
...@@ -293,6 +372,8 @@ async def run_profile(args): ...@@ -293,6 +372,8 @@ async def run_profile(args):
thpt_per_gpu = ( thpt_per_gpu = (
gap_result["output_token_throughput"]["avg"] / tp_size gap_result["output_token_throughput"]["avg"] / tp_size
) )
if itl is not None and thpt_per_gpu is not None:
engine_decode_itl.append(itl) engine_decode_itl.append(itl)
engine_decode_thpt_per_gpu.append(thpt_per_gpu) engine_decode_thpt_per_gpu.append(thpt_per_gpu)
decode_tp_size.append(tp_size) decode_tp_size.append(tp_size)
...@@ -301,16 +382,17 @@ async def run_profile(args): ...@@ -301,16 +382,17 @@ async def run_profile(args):
decode_concurrency.append(num_request) decode_concurrency.append(num_request)
decode_kv_cache_size.append(max_kv_tokens) decode_kv_cache_size.append(max_kv_tokens)
logger.info("Cleaning up deployment...")
await client.delete_deployment()
deployment_clients.remove(client)
logger.info("Deployment deleted")
# Store partial results for plotting later # Store partial results for plotting later
decode_results.append( decode_results.append(
(tp_size, engine_decode_itl, engine_decode_thpt_per_gpu) (tp_size, engine_decode_itl, engine_decode_thpt_per_gpu)
) )
if not args.dry_run and not args.use_ai_configurator:
logger.info("Cleaning up deployment...")
await client.delete_deployment()
deployment_clients.remove(client)
logger.info("Deployment deleted")
# Plot all decode results after profiling is complete # Plot all decode results after profiling is complete
if decode_results: if decode_results:
plot_decode_performance(decode_results, args.itl, args.output_dir) plot_decode_performance(decode_results, args.itl, args.output_dir)
...@@ -418,6 +500,15 @@ async def run_profile(args): ...@@ -418,6 +500,15 @@ async def run_profile(args):
if args.dry_run: if args.dry_run:
logger.info("Skipping deployment creation in dry run mode") logger.info("Skipping deployment creation in dry run mode")
elif args.use_ai_configurator:
profile_prefill_aiconfigurator(
work_dir,
best_prefill_tp, # num_gpus
args.max_context_length,
args.prefill_interpolation_granularity,
ai_configurator_perf_estimator,
tp_size=best_prefill_tp,
)
else: else:
client = DynamoDeploymentClient( client = DynamoDeploymentClient(
namespace=args.namespace, namespace=args.namespace,
...@@ -481,6 +572,19 @@ async def run_profile(args): ...@@ -481,6 +572,19 @@ async def run_profile(args):
if args.dry_run: if args.dry_run:
logger.info("Skipping deployment creation in dry run mode") logger.info("Skipping deployment creation in dry run mode")
elif args.use_ai_configurator:
max_kv_tokens = ai_configurator_perf_estimator.get_max_kv_tokens(
args.isl, args.osl, tp_size=best_decode_tp
)
profile_decode_aiconfigurator(
work_dir,
best_decode_tp, # num_gpus
max_kv_tokens,
args.max_context_length,
args.decode_interpolation_granularity,
ai_configurator_perf_estimator,
tp_size=best_decode_tp,
)
else: else:
client = DynamoDeploymentClient( client = DynamoDeploymentClient(
namespace=args.namespace, namespace=args.namespace,
...@@ -627,6 +731,26 @@ if __name__ == "__main__": ...@@ -627,6 +731,26 @@ if __name__ == "__main__":
action="store_true", action="store_true",
help="Dry run the profile job", help="Dry run the profile job",
) )
parser.add_argument(
"--use-ai-configurator",
action="store_true",
help="Use ai-configurator to estimate benchmarking results instead of running actual deployment.",
)
parser.add_argument(
"--aic-system",
type=str,
help="Target system for use with aiconfigurator (e.g. h100_sxm, h200_sxm)",
)
parser.add_argument(
"--aic-model-name",
type=str,
help="aiconfigurator name of the target model (e.g. QWEN3_32B, DEEPSEEK_V3)",
)
parser.add_argument(
"--backend-version",
type=str,
help="Specify backend version when using aiconfigurator to estimate perf.",
)
args = parser.parse_args() args = parser.parse_args()
asyncio.run(run_profile(args)) asyncio.run(run_profile(args))
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import logging
from typing import Any
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
formatter = logging.Formatter(
"%(asctime)s - %(name)s - %(levelname)s - %(message)s", "%Y-%m-%d %H:%M:%S"
)
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)
def _try_import_aiconfigurator():
# Lazy-import aiconfigurator because it's an optional dependency in profile_sla.py
import aiconfigurator.sdk.backends.factory
import aiconfigurator.sdk.config
import aiconfigurator.sdk.inference_session
import aiconfigurator.sdk.models
import aiconfigurator.sdk.perf_database
return aiconfigurator
class AIConfiguratorPerfEstimator:
"""
This class is used to estimate the performance of a model using aiconfigurator.
An instance of this class stores information about the model, system, and backend.
Methods can be called to estimate prefill and/or decode perf for a given ISL, OSL,
batch_size, and parallelism config.
"""
def __init__(
self,
model_name: str, # e.g. "QWEN3_32B"
system: str, # e.g. "h200_sxm"
backend: str, # e.g. "trtllm"
version: str, # e.g. "0.20.0"
):
aiconfigurator = _try_import_aiconfigurator()
logger.info("Loading aiconfigurator database. This might take a few seconds...")
self.database = aiconfigurator.sdk.perf_database.get_database(
system=system,
backend=backend,
version=version,
)
if not self.database:
raise ValueError(
f"Database not found for system: {system}, backend: {backend}, version: {version}"
)
logger.info("aiconfigurator database loaded.")
self.backend = aiconfigurator.sdk.backends.factory.get_backend(backend)
# This is the aiconfigurator model name (such as QWEN3_32B or DEEPSEEK_V3)
# rather than the HF model name.
self.model_name = model_name
def _get_model(self, **model_config_kwargs):
aiconfigurator = _try_import_aiconfigurator()
# NOTE: MOE models error out unless moe_tp_size and moe_ep_size are provided.
model_config = aiconfigurator.sdk.config.ModelConfig(**model_config_kwargs)
model = aiconfigurator.sdk.models.get_model(self.model_name, model_config)
return model
def estimate_perf(
self,
isl: int,
osl: int,
batch_size: int,
mode: str = "full",
**model_config_kwargs,
) -> dict[str, Any]:
"""
Estimate the perf of this model + system + backend + ISL/OSL/model_config
using aiconfigurator.
Args:
isl: Input sequence length
osl: Output sequence length
batch_size: Batch size
mode: Indicates what perf data to estimate.
"full": Estimate prefill and decode perf.
"prefill": Only estimate context perf.
"decode": Only estimate decode perf.
**model_config_kwargs: aiconfigurator model config kwargs
(such as tp_size, moe_tp_size, etc).
Returns:
dict: Perf metrics returned by aiconfigurator
"""
aiconfigurator = _try_import_aiconfigurator()
mode_to_aic_mode = {
"full": "static",
"prefill": "static_ctx",
"decode": "static_gen",
}
if mode not in mode_to_aic_mode:
raise ValueError(
f"Invalid mode: {mode}. Must be one of {list(mode_to_aic_mode.keys())}."
)
self.runtime_config = aiconfigurator.sdk.config.RuntimeConfig(
batch_size=batch_size,
beam_width=1,
isl=isl,
osl=osl,
)
model = self._get_model(**model_config_kwargs)
session = aiconfigurator.sdk.inference_session.InferenceSession(
model, self.database, self.backend
)
summary = session.run_static(
mode=mode_to_aic_mode[mode], runtime_config=self.runtime_config, stride=32
)
summary_df = summary.get_summary_df()
# Convert pd.Dataframe to dict since there's only one row
return summary_df.to_dict(orient="records")[0]
def estimate_prefill_perf(
self,
isl: int,
**model_config_kwargs,
) -> dict[str, Any]:
"""
Estimate the perf of this model + system + backend + etc assuming it is a prefill worker.
Args:
isl: Input sequence length
**model_config_kwargs: aiconfigurator model config kwargs
(such as tp_size, moe_tp_size, etc).
Returns:
dict: Perf metrics returned by aiconfigurator
"""
return self.estimate_perf(
isl,
5, # small osl
1, # concurrency = 1
mode="prefill",
**model_config_kwargs,
)
def get_max_batch_size(
self,
isl: int,
osl: int,
**model_config_kwargs,
) -> int:
"""
Estimate the largest batch size that would fit on this GPU.
Args:
isl: Input sequence length
osl: Output sequence length
**model_config_kwargs: aiconfigurator model config kwargs
(such as tp_size, moe_tp_size, etc).
Returns:
int: Estimated largest batch size that will fit on the system.
"""
model = self._get_model(**model_config_kwargs)
def get_mem_usage(bs: int):
# TODO: _get_memory_usage might be underestimating because
# 1. it doesn't account for runtime buffers
# 2. it calculates num_tokens = isl*bs which ignores osl
return self.backend._get_memory_usage(
model, self.database, bs, 1, isl, osl
)["total"]
max_memory_gb = self.database.system_spec["gpu"]["mem_capacity"] / (1024**3)
bs = 1
if get_mem_usage(bs) > max_memory_gb:
# Model does not fit on GPU with the given model config.
return 0
# Step 1: find upper bound on batch size.
while get_mem_usage(bs) < max_memory_gb:
bs *= 2
# We know that bs // 2 will fit on GPU but bs will not.
min_bs = bs // 2
max_bs = bs
# Step 2: binary search for max batch size that fits on GPU.
while min_bs < max_bs:
test_bs = (min_bs + max_bs) // 2
if get_mem_usage(test_bs) < max_memory_gb:
# Because of the +1, the new value of min_bs might not fit on the GPU
# even though test_bs did fit. So at the end when min_bs and max_bs converge,
# we need to remember to subtract 1 from the result.
min_bs = test_bs + 1
else:
# max_bs is always a value that doesn't fit on the GPU.
max_bs = test_bs
return min_bs - 1 # see comment above
def get_max_kv_tokens(
self,
isl: int,
osl: int,
**model_config_kwargs,
) -> int:
"""
Estimate the max number of kv cache tokens that will fit on this GPU
for the given ISL, OSL, and model config.
Args:
isl: Input sequence length
osl: Output sequence length
**model_config_kwargs: aiconfigurator model config kwargs
(such as tp_size, moe_tp_size, etc).
Returns:
int: Estimated number of KV cache tokens that will fit on the system.
"""
max_concurrency = self.get_max_batch_size(isl, osl, **model_config_kwargs)
return max_concurrency * (isl + osl)
...@@ -2,9 +2,11 @@ ...@@ -2,9 +2,11 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import logging import logging
from typing import Callable, Optional, Tuple
import numpy as np import numpy as np
from benchmarks.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator
from benchmarks.profiler.utils.genai_perf import benchmark_decode from benchmarks.profiler.utils.genai_perf import benchmark_decode
from benchmarks.profiler.utils.plot import plot_decode_3d_surface from benchmarks.profiler.utils.plot import plot_decode_3d_surface
...@@ -19,15 +21,15 @@ console_handler.setFormatter(formatter) ...@@ -19,15 +21,15 @@ console_handler.setFormatter(formatter)
logger.addHandler(console_handler) logger.addHandler(console_handler)
def profile_decode( def _profile_decode_helper(
work_dir, work_dir,
model_name,
tokenizer,
url,
num_gpus, num_gpus,
max_kv_tokens, max_kv_tokens,
max_context_length, max_context_length,
interpolation_granularity, interpolation_granularity,
get_itl_and_thpt_per_gpu: Callable[
[int, int, int], Tuple[Optional[float], Optional[float]]
],
): ):
"""interpolate ITL - Active_KV_Cache - Decode_Context_Length""" """interpolate ITL - Active_KV_Cache - Decode_Context_Length"""
x_kv_usage = [] x_kv_usage = []
...@@ -63,24 +65,13 @@ def profile_decode( ...@@ -63,24 +65,13 @@ def profile_decode(
max_concurrency // interpolation_granularity, max_concurrency // interpolation_granularity,
) )
for num_request in sweep_num_request: for num_request in sweep_num_request:
genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}_osl{osl}_n{num_request}" itl, thpt_per_gpu = get_itl_and_thpt_per_gpu(isl, osl, num_request)
gap_result = benchmark_decode(
isl, if itl is not None and thpt_per_gpu is not None:
osl,
num_request,
genai_perf_artifact_dir,
model_name,
tokenizer,
base_url=url,
)
if gap_result is not None:
itl = gap_result["inter_token_latency"]["avg"]
x_kv_usage.append((isl + osl / 2) * num_request / max_kv_tokens) x_kv_usage.append((isl + osl / 2) * num_request / max_kv_tokens)
y_context_length.append(isl + osl / 2) y_context_length.append(isl + osl / 2)
z_itl.append(itl) z_itl.append(itl)
z_thpt_per_gpu.append( z_thpt_per_gpu.append(thpt_per_gpu)
gap_result["output_token_throughput"]["avg"] / num_gpus
)
# Save the data points to a .npz file # Save the data points to a .npz file
save_path = f"{work_dir}/raw_data.npz" save_path = f"{work_dir}/raw_data.npz"
...@@ -100,3 +91,69 @@ def profile_decode( ...@@ -100,3 +91,69 @@ def profile_decode(
) )
return return
def profile_decode(
work_dir,
model_name,
tokenizer,
url,
num_gpus,
max_kv_tokens,
max_context_length,
interpolation_granularity,
):
def get_itl_and_thpt_per_gpu(isl, osl, num_request):
genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}_osl{osl}_n{num_request}"
gap_result = benchmark_decode(
isl,
osl,
num_request,
genai_perf_artifact_dir,
model_name,
tokenizer,
base_url=url,
)
if gap_result is not None:
itl = gap_result["inter_token_latency"]["avg"]
thpt_per_gpu = gap_result["output_token_throughput"]["avg"] / num_gpus
return itl, thpt_per_gpu
return None, None
return _profile_decode_helper(
work_dir,
num_gpus,
max_kv_tokens,
max_context_length,
interpolation_granularity,
get_itl_and_thpt_per_gpu,
)
def profile_decode_aiconfigurator(
work_dir,
num_gpus,
max_kv_tokens,
max_context_length,
interpolation_granularity,
ai_configurator_perf_estimator: AIConfiguratorPerfEstimator,
**model_config_kwargs,
):
def get_itl_and_thpt_per_gpu(isl, osl, num_request):
perf_dict = ai_configurator_perf_estimator.estimate_perf(
isl,
osl,
num_request,
mode="decode",
**model_config_kwargs,
)
return perf_dict["tpot"], perf_dict["tokens/s/gpu"]
return _profile_decode_helper(
work_dir,
num_gpus,
max_kv_tokens,
max_context_length,
interpolation_granularity,
get_itl_and_thpt_per_gpu,
)
...@@ -2,9 +2,11 @@ ...@@ -2,9 +2,11 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import logging import logging
from typing import Callable, Optional
import numpy as np import numpy as np
from benchmarks.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator
from benchmarks.profiler.utils.genai_perf import benchmark_prefill from benchmarks.profiler.utils.genai_perf import benchmark_prefill
from benchmarks.profiler.utils.plot import plot_prefill_interpolation from benchmarks.profiler.utils.plot import plot_prefill_interpolation
...@@ -19,14 +21,12 @@ console_handler.setFormatter(formatter) ...@@ -19,14 +21,12 @@ console_handler.setFormatter(formatter)
logger.addHandler(console_handler) logger.addHandler(console_handler)
def profile_prefill( def _profile_prefill_helper(
work_dir, work_dir,
model_name,
tokenizer,
url,
num_gpus, num_gpus,
max_context_length, max_context_length,
interpolation_granularity, interpolation_granularity,
get_ttft: Callable[[int], Optional[float]],
): ):
prefill_isl = [] prefill_isl = []
prefill_ttft = [] prefill_ttft = []
...@@ -36,17 +36,8 @@ def profile_prefill( ...@@ -36,17 +36,8 @@ def profile_prefill(
max_context_length, max_context_length,
(max_context_length - 100) // interpolation_granularity, (max_context_length - 100) // interpolation_granularity,
): ):
# run genai-perf ttft = get_ttft(isl)
genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}" if ttft is not None:
gap_result = benchmark_prefill(
isl,
genai_perf_artifact_dir,
model_name,
tokenizer,
base_url=url,
)
if gap_result is not None:
ttft = gap_result["time_to_first_token"]["avg"]
prefill_isl.append(isl) prefill_isl.append(isl)
prefill_ttft.append(ttft) prefill_ttft.append(ttft)
prefill_thpt_per_gpu.append(isl / ttft / num_gpus * 1000) prefill_thpt_per_gpu.append(isl / ttft / num_gpus * 1000)
...@@ -78,3 +69,61 @@ def profile_prefill( ...@@ -78,3 +69,61 @@ def profile_prefill(
) )
return return
def profile_prefill(
work_dir,
model_name,
tokenizer,
url,
num_gpus,
max_context_length,
interpolation_granularity,
):
def get_ttft(isl):
genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}"
gap_result = benchmark_prefill(
isl,
genai_perf_artifact_dir,
model_name,
tokenizer,
base_url=url,
)
if gap_result is not None:
return gap_result["time_to_first_token"]["avg"]
return None
return _profile_prefill_helper(
work_dir,
num_gpus,
max_context_length,
interpolation_granularity,
get_ttft,
)
def profile_prefill_aiconfigurator(
work_dir,
num_gpus,
max_context_length,
interpolation_granularity,
ai_configurator_perf_estimator: AIConfiguratorPerfEstimator,
**model_config_kwargs,
):
def get_ttft(isl):
perf_dict = ai_configurator_perf_estimator.estimate_prefill_perf(
isl,
**model_config_kwargs,
)
ttft = perf_dict["context_latency"]
logger.info(f"Estimated prefill TTFT: {ttft:.2f}ms")
return ttft
return _profile_prefill_helper(
work_dir,
num_gpus,
max_context_length,
interpolation_granularity,
get_ttft,
)
...@@ -2,9 +2,10 @@ ...@@ -2,9 +2,10 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
accelerate==1.6.0 accelerate==1.6.0
aiconfigurator==0.2.0
aiofiles aiofiles
av==15.0.0 av==15.0.0
fastapi==0.115.6 fastapi==0.115.12
ftfy ftfy
genai-perf==0.0.15 genai-perf==0.0.15
grpcio-tools==1.66.0 grpcio-tools==1.66.0
...@@ -26,7 +27,7 @@ prometheus-api-client ...@@ -26,7 +27,7 @@ prometheus-api-client
prometheus_client prometheus_client
prophet prophet
protobuf==5.29.5 protobuf==5.29.5
pydantic==2.7.1 pydantic==2.10.6
pyright pyright
PyYAML PyYAML
scikit-learn scikit-learn
......
...@@ -66,7 +66,7 @@ In prefill engine, prefills are usually done with batch size=1 and only the ISL ...@@ -66,7 +66,7 @@ In prefill engine, prefills are usually done with batch size=1 and only the ISL
### Decode Interpolation Data ### Decode Interpolation Data
In decode engine, decode requests are added inflight and iteration time (or ITL) depends on both the context length and the real-time load of the engine. We capture the real-time load of the engine with active kv usage and average context length. The active kv usage determines the complexity of the memory-bounded attention kernel while the active kv usage divided the average context length determines the complexity of the computation bound MLP kernel. For example, the below figure shows the ITL of DS-Distilled Llama 8b model on H100 TP4. The ITL grows near-linearly with active kv usage under a fixed context length. And the slope increases as the context length decreases. In decode engine, decode requests are added inflight and iteration time (or ITL) depends on both the context length and the real-time load of the engine. We capture the real-time load of the engine with active kv usage and average context length. The active kv usage determines the complexity of the memory-bounded attention kernel while the active kv usage divided the average context length determines the complexity of the computation bound MLP kernel. For example, the below figure shows the ITL of DS-Distilled Llama 8b model on H100 TP4. The ITL grows near-linearly with active kv usage under a fixed context length. And the slope increases as the context length decreases.
![images](../images/itl_interpolation.png) ![images](../../docs/images/itl_interpolation.png)
The script profiles the selected decode TP configuration across different active kv blocks and average context length. The script profiles the selected decode TP configuration across different active kv blocks and average context length.
...@@ -266,7 +266,66 @@ If you see `ErrImagePull` or `ImagePullBackOff` errors with 401 unauthorized mes ...@@ -266,7 +266,66 @@ If you see `ErrImagePull` or `ImagePullBackOff` errors with 401 unauthorized mes
2. Verify the service account was created with the image pull secret: 2. Verify the service account was created with the image pull secret:
```bash ```bash
kubectl get serviceaccount dynamo-sa -n $NAMESPACE -o yaml kubectl get serviceaccount dynamo-sa -n $NAMESPACE -o yaml
``` ```
3. The service account should show `imagePullSecrets` containing `nvcr-imagepullsecret`. 3. The service account should show `imagePullSecrets` containing `nvcr-imagepullsecret`.
## Running the Profiling Script with `aiconfigurator`
The profiling script can be run much quicker by using `aiconfigurator` to estimate perf numbers instead of running and benchmarking real dynamo deployments. To enable estimation using `aiconfigurator`, pass the `--use-ai-configurator` flag to the profiling script.
**Advantages** of `--use-ai-configurator`:
* Script will finish in seconds rather than hours.
* No k8s or GPU access is required.
**Disadvantages**:
* Estimated perf could contain some error, especially when the input dimensions out-of-distribution compared to the sampled values in aiconfigurator.
* `aiconfigurator` has a limited list of supported models.
* `aiconfigurator`'s database has a limited list of systems and backends.
### Prerequisites
You will need a virtual environment with `dynamo` installed. Either use the local dev environment or the docker images. If using local environment, install the required dependencies:
```bash
pip install -r deploy/utils/requirements.txt
```
Additionally, install `aiconfigurator`:
```bash
pip install aiconfigurator
```
### Available Models, Systems, and Backends
`aiconfigurator` supports a limited list of models, systems, and backends.
You can use the `aiconfigurator` CLI to see the support matrix:
```bash
aiconfigurator cli --help
```
This will display:
```
...options...
--model {GPT_7B,GPT_13B,GPT_30B,GPT_66B,GPT_175B,LLAMA2_7B,LLAMA2_13B,LLAMA2_70B,LLAMA3.1_8B,LLAMA3.1_70B,LLAMA3.1_405B,MOE_Mixtral8x7B,MOE_Mixtral8x22B,DEEPSEEK_V3,KIMI_K2,QWEN2.5_1.5B,QWEN2.5_7B,QWEN2.5_32B,QWEN2.5_72B,QWEN3_32B,QWEN3_235B,QWEN3_480B,Nemotron_super_v1.1}
Model name
--system {h100_sxm,h200_sxm}
System name
--backend {trtllm,sglang,vllm}
Backend name, suport trtllm for now
--version VERSION Version, 0.20.0,1.0.0rc3 for trtllm
...more options...
```
### Running the Script
In addition to passing the `--use-ai-configurator` flag, you must also provide the `--aic-system`, `--aic-model-name`, and `--backend-version` arguments.
Example command:
```bash
python3 profile_sla.py \
--config ../../components/backends/trtllm/deploy/disagg.yaml \
--use-ai-configurator \
--aic-system h200_sxm \
--aic-model-name QWEN3_32B \
--backend trtllm \
--backend-version 0.20.0
```
The output will be written to `./profiling_results/`.
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""
Test suite for profile_sla aiconfigurator functionality.
profile_sla should be able to use aiconfigurator functionality
even without access to any GPU system.
"""
import sys
from pathlib import Path
import pytest
# Add the project root to sys.path to enable imports
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))
from benchmarks.profiler.profile_sla import run_profile # noqa: E402
class TestProfileSlaAiconfigurator:
"""Test class for profile_sla aiconfigurator functionality."""
@pytest.fixture
def trtllm_args(self):
class Args:
backend = "trtllm"
config = "components/backends/trtllm/deploy/disagg.yaml"
output_dir = "/tmp/test_profiling_results"
namespace = "test-namespace"
min_num_gpus_per_engine = 1
max_num_gpus_per_engine = 8
skip_existing_results = False
force_rerun = False
isl = 3000
osl = 500
ttft = 50
itl = 10
max_context_length = 16384
prefill_interpolation_granularity = 16
decode_interpolation_granularity = 6
service_name = ""
dry_run = False
use_ai_configurator = True
aic_system = "h200_sxm"
aic_model_name = "QWEN3_32B"
backend_version = "0.20.0"
return Args()
@pytest.mark.pre_merge
@pytest.mark.asyncio
@pytest.mark.parametrize(
"missing_arg", ["aic_system", "aic_model_name", "backend_version"]
)
async def test_aiconfigurator_missing_args(self, trtllm_args, missing_arg):
# Check that validation error happens when a required arg is missing.
setattr(trtllm_args, missing_arg, None)
with pytest.raises(ValueError):
await run_profile(trtllm_args)
@pytest.mark.pre_merge
@pytest.mark.asyncio
@pytest.mark.parametrize(
"arg_name, bad_value",
[
# these values don't exist in the aiconfigurator database.
("aic_system", "fake_gpu_system"),
("backend_version", "0.1.0"),
],
)
async def test_aiconfiguator_no_data(self, trtllm_args, arg_name, bad_value):
# Check that an appropriate error is raised when the system/model/backend
# is not found in the aiconfigurator database.
setattr(trtllm_args, arg_name, bad_value)
with pytest.raises(ValueError, match="Database not found"):
await run_profile(trtllm_args)
@pytest.mark.pre_merge
@pytest.mark.asyncio
async def test_trtllm_aiconfigurator_single_model(self, trtllm_args):
# Test that profile_sla works with the model & backend in the trtllm_args fixture.
await run_profile(trtllm_args)
@pytest.mark.asyncio
@pytest.mark.parametrize(
"backend, backend_version",
[
("trtllm", "0.20.0"),
("trtllm", "1.0.0rc3"),
],
)
@pytest.mark.parametrize("model_name", ["QWEN3_32B", "GPT_7B", "LLAMA3.1_405B"])
async def test_trtllm_aiconfigurator_many(
self, trtllm_args, model_name, backend, backend_version
):
# Test that profile_sla works with a variety of backend versions and model names.
trtllm_args.aic_model_name = model_name
trtllm_args.backend = backend
trtllm_args.backend_version = backend_version
await run_profile(trtllm_args)
...@@ -45,6 +45,10 @@ class TestProfileSLADryRun: ...@@ -45,6 +45,10 @@ class TestProfileSLADryRun:
decode_interpolation_granularity = 6 decode_interpolation_granularity = 6
service_name = "" service_name = ""
dry_run = True dry_run = True
use_ai_configurator = False
aic_system = None
aic_model_name = None
backend_version = None
return Args() return Args()
...@@ -70,6 +74,10 @@ class TestProfileSLADryRun: ...@@ -70,6 +74,10 @@ class TestProfileSLADryRun:
decode_interpolation_granularity = 6 decode_interpolation_granularity = 6
service_name = "" service_name = ""
dry_run = True dry_run = True
use_ai_configurator = False
aic_system = None
aic_model_name = None
backend_version = None
return Args() return Args()
...@@ -109,6 +117,10 @@ class TestProfileSLADryRun: ...@@ -109,6 +117,10 @@ class TestProfileSLADryRun:
decode_interpolation_granularity = 6 decode_interpolation_granularity = 6
service_name = "" service_name = ""
dry_run = True dry_run = True
use_ai_configurator = False
aic_system = None
aic_model_name = None
backend_version = None
return Args() return Args()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment