Unverified Commit 7750ed1a authored by Hongkuan Zhou's avatar Hongkuan Zhou Committed by GitHub
Browse files

feat: add parallelization filters (#4144)


Signed-off-by: default avatarhongkuanz <hongkuanz@nvidia.com>
Signed-off-by: default avatarHongkuan Zhou <tedzhouhk@gmail.com>
Co-authored-by: default avatarcoderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
parent e1b0edb9
...@@ -15,10 +15,6 @@ spec: ...@@ -15,10 +15,6 @@ spec:
profilingConfig: profilingConfig:
profilerImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.1" profilerImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.1"
config: config:
# Engine configuration
engine:
is_moe_model: true # Enable MoE model support (uses TEP/DEP instead of TP)
# Sweep/profiling configuration # Sweep/profiling configuration
sweep: sweep:
# Standard online profiling (not using AI Configurator) # Standard online profiling (not using AI Configurator)
......
...@@ -5,6 +5,7 @@ import argparse ...@@ -5,6 +5,7 @@ import argparse
import logging import logging
import os import os
from benchmarks.profiler.utils.defaults import EngineType
from benchmarks.profiler.utils.profile_decode import profile_decode from benchmarks.profiler.utils.profile_decode import profile_decode
from benchmarks.profiler.utils.profile_prefill import profile_prefill from benchmarks.profiler.utils.profile_prefill import profile_prefill
...@@ -91,7 +92,11 @@ if __name__ == "__main__": ...@@ -91,7 +92,11 @@ if __name__ == "__main__":
os.makedirs(args.work_dir, exist_ok=True) os.makedirs(args.work_dir, exist_ok=True)
if args.tokenizer_path == "": if args.tokenizer_path == "":
args.tokenizer_path = args.model_name args.tokenizer_path = args.model_name
if args.mode == "prefill":
# Convert string mode to EngineType
mode = EngineType(args.mode)
if mode == EngineType.PREFILL:
profile_prefill( profile_prefill(
args.work_dir, args.work_dir,
args.model_name, args.model_name,
...@@ -101,7 +106,7 @@ if __name__ == "__main__": ...@@ -101,7 +106,7 @@ if __name__ == "__main__":
args.max_context_length, args.max_context_length,
args.interpolation_granularity, args.interpolation_granularity,
) )
elif args.mode == "decode": elif mode == EngineType.DECODE:
assert args.max_kv_tokens > 0, "max_kv_tokens must be provided for decode" assert args.max_kv_tokens > 0, "max_kv_tokens must be provided for decode"
profile_decode( profile_decode(
args.work_dir, args.work_dir,
...@@ -115,4 +120,4 @@ if __name__ == "__main__": ...@@ -115,4 +120,4 @@ if __name__ == "__main__":
args.attention_dp_size, args.attention_dp_size,
) )
else: else:
raise ValueError(f"Invalid mode: {args.mode}") raise ValueError(f"Invalid mode: {mode}")
...@@ -17,12 +17,19 @@ import asyncio ...@@ -17,12 +17,19 @@ import asyncio
import logging import logging
import math import math
import os import os
from dataclasses import dataclass, field
import numpy as np import numpy as np
import yaml import yaml
from benchmarks.profiler.utils.aiperf import benchmark_decode, benchmark_prefill from benchmarks.profiler.utils.aiperf import benchmark_decode, benchmark_prefill
from benchmarks.profiler.utils.config_modifiers import CONFIG_MODIFIERS from benchmarks.profiler.utils.config_modifiers import CONFIG_MODIFIERS
from benchmarks.profiler.utils.config_modifiers.parallelization_mapping import (
ParallelizationMapping,
apply_parallel_mapping_to_config,
get_candidate_parallel_mappings,
)
from benchmarks.profiler.utils.defaults import EngineType
from benchmarks.profiler.utils.dgd_generation import generate_dgd_config_with_planner from benchmarks.profiler.utils.dgd_generation import generate_dgd_config_with_planner
from benchmarks.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator from benchmarks.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator
from benchmarks.profiler.utils.plot import ( from benchmarks.profiler.utils.plot import (
...@@ -30,12 +37,6 @@ from benchmarks.profiler.utils.plot import ( ...@@ -30,12 +37,6 @@ from benchmarks.profiler.utils.plot import (
plot_pd_joint_results, plot_pd_joint_results,
plot_prefill_performance, plot_prefill_performance,
) )
from benchmarks.profiler.utils.profile_cache import (
check_decode_results_exist,
check_prefill_results_exist,
load_existing_decode_results,
load_existing_prefill_results,
)
from benchmarks.profiler.utils.profile_decode import ( from benchmarks.profiler.utils.profile_decode import (
get_num_request_range, get_num_request_range,
profile_decode, profile_decode,
...@@ -52,6 +53,65 @@ from deploy.utils.dynamo_deployment import ( ...@@ -52,6 +53,65 @@ from deploy.utils.dynamo_deployment import (
) )
from dynamo.planner.defaults import WORKER_COMPONENT_NAMES from dynamo.planner.defaults import WORKER_COMPONENT_NAMES
@dataclass
class PrefillProfileData:
"""Container for prefill profiling results."""
num_gpus: list[int] = field(default_factory=list)
ttft: list[float] = field(default_factory=list)
thpt_per_gpu: list[float] = field(default_factory=list)
parallel_mapping_labels: list[str] = field(default_factory=list)
parallel_mappings: list[ParallelizationMapping] = field(default_factory=list)
def add_data(
self,
num_gpus: int,
ttft: float,
thpt_per_gpu: float,
parallel_mapping_label: str,
parallel_mapping: ParallelizationMapping,
) -> None:
"""Add a complete data point to the profile data."""
self.num_gpus.append(num_gpus)
self.ttft.append(ttft)
self.thpt_per_gpu.append(thpt_per_gpu)
self.parallel_mapping_labels.append(parallel_mapping_label)
self.parallel_mappings.append(parallel_mapping)
@dataclass
class DecodeProfileData:
"""Container for decode profiling results."""
num_gpus: list[int] = field(default_factory=list)
itl: list[float] = field(default_factory=list)
thpt_per_gpu: list[float] = field(default_factory=list)
concurrency: list[int] = field(default_factory=list)
kv_cache_size: list[int] = field(default_factory=list)
parallel_mapping_labels: list[str] = field(default_factory=list)
parallel_mappings: list[ParallelizationMapping] = field(default_factory=list)
def add_data(
self,
num_gpus: int,
itl: float,
thpt_per_gpu: float,
concurrency: int,
kv_cache_size: int,
parallel_mapping_label: str,
parallel_mapping: ParallelizationMapping,
) -> None:
"""Add a complete data point to the profile data."""
self.num_gpus.append(num_gpus)
self.itl.append(itl)
self.thpt_per_gpu.append(thpt_per_gpu)
self.concurrency.append(concurrency)
self.kv_cache_size.append(kv_cache_size)
self.parallel_mapping_labels.append(parallel_mapping_label)
self.parallel_mappings.append(parallel_mapping)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler() console_handler = logging.StreamHandler()
...@@ -73,7 +133,7 @@ async def run_profile(args): ...@@ -73,7 +133,7 @@ async def run_profile(args):
try: try:
# Log MoE model support # Log MoE model support
if args.is_moe_model: if args.model_info.is_moe:
logger.info( logger.info(
"MoE (Mixture of Experts) model profiling, sweeping TEP size for prefill and DEP size for decode" "MoE (Mixture of Experts) model profiling, sweeping TEP size for prefill and DEP size for decode"
) )
...@@ -102,28 +162,7 @@ async def run_profile(args): ...@@ -102,28 +162,7 @@ async def run_profile(args):
for i in range(int(math.log2(args.max_num_gpus_per_engine)) + 1) for i in range(int(math.log2(args.max_num_gpus_per_engine)) + 1)
if args.min_num_gpus_per_engine <= 2**i <= args.max_num_gpus_per_engine if args.min_num_gpus_per_engine <= 2**i <= args.max_num_gpus_per_engine
] ]
if args.is_moe_model: if args.model_info.is_moe:
# Filter GPU counts to only include divisors of num_experts
if hasattr(args, "num_experts") and args.num_experts is not None:
original_counts = profile_num_gpus.copy()
profile_num_gpus = [
gpu_count
for gpu_count in profile_num_gpus
if args.num_experts % gpu_count == 0
]
if not profile_num_gpus:
error_msg = (
f"No valid GPU counts found that divide evenly into num_experts={args.num_experts}. "
f"Original candidates were {original_counts}. "
f"Valid divisors in range would be: {[d for d in range(args.min_num_gpus_per_engine, args.max_num_gpus_per_engine + 1) if args.num_experts % d == 0]}"
)
logger.error(error_msg)
raise ValueError(error_msg)
if len(profile_num_gpus) < len(original_counts):
logger.info(
f"Filtered GPU counts from {original_counts} to {profile_num_gpus} "
f"(only divisors of num_experts={args.num_experts})"
)
logger.info(f"Profiling MoE GPU counts (TEP/DEP): {profile_num_gpus}") logger.info(f"Profiling MoE GPU counts (TEP/DEP): {profile_num_gpus}")
else: else:
logger.info(f"Profiling dense model GPU counts (TP): {profile_num_gpus}") logger.info(f"Profiling dense model GPU counts (TP): {profile_num_gpus}")
...@@ -132,17 +171,30 @@ async def run_profile(args): ...@@ -132,17 +171,30 @@ async def run_profile(args):
model_name = config_modifier.get_model_name(config) model_name = config_modifier.get_model_name(config)
# Log skip behavior # Determine sweep max context length: allow user-provided cap to override model's if smaller
if args.force_rerun: use_specified_max_context_len = getattr(args, "max_context_length", None)
model_max_context_len = args.model_info.max_context_length
if not use_specified_max_context_len and not model_max_context_len:
raise ValueError(
"No max_context_length available from args.max_context_length or model_info from HF config"
)
elif not use_specified_max_context_len:
sweep_max_context_length = model_max_context_len
logger.info( logger.info(
"Force rerun enabled - will re-run all tests even if results exist" f"Using model's maximum context length: {model_max_context_len}"
) )
elif args.skip_existing_results: elif not model_max_context_len:
sweep_max_context_length = use_specified_max_context_len
logger.info( logger.info(
"Skip existing results enabled - will skip TP sizes with existing results" f"Using user-provided max_context_length: {use_specified_max_context_len}"
) )
else: else:
logger.info("Skip existing results disabled - will re-run all tests") sweep_max_context_length = min(
use_specified_max_context_len, model_max_context_len
)
logger.info(
f"Using minimum of user-provided and model's maximum context length: {sweep_max_context_length}"
)
if args.use_ai_configurator: if args.use_ai_configurator:
if not args.aic_system: if not args.aic_system:
...@@ -173,51 +225,38 @@ async def run_profile(args): ...@@ -173,51 +225,38 @@ async def run_profile(args):
) )
# first profile prefill # first profile prefill
prefill_num_gpus = [] prefill_data = PrefillProfileData()
prefill_ttft = []
prefill_thpt_per_gpu = []
logger.info("Profiling prefill...") logger.info("Profiling prefill...")
prefill_config = config_modifier.convert_config( base_prefill_config = config_modifier.convert_config(
config, "prefill", is_moe_model=args.is_moe_model config, EngineType.PREFILL, is_moe_model=args.model_info.is_moe
) )
frontend_port = config_modifier.get_port(config) frontend_port = config_modifier.get_port(config)
itl: float | None = None itl: float | None = None
thpt_per_gpu: float | None = None thpt_per_gpu: float | None = None
for num_gpus in profile_num_gpus: for num_gpus in profile_num_gpus:
logger.info(f"Profiling prefill with {num_gpus} GPUs...") logger.info(f"Profiling prefill with {num_gpus} GPUs...")
candidate_mappings = get_candidate_parallel_mappings(
# Check if results already exist for this GPU count num_gpus, args.model_info, EngineType.PREFILL
if (
args.skip_existing_results
and not args.force_rerun
and check_prefill_results_exist(args.output_dir, num_gpus, args.isl)
):
logger.info(
f"Skipping prefill {num_gpus} GPU(s) - results already exist"
) )
ttft, thpt_per_gpu = load_existing_prefill_results(
args.output_dir, num_gpus, args.isl
)
if ttft is not None and thpt_per_gpu is not None:
prefill_num_gpus.append(num_gpus)
prefill_ttft.append(ttft)
prefill_thpt_per_gpu.append(thpt_per_gpu)
logger.info(
f"Loaded existing prefill results: {num_gpus} GPU TTFT={ttft:.2f}ms, throughput={thpt_per_gpu:.2f} tokens/s/GPU"
)
continue
if args.is_moe_model: for mapping in candidate_mappings:
prefill_config = config_modifier.set_config_tep_size( # Apply parallel mapping to config
prefill_config, num_gpus, args.num_gpus_per_node prefill_config = apply_parallel_mapping_to_config(
) base_prefill_config,
else: mapping,
prefill_config = config_modifier.set_config_tp_size( EngineType.PREFILL,
prefill_config, num_gpus config_modifier,
args.num_gpus_per_node,
) )
logger.info(f"Dynamo config: {prefill_config}") logger.info(f"Dynamo config: {prefill_config}")
work_dir = f"{args.output_dir}/prefill_{num_gpus}gpus" # Work dir includes mapping label (safe chars only)
parallel_mapping_tag = (
mapping.label().replace("=", "").replace("/", "_")
)
work_dir = (
f"{args.output_dir}/prefill_{num_gpus}gpus_{parallel_mapping_tag}"
)
os.makedirs(work_dir, exist_ok=True) os.makedirs(work_dir, exist_ok=True)
prefill_config_fn = f"{work_dir}/config.yaml" prefill_config_fn = f"{work_dir}/config.yaml"
...@@ -231,7 +270,7 @@ async def run_profile(args): ...@@ -231,7 +270,7 @@ async def run_profile(args):
logger.info("Using ai-configurator to estimate prefill latency.") logger.info("Using ai-configurator to estimate prefill latency.")
perf_dict = ai_configurator_perf_estimator.estimate_prefill_perf( perf_dict = ai_configurator_perf_estimator.estimate_prefill_perf(
args.isl, args.isl,
tp_size=num_gpus, tp_size=(mapping.tp or num_gpus),
) )
ttft = perf_dict["context_latency"] ttft = perf_dict["context_latency"]
logger.info(f"Estimated prefill TTFT: {ttft:.2f}ms") logger.info(f"Estimated prefill TTFT: {ttft:.2f}ms")
...@@ -244,7 +283,9 @@ async def run_profile(args): ...@@ -244,7 +283,9 @@ async def run_profile(args):
frontend_port=frontend_port, frontend_port=frontend_port,
deployment_name=prefill_config["metadata"]["name"], deployment_name=prefill_config["metadata"]["name"],
) )
logger.info(f"Created client with service_name: {client.service_name}") logger.info(
f"Created client with service_name: {client.service_name}"
)
deployment_clients.append(client) # Track for cleanup deployment_clients.append(client) # Track for cleanup
await client.create_deployment(prefill_config_fn) await client.create_deployment(prefill_config_fn)
logger.info("Waiting for deployment to be ready...") logger.info("Waiting for deployment to be ready...")
...@@ -276,81 +317,49 @@ async def run_profile(args): ...@@ -276,81 +317,49 @@ async def run_profile(args):
logger.info("Deployment deleted") logger.info("Deployment deleted")
if ttft is not None: if ttft is not None:
prefill_num_gpus.append(num_gpus) prefill_data.add_data(
prefill_ttft.append(ttft) num_gpus=num_gpus,
prefill_thpt_per_gpu.append(args.isl / ttft / num_gpus * 1000) ttft=ttft,
thpt_per_gpu=args.isl / ttft / num_gpus * 1000,
parallel_mapping_label=mapping.label(),
parallel_mapping=mapping,
)
# Plot the results as a 2D scatter plot # Plot the results as a 2D scatter plot
prefill_results = None if prefill_data.num_gpus and prefill_data.ttft and prefill_data.thpt_per_gpu:
if prefill_num_gpus and prefill_ttft and prefill_thpt_per_gpu: plot_prefill_performance(prefill_data, args.ttft, args.output_dir)
prefill_results = (prefill_num_gpus, prefill_ttft, prefill_thpt_per_gpu)
plot_prefill_performance(prefill_results, args.ttft, args.output_dir)
# then profile decode # then profile decode
decode_num_gpus = [] decode_data = DecodeProfileData()
decode_itl = []
decode_thpt_per_gpu = []
decode_concurrency = []
decode_kv_cache_size = []
decode_results = [] # Store partial results for plotting later
logger.info("Profiling decode...") logger.info("Profiling decode...")
decode_config = config_modifier.convert_config( base_decode_config = config_modifier.convert_config(
config, "decode", is_moe_model=args.is_moe_model config, EngineType.DECODE, is_moe_model=args.model_info.is_moe
) )
for num_gpus in profile_num_gpus: for num_gpus in profile_num_gpus:
logger.info(f"Profiling decode with {num_gpus} GPUs...") logger.info(f"Profiling decode with {num_gpus} GPUs...")
candidate_mappings = get_candidate_parallel_mappings(
# Check if results already exist for this GPU count num_gpus, args.model_info, EngineType.DECODE
if (
args.skip_existing_results
and not args.force_rerun
and check_decode_results_exist(
args.output_dir, num_gpus, args.isl, args.osl
) )
):
logger.info(
f"Skipping decode {num_gpus} GPU(s) - results already exist"
)
existing_results = load_existing_decode_results(
args.output_dir, num_gpus, args.isl, args.osl
)
if existing_results:
# Add existing results to our arrays
engine_decode_itl = []
engine_decode_thpt_per_gpu = []
for itl, thpt_per_gpu, concurrency in existing_results:
decode_num_gpus.append(num_gpus)
decode_itl.append(itl)
decode_thpt_per_gpu.append(thpt_per_gpu)
decode_concurrency.append(concurrency)
# We need to get kv_cache_size from existing logs or estimate it
estimated_kv_cache = max(
100000, concurrency * (args.isl + args.osl) * 2
) # Conservative estimate
decode_kv_cache_size.append(estimated_kv_cache)
engine_decode_itl.append(itl)
engine_decode_thpt_per_gpu.append(thpt_per_gpu)
# Store results for plotting
decode_results.append(
(num_gpus, engine_decode_itl, engine_decode_thpt_per_gpu)
)
logger.info(
f"Loaded {len(existing_results)} existing decode results for {num_gpus} GPU(s)"
)
continue
if args.is_moe_model: for mapping in candidate_mappings:
decode_config = config_modifier.set_config_dep_size( # Apply parallel mapping to config
decode_config, num_gpus, args.num_gpus_per_node decode_config = apply_parallel_mapping_to_config(
) base_decode_config,
else: mapping,
decode_config = config_modifier.set_config_tp_size( EngineType.DECODE,
decode_config, num_gpus config_modifier,
args.num_gpus_per_node,
) )
logger.info(f"Dynamo config: {decode_config}") logger.info(f"Dynamo config: {decode_config}")
work_dir = f"{args.output_dir}/decode_{num_gpus}gpus" parallel_mapping_tag = (
mapping.label()
.replace("=", "")
.replace("/", "_") # safe chars for directory
)
work_dir = (
f"{args.output_dir}/decode_{num_gpus}gpus_{parallel_mapping_tag}"
)
os.makedirs(work_dir, exist_ok=True) os.makedirs(work_dir, exist_ok=True)
decode_config_fn = f"{work_dir}/config.yaml" decode_config_fn = f"{work_dir}/config.yaml"
...@@ -364,7 +373,7 @@ async def run_profile(args): ...@@ -364,7 +373,7 @@ async def run_profile(args):
# Compute max_concurrency and max_kv_tokens to know which # Compute max_concurrency and max_kv_tokens to know which
# num_request to sweep over. # num_request to sweep over.
max_concurrency = ai_configurator_perf_estimator.get_max_batch_size( max_concurrency = ai_configurator_perf_estimator.get_max_batch_size(
args.isl, args.osl, tp_size=num_gpus args.isl, args.osl, tp_size=(mapping.tp or num_gpus)
) )
max_kv_tokens = max_concurrency * (args.isl + args.osl) max_kv_tokens = max_concurrency * (args.isl + args.osl)
...@@ -391,8 +400,7 @@ async def run_profile(args): ...@@ -391,8 +400,7 @@ async def run_profile(args):
# Compute max_concurrency and max_kv_tokens to know which # Compute max_concurrency and max_kv_tokens to know which
# num_request to sweep over. # num_request to sweep over.
# For MoE models, attention_dp_size = DEP size (num_gpus), for dense models = 1 attention_dp_size = mapping.get_attn_dp_size(num_gpus)
attention_dp_size = num_gpus if args.is_moe_model else 1
max_kv_tokens = config_modifier.get_kv_cache_size_from_dynamo_log( max_kv_tokens = config_modifier.get_kv_cache_size_from_dynamo_log(
f"{work_dir}/{client.deployment_name}/{WORKER_COMPONENT_NAMES[args.backend].decode_worker_k8s_name.lower()}/0.log", f"{work_dir}/{client.deployment_name}/{WORKER_COMPONENT_NAMES[args.backend].decode_worker_k8s_name.lower()}/0.log",
attention_dp_size=attention_dp_size, attention_dp_size=attention_dp_size,
...@@ -400,7 +408,7 @@ async def run_profile(args): ...@@ -400,7 +408,7 @@ async def run_profile(args):
max_concurrency = max_kv_tokens // (args.isl + args.osl) max_concurrency = max_kv_tokens // (args.isl + args.osl)
if not args.dry_run: if not args.dry_run:
attention_dp_size = num_gpus if args.is_moe_model else 1 attention_dp_size = mapping.get_attn_dp_size(num_gpus)
sweep_num_request = get_num_request_range( sweep_num_request = get_num_request_range(
attention_dp_size, attention_dp_size,
max_concurrency, max_concurrency,
...@@ -410,18 +418,18 @@ async def run_profile(args): ...@@ -410,18 +418,18 @@ async def run_profile(args):
f"Sweeping num_request range based on maximum number of kv tokens: {sweep_num_request}" f"Sweeping num_request range based on maximum number of kv tokens: {sweep_num_request}"
) )
engine_decode_itl = []
engine_decode_thpt_per_gpu = []
for num_request in sweep_num_request: for num_request in sweep_num_request:
itl = thpt_per_gpu = None itl = thpt_per_gpu = None
if args.use_ai_configurator: if args.use_ai_configurator:
logger.info("Using ai-configurator to estimate decode latency.") logger.info(
"Using ai-configurator to estimate decode latency."
)
perf_dict = ai_configurator_perf_estimator.estimate_perf( perf_dict = ai_configurator_perf_estimator.estimate_perf(
args.isl, args.isl,
args.osl, args.osl,
num_request, num_request,
mode="decode", mode=EngineType.DECODE,
tp_size=num_gpus, tp_size=(mapping.tp or num_gpus),
) )
itl = perf_dict["tpot"] itl = perf_dict["tpot"]
...@@ -450,17 +458,14 @@ async def run_profile(args): ...@@ -450,17 +458,14 @@ async def run_profile(args):
) )
if itl is not None and thpt_per_gpu is not None: if itl is not None and thpt_per_gpu is not None:
engine_decode_itl.append(itl) decode_data.add_data(
engine_decode_thpt_per_gpu.append(thpt_per_gpu) num_gpus=num_gpus,
decode_num_gpus.append(num_gpus) itl=itl,
decode_itl.append(itl) thpt_per_gpu=thpt_per_gpu,
decode_thpt_per_gpu.append(thpt_per_gpu) concurrency=num_request,
decode_concurrency.append(num_request) kv_cache_size=max_kv_tokens,
decode_kv_cache_size.append(max_kv_tokens) parallel_mapping_label=mapping.label(),
parallel_mapping=mapping,
# Store partial results for plotting later
decode_results.append(
(num_gpus, engine_decode_itl, engine_decode_thpt_per_gpu)
) )
if not args.dry_run and not args.use_ai_configurator: if not args.dry_run and not args.use_ai_configurator:
...@@ -470,12 +475,12 @@ async def run_profile(args): ...@@ -470,12 +475,12 @@ async def run_profile(args):
logger.info("Deployment deleted") logger.info("Deployment deleted")
# Plot all decode results after profiling is complete # Plot all decode results after profiling is complete
if decode_results: if decode_data.num_gpus:
plot_decode_performance(decode_results, args.itl, args.output_dir) plot_decode_performance(decode_data, args.itl, args.output_dir)
if prefill_results and decode_results: if prefill_data.num_gpus and decode_data.num_gpus:
plot_pd_joint_results( plot_pd_joint_results(
args.isl, args.osl, prefill_results, decode_results, args.output_dir args.isl, args.osl, prefill_data, decode_data, args.output_dir
) )
if args.dry_run: if args.dry_run:
...@@ -483,99 +488,76 @@ async def run_profile(args): ...@@ -483,99 +488,76 @@ async def run_profile(args):
else: else:
logger.info("Analyzing results and generate recommendations...") logger.info("Analyzing results and generate recommendations...")
# Safety guards: no results → exit early with a clear message # Safety guards: no results → exit early with a clear message
if not (prefill_num_gpus and prefill_ttft and prefill_thpt_per_gpu): if not prefill_data.num_gpus:
logger.error("No prefill results produced; skipping recommendations.") logger.error("No prefill results produced; skipping recommendations.")
# select best tp size for prefill # select best parallel mapping for prefill
if min(prefill_ttft) > args.ttft: if min(prefill_data.ttft) > args.ttft:
logger.info( logger.info(
"No TP size satisfies the TTFT requirement, please try a smaller model or a more powerful GPU SKU" "No TP size satisfies the TTFT requirement, please try a smaller model or a more powerful GPU SKU"
) )
selected_prefill_idx = int(np.argmin(np.array(prefill_ttft))) selected_prefill_idx = int(np.argmin(np.array(prefill_data.ttft)))
else: else:
valid_indices = [ valid_indices = [
i for i, ttft in enumerate(prefill_ttft) if ttft <= args.ttft i for i, ttft in enumerate(prefill_data.ttft) if ttft <= args.ttft
] ]
# Among valid TP sizes, select the one with highest throughput per GPU # Among valid TP sizes, select the one with highest throughput per GPU
valid_thpts = [prefill_thpt_per_gpu[i] for i in valid_indices] valid_thpts = [prefill_data.thpt_per_gpu[i] for i in valid_indices]
max_thpt_idx = valid_indices[int(np.argmax(valid_thpts))] max_thpt_idx = valid_indices[int(np.argmax(valid_thpts))]
selected_prefill_idx = max_thpt_idx selected_prefill_idx = max_thpt_idx
logger.info( logger.info(
f"Suggested number of GPUs for prefill: {prefill_num_gpus[selected_prefill_idx]} (TTFT {prefill_ttft[selected_prefill_idx]:.2f} ms, throughput {prefill_thpt_per_gpu[selected_prefill_idx]:.2f} tokens/s/GPU)" f"Suggested prefill parallel mapping: {prefill_data.parallel_mapping_labels[selected_prefill_idx]} on {prefill_data.num_gpus[selected_prefill_idx]} GPU(s) (TTFT {prefill_data.ttft[selected_prefill_idx]:.2f} ms, throughput {prefill_data.thpt_per_gpu[selected_prefill_idx]:.2f} tokens/s/GPU)"
)
# scale up if estimated TTFT is 120% of target TTFT
prefill_queue_size_upper_bound = max(
0.1, args.ttft * 1.2 / prefill_ttft[selected_prefill_idx] - 1
)
# scale down if estimated TTFT is 80% of target TTFT
prefill_queue_size_lower_bound = max(
0.1, args.ttft * 0.8 / prefill_ttft[selected_prefill_idx] - 1
)
logger.info(
f"Suggested planner upper/lower bound for prefill queue size: {prefill_queue_size_upper_bound:.2f}/{prefill_queue_size_lower_bound:.2f}"
) )
# select best gpu count for decode # select best parallel mapping for decode
if not ( if not decode_data.num_gpus:
decode_num_gpus
and decode_itl
and decode_thpt_per_gpu
and decode_concurrency
and decode_kv_cache_size
):
logger.error("No decode results produced; skipping recommendations.") logger.error("No decode results produced; skipping recommendations.")
return return
if min(decode_itl) > args.itl: if min(decode_data.itl) > args.itl:
logger.info( logger.info(
"No TP size satisfies the ITL requirement, please try a smaller model or a more powerful GPU SKU" "No TP size satisfies the ITL requirement, please try a smaller model or a more powerful GPU SKU"
) )
selected_decode_idx = int(np.argmin(np.array(decode_itl))) selected_decode_idx = int(np.argmin(np.array(decode_data.itl)))
else: else:
valid_indices = [ valid_indices = [
i for i, itl in enumerate(decode_itl) if itl <= args.itl i for i, itl in enumerate(decode_data.itl) if itl <= args.itl
] ]
# Among valid TP sizes, select the one with highest throughput per GPU # Among valid TP sizes, select the one with highest throughput per GPU
valid_thpts = [decode_thpt_per_gpu[i] for i in valid_indices] valid_thpts = [decode_data.thpt_per_gpu[i] for i in valid_indices]
max_thpt_idx = valid_indices[int(np.argmax(valid_thpts))] max_thpt_idx = valid_indices[int(np.argmax(valid_thpts))]
selected_decode_idx = max_thpt_idx selected_decode_idx = max_thpt_idx
logger.info( logger.info(
f"Suggested number of GPUs for decode: {decode_num_gpus[selected_decode_idx]} (ITL {decode_itl[selected_decode_idx]:.2f} ms, throughput {decode_thpt_per_gpu[selected_decode_idx]:.2f} tokens/s/GPU)" f"Suggested decode parallel mapping: {decode_data.parallel_mapping_labels[selected_decode_idx]} on {decode_data.num_gpus[selected_decode_idx]} GPU(s) (ITL {decode_data.itl[selected_decode_idx]:.2f} ms, throughput {decode_data.thpt_per_gpu[selected_decode_idx]:.2f} tokens/s/GPU)"
)
# calculate kv cache utlization for the selected TP and concurrency
selected_decode_kv_cache_utilization = (
decode_concurrency[selected_decode_idx]
* (args.isl + (args.osl / 2))
/ decode_kv_cache_size[selected_decode_idx]
)
# set a +- 20% range for the kv cache utilization
logger.info(
f"Suggested planner upper/lower bound for decode kv cache utilization: {min(1, selected_decode_kv_cache_utilization + 0.2):.2f}/{max(0.1, selected_decode_kv_cache_utilization - 0.2):.2f}"
) )
if args.dry_run: if args.dry_run:
# use min value for prefill and decode GPU counts # use min value for prefill and decode GPU counts
prefill_num_gpus = [args.min_num_gpus_per_engine] prefill_data.num_gpus = [args.min_num_gpus_per_engine]
decode_num_gpus = [args.min_num_gpus_per_engine] decode_data.num_gpus = [args.min_num_gpus_per_engine]
prefill_data.parallel_mappings = [
ParallelizationMapping(tp=args.min_num_gpus_per_engine)
]
decode_data.parallel_mappings = [
ParallelizationMapping(tp=args.min_num_gpus_per_engine)
]
selected_prefill_idx = 0 selected_prefill_idx = 0
selected_decode_idx = 0 selected_decode_idx = 0
# interpolate ISL - TTFT with best prefill GPU count # interpolate ISL - TTFT with best prefill parallel mapping
best_prefill_gpus = prefill_num_gpus[selected_prefill_idx] best_prefill_gpus = prefill_data.num_gpus[selected_prefill_idx]
best_prefill_mapping = prefill_data.parallel_mappings[selected_prefill_idx]
logger.info( logger.info(
f"Profiling prefill under best {best_prefill_gpus} GPU(s) with different ISL..." f"Profiling prefill under best {best_prefill_gpus} GPU(s) with parallel mapping [{best_prefill_mapping.label()}] with different ISL..."
) )
prefill_config = config_modifier.convert_config( prefill_config = config_modifier.convert_config(
config, "prefill", is_moe_model=args.is_moe_model config, EngineType.PREFILL, is_moe_model=args.model_info.is_moe
)
if args.is_moe_model:
prefill_config = config_modifier.set_config_tep_size(
prefill_config, best_prefill_gpus, args.num_gpus_per_node
) )
else: prefill_config = apply_parallel_mapping_to_config(
prefill_config = config_modifier.set_config_tp_size( prefill_config,
prefill_config, best_prefill_gpus best_prefill_mapping,
EngineType.PREFILL,
config_modifier,
args.num_gpus_per_node,
) )
logger.info(f"Dynamo config: {prefill_config}") logger.info(f"Dynamo config: {prefill_config}")
...@@ -592,10 +574,10 @@ async def run_profile(args): ...@@ -592,10 +574,10 @@ async def run_profile(args):
profile_prefill_aiconfigurator( profile_prefill_aiconfigurator(
work_dir, work_dir,
best_prefill_gpus, # num_gpus best_prefill_gpus, # num_gpus
args.max_context_length, sweep_max_context_length,
args.prefill_interpolation_granularity, args.prefill_interpolation_granularity,
ai_configurator_perf_estimator, ai_configurator_perf_estimator,
tp_size=best_prefill_gpus, tp_size=(best_prefill_mapping.tp or best_prefill_gpus),
) )
else: else:
client = DynamoDeploymentClient( client = DynamoDeploymentClient(
...@@ -635,7 +617,7 @@ async def run_profile(args): ...@@ -635,7 +617,7 @@ async def run_profile(args):
model_name, model_name,
base_url, base_url,
best_prefill_gpus, best_prefill_gpus,
args.max_context_length, sweep_max_context_length,
args.prefill_interpolation_granularity, args.prefill_interpolation_granularity,
) )
...@@ -644,16 +626,21 @@ async def run_profile(args): ...@@ -644,16 +626,21 @@ async def run_profile(args):
deployment_clients.remove(client) deployment_clients.remove(client)
logger.info("Deployment deleted") logger.info("Deployment deleted")
# interpolate ITL - Active_KV_Cache - Decode_Context_Length with best decode GPU count # interpolate ITL - Active_KV_Cache - Decode_Context_Length with best decode parallel mapping
best_decode_gpus = decode_num_gpus[selected_decode_idx] best_decode_gpus = decode_data.num_gpus[selected_decode_idx]
logger.info(f"Profiling decode with {best_decode_gpus} GPUs...") best_decode_mapping = decode_data.parallel_mappings[selected_decode_idx]
if args.is_moe_model: logger.info(
decode_config = config_modifier.set_config_dep_size( f"Profiling decode with {best_decode_gpus} GPUs with parallel mapping [{best_decode_mapping.label()}]..."
decode_config, best_decode_gpus, args.num_gpus_per_node
) )
else: decode_config = config_modifier.convert_config(
decode_config = config_modifier.set_config_tp_size( config, EngineType.DECODE, is_moe_model=args.model_info.is_moe
decode_config, best_decode_gpus )
decode_config = apply_parallel_mapping_to_config(
decode_config,
best_decode_mapping,
EngineType.DECODE,
config_modifier,
args.num_gpus_per_node,
) )
logger.info(f"Dynamo config: {decode_config}") logger.info(f"Dynamo config: {decode_config}")
...@@ -667,20 +654,19 @@ async def run_profile(args): ...@@ -667,20 +654,19 @@ async def run_profile(args):
if args.dry_run: if args.dry_run:
logger.info("Skipping deployment creation in dry run mode") logger.info("Skipping deployment creation in dry run mode")
elif args.use_ai_configurator: elif args.use_ai_configurator:
# For MoE models, attention_dp_size = DEP size (best_decode_gpus), for dense models = 1 attention_dp_size = best_decode_mapping.get_attn_dp_size(best_decode_gpus)
attention_dp_size = best_decode_gpus if args.is_moe_model else 1
max_kv_tokens = ai_configurator_perf_estimator.get_max_kv_tokens( max_kv_tokens = ai_configurator_perf_estimator.get_max_kv_tokens(
args.isl, args.osl, tp_size=best_decode_gpus args.isl, args.osl, tp_size=(best_decode_mapping.tp or best_decode_gpus)
) )
profile_decode_aiconfigurator( profile_decode_aiconfigurator(
work_dir, work_dir,
best_decode_gpus, # num_gpus best_decode_gpus, # num_gpus
max_kv_tokens, max_kv_tokens,
args.max_context_length, sweep_max_context_length,
args.decode_interpolation_granularity, args.decode_interpolation_granularity,
ai_configurator_perf_estimator, ai_configurator_perf_estimator,
attention_dp_size, attention_dp_size,
tp_size=best_decode_gpus, tp_size=(best_decode_mapping.tp or best_decode_gpus),
) )
else: else:
client = DynamoDeploymentClient( client = DynamoDeploymentClient(
...@@ -703,8 +689,7 @@ async def run_profile(args): ...@@ -703,8 +689,7 @@ async def run_profile(args):
f"Logs have been saved to {client.base_log_dir / client.deployment_name}" f"Logs have been saved to {client.base_log_dir / client.deployment_name}"
) )
# For MoE models, attention_dp_size = DEP size (best_decode_gpus), for dense models = 1 attention_dp_size = best_decode_mapping.get_attn_dp_size(best_decode_gpus)
attention_dp_size = best_decode_gpus if args.is_moe_model else 1
max_kv_tokens = config_modifier.get_kv_cache_size_from_dynamo_log( max_kv_tokens = config_modifier.get_kv_cache_size_from_dynamo_log(
f"{work_dir}/{client.deployment_name}/{WORKER_COMPONENT_NAMES[args.backend].decode_worker_k8s_name.lower()}/0.log", f"{work_dir}/{client.deployment_name}/{WORKER_COMPONENT_NAMES[args.backend].decode_worker_k8s_name.lower()}/0.log",
attention_dp_size=attention_dp_size, attention_dp_size=attention_dp_size,
...@@ -719,7 +704,7 @@ async def run_profile(args): ...@@ -719,7 +704,7 @@ async def run_profile(args):
base_url, base_url,
best_decode_gpus, best_decode_gpus,
max_kv_tokens, max_kv_tokens,
args.max_context_length, sweep_max_context_length,
args.decode_interpolation_granularity, args.decode_interpolation_granularity,
attention_dp_size, attention_dp_size,
) )
...@@ -737,7 +722,7 @@ async def run_profile(args): ...@@ -737,7 +722,7 @@ async def run_profile(args):
best_decode_gpus=best_decode_gpus, best_decode_gpus=best_decode_gpus,
output_dir=args.output_dir, output_dir=args.output_dir,
args=args, args=args,
is_moe_model=args.is_moe_model, is_moe_model=args.model_info.is_moe,
num_gpus_per_node=args.num_gpus_per_node, num_gpus_per_node=args.num_gpus_per_node,
) )
logger.info(f"Final DGD config with planner: {config}") logger.info(f"Final DGD config with planner: {config}")
......
...@@ -17,7 +17,7 @@ import json ...@@ -17,7 +17,7 @@ import json
import logging import logging
import math import math
import shlex import shlex
from typing import Literal, Optional, Protocol from typing import Optional
from pydantic import BaseModel from pydantic import BaseModel
...@@ -378,69 +378,3 @@ def update_image(config: dict, image: str) -> dict: ...@@ -378,69 +378,3 @@ def update_image(config: dict, image: str) -> dict:
logger.debug(f"Updated image for {service_name} to {image}") logger.debug(f"Updated image for {service_name} to {image}")
return cfg.model_dump() return cfg.model_dump()
class ConfigModifierProtocol(Protocol):
@classmethod
def convert_config(
cls,
config: dict,
target: Literal["prefill", "decode"],
is_moe_model: bool = False,
) -> dict:
...
@classmethod
def set_config_tp_size(
cls,
config: dict,
tp_size: int,
component_type: SubComponentType = SubComponentType.DECODE,
) -> dict:
...
@classmethod
def set_config_tep_size(
cls,
config: dict,
tep_size: int,
num_gpus_per_node: int,
component_type: SubComponentType = SubComponentType.DECODE,
) -> dict:
...
@classmethod
def set_config_dep_size(
cls,
config: dict,
dep_size: int,
num_gpus_per_node: int,
component_type: SubComponentType = SubComponentType.DECODE,
) -> dict:
...
@classmethod
def get_model_name(cls, config: dict) -> str:
...
@classmethod
def get_port(cls, config: dict) -> int:
...
@classmethod
def get_kv_cache_size_from_dynamo_log(
cls, dynamo_log_fn: str, attention_dp_size: int = 1
) -> int:
...
@classmethod
def load_default_config(cls) -> dict:
...
@classmethod
def update_model(cls, config: dict, model_name: str) -> dict:
...
@classmethod
def update_image(cls, config: dict, image: str) -> dict:
...
...@@ -16,7 +16,9 @@ ...@@ -16,7 +16,9 @@
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
if TYPE_CHECKING: if TYPE_CHECKING:
from benchmarks.profiler.utils.config import ConfigModifierProtocol from benchmarks.profiler.utils.config_modifiers.protocol import (
ConfigModifierProtocol,
)
from benchmarks.profiler.utils.config_modifiers.sglang import SGLangConfigModifier from benchmarks.profiler.utils.config_modifiers.sglang import SGLangConfigModifier
from benchmarks.profiler.utils.config_modifiers.trtllm import TrtllmConfigModifier from benchmarks.profiler.utils.config_modifiers.trtllm import TrtllmConfigModifier
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import copy
import logging
from dataclasses import dataclass
from enum import Enum
from benchmarks.profiler.utils.defaults import EngineType
from benchmarks.profiler.utils.model_info import ModelInfo
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
formatter = logging.Formatter(
"%(asctime)s - %(name)s - %(levelname)s - %(message)s", "%Y-%m-%d %H:%M:%S"
)
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)
class ParallelizationStrategy(Enum):
"""Enum for parallelization strategy types."""
TP = "TP"
TEP = "TEP"
DEP = "DEP"
@dataclass(frozen=True)
class ParallelizationMapping:
"""
Represents parallelization mapping of configs
"""
tp: int | None = None
tep: int | None = None
dep: int | None = None
def label(self) -> str:
if self.tp is not None:
return f"{ParallelizationStrategy.TP.value}={self.tp}"
if self.tep is not None:
return f"{ParallelizationStrategy.TEP.value}={self.tep}"
if self.dep is not None:
return f"{ParallelizationStrategy.DEP.value}={self.dep}"
return "default"
def get_tp_size(self) -> int:
"""
Get the effective TP size for KV heads splitting.
Both TP and TEP split KV heads, DEP doesn't (returns 1).
"""
if self.tp is not None:
return self.tp
if self.tep is not None:
return self.tep
return 1 # DEP has TP split of 1
def get_expert_split(self) -> int:
"""
Get the effective expert split size.
Both TEP and DEP split experts, TP doesn't (returns 1).
"""
if self.tep is not None:
return self.tep
if self.dep is not None:
return self.dep
return 1 # TP has expert split of 1
def get_attn_dp_size(self, num_gpus: int) -> int:
"""
Get the attention data parallelism size.
DEP uses data parallelism for attention (returns num_gpus).
TP and TEP don't use data parallelism for attention (returns 1).
Args:
num_gpus: Total number of GPUs being used
Returns:
The attention data parallelism size
"""
if self.dep is not None:
return num_gpus
return 1 # TP and TEP have attention DP size of 1
def _check_divisibility(
value: int | None,
divisor: int,
value_name: str,
divisor_name: str,
mapping_label: str,
) -> bool:
"""
Check if value is divisible by divisor.
Returns True if valid (or value is None), False if invalid.
Args:
value: The value to check (e.g., num_kv_heads, num_experts)
divisor: The divisor to check against
value_name: Name of the value for error messages
divisor_name: Name of the divisor for error messages (e.g., "tp_size", "expert_split")
mapping_label: Label of the mapping for error messages
"""
if value is None:
logger.warning(
f"Skipping {value_name} divisibility check for {mapping_label}: {value_name} is unknown"
)
return True
if divisor > 1 and int(value) % divisor != 0:
logger.warning(
f"Invalid mapping {mapping_label}: {value_name}={value} not divisible by {divisor_name}={divisor}"
)
return False
return True
def _validate_intermediate_size(
mapping: ParallelizationMapping,
intermediate_size: int | None,
quant_block: int | None,
) -> bool:
"""
Validate intermediate size and quantization block for TP and TEP strategies.
Checks:
- intermediate_size % tp_size == 0
- (intermediate_size // tp_size) divides quant_block (if quant_block is known)
"""
tp_size = mapping.get_tp_size()
# Check basic divisibility
if not _check_divisibility(
intermediate_size, tp_size, "intermediate_size", "tp_size", mapping.label()
):
return False
# Additional check for quantization block constraint
if intermediate_size is not None and quant_block is not None and tp_size > 1:
per_shard = int(intermediate_size) // tp_size
if not _check_divisibility(
per_shard, quant_block, "per_shard", "quant_block", mapping.label()
):
return False
return True
def get_candidate_parallel_mappings(
num_gpus: int, model_info: ModelInfo, phase: str
) -> list[ParallelizationMapping]:
"""
Return a list of candidate parallelization mappings for a given GPU count and phase,
verified against model properties.
Verification rules:
- TP and TEP must divide num_kv_heads (if available)
- TEP and DEP must divide num_experts (if available)
"""
is_moe = bool(model_info.is_moe)
num_kv_heads = model_info.num_kv_heads
num_experts = model_info.num_experts
intermediate_size = model_info.intermediate_size
quant_block = model_info.quantization_block_size
candidates: list[ParallelizationMapping] = []
if is_moe:
if phase == EngineType.PREFILL:
candidates = [ParallelizationMapping(tep=num_gpus)]
elif phase == EngineType.DECODE:
candidates = [
ParallelizationMapping(dep=num_gpus),
ParallelizationMapping(tep=num_gpus),
]
else:
candidates = [ParallelizationMapping(tp=num_gpus)]
# Verify candidates against model constraints
verified: list[ParallelizationMapping] = []
for m in candidates:
# Check KV heads divisibility
if not _check_divisibility(
num_kv_heads, m.get_tp_size(), "num_kv_heads", "tp_size", m.label()
):
continue
# Check experts divisibility
if not _check_divisibility(
num_experts, m.get_expert_split(), "num_experts", "expert_split", m.label()
):
continue
# Check intermediate size and quantization block
if not _validate_intermediate_size(m, intermediate_size, quant_block):
continue
verified.append(m)
return verified
def apply_parallel_mapping_to_config(
base_config: dict,
mapping: ParallelizationMapping,
phase: str,
config_modifier,
num_gpus_per_node: int | None,
) -> dict:
cfg = copy.deepcopy(base_config)
if mapping.tp is not None:
cfg = config_modifier.set_config_tp_size(cfg, mapping.tp)
elif phase == EngineType.PREFILL and mapping.tep is not None:
cfg = config_modifier.set_config_tep_size(cfg, mapping.tep, num_gpus_per_node)
elif phase == EngineType.DECODE and mapping.dep is not None:
cfg = config_modifier.set_config_dep_size(cfg, mapping.dep, num_gpus_per_node)
else:
pass
return cfg
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Protocol
from benchmarks.profiler.utils.defaults import EngineType
from dynamo.planner.defaults import SubComponentType
class ConfigModifierProtocol(Protocol):
@classmethod
def convert_config(
cls,
config: dict,
target: EngineType,
is_moe_model: bool = False,
) -> dict:
...
@classmethod
def set_config_tp_size(
cls,
config: dict,
tp_size: int,
component_type: SubComponentType = SubComponentType.DECODE,
) -> dict:
...
@classmethod
def set_config_tep_size(
cls,
config: dict,
tep_size: int,
num_gpus_per_node: int,
component_type: SubComponentType = SubComponentType.DECODE,
) -> dict:
...
@classmethod
def set_config_dep_size(
cls,
config: dict,
dep_size: int,
num_gpus_per_node: int,
component_type: SubComponentType = SubComponentType.DECODE,
) -> dict:
...
@classmethod
def get_model_name(cls, config: dict) -> str:
...
@classmethod
def get_port(cls, config: dict) -> int:
...
@classmethod
def get_kv_cache_size_from_dynamo_log(
cls, dynamo_log_fn: str, attention_dp_size: int = 1
) -> int:
...
@classmethod
def load_default_config(cls) -> dict:
...
@classmethod
def update_model(cls, config: dict, model_name: str) -> dict:
...
@classmethod
def update_image(cls, config: dict, image: str) -> dict:
...
...@@ -3,7 +3,6 @@ ...@@ -3,7 +3,6 @@
import logging import logging
import re import re
from typing import Literal
import yaml import yaml
...@@ -22,6 +21,7 @@ from benchmarks.profiler.utils.config import ( ...@@ -22,6 +21,7 @@ from benchmarks.profiler.utils.config import (
from benchmarks.profiler.utils.defaults import ( from benchmarks.profiler.utils.defaults import (
DEFAULT_MODEL_NAME, DEFAULT_MODEL_NAME,
DYNAMO_RUN_DEFAULT_PORT, DYNAMO_RUN_DEFAULT_PORT,
EngineType,
) )
from dynamo.planner.defaults import SubComponentType from dynamo.planner.defaults import SubComponentType
...@@ -82,7 +82,7 @@ class SGLangConfigModifier: ...@@ -82,7 +82,7 @@ class SGLangConfigModifier:
def convert_config( def convert_config(
cls, cls,
config: dict, config: dict,
target: Literal["prefill", "decode"], target: EngineType,
is_moe_model: bool = False, is_moe_model: bool = False,
) -> dict: ) -> dict:
cfg = Config.model_validate(config) cfg = Config.model_validate(config)
...@@ -94,7 +94,7 @@ class SGLangConfigModifier: ...@@ -94,7 +94,7 @@ class SGLangConfigModifier:
if "Planner" in cfg.spec.services: if "Planner" in cfg.spec.services:
del cfg.spec.services["Planner"] del cfg.spec.services["Planner"]
if target == "prefill": if target == EngineType.PREFILL:
# Get service names by inferring from subComponentType first # Get service names by inferring from subComponentType first
prefill_service_name = get_service_name_by_type( prefill_service_name = get_service_name_by_type(
cfg, "sglang", SubComponentType.PREFILL cfg, "sglang", SubComponentType.PREFILL
...@@ -131,7 +131,7 @@ class SGLangConfigModifier: ...@@ -131,7 +131,7 @@ class SGLangConfigModifier:
worker_service.extraPodSpec.mainContainer.args = args worker_service.extraPodSpec.mainContainer.args = args
elif target == "decode": elif target == EngineType.DECODE:
# Get service names by inferring from subComponentType first # Get service names by inferring from subComponentType first
prefill_service_name = get_service_name_by_type( prefill_service_name = get_service_name_by_type(
cfg, "sglang", SubComponentType.PREFILL cfg, "sglang", SubComponentType.PREFILL
...@@ -292,6 +292,12 @@ class SGLangConfigModifier: ...@@ -292,6 +292,12 @@ class SGLangConfigModifier:
return DEFAULT_MODEL_NAME return DEFAULT_MODEL_NAME
args = break_arguments(args) args = break_arguments(args)
# Check for --model-path first (primary argument for SGLang)
for i, arg in enumerate(args):
if arg == "--model-path" and i + 1 < len(args):
return args[i + 1]
# Fall back to --served-model-name if --model-path not found
for i, arg in enumerate(args): for i, arg in enumerate(args):
if arg == "--served-model-name" and i + 1 < len(args): if arg == "--served-model-name" and i + 1 < len(args):
return args[i + 1] return args[i + 1]
......
...@@ -4,7 +4,6 @@ ...@@ -4,7 +4,6 @@
import json import json
import logging import logging
import re import re
from typing import Literal
import yaml import yaml
...@@ -24,6 +23,7 @@ from benchmarks.profiler.utils.config import ( ...@@ -24,6 +23,7 @@ from benchmarks.profiler.utils.config import (
from benchmarks.profiler.utils.defaults import ( from benchmarks.profiler.utils.defaults import (
DEFAULT_MODEL_NAME, DEFAULT_MODEL_NAME,
DYNAMO_RUN_DEFAULT_PORT, DYNAMO_RUN_DEFAULT_PORT,
EngineType,
) )
from dynamo.planner.defaults import SubComponentType from dynamo.planner.defaults import SubComponentType
...@@ -84,7 +84,7 @@ class TrtllmConfigModifier: ...@@ -84,7 +84,7 @@ class TrtllmConfigModifier:
def convert_config( def convert_config(
cls, cls,
config: dict, config: dict,
target: Literal["prefill", "decode"], target: EngineType,
is_moe_model: bool = False, is_moe_model: bool = False,
) -> dict: ) -> dict:
if is_moe_model: if is_moe_model:
...@@ -101,7 +101,7 @@ class TrtllmConfigModifier: ...@@ -101,7 +101,7 @@ class TrtllmConfigModifier:
if "Planner" in cfg.spec.services: if "Planner" in cfg.spec.services:
del cfg.spec.services["Planner"] del cfg.spec.services["Planner"]
if target == "prefill": if target == EngineType.PREFILL:
# Get service names by inferring from subComponentType first # Get service names by inferring from subComponentType first
prefill_service_name = get_service_name_by_type( prefill_service_name = get_service_name_by_type(
cfg, "trtllm", SubComponentType.PREFILL cfg, "trtllm", SubComponentType.PREFILL
...@@ -157,7 +157,7 @@ class TrtllmConfigModifier: ...@@ -157,7 +157,7 @@ class TrtllmConfigModifier:
worker_service.extraPodSpec.mainContainer.args = args worker_service.extraPodSpec.mainContainer.args = args
elif target == "decode": elif target == EngineType.DECODE:
# Get service names by inferring from subComponentType first # Get service names by inferring from subComponentType first
prefill_service_name = get_service_name_by_type( prefill_service_name = get_service_name_by_type(
cfg, "trtllm", SubComponentType.PREFILL cfg, "trtllm", SubComponentType.PREFILL
......
...@@ -2,7 +2,6 @@ ...@@ -2,7 +2,6 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import logging import logging
from typing import Literal
import yaml import yaml
...@@ -20,6 +19,7 @@ from benchmarks.profiler.utils.config import ( ...@@ -20,6 +19,7 @@ from benchmarks.profiler.utils.config import (
from benchmarks.profiler.utils.defaults import ( from benchmarks.profiler.utils.defaults import (
DEFAULT_MODEL_NAME, DEFAULT_MODEL_NAME,
DYNAMO_RUN_DEFAULT_PORT, DYNAMO_RUN_DEFAULT_PORT,
EngineType,
) )
from dynamo.planner.defaults import SubComponentType from dynamo.planner.defaults import SubComponentType
...@@ -79,7 +79,7 @@ class VllmV1ConfigModifier: ...@@ -79,7 +79,7 @@ class VllmV1ConfigModifier:
def convert_config( def convert_config(
cls, cls,
config: dict, config: dict,
target: Literal["prefill", "decode"], target: EngineType,
is_moe_model: bool = False, is_moe_model: bool = False,
) -> dict: ) -> dict:
if is_moe_model: if is_moe_model:
...@@ -96,7 +96,7 @@ class VllmV1ConfigModifier: ...@@ -96,7 +96,7 @@ class VllmV1ConfigModifier:
if "Planner" in cfg.spec.services: if "Planner" in cfg.spec.services:
del cfg.spec.services["Planner"] del cfg.spec.services["Planner"]
if target == "prefill": if target == EngineType.PREFILL:
# Get service names by inferring from subComponentType first # Get service names by inferring from subComponentType first
prefill_service_name = get_service_name_by_type( prefill_service_name = get_service_name_by_type(
cfg, "vllm", SubComponentType.PREFILL cfg, "vllm", SubComponentType.PREFILL
...@@ -133,7 +133,7 @@ class VllmV1ConfigModifier: ...@@ -133,7 +133,7 @@ class VllmV1ConfigModifier:
worker_service.extraPodSpec.mainContainer.args = args worker_service.extraPodSpec.mainContainer.args = args
elif target == "decode": elif target == EngineType.DECODE:
# Get service names by inferring from subComponentType first # Get service names by inferring from subComponentType first
prefill_service_name = get_service_name_by_type( prefill_service_name = get_service_name_by_type(
cfg, "vllm", SubComponentType.PREFILL cfg, "vllm", SubComponentType.PREFILL
......
...@@ -13,9 +13,16 @@ ...@@ -13,9 +13,16 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from enum import Enum
DEFAULT_MODEL_NAME = "Qwen/Qwen3-0.6B" DEFAULT_MODEL_NAME = "Qwen/Qwen3-0.6B"
DYNAMO_RUN_DEFAULT_PORT = 8000 DYNAMO_RUN_DEFAULT_PORT = 8000
# set a decode maximum concurrency due to limits of profiling tools # set a decode maximum concurrency due to limits of profiling tools
# for MoE models with attn-dp, we might hit this limit # for MoE models with attn-dp, we might hit this limit
DECODE_MAX_CONCURRENCY = 2000 DECODE_MAX_CONCURRENCY = 2000
class EngineType(str, Enum):
PREFILL = "prefill"
DECODE = "decode"
...@@ -5,6 +5,7 @@ from pathlib import Path ...@@ -5,6 +5,7 @@ from pathlib import Path
from typing import Optional, Union from typing import Optional, Union
from huggingface_hub import model_info from huggingface_hub import model_info
from pydantic import BaseModel
from transformers import AutoConfig from transformers import AutoConfig
DTYPE_BYTES_MAP = { DTYPE_BYTES_MAP = {
...@@ -103,10 +104,21 @@ def get_model_weight_size( ...@@ -103,10 +104,21 @@ def get_model_weight_size(
return get_model_weight_size_from_hub(str(model_name_or_path)) return get_model_weight_size_from_hub(str(model_name_or_path))
class ModelInfo(BaseModel):
model_size: float
architecture: str
is_moe: bool
max_context_length: Optional[int] = None
num_experts: Optional[int] = None
intermediate_size: Optional[int] = None
num_kv_heads: Optional[int] = None
quantization_block_size: Optional[int] = None
def get_model_info( def get_model_info(
model_name_or_path: Union[str, Path], model_name_or_path: Union[str, Path],
trust_remote_code: bool = False, trust_remote_code: bool = False,
) -> dict: ) -> ModelInfo:
model_size = get_model_weight_size(model_name_or_path) model_size = get_model_weight_size(model_name_or_path)
config = AutoConfig.from_pretrained( config = AutoConfig.from_pretrained(
...@@ -114,10 +126,11 @@ def get_model_info( ...@@ -114,10 +126,11 @@ def get_model_info(
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
) )
if config.architectures[0] in MOE_ARCHITECTURES: architecture = config.architectures[0]
config.is_moe = True if architecture in MOE_ARCHITECTURES:
is_moe = True
else: else:
config.is_moe = False is_moe = False
# Detect max context length from config # Detect max context length from config
# Different models use different attribute names for max context length # Different models use different attribute names for max context length
...@@ -132,7 +145,7 @@ def get_model_info( ...@@ -132,7 +145,7 @@ def get_model_info(
# Detect number of experts for MoE models # Detect number of experts for MoE models
# Different models use different attribute names # Different models use different attribute names
num_experts = None num_experts = None
if config.is_moe: if is_moe:
expert_attrs = [ expert_attrs = [
"n_routed_experts", # DeepSeek V3/R1 "n_routed_experts", # DeepSeek V3/R1
"num_local_experts", # Mixtral, Qwen "num_local_experts", # Mixtral, Qwen
...@@ -145,12 +158,78 @@ def get_model_info( ...@@ -145,12 +158,78 @@ def get_model_info(
num_experts = value num_experts = value
break break
return { # Detect intermediate size (FFN hidden dimension)
"model_size": model_size, intermediate_size = None
"is_moe": config.is_moe, intermediate_attrs = [
"max_context_length": max_context_length, "intermediate_size", # Most common (BERT, LLaMA, etc.)
"num_experts": num_experts, "ffn_dim", # Some transformer models
} ]
for attr in intermediate_attrs:
if hasattr(config, attr):
value = getattr(config, attr)
if value is not None:
intermediate_size = value
break
# Detect number of key-value heads (for GQA)
num_kv_heads = None
kv_head_attrs = [
"num_key_value_heads", # LLaMA 2/3, Mistral, etc.
"num_kv_heads", # Alternative name
]
for attr in kv_head_attrs:
if hasattr(config, attr):
value = getattr(config, attr)
if value is not None:
num_kv_heads = value
break
# If not found, check if it equals num_attention_heads (standard MHA)
if num_kv_heads is None and hasattr(config, "num_attention_heads"):
num_kv_heads = config.num_attention_heads
# Detect quantization block size
quantization_block_size = None
if hasattr(config, "quantization_config"):
quant_config = config.quantization_config
if isinstance(quant_config, dict):
# Check for common quantization block size attributes
quantization_block_size = (
quant_config.get("weight_block_size")
or quant_config.get("block_size")
or quant_config.get("group_size")
or quant_config.get("q_group_size")
)
elif quant_config is not None:
# Handle object-based quantization config
for attr in [
"weight_block_size",
"block_size",
"group_size",
"q_group_size",
]:
if hasattr(quant_config, attr):
value = getattr(quant_config, attr)
if value is not None:
quantization_block_size = value
break
# Handle case where block size is a list (e.g., [128, 128] for [input, output] block sizes)
if (
isinstance(quantization_block_size, list)
and len(quantization_block_size) > 0
):
quantization_block_size = max(quantization_block_size)
return ModelInfo(
model_size=model_size,
architecture=architecture,
is_moe=is_moe,
max_context_length=max_context_length,
num_experts=num_experts,
intermediate_size=intermediate_size,
num_kv_heads=num_kv_heads,
quantization_block_size=quantization_block_size,
)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
# limitations under the License. # limitations under the License.
import logging import logging
from collections import defaultdict
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy as np import numpy as np
...@@ -33,22 +34,27 @@ console_handler.setFormatter(formatter) ...@@ -33,22 +34,27 @@ console_handler.setFormatter(formatter)
logger.addHandler(console_handler) logger.addHandler(console_handler)
def plot_prefill_performance(prefill_results, target_ttft, output_dir): def plot_prefill_performance(prefill_data, target_ttft, output_dir):
""" """
Plot prefill performance as a 2D scatter plot with GPU count annotations. Plot prefill performance as a 2D scatter plot with GPU count and mapping annotations.
Args: Args:
prefill_results: tuple of (prefill_num_gpu, prefill_ttft, prefill_thpt_per_gpu) prefill_data: PrefillProfileData instance containing profiling results
target_ttft: target TTFT value for the vertical line target_ttft: target TTFT value for the vertical line
output_dir: directory to save the plot output_dir: directory to save the plot
""" """
prefill_num_gpu, prefill_ttft, prefill_thpt_per_gpu = prefill_results
plt.figure(figsize=(10, 6)) plt.figure(figsize=(10, 6))
plt.scatter(prefill_ttft, prefill_thpt_per_gpu, s=100) plt.scatter(prefill_data.ttft, prefill_data.thpt_per_gpu, s=100)
for i, num_gpu in enumerate(prefill_num_gpu): for i, num_gpu in enumerate(prefill_data.num_gpus):
label_suffix = (
f" [{prefill_data.parallel_mapping_labels[i]}]"
if prefill_data.parallel_mapping_labels
and i < len(prefill_data.parallel_mapping_labels)
else ""
)
plt.annotate( plt.annotate(
f"{num_gpu} GPU(s)", f"{num_gpu} GPU(s){label_suffix}",
(prefill_ttft[i], prefill_thpt_per_gpu[i]), (prefill_data.ttft[i], prefill_data.thpt_per_gpu[i]),
xytext=(10, 0), xytext=(10, 0),
textcoords="offset points", textcoords="offset points",
fontsize=10, fontsize=10,
...@@ -70,19 +76,46 @@ def plot_prefill_performance(prefill_results, target_ttft, output_dir): ...@@ -70,19 +76,46 @@ def plot_prefill_performance(prefill_results, target_ttft, output_dir):
plt.close() plt.close()
def plot_decode_performance(decode_results, target_itl, output_dir): def plot_decode_performance(decode_data, target_itl, output_dir):
""" """
Plot decode performance with multiple GPU count lines. Plot decode performance with multiple GPU count lines.
Args: Args:
decode_results: list of tuples (num_gpu, itl_list, thpt_per_gpu_list) decode_data: DecodeProfileData instance containing profiling results
target_itl: target ITL value for the vertical line target_itl: target ITL value for the vertical line
output_dir: directory to save the plot output_dir: directory to save the plot
""" """
plt.figure(figsize=(10, 6)) plt.figure(figsize=(10, 6))
for num_gpu, itl_list, thpt_per_gpu_list in decode_results: # Group data by (num_gpus, parallel_mapping_label) combination
plt.plot(itl_list, thpt_per_gpu_list, label=f"{num_gpu} GPU(s)") grouped_data: defaultdict[tuple[int, str], dict[str, list[float]]] = defaultdict(
lambda: {"itl": [], "thpt": []}
)
for i in range(len(decode_data.num_gpus)):
num_gpu = decode_data.num_gpus[i]
label = (
decode_data.parallel_mapping_labels[i]
if decode_data.parallel_mapping_labels
else ""
)
key = (num_gpu, label)
grouped_data[key]["itl"].append(decode_data.itl[i])
grouped_data[key]["thpt"].append(decode_data.thpt_per_gpu[i])
# Plot each group as a line
for (num_gpu, parallel_mapping_label), data in sorted(grouped_data.items()):
if parallel_mapping_label:
label = f"{num_gpu} GPU(s) [{parallel_mapping_label}]"
else:
label = f"{num_gpu} GPU(s)"
# Sort by ITL for proper line plotting
sorted_pairs = sorted(zip(data["itl"], data["thpt"]))
itl_sorted = [x[0] for x in sorted_pairs]
thpt_sorted = [x[1] for x in sorted_pairs]
plt.plot(itl_sorted, thpt_sorted, label=label, marker="o")
plt.axvline( plt.axvline(
x=target_itl, color="r", linestyle="--", label=f"Target ITL: {target_itl} ms" x=target_itl, color="r", linestyle="--", label=f"Target ITL: {target_itl} ms"
...@@ -253,18 +286,24 @@ def plot_decode_3d_surface( ...@@ -253,18 +286,24 @@ def plot_decode_3d_surface(
plt.close() plt.close()
def plot_pd_joint_results(isl, osl, prefill_results, decode_results, output_dir): def plot_pd_joint_results(isl, osl, prefill_data, decode_data, output_dir):
"""
Plot joint prefill and decode results showing cost per 1000 requests under different SLA.
Args:
isl: input sequence length
osl: output sequence length
prefill_data: PrefillProfileData instance containing profiling results
decode_data: DecodeProfileData instance containing profiling results
output_dir: directory to save the plot
"""
GPU_COST_PER_HOUR = 3.0 # $3/hour GPU_COST_PER_HOUR = 3.0 # $3/hour
# compute pareto front for prefill # compute pareto front for prefill
p_ttft, p_thpt = compute_pareto(prefill_results[1], prefill_results[2]) p_ttft, p_thpt = compute_pareto(prefill_data.ttft, prefill_data.thpt_per_gpu)
# compute pareto front for decode # compute pareto front for decode
_d_itl, _d_thpt = [], [] d_itl, d_thpt = compute_pareto(decode_data.itl, decode_data.thpt_per_gpu)
for _d_result in decode_results:
_d_itl.extend(_d_result[1])
_d_thpt.extend(_d_result[2])
d_itl, d_thpt = compute_pareto(_d_itl, _d_thpt)
# convert to cost per thousand requests # convert to cost per thousand requests
p_ttft = np.array(p_ttft) p_ttft = np.array(p_ttft)
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import glob
import json
import logging
import os
import re
from typing import List, Optional, Tuple
logger = logging.getLogger(__name__)
def check_prefill_results_exist(output_dir: str, tp_size: int, isl: int) -> bool:
"""Check if prefill results already exist for a given TP size."""
work_dir = f"{output_dir}/prefill_tp{tp_size}"
result_file = f"{work_dir}/aiperf_isl{isl}/*/profile_export_aiperf.json"
# Check if the work directory exists
if not os.path.exists(work_dir):
return False
# Look for the aiperf result file
result_files = glob.glob(result_file)
if not result_files:
return False
# Verify the result file has valid data
try:
with open(result_files[0], "r") as f:
data = json.load(f)
# Check if it has the required metrics
if "time_to_first_token" in data and "avg" in data["time_to_first_token"]:
logger.info(
f"Found existing prefill results for TP{tp_size} at {result_files[0]}"
)
return True
except (json.JSONDecodeError, KeyError, FileNotFoundError):
pass
return False
def check_decode_results_exist(
output_dir: str, tp_size: int, isl: int, osl: int
) -> bool:
"""Check if decode results already exist for a given TP size."""
work_dir = f"{output_dir}/decode_tp{tp_size}"
# Check if the work directory exists
if not os.path.exists(work_dir):
return False
# Look for at least one decode result file
result_pattern = (
f"{work_dir}/aiperf_request*_isl{isl}_osl{osl}_n*/*/profile_export_aiperf.json"
)
result_files = glob.glob(result_pattern)
if not result_files:
return False
# Verify at least one result file has valid data
try:
with open(result_files[0], "r") as f:
data = json.load(f)
# Check if it has the required metrics
if "inter_token_latency" in data and "avg" in data["inter_token_latency"]:
logger.info(
f"Found existing decode results for TP{tp_size} at {result_files[0]} (and {len(result_files)-1} others)"
)
return True
except (json.JSONDecodeError, KeyError, FileNotFoundError):
pass
return False
def load_existing_prefill_results(
output_dir: str, tp_size: int, isl: int
) -> Tuple[Optional[float], Optional[float]]:
"""Load existing prefill results from disk."""
work_dir = f"{output_dir}/prefill_tp{tp_size}"
result_file = f"{work_dir}/aiperf_isl{isl}/*/profile_export_aiperf.json"
result_files = glob.glob(result_file)
if result_files:
try:
with open(result_files[0], "r") as f:
data = json.load(f)
ttft = data["time_to_first_token"]["avg"]
thpt_per_gpu = isl / ttft / tp_size * 1000
return ttft, thpt_per_gpu
except (json.JSONDecodeError, KeyError, FileNotFoundError):
pass
return None, None
def load_existing_decode_results(
output_dir: str, tp_size: int, isl: int, osl: int
) -> List[Tuple[float, float, int]]:
"""Load existing decode results from disk."""
work_dir = f"{output_dir}/decode_tp{tp_size}"
result_pattern = (
f"{work_dir}/aiperf_request*_isl{isl}_osl{osl}_n*/*/profile_export_aiperf.json"
)
result_files = glob.glob(result_pattern)
decode_results = []
for result_file in result_files:
try:
with open(result_file, "r") as f:
data = json.load(f)
itl = data["inter_token_latency"]["avg"]
thpt_per_gpu = data["output_token_throughput"]["avg"] / tp_size
# Extract concurrency from filename
match = re.search(r"aiperf_request(\d+)_", result_file)
if match:
concurrency = int(match.group(1))
decode_results.append((itl, thpt_per_gpu, concurrency))
except (json.JSONDecodeError, KeyError, FileNotFoundError):
continue
return decode_results
...@@ -76,8 +76,6 @@ def create_profiler_parser() -> argparse.Namespace: ...@@ -76,8 +76,6 @@ def create_profiler_parser() -> argparse.Namespace:
max_num_gpus_per_engine: Int (maximum number of GPUs per engine, default: 0) max_num_gpus_per_engine: Int (maximum number of GPUs per engine, default: 0)
num_gpus_per_node: Int (number of GPUs per node for MoE models - this will be the granularity when searching for the best TEP/DEP size, default: 0) num_gpus_per_node: Int (number of GPUs per node for MoE models - this will be the granularity when searching for the best TEP/DEP size, default: 0)
sweep: sweep:
skip_existing_results: Boolean (skip TP sizes that already have results in the output directory, default: False)
force_rerun: Boolean (force re-running all tests even if results already exist (overrides --skip-existing-results), default: False)
prefill_interpolation_granularity: Int (how many samples to benchmark to interpolate TTFT under different ISL, default: 16) prefill_interpolation_granularity: Int (how many samples to benchmark to interpolate TTFT under different ISL, default: 16)
decode_interpolation_granularity: Int (how many samples to benchmark to interpolate ITL under different active kv cache size and decode context length, default: 6) decode_interpolation_granularity: Int (how many samples to benchmark to interpolate ITL under different active kv cache size and decode context length, default: 6)
use_ai_configurator: Boolean (use ai-configurator to estimate benchmarking results instead of running actual deployment, default: False) use_ai_configurator: Boolean (use ai-configurator to estimate benchmarking results instead of running actual deployment, default: False)
...@@ -158,26 +156,20 @@ def create_profiler_parser() -> argparse.Namespace: ...@@ -158,26 +156,20 @@ def create_profiler_parser() -> argparse.Namespace:
parser.add_argument( parser.add_argument(
"--min-num-gpus-per-engine", "--min-num-gpus-per-engine",
type=int, type=int,
default=config.get("hardware", {}).get("min_num_gpus_per_engine", 1), default=config.get("hardware", {}).get("min_num_gpus_per_engine", 0),
help="minimum number of GPUs per engine", help="minimum number of GPUs per engine",
) )
parser.add_argument( parser.add_argument(
"--max-num-gpus-per-engine", "--max-num-gpus-per-engine",
type=int, type=int,
default=config.get("hardware", {}).get("max_num_gpus_per_engine", 8), default=config.get("hardware", {}).get("max_num_gpus_per_engine", 0),
help="maximum number of GPUs per engine", help="maximum number of GPUs per engine",
) )
parser.add_argument( parser.add_argument(
"--skip-existing-results", "--num-gpus-per-node",
action="store_true", type=int,
default=config.get("sweep", {}).get("skip_existing_results", False), default=config.get("hardware", {}).get("num_gpus_per_node", 0),
help="Skip TP sizes that already have results in the output directory", help="Number of GPUs per node for MoE models - this will be the granularity when searching for the best TEP/DEP size",
)
parser.add_argument(
"--force-rerun",
action="store_true",
default=config.get("sweep", {}).get("force_rerun", False),
help="Force re-running all tests even if results already exist (overrides --skip-existing-results)",
) )
parser.add_argument( parser.add_argument(
"--isl", "--isl",
...@@ -235,19 +227,6 @@ def create_profiler_parser() -> argparse.Namespace: ...@@ -235,19 +227,6 @@ def create_profiler_parser() -> argparse.Namespace:
default=config.get("sweep", {}).get("dry_run", False), default=config.get("sweep", {}).get("dry_run", False),
help="Dry run the profile job", help="Dry run the profile job",
) )
parser.add_argument(
"--is-moe-model",
action="store_true",
dest="is_moe_model",
default=config.get("engine", {}).get("is_moe_model", False),
help="Enable MoE (Mixture of Experts) model support, use TEP for prefill and DEP for decode",
)
parser.add_argument(
"--num-gpus-per-node",
type=int,
default=config.get("hardware", {}).get("num_gpus_per_node", 8),
help="Number of GPUs per node for MoE models - this will be the granularity when searching for the best TEP/DEP size",
)
parser.add_argument( parser.add_argument(
"--enable-gpu-discovery", "--enable-gpu-discovery",
action="store_true", action="store_true",
...@@ -311,9 +290,5 @@ def create_profiler_parser() -> argparse.Namespace: ...@@ -311,9 +290,5 @@ def create_profiler_parser() -> argparse.Namespace:
if not args.model and not args.config: if not args.model and not args.config:
parser.error("--model or --config is required (provide at least one)") parser.error("--model or --config is required (provide at least one)")
# Run auto-generation if GPU discovery is enabled
# This will override any manually specified hardware parameters
if args.enable_gpu_discovery:
auto_generate_search_space(args) auto_generate_search_space(args)
return args return args
...@@ -9,7 +9,7 @@ import os ...@@ -9,7 +9,7 @@ import os
import yaml import yaml
from benchmarks.profiler.utils.config_modifiers import CONFIG_MODIFIERS from benchmarks.profiler.utils.config_modifiers import CONFIG_MODIFIERS
from benchmarks.profiler.utils.model_info import get_model_info from benchmarks.profiler.utils.model_info import ModelInfo, get_model_info
from deploy.utils.gpu_inventory import get_gpu_summary from deploy.utils.gpu_inventory import get_gpu_summary
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -23,7 +23,9 @@ console_handler.setFormatter(formatter) ...@@ -23,7 +23,9 @@ console_handler.setFormatter(formatter)
logger.addHandler(console_handler) logger.addHandler(console_handler)
MODEL_GPU_MEM_FRAC_MAX = 0.9 MODEL_GPU_MEM_FRAC_MAX = 0.9
MOE_MODEL_MAX_NUM_GPUS = 32
# for MoE models, we sweep up to number of GPUs that can hold 8x the model weights
MOE_MODEL_MAX_NUM_GPU_FACTOR = 8
def auto_generate_search_space(args: argparse.Namespace) -> None: def auto_generate_search_space(args: argparse.Namespace) -> None:
...@@ -31,17 +33,16 @@ def auto_generate_search_space(args: argparse.Namespace) -> None: ...@@ -31,17 +33,16 @@ def auto_generate_search_space(args: argparse.Namespace) -> None:
args.backend args.backend
] # args.backend is already validated in argparse ] # args.backend is already validated in argparse
# first check if config file exists # first get the config
if args.model:
if not args.config: if not args.config:
# modify config file from default config file # modify config file from default config file
logger.info("DGD config file not provided, using default config file") logger.info("DGD config file not provided, using default config file")
config = config_modifier.load_default_config() config = config_modifier.load_default_config()
else: else:
with open(args.config, "r") as f: with open(args.config, "r") as f:
config = yaml.safe_load(f) config = yaml.safe_load(f)
if args.model:
logger.info(f"Updating model in DGD config file to {args.model}") logger.info(f"Updating model in DGD config file to {args.model}")
config = config_modifier.update_model(config, args.model) config = config_modifier.update_model(config, args.model)
if args.dgd_image: if args.dgd_image:
...@@ -55,23 +56,26 @@ def auto_generate_search_space(args: argparse.Namespace) -> None: ...@@ -55,23 +56,26 @@ def auto_generate_search_space(args: argparse.Namespace) -> None:
yaml.dump(config, f) yaml.dump(config, f)
args.config = config_fn args.config = config_fn
# now determine the search space # get model info and update args
model_info = None model_info: ModelInfo | None = None
if args.model: if not args.model:
# get the model name from config
args.model = config_modifier.get_model_name(config)
logger.info(f"Getting model info for {args.model}...") logger.info(f"Getting model info for {args.model}...")
model_info = get_model_info(args.model) model_info = get_model_info(args.model)
num_experts_str = ( num_experts_str = (
f", num_experts={model_info['num_experts']}" f", num_experts={model_info.num_experts}"
if model_info.get("num_experts") if model_info.num_experts is not None
else "" else ""
) )
logger.info( logger.info(
f"Model {args.model} has size {model_info['model_size']}, is_moe={model_info['is_moe']}, and max_context_length={model_info['max_context_length']}{num_experts_str}" f"Model {args.model} has size {model_info.model_size}, is_moe={model_info.is_moe}, and max_context_length={model_info.max_context_length}{num_experts_str}"
) )
args.is_moe_model = model_info["is_moe"] # type: ignore[assignment] args.model_info = model_info
args.max_context_length = model_info["max_context_length"] # type: ignore[assignment]
# now determine the search space
if args.enable_gpu_discovery:
if ( if (
args.min_num_gpus_per_engine == 0 args.min_num_gpus_per_engine == 0
or args.max_num_gpus_per_engine == 0 or args.max_num_gpus_per_engine == 0
...@@ -90,16 +94,20 @@ def auto_generate_search_space(args: argparse.Namespace) -> None: ...@@ -90,16 +94,20 @@ def auto_generate_search_space(args: argparse.Namespace) -> None:
) )
# model_info should be set by now (checked above), but mypy needs explicit verification # model_info should be set by now (checked above), but mypy needs explicit verification
assert model_info is not None, "model_info must be set when model is provided" assert (
model_info is not None
), "model_info must be set when model is provided"
vram_mib = int(gpu_info["vram"]) # type: ignore[call-overload]
gpus_per_node = int(gpu_info["gpus_per_node"]) # type: ignore[call-overload]
min_gpu = math.ceil( min_gpu = math.ceil(
model_info["model_size"] / MODEL_GPU_MEM_FRAC_MAX / gpu_info["vram"] # type: ignore[operator] model_info.model_size / MODEL_GPU_MEM_FRAC_MAX / vram_mib
)
max_gpu = (
gpu_info["gpus_per_node"] # type: ignore[misc]
if not model_info["is_moe"]
else MOE_MODEL_MAX_NUM_GPUS
) )
if not model_info.is_moe:
max_gpu = gpus_per_node
else:
max_gpu = max(min_gpu * MOE_MODEL_MAX_NUM_GPU_FACTOR, gpus_per_node)
if min_gpu > max_gpu: if min_gpu > max_gpu:
error_msg = f"No valid GPU configuration found for model {args.model} on the cluster with {gpu_info['gpus_per_node']}x{gpu_info['model']} GPUs per node" error_msg = f"No valid GPU configuration found for model {args.model} on the cluster with {gpu_info['gpus_per_node']}x{gpu_info['model']} GPUs per node"
logger.error(error_msg) logger.error(error_msg)
...@@ -110,7 +118,22 @@ def auto_generate_search_space(args: argparse.Namespace) -> None: ...@@ -110,7 +118,22 @@ def auto_generate_search_space(args: argparse.Namespace) -> None:
) )
args.min_num_gpus_per_engine = min_gpu args.min_num_gpus_per_engine = min_gpu
args.max_num_gpus_per_engine = max_gpu args.max_num_gpus_per_engine = max_gpu
args.num_gpus_per_node = gpu_info["gpus_per_node"] # type: ignore[assignment] args.num_gpus_per_node = gpus_per_node # type: ignore[assignment]
args.num_experts = model_info.get("num_experts") # type: ignore[assignment] else:
# use default values for GPUs
if args.min_num_gpus_per_engine == 0:
logger.warning(
"GPU discover is disabled and min_num_gpus_per_engine is not specified, setting to 1"
)
args.min_num_gpus_per_engine = 1
if args.max_num_gpus_per_engine == 0:
logger.warning(
"GPU discover is disabled and max_num_gpus_per_engine is not specified, setting to 4"
)
args.max_num_gpus_per_engine = 4
if args.num_gpus_per_node == 0:
logger.warning(
"GPU discover is disabled and num_gpus_per_node is not specified, setting to 8"
)
args.num_gpus_per_node = 8
return return
...@@ -37,8 +37,7 @@ spec: ...@@ -37,8 +37,7 @@ spec:
# Engine configuration # Engine configuration
engine: engine:
max_context_length: 16384 # Maximum context length supported by the model max_context_length: 16384 # will override max context length of the model if provided
is_moe_model: false # Enable MoE model support (uses TEP/DEP instead of TP)
# Hardware configuration # Hardware configuration
hardware: hardware:
......
...@@ -852,7 +852,7 @@ var _ = Describe("DGDR Helper Functions", func() { ...@@ -852,7 +852,7 @@ var _ = Describe("DGDR Helper Functions", func() {
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
Config: createTestConfig(map[string]interface{}{ Config: createTestConfig(map[string]interface{}{
"sweep": map[string]interface{}{ "sweep": map[string]interface{}{
"force_rerun": true, "prefill_interpolation_granularity": 16,
}, },
}), }),
}, },
......
...@@ -315,7 +315,8 @@ profilingConfig: ...@@ -315,7 +315,8 @@ profilingConfig:
# Profiling sweep settings (optional) # Profiling sweep settings (optional)
sweep: sweep:
force_rerun: false prefill_interpolation_granularity: 16 # Number of samples for prefill ISL sweep
decode_interpolation_granularity: 6 # Number of samples for decode sweep
``` ```
> **Note**: `engine.config` is a **file path** to a DGD YAML file, not inline configuration. Use ConfigMapRef (recommended) or leave it unset to auto-generate. > **Note**: `engine.config` is a **file path** to a DGD YAML file, not inline configuration. Use ConfigMapRef (recommended) or leave it unset to auto-generate.
......
...@@ -18,6 +18,19 @@ project_root = Path(__file__).parent.parent.parent ...@@ -18,6 +18,19 @@ project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root)) sys.path.insert(0, str(project_root))
from benchmarks.profiler.profile_sla import run_profile # noqa: E402 from benchmarks.profiler.profile_sla import run_profile # noqa: E402
from benchmarks.profiler.utils.model_info import ModelInfo # noqa: E402
# Override the logger fixture from conftest.py to prevent directory creation
@pytest.fixture(autouse=True)
def logger(request):
"""Override the logger fixture to prevent test directory creation.
This replaces the logger fixture from tests/conftest.py that creates
directories named after each test.
"""
# Simply do nothing - no directories created, no file handlers added
yield
class TestProfileSlaAiconfigurator: class TestProfileSlaAiconfigurator:
...@@ -41,11 +54,9 @@ class TestProfileSlaAiconfigurator: ...@@ -41,11 +54,9 @@ class TestProfileSlaAiconfigurator:
self.osl = 500 self.osl = 500
self.ttft = 50 self.ttft = 50
self.itl = 10 self.itl = 10
self.max_context_length = 16384
self.prefill_interpolation_granularity = 16 self.prefill_interpolation_granularity = 16
self.decode_interpolation_granularity = 6 self.decode_interpolation_granularity = 6
self.service_name = "" self.service_name = ""
self.is_moe_model = False
self.dry_run = False self.dry_run = False
self.use_ai_configurator = True self.use_ai_configurator = True
self.aic_system = "h200_sxm" self.aic_system = "h200_sxm"
...@@ -54,6 +65,13 @@ class TestProfileSlaAiconfigurator: ...@@ -54,6 +65,13 @@ class TestProfileSlaAiconfigurator:
self.aic_backend_version = "0.20.0" self.aic_backend_version = "0.20.0"
self.num_gpus_per_node = 8 self.num_gpus_per_node = 8
self.deploy_after_profile = False self.deploy_after_profile = False
# Provide minimal model_info to avoid HF queries
self.model_info = ModelInfo(
model_size=16384.0,
architecture="TestArchitecture",
is_moe=False,
max_context_length=16384,
)
return Args() return Args()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment