Unverified Commit 4c648b11 authored by Hongkuan Zhou's avatar Hongkuan Zhou Committed by GitHub
Browse files

refactor: move core logics of DPP -> AIC and support static profiling (#6285)


Signed-off-by: default avatarhongkuanz <hongkuanz@nvidia.com>
Signed-off-by: default avatarHannah Zhang <hannahz@nvidia.com>
Co-authored-by: default avatarhhzhang16 <54051230+hhzhang16@users.noreply.github.com>
parent f6d4351f
......@@ -40,7 +40,7 @@ classifiers = [
]
dependencies = [
"aiconfigurator[webapp] @ git+https://github.com/ai-dynamo/aiconfigurator.git@7a24afd98714af13f061cffe784d4808f5356d45",
"aiconfigurator[webapp] @ git+https://github.com/ai-dynamo/aiconfigurator.git@168a948d5bc32209728fe8639191a9e0d9083d18",
"aiperf @ git+https://github.com/ai-dynamo/aiperf.git@c3fc969e9e30e9ddad35b2f613aa7c1d418f2de9",
"matplotlib",
"networkx",
......
......@@ -70,7 +70,7 @@ class SLAPlannerDefaults(BasePlannerDefaults):
kalman_r = 10.0
kalman_min_points = 5
no_correction = False # disable correction factor, might be useful under some conditions like long cold start time
no_correction = True
mode: Literal["disagg", "prefill", "decode", "agg"] = "disagg"
throughput_metrics_source = "frontend" # "frontend" | "router"
......
......@@ -133,6 +133,18 @@ class PlannerConfig(BaseModel):
"(enable_throughput_scaling or enable_load_scaling)"
)
if self.enable_throughput_scaling:
if (
self.pre_deployment_sweeping_mode is None
or self.pre_deployment_sweeping_mode
== PlannerPreDeploymentSweepMode.None_
):
raise ValueError(
"pre_deployment_sweeping_mode cannot be 'none' when "
"enable_throughput_scaling is True. Throughput-based scaling "
"requires pre-deployment sweeping to profile engine performance."
)
if self.enable_load_scaling:
# Router metrics URL is required outside kubernetes mode
if not self.load_router_metrics_url and self.environment != "kubernetes":
......@@ -212,6 +224,9 @@ class PlannerConfig(BaseModel):
return cls.model_validate(data)
def scaling_enabled(self) -> bool:
return self.enable_throughput_scaling or self.enable_load_scaling
if __name__ == "__main__":
from pathlib import Path
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Entry point for the Dynamo profiler.
Usage::
python -m dynamo.profiler --config <json string or path to json/yaml>
python -m dynamo.profiler --config '{"model": "Qwen/Qwen3-32B", ...}'
python -m dynamo.profiler --config /path/to/dgdr_spec.yaml
"""
import argparse
import asyncio
import json
import logging
import os
from pathlib import Path
import yaml
from dynamo.profiler.utils.dgdr_v1beta1_types import DynamoGraphDeploymentRequestSpec
from .profile_sla import run_profile
from .utils.profile_common import (
DEFAULT_DECODE_INTERPOLATION_GRANULARITY,
DEFAULT_DEPLOYMENT_TIMEOUT,
DEFAULT_DRY_RUN,
DEFAULT_OUTPUT_DIR,
DEFAULT_PREFILL_INTERPOLATION_GRANULARITY,
ProfilerOperationalConfig,
)
logger = logging.getLogger(__name__)
def _parse_dgdr_spec(config_arg: str) -> DynamoGraphDeploymentRequestSpec:
"""Parse a DGDR spec from a CLI ``--config`` argument.
Accepts a file path (JSON/YAML) or an inline JSON string.
"""
path = Path(config_arg)
if path.is_file():
text = path.read_text()
suffix = path.suffix.lower()
if suffix in (".yaml", ".yml"):
data = yaml.safe_load(text)
else:
try:
data = json.loads(text)
except json.JSONDecodeError:
data = yaml.safe_load(text)
return DynamoGraphDeploymentRequestSpec.model_validate(data)
try:
data = json.loads(config_arg)
except json.JSONDecodeError as e:
raise ValueError(
f"--config value is neither a valid file path nor valid JSON. "
f"File not found: '{config_arg}'. JSON parse error: {e}"
) from e
return DynamoGraphDeploymentRequestSpec.model_validate(data)
def _parse_args() -> tuple[DynamoGraphDeploymentRequestSpec, ProfilerOperationalConfig]:
parser = argparse.ArgumentParser(description="Dynamo Profiler")
parser.add_argument(
"--config",
required=True,
help="DynamoGraphDeploymentRequestSpec as JSON string or path to JSON/YAML file",
)
parser.add_argument(
"--output-dir",
type=str,
default=DEFAULT_OUTPUT_DIR,
help=f"Path to the output results directory (default: {DEFAULT_OUTPUT_DIR})",
)
parser.add_argument(
"--deployment-timeout",
type=int,
default=DEFAULT_DEPLOYMENT_TIMEOUT,
help=f"Max seconds to wait for deployment readiness (default: {DEFAULT_DEPLOYMENT_TIMEOUT})",
)
parser.add_argument(
"--prefill-interpolation-granularity",
type=int,
default=DEFAULT_PREFILL_INTERPOLATION_GRANULARITY,
help=f"Number of ISL samples for prefill interpolation (default: {DEFAULT_PREFILL_INTERPOLATION_GRANULARITY})",
)
parser.add_argument(
"--decode-interpolation-granularity",
type=int,
default=DEFAULT_DECODE_INTERPOLATION_GRANULARITY,
help=f"Number of samples for decode interpolation (default: {DEFAULT_DECODE_INTERPOLATION_GRANULARITY})",
)
parser.add_argument(
"--dry-run",
action="store_true",
default=DEFAULT_DRY_RUN,
help="Skip deployments and benchmarking (dev mode)",
)
args = parser.parse_args()
dgdr = _parse_dgdr_spec(args.config)
ops = ProfilerOperationalConfig(
output_dir=args.output_dir,
deployment_timeout=args.deployment_timeout,
prefill_interpolation_granularity=args.prefill_interpolation_granularity,
decode_interpolation_granularity=args.decode_interpolation_granularity,
dry_run=args.dry_run,
)
return dgdr, ops
def main():
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
try:
dgdr, ops = _parse_args()
except (ValueError, Exception) as e:
logger.error("Failed to parse profiler config: %s", e)
raise SystemExit(1) from e
os.makedirs(ops.output_dir, exist_ok=True)
log_file_handler = logging.FileHandler(f"{ops.output_dir}/profile_sla.log")
log_file_handler.setLevel(logging.INFO)
log_file_handler.setFormatter(
logging.Formatter(
"%(asctime)s - %(name)s - %(levelname)s - %(message)s",
"%Y-%m-%d %H:%M:%S",
)
)
logging.getLogger().addHandler(log_file_handler)
asyncio.run(run_profile(dgdr, ops))
if __name__ == "__main__":
main()
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Interpolation curve generation for planner pre-deployment sweeping."""
import logging
import os
import yaml
from deploy.utils.dynamo_deployment import DynamoDeploymentClient
from dynamo.planner.defaults import SubComponentType
from dynamo.planner.utils.planner_config import PlannerPreDeploymentSweepMode
from dynamo.profiler.utils.config import Config, get_service_name_by_type
from dynamo.profiler.utils.config_modifiers import CONFIG_MODIFIERS
from dynamo.profiler.utils.config_modifiers.parallelization_mapping import (
PickedParallelConfig,
)
from dynamo.profiler.utils.defaults import EngineType
from dynamo.profiler.utils.dgdr_v1beta1_types import DynamoGraphDeploymentRequestSpec
from dynamo.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator
from dynamo.profiler.utils.profile_common import ProfilerOperationalConfig
from dynamo.profiler.utils.profile_decode import (
profile_decode,
profile_decode_aiconfigurator,
)
from dynamo.profiler.utils.profile_prefill import (
profile_prefill,
profile_prefill_aiconfigurator,
)
logger = logging.getLogger(__name__)
async def run_interpolation(
dgdr: DynamoGraphDeploymentRequestSpec,
ops: ProfilerOperationalConfig,
disagg_config: dict,
best_prefill_config: PickedParallelConfig,
best_decode_config: PickedParallelConfig,
model: str,
system: str,
backend: str,
isl: int,
osl: int,
sweep_max_context_length: int,
deployment_clients: list,
):
"""Generate interpolation curves for the planner based on sweep mode.
Takes the output disagg DGD config and uses ``convert_config`` to strip
it down to standalone prefill / decode engines for profiling.
"""
planner_cfg = (
dgdr.features.planner if (dgdr.features and dgdr.features.planner) else None
)
sweep_mode = PlannerPreDeploymentSweepMode.None_
if planner_cfg and planner_cfg.pre_deployment_sweeping_mode:
sweep_mode = planner_cfg.pre_deployment_sweeping_mode
if sweep_mode == PlannerPreDeploymentSweepMode.None_:
logger.info(
"Planner pre-deployment sweeping is disabled — skipping interpolation."
)
return
config_modifier = CONFIG_MODIFIERS[backend]
model_name, model_path = config_modifier.get_model_name(disagg_config)
best_prefill_gpus = best_prefill_config.num_gpus
best_decode_gpus = best_decode_config.num_gpus
# --- Prefill interpolation ---
prefill_config = config_modifier.convert_config(disagg_config, EngineType.PREFILL)
work_dir = f"{ops.output_dir}/selected_prefill_interpolation"
os.makedirs(work_dir, exist_ok=True)
prefill_config_fn = f"{work_dir}/config.yaml"
with open(prefill_config_fn, "w") as f:
yaml.dump(prefill_config, f)
if sweep_mode == PlannerPreDeploymentSweepMode.Rapid:
logger.info("Using AIC simulation for prefill interpolation.")
estimator = AIConfiguratorPerfEstimator(
hf_id=model,
system=system.lower(),
backend=backend,
)
profile_prefill_aiconfigurator(
work_dir,
best_prefill_gpus,
sweep_max_context_length,
ops.prefill_interpolation_granularity,
estimator,
tp_size=best_prefill_config.tp_size,
)
elif sweep_mode == PlannerPreDeploymentSweepMode.Thorough:
logger.info("Using real GPUs for prefill interpolation.")
frontend_port = config_modifier.get_port(prefill_config)
client = DynamoDeploymentClient(
namespace=ops.k8s_namespace,
base_log_dir=work_dir,
model_name=model_name,
frontend_port=frontend_port,
deployment_name=prefill_config["metadata"]["name"],
)
deployment_clients.append(client)
await client.create_deployment(prefill_config_fn)
logger.info("Waiting for prefill interpolation deployment...")
try:
await client.wait_for_deployment_ready(timeout=ops.deployment_timeout)
except TimeoutError:
logger.error("Prefill interpolation deployment timed out, skipping.")
await client.delete_deployment()
deployment_clients.remove(client)
return
await client.get_deployment_logs()
base_url = client.get_service_url()
profile_prefill(
work_dir,
model_name,
model_path,
base_url,
best_prefill_gpus,
sweep_max_context_length,
ops.prefill_interpolation_granularity,
attention_dp_size=best_prefill_config.dp,
)
await client.delete_deployment()
deployment_clients.remove(client)
# --- Decode interpolation ---
decode_config = config_modifier.convert_config(disagg_config, EngineType.DECODE)
work_dir = f"{ops.output_dir}/selected_decode_interpolation"
os.makedirs(work_dir, exist_ok=True)
decode_config_fn = f"{work_dir}/config.yaml"
with open(decode_config_fn, "w") as f:
yaml.dump(decode_config, f)
if sweep_mode == PlannerPreDeploymentSweepMode.Rapid:
logger.info("Using AIC simulation for decode interpolation.")
estimator = AIConfiguratorPerfEstimator(
hf_id=model,
system=system.lower(),
backend=backend,
)
attention_dp_size = best_decode_config.dp
max_kv_tokens = estimator.get_max_kv_tokens(
isl,
osl,
tp_size=best_decode_config.tp_size,
)
profile_decode_aiconfigurator(
work_dir,
best_decode_gpus,
max_kv_tokens,
sweep_max_context_length,
ops.decode_interpolation_granularity,
estimator,
attention_dp_size,
tp_size=best_decode_config.tp_size,
)
elif sweep_mode == PlannerPreDeploymentSweepMode.Thorough:
logger.info("Using real GPUs for decode interpolation.")
frontend_port = config_modifier.get_port(decode_config)
client = DynamoDeploymentClient(
namespace=ops.k8s_namespace,
base_log_dir=work_dir,
model_name=model_name,
frontend_port=frontend_port,
deployment_name=decode_config["metadata"]["name"],
)
deployment_clients.append(client)
await client.create_deployment(decode_config_fn)
logger.info("Waiting for decode interpolation deployment...")
try:
await client.wait_for_deployment_ready(timeout=ops.deployment_timeout)
except TimeoutError:
logger.error("Decode interpolation deployment timed out, skipping.")
await client.delete_deployment()
deployment_clients.remove(client)
return
await client.get_deployment_logs()
attention_dp_size = best_decode_config.dp
decode_cfg = Config.model_validate(decode_config)
decode_service_name = get_service_name_by_type(
decode_cfg, backend, SubComponentType.DECODE
).lower()
max_kv_tokens = config_modifier.get_kv_cache_size_from_dynamo_log(
f"{work_dir}/{client.deployment_name}/{decode_service_name}/0.log",
attention_dp_size=attention_dp_size,
)
base_url = client.get_service_url()
profile_decode(
work_dir,
model_name,
model_path,
base_url,
best_decode_gpus,
max_kv_tokens,
sweep_max_context_length,
ops.decode_interpolation_granularity,
attention_dp_size,
)
await client.delete_deployment()
deployment_clients.remove(client)
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""RAPID search strategy: AIC simulation + picking + DGD generation."""
import logging
import pandas as pd
import yaml
from aiconfigurator.cli.main import _execute_task_configs, build_default_task_configs
from aiconfigurator.generator.api import (
generate_backend_artifacts,
generate_naive_config,
)
from aiconfigurator.generator.module_bridge import task_config_to_generator_config
from aiconfigurator.sdk.task import TaskConfig, TaskRunner
from dynamo.profiler.utils.config_modifiers import CONFIG_MODIFIERS
from dynamo.profiler.utils.dgdr_v1beta1_types import DynamoGraphDeploymentRequestSpec
from dynamo.profiler.utils.profile_common import derive_backend_image
logger = logging.getLogger(__name__)
def _generate_dgd_from_pick(
dgdr: DynamoGraphDeploymentRequestSpec,
best_config_df: pd.DataFrame,
chosen_exp: str,
task_configs: dict[str, TaskConfig],
) -> dict | None:
"""Generate a DGD config dict from the rank-1 picked result via AIC's generator."""
if best_config_df is None or best_config_df.empty:
return None
row = best_config_df.iloc[0]
tc = task_configs.get(chosen_exp)
# TODO: temporary workaround — when backend="auto", AIC's
# merge_experiment_results_by_mode collapses e.g. "agg_vllm" into "agg",
# but task_configs retains the original keys. Reconstruct the key from
# the winning row's backend column. Proper fix: AIC should return the
# original task config key alongside the merged chosen experiment name.
if tc is None and "backend" in row.index:
tc = task_configs.get(f"{chosen_exp}_{row['backend']}")
if tc is None:
return None
original_total_gpus = tc.total_gpus
if "total_gpus_needed" in row.index and row["total_gpus_needed"] > 0:
tc.total_gpus = int(row["total_gpus_needed"])
generator_overrides: dict = {}
k8s_overrides: dict = {}
k8s_overrides["k8s_image"] = derive_backend_image(dgdr.image, tc.backend_name)
if dgdr.modelCache:
if dgdr.modelCache.pvcName:
k8s_overrides["k8s_pvc_name"] = dgdr.modelCache.pvcName
if dgdr.modelCache.pvcMountPath:
k8s_overrides["k8s_pvc_mount_path"] = dgdr.modelCache.pvcMountPath
if dgdr.modelCache.pvcModelPath:
k8s_overrides["k8s_model_path_in_pvc"] = dgdr.modelCache.pvcModelPath
if k8s_overrides:
generator_overrides["K8sConfig"] = k8s_overrides
cfg = task_config_to_generator_config(
task_config=tc,
result_df=row,
generator_overrides=generator_overrides or None,
)
tc.total_gpus = original_total_gpus
artifacts = generate_backend_artifacts(
params=cfg,
backend=tc.backend_name,
backend_version=tc.backend_version,
use_dynamo_generator=True,
)
dgd_yaml = artifacts.get("k8s_deploy.yaml", "")
if dgd_yaml:
return yaml.safe_load(dgd_yaml)
return None
# in naive mode, use vllm as the default backend
_DEFAULT_NAIVE_BACKEND = "vllm"
def _run_naive_fallback(
dgdr: DynamoGraphDeploymentRequestSpec,
model: str,
total_gpus: int,
system: str,
backend: str,
) -> dict:
"""Handle the AIC-unsupported path via naive config generation."""
if backend == "auto":
backend = _DEFAULT_NAIVE_BACKEND
logger.info(
"Auto backend resolved to '%s' for naive fallback.",
backend,
)
logger.info(
"AIC does not support this combo — falling back to naive config generation."
)
naive_result = generate_naive_config(model, total_gpus, system, backend)
dgd_yaml = naive_result.get("artifacts", {}).get("k8s_deploy.yaml", "")
dgd_config = yaml.safe_load(dgd_yaml) if dgd_yaml else None
if dgd_config:
config_modifier = CONFIG_MODIFIERS[backend]
dgd_config = config_modifier.update_image(
dgd_config, derive_backend_image(dgdr.image, backend)
)
if dgdr.modelCache and dgdr.modelCache.pvcName:
dgd_config = config_modifier.update_model_from_pvc(
dgd_config,
model_name=model,
pvc_name=dgdr.modelCache.pvcName,
pvc_mount_path=dgdr.modelCache.pvcMountPath,
pvc_path=dgdr.modelCache.pvcModelPath or "",
)
return {
"best_config_df": pd.DataFrame(),
"best_latencies": {"ttft": 0.0, "tpot": 0.0, "request_latency": 0.0},
"dgd_config": dgd_config,
"chosen_exp": None,
}
def _run_autoscale_sim(
dgdr: DynamoGraphDeploymentRequestSpec,
model: str,
system: str,
backend: str,
total_gpus: int,
isl: int,
osl: int,
target_ttft: float,
target_tpot: float,
request_latency: float | None,
) -> dict:
"""Build a TaskConfig, run autoscale simulation, collect latencies, generate DGD."""
planner_cfg = dgdr.features.planner if dgdr.features else None
if planner_cfg and planner_cfg.enable_throughput_scaling:
logger.warning(
"Throughput-based scaling enabled — only disagg mode is supported."
)
task = TaskConfig(
serving_mode="disagg",
model_path=model,
system_name=system,
backend_name=backend,
total_gpus=total_gpus,
isl=isl,
osl=osl,
ttft=target_ttft,
tpot=target_tpot,
request_latency=request_latency,
)
runner = TaskRunner()
sim_result = runner.run(task, autoscale=True)
pareto_df = sim_result.get("pareto_df", pd.DataFrame())
best_latencies = {"ttft": 0.0, "tpot": 0.0, "request_latency": 0.0}
if pareto_df is not None and not pareto_df.empty:
row = pareto_df.iloc[0]
best_latencies["ttft"] = float(row.get("ttft", 0.0))
best_latencies["tpot"] = float(row.get("tpot", 0.0))
best_latencies["request_latency"] = float(row.get("request_latency", 0.0))
task_configs = {"disagg": task}
dgd_config = _generate_dgd_from_pick(dgdr, pareto_df, "disagg", task_configs)
return {
"best_config_df": pareto_df,
"best_latencies": best_latencies,
"dgd_config": dgd_config,
"chosen_exp": "disagg",
"task_configs": task_configs,
}
def _run_default_sim(
dgdr: DynamoGraphDeploymentRequestSpec,
model: str,
system: str,
backend: str,
total_gpus: int,
isl: int,
osl: int,
target_ttft: float,
target_tpot: float,
request_latency: float | None,
picking_mode: str,
) -> dict:
"""Build default task_configs, apply load_match kwargs, run simulation, generate DGD."""
task_configs = build_default_task_configs(
model_path=model,
total_gpus=total_gpus,
system=system,
backend=backend,
isl=isl,
osl=osl,
ttft=target_ttft,
tpot=target_tpot,
request_latency=request_latency,
)
load_kwargs: dict = {}
if picking_mode == "load_match" and dgdr.workload is not None:
load_kwargs["target_request_rate"] = dgdr.workload.requestRate
load_kwargs["target_concurrency"] = dgdr.workload.concurrency
load_kwargs["max_total_gpus"] = total_gpus
chosen, best_configs, _, _, best_latencies_map = _execute_task_configs(
task_configs,
mode="default",
top_n=5,
**load_kwargs,
)
best_config_df = best_configs.get(chosen, pd.DataFrame())
best_latencies = best_latencies_map.get(
chosen, {"ttft": 0.0, "tpot": 0.0, "request_latency": 0.0}
)
dgd_config = _generate_dgd_from_pick(dgdr, best_config_df, chosen, task_configs)
return {
"best_config_df": best_config_df,
"best_latencies": best_latencies,
"dgd_config": dgd_config,
"chosen_exp": chosen,
"task_configs": task_configs,
}
def run_rapid(
dgdr: DynamoGraphDeploymentRequestSpec,
picking_mode: str,
aic_supported: bool,
model: str,
system: str,
backend: str,
total_gpus: int,
isl: int,
osl: int,
target_ttft: float,
target_tpot: float,
request_latency: float | None,
) -> dict:
"""Run AIC simulation and picking. Returns a result dict with
``best_config_df``, ``best_latencies``, and ``dgd_config``.
"""
if not aic_supported:
return _run_naive_fallback(dgdr, model, total_gpus, system, backend)
if picking_mode == "autoscale":
return _run_autoscale_sim(
dgdr,
model,
system,
backend,
total_gpus,
isl,
osl,
target_ttft,
target_tpot,
request_latency,
)
return _run_default_sim(
dgdr,
model,
system,
backend,
total_gpus,
isl,
osl,
target_ttft,
target_tpot,
request_latency,
picking_mode,
)
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""THOROUGH search strategy: enumerate candidates, deploy, benchmark, pick."""
import logging
import os
import pandas as pd
import yaml
from aiconfigurator.generator.enumerate import enumerate_profiling_configs
from aiconfigurator.sdk.picking import pick_autoscale, pick_default, pick_load_match
from aiconfigurator.sdk.task import TaskConfig
from deploy.utils.dynamo_deployment import DynamoDeploymentClient
from dynamo.planner.defaults import SubComponentType
from dynamo.profiler.rapid import _generate_dgd_from_pick
from dynamo.profiler.utils.aic_dataframe import (
build_decode_row,
build_disagg_df_from_static,
build_prefill_row,
make_parallel_label,
)
from dynamo.profiler.utils.aiperf import (
get_decode_itl_and_thpt_per_gpu,
get_prefill_ttft,
)
from dynamo.profiler.utils.config import Config, get_service_name_by_type
from dynamo.profiler.utils.config_modifiers import CONFIG_MODIFIERS
from dynamo.profiler.utils.config_modifiers.protocol import apply_dgd_overrides
from dynamo.profiler.utils.dgdr_v1beta1_types import (
DynamoGraphDeploymentRequestSpec,
ModelCacheSpec,
)
from dynamo.profiler.utils.profile_common import (
ProfilerOperationalConfig,
derive_backend_image,
)
from dynamo.profiler.utils.profile_decode import get_num_request_range
logger = logging.getLogger(__name__)
async def _benchmark_prefill_candidates(
prefill_candidates,
ops: ProfilerOperationalConfig,
isl: int,
osl: int,
model: str,
system: str,
backend: str,
deployment_clients: list,
config_modifier,
) -> pd.DataFrame:
"""Deploy each prefill candidate, measure TTFT, return prefill_df."""
prefill_rows: list[dict] = []
for candidate in prefill_candidates:
num_gpus = candidate.num_gpus
label = make_parallel_label(
candidate.tp,
candidate.pp,
candidate.dp,
candidate.moe_tp,
candidate.moe_ep,
)
tag = label.replace("=", "").replace("/", "_")
work_dir = f"{ops.output_dir}/prefill_{num_gpus}gpus_{tag}"
os.makedirs(work_dir, exist_ok=True)
config_fn = f"{work_dir}/config.yaml"
with open(config_fn, "w") as f:
yaml.dump(candidate.dgd_config, f)
model_name, model_path = config_modifier.get_model_name(candidate.dgd_config)
frontend_port = config_modifier.get_port(candidate.dgd_config)
logger.info("Profiling prefill candidate %s with %d GPUs...", label, num_gpus)
client = DynamoDeploymentClient(
namespace=ops.k8s_namespace,
base_log_dir=work_dir,
model_name=model_name,
frontend_port=frontend_port,
deployment_name=candidate.dgd_config["metadata"]["name"],
)
deployment_clients.append(client)
await client.create_deployment(config_fn)
logger.info("Waiting for prefill deployment to be ready...")
try:
await client.wait_for_deployment_ready(timeout=ops.deployment_timeout)
except TimeoutError:
logger.error("Prefill %s with %d GPUs timed out", label, num_gpus)
await client.delete_deployment()
deployment_clients.remove(client)
continue
logger.info("Prefill deployment ready")
await client.get_deployment_logs()
base_url = client.get_service_url()
ai_perf_dir = f"{work_dir}/aiperf_isl{isl}"
ttft = get_prefill_ttft(
isl,
ai_perf_dir,
model_name,
model_path,
base_url,
attention_dp_size=candidate.dp,
)
await client.delete_deployment()
deployment_clients.remove(client)
if ttft is not None:
prefill_rows.append(
build_prefill_row(
model=model,
isl=isl,
osl=osl,
ttft=ttft,
tp=candidate.tp,
pp=candidate.pp,
dp=candidate.dp,
moe_tp=candidate.moe_tp,
moe_ep=candidate.moe_ep,
backend=backend,
system=system,
)
)
return pd.DataFrame(prefill_rows) if prefill_rows else pd.DataFrame()
async def _benchmark_decode_candidates(
decode_candidates,
ops: ProfilerOperationalConfig,
isl: int,
osl: int,
model: str,
system: str,
backend: str,
deployment_clients: list,
config_modifier,
) -> pd.DataFrame:
"""Deploy each decode candidate, sweep num_request, return decode_df."""
decode_rows: list[dict] = []
for candidate in decode_candidates:
num_gpus = candidate.num_gpus
label = make_parallel_label(
candidate.tp,
candidate.pp,
candidate.dp,
candidate.moe_tp,
candidate.moe_ep,
)
tag = label.replace("=", "").replace("/", "_")
work_dir = f"{ops.output_dir}/decode_{num_gpus}gpus_{tag}"
os.makedirs(work_dir, exist_ok=True)
config_fn = f"{work_dir}/config.yaml"
with open(config_fn, "w") as f:
yaml.dump(candidate.dgd_config, f)
model_name, model_path = config_modifier.get_model_name(candidate.dgd_config)
frontend_port = config_modifier.get_port(candidate.dgd_config)
logger.info("Profiling decode candidate %s with %d GPUs...", label, num_gpus)
client = DynamoDeploymentClient(
namespace=ops.k8s_namespace,
base_log_dir=work_dir,
model_name=model_name,
frontend_port=frontend_port,
deployment_name=candidate.dgd_config["metadata"]["name"],
)
deployment_clients.append(client)
await client.create_deployment(config_fn)
logger.info("Waiting for decode deployment to be ready...")
try:
await client.wait_for_deployment_ready(timeout=ops.deployment_timeout)
except TimeoutError:
logger.error("Decode %s with %d GPUs timed out", label, num_gpus)
await client.delete_deployment()
deployment_clients.remove(client)
continue
logger.info("Decode deployment ready")
await client.get_deployment_logs()
decode_cfg = Config.model_validate(candidate.dgd_config)
decode_service_name = get_service_name_by_type(
decode_cfg, backend, SubComponentType.DECODE
).lower()
max_kv_tokens = config_modifier.get_kv_cache_size_from_dynamo_log(
f"{work_dir}/{client.deployment_name}/{decode_service_name}/0.log",
attention_dp_size=candidate.dp,
)
max_concurrency = max_kv_tokens // (isl + osl)
sweep_num_request = get_num_request_range(
candidate.dp,
max_concurrency,
ops.decode_interpolation_granularity,
)
logger.info("Sweeping num_request: %s", sweep_num_request)
base_url = client.get_service_url()
for num_request in sweep_num_request:
ai_perf_dir = f"{work_dir}/aiperf_request{num_request}_isl{isl}_osl{osl}_n{num_request}"
itl, thpt_per_gpu = get_decode_itl_and_thpt_per_gpu(
isl,
osl,
num_request,
ai_perf_dir,
model_name,
model_path,
base_url=base_url,
num_gpus=num_gpus,
attention_dp_size=candidate.dp,
)
if itl is not None and thpt_per_gpu is not None:
decode_rows.append(
build_decode_row(
tpot=itl,
thpt_per_gpu=thpt_per_gpu,
num_request=num_request,
num_gpus=num_gpus,
osl=osl,
tp=candidate.tp,
pp=candidate.pp,
dp=candidate.dp,
moe_tp=candidate.moe_tp,
moe_ep=candidate.moe_ep,
backend=backend,
system=system,
)
)
await client.delete_deployment()
deployment_clients.remove(client)
return pd.DataFrame(decode_rows) if decode_rows else pd.DataFrame()
def _pick_thorough_best_config(
prefill_df: pd.DataFrame,
decode_df: pd.DataFrame,
picking_mode: str,
target_ttft: float,
target_tpot: float,
request_latency: float | None,
total_gpus: int,
dgdr: DynamoGraphDeploymentRequestSpec,
) -> dict:
"""Dispatch to pick_autoscale / pick_load_match / pick_default, return result dict."""
if picking_mode == "autoscale":
return pick_autoscale(prefill_df, decode_df, target_ttft, target_tpot)
elif picking_mode == "load_match":
disagg_df = build_disagg_df_from_static(prefill_df, decode_df)
lm_kwargs: dict = {
"pareto_df": disagg_df,
"serving_mode": "disagg",
"top_n": 5,
}
if request_latency is not None:
lm_kwargs["target_request_latency"] = request_latency
else:
lm_kwargs["target_tpot"] = target_tpot
if dgdr.workload and dgdr.workload.requestRate is not None:
lm_kwargs["target_request_rate"] = dgdr.workload.requestRate
if dgdr.workload and dgdr.workload.concurrency is not None:
lm_kwargs["target_concurrency"] = dgdr.workload.concurrency
if total_gpus:
lm_kwargs["max_total_gpus"] = total_gpus
return pick_load_match(**lm_kwargs)
else:
disagg_df = build_disagg_df_from_static(prefill_df, decode_df)
pk_kwargs: dict = {
"pareto_df": disagg_df,
"total_gpus": total_gpus,
"serving_mode": "disagg",
"top_n": 5,
}
if request_latency is not None:
pk_kwargs["target_request_latency"] = request_latency
else:
pk_kwargs["target_tpot"] = target_tpot
return pick_default(**pk_kwargs)
async def run_thorough(
dgdr: DynamoGraphDeploymentRequestSpec,
ops: ProfilerOperationalConfig,
picking_mode: str,
model: str,
system: str,
backend: str,
total_gpus: int,
isl: int,
osl: int,
target_ttft: float,
target_tpot: float,
request_latency: float | None,
deployment_clients: list,
) -> dict:
"""Enumerate candidates, deploy + benchmark each, build DataFrames, pick."""
logger.warning("THOROUGH mode: only disagg configurations are supported.")
# --- Stage 1: Enumeration ---
model_cache = dgdr.modelCache or ModelCacheSpec()
prefill_candidates, decode_candidates = enumerate_profiling_configs(
model_path=model,
system=system,
backend=backend,
image=derive_backend_image(dgdr.image, backend),
isl=isl,
osl=osl,
num_gpus_per_node=dgdr.hardware.numGpusPerNode,
k8s_pvc_name=model_cache.pvcName,
k8s_pvc_mount_path=model_cache.pvcMountPath,
k8s_model_path_in_pvc=model_cache.pvcModelPath,
)
logger.info(
"Enumerated %d prefill candidates, %d decode candidates",
len(prefill_candidates),
len(decode_candidates),
)
if dgdr.overrides and dgdr.overrides.dgd:
for candidate in prefill_candidates:
candidate.dgd_config = apply_dgd_overrides(
candidate.dgd_config, dgdr.overrides.dgd
)
for candidate in decode_candidates:
candidate.dgd_config = apply_dgd_overrides(
candidate.dgd_config, dgdr.overrides.dgd
)
logger.info(
"Applied DGD overrides to %d prefill + %d decode candidates.",
len(prefill_candidates),
len(decode_candidates),
)
config_modifier = CONFIG_MODIFIERS[backend]
# --- Stage 2: Benchmarking ---
prefill_df = await _benchmark_prefill_candidates(
prefill_candidates,
ops,
isl,
osl,
model,
system,
backend,
deployment_clients,
config_modifier,
)
decode_df = await _benchmark_decode_candidates(
decode_candidates,
ops,
isl,
osl,
model,
system,
backend,
deployment_clients,
config_modifier,
)
# --- Stage 3: Picking ---
if prefill_df.empty:
logger.error("No prefill results produced in THOROUGH mode.")
return {
"best_config_df": pd.DataFrame(),
"best_latencies": {"ttft": 0.0, "tpot": 0.0, "request_latency": 0.0},
"dgd_config": None,
"chosen_exp": None,
}
if decode_df.empty:
logger.error("No decode results produced in THOROUGH mode.")
return {
"best_config_df": pd.DataFrame(),
"best_latencies": {"ttft": 0.0, "tpot": 0.0, "request_latency": 0.0},
"dgd_config": None,
"chosen_exp": None,
}
result = _pick_thorough_best_config(
prefill_df,
decode_df,
picking_mode,
target_ttft,
target_tpot,
request_latency,
total_gpus,
dgdr,
)
best_config_df = result.get("best_config_df", pd.DataFrame())
# --- Stage 4: DGD generation ---
task = TaskConfig(
serving_mode="disagg",
model_path=model,
system_name=system,
backend_name=backend,
total_gpus=total_gpus,
isl=isl,
osl=osl,
ttft=target_ttft,
tpot=target_tpot,
request_latency=request_latency,
)
dgd_config = _generate_dgd_from_pick(
dgdr, best_config_df, "disagg", {"disagg": task}
)
return {
"best_config_df": best_config_df,
"best_latencies": result.get(
"best_latencies", {"ttft": 0.0, "tpot": 0.0, "request_latency": 0.0}
),
"dgd_config": dgd_config,
"chosen_exp": "disagg",
}
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Helpers to build AIC-compatible DataFrames from real-GPU benchmark results.
The picking functions in ``aiconfigurator.sdk.picking`` expect DataFrames
whose columns match the ``ColumnsStatic`` schema. Only a subset of columns
are actually accessed; this module populates exactly those columns.
"""
from __future__ import annotations
import pandas as pd
from aiconfigurator.sdk import common
from aiconfigurator.sdk.picking import _build_disagg_summary_dict
def make_parallel_label(tp: int, pp: int, dp: int, moe_tp: int, moe_ep: int) -> str:
"""Build the ``parallel`` label string used for dedup in picking."""
if moe_ep > 1:
return f"dep{moe_ep}"
elif moe_tp > 1:
return f"tep{moe_tp}"
else:
return f"tp{tp}"
def build_prefill_row(
*,
model: str,
isl: int,
osl: int,
ttft: float,
tp: int,
pp: int,
dp: int,
moe_tp: int,
moe_ep: int,
backend: str = "",
system: str = "",
) -> dict:
"""Build a single prefill row dict with the minimal columns needed by AIC picking.
Only columns actually accessed by ``pick_autoscale`` and
``_build_disagg_summary_dict`` are populated.
"""
num_gpus = tp * pp * dp
seq_s = 1000.0 / ttft * dp if ttft > 0 else 0.0
return {
"ttft": ttft,
"seq/s": seq_s,
"seq/s/gpu": seq_s / num_gpus if num_gpus > 0 else 0.0,
"global_bs": 1 * dp,
"parallel": make_parallel_label(tp, pp, dp, moe_tp, moe_ep),
"tp": tp,
"pp": pp,
"dp": dp,
"osl": osl,
"model": model,
"isl": isl,
"bs": 1,
"moe_tp": moe_tp,
"moe_ep": moe_ep,
"prefix": 0,
"gemm": "",
"kvcache": "",
"fmha": "",
"moe": "",
"comm": "",
"memory": "",
"backend": backend,
"version": "",
"system": system,
"power_w": 0.0,
}
def build_decode_row(
*,
tpot: float,
thpt_per_gpu: float,
num_request: int,
num_gpus: int,
osl: int,
tp: int,
pp: int,
dp: int,
moe_tp: int,
moe_ep: int,
backend: str = "",
system: str = "",
) -> dict:
"""Build a single decode row dict with the minimal columns needed by AIC picking.
Only columns actually accessed by ``pick_autoscale`` and
``_build_disagg_summary_dict`` are populated.
"""
seq_s = thpt_per_gpu * num_gpus / osl if osl > 0 else 0.0
return {
"tpot": tpot,
"seq/s": seq_s,
"seq/s/gpu": thpt_per_gpu / osl if osl > 0 else 0.0,
"global_bs": num_request,
"parallel": make_parallel_label(tp, pp, dp, moe_tp, moe_ep),
"tp": tp,
"pp": pp,
"dp": dp,
"concurrency": num_request,
"bs": num_request // dp if dp > 0 else num_request,
"tokens/s/user": 1000.0 / tpot if tpot > 0 else 0.0,
"moe_tp": moe_tp,
"moe_ep": moe_ep,
"gemm": "",
"kvcache": "",
"fmha": "",
"moe": "",
"comm": "",
"memory": "",
"backend": backend,
"version": "",
"system": system,
"power_w": 0.0,
}
def build_disagg_df_from_static(
prefill_df: pd.DataFrame,
decode_df: pd.DataFrame,
) -> pd.DataFrame:
"""Cross-product prefill x decode into a ColumnsDisagg DataFrame.
Used when calling ``pick_default`` or ``pick_load_match`` from
THOROUGH-mode benchmark results.
"""
combos: list[dict] = []
for _, p_row in prefill_df.iterrows():
for _, d_row in decode_df.iterrows():
combo = _build_disagg_summary_dict(
prefill_summary_dict=p_row.to_dict(),
prefill_num_worker=1,
decode_summary_dict=d_row.to_dict(),
decode_num_worker=1,
)
combos.append(combo)
if not combos:
return pd.DataFrame(columns=common.ColumnsDisagg)
return pd.DataFrame(combos, columns=common.ColumnsDisagg)
......@@ -102,6 +102,51 @@ class ParallelizationMapping:
)
@dataclass(frozen=True)
class PickedParallelConfig:
"""Lightweight representation of a picked parallelization config.
Uses the same (tp, pp, dp, moe_tp, moe_ep) tuple that AIC's enumeration
and picking pipelines produce. Unlike :class:`ParallelizationMapping`,
this stores all five dimensions explicitly rather than using mutually
exclusive optional fields.
"""
tp: int = 1
pp: int = 1
dp: int = 1
moe_tp: int = 1
moe_ep: int = 1
@property
def num_gpus(self) -> int:
return self.tp * self.pp * self.dp
@property
def tp_size(self) -> int:
"""Effective TP for KV-head splitting (TP or TEP; 1 for DEP)."""
if self.moe_ep > 1:
return 1
if self.moe_tp > 1:
return self.moe_tp
return self.tp
def label(self) -> str:
if self.moe_ep > 1:
return f"dep{self.moe_ep}"
elif self.moe_tp > 1:
return f"tep{self.moe_tp}"
return f"tp{self.tp}"
def to_parallelization_mapping(self) -> ParallelizationMapping:
"""Convert to :class:`ParallelizationMapping`."""
if self.moe_ep > 1:
return ParallelizationMapping(dep=self.moe_ep)
elif self.moe_tp > 1:
return ParallelizationMapping(tep=self.moe_tp)
return ParallelizationMapping(tp=self.tp)
def _check_divisibility(
value: int | None,
divisor: int,
......
......@@ -15,6 +15,7 @@
from __future__ import annotations
import copy
import logging
from typing import Any, Protocol, Tuple
......@@ -401,18 +402,9 @@ class BaseConfigModifier:
cls._ensure_spec_pvc(cfg, pvc_name)
# Mount to Frontend + prefill + decode services if present.
if "Frontend" in cfg.spec.services:
cls._ensure_service_volume_mount(
cfg.spec.services["Frontend"], pvc_name, pvc_mount_path
)
for sct in (SubComponentType.PREFILL, SubComponentType.DECODE):
svc_name = get_service_name_by_type(cfg, cls.BACKEND, sct)
if svc_name in cfg.spec.services:
cls._ensure_service_volume_mount(
cfg.spec.services[svc_name], pvc_name, pvc_mount_path
)
# Mount PVC to all services (Frontend + workers)
for svc_name, svc in cfg.spec.services.items():
cls._ensure_service_volume_mount(svc, pvc_name, pvc_mount_path)
# Patch workers + frontend with PVC model path.
cls._apply_model_update_to_cfg(
......@@ -515,12 +507,16 @@ class BaseConfigModifier:
# Update model (handles worker args + frontend patching)
effective_model_path = model_path or model_name
if pvc_name and pvc_mount_path:
# Derive pvc_path from effective_model_path by stripping the mount prefix
pvc_path = ""
if effective_model_path and effective_model_path.startswith(pvc_mount_path):
pvc_path = effective_model_path[len(pvc_mount_path) :].strip("/")
result = cls.update_model_from_pvc(
cfg.model_dump(),
model_name=model_name,
pvc_name=pvc_name,
pvc_mount_path=pvc_mount_path,
pvc_path="",
pvc_path=pvc_path,
)
else:
result = cls.update_model(
......@@ -629,3 +625,96 @@ class BaseConfigModifier:
cls._apply_worker_config(
cfg.spec.services[svc_name], agg_cli_args, agg_replicas, agg_gpus
)
# ---------------------------------------------------------------------------
# DGD override merging (module-level, backend-agnostic)
# ---------------------------------------------------------------------------
# Services whose CLI args are fully replaced by overrides.
# For engine-worker services (everything else), the main container args
# are *appended* because they contain profiler-generated sweep results.
_OVERRIDE_NON_WORKER_SERVICES = frozenset({"Frontend", "Planner"})
# The exact path suffix where profiler-generated CLI args live inside a
# service dict. Only this specific location gets append semantics.
_WORKER_ARGS_SUFFIX = ("extraPodSpec", "mainContainer", "args")
def _is_worker_main_container_args(path: list[str]) -> bool:
"""True when *path* is ``spec.services.<worker>.extraPodSpec.mainContainer.args``."""
if len(path) != 6:
return False
return (
path[0] == "spec"
and path[1] == "services"
and path[2] not in _OVERRIDE_NON_WORKER_SERVICES
and tuple(path[3:]) == _WORKER_ARGS_SUFFIX
)
def _deep_merge_overrides(
target: dict,
overrides: dict,
path: list[str],
) -> None:
"""Recursively merge *overrides* into *target* (mutates *target* in-place).
Rules:
- Dicts are merged recursively; missing intermediate keys are created.
- ``spec.services.<name>`` that does not exist in *target* is skipped
with a warning (all nested overrides under that service are dropped).
- Only ``spec.services.<worker>.extraPodSpec.mainContainer.args`` is
*appended* to the existing list (preserving profiler-generated CLI
args). ``args`` at any other path is replaced normally.
- All other leaf values replace the target value.
"""
for key, value in overrides.items():
current_path = path + [key]
# Guard: skip overrides for services that don't exist in the DGD
if (
len(current_path) == 3
and current_path[0] == "spec"
and current_path[1] == "services"
):
services = target.get("services", target) if path == ["spec"] else target
if key not in services:
logger.warning(
"Service '%s' does not exist in the generated DGD config; "
"overrides for this service will not be applied.",
key,
)
continue
if isinstance(value, dict) and isinstance(target.get(key), dict):
_deep_merge_overrides(target[key], value, current_path)
elif isinstance(value, dict) and key not in target:
target[key] = copy.deepcopy(value)
elif (
key == "args"
and isinstance(value, list)
and _is_worker_main_container_args(current_path)
):
existing = target.get(key) or []
target[key] = list(existing) + list(value)
else:
target[key] = (
copy.deepcopy(value) if isinstance(value, (dict, list)) else value
)
def apply_dgd_overrides(dgd_config: dict, overrides: dict) -> dict:
"""Deep-merge an ``overrides.dgd`` dict onto a generated DGD config.
Args:
dgd_config: The generated DynamoGraphDeployment config dict.
overrides: A partial DGD dict with the same structure. Leaf values
overwrite the corresponding keys in *dgd_config*.
Returns:
A new dict with the overrides applied (the original is not mutated).
"""
result = copy.deepcopy(dgd_config)
_deep_merge_overrides(result, overrides, path=[])
return result
......@@ -223,7 +223,7 @@ class DynamoGraphDeploymentRequestSpec(BaseModel):
)
image: Optional[str] = Field(
default=None,
description='Image is the container image reference for the profiling job (frontend image). Example: "nvcr.io/nvidia/dynamo-runtime:latest" TODO: In a future MR, the operator will derive the backend inference image from the backend type automatically; backend images can be overridden via overrides.dgd.',
description='Image is the container image reference for the profiling job (frontend image). Example: "nvcr.io/nvidia/ai-dynamo/dynamo-frontend:1.0.0".',
)
modelCache: Optional[ModelCacheSpec] = Field(
default=None,
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Profiler-side validation for DynamoGraphDeploymentRequestSpec.
The auto-generated Pydantic types in ``dgdr_v1beta1_types.py`` mirror the
Go API and mark most fields as ``Optional``. The profiler requires a
stricter contract. This module validates those requirements and normalises
fields (e.g. populating defaults, resolving SLA modes) so that downstream
code can access them without ``None`` checks.
"""
from __future__ import annotations
import logging
from dynamo.planner.utils.planner_config import PlannerPreDeploymentSweepMode
from dynamo.profiler.utils.defaults import SearchStrategy
from dynamo.profiler.utils.dgdr_v1beta1_types import (
DynamoGraphDeploymentRequestSpec,
SLASpec,
WorkloadSpec,
)
from dynamo.profiler.utils.profile_common import is_planner_enabled
logger = logging.getLogger(__name__)
def validate_dgdr_for_profiler(
dgdr: DynamoGraphDeploymentRequestSpec,
) -> DynamoGraphDeploymentRequestSpec:
"""Validate and normalise a DGDR spec for the profiler.
After this function returns successfully the caller can safely access:
- ``dgdr.image`` (str, non-empty)
- ``dgdr.hardware.gpuSku`` (str, non-empty)
- ``dgdr.hardware.numGpusPerNode`` (int > 0)
- ``dgdr.workload.isl``, ``dgdr.workload.osl`` (int)
- ``dgdr.sla.ttft``, ``dgdr.sla.itl`` (float) **or** ``dgdr.sla.e2eLatency`` (float)
without additional ``None`` guards.
The function mutates ``dgdr`` in-place (e.g. populating defaults) and
returns it for convenience.
Raises:
ValueError: If a required field is missing or invalid.
"""
_validate_required_fields(dgdr)
_validate_workload(dgdr.workload)
_validate_sla(dgdr.sla)
_validate_features(dgdr)
return dgdr
# ---------------------------------------------------------------------------
# Internal validators
# ---------------------------------------------------------------------------
def _validate_required_fields(dgdr: DynamoGraphDeploymentRequestSpec) -> None:
"""Check fields the profiler treats as required."""
if not dgdr.image:
raise ValueError("'image' is required in the DGDR spec.")
if not dgdr.hardware:
raise ValueError("'hardware' is required in the DGDR spec.")
if not dgdr.hardware.gpuSku:
raise ValueError("'hardware.gpuSku' is required in the DGDR spec.")
if not dgdr.hardware.numGpusPerNode or dgdr.hardware.numGpusPerNode <= 0:
raise ValueError("'hardware.numGpusPerNode' must be a positive integer.")
# Populate defaults for optional sub-objects so callers don't need None checks
if dgdr.workload is None:
dgdr.workload = WorkloadSpec()
if dgdr.sla is None:
dgdr.sla = SLASpec()
def _validate_workload(workload: WorkloadSpec) -> None:
"""Concurrency and requestRate are mutually exclusive."""
if workload.concurrency is not None and workload.requestRate is not None:
raise ValueError(
"Only one of 'concurrency' or 'requestRate' can be provided, not both."
)
def _validate_sla(sla: SLASpec) -> None:
"""Validate SLA targets and normalise e2eLatency mode."""
for name, val in [
("ttft", sla.ttft),
("itl", sla.itl),
("e2eLatency", sla.e2eLatency),
]:
if val is not None and val <= 0:
raise ValueError(f"SLA '{name}' must be positive (got {val}).")
has_e2e = sla.e2eLatency is not None
# When e2eLatency is provided it takes precedence — null out the per-token defaults
if has_e2e:
sla.ttft = None
sla.itl = None
return
has_ttft_itl = sla.ttft is not None and sla.itl is not None
if not has_ttft_itl:
raise ValueError(
"Either both 'ttft' and 'itl', or 'e2eLatency', must be provided in the SLA spec."
)
def run_gate_checks(
dgdr: DynamoGraphDeploymentRequestSpec,
aic_supported: bool,
search_strategy: SearchStrategy,
backend: str,
) -> None:
"""Raise ValueError or log warnings for unsupported combos.
Must be called after ``validate_dgdr_for_profiler``.
"""
if is_planner_enabled(dgdr) and not aic_supported:
model = dgdr.model
system = dgdr.hardware.gpuSku.lower()
planner_cfg = dgdr.features.planner
if planner_cfg.enable_throughput_scaling:
raise ValueError(
"Throughput-based planner scaling requires AIC support, but "
f"{model} on {system}/{backend} is not supported by AIC. "
"Use a supported model/hardware/backend combination or disable throughput scaling."
)
if (
planner_cfg.pre_deployment_sweeping_mode
== PlannerPreDeploymentSweepMode.Rapid
):
logger.warning(
"Planner pre-deployment sweeping mode is 'rapid' but AIC does not support "
"%s on %s/%s. Falling back to 'none' (no pre-deployment sweeping).",
model,
system,
backend,
)
planner_cfg.pre_deployment_sweeping_mode = (
PlannerPreDeploymentSweepMode.None_
)
if search_strategy == SearchStrategy.THOROUGH and backend == "auto":
raise ValueError(
"THOROUGH search strategy does not support 'auto' backend. "
"Please specify a concrete backend (trtllm, vllm, sglang)."
)
def _validate_features(dgdr: DynamoGraphDeploymentRequestSpec) -> None:
"""Cross-field validation for features."""
if not dgdr.features:
return
# Mocker requires pre-deployment sweeping
if dgdr.features.mocker and dgdr.features.mocker.enabled and dgdr.features.planner:
sweep_mode = dgdr.features.planner.pre_deployment_sweeping_mode
if sweep_mode is None or sweep_mode == PlannerPreDeploymentSweepMode.None_:
raise ValueError(
"pre_deployment_sweeping_mode cannot be 'none' when mocker is enabled. "
"Mocker backend requires pre-deployment sweeping to generate simulated "
"performance profiles."
)
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Shared helpers and configuration for the profiler pipeline."""
import logging
import os
from dataclasses import dataclass
import pandas as pd
from dynamo.profiler.utils.config_modifiers.parallelization_mapping import (
PickedParallelConfig,
)
from dynamo.profiler.utils.dgdr_v1beta1_types import DynamoGraphDeploymentRequestSpec
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Published container image naming conventions
# ---------------------------------------------------------------------------
# Mapping from backend name to the image-name component of the published
# backend runtime image.
# e.g. vllm → nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.0.0
BACKEND_IMAGE_NAMES: dict[str, str] = {
"vllm": "vllm-runtime",
"sglang": "sglang-runtime",
"trtllm": "tensorrtllm-runtime",
}
def derive_backend_image(profiler_image: str, backend: str) -> str:
"""Derive the backend worker image from the profiler image.
Replaces the image name (the last ``/``-delimited component, before any
``:tag``) with the backend-specific runtime image name, preserving the
registry path and tag unchanged.
Examples::
derive_backend_image(
"nvcr.io/nvidia/ai-dynamo/dynamo-frontend:1.0.0", "vllm"
)
# → "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.0.0"
derive_backend_image("myregistry.io/sglang-runtime:1.0.0", "sglang")
# → "myregistry.io/sglang-runtime:1.0.0"
Args:
profiler_image: Any Docker image reference of the form
``[REGISTRY/]NAME[:TAG]``.
backend: The resolved backend type (``'vllm'``, ``'sglang'``, or
``'trtllm'``).
Returns:
The backend container image string.
Raises:
ValueError: If *backend* is not a recognised backend.
"""
backend_image_name = BACKEND_IMAGE_NAMES.get(backend)
if backend_image_name is None:
raise ValueError(
f"Cannot derive backend image for unknown backend '{backend}'. "
f"Supported backends: {list(BACKEND_IMAGE_NAMES.keys())}"
)
# Split off the last path component: "registry/path/name:tag" → "name:tag"
slash_idx = profiler_image.rfind("/")
prefix = profiler_image[: slash_idx + 1] if slash_idx >= 0 else ""
suffix = profiler_image[slash_idx + 1 :]
colon_idx = suffix.find(":")
tag = suffix[colon_idx:] if colon_idx >= 0 else ""
return f"{prefix}{backend_image_name}{tag}"
# ---------------------------------------------------------------------------
# Operational defaults not part of DynamoGraphDeploymentRequestSpec
# ---------------------------------------------------------------------------
DEFAULT_OUTPUT_DIR = "profiling_results"
DEFAULT_NAMESPACE = os.environ.get("DGDR_NAMESPACE", "dynamo-sla-profiler")
DEFAULT_DEPLOYMENT_TIMEOUT = 3600
DEFAULT_PREFILL_INTERPOLATION_GRANULARITY = 16
DEFAULT_DECODE_INTERPOLATION_GRANULARITY = 6
DEFAULT_DRY_RUN = False
@dataclass
class ProfilerOperationalConfig:
"""Operational knobs that are not part of the DGDR spec."""
output_dir: str = DEFAULT_OUTPUT_DIR
k8s_namespace: str = DEFAULT_NAMESPACE
deployment_timeout: int = DEFAULT_DEPLOYMENT_TIMEOUT
prefill_interpolation_granularity: int = DEFAULT_PREFILL_INTERPOLATION_GRANULARITY
decode_interpolation_granularity: int = DEFAULT_DECODE_INTERPOLATION_GRANULARITY
dry_run: bool = DEFAULT_DRY_RUN
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def picked_config_from_row(prefix: str, row: pd.Series) -> PickedParallelConfig:
"""Extract a PickedParallelConfig from a picked ColumnsDisagg DataFrame row."""
return PickedParallelConfig(
tp=int(row.get(f"{prefix}tp", 1)),
pp=int(row.get(f"{prefix}pp", 1)),
dp=int(row.get(f"{prefix}dp", 1)),
moe_tp=int(row.get(f"{prefix}moe_tp", 1)),
moe_ep=int(row.get(f"{prefix}moe_ep", 1)),
)
def resolve_model_path(dgdr: DynamoGraphDeploymentRequestSpec) -> str:
"""Resolve the model path, preferring local PVC mount over HF ID."""
if (
dgdr.modelCache
and dgdr.modelCache.pvcName
and dgdr.modelCache.pvcMountPath
and dgdr.modelCache.pvcModelPath
):
mount = dgdr.modelCache.pvcMountPath.rstrip("/")
sub = dgdr.modelCache.pvcModelPath.strip("/")
local_path = f"{mount}/{sub}"
if os.path.isdir(local_path):
return local_path
return dgdr.model
def is_planner_enabled(dgdr: DynamoGraphDeploymentRequestSpec) -> bool:
"""True when the DGDR spec has a planner config with scaling enabled."""
return (
dgdr.features is not None
and dgdr.features.planner is not None
and dgdr.features.planner.scaling_enabled()
)
def determine_picking_mode(dgdr: DynamoGraphDeploymentRequestSpec) -> str:
target_load_provided = dgdr.workload is not None and (
dgdr.workload.requestRate is not None or dgdr.workload.concurrency is not None
)
if is_planner_enabled(dgdr):
return "autoscale"
elif target_load_provided:
return "load_match"
return "default"
def warn_and_update_sla(
best_latencies: dict,
target_ttft: float,
target_tpot: float,
) -> tuple[float, float]:
"""Warn if SLA is unachievable; return (possibly updated) targets."""
achieved_ttft = best_latencies.get("ttft", 0.0)
achieved_tpot = best_latencies.get("tpot", 0.0)
if achieved_ttft > target_ttft:
logger.warning(
"TTFT SLA %.1fms is unachievable. Best achievable: %.1fms. Updating SLA.",
target_ttft,
achieved_ttft,
)
target_ttft = achieved_ttft
if achieved_tpot > target_tpot:
logger.warning(
"ITL SLA %.1fms is unachievable. Best achievable: %.1fms. Updating SLA.",
target_tpot,
achieved_tpot,
)
target_tpot = achieved_tpot
return target_ttft, target_tpot
def warn_gpu_shortage(
picking_mode: str,
best_latencies: dict,
total_gpus: int,
) -> None:
if picking_mode != "load_match":
return
gpus_needed = best_latencies.get("total_gpus_needed")
if gpus_needed is not None and gpus_needed > total_gpus:
logger.warning(
"Load target requires %d GPUs but only %d available. "
"Consider adding more GPUs or reducing the load target.",
gpus_needed,
total_gpus,
)
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import argparse
import ast
import os
from typing import Any, Dict
import yaml
from dynamo.profiler.utils.defaults import SearchStrategy
from dynamo.profiler.utils.planner_utils import add_planner_arguments_to_parser
from dynamo.profiler.utils.search_space_autogen import auto_generate_search_space
def _get(cfg: Dict[str, Any], camel: str, snake: str, default: Any = None) -> Any:
"""Get config value with camelCase preferred, snake_case fallback."""
if camel in cfg:
return cfg[camel]
return cfg.get(snake, default)
def _camel_to_snake(name: str) -> str:
"""Convert camelCase to snake_case."""
import re
# Insert underscore before uppercase letters and lowercase
s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
return re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower()
def parse_config_string(config_str: str) -> Dict[str, Any]:
"""Parse configuration string as Python dict literal, YAML, or JSON.
Supports multiple input formats:
1. Python dict literal: "{'engine': {'backend': 'vllm'}, 'sla': {'isl': 3000}}"
2. YAML string: "engine:\n backend: vllm\nsla:\n isl: 3000"
3. JSON string: '{"engine": {"backend": "vllm"}, "sla": {"isl": 3000}}'
Args:
config_str: Configuration string in one of the supported formats
Returns:
Dictionary containing the configuration
Raises:
ValueError: If config cannot be parsed or is not a dictionary
"""
config = None
# Try 1: Parse as Python dict literal (most direct for CLI)
try:
config = ast.literal_eval(config_str)
if isinstance(config, dict):
return config
except (ValueError, SyntaxError):
pass
# Try 2: Parse as YAML/JSON (for K8s ConfigMaps and files)
try:
config = yaml.safe_load(config_str)
if config is not None and isinstance(config, dict):
return config
except yaml.YAMLError:
pass
# If we got here, parsing failed
raise ValueError(
"Failed to parse config string. Expected Python dict literal, YAML, or JSON format. "
f"Examples:\n"
f" Python dict: \"{'engine': {'backend': 'vllm'}}\"\n"
f' YAML: "engine:\\n backend: vllm"\n'
f' JSON: \'{{"engine": {{"backend": "vllm"}}}}\''
)
def create_profiler_parser() -> argparse.Namespace:
"""
Create argument parser with support for YAML config string.
Config structure (camelCase preferred, snake_case supported for backwards compat):
outputDir: String (path to the output results directory, default: profiling_results)
deployment:
namespace: String (kubernetes namespace, default: dynamo-sla-profiler)
serviceName: String (service name, default: "")
model: String (served model name)
dgdImage: String (container image to use for DGD components (frontend, planner, workers), overrides images in config file)
deploymentTimeout: Int (maximum time to wait for deployment to become ready in seconds, default: 1800)
modelCache:
pvcName: String (name of the PVC to mount the model cache,
if not provided, model must be HF name and will download from HF, default: "")
pvcPath: String (path to the model cache in the PVC, default: "")
mountPath: String (path to the model cache in the container,
note that the PVC must be mounted to the same path for the profiling job,
default: "/opt/model-cache")
engine:
backend: String (backend type, currently support [vllm, sglang, trtllm], default: vllm)
config: String (path to the DynamoGraphDeployment config file, default: "")
maxContextLength: Int (maximum context length supported by the served model, default: 0)
isMoeModel: Boolean (enable MoE (Mixture of Experts) model support, use TEP for prefill and DEP for decode, default: False)
hardware:
minNumGpusPerEngine: Int (minimum number of GPUs per engine, default: 0)
maxNumGpusPerEngine: Int (maximum number of GPUs per engine, default: 0)
numGpusPerNode: Int (number of GPUs per node, default: 0)
gpuModel: String (GPU model, used for auto-calculating search space, default: "")
gpuVramMib: Int (GPU VRAM in MiB, used for auto-calculating search space, default: 0)
system: String (target hardware system, e.g. h100_sxm, h200_sxm, default: None)
searchStrategy: String (search strategy for profiling: 'rapid' uses AI Configurator for quick estimation, 'thorough' runs actual deployments for comprehensive results, enum: [rapid, thorough], default: rapid)
sweep:
prefillInterpolationGranularity: Int (how many samples to benchmark to interpolate TTFT under different ISL, default: 16)
decodeInterpolationGranularity: Int (how many samples to benchmark to interpolate ITL under different active kv cache size and decode context length, default: 6)
dryRun: Boolean (dry run the profile job, default: False)
pickWithWebui: Boolean (pick the best parallelization mapping using webUI, default: False)
webuiPort: Int (webUI port, default: $PROFILER_WEBUI_PORT or 8000)
sla:
isl: Int (target input sequence length, default: 3000)
osl: Int (target output sequence length, default: 500)
ttft: Float (target Time To First Token in milliseconds, default: 50)
itl: Float (target Inter Token Latency in milliseconds, default: 10)
planner: (planner arguments)
e.g., plannerMinEndpoint: 2
"""
# Step 1: Pre-parse to check if --profile-config is provided
pre_parser = argparse.ArgumentParser(add_help=False)
pre_parser.add_argument("--profile-config", type=str)
pre_args, _ = pre_parser.parse_known_args()
# Step 2: Parse config if provided
config = {}
if pre_args.profile_config:
config = parse_config_string(pre_args.profile_config)
# Step 3: Create main parser with config-aware defaults
parser = argparse.ArgumentParser(
description="Profile the TTFT and ITL of the Prefill and Decode engine with different parallelization mapping. When profiling prefill we mock/fix decode,when profiling decode we mock/fix prefill."
)
parser.add_argument(
"--profile-config",
type=str,
help="Configuration as Python dict literal, YAML, or JSON string. CLI args override config values. "
"Example: \"{'engine': {'backend': 'vllm', 'config': '/path'}, 'sla': {'isl': 3000}}\"",
)
# CLI arguments with config-aware defaults (using nested .get() for cleaner code)
parser.add_argument(
"--model",
type=str,
default=config.get("deployment", {}).get("model", ""),
help="Served model name",
)
model_cache_config = config.get("deployment", {}).get("modelCache", {})
parser.add_argument(
"--model-cache-pvc-name",
type=str,
default=model_cache_config.get("pvcName", ""),
help="Name of the PVC that contains the model weights. If not provided, args.model must be a HF model name and will download from HF",
)
parser.add_argument(
"--model-cache-pvc-path",
type=str,
default=model_cache_config.get("pvcPath", ""),
help="Path to the model cache in the PVC",
)
parser.add_argument(
"--model-cache-pvc-mount-path",
type=str,
default=model_cache_config.get("mountPath", "/opt/model-cache"),
help="Path to the model cache in the container, note that the PVC must be mounted to the same path for the profiling job",
)
deployment_cfg = config.get("deployment", {})
parser.add_argument(
"--dgd-image",
type=str,
default=_get(deployment_cfg, "dgdImage", "dgd_image", ""),
help="Container image to use for DGD components (frontend, planner, workers). Overrides images in config file.",
)
parser.add_argument(
"--deployment-timeout",
type=int,
default=_get(deployment_cfg, "deploymentTimeout", "deployment_timeout", 1800),
help="Maximum time to wait for deployment to become ready in seconds (default: 1800)",
)
parser.add_argument(
"--namespace",
type=str,
default=deployment_cfg.get("namespace", "dynamo-sla-profiler"),
help="Kubernetes namespace to deploy the DynamoGraphDeployment",
)
parser.add_argument(
"--backend",
type=str,
default=config.get("engine", {}).get("backend", "vllm"),
choices=["vllm", "sglang", "trtllm"],
help="backend type, currently support [vllm, sglang, trtllm]",
)
parser.add_argument(
"--config",
type=str,
default=config.get("engine", {}).get("config", ""),
required=False,
help="Path to the DynamoGraphDeployment config file (required, can be provided via CLI or config)",
)
parser.add_argument(
"--output-dir",
type=str,
default=_get(config, "outputDir", "output_dir", "profiling_results"),
help="Path to the output results directory",
)
hardware_cfg = config.get("hardware", {})
parser.add_argument(
"--min-num-gpus-per-engine",
type=int,
default=_get(hardware_cfg, "minNumGpusPerEngine", "min_num_gpus_per_engine", 0),
help="minimum number of GPUs per engine",
)
parser.add_argument(
"--max-num-gpus-per-engine",
type=int,
default=_get(hardware_cfg, "maxNumGpusPerEngine", "max_num_gpus_per_engine", 0),
help="maximum number of GPUs per engine",
)
parser.add_argument(
"--num-gpus-per-node",
type=int,
default=_get(hardware_cfg, "numGpusPerNode", "num_gpus_per_node", 0),
help="Number of GPUs per node",
)
parser.add_argument(
"--gpu-model",
type=str,
default=_get(hardware_cfg, "gpuModel", "gpu_model", ""),
help="GPU model name (used for auto-calculating search space)",
)
parser.add_argument(
"--gpu-vram-mib",
type=int,
default=_get(hardware_cfg, "gpuVramMib", "gpu_vram_mib", 0),
help="GPU VRAM in MiB (used for auto-calculating search space)",
)
parser.add_argument(
"--system",
type=str,
default=_get(hardware_cfg, "system", "system", None),
help="Target hardware system, e.g. h100_sxm, h200_sxm",
)
parser.add_argument(
"--isl",
type=int,
default=config.get("sla", {}).get("isl", 3000),
help="target input sequence length",
)
parser.add_argument(
"--osl",
type=int,
default=config.get("sla", {}).get("osl", 500),
help="target output sequence length",
)
parser.add_argument(
"--ttft",
type=float,
default=config.get("sla", {}).get("ttft", 50.0),
help="target Time To First Token (float, in milliseconds)",
)
parser.add_argument(
"--itl",
type=float,
default=config.get("sla", {}).get("itl", 10.0),
help="target Inter Token Latency (float, in milliseconds)",
)
# High-level profiling strategy argument
parser.add_argument(
"--search-strategy",
type=SearchStrategy,
default=SearchStrategy(
_get(config, "searchStrategy", "search_strategy", "rapid")
),
choices=list(SearchStrategy),
help="Search strategy for profiling: 'rapid' uses AI Configurator for quick estimation, 'thorough' runs actual deployments for comprehensive results",
)
# arguments used for interpolating TTFT and ITL under different ISL/OSL
engine_cfg = config.get("engine", {})
parser.add_argument(
"--max-context-length",
type=int,
default=_get(engine_cfg, "maxContextLength", "max_context_length", 0),
help="maximum context length supported by the served model",
)
sweep_cfg = config.get("sweep", {})
parser.add_argument(
"--prefill-interpolation-granularity",
type=int,
default=_get(
sweep_cfg,
"prefillInterpolationGranularity",
"prefill_interpolation_granularity",
16,
),
help="how many samples to benchmark to interpolate TTFT under different ISL",
)
parser.add_argument(
"--decode-interpolation-granularity",
type=int,
default=_get(
sweep_cfg,
"decodeInterpolationGranularity",
"decode_interpolation_granularity",
6,
),
help="how many samples to benchmark to interpolate ITL under different active kv cache size and decode context length",
)
parser.add_argument(
"--service-name",
type=str,
default=_get(deployment_cfg, "serviceName", "service_name", ""),
help="Service name for port forwarding (default: {deployment_name}-frontend)",
)
parser.add_argument(
"--dry-run",
action="store_true",
default=_get(sweep_cfg, "dryRun", "dry_run", False),
help="Dry run the profile job",
)
parser.add_argument(
"--pick-with-webui",
action="store_true",
default=_get(sweep_cfg, "pickWithWebui", "pick_with_webui", False),
help="Pick the best parallelization mapping using webUI",
)
default_webui_port = 8000
webui_port_env = os.environ.get("PROFILER_WEBUI_PORT")
if webui_port_env:
default_webui_port = int(webui_port_env)
parser.add_argument(
"--webui-port",
type=int,
default=_get(sweep_cfg, "webuiPort", "webui_port", default_webui_port),
help="WebUI port",
)
# Dynamically add all planner arguments from planner_argparse.py
add_planner_arguments_to_parser(parser, prefix="planner-")
# Set defaults for any planner arguments found in config.planner
# Normalize keys: camelCase -> snake_case, hyphens -> underscores
planner_config = config.get("planner", {})
if planner_config:
normalized_planner_config = {
_camel_to_snake(key).replace("-", "_"): value
for key, value in planner_config.items()
}
parser.set_defaults(**normalized_planner_config)
# Parse arguments
args = parser.parse_args()
# remove --profile-config from args
if hasattr(args, "profile_config"):
delattr(args, "profile_config")
# Validate required arguments
# Either --model or --config (or both) must be provided
if not args.model and not args.config:
parser.error("--model or --config is required (provide at least one)")
auto_generate_search_space(args)
return args
......@@ -12,7 +12,7 @@
# For Multimodal EPD (required for device_map="auto" in vision model loading)
accelerate
aiconfigurator[webapp] @ git+https://github.com/ai-dynamo/aiconfigurator.git@7a24afd98714af13f061cffe784d4808f5356d45
aiconfigurator[webapp] @ git+https://github.com/ai-dynamo/aiconfigurator.git@168a948d5bc32209728fe8639191a9e0d9083d18
aiofiles
aiperf @ git+https://github.com/ai-dynamo/aiperf.git@54cd6dc820bff8bfebc875da104e59d745e14f75
av==15.0.0
......
......@@ -594,8 +594,7 @@ spec:
image:
description: |-
Image is the container image reference for the profiling job (frontend image).
Example: "nvcr.io/nvidia/dynamo-runtime:latest"
backend type automatically; backend images can be overridden via overrides.dgd.
Example: "nvcr.io/nvidia/ai-dynamo/dynamo-frontend:1.0.0".
type: string
model:
description: |-
......
......@@ -357,9 +357,7 @@ type DynamoGraphDeploymentRequestSpec struct {
Backend BackendType `json:"backend,omitempty"`
// Image is the container image reference for the profiling job (frontend image).
// Example: "nvcr.io/nvidia/dynamo-runtime:latest"
// TODO: In a future MR, the operator will derive the backend inference image from the
// backend type automatically; backend images can be overridden via overrides.dgd.
// Example: "nvcr.io/nvidia/ai-dynamo/dynamo-frontend:1.0.0".
// +optional
Image string `json:"image,omitempty"`
......
......@@ -594,8 +594,7 @@ spec:
image:
description: |-
Image is the container image reference for the profiling job (frontend image).
Example: "nvcr.io/nvidia/dynamo-runtime:latest"
backend type automatically; backend images can be overridden via overrides.dgd.
Example: "nvcr.io/nvidia/ai-dynamo/dynamo-frontend:1.0.0".
type: string
model:
description: |-
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment