Unverified Commit d56439ec authored by hhzhang16's avatar hhzhang16 Committed by GitHub
Browse files

feat: migrate GPU discovery from Dynamo Profiler to Dynamo Operator with...


feat: migrate GPU discovery from Dynamo Profiler to Dynamo Operator with automatic injection (#6224)
Signed-off-by: default avatarHannah Zhang <hannahz@nvidia.com>
parent 233a1e9a
aiconfigurator @ 7c08d2f2
Subproject commit 7c08d2f2c4e289afe49f48e56d392a7d7221155d
...@@ -17,9 +17,7 @@ spec: ...@@ -17,9 +17,7 @@ spec:
# NOTE: any image built before January 10 and any release prior to 0.8.1 # NOTE: any image built before January 10 and any release prior to 0.8.1
# will need to use snake_case within profilingConfig.config # will need to use snake_case within profilingConfig.config
config: config:
sweep: searchStrategy: rapid
useAiConfigurator: true
aicSystem: h200_sxm
sla: sla:
isl: 3000 isl: 3000
osl: 150 osl: 150
......
...@@ -17,8 +17,7 @@ spec: ...@@ -17,8 +17,7 @@ spec:
# NOTE: any image built before January 10 and any release prior to 0.8.1 # NOTE: any image built before January 10 and any release prior to 0.8.1
# will need to use snake_case within profilingConfig.config # will need to use snake_case within profilingConfig.config
config: config:
sweep: searchStrategy: thorough
useAiConfigurator: false
sla: sla:
isl: 3000 isl: 3000
osl: 150 osl: 150
......
...@@ -24,14 +24,12 @@ spec: ...@@ -24,14 +24,12 @@ spec:
pvcName: "model-cache" # Name of PVC containing model weights pvcName: "model-cache" # Name of PVC containing model weights
pvcPath: "deepseek-r1" # Subpath within PVC where model is stored pvcPath: "deepseek-r1" # Subpath within PVC where model is stored
sweep: searchStrategy: rapid
useAiConfigurator: false
hardware: hardware:
# for h200, sweep over 8-16 GPUs per engine # for h200, sweep over 8-16 GPUs per engine
minNumGpusPerEngine: 8 minNumGpusPerEngine: 8
maxNumGpusPerEngine: 16 maxNumGpusPerEngine: 16
numGpusPerNode: 8 numGpusPerNode: 8 # Override auto-discovered value if different
sla: sla:
isl: 3000 isl: 3000
......
...@@ -38,7 +38,7 @@ from dynamo.profiler.utils.config_modifiers.parallelization_mapping import ( ...@@ -38,7 +38,7 @@ from dynamo.profiler.utils.config_modifiers.parallelization_mapping import (
apply_parallel_mapping_to_config, apply_parallel_mapping_to_config,
get_candidate_parallel_mappings, get_candidate_parallel_mappings,
) )
from dynamo.profiler.utils.defaults import EngineType from dynamo.profiler.utils.defaults import EngineType, SearchStrategy
from dynamo.profiler.utils.dgd_generation import generate_dgd_config_with_planner from dynamo.profiler.utils.dgd_generation import generate_dgd_config_with_planner
from dynamo.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator from dynamo.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator
from dynamo.profiler.utils.plot import ( from dynamo.profiler.utils.plot import (
...@@ -140,10 +140,6 @@ async def run_profile(args): ...@@ -140,10 +140,6 @@ async def run_profile(args):
# Clear any errors from previous profiling runs # Clear any errors from previous profiling runs
clear_profiling_errors() clear_profiling_errors()
# Inherit aic_backend from backend if not explicitly set
if not args.aic_backend:
args.aic_backend = args.backend
# Write initial status for external jobs to monitor # Write initial status for external jobs to monitor
os.makedirs(args.output_dir, exist_ok=True) os.makedirs(args.output_dir, exist_ok=True)
write_profiler_status( write_profiler_status(
...@@ -197,36 +193,28 @@ async def run_profile(args): ...@@ -197,36 +193,28 @@ async def run_profile(args):
f"Using minimum of user-provided and model's maximum context length: {sweep_max_context_length}" f"Using minimum of user-provided and model's maximum context length: {sweep_max_context_length}"
) )
if args.use_ai_configurator: # Initialize AI Configurator estimator (only used when search_strategy == SearchStrategy.RAPID)
if not args.aic_system: ai_configurator_perf_estimator: AIConfiguratorPerfEstimator | None = None
if args.search_strategy == SearchStrategy.RAPID:
# Use AI Configurator for rapid estimation
if not args.system:
raise ValueError( raise ValueError(
"Must provide --aic-system when using --use-ai-configurator." "Must provide --system (hardware system, e.g. h100_sxm) when using rapid search strategy."
) )
# Fallback to args.model if aic_hf_id is not provided if not args.model:
if not args.aic_hf_id:
if args.model:
logger.info(
f"--aic-hf-id not provided, using --model ({args.model}) as HuggingFace ID for AI configurator"
)
args.aic_hf_id = args.model
else:
raise ValueError( raise ValueError(
"Must provide --aic-hf-id or --model when using --use-ai-configurator." "Must provide --model (HuggingFace ID) when using rapid search strategy."
) )
logger.info("Using aiconfigurator to estimate performance...") logger.info(
ai_configurator_perf_estimator = AIConfiguratorPerfEstimator( "Using AI Configurator to estimate performance (rapid strategy)..."
args.aic_hf_id,
args.aic_system.lower(),
args.aic_backend,
args.aic_backend_version,
) )
else: ai_configurator_perf_estimator = AIConfiguratorPerfEstimator(
if args.aic_system or args.aic_hf_id or args.aic_backend_version: hf_id=args.model,
logger.warning( system=args.system.lower(),
"Ignoring --aic-system, --aic-hf-id, and/or --backend-version " backend=args.backend,
"when not using --use-ai-configurator."
) )
# first profile prefill # first profile prefill
...@@ -272,7 +260,10 @@ async def run_profile(args): ...@@ -272,7 +260,10 @@ async def run_profile(args):
ttft = None ttft = None
if args.dry_run: if args.dry_run:
logger.info("Skipping deployment creation in dry run mode") logger.info("Skipping deployment creation in dry run mode")
elif args.use_ai_configurator: elif (
args.search_strategy == SearchStrategy.RAPID
and ai_configurator_perf_estimator
):
logger.info("Using ai-configurator to estimate prefill latency") logger.info("Using ai-configurator to estimate prefill latency")
perf_dict = ai_configurator_perf_estimator.estimate_prefill_perf( perf_dict = ai_configurator_perf_estimator.estimate_prefill_perf(
args.isl, args.isl,
...@@ -395,7 +386,10 @@ async def run_profile(args): ...@@ -395,7 +386,10 @@ async def run_profile(args):
if args.dry_run: if args.dry_run:
logger.info("Skipping deployment creation in dry run mode") logger.info("Skipping deployment creation in dry run mode")
elif args.use_ai_configurator: elif (
args.search_strategy == SearchStrategy.RAPID
and ai_configurator_perf_estimator
):
# Compute max_concurrency and max_kv_tokens to know which # Compute max_concurrency and max_kv_tokens to know which
# num_request to sweep over. # num_request to sweep over.
max_concurrency = ai_configurator_perf_estimator.get_max_batch_size( max_concurrency = ai_configurator_perf_estimator.get_max_batch_size(
...@@ -467,7 +461,10 @@ async def run_profile(args): ...@@ -467,7 +461,10 @@ async def run_profile(args):
for num_request in sweep_num_request: for num_request in sweep_num_request:
itl = thpt_per_gpu = None itl = thpt_per_gpu = None
if args.use_ai_configurator: if (
args.search_strategy == SearchStrategy.RAPID
and ai_configurator_perf_estimator
):
logger.info( logger.info(
"Using ai-configurator to estimate decode latency." "Using ai-configurator to estimate decode latency."
) )
...@@ -511,7 +508,10 @@ async def run_profile(args): ...@@ -511,7 +508,10 @@ async def run_profile(args):
parallel_mapping=mapping, parallel_mapping=mapping,
) )
if not args.dry_run and not args.use_ai_configurator: if (
not args.dry_run
and not args.search_strategy == SearchStrategy.RAPID
):
logger.info("Cleaning up deployment...") logger.info("Cleaning up deployment...")
await client.delete_deployment() await client.delete_deployment()
deployment_clients.remove(client) deployment_clients.remove(client)
...@@ -642,7 +642,10 @@ async def run_profile(args): ...@@ -642,7 +642,10 @@ async def run_profile(args):
if args.dry_run: if args.dry_run:
logger.info("Skipping deployment creation in dry run mode") logger.info("Skipping deployment creation in dry run mode")
elif args.use_ai_configurator: elif (
args.search_strategy == SearchStrategy.RAPID
and ai_configurator_perf_estimator
):
profile_prefill_aiconfigurator( profile_prefill_aiconfigurator(
work_dir, work_dir,
best_prefill_gpus, # num_gpus best_prefill_gpus, # num_gpus
...@@ -728,7 +731,10 @@ async def run_profile(args): ...@@ -728,7 +731,10 @@ async def run_profile(args):
if args.dry_run: if args.dry_run:
logger.info("Skipping deployment creation in dry run mode") logger.info("Skipping deployment creation in dry run mode")
elif args.use_ai_configurator: elif (
args.search_strategy == SearchStrategy.RAPID
and ai_configurator_perf_estimator
):
attention_dp_size = best_decode_mapping.get_attn_dp_size() attention_dp_size = best_decode_mapping.get_attn_dp_size()
max_kv_tokens = ai_configurator_perf_estimator.get_max_kv_tokens( max_kv_tokens = ai_configurator_perf_estimator.get_max_kv_tokens(
args.isl, args.osl, tp_size=best_decode_mapping.get_tp_size() args.isl, args.osl, tp_size=best_decode_mapping.get_tp_size()
......
...@@ -154,7 +154,7 @@ class BaseConfigModifier: ...@@ -154,7 +154,7 @@ class BaseConfigModifier:
Raises: Raises:
ValueError: If neither --served-model-name nor model path arg is found ValueError: If neither --served-model-name nor model path arg is found
""" """
model_name = None model_name = ""
# Check for --served-model-name first (API model name) # Check for --served-model-name first (API model name)
for i, arg in enumerate(args): for i, arg in enumerate(args):
if arg == cls.WORKER_SERVED_MODEL_NAME_ARG and i + 1 < len(args): if arg == cls.WORKER_SERVED_MODEL_NAME_ARG and i + 1 < len(args):
...@@ -162,14 +162,14 @@ class BaseConfigModifier: ...@@ -162,14 +162,14 @@ class BaseConfigModifier:
break break
# Check for backend-specific path argument # Check for backend-specific path argument
model_path = None model_path = ""
for i, arg in enumerate(args): for i, arg in enumerate(args):
if arg == cls.WORKER_MODEL_PATH_ARG and i + 1 < len(args): if arg == cls.WORKER_MODEL_PATH_ARG and i + 1 < len(args):
model_path = args[i + 1] model_path = args[i + 1]
break break
# Require at least one to be specified # Require at least one to be specified
if model_name is None and model_path is None: if not model_name and not model_path:
raise ValueError( raise ValueError(
f"Cannot determine model: neither {cls.WORKER_MODEL_PATH_ARG} nor " f"Cannot determine model: neither {cls.WORKER_MODEL_PATH_ARG} nor "
f"{cls.WORKER_SERVED_MODEL_NAME_ARG} found in worker configuration. " f"{cls.WORKER_SERVED_MODEL_NAME_ARG} found in worker configuration. "
...@@ -177,9 +177,9 @@ class BaseConfigModifier: ...@@ -177,9 +177,9 @@ class BaseConfigModifier:
) )
# If only one is specified, use it for both # If only one is specified, use it for both
if model_path is None: if not model_path:
model_path = model_name model_path = model_name
elif model_name is None: elif not model_name:
model_name = model_path model_name = model_path
return model_name, model_path return model_name, model_path
......
...@@ -49,3 +49,8 @@ DEFAULT_GPU_COST_PER_HOUR = 3.0 # Cost per GPU per hour in dollars ...@@ -49,3 +49,8 @@ DEFAULT_GPU_COST_PER_HOUR = 3.0 # Cost per GPU per hour in dollars
class EngineType(str, Enum): class EngineType(str, Enum):
PREFILL = "prefill" PREFILL = "prefill"
DECODE = "decode" DECODE = "decode"
class SearchStrategy(str, Enum):
RAPID = "rapid"
THOROUGH = "thorough"
...@@ -39,12 +39,10 @@ class AIConfiguratorPerfEstimator: ...@@ -39,12 +39,10 @@ class AIConfiguratorPerfEstimator:
hf_id: str, # e.g. "Qwen/Qwen3-32B" hf_id: str, # e.g. "Qwen/Qwen3-32B"
system: str, # e.g. "h200_sxm" system: str, # e.g. "h200_sxm"
backend: str, # e.g. "trtllm" backend: str, # e.g. "trtllm"
version: str, # e.g. "0.20.0"
): ):
aiconfigurator = _try_import_aiconfigurator() aiconfigurator = _try_import_aiconfigurator()
logger.info("Loading aiconfigurator database. This might take a few seconds...") logger.info("Loading aiconfigurator database. This might take a few seconds...")
if not version:
version = aiconfigurator.sdk.perf_database.get_latest_database_version( version = aiconfigurator.sdk.perf_database.get_latest_database_version(
system, system,
backend, backend,
......
...@@ -8,6 +8,7 @@ from typing import Any, Dict ...@@ -8,6 +8,7 @@ from typing import Any, Dict
import yaml import yaml
from dynamo.profiler.utils.defaults import SearchStrategy
from dynamo.profiler.utils.planner_utils import add_planner_arguments_to_parser from dynamo.profiler.utils.planner_utils import add_planner_arguments_to_parser
from dynamo.profiler.utils.search_space_autogen import auto_generate_search_space from dynamo.profiler.utils.search_space_autogen import auto_generate_search_space
...@@ -100,16 +101,14 @@ def create_profiler_parser() -> argparse.Namespace: ...@@ -100,16 +101,14 @@ def create_profiler_parser() -> argparse.Namespace:
hardware: hardware:
minNumGpusPerEngine: Int (minimum number of GPUs per engine, default: 0) minNumGpusPerEngine: Int (minimum number of GPUs per engine, default: 0)
maxNumGpusPerEngine: Int (maximum number of GPUs per engine, default: 0) maxNumGpusPerEngine: Int (maximum number of GPUs per engine, default: 0)
numGpusPerNode: Int (number of GPUs per node for MoE models - this will be the granularity when searching for the best TEP/DEP size, default: 0) numGpusPerNode: Int (number of GPUs per node, default: 0)
enableGpuDiscovery: Boolean (enable automatic GPU discovery from Kubernetes cluster nodes, when enabled overrides any manually specified hardware configuration, requires cluster-wide node access permissions, default: False) gpuModel: String (GPU model, used for auto-calculating search space, default: "")
gpuVramMib: Int (GPU VRAM in MiB, used for auto-calculating search space, default: 0)
system: String (target hardware system, e.g. h100_sxm, h200_sxm, default: None)
searchStrategy: String (search strategy for profiling: 'rapid' uses AI Configurator for quick estimation, 'thorough' runs actual deployments for comprehensive results, enum: [rapid, thorough], default: rapid)
sweep: sweep:
prefillInterpolationGranularity: Int (how many samples to benchmark to interpolate TTFT under different ISL, default: 16) prefillInterpolationGranularity: Int (how many samples to benchmark to interpolate TTFT under different ISL, default: 16)
decodeInterpolationGranularity: Int (how many samples to benchmark to interpolate ITL under different active kv cache size and decode context length, default: 6) decodeInterpolationGranularity: Int (how many samples to benchmark to interpolate ITL under different active kv cache size and decode context length, default: 6)
useAiConfigurator: Boolean (use ai-configurator to estimate benchmarking results instead of running actual deployment, default: False)
aicSystem: String (target system for use with aiconfigurator, default: None)
aicHfId: String (aiconfigurator huggingface id of the target model, default: None)
aicBackend: String (aiconfigurator backend of the target model, if not provided, will use args.backend, default: "")
aicBackendVersion: String (specify backend version when using aiconfigurator to estimate perf, default: None)
dryRun: Boolean (dry run the profile job, default: False) dryRun: Boolean (dry run the profile job, default: False)
pickWithWebui: Boolean (pick the best parallelization mapping using webUI, default: False) pickWithWebui: Boolean (pick the best parallelization mapping using webUI, default: False)
webuiPort: Int (webUI port, default: $PROFILER_WEBUI_PORT or 8000) webuiPort: Int (webUI port, default: $PROFILER_WEBUI_PORT or 8000)
...@@ -226,7 +225,25 @@ def create_profiler_parser() -> argparse.Namespace: ...@@ -226,7 +225,25 @@ def create_profiler_parser() -> argparse.Namespace:
"--num-gpus-per-node", "--num-gpus-per-node",
type=int, type=int,
default=_get(hardware_cfg, "numGpusPerNode", "num_gpus_per_node", 0), default=_get(hardware_cfg, "numGpusPerNode", "num_gpus_per_node", 0),
help="Number of GPUs per node for MoE models - this will be the granularity when searching for the best TEP/DEP size", help="Number of GPUs per node",
)
parser.add_argument(
"--gpu-model",
type=str,
default=_get(hardware_cfg, "gpuModel", "gpu_model", ""),
help="GPU model name (used for auto-calculating search space)",
)
parser.add_argument(
"--gpu-vram-mib",
type=int,
default=_get(hardware_cfg, "gpuVramMib", "gpu_vram_mib", 0),
help="GPU VRAM in MiB (used for auto-calculating search space)",
)
parser.add_argument(
"--system",
type=str,
default=_get(hardware_cfg, "system", "system", None),
help="Target hardware system, e.g. h100_sxm, h200_sxm",
) )
parser.add_argument( parser.add_argument(
"--isl", "--isl",
...@@ -253,6 +270,17 @@ def create_profiler_parser() -> argparse.Namespace: ...@@ -253,6 +270,17 @@ def create_profiler_parser() -> argparse.Namespace:
help="target Inter Token Latency (float, in milliseconds)", help="target Inter Token Latency (float, in milliseconds)",
) )
# High-level profiling strategy argument
parser.add_argument(
"--search-strategy",
type=SearchStrategy,
default=SearchStrategy(
_get(config, "searchStrategy", "search_strategy", "rapid")
),
choices=list(SearchStrategy),
help="Search strategy for profiling: 'rapid' uses AI Configurator for quick estimation, 'thorough' runs actual deployments for comprehensive results",
)
# arguments used for interpolating TTFT and ITL under different ISL/OSL # arguments used for interpolating TTFT and ITL under different ISL/OSL
engine_cfg = config.get("engine", {}) engine_cfg = config.get("engine", {})
parser.add_argument( parser.add_argument(
...@@ -296,12 +324,6 @@ def create_profiler_parser() -> argparse.Namespace: ...@@ -296,12 +324,6 @@ def create_profiler_parser() -> argparse.Namespace:
default=_get(sweep_cfg, "dryRun", "dry_run", False), default=_get(sweep_cfg, "dryRun", "dry_run", False),
help="Dry run the profile job", help="Dry run the profile job",
) )
parser.add_argument(
"--enable-gpu-discovery",
action="store_true",
default=_get(hardware_cfg, "enableGpuDiscovery", "enable_gpu_discovery", False),
help="Enable automatic GPU discovery from Kubernetes cluster nodes. When enabled, overrides any manually specified hardware configuration. Requires cluster-wide node access permissions.",
)
parser.add_argument( parser.add_argument(
"--pick-with-webui", "--pick-with-webui",
action="store_true", action="store_true",
...@@ -332,38 +354,6 @@ def create_profiler_parser() -> argparse.Namespace: ...@@ -332,38 +354,6 @@ def create_profiler_parser() -> argparse.Namespace:
} }
parser.set_defaults(**normalized_planner_config) parser.set_defaults(**normalized_planner_config)
# arguments if using aiconfigurator
parser.add_argument(
"--use-ai-configurator",
action="store_true",
default=_get(sweep_cfg, "useAiConfigurator", "use_ai_configurator", False),
help="Use ai-configurator to estimate benchmarking results instead of running actual deployment.",
)
parser.add_argument(
"--aic-system",
type=str,
default=_get(sweep_cfg, "aicSystem", "aic_system", None),
help="Target system for use with aiconfigurator (e.g. h100_sxm, h200_sxm)",
)
parser.add_argument(
"--aic-hf-id",
type=str,
default=_get(sweep_cfg, "aicHfId", "aic_hf_id", None),
help="aiconfigurator name of the target model (e.g. Qwen/Qwen3-32B, meta-llama/Llama-3.1-405B)",
)
parser.add_argument(
"--aic-backend",
type=str,
default=_get(sweep_cfg, "aicBackend", "aic_backend", ""),
help="aiconfigurator backend of the target model, if not provided, will use args.backend",
)
parser.add_argument(
"--aic-backend-version",
type=str,
default=_get(sweep_cfg, "aicBackendVersion", "aic_backend_version", None),
help="Specify backend version when using aiconfigurator to estimate perf.",
)
# Parse arguments # Parse arguments
args = parser.parse_args() args = parser.parse_args()
......
...@@ -8,7 +8,6 @@ import os ...@@ -8,7 +8,6 @@ import os
import yaml import yaml
from deploy.utils.gpu_inventory import get_gpu_summary
from dynamo.profiler.utils.config_modifiers import CONFIG_MODIFIERS from dynamo.profiler.utils.config_modifiers import CONFIG_MODIFIERS
from dynamo.profiler.utils.model_info import ModelInfo, get_model_info from dynamo.profiler.utils.model_info import ModelInfo, get_model_info
...@@ -103,66 +102,79 @@ def auto_generate_search_space(args: argparse.Namespace) -> None: ...@@ -103,66 +102,79 @@ def auto_generate_search_space(args: argparse.Namespace) -> None:
) )
args.model_info = model_info args.model_info = model_info
# now determine the search space # Determine the search space for profiling
if args.enable_gpu_discovery: # User-provided min/max values take precedence; auto-calculate missing bounds
if ( # based on GPU hardware info and model size
args.min_num_gpus_per_engine == 0 user_specified_ranges = (
or args.max_num_gpus_per_engine == 0 args.min_num_gpus_per_engine != 0 and args.max_num_gpus_per_engine != 0
or args.num_gpus_per_node == 0 )
):
if not args.model:
# TODO: get model info provided DGD config
error_msg = "No model provided, cannot auto-generate GPU search space. Please provide `--model` or GPU info"
logger.error(error_msg)
raise RuntimeError(error_msg)
logger.info("Getting GPU info from k8s cluster...") if user_specified_ranges:
gpu_info = get_gpu_summary()
logger.info( logger.info(
f"Cluster has {gpu_info['gpus_per_node']}x{gpu_info['model']} GPUs per node with {gpu_info['vram']} VRAM" f"Using user-specified GPU search space: {args.min_num_gpus_per_engine} to {args.max_num_gpus_per_engine}"
) )
# Ensure num_gpus_per_node is set (needed for multi-node configs)
if args.num_gpus_per_node == 0:
logger.warning("num_gpus_per_node not specified, setting to 8")
args.num_gpus_per_node = 8
else:
# Auto-calculate search space (honor partial user overrides)
# NOTE: will be handled in AIC
if args.num_gpus_per_node != 0 and args.gpu_vram_mib != 0:
# Have GPU hardware info - calculate based on model size
if not args.model:
error_msg = "No model provided, cannot auto-generate GPU search space. Please provide --model"
logger.error(error_msg)
raise RuntimeError(error_msg)
# model_info should be set by now (checked above), but mypy needs explicit verification
assert ( assert (
model_info is not None model_info is not None
), "model_info must be set when model is provided" ), "model_info must be set when model is provided"
vram_mib = int(gpu_info["vram"]) # type: ignore[call-overload] logger.info(
gpus_per_node = int(gpu_info["gpus_per_node"]) # type: ignore[call-overload] f"Auto-generating search space: {args.num_gpus_per_node}x {args.gpu_model} GPUs with {args.gpu_vram_mib} MiB VRAM per GPU"
)
if args.system:
logger.info(f"Hardware system: {args.system}")
# Calculate minimum GPUs needed for model
min_gpu = math.ceil( min_gpu = math.ceil(
model_info.model_size / MODEL_GPU_MEM_FRAC_MAX / vram_mib model_info.model_size / MODEL_GPU_MEM_FRAC_MAX / args.gpu_vram_mib
) )
# Calculate maximum GPUs to profile
if not model_info.is_moe: if not model_info.is_moe:
max_gpu = gpus_per_node max_gpu = args.num_gpus_per_node
else: else:
max_gpu = max(min_gpu * MOE_MODEL_MAX_NUM_GPU_FACTOR, gpus_per_node) # MoE models can benefit from more GPUs
if min_gpu > max_gpu: max_gpu = max(
error_msg = f"No valid GPU configuration found for model {args.model} on the cluster with {gpu_info['gpus_per_node']}x{gpu_info['model']} GPUs per node" min_gpu * MOE_MODEL_MAX_NUM_GPU_FACTOR, args.num_gpus_per_node
)
# Honor partial user overrides
final_min = args.min_num_gpus_per_engine or min_gpu
final_max = args.max_num_gpus_per_engine or max_gpu
# Validate final_min <= final_max
if final_min > final_max:
error_msg = f"Invalid GPU range: min_num_gpus_per_engine ({final_min}) > max_num_gpus_per_engine ({final_max})"
logger.error(error_msg) logger.error(error_msg)
raise RuntimeError(error_msg) raise RuntimeError(error_msg)
logger.info( # Clamp to valid range [1, args.num_gpus_per_node]
f"Auto-generated search space for model {args.model} on the cluster with {gpu_info['gpus_per_node']}x{gpu_info['model']} GPUs per node: {min_gpu} to {max_gpu}" final_min = max(1, min(final_min, args.num_gpus_per_node))
) final_max = max(1, min(final_max, args.num_gpus_per_node))
args.min_num_gpus_per_engine = min_gpu
args.max_num_gpus_per_engine = max_gpu logger.info(f"Auto-generated search space: {final_min} to {final_max} GPUs")
args.num_gpus_per_node = gpus_per_node # type: ignore[assignment] args.min_num_gpus_per_engine = final_min
args.max_num_gpus_per_engine = final_max
else: else:
# use default values for GPUs # No GPU info available - use defaults
if args.min_num_gpus_per_engine == 0: logger.warning("GPU hardware info not available, using default values")
logger.warning( args.min_num_gpus_per_engine = args.min_num_gpus_per_engine or 1
"GPU discover is disabled and min_num_gpus_per_engine is not specified, setting to 1" args.max_num_gpus_per_engine = args.max_num_gpus_per_engine or 4
) args.num_gpus_per_node = args.num_gpus_per_node or 8
args.min_num_gpus_per_engine = 1 logger.info(
if args.max_num_gpus_per_engine == 0: f"Default search space: {args.min_num_gpus_per_engine} to {args.max_num_gpus_per_engine} GPUs, {args.num_gpus_per_node} GPUs per node"
logger.warning(
"GPU discover is disabled and max_num_gpus_per_engine is not specified, setting to 4"
)
args.max_num_gpus_per_engine = 4
if args.num_gpus_per_node == 0:
logger.warning(
"GPU discover is disabled and num_gpus_per_node is not specified, setting to 8"
) )
args.num_gpus_per_node = 8
return return
...@@ -140,13 +140,13 @@ spec: ...@@ -140,13 +140,13 @@ spec:
type: string type: string
type: object type: object
enableGpuDiscovery: enableGpuDiscovery:
default: false default: true
description: |- description: |-
EnableGpuDiscovery controls whether the profiler should automatically discover GPU EnableGPUDiscovery controls whether the operator attempts to discover GPU hardware from cluster nodes.
resources from the Kubernetes cluster nodes. When enabled, the profiler will override DEPRECATED: This field is deprecated and will be removed in v1beta1. GPU discovery is now always
any manually specified hardware configuration (minNumGpusPerEngine, maxNumGpusPerEngine, attempted automatically. Setting this field has no effect - the operator will always try to discover
numGpusPerNode) with values detected from the cluster. GPU hardware when node read permissions are available. If discovery is unavailable (e.g., namespace-scoped
Requires cluster-wide node access permissions - only available with cluster-scoped operators. operator without permissions), manual hardware configuration is required regardless of this setting.
type: boolean type: boolean
model: model:
description: |- description: |-
...@@ -157,6 +157,12 @@ spec: ...@@ -157,6 +157,12 @@ spec:
profilingConfig: profilingConfig:
description: |- description: |-
ProfilingConfig provides the complete configuration for the profiling job. ProfilingConfig provides the complete configuration for the profiling job.
Note: GPU discovery is automatically attempted to detect GPU resources from Kubernetes
cluster nodes. If the operator has node read permissions (cluster-wide or explicitly granted),
discovered GPU configuration is used as defaults when hardware configuration is not manually
specified (minNumGpusPerEngine, maxNumGpusPerEngine, numGpusPerNode). User-specified values
always take precedence over auto-discovered values. If GPU discovery fails (e.g.,
namespace-restricted operator without node permissions), manual hardware config is required.
This configuration is passed directly to the profiler. This configuration is passed directly to the profiler.
The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema). The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema).
Note: deployment.model and engine.backend are automatically set from the high-level Note: deployment.model and engine.backend are automatically set from the high-level
......
...@@ -151,16 +151,13 @@ type DynamoGraphDeploymentRequestSpec struct { ...@@ -151,16 +151,13 @@ type DynamoGraphDeploymentRequestSpec struct {
// +kubebuilder:default=false // +kubebuilder:default=false
UseMocker bool `json:"useMocker,omitempty"` UseMocker bool `json:"useMocker,omitempty"`
// EnableGpuDiscovery controls whether the profiler should automatically discover GPU
// resources from the Kubernetes cluster nodes. When enabled, the profiler will override
// any manually specified hardware configuration (minNumGpusPerEngine, maxNumGpusPerEngine,
// numGpusPerNode) with values detected from the cluster.
// Requires cluster-wide node access permissions - only available with cluster-scoped operators.
// +kubebuilder:default=false
// +kubebuilder:validation:Optional
EnableGpuDiscovery bool `json:"enableGpuDiscovery,omitempty"`
// ProfilingConfig provides the complete configuration for the profiling job. // ProfilingConfig provides the complete configuration for the profiling job.
// Note: GPU discovery is automatically attempted to detect GPU resources from Kubernetes
// cluster nodes. If the operator has node read permissions (cluster-wide or explicitly granted),
// discovered GPU configuration is used as defaults when hardware configuration is not manually
// specified (minNumGpusPerEngine, maxNumGpusPerEngine, numGpusPerNode). User-specified values
// always take precedence over auto-discovered values. If GPU discovery fails (e.g.,
// namespace-restricted operator without node permissions), manual hardware config is required.
// This configuration is passed directly to the profiler. // This configuration is passed directly to the profiler.
// The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema). // The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema).
// Note: deployment.model and engine.backend are automatically set from the high-level // Note: deployment.model and engine.backend are automatically set from the high-level
...@@ -168,6 +165,15 @@ type DynamoGraphDeploymentRequestSpec struct { ...@@ -168,6 +165,15 @@ type DynamoGraphDeploymentRequestSpec struct {
// +kubebuilder:validation:Required // +kubebuilder:validation:Required
ProfilingConfig ProfilingConfigSpec `json:"profilingConfig"` ProfilingConfig ProfilingConfigSpec `json:"profilingConfig"`
// EnableGPUDiscovery controls whether the operator attempts to discover GPU hardware from cluster nodes.
// DEPRECATED: This field is deprecated and will be removed in v1beta1. GPU discovery is now always
// attempted automatically. Setting this field has no effect - the operator will always try to discover
// GPU hardware when node read permissions are available. If discovery is unavailable (e.g., namespace-scoped
// operator without permissions), manual hardware configuration is required regardless of this setting.
// +optional
// +kubebuilder:default=true
EnableGPUDiscovery *bool `json:"enableGpuDiscovery,omitempty"`
// AutoApply indicates whether to automatically create a DynamoGraphDeployment // AutoApply indicates whether to automatically create a DynamoGraphDeployment
// after profiling completes. If false, only the spec is generated and stored in status. // after profiling completes. If false, only the spec is generated and stored in status.
// Users can then manually create a DGD using the generated spec. // Users can then manually create a DGD using the generated spec.
......
...@@ -726,6 +726,11 @@ func (in *DynamoGraphDeploymentRequestList) DeepCopyObject() runtime.Object { ...@@ -726,6 +726,11 @@ func (in *DynamoGraphDeploymentRequestList) DeepCopyObject() runtime.Object {
func (in *DynamoGraphDeploymentRequestSpec) DeepCopyInto(out *DynamoGraphDeploymentRequestSpec) { func (in *DynamoGraphDeploymentRequestSpec) DeepCopyInto(out *DynamoGraphDeploymentRequestSpec) {
*out = *in *out = *in
in.ProfilingConfig.DeepCopyInto(&out.ProfilingConfig) in.ProfilingConfig.DeepCopyInto(&out.ProfilingConfig)
if in.EnableGPUDiscovery != nil {
in, out := &in.EnableGPUDiscovery, &out.EnableGPUDiscovery
*out = new(bool)
**out = **in
}
if in.DeploymentOverrides != nil { if in.DeploymentOverrides != nil {
in, out := &in.DeploymentOverrides, &out.DeploymentOverrides in, out := &in.DeploymentOverrides, &out.DeploymentOverrides
*out = new(DeploymentOverridesSpec) *out = new(DeploymentOverridesSpec)
......
...@@ -140,13 +140,13 @@ spec: ...@@ -140,13 +140,13 @@ spec:
type: string type: string
type: object type: object
enableGpuDiscovery: enableGpuDiscovery:
default: false default: true
description: |- description: |-
EnableGpuDiscovery controls whether the profiler should automatically discover GPU EnableGPUDiscovery controls whether the operator attempts to discover GPU hardware from cluster nodes.
resources from the Kubernetes cluster nodes. When enabled, the profiler will override DEPRECATED: This field is deprecated and will be removed in v1beta1. GPU discovery is now always
any manually specified hardware configuration (minNumGpusPerEngine, maxNumGpusPerEngine, attempted automatically. Setting this field has no effect - the operator will always try to discover
numGpusPerNode) with values detected from the cluster. GPU hardware when node read permissions are available. If discovery is unavailable (e.g., namespace-scoped
Requires cluster-wide node access permissions - only available with cluster-scoped operators. operator without permissions), manual hardware configuration is required regardless of this setting.
type: boolean type: boolean
model: model:
description: |- description: |-
...@@ -157,6 +157,12 @@ spec: ...@@ -157,6 +157,12 @@ spec:
profilingConfig: profilingConfig:
description: |- description: |-
ProfilingConfig provides the complete configuration for the profiling job. ProfilingConfig provides the complete configuration for the profiling job.
Note: GPU discovery is automatically attempted to detect GPU resources from Kubernetes
cluster nodes. If the operator has node read permissions (cluster-wide or explicitly granted),
discovered GPU configuration is used as defaults when hardware configuration is not manually
specified (minNumGpusPerEngine, maxNumGpusPerEngine, numGpusPerNode). User-specified values
always take precedence over auto-discovered values. If GPU discovery fails (e.g.,
namespace-restricted operator without node permissions), manual hardware config is required.
This configuration is passed directly to the profiler. This configuration is passed directly to the profiler.
The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema). The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema).
Note: deployment.model and engine.backend are automatically set from the high-level Note: deployment.model and engine.backend are automatically set from the high-level
......
...@@ -39,23 +39,24 @@ spec: ...@@ -39,23 +39,24 @@ spec:
engine: engine:
maxContextLength: 16384 # will override max context length of the model if provided maxContextLength: 16384 # will override max context length of the model if provided
# Search strategy: 'rapid' for AI Configurator estimation (20-30s), 'thorough' for actual deployments (2-4h)
searchStrategy: thorough
# Hardware configuration # Hardware configuration
# Note: Operator auto-discovers GPU info from cluster nodes when available
hardware: hardware:
minNumGpusPerEngine: 1 # Minimum GPUs to test minNumGpusPerEngine: 1 # Minimum GPUs to test
maxNumGpusPerEngine: 4 # Maximum GPUs to test (limited by model's num_heads/4) maxNumGpusPerEngine: 4 # Maximum GPUs to test
numGpusPerNode: 8 # GPUs per node (for MoE models) numGpusPerNode: 8 # GPUs per node (optional - auto-discovered if not specified)
system: h200_sxm # Hardware system (optional - auto-detected if not specified)
# gpuModel: "H200-SXM" # GPU model (optional - auto-discovered)
# gpuVramMib: 141557 # GPU VRAM in MiB (optional - auto-discovered)
# Sweep/profiling configuration # Sweep/profiling configuration
sweep: sweep:
prefillInterpolationGranularity: 16 # Samples for TTFT interpolation prefillInterpolationGranularity: 16 # Samples for TTFT interpolation
decodeInterpolationGranularity: 6 # Samples for ITL interpolation decodeInterpolationGranularity: 6 # Samples for ITL interpolation
# AI Configurator mode (fast simulation-based profiling, 20-30 seconds)
useAiConfigurator: false # Set to false for online profiling (2-4 hours)
aicSystem: h200_sxm # Target GPU system for AI Configurator
aicHfId: Qwen/Qwen3-0.6B # HuggingFace model ID for AI Configurator
aicBackendVersion: "0.20.0" # Backend version for AI Configurator
# SLA targets for profiling # SLA targets for profiling
sla: sla:
isl: 3000 # Input sequence length isl: 3000 # Input sequence length
......
...@@ -47,6 +47,7 @@ import ( ...@@ -47,6 +47,7 @@ import (
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts" "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
commonController "github.com/ai-dynamo/dynamo/deploy/operator/internal/controller_common" commonController "github.com/ai-dynamo/dynamo/deploy/operator/internal/controller_common"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/gpu"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/observability" "github.com/ai-dynamo/dynamo/deploy/operator/internal/observability"
webhookvalidation "github.com/ai-dynamo/dynamo/deploy/operator/internal/webhook/validation" webhookvalidation "github.com/ai-dynamo/dynamo/deploy/operator/internal/webhook/validation"
) )
...@@ -169,12 +170,26 @@ const ( ...@@ -169,12 +170,26 @@ const (
BackendSGLang = "sglang" BackendSGLang = "sglang"
BackendTRTLLM = "trtllm" BackendTRTLLM = "trtllm"
// Profiling config field names // Profiling config field names for v1alpha1; note: will be removed in v1beta1
ConfigKeyDeployment = "deployment" ConfigKeyDeployment = "deployment"
ConfigKeyModelCache = "modelCache" ConfigKeyModelCache = "modelCache"
ConfigKeyPVCName = "pvcName" ConfigKeyPVCName = "pvcName"
ConfigKeyPVCPath = "pvcPath" ConfigKeyPVCPath = "pvcPath"
ConfigKeyMountPath = "mountPath" ConfigKeyMountPath = "mountPath"
ConfigKeyHardware = "hardware"
ConfigKeyEngine = "engine"
ConfigKeyOutputDir = "output_dir"
ConfigKeyNumGpusPerNode = "numGpusPerNode"
ConfigKeyGPUModel = "gpuModel"
ConfigKeyGPUVramMib = "gpuVramMib"
ConfigKeySystem = "system"
ConfigKeyMinNumGpusPerEng = "minNumGpusPerEngine"
ConfigKeyMaxNumGpusPerEng = "maxNumGpusPerEngine"
ConfigKeyBackend = "backend"
ConfigKeyConfig = "config"
ConfigKeyNamespace = "namespace"
ConfigKeyModel = "model"
ConfigKeyDGDImage = "dgd_image"
) )
// shell script template for the output copier sidecar // shell script template for the output copier sidecar
...@@ -946,10 +961,105 @@ func (r *DynamoGraphDeploymentRequestReconciler) validateSpec(ctx context.Contex ...@@ -946,10 +961,105 @@ func (r *DynamoGraphDeploymentRequestReconciler) validateSpec(ctx context.Contex
} }
} }
if err := r.validateGPUHardwareInfo(ctx, dgdr); err != nil {
return err
}
// The profiler will validate the rest of the configuration // The profiler will validate the rest of the configuration
return nil return nil
} }
// toFloat64 converts a numeric value (int or float64) to float64.
// Returns 0 if the value is neither int nor float64.
func toFloat64(val interface{}) float64 {
switch v := val.(type) {
case float64:
return v
case int:
return float64(v)
default:
return 0
}
}
// validateGPUHardwareInfo ensures GPU hardware information is available when required for profiling
func (r *DynamoGraphDeploymentRequestReconciler) validateGPUHardwareInfo(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) error {
logger := log.FromContext(ctx)
// Check for hardware info and GPU ranges
// TODO: will be cleaner once we swap to new DGDR schema (#6130)
var config map[string]interface{}
if dgdr.Spec.ProfilingConfig.Config != nil {
if err := yaml.Unmarshal(dgdr.Spec.ProfilingConfig.Config.Raw, &config); err != nil {
// Config parse errors will be caught later, skip validation here
return nil
}
} else {
config = make(map[string]interface{})
}
hardwareVal, hasHardware := config[ConfigKeyHardware]
var hasManualHardwareConfig bool
if hasHardware && hardwareVal != nil {
if hardwareConfig, ok := hardwareVal.(map[string]interface{}); ok {
_, hasGPUModel := hardwareConfig[ConfigKeyGPUModel]
_, hasGPUVram := hardwareConfig[ConfigKeyGPUVramMib]
_, hasNumGPUs := hardwareConfig[ConfigKeyNumGpusPerNode]
hasManualHardwareConfig = hasGPUModel || hasGPUVram || hasNumGPUs
}
}
var hasExplicitGPURanges bool
if engineVal, hasEngine := config[ConfigKeyEngine]; hasEngine && engineVal != nil {
if engineConfig, ok := engineVal.(map[string]interface{}); ok {
minGPUs, hasMin := engineConfig[ConfigKeyMinNumGpusPerEng]
maxGPUs, hasMax := engineConfig[ConfigKeyMaxNumGpusPerEng]
if hasMin && hasMax {
minVal := toFloat64(minGPUs)
maxVal := toFloat64(maxGPUs)
// Validate that min <= max
if minVal > maxVal {
return fmt.Errorf("invalid GPU range: %s (%v) cannot be greater than %s (%v)",
ConfigKeyMinNumGpusPerEng, minVal, ConfigKeyMaxNumGpusPerEng, maxVal)
}
hasExplicitGPURanges = minVal > 0 && maxVal > 0
}
}
}
// If manual config or explicit ranges are provided, validation passes
if hasManualHardwareConfig || hasExplicitGPURanges {
return nil
}
_, err := gpu.DiscoverGPUs(ctx, r.Client)
if err == nil {
// GPU discovery is available, validation passes
return nil
}
logger.Info("GPU discovery not available", "reason", err.Error())
isNamespaceScoped := r.Config.RestrictedNamespace != ""
if isNamespaceScoped {
return fmt.Errorf(`GPU hardware info required but cannot be auto-discovered (namespace-scoped operator lacks node read permissions).
Add hardware config to profilingConfig.config.%s (%s, %s, %s) or specify %s.%s and %s.%s.
See: https://github.com/ai-dynamo/dynamo/issues/6257`,
ConfigKeyHardware, ConfigKeyNumGpusPerNode, ConfigKeyGPUModel, ConfigKeyGPUVramMib,
ConfigKeyEngine, ConfigKeyMinNumGpusPerEng, ConfigKeyEngine, ConfigKeyMaxNumGpusPerEng)
}
return fmt.Errorf(`GPU hardware info required but auto-discovery failed. Add hardware config to profilingConfig.config.%s (%s, %s, %s) or specify %s.%s and %s.%s.
See profiling documentation for configuration details.`,
ConfigKeyHardware, ConfigKeyNumGpusPerNode, ConfigKeyGPUModel, ConfigKeyGPUVramMib,
ConfigKeyEngine, ConfigKeyMinNumGpusPerEng, ConfigKeyEngine, ConfigKeyMaxNumGpusPerEng)
}
// createProfilingJob creates a Kubernetes Job for profiling using SyncResource // createProfilingJob creates a Kubernetes Job for profiling using SyncResource
func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) error { func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) error {
logger := log.FromContext(ctx) logger := log.FromContext(ctx)
...@@ -989,13 +1099,29 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context. ...@@ -989,13 +1099,29 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
} }
} }
// Run GPU discovery before creating job (cluster-wide and namespace-restricted operators if they have node read permissions)
var gpuInfo *gpu.GPUInfo
logger.Info("Attempting GPU discovery for profiling job")
discoveredInfo, err := gpu.DiscoverGPUs(ctx, r.Client)
if err != nil {
// This path is expected for namespace-restricted operators without node read permissions
logger.Info("GPU discovery not available, using manual hardware configuration from profiling config",
"reason", err.Error())
} else {
gpuInfo = discoveredInfo
logger.Info("GPU discovery completed successfully",
"gpusPerNode", gpuInfo.GPUsPerNode,
"model", gpuInfo.Model,
"vramMiB", gpuInfo.VRAMPerGPU,
"system", gpuInfo.System)
}
// Use SyncResource to create/update the job // Use SyncResource to create/update the job
modified, job, err := commonController.SyncResource(ctx, r, dgdr, func(ctx context.Context) (*batchv1.Job, bool, error) { modified, job, err := commonController.SyncResource(ctx, r, dgdr, func(ctx context.Context) (*batchv1.Job, bool, error) {
jobName := getProfilingJobName(dgdr) jobName := getProfilingJobName(dgdr)
outputConfigMapName := getOutputConfigMapName(dgdr) outputConfigMapName := getOutputConfigMapName(dgdr)
// Parse and prepare profiling config configYAML, err := r.prepareProfilingConfig(dgdr, gpuInfo)
configYAML, err := r.prepareProfilingConfig(dgdr)
if err != nil { if err != nil {
return nil, false, err return nil, false, err
} }
...@@ -1069,12 +1195,6 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context. ...@@ -1069,12 +1195,6 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
"--profile-config", string(configYAML), "--profile-config", string(configYAML),
} }
// Add --enable-gpu-discovery flag based on DGDR spec
// GPU discovery requires cluster-wide node access
if dgdr.Spec.EnableGpuDiscovery {
profilerArgs = append(profilerArgs, "--enable-gpu-discovery")
}
// Use profiler image from profilingConfig // Use profiler image from profilingConfig
imageName := dgdr.Spec.ProfilingConfig.ProfilerImage imageName := dgdr.Spec.ProfilingConfig.ProfilerImage
logger.Info("Using profiler image", "image", imageName) logger.Info("Using profiler image", "image", imageName)
...@@ -1250,7 +1370,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context. ...@@ -1250,7 +1370,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
} }
// prepareProfilingConfig parses and modifies the profiling config // prepareProfilingConfig parses and modifies the profiling config
func (r *DynamoGraphDeploymentRequestReconciler) prepareProfilingConfig(dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) ([]byte, error) { func (r *DynamoGraphDeploymentRequestReconciler) prepareProfilingConfig(dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest, gpuInfo *gpu.GPUInfo) ([]byte, error) {
// Parse the profiling config from JSON // Parse the profiling config from JSON
var config map[string]interface{} var config map[string]interface{}
if err := yaml.Unmarshal(dgdr.Spec.ProfilingConfig.Config.Raw, &config); err != nil { if err := yaml.Unmarshal(dgdr.Spec.ProfilingConfig.Config.Raw, &config); err != nil {
...@@ -1258,53 +1378,84 @@ func (r *DynamoGraphDeploymentRequestReconciler) prepareProfilingConfig(dgdr *nv ...@@ -1258,53 +1378,84 @@ func (r *DynamoGraphDeploymentRequestReconciler) prepareProfilingConfig(dgdr *nv
} }
// Set deployment.namespace if not already set // Set deployment.namespace if not already set
deploymentVal, hasDeployment := config["deployment"] deploymentVal, hasDeployment := config[ConfigKeyDeployment]
var deploymentConfig map[string]interface{} var deploymentConfig map[string]interface{}
if !hasDeployment || deploymentVal == nil { if !hasDeployment || deploymentVal == nil {
deploymentConfig = make(map[string]interface{}) deploymentConfig = make(map[string]interface{})
config["deployment"] = deploymentConfig config[ConfigKeyDeployment] = deploymentConfig
} else { } else {
var ok bool var ok bool
deploymentConfig, ok = deploymentVal.(map[string]interface{}) deploymentConfig, ok = deploymentVal.(map[string]interface{})
if !ok { if !ok {
return nil, fmt.Errorf("profilingConfig.config.deployment must be an object, got %T", deploymentVal) return nil, fmt.Errorf("profilingConfig.config.%s must be an object, got %T", ConfigKeyDeployment, deploymentVal)
} }
} }
if _, hasNamespace := deploymentConfig["namespace"]; !hasNamespace { if _, hasNamespace := deploymentConfig[ConfigKeyNamespace]; !hasNamespace {
deploymentConfig["namespace"] = dgdr.Namespace deploymentConfig[ConfigKeyNamespace] = dgdr.Namespace
} }
// Set deployment.model from spec.model // Set deployment.model from spec.model
deploymentConfig["model"] = dgdr.Spec.Model deploymentConfig[ConfigKeyModel] = dgdr.Spec.Model
// Set deployment.dgd_image from deploymentOverrides.workersImage if provided // Set deployment.dgd_image from deploymentOverrides.workersImage if provided
if dgdr.Spec.DeploymentOverrides != nil && dgdr.Spec.DeploymentOverrides.WorkersImage != "" { if dgdr.Spec.DeploymentOverrides != nil && dgdr.Spec.DeploymentOverrides.WorkersImage != "" {
deploymentConfig["dgd_image"] = dgdr.Spec.DeploymentOverrides.WorkersImage deploymentConfig[ConfigKeyDGDImage] = dgdr.Spec.DeploymentOverrides.WorkersImage
} }
// Set output_dir if not already set // Set output_dir if not already set
if _, hasOutputDir := config["output_dir"]; !hasOutputDir { if _, hasOutputDir := config[ConfigKeyOutputDir]; !hasOutputDir {
config["output_dir"] = ProfilingOutputPath config[ConfigKeyOutputDir] = ProfilingOutputPath
} }
// Set engine.backend from spec.backend // Set engine.backend from spec.backend
engineVal, hasEngine := config["engine"] engineVal, hasEngine := config[ConfigKeyEngine]
var engineConfig map[string]interface{} var engineConfig map[string]interface{}
if !hasEngine || engineVal == nil { if !hasEngine || engineVal == nil {
engineConfig = make(map[string]interface{}) engineConfig = make(map[string]interface{})
config["engine"] = engineConfig config[ConfigKeyEngine] = engineConfig
} else { } else {
var ok bool var ok bool
engineConfig, ok = engineVal.(map[string]interface{}) engineConfig, ok = engineVal.(map[string]interface{})
if !ok { if !ok {
return nil, fmt.Errorf("profilingConfig.config.engine must be an object, got %T", engineVal) return nil, fmt.Errorf("profilingConfig.config.%s must be an object, got %T", ConfigKeyEngine, engineVal)
} }
} }
engineConfig["backend"] = dgdr.Spec.Backend engineConfig[ConfigKeyBackend] = dgdr.Spec.Backend
// If ConfigMapRef is provided, set engine.config path // If ConfigMapRef is provided, set engine.config path
if dgdr.Spec.ProfilingConfig.ConfigMapRef != nil { if dgdr.Spec.ProfilingConfig.ConfigMapRef != nil {
engineConfig["config"] = fmt.Sprintf("%s/%s", ProfilingConfigPath, ProfilingConfigFile) engineConfig[ConfigKeyConfig] = fmt.Sprintf("%s/%s", ProfilingConfigPath, ProfilingConfigFile)
}
// User-specified values take precedence over auto-discovered values
if gpuInfo != nil {
hardwareVal, hasHardware := config["hardware"]
var hardwareConfig map[string]interface{}
if !hasHardware || hardwareVal == nil {
hardwareConfig = make(map[string]interface{})
config["hardware"] = hardwareConfig
} else {
var ok bool
hardwareConfig, ok = hardwareVal.(map[string]interface{})
if !ok {
return nil, fmt.Errorf("profilingConfig.config.hardware must be an object, got %T", hardwareVal)
}
}
if _, hasNumGpus := hardwareConfig[ConfigKeyNumGpusPerNode]; !hasNumGpus {
hardwareConfig[ConfigKeyNumGpusPerNode] = gpuInfo.GPUsPerNode
}
if _, hasGpuModel := hardwareConfig[ConfigKeyGPUModel]; !hasGpuModel {
hardwareConfig[ConfigKeyGPUModel] = gpuInfo.Model
}
if _, hasGpuVram := hardwareConfig[ConfigKeyGPUVramMib]; !hasGpuVram {
hardwareConfig[ConfigKeyGPUVramMib] = gpuInfo.VRAMPerGPU
}
if gpuInfo.System != "" {
if _, hasSystem := hardwareConfig[ConfigKeySystem]; !hasSystem {
hardwareConfig[ConfigKeySystem] = gpuInfo.System
}
}
} }
// Serialize config to YAML for passing to profiler // Serialize config to YAML for passing to profiler
......
...@@ -55,6 +55,14 @@ func (m *MockRBACManager) EnsureServiceAccountWithRBAC(ctx context.Context, targ ...@@ -55,6 +55,14 @@ func (m *MockRBACManager) EnsureServiceAccountWithRBAC(ctx context.Context, targ
// Helper function to create JSON config for tests // Helper function to create JSON config for tests
func createTestConfig(config map[string]interface{}) *apiextensionsv1.JSON { func createTestConfig(config map[string]interface{}) *apiextensionsv1.JSON {
// Add default hardware config if not present to satisfy validation
if _, hasHardware := config["hardware"]; !hasHardware {
config["hardware"] = map[string]interface{}{
"numGpusPerNode": 8,
"gpuModel": "H100-SXM5-80GB",
"gpuVramMib": 81920,
}
}
jsonBytes, err := json.Marshal(config) jsonBytes, err := json.Marshal(config)
if err != nil { if err != nil {
panic(err) panic(err)
...@@ -114,10 +122,6 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { ...@@ -114,10 +122,6 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
"isl": 3000, "isl": 3000,
"osl": 5, "osl": 5,
}, },
"hardware": map[string]interface{}{
"min_num_gpus_per_engine": 1,
"max_num_gpus_per_engine": 8,
},
}), }),
}, },
}, },
...@@ -243,10 +247,6 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { ...@@ -243,10 +247,6 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
"isl": 3000, "isl": 3000,
"osl": 5, "osl": 5,
}, },
"hardware": map[string]interface{}{
"min_num_gpus_per_engine": 1,
"max_num_gpus_per_engine": 8,
},
}), }),
ConfigMapRef: &nvidiacomv1alpha1.ConfigMapKeySelector{ ConfigMapRef: &nvidiacomv1alpha1.ConfigMapKeySelector{
Name: "test-config", Name: "test-config",
...@@ -341,10 +341,6 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { ...@@ -341,10 +341,6 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
"isl": 3000, "isl": 3000,
"osl": 5, "osl": 5,
}, },
"hardware": map[string]interface{}{
"min_num_gpus_per_engine": 1,
"max_num_gpus_per_engine": 8,
},
"sweep": map[string]interface{}{ "sweep": map[string]interface{}{
"use_ai_configurator": true, "use_ai_configurator": true,
"aic_system": "h200_sxm", "aic_system": "h200_sxm",
...@@ -1236,10 +1232,6 @@ var _ = Describe("DGDR Error Handling", func() { ...@@ -1236,10 +1232,6 @@ var _ = Describe("DGDR Error Handling", func() {
"isl": 3000, "isl": 3000,
"osl": 5, "osl": 5,
}, },
"hardware": map[string]interface{}{
"min_num_gpus_per_engine": 1,
"max_num_gpus_per_engine": 8,
},
}), }),
}, },
}, },
...@@ -1511,4 +1503,307 @@ spec: ...@@ -1511,4 +1503,307 @@ spec:
Expect(additionalResources[1].GetName()).Should(Equal("config2")) Expect(additionalResources[1].GetName()).Should(Equal("config2"))
}) })
}) })
Context("GPU Discovery Integration Tests", func() {
It("Should use GPU discovery when nodes have GPU labels", func() {
ctx := context.Background()
dgdrName := "test-dgdr-gpu-discovery"
namespace := defaultNamespace
// Create a node with GPU labels (simulating GFD labels)
gpuNode := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "gpu-worker-1",
Labels: map[string]string{
"nvidia.com/gpu.count": "8",
"nvidia.com/gpu.product": "H100-SXM5-80GB",
"nvidia.com/gpu.memory": "81920",
},
},
}
Expect(k8sClient.Create(ctx, gpuNode)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, gpuNode) }()
// Create DGDR WITHOUT hardware config (should use GPU discovery)
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "test-profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(`{
"sla": {"ttft": 100.0, "itl": 1500.0},
"engine": {"minNumGpusPerEngine": 1, "maxNumGpusPerEngine": 8}
}`),
},
},
},
}
Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Reconcile - should succeed with GPU discovery
_, err := reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{
Name: dgdrName,
Namespace: namespace,
},
})
Expect(err).NotTo(HaveOccurred())
// Should transition to Pending (validation passed)
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
Expect(updated.Status.State).Should(Equal(DGDRStatePending))
})
It("Should respect manual hardware config over GPU discovery", func() {
ctx := context.Background()
dgdrName := "test-dgdr-manual-override"
namespace := defaultNamespace
// Create a node with H100 GPUs
gpuNode := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "gpu-worker-h100",
Labels: map[string]string{
"nvidia.com/gpu.count": "8",
"nvidia.com/gpu.product": "H100-SXM5-80GB",
"nvidia.com/gpu.memory": "81920",
},
},
}
Expect(k8sClient.Create(ctx, gpuNode)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, gpuNode) }()
// Create DGDR WITH manual hardware config (A100, not H100)
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "test-profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(`{
"sla": {"ttft": 100.0, "itl": 1500.0},
"hardware": {
"numGpusPerNode": 4,
"gpuModel": "A100-SXM4-40GB",
"gpuVramMib": 40960,
"system": "a100_sxm"
}
}`),
},
},
},
}
Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Reconcile - should succeed and use manual config
_, err := reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{
Name: dgdrName,
Namespace: namespace,
},
})
Expect(err).NotTo(HaveOccurred())
// Should transition to Pending (validation passed with manual config)
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
Expect(updated.Status.State).Should(Equal(DGDRStatePending))
})
It("Should succeed with GPU discovery when cluster has GPU nodes", func() {
ctx := context.Background()
dgdrName := "test-dgdr-with-autodiscovery"
namespace := defaultNamespace
// Create a GPU node so GPU discovery can succeed
node := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "gpu-worker-autodiscovery",
Labels: map[string]string{
"nvidia.com/gpu.count": "8",
"nvidia.com/gpu.product": "H100-SXM5-80GB",
"nvidia.com/gpu.memory": "81920",
},
},
}
Expect(k8sClient.Create(ctx, node)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, node) }()
// Create DGDR WITHOUT hardware config - should use GPU discovery
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "test-profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(`{
"sla": {"ttft": 100.0, "itl": 1500.0}
}`),
},
},
},
}
Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Reconcile - should succeed with GPU discovery
_, err := reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{
Name: dgdrName,
Namespace: namespace,
},
})
Expect(err).NotTo(HaveOccurred())
// Should transition to Pending
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
Expect(updated.Status.State).Should(Equal(DGDRStatePending))
})
It("Should pass validation with explicit GPU ranges without GPU discovery", func() {
ctx := context.Background()
dgdrName := "test-dgdr-explicit-ranges"
namespace := defaultNamespace
// Intentionally don't create GPU nodes to test that explicit ranges work without GPU discovery
// Create DGDR with explicit minNumGpusPerEngine/maxNumGpusPerEngine
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "test-profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(`{
"sla": {"ttft": 100.0, "itl": 1500.0},
"engine": {
"minNumGpusPerEngine": 2,
"maxNumGpusPerEngine": 4
},
"hardware": {
"numGpusPerNode": 8
}
}`),
},
},
},
}
Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Reconcile - should succeed (explicit ranges + minimal hardware bypass GPU discovery requirement)
_, err := reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{
Name: dgdrName,
Namespace: namespace,
},
})
Expect(err).NotTo(HaveOccurred())
// Should transition to Pending
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
Expect(updated.Status.State).Should(Equal(DGDRStatePending))
})
It("Should use GPU discovery with heterogeneous nodes (picks best)", func() {
ctx := context.Background()
dgdrName := "test-dgdr-heterogeneous"
namespace := defaultNamespace
// Create nodes with different GPU configs
nodeA100 := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "gpu-worker-a100",
Labels: map[string]string{
"nvidia.com/gpu.count": "4",
"nvidia.com/gpu.product": "A100-SXM4-40GB",
"nvidia.com/gpu.memory": "40960",
},
},
}
nodeH100 := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "gpu-worker-h100",
Labels: map[string]string{
"nvidia.com/gpu.count": "8",
"nvidia.com/gpu.product": "H100-SXM5-80GB",
"nvidia.com/gpu.memory": "81920",
},
},
}
Expect(k8sClient.Create(ctx, nodeA100)).Should(Succeed())
Expect(k8sClient.Create(ctx, nodeH100)).Should(Succeed())
defer func() {
_ = k8sClient.Delete(ctx, nodeA100)
_ = k8sClient.Delete(ctx, nodeH100)
}()
// Create DGDR without hardware config
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "test-profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(`{
"sla": {"ttft": 100.0, "itl": 1500.0},
"engine": {"minNumGpusPerEngine": 1, "maxNumGpusPerEngine": 8}
}`),
},
},
},
}
Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Reconcile - should pick H100 (8 GPUs > 4 GPUs)
_, err := reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{
Name: dgdrName,
Namespace: namespace,
},
})
Expect(err).NotTo(HaveOccurred())
// Should transition to Pending (using H100 config)
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
Expect(updated.Status.State).Should(Equal(DGDRStatePending))
})
})
}) })
/*
* SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gpu
import (
"context"
"fmt"
"strconv"
"strings"
corev1 "k8s.io/api/core/v1"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"
)
const (
// NVIDIA GPU Feature Discovery (GFD) label keys
LabelGPUCount = "nvidia.com/gpu.count"
LabelGPUProduct = "nvidia.com/gpu.product"
LabelGPUMemory = "nvidia.com/gpu.memory"
)
// GPUInfo contains discovered GPU configuration from cluster nodes
type GPUInfo struct {
GPUsPerNode int // Maximum GPUs per node found in the cluster
Model string // GPU product name (e.g., "H100-SXM5-80GB")
VRAMPerGPU int // VRAM in MiB per GPU
System string // AIC hardware system identifier (e.g., "h100_sxm", "h200_sxm"), empty if unknown
}
// DiscoverGPUs queries Kubernetes nodes to determine GPU configuration.
// It extracts GPU information from NVIDIA GPU Feature Discovery (GFD) labels
// and returns aggregated GPU info, preferring nodes with higher GPU count,
// then higher VRAM if counts are equal.
//
// This function requires cluster-wide node read permissions and expects nodes
// to have GFD labels. If no nodes with GPU labels are found, it returns an error.
func DiscoverGPUs(ctx context.Context, k8sClient client.Client) (*GPUInfo, error) {
logger := log.FromContext(ctx)
logger.Info("Starting GPU discovery from cluster nodes")
// List all nodes in the cluster
nodeList := &corev1.NodeList{}
if err := k8sClient.List(ctx, nodeList); err != nil {
return nil, fmt.Errorf("failed to list cluster nodes: %w", err)
}
if len(nodeList.Items) == 0 {
return nil, fmt.Errorf("no nodes found in cluster")
}
logger.Info("Found cluster nodes", "count", len(nodeList.Items))
// Track the best GPU configuration found
var bestGPUInfo *GPUInfo
nodesWithGPUs := 0
for i := range nodeList.Items {
node := &nodeList.Items[i]
gpuInfo, err := extractGPUInfoFromNode(node)
if err != nil {
// Node doesn't have GPU labels or has invalid labels, skip it
logger.V(1).Info("Skipping node without valid GPU info",
"node", node.Name,
"reason", err.Error())
continue
}
nodesWithGPUs++
logger.Info("Found GPU node",
"node", node.Name,
"gpus", gpuInfo.GPUsPerNode,
"model", gpuInfo.Model,
"vram", gpuInfo.VRAMPerGPU)
// Select best configuration: prefer higher GPU count, then higher VRAM
if bestGPUInfo == nil ||
gpuInfo.GPUsPerNode > bestGPUInfo.GPUsPerNode ||
(gpuInfo.GPUsPerNode == bestGPUInfo.GPUsPerNode && gpuInfo.VRAMPerGPU > bestGPUInfo.VRAMPerGPU) {
bestGPUInfo = gpuInfo
}
}
if bestGPUInfo == nil {
return nil, fmt.Errorf("no nodes with NVIDIA GPU Feature Discovery labels found (checked %d nodes). "+
"Ensure GPU nodes have labels: %s, %s, %s",
len(nodeList.Items), LabelGPUCount, LabelGPUProduct, LabelGPUMemory)
}
// Infer hardware system from GPU model
bestGPUInfo.System = InferHardwareSystem(bestGPUInfo.Model)
logger.Info("GPU discovery completed",
"gpusPerNode", bestGPUInfo.GPUsPerNode,
"model", bestGPUInfo.Model,
"vram", bestGPUInfo.VRAMPerGPU,
"system", bestGPUInfo.System,
"nodesWithGPUs", nodesWithGPUs)
return bestGPUInfo, nil
}
// extractGPUInfoFromNode extracts GPU information from a single node's labels.
// Returns error if required labels are missing or invalid.
func extractGPUInfoFromNode(node *corev1.Node) (*GPUInfo, error) {
labels := node.Labels
if labels == nil {
return nil, fmt.Errorf("node has no labels")
}
gpuCountStr, ok := labels[LabelGPUCount]
if !ok {
return nil, fmt.Errorf("missing label %s", LabelGPUCount)
}
gpuCount, err := strconv.Atoi(gpuCountStr)
if err != nil || gpuCount <= 0 {
return nil, fmt.Errorf("invalid GPU count: %s", gpuCountStr)
}
gpuModel, ok := labels[LabelGPUProduct]
if !ok || gpuModel == "" {
return nil, fmt.Errorf("missing or empty label %s", LabelGPUProduct)
}
// Extract VRAM (memory in MiB)
gpuMemoryStr, ok := labels[LabelGPUMemory]
if !ok {
return nil, fmt.Errorf("missing label %s", LabelGPUMemory)
}
gpuMemory, err := strconv.Atoi(gpuMemoryStr)
if err != nil || gpuMemory <= 0 {
return nil, fmt.Errorf("invalid GPU memory: %s", gpuMemoryStr)
}
return &GPUInfo{
GPUsPerNode: gpuCount,
Model: gpuModel,
VRAMPerGPU: gpuMemory,
}, nil
}
// InferHardwareSystem maps GPU product name to hardware system identifier.
// Returns empty string if the GPU model cannot be confidently mapped.
//
// This is a best-effort mapping based on common NVIDIA datacenter GPU naming patterns.
// The system identifier is used by the profiler for performance estimation and configuration.
//
// Limitations:
// - Cannot distinguish SXM vs. PCIe variants from labels alone (assumes SXM for datacenter GPUs)
// - New GPU models require code updates (gracefully returns empty string)
// - Non-standard SKU names may not match
//
// Users can manually override the system in their profiling config (hardware.system)
// if auto-detection is incorrect or unavailable.
func InferHardwareSystem(gpuProduct string) string {
if gpuProduct == "" {
return ""
}
// Normalize: uppercase, remove spaces/dashes for pattern matching
normalized := strings.ToUpper(strings.ReplaceAll(gpuProduct, "-", ""))
normalized = strings.ReplaceAll(normalized, " ", "")
// Map common NVIDIA datacenter GPU products to hardware system identifiers
patterns := []struct {
pattern string
system string
}{
{"GB200", "gb200_sxm"},
{"H200", "h200_sxm"},
{"H100", "h100_sxm"},
{"B200", "b200_sxm"},
{"A100", "a100_sxm"},
{"L40S", "l40s"},
}
for _, p := range patterns {
if strings.Contains(normalized, p.pattern) {
return p.system
}
}
// Unknown GPU type, return empty string
// User must specify system manually in profiling config (hardware.system)
return ""
}
/*
* SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gpu
import (
"context"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/fake"
)
// newFakeClient creates a fake Kubernetes client with the given objects
func newFakeClient(objs ...client.Object) client.Client {
scheme := runtime.NewScheme()
_ = corev1.AddToScheme(scheme)
return fake.NewClientBuilder().
WithScheme(scheme).
WithObjects(objs...).
Build()
}
func TestDiscoverGPUs_SingleNode(t *testing.T) {
ctx := context.Background()
node := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "gpu-node-1",
Labels: map[string]string{
LabelGPUCount: "8",
LabelGPUProduct: "H100-SXM5-80GB",
LabelGPUMemory: "81920",
},
},
}
k8sClient := newFakeClient(node)
gpuInfo, err := DiscoverGPUs(ctx, k8sClient)
require.NoError(t, err)
require.NotNil(t, gpuInfo)
assert.Equal(t, 8, gpuInfo.GPUsPerNode)
assert.Equal(t, "H100-SXM5-80GB", gpuInfo.Model)
assert.Equal(t, 81920, gpuInfo.VRAMPerGPU)
assert.Equal(t, "h100_sxm", gpuInfo.System)
}
func TestDiscoverGPUs_MultipleNodesHomogeneous(t *testing.T) {
ctx := context.Background()
// Multiple nodes with same GPU configuration
node1 := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "gpu-node-1",
Labels: map[string]string{
LabelGPUCount: "8",
LabelGPUProduct: "H100-SXM5-80GB",
LabelGPUMemory: "81920",
},
},
}
node2 := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "gpu-node-2",
Labels: map[string]string{
LabelGPUCount: "8",
LabelGPUProduct: "H100-SXM5-80GB",
LabelGPUMemory: "81920",
},
},
}
k8sClient := newFakeClient(node1, node2)
gpuInfo, err := DiscoverGPUs(ctx, k8sClient)
require.NoError(t, err)
require.NotNil(t, gpuInfo)
assert.Equal(t, 8, gpuInfo.GPUsPerNode)
assert.Equal(t, "H100-SXM5-80GB", gpuInfo.Model)
assert.Equal(t, 81920, gpuInfo.VRAMPerGPU)
}
func TestDiscoverGPUs_MultipleNodesHeterogeneous_HigherGPUCountWins(t *testing.T) {
ctx := context.Background()
// Node with fewer GPUs
node1 := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "gpu-node-1",
Labels: map[string]string{
LabelGPUCount: "4",
LabelGPUProduct: "A100-SXM4-40GB",
LabelGPUMemory: "40960",
},
},
}
// Node with more GPUs (should win)
node2 := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "gpu-node-2",
Labels: map[string]string{
LabelGPUCount: "8",
LabelGPUProduct: "H100-SXM5-80GB",
LabelGPUMemory: "81920",
},
},
}
k8sClient := newFakeClient(node1, node2)
gpuInfo, err := DiscoverGPUs(ctx, k8sClient)
require.NoError(t, err)
require.NotNil(t, gpuInfo)
// Should prefer node with 8 GPUs over node with 4 GPUs
assert.Equal(t, 8, gpuInfo.GPUsPerNode)
assert.Equal(t, "H100-SXM5-80GB", gpuInfo.Model)
assert.Equal(t, 81920, gpuInfo.VRAMPerGPU)
}
func TestDiscoverGPUs_MultipleNodesHeterogeneous_HigherVRAMWins(t *testing.T) {
ctx := context.Background()
// Node with same GPU count but less VRAM
node1 := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "gpu-node-1",
Labels: map[string]string{
LabelGPUCount: "8",
LabelGPUProduct: "A100-SXM4-40GB",
LabelGPUMemory: "40960",
},
},
}
// Node with same GPU count but more VRAM (should win)
node2 := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "gpu-node-2",
Labels: map[string]string{
LabelGPUCount: "8",
LabelGPUProduct: "H100-SXM5-80GB",
LabelGPUMemory: "81920",
},
},
}
k8sClient := newFakeClient(node1, node2)
gpuInfo, err := DiscoverGPUs(ctx, k8sClient)
require.NoError(t, err)
require.NotNil(t, gpuInfo)
// Should prefer node with higher VRAM when GPU count is equal
assert.Equal(t, 8, gpuInfo.GPUsPerNode)
assert.Equal(t, "H100-SXM5-80GB", gpuInfo.Model)
assert.Equal(t, 81920, gpuInfo.VRAMPerGPU)
}
func TestDiscoverGPUs_MixedNodesWithAndWithoutGPUs(t *testing.T) {
ctx := context.Background()
// CPU-only node (no GPU labels)
cpuNode := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "cpu-node-1",
Labels: map[string]string{},
},
}
// GPU node
gpuNode := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "gpu-node-1",
Labels: map[string]string{
LabelGPUCount: "8",
LabelGPUProduct: "H100-SXM5-80GB",
LabelGPUMemory: "81920",
},
},
}
k8sClient := newFakeClient(cpuNode, gpuNode)
gpuInfo, err := DiscoverGPUs(ctx, k8sClient)
require.NoError(t, err)
require.NotNil(t, gpuInfo)
// Should find the GPU node and ignore CPU-only node
assert.Equal(t, 8, gpuInfo.GPUsPerNode)
assert.Equal(t, "H100-SXM5-80GB", gpuInfo.Model)
}
func TestDiscoverGPUs_NoNodes(t *testing.T) {
ctx := context.Background()
k8sClient := newFakeClient() // Empty cluster
gpuInfo, err := DiscoverGPUs(ctx, k8sClient)
assert.Error(t, err)
assert.Nil(t, gpuInfo)
assert.Contains(t, err.Error(), "no nodes found")
}
func TestDiscoverGPUs_NoGPUNodes(t *testing.T) {
ctx := context.Background()
// Only CPU nodes
cpuNode1 := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "cpu-node-1",
Labels: map[string]string{},
},
}
cpuNode2 := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "cpu-node-2",
Labels: map[string]string{
"node-type": "cpu-only",
},
},
}
k8sClient := newFakeClient(cpuNode1, cpuNode2)
gpuInfo, err := DiscoverGPUs(ctx, k8sClient)
assert.Error(t, err)
assert.Nil(t, gpuInfo)
assert.Contains(t, err.Error(), "no nodes with NVIDIA GPU Feature Discovery labels found")
}
func TestExtractGPUInfoFromNode_MissingLabels(t *testing.T) {
tests := []struct {
name string
labels map[string]string
expectError bool
errorMsg string
}{
{
name: "missing GPU count",
labels: map[string]string{LabelGPUProduct: "H100", LabelGPUMemory: "80000"},
expectError: true,
errorMsg: LabelGPUCount,
},
{
name: "missing GPU product",
labels: map[string]string{LabelGPUCount: "8", LabelGPUMemory: "80000"},
expectError: true,
errorMsg: LabelGPUProduct,
},
{
name: "missing GPU memory",
labels: map[string]string{LabelGPUCount: "8", LabelGPUProduct: "H100"},
expectError: true,
errorMsg: LabelGPUMemory,
},
{
name: "invalid GPU count",
labels: map[string]string{LabelGPUCount: "invalid", LabelGPUProduct: "H100", LabelGPUMemory: "80000"},
expectError: true,
errorMsg: "invalid GPU count",
},
{
name: "invalid GPU memory",
labels: map[string]string{LabelGPUCount: "8", LabelGPUProduct: "H100", LabelGPUMemory: "invalid"},
expectError: true,
errorMsg: "invalid GPU memory",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
node := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "test-node",
Labels: tt.labels,
},
}
gpuInfo, err := extractGPUInfoFromNode(node)
if tt.expectError {
assert.Error(t, err)
assert.Nil(t, gpuInfo)
if tt.errorMsg != "" {
assert.Contains(t, err.Error(), tt.errorMsg)
}
} else {
assert.NoError(t, err)
assert.NotNil(t, gpuInfo)
}
})
}
}
func TestInferHardwareSystem(t *testing.T) {
tests := []struct {
gpuProduct string
expectedSystem string
description string
}{
{"H100-SXM5-80GB", "h100_sxm", "H100 SXM variant"},
{"H100-PCIE-80GB", "h100_sxm", "H100 PCIe variant (mapped to SXM)"},
{"H200-SXM5-141GB", "h200_sxm", "H200 SXM variant"},
{"A100-SXM4-40GB", "a100_sxm", "A100 SXM variant"},
{"A100-PCIE-80GB", "a100_sxm", "A100 PCIe variant (mapped to SXM)"},
{"L40S", "l40s", "L40S"},
{"NVIDIA L40S", "l40s", "L40S with prefix"},
{"B200-SXM", "b200_sxm", "B200 SXM"},
{"GB200", "gb200_sxm", "GB200"},
{"Tesla V100-SXM2-16GB", "", "V100 (not in mapping)"},
{"RTX 4090", "", "Consumer GPU (not in mapping)"},
{"Unknown-GPU", "", "Unknown GPU"},
{"", "", "Empty string"},
}
for _, tt := range tests {
t.Run(tt.description, func(t *testing.T) {
result := InferHardwareSystem(tt.gpuProduct)
assert.Equal(t, tt.expectedSystem, result, "Failed for GPU: %s", tt.gpuProduct)
})
}
}
func TestInferHardwareSystem_CaseInsensitive(t *testing.T) {
// Test that inference is case-insensitive
variants := []string{
"h100-sxm5-80gb",
"H100-SXM5-80GB",
"H100-sxm5-80GB",
"h100-SXM5-80gb",
}
for _, variant := range variants {
result := InferHardwareSystem(variant)
assert.Equal(t, "h100_sxm", result, "Should handle case variations: %s", variant)
}
}
func TestInferHardwareSystem_SpacesAndDashes(t *testing.T) {
// Test that spaces and dashes are normalized
variants := []string{
"H100-SXM5-80GB",
"H100 SXM5 80GB",
"H100SXM580GB",
"H100-SXM5 80GB",
}
for _, variant := range variants {
result := InferHardwareSystem(variant)
assert.Equal(t, "h100_sxm", result, "Should normalize spaces/dashes: %s", variant)
}
}
...@@ -26,6 +26,19 @@ import ( ...@@ -26,6 +26,19 @@ import (
"sigs.k8s.io/controller-runtime/pkg/webhook/admission" "sigs.k8s.io/controller-runtime/pkg/webhook/admission"
) )
// toFloat64 converts a numeric value (int or float64) to float64.
// Returns 0 if the value is neither int nor float64.
func toFloat64(val interface{}) float64 {
switch v := val.(type) {
case float64:
return v
case int:
return float64(v)
default:
return 0
}
}
// DynamoGraphDeploymentRequestValidator validates DynamoGraphDeploymentRequest resources. // DynamoGraphDeploymentRequestValidator validates DynamoGraphDeploymentRequest resources.
// This validator can be used by both webhooks and controllers for consistent validation. // This validator can be used by both webhooks and controllers for consistent validation.
type DynamoGraphDeploymentRequestValidator struct { type DynamoGraphDeploymentRequestValidator struct {
...@@ -48,6 +61,11 @@ func (v *DynamoGraphDeploymentRequestValidator) Validate() (admission.Warnings, ...@@ -48,6 +61,11 @@ func (v *DynamoGraphDeploymentRequestValidator) Validate() (admission.Warnings,
var warnings admission.Warnings var warnings admission.Warnings
var err error var err error
// Warn about deprecated enableGpuDiscovery field
if v.request.Spec.EnableGPUDiscovery != nil {
warnings = append(warnings, "spec.enableGpuDiscovery is deprecated and will be removed in v1beta1. GPU discovery is now always attempted automatically. This field has no effect.")
}
// Validate profiler image is specified // Validate profiler image is specified
if v.request.Spec.ProfilingConfig.ProfilerImage == "" { if v.request.Spec.ProfilingConfig.ProfilerImage == "" {
err = errors.Join(err, errors.New("spec.profilingConfig.profilerImage is required")) err = errors.Join(err, errors.New("spec.profilingConfig.profilerImage is required"))
...@@ -58,10 +76,8 @@ func (v *DynamoGraphDeploymentRequestValidator) Validate() (admission.Warnings, ...@@ -58,10 +76,8 @@ func (v *DynamoGraphDeploymentRequestValidator) Validate() (admission.Warnings,
err = errors.Join(err, errors.New("spec.profilingConfig.config is required and must not be empty")) err = errors.Join(err, errors.New("spec.profilingConfig.config is required and must not be empty"))
} }
// Validate enableGpuDiscovery is only true for cluster-wide operators // Note: GPU discovery is now automatic for cluster-wide operators
if v.request.Spec.EnableGpuDiscovery && !v.isClusterWideOperator { // Namespace-restricted operators automatically skip GPU discovery and require manual hardware config
err = errors.Join(err, errors.New("spec.enableGpuDiscovery can only be set to true for cluster-wide operators. Namespace-restricted operators cannot access cluster nodes for GPU discovery. Please set enableGpuDiscovery to false and provide hardware configuration (hardware.min_num_gpus_per_engine, hardware.max_num_gpus_per_engine, hardware.num_gpus_per_node) in spec.profilingConfig.config"))
}
// Parse config to validate structure (only if config is present) // Parse config to validate structure (only if config is present)
if v.request.Spec.ProfilingConfig.Config != nil && len(v.request.Spec.ProfilingConfig.Config.Raw) > 0 { if v.request.Spec.ProfilingConfig.Config != nil && len(v.request.Spec.ProfilingConfig.Config.Raw) > 0 {
...@@ -83,9 +99,101 @@ func (v *DynamoGraphDeploymentRequestValidator) Validate() (admission.Warnings, ...@@ -83,9 +99,101 @@ func (v *DynamoGraphDeploymentRequestValidator) Validate() (admission.Warnings,
} }
} }
// Validate GPU hardware information is available (last, so other errors are collected first)
if gpuErr := v.validateGPUHardwareInfo(); gpuErr != nil {
err = errors.Join(err, gpuErr)
}
return warnings, err return warnings, err
} }
// validateGPUHardwareInfo ensures GPU hardware information will be available for profiling.
// This validation happens at admission time to fail fast before the DGDR is persisted to etcd.
func (v *DynamoGraphDeploymentRequestValidator) validateGPUHardwareInfo() error {
// Parse profiling config
var config map[string]interface{}
if v.request.Spec.ProfilingConfig.Config != nil {
if err := yaml.Unmarshal(v.request.Spec.ProfilingConfig.Config.Raw, &config); err != nil {
// Config parse errors will be caught by other validators
return nil
}
} else {
config = make(map[string]interface{})
}
// Check if manual hardware config is provided
hardwareVal, hasHardware := config["hardware"]
var hasManualHardwareConfig bool
if hasHardware && hardwareVal != nil {
if hardwareConfig, ok := hardwareVal.(map[string]interface{}); ok {
// Check if essential hardware fields are provided
_, hasGPUModel := hardwareConfig["gpuModel"]
_, hasGPUVram := hardwareConfig["gpuVramMib"]
_, hasNumGPUs := hardwareConfig["numGpusPerNode"]
hasManualHardwareConfig = hasGPUModel || hasGPUVram || hasNumGPUs
}
}
// Check if explicit GPU ranges are provided
var hasExplicitGPURanges bool
if engineVal, hasEngine := config["engine"]; hasEngine && engineVal != nil {
if engineConfig, ok := engineVal.(map[string]interface{}); ok {
minGPUs, hasMin := engineConfig["minNumGpusPerEngine"]
maxGPUs, hasMax := engineConfig["maxNumGpusPerEngine"]
// Validate explicit GPU ranges
if hasMin && hasMax {
minVal := toFloat64(minGPUs)
maxVal := toFloat64(maxGPUs)
// Validate that min <= max
if minVal > maxVal {
return fmt.Errorf("invalid GPU range: minNumGpusPerEngine (%v) cannot be greater than maxNumGpusPerEngine (%v)",
minVal, maxVal)
}
hasExplicitGPURanges = minVal > 0 && maxVal > 0
}
}
}
// If manual config or explicit ranges provided, validation passes
if hasManualHardwareConfig || hasExplicitGPURanges {
return nil
}
// Neither manual config nor explicit ranges provided
// GPU discovery will be attempted at reconcile time, but if it's unavailable
// (e.g., namespace-scoped operator), the DGDR will fail
//
// Fail at admission time to give users immediate feedback
if v.isClusterWideOperator {
// Cluster-wide operator should have GPU discovery available
// Allow DGDR to be created - GPU discovery will provide hardware info
return nil
}
// Namespace-scoped operator likely doesn't have node read permissions
// Require manual hardware config or explicit GPU ranges
return errors.New(`GPU hardware configuration required for namespace-scoped operators.
Namespace-scoped operators typically lack node read permissions for GPU auto-discovery.
Provide hardware configuration in one of these ways:
1. Add hardware config in spec.profilingConfig.config:
hardware:
numGpusPerNode: 8
gpuModel: "H100-SXM5-80GB"
gpuVramMib: 81920
2. Or specify explicit GPU search ranges:
engine:
minNumGpusPerEngine: 2
maxNumGpusPerEngine: 8
See: https://github.com/ai-dynamo/dynamo/issues/6257`)
}
// ValidateUpdate performs stateful validation comparing old and new DynamoGraphDeploymentRequest. // ValidateUpdate performs stateful validation comparing old and new DynamoGraphDeploymentRequest.
// Returns warnings and error. // Returns warnings and error.
func (v *DynamoGraphDeploymentRequestValidator) ValidateUpdate(old *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (admission.Warnings, error) { func (v *DynamoGraphDeploymentRequestValidator) ValidateUpdate(old *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (admission.Warnings, error) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment