Unverified Commit d56439ec authored by hhzhang16's avatar hhzhang16 Committed by GitHub
Browse files

feat: migrate GPU discovery from Dynamo Profiler to Dynamo Operator with...


feat: migrate GPU discovery from Dynamo Profiler to Dynamo Operator with automatic injection (#6224)
Signed-off-by: default avatarHannah Zhang <hannahz@nvidia.com>
parent 233a1e9a
aiconfigurator @ 7c08d2f2
Subproject commit 7c08d2f2c4e289afe49f48e56d392a7d7221155d
......@@ -17,9 +17,7 @@ spec:
# NOTE: any image built before January 10 and any release prior to 0.8.1
# will need to use snake_case within profilingConfig.config
config:
sweep:
useAiConfigurator: true
aicSystem: h200_sxm
searchStrategy: rapid
sla:
isl: 3000
osl: 150
......
......@@ -17,8 +17,7 @@ spec:
# NOTE: any image built before January 10 and any release prior to 0.8.1
# will need to use snake_case within profilingConfig.config
config:
sweep:
useAiConfigurator: false
searchStrategy: thorough
sla:
isl: 3000
osl: 150
......
......@@ -24,14 +24,12 @@ spec:
pvcName: "model-cache" # Name of PVC containing model weights
pvcPath: "deepseek-r1" # Subpath within PVC where model is stored
sweep:
useAiConfigurator: false
searchStrategy: rapid
hardware:
# for h200, sweep over 8-16 GPUs per engine
minNumGpusPerEngine: 8
maxNumGpusPerEngine: 16
numGpusPerNode: 8
numGpusPerNode: 8 # Override auto-discovered value if different
sla:
isl: 3000
......
......@@ -38,7 +38,7 @@ from dynamo.profiler.utils.config_modifiers.parallelization_mapping import (
apply_parallel_mapping_to_config,
get_candidate_parallel_mappings,
)
from dynamo.profiler.utils.defaults import EngineType
from dynamo.profiler.utils.defaults import EngineType, SearchStrategy
from dynamo.profiler.utils.dgd_generation import generate_dgd_config_with_planner
from dynamo.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator
from dynamo.profiler.utils.plot import (
......@@ -140,10 +140,6 @@ async def run_profile(args):
# Clear any errors from previous profiling runs
clear_profiling_errors()
# Inherit aic_backend from backend if not explicitly set
if not args.aic_backend:
args.aic_backend = args.backend
# Write initial status for external jobs to monitor
os.makedirs(args.output_dir, exist_ok=True)
write_profiler_status(
......@@ -197,36 +193,28 @@ async def run_profile(args):
f"Using minimum of user-provided and model's maximum context length: {sweep_max_context_length}"
)
if args.use_ai_configurator:
if not args.aic_system:
# Initialize AI Configurator estimator (only used when search_strategy == SearchStrategy.RAPID)
ai_configurator_perf_estimator: AIConfiguratorPerfEstimator | None = None
if args.search_strategy == SearchStrategy.RAPID:
# Use AI Configurator for rapid estimation
if not args.system:
raise ValueError(
"Must provide --aic-system when using --use-ai-configurator."
"Must provide --system (hardware system, e.g. h100_sxm) when using rapid search strategy."
)
# Fallback to args.model if aic_hf_id is not provided
if not args.aic_hf_id:
if args.model:
logger.info(
f"--aic-hf-id not provided, using --model ({args.model}) as HuggingFace ID for AI configurator"
)
args.aic_hf_id = args.model
else:
if not args.model:
raise ValueError(
"Must provide --aic-hf-id or --model when using --use-ai-configurator."
"Must provide --model (HuggingFace ID) when using rapid search strategy."
)
logger.info("Using aiconfigurator to estimate performance...")
ai_configurator_perf_estimator = AIConfiguratorPerfEstimator(
args.aic_hf_id,
args.aic_system.lower(),
args.aic_backend,
args.aic_backend_version,
logger.info(
"Using AI Configurator to estimate performance (rapid strategy)..."
)
else:
if args.aic_system or args.aic_hf_id or args.aic_backend_version:
logger.warning(
"Ignoring --aic-system, --aic-hf-id, and/or --backend-version "
"when not using --use-ai-configurator."
ai_configurator_perf_estimator = AIConfiguratorPerfEstimator(
hf_id=args.model,
system=args.system.lower(),
backend=args.backend,
)
# first profile prefill
......@@ -272,7 +260,10 @@ async def run_profile(args):
ttft = None
if args.dry_run:
logger.info("Skipping deployment creation in dry run mode")
elif args.use_ai_configurator:
elif (
args.search_strategy == SearchStrategy.RAPID
and ai_configurator_perf_estimator
):
logger.info("Using ai-configurator to estimate prefill latency")
perf_dict = ai_configurator_perf_estimator.estimate_prefill_perf(
args.isl,
......@@ -395,7 +386,10 @@ async def run_profile(args):
if args.dry_run:
logger.info("Skipping deployment creation in dry run mode")
elif args.use_ai_configurator:
elif (
args.search_strategy == SearchStrategy.RAPID
and ai_configurator_perf_estimator
):
# Compute max_concurrency and max_kv_tokens to know which
# num_request to sweep over.
max_concurrency = ai_configurator_perf_estimator.get_max_batch_size(
......@@ -467,7 +461,10 @@ async def run_profile(args):
for num_request in sweep_num_request:
itl = thpt_per_gpu = None
if args.use_ai_configurator:
if (
args.search_strategy == SearchStrategy.RAPID
and ai_configurator_perf_estimator
):
logger.info(
"Using ai-configurator to estimate decode latency."
)
......@@ -511,7 +508,10 @@ async def run_profile(args):
parallel_mapping=mapping,
)
if not args.dry_run and not args.use_ai_configurator:
if (
not args.dry_run
and not args.search_strategy == SearchStrategy.RAPID
):
logger.info("Cleaning up deployment...")
await client.delete_deployment()
deployment_clients.remove(client)
......@@ -642,7 +642,10 @@ async def run_profile(args):
if args.dry_run:
logger.info("Skipping deployment creation in dry run mode")
elif args.use_ai_configurator:
elif (
args.search_strategy == SearchStrategy.RAPID
and ai_configurator_perf_estimator
):
profile_prefill_aiconfigurator(
work_dir,
best_prefill_gpus, # num_gpus
......@@ -728,7 +731,10 @@ async def run_profile(args):
if args.dry_run:
logger.info("Skipping deployment creation in dry run mode")
elif args.use_ai_configurator:
elif (
args.search_strategy == SearchStrategy.RAPID
and ai_configurator_perf_estimator
):
attention_dp_size = best_decode_mapping.get_attn_dp_size()
max_kv_tokens = ai_configurator_perf_estimator.get_max_kv_tokens(
args.isl, args.osl, tp_size=best_decode_mapping.get_tp_size()
......
......@@ -154,7 +154,7 @@ class BaseConfigModifier:
Raises:
ValueError: If neither --served-model-name nor model path arg is found
"""
model_name = None
model_name = ""
# Check for --served-model-name first (API model name)
for i, arg in enumerate(args):
if arg == cls.WORKER_SERVED_MODEL_NAME_ARG and i + 1 < len(args):
......@@ -162,14 +162,14 @@ class BaseConfigModifier:
break
# Check for backend-specific path argument
model_path = None
model_path = ""
for i, arg in enumerate(args):
if arg == cls.WORKER_MODEL_PATH_ARG and i + 1 < len(args):
model_path = args[i + 1]
break
# Require at least one to be specified
if model_name is None and model_path is None:
if not model_name and not model_path:
raise ValueError(
f"Cannot determine model: neither {cls.WORKER_MODEL_PATH_ARG} nor "
f"{cls.WORKER_SERVED_MODEL_NAME_ARG} found in worker configuration. "
......@@ -177,9 +177,9 @@ class BaseConfigModifier:
)
# If only one is specified, use it for both
if model_path is None:
if not model_path:
model_path = model_name
elif model_name is None:
elif not model_name:
model_name = model_path
return model_name, model_path
......
......@@ -49,3 +49,8 @@ DEFAULT_GPU_COST_PER_HOUR = 3.0 # Cost per GPU per hour in dollars
class EngineType(str, Enum):
PREFILL = "prefill"
DECODE = "decode"
class SearchStrategy(str, Enum):
RAPID = "rapid"
THOROUGH = "thorough"
......@@ -39,12 +39,10 @@ class AIConfiguratorPerfEstimator:
hf_id: str, # e.g. "Qwen/Qwen3-32B"
system: str, # e.g. "h200_sxm"
backend: str, # e.g. "trtllm"
version: str, # e.g. "0.20.0"
):
aiconfigurator = _try_import_aiconfigurator()
logger.info("Loading aiconfigurator database. This might take a few seconds...")
if not version:
version = aiconfigurator.sdk.perf_database.get_latest_database_version(
system,
backend,
......
......@@ -8,6 +8,7 @@ from typing import Any, Dict
import yaml
from dynamo.profiler.utils.defaults import SearchStrategy
from dynamo.profiler.utils.planner_utils import add_planner_arguments_to_parser
from dynamo.profiler.utils.search_space_autogen import auto_generate_search_space
......@@ -100,16 +101,14 @@ def create_profiler_parser() -> argparse.Namespace:
hardware:
minNumGpusPerEngine: Int (minimum number of GPUs per engine, default: 0)
maxNumGpusPerEngine: Int (maximum number of GPUs per engine, default: 0)
numGpusPerNode: Int (number of GPUs per node for MoE models - this will be the granularity when searching for the best TEP/DEP size, default: 0)
enableGpuDiscovery: Boolean (enable automatic GPU discovery from Kubernetes cluster nodes, when enabled overrides any manually specified hardware configuration, requires cluster-wide node access permissions, default: False)
numGpusPerNode: Int (number of GPUs per node, default: 0)
gpuModel: String (GPU model, used for auto-calculating search space, default: "")
gpuVramMib: Int (GPU VRAM in MiB, used for auto-calculating search space, default: 0)
system: String (target hardware system, e.g. h100_sxm, h200_sxm, default: None)
searchStrategy: String (search strategy for profiling: 'rapid' uses AI Configurator for quick estimation, 'thorough' runs actual deployments for comprehensive results, enum: [rapid, thorough], default: rapid)
sweep:
prefillInterpolationGranularity: Int (how many samples to benchmark to interpolate TTFT under different ISL, default: 16)
decodeInterpolationGranularity: Int (how many samples to benchmark to interpolate ITL under different active kv cache size and decode context length, default: 6)
useAiConfigurator: Boolean (use ai-configurator to estimate benchmarking results instead of running actual deployment, default: False)
aicSystem: String (target system for use with aiconfigurator, default: None)
aicHfId: String (aiconfigurator huggingface id of the target model, default: None)
aicBackend: String (aiconfigurator backend of the target model, if not provided, will use args.backend, default: "")
aicBackendVersion: String (specify backend version when using aiconfigurator to estimate perf, default: None)
dryRun: Boolean (dry run the profile job, default: False)
pickWithWebui: Boolean (pick the best parallelization mapping using webUI, default: False)
webuiPort: Int (webUI port, default: $PROFILER_WEBUI_PORT or 8000)
......@@ -226,7 +225,25 @@ def create_profiler_parser() -> argparse.Namespace:
"--num-gpus-per-node",
type=int,
default=_get(hardware_cfg, "numGpusPerNode", "num_gpus_per_node", 0),
help="Number of GPUs per node for MoE models - this will be the granularity when searching for the best TEP/DEP size",
help="Number of GPUs per node",
)
parser.add_argument(
"--gpu-model",
type=str,
default=_get(hardware_cfg, "gpuModel", "gpu_model", ""),
help="GPU model name (used for auto-calculating search space)",
)
parser.add_argument(
"--gpu-vram-mib",
type=int,
default=_get(hardware_cfg, "gpuVramMib", "gpu_vram_mib", 0),
help="GPU VRAM in MiB (used for auto-calculating search space)",
)
parser.add_argument(
"--system",
type=str,
default=_get(hardware_cfg, "system", "system", None),
help="Target hardware system, e.g. h100_sxm, h200_sxm",
)
parser.add_argument(
"--isl",
......@@ -253,6 +270,17 @@ def create_profiler_parser() -> argparse.Namespace:
help="target Inter Token Latency (float, in milliseconds)",
)
# High-level profiling strategy argument
parser.add_argument(
"--search-strategy",
type=SearchStrategy,
default=SearchStrategy(
_get(config, "searchStrategy", "search_strategy", "rapid")
),
choices=list(SearchStrategy),
help="Search strategy for profiling: 'rapid' uses AI Configurator for quick estimation, 'thorough' runs actual deployments for comprehensive results",
)
# arguments used for interpolating TTFT and ITL under different ISL/OSL
engine_cfg = config.get("engine", {})
parser.add_argument(
......@@ -296,12 +324,6 @@ def create_profiler_parser() -> argparse.Namespace:
default=_get(sweep_cfg, "dryRun", "dry_run", False),
help="Dry run the profile job",
)
parser.add_argument(
"--enable-gpu-discovery",
action="store_true",
default=_get(hardware_cfg, "enableGpuDiscovery", "enable_gpu_discovery", False),
help="Enable automatic GPU discovery from Kubernetes cluster nodes. When enabled, overrides any manually specified hardware configuration. Requires cluster-wide node access permissions.",
)
parser.add_argument(
"--pick-with-webui",
action="store_true",
......@@ -332,38 +354,6 @@ def create_profiler_parser() -> argparse.Namespace:
}
parser.set_defaults(**normalized_planner_config)
# arguments if using aiconfigurator
parser.add_argument(
"--use-ai-configurator",
action="store_true",
default=_get(sweep_cfg, "useAiConfigurator", "use_ai_configurator", False),
help="Use ai-configurator to estimate benchmarking results instead of running actual deployment.",
)
parser.add_argument(
"--aic-system",
type=str,
default=_get(sweep_cfg, "aicSystem", "aic_system", None),
help="Target system for use with aiconfigurator (e.g. h100_sxm, h200_sxm)",
)
parser.add_argument(
"--aic-hf-id",
type=str,
default=_get(sweep_cfg, "aicHfId", "aic_hf_id", None),
help="aiconfigurator name of the target model (e.g. Qwen/Qwen3-32B, meta-llama/Llama-3.1-405B)",
)
parser.add_argument(
"--aic-backend",
type=str,
default=_get(sweep_cfg, "aicBackend", "aic_backend", ""),
help="aiconfigurator backend of the target model, if not provided, will use args.backend",
)
parser.add_argument(
"--aic-backend-version",
type=str,
default=_get(sweep_cfg, "aicBackendVersion", "aic_backend_version", None),
help="Specify backend version when using aiconfigurator to estimate perf.",
)
# Parse arguments
args = parser.parse_args()
......
......@@ -8,7 +8,6 @@ import os
import yaml
from deploy.utils.gpu_inventory import get_gpu_summary
from dynamo.profiler.utils.config_modifiers import CONFIG_MODIFIERS
from dynamo.profiler.utils.model_info import ModelInfo, get_model_info
......@@ -103,66 +102,79 @@ def auto_generate_search_space(args: argparse.Namespace) -> None:
)
args.model_info = model_info
# now determine the search space
if args.enable_gpu_discovery:
if (
args.min_num_gpus_per_engine == 0
or args.max_num_gpus_per_engine == 0
or args.num_gpus_per_node == 0
):
if not args.model:
# TODO: get model info provided DGD config
error_msg = "No model provided, cannot auto-generate GPU search space. Please provide `--model` or GPU info"
logger.error(error_msg)
raise RuntimeError(error_msg)
# Determine the search space for profiling
# User-provided min/max values take precedence; auto-calculate missing bounds
# based on GPU hardware info and model size
user_specified_ranges = (
args.min_num_gpus_per_engine != 0 and args.max_num_gpus_per_engine != 0
)
logger.info("Getting GPU info from k8s cluster...")
gpu_info = get_gpu_summary()
if user_specified_ranges:
logger.info(
f"Cluster has {gpu_info['gpus_per_node']}x{gpu_info['model']} GPUs per node with {gpu_info['vram']} VRAM"
f"Using user-specified GPU search space: {args.min_num_gpus_per_engine} to {args.max_num_gpus_per_engine}"
)
# Ensure num_gpus_per_node is set (needed for multi-node configs)
if args.num_gpus_per_node == 0:
logger.warning("num_gpus_per_node not specified, setting to 8")
args.num_gpus_per_node = 8
else:
# Auto-calculate search space (honor partial user overrides)
# NOTE: will be handled in AIC
if args.num_gpus_per_node != 0 and args.gpu_vram_mib != 0:
# Have GPU hardware info - calculate based on model size
if not args.model:
error_msg = "No model provided, cannot auto-generate GPU search space. Please provide --model"
logger.error(error_msg)
raise RuntimeError(error_msg)
# model_info should be set by now (checked above), but mypy needs explicit verification
assert (
model_info is not None
), "model_info must be set when model is provided"
vram_mib = int(gpu_info["vram"]) # type: ignore[call-overload]
gpus_per_node = int(gpu_info["gpus_per_node"]) # type: ignore[call-overload]
logger.info(
f"Auto-generating search space: {args.num_gpus_per_node}x {args.gpu_model} GPUs with {args.gpu_vram_mib} MiB VRAM per GPU"
)
if args.system:
logger.info(f"Hardware system: {args.system}")
# Calculate minimum GPUs needed for model
min_gpu = math.ceil(
model_info.model_size / MODEL_GPU_MEM_FRAC_MAX / vram_mib
model_info.model_size / MODEL_GPU_MEM_FRAC_MAX / args.gpu_vram_mib
)
# Calculate maximum GPUs to profile
if not model_info.is_moe:
max_gpu = gpus_per_node
max_gpu = args.num_gpus_per_node
else:
max_gpu = max(min_gpu * MOE_MODEL_MAX_NUM_GPU_FACTOR, gpus_per_node)
if min_gpu > max_gpu:
error_msg = f"No valid GPU configuration found for model {args.model} on the cluster with {gpu_info['gpus_per_node']}x{gpu_info['model']} GPUs per node"
# MoE models can benefit from more GPUs
max_gpu = max(
min_gpu * MOE_MODEL_MAX_NUM_GPU_FACTOR, args.num_gpus_per_node
)
# Honor partial user overrides
final_min = args.min_num_gpus_per_engine or min_gpu
final_max = args.max_num_gpus_per_engine or max_gpu
# Validate final_min <= final_max
if final_min > final_max:
error_msg = f"Invalid GPU range: min_num_gpus_per_engine ({final_min}) > max_num_gpus_per_engine ({final_max})"
logger.error(error_msg)
raise RuntimeError(error_msg)
logger.info(
f"Auto-generated search space for model {args.model} on the cluster with {gpu_info['gpus_per_node']}x{gpu_info['model']} GPUs per node: {min_gpu} to {max_gpu}"
)
args.min_num_gpus_per_engine = min_gpu
args.max_num_gpus_per_engine = max_gpu
args.num_gpus_per_node = gpus_per_node # type: ignore[assignment]
# Clamp to valid range [1, args.num_gpus_per_node]
final_min = max(1, min(final_min, args.num_gpus_per_node))
final_max = max(1, min(final_max, args.num_gpus_per_node))
logger.info(f"Auto-generated search space: {final_min} to {final_max} GPUs")
args.min_num_gpus_per_engine = final_min
args.max_num_gpus_per_engine = final_max
else:
# use default values for GPUs
if args.min_num_gpus_per_engine == 0:
logger.warning(
"GPU discover is disabled and min_num_gpus_per_engine is not specified, setting to 1"
)
args.min_num_gpus_per_engine = 1
if args.max_num_gpus_per_engine == 0:
logger.warning(
"GPU discover is disabled and max_num_gpus_per_engine is not specified, setting to 4"
)
args.max_num_gpus_per_engine = 4
if args.num_gpus_per_node == 0:
logger.warning(
"GPU discover is disabled and num_gpus_per_node is not specified, setting to 8"
# No GPU info available - use defaults
logger.warning("GPU hardware info not available, using default values")
args.min_num_gpus_per_engine = args.min_num_gpus_per_engine or 1
args.max_num_gpus_per_engine = args.max_num_gpus_per_engine or 4
args.num_gpus_per_node = args.num_gpus_per_node or 8
logger.info(
f"Default search space: {args.min_num_gpus_per_engine} to {args.max_num_gpus_per_engine} GPUs, {args.num_gpus_per_node} GPUs per node"
)
args.num_gpus_per_node = 8
return
......@@ -140,13 +140,13 @@ spec:
type: string
type: object
enableGpuDiscovery:
default: false
default: true
description: |-
EnableGpuDiscovery controls whether the profiler should automatically discover GPU
resources from the Kubernetes cluster nodes. When enabled, the profiler will override
any manually specified hardware configuration (minNumGpusPerEngine, maxNumGpusPerEngine,
numGpusPerNode) with values detected from the cluster.
Requires cluster-wide node access permissions - only available with cluster-scoped operators.
EnableGPUDiscovery controls whether the operator attempts to discover GPU hardware from cluster nodes.
DEPRECATED: This field is deprecated and will be removed in v1beta1. GPU discovery is now always
attempted automatically. Setting this field has no effect - the operator will always try to discover
GPU hardware when node read permissions are available. If discovery is unavailable (e.g., namespace-scoped
operator without permissions), manual hardware configuration is required regardless of this setting.
type: boolean
model:
description: |-
......@@ -157,6 +157,12 @@ spec:
profilingConfig:
description: |-
ProfilingConfig provides the complete configuration for the profiling job.
Note: GPU discovery is automatically attempted to detect GPU resources from Kubernetes
cluster nodes. If the operator has node read permissions (cluster-wide or explicitly granted),
discovered GPU configuration is used as defaults when hardware configuration is not manually
specified (minNumGpusPerEngine, maxNumGpusPerEngine, numGpusPerNode). User-specified values
always take precedence over auto-discovered values. If GPU discovery fails (e.g.,
namespace-restricted operator without node permissions), manual hardware config is required.
This configuration is passed directly to the profiler.
The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema).
Note: deployment.model and engine.backend are automatically set from the high-level
......
......@@ -151,16 +151,13 @@ type DynamoGraphDeploymentRequestSpec struct {
// +kubebuilder:default=false
UseMocker bool `json:"useMocker,omitempty"`
// EnableGpuDiscovery controls whether the profiler should automatically discover GPU
// resources from the Kubernetes cluster nodes. When enabled, the profiler will override
// any manually specified hardware configuration (minNumGpusPerEngine, maxNumGpusPerEngine,
// numGpusPerNode) with values detected from the cluster.
// Requires cluster-wide node access permissions - only available with cluster-scoped operators.
// +kubebuilder:default=false
// +kubebuilder:validation:Optional
EnableGpuDiscovery bool `json:"enableGpuDiscovery,omitempty"`
// ProfilingConfig provides the complete configuration for the profiling job.
// Note: GPU discovery is automatically attempted to detect GPU resources from Kubernetes
// cluster nodes. If the operator has node read permissions (cluster-wide or explicitly granted),
// discovered GPU configuration is used as defaults when hardware configuration is not manually
// specified (minNumGpusPerEngine, maxNumGpusPerEngine, numGpusPerNode). User-specified values
// always take precedence over auto-discovered values. If GPU discovery fails (e.g.,
// namespace-restricted operator without node permissions), manual hardware config is required.
// This configuration is passed directly to the profiler.
// The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema).
// Note: deployment.model and engine.backend are automatically set from the high-level
......@@ -168,6 +165,15 @@ type DynamoGraphDeploymentRequestSpec struct {
// +kubebuilder:validation:Required
ProfilingConfig ProfilingConfigSpec `json:"profilingConfig"`
// EnableGPUDiscovery controls whether the operator attempts to discover GPU hardware from cluster nodes.
// DEPRECATED: This field is deprecated and will be removed in v1beta1. GPU discovery is now always
// attempted automatically. Setting this field has no effect - the operator will always try to discover
// GPU hardware when node read permissions are available. If discovery is unavailable (e.g., namespace-scoped
// operator without permissions), manual hardware configuration is required regardless of this setting.
// +optional
// +kubebuilder:default=true
EnableGPUDiscovery *bool `json:"enableGpuDiscovery,omitempty"`
// AutoApply indicates whether to automatically create a DynamoGraphDeployment
// after profiling completes. If false, only the spec is generated and stored in status.
// Users can then manually create a DGD using the generated spec.
......
......@@ -726,6 +726,11 @@ func (in *DynamoGraphDeploymentRequestList) DeepCopyObject() runtime.Object {
func (in *DynamoGraphDeploymentRequestSpec) DeepCopyInto(out *DynamoGraphDeploymentRequestSpec) {
*out = *in
in.ProfilingConfig.DeepCopyInto(&out.ProfilingConfig)
if in.EnableGPUDiscovery != nil {
in, out := &in.EnableGPUDiscovery, &out.EnableGPUDiscovery
*out = new(bool)
**out = **in
}
if in.DeploymentOverrides != nil {
in, out := &in.DeploymentOverrides, &out.DeploymentOverrides
*out = new(DeploymentOverridesSpec)
......
......@@ -140,13 +140,13 @@ spec:
type: string
type: object
enableGpuDiscovery:
default: false
default: true
description: |-
EnableGpuDiscovery controls whether the profiler should automatically discover GPU
resources from the Kubernetes cluster nodes. When enabled, the profiler will override
any manually specified hardware configuration (minNumGpusPerEngine, maxNumGpusPerEngine,
numGpusPerNode) with values detected from the cluster.
Requires cluster-wide node access permissions - only available with cluster-scoped operators.
EnableGPUDiscovery controls whether the operator attempts to discover GPU hardware from cluster nodes.
DEPRECATED: This field is deprecated and will be removed in v1beta1. GPU discovery is now always
attempted automatically. Setting this field has no effect - the operator will always try to discover
GPU hardware when node read permissions are available. If discovery is unavailable (e.g., namespace-scoped
operator without permissions), manual hardware configuration is required regardless of this setting.
type: boolean
model:
description: |-
......@@ -157,6 +157,12 @@ spec:
profilingConfig:
description: |-
ProfilingConfig provides the complete configuration for the profiling job.
Note: GPU discovery is automatically attempted to detect GPU resources from Kubernetes
cluster nodes. If the operator has node read permissions (cluster-wide or explicitly granted),
discovered GPU configuration is used as defaults when hardware configuration is not manually
specified (minNumGpusPerEngine, maxNumGpusPerEngine, numGpusPerNode). User-specified values
always take precedence over auto-discovered values. If GPU discovery fails (e.g.,
namespace-restricted operator without node permissions), manual hardware config is required.
This configuration is passed directly to the profiler.
The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema).
Note: deployment.model and engine.backend are automatically set from the high-level
......
......@@ -39,23 +39,24 @@ spec:
engine:
maxContextLength: 16384 # will override max context length of the model if provided
# Search strategy: 'rapid' for AI Configurator estimation (20-30s), 'thorough' for actual deployments (2-4h)
searchStrategy: thorough
# Hardware configuration
# Note: Operator auto-discovers GPU info from cluster nodes when available
hardware:
minNumGpusPerEngine: 1 # Minimum GPUs to test
maxNumGpusPerEngine: 4 # Maximum GPUs to test (limited by model's num_heads/4)
numGpusPerNode: 8 # GPUs per node (for MoE models)
maxNumGpusPerEngine: 4 # Maximum GPUs to test
numGpusPerNode: 8 # GPUs per node (optional - auto-discovered if not specified)
system: h200_sxm # Hardware system (optional - auto-detected if not specified)
# gpuModel: "H200-SXM" # GPU model (optional - auto-discovered)
# gpuVramMib: 141557 # GPU VRAM in MiB (optional - auto-discovered)
# Sweep/profiling configuration
sweep:
prefillInterpolationGranularity: 16 # Samples for TTFT interpolation
decodeInterpolationGranularity: 6 # Samples for ITL interpolation
# AI Configurator mode (fast simulation-based profiling, 20-30 seconds)
useAiConfigurator: false # Set to false for online profiling (2-4 hours)
aicSystem: h200_sxm # Target GPU system for AI Configurator
aicHfId: Qwen/Qwen3-0.6B # HuggingFace model ID for AI Configurator
aicBackendVersion: "0.20.0" # Backend version for AI Configurator
# SLA targets for profiling
sla:
isl: 3000 # Input sequence length
......
......@@ -47,6 +47,7 @@ import (
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
commonController "github.com/ai-dynamo/dynamo/deploy/operator/internal/controller_common"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/gpu"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/observability"
webhookvalidation "github.com/ai-dynamo/dynamo/deploy/operator/internal/webhook/validation"
)
......@@ -169,12 +170,26 @@ const (
BackendSGLang = "sglang"
BackendTRTLLM = "trtllm"
// Profiling config field names
// Profiling config field names for v1alpha1; note: will be removed in v1beta1
ConfigKeyDeployment = "deployment"
ConfigKeyModelCache = "modelCache"
ConfigKeyPVCName = "pvcName"
ConfigKeyPVCPath = "pvcPath"
ConfigKeyMountPath = "mountPath"
ConfigKeyHardware = "hardware"
ConfigKeyEngine = "engine"
ConfigKeyOutputDir = "output_dir"
ConfigKeyNumGpusPerNode = "numGpusPerNode"
ConfigKeyGPUModel = "gpuModel"
ConfigKeyGPUVramMib = "gpuVramMib"
ConfigKeySystem = "system"
ConfigKeyMinNumGpusPerEng = "minNumGpusPerEngine"
ConfigKeyMaxNumGpusPerEng = "maxNumGpusPerEngine"
ConfigKeyBackend = "backend"
ConfigKeyConfig = "config"
ConfigKeyNamespace = "namespace"
ConfigKeyModel = "model"
ConfigKeyDGDImage = "dgd_image"
)
// shell script template for the output copier sidecar
......@@ -946,10 +961,105 @@ func (r *DynamoGraphDeploymentRequestReconciler) validateSpec(ctx context.Contex
}
}
if err := r.validateGPUHardwareInfo(ctx, dgdr); err != nil {
return err
}
// The profiler will validate the rest of the configuration
return nil
}
// toFloat64 converts a numeric value (int or float64) to float64.
// Returns 0 if the value is neither int nor float64.
func toFloat64(val interface{}) float64 {
switch v := val.(type) {
case float64:
return v
case int:
return float64(v)
default:
return 0
}
}
// validateGPUHardwareInfo ensures GPU hardware information is available when required for profiling
func (r *DynamoGraphDeploymentRequestReconciler) validateGPUHardwareInfo(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) error {
logger := log.FromContext(ctx)
// Check for hardware info and GPU ranges
// TODO: will be cleaner once we swap to new DGDR schema (#6130)
var config map[string]interface{}
if dgdr.Spec.ProfilingConfig.Config != nil {
if err := yaml.Unmarshal(dgdr.Spec.ProfilingConfig.Config.Raw, &config); err != nil {
// Config parse errors will be caught later, skip validation here
return nil
}
} else {
config = make(map[string]interface{})
}
hardwareVal, hasHardware := config[ConfigKeyHardware]
var hasManualHardwareConfig bool
if hasHardware && hardwareVal != nil {
if hardwareConfig, ok := hardwareVal.(map[string]interface{}); ok {
_, hasGPUModel := hardwareConfig[ConfigKeyGPUModel]
_, hasGPUVram := hardwareConfig[ConfigKeyGPUVramMib]
_, hasNumGPUs := hardwareConfig[ConfigKeyNumGpusPerNode]
hasManualHardwareConfig = hasGPUModel || hasGPUVram || hasNumGPUs
}
}
var hasExplicitGPURanges bool
if engineVal, hasEngine := config[ConfigKeyEngine]; hasEngine && engineVal != nil {
if engineConfig, ok := engineVal.(map[string]interface{}); ok {
minGPUs, hasMin := engineConfig[ConfigKeyMinNumGpusPerEng]
maxGPUs, hasMax := engineConfig[ConfigKeyMaxNumGpusPerEng]
if hasMin && hasMax {
minVal := toFloat64(minGPUs)
maxVal := toFloat64(maxGPUs)
// Validate that min <= max
if minVal > maxVal {
return fmt.Errorf("invalid GPU range: %s (%v) cannot be greater than %s (%v)",
ConfigKeyMinNumGpusPerEng, minVal, ConfigKeyMaxNumGpusPerEng, maxVal)
}
hasExplicitGPURanges = minVal > 0 && maxVal > 0
}
}
}
// If manual config or explicit ranges are provided, validation passes
if hasManualHardwareConfig || hasExplicitGPURanges {
return nil
}
_, err := gpu.DiscoverGPUs(ctx, r.Client)
if err == nil {
// GPU discovery is available, validation passes
return nil
}
logger.Info("GPU discovery not available", "reason", err.Error())
isNamespaceScoped := r.Config.RestrictedNamespace != ""
if isNamespaceScoped {
return fmt.Errorf(`GPU hardware info required but cannot be auto-discovered (namespace-scoped operator lacks node read permissions).
Add hardware config to profilingConfig.config.%s (%s, %s, %s) or specify %s.%s and %s.%s.
See: https://github.com/ai-dynamo/dynamo/issues/6257`,
ConfigKeyHardware, ConfigKeyNumGpusPerNode, ConfigKeyGPUModel, ConfigKeyGPUVramMib,
ConfigKeyEngine, ConfigKeyMinNumGpusPerEng, ConfigKeyEngine, ConfigKeyMaxNumGpusPerEng)
}
return fmt.Errorf(`GPU hardware info required but auto-discovery failed. Add hardware config to profilingConfig.config.%s (%s, %s, %s) or specify %s.%s and %s.%s.
See profiling documentation for configuration details.`,
ConfigKeyHardware, ConfigKeyNumGpusPerNode, ConfigKeyGPUModel, ConfigKeyGPUVramMib,
ConfigKeyEngine, ConfigKeyMinNumGpusPerEng, ConfigKeyEngine, ConfigKeyMaxNumGpusPerEng)
}
// createProfilingJob creates a Kubernetes Job for profiling using SyncResource
func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) error {
logger := log.FromContext(ctx)
......@@ -989,13 +1099,29 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
}
}
// Run GPU discovery before creating job (cluster-wide and namespace-restricted operators if they have node read permissions)
var gpuInfo *gpu.GPUInfo
logger.Info("Attempting GPU discovery for profiling job")
discoveredInfo, err := gpu.DiscoverGPUs(ctx, r.Client)
if err != nil {
// This path is expected for namespace-restricted operators without node read permissions
logger.Info("GPU discovery not available, using manual hardware configuration from profiling config",
"reason", err.Error())
} else {
gpuInfo = discoveredInfo
logger.Info("GPU discovery completed successfully",
"gpusPerNode", gpuInfo.GPUsPerNode,
"model", gpuInfo.Model,
"vramMiB", gpuInfo.VRAMPerGPU,
"system", gpuInfo.System)
}
// Use SyncResource to create/update the job
modified, job, err := commonController.SyncResource(ctx, r, dgdr, func(ctx context.Context) (*batchv1.Job, bool, error) {
jobName := getProfilingJobName(dgdr)
outputConfigMapName := getOutputConfigMapName(dgdr)
// Parse and prepare profiling config
configYAML, err := r.prepareProfilingConfig(dgdr)
configYAML, err := r.prepareProfilingConfig(dgdr, gpuInfo)
if err != nil {
return nil, false, err
}
......@@ -1069,12 +1195,6 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
"--profile-config", string(configYAML),
}
// Add --enable-gpu-discovery flag based on DGDR spec
// GPU discovery requires cluster-wide node access
if dgdr.Spec.EnableGpuDiscovery {
profilerArgs = append(profilerArgs, "--enable-gpu-discovery")
}
// Use profiler image from profilingConfig
imageName := dgdr.Spec.ProfilingConfig.ProfilerImage
logger.Info("Using profiler image", "image", imageName)
......@@ -1250,7 +1370,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
}
// prepareProfilingConfig parses and modifies the profiling config
func (r *DynamoGraphDeploymentRequestReconciler) prepareProfilingConfig(dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) ([]byte, error) {
func (r *DynamoGraphDeploymentRequestReconciler) prepareProfilingConfig(dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest, gpuInfo *gpu.GPUInfo) ([]byte, error) {
// Parse the profiling config from JSON
var config map[string]interface{}
if err := yaml.Unmarshal(dgdr.Spec.ProfilingConfig.Config.Raw, &config); err != nil {
......@@ -1258,53 +1378,84 @@ func (r *DynamoGraphDeploymentRequestReconciler) prepareProfilingConfig(dgdr *nv
}
// Set deployment.namespace if not already set
deploymentVal, hasDeployment := config["deployment"]
deploymentVal, hasDeployment := config[ConfigKeyDeployment]
var deploymentConfig map[string]interface{}
if !hasDeployment || deploymentVal == nil {
deploymentConfig = make(map[string]interface{})
config["deployment"] = deploymentConfig
config[ConfigKeyDeployment] = deploymentConfig
} else {
var ok bool
deploymentConfig, ok = deploymentVal.(map[string]interface{})
if !ok {
return nil, fmt.Errorf("profilingConfig.config.deployment must be an object, got %T", deploymentVal)
return nil, fmt.Errorf("profilingConfig.config.%s must be an object, got %T", ConfigKeyDeployment, deploymentVal)
}
}
if _, hasNamespace := deploymentConfig["namespace"]; !hasNamespace {
deploymentConfig["namespace"] = dgdr.Namespace
if _, hasNamespace := deploymentConfig[ConfigKeyNamespace]; !hasNamespace {
deploymentConfig[ConfigKeyNamespace] = dgdr.Namespace
}
// Set deployment.model from spec.model
deploymentConfig["model"] = dgdr.Spec.Model
deploymentConfig[ConfigKeyModel] = dgdr.Spec.Model
// Set deployment.dgd_image from deploymentOverrides.workersImage if provided
if dgdr.Spec.DeploymentOverrides != nil && dgdr.Spec.DeploymentOverrides.WorkersImage != "" {
deploymentConfig["dgd_image"] = dgdr.Spec.DeploymentOverrides.WorkersImage
deploymentConfig[ConfigKeyDGDImage] = dgdr.Spec.DeploymentOverrides.WorkersImage
}
// Set output_dir if not already set
if _, hasOutputDir := config["output_dir"]; !hasOutputDir {
config["output_dir"] = ProfilingOutputPath
if _, hasOutputDir := config[ConfigKeyOutputDir]; !hasOutputDir {
config[ConfigKeyOutputDir] = ProfilingOutputPath
}
// Set engine.backend from spec.backend
engineVal, hasEngine := config["engine"]
engineVal, hasEngine := config[ConfigKeyEngine]
var engineConfig map[string]interface{}
if !hasEngine || engineVal == nil {
engineConfig = make(map[string]interface{})
config["engine"] = engineConfig
config[ConfigKeyEngine] = engineConfig
} else {
var ok bool
engineConfig, ok = engineVal.(map[string]interface{})
if !ok {
return nil, fmt.Errorf("profilingConfig.config.engine must be an object, got %T", engineVal)
return nil, fmt.Errorf("profilingConfig.config.%s must be an object, got %T", ConfigKeyEngine, engineVal)
}
}
engineConfig["backend"] = dgdr.Spec.Backend
engineConfig[ConfigKeyBackend] = dgdr.Spec.Backend
// If ConfigMapRef is provided, set engine.config path
if dgdr.Spec.ProfilingConfig.ConfigMapRef != nil {
engineConfig["config"] = fmt.Sprintf("%s/%s", ProfilingConfigPath, ProfilingConfigFile)
engineConfig[ConfigKeyConfig] = fmt.Sprintf("%s/%s", ProfilingConfigPath, ProfilingConfigFile)
}
// User-specified values take precedence over auto-discovered values
if gpuInfo != nil {
hardwareVal, hasHardware := config["hardware"]
var hardwareConfig map[string]interface{}
if !hasHardware || hardwareVal == nil {
hardwareConfig = make(map[string]interface{})
config["hardware"] = hardwareConfig
} else {
var ok bool
hardwareConfig, ok = hardwareVal.(map[string]interface{})
if !ok {
return nil, fmt.Errorf("profilingConfig.config.hardware must be an object, got %T", hardwareVal)
}
}
if _, hasNumGpus := hardwareConfig[ConfigKeyNumGpusPerNode]; !hasNumGpus {
hardwareConfig[ConfigKeyNumGpusPerNode] = gpuInfo.GPUsPerNode
}
if _, hasGpuModel := hardwareConfig[ConfigKeyGPUModel]; !hasGpuModel {
hardwareConfig[ConfigKeyGPUModel] = gpuInfo.Model
}
if _, hasGpuVram := hardwareConfig[ConfigKeyGPUVramMib]; !hasGpuVram {
hardwareConfig[ConfigKeyGPUVramMib] = gpuInfo.VRAMPerGPU
}
if gpuInfo.System != "" {
if _, hasSystem := hardwareConfig[ConfigKeySystem]; !hasSystem {
hardwareConfig[ConfigKeySystem] = gpuInfo.System
}
}
}
// Serialize config to YAML for passing to profiler
......
......@@ -55,6 +55,14 @@ func (m *MockRBACManager) EnsureServiceAccountWithRBAC(ctx context.Context, targ
// Helper function to create JSON config for tests
func createTestConfig(config map[string]interface{}) *apiextensionsv1.JSON {
// Add default hardware config if not present to satisfy validation
if _, hasHardware := config["hardware"]; !hasHardware {
config["hardware"] = map[string]interface{}{
"numGpusPerNode": 8,
"gpuModel": "H100-SXM5-80GB",
"gpuVramMib": 81920,
}
}
jsonBytes, err := json.Marshal(config)
if err != nil {
panic(err)
......@@ -114,10 +122,6 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
"isl": 3000,
"osl": 5,
},
"hardware": map[string]interface{}{
"min_num_gpus_per_engine": 1,
"max_num_gpus_per_engine": 8,
},
}),
},
},
......@@ -243,10 +247,6 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
"isl": 3000,
"osl": 5,
},
"hardware": map[string]interface{}{
"min_num_gpus_per_engine": 1,
"max_num_gpus_per_engine": 8,
},
}),
ConfigMapRef: &nvidiacomv1alpha1.ConfigMapKeySelector{
Name: "test-config",
......@@ -341,10 +341,6 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
"isl": 3000,
"osl": 5,
},
"hardware": map[string]interface{}{
"min_num_gpus_per_engine": 1,
"max_num_gpus_per_engine": 8,
},
"sweep": map[string]interface{}{
"use_ai_configurator": true,
"aic_system": "h200_sxm",
......@@ -1236,10 +1232,6 @@ var _ = Describe("DGDR Error Handling", func() {
"isl": 3000,
"osl": 5,
},
"hardware": map[string]interface{}{
"min_num_gpus_per_engine": 1,
"max_num_gpus_per_engine": 8,
},
}),
},
},
......@@ -1511,4 +1503,307 @@ spec:
Expect(additionalResources[1].GetName()).Should(Equal("config2"))
})
})
Context("GPU Discovery Integration Tests", func() {
It("Should use GPU discovery when nodes have GPU labels", func() {
ctx := context.Background()
dgdrName := "test-dgdr-gpu-discovery"
namespace := defaultNamespace
// Create a node with GPU labels (simulating GFD labels)
gpuNode := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "gpu-worker-1",
Labels: map[string]string{
"nvidia.com/gpu.count": "8",
"nvidia.com/gpu.product": "H100-SXM5-80GB",
"nvidia.com/gpu.memory": "81920",
},
},
}
Expect(k8sClient.Create(ctx, gpuNode)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, gpuNode) }()
// Create DGDR WITHOUT hardware config (should use GPU discovery)
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "test-profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(`{
"sla": {"ttft": 100.0, "itl": 1500.0},
"engine": {"minNumGpusPerEngine": 1, "maxNumGpusPerEngine": 8}
}`),
},
},
},
}
Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Reconcile - should succeed with GPU discovery
_, err := reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{
Name: dgdrName,
Namespace: namespace,
},
})
Expect(err).NotTo(HaveOccurred())
// Should transition to Pending (validation passed)
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
Expect(updated.Status.State).Should(Equal(DGDRStatePending))
})
It("Should respect manual hardware config over GPU discovery", func() {
ctx := context.Background()
dgdrName := "test-dgdr-manual-override"
namespace := defaultNamespace
// Create a node with H100 GPUs
gpuNode := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "gpu-worker-h100",
Labels: map[string]string{
"nvidia.com/gpu.count": "8",
"nvidia.com/gpu.product": "H100-SXM5-80GB",
"nvidia.com/gpu.memory": "81920",
},
},
}
Expect(k8sClient.Create(ctx, gpuNode)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, gpuNode) }()
// Create DGDR WITH manual hardware config (A100, not H100)
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "test-profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(`{
"sla": {"ttft": 100.0, "itl": 1500.0},
"hardware": {
"numGpusPerNode": 4,
"gpuModel": "A100-SXM4-40GB",
"gpuVramMib": 40960,
"system": "a100_sxm"
}
}`),
},
},
},
}
Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Reconcile - should succeed and use manual config
_, err := reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{
Name: dgdrName,
Namespace: namespace,
},
})
Expect(err).NotTo(HaveOccurred())
// Should transition to Pending (validation passed with manual config)
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
Expect(updated.Status.State).Should(Equal(DGDRStatePending))
})
It("Should succeed with GPU discovery when cluster has GPU nodes", func() {
ctx := context.Background()
dgdrName := "test-dgdr-with-autodiscovery"
namespace := defaultNamespace
// Create a GPU node so GPU discovery can succeed
node := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "gpu-worker-autodiscovery",
Labels: map[string]string{
"nvidia.com/gpu.count": "8",
"nvidia.com/gpu.product": "H100-SXM5-80GB",
"nvidia.com/gpu.memory": "81920",
},
},
}
Expect(k8sClient.Create(ctx, node)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, node) }()
// Create DGDR WITHOUT hardware config - should use GPU discovery
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "test-profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(`{
"sla": {"ttft": 100.0, "itl": 1500.0}
}`),
},
},
},
}
Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Reconcile - should succeed with GPU discovery
_, err := reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{
Name: dgdrName,
Namespace: namespace,
},
})
Expect(err).NotTo(HaveOccurred())
// Should transition to Pending
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
Expect(updated.Status.State).Should(Equal(DGDRStatePending))
})
It("Should pass validation with explicit GPU ranges without GPU discovery", func() {
ctx := context.Background()
dgdrName := "test-dgdr-explicit-ranges"
namespace := defaultNamespace
// Intentionally don't create GPU nodes to test that explicit ranges work without GPU discovery
// Create DGDR with explicit minNumGpusPerEngine/maxNumGpusPerEngine
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "test-profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(`{
"sla": {"ttft": 100.0, "itl": 1500.0},
"engine": {
"minNumGpusPerEngine": 2,
"maxNumGpusPerEngine": 4
},
"hardware": {
"numGpusPerNode": 8
}
}`),
},
},
},
}
Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Reconcile - should succeed (explicit ranges + minimal hardware bypass GPU discovery requirement)
_, err := reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{
Name: dgdrName,
Namespace: namespace,
},
})
Expect(err).NotTo(HaveOccurred())
// Should transition to Pending
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
Expect(updated.Status.State).Should(Equal(DGDRStatePending))
})
It("Should use GPU discovery with heterogeneous nodes (picks best)", func() {
ctx := context.Background()
dgdrName := "test-dgdr-heterogeneous"
namespace := defaultNamespace
// Create nodes with different GPU configs
nodeA100 := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "gpu-worker-a100",
Labels: map[string]string{
"nvidia.com/gpu.count": "4",
"nvidia.com/gpu.product": "A100-SXM4-40GB",
"nvidia.com/gpu.memory": "40960",
},
},
}
nodeH100 := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "gpu-worker-h100",
Labels: map[string]string{
"nvidia.com/gpu.count": "8",
"nvidia.com/gpu.product": "H100-SXM5-80GB",
"nvidia.com/gpu.memory": "81920",
},
},
}
Expect(k8sClient.Create(ctx, nodeA100)).Should(Succeed())
Expect(k8sClient.Create(ctx, nodeH100)).Should(Succeed())
defer func() {
_ = k8sClient.Delete(ctx, nodeA100)
_ = k8sClient.Delete(ctx, nodeH100)
}()
// Create DGDR without hardware config
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "test-profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(`{
"sla": {"ttft": 100.0, "itl": 1500.0},
"engine": {"minNumGpusPerEngine": 1, "maxNumGpusPerEngine": 8}
}`),
},
},
},
}
Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Reconcile - should pick H100 (8 GPUs > 4 GPUs)
_, err := reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{
Name: dgdrName,
Namespace: namespace,
},
})
Expect(err).NotTo(HaveOccurred())
// Should transition to Pending (using H100 config)
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
Expect(updated.Status.State).Should(Equal(DGDRStatePending))
})
})
})
/*
* SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gpu
import (
"context"
"fmt"
"strconv"
"strings"
corev1 "k8s.io/api/core/v1"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"
)
const (
// NVIDIA GPU Feature Discovery (GFD) label keys
LabelGPUCount = "nvidia.com/gpu.count"
LabelGPUProduct = "nvidia.com/gpu.product"
LabelGPUMemory = "nvidia.com/gpu.memory"
)
// GPUInfo contains discovered GPU configuration from cluster nodes
type GPUInfo struct {
GPUsPerNode int // Maximum GPUs per node found in the cluster
Model string // GPU product name (e.g., "H100-SXM5-80GB")
VRAMPerGPU int // VRAM in MiB per GPU
System string // AIC hardware system identifier (e.g., "h100_sxm", "h200_sxm"), empty if unknown
}
// DiscoverGPUs queries Kubernetes nodes to determine GPU configuration.
// It extracts GPU information from NVIDIA GPU Feature Discovery (GFD) labels
// and returns aggregated GPU info, preferring nodes with higher GPU count,
// then higher VRAM if counts are equal.
//
// This function requires cluster-wide node read permissions and expects nodes
// to have GFD labels. If no nodes with GPU labels are found, it returns an error.
func DiscoverGPUs(ctx context.Context, k8sClient client.Client) (*GPUInfo, error) {
logger := log.FromContext(ctx)
logger.Info("Starting GPU discovery from cluster nodes")
// List all nodes in the cluster
nodeList := &corev1.NodeList{}
if err := k8sClient.List(ctx, nodeList); err != nil {
return nil, fmt.Errorf("failed to list cluster nodes: %w", err)
}
if len(nodeList.Items) == 0 {
return nil, fmt.Errorf("no nodes found in cluster")
}
logger.Info("Found cluster nodes", "count", len(nodeList.Items))
// Track the best GPU configuration found
var bestGPUInfo *GPUInfo
nodesWithGPUs := 0
for i := range nodeList.Items {
node := &nodeList.Items[i]
gpuInfo, err := extractGPUInfoFromNode(node)
if err != nil {
// Node doesn't have GPU labels or has invalid labels, skip it
logger.V(1).Info("Skipping node without valid GPU info",
"node", node.Name,
"reason", err.Error())
continue
}
nodesWithGPUs++
logger.Info("Found GPU node",
"node", node.Name,
"gpus", gpuInfo.GPUsPerNode,
"model", gpuInfo.Model,
"vram", gpuInfo.VRAMPerGPU)
// Select best configuration: prefer higher GPU count, then higher VRAM
if bestGPUInfo == nil ||
gpuInfo.GPUsPerNode > bestGPUInfo.GPUsPerNode ||
(gpuInfo.GPUsPerNode == bestGPUInfo.GPUsPerNode && gpuInfo.VRAMPerGPU > bestGPUInfo.VRAMPerGPU) {
bestGPUInfo = gpuInfo
}
}
if bestGPUInfo == nil {
return nil, fmt.Errorf("no nodes with NVIDIA GPU Feature Discovery labels found (checked %d nodes). "+
"Ensure GPU nodes have labels: %s, %s, %s",
len(nodeList.Items), LabelGPUCount, LabelGPUProduct, LabelGPUMemory)
}
// Infer hardware system from GPU model
bestGPUInfo.System = InferHardwareSystem(bestGPUInfo.Model)
logger.Info("GPU discovery completed",
"gpusPerNode", bestGPUInfo.GPUsPerNode,
"model", bestGPUInfo.Model,
"vram", bestGPUInfo.VRAMPerGPU,
"system", bestGPUInfo.System,
"nodesWithGPUs", nodesWithGPUs)
return bestGPUInfo, nil
}
// extractGPUInfoFromNode extracts GPU information from a single node's labels.
// Returns error if required labels are missing or invalid.
func extractGPUInfoFromNode(node *corev1.Node) (*GPUInfo, error) {
labels := node.Labels
if labels == nil {
return nil, fmt.Errorf("node has no labels")
}
gpuCountStr, ok := labels[LabelGPUCount]
if !ok {
return nil, fmt.Errorf("missing label %s", LabelGPUCount)
}
gpuCount, err := strconv.Atoi(gpuCountStr)
if err != nil || gpuCount <= 0 {
return nil, fmt.Errorf("invalid GPU count: %s", gpuCountStr)
}
gpuModel, ok := labels[LabelGPUProduct]
if !ok || gpuModel == "" {
return nil, fmt.Errorf("missing or empty label %s", LabelGPUProduct)
}
// Extract VRAM (memory in MiB)
gpuMemoryStr, ok := labels[LabelGPUMemory]
if !ok {
return nil, fmt.Errorf("missing label %s", LabelGPUMemory)
}
gpuMemory, err := strconv.Atoi(gpuMemoryStr)
if err != nil || gpuMemory <= 0 {
return nil, fmt.Errorf("invalid GPU memory: %s", gpuMemoryStr)
}
return &GPUInfo{
GPUsPerNode: gpuCount,
Model: gpuModel,
VRAMPerGPU: gpuMemory,
}, nil
}
// InferHardwareSystem maps GPU product name to hardware system identifier.
// Returns empty string if the GPU model cannot be confidently mapped.
//
// This is a best-effort mapping based on common NVIDIA datacenter GPU naming patterns.
// The system identifier is used by the profiler for performance estimation and configuration.
//
// Limitations:
// - Cannot distinguish SXM vs. PCIe variants from labels alone (assumes SXM for datacenter GPUs)
// - New GPU models require code updates (gracefully returns empty string)
// - Non-standard SKU names may not match
//
// Users can manually override the system in their profiling config (hardware.system)
// if auto-detection is incorrect or unavailable.
func InferHardwareSystem(gpuProduct string) string {
if gpuProduct == "" {
return ""
}
// Normalize: uppercase, remove spaces/dashes for pattern matching
normalized := strings.ToUpper(strings.ReplaceAll(gpuProduct, "-", ""))
normalized = strings.ReplaceAll(normalized, " ", "")
// Map common NVIDIA datacenter GPU products to hardware system identifiers
patterns := []struct {
pattern string
system string
}{
{"GB200", "gb200_sxm"},
{"H200", "h200_sxm"},
{"H100", "h100_sxm"},
{"B200", "b200_sxm"},
{"A100", "a100_sxm"},
{"L40S", "l40s"},
}
for _, p := range patterns {
if strings.Contains(normalized, p.pattern) {
return p.system
}
}
// Unknown GPU type, return empty string
// User must specify system manually in profiling config (hardware.system)
return ""
}
/*
* SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gpu
import (
"context"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/fake"
)
// newFakeClient creates a fake Kubernetes client with the given objects
func newFakeClient(objs ...client.Object) client.Client {
scheme := runtime.NewScheme()
_ = corev1.AddToScheme(scheme)
return fake.NewClientBuilder().
WithScheme(scheme).
WithObjects(objs...).
Build()
}
func TestDiscoverGPUs_SingleNode(t *testing.T) {
ctx := context.Background()
node := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "gpu-node-1",
Labels: map[string]string{
LabelGPUCount: "8",
LabelGPUProduct: "H100-SXM5-80GB",
LabelGPUMemory: "81920",
},
},
}
k8sClient := newFakeClient(node)
gpuInfo, err := DiscoverGPUs(ctx, k8sClient)
require.NoError(t, err)
require.NotNil(t, gpuInfo)
assert.Equal(t, 8, gpuInfo.GPUsPerNode)
assert.Equal(t, "H100-SXM5-80GB", gpuInfo.Model)
assert.Equal(t, 81920, gpuInfo.VRAMPerGPU)
assert.Equal(t, "h100_sxm", gpuInfo.System)
}
func TestDiscoverGPUs_MultipleNodesHomogeneous(t *testing.T) {
ctx := context.Background()
// Multiple nodes with same GPU configuration
node1 := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "gpu-node-1",
Labels: map[string]string{
LabelGPUCount: "8",
LabelGPUProduct: "H100-SXM5-80GB",
LabelGPUMemory: "81920",
},
},
}
node2 := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "gpu-node-2",
Labels: map[string]string{
LabelGPUCount: "8",
LabelGPUProduct: "H100-SXM5-80GB",
LabelGPUMemory: "81920",
},
},
}
k8sClient := newFakeClient(node1, node2)
gpuInfo, err := DiscoverGPUs(ctx, k8sClient)
require.NoError(t, err)
require.NotNil(t, gpuInfo)
assert.Equal(t, 8, gpuInfo.GPUsPerNode)
assert.Equal(t, "H100-SXM5-80GB", gpuInfo.Model)
assert.Equal(t, 81920, gpuInfo.VRAMPerGPU)
}
func TestDiscoverGPUs_MultipleNodesHeterogeneous_HigherGPUCountWins(t *testing.T) {
ctx := context.Background()
// Node with fewer GPUs
node1 := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "gpu-node-1",
Labels: map[string]string{
LabelGPUCount: "4",
LabelGPUProduct: "A100-SXM4-40GB",
LabelGPUMemory: "40960",
},
},
}
// Node with more GPUs (should win)
node2 := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "gpu-node-2",
Labels: map[string]string{
LabelGPUCount: "8",
LabelGPUProduct: "H100-SXM5-80GB",
LabelGPUMemory: "81920",
},
},
}
k8sClient := newFakeClient(node1, node2)
gpuInfo, err := DiscoverGPUs(ctx, k8sClient)
require.NoError(t, err)
require.NotNil(t, gpuInfo)
// Should prefer node with 8 GPUs over node with 4 GPUs
assert.Equal(t, 8, gpuInfo.GPUsPerNode)
assert.Equal(t, "H100-SXM5-80GB", gpuInfo.Model)
assert.Equal(t, 81920, gpuInfo.VRAMPerGPU)
}
func TestDiscoverGPUs_MultipleNodesHeterogeneous_HigherVRAMWins(t *testing.T) {
ctx := context.Background()
// Node with same GPU count but less VRAM
node1 := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "gpu-node-1",
Labels: map[string]string{
LabelGPUCount: "8",
LabelGPUProduct: "A100-SXM4-40GB",
LabelGPUMemory: "40960",
},
},
}
// Node with same GPU count but more VRAM (should win)
node2 := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "gpu-node-2",
Labels: map[string]string{
LabelGPUCount: "8",
LabelGPUProduct: "H100-SXM5-80GB",
LabelGPUMemory: "81920",
},
},
}
k8sClient := newFakeClient(node1, node2)
gpuInfo, err := DiscoverGPUs(ctx, k8sClient)
require.NoError(t, err)
require.NotNil(t, gpuInfo)
// Should prefer node with higher VRAM when GPU count is equal
assert.Equal(t, 8, gpuInfo.GPUsPerNode)
assert.Equal(t, "H100-SXM5-80GB", gpuInfo.Model)
assert.Equal(t, 81920, gpuInfo.VRAMPerGPU)
}
func TestDiscoverGPUs_MixedNodesWithAndWithoutGPUs(t *testing.T) {
ctx := context.Background()
// CPU-only node (no GPU labels)
cpuNode := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "cpu-node-1",
Labels: map[string]string{},
},
}
// GPU node
gpuNode := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "gpu-node-1",
Labels: map[string]string{
LabelGPUCount: "8",
LabelGPUProduct: "H100-SXM5-80GB",
LabelGPUMemory: "81920",
},
},
}
k8sClient := newFakeClient(cpuNode, gpuNode)
gpuInfo, err := DiscoverGPUs(ctx, k8sClient)
require.NoError(t, err)
require.NotNil(t, gpuInfo)
// Should find the GPU node and ignore CPU-only node
assert.Equal(t, 8, gpuInfo.GPUsPerNode)
assert.Equal(t, "H100-SXM5-80GB", gpuInfo.Model)
}
func TestDiscoverGPUs_NoNodes(t *testing.T) {
ctx := context.Background()
k8sClient := newFakeClient() // Empty cluster
gpuInfo, err := DiscoverGPUs(ctx, k8sClient)
assert.Error(t, err)
assert.Nil(t, gpuInfo)
assert.Contains(t, err.Error(), "no nodes found")
}
func TestDiscoverGPUs_NoGPUNodes(t *testing.T) {
ctx := context.Background()
// Only CPU nodes
cpuNode1 := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "cpu-node-1",
Labels: map[string]string{},
},
}
cpuNode2 := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "cpu-node-2",
Labels: map[string]string{
"node-type": "cpu-only",
},
},
}
k8sClient := newFakeClient(cpuNode1, cpuNode2)
gpuInfo, err := DiscoverGPUs(ctx, k8sClient)
assert.Error(t, err)
assert.Nil(t, gpuInfo)
assert.Contains(t, err.Error(), "no nodes with NVIDIA GPU Feature Discovery labels found")
}
func TestExtractGPUInfoFromNode_MissingLabels(t *testing.T) {
tests := []struct {
name string
labels map[string]string
expectError bool
errorMsg string
}{
{
name: "missing GPU count",
labels: map[string]string{LabelGPUProduct: "H100", LabelGPUMemory: "80000"},
expectError: true,
errorMsg: LabelGPUCount,
},
{
name: "missing GPU product",
labels: map[string]string{LabelGPUCount: "8", LabelGPUMemory: "80000"},
expectError: true,
errorMsg: LabelGPUProduct,
},
{
name: "missing GPU memory",
labels: map[string]string{LabelGPUCount: "8", LabelGPUProduct: "H100"},
expectError: true,
errorMsg: LabelGPUMemory,
},
{
name: "invalid GPU count",
labels: map[string]string{LabelGPUCount: "invalid", LabelGPUProduct: "H100", LabelGPUMemory: "80000"},
expectError: true,
errorMsg: "invalid GPU count",
},
{
name: "invalid GPU memory",
labels: map[string]string{LabelGPUCount: "8", LabelGPUProduct: "H100", LabelGPUMemory: "invalid"},
expectError: true,
errorMsg: "invalid GPU memory",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
node := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "test-node",
Labels: tt.labels,
},
}
gpuInfo, err := extractGPUInfoFromNode(node)
if tt.expectError {
assert.Error(t, err)
assert.Nil(t, gpuInfo)
if tt.errorMsg != "" {
assert.Contains(t, err.Error(), tt.errorMsg)
}
} else {
assert.NoError(t, err)
assert.NotNil(t, gpuInfo)
}
})
}
}
func TestInferHardwareSystem(t *testing.T) {
tests := []struct {
gpuProduct string
expectedSystem string
description string
}{
{"H100-SXM5-80GB", "h100_sxm", "H100 SXM variant"},
{"H100-PCIE-80GB", "h100_sxm", "H100 PCIe variant (mapped to SXM)"},
{"H200-SXM5-141GB", "h200_sxm", "H200 SXM variant"},
{"A100-SXM4-40GB", "a100_sxm", "A100 SXM variant"},
{"A100-PCIE-80GB", "a100_sxm", "A100 PCIe variant (mapped to SXM)"},
{"L40S", "l40s", "L40S"},
{"NVIDIA L40S", "l40s", "L40S with prefix"},
{"B200-SXM", "b200_sxm", "B200 SXM"},
{"GB200", "gb200_sxm", "GB200"},
{"Tesla V100-SXM2-16GB", "", "V100 (not in mapping)"},
{"RTX 4090", "", "Consumer GPU (not in mapping)"},
{"Unknown-GPU", "", "Unknown GPU"},
{"", "", "Empty string"},
}
for _, tt := range tests {
t.Run(tt.description, func(t *testing.T) {
result := InferHardwareSystem(tt.gpuProduct)
assert.Equal(t, tt.expectedSystem, result, "Failed for GPU: %s", tt.gpuProduct)
})
}
}
func TestInferHardwareSystem_CaseInsensitive(t *testing.T) {
// Test that inference is case-insensitive
variants := []string{
"h100-sxm5-80gb",
"H100-SXM5-80GB",
"H100-sxm5-80GB",
"h100-SXM5-80gb",
}
for _, variant := range variants {
result := InferHardwareSystem(variant)
assert.Equal(t, "h100_sxm", result, "Should handle case variations: %s", variant)
}
}
func TestInferHardwareSystem_SpacesAndDashes(t *testing.T) {
// Test that spaces and dashes are normalized
variants := []string{
"H100-SXM5-80GB",
"H100 SXM5 80GB",
"H100SXM580GB",
"H100-SXM5 80GB",
}
for _, variant := range variants {
result := InferHardwareSystem(variant)
assert.Equal(t, "h100_sxm", result, "Should normalize spaces/dashes: %s", variant)
}
}
......@@ -26,6 +26,19 @@ import (
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
)
// toFloat64 converts a numeric value (int or float64) to float64.
// Returns 0 if the value is neither int nor float64.
func toFloat64(val interface{}) float64 {
switch v := val.(type) {
case float64:
return v
case int:
return float64(v)
default:
return 0
}
}
// DynamoGraphDeploymentRequestValidator validates DynamoGraphDeploymentRequest resources.
// This validator can be used by both webhooks and controllers for consistent validation.
type DynamoGraphDeploymentRequestValidator struct {
......@@ -48,6 +61,11 @@ func (v *DynamoGraphDeploymentRequestValidator) Validate() (admission.Warnings,
var warnings admission.Warnings
var err error
// Warn about deprecated enableGpuDiscovery field
if v.request.Spec.EnableGPUDiscovery != nil {
warnings = append(warnings, "spec.enableGpuDiscovery is deprecated and will be removed in v1beta1. GPU discovery is now always attempted automatically. This field has no effect.")
}
// Validate profiler image is specified
if v.request.Spec.ProfilingConfig.ProfilerImage == "" {
err = errors.Join(err, errors.New("spec.profilingConfig.profilerImage is required"))
......@@ -58,10 +76,8 @@ func (v *DynamoGraphDeploymentRequestValidator) Validate() (admission.Warnings,
err = errors.Join(err, errors.New("spec.profilingConfig.config is required and must not be empty"))
}
// Validate enableGpuDiscovery is only true for cluster-wide operators
if v.request.Spec.EnableGpuDiscovery && !v.isClusterWideOperator {
err = errors.Join(err, errors.New("spec.enableGpuDiscovery can only be set to true for cluster-wide operators. Namespace-restricted operators cannot access cluster nodes for GPU discovery. Please set enableGpuDiscovery to false and provide hardware configuration (hardware.min_num_gpus_per_engine, hardware.max_num_gpus_per_engine, hardware.num_gpus_per_node) in spec.profilingConfig.config"))
}
// Note: GPU discovery is now automatic for cluster-wide operators
// Namespace-restricted operators automatically skip GPU discovery and require manual hardware config
// Parse config to validate structure (only if config is present)
if v.request.Spec.ProfilingConfig.Config != nil && len(v.request.Spec.ProfilingConfig.Config.Raw) > 0 {
......@@ -83,9 +99,101 @@ func (v *DynamoGraphDeploymentRequestValidator) Validate() (admission.Warnings,
}
}
// Validate GPU hardware information is available (last, so other errors are collected first)
if gpuErr := v.validateGPUHardwareInfo(); gpuErr != nil {
err = errors.Join(err, gpuErr)
}
return warnings, err
}
// validateGPUHardwareInfo ensures GPU hardware information will be available for profiling.
// This validation happens at admission time to fail fast before the DGDR is persisted to etcd.
func (v *DynamoGraphDeploymentRequestValidator) validateGPUHardwareInfo() error {
// Parse profiling config
var config map[string]interface{}
if v.request.Spec.ProfilingConfig.Config != nil {
if err := yaml.Unmarshal(v.request.Spec.ProfilingConfig.Config.Raw, &config); err != nil {
// Config parse errors will be caught by other validators
return nil
}
} else {
config = make(map[string]interface{})
}
// Check if manual hardware config is provided
hardwareVal, hasHardware := config["hardware"]
var hasManualHardwareConfig bool
if hasHardware && hardwareVal != nil {
if hardwareConfig, ok := hardwareVal.(map[string]interface{}); ok {
// Check if essential hardware fields are provided
_, hasGPUModel := hardwareConfig["gpuModel"]
_, hasGPUVram := hardwareConfig["gpuVramMib"]
_, hasNumGPUs := hardwareConfig["numGpusPerNode"]
hasManualHardwareConfig = hasGPUModel || hasGPUVram || hasNumGPUs
}
}
// Check if explicit GPU ranges are provided
var hasExplicitGPURanges bool
if engineVal, hasEngine := config["engine"]; hasEngine && engineVal != nil {
if engineConfig, ok := engineVal.(map[string]interface{}); ok {
minGPUs, hasMin := engineConfig["minNumGpusPerEngine"]
maxGPUs, hasMax := engineConfig["maxNumGpusPerEngine"]
// Validate explicit GPU ranges
if hasMin && hasMax {
minVal := toFloat64(minGPUs)
maxVal := toFloat64(maxGPUs)
// Validate that min <= max
if minVal > maxVal {
return fmt.Errorf("invalid GPU range: minNumGpusPerEngine (%v) cannot be greater than maxNumGpusPerEngine (%v)",
minVal, maxVal)
}
hasExplicitGPURanges = minVal > 0 && maxVal > 0
}
}
}
// If manual config or explicit ranges provided, validation passes
if hasManualHardwareConfig || hasExplicitGPURanges {
return nil
}
// Neither manual config nor explicit ranges provided
// GPU discovery will be attempted at reconcile time, but if it's unavailable
// (e.g., namespace-scoped operator), the DGDR will fail
//
// Fail at admission time to give users immediate feedback
if v.isClusterWideOperator {
// Cluster-wide operator should have GPU discovery available
// Allow DGDR to be created - GPU discovery will provide hardware info
return nil
}
// Namespace-scoped operator likely doesn't have node read permissions
// Require manual hardware config or explicit GPU ranges
return errors.New(`GPU hardware configuration required for namespace-scoped operators.
Namespace-scoped operators typically lack node read permissions for GPU auto-discovery.
Provide hardware configuration in one of these ways:
1. Add hardware config in spec.profilingConfig.config:
hardware:
numGpusPerNode: 8
gpuModel: "H100-SXM5-80GB"
gpuVramMib: 81920
2. Or specify explicit GPU search ranges:
engine:
minNumGpusPerEngine: 2
maxNumGpusPerEngine: 8
See: https://github.com/ai-dynamo/dynamo/issues/6257`)
}
// ValidateUpdate performs stateful validation comparing old and new DynamoGraphDeploymentRequest.
// Returns warnings and error.
func (v *DynamoGraphDeploymentRequestValidator) ValidateUpdate(old *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (admission.Warnings, error) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment