feat: migrate GPU discovery from Dynamo Profiler to Dynamo Operator with...

feat: migrate GPU discovery from Dynamo Profiler to Dynamo Operator with automatic injection (#6224) Signed-off-by: Hannah Zhang <hannahz@nvidia.com>

feat: migrate GPU discovery from Dynamo Profiler to Dynamo Operator with...
feat: migrate GPU discovery from Dynamo Profiler to Dynamo Operator with automatic injection (#6224) Signed-off-by: Hannah Zhang <hannahz@nvidia.com>
d56439ec · hhzhang16 · GitHub · 233a1e9a · 7c08d2f2 · d56439ec
Unverified Commit d56439ec authored Feb 13, 2026 by hhzhang16 Committed by GitHub Feb 14, 2026
20 changed files
--- a/aiconfigurator @ 7c08d2f2
+++ b/aiconfigurator @ 7c08d2f2
+Subproject commit 7c08d2f2c4e289afe49f48e56d392a7d7221155d
--- a/components/src/dynamo/profiler/deploy/profile_sla_aic_dgdr.yaml
+++ b/components/src/dynamo/profiler/deploy/profile_sla_aic_dgdr.yaml
@@ -17,9 +17,7 @@ spec:
    # NOTE: any image built before January 10 and any release prior to 0.8.1
    # will need to use snake_case within profilingConfig.config
    config:
-      sweep:
+      searchStrategy: rapid
-        useAiConfigurator: true
-        aicSystem: h200_sxm
      sla:
        isl: 3000
        osl: 150

--- a/components/src/dynamo/profiler/deploy/profile_sla_dgdr.yaml
+++ b/components/src/dynamo/profiler/deploy/profile_sla_dgdr.yaml
@@ -17,8 +17,7 @@ spec:
    # NOTE: any image built before January 10 and any release prior to 0.8.1
    # will need to use snake_case within profilingConfig.config
    config:
-      sweep:
+      searchStrategy: thorough
-        useAiConfigurator: false
      sla:
        isl: 3000
        osl: 150

--- a/components/src/dynamo/profiler/deploy/profile_sla_moe_dgdr.yaml
+++ b/components/src/dynamo/profiler/deploy/profile_sla_moe_dgdr.yaml
@@ -24,14 +24,12 @@ spec:
          pvcName: "model-cache"                      # Name of PVC containing model weights
          pvcPath: "deepseek-r1"                      # Subpath within PVC where model is stored
-      sweep:
+      searchStrategy: rapid
-        useAiConfigurator: false
      hardware:
        # for h200, sweep over 8-16 GPUs per engine
        minNumGpusPerEngine: 8
        maxNumGpusPerEngine: 16
-        numGpusPerNode: 8
+        numGpusPerNode: 8  # Override auto-discovered value if different
      sla:
        isl: 3000

--- a/components/src/dynamo/profiler/profile_sla.py
+++ b/components/src/dynamo/profiler/profile_sla.py
@@ -38,7 +38,7 @@ from dynamo.profiler.utils.config_modifiers.parallelization_mapping import (
    apply_parallel_mapping_to_config,
    get_candidate_parallel_mappings,
 )
-from dynamo.profiler.utils.defaults import EngineType
+from dynamo.profiler.utils.defaults import EngineType, SearchStrategy
 from dynamo.profiler.utils.dgd_generation import generate_dgd_config_with_planner
 from dynamo.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator
 from dynamo.profiler.utils.plot import (
@@ -140,10 +140,6 @@ async def run_profile(args):
    # Clear any errors from previous profiling runs
    clear_profiling_errors()
-    # Inherit aic_backend from backend if not explicitly set
-    if not args.aic_backend:
-        args.aic_backend = args.backend
    # Write initial status for external jobs to monitor
    os.makedirs(args.output_dir, exist_ok=True)
    write_profiler_status(
@@ -197,36 +193,28 @@ async def run_profile(args):
                f"Using minimum of user-provided and model's maximum context length: {sweep_max_context_length}"
            )
-        if args.use_ai_configurator:
+        # Initialize AI Configurator estimator (only used when search_strategy == SearchStrategy.RAPID)
-            if not args.aic_system:
+        ai_configurator_perf_estimator: AIConfiguratorPerfEstimator | None = None
+        if args.search_strategy == SearchStrategy.RAPID:
+            # Use AI Configurator for rapid estimation
+            if not args.system:
                raise ValueError(
-                    "Must provide --aic-system when using --use-ai-configurator."
+                    "Must provide --system (hardware system, e.g. h100_sxm) when using rapid search strategy."
                )
-            # Fallback to args.model if aic_hf_id is not provided
+            if not args.model:
-            if not args.aic_hf_id:
-                if args.model:
-                    logger.info(
-                        f"--aic-hf-id not provided, using --model ({args.model}) as HuggingFace ID for AI configurator"
-                    )
-                    args.aic_hf_id = args.model
-                else:
                raise ValueError(
-                        "Must provide --aic-hf-id or --model when using --use-ai-configurator."
+                    "Must provide --model (HuggingFace ID) when using rapid search strategy."
                )
-            logger.info("Using aiconfigurator to estimate performance...")
+            logger.info(
-            ai_configurator_perf_estimator = AIConfiguratorPerfEstimator(
+                "Using AI Configurator to estimate performance (rapid strategy)..."
-                args.aic_hf_id,
-                args.aic_system.lower(),
-                args.aic_backend,
-                args.aic_backend_version,
            )
-        else:
+            ai_configurator_perf_estimator = AIConfiguratorPerfEstimator(
-            if args.aic_system or args.aic_hf_id or args.aic_backend_version:
+                hf_id=args.model,
-                logger.warning(
+                system=args.system.lower(),
-                    "Ignoring --aic-system, --aic-hf-id, and/or --backend-version "
+                backend=args.backend,
-                    "when not using --use-ai-configurator."
            )
        # first profile prefill
@@ -272,7 +260,10 @@ async def run_profile(args):
                ttft = None
                if args.dry_run:
                    logger.info("Skipping deployment creation in dry run mode")
-                elif args.use_ai_configurator:
+                elif (
+                    args.search_strategy == SearchStrategy.RAPID
+                    and ai_configurator_perf_estimator
+                ):
                    logger.info("Using ai-configurator to estimate prefill latency")
                    perf_dict = ai_configurator_perf_estimator.estimate_prefill_perf(
                        args.isl,
@@ -395,7 +386,10 @@ async def run_profile(args):
                if args.dry_run:
                    logger.info("Skipping deployment creation in dry run mode")
-                elif args.use_ai_configurator:
+                elif (
+                    args.search_strategy == SearchStrategy.RAPID
+                    and ai_configurator_perf_estimator
+                ):
                    # Compute max_concurrency and max_kv_tokens to know which
                    # num_request to sweep over.
                    max_concurrency = ai_configurator_perf_estimator.get_max_batch_size(
@@ -467,7 +461,10 @@ async def run_profile(args):
                    for num_request in sweep_num_request:
                        itl = thpt_per_gpu = None
-                        if args.use_ai_configurator:
+                        if (
+                            args.search_strategy == SearchStrategy.RAPID
+                            and ai_configurator_perf_estimator
+                        ):
                            logger.info(
                                "Using ai-configurator to estimate decode latency."
                            )
@@ -511,7 +508,10 @@ async def run_profile(args):
                                parallel_mapping=mapping,
                            )
-                if not args.dry_run and not args.use_ai_configurator:
+                if (
+                    not args.dry_run
+                    and not args.search_strategy == SearchStrategy.RAPID
+                ):
                    logger.info("Cleaning up deployment...")
                    await client.delete_deployment()
                    deployment_clients.remove(client)
@@ -642,7 +642,10 @@ async def run_profile(args):
        if args.dry_run:
            logger.info("Skipping deployment creation in dry run mode")
-        elif args.use_ai_configurator:
+        elif (
+            args.search_strategy == SearchStrategy.RAPID
+            and ai_configurator_perf_estimator
+        ):
            profile_prefill_aiconfigurator(
                work_dir,
                best_prefill_gpus,  # num_gpus
@@ -728,7 +731,10 @@ async def run_profile(args):
        if args.dry_run:
            logger.info("Skipping deployment creation in dry run mode")
-        elif args.use_ai_configurator:
+        elif (
+            args.search_strategy == SearchStrategy.RAPID
+            and ai_configurator_perf_estimator
+        ):
            attention_dp_size = best_decode_mapping.get_attn_dp_size()
            max_kv_tokens = ai_configurator_perf_estimator.get_max_kv_tokens(
                args.isl, args.osl, tp_size=best_decode_mapping.get_tp_size()

--- a/components/src/dynamo/profiler/utils/config_modifiers/protocol.py
+++ b/components/src/dynamo/profiler/utils/config_modifiers/protocol.py
@@ -154,7 +154,7 @@ class BaseConfigModifier:
        Raises:
            ValueError: If neither --served-model-name nor model path arg is found
        """
-        model_name = None
+        model_name = ""
        # Check for --served-model-name first (API model name)
        for i, arg in enumerate(args):
            if arg == cls.WORKER_SERVED_MODEL_NAME_ARG and i + 1 < len(args):
@@ -162,14 +162,14 @@ class BaseConfigModifier:
                break
        # Check for backend-specific path argument
-        model_path = None
+        model_path = ""
        for i, arg in enumerate(args):
            if arg == cls.WORKER_MODEL_PATH_ARG and i + 1 < len(args):
                model_path = args[i + 1]
                break
        # Require at least one to be specified
-        if model_name is None and model_path is None:
+        if not model_name and not model_path:
            raise ValueError(
                f"Cannot determine model: neither {cls.WORKER_MODEL_PATH_ARG} nor "
                f"{cls.WORKER_SERVED_MODEL_NAME_ARG} found in worker configuration. "
@@ -177,9 +177,9 @@ class BaseConfigModifier:
            )
        # If only one is specified, use it for both
-        if model_path is None:
+        if not model_path:
            model_path = model_name
-        elif model_name is None:
+        elif not model_name:
            model_name = model_path
        return model_name, model_path

--- a/components/src/dynamo/profiler/utils/defaults.py
+++ b/components/src/dynamo/profiler/utils/defaults.py
@@ -49,3 +49,8 @@ DEFAULT_GPU_COST_PER_HOUR = 3.0  # Cost per GPU per hour in dollars
 class EngineType(str, Enum):
    PREFILL = "prefill"
    DECODE = "decode"
+class SearchStrategy(str, Enum):
+    RAPID = "rapid"
+    THOROUGH = "thorough"
--- a/components/src/dynamo/profiler/utils/estimate_perf.py
+++ b/components/src/dynamo/profiler/utils/estimate_perf.py
@@ -39,12 +39,10 @@ class AIConfiguratorPerfEstimator:
        hf_id: str,  # e.g. "Qwen/Qwen3-32B"
        system: str,  # e.g. "h200_sxm"
        backend: str,  # e.g. "trtllm"
-        version: str,  # e.g. "0.20.0"
    ):
        aiconfigurator = _try_import_aiconfigurator()
        logger.info("Loading aiconfigurator database. This might take a few seconds...")
-        if not version:
        version = aiconfigurator.sdk.perf_database.get_latest_database_version(
            system,
            backend,

--- a/components/src/dynamo/profiler/utils/profiler_argparse.py
+++ b/components/src/dynamo/profiler/utils/profiler_argparse.py
@@ -8,6 +8,7 @@ from typing import Any, Dict
 import yaml
+from dynamo.profiler.utils.defaults import SearchStrategy
 from dynamo.profiler.utils.planner_utils import add_planner_arguments_to_parser
 from dynamo.profiler.utils.search_space_autogen import auto_generate_search_space
@@ -100,16 +101,14 @@ def create_profiler_parser() -> argparse.Namespace:
        hardware:
            minNumGpusPerEngine: Int (minimum number of GPUs per engine, default: 0)
            maxNumGpusPerEngine: Int (maximum number of GPUs per engine, default: 0)
-            numGpusPerNode: Int (number of GPUs per node for MoE models - this will be the granularity when searching for the best TEP/DEP size, default: 0)
+            numGpusPerNode: Int (number of GPUs per node, default: 0)
-            enableGpuDiscovery: Boolean (enable automatic GPU discovery from Kubernetes cluster nodes, when enabled overrides any manually specified hardware configuration, requires cluster-wide node access permissions, default: False)
+            gpuModel: String (GPU model, used for auto-calculating search space, default: "")
+            gpuVramMib: Int (GPU VRAM in MiB, used for auto-calculating search space, default: 0)
+            system: String (target hardware system, e.g. h100_sxm, h200_sxm, default: None)
+        searchStrategy: String (search strategy for profiling: 'rapid' uses AI Configurator for quick estimation, 'thorough' runs actual deployments for comprehensive results, enum: [rapid, thorough], default: rapid)
        sweep:
            prefillInterpolationGranularity: Int (how many samples to benchmark to interpolate TTFT under different ISL, default: 16)
            decodeInterpolationGranularity: Int (how many samples to benchmark to interpolate ITL under different active kv cache size and decode context length, default: 6)
-            useAiConfigurator: Boolean (use ai-configurator to estimate benchmarking results instead of running actual deployment, default: False)
-            aicSystem: String (target system for use with aiconfigurator, default: None)
-            aicHfId: String (aiconfigurator huggingface id of the target model, default: None)
-            aicBackend: String (aiconfigurator backend of the target model, if not provided, will use args.backend, default: "")
-            aicBackendVersion: String (specify backend version when using aiconfigurator to estimate perf, default: None)
            dryRun: Boolean (dry run the profile job, default: False)
            pickWithWebui: Boolean (pick the best parallelization mapping using webUI, default: False)
            webuiPort: Int (webUI port, default: $PROFILER_WEBUI_PORT or 8000)
@@ -226,7 +225,25 @@ def create_profiler_parser() -> argparse.Namespace:
        "--num-gpus-per-node",
        type=int,
        default=_get(hardware_cfg, "numGpusPerNode", "num_gpus_per_node", 0),
-        help="Number of GPUs per node for MoE models - this will be the granularity when searching for the best TEP/DEP size",
+        help="Number of GPUs per node",
+    )
+    parser.add_argument(
+        "--gpu-model",
+        type=str,
+        default=_get(hardware_cfg, "gpuModel", "gpu_model", ""),
+        help="GPU model name (used for auto-calculating search space)",
+    )
+    parser.add_argument(
+        "--gpu-vram-mib",
+        type=int,
+        default=_get(hardware_cfg, "gpuVramMib", "gpu_vram_mib", 0),
+        help="GPU VRAM in MiB (used for auto-calculating search space)",
+    )
+    parser.add_argument(
+        "--system",
+        type=str,
+        default=_get(hardware_cfg, "system", "system", None),
+        help="Target hardware system, e.g. h100_sxm, h200_sxm",
    )
    parser.add_argument(
        "--isl",
@@ -253,6 +270,17 @@ def create_profiler_parser() -> argparse.Namespace:
        help="target Inter Token Latency (float, in milliseconds)",
    )
+    # High-level profiling strategy argument
+    parser.add_argument(
+        "--search-strategy",
+        type=SearchStrategy,
+        default=SearchStrategy(
+            _get(config, "searchStrategy", "search_strategy", "rapid")
+        ),
+        choices=list(SearchStrategy),
+        help="Search strategy for profiling: 'rapid' uses AI Configurator for quick estimation, 'thorough' runs actual deployments for comprehensive results",
+    )
    # arguments used for interpolating TTFT and ITL under different ISL/OSL
    engine_cfg = config.get("engine", {})
    parser.add_argument(
@@ -296,12 +324,6 @@ def create_profiler_parser() -> argparse.Namespace:
        default=_get(sweep_cfg, "dryRun", "dry_run", False),
        help="Dry run the profile job",
    )
-    parser.add_argument(
-        "--enable-gpu-discovery",
-        action="store_true",
-        default=_get(hardware_cfg, "enableGpuDiscovery", "enable_gpu_discovery", False),
-        help="Enable automatic GPU discovery from Kubernetes cluster nodes. When enabled, overrides any manually specified hardware configuration. Requires cluster-wide node access permissions.",
-    )
    parser.add_argument(
        "--pick-with-webui",
        action="store_true",
@@ -332,38 +354,6 @@ def create_profiler_parser() -> argparse.Namespace:
        }
        parser.set_defaults(**normalized_planner_config)
-    # arguments if using aiconfigurator
-    parser.add_argument(
-        "--use-ai-configurator",
-        action="store_true",
-        default=_get(sweep_cfg, "useAiConfigurator", "use_ai_configurator", False),
-        help="Use ai-configurator to estimate benchmarking results instead of running actual deployment.",
-    )
-    parser.add_argument(
-        "--aic-system",
-        type=str,
-        default=_get(sweep_cfg, "aicSystem", "aic_system", None),
-        help="Target system for use with aiconfigurator (e.g. h100_sxm, h200_sxm)",
-    )
-    parser.add_argument(
-        "--aic-hf-id",
-        type=str,
-        default=_get(sweep_cfg, "aicHfId", "aic_hf_id", None),
-        help="aiconfigurator name of the target model (e.g. Qwen/Qwen3-32B, meta-llama/Llama-3.1-405B)",
-    )
-    parser.add_argument(
-        "--aic-backend",
-        type=str,
-        default=_get(sweep_cfg, "aicBackend", "aic_backend", ""),
-        help="aiconfigurator backend of the target model, if not provided, will use args.backend",
-    )
-    parser.add_argument(
-        "--aic-backend-version",
-        type=str,
-        default=_get(sweep_cfg, "aicBackendVersion", "aic_backend_version", None),
-        help="Specify backend version when using aiconfigurator to estimate perf.",
-    )
    # Parse arguments
    args = parser.parse_args()

--- a/components/src/dynamo/profiler/utils/search_space_autogen.py
+++ b/components/src/dynamo/profiler/utils/search_space_autogen.py
@@ -8,7 +8,6 @@ import os
 import yaml
-from deploy.utils.gpu_inventory import get_gpu_summary
 from dynamo.profiler.utils.config_modifiers import CONFIG_MODIFIERS
 from dynamo.profiler.utils.model_info import ModelInfo, get_model_info
@@ -103,66 +102,79 @@ def auto_generate_search_space(args: argparse.Namespace) -> None:
    )
    args.model_info = model_info
-    # now determine the search space
+    # Determine the search space for profiling
-    if args.enable_gpu_discovery:
+    # User-provided min/max values take precedence; auto-calculate missing bounds
-        if (
+    # based on GPU hardware info and model size
-            args.min_num_gpus_per_engine == 0
+    user_specified_ranges = (
-            or args.max_num_gpus_per_engine == 0
+        args.min_num_gpus_per_engine != 0 and args.max_num_gpus_per_engine != 0
-            or args.num_gpus_per_node == 0
+    )
-        ):
-            if not args.model:
-                # TODO: get model info provided DGD config
-                error_msg = "No model provided, cannot auto-generate GPU search space. Please provide `--model` or GPU info"
-                logger.error(error_msg)
-                raise RuntimeError(error_msg)
-            logger.info("Getting GPU info from k8s cluster...")
+    if user_specified_ranges:
-            gpu_info = get_gpu_summary()
        logger.info(
-                f"Cluster has {gpu_info['gpus_per_node']}x{gpu_info['model']} GPUs per node with {gpu_info['vram']} VRAM"
+            f"Using user-specified GPU search space: {args.min_num_gpus_per_engine} to {args.max_num_gpus_per_engine}"
        )
+        # Ensure num_gpus_per_node is set (needed for multi-node configs)
+        if args.num_gpus_per_node == 0:
+            logger.warning("num_gpus_per_node not specified, setting to 8")
+            args.num_gpus_per_node = 8
+    else:
+        # Auto-calculate search space (honor partial user overrides)
+        # NOTE: will be handled in AIC
+        if args.num_gpus_per_node != 0 and args.gpu_vram_mib != 0:
+            # Have GPU hardware info - calculate based on model size
+            if not args.model:
+                error_msg = "No model provided, cannot auto-generate GPU search space. Please provide --model"
+                logger.error(error_msg)
+                raise RuntimeError(error_msg)
-            # model_info should be set by now (checked above), but mypy needs explicit verification
            assert (
                model_info is not None
            ), "model_info must be set when model is provided"
-            vram_mib = int(gpu_info["vram"])  # type: ignore[call-overload]
+            logger.info(
-            gpus_per_node = int(gpu_info["gpus_per_node"])  # type: ignore[call-overload]
+                f"Auto-generating search space: {args.num_gpus_per_node}x {args.gpu_model} GPUs with {args.gpu_vram_mib} MiB VRAM per GPU"
+            )
+            if args.system:
+                logger.info(f"Hardware system: {args.system}")
+            # Calculate minimum GPUs needed for model
            min_gpu = math.ceil(
-                model_info.model_size / MODEL_GPU_MEM_FRAC_MAX / vram_mib
+                model_info.model_size / MODEL_GPU_MEM_FRAC_MAX / args.gpu_vram_mib
            )
+            # Calculate maximum GPUs to profile
            if not model_info.is_moe:
-                max_gpu = gpus_per_node
+                max_gpu = args.num_gpus_per_node
            else:
-                max_gpu = max(min_gpu * MOE_MODEL_MAX_NUM_GPU_FACTOR, gpus_per_node)
+                # MoE models can benefit from more GPUs
-            if min_gpu > max_gpu:
+                max_gpu = max(
-                error_msg = f"No valid GPU configuration found for model {args.model} on the cluster with {gpu_info['gpus_per_node']}x{gpu_info['model']} GPUs per node"
+                    min_gpu * MOE_MODEL_MAX_NUM_GPU_FACTOR, args.num_gpus_per_node
+                )
+            # Honor partial user overrides
+            final_min = args.min_num_gpus_per_engine or min_gpu
+            final_max = args.max_num_gpus_per_engine or max_gpu
+            # Validate final_min <= final_max
+            if final_min > final_max:
+                error_msg = f"Invalid GPU range: min_num_gpus_per_engine ({final_min}) > max_num_gpus_per_engine ({final_max})"
                logger.error(error_msg)
                raise RuntimeError(error_msg)
-            logger.info(
+            # Clamp to valid range [1, args.num_gpus_per_node]
-                f"Auto-generated search space for model {args.model} on the cluster with {gpu_info['gpus_per_node']}x{gpu_info['model']} GPUs per node: {min_gpu} to {max_gpu}"
+            final_min = max(1, min(final_min, args.num_gpus_per_node))
-            )
+            final_max = max(1, min(final_max, args.num_gpus_per_node))
-            args.min_num_gpus_per_engine = min_gpu
-            args.max_num_gpus_per_engine = max_gpu
+            logger.info(f"Auto-generated search space: {final_min} to {final_max} GPUs")
-            args.num_gpus_per_node = gpus_per_node  # type: ignore[assignment]
+            args.min_num_gpus_per_engine = final_min
+            args.max_num_gpus_per_engine = final_max
        else:
-        # use default values for GPUs
+            # No GPU info available - use defaults
-        if args.min_num_gpus_per_engine == 0:
+            logger.warning("GPU hardware info not available, using default values")
-            logger.warning(
+            args.min_num_gpus_per_engine = args.min_num_gpus_per_engine or 1
-                "GPU discover is disabled and min_num_gpus_per_engine is not specified, setting to 1"
+            args.max_num_gpus_per_engine = args.max_num_gpus_per_engine or 4
-            )
+            args.num_gpus_per_node = args.num_gpus_per_node or 8
-            args.min_num_gpus_per_engine = 1
+            logger.info(
-        if args.max_num_gpus_per_engine == 0:
+                f"Default search space: {args.min_num_gpus_per_engine} to {args.max_num_gpus_per_engine} GPUs, {args.num_gpus_per_node} GPUs per node"
-            logger.warning(
-                "GPU discover is disabled and max_num_gpus_per_engine is not specified, setting to 4"
-            )
-            args.max_num_gpus_per_engine = 4
-        if args.num_gpus_per_node == 0:
-            logger.warning(
-                "GPU discover is disabled and num_gpus_per_node is not specified, setting to 8"
            )
-            args.num_gpus_per_node = 8
    return
--- a/deploy/helm/charts/crds/templates/nvidia.com_dynamographdeploymentrequests.yaml
+++ b/deploy/helm/charts/crds/templates/nvidia.com_dynamographdeploymentrequests.yaml
@@ -140,13 +140,13 @@ spec:
                      type: string
                  type: object
                enableGpuDiscovery:
-                  default: false
+                  default: true
                  description: |-
-                    EnableGpuDiscovery controls whether the profiler should automatically discover GPU
+                    EnableGPUDiscovery controls whether the operator attempts to discover GPU hardware from cluster nodes.
-                    resources from the Kubernetes cluster nodes. When enabled, the profiler will override
+                    DEPRECATED: This field is deprecated and will be removed in v1beta1. GPU discovery is now always
-                    any manually specified hardware configuration (minNumGpusPerEngine, maxNumGpusPerEngine,
+                    attempted automatically. Setting this field has no effect - the operator will always try to discover
-                    numGpusPerNode) with values detected from the cluster.
+                    GPU hardware when node read permissions are available. If discovery is unavailable (e.g., namespace-scoped
-                    Requires cluster-wide node access permissions - only available with cluster-scoped operators.
+                    operator without permissions), manual hardware configuration is required regardless of this setting.
                  type: boolean
                model:
                  description: |-
@@ -157,6 +157,12 @@ spec:
                profilingConfig:
                  description: |-
                    ProfilingConfig provides the complete configuration for the profiling job.
+                    Note: GPU discovery is automatically attempted to detect GPU resources from Kubernetes
+                    cluster nodes. If the operator has node read permissions (cluster-wide or explicitly granted),
+                    discovered GPU configuration is used as defaults when hardware configuration is not manually
+                    specified (minNumGpusPerEngine, maxNumGpusPerEngine, numGpusPerNode). User-specified values
+                    always take precedence over auto-discovered values. If GPU discovery fails (e.g.,
+                    namespace-restricted operator without node permissions), manual hardware config is required.
                    This configuration is passed directly to the profiler.
                    The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema).
                    Note: deployment.model and engine.backend are automatically set from the high-level

--- a/deploy/operator/api/v1alpha1/dynamographdeploymentrequest_types.go
+++ b/deploy/operator/api/v1alpha1/dynamographdeploymentrequest_types.go
@@ -151,16 +151,13 @@ type DynamoGraphDeploymentRequestSpec struct {
 	// +kubebuilder:default=false
 	UseMocker bool `json:"useMocker,omitempty"`
-	// EnableGpuDiscovery controls whether the profiler should automatically discover GPU
-	// resources from the Kubernetes cluster nodes. When enabled, the profiler will override
-	// any manually specified hardware configuration (minNumGpusPerEngine, maxNumGpusPerEngine,
-	// numGpusPerNode) with values detected from the cluster.
-	// Requires cluster-wide node access permissions - only available with cluster-scoped operators.
-	// +kubebuilder:default=false
-	// +kubebuilder:validation:Optional
-	EnableGpuDiscovery bool `json:"enableGpuDiscovery,omitempty"`
 	// ProfilingConfig provides the complete configuration for the profiling job.
+	// Note: GPU discovery is automatically attempted to detect GPU resources from Kubernetes
+	// cluster nodes. If the operator has node read permissions (cluster-wide or explicitly granted),
+	// discovered GPU configuration is used as defaults when hardware configuration is not manually
+	// specified (minNumGpusPerEngine, maxNumGpusPerEngine, numGpusPerNode). User-specified values
+	// always take precedence over auto-discovered values. If GPU discovery fails (e.g.,
+	// namespace-restricted operator without node permissions), manual hardware config is required.
 	// This configuration is passed directly to the profiler.
 	// The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema).
 	// Note: deployment.model and engine.backend are automatically set from the high-level
@@ -168,6 +165,15 @@ type DynamoGraphDeploymentRequestSpec struct {
 	// +kubebuilder:validation:Required
 	ProfilingConfig ProfilingConfigSpec `json:"profilingConfig"`
+	// EnableGPUDiscovery controls whether the operator attempts to discover GPU hardware from cluster nodes.
+	// DEPRECATED: This field is deprecated and will be removed in v1beta1. GPU discovery is now always
+	// attempted automatically. Setting this field has no effect - the operator will always try to discover
+	// GPU hardware when node read permissions are available. If discovery is unavailable (e.g., namespace-scoped
+	// operator without permissions), manual hardware configuration is required regardless of this setting.
+	// +optional
+	// +kubebuilder:default=true
+	EnableGPUDiscovery *bool `json:"enableGpuDiscovery,omitempty"`
 	// AutoApply indicates whether to automatically create a DynamoGraphDeployment
 	// after profiling completes. If false, only the spec is generated and stored in status.
 	// Users can then manually create a DGD using the generated spec.

--- a/deploy/operator/api/v1alpha1/zz_generated.deepcopy.go
+++ b/deploy/operator/api/v1alpha1/zz_generated.deepcopy.go
@@ -726,6 +726,11 @@ func (in *DynamoGraphDeploymentRequestList) DeepCopyObject() runtime.Object {
 func (in *DynamoGraphDeploymentRequestSpec) DeepCopyInto(out *DynamoGraphDeploymentRequestSpec) {
 	*out = *in
 	in.ProfilingConfig.DeepCopyInto(&out.ProfilingConfig)
+	if in.EnableGPUDiscovery != nil {
+		in, out := &in.EnableGPUDiscovery, &out.EnableGPUDiscovery
+		*out = new(bool)
+		**out = **in
+	}
 	if in.DeploymentOverrides != nil {
 		in, out := &in.DeploymentOverrides, &out.DeploymentOverrides
 		*out = new(DeploymentOverridesSpec)

--- a/deploy/operator/config/crd/bases/nvidia.com_dynamographdeploymentrequests.yaml
+++ b/deploy/operator/config/crd/bases/nvidia.com_dynamographdeploymentrequests.yaml
@@ -140,13 +140,13 @@ spec:
                      type: string
                  type: object
                enableGpuDiscovery:
-                  default: false
+                  default: true
                  description: |-
-                    EnableGpuDiscovery controls whether the profiler should automatically discover GPU
+                    EnableGPUDiscovery controls whether the operator attempts to discover GPU hardware from cluster nodes.
-                    resources from the Kubernetes cluster nodes. When enabled, the profiler will override
+                    DEPRECATED: This field is deprecated and will be removed in v1beta1. GPU discovery is now always
-                    any manually specified hardware configuration (minNumGpusPerEngine, maxNumGpusPerEngine,
+                    attempted automatically. Setting this field has no effect - the operator will always try to discover
-                    numGpusPerNode) with values detected from the cluster.
+                    GPU hardware when node read permissions are available. If discovery is unavailable (e.g., namespace-scoped
-                    Requires cluster-wide node access permissions - only available with cluster-scoped operators.
+                    operator without permissions), manual hardware configuration is required regardless of this setting.
                  type: boolean
                model:
                  description: |-
@@ -157,6 +157,12 @@ spec:
                profilingConfig:
                  description: |-
                    ProfilingConfig provides the complete configuration for the profiling job.
+                    Note: GPU discovery is automatically attempted to detect GPU resources from Kubernetes
+                    cluster nodes. If the operator has node read permissions (cluster-wide or explicitly granted),
+                    discovered GPU configuration is used as defaults when hardware configuration is not manually
+                    specified (minNumGpusPerEngine, maxNumGpusPerEngine, numGpusPerNode). User-specified values
+                    always take precedence over auto-discovered values. If GPU discovery fails (e.g.,
+                    namespace-restricted operator without node permissions), manual hardware config is required.
                    This configuration is passed directly to the profiler.
                    The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema).
                    Note: deployment.model and engine.backend are automatically set from the high-level

--- a/deploy/operator/config/samples/nvidia.com_v1alpha1_dynamographdeploymentrequest.yaml
+++ b/deploy/operator/config/samples/nvidia.com_v1alpha1_dynamographdeploymentrequest.yaml
@@ -39,23 +39,24 @@ spec:
      engine:
        maxContextLength: 16384  # will override max context length of the model if provided
+      # Search strategy: 'rapid' for AI Configurator estimation (20-30s), 'thorough' for actual deployments (2-4h)
+      searchStrategy: thorough
      # Hardware configuration
+      # Note: Operator auto-discovers GPU info from cluster nodes when available
      hardware:
        minNumGpusPerEngine: 1  # Minimum GPUs to test
-        maxNumGpusPerEngine: 4  # Maximum GPUs to test (limited by model's num_heads/4)
+        maxNumGpusPerEngine: 4  # Maximum GPUs to test
-        numGpusPerNode: 8  # GPUs per node (for MoE models)
+        numGpusPerNode: 8       # GPUs per node (optional - auto-discovered if not specified)
+        system: h200_sxm        # Hardware system (optional - auto-detected if not specified)
+        # gpuModel: "H200-SXM"  # GPU model (optional - auto-discovered)
+        # gpuVramMib: 141557    # GPU VRAM in MiB (optional - auto-discovered)
      # Sweep/profiling configuration
      sweep:
        prefillInterpolationGranularity: 16  # Samples for TTFT interpolation
        decodeInterpolationGranularity: 6    # Samples for ITL interpolation
-        # AI Configurator mode (fast simulation-based profiling, 20-30 seconds)
-        useAiConfigurator: false  # Set to false for online profiling (2-4 hours)
-        aicSystem: h200_sxm  # Target GPU system for AI Configurator
-        aicHfId: Qwen/Qwen3-0.6B  # HuggingFace model ID for AI Configurator
-        aicBackendVersion: "0.20.0"  # Backend version for AI Configurator
      # SLA targets for profiling
      sla:
        isl: 3000  # Input sequence length

--- a/deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go
+++ b/deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go
@@ -47,6 +47,7 @@ import (
 	nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
 	"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
 	commonController "github.com/ai-dynamo/dynamo/deploy/operator/internal/controller_common"
+	"github.com/ai-dynamo/dynamo/deploy/operator/internal/gpu"
 	"github.com/ai-dynamo/dynamo/deploy/operator/internal/observability"
 	webhookvalidation "github.com/ai-dynamo/dynamo/deploy/operator/internal/webhook/validation"
 )
@@ -169,12 +170,26 @@ const (
 	BackendSGLang = "sglang"
 	BackendTRTLLM = "trtllm"
-	// Profiling config field names
+	// Profiling config field names for v1alpha1; note: will be removed in v1beta1
 	ConfigKeyDeployment       = "deployment"
 	ConfigKeyModelCache       = "modelCache"
 	ConfigKeyPVCName          = "pvcName"
 	ConfigKeyPVCPath          = "pvcPath"
 	ConfigKeyMountPath        = "mountPath"
+	ConfigKeyHardware         = "hardware"
+	ConfigKeyEngine           = "engine"
+	ConfigKeyOutputDir        = "output_dir"
+	ConfigKeyNumGpusPerNode   = "numGpusPerNode"
+	ConfigKeyGPUModel         = "gpuModel"
+	ConfigKeyGPUVramMib       = "gpuVramMib"
+	ConfigKeySystem           = "system"
+	ConfigKeyMinNumGpusPerEng = "minNumGpusPerEngine"
+	ConfigKeyMaxNumGpusPerEng = "maxNumGpusPerEngine"
+	ConfigKeyBackend          = "backend"
+	ConfigKeyConfig           = "config"
+	ConfigKeyNamespace        = "namespace"
+	ConfigKeyModel            = "model"
+	ConfigKeyDGDImage         = "dgd_image"
 )
 // shell script template for the output copier sidecar
@@ -946,10 +961,105 @@ func (r *DynamoGraphDeploymentRequestReconciler) validateSpec(ctx context.Contex
 		}
 	}
+	if err := r.validateGPUHardwareInfo(ctx, dgdr); err != nil {
+		return err
+	}
 	// The profiler will validate the rest of the configuration
 	return nil
 }
+// toFloat64 converts a numeric value (int or float64) to float64.
+// Returns 0 if the value is neither int nor float64.
+func toFloat64(val interface{}) float64 {
+	switch v := val.(type) {
+	case float64:
+		return v
+	case int:
+		return float64(v)
+	default:
+		return 0
+	}
+}
+// validateGPUHardwareInfo ensures GPU hardware information is available when required for profiling
+func (r *DynamoGraphDeploymentRequestReconciler) validateGPUHardwareInfo(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) error {
+	logger := log.FromContext(ctx)
+	// Check for hardware info and GPU ranges
+	// TODO: will be cleaner once we swap to new DGDR schema (#6130)
+	var config map[string]interface{}
+	if dgdr.Spec.ProfilingConfig.Config != nil {
+		if err := yaml.Unmarshal(dgdr.Spec.ProfilingConfig.Config.Raw, &config); err != nil {
+			// Config parse errors will be caught later, skip validation here
+			return nil
+		}
+	} else {
+		config = make(map[string]interface{})
+	}
+	hardwareVal, hasHardware := config[ConfigKeyHardware]
+	var hasManualHardwareConfig bool
+	if hasHardware && hardwareVal != nil {
+		if hardwareConfig, ok := hardwareVal.(map[string]interface{}); ok {
+			_, hasGPUModel := hardwareConfig[ConfigKeyGPUModel]
+			_, hasGPUVram := hardwareConfig[ConfigKeyGPUVramMib]
+			_, hasNumGPUs := hardwareConfig[ConfigKeyNumGpusPerNode]
+			hasManualHardwareConfig = hasGPUModel || hasGPUVram || hasNumGPUs
+		}
+	}
+	var hasExplicitGPURanges bool
+	if engineVal, hasEngine := config[ConfigKeyEngine]; hasEngine && engineVal != nil {
+		if engineConfig, ok := engineVal.(map[string]interface{}); ok {
+			minGPUs, hasMin := engineConfig[ConfigKeyMinNumGpusPerEng]
+			maxGPUs, hasMax := engineConfig[ConfigKeyMaxNumGpusPerEng]
+			if hasMin && hasMax {
+				minVal := toFloat64(minGPUs)
+				maxVal := toFloat64(maxGPUs)
+				// Validate that min <= max
+				if minVal > maxVal {
+					return fmt.Errorf("invalid GPU range: %s (%v) cannot be greater than %s (%v)",
+						ConfigKeyMinNumGpusPerEng, minVal, ConfigKeyMaxNumGpusPerEng, maxVal)
+				}
+				hasExplicitGPURanges = minVal > 0 && maxVal > 0
+			}
+		}
+	}
+	// If manual config or explicit ranges are provided, validation passes
+	if hasManualHardwareConfig || hasExplicitGPURanges {
+		return nil
+	}
+	_, err := gpu.DiscoverGPUs(ctx, r.Client)
+	if err == nil {
+		// GPU discovery is available, validation passes
+		return nil
+	}
+	logger.Info("GPU discovery not available", "reason", err.Error())
+	isNamespaceScoped := r.Config.RestrictedNamespace != ""
+	if isNamespaceScoped {
+		return fmt.Errorf(`GPU hardware info required but cannot be auto-discovered (namespace-scoped operator lacks node read permissions).
+Add hardware config to profilingConfig.config.%s (%s, %s, %s) or specify %s.%s and %s.%s.
+See: https://github.com/ai-dynamo/dynamo/issues/6257`,
+			ConfigKeyHardware, ConfigKeyNumGpusPerNode, ConfigKeyGPUModel, ConfigKeyGPUVramMib,
+			ConfigKeyEngine, ConfigKeyMinNumGpusPerEng, ConfigKeyEngine, ConfigKeyMaxNumGpusPerEng)
+	}
+	return fmt.Errorf(`GPU hardware info required but auto-discovery failed. Add hardware config to profilingConfig.config.%s (%s, %s, %s) or specify %s.%s and %s.%s.
+See profiling documentation for configuration details.`,
+		ConfigKeyHardware, ConfigKeyNumGpusPerNode, ConfigKeyGPUModel, ConfigKeyGPUVramMib,
+		ConfigKeyEngine, ConfigKeyMinNumGpusPerEng, ConfigKeyEngine, ConfigKeyMaxNumGpusPerEng)
+}
 // createProfilingJob creates a Kubernetes Job for profiling using SyncResource
 func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) error {
 	logger := log.FromContext(ctx)
@@ -989,13 +1099,29 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
 		}
 	}
+	// Run GPU discovery before creating job (cluster-wide and namespace-restricted operators if they have node read permissions)
+	var gpuInfo *gpu.GPUInfo
+	logger.Info("Attempting GPU discovery for profiling job")
+	discoveredInfo, err := gpu.DiscoverGPUs(ctx, r.Client)
+	if err != nil {
+		// This path is expected for namespace-restricted operators without node read permissions
+		logger.Info("GPU discovery not available, using manual hardware configuration from profiling config",
+			"reason", err.Error())
+	} else {
+		gpuInfo = discoveredInfo
+		logger.Info("GPU discovery completed successfully",
+			"gpusPerNode", gpuInfo.GPUsPerNode,
+			"model", gpuInfo.Model,
+			"vramMiB", gpuInfo.VRAMPerGPU,
+			"system", gpuInfo.System)
+	}
 	// Use SyncResource to create/update the job
 	modified, job, err := commonController.SyncResource(ctx, r, dgdr, func(ctx context.Context) (*batchv1.Job, bool, error) {
 		jobName := getProfilingJobName(dgdr)
 		outputConfigMapName := getOutputConfigMapName(dgdr)
-		// Parse and prepare profiling config
+		configYAML, err := r.prepareProfilingConfig(dgdr, gpuInfo)
-		configYAML, err := r.prepareProfilingConfig(dgdr)
 		if err != nil {
 			return nil, false, err
 		}
@@ -1069,12 +1195,6 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
 			"--profile-config", string(configYAML),
 		}
-		// Add --enable-gpu-discovery flag based on DGDR spec
-		// GPU discovery requires cluster-wide node access
-		if dgdr.Spec.EnableGpuDiscovery {
-			profilerArgs = append(profilerArgs, "--enable-gpu-discovery")
-		}
 		// Use profiler image from profilingConfig
 		imageName := dgdr.Spec.ProfilingConfig.ProfilerImage
 		logger.Info("Using profiler image", "image", imageName)
@@ -1250,7 +1370,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
 }
 // prepareProfilingConfig parses and modifies the profiling config
-func (r *DynamoGraphDeploymentRequestReconciler) prepareProfilingConfig(dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) ([]byte, error) {
+func (r *DynamoGraphDeploymentRequestReconciler) prepareProfilingConfig(dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest, gpuInfo *gpu.GPUInfo) ([]byte, error) {
 	// Parse the profiling config from JSON
 	var config map[string]interface{}
 	if err := yaml.Unmarshal(dgdr.Spec.ProfilingConfig.Config.Raw, &config); err != nil {
@@ -1258,53 +1378,84 @@ func (r *DynamoGraphDeploymentRequestReconciler) prepareProfilingConfig(dgdr *nv
 	}
 	// Set deployment.namespace if not already set
-	deploymentVal, hasDeployment := config["deployment"]
+	deploymentVal, hasDeployment := config[ConfigKeyDeployment]
 	var deploymentConfig map[string]interface{}
 	if !hasDeployment || deploymentVal == nil {
 		deploymentConfig = make(map[string]interface{})
-		config["deployment"] = deploymentConfig
+		config[ConfigKeyDeployment] = deploymentConfig
 	} else {
 		var ok bool
 		deploymentConfig, ok = deploymentVal.(map[string]interface{})
 		if !ok {
-			return nil, fmt.Errorf("profilingConfig.config.deployment must be an object, got %T", deploymentVal)
+			return nil, fmt.Errorf("profilingConfig.config.%s must be an object, got %T", ConfigKeyDeployment, deploymentVal)
 		}
 	}
-	if _, hasNamespace := deploymentConfig["namespace"]; !hasNamespace {
+	if _, hasNamespace := deploymentConfig[ConfigKeyNamespace]; !hasNamespace {
-		deploymentConfig["namespace"] = dgdr.Namespace
+		deploymentConfig[ConfigKeyNamespace] = dgdr.Namespace
 	}
 	// Set deployment.model from spec.model
-	deploymentConfig["model"] = dgdr.Spec.Model
+	deploymentConfig[ConfigKeyModel] = dgdr.Spec.Model
 	// Set deployment.dgd_image from deploymentOverrides.workersImage if provided
 	if dgdr.Spec.DeploymentOverrides != nil && dgdr.Spec.DeploymentOverrides.WorkersImage != "" {
-		deploymentConfig["dgd_image"] = dgdr.Spec.DeploymentOverrides.WorkersImage
+		deploymentConfig[ConfigKeyDGDImage] = dgdr.Spec.DeploymentOverrides.WorkersImage
 	}
 	// Set output_dir if not already set
-	if _, hasOutputDir := config["output_dir"]; !hasOutputDir {
+	if _, hasOutputDir := config[ConfigKeyOutputDir]; !hasOutputDir {
-		config["output_dir"] = ProfilingOutputPath
+		config[ConfigKeyOutputDir] = ProfilingOutputPath
 	}
 	// Set engine.backend from spec.backend
-	engineVal, hasEngine := config["engine"]
+	engineVal, hasEngine := config[ConfigKeyEngine]
 	var engineConfig map[string]interface{}
 	if !hasEngine || engineVal == nil {
 		engineConfig = make(map[string]interface{})
-		config["engine"] = engineConfig
+		config[ConfigKeyEngine] = engineConfig
 	} else {
 		var ok bool
 		engineConfig, ok = engineVal.(map[string]interface{})
 		if !ok {
-			return nil, fmt.Errorf("profilingConfig.config.engine must be an object, got %T", engineVal)
+			return nil, fmt.Errorf("profilingConfig.config.%s must be an object, got %T", ConfigKeyEngine, engineVal)
 		}
 	}
-	engineConfig["backend"] = dgdr.Spec.Backend
+	engineConfig[ConfigKeyBackend] = dgdr.Spec.Backend
 	// If ConfigMapRef is provided, set engine.config path
 	if dgdr.Spec.ProfilingConfig.ConfigMapRef != nil {
-		engineConfig["config"] = fmt.Sprintf("%s/%s", ProfilingConfigPath, ProfilingConfigFile)
+		engineConfig[ConfigKeyConfig] = fmt.Sprintf("%s/%s", ProfilingConfigPath, ProfilingConfigFile)
+	}
+	// User-specified values take precedence over auto-discovered values
+	if gpuInfo != nil {
+		hardwareVal, hasHardware := config["hardware"]
+		var hardwareConfig map[string]interface{}
+		if !hasHardware || hardwareVal == nil {
+			hardwareConfig = make(map[string]interface{})
+			config["hardware"] = hardwareConfig
+		} else {
+			var ok bool
+			hardwareConfig, ok = hardwareVal.(map[string]interface{})
+			if !ok {
+				return nil, fmt.Errorf("profilingConfig.config.hardware must be an object, got %T", hardwareVal)
+			}
+		}
+		if _, hasNumGpus := hardwareConfig[ConfigKeyNumGpusPerNode]; !hasNumGpus {
+			hardwareConfig[ConfigKeyNumGpusPerNode] = gpuInfo.GPUsPerNode
+		}
+		if _, hasGpuModel := hardwareConfig[ConfigKeyGPUModel]; !hasGpuModel {
+			hardwareConfig[ConfigKeyGPUModel] = gpuInfo.Model
+		}
+		if _, hasGpuVram := hardwareConfig[ConfigKeyGPUVramMib]; !hasGpuVram {
+			hardwareConfig[ConfigKeyGPUVramMib] = gpuInfo.VRAMPerGPU
+		}
+		if gpuInfo.System != "" {
+			if _, hasSystem := hardwareConfig[ConfigKeySystem]; !hasSystem {
+				hardwareConfig[ConfigKeySystem] = gpuInfo.System
+			}
+		}
 	}
 	// Serialize config to YAML for passing to profiler

--- a/deploy/operator/internal/controller/dynamographdeploymentrequest_controller_test.go
+++ b/deploy/operator/internal/controller/dynamographdeploymentrequest_controller_test.go
@@ -55,6 +55,14 @@ func (m *MockRBACManager) EnsureServiceAccountWithRBAC(ctx context.Context, targ
 // Helper function to create JSON config for tests
 func createTestConfig(config map[string]interface{}) *apiextensionsv1.JSON {
+	// Add default hardware config if not present to satisfy validation
+	if _, hasHardware := config["hardware"]; !hasHardware {
+		config["hardware"] = map[string]interface{}{
+			"numGpusPerNode": 8,
+			"gpuModel":       "H100-SXM5-80GB",
+			"gpuVramMib":     81920,
+		}
+	}
 	jsonBytes, err := json.Marshal(config)
 	if err != nil {
 		panic(err)
@@ -114,10 +122,6 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
 								"isl":  3000,
 								"osl":  5,
 							},
-							"hardware": map[string]interface{}{
-								"min_num_gpus_per_engine": 1,
-								"max_num_gpus_per_engine": 8,
-							},
 						}),
 					},
 				},
@@ -243,10 +247,6 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
 								"isl":  3000,
 								"osl":  5,
 							},
-							"hardware": map[string]interface{}{
-								"min_num_gpus_per_engine": 1,
-								"max_num_gpus_per_engine": 8,
-							},
 						}),
 						ConfigMapRef: &nvidiacomv1alpha1.ConfigMapKeySelector{
 							Name: "test-config",
@@ -341,10 +341,6 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
 								"isl":  3000,
 								"osl":  5,
 							},
-							"hardware": map[string]interface{}{
-								"min_num_gpus_per_engine": 1,
-								"max_num_gpus_per_engine": 8,
-							},
 							"sweep": map[string]interface{}{
 								"use_ai_configurator": true,
 								"aic_system":          "h200_sxm",
@@ -1236,10 +1232,6 @@ var _ = Describe("DGDR Error Handling", func() {
 								"isl":  3000,
 								"osl":  5,
 							},
-							"hardware": map[string]interface{}{
-								"min_num_gpus_per_engine": 1,
-								"max_num_gpus_per_engine": 8,
-							},
 						}),
 					},
 				},
@@ -1511,4 +1503,307 @@ spec:
 			Expect(additionalResources[1].GetName()).Should(Equal("config2"))
 		})
 	})
+	Context("GPU Discovery Integration Tests", func() {
+		It("Should use GPU discovery when nodes have GPU labels", func() {
+			ctx := context.Background()
+			dgdrName := "test-dgdr-gpu-discovery"
+			namespace := defaultNamespace
+			// Create a node with GPU labels (simulating GFD labels)
+			gpuNode := &corev1.Node{
+				ObjectMeta: metav1.ObjectMeta{
+					Name: "gpu-worker-1",
+					Labels: map[string]string{
+						"nvidia.com/gpu.count":   "8",
+						"nvidia.com/gpu.product": "H100-SXM5-80GB",
+						"nvidia.com/gpu.memory":  "81920",
+					},
+				},
+			}
+			Expect(k8sClient.Create(ctx, gpuNode)).Should(Succeed())
+			defer func() { _ = k8sClient.Delete(ctx, gpuNode) }()
+			// Create DGDR WITHOUT hardware config (should use GPU discovery)
+			dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      dgdrName,
+					Namespace: namespace,
+				},
+				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
+					Model:   "test-model",
+					Backend: "vllm",
+					ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
+						ProfilerImage: "test-profiler:latest",
+						Config: &apiextensionsv1.JSON{
+							Raw: []byte(`{
+								"sla": {"ttft": 100.0, "itl": 1500.0},
+								"engine": {"minNumGpusPerEngine": 1, "maxNumGpusPerEngine": 8}
+							}`),
+						},
+					},
+				},
+			}
+			Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
+			defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
+			// Reconcile - should succeed with GPU discovery
+			_, err := reconciler.Reconcile(ctx, reconcile.Request{
+				NamespacedName: types.NamespacedName{
+					Name:      dgdrName,
+					Namespace: namespace,
+				},
+			})
+			Expect(err).NotTo(HaveOccurred())
+			// Should transition to Pending (validation passed)
+			var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
+			_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
+			Expect(updated.Status.State).Should(Equal(DGDRStatePending))
+		})
+		It("Should respect manual hardware config over GPU discovery", func() {
+			ctx := context.Background()
+			dgdrName := "test-dgdr-manual-override"
+			namespace := defaultNamespace
+			// Create a node with H100 GPUs
+			gpuNode := &corev1.Node{
+				ObjectMeta: metav1.ObjectMeta{
+					Name: "gpu-worker-h100",
+					Labels: map[string]string{
+						"nvidia.com/gpu.count":   "8",
+						"nvidia.com/gpu.product": "H100-SXM5-80GB",
+						"nvidia.com/gpu.memory":  "81920",
+					},
+				},
+			}
+			Expect(k8sClient.Create(ctx, gpuNode)).Should(Succeed())
+			defer func() { _ = k8sClient.Delete(ctx, gpuNode) }()
+			// Create DGDR WITH manual hardware config (A100, not H100)
+			dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      dgdrName,
+					Namespace: namespace,
+				},
+				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
+					Model:   "test-model",
+					Backend: "vllm",
+					ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
+						ProfilerImage: "test-profiler:latest",
+						Config: &apiextensionsv1.JSON{
+							Raw: []byte(`{
+								"sla": {"ttft": 100.0, "itl": 1500.0},
+								"hardware": {
+									"numGpusPerNode": 4,
+									"gpuModel": "A100-SXM4-40GB",
+									"gpuVramMib": 40960,
+									"system": "a100_sxm"
+								}
+							}`),
+						},
+					},
+				},
+			}
+			Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
+			defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
+			// Reconcile - should succeed and use manual config
+			_, err := reconciler.Reconcile(ctx, reconcile.Request{
+				NamespacedName: types.NamespacedName{
+					Name:      dgdrName,
+					Namespace: namespace,
+				},
+			})
+			Expect(err).NotTo(HaveOccurred())
+			// Should transition to Pending (validation passed with manual config)
+			var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
+			_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
+			Expect(updated.Status.State).Should(Equal(DGDRStatePending))
+		})
+		It("Should succeed with GPU discovery when cluster has GPU nodes", func() {
+			ctx := context.Background()
+			dgdrName := "test-dgdr-with-autodiscovery"
+			namespace := defaultNamespace
+			// Create a GPU node so GPU discovery can succeed
+			node := &corev1.Node{
+				ObjectMeta: metav1.ObjectMeta{
+					Name: "gpu-worker-autodiscovery",
+					Labels: map[string]string{
+						"nvidia.com/gpu.count":   "8",
+						"nvidia.com/gpu.product": "H100-SXM5-80GB",
+						"nvidia.com/gpu.memory":  "81920",
+					},
+				},
+			}
+			Expect(k8sClient.Create(ctx, node)).Should(Succeed())
+			defer func() { _ = k8sClient.Delete(ctx, node) }()
+			// Create DGDR WITHOUT hardware config - should use GPU discovery
+			dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      dgdrName,
+					Namespace: namespace,
+				},
+				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
+					Model:   "test-model",
+					Backend: "vllm",
+					ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
+						ProfilerImage: "test-profiler:latest",
+						Config: &apiextensionsv1.JSON{
+							Raw: []byte(`{
+								"sla": {"ttft": 100.0, "itl": 1500.0}
+							}`),
+						},
+					},
+				},
+			}
+			Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
+			defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
+			// Reconcile - should succeed with GPU discovery
+			_, err := reconciler.Reconcile(ctx, reconcile.Request{
+				NamespacedName: types.NamespacedName{
+					Name:      dgdrName,
+					Namespace: namespace,
+				},
+			})
+			Expect(err).NotTo(HaveOccurred())
+			// Should transition to Pending
+			var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
+			_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
+			Expect(updated.Status.State).Should(Equal(DGDRStatePending))
+		})
+		It("Should pass validation with explicit GPU ranges without GPU discovery", func() {
+			ctx := context.Background()
+			dgdrName := "test-dgdr-explicit-ranges"
+			namespace := defaultNamespace
+			// Intentionally don't create GPU nodes to test that explicit ranges work without GPU discovery
+			// Create DGDR with explicit minNumGpusPerEngine/maxNumGpusPerEngine
+			dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      dgdrName,
+					Namespace: namespace,
+				},
+				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
+					Model:   "test-model",
+					Backend: "vllm",
+					ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
+						ProfilerImage: "test-profiler:latest",
+						Config: &apiextensionsv1.JSON{
+							Raw: []byte(`{
+								"sla": {"ttft": 100.0, "itl": 1500.0},
+								"engine": {
+									"minNumGpusPerEngine": 2,
+									"maxNumGpusPerEngine": 4
+								},
+								"hardware": {
+									"numGpusPerNode": 8
+								}
+							}`),
+						},
+					},
+				},
+			}
+			Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
+			defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
+			// Reconcile - should succeed (explicit ranges + minimal hardware bypass GPU discovery requirement)
+			_, err := reconciler.Reconcile(ctx, reconcile.Request{
+				NamespacedName: types.NamespacedName{
+					Name:      dgdrName,
+					Namespace: namespace,
+				},
+			})
+			Expect(err).NotTo(HaveOccurred())
+			// Should transition to Pending
+			var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
+			_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
+			Expect(updated.Status.State).Should(Equal(DGDRStatePending))
+		})
+		It("Should use GPU discovery with heterogeneous nodes (picks best)", func() {
+			ctx := context.Background()
+			dgdrName := "test-dgdr-heterogeneous"
+			namespace := defaultNamespace
+			// Create nodes with different GPU configs
+			nodeA100 := &corev1.Node{
+				ObjectMeta: metav1.ObjectMeta{
+					Name: "gpu-worker-a100",
+					Labels: map[string]string{
+						"nvidia.com/gpu.count":   "4",
+						"nvidia.com/gpu.product": "A100-SXM4-40GB",
+						"nvidia.com/gpu.memory":  "40960",
+					},
+				},
+			}
+			nodeH100 := &corev1.Node{
+				ObjectMeta: metav1.ObjectMeta{
+					Name: "gpu-worker-h100",
+					Labels: map[string]string{
+						"nvidia.com/gpu.count":   "8",
+						"nvidia.com/gpu.product": "H100-SXM5-80GB",
+						"nvidia.com/gpu.memory":  "81920",
+					},
+				},
+			}
+			Expect(k8sClient.Create(ctx, nodeA100)).Should(Succeed())
+			Expect(k8sClient.Create(ctx, nodeH100)).Should(Succeed())
+			defer func() {
+				_ = k8sClient.Delete(ctx, nodeA100)
+				_ = k8sClient.Delete(ctx, nodeH100)
+			}()
+			// Create DGDR without hardware config
+			dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      dgdrName,
+					Namespace: namespace,
+				},
+				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
+					Model:   "test-model",
+					Backend: "vllm",
+					ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
+						ProfilerImage: "test-profiler:latest",
+						Config: &apiextensionsv1.JSON{
+							Raw: []byte(`{
+								"sla": {"ttft": 100.0, "itl": 1500.0},
+								"engine": {"minNumGpusPerEngine": 1, "maxNumGpusPerEngine": 8}
+							}`),
+						},
+					},
+				},
+			}
+			Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
+			defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
+			// Reconcile - should pick H100 (8 GPUs > 4 GPUs)
+			_, err := reconciler.Reconcile(ctx, reconcile.Request{
+				NamespacedName: types.NamespacedName{
+					Name:      dgdrName,
+					Namespace: namespace,
+				},
+			})
+			Expect(err).NotTo(HaveOccurred())
+			// Should transition to Pending (using H100 config)
+			var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
+			_ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
+			Expect(updated.Status.State).Should(Equal(DGDRStatePending))
+		})
+	})
 })
--- a/deploy/operator/internal/gpu/discovery.go
+++ b/deploy/operator/internal/gpu/discovery.go
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package gpu
+import (
+	"context"
+	"fmt"
+	"strconv"
+	"strings"
+	corev1 "k8s.io/api/core/v1"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+	"sigs.k8s.io/controller-runtime/pkg/log"
+)
+const (
+	// NVIDIA GPU Feature Discovery (GFD) label keys
+	LabelGPUCount   = "nvidia.com/gpu.count"
+	LabelGPUProduct = "nvidia.com/gpu.product"
+	LabelGPUMemory  = "nvidia.com/gpu.memory"
+)
+// GPUInfo contains discovered GPU configuration from cluster nodes
+type GPUInfo struct {
+	GPUsPerNode int    // Maximum GPUs per node found in the cluster
+	Model       string // GPU product name (e.g., "H100-SXM5-80GB")
+	VRAMPerGPU  int    // VRAM in MiB per GPU
+	System      string // AIC hardware system identifier (e.g., "h100_sxm", "h200_sxm"), empty if unknown
+}
+// DiscoverGPUs queries Kubernetes nodes to determine GPU configuration.
+// It extracts GPU information from NVIDIA GPU Feature Discovery (GFD) labels
+// and returns aggregated GPU info, preferring nodes with higher GPU count,
+// then higher VRAM if counts are equal.
+//
+// This function requires cluster-wide node read permissions and expects nodes
+// to have GFD labels. If no nodes with GPU labels are found, it returns an error.
+func DiscoverGPUs(ctx context.Context, k8sClient client.Client) (*GPUInfo, error) {
+	logger := log.FromContext(ctx)
+	logger.Info("Starting GPU discovery from cluster nodes")
+	// List all nodes in the cluster
+	nodeList := &corev1.NodeList{}
+	if err := k8sClient.List(ctx, nodeList); err != nil {
+		return nil, fmt.Errorf("failed to list cluster nodes: %w", err)
+	}
+	if len(nodeList.Items) == 0 {
+		return nil, fmt.Errorf("no nodes found in cluster")
+	}
+	logger.Info("Found cluster nodes", "count", len(nodeList.Items))
+	// Track the best GPU configuration found
+	var bestGPUInfo *GPUInfo
+	nodesWithGPUs := 0
+	for i := range nodeList.Items {
+		node := &nodeList.Items[i]
+		gpuInfo, err := extractGPUInfoFromNode(node)
+		if err != nil {
+			// Node doesn't have GPU labels or has invalid labels, skip it
+			logger.V(1).Info("Skipping node without valid GPU info",
+				"node", node.Name,
+				"reason", err.Error())
+			continue
+		}
+		nodesWithGPUs++
+		logger.Info("Found GPU node",
+			"node", node.Name,
+			"gpus", gpuInfo.GPUsPerNode,
+			"model", gpuInfo.Model,
+			"vram", gpuInfo.VRAMPerGPU)
+		// Select best configuration: prefer higher GPU count, then higher VRAM
+		if bestGPUInfo == nil ||
+			gpuInfo.GPUsPerNode > bestGPUInfo.GPUsPerNode ||
+			(gpuInfo.GPUsPerNode == bestGPUInfo.GPUsPerNode && gpuInfo.VRAMPerGPU > bestGPUInfo.VRAMPerGPU) {
+			bestGPUInfo = gpuInfo
+		}
+	}
+	if bestGPUInfo == nil {
+		return nil, fmt.Errorf("no nodes with NVIDIA GPU Feature Discovery labels found (checked %d nodes). "+
+			"Ensure GPU nodes have labels: %s, %s, %s",
+			len(nodeList.Items), LabelGPUCount, LabelGPUProduct, LabelGPUMemory)
+	}
+	// Infer hardware system from GPU model
+	bestGPUInfo.System = InferHardwareSystem(bestGPUInfo.Model)
+	logger.Info("GPU discovery completed",
+		"gpusPerNode", bestGPUInfo.GPUsPerNode,
+		"model", bestGPUInfo.Model,
+		"vram", bestGPUInfo.VRAMPerGPU,
+		"system", bestGPUInfo.System,
+		"nodesWithGPUs", nodesWithGPUs)
+	return bestGPUInfo, nil
+}
+// extractGPUInfoFromNode extracts GPU information from a single node's labels.
+// Returns error if required labels are missing or invalid.
+func extractGPUInfoFromNode(node *corev1.Node) (*GPUInfo, error) {
+	labels := node.Labels
+	if labels == nil {
+		return nil, fmt.Errorf("node has no labels")
+	}
+	gpuCountStr, ok := labels[LabelGPUCount]
+	if !ok {
+		return nil, fmt.Errorf("missing label %s", LabelGPUCount)
+	}
+	gpuCount, err := strconv.Atoi(gpuCountStr)
+	if err != nil || gpuCount <= 0 {
+		return nil, fmt.Errorf("invalid GPU count: %s", gpuCountStr)
+	}
+	gpuModel, ok := labels[LabelGPUProduct]
+	if !ok || gpuModel == "" {
+		return nil, fmt.Errorf("missing or empty label %s", LabelGPUProduct)
+	}
+	// Extract VRAM (memory in MiB)
+	gpuMemoryStr, ok := labels[LabelGPUMemory]
+	if !ok {
+		return nil, fmt.Errorf("missing label %s", LabelGPUMemory)
+	}
+	gpuMemory, err := strconv.Atoi(gpuMemoryStr)
+	if err != nil || gpuMemory <= 0 {
+		return nil, fmt.Errorf("invalid GPU memory: %s", gpuMemoryStr)
+	}
+	return &GPUInfo{
+		GPUsPerNode: gpuCount,
+		Model:       gpuModel,
+		VRAMPerGPU:  gpuMemory,
+	}, nil
+}
+// InferHardwareSystem maps GPU product name to hardware system identifier.
+// Returns empty string if the GPU model cannot be confidently mapped.
+//
+// This is a best-effort mapping based on common NVIDIA datacenter GPU naming patterns.
+// The system identifier is used by the profiler for performance estimation and configuration.
+//
+// Limitations:
+//   - Cannot distinguish SXM vs. PCIe variants from labels alone (assumes SXM for datacenter GPUs)
+//   - New GPU models require code updates (gracefully returns empty string)
+//   - Non-standard SKU names may not match
+//
+// Users can manually override the system in their profiling config (hardware.system)
+// if auto-detection is incorrect or unavailable.
+func InferHardwareSystem(gpuProduct string) string {
+	if gpuProduct == "" {
+		return ""
+	}
+	// Normalize: uppercase, remove spaces/dashes for pattern matching
+	normalized := strings.ToUpper(strings.ReplaceAll(gpuProduct, "-", ""))
+	normalized = strings.ReplaceAll(normalized, " ", "")
+	// Map common NVIDIA datacenter GPU products to hardware system identifiers
+	patterns := []struct {
+		pattern string
+		system  string
+	}{
+		{"GB200", "gb200_sxm"},
+		{"H200", "h200_sxm"},
+		{"H100", "h100_sxm"},
+		{"B200", "b200_sxm"},
+		{"A100", "a100_sxm"},
+		{"L40S", "l40s"},
+	}
+	for _, p := range patterns {
+		if strings.Contains(normalized, p.pattern) {
+			return p.system
+		}
+	}
+	// Unknown GPU type, return empty string
+	// User must specify system manually in profiling config (hardware.system)
+	return ""
+}
--- a/deploy/operator/internal/gpu/discovery_test.go
+++ b/deploy/operator/internal/gpu/discovery_test.go
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package gpu
+import (
+	"context"
+	"testing"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/runtime"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+	"sigs.k8s.io/controller-runtime/pkg/client/fake"
+)
+// newFakeClient creates a fake Kubernetes client with the given objects
+func newFakeClient(objs ...client.Object) client.Client {
+	scheme := runtime.NewScheme()
+	_ = corev1.AddToScheme(scheme)
+	return fake.NewClientBuilder().
+		WithScheme(scheme).
+		WithObjects(objs...).
+		Build()
+}
+func TestDiscoverGPUs_SingleNode(t *testing.T) {
+	ctx := context.Background()
+	node := &corev1.Node{
+		ObjectMeta: metav1.ObjectMeta{
+			Name: "gpu-node-1",
+			Labels: map[string]string{
+				LabelGPUCount:   "8",
+				LabelGPUProduct: "H100-SXM5-80GB",
+				LabelGPUMemory:  "81920",
+			},
+		},
+	}
+	k8sClient := newFakeClient(node)
+	gpuInfo, err := DiscoverGPUs(ctx, k8sClient)
+	require.NoError(t, err)
+	require.NotNil(t, gpuInfo)
+	assert.Equal(t, 8, gpuInfo.GPUsPerNode)
+	assert.Equal(t, "H100-SXM5-80GB", gpuInfo.Model)
+	assert.Equal(t, 81920, gpuInfo.VRAMPerGPU)
+	assert.Equal(t, "h100_sxm", gpuInfo.System)
+}
+func TestDiscoverGPUs_MultipleNodesHomogeneous(t *testing.T) {
+	ctx := context.Background()
+	// Multiple nodes with same GPU configuration
+	node1 := &corev1.Node{
+		ObjectMeta: metav1.ObjectMeta{
+			Name: "gpu-node-1",
+			Labels: map[string]string{
+				LabelGPUCount:   "8",
+				LabelGPUProduct: "H100-SXM5-80GB",
+				LabelGPUMemory:  "81920",
+			},
+		},
+	}
+	node2 := &corev1.Node{
+		ObjectMeta: metav1.ObjectMeta{
+			Name: "gpu-node-2",
+			Labels: map[string]string{
+				LabelGPUCount:   "8",
+				LabelGPUProduct: "H100-SXM5-80GB",
+				LabelGPUMemory:  "81920",
+			},
+		},
+	}
+	k8sClient := newFakeClient(node1, node2)
+	gpuInfo, err := DiscoverGPUs(ctx, k8sClient)
+	require.NoError(t, err)
+	require.NotNil(t, gpuInfo)
+	assert.Equal(t, 8, gpuInfo.GPUsPerNode)
+	assert.Equal(t, "H100-SXM5-80GB", gpuInfo.Model)
+	assert.Equal(t, 81920, gpuInfo.VRAMPerGPU)
+}
+func TestDiscoverGPUs_MultipleNodesHeterogeneous_HigherGPUCountWins(t *testing.T) {
+	ctx := context.Background()
+	// Node with fewer GPUs
+	node1 := &corev1.Node{
+		ObjectMeta: metav1.ObjectMeta{
+			Name: "gpu-node-1",
+			Labels: map[string]string{
+				LabelGPUCount:   "4",
+				LabelGPUProduct: "A100-SXM4-40GB",
+				LabelGPUMemory:  "40960",
+			},
+		},
+	}
+	// Node with more GPUs (should win)
+	node2 := &corev1.Node{
+		ObjectMeta: metav1.ObjectMeta{
+			Name: "gpu-node-2",
+			Labels: map[string]string{
+				LabelGPUCount:   "8",
+				LabelGPUProduct: "H100-SXM5-80GB",
+				LabelGPUMemory:  "81920",
+			},
+		},
+	}
+	k8sClient := newFakeClient(node1, node2)
+	gpuInfo, err := DiscoverGPUs(ctx, k8sClient)
+	require.NoError(t, err)
+	require.NotNil(t, gpuInfo)
+	// Should prefer node with 8 GPUs over node with 4 GPUs
+	assert.Equal(t, 8, gpuInfo.GPUsPerNode)
+	assert.Equal(t, "H100-SXM5-80GB", gpuInfo.Model)
+	assert.Equal(t, 81920, gpuInfo.VRAMPerGPU)
+}
+func TestDiscoverGPUs_MultipleNodesHeterogeneous_HigherVRAMWins(t *testing.T) {
+	ctx := context.Background()
+	// Node with same GPU count but less VRAM
+	node1 := &corev1.Node{
+		ObjectMeta: metav1.ObjectMeta{
+			Name: "gpu-node-1",
+			Labels: map[string]string{
+				LabelGPUCount:   "8",
+				LabelGPUProduct: "A100-SXM4-40GB",
+				LabelGPUMemory:  "40960",
+			},
+		},
+	}
+	// Node with same GPU count but more VRAM (should win)
+	node2 := &corev1.Node{
+		ObjectMeta: metav1.ObjectMeta{
+			Name: "gpu-node-2",
+			Labels: map[string]string{
+				LabelGPUCount:   "8",
+				LabelGPUProduct: "H100-SXM5-80GB",
+				LabelGPUMemory:  "81920",
+			},
+		},
+	}
+	k8sClient := newFakeClient(node1, node2)
+	gpuInfo, err := DiscoverGPUs(ctx, k8sClient)
+	require.NoError(t, err)
+	require.NotNil(t, gpuInfo)
+	// Should prefer node with higher VRAM when GPU count is equal
+	assert.Equal(t, 8, gpuInfo.GPUsPerNode)
+	assert.Equal(t, "H100-SXM5-80GB", gpuInfo.Model)
+	assert.Equal(t, 81920, gpuInfo.VRAMPerGPU)
+}
+func TestDiscoverGPUs_MixedNodesWithAndWithoutGPUs(t *testing.T) {
+	ctx := context.Background()
+	// CPU-only node (no GPU labels)
+	cpuNode := &corev1.Node{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:   "cpu-node-1",
+			Labels: map[string]string{},
+		},
+	}
+	// GPU node
+	gpuNode := &corev1.Node{
+		ObjectMeta: metav1.ObjectMeta{
+			Name: "gpu-node-1",
+			Labels: map[string]string{
+				LabelGPUCount:   "8",
+				LabelGPUProduct: "H100-SXM5-80GB",
+				LabelGPUMemory:  "81920",
+			},
+		},
+	}
+	k8sClient := newFakeClient(cpuNode, gpuNode)
+	gpuInfo, err := DiscoverGPUs(ctx, k8sClient)
+	require.NoError(t, err)
+	require.NotNil(t, gpuInfo)
+	// Should find the GPU node and ignore CPU-only node
+	assert.Equal(t, 8, gpuInfo.GPUsPerNode)
+	assert.Equal(t, "H100-SXM5-80GB", gpuInfo.Model)
+}
+func TestDiscoverGPUs_NoNodes(t *testing.T) {
+	ctx := context.Background()
+	k8sClient := newFakeClient() // Empty cluster
+	gpuInfo, err := DiscoverGPUs(ctx, k8sClient)
+	assert.Error(t, err)
+	assert.Nil(t, gpuInfo)
+	assert.Contains(t, err.Error(), "no nodes found")
+}
+func TestDiscoverGPUs_NoGPUNodes(t *testing.T) {
+	ctx := context.Background()
+	// Only CPU nodes
+	cpuNode1 := &corev1.Node{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:   "cpu-node-1",
+			Labels: map[string]string{},
+		},
+	}
+	cpuNode2 := &corev1.Node{
+		ObjectMeta: metav1.ObjectMeta{
+			Name: "cpu-node-2",
+			Labels: map[string]string{
+				"node-type": "cpu-only",
+			},
+		},
+	}
+	k8sClient := newFakeClient(cpuNode1, cpuNode2)
+	gpuInfo, err := DiscoverGPUs(ctx, k8sClient)
+	assert.Error(t, err)
+	assert.Nil(t, gpuInfo)
+	assert.Contains(t, err.Error(), "no nodes with NVIDIA GPU Feature Discovery labels found")
+}
+func TestExtractGPUInfoFromNode_MissingLabels(t *testing.T) {
+	tests := []struct {
+		name        string
+		labels      map[string]string
+		expectError bool
+		errorMsg    string
+	}{
+		{
+			name:        "missing GPU count",
+			labels:      map[string]string{LabelGPUProduct: "H100", LabelGPUMemory: "80000"},
+			expectError: true,
+			errorMsg:    LabelGPUCount,
+		},
+		{
+			name:        "missing GPU product",
+			labels:      map[string]string{LabelGPUCount: "8", LabelGPUMemory: "80000"},
+			expectError: true,
+			errorMsg:    LabelGPUProduct,
+		},
+		{
+			name:        "missing GPU memory",
+			labels:      map[string]string{LabelGPUCount: "8", LabelGPUProduct: "H100"},
+			expectError: true,
+			errorMsg:    LabelGPUMemory,
+		},
+		{
+			name:        "invalid GPU count",
+			labels:      map[string]string{LabelGPUCount: "invalid", LabelGPUProduct: "H100", LabelGPUMemory: "80000"},
+			expectError: true,
+			errorMsg:    "invalid GPU count",
+		},
+		{
+			name:        "invalid GPU memory",
+			labels:      map[string]string{LabelGPUCount: "8", LabelGPUProduct: "H100", LabelGPUMemory: "invalid"},
+			expectError: true,
+			errorMsg:    "invalid GPU memory",
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			node := &corev1.Node{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:   "test-node",
+					Labels: tt.labels,
+				},
+			}
+			gpuInfo, err := extractGPUInfoFromNode(node)
+			if tt.expectError {
+				assert.Error(t, err)
+				assert.Nil(t, gpuInfo)
+				if tt.errorMsg != "" {
+					assert.Contains(t, err.Error(), tt.errorMsg)
+				}
+			} else {
+				assert.NoError(t, err)
+				assert.NotNil(t, gpuInfo)
+			}
+		})
+	}
+}
+func TestInferHardwareSystem(t *testing.T) {
+	tests := []struct {
+		gpuProduct     string
+		expectedSystem string
+		description    string
+	}{
+		{"H100-SXM5-80GB", "h100_sxm", "H100 SXM variant"},
+		{"H100-PCIE-80GB", "h100_sxm", "H100 PCIe variant (mapped to SXM)"},
+		{"H200-SXM5-141GB", "h200_sxm", "H200 SXM variant"},
+		{"A100-SXM4-40GB", "a100_sxm", "A100 SXM variant"},
+		{"A100-PCIE-80GB", "a100_sxm", "A100 PCIe variant (mapped to SXM)"},
+		{"L40S", "l40s", "L40S"},
+		{"NVIDIA L40S", "l40s", "L40S with prefix"},
+		{"B200-SXM", "b200_sxm", "B200 SXM"},
+		{"GB200", "gb200_sxm", "GB200"},
+		{"Tesla V100-SXM2-16GB", "", "V100 (not in mapping)"},
+		{"RTX 4090", "", "Consumer GPU (not in mapping)"},
+		{"Unknown-GPU", "", "Unknown GPU"},
+		{"", "", "Empty string"},
+	}
+	for _, tt := range tests {
+		t.Run(tt.description, func(t *testing.T) {
+			result := InferHardwareSystem(tt.gpuProduct)
+			assert.Equal(t, tt.expectedSystem, result, "Failed for GPU: %s", tt.gpuProduct)
+		})
+	}
+}
+func TestInferHardwareSystem_CaseInsensitive(t *testing.T) {
+	// Test that inference is case-insensitive
+	variants := []string{
+		"h100-sxm5-80gb",
+		"H100-SXM5-80GB",
+		"H100-sxm5-80GB",
+		"h100-SXM5-80gb",
+	}
+	for _, variant := range variants {
+		result := InferHardwareSystem(variant)
+		assert.Equal(t, "h100_sxm", result, "Should handle case variations: %s", variant)
+	}
+}
+func TestInferHardwareSystem_SpacesAndDashes(t *testing.T) {
+	// Test that spaces and dashes are normalized
+	variants := []string{
+		"H100-SXM5-80GB",
+		"H100 SXM5 80GB",
+		"H100SXM580GB",
+		"H100-SXM5 80GB",
+	}
+	for _, variant := range variants {
+		result := InferHardwareSystem(variant)
+		assert.Equal(t, "h100_sxm", result, "Should normalize spaces/dashes: %s", variant)
+	}
+}
--- a/deploy/operator/internal/webhook/validation/dynamographdeploymentrequest.go
+++ b/deploy/operator/internal/webhook/validation/dynamographdeploymentrequest.go
@@ -26,6 +26,19 @@ import (
 	"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
 )
+// toFloat64 converts a numeric value (int or float64) to float64.
+// Returns 0 if the value is neither int nor float64.
+func toFloat64(val interface{}) float64 {
+	switch v := val.(type) {
+	case float64:
+		return v
+	case int:
+		return float64(v)
+	default:
+		return 0
+	}
+}
 // DynamoGraphDeploymentRequestValidator validates DynamoGraphDeploymentRequest resources.
 // This validator can be used by both webhooks and controllers for consistent validation.
 type DynamoGraphDeploymentRequestValidator struct {
@@ -48,6 +61,11 @@ func (v *DynamoGraphDeploymentRequestValidator) Validate() (admission.Warnings,
 	var warnings admission.Warnings
 	var err error
+	// Warn about deprecated enableGpuDiscovery field
+	if v.request.Spec.EnableGPUDiscovery != nil {
+		warnings = append(warnings, "spec.enableGpuDiscovery is deprecated and will be removed in v1beta1. GPU discovery is now always attempted automatically. This field has no effect.")
+	}
 	// Validate profiler image is specified
 	if v.request.Spec.ProfilingConfig.ProfilerImage == "" {
 		err = errors.Join(err, errors.New("spec.profilingConfig.profilerImage is required"))
@@ -58,10 +76,8 @@ func (v *DynamoGraphDeploymentRequestValidator) Validate() (admission.Warnings,
 		err = errors.Join(err, errors.New("spec.profilingConfig.config is required and must not be empty"))
 	}
-	// Validate enableGpuDiscovery is only true for cluster-wide operators
+	// Note: GPU discovery is now automatic for cluster-wide operators
-	if v.request.Spec.EnableGpuDiscovery && !v.isClusterWideOperator {
+	// Namespace-restricted operators automatically skip GPU discovery and require manual hardware config
-		err = errors.Join(err, errors.New("spec.enableGpuDiscovery can only be set to true for cluster-wide operators. Namespace-restricted operators cannot access cluster nodes for GPU discovery. Please set enableGpuDiscovery to false and provide hardware configuration (hardware.min_num_gpus_per_engine, hardware.max_num_gpus_per_engine, hardware.num_gpus_per_node) in spec.profilingConfig.config"))
-	}
 	// Parse config to validate structure (only if config is present)
 	if v.request.Spec.ProfilingConfig.Config != nil && len(v.request.Spec.ProfilingConfig.Config.Raw) > 0 {
@@ -83,9 +99,101 @@ func (v *DynamoGraphDeploymentRequestValidator) Validate() (admission.Warnings,
 		}
 	}
+	// Validate GPU hardware information is available (last, so other errors are collected first)
+	if gpuErr := v.validateGPUHardwareInfo(); gpuErr != nil {
+		err = errors.Join(err, gpuErr)
+	}
 	return warnings, err
 }
+// validateGPUHardwareInfo ensures GPU hardware information will be available for profiling.
+// This validation happens at admission time to fail fast before the DGDR is persisted to etcd.
+func (v *DynamoGraphDeploymentRequestValidator) validateGPUHardwareInfo() error {
+	// Parse profiling config
+	var config map[string]interface{}
+	if v.request.Spec.ProfilingConfig.Config != nil {
+		if err := yaml.Unmarshal(v.request.Spec.ProfilingConfig.Config.Raw, &config); err != nil {
+			// Config parse errors will be caught by other validators
+			return nil
+		}
+	} else {
+		config = make(map[string]interface{})
+	}
+	// Check if manual hardware config is provided
+	hardwareVal, hasHardware := config["hardware"]
+	var hasManualHardwareConfig bool
+	if hasHardware && hardwareVal != nil {
+		if hardwareConfig, ok := hardwareVal.(map[string]interface{}); ok {
+			// Check if essential hardware fields are provided
+			_, hasGPUModel := hardwareConfig["gpuModel"]
+			_, hasGPUVram := hardwareConfig["gpuVramMib"]
+			_, hasNumGPUs := hardwareConfig["numGpusPerNode"]
+			hasManualHardwareConfig = hasGPUModel || hasGPUVram || hasNumGPUs
+		}
+	}
+	// Check if explicit GPU ranges are provided
+	var hasExplicitGPURanges bool
+	if engineVal, hasEngine := config["engine"]; hasEngine && engineVal != nil {
+		if engineConfig, ok := engineVal.(map[string]interface{}); ok {
+			minGPUs, hasMin := engineConfig["minNumGpusPerEngine"]
+			maxGPUs, hasMax := engineConfig["maxNumGpusPerEngine"]
+			// Validate explicit GPU ranges
+			if hasMin && hasMax {
+				minVal := toFloat64(minGPUs)
+				maxVal := toFloat64(maxGPUs)
+				// Validate that min <= max
+				if minVal > maxVal {
+					return fmt.Errorf("invalid GPU range: minNumGpusPerEngine (%v) cannot be greater than maxNumGpusPerEngine (%v)",
+						minVal, maxVal)
+				}
+				hasExplicitGPURanges = minVal > 0 && maxVal > 0
+			}
+		}
+	}
+	// If manual config or explicit ranges provided, validation passes
+	if hasManualHardwareConfig || hasExplicitGPURanges {
+		return nil
+	}
+	// Neither manual config nor explicit ranges provided
+	// GPU discovery will be attempted at reconcile time, but if it's unavailable
+	// (e.g., namespace-scoped operator), the DGDR will fail
+	//
+	// Fail at admission time to give users immediate feedback
+	if v.isClusterWideOperator {
+		// Cluster-wide operator should have GPU discovery available
+		// Allow DGDR to be created - GPU discovery will provide hardware info
+		return nil
+	}
+	// Namespace-scoped operator likely doesn't have node read permissions
+	// Require manual hardware config or explicit GPU ranges
+	return errors.New(`GPU hardware configuration required for namespace-scoped operators.
+Namespace-scoped operators typically lack node read permissions for GPU auto-discovery.
+Provide hardware configuration in one of these ways:
+1. Add hardware config in spec.profilingConfig.config:
+   hardware:
+     numGpusPerNode: 8
+     gpuModel: "H100-SXM5-80GB"
+     gpuVramMib: 81920
+2. Or specify explicit GPU search ranges:
+   engine:
+     minNumGpusPerEngine: 2
+     maxNumGpusPerEngine: 8
+See: https://github.com/ai-dynamo/dynamo/issues/6257`)
+}
 // ValidateUpdate performs stateful validation comparing old and new DynamoGraphDeploymentRequest.
 // Returns warnings and error.
 func (v *DynamoGraphDeploymentRequestValidator) ValidateUpdate(old *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (admission.Warnings, error) {