feat: mount model path to Profiler if specified (#5212)

Signed-off-by: Hannah Zhang <hannahz@nvidia.com>

feat: mount model path to Profiler if specified (#5212)
Signed-off-by: Hannah Zhang <hannahz@nvidia.com>
ec5630ea · hhzhang16 · GitHub · c29f78c1 · ec5630ea · ec5630ea
Unverified Commit ec5630ea authored Jan 09, 2026 by hhzhang16 Committed by GitHub Jan 09, 2026
13 changed files
--- a/benchmarks/profiler/deploy/profile_sla_aic_dgdr.yaml
+++ b/benchmarks/profiler/deploy/profile_sla_aic_dgdr.yaml
@@ -17,10 +17,10 @@ spec:
      # Sweep/profiling configuration
      sweep:
        # AI Configurator mode (fast simulation-based profiling)
-        use_ai_configurator: true
-        aic_system: h200_sxm
-        aic_hf_id: Qwen/Qwen3-32B
-        aic_backend_version: "0.20.0"
+        useAiConfigurator: true
+        aicSystem: h200_sxm
+        aicHfId: Qwen/Qwen3-32B
+        aicBackendVersion: "0.20.0"

      # SLA targets for profiling
      sla:

--- a/benchmarks/profiler/deploy/profile_sla_dgdr.yaml
+++ b/benchmarks/profiler/deploy/profile_sla_dgdr.yaml
@@ -17,7 +17,7 @@ spec:
      # Sweep/profiling configuration
      sweep:
        # Online profiling mode (real deployment testing)
-        use_ai_configurator: false
+        useAiConfigurator: false

      # SLA targets for profiling
      sla:

--- a/benchmarks/profiler/deploy/profile_sla_moe_dgdr.yaml
+++ b/benchmarks/profiler/deploy/profile_sla_moe_dgdr.yaml
@@ -15,16 +15,22 @@ spec:
  profilingConfig:
    profilerImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.1"
    config:
+      # Model cache PVC configuration; profiler will mount this PVC to access model weights
+      deployment:
+        modelCache:
+          pvcName: "model-cache"                      # Name of PVC containing model weights
+          pvcPath: "deepseek-r1"                      # Subpath within PVC where model is stored
+
      # Sweep/profiling configuration
      sweep:
        # Standard online profiling (not using AI Configurator)
-        use_ai_configurator: false
+        useAiConfigurator: false

      hardware:
        # for h200, sweep over 8-16 GPUs per engine
-        min_num_gpus_per_engine: 8
-        max_num_gpus_per_engine: 16
-        num_gpus_per_node: 8
+        minNumGpusPerEngine: 8
+        maxNumGpusPerEngine: 16
+        numGpusPerNode: 8

      # SLA targets for profiling
      sla:

--- a/benchmarks/profiler/utils/profiler_argparse.py
+++ b/benchmarks/profiler/utils/profiler_argparse.py
@@ -12,6 +12,22 @@ from benchmarks.profiler.utils.planner_utils import add_planner_arguments_to_par
 from benchmarks.profiler.utils.search_space_autogen import auto_generate_search_space


+def _get(cfg: Dict[str, Any], camel: str, snake: str, default: Any = None) -> Any:
+    """Get config value with camelCase preferred, snake_case fallback."""
+    if camel in cfg:
+        return cfg[camel]
+    return cfg.get(snake, default)
+
+
+def _camel_to_snake(name: str) -> str:
+    """Convert camelCase to snake_case."""
+    import re
+
+    # Insert underscore before uppercase letters and lowercase
+    s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
+    return re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower()
+
+
 def parse_config_string(config_str: str) -> Dict[str, Any]:
    """Parse configuration string as Python dict literal, YAML, or JSON.

@@ -61,45 +77,48 @@ def create_profiler_parser() -> argparse.Namespace:
    """
    Create argument parser with support for YAML config string.

-    Config structure:
-        output_dir: String (path to the output results directory, default: profiling_results)
+    Config structure (camelCase preferred, snake_case supported for backwards compat):
+        outputDir: String (path to the output results directory, default: profiling_results)
        deployment:
            namespace: String (kubernetes namespace, default: dynamo-sla-profiler)
-            service_name: String (service name, default: "")
+            serviceName: String (service name, default: "")
            model: String (served model name)
-            model_cache_pvc_name: String (name of the PVC to mount the model cache,
+            dgdImage: String (container image to use for DGD components (frontend, planner, workers), overrides images in config file)
+            modelCache:
+                pvcName: String (name of the PVC to mount the model cache,
                    if not provided, model must be HF name and will download from HF, default: "")
-            model_cache_pvc_path: String (path to the model cache in the PVC, default: "")
-            model_cache_pvc_mount_path: String (path to the model cache in the container,
+                pvcPath: String (path to the model cache in the PVC, default: "")
+                mountPath: String (path to the model cache in the container,
                    note that the PVC must be mounted to the same path for the profiling job,
                    default: "/opt/model-cache")
        engine:
            backend: String (backend type, currently support [vllm, sglang, trtllm], default: vllm)
            config: String (path to the DynamoGraphDeployment config file, default: "")
-            max_context_length: Int (maximum context length supported by the served model, default: 0)
-            is_moe_model: Boolean (enable MoE (Mixture of Experts) model support, use TEP for prefill and DEP for decode, default: False)
+            maxContextLength: Int (maximum context length supported by the served model, default: 0)
+            isMoeModel: Boolean (enable MoE (Mixture of Experts) model support, use TEP for prefill and DEP for decode, default: False)
        hardware:
-            min_num_gpus_per_engine: Int (minimum number of GPUs per engine, default: 0)
-            max_num_gpus_per_engine: Int (maximum number of GPUs per engine, default: 0)
-            num_gpus_per_node: Int (number of GPUs per node for MoE models - this will be the granularity when searching for the best TEP/DEP size, default: 0)
+            minNumGpusPerEngine: Int (minimum number of GPUs per engine, default: 0)
+            maxNumGpusPerEngine: Int (maximum number of GPUs per engine, default: 0)
+            numGpusPerNode: Int (number of GPUs per node for MoE models - this will be the granularity when searching for the best TEP/DEP size, default: 0)
+            enableGpuDiscovery: Boolean (enable automatic GPU discovery from Kubernetes cluster nodes, when enabled overrides any manually specified hardware configuration, requires cluster-wide node access permissions, default: False)
        sweep:
-            prefill_interpolation_granularity: Int (how many samples to benchmark to interpolate TTFT under different ISL, default: 16)
-            decode_interpolation_granularity: Int (how many samples to benchmark to interpolate ITL under different active kv cache size and decode context length, default: 6)
-            use_ai_configurator: Boolean (use ai-configurator to estimate benchmarking results instead of running actual deployment, default: False)
-            aic_system: String (target system for use with aiconfigurator, default: None)
-            aic_hf_id: String (aiconfigurator huggingface id of the target model, default: None)
-            aic_backend: String (aiconfigurator backend of the target model, if not provided, will use args.backend, default: "")
-            aic_backend_version: String (specify backend version when using aiconfigurator to estimate perf, default: None)
-            dry_run: Boolean (dry run the profile job, default: False)
-            pick_with_webui: Boolean (pick the best parallelization mapping using webUI, default: False)
-            webui_port: Int (webUI port, default: $PROFILER_WEBUI_PORT or 8000)
+            prefillInterpolationGranularity: Int (how many samples to benchmark to interpolate TTFT under different ISL, default: 16)
+            decodeInterpolationGranularity: Int (how many samples to benchmark to interpolate ITL under different active kv cache size and decode context length, default: 6)
+            useAiConfigurator: Boolean (use ai-configurator to estimate benchmarking results instead of running actual deployment, default: False)
+            aicSystem: String (target system for use with aiconfigurator, default: None)
+            aicHfId: String (aiconfigurator huggingface id of the target model, default: None)
+            aicBackend: String (aiconfigurator backend of the target model, if not provided, will use args.backend, default: "")
+            aicBackendVersion: String (specify backend version when using aiconfigurator to estimate perf, default: None)
+            dryRun: Boolean (dry run the profile job, default: False)
+            pickWithWebui: Boolean (pick the best parallelization mapping using webUI, default: False)
+            webuiPort: Int (webUI port, default: $PROFILER_WEBUI_PORT or 8000)
        sla:
            isl: Int (target input sequence length, default: 3000)
            osl: Int (target output sequence length, default: 500)
            ttft: Float (target Time To First Token in milliseconds, default: 50)
            itl: Float (target Inter Token Latency in milliseconds, default: 10)
-        planner: (planner-bypass arguments, use hyphens or underscores)
-            i.e., planner-min-endpoint: 2  # or planner_min_endpoint: 2 (both work)
+        planner: (planner arguments)
+            e.g., plannerMinEndpoint: 2
    """
    # Step 1: Pre-parse to check if --profile-config is provided
    pre_parser = argparse.ArgumentParser(add_help=False)
@@ -130,37 +149,37 @@ def create_profiler_parser() -> argparse.Namespace:
        default=config.get("deployment", {}).get("model", ""),
        help="Served model name",
    )
+    model_cache_config = config.get("deployment", {}).get("modelCache", {})
    parser.add_argument(
        "--model-cache-pvc-name",
        type=str,
-        default=config.get("deployment", {}).get("model_cache_pvc_name", ""),
+        default=model_cache_config.get("pvcName", ""),
        help="Name of the PVC that contains the model weights. If not provided, args.model must be a HF model name and will download from HF",
    )
    parser.add_argument(
        "--model-cache-pvc-path",
        type=str,
-        default=config.get("deployment", {}).get("model_cache_pvc_path", ""),
+        default=model_cache_config.get("pvcPath", ""),
        help="Path to the model cache in the PVC",
    )
    parser.add_argument(
        "--model-cache-pvc-mount-path",
        type=str,
-        default=config.get("deployment", {}).get(
-            "model_cache_pvc_mount_path", "/opt/model-cache"
-        ),
+        default=model_cache_config.get("mountPath", "/opt/model-cache"),
        help="Path to the model cache in the container, note that the PVC must be mounted to the same path for the profiling job",
    )
+    deployment_cfg = config.get("deployment", {})
    parser.add_argument(
        "--dgd-image",
        type=str,
-        default=config.get("deployment", {}).get("dgd_image", ""),
+        default=_get(deployment_cfg, "dgdImage", "dgd_image", ""),
        help="Container image to use for DGD components (frontend, planner, workers). Overrides images in config file.",
    )

    parser.add_argument(
        "--namespace",
        type=str,
-        default=config.get("deployment", {}).get("namespace", "dynamo-sla-profiler"),
+        default=deployment_cfg.get("namespace", "dynamo-sla-profiler"),
        help="Kubernetes namespace to deploy the DynamoGraphDeployment",
    )
    parser.add_argument(
@@ -180,25 +199,26 @@ def create_profiler_parser() -> argparse.Namespace:
    parser.add_argument(
        "--output-dir",
        type=str,
-        default=config.get("output_dir", "profiling_results"),
+        default=_get(config, "outputDir", "output_dir", "profiling_results"),
        help="Path to the output results directory",
    )
+    hardware_cfg = config.get("hardware", {})
    parser.add_argument(
        "--min-num-gpus-per-engine",
        type=int,
-        default=config.get("hardware", {}).get("min_num_gpus_per_engine", 0),
+        default=_get(hardware_cfg, "minNumGpusPerEngine", "min_num_gpus_per_engine", 0),
        help="minimum number of GPUs per engine",
    )
    parser.add_argument(
        "--max-num-gpus-per-engine",
        type=int,
-        default=config.get("hardware", {}).get("max_num_gpus_per_engine", 0),
+        default=_get(hardware_cfg, "maxNumGpusPerEngine", "max_num_gpus_per_engine", 0),
        help="maximum number of GPUs per engine",
    )
    parser.add_argument(
        "--num-gpus-per-node",
        type=int,
-        default=config.get("hardware", {}).get("num_gpus_per_node", 0),
+        default=_get(hardware_cfg, "numGpusPerNode", "num_gpus_per_node", 0),
        help="Number of GPUs per node for MoE models - this will be the granularity when searching for the best TEP/DEP size",
    )
    parser.add_argument(
@@ -227,46 +247,58 @@ def create_profiler_parser() -> argparse.Namespace:
    )

    # arguments used for interpolating TTFT and ITL under different ISL/OSL
+    engine_cfg = config.get("engine", {})
    parser.add_argument(
        "--max-context-length",
        type=int,
-        default=config.get("engine", {}).get("max_context_length", 0),
+        default=_get(engine_cfg, "maxContextLength", "max_context_length", 0),
        help="maximum context length supported by the served model",
    )
+    sweep_cfg = config.get("sweep", {})
    parser.add_argument(
        "--prefill-interpolation-granularity",
        type=int,
-        default=config.get("sweep", {}).get("prefill_interpolation_granularity", 16),
+        default=_get(
+            sweep_cfg,
+            "prefillInterpolationGranularity",
+            "prefill_interpolation_granularity",
+            16,
+        ),
        help="how many samples to benchmark to interpolate TTFT under different ISL",
    )
    parser.add_argument(
        "--decode-interpolation-granularity",
        type=int,
-        default=config.get("sweep", {}).get("decode_interpolation_granularity", 6),
+        default=_get(
+            sweep_cfg,
+            "decodeInterpolationGranularity",
+            "decode_interpolation_granularity",
+            6,
+        ),
        help="how many samples to benchmark to interpolate ITL under different active kv cache size and decode context length",
    )
    parser.add_argument(
        "--service-name",
        type=str,
-        default=config.get("deployment", {}).get("service_name", ""),
+        default=_get(deployment_cfg, "serviceName", "service_name", ""),
        help="Service name for port forwarding (default: {deployment_name}-frontend)",
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
-        default=config.get("sweep", {}).get("dry_run", False),
+        default=_get(sweep_cfg, "dryRun", "dry_run", False),
        help="Dry run the profile job",
    )
    parser.add_argument(
        "--enable-gpu-discovery",
        action="store_true",
-        default=config.get("hardware", {}).get("enable_gpu_discovery", False),
+        default=_get(hardware_cfg, "enableGpuDiscovery", "enable_gpu_discovery", False),
        help="Enable automatic GPU discovery from Kubernetes cluster nodes. When enabled, overrides any manually specified hardware configuration. Requires cluster-wide node access permissions.",
    )
    parser.add_argument(
        "--pick-with-webui",
        action="store_true",
-        default=config.get("sweep", {}).get("pick_with_webui", False),
+        default=_get(sweep_cfg, "pickWithWebui", "pick_with_webui", False),
        help="Pick the best parallelization mapping using webUI",
    )

@@ -277,19 +309,19 @@ def create_profiler_parser() -> argparse.Namespace:
    parser.add_argument(
        "--webui-port",
        type=int,
-        default=config.get("sweep", {}).get("webui_port", default_webui_port),
+        default=_get(sweep_cfg, "webuiPort", "webui_port", default_webui_port),
        help="WebUI port",
    )

    # Dynamically add all planner arguments from planner_argparse.py
    add_planner_arguments_to_parser(parser, prefix="planner-")
    # Set defaults for any planner arguments found in config.planner
-    # Note: argparse converts hyphens to underscores, so we need to normalize keys
+    # Normalize keys: camelCase -> snake_case, hyphens -> underscores
    planner_config = config.get("planner", {})
    if planner_config:
-        # Convert hyphens to underscores to match argparse's internal naming
        normalized_planner_config = {
-            key.replace("-", "_"): value for key, value in planner_config.items()
+            _camel_to_snake(key).replace("-", "_"): value
+            for key, value in planner_config.items()
        }
        parser.set_defaults(**normalized_planner_config)

@@ -297,31 +329,31 @@ def create_profiler_parser() -> argparse.Namespace:
    parser.add_argument(
        "--use-ai-configurator",
        action="store_true",
-        default=config.get("sweep", {}).get("use_ai_configurator", False),
+        default=_get(sweep_cfg, "useAiConfigurator", "use_ai_configurator", False),
        help="Use ai-configurator to estimate benchmarking results instead of running actual deployment.",
    )
    parser.add_argument(
        "--aic-system",
        type=str,
-        default=config.get("sweep", {}).get("aic_system"),
+        default=_get(sweep_cfg, "aicSystem", "aic_system", None),
        help="Target system for use with aiconfigurator (e.g. h100_sxm, h200_sxm)",
    )
    parser.add_argument(
        "--aic-hf-id",
        type=str,
-        default=config.get("sweep", {}).get("aic_hf_id"),
+        default=_get(sweep_cfg, "aicHfId", "aic_hf_id", None),
        help="aiconfigurator name of the target model (e.g. Qwen/Qwen3-32B, meta-llama/Llama-3.1-405B)",
    )
    parser.add_argument(
        "--aic-backend",
        type=str,
-        default=config.get("sweep", {}).get("aic_backend", ""),
+        default=_get(sweep_cfg, "aicBackend", "aic_backend", ""),
        help="aiconfigurator backend of the target model, if not provided, will use args.backend",
    )
    parser.add_argument(
        "--aic-backend-version",
        type=str,
-        default=config.get("sweep", {}).get("aic_backend_version"),
+        default=_get(sweep_cfg, "aicBackendVersion", "aic_backend_version", None),
        help="Specify backend version when using aiconfigurator to estimate perf.",
    )


--- a/deploy/helm/charts/crds/templates/nvidia.com_dynamographdeploymentrequests.yaml
+++ b/deploy/helm/charts/crds/templates/nvidia.com_dynamographdeploymentrequests.yaml
@@ -144,8 +144,8 @@ spec:
                  description: |-
                    EnableGpuDiscovery controls whether the profiler should automatically discover GPU
                    resources from the Kubernetes cluster nodes. When enabled, the profiler will override
-                    any manually specified hardware configuration (min_num_gpus_per_engine, max_num_gpus_per_engine,
-                    num_gpus_per_node) with values detected from the cluster.
+                    any manually specified hardware configuration (minNumGpusPerEngine, maxNumGpusPerEngine,
+                    numGpusPerNode) with values detected from the cluster.
                    Requires cluster-wide node access permissions - only available with cluster-scoped operators.
                  type: boolean
                model:

--- a/deploy/operator/api/v1alpha1/dynamographdeploymentrequest_types.go
+++ b/deploy/operator/api/v1alpha1/dynamographdeploymentrequest_types.go
@@ -146,8 +146,8 @@ type DynamoGraphDeploymentRequestSpec struct {

 	// EnableGpuDiscovery controls whether the profiler should automatically discover GPU
 	// resources from the Kubernetes cluster nodes. When enabled, the profiler will override
-	// any manually specified hardware configuration (min_num_gpus_per_engine, max_num_gpus_per_engine,
-	// num_gpus_per_node) with values detected from the cluster.
+	// any manually specified hardware configuration (minNumGpusPerEngine, maxNumGpusPerEngine,
+	// numGpusPerNode) with values detected from the cluster.
 	// Requires cluster-wide node access permissions - only available with cluster-scoped operators.
 	// +kubebuilder:default=false
 	// +kubebuilder:validation:Optional

--- a/deploy/operator/config/crd/bases/nvidia.com_dynamographdeploymentrequests.yaml
+++ b/deploy/operator/config/crd/bases/nvidia.com_dynamographdeploymentrequests.yaml
@@ -144,8 +144,8 @@ spec:
                  description: |-
                    EnableGpuDiscovery controls whether the profiler should automatically discover GPU
                    resources from the Kubernetes cluster nodes. When enabled, the profiler will override
-                    any manually specified hardware configuration (min_num_gpus_per_engine, max_num_gpus_per_engine,
-                    num_gpus_per_node) with values detected from the cluster.
+                    any manually specified hardware configuration (minNumGpusPerEngine, maxNumGpusPerEngine,
+                    numGpusPerNode) with values detected from the cluster.
                    Requires cluster-wide node access permissions - only available with cluster-scoped operators.
                  type: boolean
                model:

--- a/deploy/operator/config/samples/nvidia.com_v1alpha1_dynamographdeploymentrequest.yaml
+++ b/deploy/operator/config/samples/nvidia.com_v1alpha1_dynamographdeploymentrequest.yaml
@@ -37,24 +37,24 @@ spec:

      # Engine configuration
      engine:
-        max_context_length: 16384  # will override max context length of the model if provided
+        maxContextLength: 16384  # will override max context length of the model if provided

      # Hardware configuration
      hardware:
-        min_num_gpus_per_engine: 1  # Minimum GPUs to test
-        max_num_gpus_per_engine: 4  # Maximum GPUs to test (limited by model's num_heads/4)
-        num_gpus_per_node: 8  # GPUs per node (for MoE models)
+        minNumGpusPerEngine: 1  # Minimum GPUs to test
+        maxNumGpusPerEngine: 4  # Maximum GPUs to test (limited by model's num_heads/4)
+        numGpusPerNode: 8  # GPUs per node (for MoE models)

      # Sweep/profiling configuration
      sweep:
-        prefill_interpolation_granularity: 16  # Samples for TTFT interpolation
-        decode_interpolation_granularity: 6  # Samples for ITL interpolation
+        prefillInterpolationGranularity: 16  # Samples for TTFT interpolation
+        decodeInterpolationGranularity: 6  # Samples for ITL interpolation

        # AI Configurator mode (fast simulation-based profiling, 20-30 seconds)
-        use_ai_configurator: false  # Set to false for online profiling (2-4 hours)
-        aic_system: h200_sxm  # Target GPU system for AI Configurator
-        aic_hf_id: Qwen/Qwen3-0.6B  # HuggingFace model ID for AI Configurator
-        aic_backend_version: "0.20.0"  # Backend version for AI Configurator
+        useAiConfigurator: false  # Set to false for online profiling (2-4 hours)
+        aicSystem: h200_sxm  # Target GPU system for AI Configurator
+        aicHfId: Qwen/Qwen3-0.6B  # HuggingFace model ID for AI Configurator
+        aicBackendVersion: "0.20.0"  # Backend version for AI Configurator

      # SLA targets for profiling
      sla:
@@ -65,8 +65,8 @@ spec:

      # Optional: Planner-specific arguments
      # planner:
-      #   planner_min_endpoint: 2
-      #   # Add any other planner args here (use hyphens or underscores)
+      #   plannerMinEndpoint: 2
+      #   # Add any other planner args here

    # Reference to ConfigMap containing the DGD base config (disagg.yaml)
    # The path to this file will be automatically set as engine.config

--- a/deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go
+++ b/deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go
@@ -116,6 +116,7 @@ const (
 	// Volume names
 	VolumeNameProfilingConfig = "profiling-config"
 	VolumeNameProfilingOutput = "profiling-output"
+	VolumeNameModelCache      = "model-cache"

 	// Volume paths
 	ProfilingOutputPath        = "/data"
@@ -123,6 +124,7 @@ const (
 	ProfilingOutputFileMocker  = "mocker_config_with_planner.yaml"
 	ProfilingConfigPath        = "/config"
 	ProfilingConfigFile        = "disagg.yaml"
+	DefaultModelCacheMountPath = "/opt/model-cache"

 	// Command line arguments
 	ArgModel   = "--model"
@@ -152,6 +154,7 @@ const (
 	MessageProfilingCheckFailed      = "ProfilingCheckFailed"
 	MessageConfigMapNotFound         = "ConfigMap %s not found in namespace %s"
 	MessageConfigMapKeyNotFound      = "key %s not found in ConfigMap %s"
+	MessageModelCachePVCNotFound     = "model cache PVC %s not found in namespace %s"

 	// Validation messages
 	ValidationErrorModelRequired  = "model is required"
@@ -163,6 +166,13 @@ const (
 	BackendVLLM   = "vllm"
 	BackendSGLang = "sglang"
 	BackendTRTLLM = "trtllm"
+
+	// Profiling config field names
+	ConfigKeyDeployment = "deployment"
+	ConfigKeyModelCache = "modelCache"
+	ConfigKeyPVCName    = "pvcName"
+	ConfigKeyPVCPath    = "pvcPath"
+	ConfigKeyMountPath  = "mountPath"
 )

 // shell script template for the output copier sidecar
@@ -796,6 +806,10 @@ func isOnlineProfiling(dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) boo
 	}

 	if sweep, ok := config["sweep"].(map[string]interface{}); ok {
+		// Check camelCase first (preferred), then snake_case (backwards compat)
+		if useAIC, exists := sweep["useAiConfigurator"].(bool); exists {
+			return !useAIC
+		}
 		if useAIC, exists := sweep["use_ai_configurator"].(bool); exists {
 			return !useAIC
 		}
@@ -852,6 +866,23 @@ func (r *DynamoGraphDeploymentRequestReconciler) validateSpec(ctx context.Contex
 		}
 	}

+	// Validate model cache PVC if provided
+	modelCachePVC, _ := extractModelCachePVCConfig(dgdr)
+	if modelCachePVC != "" {
+		pvc := &corev1.PersistentVolumeClaim{}
+		err := r.Get(ctx, types.NamespacedName{
+			Name:      modelCachePVC,
+			Namespace: dgdr.Namespace,
+		}, pvc)
+
+		if err != nil {
+			if apierrors.IsNotFound(err) {
+				return fmt.Errorf(MessageModelCachePVCNotFound, modelCachePVC, dgdr.Namespace)
+			}
+			return err
+		}
+	}
+
 	// The profiler will validate the rest of the configuration
 	return nil
 }
@@ -959,6 +990,17 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
 			})
 		}

+		// Add model cache PVC mount if configured in profilingConfig.config.deployment
+		modelCachePVC, modelCacheMountPath := extractModelCachePVCConfig(dgdr)
+		if modelCachePVC != "" {
+			logger.Info("Mounting model cache PVC to profiler pod", "pvc", modelCachePVC, "mountPath", modelCacheMountPath)
+			volumeMounts = append(volumeMounts, corev1.VolumeMount{
+				Name:      VolumeNameModelCache,
+				MountPath: modelCacheMountPath,
+				ReadOnly:  true,
+			})
+		}
+
 		// Profiler args: pass the config as an inline YAML string via --profile-config
 		profilerArgs := []string{
 			"--profile-config", string(configYAML),
@@ -1064,6 +1106,19 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
 			})
 		}

+		// Add model cache PVC volume if configured
+		if modelCachePVC != "" {
+			volumes = append(volumes, corev1.Volume{
+				Name: VolumeNameModelCache,
+				VolumeSource: corev1.VolumeSource{
+					PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
+						ClaimName: modelCachePVC,
+						ReadOnly:  true,
+					},
+				},
+			})
+		}
+
 		// Limit retries to prevent infinite loop
 		backoffLimit := int32(3)

@@ -1193,6 +1248,41 @@ func (r *DynamoGraphDeploymentRequestReconciler) prepareProfilingConfig(dgdr *nv
 	return configYAML, nil
 }

+// extractModelCachePVCConfig extracts model cache PVC settings from the profiling config.
+// Returns (pvcName, mountPath) - both empty if not configured.
+func extractModelCachePVCConfig(dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (string, string) {
+	if dgdr.Spec.ProfilingConfig.Config == nil {
+		return "", ""
+	}
+
+	var config map[string]interface{}
+	if err := yaml.Unmarshal(dgdr.Spec.ProfilingConfig.Config.Raw, &config); err != nil {
+		return "", ""
+	}
+
+	deployment, ok := config[ConfigKeyDeployment].(map[string]interface{})
+	if !ok {
+		return "", ""
+	}
+
+	modelCache, ok := deployment[ConfigKeyModelCache].(map[string]interface{})
+	if !ok {
+		return "", ""
+	}
+
+	pvcName, _ := modelCache[ConfigKeyPVCName].(string)
+	if pvcName == "" {
+		return "", ""
+	}
+
+	mountPath, _ := modelCache[ConfigKeyMountPath].(string)
+	if mountPath == "" {
+		mountPath = DefaultModelCacheMountPath
+	}
+
+	return pvcName, mountPath
+}
+
 // checkProfilingJobStatus checks if the profiling job has completed
 func (r *DynamoGraphDeploymentRequestReconciler) checkProfilingJobStatus(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (bool, error) {
 	logger := log.FromContext(ctx)

--- a/deploy/operator/internal/controller/dynamographdeploymentrequest_controller_test.go
+++ b/deploy/operator/internal/controller/dynamographdeploymentrequest_controller_test.go
@@ -858,6 +858,36 @@ var _ = Describe("DGDR Helper Functions", func() {
 			}
 			Expect(isOnlineProfiling(dgdr)).Should(BeTrue())
 		})
+
+		It("Should return false for AI Configurator profiling (useAiConfigurator=true camelCase)", func() {
+			dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
+				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
+					ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
+						Config: createTestConfig(map[string]interface{}{
+							"sweep": map[string]interface{}{
+								"useAiConfigurator": true,
+							},
+						}),
+					},
+				},
+			}
+			Expect(isOnlineProfiling(dgdr)).Should(BeFalse())
+		})
+
+		It("Should return true for online profiling (useAiConfigurator=false camelCase)", func() {
+			dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
+				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
+					ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
+						Config: createTestConfig(map[string]interface{}{
+							"sweep": map[string]interface{}{
+								"useAiConfigurator": false,
+							},
+						}),
+					},
+				},
+			}
+			Expect(isOnlineProfiling(dgdr)).Should(BeTrue())
+		})
 	})
 })


--- a/docs/benchmarks/sla_driven_profiling.md
+++ b/docs/benchmarks/sla_driven_profiling.md
@@ -63,13 +63,13 @@ profilingConfig:
  config:
    # Override hardware defaults if needed
    hardware:
-      min_num_gpus_per_engine: 1
-      max_num_gpus_per_engine: 8
-      num_gpus_per_node: 8
+      minNumGpusPerEngine: 1
+      maxNumGpusPerEngine: 8
+      numGpusPerNode: 8

-    # Only needed when using AI Configurator (sweep.use_ai_configurator: true)
+    # Only needed when using AI Configurator (sweep.useAiConfigurator: true)
    sweep:
-      aic_system: h200_sxm  # GPU type for AI Configurator (h100_sxm, h200_sxm, etc.)
+      aicSystem: h200_sxm  # GPU type for AI Configurator (h100_sxm, h200_sxm, etc.)
 ```

 ### Automatic GPU Discovery (Optional Feature)
@@ -120,7 +120,7 @@ Profiles your model by creating real test deployments in Kubernetes and measurin
 profilingConfig:
  config:
    sweep:
-      use_ai_configurator: false  # Default
+      useAiConfigurator: false  # Default
 ```

 ### AI Configurator Simulation
@@ -138,11 +138,10 @@ Uses performance simulation to rapidly estimate optimal configurations without r
 profilingConfig:
  config:
    sweep:
-      use_ai_configurator: true
-    aic:
-      system: h200_sxm          # GPU system type
-      model_name: QWEN3_32B     # AIC model identifier
-      backend_version: "0.20.0"
+      useAiConfigurator: true
+      aicSystem: h200_sxm          # GPU system type
+      aicHfId: Qwen/Qwen3-32B      # HuggingFace model ID
+      aicBackendVersion: "0.20.0"
 ```

 **Supported Configurations:**
@@ -290,8 +289,7 @@ spec:
    config:                        # Profiler configuration
      sla: { ... }
      hardware: { ... }
-      sweep: { ... }
-      aic: { ... }
+      sweep: { ... }               # AIC settings go here (aicSystem, aicHfId, etc.)
      planner: { ... }

  deploymentOverrides:             # Optional
@@ -326,16 +324,16 @@ Control GPU search space and constraints:
 profilingConfig:
  config:
    hardware:
-      min_num_gpus_per_engine: 2      # if not provided, will automatically determine based on model and VRAM size
-      max_num_gpus_per_engine: 8      # Maximum GPUs to test
-      num_gpus_per_node: 8            # GPUs per node (for multi-node MoE)
-      gpu_type: h200_sxm              # GPU type hint
+      minNumGpusPerEngine: 2      # if not provided, will automatically determine based on model and VRAM size
+      maxNumGpusPerEngine: 8      # Maximum GPUs to test
+      numGpusPerNode: 8            # GPUs per node (for multi-node MoE)
+      gpuType: h200_sxm              # GPU type hint
 ```

 **When to use:**
- **min_num_gpus_per_engine**: Skip small TP sizes if your model is large
- **max_num_gpus_per_engine**: Limit search space or work around constraints (e.g., [AIC attention heads](#ai-configurator-attention-head-constraint-error))
- **num_gpus_per_node**: Determine the upper bound of number of GPUs per node for dense models and configure Grove for multi-node MoE engines.
+- **minNumGpusPerEngine**: Skip small TP sizes if your model is large
+- **maxNumGpusPerEngine**: Limit search space or work around constraints (e.g., [AIC attention heads](#ai-configurator-attention-head-constraint-error))
+- **numGpusPerNode**: Determine the upper bound of number of GPUs per node for dense models and configure Grove for multi-node MoE engines.
 - **gpu_type**: Informational, auto-detected by controller

 > [!TIP]
@@ -349,17 +347,17 @@ Control profiling behavior:
 profilingConfig:
  config:
    sweep:
-      use_ai_configurator: false              # Use offline profiling (default: false)
-      prefill_interpolation_granularity: 16   # Samples for prefill TTFT curve
-      decode_interpolation_granularity: 6     # Samples for decode ITL curve
+      useAiConfigurator: false              # Use offline profiling (default: false)
+      prefillInterpolationGranularity: 16   # Samples for prefill TTFT curve
+      decodeInterpolationGranularity: 6     # Samples for decode ITL curve
 ```

 **Use cases:**
- **use_ai_configurator**: Set to `true` for 20-30 second profiling (TensorRT-LLM only)
- **prefill_interpolation_granularity**: How many samples to benchmark for prefill TTFT curve (lower = faster but may be less accurate)
- **decode_interpolation_granularity**: How many samples to benchmark for decode ITL curve (lower = faster but may be less accurate). Since ITL interpolation is a 3d plot and takes longer to run, we default to a smaller number of samples. Increasing this value might quadratically increase the profiling time.
+- **useAiConfigurator**: Set to `true` for 20-30 second profiling (TensorRT-LLM only)
+- **prefillInterpolationGranularity**: How many samples to benchmark for prefill TTFT curve (lower = faster but may be less accurate)
+- **decodeInterpolationGranularity**: How many samples to benchmark for decode ITL curve (lower = faster but may be less accurate). Since ITL interpolation is a 3d plot and takes longer to run, we default to a smaller number of samples. Increasing this value might quadratically increase the profiling time.

-### AI Configurator Configuration (Required if `use_ai_configurator: true`)
+### AI Configurator Configuration (Required if `useAiConfigurator: true`)

 Configure AI Configurator profiling mode:

@@ -367,10 +365,10 @@ Configure AI Configurator profiling mode:
 profilingConfig:
  config:
    sweep:
-      use_ai_configurator: true
-      aic_system: h200_sxm              # GPU system: h100_sxm, h200_sxm, b200_sxm, gb200_sxm, a100_sxm
-      aic_hf_id: Qwen/Qwen3-32B         # Huggingface model id
-      aic_backend_version: "0.20.0"     # TensorRT-LLM version: 0.20.0, 1.0.0rc3
+      useAiConfigurator: true
+      aicSystem: h200_sxm              # GPU system: h100_sxm, h200_sxm, b200_sxm, gb200_sxm, a100_sxm
+      aicHfId: Qwen/Qwen3-32B         # Huggingface model id
+      aicBackendVersion: "0.20.0"     # TensorRT-LLM version: 0.20.0, 1.0.0rc3
 ```

 **Supported configurations:** See [AI Configurator documentation](https://github.com/ai-dynamo/aiconfigurator#supported-features)
@@ -391,6 +389,27 @@ profilingConfig:
 > [!NOTE]
 > Planner arguments use `planner_` prefix. See planner documentation for full list.

+### Model Cache PVC (Advanced)
+
+For large models, you can use a pre-populated PVC containing model weights instead of downloading from HuggingFace. This is useful when:
+- The model is not publicly available on HuggingFace
+- You want to avoid repeated downloads during profiling
+- You have a shared model cache across your cluster
+
+```yaml
+profilingConfig:
+  config:
+    deployment:
+      modelCache:
+        pvcName: "model-cache"                        # Name of PVC containing model weights (required)
+        pvcPath: "hub/models--deepseek-ai--DeepSeek-R1"  # Subpath within PVC (optional)
+        mountPath: "/opt/model-cache"                 # Mount path in container (optional, default: /opt/model-cache)
+```
+
+**Requirements:**
+- The PVC must exist in the same namespace as the DGDR
+- The model weights must be accessible at `{mountPath}/{pvcPath}`
+
 ### Engine Configuration (Auto-configured)

 The controller automatically sets these from high-level fields:
@@ -434,11 +453,11 @@ spec:
        itl: 20.0

      hardware:
-        min_num_gpus_per_engine: 1
-        max_num_gpus_per_engine: 8
+        minNumGpusPerEngine: 1
+        maxNumGpusPerEngine: 8

      sweep:
-        use_ai_configurator: false
+        useAiConfigurator: false

  deploymentOverrides:
    workersImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1"
@@ -467,12 +486,10 @@ spec:
        itl: 10.0

      sweep:
-        use_ai_configurator: true
-
-      aic:
-        system: h200_sxm
-        model_name: QWEN3_32B
-        backend_version: "0.20.0"
+        useAiConfigurator: true
+        aicSystem: h200_sxm
+        aicHfId: Qwen/Qwen3-32B
+        aicBackendVersion: "0.20.0"

  deploymentOverrides:
    workersImage: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.6.1"
@@ -501,11 +518,11 @@ spec:
        itl: 25.0

      hardware:
-        num_gpus_per_node: 8
-        max_num_gpus_per_engine: 32
+        numGpusPerNode: 8
+        maxNumGpusPerEngine: 32

      engine:
-        is_moe_model: true       # Enable MoE profiling mode
+        isMoeModel: true       # Enable MoE profiling mode

  deploymentOverrides:
    workersImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.1"
@@ -520,15 +537,15 @@ spec:
 **Solution 1**: Use AI Configurator for rapid profiling (TensorRT-LLM only):
 ```yaml
 sweep:
-  use_ai_configurator: true
+  useAiConfigurator: true
 ```

 **Solution 2**: Reduce search space:
 ```yaml
 config:
  sweep:
-    min_num_gpus: 4  # Skip TP1, TP2
-    max_num_gpus: 8  # Don't test beyond TP8
+    minNumGpus: 4  # Skip TP1, TP2
+    maxNumGpus: 8  # Don't test beyond TP8
 ```

 ### SLA Cannot Be Met
@@ -555,19 +572,18 @@ AssertionError: num_heads <N> should be divisible by tp_size <M> and the divisio
 - **GPT-2** (12 heads): Max TP = 3
 - Most models **<1B parameters**: May hit this constraint

-**Solution**: Limit `max_num_gpus_per_engine` in your DGDR:
+**Solution**: Limit `maxNumGpusPerEngine` in your DGDR:

 ```yaml
 profilingConfig:
  profilerImage: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.6.1"
  config:
    hardware:
-      max_num_gpus_per_engine: 4  # For Qwen3-0.6B (16 heads / 4 = max TP of 4)
+      maxNumGpusPerEngine: 4  # For Qwen3-0.6B (16 heads / 4 = max TP of 4)
    sweep:
-      use_ai_configurator: true
-    aic:
-      system: h200_sxm
-      model_name: QWEN3_0_6B
+      useAiConfigurator: true
+      aicSystem: h200_sxm
+      aicHfId: Qwen/Qwen3-0.6B
 ```

 **Calculate Max TP**: `max_tp = num_attention_heads / 4`

--- a/docs/kubernetes/api_reference.md
+++ b/docs/kubernetes/api_reference.md
@@ -311,7 +311,7 @@ _Appears in:_
 | `model` _string_ | Model specifies the model to deploy (e.g., "Qwen/Qwen3-0.6B", "meta-llama/Llama-3-70b").<br />This is a high-level identifier for easy reference in kubectl output and logs.<br />The controller automatically sets this value in profilingConfig.config.deployment.model. |  | Required: \{\} <br /> |
 | `backend` _string_ | Backend specifies the inference backend for profiling.<br />The controller automatically sets this value in profilingConfig.config.engine.backend.<br />Profiling runs on real GPUs or via AIC simulation to collect performance data. |  | Enum: [vllm sglang trtllm] <br />Required: \{\} <br /> |
 | `useMocker` _boolean_ | UseMocker indicates whether to deploy a mocker DynamoGraphDeployment instead of<br />a real backend deployment. When true, the deployment uses simulated engines that<br />don't require GPUs, using the profiling data to simulate realistic timing behavior.<br />Mocker is available in all backend images and useful for large-scale experiments.<br />Profiling still runs against the real backend (specified above) to collect performance data. | false |  |
-| `enableGpuDiscovery` _boolean_ | EnableGpuDiscovery controls whether the profiler should automatically discover GPU<br />resources from the Kubernetes cluster nodes. When enabled, the profiler will override<br />any manually specified hardware configuration (min_num_gpus_per_engine, max_num_gpus_per_engine,<br />num_gpus_per_node) with values detected from the cluster.<br />Requires cluster-wide node access permissions - only available with cluster-scoped operators. | false | Optional: \{\} <br /> |
+| `enableGpuDiscovery` _boolean_ | EnableGpuDiscovery controls whether the profiler should automatically discover GPU<br />resources from the Kubernetes cluster nodes. When enabled, the profiler will override<br />any manually specified hardware configuration (minNumGpusPerEngine, maxNumGpusPerEngine,<br />numGpusPerNode) with values detected from the cluster.<br />Requires cluster-wide node access permissions - only available with cluster-scoped operators. | false | Optional: \{\} <br /> |
 | `profilingConfig` _[ProfilingConfigSpec](#profilingconfigspec)_ | ProfilingConfig provides the complete configuration for the profiling job.<br />This configuration is passed directly to the profiler.<br />The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema).<br />Note: deployment.model and engine.backend are automatically set from the high-level<br />modelName and backend fields and should not be specified in this config. |  | Required: \{\} <br /> |
 | `autoApply` _boolean_ | AutoApply indicates whether to automatically create a DynamoGraphDeployment<br />after profiling completes. If false, only the spec is generated and stored in status.<br />Users can then manually create a DGD using the generated spec. | false |  |
 | `deploymentOverrides` _[DeploymentOverridesSpec](#deploymentoverridesspec)_ | DeploymentOverrides allows customizing metadata for the auto-created DGD.<br />Only applicable when AutoApply is true. |  | Optional: \{\} <br /> |

--- a/docs/planner/sla_planner_quickstart.md
+++ b/docs/planner/sla_planner_quickstart.md
@@ -92,38 +92,10 @@ Dynamo provides sample DGDR configurations in `benchmarks/profiler/deploy/`. You

 **Available Sample DGDRs:**
 - **`profile_sla_dgdr.yaml`**: Standard online profiling for dense models
- **`profile_sla_aic_dgdr.yaml`**: Fast offline profiling using AI Configurator (TensorRT-LLM)
+- **`profile_sla_aic_dgdr.yaml`**: Fast offline profiling using AI Configurator
 - **`profile_sla_moe_dgdr.yaml`**: Online profiling for MoE models (SGLang)

-Or, you can create your own DGDR for your own needs:
-
-```yaml
-apiVersion: nvidia.com/v1alpha1
-kind: DynamoGraphDeploymentRequest
-metadata:
-  name: my-model-deployment  # Change the name
-  namespace: default         # Change the namespace
-spec:
-  model: "Qwen/Qwen3-0.6B"     # Update to your model
-  backend: vllm                # Backend: vllm, sglang, or trtllm
-
-  profilingConfig:
-    profilerImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1"  # Required
-    config:
-      sla:
-        isl: 3000    # Adjust to your workload
-        osl: 150     # Adjust to your workload
-        ttft: 200    # Your target (ms)
-        itl: 20      # Your target (ms)
-
-      sweep:
-        use_ai_configurator: false  # Set to true for fast profiling (TensorRT-LLM only)
-
-  deploymentOverrides:
-    workersImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1"  # Optional
-
-  autoApply: true  # Auto-deploy after profiling
-```
+Or, you can create your own DGDR for your own needs.

 > [!TIP]
 > For detailed explanations of all configuration options (SLA, hardware, sweep, AIC, planner), see the [DGDR Configuration Reference](/docs/benchmarks/sla_driven_profiling.md#dgdr-configuration-reference).
@@ -242,14 +214,14 @@ Choose between **online profiling** (real measurements, 2-4 hours) or **offline
 ```yaml
 # Online Profiling (Default)
 sweep:
-  use_ai_configurator: false
+  useAiConfigurator: false

-# Offline Profiling (AI Configurator - TensorRT-LLM only)
+# Offline Profiling (AI Configurator)
 sweep:
-  use_ai_configurator: true
-  aic_system: h200_sxm
-  aic_hf_id: Qwen/Qwen3-32B
-  aic_backend_version: "0.20.0"
+  useAiConfigurator: true
+  aicSystem: h200_sxm
+  aicHfId: Qwen/Qwen3-32B
+  aicBackendVersion: "0.20.0"
 ```

 > [!NOTE]
@@ -297,11 +269,10 @@ spec:
        ttft: 300
        itl: 10
      sweep:
-        use_ai_configurator: true
-      aic:
-        system: h200_sxm
-        model_name: DEEPSEEK_V3
-        backend_version: "0.20.0"
+        useAiConfigurator: true
+        aicSystem: h200_sxm
+        aicHfId: deepseek-ai/DeepSeek-V3
+        aicBackendVersion: "0.20.0"

  deploymentOverrides:
    workersImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.1"
@@ -327,26 +298,26 @@ profilingConfig:

    # Hardware constraints (optional)
    hardware:
-      min_num_gpus_per_engine: 2
-      max_num_gpus_per_engine: 8
-      gpu_type: h200_sxm
+      minNumGpusPerEngine: 2
+      maxNumGpusPerEngine: 8
+      gpuType: h200_sxm

    # Profiling sweep settings (optional)
    sweep:
-      prefill_interpolation_granularity: 16  # Number of samples for prefill ISL sweep
-      decode_interpolation_granularity: 6    # Number of samples for decode sweep
+      prefillInterpolationGranularity: 16  # Number of samples for prefill ISL sweep
+      decodeInterpolationGranularity: 6    # Number of samples for decode sweep
 ```

 > **Note**: `engine.config` is a **file path** to a DGD YAML file, not inline configuration. Use ConfigMapRef (recommended) or leave it unset to auto-generate.

 #### Planner Configuration Passthrough
-Add planner-specific settings. Planner arguments use a `planner_` prefix:
+Add planner-specific settings:

 ```yaml
 profilingConfig:
  config:
    planner:
-      planner_min_endpoint: 2
+      plannerMinEndpoint: 2
 ```

 ## Understanding Profiling Results
@@ -378,6 +349,10 @@ spec:

 Profiling still runs against the real backend (via GPUs or AIC) to collect performance data. The mocker deployment then uses this data to simulate realistic timing behavior.

+### Using a Model Cache PVC
+
+For large models, you can use a pre-populated PVC containing model weights instead of downloading from HuggingFace. See [Model Cache PVC](/docs/benchmarks/sla_driven_profiling.md#model-cache-pvc-advanced) for configuration details.
+
 ### DGDR Immutability

 DGDRs are **immutable** - if you need to update SLAs or configuration: