fix: make it clear what features are only available 0.8.1 and on (#5492)

Signed-off-by: Hannah Zhang <hannahz@nvidia.com>

fix: make it clear what features are only available 0.8.1 and on (#5492)
Signed-off-by: Hannah Zhang <hannahz@nvidia.com>
a2c7d0f9 · hhzhang16 · GitHub · 06939770 · a2c7d0f9 · a2c7d0f9
Unverified Commit a2c7d0f9 authored Jan 16, 2026 by hhzhang16 Committed by GitHub Jan 16, 2026
4 changed files
--- a/benchmarks/profiler/deploy/profile_sla_aic_dgdr.yaml
+++ b/benchmarks/profiler/deploy/profile_sla_aic_dgdr.yaml
@@ -12,26 +12,19 @@ spec:
  # ProfilingConfig maps directly to the profile_sla.py config format
  profilingConfig:
-    profilerImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag"
+    profilerImage: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:my-tag"
+    # NOTE: any image built before January 10 and any release prior to 0.8.1
+    # will need to use snake_case within profilingConfig.config
    config:
-      # Sweep/profiling configuration
      sweep:
-        # AI Configurator mode (fast simulation-based profiling)
        useAiConfigurator: true
        aicSystem: h200_sxm
-        aicHfId: Qwen/Qwen3-32B
-        aicBackendVersion: "0.20.0"
-      # SLA targets for profiling
      sla:
-        isl: 3000   # Input sequence length
+        isl: 3000
-        osl: 150    # Output sequence length
+        osl: 150
-        ttft: 500.0 # Time To First Token target (milliseconds)
+        ttft: 500.0
-        itl: 30.0   # Inter-Token Latency target (milliseconds)
+        itl: 30.0
-  # Deployment overrides for the auto-created DGD
  deploymentOverrides:
    workersImage: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:my-tag"
-  # Automatically create DynamoGraphDeployment after profiling
  autoApply: true
--- a/benchmarks/profiler/deploy/profile_sla_dgdr.yaml
+++ b/benchmarks/profiler/deploy/profile_sla_dgdr.yaml
@@ -13,22 +13,17 @@ spec:
  # ProfilingConfig maps directly to the profile_sla.py config format
  profilingConfig:
    profilerImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag"
+    # NOTE: any image built before January 10 and any release prior to 0.8.1
+    # will need to use snake_case within profilingConfig.config
    config:
-      # Sweep/profiling configuration
      sweep:
-        # Online profiling mode (real deployment testing)
        useAiConfigurator: false
-      # SLA targets for profiling
      sla:
-        isl: 3000   # Input sequence length
+        isl: 3000
-        osl: 150    # Output sequence length
+        osl: 150
-        ttft: 200.0 # Time To First Token target (milliseconds)
+        ttft: 200.0
-        itl: 20.0   # Inter-Token Latency target (milliseconds)
+        itl: 20.0
-  # Deployment overrides for the auto-created DGD
  deploymentOverrides:
    workersImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag"
-  # Automatically create DynamoGraphDeployment after profiling
  autoApply: true
--- a/benchmarks/profiler/deploy/profile_sla_moe_dgdr.yaml
+++ b/benchmarks/profiler/deploy/profile_sla_moe_dgdr.yaml
@@ -13,17 +13,18 @@ spec:
  # ProfilingConfig maps directly to the profile_sla.py config format
  profilingConfig:
-    profilerImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.1"
+    profilerImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:my-tag"
+    # NOTE: any image built before January 10 and any release prior to 0.8.1
+    # will need to use snake_case within profilingConfig.config
    config:
-      # Model cache PVC configuration; profiler will mount this PVC to access model weights
+      # 0.8.1 and later: Model cache PVC to access model weights
      deployment:
        modelCache:
          pvcName: "model-cache"                      # Name of PVC containing model weights
          pvcPath: "deepseek-r1"                      # Subpath within PVC where model is stored
-      # Sweep/profiling configuration
      sweep:
-        # Standard online profiling (not using AI Configurator)
        useAiConfigurator: false
      hardware:
@@ -32,12 +33,11 @@ spec:
        maxNumGpusPerEngine: 16
        numGpusPerNode: 8
-      # SLA targets for profiling
      sla:
-        isl: 3000   # Input sequence length
+        isl: 3000
-        osl: 150    # Output sequence length
+        osl: 150
-        ttft: 200.0 # Time To First Token target (milliseconds)
+        ttft: 200.0
-        itl: 20.0   # Inter-Token Latency target (milliseconds)
+        itl: 20.0
    # Reference to ConfigMap containing the DGD base config
    # For MoE models, this should point to the appropriate disagg config
@@ -46,10 +46,7 @@ spec:
      name: deepseek-r1-config
      key: tep16p-dep16d-disagg.yaml
-  # Deployment overrides for the auto-created DGD
  deploymentOverrides:
-    workersImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.1"
+    workersImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:my-tag"
-  # Automatically create DynamoGraphDeployment after profiling
  autoApply: true
--- a/docs/planner/sla_planner_quickstart.md
+++ b/docs/planner/sla_planner_quickstart.md
@@ -97,6 +97,8 @@ Dynamo provides sample DGDR configurations in `benchmarks/profiler/deploy/`. You
 Or, you can create your own DGDR for your own needs.
+> **Important - Profiling Config Cases**: Prior to 0.8.1, any fields under `profilingConfig.config` are represented in snake_case. Starting 0.8.1, fields under `profilingConfig.config` are represented in camelCase for uniformity. There is backwards compatibility to snake_case, but as all example DGDRs are using camelCase, anyone using a release prior to 0.8.1 must manually update the configs under the examples to have snake_case config fields.
 > [!TIP]
 > For detailed explanations of all configuration options (SLA, hardware, sweep, AIC, planner), see the [DGDR Configuration Reference](/docs/benchmarks/sla_driven_profiling.md#dgdr-configuration-reference).
@@ -349,9 +351,9 @@ spec:
 Profiling still runs against the real backend (via GPUs or AIC) to collect performance data. The mocker deployment then uses this data to simulate realistic timing behavior.
-### Using a Model Cache PVC
+### Using a Model Cache PVC (0.8.1 or later)
-For large models, you can use a pre-populated PVC containing model weights instead of downloading from HuggingFace. See [Model Cache PVC](/docs/benchmarks/sla_driven_profiling.md#model-cache-pvc-advanced) for configuration details.
+Starting in Dynamo 0.8.1, for large models, you can use a pre-populated PVC containing model weights instead of downloading from HuggingFace. See [Model Cache PVC](/docs/benchmarks/sla_driven_profiling.md#model-cache-pvc-advanced) for configuration details.
 ### DGDR Immutability