Unverified Commit a2c7d0f9 authored by hhzhang16's avatar hhzhang16 Committed by GitHub
Browse files

fix: make it clear what features are only available 0.8.1 and on (#5492)


Signed-off-by: default avatarHannah Zhang <hannahz@nvidia.com>
parent 06939770
...@@ -12,26 +12,19 @@ spec: ...@@ -12,26 +12,19 @@ spec:
# ProfilingConfig maps directly to the profile_sla.py config format # ProfilingConfig maps directly to the profile_sla.py config format
profilingConfig: profilingConfig:
profilerImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag" profilerImage: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:my-tag"
# NOTE: any image built before January 10 and any release prior to 0.8.1
# will need to use snake_case within profilingConfig.config
config: config:
# Sweep/profiling configuration
sweep: sweep:
# AI Configurator mode (fast simulation-based profiling)
useAiConfigurator: true useAiConfigurator: true
aicSystem: h200_sxm aicSystem: h200_sxm
aicHfId: Qwen/Qwen3-32B
aicBackendVersion: "0.20.0"
# SLA targets for profiling
sla: sla:
isl: 3000 # Input sequence length isl: 3000
osl: 150 # Output sequence length osl: 150
ttft: 500.0 # Time To First Token target (milliseconds) ttft: 500.0
itl: 30.0 # Inter-Token Latency target (milliseconds) itl: 30.0
# Deployment overrides for the auto-created DGD
deploymentOverrides: deploymentOverrides:
workersImage: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:my-tag" workersImage: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:my-tag"
# Automatically create DynamoGraphDeployment after profiling
autoApply: true autoApply: true
...@@ -13,22 +13,17 @@ spec: ...@@ -13,22 +13,17 @@ spec:
# ProfilingConfig maps directly to the profile_sla.py config format # ProfilingConfig maps directly to the profile_sla.py config format
profilingConfig: profilingConfig:
profilerImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag" profilerImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag"
# NOTE: any image built before January 10 and any release prior to 0.8.1
# will need to use snake_case within profilingConfig.config
config: config:
# Sweep/profiling configuration
sweep: sweep:
# Online profiling mode (real deployment testing)
useAiConfigurator: false useAiConfigurator: false
# SLA targets for profiling
sla: sla:
isl: 3000 # Input sequence length isl: 3000
osl: 150 # Output sequence length osl: 150
ttft: 200.0 # Time To First Token target (milliseconds) ttft: 200.0
itl: 20.0 # Inter-Token Latency target (milliseconds) itl: 20.0
# Deployment overrides for the auto-created DGD
deploymentOverrides: deploymentOverrides:
workersImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag" workersImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag"
# Automatically create DynamoGraphDeployment after profiling
autoApply: true autoApply: true
...@@ -13,17 +13,18 @@ spec: ...@@ -13,17 +13,18 @@ spec:
# ProfilingConfig maps directly to the profile_sla.py config format # ProfilingConfig maps directly to the profile_sla.py config format
profilingConfig: profilingConfig:
profilerImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.1" profilerImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:my-tag"
# NOTE: any image built before January 10 and any release prior to 0.8.1
# will need to use snake_case within profilingConfig.config
config: config:
# Model cache PVC configuration; profiler will mount this PVC to access model weights # 0.8.1 and later: Model cache PVC to access model weights
deployment: deployment:
modelCache: modelCache:
pvcName: "model-cache" # Name of PVC containing model weights pvcName: "model-cache" # Name of PVC containing model weights
pvcPath: "deepseek-r1" # Subpath within PVC where model is stored pvcPath: "deepseek-r1" # Subpath within PVC where model is stored
# Sweep/profiling configuration
sweep: sweep:
# Standard online profiling (not using AI Configurator)
useAiConfigurator: false useAiConfigurator: false
hardware: hardware:
...@@ -32,12 +33,11 @@ spec: ...@@ -32,12 +33,11 @@ spec:
maxNumGpusPerEngine: 16 maxNumGpusPerEngine: 16
numGpusPerNode: 8 numGpusPerNode: 8
# SLA targets for profiling
sla: sla:
isl: 3000 # Input sequence length isl: 3000
osl: 150 # Output sequence length osl: 150
ttft: 200.0 # Time To First Token target (milliseconds) ttft: 200.0
itl: 20.0 # Inter-Token Latency target (milliseconds) itl: 20.0
# Reference to ConfigMap containing the DGD base config # Reference to ConfigMap containing the DGD base config
# For MoE models, this should point to the appropriate disagg config # For MoE models, this should point to the appropriate disagg config
...@@ -46,10 +46,7 @@ spec: ...@@ -46,10 +46,7 @@ spec:
name: deepseek-r1-config name: deepseek-r1-config
key: tep16p-dep16d-disagg.yaml key: tep16p-dep16d-disagg.yaml
# Deployment overrides for the auto-created DGD
deploymentOverrides: deploymentOverrides:
workersImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.1" workersImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:my-tag"
# Automatically create DynamoGraphDeployment after profiling
autoApply: true autoApply: true
...@@ -97,6 +97,8 @@ Dynamo provides sample DGDR configurations in `benchmarks/profiler/deploy/`. You ...@@ -97,6 +97,8 @@ Dynamo provides sample DGDR configurations in `benchmarks/profiler/deploy/`. You
Or, you can create your own DGDR for your own needs. Or, you can create your own DGDR for your own needs.
> **Important - Profiling Config Cases**: Prior to 0.8.1, any fields under `profilingConfig.config` are represented in snake_case. Starting 0.8.1, fields under `profilingConfig.config` are represented in camelCase for uniformity. There is backwards compatibility to snake_case, but as all example DGDRs are using camelCase, anyone using a release prior to 0.8.1 must manually update the configs under the examples to have snake_case config fields.
> [!TIP] > [!TIP]
> For detailed explanations of all configuration options (SLA, hardware, sweep, AIC, planner), see the [DGDR Configuration Reference](/docs/benchmarks/sla_driven_profiling.md#dgdr-configuration-reference). > For detailed explanations of all configuration options (SLA, hardware, sweep, AIC, planner), see the [DGDR Configuration Reference](/docs/benchmarks/sla_driven_profiling.md#dgdr-configuration-reference).
...@@ -349,9 +351,9 @@ spec: ...@@ -349,9 +351,9 @@ spec:
Profiling still runs against the real backend (via GPUs or AIC) to collect performance data. The mocker deployment then uses this data to simulate realistic timing behavior. Profiling still runs against the real backend (via GPUs or AIC) to collect performance data. The mocker deployment then uses this data to simulate realistic timing behavior.
### Using a Model Cache PVC ### Using a Model Cache PVC (0.8.1 or later)
For large models, you can use a pre-populated PVC containing model weights instead of downloading from HuggingFace. See [Model Cache PVC](/docs/benchmarks/sla_driven_profiling.md#model-cache-pvc-advanced) for configuration details. Starting in Dynamo 0.8.1, for large models, you can use a pre-populated PVC containing model weights instead of downloading from HuggingFace. See [Model Cache PVC](/docs/benchmarks/sla_driven_profiling.md#model-cache-pvc-advanced) for configuration details.
### DGDR Immutability ### DGDR Immutability
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment