Unverified Commit a2c7d0f9 authored by hhzhang16's avatar hhzhang16 Committed by GitHub
Browse files

fix: make it clear what features are only available 0.8.1 and on (#5492)


Signed-off-by: default avatarHannah Zhang <hannahz@nvidia.com>
parent 06939770
......@@ -12,26 +12,19 @@ spec:
# ProfilingConfig maps directly to the profile_sla.py config format
profilingConfig:
profilerImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag"
profilerImage: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:my-tag"
# NOTE: any image built before January 10 and any release prior to 0.8.1
# will need to use snake_case within profilingConfig.config
config:
# Sweep/profiling configuration
sweep:
# AI Configurator mode (fast simulation-based profiling)
useAiConfigurator: true
aicSystem: h200_sxm
aicHfId: Qwen/Qwen3-32B
aicBackendVersion: "0.20.0"
# SLA targets for profiling
sla:
isl: 3000 # Input sequence length
osl: 150 # Output sequence length
ttft: 500.0 # Time To First Token target (milliseconds)
itl: 30.0 # Inter-Token Latency target (milliseconds)
# Deployment overrides for the auto-created DGD
isl: 3000
osl: 150
ttft: 500.0
itl: 30.0
deploymentOverrides:
workersImage: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:my-tag"
# Automatically create DynamoGraphDeployment after profiling
autoApply: true
......@@ -13,22 +13,17 @@ spec:
# ProfilingConfig maps directly to the profile_sla.py config format
profilingConfig:
profilerImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag"
# NOTE: any image built before January 10 and any release prior to 0.8.1
# will need to use snake_case within profilingConfig.config
config:
# Sweep/profiling configuration
sweep:
# Online profiling mode (real deployment testing)
useAiConfigurator: false
# SLA targets for profiling
sla:
isl: 3000 # Input sequence length
osl: 150 # Output sequence length
ttft: 200.0 # Time To First Token target (milliseconds)
itl: 20.0 # Inter-Token Latency target (milliseconds)
# Deployment overrides for the auto-created DGD
isl: 3000
osl: 150
ttft: 200.0
itl: 20.0
deploymentOverrides:
workersImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag"
# Automatically create DynamoGraphDeployment after profiling
autoApply: true
......@@ -13,17 +13,18 @@ spec:
# ProfilingConfig maps directly to the profile_sla.py config format
profilingConfig:
profilerImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.1"
profilerImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:my-tag"
# NOTE: any image built before January 10 and any release prior to 0.8.1
# will need to use snake_case within profilingConfig.config
config:
# Model cache PVC configuration; profiler will mount this PVC to access model weights
# 0.8.1 and later: Model cache PVC to access model weights
deployment:
modelCache:
pvcName: "model-cache" # Name of PVC containing model weights
pvcPath: "deepseek-r1" # Subpath within PVC where model is stored
# Sweep/profiling configuration
sweep:
# Standard online profiling (not using AI Configurator)
useAiConfigurator: false
hardware:
......@@ -32,12 +33,11 @@ spec:
maxNumGpusPerEngine: 16
numGpusPerNode: 8
# SLA targets for profiling
sla:
isl: 3000 # Input sequence length
osl: 150 # Output sequence length
ttft: 200.0 # Time To First Token target (milliseconds)
itl: 20.0 # Inter-Token Latency target (milliseconds)
isl: 3000
osl: 150
ttft: 200.0
itl: 20.0
# Reference to ConfigMap containing the DGD base config
# For MoE models, this should point to the appropriate disagg config
......@@ -46,10 +46,7 @@ spec:
name: deepseek-r1-config
key: tep16p-dep16d-disagg.yaml
# Deployment overrides for the auto-created DGD
deploymentOverrides:
workersImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.1"
# Automatically create DynamoGraphDeployment after profiling
workersImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:my-tag"
autoApply: true
......@@ -97,6 +97,8 @@ Dynamo provides sample DGDR configurations in `benchmarks/profiler/deploy/`. You
Or, you can create your own DGDR for your own needs.
> **Important - Profiling Config Cases**: Prior to 0.8.1, any fields under `profilingConfig.config` are represented in snake_case. Starting 0.8.1, fields under `profilingConfig.config` are represented in camelCase for uniformity. There is backwards compatibility to snake_case, but as all example DGDRs are using camelCase, anyone using a release prior to 0.8.1 must manually update the configs under the examples to have snake_case config fields.
> [!TIP]
> For detailed explanations of all configuration options (SLA, hardware, sweep, AIC, planner), see the [DGDR Configuration Reference](/docs/benchmarks/sla_driven_profiling.md#dgdr-configuration-reference).
......@@ -349,9 +351,9 @@ spec:
Profiling still runs against the real backend (via GPUs or AIC) to collect performance data. The mocker deployment then uses this data to simulate realistic timing behavior.
### Using a Model Cache PVC
### Using a Model Cache PVC (0.8.1 or later)
For large models, you can use a pre-populated PVC containing model weights instead of downloading from HuggingFace. See [Model Cache PVC](/docs/benchmarks/sla_driven_profiling.md#model-cache-pvc-advanced) for configuration details.
Starting in Dynamo 0.8.1, for large models, you can use a pre-populated PVC containing model weights instead of downloading from HuggingFace. See [Model Cache PVC](/docs/benchmarks/sla_driven_profiling.md#model-cache-pvc-advanced) for configuration details.
### DGDR Immutability
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment