"lib/llm/src/vscode:/vscode.git/clone" did not exist on "e42746a18c5762f34680235477b1977192ee8d35"
Unverified Commit 6a84ffd3 authored by hhzhang16's avatar hhzhang16 Committed by GitHub
Browse files

feat: turn profiling k8s jobs into sample DGDR requests (#3864)


Signed-off-by: default avatarHannah Zhang <hannahz@nvidia.com>
Signed-off-by: default avatarhongkuanz <hongkuanz@nvidia.com>
Signed-off-by: default avatarHongkuan Zhou <tedzhouhk@gmail.com>
Co-authored-by: default avatarhongkuanz <hongkuanz@nvidia.com>
Co-authored-by: default avatarHongkuan Zhou <tedzhouhk@gmail.com>
parent 0d07e2c3
...@@ -135,15 +135,6 @@ dynamo-operator: ...@@ -135,15 +135,6 @@ dynamo-operator:
# -- Whether to enable SSH key generation for MPI Run # -- Whether to enable SSH key generation for MPI Run
enabled: true enabled: true
# DynamoGraphDeploymentRequest (DGDR) configuration
dgdr:
# -- Container image to use for profiling jobs (both online and offline/AIC)
# REQUIRED: Must be set to create DynamoGraphDeploymentRequests
# For development: Build and push the profiler image from the ai-dynamo repository
# Public image will be available in release 0.6.1
# Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1"
profilerImage: ""
# Grove component - distributed inference orchestration # Grove component - distributed inference orchestration
grove: grove:
......
...@@ -267,7 +267,7 @@ helm: manifests kustomize helmify ...@@ -267,7 +267,7 @@ helm: manifests kustomize helmify
$(KUSTOMIZE) build config/default | $(HELMIFY) -image-pull-secrets charts/dynamo-kubernetes-operator $(KUSTOMIZE) build config/default | $(HELMIFY) -image-pull-secrets charts/dynamo-kubernetes-operator
######################### CRD Reference Docs ######################### CRD Reference Docs
CRD_REF_DOCS_VERSION ?= v0.0.12 CRD_REF_DOCS_VERSION ?= latest
CRD_REF_DOCS ?= $(LOCALBIN)/crd-ref-docs CRD_REF_DOCS ?= $(LOCALBIN)/crd-ref-docs
.PHONY: crd-ref-docs .PHONY: crd-ref-docs
......
...@@ -60,6 +60,12 @@ type ProfilingConfigSpec struct { ...@@ -60,6 +60,12 @@ type ProfilingConfigSpec struct {
// The path to this config will be set as engine.config in the profiling config. // The path to this config will be set as engine.config in the profiling config.
// +kubebuilder:validation:Optional // +kubebuilder:validation:Optional
ConfigMapRef *ConfigMapKeySelector `json:"configMapRef,omitempty"` ConfigMapRef *ConfigMapKeySelector `json:"configMapRef,omitempty"`
// ProfilerImage specifies the container image to use for profiling jobs.
// This image contains the profiler code and dependencies needed for SLA-based profiling.
// Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1"
// +kubebuilder:validation:Required
ProfilerImage string `json:"profilerImage"`
} }
// DeploymentOverridesSpec allows users to customize metadata for auto-created DynamoGraphDeployments. // DeploymentOverridesSpec allows users to customize metadata for auto-created DynamoGraphDeployments.
...@@ -83,21 +89,36 @@ type DeploymentOverridesSpec struct { ...@@ -83,21 +89,36 @@ type DeploymentOverridesSpec struct {
// Annotations are additional annotations to add to the DynamoGraphDeployment metadata. // Annotations are additional annotations to add to the DynamoGraphDeployment metadata.
// +kubebuilder:validation:Optional // +kubebuilder:validation:Optional
Annotations map[string]string `json:"annotations,omitempty"` Annotations map[string]string `json:"annotations,omitempty"`
// WorkersImage specifies the container image to use for DynamoGraphDeployment worker components.
// This image is used for both temporary DGDs created during online profiling and the final DGD.
// If omitted, the image from the base config file (e.g., disagg.yaml) is used.
// Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1"
// +kubebuilder:validation:Optional
WorkersImage string `json:"workersImage,omitempty"`
} }
// DynamoGraphDeploymentRequestSpec defines the desired state of a DynamoGraphDeploymentRequest. // DynamoGraphDeploymentRequestSpec defines the desired state of a DynamoGraphDeploymentRequest.
// This CRD serves as the primary interface for users to request model deployments with // This CRD serves as the primary interface for users to request model deployments with
// specific performance constraints and resource requirements, enabling SLA-driven deployments. // specific performance constraints and resource requirements, enabling SLA-driven deployments.
type DynamoGraphDeploymentRequestSpec struct { type DynamoGraphDeploymentRequestSpec struct {
// ModelName specifies the model to deploy (e.g., "Qwen/Qwen3-0.6B", "meta-llama/Llama-3-70b"). // Model specifies the model to deploy (e.g., "Qwen/Qwen3-0.6B", "meta-llama/Llama-3-70b").
// This is a high-level identifier for easy reference in kubectl output and logs. // This is a high-level identifier for easy reference in kubectl output and logs.
// The controller automatically sets this value in profilingConfig.config.deployment.model.
// +kubebuilder:validation:Required // +kubebuilder:validation:Required
ModelName string `json:"modelName"` Model string `json:"model"`
// Backend specifies the inference backend to use.
// The controller automatically sets this value in profilingConfig.config.engine.backend.
// +kubebuilder:validation:Required
// +kubebuilder:validation:Enum=vllm;sglang;trtllm
Backend string `json:"backend"`
// ProfilingConfig provides the complete configuration for the profiling job. // ProfilingConfig provides the complete configuration for the profiling job.
// This configuration is passed directly to the profiler. // This configuration is passed directly to the profiler.
// The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema). // The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema).
// The profiler will validate the configuration and report any errors. // Note: deployment.model and engine.backend are automatically set from the high-level
// modelName and backend fields and should not be specified in this config.
// +kubebuilder:validation:Required // +kubebuilder:validation:Required
ProfilingConfig ProfilingConfigSpec `json:"profilingConfig"` ProfilingConfig ProfilingConfigSpec `json:"profilingConfig"`
...@@ -191,7 +212,7 @@ type DynamoGraphDeploymentRequestStatus struct { ...@@ -191,7 +212,7 @@ type DynamoGraphDeploymentRequestStatus struct {
// +kubebuilder:object:root=true // +kubebuilder:object:root=true
// +kubebuilder:subresource:status // +kubebuilder:subresource:status
// +kubebuilder:resource:shortName=dgdr // +kubebuilder:resource:shortName=dgdr
// +kubebuilder:printcolumn:name="Model",type=string,JSONPath=`.spec.modelName` // +kubebuilder:printcolumn:name="Model",type=string,JSONPath=`.spec.model`
// +kubebuilder:printcolumn:name="Backend",type=string,JSONPath=`.status.backend` // +kubebuilder:printcolumn:name="Backend",type=string,JSONPath=`.status.backend`
// +kubebuilder:printcolumn:name="State",type=string,JSONPath=`.status.state` // +kubebuilder:printcolumn:name="State",type=string,JSONPath=`.status.state`
// +kubebuilder:printcolumn:name="DGD-State",type=string,JSONPath=`.status.deployment.state` // +kubebuilder:printcolumn:name="DGD-State",type=string,JSONPath=`.status.deployment.state`
......
...@@ -140,7 +140,6 @@ func main() { ...@@ -140,7 +140,6 @@ func main() {
var mpiRunSecretName string var mpiRunSecretName string
var mpiRunSecretNamespace string var mpiRunSecretNamespace string
var plannerClusterRoleName string var plannerClusterRoleName string
var profilerImage string
var dgdrProfilingClusterRoleName string var dgdrProfilingClusterRoleName string
flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.") flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
...@@ -182,8 +181,6 @@ func main() { ...@@ -182,8 +181,6 @@ func main() {
"Namespace where the MPI SSH secret is located (required)") "Namespace where the MPI SSH secret is located (required)")
flag.StringVar(&plannerClusterRoleName, "planner-cluster-role-name", "", flag.StringVar(&plannerClusterRoleName, "planner-cluster-role-name", "",
"Name of the ClusterRole for planner (cluster-wide mode only)") "Name of the ClusterRole for planner (cluster-wide mode only)")
flag.StringVar(&profilerImage, "profiler-image", "",
"Container image to use for profiling jobs (both online and offline/AIC) (for DynamoGraphDeploymentRequest)")
flag.StringVar(&dgdrProfilingClusterRoleName, "dgdr-profiling-cluster-role-name", "", flag.StringVar(&dgdrProfilingClusterRoleName, "dgdr-profiling-cluster-role-name", "",
"Name of the ClusterRole for DGDR profiling jobs (cluster-wide mode only)") "Name of the ClusterRole for DGDR profiling jobs (cluster-wide mode only)")
opts := zap.Options{ opts := zap.Options{
...@@ -458,11 +455,10 @@ func main() { ...@@ -458,11 +455,10 @@ func main() {
} }
if err = (&controller.DynamoGraphDeploymentRequestReconciler{ if err = (&controller.DynamoGraphDeploymentRequestReconciler{
Client: mgr.GetClient(), Client: mgr.GetClient(),
Recorder: mgr.GetEventRecorderFor("dynamographdeploymentrequest"), Recorder: mgr.GetEventRecorderFor("dynamographdeploymentrequest"),
ProfilerImage: profilerImage, Config: ctrlConfig,
Config: ctrlConfig, RBACManager: rbacManager,
RBACManager: rbacManager,
}).SetupWithManager(mgr); err != nil { }).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "DynamoGraphDeploymentRequest") setupLog.Error(err, "unable to create controller", "controller", "DynamoGraphDeploymentRequest")
os.Exit(1) os.Exit(1)
......
...@@ -33,7 +33,7 @@ spec: ...@@ -33,7 +33,7 @@ spec:
scope: Namespaced scope: Namespaced
versions: versions:
- additionalPrinterColumns: - additionalPrinterColumns:
- jsonPath: .spec.modelName - jsonPath: .spec.model
name: Model name: Model
type: string type: string
- jsonPath: .status.backend - jsonPath: .status.backend
...@@ -94,6 +94,15 @@ spec: ...@@ -94,6 +94,15 @@ spec:
after profiling completes. If false, only the spec is generated and stored in status. after profiling completes. If false, only the spec is generated and stored in status.
Users can then manually create a DGD using the generated spec. Users can then manually create a DGD using the generated spec.
type: boolean type: boolean
backend:
description: |-
Backend specifies the inference backend to use.
The controller automatically sets this value in profilingConfig.config.engine.backend.
enum:
- vllm
- sglang
- trtllm
type: string
deploymentOverrides: deploymentOverrides:
description: |- description: |-
DeploymentOverrides allows customizing metadata for the auto-created DGD. DeploymentOverrides allows customizing metadata for the auto-created DGD.
...@@ -121,18 +130,27 @@ spec: ...@@ -121,18 +130,27 @@ spec:
Namespace is the desired namespace for the created DynamoGraphDeployment. Namespace is the desired namespace for the created DynamoGraphDeployment.
If not specified, defaults to the DGDR namespace. If not specified, defaults to the DGDR namespace.
type: string type: string
workersImage:
description: |-
WorkersImage specifies the container image to use for DynamoGraphDeployment worker components.
This image is used for both temporary DGDs created during online profiling and the final DGD.
If omitted, the image from the base config file (e.g., disagg.yaml) is used.
Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1"
type: string
type: object type: object
modelName: model:
description: |- description: |-
ModelName specifies the model to deploy (e.g., "Qwen/Qwen3-0.6B", "meta-llama/Llama-3-70b"). Model specifies the model to deploy (e.g., "Qwen/Qwen3-0.6B", "meta-llama/Llama-3-70b").
This is a high-level identifier for easy reference in kubectl output and logs. This is a high-level identifier for easy reference in kubectl output and logs.
The controller automatically sets this value in profilingConfig.config.deployment.model.
type: string type: string
profilingConfig: profilingConfig:
description: |- description: |-
ProfilingConfig provides the complete configuration for the profiling job. ProfilingConfig provides the complete configuration for the profiling job.
This configuration is passed directly to the profiler. This configuration is passed directly to the profiler.
The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema). The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema).
The profiler will validate the configuration and report any errors. Note: deployment.model and engine.backend are automatically set from the high-level
modelName and backend fields and should not be specified in this config.
properties: properties:
config: config:
description: |- description: |-
...@@ -156,9 +174,18 @@ spec: ...@@ -156,9 +174,18 @@ spec:
required: required:
- name - name
type: object type: object
profilerImage:
description: |-
ProfilerImage specifies the container image to use for profiling jobs.
This image contains the profiler code and dependencies needed for SLA-based profiling.
Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1"
type: string
required:
- profilerImage
type: object type: object
required: required:
- modelName - backend
- model
- profilingConfig - profilingConfig
type: object type: object
status: status:
......
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---
apiVersion: rbac.authorization.k8s.io/v1 apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole kind: ClusterRole
metadata: metadata:
......
...@@ -18,11 +18,18 @@ kind: DynamoGraphDeploymentRequest ...@@ -18,11 +18,18 @@ kind: DynamoGraphDeploymentRequest
metadata: metadata:
name: example-llm-sla name: example-llm-sla
spec: spec:
# ModelName is a high-level identifier for the model being deployed # Model is a high-level identifier for the model being deployed (required - injected into profilingConfig.config.deployment.model)
modelName: Qwen/Qwen3-0.6B model: Qwen/Qwen3-0.6B
# Backend to use for profiling (required - injected into profilingConfig.config.engine.backend)
backend: trtllm
# ProfilerImage is the container image to use for profiling jobs (required)
profilerImage: "nvcr.io/nvidia/ai-dynamo/trtllm-runtime:0.6.1"
# ProfilingConfig maps directly to the profile_sla.py config format # ProfilingConfig maps directly to the profile_sla.py config format
# See benchmarks/profiler/utils/profiler_argparse.py for complete schema # See benchmarks/profiler/utils/profiler_argparse.py for complete schema
# Note: deployment.model and engine.backend are automatically set from model and backend above
profilingConfig: profilingConfig:
config: config:
# Optional: Output directory for profiling results (defaults to /data in the Job) # Optional: Output directory for profiling results (defaults to /data in the Job)
...@@ -30,7 +37,6 @@ spec: ...@@ -30,7 +37,6 @@ spec:
# Engine configuration # Engine configuration
engine: engine:
backend: trtllm # Inference backend: vllm, sglang, or trtllm
max_context_length: 16384 # Maximum context length supported by the model max_context_length: 16384 # Maximum context length supported by the model
is_moe_model: false # Enable MoE model support (uses TEP/DEP instead of TP) is_moe_model: false # Enable MoE model support (uses TEP/DEP instead of TP)
......
...@@ -74,14 +74,14 @@ require ( ...@@ -74,14 +74,14 @@ require (
go.uber.org/automaxprocs v1.6.0 // indirect go.uber.org/automaxprocs v1.6.0 // indirect
go.uber.org/multierr v1.11.0 // indirect go.uber.org/multierr v1.11.0 // indirect
go.uber.org/zap v1.27.0 // indirect go.uber.org/zap v1.27.0 // indirect
golang.org/x/net v0.40.0 // indirect golang.org/x/net v0.46.0 // indirect
golang.org/x/oauth2 v0.30.0 // indirect golang.org/x/oauth2 v0.30.0 // indirect
golang.org/x/sync v0.14.0 // indirect golang.org/x/sync v0.17.0 // indirect
golang.org/x/sys v0.33.0 // indirect golang.org/x/sys v0.37.0 // indirect
golang.org/x/term v0.32.0 // indirect golang.org/x/term v0.36.0 // indirect
golang.org/x/text v0.25.0 // indirect golang.org/x/text v0.30.0 // indirect
golang.org/x/time v0.9.0 // indirect golang.org/x/time v0.9.0 // indirect
golang.org/x/tools v0.33.0 // indirect golang.org/x/tools v0.38.0 // indirect
gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20250519155744-55703ea1f237 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20250519155744-55703ea1f237 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20250519155744-55703ea1f237 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20250519155744-55703ea1f237 // indirect
......
...@@ -158,34 +158,34 @@ golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn ...@@ -158,34 +158,34 @@ golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
golang.org/x/net v0.40.0 h1:79Xs7wF06Gbdcg4kdCCIQArK11Z1hr5POQ6+fIYHNuY= golang.org/x/net v0.46.0 h1:giFlY12I07fugqwPuWJi68oOnpfqFnJIJzaIIm2JVV4=
golang.org/x/net v0.40.0/go.mod h1:y0hY0exeL2Pku80/zKK7tpntoX23cqL3Oa6njdgRtds= golang.org/x/net v0.46.0/go.mod h1:Q9BGdFy1y4nkUwiLvT5qtyhAnEHgnQ/zd8PfU6nc210=
golang.org/x/oauth2 v0.30.0 h1:dnDm7JmhM45NNpd8FDDeLhK6FwqbOf4MLCM9zb1BOHI= golang.org/x/oauth2 v0.30.0 h1:dnDm7JmhM45NNpd8FDDeLhK6FwqbOf4MLCM9zb1BOHI=
golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKlU= golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKlU=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.14.0 h1:woo0S4Yywslg6hp4eUFjTVOyKt0RookbpAHG4c1HmhQ= golang.org/x/sync v0.17.0 h1:l60nONMj9l5drqw6jlhIELNv9I0A4OFgRsG9k2oT9Ug=
golang.org/x/sync v0.14.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sync v0.17.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw= golang.org/x/sys v0.37.0 h1:fdNQudmxPjkdUTPnLn5mdQv7Zwvbvpaxqs831goi9kQ=
golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= golang.org/x/sys v0.37.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
golang.org/x/term v0.32.0 h1:DR4lr0TjUs3epypdhTOkMmuF5CDFJ/8pOnbzMZPQ7bg= golang.org/x/term v0.36.0 h1:zMPR+aF8gfksFprF/Nc/rd1wRS1EI6nDBGyWAvDzx2Q=
golang.org/x/term v0.32.0/go.mod h1:uZG1FhGx848Sqfsq4/DlJr3xGGsYMu/L5GW4abiaEPQ= golang.org/x/term v0.36.0/go.mod h1:Qu394IJq6V6dCBRgwqshf3mPF85AqzYEzofzRdZkWss=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.25.0 h1:qVyWApTSYLk/drJRO5mDlNYskwQznZmkpV2c8q9zls4= golang.org/x/text v0.30.0 h1:yznKA/E9zq54KzlzBEAWn1NXSQ8DIp/NYMy88xJjl4k=
golang.org/x/text v0.25.0/go.mod h1:WEdwpYrmk1qmdHvhkSTNPm3app7v4rsT8F2UD6+VHIA= golang.org/x/text v0.30.0/go.mod h1:yDdHFIX9t+tORqspjENWgzaCVXgk0yYnYuSZ8UzzBVM=
golang.org/x/time v0.9.0 h1:EsRrnYcQiGH+5FfbgvV4AP7qEZstoyrHB0DzarOQ4ZY= golang.org/x/time v0.9.0 h1:EsRrnYcQiGH+5FfbgvV4AP7qEZstoyrHB0DzarOQ4ZY=
golang.org/x/time v0.9.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/time v0.9.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
golang.org/x/tools v0.33.0 h1:4qz2S3zmRxbGIhDIAgjxvFutSvH5EfnsYrRBj0UI0bc= golang.org/x/tools v0.38.0 h1:Hx2Xv8hISq8Lm16jvBZ2VQf+RLmbd7wVUsALibYI/IQ=
golang.org/x/tools v0.33.0/go.mod h1:CIJMaWEY88juyUfo7UbgPqbC8rU2OqfAV1h2Qp0oMYI= golang.org/x/tools v0.38.0/go.mod h1:yEsQ/d/YK8cjh0L6rZlY8tgtlKiBNTL14pGDJPJpYQs=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
......
...@@ -144,10 +144,10 @@ const ( ...@@ -144,10 +144,10 @@ const (
MessageConfigMapKeyNotFound = "key %s not found in ConfigMap %s" MessageConfigMapKeyNotFound = "key %s not found in ConfigMap %s"
// Validation messages // Validation messages
ValidationErrorModelNameRequired = "modelName is required" ValidationErrorModelRequired = "model is required"
ValidationErrorITLPositive = "sla.itl must be positive" ValidationErrorITLPositive = "sla.itl must be positive"
ValidationErrorTTFTPositive = "sla.ttft must be positive" ValidationErrorTTFTPositive = "sla.ttft must be positive"
ValidationErrorInvalidBackend = "invalid backend: %s (must be vllm, sglang, or trtllm)" ValidationErrorInvalidBackend = "invalid backend: %s (must be vllm, sglang, or trtllm)"
// Valid backend values // Valid backend values
BackendVLLM = "vllm" BackendVLLM = "vllm"
...@@ -198,8 +198,6 @@ type DynamoGraphDeploymentRequestReconciler struct { ...@@ -198,8 +198,6 @@ type DynamoGraphDeploymentRequestReconciler struct {
Recorder record.EventRecorder Recorder record.EventRecorder
Config commonController.Config Config commonController.Config
// ProfilerImage is the container image to use for profiling jobs (both online and offline/AIC)
ProfilerImage string
// RBACMgr handles RBAC setup for profiling jobs // RBACMgr handles RBAC setup for profiling jobs
RBACManager RBACManager RBACManager RBACManager
} }
...@@ -217,13 +215,6 @@ func (r *DynamoGraphDeploymentRequestReconciler) GetRecorder() record.EventRecor ...@@ -217,13 +215,6 @@ func (r *DynamoGraphDeploymentRequestReconciler) GetRecorder() record.EventRecor
// FinalizeResource implements commonController.Finalizer interface // FinalizeResource implements commonController.Finalizer interface
func (r *DynamoGraphDeploymentRequestReconciler) FinalizeResource(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) error { func (r *DynamoGraphDeploymentRequestReconciler) FinalizeResource(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) error {
logger := log.FromContext(ctx) logger := log.FromContext(ctx)
logger.Info("Finalizing DGDR", "name", dgdr.Name)
// Cleanup profiling resources
if err := r.cleanupProfilingResources(ctx, dgdr); err != nil {
logger.Error(err, "Failed to cleanup profiling resources")
return err
}
logger.Info("DGDR finalized successfully", "name", dgdr.Name) logger.Info("DGDR finalized successfully", "name", dgdr.Name)
return nil return nil
...@@ -320,8 +311,8 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleInitialState(ctx context. ...@@ -320,8 +311,8 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleInitialState(ctx context.
// Set observedGeneration to track the spec we're processing // Set observedGeneration to track the spec we're processing
dgdr.Status.ObservedGeneration = dgdr.Generation dgdr.Status.ObservedGeneration = dgdr.Generation
// Extract and populate backend from config for display in kubectl output // Populate backend in status from spec for display in kubectl output
dgdr.Status.Backend = getBackendFromConfig(dgdr) dgdr.Status.Backend = dgdr.Spec.Backend
// Initialize status // Initialize status
r.Recorder.Event(dgdr, corev1.EventTypeNormal, EventReasonInitialized, MessageInitialized) r.Recorder.Event(dgdr, corev1.EventTypeNormal, EventReasonInitialized, MessageInitialized)
...@@ -664,11 +655,6 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleFailedState(ctx context.C ...@@ -664,11 +655,6 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleFailedState(ctx context.C
logger := log.FromContext(ctx) logger := log.FromContext(ctx)
logger.Info("DGDR is in failed state", "name", dgdr.Name) logger.Info("DGDR is in failed state", "name", dgdr.Name)
// Cleanup profiling resources if any
if err := r.cleanupProfilingResources(ctx, dgdr); err != nil {
logger.Error(err, "Failed to cleanup profiling resources")
}
// Could implement retry logic here if desired // Could implement retry logic here if desired
return ctrl.Result{}, nil return ctrl.Result{}, nil
} }
...@@ -705,27 +691,13 @@ func isOnlineProfiling(dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) boo ...@@ -705,27 +691,13 @@ func isOnlineProfiling(dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) boo
return true return true
} }
// getBackendFromConfig extracts the backend value from profilingConfig.config.engine.backend
func getBackendFromConfig(dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) string {
if dgdr.Spec.ProfilingConfig.Config == nil {
return ""
}
var config map[string]interface{}
if err := yaml.Unmarshal(dgdr.Spec.ProfilingConfig.Config.Raw, &config); err != nil {
return ""
}
if engine, ok := config["engine"].(map[string]interface{}); ok {
if backend, ok := engine["backend"].(string); ok {
return backend
}
}
return ""
}
// validateSpec validates the DGDR spec // validateSpec validates the DGDR spec
func (r *DynamoGraphDeploymentRequestReconciler) validateSpec(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) error { func (r *DynamoGraphDeploymentRequestReconciler) validateSpec(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) error {
// Validate profiler image is specified in the new location
if dgdr.Spec.ProfilingConfig.ProfilerImage == "" {
return errors.New("profilingConfig.profilerImage is required")
}
// Basic validation - check that profilingConfig.config is provided // Basic validation - check that profilingConfig.config is provided
if dgdr.Spec.ProfilingConfig.Config == nil || len(dgdr.Spec.ProfilingConfig.Config.Raw) == 0 { if dgdr.Spec.ProfilingConfig.Config == nil || len(dgdr.Spec.ProfilingConfig.Config.Raw) == 0 {
return errors.New("profilingConfig.config is required and must not be empty") return errors.New("profilingConfig.config is required and must not be empty")
...@@ -764,15 +736,20 @@ func (r *DynamoGraphDeploymentRequestReconciler) validateSpec(ctx context.Contex ...@@ -764,15 +736,20 @@ func (r *DynamoGraphDeploymentRequestReconciler) validateSpec(ctx context.Contex
return fmt.Errorf("failed to parse profilingConfig.config: %w", err) return fmt.Errorf("failed to parse profilingConfig.config: %w", err)
} }
// Additional validation: Ensure engine.config is set (either as path or will be set from ConfigMapRef) // Warn if deployment.model or engine.backend are specified in config (they will be overwritten by spec fields)
engineConfig, hasEngine := config["engine"].(map[string]interface{}) if engineConfig, ok := config["engine"].(map[string]interface{}); ok {
if hasEngine { if backend, ok := engineConfig["backend"].(string); ok && backend != "" && backend != dgdr.Spec.Backend {
_, hasConfig := engineConfig["config"] logger := log.FromContext(ctx)
if !hasConfig && dgdr.Spec.ProfilingConfig.ConfigMapRef == nil { logger.Info("Warning: profilingConfig.config.engine.backend will be overwritten by spec.backend",
return errors.New("either profilingConfig.config.engine.config must be set, or profilingConfig.configMapRef must be provided") "configBackend", backend, "specBackend", dgdr.Spec.Backend)
}
}
if deployment, ok := config["deployment"].(map[string]interface{}); ok {
if model, ok := deployment["model"].(string); ok && model != "" && model != dgdr.Spec.Model {
logger := log.FromContext(ctx)
logger.Info("Warning: profilingConfig.config.deployment.model will be overwritten by spec.model",
"configModel", model, "specModel", dgdr.Spec.Model)
} }
} else if dgdr.Spec.ProfilingConfig.ConfigMapRef == nil {
return errors.New("profilingConfig.config must contain 'engine' section, or profilingConfig.configMapRef must be provided")
} }
// The profiler will validate the rest of the configuration // The profiler will validate the rest of the configuration
...@@ -783,7 +760,29 @@ func (r *DynamoGraphDeploymentRequestReconciler) validateSpec(ctx context.Contex ...@@ -783,7 +760,29 @@ func (r *DynamoGraphDeploymentRequestReconciler) validateSpec(ctx context.Contex
func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) error { func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) error {
logger := log.FromContext(ctx) logger := log.FromContext(ctx)
// Ensure profiling job RBAC exists in cluster-wide mode // Delete any existing output ConfigMap to ensure fresh profiling results
// This prevents using stale data from previous profiling runs
outputConfigMapName := getOutputConfigMapName(dgdr)
existingCM := &corev1.ConfigMap{}
err := r.Get(ctx, types.NamespacedName{
Name: outputConfigMapName,
Namespace: dgdr.Namespace,
}, existingCM)
if err == nil {
// ConfigMap exists, delete it
logger.Info("Deleting existing output ConfigMap to ensure fresh profiling results", "configMap", outputConfigMapName)
if err := r.Delete(ctx, existingCM); err != nil && !apierrors.IsNotFound(err) {
logger.Error(err, "Failed to delete existing output ConfigMap", "configMap", outputConfigMapName)
return fmt.Errorf("failed to delete existing output ConfigMap: %w", err)
}
logger.Info("Successfully deleted old output ConfigMap", "configMap", outputConfigMapName)
} else if !apierrors.IsNotFound(err) {
// Unexpected error checking for ConfigMap
logger.Error(err, "Failed to check for existing output ConfigMap", "configMap", outputConfigMapName)
return fmt.Errorf("failed to check for existing output ConfigMap: %w", err)
}
// Ensure profiling job RBAC exists (only for cluster-wide installation)
if r.Config.RestrictedNamespace == "" { if r.Config.RestrictedNamespace == "" {
if err := r.RBACManager.EnsureServiceAccountWithRBAC( if err := r.RBACManager.EnsureServiceAccountWithRBAC(
ctx, ctx,
...@@ -808,25 +807,52 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context. ...@@ -808,25 +807,52 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
} }
// Set deployment.namespace if not already set // Set deployment.namespace if not already set
if _, hasDeployment := config["deployment"]; !hasDeployment { deploymentVal, hasDeployment := config["deployment"]
config["deployment"] = make(map[string]interface{}) var deploymentConfig map[string]interface{}
if !hasDeployment || deploymentVal == nil {
deploymentConfig = make(map[string]interface{})
config["deployment"] = deploymentConfig
} else {
var ok bool
deploymentConfig, ok = deploymentVal.(map[string]interface{})
if !ok {
return nil, false, fmt.Errorf("profilingConfig.config.deployment must be an object, got %T", deploymentVal)
}
} }
deploymentConfig := config["deployment"].(map[string]interface{})
if _, hasNamespace := deploymentConfig["namespace"]; !hasNamespace { if _, hasNamespace := deploymentConfig["namespace"]; !hasNamespace {
deploymentConfig["namespace"] = dgdr.Namespace deploymentConfig["namespace"] = dgdr.Namespace
} }
// Set deployment.model from spec.model
deploymentConfig["model"] = dgdr.Spec.Model
// Set deployment.dgd_image from deploymentOverrides.workersImage if provided
if dgdr.Spec.DeploymentOverrides != nil && dgdr.Spec.DeploymentOverrides.WorkersImage != "" {
deploymentConfig["dgd_image"] = dgdr.Spec.DeploymentOverrides.WorkersImage
}
// Set output_dir if not already set // Set output_dir if not already set
if _, hasOutputDir := config["output_dir"]; !hasOutputDir { if _, hasOutputDir := config["output_dir"]; !hasOutputDir {
config["output_dir"] = ProfilingOutputPath config["output_dir"] = ProfilingOutputPath
} }
// Set engine.backend from spec.backend
engineVal, hasEngine := config["engine"]
var engineConfig map[string]interface{}
if !hasEngine || engineVal == nil {
engineConfig = make(map[string]interface{})
config["engine"] = engineConfig
} else {
var ok bool
engineConfig, ok = engineVal.(map[string]interface{})
if !ok {
return nil, false, fmt.Errorf("profilingConfig.config.engine must be an object, got %T", engineVal)
}
}
engineConfig["backend"] = dgdr.Spec.Backend
// If ConfigMapRef is provided, set engine.config path // If ConfigMapRef is provided, set engine.config path
if dgdr.Spec.ProfilingConfig.ConfigMapRef != nil { if dgdr.Spec.ProfilingConfig.ConfigMapRef != nil {
if _, hasEngine := config["engine"]; !hasEngine {
config["engine"] = make(map[string]interface{})
}
engineConfig := config["engine"].(map[string]interface{})
engineConfig["config"] = fmt.Sprintf("%s/%s", ProfilingConfigPath, ProfilingConfigFile) engineConfig["config"] = fmt.Sprintf("%s/%s", ProfilingConfigPath, ProfilingConfigFile)
} }
...@@ -857,6 +883,19 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context. ...@@ -857,6 +883,19 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
Name: "ETCD_ENDPOINTS", Name: "ETCD_ENDPOINTS",
Value: fmt.Sprintf("%s-etcd:2379", dgdr.Namespace), Value: fmt.Sprintf("%s-etcd:2379", dgdr.Namespace),
}, },
// DGDR metadata for setting ownerReferences
{
Name: "DGDR_NAME",
Value: dgdr.Name,
},
{
Name: "DGDR_NAMESPACE",
Value: dgdr.Namespace,
},
{
Name: "DGDR_UID",
Value: string(dgdr.UID),
},
} }
// Build volume mounts // Build volume mounts
...@@ -881,11 +920,8 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context. ...@@ -881,11 +920,8 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
"--profile-config", string(configYAML), "--profile-config", string(configYAML),
} }
// Determine profiler image // Use profiler image from profilingConfig
imageName := r.ProfilerImage imageName := dgdr.Spec.ProfilingConfig.ProfilerImage
if imageName == "" {
return nil, false, fmt.Errorf("profiler image not configured: configure dynamo-operator.dynamo.dgdr.profilerImage in Helm values")
}
logger.Info("Using profiler image", "image", imageName) logger.Info("Using profiler image", "image", imageName)
profilerContainer := corev1.Container{ profilerContainer := corev1.Container{
...@@ -1144,25 +1180,6 @@ func (r *DynamoGraphDeploymentRequestReconciler) generateDGDSpec(ctx context.Con ...@@ -1144,25 +1180,6 @@ func (r *DynamoGraphDeploymentRequestReconciler) generateDGDSpec(ctx context.Con
return r.Status().Update(ctx, dgdr) return r.Status().Update(ctx, dgdr)
} }
// cleanupProfilingResources cleans up profiling resources
func (r *DynamoGraphDeploymentRequestReconciler) cleanupProfilingResources(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) error {
logger := log.FromContext(ctx)
logger.Info("Cleaning up profiling resources", "name", dgdr.Name)
// Cleanup behavior when DGDR is deleted:
// - Profiling Job: Automatically deleted via ownerReference (set by SyncResource)
// - Output ConfigMap: NOT deleted (no ownerReference) - contains valuable profiling data
// - Auto-created DGD: NOT deleted (no ownerReference) - may be serving traffic
//
// We use labels (LabelDGDRName) to track relationships without cascade delete.
// Users can manually clean up ConfigMaps and DGDs if needed using label selectors:
// kubectl delete configmap -l dgdr.nvidia.com/name=<dgdr-name>
// kubectl delete dynamographdeployment -l dgdr.nvidia.com/name=<dgdr-name>
logger.Info("Profiling job will be automatically deleted via ownerReference")
return nil
}
// updateStateAndRequeue updates the DGDR state and requeues // updateStateAndRequeue updates the DGDR state and requeues
func (r *DynamoGraphDeploymentRequestReconciler) updateStateAndRequeue(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest, state, _ string) (ctrl.Result, error) { func (r *DynamoGraphDeploymentRequestReconciler) updateStateAndRequeue(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest, state, _ string) (ctrl.Result, error) {
dgdr.Status.State = state dgdr.Status.State = state
......
...@@ -72,9 +72,8 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { ...@@ -72,9 +72,8 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
BeforeEach(func() { BeforeEach(func() {
recorder = record.NewFakeRecorder(100) recorder = record.NewFakeRecorder(100)
reconciler = &DynamoGraphDeploymentRequestReconciler{ reconciler = &DynamoGraphDeploymentRequestReconciler{
Client: k8sClient, Client: k8sClient,
Recorder: recorder, Recorder: recorder,
ProfilerImage: "test-profiler:latest",
Config: commonController.Config{ Config: commonController.Config{
RestrictedNamespace: "", RestrictedNamespace: "",
RBAC: commonController.RBACConfig{ RBAC: commonController.RBACConfig{
...@@ -97,11 +96,13 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { ...@@ -97,11 +96,13 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
Namespace: namespace, Namespace: namespace,
}, },
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
ProfilerImage: "test-profiler:latest",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
Config: createTestConfig(map[string]interface{}{ Config: createTestConfig(map[string]interface{}{
"engine": map[string]interface{}{ "engine": map[string]interface{}{
"backend": "vllm", "config": "/tmp/test-config.yaml",
"config": "/tmp/test-config.yaml",
}, },
"sla": map[string]interface{}{ "sla": map[string]interface{}{
"ttft": 100.0, "ttft": 100.0,
...@@ -143,9 +144,9 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { ...@@ -143,9 +144,9 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
Expect(updated.Status.ObservedGeneration).Should(Equal(updated.Generation)) Expect(updated.Status.ObservedGeneration).Should(Equal(updated.Generation))
}) })
It("Should fail validation with missing config", func() { It("Should pass validation with minimal config", func() {
ctx := context.Background() ctx := context.Background()
dgdrName := "test-dgdr-invalid" dgdrName := "test-dgdr-minimal"
namespace := "default" namespace := "default"
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
...@@ -154,8 +155,16 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { ...@@ -154,8 +155,16 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
Namespace: namespace, Namespace: namespace,
}, },
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
ProfilerImage: "test-profiler:latest",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
Config: createTestConfig(map[string]interface{}{}), Config: createTestConfig(map[string]interface{}{
"sla": map[string]interface{}{
"ttft": 100.0,
"itl": 1500.0,
},
}),
}, },
}, },
} }
...@@ -163,7 +172,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { ...@@ -163,7 +172,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed()) Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
defer k8sClient.Delete(ctx, dgdr) defer k8sClient.Delete(ctx, dgdr)
// Reconcile // Reconcile - should succeed with minimal config
_, err := reconciler.Reconcile(ctx, reconcile.Request{ _, err := reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{ NamespacedName: types.NamespacedName{
Name: dgdrName, Name: dgdrName,
...@@ -172,12 +181,12 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { ...@@ -172,12 +181,12 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
}) })
Expect(err).NotTo(HaveOccurred()) Expect(err).NotTo(HaveOccurred())
// Check status transitions to Failed // Check status transitions to Pending (not Failed)
Eventually(func() string { Eventually(func() string {
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated) k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
return updated.Status.State return updated.Status.State
}, timeout, interval).Should(Equal(StateFailed)) }, timeout, interval).Should(Equal(StatePending))
}) })
}) })
...@@ -216,10 +225,12 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { ...@@ -216,10 +225,12 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
Namespace: namespace, Namespace: namespace,
}, },
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
ProfilerImage: "test-profiler:latest",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
Config: createTestConfig(map[string]interface{}{ Config: createTestConfig(map[string]interface{}{
"engine": map[string]interface{}{ "engine": map[string]interface{}{
"backend": "vllm",
"profiler_image": "test-profiler:latest", "profiler_image": "test-profiler:latest",
}, },
"sla": map[string]interface{}{ "sla": map[string]interface{}{
...@@ -313,10 +324,12 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { ...@@ -313,10 +324,12 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
Namespace: namespace, Namespace: namespace,
}, },
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
ProfilerImage: "test-profiler:latest",
Backend: "trtllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
Config: createTestConfig(map[string]interface{}{ Config: createTestConfig(map[string]interface{}{
"engine": map[string]interface{}{ "engine": map[string]interface{}{
"backend": "trtllm",
"config": "/tmp/test-config.yaml", "config": "/tmp/test-config.yaml",
"profiler_image": "test-profiler:latest", "profiler_image": "test-profiler:latest",
}, },
...@@ -386,11 +399,13 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { ...@@ -386,11 +399,13 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
Namespace: namespace, Namespace: namespace,
}, },
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
ProfilerImage: "test-profiler:latest",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
Config: createTestConfig(map[string]interface{}{ Config: createTestConfig(map[string]interface{}{
"engine": map[string]interface{}{ "engine": map[string]interface{}{
"backend": "vllm", "config": "/tmp/test-config.yaml",
"config": "/tmp/test-config.yaml",
}, },
"sla": map[string]interface{}{ "sla": map[string]interface{}{
"ttft": 100.0, "ttft": 100.0,
...@@ -498,11 +513,13 @@ spec: ...@@ -498,11 +513,13 @@ spec:
Namespace: namespace, Namespace: namespace,
}, },
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
ProfilerImage: "test-profiler:latest",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
Config: createTestConfig(map[string]interface{}{ Config: createTestConfig(map[string]interface{}{
"engine": map[string]interface{}{ "engine": map[string]interface{}{
"backend": "vllm", "config": "/tmp/test-config.yaml",
"config": "/tmp/test-config.yaml",
}, },
"sla": map[string]interface{}{ "sla": map[string]interface{}{
"ttft": 100.0, "ttft": 100.0,
...@@ -626,11 +643,13 @@ spec: ...@@ -626,11 +643,13 @@ spec:
Namespace: namespace, Namespace: namespace,
}, },
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
ProfilerImage: "test-profiler:latest",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
Config: createTestConfig(map[string]interface{}{ Config: createTestConfig(map[string]interface{}{
"engine": map[string]interface{}{ "engine": map[string]interface{}{
"backend": "vllm", "config": "/tmp/test-config.yaml",
"config": "/tmp/test-config.yaml",
}, },
"sla": map[string]interface{}{ "sla": map[string]interface{}{
"ttft": 100.0, "ttft": 100.0,
...@@ -707,11 +726,13 @@ spec: ...@@ -707,11 +726,13 @@ spec:
Namespace: namespace, Namespace: namespace,
}, },
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
ProfilerImage: "test-profiler:latest",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
Config: createTestConfig(map[string]interface{}{ Config: createTestConfig(map[string]interface{}{
"engine": map[string]interface{}{ "engine": map[string]interface{}{
"backend": "vllm", "config": "/tmp/test-config.yaml",
"config": "/tmp/test-config.yaml",
}, },
"sla": map[string]interface{}{ "sla": map[string]interface{}{
"ttft": 100.0, "ttft": 100.0,
...@@ -852,11 +873,13 @@ var _ = Describe("DGDR Validation", func() { ...@@ -852,11 +873,13 @@ var _ = Describe("DGDR Validation", func() {
ctx := context.Background() ctx := context.Background()
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
ProfilerImage: "test-profiler:latest",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
Config: createTestConfig(map[string]interface{}{ Config: createTestConfig(map[string]interface{}{
"engine": map[string]interface{}{ "engine": map[string]interface{}{
"backend": "vllm", "config": "/tmp/test-config.yaml",
"config": "/tmp/test-config.yaml",
}, },
"sla": map[string]interface{}{ "sla": map[string]interface{}{
"ttft": 100.0, "ttft": 100.0,
...@@ -873,25 +896,13 @@ var _ = Describe("DGDR Validation", func() { ...@@ -873,25 +896,13 @@ var _ = Describe("DGDR Validation", func() {
Expect(err).NotTo(HaveOccurred()) Expect(err).NotTo(HaveOccurred())
}) })
It("Should fail validation when config is empty", func() { It("Should pass validation with minimal config", func() {
ctx := context.Background()
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
Config: createTestConfig(map[string]interface{}{}),
},
},
}
err := reconciler.validateSpec(ctx, dgdr)
Expect(err).To(HaveOccurred())
Expect(err.Error()).Should(ContainSubstring("config"))
})
It("Should fail validation when engine section is missing", func() {
ctx := context.Background() ctx := context.Background()
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
ProfilerImage: "test-profiler:latest",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
Config: createTestConfig(map[string]interface{}{ Config: createTestConfig(map[string]interface{}{
"sla": map[string]interface{}{ "sla": map[string]interface{}{
...@@ -903,32 +914,9 @@ var _ = Describe("DGDR Validation", func() { ...@@ -903,32 +914,9 @@ var _ = Describe("DGDR Validation", func() {
}, },
} }
// Validation should pass - profiler will auto-generate missing config
err := reconciler.validateSpec(ctx, dgdr) err := reconciler.validateSpec(ctx, dgdr)
Expect(err).To(HaveOccurred()) Expect(err).NotTo(HaveOccurred())
Expect(err.Error()).Should(ContainSubstring("engine"))
})
It("Should fail validation when engine.config and configMapRef are both missing", func() {
ctx := context.Background()
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
Config: createTestConfig(map[string]interface{}{
"engine": map[string]interface{}{
"backend": "vllm",
},
"sla": map[string]interface{}{
"ttft": 100.0,
"itl": 1500.0,
},
}),
},
},
}
err := reconciler.validateSpec(ctx, dgdr)
Expect(err).To(HaveOccurred())
Expect(err.Error()).Should(ContainSubstring("engine.config"))
}) })
}) })
}) })
...@@ -938,9 +926,8 @@ var _ = Describe("DGDR Profiler Arguments", func() { ...@@ -938,9 +926,8 @@ var _ = Describe("DGDR Profiler Arguments", func() {
BeforeEach(func() { BeforeEach(func() {
reconciler = &DynamoGraphDeploymentRequestReconciler{ reconciler = &DynamoGraphDeploymentRequestReconciler{
Client: k8sClient, Client: k8sClient,
Recorder: record.NewFakeRecorder(100), Recorder: record.NewFakeRecorder(100),
ProfilerImage: "test-profiler:latest",
Config: commonController.Config{ Config: commonController.Config{
RestrictedNamespace: "", RestrictedNamespace: "",
}, },
...@@ -970,10 +957,12 @@ var _ = Describe("DGDR Profiler Arguments", func() { ...@@ -970,10 +957,12 @@ var _ = Describe("DGDR Profiler Arguments", func() {
Namespace: namespace, Namespace: namespace,
}, },
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
ProfilerImage: "test-profiler:latest",
Backend: "trtllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
Config: createTestConfig(map[string]interface{}{ Config: createTestConfig(map[string]interface{}{
"engine": map[string]interface{}{ "engine": map[string]interface{}{
"backend": "trtllm",
"config": "/tmp/test-config.yaml", "config": "/tmp/test-config.yaml",
"profiler_image": "test-profiler:latest", "profiler_image": "test-profiler:latest",
}, },
...@@ -1044,10 +1033,12 @@ var _ = Describe("DGDR Profiler Arguments", func() { ...@@ -1044,10 +1033,12 @@ var _ = Describe("DGDR Profiler Arguments", func() {
Namespace: namespace, Namespace: namespace,
}, },
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
ProfilerImage: "test-profiler:latest",
Backend: "trtllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
Config: createTestConfig(map[string]interface{}{ Config: createTestConfig(map[string]interface{}{
"engine": map[string]interface{}{ "engine": map[string]interface{}{
"backend": "trtllm",
"config": "/tmp/test-config.yaml", "config": "/tmp/test-config.yaml",
"profiler_image": "test-profiler:latest", "profiler_image": "test-profiler:latest",
}, },
...@@ -1109,9 +1100,8 @@ var _ = Describe("DGDR Error Handling", func() { ...@@ -1109,9 +1100,8 @@ var _ = Describe("DGDR Error Handling", func() {
BeforeEach(func() { BeforeEach(func() {
recorder = record.NewFakeRecorder(100) recorder = record.NewFakeRecorder(100)
reconciler = &DynamoGraphDeploymentRequestReconciler{ reconciler = &DynamoGraphDeploymentRequestReconciler{
Client: k8sClient, Client: k8sClient,
Recorder: recorder, Recorder: recorder,
ProfilerImage: "test-profiler:latest",
Config: commonController.Config{ Config: commonController.Config{
RestrictedNamespace: "", RestrictedNamespace: "",
}, },
...@@ -1131,11 +1121,13 @@ var _ = Describe("DGDR Error Handling", func() { ...@@ -1131,11 +1121,13 @@ var _ = Describe("DGDR Error Handling", func() {
Namespace: namespace, Namespace: namespace,
}, },
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{ Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
ProfilerImage: "test-profiler:latest",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{ ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
Config: createTestConfig(map[string]interface{}{ Config: createTestConfig(map[string]interface{}{
"engine": map[string]interface{}{ "engine": map[string]interface{}{
"backend": "vllm", "config": "/tmp/test-config.yaml",
"config": "/tmp/test-config.yaml",
}, },
"sla": map[string]interface{}{ "sla": map[string]interface{}{
"ttft": 100.0, "ttft": 100.0,
......
...@@ -119,7 +119,7 @@ python3 -m deploy.utils.download_pvc_results \ ...@@ -119,7 +119,7 @@ python3 -m deploy.utils.download_pvc_results \
For complete benchmarking and profiling workflows: For complete benchmarking and profiling workflows:
- **Benchmarking Guide**: See [docs/benchmarks/benchmarking.md](../../docs/benchmarks/benchmarking.md) for comparing DynamoGraphDeployments and external endpoints - **Benchmarking Guide**: See [docs/benchmarks/benchmarking.md](../../docs/benchmarks/benchmarking.md) for comparing DynamoGraphDeployments and external endpoints
- **Pre-Deployment Profiling**: See [docs/benchmarks/pre_deployment_profiling.md](../../docs/benchmarks/pre_deployment_profiling.md) for optimizing configurations before deployment - **Pre-Deployment Profiling**: See [docs/benchmarks/sla_driven_profiling.md](../../docs/benchmarks/sla_driven_profiling.md) for optimizing configurations before deployment
## Notes ## Notes
......
...@@ -248,6 +248,26 @@ class DynamoDeploymentClient: ...@@ -248,6 +248,26 @@ class DynamoDeploymentClient:
self.deployment_spec["metadata"]["name"] = self.deployment_name self.deployment_spec["metadata"]["name"] = self.deployment_name
self.deployment_spec["metadata"]["namespace"] = self.namespace self.deployment_spec["metadata"]["namespace"] = self.namespace
# Add ownerReference if env vars are set (for temporary DGDs during profiling)
# This makes the DGD auto-delete when the DGDR is deleted
dgdr_name = os.environ.get("DGDR_NAME")
dgdr_namespace = os.environ.get("DGDR_NAMESPACE")
dgdr_uid = os.environ.get("DGDR_UID")
if dgdr_name and dgdr_namespace and dgdr_uid:
if self.namespace == dgdr_namespace:
self.deployment_spec["metadata"]["ownerReferences"] = [
{
"apiVersion": "nvidia.com/v1alpha1",
"kind": "DynamoGraphDeploymentRequest",
"name": dgdr_name,
"uid": dgdr_uid,
"controller": False,
"blockOwnerDeletion": True,
}
]
print(f"Added ownerReference to DGDR {dgdr_name} for auto-cleanup")
try: try:
await self.custom_api.create_namespaced_custom_object( await self.custom_api.create_namespaced_custom_object(
group="nvidia.com", group="nvidia.com",
......
# Pre-Deployment Profiling
> [!TIP]
> **New to SLA Planner?** For a complete workflow including profiling and deployment, see the [SLA Planner Quick Start Guide](/docs/planner/sla_planner_quickstart.md).
## Profiling Script
To ensure Dynamo deployments comply with the SLA, we provide a pre-deployment script to profile the model performance with different parallelization mappings and recommend the parallelization mapping for prefill and decode workers and planner configurations. To use this script, the user needs to provide the target ISL, OSL, TTFT SLA, and ITL SLA.
> [!NOTE]
> **Time Investment**: This profiling process is comprehensive and typically takes **a few hours** to complete. The script systematically tests multiple tensor parallelism configurations and load conditions to find optimal performance settings. This upfront investment ensures your deployment meets SLA requirements and operates efficiently.
Support matrix:
| Backends | Model Types | Supported |
| --- | --- | --- |
| vLLM | Dense | ✅ |
| vLLM | MoE | 🚧 |
| SGLang | Dense | ✅ |
| SGLang | MoE | ✅ |
| TensorRT-LLM | Dense | ✅ |
| TensorRT-LLM | MoE | 🚧 |
> [!NOTE]
> The script considers a fixed ISL/OSL without KV cache reuse. If the real ISL/OSL has a large variance or a significant amount of KV cache can be reused, the result might be inaccurate.
We assume there is no piggy-backed prefill requests in the decode engine. Even if there are some short piggy-backed prefill requests in the decode engine, it should not affect the ITL too much in most conditions. However, if the piggy-backed prefill requests are too much, the ITL might be inaccurate.
The script will first detect the number of available GPUs on the current nodes (multi-node engine not supported yet). Then, it will profile the prefill and decode performance with different TP sizes. For prefill, since there is no in-flight batching (assume isl is long enough to saturate the GPU), the script directly measures the TTFT for a request with given isl without kv-reusing. For decode, since the ITL (or iteration time) is relevant with how many requests are in-flight, the script will measure the ITL under different number of in-flight requests. The range of the number of in-flight requests is from 1 to the maximum number of requests that the kv cache of the engine can hold. To measure the ITL without being affected by piggy-backed prefill requests, the script will enable kv-reuse and warm up the engine by issuing the same prompts before measuring the ITL. Since the kv cache is sufficient for all the requests, it can hold the kv cache of the pre-computed prompts and skip the prefill phase when measuring the ITL.
### GPU Resource Usage
**Important**: Profiling tests different tensor parallelism (TP) configurations **sequentially**, not in parallel. This means:
- **One TP configuration at a time**: Each tensor parallelism size (TP1, TP2, TP4, TP8, etc.) is tested individually
- **Full GPU access**: Each TP configuration gets exclusive access to all available GPUs during its profiling run
- **Resource isolation**: No interference between different TP configurations during testing
- **Accurate measurements**: Each configuration is profiled under identical resource conditions
This sequential approach ensures:
- **Precise performance profiling** without resource conflicts
- **Consistent GPU allocation** for fair comparison across TP sizes
- **Reliable cleanup** between different TP configuration tests
- **Accurate SLA compliance verification** for each configuration
After the profiling finishes, two plots will be generated in the `output-dir`. For example, here are the profiling results for `components/backends/vllm/deploy/disagg.yaml`:
![Prefill Performance](../../docs/images/h100_prefill_performance.png)
![Decode Performance](../../docs/images/h100_decode_performance.png)
For the prefill performance, the script will plot the TTFT for different TP sizes and select the best TP size that meet the target TTFT SLA and delivers the best throughput per GPU. Based on how close the TTFT of the selected TP size is to the SLA, the script will also recommend the upper and lower bounds of the prefill queue size to be used in planner.
For the decode performance, the script will plot the ITL for different TP sizes and different in-flight requests. Similarly, it will select the best point that satisfies the ITL SLA and delivers the best throughput per GPU and recommend the upper and lower bounds of the kv cache utilization rate to be used in planner.
The script will recommend the best TP size for prefill and decode, as well as the upper and lower bounds of the prefill queue size and decode kv cache utilization rate if using load-based planner. The following information will be printed out in the terminal:
```
2025-05-16 15:20:24 - __main__ - INFO - Analyzing results and generate recommendations...
2025-05-16 15:20:24 - __main__ - INFO - Suggested prefill TP:4 (TTFT 48.37 ms, throughput 15505.23 tokens/s/GPU)
2025-05-16 15:20:24 - __main__ - INFO - Suggested planner upper/lower bound for prefill queue size: 0.24/0.10
2025-05-16 15:20:24 - __main__ - INFO - Suggested decode TP:4 (ITL 4.83 ms, throughput 51.22 tokens/s/GPU)
2025-05-16 15:20:24 - __main__ - INFO - Suggested planner upper/lower bound for decode kv cache utilization: 0.20/0.10
```
After finding the best TP size for prefill and decode, the script will then interpolate the TTFT with ISL and ITL with active KV cache and decode context length. This is to provide a more accurate estimation of the performance when ISL and OSL changes and will be used in the sla-planner. The results will be saved to `<output_dir>/<decode/prefill>_tp<best_tp>_interpolation`. Please change the prefill and decode TP size in the config file to match the best TP sizes obtained from the profiling script.
### Prefill Interpolation Data
In prefill engine, prefills are usually done with batch size=1 and only the ISL (excluding prefix cache hit) affects the iteration time. The script profiles the selected prefill TP configuration across different ISLs and record the TTFT and prefill throughput per GPU under those ISLs.
For dense models, the script profiles different TP sizes.
For MoE models, the script only profiles different TEP sizes, since DEP is generally not the optimal prefill configuration.
### Decode Interpolation Data
In decode engine, decode requests are added inflight and iteration time (or ITL) depends on both the context length and the real-time load of the engine. We capture the real-time load of the engine with active kv usage and average context length. The active kv usage determines the complexity of the memory-bounded attention kernel while the active kv usage divided the average context length determines the complexity of the computation bound MLP kernel. For example, the below figure shows the ITL of DS-Distilled Llama 8b model on H100 TP4. The ITL grows near-linearly with active kv usage under a fixed context length. And the slope increases as the context length decreases.
For dense models, the script profiles different TP sizes.
For MoE models, the script profiles different DEP sizes. TEP decode engines for low latency will be supported in the future.
![images](../../docs/images/itl_interpolation.png)
The script profiles the selected decode TP configuration across different active kv blocks and average context length.
### Output Format of Interpolation Data
After suggesting the optimal TP configuration, two `.npz` files that describe the performance characteristics of the prefill and decode engines in their suggested parallel configurations will be generated. The two `.npz` files are:
* `${benchmark_result_dir}/selected_prefill_interpolation/raw_data.npz}`
* `prefill_isl`: a 1D Numpy array to store the ISLs used to profile the prefill engine.
* `prefill_ttft`: a 1D Numpy array to store the TTFTs under the corresponding ISLs when the prefill engine is exclusively running each prefill request (i.e., with batch size of 1). The unit is in milliseconds.
* `prefill_thpt_per_gpu`: a 1D Numpy array to store the prefill throughput per GPU under the corresponding ISLs. The unit is in tokens per second per GPU.
* `${benchmark_result_dir}/selected_decode_interpolation/raw_data.npz`
* `max_kv_tokens`: a 1D Numpy array with only one element to store the total number of KV tokens in the decode engine.
* `x_kv_usage`: a 1D Numpy array to store the percentage of the active KV blocks (in the range of [0, 1]) used to profile the decode engine. The active KV blocks can be controlled by varying `(ISL + OSL / 2) * concurrency`.
* `y_context_length`: a 1D Numpy array to store the average context length (ISL + OSL / 2) used to profile the decode engine.
* `z_itl`: a 1D Numpy array to store the ITLs under the corresponding active KV usage and context length. To skip the prefill stage while maintaining the context length, benchmark can be done by turn on kv reuse and warmup the engine with the prompts first before running the actual profiling. The unit is in milliseconds.
* `z_thpt_per_gpu`: a 1D Numpy array to store the decode throughput per GPU under the corresponding active KV usage and context length. The unit is in tokens per second per GPU.
SLA planner can work with any interpolation data that follows the above format. For best results, use fine-grained and high coverage interpolation data for the prefill and decode engines.
## Detailed Kubernetes Profiling Instructions
> [!TIP]
> For a complete step-by-step workflow, see the [SLA Planner Quick Start Guide](/docs/planner/sla_planner_quickstart.md).
This section provides detailed technical information for advanced users who need to customize the profiling process.
### Configuration Options
**For dense models**, configure `$DYNAMO_HOME/benchmarks/profiler/deploy/profile_sla_job.yaml`:
```yaml
spec:
template:
spec:
containers:
- name: profile-sla
args:
- --isl
- "3000" # average ISL is 3000 tokens
- --osl
- "150" # average OSL is 150 tokens
- --ttft
- "200" # target TTFT is 200ms (float, in milliseconds)
- --itl
- "20" # target ITL is 20ms (float, in milliseconds)
- --backend
- <vllm/sglang>
```
**For MoE models**, use `profile_sla_moe_job.yaml` with TEP/DEP configuration instead.
### Auto-Configuration
To automatically configure the profiling job based on the hardware and model information, supply the `--model` argument to the profiling script. The following arguments will be automatically set:
- `--config`: will use the default config file (`components/backends/<backend>/deploy/disagg.yaml`) with model updated to the provided model name
- `--min-num-gpus-per-engine`: will be set to the minimum number of GPUs per engine based on the model size and hardware information
- `--max-num-gpus-per-engine`: will be set to the maximum number of GPUs per engine based on the model size and hardware information
- `--num-gpus-per-node`: will be set to the number of GPUs per node based on the hardware information
- `--is-moe-model`: will be set based on the HF config file
- `--max-context-length`: will be set to the maximum context length supported by the model based on the HF config file
### Advanced Configuration
- **Model caching**: For large models, create a multi-attach PVC to cache the model. See [recipes](../../recipes/README.md) for details.
- **Custom disaggregated configurations**: Use the manifest injector to place custom DGD configurations in the PVC.
- **Planner Config Passthrough**: To specify custom planner configurations (e.g., `adjustment-interval` or `load-predictor`) in the generated or deployed DGD config, add a `planner-` prefix to the argument. For example, to specify `--adjustment-interval=60` in SLA planner, add `--planner-adjustment-interval=60` arg to the profiling job.
- **Resource allocation**: Modify the job YAML to adjust GPU and memory requirements.
### Viewing Profiling Results
After the profiling job completes successfully, the results are stored in the persistent volume claim (PVC) created during Step 2.
To download the results:
```bash
# Download to directory
python3 -m deploy.utils.download_pvc_results --namespace $NAMESPACE --output-dir ./results --folder /data/profiling_results
# Download without any of the auto-created config.yaml files used in profiling
python3 -m deploy.utils.download_pvc_results --namespace $NAMESPACE --output-dir ./results --folder /data/profiling_results --no-config
```
The script will:
* Deploy a temporary access pod
* Download all files maintaining directory structure
* Clean the pod up automatically
#### File Structure
The profiling results directory contains the following structure:
```
/workspace/data/profiling_results/
├── prefill_performance.png # Main prefill performance plot
├── decode_performance.png # Main decode performance plot
├── prefill_tp1/ # Individual TP profiling directories
...
├── decode_tp1/
...
├── selected_prefill_interpolation/
│ ├── raw_data.npz # Prefill interpolation data
│ ├── prefill_ttft_interpolation.png # TTFT vs ISL plot
│ └── prefill_throughput_interpolation.png # Throughput vs ISL plot
├── selected_decode_interpolation/
│ ├── raw_data.npz # Decode interpolation data
│ └── decode_tp{best_tp}.png # 3D ITL surface plot
└── config_with_planner.yaml # Generated DGD config with planner
```
#### Viewing Performance Plots
The profiling generates several performance visualization files:
**Main Performance Plots:**
- **`prefill_performance.png`**: Shows TTFT (Time To First Token) performance across different tensor parallelism (TP) sizes
- **`decode_performance.png`**: Shows ITL (Inter-Token Latency) performance across different TP sizes and in-flight request counts
**Interpolation Plots:**
- **`selected_prefill_interpolation/prefill_ttft_interpolation.png`**: TTFT vs Input Sequence Length with quadratic fit
- **`selected_prefill_interpolation/prefill_throughput_interpolation.png`**: Prefill throughput vs Input Sequence Length
- **`selected_decode_interpolation/decode_tp{best_tp}.png`**: 3D surface plot showing ITL vs KV usage and context length
#### Understanding the Data Files
The `.npz` files contain raw profiling data that can be loaded and analyzed using Python:
```python
import numpy as np
# Load prefill data
prefill_data = np.load('selected_prefill_interpolation/raw_data.npz')
print("Prefill data keys:", list(prefill_data.keys()))
# Load decode data
decode_data = np.load('selected_decode_interpolation/raw_data.npz')
print("Decode data keys:", list(decode_data.keys()))
```
### Troubleshooting
#### Image Pull Authentication Errors
If you see `ErrImagePull` or `ImagePullBackOff` errors with 401 unauthorized messages:
1. Ensure the `nvcr-imagepullsecret` exists in your namespace:
```bash
kubectl get secret nvcr-imagepullsecret -n $NAMESPACE
```
2. Verify the service account was created with the image pull secret:
```bash
kubectl get serviceaccount dgdr-profiling-job -n $NAMESPACE -o yaml
```
3. The service account should show `imagePullSecrets` containing `nvcr-imagepullsecret`.
If it doesn't, create the secret
```bash
export NGC_API_KEY=<you-ngc-api-key-here>
kubectl create secret docker-registry nvcr-imagepullsecret --docker-server=nvcr.io --docker-username='$oauthtoken' --docker-password=$NGC_API_KEY
```
## Running the Profiling Script with AI Configurator
> [!NOTE]
> **TensorRT-LLM Only**: AI Configurator currently supports TensorRT-LLM only. Support for vLLM and SGLang is coming soon.
The profiling script can be run much faster using AI Configurator to estimate performance numbers instead of running real Dynamo deployments. This completes profiling in 20-30 seconds using performance simulation.
**Advantages** of `--use-ai-configurator`:
* Script completes in seconds rather than hours
* No Kubernetes or GPU access required
* Ideal for rapid prototyping and testing
**Disadvantages**:
* Estimated performance may contain errors, especially for out-of-distribution input dimensions
* Limited list of supported models, systems, and backends
* Less accurate than real deployment profiling
### Prerequisites
Install AI Configurator:
```bash
pip install aiconfigurator
```
If using local environment, also install:
```bash
pip install -r deploy/utils/requirements.txt
```
### Check Support Matrix
View supported models, systems, and backends:
```bash
aiconfigurator cli --help
```
**Supported configurations:**
```
Models: GPT_7B, GPT_13B, GPT_30B, GPT_66B, GPT_175B, LLAMA2_7B, LLAMA2_13B, LLAMA2_70B, LLAMA3.1_8B, LLAMA3.1_70B, LLAMA3.1_405B, MOE_Mixtral8x7B, MOE_Mixtral8x22B, DEEPSEEK_V3, KIMI_K2, QWEN2.5_1.5B, QWEN2.5_7B, QWEN2.5_32B, QWEN2.5_72B, QWEN3_32B, QWEN3_235B, QWEN3_480B, Nemotron_super_v1.1
Systems: h100_sxm, h200_sxm
Backends: trtllm (vllm and sglang support coming soon)
```
### Running Fast Profiling
Example command for TensorRT-LLM:
```bash
python3 -m benchmarks.profiler.profile_sla \
--config ./components/backends/trtllm/deploy/disagg.yaml \
--backend trtllm \
--use-ai-configurator \
--aic-system h200_sxm \
--aic-model-name QWEN3_32B \
--aic-backend trtllm \ # optional, will use --backend if not provided
--aic-backend-version 0.20.0 \
--isl 3000 \
--osl 150 \
--ttft 200 \ # target TTFT in milliseconds (float)
--itl 20 # target ITL in milliseconds (float)
```
The output will be written to `./profiling_results/` and can be used directly with SLA planner deployment.
# SLA-Driven Profiling with DynamoGraphDeploymentRequest
> [!TIP]
> **New to DGDR and SLA-Driven Profiling?** Start with the [SLA-Driven Profiling and Planner Deployment Quick Start Guide](/docs/planner/sla_planner_quickstart.md) for step-by-step instructions. This document provides deeper technical details about the profiling process.
## Overview
Dynamo provides automated SLA-driven profiling through **DynamoGraphDeploymentRequests (DGDR)**. Instead of manually running profiling scripts, you declare your performance requirements and let the Dynamo Operator handle profiling and deployment automatically.
**Key Benefits:**
- **Declarative**: Specify SLAs, not implementation details
- **Automated**: No manual job setup or result processing
- **Integrated**: Seamlessly works with Dynamo Operator
- **Production-Ready**: Generates optimized configurations with SLA planner
This document covers:
- Technical details of online vs offline profiling
- Profiling process internals (GPU usage, measurements, interpolation)
- Direct script usage for advanced scenarios
- Comprehensive troubleshooting
## Support Matrix
| Backend | Dense Models (P:TP, D:TP) | MoE Models (P:TEP, D:DEP) |
|---------|-------------|------------|
| vLLM | ✅ | 🚧 |
| SGLang | ✅ | ✅ |
| TensorRT-LLM | ✅ | 🚧 |
> [!NOTE]
> - We only support multi-node engines for MoE models.
> - For MoE models, we currently only support deepseek-style MLA+MoE models. For other MoE models like GQA+MoE, please use the dense mode (sweep over TP sizes) instead.
> - Exact model x parallelization mapping support is dependent on the backend. The profiler does not guarantee that the recommended P/D engine configuration is supported and bug-free by the backend.
## Using DGDR for Profiling (Recommended)
The recommended way to profile models is through DGDRs. Sample configurations are provided in `deploy/`:
**Available Samples:**
- **`profile_sla_dgdr.yaml`**: Standard profiling with AIPerf on real engines
- **`profile_sla_aic_dgdr.yaml`**: Fast profiling with AI Configurator simulation
- **`profile_sla_moe_dgdr.yaml`**: MoE model profiling
The Dynamo Operator automatically:
1. Discovers GPU resources
2. Runs profiling (AIPerf on real engines or AI Configurator simulation)
3. Generates optimal DGD configuration with SLA planner
4. Deploys the DGD to your cluster
See the [Quick Start Guide](/docs/planner/sla_planner_quickstart.md) for prerequisites and detailed instructions.
## Profiling Method
1. **GPU Discovery**: Detects available GPUs and their specifications
2. **Identify Sweep Ranges**: Automatically determine minimum and maximum number of GPUs per engine. Minimum is determined by the model size and GPU VRAM. Maximum is set to one node for dense model and 4 nodes for MoE models.
3. **Parallelization Mapping Sweep**: Use the input ISL and OSL, test the performance of the engines with different parallelization mappings. For dense models, we test different TP sizes for both prefill and decode. For MoE models, we test different TEP sizes for prefill and DEP sizes for decode.
- **Prefill**: For prefill, since there is no in-flight batching (assume isl is long enough to saturate the GPU), we directly measure the TTFT for a request with given isl without kv-reusing. For example, the below plot shows the prefill parallelization mapping sweep results for H100 for deepseek-ai/DeepSeek-R1-Distill-Llama-8B.
![Prefill Performance](/docs/images/h100_prefill_performance.png)
- **Decode**: Since the ITL (or iteration time) is relevant with how many requests are in-flight, we measure the ITL under different number of in-flight requests. The range of the number of in-flight requests is from 1 to the maximum number of requests that the kv cache of the engine can hold. To measure the ITL without being affected by piggy-backed prefill requests, the script will enable kv-reuse and warm up the engine by issuing the same prompts before measuring the ITL. Since the kv cache is sufficient for all the requests, it can hold the kv cache of the pre-computed prompts and skip the prefill phase when measuring the ITL. However, for MoE models, this is not guaranteed because the kv cache in different attention DP ranks is different. We are working on framework-side change to fix this issue. For example, the below plot shows the decode parallelization mapping sweep results for H100 for deepseek-ai/DeepSeek-R1-Distill-Llama-8B.
![Decode Performance](/docs/images/h100_decode_performance.png)
4. **Recommendation**: Selects optimal parallelization mapping for prefill and decode that achieves the highest per GPU throughput while adhering the SLA on TTFT and ITL. Specifically, the profiler will choose the point (or a point on the curve for decode) that is left to the vertical red dashed line that represents the SLAs while has the highest y coordinate (throughput per GPU).
5. **In-Depth Profiling on the Recommended P/D Engine**: After finding the best TP size for prefill and decode, the script will then interpolate the TTFT with ISL and ITL with active KV cache and decode context length. This is to provide a more accurate estimation of the performance when ISL and OSL changes and will be used in the sla-planner.
![ITL Interpolation](/docs/images/pd_interpolation.png)
- **Prefill**: Measures TTFT and throughput per GPU across different input lengths with batch size=1.
- **Decode**: Measures ITL and throughput per GPU under various KV cache loads and decode context lengths. The active kv usage determines the complexity of the memory-bounded attention kernel while the active kv usage divided the average context length determines the complexity of the computation bound MLP kernel. For example, the below figure shows the ITL of DS-Distilled Llama 8b model on H100 TP4. The ITL grows near-linearly with active kv usage under a fixed context length. And the slope increases as the context length decreases.
To run the parallelization mapping sweep and the in-depth profiling on the recommended P/D engine, the profiler need to know the engine's forward pass time with different loads. There are two ways to achieve this: run AIPerf on real engines or use AI Configurator to run simulations.
### AIPerf on Real Engines
Profiles your model by creating real test deployments in Kubernetes and measuring their performance.
**Characteristics:**
- **Duration**: 2-4 hours
- **Accuracy**: Highest (real measurements)
- **GPU Requirements**: Full access to test different parallelization mappings
- **Backends**: vLLM, SGLang, TensorRT-LLM
**DGDR Configuration:**
```yaml
profilingConfig:
config:
sweep:
use_ai_configurator: false # Default
```
### AI Configurator Simulation
Uses performance simulation to rapidly estimate optimal configurations without running real deployments.
**Characteristics:**
- **Duration**: 20-30 seconds
- **Accuracy**: Estimated (may have errors for unusual configurations)
- **GPU Requirements**: None
- **Backends**: TensorRT-LLM only (vLLM/SGLang coming soon)
**DGDR Configuration:**
```yaml
profilingConfig:
config:
sweep:
use_ai_configurator: true
aic:
system: h200_sxm # GPU system type
model_name: QWEN3_32B # AIC model identifier
backend_version: "0.20.0"
```
**Supported Configurations:**
For the current list of supported models, systems, and backend versions, see the [AI Configurator documentation](https://github.com/ai-dynamo/aiconfigurator#supported-features).
To check from the command line: `aiconfigurator cli --help`
**Currently supports:**
- **Backends**: TensorRT-LLM (versions 0.20.0, 1.0.0rc3, 1.0.0rc6)
- **Systems**: H100 SXM, H200 SXM, B200 SXM, GB200 SXM, A100 SXM
- **Models**: Wide range including GPT, Llama, Mixtral, DeepSeek, Qwen, and more
### Output Format
After profiling, the DGDR status contains:
1. **Recommended Configuration**: Optimal TP for prefill and decode
2. **Performance Data**: Interpolation models for SLA planner
3. **Generated DGD**: Complete deployment manifest
**Example Recommendations:**
```
Suggested prefill TP:4 (TTFT 48.37 ms, throughput 15505.23 tokens/s/GPU)
Suggested decode TP:4 (ITL 4.83 ms, throughput 51.22 tokens/s/GPU)
```
#### Output Performance Plots
The profiler will generate the following plots to better visualize the performance data:
**Parallelization Mapping Sweep Plots:**
- `prefill_performance.png`: TTFT vs Parallelization Mapping size
- `decode_performance.png`: ITL vs Parallelization Mapping size and in-flight requests
Note these two plots are based on the input ISL and OSL.
**In-Depth Profiling for the Recommended P/D Engine Plots:**
- `selected_prefill_interpolation/prefill_ttft_interpolation.png`: TTFT vs ISL for the recommended prefill engine
- `selected_prefill_interpolation/prefill_throughput_interpolation.png`: Throughput vs ISL for the recommended prefill engine
- `selected_decode_interpolation/decode_itl_interplation.png`: ITL vs KV usage and context length for the recommended decode engine
- `selected_decode_interpolation/decode_throughput_interpolation.png`: Throughput vs KV usage and context length for the recommended decode engine
### Output Interpolation Data
The profiler generates `.npz` files to store the performance data for the recommended P/D engine:
**Prefill Interpolation** (`selected_prefill_interpolation/raw_data.npz`):
- `prefill_isl`: 1D array of input sequence lengths tested
- `prefill_ttft`: 1D array of TTFTs (ms) at each ISL
- `prefill_thpt_per_gpu`: 1D array of throughput (tokens/s/GPU) at each ISL
**Decode Interpolation** (`selected_decode_interpolation/raw_data.npz`):
- `max_kv_tokens`: Total KV tokens capacity in decode engine
- `x_kv_usage`: 1D array of active KV usage percentages [0, 1]
- `y_context_length`: 1D array of average context lengths tested
- `z_itl`: 1D array of ITLs (ms) at each (KV usage, context length) point
- `z_thpt_per_gpu`: 1D array of throughput (tokens/s/GPU) at each point
## DGDR Configuration Reference
This section provides detailed explanations of all DGDR `profilingConfig` options. The DGDR controller passes this configuration to the profiler script, which is defined in `benchmarks/profiler/utils/profiler_argparse.py`.
### Configuration Structure
All profiler configuration goes under `spec.profilingConfig.config`:
```yaml
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeploymentRequest
metadata:
name: my-deployment
spec:
model: "Qwen/Qwen3-0.6B" # High-level: model to deploy
backend: vllm # High-level: inference backend
profilingConfig:
profilerImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1" # Required
configMapRef: # Optional: base DGD config
name: my-config
key: disagg.yaml
config: # Profiler configuration
sla: { ... }
hardware: { ... }
sweep: { ... }
aic: { ... }
planner: { ... }
deploymentOverrides: # Optional
workersImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1"
```
### SLA Configuration (Required)
Define your performance requirements and workload characteristics:
```yaml
profilingConfig:
config:
sla:
isl: 3000 # Average input sequence length (tokens)
osl: 150 # Average output sequence length (tokens)
ttft: 200.0 # Target Time To First Token (milliseconds)
itl: 20.0 # Target Inter-Token Latency (milliseconds)
```
**What these control:**
- **ISL/OSL**: Based on your expected traffic patterns
- **TTFT**: First token latency target (lower = more GPUs needed, affects prefill engine)
- **ITL**: Token generation latency target (lower = more GPUs needed, affects decode engine)
- **Trade-offs**: Tighter SLAs require more GPU resources
### Hardware Configuration (Optional)
Control GPU search space and constraints:
```yaml
profilingConfig:
config:
hardware:
min_num_gpus_per_engine: 2 # if not provided, will automatically determine based on model and VRAM size
max_num_gpus_per_engine: 8 # Maximum GPUs to test
num_gpus_per_node: 8 # GPUs per node (for multi-node MoE)
gpu_type: h200_sxm # GPU type hint
```
**When to use:**
- **min_num_gpus_per_engine**: Skip small TP sizes if your model is large
- **max_num_gpus_per_engine**: Limit search space or work around constraints (e.g., [AIC attention heads](#ai-configurator-attention-head-constraint-error))
- **num_gpus_per_node**: Required for MoE models with TEP/DEP sizing
- **gpu_type**: Informational, auto-detected by controller
> [!TIP]
> If you don't specify hardware constraints, the controller auto-detects based on your model size and available cluster resources.
### Sweep Configuration (Optional)
Control profiling behavior:
```yaml
profilingConfig:
config:
sweep:
use_ai_configurator: false # Use offline profiling (default: false)
prefill_interpolation_granularity: 16 # Samples for prefill TTFT curve
decode_interpolation_granularity: 6 # Samples for decode ITL curve
```
**Use cases:**
- **use_ai_configurator**: Set to `true` for 20-30 second profiling (TensorRT-LLM only)
- **prefill_interpolation_granularity**: How many samples to benchmark for prefill TTFT curve (lower = faster but may be less accurate)
- **decode_interpolation_granularity**: How many samples to benchmark for decode ITL curve (lower = faster but may be less accurate). Since ITL interpolation is a 3d plot and takes longer to run, we default to a smaller number of samples. Increasing this value might quadratically increase the profiling time.
### AI Configurator Configuration (Required if `use_ai_configurator: true`)
Configure AI Configurator profiling mode:
```yaml
profilingConfig:
config:
sweep:
use_ai_configurator: true
aic_system: h200_sxm # GPU system: h100_sxm, h200_sxm, b200_sxm, gb200_sxm, a100_sxm
aic_model_name: QWEN3_32B # AIC model identifier (see supported list)
aic_backend_version: "0.20.0" # TensorRT-LLM version: 0.20.0, 1.0.0rc3, 1.0.0rc6
```
**Supported configurations:** See [AI Configurator documentation](https://github.com/ai-dynamo/aiconfigurator#supported-features)
**Model name mapping examples:**
- `Qwen/Qwen3-32B``QWEN3_32B`
- `meta-llama/Llama-3.1-70B``LLAMA3.1_70B`
- `deepseek-ai/DeepSeek-V3``DEEPSEEK_V3`
### Planner Configuration (Optional)
Pass arguments to the SLA planner:
```yaml
profilingConfig:
config:
planner:
planner_min_endpoint: 2 # Minimum endpoints to maintain
planner_adjustment_interval: 60 # Adjustment interval (seconds)
planner_load_predictor: linear # Load prediction method
```
> [!NOTE]
> Planner arguments use `planner_` prefix. See planner documentation for full list.
### Engine Configuration (Auto-configured)
The controller automatically sets these from high-level fields:
```yaml
# You specify:
spec:
model: "Qwen/Qwen3-0.6B"
backend: vllm
# Controller auto-injects into config:
profilingConfig:
config:
deployment:
model: "Qwen/Qwen3-0.6B" # From spec.model
engine:
backend: vllm # From spec.backend
config: /path/to/configmap # From spec.profilingConfig.configMapRef (if provided)
```
**You should not manually set** `deployment.model` or `engine.backend` in `profilingConfig.config` - they are automatically injected from the high-level fields.
### Complete Example: AIPerf on Real Engines
```yaml
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeploymentRequest
metadata:
name: vllm-dense-online
spec:
model: "Qwen/Qwen3-0.6B"
backend: vllm
profilingConfig:
profilerImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1"
config:
sla:
isl: 3000
osl: 150
ttft: 200.0
itl: 20.0
hardware:
min_num_gpus_per_engine: 1
max_num_gpus_per_engine: 8
sweep:
use_ai_configurator: false
skip_existing_results: false
deploymentOverrides:
workersImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1"
autoApply: true
```
### Complete Example: AI Configurator Simulation
```yaml
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeploymentRequest
metadata:
name: trtllm-aic-offline
spec:
model: "Qwen/Qwen3-32B"
backend: trtllm
profilingConfig:
profilerImage: "nvcr.io/nvidia/ai-dynamo/trtllm-runtime:0.6.1"
config:
sla:
isl: 4000
osl: 500
ttft: 300.0
itl: 10.0
sweep:
use_ai_configurator: true
aic:
system: h200_sxm
model_name: QWEN3_32B
backend_version: "0.20.0"
deploymentOverrides:
workersImage: "nvcr.io/nvidia/ai-dynamo/trtllm-runtime:0.6.1"
autoApply: true
```
### Complete Example: MoE Model
```yaml
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeploymentRequest
metadata:
name: sglang-moe
spec:
model: "deepseek-ai/DeepSeek-R1"
backend: sglang
profilingConfig:
profilerImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.1"
config:
sla:
isl: 2048
osl: 512
ttft: 300.0
itl: 25.0
hardware:
num_gpus_per_node: 8
max_num_gpus_per_engine: 32
engine:
is_moe_model: true # Enable MoE profiling mode
deploymentOverrides:
workersImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.1"
autoApply: true
```
## Troubleshooting
### Profiling Takes Too Long
**Solution 1**: Use AI Configurator for rapid profiling (TensorRT-LLM only):
```yaml
sweep:
use_ai_configurator: true
```
**Solution 2**: Reduce search space:
```yaml
config:
sweep:
min_num_gpus: 4 # Skip TP1, TP2
max_num_gpus: 8 # Don't test beyond TP8
```
### SLA Cannot Be Met
**Symptoms**: Profiler reports no configuration meets targets
**Solutions:**
1. Relax SLA targets (increase TTFT/ITL)
2. Add more GPU resources
3. Try a different backend
4. Use a smaller model
### AI Configurator: Attention Head Constraint Error
**Symptoms**: Profiling fails with error:
```
AssertionError: num_heads <N> should be divisible by tp_size <M> and the division result should be >= 4
```
**Cause**: AI Configurator requires **≥4 attention heads per GPU**. Small models with few heads cannot use high TP sizes.
**Affected Models:**
- **Qwen3-0.6B** (16 heads): Max TP = 4 ❌ Fails at TP=8
- **GPT-2** (12 heads): Max TP = 3
- Most models **<1B parameters**: May hit this constraint
**Solution**: Limit `max_num_gpus_per_engine` in your DGDR:
```yaml
profilingConfig:
profilerImage: "nvcr.io/nvidia/ai-dynamo/trtllm-runtime:0.6.1"
config:
hardware:
max_num_gpus_per_engine: 4 # For Qwen3-0.6B (16 heads / 4 = max TP of 4)
sweep:
use_ai_configurator: true
aic:
system: h200_sxm
model_name: QWEN3_0_6B
```
**Calculate Max TP**: `max_tp = num_attention_heads / 4`
> **Note**: This is an AI Configurator limitation. Online profiling doesn't have this constraint.
### Image Pull Errors
**Symptoms**: `ErrImagePull` or `ImagePullBackOff`
**Solution**: Ensure image pull secrets are configured:
```bash
kubectl create secret docker-registry nvcr-imagepullsecret \
--docker-server=nvcr.io \
--docker-username='$oauthtoken' \
--docker-password=<NGC_API_KEY> \
--namespace <your-namespace>
```
### Out of Memory During Profiling
**Symptoms**: OOM errors in profiling jobs
**Solutions:**
1. Reduce `gpu_memory_utilization` in engine config
2. Reduce `--max-context-length`
3. Skip larger TP configurations
4. Use fewer GPUs per test
### Unsupported Parallelization Mapping in Backend
**Symptoms**: Starttime/runtime error in the backend. For example, prime number of attention heads restrain TP size to be 1 (i.e., falcon-7b with 71 attention heads). Or some backend does not support different TP sizes for prefill and decode.
**Solutions:**
1. Contact the backend to add support for the use cases and bump backend version in dynamo.
2. Restrain the max and min number of GPUs per engine to the supported range.
## Next Steps
- **Deploy with DGDR**: See [Quick Start Guide](/docs/planner/sla_planner_quickstart.md)
- **Understand SLA Planner**: Read [SLA Planner Deep Dive](/docs/planner/sla_planner.md)
- **Monitor Deployments**: Set up [Observability](/docs/kubernetes/observability/metrics.md)
- **Optimize Performance**: See [Performance Tuning](/docs/performance/tuning.md)
## Related Documentation
- [DGDR API Reference](/docs/kubernetes/api_reference.md)
- [SLA Planner Quick Start](/docs/planner/sla_planner_quickstart.md)
- [SLA Planner Architecture](/docs/planner/sla_planner.md)
- [Profiler Arguments Reference](/benchmarks/profiler/utils/profiler_argparse.py)
File suppressed by a .gitattributes entry or the file's encoding is unsupported.
...@@ -77,7 +77,7 @@ _Appears in:_ ...@@ -77,7 +77,7 @@ _Appears in:_
| Field | Description | Default | Validation | | Field | Description | Default | Validation |
| --- | --- | --- | --- | | --- | --- | --- | --- |
| `name` _string_ | Name of the ConfigMap containing the desired data. | | Required: {} <br /> | | `name` _string_ | Name of the ConfigMap containing the desired data. | | Required: \{\} <br /> |
| `key` _string_ | Key in the ConfigMap to select. If not specified, defaults to "disagg.yaml". | disagg.yaml | | | `key` _string_ | Key in the ConfigMap to select. If not specified, defaults to "disagg.yaml". | disagg.yaml | |
...@@ -95,10 +95,11 @@ _Appears in:_ ...@@ -95,10 +95,11 @@ _Appears in:_
| Field | Description | Default | Validation | | Field | Description | Default | Validation |
| --- | --- | --- | --- | | --- | --- | --- | --- |
| `name` _string_ | Name is the desired name for the created DynamoGraphDeployment.<br />If not specified, defaults to the DGDR name. | | Optional: {} <br /> | | `name` _string_ | Name is the desired name for the created DynamoGraphDeployment.<br />If not specified, defaults to the DGDR name. | | Optional: \{\} <br /> |
| `namespace` _string_ | Namespace is the desired namespace for the created DynamoGraphDeployment.<br />If not specified, defaults to the DGDR namespace. | | Optional: {} <br /> | | `namespace` _string_ | Namespace is the desired namespace for the created DynamoGraphDeployment.<br />If not specified, defaults to the DGDR namespace. | | Optional: \{\} <br /> |
| `labels` _object (keys:string, values:string)_ | Labels are additional labels to add to the DynamoGraphDeployment metadata.<br />These are merged with auto-generated labels from the profiling process. | | Optional: {} <br /> | | `labels` _object (keys:string, values:string)_ | Labels are additional labels to add to the DynamoGraphDeployment metadata.<br />These are merged with auto-generated labels from the profiling process. | | Optional: \{\} <br /> |
| `annotations` _object (keys:string, values:string)_ | Annotations are additional annotations to add to the DynamoGraphDeployment metadata. | | Optional: {} <br /> | | `annotations` _object (keys:string, values:string)_ | Annotations are additional annotations to add to the DynamoGraphDeployment metadata. | | Optional: \{\} <br /> |
| `workersImage` _string_ | WorkersImage specifies the container image to use for DynamoGraphDeployment worker components.<br />This image is used for both temporary DGDs created during online profiling and the final DGD.<br />If omitted, the image from the base config file (e.g., disagg.yaml) is used.<br />Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1" | | Optional: \{\} <br /> |
#### DeploymentStatus #### DeploymentStatus
...@@ -236,7 +237,6 @@ DynamoGraphDeploymentRequest is the Schema for the dynamographdeploymentrequests ...@@ -236,7 +237,6 @@ DynamoGraphDeploymentRequest is the Schema for the dynamographdeploymentrequests
It serves as the primary interface for users to request model deployments with It serves as the primary interface for users to request model deployments with
specific performance and resource constraints, enabling SLA-driven deployments. specific performance and resource constraints, enabling SLA-driven deployments.
Lifecycle: Lifecycle:
1. Initial → Pending: Validates spec and prepares for profiling 1. Initial → Pending: Validates spec and prepares for profiling
2. Pending → Profiling: Creates and runs profiling job (online or AIC) 2. Pending → Profiling: Creates and runs profiling job (online or AIC)
...@@ -245,7 +245,6 @@ Lifecycle: ...@@ -245,7 +245,6 @@ Lifecycle:
5. Ready: Terminal state when DGD is operational or spec is available 5. Ready: Terminal state when DGD is operational or spec is available
6. DeploymentDeleted: Terminal state when auto-created DGD is manually deleted 6. DeploymentDeleted: Terminal state when auto-created DGD is manually deleted
The spec becomes immutable once profiling starts. Users must delete and recreate The spec becomes immutable once profiling starts. Users must delete and recreate
the DGDR to modify configuration after this point. the DGDR to modify configuration after this point.
...@@ -277,10 +276,11 @@ _Appears in:_ ...@@ -277,10 +276,11 @@ _Appears in:_
| Field | Description | Default | Validation | | Field | Description | Default | Validation |
| --- | --- | --- | --- | | --- | --- | --- | --- |
| `modelName` _string_ | ModelName specifies the model to deploy (e.g., "Qwen/Qwen3-0.6B", "meta-llama/Llama-3-70b").<br />This is a high-level identifier for easy reference in kubectl output and logs. | | Required: {} <br /> | | `model` _string_ | Model specifies the model to deploy (e.g., "Qwen/Qwen3-0.6B", "meta-llama/Llama-3-70b").<br />This is a high-level identifier for easy reference in kubectl output and logs.<br />The controller automatically sets this value in profilingConfig.config.deployment.model. | | Required: \{\} <br /> |
| `profilingConfig` _[ProfilingConfigSpec](#profilingconfigspec)_ | ProfilingConfig provides the complete configuration for the profiling job.<br />This configuration is passed directly to the profiler.<br />The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema).<br />The profiler will validate the configuration and report any errors. | | Required: {} <br /> | | `backend` _string_ | Backend specifies the inference backend to use.<br />The controller automatically sets this value in profilingConfig.config.engine.backend. | | Enum: [vllm sglang trtllm] <br />Required: \{\} <br /> |
| `profilingConfig` _[ProfilingConfigSpec](#profilingconfigspec)_ | ProfilingConfig provides the complete configuration for the profiling job.<br />This configuration is passed directly to the profiler.<br />The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema).<br />Note: deployment.model and engine.backend are automatically set from the high-level<br />modelName and backend fields and should not be specified in this config. | | Required: \{\} <br /> |
| `autoApply` _boolean_ | AutoApply indicates whether to automatically create a DynamoGraphDeployment<br />after profiling completes. If false, only the spec is generated and stored in status.<br />Users can then manually create a DGD using the generated spec. | false | | | `autoApply` _boolean_ | AutoApply indicates whether to automatically create a DynamoGraphDeployment<br />after profiling completes. If false, only the spec is generated and stored in status.<br />Users can then manually create a DGD using the generated spec. | false | |
| `deploymentOverrides` _[DeploymentOverridesSpec](#deploymentoverridesspec)_ | DeploymentOverrides allows customizing metadata for the auto-created DGD.<br />Only applicable when AutoApply is true. | | Optional: {} <br /> | | `deploymentOverrides` _[DeploymentOverridesSpec](#deploymentoverridesspec)_ | DeploymentOverrides allows customizing metadata for the auto-created DGD.<br />Only applicable when AutoApply is true. | | Optional: \{\} <br /> |
#### DynamoGraphDeploymentRequestStatus #### DynamoGraphDeploymentRequestStatus
...@@ -298,12 +298,12 @@ _Appears in:_ ...@@ -298,12 +298,12 @@ _Appears in:_
| Field | Description | Default | Validation | | Field | Description | Default | Validation |
| --- | --- | --- | --- | | --- | --- | --- | --- |
| `state` _string_ | State is a high-level textual status of the deployment request lifecycle.<br />Possible values: "", "Pending", "Profiling", "Deploying", "Ready", "DeploymentDeleted", "Failed"<br />Empty string ("") represents the initial state before initialization. | | | | `state` _string_ | State is a high-level textual status of the deployment request lifecycle.<br />Possible values: "", "Pending", "Profiling", "Deploying", "Ready", "DeploymentDeleted", "Failed"<br />Empty string ("") represents the initial state before initialization. | | |
| `backend` _string_ | Backend is extracted from profilingConfig.config.engine.backend for display purposes.<br />This field is populated by the controller and shown in kubectl output. | | Optional: {} <br /> | | `backend` _string_ | Backend is extracted from profilingConfig.config.engine.backend for display purposes.<br />This field is populated by the controller and shown in kubectl output. | | Optional: \{\} <br /> |
| `observedGeneration` _integer_ | ObservedGeneration reflects the generation of the most recently observed spec.<br />Used to detect spec changes and enforce immutability after profiling starts. | | | | `observedGeneration` _integer_ | ObservedGeneration reflects the generation of the most recently observed spec.<br />Used to detect spec changes and enforce immutability after profiling starts. | | |
| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta) array_ | Conditions contains the latest observed conditions of the deployment request.<br />Standard condition types include: Validation, Profiling, SpecGenerated, DeploymentReady.<br />Conditions are merged by type on patch updates. | | | | `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta) array_ | Conditions contains the latest observed conditions of the deployment request.<br />Standard condition types include: Validation, Profiling, SpecGenerated, DeploymentReady.<br />Conditions are merged by type on patch updates. | | |
| `profilingResults` _string_ | ProfilingResults contains a reference to the ConfigMap holding profiling data.<br />Format: "configmap/<name>" | | Optional: {} <br /> | | `profilingResults` _string_ | ProfilingResults contains a reference to the ConfigMap holding profiling data.<br />Format: "configmap/<name>" | | Optional: \{\} <br /> |
| `generatedDeployment` _[RawExtension](#rawextension)_ | GeneratedDeployment contains the full generated DynamoGraphDeployment specification<br />including metadata, based on profiling results. Users can extract this to create<br />a DGD manually, or it's used automatically when autoApply is true.<br />Stored as RawExtension to preserve all fields including metadata. | | EmbeddedResource: {} <br />Optional: {} <br /> | | `generatedDeployment` _[RawExtension](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#rawextension-runtime-pkg)_ | GeneratedDeployment contains the full generated DynamoGraphDeployment specification<br />including metadata, based on profiling results. Users can extract this to create<br />a DGD manually, or it's used automatically when autoApply is true.<br />Stored as RawExtension to preserve all fields including metadata. | | EmbeddedResource: \{\} <br />Optional: \{\} <br /> |
| `deployment` _[DeploymentStatus](#deploymentstatus)_ | Deployment tracks the auto-created DGD when AutoApply is true.<br />Contains name, namespace, state, and creation status of the managed DGD. | | Optional: {} <br /> | | `deployment` _[DeploymentStatus](#deploymentstatus)_ | Deployment tracks the auto-created DGD when AutoApply is true.<br />Contains name, namespace, state, and creation status of the managed DGD. | | Optional: \{\} <br /> |
#### DynamoGraphDeploymentSpec #### DynamoGraphDeploymentSpec
...@@ -319,9 +319,9 @@ _Appears in:_ ...@@ -319,9 +319,9 @@ _Appears in:_
| Field | Description | Default | Validation | | Field | Description | Default | Validation |
| --- | --- | --- | --- | | --- | --- | --- | --- |
| `pvcs` _[PVC](#pvc) array_ | PVCs defines a list of persistent volume claims that can be referenced by components.<br />Each PVC must have a unique name that can be referenced in component specifications. | | Optional: {} <br /> | | `pvcs` _[PVC](#pvc) array_ | PVCs defines a list of persistent volume claims that can be referenced by components.<br />Each PVC must have a unique name that can be referenced in component specifications. | | Optional: \{\} <br /> |
| `services` _object (keys:string, values:[DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec))_ | Services are the services to deploy as part of this deployment. | | Optional: {} <br /> | | `services` _object (keys:string, values:[DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec))_ | Services are the services to deploy as part of this deployment. | | Optional: \{\} <br /> |
| `envs` _[EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#envvar-v1-core) array_ | Envs are environment variables applied to all services in the deployment unless<br />overridden by service-specific configuration. | | Optional: {} <br /> | | `envs` _[EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#envvar-v1-core) array_ | Envs are environment variables applied to all services in the deployment unless<br />overridden by service-specific configuration. | | Optional: \{\} <br /> |
| `backendFramework` _string_ | BackendFramework specifies the backend framework (e.g., "sglang", "vllm", "trtllm"). | | Enum: [sglang vllm trtllm] <br /> | | `backendFramework` _string_ | BackendFramework specifies the backend framework (e.g., "sglang", "vllm", "trtllm"). | | Enum: [sglang vllm trtllm] <br /> |
...@@ -415,9 +415,9 @@ _Appears in:_ ...@@ -415,9 +415,9 @@ _Appears in:_
| Field | Description | Default | Validation | | Field | Description | Default | Validation |
| --- | --- | --- | --- | | --- | --- | --- | --- |
| `create` _boolean_ | Create indicates to create a new PVC | | | | `create` _boolean_ | Create indicates to create a new PVC | | |
| `name` _string_ | Name is the name of the PVC | | Required: {} <br /> | | `name` _string_ | Name is the name of the PVC | | Required: \{\} <br /> |
| `storageClass` _string_ | StorageClass to be used for PVC creation. Required when create is true. | | | | `storageClass` _string_ | StorageClass to be used for PVC creation. Required when create is true. | | |
| `size` _[Quantity](#quantity)_ | Size of the volume in Gi, used during PVC creation. Required when create is true. | | | | `size` _[Quantity](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#quantity-resource-api)_ | Size of the volume in Gi, used during PVC creation. Required when create is true. | | |
| `volumeAccessMode` _[PersistentVolumeAccessMode](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#persistentvolumeaccessmode-v1-core)_ | VolumeAccessMode is the volume access mode of the PVC. Required when create is true. | | | | `volumeAccessMode` _[PersistentVolumeAccessMode](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#persistentvolumeaccessmode-v1-core)_ | VolumeAccessMode is the volume access mode of the PVC. Required when create is true. | | |
...@@ -436,8 +436,9 @@ _Appears in:_ ...@@ -436,8 +436,9 @@ _Appears in:_
| Field | Description | Default | Validation | | Field | Description | Default | Validation |
| --- | --- | --- | --- | | --- | --- | --- | --- |
| `config` _[JSON](#json)_ | Config is the profiling configuration as arbitrary JSON/YAML. This will be passed directly to the profiler.<br />The profiler will validate the configuration and report any errors. | | Optional: {} <br />Type: object <br /> | | `config` _[JSON](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#json-v1-apiextensions-k8s-io)_ | Config is the profiling configuration as arbitrary JSON/YAML. This will be passed directly to the profiler.<br />The profiler will validate the configuration and report any errors. | | Optional: \{\} <br />Type: object <br /> |
| `configMapRef` _[ConfigMapKeySelector](#configmapkeyselector)_ | ConfigMapRef is an optional reference to a ConfigMap containing the DynamoGraphDeployment<br />base config file (disagg.yaml). This is separate from the profiling config above.<br />The path to this config will be set as engine.config in the profiling config. | | Optional: {} <br /> | | `configMapRef` _[ConfigMapKeySelector](#configmapkeyselector)_ | ConfigMapRef is an optional reference to a ConfigMap containing the DynamoGraphDeployment<br />base config file (disagg.yaml). This is separate from the profiling config above.<br />The path to this config will be set as engine.config in the profiling config. | | Optional: \{\} <br /> |
| `profilerImage` _string_ | ProfilerImage specifies the container image to use for profiling jobs.<br />This image contains the profiler code and dependencies needed for SLA-based profiling.<br />Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1" | | Required: \{\} <br /> |
#### SharedMemorySpec #### SharedMemorySpec
...@@ -455,7 +456,7 @@ _Appears in:_ ...@@ -455,7 +456,7 @@ _Appears in:_
| Field | Description | Default | Validation | | Field | Description | Default | Validation |
| --- | --- | --- | --- | | --- | --- | --- | --- |
| `disabled` _boolean_ | | | | | `disabled` _boolean_ | | | |
| `size` _[Quantity](#quantity)_ | | | | | `size` _[Quantity](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#quantity-resource-api)_ | | | |
#### VolumeMount #### VolumeMount
...@@ -472,7 +473,7 @@ _Appears in:_ ...@@ -472,7 +473,7 @@ _Appears in:_
| Field | Description | Default | Validation | | Field | Description | Default | Validation |
| --- | --- | --- | --- | | --- | --- | --- | --- |
| `name` _string_ | Name references a PVC name defined in the top-level PVCs map | | Required: {} <br /> | | `name` _string_ | Name references a PVC name defined in the top-level PVCs map | | Required: \{\} <br /> |
| `mountPoint` _string_ | MountPoint specifies where to mount the volume.<br />If useAsCompilationCache is true and mountPoint is not specified,<br />a backend-specific default will be used. | | | | `mountPoint` _string_ | MountPoint specifies where to mount the volume.<br />If useAsCompilationCache is true and mountPoint is not specified,<br />a backend-specific default will be used. | | |
| `useAsCompilationCache` _boolean_ | UseAsCompilationCache indicates this volume should be used as a compilation cache.<br />When true, backend-specific environment variables will be set and default mount points may be used. | false | | | `useAsCompilationCache` _boolean_ | UseAsCompilationCache indicates this volume should be used as a compilation cache.<br />When true, backend-specific environment variables will be set and default mount points may be used. | false | |
......
...@@ -24,7 +24,7 @@ There are two additional rules set by planner to prevent over-compensation: ...@@ -24,7 +24,7 @@ There are two additional rules set by planner to prevent over-compensation:
## SLA-based Scaling Up/Down Prefill/Decode Workers ## SLA-based Scaling Up/Down Prefill/Decode Workers
See [Pre-Deployment Profiling](../benchmarks/pre_deployment_profiling.md) for more details. See [SLA-Driven Profiling](../benchmarks/sla_driven_profiling.md) for more details.
## Usage ## Usage
......
...@@ -78,5 +78,5 @@ Key features include: ...@@ -78,5 +78,5 @@ Key features include:
Overview <self> Overview <self>
SLA Planner Quick Start <sla_planner_quickstart> SLA Planner Quick Start <sla_planner_quickstart>
Pre-Deployment Profiling <../benchmarks/pre_deployment_profiling.md> SLA-Driven Profiling <../benchmarks/sla_driven_profiling.md>
SLA-based Planner <sla_planner.md> SLA-based Planner <sla_planner.md>
# SLA-based Planner # SLA-based Planner
> [!TIP] > [!TIP]
> **New to SLA Planner?** For a complete workflow including profiling and deployment, see the [SLA Planner Quick Start Guide](/docs/planner/sla_planner_quickstart.md). > **New to SLA Planner?** For a complete workflow including profiling and deployment, see the [SLA Profiling + Planner Quick Start Guide](/docs/planner/sla_planner_quickstart.md).
This document covers information regarding the SLA-based planner in `examples/common/utils/planner_core.py`. This document covers information regarding the SLA-based planner in `examples/common/utils/planner_core.py`.
...@@ -47,11 +47,11 @@ The SLA planner consists of several key components: ...@@ -47,11 +47,11 @@ The SLA planner consists of several key components:
3. **Correction Factors**: Adjust predictions based on observed vs. expected performance 3. **Correction Factors**: Adjust predictions based on observed vs. expected performance
4. **Scaling Logic**: Calculate optimal number of prefill/decode replicas to meet SLA targets 4. **Scaling Logic**: Calculate optimal number of prefill/decode replicas to meet SLA targets
## Pre-Deployment Profiling ## SLA-Driven Pre-Deployment Profiling
**Prerequisite**: SLA-based planner requires pre-deployment profiling to be completed before deployment. The profiling process analyzes your model's performance characteristics to determine optimal tensor parallelism configurations and scaling parameters that the planner will use during operation. **Prerequisite**: SLA-based planner requires pre-deployment profiling to be completed before deployment. The profiling process analyzes your model's performance characteristics to determine optimal tensor parallelism configurations and scaling parameters that the planner will use during operation.
See [Pre-Deployment Profiling](../benchmarks/pre_deployment_profiling.md) for detailed instructions on running the profiling process. See [Pre-Deployment Profiling](../benchmarks/sla_driven_profiling.md) for detailed instructions on running the profiling process.
## Load Prediction ## Load Prediction
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment