Unverified Commit 6a84ffd3 authored by hhzhang16's avatar hhzhang16 Committed by GitHub
Browse files

feat: turn profiling k8s jobs into sample DGDR requests (#3864)


Signed-off-by: default avatarHannah Zhang <hannahz@nvidia.com>
Signed-off-by: default avatarhongkuanz <hongkuanz@nvidia.com>
Signed-off-by: default avatarHongkuan Zhou <tedzhouhk@gmail.com>
Co-authored-by: default avatarhongkuanz <hongkuanz@nvidia.com>
Co-authored-by: default avatarHongkuan Zhou <tedzhouhk@gmail.com>
parent 0d07e2c3
......@@ -135,15 +135,6 @@ dynamo-operator:
# -- Whether to enable SSH key generation for MPI Run
enabled: true
# DynamoGraphDeploymentRequest (DGDR) configuration
dgdr:
# -- Container image to use for profiling jobs (both online and offline/AIC)
# REQUIRED: Must be set to create DynamoGraphDeploymentRequests
# For development: Build and push the profiler image from the ai-dynamo repository
# Public image will be available in release 0.6.1
# Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1"
profilerImage: ""
# Grove component - distributed inference orchestration
grove:
......
......@@ -267,7 +267,7 @@ helm: manifests kustomize helmify
$(KUSTOMIZE) build config/default | $(HELMIFY) -image-pull-secrets charts/dynamo-kubernetes-operator
######################### CRD Reference Docs
CRD_REF_DOCS_VERSION ?= v0.0.12
CRD_REF_DOCS_VERSION ?= latest
CRD_REF_DOCS ?= $(LOCALBIN)/crd-ref-docs
.PHONY: crd-ref-docs
......
......@@ -60,6 +60,12 @@ type ProfilingConfigSpec struct {
// The path to this config will be set as engine.config in the profiling config.
// +kubebuilder:validation:Optional
ConfigMapRef *ConfigMapKeySelector `json:"configMapRef,omitempty"`
// ProfilerImage specifies the container image to use for profiling jobs.
// This image contains the profiler code and dependencies needed for SLA-based profiling.
// Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1"
// +kubebuilder:validation:Required
ProfilerImage string `json:"profilerImage"`
}
// DeploymentOverridesSpec allows users to customize metadata for auto-created DynamoGraphDeployments.
......@@ -83,21 +89,36 @@ type DeploymentOverridesSpec struct {
// Annotations are additional annotations to add to the DynamoGraphDeployment metadata.
// +kubebuilder:validation:Optional
Annotations map[string]string `json:"annotations,omitempty"`
// WorkersImage specifies the container image to use for DynamoGraphDeployment worker components.
// This image is used for both temporary DGDs created during online profiling and the final DGD.
// If omitted, the image from the base config file (e.g., disagg.yaml) is used.
// Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1"
// +kubebuilder:validation:Optional
WorkersImage string `json:"workersImage,omitempty"`
}
// DynamoGraphDeploymentRequestSpec defines the desired state of a DynamoGraphDeploymentRequest.
// This CRD serves as the primary interface for users to request model deployments with
// specific performance constraints and resource requirements, enabling SLA-driven deployments.
type DynamoGraphDeploymentRequestSpec struct {
// ModelName specifies the model to deploy (e.g., "Qwen/Qwen3-0.6B", "meta-llama/Llama-3-70b").
// Model specifies the model to deploy (e.g., "Qwen/Qwen3-0.6B", "meta-llama/Llama-3-70b").
// This is a high-level identifier for easy reference in kubectl output and logs.
// The controller automatically sets this value in profilingConfig.config.deployment.model.
// +kubebuilder:validation:Required
ModelName string `json:"modelName"`
Model string `json:"model"`
// Backend specifies the inference backend to use.
// The controller automatically sets this value in profilingConfig.config.engine.backend.
// +kubebuilder:validation:Required
// +kubebuilder:validation:Enum=vllm;sglang;trtllm
Backend string `json:"backend"`
// ProfilingConfig provides the complete configuration for the profiling job.
// This configuration is passed directly to the profiler.
// The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema).
// The profiler will validate the configuration and report any errors.
// Note: deployment.model and engine.backend are automatically set from the high-level
// modelName and backend fields and should not be specified in this config.
// +kubebuilder:validation:Required
ProfilingConfig ProfilingConfigSpec `json:"profilingConfig"`
......@@ -191,7 +212,7 @@ type DynamoGraphDeploymentRequestStatus struct {
// +kubebuilder:object:root=true
// +kubebuilder:subresource:status
// +kubebuilder:resource:shortName=dgdr
// +kubebuilder:printcolumn:name="Model",type=string,JSONPath=`.spec.modelName`
// +kubebuilder:printcolumn:name="Model",type=string,JSONPath=`.spec.model`
// +kubebuilder:printcolumn:name="Backend",type=string,JSONPath=`.status.backend`
// +kubebuilder:printcolumn:name="State",type=string,JSONPath=`.status.state`
// +kubebuilder:printcolumn:name="DGD-State",type=string,JSONPath=`.status.deployment.state`
......
......@@ -140,7 +140,6 @@ func main() {
var mpiRunSecretName string
var mpiRunSecretNamespace string
var plannerClusterRoleName string
var profilerImage string
var dgdrProfilingClusterRoleName string
flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
......@@ -182,8 +181,6 @@ func main() {
"Namespace where the MPI SSH secret is located (required)")
flag.StringVar(&plannerClusterRoleName, "planner-cluster-role-name", "",
"Name of the ClusterRole for planner (cluster-wide mode only)")
flag.StringVar(&profilerImage, "profiler-image", "",
"Container image to use for profiling jobs (both online and offline/AIC) (for DynamoGraphDeploymentRequest)")
flag.StringVar(&dgdrProfilingClusterRoleName, "dgdr-profiling-cluster-role-name", "",
"Name of the ClusterRole for DGDR profiling jobs (cluster-wide mode only)")
opts := zap.Options{
......@@ -460,7 +457,6 @@ func main() {
if err = (&controller.DynamoGraphDeploymentRequestReconciler{
Client: mgr.GetClient(),
Recorder: mgr.GetEventRecorderFor("dynamographdeploymentrequest"),
ProfilerImage: profilerImage,
Config: ctrlConfig,
RBACManager: rbacManager,
}).SetupWithManager(mgr); err != nil {
......
......@@ -33,7 +33,7 @@ spec:
scope: Namespaced
versions:
- additionalPrinterColumns:
- jsonPath: .spec.modelName
- jsonPath: .spec.model
name: Model
type: string
- jsonPath: .status.backend
......@@ -94,6 +94,15 @@ spec:
after profiling completes. If false, only the spec is generated and stored in status.
Users can then manually create a DGD using the generated spec.
type: boolean
backend:
description: |-
Backend specifies the inference backend to use.
The controller automatically sets this value in profilingConfig.config.engine.backend.
enum:
- vllm
- sglang
- trtllm
type: string
deploymentOverrides:
description: |-
DeploymentOverrides allows customizing metadata for the auto-created DGD.
......@@ -121,18 +130,27 @@ spec:
Namespace is the desired namespace for the created DynamoGraphDeployment.
If not specified, defaults to the DGDR namespace.
type: string
workersImage:
description: |-
WorkersImage specifies the container image to use for DynamoGraphDeployment worker components.
This image is used for both temporary DGDs created during online profiling and the final DGD.
If omitted, the image from the base config file (e.g., disagg.yaml) is used.
Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1"
type: string
type: object
modelName:
model:
description: |-
ModelName specifies the model to deploy (e.g., "Qwen/Qwen3-0.6B", "meta-llama/Llama-3-70b").
Model specifies the model to deploy (e.g., "Qwen/Qwen3-0.6B", "meta-llama/Llama-3-70b").
This is a high-level identifier for easy reference in kubectl output and logs.
The controller automatically sets this value in profilingConfig.config.deployment.model.
type: string
profilingConfig:
description: |-
ProfilingConfig provides the complete configuration for the profiling job.
This configuration is passed directly to the profiler.
The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema).
The profiler will validate the configuration and report any errors.
Note: deployment.model and engine.backend are automatically set from the high-level
modelName and backend fields and should not be specified in this config.
properties:
config:
description: |-
......@@ -156,9 +174,18 @@ spec:
required:
- name
type: object
profilerImage:
description: |-
ProfilerImage specifies the container image to use for profiling jobs.
This image contains the profiler code and dependencies needed for SLA-based profiling.
Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1"
type: string
required:
- profilerImage
type: object
required:
- modelName
- backend
- model
- profilingConfig
type: object
status:
......
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
......
......@@ -18,11 +18,18 @@ kind: DynamoGraphDeploymentRequest
metadata:
name: example-llm-sla
spec:
# ModelName is a high-level identifier for the model being deployed
modelName: Qwen/Qwen3-0.6B
# Model is a high-level identifier for the model being deployed (required - injected into profilingConfig.config.deployment.model)
model: Qwen/Qwen3-0.6B
# Backend to use for profiling (required - injected into profilingConfig.config.engine.backend)
backend: trtllm
# ProfilerImage is the container image to use for profiling jobs (required)
profilerImage: "nvcr.io/nvidia/ai-dynamo/trtllm-runtime:0.6.1"
# ProfilingConfig maps directly to the profile_sla.py config format
# See benchmarks/profiler/utils/profiler_argparse.py for complete schema
# Note: deployment.model and engine.backend are automatically set from model and backend above
profilingConfig:
config:
# Optional: Output directory for profiling results (defaults to /data in the Job)
......@@ -30,7 +37,6 @@ spec:
# Engine configuration
engine:
backend: trtllm # Inference backend: vllm, sglang, or trtllm
max_context_length: 16384 # Maximum context length supported by the model
is_moe_model: false # Enable MoE model support (uses TEP/DEP instead of TP)
......
......@@ -74,14 +74,14 @@ require (
go.uber.org/automaxprocs v1.6.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
go.uber.org/zap v1.27.0 // indirect
golang.org/x/net v0.40.0 // indirect
golang.org/x/net v0.46.0 // indirect
golang.org/x/oauth2 v0.30.0 // indirect
golang.org/x/sync v0.14.0 // indirect
golang.org/x/sys v0.33.0 // indirect
golang.org/x/term v0.32.0 // indirect
golang.org/x/text v0.25.0 // indirect
golang.org/x/sync v0.17.0 // indirect
golang.org/x/sys v0.37.0 // indirect
golang.org/x/term v0.36.0 // indirect
golang.org/x/text v0.30.0 // indirect
golang.org/x/time v0.9.0 // indirect
golang.org/x/tools v0.33.0 // indirect
golang.org/x/tools v0.38.0 // indirect
gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20250519155744-55703ea1f237 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20250519155744-55703ea1f237 // indirect
......
......@@ -158,34 +158,34 @@ golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
golang.org/x/net v0.40.0 h1:79Xs7wF06Gbdcg4kdCCIQArK11Z1hr5POQ6+fIYHNuY=
golang.org/x/net v0.40.0/go.mod h1:y0hY0exeL2Pku80/zKK7tpntoX23cqL3Oa6njdgRtds=
golang.org/x/net v0.46.0 h1:giFlY12I07fugqwPuWJi68oOnpfqFnJIJzaIIm2JVV4=
golang.org/x/net v0.46.0/go.mod h1:Q9BGdFy1y4nkUwiLvT5qtyhAnEHgnQ/zd8PfU6nc210=
golang.org/x/oauth2 v0.30.0 h1:dnDm7JmhM45NNpd8FDDeLhK6FwqbOf4MLCM9zb1BOHI=
golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKlU=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.14.0 h1:woo0S4Yywslg6hp4eUFjTVOyKt0RookbpAHG4c1HmhQ=
golang.org/x/sync v0.14.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
golang.org/x/sync v0.17.0 h1:l60nONMj9l5drqw6jlhIELNv9I0A4OFgRsG9k2oT9Ug=
golang.org/x/sync v0.17.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw=
golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
golang.org/x/term v0.32.0 h1:DR4lr0TjUs3epypdhTOkMmuF5CDFJ/8pOnbzMZPQ7bg=
golang.org/x/term v0.32.0/go.mod h1:uZG1FhGx848Sqfsq4/DlJr3xGGsYMu/L5GW4abiaEPQ=
golang.org/x/sys v0.37.0 h1:fdNQudmxPjkdUTPnLn5mdQv7Zwvbvpaxqs831goi9kQ=
golang.org/x/sys v0.37.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
golang.org/x/term v0.36.0 h1:zMPR+aF8gfksFprF/Nc/rd1wRS1EI6nDBGyWAvDzx2Q=
golang.org/x/term v0.36.0/go.mod h1:Qu394IJq6V6dCBRgwqshf3mPF85AqzYEzofzRdZkWss=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.25.0 h1:qVyWApTSYLk/drJRO5mDlNYskwQznZmkpV2c8q9zls4=
golang.org/x/text v0.25.0/go.mod h1:WEdwpYrmk1qmdHvhkSTNPm3app7v4rsT8F2UD6+VHIA=
golang.org/x/text v0.30.0 h1:yznKA/E9zq54KzlzBEAWn1NXSQ8DIp/NYMy88xJjl4k=
golang.org/x/text v0.30.0/go.mod h1:yDdHFIX9t+tORqspjENWgzaCVXgk0yYnYuSZ8UzzBVM=
golang.org/x/time v0.9.0 h1:EsRrnYcQiGH+5FfbgvV4AP7qEZstoyrHB0DzarOQ4ZY=
golang.org/x/time v0.9.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
golang.org/x/tools v0.33.0 h1:4qz2S3zmRxbGIhDIAgjxvFutSvH5EfnsYrRBj0UI0bc=
golang.org/x/tools v0.33.0/go.mod h1:CIJMaWEY88juyUfo7UbgPqbC8rU2OqfAV1h2Qp0oMYI=
golang.org/x/tools v0.38.0 h1:Hx2Xv8hISq8Lm16jvBZ2VQf+RLmbd7wVUsALibYI/IQ=
golang.org/x/tools v0.38.0/go.mod h1:yEsQ/d/YK8cjh0L6rZlY8tgtlKiBNTL14pGDJPJpYQs=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
......
......@@ -144,7 +144,7 @@ const (
MessageConfigMapKeyNotFound = "key %s not found in ConfigMap %s"
// Validation messages
ValidationErrorModelNameRequired = "modelName is required"
ValidationErrorModelRequired = "model is required"
ValidationErrorITLPositive = "sla.itl must be positive"
ValidationErrorTTFTPositive = "sla.ttft must be positive"
ValidationErrorInvalidBackend = "invalid backend: %s (must be vllm, sglang, or trtllm)"
......@@ -198,8 +198,6 @@ type DynamoGraphDeploymentRequestReconciler struct {
Recorder record.EventRecorder
Config commonController.Config
// ProfilerImage is the container image to use for profiling jobs (both online and offline/AIC)
ProfilerImage string
// RBACMgr handles RBAC setup for profiling jobs
RBACManager RBACManager
}
......@@ -217,13 +215,6 @@ func (r *DynamoGraphDeploymentRequestReconciler) GetRecorder() record.EventRecor
// FinalizeResource implements commonController.Finalizer interface
func (r *DynamoGraphDeploymentRequestReconciler) FinalizeResource(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) error {
logger := log.FromContext(ctx)
logger.Info("Finalizing DGDR", "name", dgdr.Name)
// Cleanup profiling resources
if err := r.cleanupProfilingResources(ctx, dgdr); err != nil {
logger.Error(err, "Failed to cleanup profiling resources")
return err
}
logger.Info("DGDR finalized successfully", "name", dgdr.Name)
return nil
......@@ -320,8 +311,8 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleInitialState(ctx context.
// Set observedGeneration to track the spec we're processing
dgdr.Status.ObservedGeneration = dgdr.Generation
// Extract and populate backend from config for display in kubectl output
dgdr.Status.Backend = getBackendFromConfig(dgdr)
// Populate backend in status from spec for display in kubectl output
dgdr.Status.Backend = dgdr.Spec.Backend
// Initialize status
r.Recorder.Event(dgdr, corev1.EventTypeNormal, EventReasonInitialized, MessageInitialized)
......@@ -664,11 +655,6 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleFailedState(ctx context.C
logger := log.FromContext(ctx)
logger.Info("DGDR is in failed state", "name", dgdr.Name)
// Cleanup profiling resources if any
if err := r.cleanupProfilingResources(ctx, dgdr); err != nil {
logger.Error(err, "Failed to cleanup profiling resources")
}
// Could implement retry logic here if desired
return ctrl.Result{}, nil
}
......@@ -705,27 +691,13 @@ func isOnlineProfiling(dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) boo
return true
}
// getBackendFromConfig extracts the backend value from profilingConfig.config.engine.backend
func getBackendFromConfig(dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) string {
if dgdr.Spec.ProfilingConfig.Config == nil {
return ""
}
var config map[string]interface{}
if err := yaml.Unmarshal(dgdr.Spec.ProfilingConfig.Config.Raw, &config); err != nil {
return ""
}
if engine, ok := config["engine"].(map[string]interface{}); ok {
if backend, ok := engine["backend"].(string); ok {
return backend
}
}
return ""
}
// validateSpec validates the DGDR spec
func (r *DynamoGraphDeploymentRequestReconciler) validateSpec(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) error {
// Validate profiler image is specified in the new location
if dgdr.Spec.ProfilingConfig.ProfilerImage == "" {
return errors.New("profilingConfig.profilerImage is required")
}
// Basic validation - check that profilingConfig.config is provided
if dgdr.Spec.ProfilingConfig.Config == nil || len(dgdr.Spec.ProfilingConfig.Config.Raw) == 0 {
return errors.New("profilingConfig.config is required and must not be empty")
......@@ -764,15 +736,20 @@ func (r *DynamoGraphDeploymentRequestReconciler) validateSpec(ctx context.Contex
return fmt.Errorf("failed to parse profilingConfig.config: %w", err)
}
// Additional validation: Ensure engine.config is set (either as path or will be set from ConfigMapRef)
engineConfig, hasEngine := config["engine"].(map[string]interface{})
if hasEngine {
_, hasConfig := engineConfig["config"]
if !hasConfig && dgdr.Spec.ProfilingConfig.ConfigMapRef == nil {
return errors.New("either profilingConfig.config.engine.config must be set, or profilingConfig.configMapRef must be provided")
// Warn if deployment.model or engine.backend are specified in config (they will be overwritten by spec fields)
if engineConfig, ok := config["engine"].(map[string]interface{}); ok {
if backend, ok := engineConfig["backend"].(string); ok && backend != "" && backend != dgdr.Spec.Backend {
logger := log.FromContext(ctx)
logger.Info("Warning: profilingConfig.config.engine.backend will be overwritten by spec.backend",
"configBackend", backend, "specBackend", dgdr.Spec.Backend)
}
}
if deployment, ok := config["deployment"].(map[string]interface{}); ok {
if model, ok := deployment["model"].(string); ok && model != "" && model != dgdr.Spec.Model {
logger := log.FromContext(ctx)
logger.Info("Warning: profilingConfig.config.deployment.model will be overwritten by spec.model",
"configModel", model, "specModel", dgdr.Spec.Model)
}
} else if dgdr.Spec.ProfilingConfig.ConfigMapRef == nil {
return errors.New("profilingConfig.config must contain 'engine' section, or profilingConfig.configMapRef must be provided")
}
// The profiler will validate the rest of the configuration
......@@ -783,7 +760,29 @@ func (r *DynamoGraphDeploymentRequestReconciler) validateSpec(ctx context.Contex
func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) error {
logger := log.FromContext(ctx)
// Ensure profiling job RBAC exists in cluster-wide mode
// Delete any existing output ConfigMap to ensure fresh profiling results
// This prevents using stale data from previous profiling runs
outputConfigMapName := getOutputConfigMapName(dgdr)
existingCM := &corev1.ConfigMap{}
err := r.Get(ctx, types.NamespacedName{
Name: outputConfigMapName,
Namespace: dgdr.Namespace,
}, existingCM)
if err == nil {
// ConfigMap exists, delete it
logger.Info("Deleting existing output ConfigMap to ensure fresh profiling results", "configMap", outputConfigMapName)
if err := r.Delete(ctx, existingCM); err != nil && !apierrors.IsNotFound(err) {
logger.Error(err, "Failed to delete existing output ConfigMap", "configMap", outputConfigMapName)
return fmt.Errorf("failed to delete existing output ConfigMap: %w", err)
}
logger.Info("Successfully deleted old output ConfigMap", "configMap", outputConfigMapName)
} else if !apierrors.IsNotFound(err) {
// Unexpected error checking for ConfigMap
logger.Error(err, "Failed to check for existing output ConfigMap", "configMap", outputConfigMapName)
return fmt.Errorf("failed to check for existing output ConfigMap: %w", err)
}
// Ensure profiling job RBAC exists (only for cluster-wide installation)
if r.Config.RestrictedNamespace == "" {
if err := r.RBACManager.EnsureServiceAccountWithRBAC(
ctx,
......@@ -808,25 +807,52 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
}
// Set deployment.namespace if not already set
if _, hasDeployment := config["deployment"]; !hasDeployment {
config["deployment"] = make(map[string]interface{})
deploymentVal, hasDeployment := config["deployment"]
var deploymentConfig map[string]interface{}
if !hasDeployment || deploymentVal == nil {
deploymentConfig = make(map[string]interface{})
config["deployment"] = deploymentConfig
} else {
var ok bool
deploymentConfig, ok = deploymentVal.(map[string]interface{})
if !ok {
return nil, false, fmt.Errorf("profilingConfig.config.deployment must be an object, got %T", deploymentVal)
}
}
deploymentConfig := config["deployment"].(map[string]interface{})
if _, hasNamespace := deploymentConfig["namespace"]; !hasNamespace {
deploymentConfig["namespace"] = dgdr.Namespace
}
// Set deployment.model from spec.model
deploymentConfig["model"] = dgdr.Spec.Model
// Set deployment.dgd_image from deploymentOverrides.workersImage if provided
if dgdr.Spec.DeploymentOverrides != nil && dgdr.Spec.DeploymentOverrides.WorkersImage != "" {
deploymentConfig["dgd_image"] = dgdr.Spec.DeploymentOverrides.WorkersImage
}
// Set output_dir if not already set
if _, hasOutputDir := config["output_dir"]; !hasOutputDir {
config["output_dir"] = ProfilingOutputPath
}
// Set engine.backend from spec.backend
engineVal, hasEngine := config["engine"]
var engineConfig map[string]interface{}
if !hasEngine || engineVal == nil {
engineConfig = make(map[string]interface{})
config["engine"] = engineConfig
} else {
var ok bool
engineConfig, ok = engineVal.(map[string]interface{})
if !ok {
return nil, false, fmt.Errorf("profilingConfig.config.engine must be an object, got %T", engineVal)
}
}
engineConfig["backend"] = dgdr.Spec.Backend
// If ConfigMapRef is provided, set engine.config path
if dgdr.Spec.ProfilingConfig.ConfigMapRef != nil {
if _, hasEngine := config["engine"]; !hasEngine {
config["engine"] = make(map[string]interface{})
}
engineConfig := config["engine"].(map[string]interface{})
engineConfig["config"] = fmt.Sprintf("%s/%s", ProfilingConfigPath, ProfilingConfigFile)
}
......@@ -857,6 +883,19 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
Name: "ETCD_ENDPOINTS",
Value: fmt.Sprintf("%s-etcd:2379", dgdr.Namespace),
},
// DGDR metadata for setting ownerReferences
{
Name: "DGDR_NAME",
Value: dgdr.Name,
},
{
Name: "DGDR_NAMESPACE",
Value: dgdr.Namespace,
},
{
Name: "DGDR_UID",
Value: string(dgdr.UID),
},
}
// Build volume mounts
......@@ -881,11 +920,8 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
"--profile-config", string(configYAML),
}
// Determine profiler image
imageName := r.ProfilerImage
if imageName == "" {
return nil, false, fmt.Errorf("profiler image not configured: configure dynamo-operator.dynamo.dgdr.profilerImage in Helm values")
}
// Use profiler image from profilingConfig
imageName := dgdr.Spec.ProfilingConfig.ProfilerImage
logger.Info("Using profiler image", "image", imageName)
profilerContainer := corev1.Container{
......@@ -1144,25 +1180,6 @@ func (r *DynamoGraphDeploymentRequestReconciler) generateDGDSpec(ctx context.Con
return r.Status().Update(ctx, dgdr)
}
// cleanupProfilingResources cleans up profiling resources
func (r *DynamoGraphDeploymentRequestReconciler) cleanupProfilingResources(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) error {
logger := log.FromContext(ctx)
logger.Info("Cleaning up profiling resources", "name", dgdr.Name)
// Cleanup behavior when DGDR is deleted:
// - Profiling Job: Automatically deleted via ownerReference (set by SyncResource)
// - Output ConfigMap: NOT deleted (no ownerReference) - contains valuable profiling data
// - Auto-created DGD: NOT deleted (no ownerReference) - may be serving traffic
//
// We use labels (LabelDGDRName) to track relationships without cascade delete.
// Users can manually clean up ConfigMaps and DGDs if needed using label selectors:
// kubectl delete configmap -l dgdr.nvidia.com/name=<dgdr-name>
// kubectl delete dynamographdeployment -l dgdr.nvidia.com/name=<dgdr-name>
logger.Info("Profiling job will be automatically deleted via ownerReference")
return nil
}
// updateStateAndRequeue updates the DGDR state and requeues
func (r *DynamoGraphDeploymentRequestReconciler) updateStateAndRequeue(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest, state, _ string) (ctrl.Result, error) {
dgdr.Status.State = state
......
......@@ -74,7 +74,6 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
reconciler = &DynamoGraphDeploymentRequestReconciler{
Client: k8sClient,
Recorder: recorder,
ProfilerImage: "test-profiler:latest",
Config: commonController.Config{
RestrictedNamespace: "",
RBAC: commonController.RBACConfig{
......@@ -97,10 +96,12 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
ProfilerImage: "test-profiler:latest",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
Config: createTestConfig(map[string]interface{}{
"engine": map[string]interface{}{
"backend": "vllm",
"config": "/tmp/test-config.yaml",
},
"sla": map[string]interface{}{
......@@ -143,9 +144,9 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
Expect(updated.Status.ObservedGeneration).Should(Equal(updated.Generation))
})
It("Should fail validation with missing config", func() {
It("Should pass validation with minimal config", func() {
ctx := context.Background()
dgdrName := "test-dgdr-invalid"
dgdrName := "test-dgdr-minimal"
namespace := "default"
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
......@@ -154,8 +155,16 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
ProfilerImage: "test-profiler:latest",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
Config: createTestConfig(map[string]interface{}{}),
Config: createTestConfig(map[string]interface{}{
"sla": map[string]interface{}{
"ttft": 100.0,
"itl": 1500.0,
},
}),
},
},
}
......@@ -163,7 +172,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
defer k8sClient.Delete(ctx, dgdr)
// Reconcile
// Reconcile - should succeed with minimal config
_, err := reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{
Name: dgdrName,
......@@ -172,12 +181,12 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
})
Expect(err).NotTo(HaveOccurred())
// Check status transitions to Failed
// Check status transitions to Pending (not Failed)
Eventually(func() string {
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
return updated.Status.State
}, timeout, interval).Should(Equal(StateFailed))
}, timeout, interval).Should(Equal(StatePending))
})
})
......@@ -216,10 +225,12 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
ProfilerImage: "test-profiler:latest",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
Config: createTestConfig(map[string]interface{}{
"engine": map[string]interface{}{
"backend": "vllm",
"profiler_image": "test-profiler:latest",
},
"sla": map[string]interface{}{
......@@ -313,10 +324,12 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
ProfilerImage: "test-profiler:latest",
Backend: "trtllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
Config: createTestConfig(map[string]interface{}{
"engine": map[string]interface{}{
"backend": "trtllm",
"config": "/tmp/test-config.yaml",
"profiler_image": "test-profiler:latest",
},
......@@ -386,10 +399,12 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
ProfilerImage: "test-profiler:latest",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
Config: createTestConfig(map[string]interface{}{
"engine": map[string]interface{}{
"backend": "vllm",
"config": "/tmp/test-config.yaml",
},
"sla": map[string]interface{}{
......@@ -498,10 +513,12 @@ spec:
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
ProfilerImage: "test-profiler:latest",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
Config: createTestConfig(map[string]interface{}{
"engine": map[string]interface{}{
"backend": "vllm",
"config": "/tmp/test-config.yaml",
},
"sla": map[string]interface{}{
......@@ -626,10 +643,12 @@ spec:
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
ProfilerImage: "test-profiler:latest",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
Config: createTestConfig(map[string]interface{}{
"engine": map[string]interface{}{
"backend": "vllm",
"config": "/tmp/test-config.yaml",
},
"sla": map[string]interface{}{
......@@ -707,10 +726,12 @@ spec:
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
ProfilerImage: "test-profiler:latest",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
Config: createTestConfig(map[string]interface{}{
"engine": map[string]interface{}{
"backend": "vllm",
"config": "/tmp/test-config.yaml",
},
"sla": map[string]interface{}{
......@@ -852,10 +873,12 @@ var _ = Describe("DGDR Validation", func() {
ctx := context.Background()
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
ProfilerImage: "test-profiler:latest",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
Config: createTestConfig(map[string]interface{}{
"engine": map[string]interface{}{
"backend": "vllm",
"config": "/tmp/test-config.yaml",
},
"sla": map[string]interface{}{
......@@ -873,50 +896,15 @@ var _ = Describe("DGDR Validation", func() {
Expect(err).NotTo(HaveOccurred())
})
It("Should fail validation when config is empty", func() {
ctx := context.Background()
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
Config: createTestConfig(map[string]interface{}{}),
},
},
}
err := reconciler.validateSpec(ctx, dgdr)
Expect(err).To(HaveOccurred())
Expect(err.Error()).Should(ContainSubstring("config"))
})
It("Should fail validation when engine section is missing", func() {
ctx := context.Background()
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
Config: createTestConfig(map[string]interface{}{
"sla": map[string]interface{}{
"ttft": 100.0,
"itl": 1500.0,
},
}),
},
},
}
err := reconciler.validateSpec(ctx, dgdr)
Expect(err).To(HaveOccurred())
Expect(err.Error()).Should(ContainSubstring("engine"))
})
It("Should fail validation when engine.config and configMapRef are both missing", func() {
It("Should pass validation with minimal config", func() {
ctx := context.Background()
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
ProfilerImage: "test-profiler:latest",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
Config: createTestConfig(map[string]interface{}{
"engine": map[string]interface{}{
"backend": "vllm",
},
"sla": map[string]interface{}{
"ttft": 100.0,
"itl": 1500.0,
......@@ -926,9 +914,9 @@ var _ = Describe("DGDR Validation", func() {
},
}
// Validation should pass - profiler will auto-generate missing config
err := reconciler.validateSpec(ctx, dgdr)
Expect(err).To(HaveOccurred())
Expect(err.Error()).Should(ContainSubstring("engine.config"))
Expect(err).NotTo(HaveOccurred())
})
})
})
......@@ -940,7 +928,6 @@ var _ = Describe("DGDR Profiler Arguments", func() {
reconciler = &DynamoGraphDeploymentRequestReconciler{
Client: k8sClient,
Recorder: record.NewFakeRecorder(100),
ProfilerImage: "test-profiler:latest",
Config: commonController.Config{
RestrictedNamespace: "",
},
......@@ -970,10 +957,12 @@ var _ = Describe("DGDR Profiler Arguments", func() {
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
ProfilerImage: "test-profiler:latest",
Backend: "trtllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
Config: createTestConfig(map[string]interface{}{
"engine": map[string]interface{}{
"backend": "trtllm",
"config": "/tmp/test-config.yaml",
"profiler_image": "test-profiler:latest",
},
......@@ -1044,10 +1033,12 @@ var _ = Describe("DGDR Profiler Arguments", func() {
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
ProfilerImage: "test-profiler:latest",
Backend: "trtllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
Config: createTestConfig(map[string]interface{}{
"engine": map[string]interface{}{
"backend": "trtllm",
"config": "/tmp/test-config.yaml",
"profiler_image": "test-profiler:latest",
},
......@@ -1111,7 +1102,6 @@ var _ = Describe("DGDR Error Handling", func() {
reconciler = &DynamoGraphDeploymentRequestReconciler{
Client: k8sClient,
Recorder: recorder,
ProfilerImage: "test-profiler:latest",
Config: commonController.Config{
RestrictedNamespace: "",
},
......@@ -1131,10 +1121,12 @@ var _ = Describe("DGDR Error Handling", func() {
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
ProfilerImage: "test-profiler:latest",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
Config: createTestConfig(map[string]interface{}{
"engine": map[string]interface{}{
"backend": "vllm",
"config": "/tmp/test-config.yaml",
},
"sla": map[string]interface{}{
......
......@@ -119,7 +119,7 @@ python3 -m deploy.utils.download_pvc_results \
For complete benchmarking and profiling workflows:
- **Benchmarking Guide**: See [docs/benchmarks/benchmarking.md](../../docs/benchmarks/benchmarking.md) for comparing DynamoGraphDeployments and external endpoints
- **Pre-Deployment Profiling**: See [docs/benchmarks/pre_deployment_profiling.md](../../docs/benchmarks/pre_deployment_profiling.md) for optimizing configurations before deployment
- **Pre-Deployment Profiling**: See [docs/benchmarks/sla_driven_profiling.md](../../docs/benchmarks/sla_driven_profiling.md) for optimizing configurations before deployment
## Notes
......
......@@ -248,6 +248,26 @@ class DynamoDeploymentClient:
self.deployment_spec["metadata"]["name"] = self.deployment_name
self.deployment_spec["metadata"]["namespace"] = self.namespace
# Add ownerReference if env vars are set (for temporary DGDs during profiling)
# This makes the DGD auto-delete when the DGDR is deleted
dgdr_name = os.environ.get("DGDR_NAME")
dgdr_namespace = os.environ.get("DGDR_NAMESPACE")
dgdr_uid = os.environ.get("DGDR_UID")
if dgdr_name and dgdr_namespace and dgdr_uid:
if self.namespace == dgdr_namespace:
self.deployment_spec["metadata"]["ownerReferences"] = [
{
"apiVersion": "nvidia.com/v1alpha1",
"kind": "DynamoGraphDeploymentRequest",
"name": dgdr_name,
"uid": dgdr_uid,
"controller": False,
"blockOwnerDeletion": True,
}
]
print(f"Added ownerReference to DGDR {dgdr_name} for auto-cleanup")
try:
await self.custom_api.create_namespaced_custom_object(
group="nvidia.com",
......
# Pre-Deployment Profiling
> [!TIP]
> **New to SLA Planner?** For a complete workflow including profiling and deployment, see the [SLA Planner Quick Start Guide](/docs/planner/sla_planner_quickstart.md).
## Profiling Script
To ensure Dynamo deployments comply with the SLA, we provide a pre-deployment script to profile the model performance with different parallelization mappings and recommend the parallelization mapping for prefill and decode workers and planner configurations. To use this script, the user needs to provide the target ISL, OSL, TTFT SLA, and ITL SLA.
> [!NOTE]
> **Time Investment**: This profiling process is comprehensive and typically takes **a few hours** to complete. The script systematically tests multiple tensor parallelism configurations and load conditions to find optimal performance settings. This upfront investment ensures your deployment meets SLA requirements and operates efficiently.
Support matrix:
| Backends | Model Types | Supported |
| --- | --- | --- |
| vLLM | Dense | ✅ |
| vLLM | MoE | 🚧 |
| SGLang | Dense | ✅ |
| SGLang | MoE | ✅ |
| TensorRT-LLM | Dense | ✅ |
| TensorRT-LLM | MoE | 🚧 |
> [!NOTE]
> The script considers a fixed ISL/OSL without KV cache reuse. If the real ISL/OSL has a large variance or a significant amount of KV cache can be reused, the result might be inaccurate.
We assume there is no piggy-backed prefill requests in the decode engine. Even if there are some short piggy-backed prefill requests in the decode engine, it should not affect the ITL too much in most conditions. However, if the piggy-backed prefill requests are too much, the ITL might be inaccurate.
The script will first detect the number of available GPUs on the current nodes (multi-node engine not supported yet). Then, it will profile the prefill and decode performance with different TP sizes. For prefill, since there is no in-flight batching (assume isl is long enough to saturate the GPU), the script directly measures the TTFT for a request with given isl without kv-reusing. For decode, since the ITL (or iteration time) is relevant with how many requests are in-flight, the script will measure the ITL under different number of in-flight requests. The range of the number of in-flight requests is from 1 to the maximum number of requests that the kv cache of the engine can hold. To measure the ITL without being affected by piggy-backed prefill requests, the script will enable kv-reuse and warm up the engine by issuing the same prompts before measuring the ITL. Since the kv cache is sufficient for all the requests, it can hold the kv cache of the pre-computed prompts and skip the prefill phase when measuring the ITL.
### GPU Resource Usage
**Important**: Profiling tests different tensor parallelism (TP) configurations **sequentially**, not in parallel. This means:
- **One TP configuration at a time**: Each tensor parallelism size (TP1, TP2, TP4, TP8, etc.) is tested individually
- **Full GPU access**: Each TP configuration gets exclusive access to all available GPUs during its profiling run
- **Resource isolation**: No interference between different TP configurations during testing
- **Accurate measurements**: Each configuration is profiled under identical resource conditions
This sequential approach ensures:
- **Precise performance profiling** without resource conflicts
- **Consistent GPU allocation** for fair comparison across TP sizes
- **Reliable cleanup** between different TP configuration tests
- **Accurate SLA compliance verification** for each configuration
After the profiling finishes, two plots will be generated in the `output-dir`. For example, here are the profiling results for `components/backends/vllm/deploy/disagg.yaml`:
![Prefill Performance](../../docs/images/h100_prefill_performance.png)
![Decode Performance](../../docs/images/h100_decode_performance.png)
For the prefill performance, the script will plot the TTFT for different TP sizes and select the best TP size that meet the target TTFT SLA and delivers the best throughput per GPU. Based on how close the TTFT of the selected TP size is to the SLA, the script will also recommend the upper and lower bounds of the prefill queue size to be used in planner.
For the decode performance, the script will plot the ITL for different TP sizes and different in-flight requests. Similarly, it will select the best point that satisfies the ITL SLA and delivers the best throughput per GPU and recommend the upper and lower bounds of the kv cache utilization rate to be used in planner.
The script will recommend the best TP size for prefill and decode, as well as the upper and lower bounds of the prefill queue size and decode kv cache utilization rate if using load-based planner. The following information will be printed out in the terminal:
```
2025-05-16 15:20:24 - __main__ - INFO - Analyzing results and generate recommendations...
2025-05-16 15:20:24 - __main__ - INFO - Suggested prefill TP:4 (TTFT 48.37 ms, throughput 15505.23 tokens/s/GPU)
2025-05-16 15:20:24 - __main__ - INFO - Suggested planner upper/lower bound for prefill queue size: 0.24/0.10
2025-05-16 15:20:24 - __main__ - INFO - Suggested decode TP:4 (ITL 4.83 ms, throughput 51.22 tokens/s/GPU)
2025-05-16 15:20:24 - __main__ - INFO - Suggested planner upper/lower bound for decode kv cache utilization: 0.20/0.10
```
After finding the best TP size for prefill and decode, the script will then interpolate the TTFT with ISL and ITL with active KV cache and decode context length. This is to provide a more accurate estimation of the performance when ISL and OSL changes and will be used in the sla-planner. The results will be saved to `<output_dir>/<decode/prefill>_tp<best_tp>_interpolation`. Please change the prefill and decode TP size in the config file to match the best TP sizes obtained from the profiling script.
### Prefill Interpolation Data
In prefill engine, prefills are usually done with batch size=1 and only the ISL (excluding prefix cache hit) affects the iteration time. The script profiles the selected prefill TP configuration across different ISLs and record the TTFT and prefill throughput per GPU under those ISLs.
For dense models, the script profiles different TP sizes.
For MoE models, the script only profiles different TEP sizes, since DEP is generally not the optimal prefill configuration.
### Decode Interpolation Data
In decode engine, decode requests are added inflight and iteration time (or ITL) depends on both the context length and the real-time load of the engine. We capture the real-time load of the engine with active kv usage and average context length. The active kv usage determines the complexity of the memory-bounded attention kernel while the active kv usage divided the average context length determines the complexity of the computation bound MLP kernel. For example, the below figure shows the ITL of DS-Distilled Llama 8b model on H100 TP4. The ITL grows near-linearly with active kv usage under a fixed context length. And the slope increases as the context length decreases.
For dense models, the script profiles different TP sizes.
For MoE models, the script profiles different DEP sizes. TEP decode engines for low latency will be supported in the future.
![images](../../docs/images/itl_interpolation.png)
The script profiles the selected decode TP configuration across different active kv blocks and average context length.
### Output Format of Interpolation Data
After suggesting the optimal TP configuration, two `.npz` files that describe the performance characteristics of the prefill and decode engines in their suggested parallel configurations will be generated. The two `.npz` files are:
* `${benchmark_result_dir}/selected_prefill_interpolation/raw_data.npz}`
* `prefill_isl`: a 1D Numpy array to store the ISLs used to profile the prefill engine.
* `prefill_ttft`: a 1D Numpy array to store the TTFTs under the corresponding ISLs when the prefill engine is exclusively running each prefill request (i.e., with batch size of 1). The unit is in milliseconds.
* `prefill_thpt_per_gpu`: a 1D Numpy array to store the prefill throughput per GPU under the corresponding ISLs. The unit is in tokens per second per GPU.
* `${benchmark_result_dir}/selected_decode_interpolation/raw_data.npz`
* `max_kv_tokens`: a 1D Numpy array with only one element to store the total number of KV tokens in the decode engine.
* `x_kv_usage`: a 1D Numpy array to store the percentage of the active KV blocks (in the range of [0, 1]) used to profile the decode engine. The active KV blocks can be controlled by varying `(ISL + OSL / 2) * concurrency`.
* `y_context_length`: a 1D Numpy array to store the average context length (ISL + OSL / 2) used to profile the decode engine.
* `z_itl`: a 1D Numpy array to store the ITLs under the corresponding active KV usage and context length. To skip the prefill stage while maintaining the context length, benchmark can be done by turn on kv reuse and warmup the engine with the prompts first before running the actual profiling. The unit is in milliseconds.
* `z_thpt_per_gpu`: a 1D Numpy array to store the decode throughput per GPU under the corresponding active KV usage and context length. The unit is in tokens per second per GPU.
SLA planner can work with any interpolation data that follows the above format. For best results, use fine-grained and high coverage interpolation data for the prefill and decode engines.
## Detailed Kubernetes Profiling Instructions
> [!TIP]
> For a complete step-by-step workflow, see the [SLA Planner Quick Start Guide](/docs/planner/sla_planner_quickstart.md).
This section provides detailed technical information for advanced users who need to customize the profiling process.
### Configuration Options
**For dense models**, configure `$DYNAMO_HOME/benchmarks/profiler/deploy/profile_sla_job.yaml`:
```yaml
spec:
template:
spec:
containers:
- name: profile-sla
args:
- --isl
- "3000" # average ISL is 3000 tokens
- --osl
- "150" # average OSL is 150 tokens
- --ttft
- "200" # target TTFT is 200ms (float, in milliseconds)
- --itl
- "20" # target ITL is 20ms (float, in milliseconds)
- --backend
- <vllm/sglang>
```
**For MoE models**, use `profile_sla_moe_job.yaml` with TEP/DEP configuration instead.
### Auto-Configuration
To automatically configure the profiling job based on the hardware and model information, supply the `--model` argument to the profiling script. The following arguments will be automatically set:
- `--config`: will use the default config file (`components/backends/<backend>/deploy/disagg.yaml`) with model updated to the provided model name
- `--min-num-gpus-per-engine`: will be set to the minimum number of GPUs per engine based on the model size and hardware information
- `--max-num-gpus-per-engine`: will be set to the maximum number of GPUs per engine based on the model size and hardware information
- `--num-gpus-per-node`: will be set to the number of GPUs per node based on the hardware information
- `--is-moe-model`: will be set based on the HF config file
- `--max-context-length`: will be set to the maximum context length supported by the model based on the HF config file
### Advanced Configuration
- **Model caching**: For large models, create a multi-attach PVC to cache the model. See [recipes](../../recipes/README.md) for details.
- **Custom disaggregated configurations**: Use the manifest injector to place custom DGD configurations in the PVC.
- **Planner Config Passthrough**: To specify custom planner configurations (e.g., `adjustment-interval` or `load-predictor`) in the generated or deployed DGD config, add a `planner-` prefix to the argument. For example, to specify `--adjustment-interval=60` in SLA planner, add `--planner-adjustment-interval=60` arg to the profiling job.
- **Resource allocation**: Modify the job YAML to adjust GPU and memory requirements.
### Viewing Profiling Results
After the profiling job completes successfully, the results are stored in the persistent volume claim (PVC) created during Step 2.
To download the results:
```bash
# Download to directory
python3 -m deploy.utils.download_pvc_results --namespace $NAMESPACE --output-dir ./results --folder /data/profiling_results
# Download without any of the auto-created config.yaml files used in profiling
python3 -m deploy.utils.download_pvc_results --namespace $NAMESPACE --output-dir ./results --folder /data/profiling_results --no-config
```
The script will:
* Deploy a temporary access pod
* Download all files maintaining directory structure
* Clean the pod up automatically
#### File Structure
The profiling results directory contains the following structure:
```
/workspace/data/profiling_results/
├── prefill_performance.png # Main prefill performance plot
├── decode_performance.png # Main decode performance plot
├── prefill_tp1/ # Individual TP profiling directories
...
├── decode_tp1/
...
├── selected_prefill_interpolation/
│ ├── raw_data.npz # Prefill interpolation data
│ ├── prefill_ttft_interpolation.png # TTFT vs ISL plot
│ └── prefill_throughput_interpolation.png # Throughput vs ISL plot
├── selected_decode_interpolation/
│ ├── raw_data.npz # Decode interpolation data
│ └── decode_tp{best_tp}.png # 3D ITL surface plot
└── config_with_planner.yaml # Generated DGD config with planner
```
#### Viewing Performance Plots
The profiling generates several performance visualization files:
**Main Performance Plots:**
- **`prefill_performance.png`**: Shows TTFT (Time To First Token) performance across different tensor parallelism (TP) sizes
- **`decode_performance.png`**: Shows ITL (Inter-Token Latency) performance across different TP sizes and in-flight request counts
**Interpolation Plots:**
- **`selected_prefill_interpolation/prefill_ttft_interpolation.png`**: TTFT vs Input Sequence Length with quadratic fit
- **`selected_prefill_interpolation/prefill_throughput_interpolation.png`**: Prefill throughput vs Input Sequence Length
- **`selected_decode_interpolation/decode_tp{best_tp}.png`**: 3D surface plot showing ITL vs KV usage and context length
#### Understanding the Data Files
The `.npz` files contain raw profiling data that can be loaded and analyzed using Python:
```python
import numpy as np
# Load prefill data
prefill_data = np.load('selected_prefill_interpolation/raw_data.npz')
print("Prefill data keys:", list(prefill_data.keys()))
# Load decode data
decode_data = np.load('selected_decode_interpolation/raw_data.npz')
print("Decode data keys:", list(decode_data.keys()))
```
### Troubleshooting
#### Image Pull Authentication Errors
If you see `ErrImagePull` or `ImagePullBackOff` errors with 401 unauthorized messages:
1. Ensure the `nvcr-imagepullsecret` exists in your namespace:
```bash
kubectl get secret nvcr-imagepullsecret -n $NAMESPACE
```
2. Verify the service account was created with the image pull secret:
```bash
kubectl get serviceaccount dgdr-profiling-job -n $NAMESPACE -o yaml
```
3. The service account should show `imagePullSecrets` containing `nvcr-imagepullsecret`.
If it doesn't, create the secret
```bash
export NGC_API_KEY=<you-ngc-api-key-here>
kubectl create secret docker-registry nvcr-imagepullsecret --docker-server=nvcr.io --docker-username='$oauthtoken' --docker-password=$NGC_API_KEY
```
## Running the Profiling Script with AI Configurator
> [!NOTE]
> **TensorRT-LLM Only**: AI Configurator currently supports TensorRT-LLM only. Support for vLLM and SGLang is coming soon.
The profiling script can be run much faster using AI Configurator to estimate performance numbers instead of running real Dynamo deployments. This completes profiling in 20-30 seconds using performance simulation.
**Advantages** of `--use-ai-configurator`:
* Script completes in seconds rather than hours
* No Kubernetes or GPU access required
* Ideal for rapid prototyping and testing
**Disadvantages**:
* Estimated performance may contain errors, especially for out-of-distribution input dimensions
* Limited list of supported models, systems, and backends
* Less accurate than real deployment profiling
### Prerequisites
Install AI Configurator:
```bash
pip install aiconfigurator
```
If using local environment, also install:
```bash
pip install -r deploy/utils/requirements.txt
```
### Check Support Matrix
View supported models, systems, and backends:
```bash
aiconfigurator cli --help
```
**Supported configurations:**
```
Models: GPT_7B, GPT_13B, GPT_30B, GPT_66B, GPT_175B, LLAMA2_7B, LLAMA2_13B, LLAMA2_70B, LLAMA3.1_8B, LLAMA3.1_70B, LLAMA3.1_405B, MOE_Mixtral8x7B, MOE_Mixtral8x22B, DEEPSEEK_V3, KIMI_K2, QWEN2.5_1.5B, QWEN2.5_7B, QWEN2.5_32B, QWEN2.5_72B, QWEN3_32B, QWEN3_235B, QWEN3_480B, Nemotron_super_v1.1
Systems: h100_sxm, h200_sxm
Backends: trtllm (vllm and sglang support coming soon)
```
### Running Fast Profiling
Example command for TensorRT-LLM:
```bash
python3 -m benchmarks.profiler.profile_sla \
--config ./components/backends/trtllm/deploy/disagg.yaml \
--backend trtllm \
--use-ai-configurator \
--aic-system h200_sxm \
--aic-model-name QWEN3_32B \
--aic-backend trtllm \ # optional, will use --backend if not provided
--aic-backend-version 0.20.0 \
--isl 3000 \
--osl 150 \
--ttft 200 \ # target TTFT in milliseconds (float)
--itl 20 # target ITL in milliseconds (float)
```
The output will be written to `./profiling_results/` and can be used directly with SLA planner deployment.
This diff is collapsed.
File suppressed by a .gitattributes entry or the file's encoding is unsupported.
......@@ -77,7 +77,7 @@ _Appears in:_
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `name` _string_ | Name of the ConfigMap containing the desired data. | | Required: {} <br /> |
| `name` _string_ | Name of the ConfigMap containing the desired data. | | Required: \{\} <br /> |
| `key` _string_ | Key in the ConfigMap to select. If not specified, defaults to "disagg.yaml". | disagg.yaml | |
......@@ -95,10 +95,11 @@ _Appears in:_
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `name` _string_ | Name is the desired name for the created DynamoGraphDeployment.<br />If not specified, defaults to the DGDR name. | | Optional: {} <br /> |
| `namespace` _string_ | Namespace is the desired namespace for the created DynamoGraphDeployment.<br />If not specified, defaults to the DGDR namespace. | | Optional: {} <br /> |
| `labels` _object (keys:string, values:string)_ | Labels are additional labels to add to the DynamoGraphDeployment metadata.<br />These are merged with auto-generated labels from the profiling process. | | Optional: {} <br /> |
| `annotations` _object (keys:string, values:string)_ | Annotations are additional annotations to add to the DynamoGraphDeployment metadata. | | Optional: {} <br /> |
| `name` _string_ | Name is the desired name for the created DynamoGraphDeployment.<br />If not specified, defaults to the DGDR name. | | Optional: \{\} <br /> |
| `namespace` _string_ | Namespace is the desired namespace for the created DynamoGraphDeployment.<br />If not specified, defaults to the DGDR namespace. | | Optional: \{\} <br /> |
| `labels` _object (keys:string, values:string)_ | Labels are additional labels to add to the DynamoGraphDeployment metadata.<br />These are merged with auto-generated labels from the profiling process. | | Optional: \{\} <br /> |
| `annotations` _object (keys:string, values:string)_ | Annotations are additional annotations to add to the DynamoGraphDeployment metadata. | | Optional: \{\} <br /> |
| `workersImage` _string_ | WorkersImage specifies the container image to use for DynamoGraphDeployment worker components.<br />This image is used for both temporary DGDs created during online profiling and the final DGD.<br />If omitted, the image from the base config file (e.g., disagg.yaml) is used.<br />Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1" | | Optional: \{\} <br /> |
#### DeploymentStatus
......@@ -236,7 +237,6 @@ DynamoGraphDeploymentRequest is the Schema for the dynamographdeploymentrequests
It serves as the primary interface for users to request model deployments with
specific performance and resource constraints, enabling SLA-driven deployments.
Lifecycle:
1. Initial → Pending: Validates spec and prepares for profiling
2. Pending → Profiling: Creates and runs profiling job (online or AIC)
......@@ -245,7 +245,6 @@ Lifecycle:
5. Ready: Terminal state when DGD is operational or spec is available
6. DeploymentDeleted: Terminal state when auto-created DGD is manually deleted
The spec becomes immutable once profiling starts. Users must delete and recreate
the DGDR to modify configuration after this point.
......@@ -277,10 +276,11 @@ _Appears in:_
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `modelName` _string_ | ModelName specifies the model to deploy (e.g., "Qwen/Qwen3-0.6B", "meta-llama/Llama-3-70b").<br />This is a high-level identifier for easy reference in kubectl output and logs. | | Required: {} <br /> |
| `profilingConfig` _[ProfilingConfigSpec](#profilingconfigspec)_ | ProfilingConfig provides the complete configuration for the profiling job.<br />This configuration is passed directly to the profiler.<br />The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema).<br />The profiler will validate the configuration and report any errors. | | Required: {} <br /> |
| `model` _string_ | Model specifies the model to deploy (e.g., "Qwen/Qwen3-0.6B", "meta-llama/Llama-3-70b").<br />This is a high-level identifier for easy reference in kubectl output and logs.<br />The controller automatically sets this value in profilingConfig.config.deployment.model. | | Required: \{\} <br /> |
| `backend` _string_ | Backend specifies the inference backend to use.<br />The controller automatically sets this value in profilingConfig.config.engine.backend. | | Enum: [vllm sglang trtllm] <br />Required: \{\} <br /> |
| `profilingConfig` _[ProfilingConfigSpec](#profilingconfigspec)_ | ProfilingConfig provides the complete configuration for the profiling job.<br />This configuration is passed directly to the profiler.<br />The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema).<br />Note: deployment.model and engine.backend are automatically set from the high-level<br />modelName and backend fields and should not be specified in this config. | | Required: \{\} <br /> |
| `autoApply` _boolean_ | AutoApply indicates whether to automatically create a DynamoGraphDeployment<br />after profiling completes. If false, only the spec is generated and stored in status.<br />Users can then manually create a DGD using the generated spec. | false | |
| `deploymentOverrides` _[DeploymentOverridesSpec](#deploymentoverridesspec)_ | DeploymentOverrides allows customizing metadata for the auto-created DGD.<br />Only applicable when AutoApply is true. | | Optional: {} <br /> |
| `deploymentOverrides` _[DeploymentOverridesSpec](#deploymentoverridesspec)_ | DeploymentOverrides allows customizing metadata for the auto-created DGD.<br />Only applicable when AutoApply is true. | | Optional: \{\} <br /> |
#### DynamoGraphDeploymentRequestStatus
......@@ -298,12 +298,12 @@ _Appears in:_
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `state` _string_ | State is a high-level textual status of the deployment request lifecycle.<br />Possible values: "", "Pending", "Profiling", "Deploying", "Ready", "DeploymentDeleted", "Failed"<br />Empty string ("") represents the initial state before initialization. | | |
| `backend` _string_ | Backend is extracted from profilingConfig.config.engine.backend for display purposes.<br />This field is populated by the controller and shown in kubectl output. | | Optional: {} <br /> |
| `backend` _string_ | Backend is extracted from profilingConfig.config.engine.backend for display purposes.<br />This field is populated by the controller and shown in kubectl output. | | Optional: \{\} <br /> |
| `observedGeneration` _integer_ | ObservedGeneration reflects the generation of the most recently observed spec.<br />Used to detect spec changes and enforce immutability after profiling starts. | | |
| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta) array_ | Conditions contains the latest observed conditions of the deployment request.<br />Standard condition types include: Validation, Profiling, SpecGenerated, DeploymentReady.<br />Conditions are merged by type on patch updates. | | |
| `profilingResults` _string_ | ProfilingResults contains a reference to the ConfigMap holding profiling data.<br />Format: "configmap/<name>" | | Optional: {} <br /> |
| `generatedDeployment` _[RawExtension](#rawextension)_ | GeneratedDeployment contains the full generated DynamoGraphDeployment specification<br />including metadata, based on profiling results. Users can extract this to create<br />a DGD manually, or it's used automatically when autoApply is true.<br />Stored as RawExtension to preserve all fields including metadata. | | EmbeddedResource: {} <br />Optional: {} <br /> |
| `deployment` _[DeploymentStatus](#deploymentstatus)_ | Deployment tracks the auto-created DGD when AutoApply is true.<br />Contains name, namespace, state, and creation status of the managed DGD. | | Optional: {} <br /> |
| `profilingResults` _string_ | ProfilingResults contains a reference to the ConfigMap holding profiling data.<br />Format: "configmap/<name>" | | Optional: \{\} <br /> |
| `generatedDeployment` _[RawExtension](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#rawextension-runtime-pkg)_ | GeneratedDeployment contains the full generated DynamoGraphDeployment specification<br />including metadata, based on profiling results. Users can extract this to create<br />a DGD manually, or it's used automatically when autoApply is true.<br />Stored as RawExtension to preserve all fields including metadata. | | EmbeddedResource: \{\} <br />Optional: \{\} <br /> |
| `deployment` _[DeploymentStatus](#deploymentstatus)_ | Deployment tracks the auto-created DGD when AutoApply is true.<br />Contains name, namespace, state, and creation status of the managed DGD. | | Optional: \{\} <br /> |
#### DynamoGraphDeploymentSpec
......@@ -319,9 +319,9 @@ _Appears in:_
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `pvcs` _[PVC](#pvc) array_ | PVCs defines a list of persistent volume claims that can be referenced by components.<br />Each PVC must have a unique name that can be referenced in component specifications. | | Optional: {} <br /> |
| `services` _object (keys:string, values:[DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec))_ | Services are the services to deploy as part of this deployment. | | Optional: {} <br /> |
| `envs` _[EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#envvar-v1-core) array_ | Envs are environment variables applied to all services in the deployment unless<br />overridden by service-specific configuration. | | Optional: {} <br /> |
| `pvcs` _[PVC](#pvc) array_ | PVCs defines a list of persistent volume claims that can be referenced by components.<br />Each PVC must have a unique name that can be referenced in component specifications. | | Optional: \{\} <br /> |
| `services` _object (keys:string, values:[DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec))_ | Services are the services to deploy as part of this deployment. | | Optional: \{\} <br /> |
| `envs` _[EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#envvar-v1-core) array_ | Envs are environment variables applied to all services in the deployment unless<br />overridden by service-specific configuration. | | Optional: \{\} <br /> |
| `backendFramework` _string_ | BackendFramework specifies the backend framework (e.g., "sglang", "vllm", "trtllm"). | | Enum: [sglang vllm trtllm] <br /> |
......@@ -415,9 +415,9 @@ _Appears in:_
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `create` _boolean_ | Create indicates to create a new PVC | | |
| `name` _string_ | Name is the name of the PVC | | Required: {} <br /> |
| `name` _string_ | Name is the name of the PVC | | Required: \{\} <br /> |
| `storageClass` _string_ | StorageClass to be used for PVC creation. Required when create is true. | | |
| `size` _[Quantity](#quantity)_ | Size of the volume in Gi, used during PVC creation. Required when create is true. | | |
| `size` _[Quantity](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#quantity-resource-api)_ | Size of the volume in Gi, used during PVC creation. Required when create is true. | | |
| `volumeAccessMode` _[PersistentVolumeAccessMode](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#persistentvolumeaccessmode-v1-core)_ | VolumeAccessMode is the volume access mode of the PVC. Required when create is true. | | |
......@@ -436,8 +436,9 @@ _Appears in:_
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `config` _[JSON](#json)_ | Config is the profiling configuration as arbitrary JSON/YAML. This will be passed directly to the profiler.<br />The profiler will validate the configuration and report any errors. | | Optional: {} <br />Type: object <br /> |
| `configMapRef` _[ConfigMapKeySelector](#configmapkeyselector)_ | ConfigMapRef is an optional reference to a ConfigMap containing the DynamoGraphDeployment<br />base config file (disagg.yaml). This is separate from the profiling config above.<br />The path to this config will be set as engine.config in the profiling config. | | Optional: {} <br /> |
| `config` _[JSON](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#json-v1-apiextensions-k8s-io)_ | Config is the profiling configuration as arbitrary JSON/YAML. This will be passed directly to the profiler.<br />The profiler will validate the configuration and report any errors. | | Optional: \{\} <br />Type: object <br /> |
| `configMapRef` _[ConfigMapKeySelector](#configmapkeyselector)_ | ConfigMapRef is an optional reference to a ConfigMap containing the DynamoGraphDeployment<br />base config file (disagg.yaml). This is separate from the profiling config above.<br />The path to this config will be set as engine.config in the profiling config. | | Optional: \{\} <br /> |
| `profilerImage` _string_ | ProfilerImage specifies the container image to use for profiling jobs.<br />This image contains the profiler code and dependencies needed for SLA-based profiling.<br />Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1" | | Required: \{\} <br /> |
#### SharedMemorySpec
......@@ -455,7 +456,7 @@ _Appears in:_
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `disabled` _boolean_ | | | |
| `size` _[Quantity](#quantity)_ | | | |
| `size` _[Quantity](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#quantity-resource-api)_ | | | |
#### VolumeMount
......@@ -472,7 +473,7 @@ _Appears in:_
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `name` _string_ | Name references a PVC name defined in the top-level PVCs map | | Required: {} <br /> |
| `name` _string_ | Name references a PVC name defined in the top-level PVCs map | | Required: \{\} <br /> |
| `mountPoint` _string_ | MountPoint specifies where to mount the volume.<br />If useAsCompilationCache is true and mountPoint is not specified,<br />a backend-specific default will be used. | | |
| `useAsCompilationCache` _boolean_ | UseAsCompilationCache indicates this volume should be used as a compilation cache.<br />When true, backend-specific environment variables will be set and default mount points may be used. | false | |
......
......@@ -24,7 +24,7 @@ There are two additional rules set by planner to prevent over-compensation:
## SLA-based Scaling Up/Down Prefill/Decode Workers
See [Pre-Deployment Profiling](../benchmarks/pre_deployment_profiling.md) for more details.
See [SLA-Driven Profiling](../benchmarks/sla_driven_profiling.md) for more details.
## Usage
......
......@@ -78,5 +78,5 @@ Key features include:
Overview <self>
SLA Planner Quick Start <sla_planner_quickstart>
Pre-Deployment Profiling <../benchmarks/pre_deployment_profiling.md>
SLA-Driven Profiling <../benchmarks/sla_driven_profiling.md>
SLA-based Planner <sla_planner.md>
# SLA-based Planner
> [!TIP]
> **New to SLA Planner?** For a complete workflow including profiling and deployment, see the [SLA Planner Quick Start Guide](/docs/planner/sla_planner_quickstart.md).
> **New to SLA Planner?** For a complete workflow including profiling and deployment, see the [SLA Profiling + Planner Quick Start Guide](/docs/planner/sla_planner_quickstart.md).
This document covers information regarding the SLA-based planner in `examples/common/utils/planner_core.py`.
......@@ -47,11 +47,11 @@ The SLA planner consists of several key components:
3. **Correction Factors**: Adjust predictions based on observed vs. expected performance
4. **Scaling Logic**: Calculate optimal number of prefill/decode replicas to meet SLA targets
## Pre-Deployment Profiling
## SLA-Driven Pre-Deployment Profiling
**Prerequisite**: SLA-based planner requires pre-deployment profiling to be completed before deployment. The profiling process analyzes your model's performance characteristics to determine optimal tensor parallelism configurations and scaling parameters that the planner will use during operation.
See [Pre-Deployment Profiling](../benchmarks/pre_deployment_profiling.md) for detailed instructions on running the profiling process.
See [Pre-Deployment Profiling](../benchmarks/sla_driven_profiling.md) for detailed instructions on running the profiling process.
## Load Prediction
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment