Unverified Commit 57cdb9a1 authored by Julien Mancuso's avatar Julien Mancuso Committed by GitHub
Browse files
parent 66fd6f84
......@@ -596,10 +596,11 @@ async def run_profile(args):
try:
await client.wait_for_deployment_ready()
logger.info("Deployment is ready")
skip_profile = False
except TimeoutError:
logger.error(
"Deployment failed to become ready within timeout, skipping profiling"
"Deployment or model failed to become ready within timeout, skipping profiling"
)
skip_profile = True
......
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.16.4
helm.sh/resource-policy: keep
name: dynamographdeploymentrequests.nvidia.com
spec:
group: nvidia.com
names:
kind: DynamoGraphDeploymentRequest
listKind: DynamoGraphDeploymentRequestList
plural: dynamographdeploymentrequests
shortNames:
- dgdr
singular: dynamographdeploymentrequest
scope: Namespaced
versions:
- additionalPrinterColumns:
- jsonPath: .spec.modelName
name: Model
type: string
- jsonPath: .spec.backend
name: Backend
type: string
- jsonPath: .status.state
name: State
type: string
- jsonPath: .status.deployment.state
name: DGD-State
type: string
- jsonPath: .metadata.creationTimestamp
name: Age
type: date
name: v1alpha1
schema:
openAPIV3Schema:
description: |-
DynamoGraphDeploymentRequest is the Schema for the dynamographdeploymentrequests API.
It serves as the primary interface for users to request model deployments with
specific performance and resource constraints, enabling SLA-driven deployments.
Lifecycle:
1. Initial → Pending: Validates spec and prepares for profiling
2. Pending → Profiling: Creates and runs profiling job (online or AIC)
3. Profiling → Ready/Deploying: Generates DGD spec after profiling completes
4. Deploying → Ready: When autoApply=true, monitors DGD until Ready
5. Ready: Terminal state when DGD is operational or spec is available
6. DeploymentDeleted: Terminal state when auto-created DGD is manually deleted
The spec becomes immutable once profiling starts. Users must delete and recreate
the DGDR to modify configuration after this point.
properties:
apiVersion:
description: |-
APIVersion defines the versioned schema of this representation of an object.
Servers should convert recognized schemas to the latest internal value, and
may reject unrecognized values.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
type: string
kind:
description: |-
Kind is a string value representing the REST resource this object represents.
Servers may infer this from the endpoint the client submits requests to.
Cannot be updated.
In CamelCase.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
type: string
metadata:
type: object
spec:
description: Spec defines the desired state for this deployment request.
properties:
autoApply:
default: false
description: |-
AutoApply indicates whether to automatically create a DynamoGraphDeployment
after profiling completes. If false, only the spec is generated and stored in status.
Users can then manually create a DGD using the generated spec.
type: boolean
backend:
default: trtllm
description: |-
Backend specifies the inference backend framework to use.
Supported values are: "vllm", "sglang", "trtllm".
enum:
- vllm
- sglang
- trtllm
type: string
deploymentOverrides:
description: |-
DeploymentOverrides allows customizing metadata for the auto-created DGD.
Only applicable when AutoApply is true.
properties:
annotations:
additionalProperties:
type: string
description: Annotations are additional annotations to add to the DynamoGraphDeployment metadata.
type: object
labels:
additionalProperties:
type: string
description: |-
Labels are additional labels to add to the DynamoGraphDeployment metadata.
These are merged with auto-generated labels from the profiling process.
type: object
name:
description: |-
Name is the desired name for the created DynamoGraphDeployment.
If not specified, defaults to the DGDR name.
type: string
namespace:
description: |-
Namespace is the desired namespace for the created DynamoGraphDeployment.
If not specified, defaults to the DGDR namespace.
type: string
type: object
gpu:
description: |-
GPU defines optional GPU type and resource specifications.
These constraints guide the profiler to find configurations within specified bounds.
properties:
maxNumGPUsPerEngine:
default: 8
description: |-
MaxNumGPUsPerEngine specifies the maximum number of GPUs per engine for profiling.
The profiler will not consider configurations with more GPUs than this value.
minimum: 1
type: integer
minNumGPUsPerEngine:
default: 1
description: |-
MinNumGPUsPerEngine specifies the minimum number of GPUs per engine for profiling.
The profiler will not consider configurations with fewer GPUs than this value.
minimum: 1
type: integer
type:
description: |-
Type specifies the GPU type to target (e.g., "h200", "h100", "a100").
If specified, profiling will focus on configurations optimized for this GPU type.
type: string
type: object
modelName:
description: |-
ModelName specifies the model to deploy (e.g., "meta/llama3-70b").
This should be a valid model identifier that the profiler can resolve.
type: string
online:
default: false
description: |-
Online indicates whether to use online profiler (true) or AI Configurator (false).
Online profiling uses real deployments for accurate measurements (2-4 hours).
Offline profiling uses AI Configurator for fast simulation-based profiling (20-30 seconds).
type: boolean
profilingConfig:
description: |-
ProfilingConfig provides custom configuration for the profiling job.
Applicable to both online and offline (AIC) profiling modes.
properties:
configMapRef:
description: |-
ConfigMapRef is a reference to a ConfigMap containing profiling configuration.
The ConfigMap should contain a key (default: "disagg.yaml") with the configuration file.
This configuration is used by both online and offline (AIC) profiling modes.
properties:
key:
default: disagg.yaml
description: Key in the ConfigMap to select. If not specified, defaults to "disagg.yaml".
type: string
name:
description: Name of the ConfigMap containing the desired data.
type: string
required:
- name
type: object
type: object
sla:
description: |-
SLA defines the Service Level Agreement profiling targets.
The profiler uses these targets to find an optimal deployment configuration.
properties:
isl:
default: 3000
description: |-
ISL is the Input Sequence Length for profiling.
Defines the length of input sequences to use during profiling tests.
minimum: 1
type: integer
itl:
default: 10
description: |-
ITL is the target Inter-Token Latency in milliseconds.
This represents the maximum time allowed between consecutive tokens in the output.
type: integer
osl:
default: 500
description: |-
OSL is the Output Sequence Length for profiling.
Defines the expected length of output sequences to generate during profiling tests.
minimum: 1
type: integer
ttft:
default: 50
description: |-
TTFT is the target Time To First Token in milliseconds.
This represents the maximum time allowed from request submission to receiving the first token.
type: integer
type: object
required:
- modelName
- sla
type: object
status:
description: Status reflects the current observed state of this deployment request.
properties:
conditions:
description: |-
Conditions contains the latest observed conditions of the deployment request.
Standard condition types include: Validation, Profiling, SpecGenerated, DeploymentReady.
Conditions are merged by type on patch updates.
items:
description: Condition contains details for one aspect of the current state of this API Resource.
properties:
lastTransitionTime:
description: |-
lastTransitionTime is the last time the condition transitioned from one status to another.
This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable.
format: date-time
type: string
message:
description: |-
message is a human readable message indicating details about the transition.
This may be an empty string.
maxLength: 32768
type: string
observedGeneration:
description: |-
observedGeneration represents the .metadata.generation that the condition was set based upon.
For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date
with respect to the current state of the instance.
format: int64
minimum: 0
type: integer
reason:
description: |-
reason contains a programmatic identifier indicating the reason for the condition's last transition.
Producers of specific condition types may define expected values and meanings for this field,
and whether the values are considered a guaranteed API.
The value should be a CamelCase string.
This field may not be empty.
maxLength: 1024
minLength: 1
pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$
type: string
status:
description: status of the condition, one of True, False, Unknown.
enum:
- "True"
- "False"
- Unknown
type: string
type:
description: type of condition in CamelCase or in foo.example.com/CamelCase.
maxLength: 316
pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$
type: string
required:
- lastTransitionTime
- message
- reason
- status
- type
type: object
type: array
deployment:
description: |-
Deployment tracks the auto-created DGD when AutoApply is true.
Contains name, namespace, state, and creation status of the managed DGD.
properties:
created:
description: |-
Created indicates whether the DGD has been successfully created.
Used to prevent recreation if the DGD is manually deleted by users.
type: boolean
name:
description: Name is the name of the created DynamoGraphDeployment.
type: string
namespace:
description: Namespace is the namespace of the created DynamoGraphDeployment.
type: string
state:
description: |-
State is the current state of the DynamoGraphDeployment.
This value is mirrored from the DGD's status.state field.
type: string
type: object
generatedDeployment:
description: |-
GeneratedDeployment contains the full generated DynamoGraphDeployment specification
including metadata, based on profiling results. Users can extract this to create
a DGD manually, or it's used automatically when autoApply is true.
Stored as RawExtension to preserve all fields including metadata.
type: object
x-kubernetes-embedded-resource: true
x-kubernetes-preserve-unknown-fields: true
observedGeneration:
description: |-
ObservedGeneration reflects the generation of the most recently observed spec.
Used to detect spec changes and enforce immutability after profiling starts.
format: int64
type: integer
profilingResults:
description: |-
ProfilingResults contains a reference to the ConfigMap holding profiling data.
Format: "configmap/<name>"
type: string
state:
description: |-
State is a high-level textual status of the deployment request lifecycle.
Possible values: "", "Pending", "Profiling", "Deploying", "Ready", "DeploymentDeleted", "Failed"
Empty string ("") represents the initial state before initialization.
type: string
type: object
type: object
served: true
storage: true
subresources:
status: {}
......@@ -132,6 +132,7 @@ The chart includes built-in validation to prevent all operator conflicts:
| dynamo-operator.dynamo.metrics.prometheusEndpoint | string | `""` | Endpoint that services can use to retrieve metrics. If set, dynamo operator will automatically inject the PROMETHEUS_ENDPOINT environment variable into services it manages. Users can override the value of the PROMETHEUS_ENDPOINT environment variable by modifying the corresponding deployment's environment variables |
| dynamo-operator.dynamo.mpiRun.secretName | string | `"mpi-run-ssh-secret"` | Name of the secret containing the SSH key for MPI Run |
| dynamo-operator.dynamo.mpiRun.sshKeygen.enabled | bool | `true` | Whether to enable SSH key generation for MPI Run |
| dynamo-operator.dynamo.dgdr.profilerImage | string | `""` | Container image to use for profiling jobs (both online and offline/AIC) |
| grove.enabled | bool | `false` | Whether to enable Grove for multi-node inference coordination, if enabled, the Grove operator will be deployed cluster-wide |
| kai-scheduler.enabled | bool | `false` | Whether to enable Kai Scheduler for intelligent resource allocation, if enabled, the Kai Scheduler operator will be deployed cluster-wide |
| etcd.enabled | bool | `true` | Whether to enable etcd deployment, disable if you want to use an external etcd instance. For complete configuration options, see: https://github.com/bitnami/charts/tree/main/bitnami/etcd , all etcd settings should be prefixed with "etcd." |
......
......@@ -124,7 +124,11 @@ spec:
- --mpi-run-ssh-secret-name={{ .Values.dynamo.mpiRun.secretName }}
- --mpi-run-ssh-secret-namespace={{ .Release.Namespace }}
{{- end }}
{{- if .Values.dynamo.dgdr.profilerImage }}
- --profiler-image={{ .Values.dynamo.dgdr.profilerImage }}
{{- end }}
{{- if not .Values.namespaceRestriction.enabled }}
- --dgdr-profiling-cluster-role-name={{ include "dynamo-operator.fullname" . }}-dgdr-profiling
- --planner-cluster-role-name={{ include "dynamo-operator.fullname" . }}-planner
{{- end }}
command:
......
......@@ -359,6 +359,7 @@ rules:
- nvidia.com
resources:
- dynamocomponentdeployments
- dynamographdeploymentrequests
- dynamographdeployments
verbs:
- create
......@@ -372,6 +373,7 @@ rules:
- nvidia.com
resources:
- dynamocomponentdeployments/finalizers
- dynamographdeploymentrequests/finalizers
- dynamographdeployments/finalizers
verbs:
- update
......@@ -379,6 +381,7 @@ rules:
- nvidia.com
resources:
- dynamocomponentdeployments/status
- dynamographdeploymentrequests/status
- dynamographdeployments/status
verbs:
- get
......
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
{{- if .Values.namespaceRestriction.enabled }}
# Namespace-restricted mode: Role + ServiceAccount + RoleBinding
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: dgdr-profiling-job
namespace: {{ .Release.Namespace }}
labels:
{{- include "dynamo-operator.labels" . | nindent 4 }}
app.kubernetes.io/component: dgdr-profiling
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: dgdr-profiling-job
namespace: {{ .Release.Namespace }}
labels:
{{- include "dynamo-operator.labels" . | nindent 4 }}
app.kubernetes.io/component: dgdr-profiling
rules:
# ConfigMaps - needed for saving profiling results
- apiGroups: [""]
resources: ["configmaps"]
verbs: ["create", "get", "update", "patch", "delete"]
# DynamoGraphDeploymentRequests - needed to get DGDR info
- apiGroups: ["nvidia.com"]
resources: ["dynamographdeploymentrequests"]
verbs: ["get"]
# DynamoGraphDeployments - needed for online profiling to create test deployments
# The operator will handle creating the actual pods, services, and deployments
- apiGroups: ["nvidia.com"]
resources: ["dynamographdeployments"]
verbs: ["get", "create", "delete", "list", "watch"]
# Pods - needed for listing pods by label selector and getting logs from test deployments
- apiGroups: [""]
resources: ["pods"]
verbs: ["list", "get"]
- apiGroups: [""]
resources: ["pods/log"]
verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: dgdr-profiling-job
namespace: {{ .Release.Namespace }}
labels:
{{- include "dynamo-operator.labels" . | nindent 4 }}
app.kubernetes.io/component: dgdr-profiling
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: dgdr-profiling-job
subjects:
- kind: ServiceAccount
name: dgdr-profiling-job
namespace: {{ .Release.Namespace }}
{{- else }}
# Cluster-wide mode: ClusterRole for DGDR profiling jobs
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: {{ include "dynamo-operator.fullname" . }}-dgdr-profiling
labels:
{{- include "dynamo-operator.labels" . | nindent 4 }}
app.kubernetes.io/component: dgdr-profiling
rules:
# ConfigMaps - needed for saving profiling results
- apiGroups: [""]
resources: ["configmaps"]
verbs: ["create", "get", "update", "patch", "delete"]
# DynamoGraphDeploymentRequests - needed to get DGDR info
- apiGroups: ["nvidia.com"]
resources: ["dynamographdeploymentrequests"]
verbs: ["get"]
# DynamoGraphDeployments - needed for online profiling to create test deployments
# The operator will handle creating the actual pods, services, and deployments
- apiGroups: ["nvidia.com"]
resources: ["dynamographdeployments"]
verbs: ["get", "create", "delete", "list", "watch"]
# Pods - needed for listing pods by label selector and getting logs from test deployments
- apiGroups: [""]
resources: ["pods"]
verbs: ["list", "get"]
- apiGroups: [""]
resources: ["pods/log"]
verbs: ["get"]
{{- end }}
# (Remove the trailing blank line at end of file)
......@@ -117,6 +117,15 @@ dynamo:
sshKeygen:
enabled: true
# DynamoGraphDeploymentRequest (DGDR) configuration
dgdr:
# Container image to use for profiling jobs (both online and offline/AIC)
# REQUIRED: Must be set to create DynamoGraphDeploymentRequests
# For development: Build and push the profiler image from the ai-dynamo repository
# Public image will be available in release 0.6.1
# Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1"
profilerImage: ""
#imagePullSecrets: []
kubernetesClusterDomain: cluster.local
......
......@@ -135,6 +135,15 @@ dynamo-operator:
# -- Whether to enable SSH key generation for MPI Run
enabled: true
# DynamoGraphDeploymentRequest (DGDR) configuration
dgdr:
# -- Container image to use for profiling jobs (both online and offline/AIC)
# REQUIRED: Must be set to create DynamoGraphDeploymentRequests
# For development: Build and push the profiler image from the ai-dynamo repository
# Public image will be available in release 0.6.1
# Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1"
profilerImage: ""
# Grove component - distributed inference orchestration
grove:
......
......@@ -5,10 +5,19 @@ A Kubernetes Operator to manage all Dynamo pipelines using custom resources.
## Overview
This operator automates the deployment and lifecycle management of `DynamoGraphDeployment` resources in Kubernetes clusters.
This operator automates the deployment and lifecycle management of Dynamo resources in Kubernetes clusters:
- **DynamoGraphDeploymentRequest (DGDR)** - Simplified SLA-driven deployment interface
- **DynamoGraphDeployment (DGD)** - Direct deployment configuration
Built with [Kubebuilder](https://book.kubebuilder.io/), it follows Kubernetes best practices and supports declarative configuration through CustomResourceDefinitions (CRDs).
### Custom Resources
- **DynamoGraphDeploymentRequest**: High-level interface for SLA-driven configuration generation. Automatically handles profiling and generates an optimized DGD spec based on your performance requirements.
- **DynamoGraphDeployment**: Lower-level interface for direct deployment configuration with full control over all parameters.
## Developer guide
### Pre-requisites
......
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
Package v1alpha1 contains API Schema definitions for the nvidia.com v1alpha1 API group.
This package defines the DynamoGraphDeploymentRequest (DGDR) custom resource, which provides
a high-level, SLA-driven interface for deploying machine learning models on Dynamo.
*/
package v1alpha1
import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
runtime "k8s.io/apimachinery/pkg/runtime"
)
// EDIT THIS FILE! THIS IS SCAFFOLDING FOR YOU TO OWN!
// NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized.
// SLASpec defines Service Level Agreement targets for model profiling and deployment.
// These targets guide the profiling process to find optimal deployment configurations
// that meet the specified performance requirements.
type SLASpec struct {
// ITL is the target Inter-Token Latency in milliseconds.
// This represents the maximum time allowed between consecutive tokens in the output.
// +kubebuilder:default=10
// +optional
ITL int `json:"itl,omitempty"`
// TTFT is the target Time To First Token in milliseconds.
// This represents the maximum time allowed from request submission to receiving the first token.
// +kubebuilder:default=50
// +optional
TTFT int `json:"ttft,omitempty"`
// ISL is the Input Sequence Length for profiling.
// Defines the length of input sequences to use during profiling tests.
// +kubebuilder:default=3000
// +kubebuilder:validation:Minimum=1
// +optional
ISL int `json:"isl,omitempty"`
// OSL is the Output Sequence Length for profiling.
// Defines the expected length of output sequences to generate during profiling tests.
// +kubebuilder:default=500
// +kubebuilder:validation:Minimum=1
// +optional
OSL int `json:"osl,omitempty"`
}
// GPUSpec defines optional GPU type and resource specifications for profiling and deployment.
// These constraints help narrow down the search space during profiling to find configurations
// that fit within specified hardware bounds.
type GPUSpec struct {
// Type specifies the GPU type to target (e.g., "h200", "h100", "a100").
// If specified, profiling will focus on configurations optimized for this GPU type.
// +kubebuilder:validation:Optional
Type string `json:"type,omitempty"`
// MinNumGPUsPerEngine specifies the minimum number of GPUs per engine for profiling.
// The profiler will not consider configurations with fewer GPUs than this value.
// +kubebuilder:validation:Optional
// +kubebuilder:validation:Minimum=1
// +kubebuilder:default=1
MinNumGPUsPerEngine int `json:"minNumGPUsPerEngine,omitempty"`
// MaxNumGPUsPerEngine specifies the maximum number of GPUs per engine for profiling.
// The profiler will not consider configurations with more GPUs than this value.
// +kubebuilder:validation:Optional
// +kubebuilder:validation:Minimum=1
// +kubebuilder:default=8
MaxNumGPUsPerEngine int `json:"maxNumGPUsPerEngine,omitempty"`
}
// ConfigMapKeySelector selects a specific key from a ConfigMap.
// Used to reference external configuration data stored in ConfigMaps.
type ConfigMapKeySelector struct {
// Name of the ConfigMap containing the desired data.
// +kubebuilder:validation:Required
Name string `json:"name"`
// Key in the ConfigMap to select. If not specified, defaults to "disagg.yaml".
// +kubebuilder:default=disagg.yaml
Key string `json:"key,omitempty"`
}
// ProfilingConfigSpec defines configuration for the profiling process.
// Allows users to provide custom profiling parameters via ConfigMap references.
type ProfilingConfigSpec struct {
// ConfigMapRef is a reference to a ConfigMap containing profiling configuration.
// The ConfigMap should contain a key (default: "disagg.yaml") with the configuration file.
// This configuration is used by both online and offline (AIC) profiling modes.
// +kubebuilder:validation:Optional
ConfigMapRef *ConfigMapKeySelector `json:"configMapRef,omitempty"`
}
// DeploymentOverridesSpec allows users to customize metadata for auto-created DynamoGraphDeployments.
// When autoApply is enabled, these overrides are applied to the generated DGD resource.
type DeploymentOverridesSpec struct {
// Name is the desired name for the created DynamoGraphDeployment.
// If not specified, defaults to the DGDR name.
// +kubebuilder:validation:Optional
Name string `json:"name,omitempty"`
// Namespace is the desired namespace for the created DynamoGraphDeployment.
// If not specified, defaults to the DGDR namespace.
// +kubebuilder:validation:Optional
Namespace string `json:"namespace,omitempty"`
// Labels are additional labels to add to the DynamoGraphDeployment metadata.
// These are merged with auto-generated labels from the profiling process.
// +kubebuilder:validation:Optional
Labels map[string]string `json:"labels,omitempty"`
// Annotations are additional annotations to add to the DynamoGraphDeployment metadata.
// +kubebuilder:validation:Optional
Annotations map[string]string `json:"annotations,omitempty"`
}
// DynamoGraphDeploymentRequestSpec defines the desired state of a DynamoGraphDeploymentRequest.
// This CRD serves as the primary interface for users to request model deployments with
// specific performance constraints and resource requirements, enabling SLA-driven deployments.
type DynamoGraphDeploymentRequestSpec struct {
// ModelName specifies the model to deploy (e.g., "meta/llama3-70b").
// This should be a valid model identifier that the profiler can resolve.
// +kubebuilder:validation:Required
ModelName string `json:"modelName"`
// Backend specifies the inference backend framework to use.
// Supported values are: "vllm", "sglang", "trtllm".
// +kubebuilder:validation:Enum=vllm;sglang;trtllm
// +kubebuilder:default=trtllm
Backend string `json:"backend,omitempty"`
// SLA defines the Service Level Agreement profiling targets.
// The profiler uses these targets to find an optimal deployment configuration.
// +kubebuilder:validation:Required
SLA SLASpec `json:"sla"`
// GPU defines optional GPU type and resource specifications.
// These constraints guide the profiler to find configurations within specified bounds.
// +kubebuilder:validation:Optional
GPU *GPUSpec `json:"gpu,omitempty"`
// Online indicates whether to use online profiler (true) or AI Configurator (false).
// Online profiling uses real deployments for accurate measurements (2-4 hours).
// Offline profiling uses AI Configurator for fast simulation-based profiling (20-30 seconds).
// +kubebuilder:default=false
Online bool `json:"online,omitempty"`
// AutoApply indicates whether to automatically create a DynamoGraphDeployment
// after profiling completes. If false, only the spec is generated and stored in status.
// Users can then manually create a DGD using the generated spec.
// +kubebuilder:default=false
AutoApply bool `json:"autoApply,omitempty"`
// DeploymentOverrides allows customizing metadata for the auto-created DGD.
// Only applicable when AutoApply is true.
// +kubebuilder:validation:Optional
DeploymentOverrides *DeploymentOverridesSpec `json:"deploymentOverrides,omitempty"`
// ProfilingConfig provides custom configuration for the profiling job.
// Applicable to both online and offline (AIC) profiling modes.
// +kubebuilder:validation:Optional
ProfilingConfig *ProfilingConfigSpec `json:"profilingConfig,omitempty"`
}
// DeploymentStatus tracks the state of an auto-created DynamoGraphDeployment.
// This status is populated when autoApply is enabled and a DGD is created.
type DeploymentStatus struct {
// Name is the name of the created DynamoGraphDeployment.
Name string `json:"name,omitempty"`
// Namespace is the namespace of the created DynamoGraphDeployment.
Namespace string `json:"namespace,omitempty"`
// State is the current state of the DynamoGraphDeployment.
// This value is mirrored from the DGD's status.state field.
State string `json:"state,omitempty"`
// Created indicates whether the DGD has been successfully created.
// Used to prevent recreation if the DGD is manually deleted by users.
Created bool `json:"created,omitempty"`
}
// DynamoGraphDeploymentRequestStatus represents the observed state of a DynamoGraphDeploymentRequest.
// The controller updates this status as the DGDR progresses through its lifecycle.
type DynamoGraphDeploymentRequestStatus struct {
// State is a high-level textual status of the deployment request lifecycle.
// Possible values: "", "Pending", "Profiling", "Deploying", "Ready", "DeploymentDeleted", "Failed"
// Empty string ("") represents the initial state before initialization.
State string `json:"state,omitempty"`
// ObservedGeneration reflects the generation of the most recently observed spec.
// Used to detect spec changes and enforce immutability after profiling starts.
ObservedGeneration int64 `json:"observedGeneration,omitempty"`
// Conditions contains the latest observed conditions of the deployment request.
// Standard condition types include: Validation, Profiling, SpecGenerated, DeploymentReady.
// Conditions are merged by type on patch updates.
Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type"`
// ProfilingResults contains a reference to the ConfigMap holding profiling data.
// Format: "configmap/<name>"
// +kubebuilder:validation:Optional
ProfilingResults string `json:"profilingResults,omitempty"`
// GeneratedDeployment contains the full generated DynamoGraphDeployment specification
// including metadata, based on profiling results. Users can extract this to create
// a DGD manually, or it's used automatically when autoApply is true.
// Stored as RawExtension to preserve all fields including metadata.
// +kubebuilder:validation:Optional
// +kubebuilder:pruning:PreserveUnknownFields
// +kubebuilder:validation:EmbeddedResource
GeneratedDeployment *runtime.RawExtension `json:"generatedDeployment,omitempty"`
// Deployment tracks the auto-created DGD when AutoApply is true.
// Contains name, namespace, state, and creation status of the managed DGD.
// +kubebuilder:validation:Optional
Deployment *DeploymentStatus `json:"deployment,omitempty"`
}
// DynamoGraphDeploymentRequest is the Schema for the dynamographdeploymentrequests API.
// It serves as the primary interface for users to request model deployments with
// specific performance and resource constraints, enabling SLA-driven deployments.
//
// Lifecycle:
// 1. Initial → Pending: Validates spec and prepares for profiling
// 2. Pending → Profiling: Creates and runs profiling job (online or AIC)
// 3. Profiling → Ready/Deploying: Generates DGD spec after profiling completes
// 4. Deploying → Ready: When autoApply=true, monitors DGD until Ready
// 5. Ready: Terminal state when DGD is operational or spec is available
// 6. DeploymentDeleted: Terminal state when auto-created DGD is manually deleted
//
// The spec becomes immutable once profiling starts. Users must delete and recreate
// the DGDR to modify configuration after this point.
//
// +kubebuilder:object:root=true
// +kubebuilder:subresource:status
// +kubebuilder:resource:shortName=dgdr
// +kubebuilder:printcolumn:name="Model",type=string,JSONPath=`.spec.modelName`
// +kubebuilder:printcolumn:name="Backend",type=string,JSONPath=`.spec.backend`
// +kubebuilder:printcolumn:name="State",type=string,JSONPath=`.status.state`
// +kubebuilder:printcolumn:name="DGD-State",type=string,JSONPath=`.status.deployment.state`
// +kubebuilder:printcolumn:name="Age",type="date",JSONPath=".metadata.creationTimestamp"
type DynamoGraphDeploymentRequest struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`
// Spec defines the desired state for this deployment request.
Spec DynamoGraphDeploymentRequestSpec `json:"spec,omitempty"`
// Status reflects the current observed state of this deployment request.
Status DynamoGraphDeploymentRequestStatus `json:"status,omitempty"`
}
// SetState updates the State field in the DGDR status.
func (s *DynamoGraphDeploymentRequest) SetState(state string) {
s.Status.State = state
}
// GetSpec returns the spec of this DGDR as a generic interface.
// Implements a common interface used by controller utilities.
func (s *DynamoGraphDeploymentRequest) GetSpec() any {
return s.Spec
}
// SetSpec updates the spec of this DGDR from a generic interface value.
// Implements a common interface used by controller utilities.
func (s *DynamoGraphDeploymentRequest) SetSpec(spec any) {
s.Spec = spec.(DynamoGraphDeploymentRequestSpec)
}
// AddStatusCondition adds or updates a condition in the status.
// If a condition with the same type already exists, it replaces it.
// Otherwise, it appends the new condition to the list.
func (s *DynamoGraphDeploymentRequest) AddStatusCondition(condition metav1.Condition) {
if s.Status.Conditions == nil {
s.Status.Conditions = []metav1.Condition{}
}
// Check if condition with same type already exists
for i, existingCondition := range s.Status.Conditions {
if existingCondition.Type == condition.Type {
// Replace the existing condition
s.Status.Conditions[i] = condition
return
}
}
// If no matching condition found, append the new one
s.Status.Conditions = append(s.Status.Conditions, condition)
}
// DynamoGraphDeploymentRequestList contains a list of DynamoGraphDeploymentRequest resources.
//
// +kubebuilder:object:root=true
type DynamoGraphDeploymentRequestList struct {
metav1.TypeMeta `json:",inline"`
metav1.ListMeta `json:"metadata,omitempty"`
Items []DynamoGraphDeploymentRequest `json:"items"`
}
func init() {
SchemeBuilder.Register(&DynamoGraphDeploymentRequest{}, &DynamoGraphDeploymentRequestList{})
}
......@@ -42,7 +42,7 @@ import (
"k8s.io/api/autoscaling/v2"
"k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
runtime "k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime"
)
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
......@@ -114,6 +114,65 @@ func (in *BaseStatus) DeepCopy() *BaseStatus {
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *ConfigMapKeySelector) DeepCopyInto(out *ConfigMapKeySelector) {
*out = *in
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ConfigMapKeySelector.
func (in *ConfigMapKeySelector) DeepCopy() *ConfigMapKeySelector {
if in == nil {
return nil
}
out := new(ConfigMapKeySelector)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *DeploymentOverridesSpec) DeepCopyInto(out *DeploymentOverridesSpec) {
*out = *in
if in.Labels != nil {
in, out := &in.Labels, &out.Labels
*out = make(map[string]string, len(*in))
for key, val := range *in {
(*out)[key] = val
}
}
if in.Annotations != nil {
in, out := &in.Annotations, &out.Annotations
*out = make(map[string]string, len(*in))
for key, val := range *in {
(*out)[key] = val
}
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DeploymentOverridesSpec.
func (in *DeploymentOverridesSpec) DeepCopy() *DeploymentOverridesSpec {
if in == nil {
return nil
}
out := new(DeploymentOverridesSpec)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *DeploymentStatus) DeepCopyInto(out *DeploymentStatus) {
*out = *in
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DeploymentStatus.
func (in *DeploymentStatus) DeepCopy() *DeploymentStatus {
if in == nil {
return nil
}
out := new(DeploymentStatus)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *DynamoComponentDeployment) DeepCopyInto(out *DynamoComponentDeployment) {
*out = *in
......@@ -378,6 +437,128 @@ func (in *DynamoGraphDeploymentList) DeepCopyObject() runtime.Object {
return nil
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *DynamoGraphDeploymentRequest) DeepCopyInto(out *DynamoGraphDeploymentRequest) {
*out = *in
out.TypeMeta = in.TypeMeta
in.ObjectMeta.DeepCopyInto(&out.ObjectMeta)
in.Spec.DeepCopyInto(&out.Spec)
in.Status.DeepCopyInto(&out.Status)
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DynamoGraphDeploymentRequest.
func (in *DynamoGraphDeploymentRequest) DeepCopy() *DynamoGraphDeploymentRequest {
if in == nil {
return nil
}
out := new(DynamoGraphDeploymentRequest)
in.DeepCopyInto(out)
return out
}
// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
func (in *DynamoGraphDeploymentRequest) DeepCopyObject() runtime.Object {
if c := in.DeepCopy(); c != nil {
return c
}
return nil
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *DynamoGraphDeploymentRequestList) DeepCopyInto(out *DynamoGraphDeploymentRequestList) {
*out = *in
out.TypeMeta = in.TypeMeta
in.ListMeta.DeepCopyInto(&out.ListMeta)
if in.Items != nil {
in, out := &in.Items, &out.Items
*out = make([]DynamoGraphDeploymentRequest, len(*in))
for i := range *in {
(*in)[i].DeepCopyInto(&(*out)[i])
}
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DynamoGraphDeploymentRequestList.
func (in *DynamoGraphDeploymentRequestList) DeepCopy() *DynamoGraphDeploymentRequestList {
if in == nil {
return nil
}
out := new(DynamoGraphDeploymentRequestList)
in.DeepCopyInto(out)
return out
}
// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
func (in *DynamoGraphDeploymentRequestList) DeepCopyObject() runtime.Object {
if c := in.DeepCopy(); c != nil {
return c
}
return nil
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *DynamoGraphDeploymentRequestSpec) DeepCopyInto(out *DynamoGraphDeploymentRequestSpec) {
*out = *in
out.SLA = in.SLA
if in.GPU != nil {
in, out := &in.GPU, &out.GPU
*out = new(GPUSpec)
**out = **in
}
if in.DeploymentOverrides != nil {
in, out := &in.DeploymentOverrides, &out.DeploymentOverrides
*out = new(DeploymentOverridesSpec)
(*in).DeepCopyInto(*out)
}
if in.ProfilingConfig != nil {
in, out := &in.ProfilingConfig, &out.ProfilingConfig
*out = new(ProfilingConfigSpec)
(*in).DeepCopyInto(*out)
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DynamoGraphDeploymentRequestSpec.
func (in *DynamoGraphDeploymentRequestSpec) DeepCopy() *DynamoGraphDeploymentRequestSpec {
if in == nil {
return nil
}
out := new(DynamoGraphDeploymentRequestSpec)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *DynamoGraphDeploymentRequestStatus) DeepCopyInto(out *DynamoGraphDeploymentRequestStatus) {
*out = *in
if in.Conditions != nil {
in, out := &in.Conditions, &out.Conditions
*out = make([]metav1.Condition, len(*in))
for i := range *in {
(*in)[i].DeepCopyInto(&(*out)[i])
}
}
if in.GeneratedDeployment != nil {
in, out := &in.GeneratedDeployment, &out.GeneratedDeployment
*out = new(runtime.RawExtension)
(*in).DeepCopyInto(*out)
}
if in.Deployment != nil {
in, out := &in.Deployment, &out.Deployment
*out = new(DeploymentStatus)
**out = **in
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DynamoGraphDeploymentRequestStatus.
func (in *DynamoGraphDeploymentRequestStatus) DeepCopy() *DynamoGraphDeploymentRequestStatus {
if in == nil {
return nil
}
out := new(DynamoGraphDeploymentRequestStatus)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *DynamoGraphDeploymentSpec) DeepCopyInto(out *DynamoGraphDeploymentSpec) {
*out = *in
......@@ -445,6 +626,21 @@ func (in *DynamoGraphDeploymentStatus) DeepCopy() *DynamoGraphDeploymentStatus {
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *GPUSpec) DeepCopyInto(out *GPUSpec) {
*out = *in
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUSpec.
func (in *GPUSpec) DeepCopy() *GPUSpec {
if in == nil {
return nil
}
out := new(GPUSpec)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *IngressSpec) DeepCopyInto(out *IngressSpec) {
*out = *in
......@@ -555,6 +751,41 @@ func (in *PVC) DeepCopy() *PVC {
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *ProfilingConfigSpec) DeepCopyInto(out *ProfilingConfigSpec) {
*out = *in
if in.ConfigMapRef != nil {
in, out := &in.ConfigMapRef, &out.ConfigMapRef
*out = new(ConfigMapKeySelector)
**out = **in
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ProfilingConfigSpec.
func (in *ProfilingConfigSpec) DeepCopy() *ProfilingConfigSpec {
if in == nil {
return nil
}
out := new(ProfilingConfigSpec)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *SLASpec) DeepCopyInto(out *SLASpec) {
*out = *in
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SLASpec.
func (in *SLASpec) DeepCopy() *SLASpec {
if in == nil {
return nil
}
out := new(SLASpec)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *SharedMemorySpec) DeepCopyInto(out *SharedMemorySpec) {
*out = *in
......
......@@ -140,6 +140,8 @@ func main() {
var mpiRunSecretName string
var mpiRunSecretNamespace string
var plannerClusterRoleName string
var profilerImage string
var dgdrProfilingClusterRoleName string
flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
flag.BoolVar(&enableLeaderElection, "leader-elect", false,
......@@ -180,6 +182,10 @@ func main() {
"Namespace where the MPI SSH secret is located (required)")
flag.StringVar(&plannerClusterRoleName, "planner-cluster-role-name", "",
"Name of the ClusterRole for planner (cluster-wide mode only)")
flag.StringVar(&profilerImage, "profiler-image", "",
"Container image to use for profiling jobs (both online and offline/AIC) (for DynamoGraphDeploymentRequest)")
flag.StringVar(&dgdrProfilingClusterRoleName, "dgdr-profiling-cluster-role-name", "",
"Name of the ClusterRole for DGDR profiling jobs (cluster-wide mode only)")
opts := zap.Options{
Development: true,
}
......@@ -237,6 +243,7 @@ func main() {
},
RBAC: commonController.RBACConfig{
PlannerClusterRoleName: plannerClusterRoleName,
DGDRProfilingClusterRoleName: dgdrProfilingClusterRoleName,
},
}
......@@ -449,6 +456,17 @@ func main() {
setupLog.Error(err, "unable to create controller", "controller", "DynamoGraphDeployment")
os.Exit(1)
}
if err = (&controller.DynamoGraphDeploymentRequestReconciler{
Client: mgr.GetClient(),
Recorder: mgr.GetEventRecorderFor("dynamographdeploymentrequest"),
ProfilerImage: profilerImage,
Config: ctrlConfig,
RBACManager: rbacManager,
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "DynamoGraphDeploymentRequest")
os.Exit(1)
}
//+kubebuilder:scaffold:builder
if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
......
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.16.4
helm.sh/resource-policy: keep
name: dynamographdeploymentrequests.nvidia.com
spec:
group: nvidia.com
names:
kind: DynamoGraphDeploymentRequest
listKind: DynamoGraphDeploymentRequestList
plural: dynamographdeploymentrequests
shortNames:
- dgdr
singular: dynamographdeploymentrequest
scope: Namespaced
versions:
- additionalPrinterColumns:
- jsonPath: .spec.modelName
name: Model
type: string
- jsonPath: .spec.backend
name: Backend
type: string
- jsonPath: .status.state
name: State
type: string
- jsonPath: .status.deployment.state
name: DGD-State
type: string
- jsonPath: .metadata.creationTimestamp
name: Age
type: date
name: v1alpha1
schema:
openAPIV3Schema:
description: |-
DynamoGraphDeploymentRequest is the Schema for the dynamographdeploymentrequests API.
It serves as the primary interface for users to request model deployments with
specific performance and resource constraints, enabling SLA-driven deployments.
Lifecycle:
1. Initial → Pending: Validates spec and prepares for profiling
2. Pending → Profiling: Creates and runs profiling job (online or AIC)
3. Profiling → Ready/Deploying: Generates DGD spec after profiling completes
4. Deploying → Ready: When autoApply=true, monitors DGD until Ready
5. Ready: Terminal state when DGD is operational or spec is available
6. DeploymentDeleted: Terminal state when auto-created DGD is manually deleted
The spec becomes immutable once profiling starts. Users must delete and recreate
the DGDR to modify configuration after this point.
properties:
apiVersion:
description: |-
APIVersion defines the versioned schema of this representation of an object.
Servers should convert recognized schemas to the latest internal value, and
may reject unrecognized values.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
type: string
kind:
description: |-
Kind is a string value representing the REST resource this object represents.
Servers may infer this from the endpoint the client submits requests to.
Cannot be updated.
In CamelCase.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
type: string
metadata:
type: object
spec:
description: Spec defines the desired state for this deployment request.
properties:
autoApply:
default: false
description: |-
AutoApply indicates whether to automatically create a DynamoGraphDeployment
after profiling completes. If false, only the spec is generated and stored in status.
Users can then manually create a DGD using the generated spec.
type: boolean
backend:
default: trtllm
description: |-
Backend specifies the inference backend framework to use.
Supported values are: "vllm", "sglang", "trtllm".
enum:
- vllm
- sglang
- trtllm
type: string
deploymentOverrides:
description: |-
DeploymentOverrides allows customizing metadata for the auto-created DGD.
Only applicable when AutoApply is true.
properties:
annotations:
additionalProperties:
type: string
description: Annotations are additional annotations to add to the DynamoGraphDeployment metadata.
type: object
labels:
additionalProperties:
type: string
description: |-
Labels are additional labels to add to the DynamoGraphDeployment metadata.
These are merged with auto-generated labels from the profiling process.
type: object
name:
description: |-
Name is the desired name for the created DynamoGraphDeployment.
If not specified, defaults to the DGDR name.
type: string
namespace:
description: |-
Namespace is the desired namespace for the created DynamoGraphDeployment.
If not specified, defaults to the DGDR namespace.
type: string
type: object
gpu:
description: |-
GPU defines optional GPU type and resource specifications.
These constraints guide the profiler to find configurations within specified bounds.
properties:
maxNumGPUsPerEngine:
default: 8
description: |-
MaxNumGPUsPerEngine specifies the maximum number of GPUs per engine for profiling.
The profiler will not consider configurations with more GPUs than this value.
minimum: 1
type: integer
minNumGPUsPerEngine:
default: 1
description: |-
MinNumGPUsPerEngine specifies the minimum number of GPUs per engine for profiling.
The profiler will not consider configurations with fewer GPUs than this value.
minimum: 1
type: integer
type:
description: |-
Type specifies the GPU type to target (e.g., "h200", "h100", "a100").
If specified, profiling will focus on configurations optimized for this GPU type.
type: string
type: object
modelName:
description: |-
ModelName specifies the model to deploy (e.g., "meta/llama3-70b").
This should be a valid model identifier that the profiler can resolve.
type: string
online:
default: false
description: |-
Online indicates whether to use online profiler (true) or AI Configurator (false).
Online profiling uses real deployments for accurate measurements (2-4 hours).
Offline profiling uses AI Configurator for fast simulation-based profiling (20-30 seconds).
type: boolean
profilingConfig:
description: |-
ProfilingConfig provides custom configuration for the profiling job.
Applicable to both online and offline (AIC) profiling modes.
properties:
configMapRef:
description: |-
ConfigMapRef is a reference to a ConfigMap containing profiling configuration.
The ConfigMap should contain a key (default: "disagg.yaml") with the configuration file.
This configuration is used by both online and offline (AIC) profiling modes.
properties:
key:
default: disagg.yaml
description: Key in the ConfigMap to select. If not specified, defaults to "disagg.yaml".
type: string
name:
description: Name of the ConfigMap containing the desired data.
type: string
required:
- name
type: object
type: object
sla:
description: |-
SLA defines the Service Level Agreement profiling targets.
The profiler uses these targets to find an optimal deployment configuration.
properties:
isl:
default: 3000
description: |-
ISL is the Input Sequence Length for profiling.
Defines the length of input sequences to use during profiling tests.
minimum: 1
type: integer
itl:
default: 10
description: |-
ITL is the target Inter-Token Latency in milliseconds.
This represents the maximum time allowed between consecutive tokens in the output.
type: integer
osl:
default: 500
description: |-
OSL is the Output Sequence Length for profiling.
Defines the expected length of output sequences to generate during profiling tests.
minimum: 1
type: integer
ttft:
default: 50
description: |-
TTFT is the target Time To First Token in milliseconds.
This represents the maximum time allowed from request submission to receiving the first token.
type: integer
type: object
required:
- modelName
- sla
type: object
status:
description: Status reflects the current observed state of this deployment request.
properties:
conditions:
description: |-
Conditions contains the latest observed conditions of the deployment request.
Standard condition types include: Validation, Profiling, SpecGenerated, DeploymentReady.
Conditions are merged by type on patch updates.
items:
description: Condition contains details for one aspect of the current state of this API Resource.
properties:
lastTransitionTime:
description: |-
lastTransitionTime is the last time the condition transitioned from one status to another.
This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable.
format: date-time
type: string
message:
description: |-
message is a human readable message indicating details about the transition.
This may be an empty string.
maxLength: 32768
type: string
observedGeneration:
description: |-
observedGeneration represents the .metadata.generation that the condition was set based upon.
For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date
with respect to the current state of the instance.
format: int64
minimum: 0
type: integer
reason:
description: |-
reason contains a programmatic identifier indicating the reason for the condition's last transition.
Producers of specific condition types may define expected values and meanings for this field,
and whether the values are considered a guaranteed API.
The value should be a CamelCase string.
This field may not be empty.
maxLength: 1024
minLength: 1
pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$
type: string
status:
description: status of the condition, one of True, False, Unknown.
enum:
- "True"
- "False"
- Unknown
type: string
type:
description: type of condition in CamelCase or in foo.example.com/CamelCase.
maxLength: 316
pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$
type: string
required:
- lastTransitionTime
- message
- reason
- status
- type
type: object
type: array
deployment:
description: |-
Deployment tracks the auto-created DGD when AutoApply is true.
Contains name, namespace, state, and creation status of the managed DGD.
properties:
created:
description: |-
Created indicates whether the DGD has been successfully created.
Used to prevent recreation if the DGD is manually deleted by users.
type: boolean
name:
description: Name is the name of the created DynamoGraphDeployment.
type: string
namespace:
description: Namespace is the namespace of the created DynamoGraphDeployment.
type: string
state:
description: |-
State is the current state of the DynamoGraphDeployment.
This value is mirrored from the DGD's status.state field.
type: string
type: object
generatedDeployment:
description: |-
GeneratedDeployment contains the full generated DynamoGraphDeployment specification
including metadata, based on profiling results. Users can extract this to create
a DGD manually, or it's used automatically when autoApply is true.
Stored as RawExtension to preserve all fields including metadata.
type: object
x-kubernetes-embedded-resource: true
x-kubernetes-preserve-unknown-fields: true
observedGeneration:
description: |-
ObservedGeneration reflects the generation of the most recently observed spec.
Used to detect spec changes and enforce immutability after profiling starts.
format: int64
type: integer
profilingResults:
description: |-
ProfilingResults contains a reference to the ConfigMap holding profiling data.
Format: "configmap/<name>"
type: string
state:
description: |-
State is a high-level textual status of the deployment request lifecycle.
Possible values: "", "Pending", "Profiling", "Deploying", "Ready", "DeploymentDeleted", "Failed"
Empty string ("") represents the initial state before initialization.
type: string
type: object
type: object
served: true
storage: true
subresources:
status: {}
......@@ -13,7 +13,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
......@@ -74,6 +73,18 @@ rules:
- patch
- update
- watch
- apiGroups:
- batch
resources:
- jobs
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- coordination.k8s.io
resources:
......@@ -160,6 +171,7 @@ rules:
- nvidia.com
resources:
- dynamocomponentdeployments
- dynamographdeploymentrequests
- dynamographdeployments
verbs:
- create
......@@ -173,6 +185,7 @@ rules:
- nvidia.com
resources:
- dynamocomponentdeployments/finalizers
- dynamographdeploymentrequests/finalizers
- dynamographdeployments/finalizers
verbs:
- update
......@@ -180,6 +193,7 @@ rules:
- nvidia.com
resources:
- dynamocomponentdeployments/status
- dynamographdeploymentrequests/status
- dynamographdeployments/status
verbs:
- get
......
......@@ -18,4 +18,5 @@ resources:
- nvidia.com_v1alpha1_dynamocomponentdeployment.yaml
- nvidia.com_v1alpha1_dynamocomponent.yaml
- nvidia.com_v1alpha1_dynamographdeployment.yaml
- nvidia.com_v1alpha1_dynamographdeploymentrequest.yaml
#+kubebuilder:scaffold:manifestskustomizesamples
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeploymentRequest
metadata:
name: example-llm-sla
spec:
modelName: "meta/llama3-70b"
backend: trtllm # enum: [vllm, sglang, trtllm]; default is trtllm
sla: # SLA profiling targets (all fields optional with defaults)
itl: 10 # Inter-Token Latency target in milliseconds (default: 10)
ttft: 50 # Time To First Token target in milliseconds (default: 50)
isl: 3000 # Input Sequence Length (default: 3000)
osl: 500 # Output Sequence Length (default: 500)
gpu: # optional
type: h200_sxm
minNumGPUsPerEngine: 1 # default is 1
maxNumGPUsPerEngine: 8 # default is 8
online: false # true for online profiler, false for AIC profiler
# Optional: Automatically create DynamoGraphDeployment after profiling
autoApply: true # default is false
# Optional: Override metadata for auto-created DGD (only used when autoApply: true)
# deploymentOverrides:
# name: my-custom-dgd-name
# namespace: production
# labels:
# team: ml-platform
# annotations:
# description: "Auto-generated from DGDR"
# Currently required for both online and offline/AIC profiling, but will be removed in the future
profilingConfig:
configMapRef:
name: my-profiling-config
key: disagg.yaml # default is "disagg.yaml"
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package controller
import (
"bytes"
"context"
"errors"
"fmt"
"text/template"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/meta"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/tools/record"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/builder"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/event"
"sigs.k8s.io/controller-runtime/pkg/handler"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/predicate"
"sigs.k8s.io/yaml"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
commonController "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common"
)
const (
// State constants
StateEmpty = ""
StatePending = "Pending"
StateProfiling = "Profiling"
StateDeploying = "Deploying"
StateReady = "Ready"
StateDeploymentDeleted = "DeploymentDeleted"
StateFailed = "Failed"
// Condition types
ConditionTypeValidation = "Validation"
ConditionTypeProfiling = "Profiling"
ConditionTypeSpecGenerated = "SpecGenerated"
ConditionTypeDeploymentReady = "DeploymentReady"
// Event reasons
EventReasonInitialized = "Initialized"
EventReasonValidationFailed = "ValidationFailed"
EventReasonProfilingJobCreated = "ProfilingJobCreated"
EventReasonProfilingJobFailed = "ProfilingJobFailed"
EventReasonAIConfiguratorFailed = "AIConfiguratorFailed"
EventReasonSpecGenerated = "SpecGenerated"
EventReasonSpecChangeRejected = "SpecChangeRejected"
EventReasonDeploymentCreated = "DeploymentCreated"
EventReasonDeploymentReady = "DeploymentReady"
EventReasonDeploymentDegraded = "DeploymentDegraded"
EventReasonDeploymentDeleted = "DeploymentDeleted"
// Label keys
LabelApp = "app"
LabelDGDR = "dgdr"
LabelDGDRName = "dgdr.nvidia.com/name"
LabelDGDRNamespace = "dgdr.nvidia.com/namespace"
LabelManagedBy = "nvidia.com/managed-by"
// Label values
LabelValueDynamoProfiler = "dynamo-profiler"
LabelValueAICProfiler = "aic-profiler"
LabelValueDynamoOperator = "dynamo-operator"
// Job naming
JobNamePrefixOnline = "profile-online-"
JobNamePrefixAIC = "profile-aic-"
// Container names
ContainerNameProfiler = "profiler"
ContainerNameOutputCopier = "output-copier"
// ServiceAccount
ServiceAccountProfilingJob = "dgdr-profiling-job"
// ConfigMap naming
ConfigMapOutputPrefix = "dgdr-output-"
// Sidecar image
SidecarImage = "bitnami/kubectl:latest"
// Volume names
VolumeNameProfilingConfig = "profiling-config"
VolumeNameProfilingOutput = "profiling-output"
// Volume paths
ProfilingOutputPath = "/data"
ProfilingOutputFile = "config_with_planner.yaml"
ProfilingConfigPath = "/config"
ProfilingConfigFile = "disagg.yaml"
// Command line arguments
ArgModel = "--model"
ArgBackend = "--backend"
ArgTTFT = "--ttft"
ArgITL = "--itl"
ArgConfig = "--config"
// Messages
MessageInitialized = "DGDR initialized successfully"
MessageProfilingJobCreated = "Profiling job created"
MessageAICProfilingJobCreated = "AIC profiling job created"
MessageProfilingInProgress = "Profiling is in progress"
MessageSpecGenerated = "DynamoGraphDeployment spec generated successfully"
MessageSpecAvailable = "Generated spec is available in status.generatedDeployment"
MessageDeploymentCreated = "DynamoGraphDeployment %s created successfully"
MessageDeploymentReady = "DynamoGraphDeployment %s is ready"
MessageDeploymentDegraded = "DynamoGraphDeployment %s degraded from Ready to %s"
MessageDeploymentDeleted = "DGD %s was deleted. DGDR will not recreate it. Delete this DGDR and create a new one to redeploy."
MessageInvalidState = "Invalid state"
MessageSpecChangeRejected = "Cannot modify spec in state '%s'. DynamoGraphDeploymentRequest is immutable once profiling starts. Create a new resource with a different name instead."
MessageJobCreationFailed = "JobCreationFailed"
MessageDeploymentCreationFailed = "DeploymentCreationFailed"
MessageResultsRetrievalFailed = "ResultsRetrievalFailed"
MessageGenerationFailed = "GenerationFailed"
MessageAIConfiguratorCheckFailed = "AIConfiguratorCheckFailed"
MessageProfilingCheckFailed = "ProfilingCheckFailed"
MessageConfigMapNotFound = "ConfigMap %s not found in namespace %s"
MessageConfigMapKeyNotFound = "key %s not found in ConfigMap %s"
// Validation messages
ValidationErrorModelNameRequired = "modelName is required"
ValidationErrorITLPositive = "sla.itl must be positive"
ValidationErrorTTFTPositive = "sla.ttft must be positive"
ValidationErrorInvalidBackend = "invalid backend: %s (must be vllm, sglang, or trtllm)"
// Valid backend values
BackendVLLM = "vllm"
BackendSGLang = "sglang"
BackendTRTLLM = "trtllm"
)
// shell script template for the output copier sidecar
const sidecarScriptTemplate = `
set -e
set -o pipefail
while [ ! -f {{.OutputPath}}/{{.OutputFile}} ]; do sleep 2; done
# Start building ConfigMap YAML with DGD spec
cat >/tmp/cm.yaml <<EOF
apiVersion: v1
kind: ConfigMap
metadata:
name: {{.ConfigMapName}}
namespace: {{.Namespace}}
labels:
dgdr.nvidia.com/name: {{.DGDRName}}
nvidia.com/managed-by: dynamo-operator
data:
{{.OutputFile}}: |
EOF
sed 's/^/ /' {{.OutputPath}}/{{.OutputFile}} >> /tmp/cm.yaml
# Add profiling data directories to ConfigMap for long-term storage
# Find all interpolation directories and add their raw_data.npz files
for dir in {{.OutputPath}}/*/interpolation; do
if [ -d "$dir" ]; then
dirname=$(basename $(dirname "$dir"))
if [ -f "$dir/raw_data.npz" ]; then
echo " ${dirname}_raw_data.npz: |" >> /tmp/cm.yaml
base64 "$dir/raw_data.npz" | sed 's/^/ /' >> /tmp/cm.yaml
fi
fi
done
kubectl apply -f /tmp/cm.yaml
echo "Saved profiling output to ConfigMap {{.ConfigMapName}}"
`
// DynamoGraphDeploymentRequestReconciler reconciles a DynamoGraphDeploymentRequest object
type DynamoGraphDeploymentRequestReconciler struct {
client.Client
Recorder record.EventRecorder
Config commonController.Config
// ProfilerImage is the container image to use for profiling jobs (both online and offline/AIC)
ProfilerImage string
// RBACMgr handles RBAC setup for profiling jobs
RBACManager RBACManager
}
// RBACManager interface for managing RBAC resources
type RBACManager interface {
EnsureServiceAccountWithRBAC(ctx context.Context, targetNamespace, serviceAccountName, clusterRoleName string) error
}
// GetRecorder implements commonController.Reconciler interface
func (r *DynamoGraphDeploymentRequestReconciler) GetRecorder() record.EventRecorder {
return r.Recorder
}
// FinalizeResource implements commonController.Finalizer interface
func (r *DynamoGraphDeploymentRequestReconciler) FinalizeResource(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) error {
logger := log.FromContext(ctx)
logger.Info("Finalizing DGDR", "name", dgdr.Name)
// Cleanup profiling resources
if err := r.cleanupProfilingResources(ctx, dgdr); err != nil {
logger.Error(err, "Failed to cleanup profiling resources")
return err
}
logger.Info("DGDR finalized successfully", "name", dgdr.Name)
return nil
}
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeploymentrequests,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeploymentrequests/status,verbs=get;update;patch
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeploymentrequests/finalizers,verbs=update
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments/status,verbs=get;update;patch
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments/finalizers,verbs=update
// +kubebuilder:rbac:groups=batch,resources=jobs,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=core,resources=configmaps,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=core,resources=events,verbs=create;patch
// Reconcile handles the reconciliation loop for DynamoGraphDeploymentRequest
func (r *DynamoGraphDeploymentRequestReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
logger := log.FromContext(ctx)
logger.Info("Reconciling DynamoGraphDeploymentRequest", "name", req.Name, "namespace", req.Namespace)
// Fetch the DGDR instance
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{}
if err := r.Get(ctx, req.NamespacedName, dgdr); err != nil {
if apierrors.IsNotFound(err) {
logger.Info("DGDR resource not found, ignoring since object must be deleted")
return ctrl.Result{}, nil
}
logger.Error(err, "Failed to get DGDR")
return ctrl.Result{}, err
}
// Handle finalizer using common function
finalized, err := commonController.HandleFinalizer(ctx, dgdr, r.Client, r)
if err != nil {
return ctrl.Result{}, err
}
if finalized {
// Resource was deleted and finalized
return ctrl.Result{}, nil
}
// Check for spec changes (immutability enforcement)
if dgdr.Status.ObservedGeneration > 0 && dgdr.Status.ObservedGeneration != dgdr.Generation {
// Spec changed after initial processing
if dgdr.Status.State == StateProfiling || dgdr.Status.State == StateDeploying ||
dgdr.Status.State == StateReady || dgdr.Status.State == StateDeploymentDeleted {
logger.Info("Spec change detected in immutable state",
"state", dgdr.Status.State,
"observedGeneration", dgdr.Status.ObservedGeneration,
"currentGeneration", dgdr.Generation)
r.Recorder.Event(dgdr, corev1.EventTypeWarning, EventReasonSpecChangeRejected,
fmt.Sprintf(MessageSpecChangeRejected, dgdr.Status.State))
// Keep the old observedGeneration to continue rejecting changes
// No state transition - stay in current state with old spec
return ctrl.Result{}, nil
}
}
// State machine: handle different states
switch dgdr.Status.State {
case StateEmpty:
return r.handleInitialState(ctx, dgdr)
case StatePending:
return r.handlePendingState(ctx, dgdr)
case StateProfiling:
return r.handleProfilingState(ctx, dgdr)
case StateDeploying:
return r.handleDeployingState(ctx, dgdr)
case StateReady:
return r.handleReadyState(ctx, dgdr)
case StateDeploymentDeleted:
return r.handleDeploymentDeletedState(ctx, dgdr)
case StateFailed:
return r.handleFailedState(ctx, dgdr)
default:
logger.Info("Unknown state", "state", dgdr.Status.State)
return r.updateStateAndRequeue(ctx, dgdr, StateFailed, MessageInvalidState)
}
}
// handleInitialState processes newly created DGDR resources
func (r *DynamoGraphDeploymentRequestReconciler) handleInitialState(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
logger := log.FromContext(ctx)
logger.Info("Handling initial state", "name", dgdr.Name)
// Validate the spec
if err := r.validateSpec(ctx, dgdr); err != nil {
r.Recorder.Event(dgdr, corev1.EventTypeWarning, EventReasonValidationFailed, err.Error())
return r.updateStateWithCondition(ctx, dgdr, StateFailed, ConditionTypeValidation, metav1.ConditionFalse, EventReasonValidationFailed, err.Error())
}
// Set observedGeneration to track the spec we're processing
dgdr.Status.ObservedGeneration = dgdr.Generation
// Initialize status
r.Recorder.Event(dgdr, corev1.EventTypeNormal, EventReasonInitialized, MessageInitialized)
return r.updateStateAndRequeue(ctx, dgdr, StatePending, MessageInitialized)
}
// handlePendingState starts the profiling process
func (r *DynamoGraphDeploymentRequestReconciler) handlePendingState(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
logger := log.FromContext(ctx)
logger.Info("Handling pending state", "name", dgdr.Name)
// Create profiling job (online or AIC)
if err := r.createProfilingJob(ctx, dgdr); err != nil {
r.Recorder.Event(dgdr, corev1.EventTypeWarning, EventReasonProfilingJobFailed, err.Error())
return r.updateStateWithCondition(ctx, dgdr, StateFailed, ConditionTypeProfiling, metav1.ConditionFalse, MessageJobCreationFailed, err.Error())
}
// Record event with appropriate message
if dgdr.Spec.Online {
r.Recorder.Event(dgdr, corev1.EventTypeNormal, EventReasonProfilingJobCreated, MessageProfilingJobCreated)
} else {
r.Recorder.Event(dgdr, corev1.EventTypeNormal, EventReasonProfilingJobCreated, MessageAICProfilingJobCreated)
}
// Update to Profiling state with Running status
return r.updateStateWithCondition(ctx, dgdr, StateProfiling, ConditionTypeProfiling, metav1.ConditionFalse, "ProfilingRunning", MessageProfilingInProgress)
}
// handleProfilingState monitors profiling progress and generates spec when complete
func (r *DynamoGraphDeploymentRequestReconciler) handleProfilingState(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
logger := log.FromContext(ctx)
logger.Info("Handling profiling state", "name", dgdr.Name)
// Check profiling job status (both online and offline/AIC run as Jobs)
// Note: We watch the Job via Owns(), so we'll be triggered automatically on Job changes
completed, err := r.checkProfilingJobStatus(ctx, dgdr)
if err != nil {
r.Recorder.Event(dgdr, corev1.EventTypeWarning, MessageProfilingCheckFailed, err.Error())
// Job failed - transition to Failed state
return r.updateStateWithCondition(ctx, dgdr, StateFailed, ConditionTypeProfiling, metav1.ConditionFalse, "ProfilingFailed", err.Error())
}
if !completed {
logger.Info("Profiling job still running", "name", dgdr.Name)
// Don't requeue - we'll be triggered when the Job completes/fails
return ctrl.Result{}, nil
}
// Mark profiling as completed successfully
meta.SetStatusCondition(&dgdr.Status.Conditions, metav1.Condition{
Type: ConditionTypeProfiling,
Status: metav1.ConditionTrue,
ObservedGeneration: dgdr.Generation,
Reason: "ProfilingCompleted",
Message: "Profiling job completed successfully",
})
// Retrieve profiling results and generate spec
if err := r.generateDGDSpec(ctx, dgdr); err != nil {
r.Recorder.Event(dgdr, corev1.EventTypeWarning, MessageGenerationFailed, err.Error())
return r.updateStateWithCondition(ctx, dgdr, StateFailed, ConditionTypeSpecGenerated, metav1.ConditionFalse, MessageGenerationFailed, err.Error())
}
// Record spec generation event
r.Recorder.Event(dgdr, corev1.EventTypeNormal, EventReasonSpecGenerated, MessageSpecGenerated)
// If autoApply is enabled, transition to Deploying state
if dgdr.Spec.AutoApply {
logger.Info("AutoApply enabled, transitioning to Deploying state")
return r.updateStateWithCondition(ctx, dgdr, StateDeploying, ConditionTypeSpecGenerated, metav1.ConditionTrue, EventReasonSpecGenerated, MessageSpecGenerated)
}
// Otherwise, transition to Ready state
return r.updateStateWithCondition(ctx, dgdr, StateReady, ConditionTypeSpecGenerated, metav1.ConditionTrue, EventReasonSpecGenerated, MessageSpecAvailable)
}
// handleReadyState handles DGDR in Ready state
func (r *DynamoGraphDeploymentRequestReconciler) handleReadyState(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
logger := log.FromContext(ctx)
logger.Info("DGDR is ready", "name", dgdr.Name)
// If autoApply is not enabled, nothing to monitor
if !dgdr.Spec.AutoApply {
return ctrl.Result{}, nil
}
// Check if DGD still exists and monitor its status
dgd := &nvidiacomv1alpha1.DynamoGraphDeployment{}
err := r.Get(ctx, types.NamespacedName{
Name: dgdr.Status.Deployment.Name,
Namespace: dgdr.Status.Deployment.Namespace,
}, dgd)
if apierrors.IsNotFound(err) {
// DGD was deleted by user
return r.handleDGDDeleted(ctx, dgdr)
}
if err != nil {
return ctrl.Result{}, err
}
// Update deployment status
dgdr.Status.Deployment.State = dgd.Status.State
// Check if DGD degraded from Ready
if dgd.Status.State != "Ready" {
logger.Info("DGD degraded, transitioning back to Deploying",
"dgdState", dgd.Status.State)
dgdr.Status.State = StateDeploying
r.Recorder.Event(dgdr, corev1.EventTypeWarning, EventReasonDeploymentDegraded,
fmt.Sprintf(MessageDeploymentDegraded, dgd.Name, dgd.Status.State))
meta.SetStatusCondition(&dgdr.Status.Conditions, metav1.Condition{
Type: ConditionTypeDeploymentReady,
Status: metav1.ConditionFalse,
Reason: EventReasonDeploymentDegraded,
Message: fmt.Sprintf("Deployment degraded to %s", dgd.Status.State),
})
}
return ctrl.Result{}, r.Status().Update(ctx, dgdr)
}
// handleDeployingState handles DGD creation and monitors deployment
func (r *DynamoGraphDeploymentRequestReconciler) handleDeployingState(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
logger := log.FromContext(ctx)
logger.Info("Handling deploying state", "name", dgdr.Name)
if !dgdr.Spec.AutoApply {
// Shouldn't be in this state without autoApply
logger.Info("AutoApply not enabled, transitioning to Ready")
dgdr.Status.State = StateReady
return ctrl.Result{}, r.Status().Update(ctx, dgdr)
}
// Check if we need to create DGD
if dgdr.Status.Deployment == nil || !dgdr.Status.Deployment.Created {
return r.createDGD(ctx, dgdr)
}
// DGD was already created, check its status
dgd := &nvidiacomv1alpha1.DynamoGraphDeployment{}
err := r.Get(ctx, types.NamespacedName{
Name: dgdr.Status.Deployment.Name,
Namespace: dgdr.Status.Deployment.Namespace,
}, dgd)
if apierrors.IsNotFound(err) {
// DGD was deleted by user
return r.handleDGDDeleted(ctx, dgdr)
}
if err != nil {
return ctrl.Result{}, err
}
// Update deployment status
dgdr.Status.Deployment.State = dgd.Status.State
// Check if DGD is Ready
if dgd.Status.State == "Ready" {
logger.Info("DGD is Ready, transitioning to Ready state")
dgdr.Status.State = StateReady
r.Recorder.Event(dgdr, corev1.EventTypeNormal, EventReasonDeploymentReady,
fmt.Sprintf(MessageDeploymentReady, dgd.Name))
meta.SetStatusCondition(&dgdr.Status.Conditions, metav1.Condition{
Type: ConditionTypeDeploymentReady,
Status: metav1.ConditionTrue,
Reason: EventReasonDeploymentReady,
Message: fmt.Sprintf(MessageDeploymentReady, dgd.Name),
})
}
return ctrl.Result{}, r.Status().Update(ctx, dgdr)
}
// handleDeploymentDeletedState is a terminal state for when auto-created DGD is deleted
func (r *DynamoGraphDeploymentRequestReconciler) handleDeploymentDeletedState(_ context.Context, _ *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
// Terminal state - nothing to do
// User must delete this DGDR and create a new one to redeploy
return ctrl.Result{}, nil
}
// handleDGDDeleted handles the case when auto-created DGD is deleted by user
func (r *DynamoGraphDeploymentRequestReconciler) handleDGDDeleted(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
logger := log.FromContext(ctx)
logger.Info("DGD was deleted by user, transitioning to DeploymentDeleted state")
dgdr.Status.State = StateDeploymentDeleted
dgdr.Status.Deployment.State = "Deleted"
r.Recorder.Event(dgdr, corev1.EventTypeWarning, EventReasonDeploymentDeleted,
fmt.Sprintf(MessageDeploymentDeleted, dgdr.Status.Deployment.Name))
meta.SetStatusCondition(&dgdr.Status.Conditions, metav1.Condition{
Type: ConditionTypeDeploymentReady,
Status: metav1.ConditionFalse,
Reason: EventReasonDeploymentDeleted,
Message: "Deployment was deleted by user. Create a new DGDR to redeploy.",
})
return ctrl.Result{}, r.Status().Update(ctx, dgdr)
}
// createDGD creates a DynamoGraphDeployment with the generated spec
func (r *DynamoGraphDeploymentRequestReconciler) createDGD(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
logger := log.FromContext(ctx)
// Extract DGD from RawExtension
if dgdr.Status.GeneratedDeployment == nil {
return ctrl.Result{}, fmt.Errorf("generatedDeployment is not set")
}
generatedDGD := &nvidiacomv1alpha1.DynamoGraphDeployment{}
// RawExtension can have either Object (already decoded) or Raw (JSON bytes)
if dgdr.Status.GeneratedDeployment.Object != nil {
var ok bool
generatedDGD, ok = dgdr.Status.GeneratedDeployment.Object.(*nvidiacomv1alpha1.DynamoGraphDeployment)
if !ok {
return ctrl.Result{}, fmt.Errorf("generatedDeployment.Object is not a DynamoGraphDeployment")
}
} else if dgdr.Status.GeneratedDeployment.Raw != nil {
if err := yaml.Unmarshal(dgdr.Status.GeneratedDeployment.Raw, generatedDGD); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to unmarshal generated deployment: %w", err)
}
} else {
return ctrl.Result{}, fmt.Errorf("generatedDeployment has neither Object nor Raw set")
}
// Determine DGD name and namespace
dgdName := generatedDGD.Name
dgdNamespace := dgdr.Namespace
if dgdr.Spec.DeploymentOverrides != nil {
if dgdr.Spec.DeploymentOverrides.Name != "" {
dgdName = dgdr.Spec.DeploymentOverrides.Name
}
if dgdr.Spec.DeploymentOverrides.Namespace != "" {
dgdNamespace = dgdr.Spec.DeploymentOverrides.Namespace
}
}
// Build labels (start with generated DGD's labels)
labels := make(map[string]string)
if generatedDGD.Labels != nil {
for k, v := range generatedDGD.Labels {
labels[k] = v
}
}
// Add/override with managed labels
labels[LabelDGDRName] = dgdr.Name
labels[LabelDGDRNamespace] = dgdr.Namespace
labels[LabelManagedBy] = LabelValueDynamoOperator
// Merge custom labels from overrides
if dgdr.Spec.DeploymentOverrides != nil && dgdr.Spec.DeploymentOverrides.Labels != nil {
for k, v := range dgdr.Spec.DeploymentOverrides.Labels {
labels[k] = v
}
}
// Build annotations (start with generated DGD's annotations)
annotations := make(map[string]string)
if generatedDGD.Annotations != nil {
for k, v := range generatedDGD.Annotations {
annotations[k] = v
}
}
// Merge custom annotations from overrides
if dgdr.Spec.DeploymentOverrides != nil && dgdr.Spec.DeploymentOverrides.Annotations != nil {
for k, v := range dgdr.Spec.DeploymentOverrides.Annotations {
annotations[k] = v
}
}
// Create DGD from generated deployment
dgd := &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: dgdName,
Namespace: dgdNamespace,
Labels: labels,
Annotations: annotations,
},
Spec: generatedDGD.Spec,
}
// Note: We don't set owner reference on DGD
// If a DGDR is deleted, the DGD may be serving traffic and should persist independently.
// We use labels (LabelDGDRName) to track the relationship.
logger.Info("Creating DynamoGraphDeployment", "name", dgdName, "namespace", dgdNamespace)
if err := r.Create(ctx, dgd); err != nil {
if apierrors.IsAlreadyExists(err) {
// DGD already exists, just update status
logger.Info("DGD already exists, updating status")
dgdr.Status.Deployment = &nvidiacomv1alpha1.DeploymentStatus{
Name: dgdName,
Namespace: dgdNamespace,
State: "Pending",
Created: true,
}
return ctrl.Result{}, r.Status().Update(ctx, dgdr)
}
r.Recorder.Event(dgdr, corev1.EventTypeWarning, MessageDeploymentCreationFailed, err.Error())
return ctrl.Result{}, err
}
// Update status
dgdr.Status.Deployment = &nvidiacomv1alpha1.DeploymentStatus{
Name: dgdName,
Namespace: dgdNamespace,
State: "Pending",
Created: true,
}
r.Recorder.Event(dgdr, corev1.EventTypeNormal, EventReasonDeploymentCreated,
fmt.Sprintf(MessageDeploymentCreated, dgdName))
meta.SetStatusCondition(&dgdr.Status.Conditions, metav1.Condition{
Type: ConditionTypeDeploymentReady,
Status: metav1.ConditionFalse,
Reason: EventReasonDeploymentCreated,
Message: fmt.Sprintf("DGD %s created, waiting for Ready", dgdName),
})
logger.Info("DynamoGraphDeployment created successfully", "name", dgdName)
return ctrl.Result{}, r.Status().Update(ctx, dgdr)
}
// handleFailedState handles DGDR in Failed state
func (r *DynamoGraphDeploymentRequestReconciler) handleFailedState(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
logger := log.FromContext(ctx)
logger.Info("DGDR is in failed state", "name", dgdr.Name)
// Cleanup profiling resources if any
if err := r.cleanupProfilingResources(ctx, dgdr); err != nil {
logger.Error(err, "Failed to cleanup profiling resources")
}
// Could implement retry logic here if desired
return ctrl.Result{}, nil
}
// getProfilingJobName returns the job name for a DGDR based on profiling mode
func getProfilingJobName(dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) string {
var jobNamePrefix string
if dgdr.Spec.Online {
jobNamePrefix = JobNamePrefixOnline
} else {
jobNamePrefix = JobNamePrefixAIC
}
return fmt.Sprintf("%s%s", jobNamePrefix, dgdr.Name)
}
// getOutputConfigMapName returns the ConfigMap name for profiling output
func getOutputConfigMapName(dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) string {
return fmt.Sprintf("%s%s", ConfigMapOutputPrefix, dgdr.Name)
}
// validateSpec validates the DGDR spec
func (r *DynamoGraphDeploymentRequestReconciler) validateSpec(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) error {
if dgdr.Spec.ModelName == "" {
return errors.New(ValidationErrorModelNameRequired)
}
if dgdr.Spec.SLA.ITL <= 0 {
return errors.New(ValidationErrorITLPositive)
}
if dgdr.Spec.SLA.TTFT <= 0 {
return errors.New(ValidationErrorTTFTPositive)
}
// Validate backend
validBackends := map[string]bool{
BackendVLLM: true,
BackendSGLang: true,
BackendTRTLLM: true,
}
if dgdr.Spec.Backend != "" && !validBackends[dgdr.Spec.Backend] {
return fmt.Errorf(ValidationErrorInvalidBackend, dgdr.Spec.Backend)
}
// Validate ConfigMap if provided (for both online and offline/AIC profiling)
if dgdr.Spec.ProfilingConfig != nil && dgdr.Spec.ProfilingConfig.ConfigMapRef != nil {
cm := &corev1.ConfigMap{}
err := r.Get(ctx, types.NamespacedName{
Name: dgdr.Spec.ProfilingConfig.ConfigMapRef.Name,
Namespace: dgdr.Namespace,
}, cm)
if err != nil {
if apierrors.IsNotFound(err) {
return fmt.Errorf(MessageConfigMapNotFound,
dgdr.Spec.ProfilingConfig.ConfigMapRef.Name, dgdr.Namespace)
}
return err
}
// Validate key exists
key := dgdr.Spec.ProfilingConfig.ConfigMapRef.Key
if key == "" {
key = "disagg.yaml"
}
if _, exists := cm.Data[key]; !exists {
return fmt.Errorf(MessageConfigMapKeyNotFound, key, cm.Name)
}
}
return nil
}
// createProfilingJob creates a Kubernetes Job for profiling using SyncResource
func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) error {
logger := log.FromContext(ctx)
// Ensure profiling job RBAC exists in cluster-wide mode
if r.Config.RestrictedNamespace == "" {
if err := r.RBACManager.EnsureServiceAccountWithRBAC(
ctx,
dgdr.Namespace,
ServiceAccountProfilingJob,
r.Config.RBAC.DGDRProfilingClusterRoleName,
); err != nil {
logger.Error(err, "Failed to ensure profiling job RBAC")
return fmt.Errorf("failed to ensure profiling job RBAC: %w", err)
}
}
// Use ProfilerImage for both online and offline (AIC) profiling
imageName := r.ProfilerImage
if imageName == "" {
return fmt.Errorf("profiler image not configured: the operator's profilerImage must be set in the Helm chart values (dynamo-operator.dynamo.dgdr.profilerImage). The image must contain the ai-dynamo profiler (python -m benchmarks.profiler.profile_sla entrypoint). For development, build from the ai-dynamo repository Dockerfile and push to your registry. A public image will be available in release 0.6.1")
}
logger.Info("Using profiler image", "image", imageName, "online", dgdr.Spec.Online)
// Determine label based on profiling mode
var labelValue string
if dgdr.Spec.Online {
labelValue = LabelValueDynamoProfiler
} else {
labelValue = LabelValueAICProfiler
}
// Use SyncResource to create/update the job
modified, job, err := commonController.SyncResource(ctx, r, dgdr, func(ctx context.Context) (*batchv1.Job, bool, error) {
jobName := getProfilingJobName(dgdr)
outputConfigMapName := getOutputConfigMapName(dgdr)
// Build profiler container based on online vs offline (AIC) mode
var profilerArgs []string
var profilerEnv []corev1.EnvVar
// Common environment variables
profilerEnv = []corev1.EnvVar{
{
Name: "HUGGING_FACE_HUB_TOKEN",
ValueFrom: &corev1.EnvVarSource{
SecretKeyRef: &corev1.SecretKeySelector{
LocalObjectReference: corev1.LocalObjectReference{
Name: "hf-token-secret",
},
Key: "HF_TOKEN",
},
},
},
{
Name: "NATS_SERVER",
Value: fmt.Sprintf("nats://%s-nats:4222", dgdr.Namespace),
},
{
Name: "ETCD_ENDPOINTS",
Value: fmt.Sprintf("%s-etcd:2379", dgdr.Namespace),
},
}
// Build container with volume mounts
volumeMounts := []corev1.VolumeMount{
{
Name: VolumeNameProfilingOutput,
MountPath: ProfilingOutputPath,
},
}
// Determine GPU range for profiling
minGPUs := 1
maxGPUs := 8
if dgdr.Spec.GPU != nil {
if dgdr.Spec.GPU.MinNumGPUsPerEngine > 0 {
minGPUs = dgdr.Spec.GPU.MinNumGPUsPerEngine
}
if dgdr.Spec.GPU.MaxNumGPUsPerEngine > 0 {
maxGPUs = dgdr.Spec.GPU.MaxNumGPUsPerEngine
}
}
// Build common profiler args (shared by both online and offline modes)
profilerArgs = []string{
"--namespace", dgdr.Namespace,
"--backend", dgdr.Spec.Backend,
"--ttft", fmt.Sprintf("%d", dgdr.Spec.SLA.TTFT),
"--itl", fmt.Sprintf("%d", dgdr.Spec.SLA.ITL),
"--isl", fmt.Sprintf("%d", dgdr.Spec.SLA.ISL),
"--osl", fmt.Sprintf("%d", dgdr.Spec.SLA.OSL),
"--output-dir", ProfilingOutputPath,
"--min-num-gpus-per-engine", fmt.Sprintf("%d", minGPUs),
"--max-num-gpus-per-engine", fmt.Sprintf("%d", maxGPUs),
}
// Add mode-specific args
if !dgdr.Spec.Online {
// Offline (AIC) profiling: add AI Configurator args
profilerArgs = append(profilerArgs,
"--use-ai-configurator",
"--aic-model-name", dgdr.Spec.ModelName,
"--aic-backend-version", "0.20.0", // TODO: don't hardcode this
)
// Add AIC-specific GPU system type
if dgdr.Spec.GPU != nil && dgdr.Spec.GPU.Type != "" {
profilerArgs = append(profilerArgs, "--aic-system", dgdr.Spec.GPU.Type)
}
}
// Add config if provided (for both online and offline modes)
if dgdr.Spec.ProfilingConfig != nil && dgdr.Spec.ProfilingConfig.ConfigMapRef != nil {
profilerArgs = append(profilerArgs, "--config", fmt.Sprintf("%s/%s", ProfilingConfigPath, ProfilingConfigFile))
volumeMounts = append(volumeMounts, corev1.VolumeMount{
Name: VolumeNameProfilingConfig,
MountPath: ProfilingConfigPath,
ReadOnly: true,
})
}
profilerContainer := corev1.Container{
Name: ContainerNameProfiler,
Image: imageName,
Command: []string{"python", "-m", "benchmarks.profiler.profile_sla"},
Args: profilerArgs,
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("16"),
corev1.ResourceMemory: resource.MustParse("10Gi"),
},
},
Env: profilerEnv,
VolumeMounts: volumeMounts,
}
// Generate sidecar script from template
tmpl, err := template.New("sidecar").Parse(sidecarScriptTemplate)
if err != nil {
return nil, false, fmt.Errorf("failed to parse sidecar script template: %w", err)
}
var scriptBuf bytes.Buffer
err = tmpl.Execute(&scriptBuf, map[string]string{
"OutputPath": ProfilingOutputPath,
"OutputFile": ProfilingOutputFile,
"ConfigMapName": outputConfigMapName,
"Namespace": dgdr.Namespace,
"DGDRName": dgdr.Name,
})
if err != nil {
return nil, false, fmt.Errorf("failed to execute sidecar script template: %w", err)
}
sidecarContainer := corev1.Container{
Name: ContainerNameOutputCopier,
Image: SidecarImage,
Command: []string{"/bin/sh", "-c"},
Args: []string{scriptBuf.String()},
VolumeMounts: []corev1.VolumeMount{{
Name: VolumeNameProfilingOutput,
MountPath: ProfilingOutputPath,
ReadOnly: true,
}},
}
// Build volumes - use dynamo-pvc for profiling output so data persists for the Planner
volumes := []corev1.Volume{{
Name: VolumeNameProfilingOutput,
VolumeSource: corev1.VolumeSource{
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
ClaimName: "dynamo-pvc",
},
},
}}
// Add ConfigMap volume if provided (for both online and offline/AIC)
if dgdr.Spec.ProfilingConfig != nil && dgdr.Spec.ProfilingConfig.ConfigMapRef != nil {
key := dgdr.Spec.ProfilingConfig.ConfigMapRef.Key
if key == "" {
key = ProfilingConfigFile
}
volumes = append(volumes, corev1.Volume{
Name: VolumeNameProfilingConfig,
VolumeSource: corev1.VolumeSource{
ConfigMap: &corev1.ConfigMapVolumeSource{
LocalObjectReference: corev1.LocalObjectReference{
Name: dgdr.Spec.ProfilingConfig.ConfigMapRef.Name,
},
Items: []corev1.KeyToPath{{
Key: key,
Path: ProfilingConfigFile,
}},
},
},
})
}
// Limit retries to prevent infinite loop
backoffLimit := int32(3)
job := &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{
Name: jobName,
Namespace: dgdr.Namespace,
Labels: map[string]string{
LabelApp: labelValue,
LabelDGDR: dgdr.Name,
LabelManagedBy: LabelValueDynamoOperator,
},
},
Spec: batchv1.JobSpec{
BackoffLimit: &backoffLimit,
Template: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
ServiceAccountName: ServiceAccountProfilingJob,
RestartPolicy: corev1.RestartPolicyNever,
Containers: []corev1.Container{profilerContainer, sidecarContainer},
Volumes: volumes,
ImagePullSecrets: []corev1.LocalObjectReference{
{Name: "nvcr-imagepullsecret"},
},
},
},
},
}
return job, false, nil
})
if err != nil {
return err
}
if modified {
if dgdr.Spec.Online {
logger.Info("Online profiling job created/updated", "job", job.Name)
} else {
logger.Info("Offline (AIC) profiling job created/updated", "job", job.Name)
}
}
return nil
}
// checkProfilingJobStatus checks if the profiling job has completed
func (r *DynamoGraphDeploymentRequestReconciler) checkProfilingJobStatus(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (bool, error) {
logger := log.FromContext(ctx)
jobName := getProfilingJobName(dgdr)
job := &batchv1.Job{}
if err := r.Get(ctx, types.NamespacedName{Name: jobName, Namespace: dgdr.Namespace}, job); err != nil {
return false, err
}
// Check job conditions
for _, condition := range job.Status.Conditions {
if condition.Type == batchv1.JobComplete && condition.Status == corev1.ConditionTrue {
logger.Info("Profiling job completed", "job", jobName)
return true, nil
}
if condition.Type == batchv1.JobFailed && condition.Status == corev1.ConditionTrue {
return false, fmt.Errorf("profiling job failed: %s", condition.Message)
}
}
return false, nil
}
// generateDGDSpec generates DGD spec from profiling results (online or offline/AIC)
func (r *DynamoGraphDeploymentRequestReconciler) generateDGDSpec(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) error {
logger := log.FromContext(ctx)
logger.Info("Generating DGD spec from profiling results", "name", dgdr.Name, "online", dgdr.Spec.Online)
// Read the generated spec from ConfigMap (created by sidecar)
outputConfigMapName := getOutputConfigMapName(dgdr)
cm := &corev1.ConfigMap{}
err := r.Get(ctx, types.NamespacedName{
Name: outputConfigMapName,
Namespace: dgdr.Namespace,
}, cm)
if err != nil {
if apierrors.IsNotFound(err) {
return fmt.Errorf("output ConfigMap %s not found - profiling may not have completed yet", outputConfigMapName)
}
return fmt.Errorf("failed to get output ConfigMap: %w", err)
}
// Get YAML content from ConfigMap
yamlContent, exists := cm.Data[ProfilingOutputFile]
if !exists {
return fmt.Errorf("key %s not found in ConfigMap %s", ProfilingOutputFile, outputConfigMapName)
}
logger.Info("Found profiling output in ConfigMap", "configMap", outputConfigMapName, "size", len(yamlContent))
// Parse YAML into full DynamoGraphDeployment object first to validate and get name
dgd := &nvidiacomv1alpha1.DynamoGraphDeployment{}
if err := yaml.Unmarshal([]byte(yamlContent), dgd); err != nil {
return fmt.Errorf("failed to parse %s: %w", ProfilingOutputFile, err)
}
logger.Info("Parsed DGD from ConfigMap", "dgdName", dgd.Name)
// Store as RawExtension (need to marshal to JSON as RawExtension expects JSON)
// This preserves all fields including metadata
dgdr.Status.GeneratedDeployment = &runtime.RawExtension{
Object: dgd,
}
// Set profiling results reference
dgdr.Status.ProfilingResults = fmt.Sprintf("configmap/%s", outputConfigMapName)
logger.Info("Successfully generated DGD from profiling output", "dgdName", dgd.Name)
return r.Status().Update(ctx, dgdr)
}
// cleanupProfilingResources cleans up profiling resources
func (r *DynamoGraphDeploymentRequestReconciler) cleanupProfilingResources(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) error {
logger := log.FromContext(ctx)
logger.Info("Cleaning up profiling resources", "name", dgdr.Name)
// Cleanup behavior when DGDR is deleted:
// - Profiling Job: Automatically deleted via ownerReference (set by SyncResource)
// - Output ConfigMap: NOT deleted (no ownerReference) - contains valuable profiling data
// - Auto-created DGD: NOT deleted (no ownerReference) - may be serving traffic
//
// We use labels (LabelDGDRName) to track relationships without cascade delete.
// Users can manually clean up ConfigMaps and DGDs if needed using label selectors:
// kubectl delete configmap -l dgdr.nvidia.com/name=<dgdr-name>
// kubectl delete dynamographdeployment -l dgdr.nvidia.com/name=<dgdr-name>
logger.Info("Profiling job will be automatically deleted via ownerReference")
return nil
}
// updateStateAndRequeue updates the DGDR state and requeues
func (r *DynamoGraphDeploymentRequestReconciler) updateStateAndRequeue(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest, state, _ string) (ctrl.Result, error) {
dgdr.Status.State = state
if err := r.Status().Update(ctx, dgdr); err != nil {
return ctrl.Result{}, err
}
return ctrl.Result{Requeue: true}, nil
}
// updateStateWithCondition updates state and adds/updates a condition
func (r *DynamoGraphDeploymentRequestReconciler) updateStateWithCondition(
ctx context.Context,
dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest,
state string,
conditionType string,
status metav1.ConditionStatus,
reason string,
message string,
) (ctrl.Result, error) {
dgdr.Status.State = state
condition := metav1.Condition{
Type: conditionType,
Status: status,
ObservedGeneration: dgdr.Generation,
LastTransitionTime: metav1.Now(),
Reason: reason,
Message: message,
}
dgdr.AddStatusCondition(condition)
if err := r.Status().Update(ctx, dgdr); err != nil {
return ctrl.Result{}, err
}
return ctrl.Result{Requeue: true}, nil
}
// SetupWithManager sets up the controller with the Manager
func (r *DynamoGraphDeploymentRequestReconciler) SetupWithManager(mgr ctrl.Manager) error {
return ctrl.NewControllerManagedBy(mgr).
For(&nvidiacomv1alpha1.DynamoGraphDeploymentRequest{}).
Owns(&batchv1.Job{}, builder.WithPredicates(predicate.Funcs{
// ignore creation cause we don't want to be called again after we create the job
CreateFunc: func(ce event.CreateEvent) bool { return false },
DeleteFunc: func(de event.DeleteEvent) bool { return true },
UpdateFunc: func(de event.UpdateEvent) bool { return true },
GenericFunc: func(ge event.GenericEvent) bool { return true },
})). // Watch Jobs created by this controller (via ownerReference)
Watches(
&nvidiacomv1alpha1.DynamoGraphDeployment{},
handler.EnqueueRequestsFromMapFunc(func(ctx context.Context, obj client.Object) []ctrl.Request {
// Find DGDR by label instead of owner reference
dgd := obj.(*nvidiacomv1alpha1.DynamoGraphDeployment)
dgdrName, hasName := dgd.Labels[LabelDGDRName]
dgdrNamespace, hasNamespace := dgd.Labels[LabelDGDRNamespace]
if !hasName || !hasNamespace {
return nil
}
return []ctrl.Request{{
NamespacedName: types.NamespacedName{
Name: dgdrName,
Namespace: dgdrNamespace,
},
}}
}),
builder.WithPredicates(predicate.Funcs{
// ignore creation cause we don't want to be called again after we create the DGD
CreateFunc: func(ce event.CreateEvent) bool { return false },
DeleteFunc: func(de event.DeleteEvent) bool { return true },
UpdateFunc: func(ue event.UpdateEvent) bool { return true },
GenericFunc: func(ge event.GenericEvent) bool { return true },
}),
). // Watch DGDs created by this controller (via label)
Complete(r)
}
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package controller
import (
"context"
"time"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
commonController "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/tools/record"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
)
// MockRBACManager implements RBACManager for testing
type MockRBACManager struct {
EnsureServiceAccountWithRBACFunc func(ctx context.Context, targetNamespace, serviceAccountName, clusterRoleName string) error
}
func (m *MockRBACManager) EnsureServiceAccountWithRBAC(ctx context.Context, targetNamespace, serviceAccountName, clusterRoleName string) error {
if m.EnsureServiceAccountWithRBACFunc != nil {
return m.EnsureServiceAccountWithRBACFunc(ctx, targetNamespace, serviceAccountName, clusterRoleName)
}
return nil
}
var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
const (
timeout = time.Second * 10
interval = time.Millisecond * 250
)
var (
reconciler *DynamoGraphDeploymentRequestReconciler
recorder *record.FakeRecorder
)
BeforeEach(func() {
recorder = record.NewFakeRecorder(100)
reconciler = &DynamoGraphDeploymentRequestReconciler{
Client: k8sClient,
Recorder: recorder,
ProfilerImage: "test-profiler:latest",
Config: commonController.Config{
RestrictedNamespace: "",
RBAC: commonController.RBACConfig{
DGDRProfilingClusterRoleName: "test-cluster-role",
},
},
RBACManager: &MockRBACManager{},
}
})
Context("When reconciling initial DGDR", func() {
It("Should validate spec and transition to Pending", func() {
ctx := context.Background()
dgdrName := "test-dgdr-initial"
namespace := "default"
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
ModelName: "test-model",
Backend: BackendVLLM,
SLA: nvidiacomv1alpha1.SLASpec{
TTFT: 100,
ITL: 1500,
ISL: 3000,
OSL: 5,
},
Online: true,
},
}
Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
defer k8sClient.Delete(ctx, dgdr)
// First reconcile: Empty -> Pending
_, err := reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{
Name: dgdrName,
Namespace: namespace,
},
})
Expect(err).NotTo(HaveOccurred())
// Check status
Eventually(func() string {
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
return updated.Status.State
}, timeout, interval).Should(Equal(StatePending))
// Verify observedGeneration is set
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
Expect(updated.Status.ObservedGeneration).Should(Equal(updated.Generation))
})
It("Should fail validation with missing modelName", func() {
ctx := context.Background()
dgdrName := "test-dgdr-invalid"
namespace := "default"
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Backend: BackendVLLM,
SLA: nvidiacomv1alpha1.SLASpec{
TTFT: 100,
ITL: 1500,
ISL: 3000,
OSL: 5,
},
},
}
Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
defer k8sClient.Delete(ctx, dgdr)
// Reconcile
_, err := reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{
Name: dgdrName,
Namespace: namespace,
},
})
Expect(err).NotTo(HaveOccurred())
// Check status transitions to Failed
Eventually(func() string {
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
return updated.Status.State
}, timeout, interval).Should(Equal(StateFailed))
})
})
Context("When creating profiling job", func() {
It("Should create online profiling job", func() {
ctx := context.Background()
dgdrName := "test-dgdr-profiling-online"
namespace := "default"
// Create ConfigMap for profiling config
configMap := &corev1.ConfigMap{
ObjectMeta: metav1.ObjectMeta{
Name: "test-config",
Namespace: namespace,
},
Data: map[string]string{
"disagg.yaml": "test: config",
},
}
Expect(k8sClient.Create(ctx, configMap)).Should(Succeed())
defer k8sClient.Delete(ctx, configMap)
// Create ServiceAccount
sa := &corev1.ServiceAccount{
ObjectMeta: metav1.ObjectMeta{
Name: ServiceAccountProfilingJob,
Namespace: namespace,
},
}
Expect(k8sClient.Create(ctx, sa)).Should(Succeed())
defer k8sClient.Delete(ctx, sa)
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
ModelName: "test-model",
Backend: BackendVLLM,
SLA: nvidiacomv1alpha1.SLASpec{
TTFT: 100,
ITL: 1500,
ISL: 3000,
OSL: 5,
},
Online: true,
ProfilingConfig: &nvidiacomv1alpha1.ProfilingConfigSpec{
ConfigMapRef: &nvidiacomv1alpha1.ConfigMapKeySelector{
Name: "test-config",
Key: "disagg.yaml",
},
},
},
}
Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
defer k8sClient.Delete(ctx, dgdr)
// Reconcile multiple times to move through states
_, err := reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
})
Expect(err).NotTo(HaveOccurred())
// Second reconcile: Pending -> Profiling
_, err = reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
})
Expect(err).NotTo(HaveOccurred())
// Verify profiling job was created
Eventually(func() bool {
jobName := getProfilingJobName(dgdr)
job := &batchv1.Job{}
err := k8sClient.Get(ctx, types.NamespacedName{Name: jobName, Namespace: namespace}, job)
return err == nil
}, timeout, interval).Should(BeTrue())
// Verify job has correct labels
jobName := getProfilingJobName(dgdr)
job := &batchv1.Job{}
k8sClient.Get(ctx, types.NamespacedName{Name: jobName, Namespace: namespace}, job)
Expect(job.Labels[LabelApp]).Should(Equal(LabelValueDynamoProfiler))
Expect(job.Labels[LabelDGDR]).Should(Equal(dgdrName))
// Verify job has profiler container
Expect(job.Spec.Template.Spec.Containers).Should(HaveLen(2))
Expect(job.Spec.Template.Spec.Containers[0].Name).Should(Equal(ContainerNameProfiler))
Expect(job.Spec.Template.Spec.Containers[1].Name).Should(Equal(ContainerNameOutputCopier))
// Verify PVC volume mount
Expect(job.Spec.Template.Spec.Volumes).Should(ContainElement(
corev1.Volume{
Name: VolumeNameProfilingOutput,
VolumeSource: corev1.VolumeSource{
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
ClaimName: "dynamo-pvc",
},
},
},
))
// Clean up job
k8sClient.Delete(ctx, job)
})
It("Should create offline (AIC) profiling job", func() {
ctx := context.Background()
dgdrName := "test-dgdr-profiling-aic"
namespace := "default"
// Create ServiceAccount
sa := &corev1.ServiceAccount{
ObjectMeta: metav1.ObjectMeta{
Name: ServiceAccountProfilingJob,
Namespace: namespace,
},
}
_ = k8sClient.Create(ctx, sa)
defer k8sClient.Delete(ctx, sa)
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
ModelName: "QWEN3_32B",
Backend: BackendTRTLLM,
SLA: nvidiacomv1alpha1.SLASpec{
TTFT: 100,
ITL: 1500,
ISL: 3000,
OSL: 5,
},
Online: false, // Offline profiling
GPU: &nvidiacomv1alpha1.GPUSpec{
Type: "h200_sxm",
MinNumGPUsPerEngine: 1,
MaxNumGPUsPerEngine: 8,
},
},
}
Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
defer k8sClient.Delete(ctx, dgdr)
// Reconcile
_, err := reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
})
Expect(err).NotTo(HaveOccurred())
_, err = reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
})
Expect(err).NotTo(HaveOccurred())
// Verify job was created with AIC label
Eventually(func() string {
jobName := getProfilingJobName(dgdr)
job := &batchv1.Job{}
if err := k8sClient.Get(ctx, types.NamespacedName{Name: jobName, Namespace: namespace}, job); err != nil {
return ""
}
return job.Labels[LabelApp]
}, timeout, interval).Should(Equal(LabelValueAICProfiler))
// Clean up
jobName := getProfilingJobName(dgdr)
job := &batchv1.Job{}
if err := k8sClient.Get(ctx, types.NamespacedName{Name: jobName, Namespace: namespace}, job); err == nil {
k8sClient.Delete(ctx, job)
}
})
})
Context("When profiling completes", func() {
It("Should generate DGD spec from ConfigMap", func() {
ctx := context.Background()
dgdrName := "test-dgdr-profiling-complete"
namespace := "default"
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
ModelName: "test-model",
Backend: BackendVLLM,
SLA: nvidiacomv1alpha1.SLASpec{
TTFT: 100,
ITL: 1500,
ISL: 3000,
OSL: 5,
},
Online: true,
},
}
Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
defer k8sClient.Delete(ctx, dgdr)
// Update status to Profiling using Status subresource
dgdr.Status.State = StateProfiling
Expect(k8sClient.Status().Update(ctx, dgdr)).Should(Succeed())
// Create completed profiling job
jobName := getProfilingJobName(dgdr)
job := &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{
Name: jobName,
Namespace: namespace,
},
Spec: batchv1.JobSpec{
Template: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
Containers: []corev1.Container{{
Name: "test",
Image: "test",
}},
RestartPolicy: corev1.RestartPolicyNever,
},
},
},
Status: batchv1.JobStatus{
Conditions: []batchv1.JobCondition{{
Type: batchv1.JobComplete,
Status: corev1.ConditionTrue,
}},
},
}
Expect(k8sClient.Create(ctx, job)).Should(Succeed())
defer k8sClient.Delete(ctx, job)
// Update job status to completed using Status subresource
job.Status.Conditions = []batchv1.JobCondition{{
Type: batchv1.JobComplete,
Status: corev1.ConditionTrue,
}}
Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed())
// Create output ConfigMap with DGD spec
dgdYAML := `apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: test-dgd
spec:
services:
Frontend:
replicas: 1`
outputConfigMapName := getOutputConfigMapName(dgdr)
cm := &corev1.ConfigMap{
ObjectMeta: metav1.ObjectMeta{
Name: outputConfigMapName,
Namespace: namespace,
},
Data: map[string]string{
ProfilingOutputFile: dgdYAML,
},
}
Expect(k8sClient.Create(ctx, cm)).Should(Succeed())
defer k8sClient.Delete(ctx, cm)
// Reconcile to process the profiling completion
_, err := reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
})
Expect(err).NotTo(HaveOccurred())
// Get the updated DGDR
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
// Check that DGD spec was generated
Expect(updated.Status.GeneratedDeployment).NotTo(BeNil())
// Verify state transitioned to Ready (since autoApply is false by default)
Expect(updated.Status.State).Should(Equal(StateReady))
})
})
Context("When autoApply is enabled", func() {
It("Should create DGD after profiling", func() {
ctx := context.Background()
dgdrName := "test-dgdr-autoapply"
namespace := "default"
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
ModelName: "test-model",
Backend: BackendVLLM,
SLA: nvidiacomv1alpha1.SLASpec{
TTFT: 100,
ITL: 1500,
ISL: 3000,
OSL: 5,
},
Online: true,
AutoApply: true,
},
}
Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
defer k8sClient.Delete(ctx, dgdr)
// Update status to Profiling using Status subresource
dgdr.Status.State = StateProfiling
Expect(k8sClient.Status().Update(ctx, dgdr)).Should(Succeed())
// Create completed profiling job
jobName := getProfilingJobName(dgdr)
job := &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{
Name: jobName,
Namespace: namespace,
},
Spec: batchv1.JobSpec{
Template: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
Containers: []corev1.Container{{
Name: "test",
Image: "test",
}},
RestartPolicy: corev1.RestartPolicyNever,
},
},
},
Status: batchv1.JobStatus{
Conditions: []batchv1.JobCondition{{
Type: batchv1.JobComplete,
Status: corev1.ConditionTrue,
}},
},
}
Expect(k8sClient.Create(ctx, job)).Should(Succeed())
defer k8sClient.Delete(ctx, job)
// Update job status to completed using Status subresource
job.Status.Conditions = []batchv1.JobCondition{{
Type: batchv1.JobComplete,
Status: corev1.ConditionTrue,
}}
Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed())
// Create output ConfigMap
dgdYAML := `apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: test-dgd-auto
spec:
services:
Frontend:
replicas: 1`
outputConfigMapName := getOutputConfigMapName(dgdr)
cm := &corev1.ConfigMap{
ObjectMeta: metav1.ObjectMeta{
Name: outputConfigMapName,
Namespace: namespace,
},
Data: map[string]string{
ProfilingOutputFile: dgdYAML,
},
}
Expect(k8sClient.Create(ctx, cm)).Should(Succeed())
defer k8sClient.Delete(ctx, cm)
// Reconcile to generate spec (transitions to Deploying because autoApply=true)
_, err := reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
})
Expect(err).NotTo(HaveOccurred())
// Get updated DGDR and check state is Deploying
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
Expect(updated.Status.State).Should(Equal(StateDeploying))
// Reconcile again to create DGD
_, err = reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
})
Expect(err).NotTo(HaveOccurred())
// Verify DGD was created
dgd := &nvidiacomv1alpha1.DynamoGraphDeployment{}
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: "test-dgd-auto", Namespace: namespace}, dgd)).Should(Succeed())
// Get final DGDR status
k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
Expect(updated.Status.Deployment).NotTo(BeNil())
Expect(updated.Status.Deployment.Created).Should(BeTrue())
Expect(updated.Status.Deployment.Name).Should(Equal("test-dgd-auto"))
// Clean up DGD
k8sClient.Get(ctx, types.NamespacedName{Name: "test-dgd-auto", Namespace: namespace}, dgd)
k8sClient.Delete(ctx, dgd)
})
})
Context("When enforcing spec immutability", func() {
It("Should reject spec changes after profiling starts", func() {
ctx := context.Background()
dgdrName := "test-dgdr-immutable"
namespace := "default"
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
ModelName: "test-model",
Backend: BackendVLLM,
SLA: nvidiacomv1alpha1.SLASpec{
TTFT: 100,
ITL: 1500,
ISL: 3000,
OSL: 5,
},
Online: true,
},
}
Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
defer k8sClient.Delete(ctx, dgdr)
// Reconcile to initialize
_, err := reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
})
Expect(err).NotTo(HaveOccurred())
// Get current generation
var current nvidiacomv1alpha1.DynamoGraphDeploymentRequest
k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &current)
initialGeneration := current.Generation
observedGeneration := current.Status.ObservedGeneration
// Manually set state to Profiling to simulate in-progress profiling
current.Status.State = StateProfiling
k8sClient.Status().Update(ctx, &current)
// Try to modify spec
k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &current)
current.Spec.SLA.TTFT = 200
k8sClient.Update(ctx, &current)
// Reconcile
_, err = reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
})
Expect(err).NotTo(HaveOccurred())
// Verify generation changed but observedGeneration stayed the same
k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &current)
Expect(current.Generation).Should(BeNumerically(">", initialGeneration))
Expect(current.Status.ObservedGeneration).Should(Equal(observedGeneration))
Expect(current.Status.State).Should(Equal(StateProfiling)) // State unchanged
// Verify event was recorded
Eventually(func() bool {
select {
case event := <-recorder.Events:
return event == "Warning SpecChangeRejected Cannot modify spec in state 'Profiling'. DynamoGraphDeploymentRequest is immutable once profiling starts. Create a new resource with a different name instead."
default:
return false
}
}, timeout, interval).Should(BeTrue())
})
})
Context("When handling DGD deletion", func() {
It("Should transition to DeploymentDeleted state", func() {
ctx := context.Background()
dgdrName := "test-dgdr-dgd-deleted"
namespace := "default"
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
ModelName: "test-model",
Backend: BackendVLLM,
SLA: nvidiacomv1alpha1.SLASpec{
TTFT: 100,
ITL: 1500,
ISL: 3000,
OSL: 5,
},
Online: true,
AutoApply: true,
},
}
Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
defer k8sClient.Delete(ctx, dgdr)
// Update status to Ready with Deployment info using Status subresource
dgdr.Status.State = StateReady
dgdr.Status.Deployment = &nvidiacomv1alpha1.DeploymentStatus{
Name: "test-dgd-to-delete",
Namespace: namespace,
Created: true,
State: "Ready",
}
Expect(k8sClient.Status().Update(ctx, dgdr)).Should(Succeed())
// Reconcile when DGD doesn't exist (simulating deletion)
_, err := reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
})
Expect(err).NotTo(HaveOccurred())
// Get updated DGDR and check state transitioned to DeploymentDeleted
var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
Expect(updated.Status.State).Should(Equal(StateDeploymentDeleted))
})
})
})
var _ = Describe("DGDR Helper Functions", func() {
Context("getProfilingJobName", func() {
It("Should return correct job name for online profiling", func() {
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Online: true,
},
}
Expect(getProfilingJobName(dgdr)).Should(Equal("profile-online-test-dgdr"))
})
It("Should return correct job name for offline profiling", func() {
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Online: false,
},
}
Expect(getProfilingJobName(dgdr)).Should(Equal("profile-aic-test-dgdr"))
})
})
Context("getOutputConfigMapName", func() {
It("Should return correct ConfigMap name", func() {
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
},
}
Expect(getOutputConfigMapName(dgdr)).Should(Equal("dgdr-output-test-dgdr"))
})
})
})
var _ = Describe("DGDR Validation", func() {
var reconciler *DynamoGraphDeploymentRequestReconciler
BeforeEach(func() {
reconciler = &DynamoGraphDeploymentRequestReconciler{
Client: k8sClient,
}
})
Context("validateSpec", func() {
It("Should pass validation for valid spec", func() {
ctx := context.Background()
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
ModelName: "test-model",
Backend: BackendVLLM,
SLA: nvidiacomv1alpha1.SLASpec{
TTFT: 100,
ITL: 1500,
ISL: 3000,
OSL: 5,
},
},
}
err := reconciler.validateSpec(ctx, dgdr)
Expect(err).NotTo(HaveOccurred())
})
It("Should fail validation when modelName is empty", func() {
ctx := context.Background()
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Backend: BackendVLLM,
SLA: nvidiacomv1alpha1.SLASpec{
TTFT: 100,
ITL: 1500,
},
},
}
err := reconciler.validateSpec(ctx, dgdr)
Expect(err).To(HaveOccurred())
Expect(err.Error()).Should(ContainSubstring("modelName"))
})
It("Should fail validation when TTFT is zero", func() {
ctx := context.Background()
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
ModelName: "test-model",
Backend: BackendVLLM,
SLA: nvidiacomv1alpha1.SLASpec{
TTFT: 0,
ITL: 1500,
ISL: 3000,
OSL: 500,
},
},
}
err := reconciler.validateSpec(ctx, dgdr)
Expect(err).To(HaveOccurred())
Expect(err.Error()).Should(ContainSubstring("ttft"))
})
It("Should fail validation when TTFT is negative", func() {
ctx := context.Background()
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
ModelName: "test-model",
Backend: BackendVLLM,
SLA: nvidiacomv1alpha1.SLASpec{
TTFT: -1,
ITL: 1500,
},
},
}
err := reconciler.validateSpec(ctx, dgdr)
Expect(err).To(HaveOccurred())
Expect(err.Error()).Should(ContainSubstring("ttft"))
})
It("Should fail validation when ITL is zero", func() {
ctx := context.Background()
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
ModelName: "test-model",
Backend: BackendVLLM,
SLA: nvidiacomv1alpha1.SLASpec{
TTFT: 100,
ITL: 0,
ISL: 3000,
OSL: 500,
},
},
}
err := reconciler.validateSpec(ctx, dgdr)
Expect(err).To(HaveOccurred())
Expect(err.Error()).Should(ContainSubstring("itl"))
})
It("Should fail validation when ITL is negative", func() {
ctx := context.Background()
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
ModelName: "test-model",
Backend: BackendVLLM,
SLA: nvidiacomv1alpha1.SLASpec{
TTFT: 100,
ITL: -1,
},
},
}
err := reconciler.validateSpec(ctx, dgdr)
Expect(err).To(HaveOccurred())
Expect(err.Error()).Should(ContainSubstring("itl"))
})
It("Should fail validation for invalid backend", func() {
ctx := context.Background()
dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
ModelName: "test-model",
Backend: "invalid-backend",
SLA: nvidiacomv1alpha1.SLASpec{
TTFT: 100,
ITL: 1500,
},
},
}
err := reconciler.validateSpec(ctx, dgdr)
Expect(err).To(HaveOccurred())
Expect(err.Error()).Should(ContainSubstring("invalid backend"))
})
})
})
......@@ -80,7 +80,7 @@ var _ = BeforeSuite(func() {
filepath.Join(".", "testing", "run.ai"),
filepath.Join(".", "testing", "nvidia"),
},
ErrorIfCRDPathMissing: true,
ErrorIfCRDPathMissing: false,
// The BinaryAssetsDirectory is only required if you want to run the tests directly
// without call the makefile target test. If not informed it will look for the
......
......@@ -74,6 +74,8 @@ type Config struct {
type RBACConfig struct {
// PlannerClusterRoleName is the name of the ClusterRole for planner (cluster-wide mode only)
PlannerClusterRoleName string
// DGDRProfilingClusterRoleName is the name of the ClusterRole for DGDR profiling jobs (cluster-wide mode only)
DGDRProfilingClusterRoleName string
}
type IngressConfig struct {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment