Unverified Commit f3aa1e01 authored by Julien Mancuso's avatar Julien Mancuso Committed by GitHub
Browse files

feat: introducing ChReK (Checkpoint Restore in K8s) (#4978)


Signed-off-by: default avatarJulien Mancuso <jmancuso@nvidia.com>
parent 44986bf5
...@@ -683,6 +683,87 @@ spec: ...@@ -683,6 +683,87 @@ spec:
- vllm - vllm
- trtllm - trtllm
type: string type: string
checkpoint:
description: |-
Checkpoint configures container checkpointing for this service.
When enabled, pods can be restored from a checkpoint files for faster cold start.
properties:
checkpointRef:
description: |-
CheckpointRef references an existing Checkpoint CR to use
If specified, Identity is ignored and this checkpoint is used directly
type: string
enabled:
default: false
description: Enabled indicates whether checkpointing is enabled for this service
type: boolean
identity:
description: |-
Identity defines the checkpoint identity for hash computation
Used when Mode is Auto or when looking up existing checkpoints
Required when checkpointRef is not specified
properties:
backendFramework:
description: BackendFramework is the runtime framework (vllm, sglang, trtllm)
enum:
- vllm
- sglang
- trtllm
type: string
dtype:
description: Dtype is the data type (fp16, bf16, fp8, etc.)
type: string
dynamoVersion:
description: |-
DynamoVersion is the Dynamo platform version (optional)
If not specified, version is not included in identity hash
This ensures checkpoint compatibility across Dynamo releases
type: string
extraParameters:
additionalProperties:
type: string
description: |-
ExtraParameters are additional parameters that affect the checkpoint hash
Use for any framework-specific or custom parameters not covered above
type: object
maxModelLen:
description: MaxModelLen is the maximum sequence length
format: int32
minimum: 1
type: integer
model:
description: Model is the model identifier (e.g., "meta-llama/Llama-3-70B")
type: string
pipelineParallelSize:
default: 1
description: PipelineParallelSize is the pipeline parallel configuration
format: int32
minimum: 1
type: integer
tensorParallelSize:
default: 1
description: TensorParallelSize is the tensor parallel configuration
format: int32
minimum: 1
type: integer
required:
- backendFramework
- model
type: object
mode:
default: Auto
description: |-
Mode defines how checkpoint creation is handled
- Auto: DGD controller creates Checkpoint CR automatically
- Manual: User must create Checkpoint CR
enum:
- Auto
- Manual
type: string
type: object
x-kubernetes-validations:
- message: When enabled, either checkpointRef or both identity.model and identity.backendFramework must be specified
rule: '!self.enabled || (has(self.checkpointRef) && size(self.checkpointRef) > 0) || (has(self.identity) && has(self.identity.model) && has(self.identity.backendFramework))'
componentType: componentType:
description: ComponentType indicates the role of this component (for example, "main"). description: ComponentType indicates the role of this component (for example, "main").
type: string type: string
......
...@@ -892,6 +892,87 @@ spec: ...@@ -892,6 +892,87 @@ spec:
description: 'Deprecated: This field is ignored.' description: 'Deprecated: This field is ignored.'
type: integer type: integer
type: object type: object
checkpoint:
description: |-
Checkpoint configures container checkpointing for this service.
When enabled, pods can be restored from a checkpoint files for faster cold start.
properties:
checkpointRef:
description: |-
CheckpointRef references an existing Checkpoint CR to use
If specified, Identity is ignored and this checkpoint is used directly
type: string
enabled:
default: false
description: Enabled indicates whether checkpointing is enabled for this service
type: boolean
identity:
description: |-
Identity defines the checkpoint identity for hash computation
Used when Mode is Auto or when looking up existing checkpoints
Required when checkpointRef is not specified
properties:
backendFramework:
description: BackendFramework is the runtime framework (vllm, sglang, trtllm)
enum:
- vllm
- sglang
- trtllm
type: string
dtype:
description: Dtype is the data type (fp16, bf16, fp8, etc.)
type: string
dynamoVersion:
description: |-
DynamoVersion is the Dynamo platform version (optional)
If not specified, version is not included in identity hash
This ensures checkpoint compatibility across Dynamo releases
type: string
extraParameters:
additionalProperties:
type: string
description: |-
ExtraParameters are additional parameters that affect the checkpoint hash
Use for any framework-specific or custom parameters not covered above
type: object
maxModelLen:
description: MaxModelLen is the maximum sequence length
format: int32
minimum: 1
type: integer
model:
description: Model is the model identifier (e.g., "meta-llama/Llama-3-70B")
type: string
pipelineParallelSize:
default: 1
description: PipelineParallelSize is the pipeline parallel configuration
format: int32
minimum: 1
type: integer
tensorParallelSize:
default: 1
description: TensorParallelSize is the tensor parallel configuration
format: int32
minimum: 1
type: integer
required:
- backendFramework
- model
type: object
mode:
default: Auto
description: |-
Mode defines how checkpoint creation is handled
- Auto: DGD controller creates Checkpoint CR automatically
- Manual: User must create Checkpoint CR
enum:
- Auto
- Manual
type: string
type: object
x-kubernetes-validations:
- message: When enabled, either checkpointRef or both identity.model and identity.backendFramework must be specified
rule: '!self.enabled || (has(self.checkpointRef) && size(self.checkpointRef) > 0) || (has(self.identity) && has(self.identity.model) && has(self.identity.backendFramework))'
componentType: componentType:
description: ComponentType indicates the role of this component (for example, "main"). description: ComponentType indicates the role of this component (for example, "main").
type: string type: string
...@@ -11212,6 +11293,24 @@ spec: ...@@ -11212,6 +11293,24 @@ spec:
status: status:
description: Status reflects the current observed state of this graph deployment. description: Status reflects the current observed state of this graph deployment.
properties: properties:
checkpoints:
additionalProperties:
description: ServiceCheckpointStatus contains checkpoint information for a single service.
properties:
checkpointName:
description: CheckpointName is the name of the associated Checkpoint CR
type: string
identityHash:
description: IdentityHash is the computed hash of the checkpoint identity
type: string
ready:
description: Ready indicates if the checkpoint is ready for use
type: boolean
type: object
description: |-
Checkpoints contains per-service checkpoint status information.
The map key is the service name from spec.services.
type: object
conditions: conditions:
description: |- description: |-
Conditions contains the latest observed conditions of the graph deployment. Conditions contains the latest observed conditions of the graph deployment.
......
...@@ -149,6 +149,26 @@ The chart includes built-in validation to prevent all operator conflicts: ...@@ -149,6 +149,26 @@ The chart includes built-in validation to prevent all operator conflicts:
| dynamo-operator.webhook.certManager.certificate.renewBefore | string | `"360h"` | Time before certificate expiration to trigger renewal (e.g., "360h" for 15 days). cert-manager will attempt to renew the certificate when this threshold is reached. | | dynamo-operator.webhook.certManager.certificate.renewBefore | string | `"360h"` | Time before certificate expiration to trigger renewal (e.g., "360h" for 15 days). cert-manager will attempt to renew the certificate when this threshold is reached. |
| dynamo-operator.webhook.certManager.certificate.rootCA.duration | string | `"87600h"` | Duration for the root CA certificate (e.g., "87600h" for 10 years). The root CA typically has a much longer lifetime than the leaf certificates it signs. | | dynamo-operator.webhook.certManager.certificate.rootCA.duration | string | `"87600h"` | Duration for the root CA certificate (e.g., "87600h" for 10 years). The root CA typically has a much longer lifetime than the leaf certificates it signs. |
| dynamo-operator.webhook.certManager.certificate.rootCA.renewBefore | string | `"720h"` | Time before root CA expiration to trigger renewal (e.g., "720h" for 30 days). Renewing a CA can be disruptive as all signed certificates must be reissued. | | dynamo-operator.webhook.certManager.certificate.rootCA.renewBefore | string | `"720h"` | Time before root CA expiration to trigger renewal (e.g., "720h" for 30 days). Renewing a CA can be disruptive as all signed certificates must be reissued. |
| dynamo-operator.checkpoint.enabled | bool | `false` | Whether to enable checkpoint/restore functionality. When enabled, deploys the checkpoint-agent DaemonSet for creating container checkpoints. |
| dynamo-operator.checkpoint.storage.type | string | `"pvc"` | Storage backend type. Options: "pvc" (PersistentVolumeClaim), "s3" (S3-compatible object storage), "oci" (OCI registry) |
| dynamo-operator.checkpoint.storage.signalHostPath | string | `"/var/lib/chrek/signals"` | Host path for signal files used for communication between checkpoint job pods and the DaemonSet. Both components mount this path to coordinate checkpoint completion. |
| dynamo-operator.checkpoint.storage.pvc.pvcName | string | `"checkpoint-storage"` | Name of an existing PVC for storing checkpoint tar files. This PVC must be created separately with RWX (ReadWriteMany) access mode to allow multiple nodes to read checkpoints. |
| dynamo-operator.checkpoint.storage.pvc.basePath | string | `"/checkpoints"` | Base path within the PVC for storing checkpoint tar files. Each checkpoint is stored as {basePath}/{identityHash}.tar |
| dynamo-operator.checkpoint.storage.s3.uri | string | `""` | S3 URI in format: s3://[endpoint/]bucket/prefix. Examples: "s3://my-bucket/checkpoints" (AWS S3), "s3://minio.example.com/my-bucket/checkpoints" (MinIO) |
| dynamo-operator.checkpoint.storage.s3.credentialsSecretRef | string | `""` | Reference to a secret containing AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, and optionally AWS_REGION. If not provided, uses IRSA/Workload Identity for authentication. |
| dynamo-operator.checkpoint.storage.oci.uri | string | `""` | OCI URI in format: oci://registry/repository. Examples: "oci://myregistry.io/checkpoints", "oci://ghcr.io/myorg/checkpoints" |
| dynamo-operator.checkpoint.storage.oci.credentialsSecretRef | string | `""` | Reference to a docker config secret for registry authentication |
| dynamo-operator.checkpoint.agent.image.repository | string | `"nvcr.io/nvidia/ai-dynamo/checkpoint-agent"` | Container image repository for the checkpoint agent |
| dynamo-operator.checkpoint.agent.image.tag | string | `"latest"` | Container image tag for the checkpoint agent |
| dynamo-operator.checkpoint.agent.image.pullPolicy | string | `"IfNotPresent"` | Image pull policy for the checkpoint agent |
| dynamo-operator.checkpoint.agent.resources | object | `{"limits":{"cpu":"500m","memory":"512Mi"},"requests":{"cpu":"100m","memory":"128Mi"}}` | Resource limits and requests for checkpoint agent containers |
| dynamo-operator.checkpoint.agent.nodeSelector | object | `{}` | Node selector for checkpoint agent pods. Use this to restrict checkpoint agents to specific nodes (e.g., GPU nodes). |
| dynamo-operator.checkpoint.agent.tolerations | list | `[]` | Node tolerations for checkpoint agent pods |
| dynamo-operator.checkpoint.agent.affinity | object | `{}` | Affinity rules for checkpoint agent pods |
| dynamo-operator.checkpoint.agent.podLabels | object | `{}` | Additional labels to add to checkpoint agent pods |
| dynamo-operator.checkpoint.agent.podAnnotations | object | `{}` | Additional annotations to add to checkpoint agent pods |
| dynamo-operator.checkpoint.agent.imagePullSecrets | list | `[]` | Image pull secrets for the checkpoint agent container image |
| dynamo-operator.checkpoint.agent.containerRuntimeSocket | string | `"/run/containerd/containerd.sock"` | Path to the container runtime socket. The checkpoint agent needs access to the container runtime to perform checkpoint operations. Change to /var/run/docker.sock for Docker runtime. |
| grove.enabled | bool | `false` | Whether to enable Grove for multi-node inference coordination, if enabled, the Grove operator will be deployed cluster-wide | | grove.enabled | bool | `false` | Whether to enable Grove for multi-node inference coordination, if enabled, the Grove operator will be deployed cluster-wide |
| grove.tolerations | list | `[]` | Node tolerations for Grove pods | | grove.tolerations | list | `[]` | Node tolerations for Grove pods |
| grove.affinity | object | `{}` | Affinity rules for Grove pods | | grove.affinity | object | `{}` | Affinity rules for Grove pods |
......
...@@ -146,6 +146,29 @@ spec: ...@@ -146,6 +146,29 @@ spec:
{{- if .Values.webhook.enabled }} {{- if .Values.webhook.enabled }}
- --enable-webhooks=true - --enable-webhooks=true
{{- end }} {{- end }}
{{- if .Values.checkpoint.enabled }}
- --checkpoint-enabled=true
- --checkpoint-storage-type={{ .Values.checkpoint.storage.type }}
- --checkpoint-signal-host-path={{ .Values.checkpoint.storage.signalHostPath }}
- --checkpoint-criu-timeout={{ .Values.checkpoint.criu.timeout }}
- --checkpoint-init-container-image={{ .Values.checkpoint.initContainerImage }}
{{- if eq .Values.checkpoint.storage.type "pvc" }}
- --checkpoint-pvc-name={{ .Values.checkpoint.storage.pvc.pvcName }}
- --checkpoint-pvc-base-path={{ .Values.checkpoint.storage.pvc.basePath }}
{{- end }}
{{- if eq .Values.checkpoint.storage.type "s3" }}
- --checkpoint-s3-uri={{ .Values.checkpoint.storage.s3.uri }}
{{- if .Values.checkpoint.storage.s3.credentialsSecretRef }}
- --checkpoint-s3-credentials-secret={{ .Values.checkpoint.storage.s3.credentialsSecretRef }}
{{- end }}
{{- end }}
{{- if eq .Values.checkpoint.storage.type "oci" }}
- --checkpoint-oci-uri={{ .Values.checkpoint.storage.oci.uri }}
{{- if .Values.checkpoint.storage.oci.credentialsSecretRef }}
- --checkpoint-oci-credentials-secret={{ .Values.checkpoint.storage.oci.credentialsSecretRef }}
{{- end }}
{{- end }}
{{- end }}
command: command:
- /manager - /manager
env: env:
......
...@@ -378,6 +378,7 @@ rules: ...@@ -378,6 +378,7 @@ rules:
- apiGroups: - apiGroups:
- nvidia.com - nvidia.com
resources: resources:
- dynamocheckpoints
- dynamocomponentdeployments - dynamocomponentdeployments
- dynamographdeploymentrequests - dynamographdeploymentrequests
- dynamographdeployments - dynamographdeployments
...@@ -394,6 +395,7 @@ rules: ...@@ -394,6 +395,7 @@ rules:
- apiGroups: - apiGroups:
- nvidia.com - nvidia.com
resources: resources:
- dynamocheckpoints/finalizers
- dynamocomponentdeployments/finalizers - dynamocomponentdeployments/finalizers
- dynamographdeploymentrequests/finalizers - dynamographdeploymentrequests/finalizers
- dynamographdeployments/finalizers - dynamographdeployments/finalizers
...@@ -403,6 +405,7 @@ rules: ...@@ -403,6 +405,7 @@ rules:
- apiGroups: - apiGroups:
- nvidia.com - nvidia.com
resources: resources:
- dynamocheckpoints/status
- dynamocomponentdeployments/status - dynamocomponentdeployments/status
- dynamographdeploymentrequests/status - dynamographdeploymentrequests/status
- dynamographdeployments/status - dynamographdeployments/status
......
...@@ -146,6 +146,64 @@ nats: ...@@ -146,6 +146,64 @@ nats:
modelExpressURL: "" modelExpressURL: ""
# Checkpoint configuration for fast pod restore
# NOTE: The checkpoint infrastructure (PVC + DaemonSet) must be installed separately
# using the chrek Helm chart in each namespace where checkpointing is needed.
checkpoint:
# Enable checkpoint/restore functionality
enabled: false
# Image used for init containers in checkpoint jobs (e.g., signal file cleanup)
# Defaults to busybox:latest if not specified
initContainerImage: "busybox:latest"
# Storage configuration
# These settings tell the operator where to find checkpoint storage
# Must match the configuration in the chrek chart
storage:
# Storage backend type: pvc, s3, or oci
type: pvc
# Host path for signal files (communication between checkpoint pod and DaemonSet)
signalHostPath: "/var/lib/chrek/signals"
# PVC configuration (used when type=pvc)
pvc:
# Name of the PVC created by the chrek chart
# Must match the PVC name in the chrek chart
pvcName: "chrek-pvc"
# Base path within the PVC for storing checkpoints
basePath: "/checkpoints"
# S3 configuration (used when type=s3)
s3:
# S3 URI in format: s3://[endpoint/]bucket/prefix
# Examples:
# - s3://my-bucket/checkpoints (AWS S3)
# - s3://minio.example.com/my-bucket/checkpoints (MinIO/custom endpoint)
uri: ""
# Reference to a secret containing AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, and optionally AWS_REGION
# If not provided, uses IRSA/Workload Identity for authentication
credentialsSecretRef: ""
# OCI registry configuration (used when type=oci)
oci:
# OCI URI in format: oci://registry/repository
# Examples:
# - oci://myregistry.io/checkpoints
# - oci://ghcr.io/myorg/checkpoints
uri: ""
# Reference to a docker config secret for registry authentication
credentialsSecretRef: ""
# CRIU timeout configuration (shared across checkpoint and restore)
criu:
# CRIU operation timeout in seconds
# Maximum time to wait for checkpoint/restore to complete
# Increase for models with very large memory footprints
# 21600s (6 hours) is recommended for large LLMs (70B+)
timeout: "21600"
# Webhook configuration # Webhook configuration
webhook: webhook:
# Enable admission webhooks for validation # Enable admission webhooks for validation
......
...@@ -68,11 +68,11 @@ dynamo-operator: ...@@ -68,11 +68,11 @@ dynamo-operator:
# Container image configuration for the operator manager # Container image configuration for the operator manager
image: image:
# -- Official NVIDIA Dynamo operator image repository # -- Official NVIDIA Dynamo operator image repository
repository: "nvcr.io/nvidia/ai-dynamo/kubernetes-operator" repository: "nvcr.io/nvidian/dynamo-dev/dynamo-operator"
# -- Image tag (leave empty to use chart default) # -- Image tag (leave empty to use chart default)
tag: "" tag: "schwinns-latest"
# -- Image pull policy - when to pull the image # -- Image pull policy - when to pull the image
pullPolicy: IfNotPresent pullPolicy: Always
# Command line arguments for the operator manager # Command line arguments for the operator manager
args: args:
...@@ -82,7 +82,8 @@ dynamo-operator: ...@@ -82,7 +82,8 @@ dynamo-operator:
- --metrics-bind-address=127.0.0.1:8080 - --metrics-bind-address=127.0.0.1:8080
# -- Secrets for pulling private container images # -- Secrets for pulling private container images
imagePullSecrets: [] imagePullSecrets:
- name: ngc-secret
# Core Dynamo platform configuration # Core Dynamo platform configuration
dynamo: dynamo:
...@@ -209,6 +210,49 @@ dynamo-operator: ...@@ -209,6 +210,49 @@ dynamo-operator:
# -- Time before root CA expiration to trigger renewal (e.g., "720h" for 30 days). Renewing a CA can be disruptive as all signed certificates must be reissued. # -- Time before root CA expiration to trigger renewal (e.g., "720h" for 30 days). Renewing a CA can be disruptive as all signed certificates must be reissued.
renewBefore: "720h" renewBefore: "720h"
# Checkpoint configuration for fast pod restore using CRIU/cuda-checkpoint
# NOTE: The checkpoint infrastructure (PVC + DaemonSet) must be installed separately
# using the chrek Helm chart in each namespace where checkpointing is needed.
checkpoint:
# -- Whether to enable checkpoint/restore functionality
enabled: true
# Storage configuration
# These settings tell the operator where to find checkpoint storage
# Must match the configuration in the chrek chart
storage:
# -- Storage backend type: pvc, s3, or oci
type: pvc
# -- Host path for signal files (communication between checkpoint pod and DaemonSet)
signalHostPath: "/var/lib/chrek/signals"
# PVC storage configuration (used when type=pvc)
pvc:
# -- Name of the PVC created by the chrek chart
pvcName: "chrek-pvc"
# -- Base path within the PVC for storing checkpoints
basePath: "/checkpoints"
# S3 storage configuration (used when type=s3)
s3:
# -- S3 URI in format: s3://[endpoint/]bucket/prefix
uri: ""
# -- Reference to a secret containing AWS credentials
credentialsSecretRef: ""
# OCI registry storage configuration (used when type=oci)
oci:
# -- OCI URI in format: oci://registry/repository
uri: ""
# -- Reference to a docker config secret for registry authentication
credentialsSecretRef: ""
# CRIU timeout configuration (shared across checkpoint and restore)
criu:
# -- CRIU operation timeout in seconds. Default: 21600 (6 hours)
timeout: "21600"
# Grove component - distributed inference orchestration # Grove component - distributed inference orchestration
grove: grove:
......
...@@ -32,4 +32,12 @@ resources: ...@@ -32,4 +32,12 @@ resources:
kind: DynamoModel kind: DynamoModel
path: github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1 path: github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1
version: v1alpha1 version: v1alpha1
- api:
crdVersion: v1
namespaced: true
controller: true
domain: nvidia.com
kind: DynamoCheckpoint
path: github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1
version: v1alpha1
version: "3" version: "3"
...@@ -135,3 +135,41 @@ type ScalingAdapter struct { ...@@ -135,3 +135,41 @@ type ScalingAdapter struct {
// +kubebuilder:default=false // +kubebuilder:default=false
Enabled bool `json:"enabled,omitempty"` Enabled bool `json:"enabled,omitempty"`
} }
// CheckpointMode defines how checkpoint creation is handled
// +kubebuilder:validation:Enum=Auto;Manual
type CheckpointMode string
const (
// CheckpointModeAuto means the DGD controller will automatically create a Checkpoint CR
CheckpointModeAuto CheckpointMode = "Auto"
// CheckpointModeManual means the user must create the Checkpoint CR themselves
CheckpointModeManual CheckpointMode = "Manual"
)
// ServiceCheckpointConfig configures checkpointing for a DGD service
// +kubebuilder:validation:XValidation:rule="!self.enabled || (has(self.checkpointRef) && size(self.checkpointRef) > 0) || (has(self.identity) && has(self.identity.model) && has(self.identity.backendFramework))",message="When enabled, either checkpointRef or both identity.model and identity.backendFramework must be specified"
type ServiceCheckpointConfig struct {
// Enabled indicates whether checkpointing is enabled for this service
// +optional
// +kubebuilder:default=false
Enabled bool `json:"enabled,omitempty"`
// Mode defines how checkpoint creation is handled
// - Auto: DGD controller creates Checkpoint CR automatically
// - Manual: User must create Checkpoint CR
// +optional
// +kubebuilder:default=Auto
Mode CheckpointMode `json:"mode,omitempty"`
// CheckpointRef references an existing Checkpoint CR to use
// If specified, Identity is ignored and this checkpoint is used directly
// +optional
CheckpointRef *string `json:"checkpointRef,omitempty"`
// Identity defines the checkpoint identity for hash computation
// Used when Mode is Auto or when looking up existing checkpoints
// Required when checkpointRef is not specified
// +optional
Identity *DynamoCheckpointIdentity `json:"identity,omitempty"`
}
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package v1alpha1
import (
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
// DynamoCheckpointPhase represents the current phase of the checkpoint lifecycle
// +kubebuilder:validation:Enum=Pending;Creating;Ready;Failed
type DynamoCheckpointPhase string
const (
// DynamoCheckpointPhasePending indicates the checkpoint CR has been created but the Job has not started
DynamoCheckpointPhasePending DynamoCheckpointPhase = "Pending"
// DynamoCheckpointPhaseCreating indicates the checkpoint Job is running
DynamoCheckpointPhaseCreating DynamoCheckpointPhase = "Creating"
// DynamoCheckpointPhaseReady indicates the checkpoint tar file is available on the PVC
DynamoCheckpointPhaseReady DynamoCheckpointPhase = "Ready"
// DynamoCheckpointPhaseFailed indicates the checkpoint creation failed
DynamoCheckpointPhaseFailed DynamoCheckpointPhase = "Failed"
)
// DynamoCheckpointStorageType defines the supported storage backends for checkpoints
// +kubebuilder:validation:Enum=pvc;s3;oci
type DynamoCheckpointStorageType string
// DynamoCheckpointIdentity defines the inputs that determine checkpoint equivalence
// Two checkpoints with the same identity hash are considered equivalent
type DynamoCheckpointIdentity struct {
// Model is the model identifier (e.g., "meta-llama/Llama-3-70B")
// +kubebuilder:validation:Required
Model string `json:"model"`
// BackendFramework is the runtime framework (vllm, sglang, trtllm)
// +kubebuilder:validation:Required
// +kubebuilder:validation:Enum=vllm;sglang;trtllm
BackendFramework string `json:"backendFramework"`
// DynamoVersion is the Dynamo platform version (optional)
// If not specified, version is not included in identity hash
// This ensures checkpoint compatibility across Dynamo releases
// +optional
DynamoVersion string `json:"dynamoVersion,omitempty"`
// TensorParallelSize is the tensor parallel configuration
// +optional
// +kubebuilder:validation:Minimum=1
// +kubebuilder:default=1
TensorParallelSize int32 `json:"tensorParallelSize,omitempty"`
// PipelineParallelSize is the pipeline parallel configuration
// +optional
// +kubebuilder:validation:Minimum=1
// +kubebuilder:default=1
PipelineParallelSize int32 `json:"pipelineParallelSize,omitempty"`
// Dtype is the data type (fp16, bf16, fp8, etc.)
// +optional
Dtype string `json:"dtype,omitempty"`
// MaxModelLen is the maximum sequence length
// +optional
// +kubebuilder:validation:Minimum=1
MaxModelLen int32 `json:"maxModelLen,omitempty"`
// ExtraParameters are additional parameters that affect the checkpoint hash
// Use for any framework-specific or custom parameters not covered above
// +optional
ExtraParameters map[string]string `json:"extraParameters,omitempty"`
}
// DynamoCheckpointJobConfig defines the configuration for the checkpoint creation Job
type DynamoCheckpointJobConfig struct {
// PodTemplateSpec allows customizing the checkpoint Job pod
// This should include the container that runs the workload to be checkpointed
// +kubebuilder:validation:Required
PodTemplateSpec corev1.PodTemplateSpec `json:"podTemplateSpec"`
// ActiveDeadlineSeconds specifies the maximum time the Job can run
// +optional
// +kubebuilder:default=3600
ActiveDeadlineSeconds *int64 `json:"activeDeadlineSeconds,omitempty"`
// BackoffLimit specifies the number of retries before marking the Job failed
// +optional
// +kubebuilder:default=3
BackoffLimit *int32 `json:"backoffLimit,omitempty"`
// TTLSecondsAfterFinished specifies how long to keep the Job after completion
// +optional
// +kubebuilder:default=300
TTLSecondsAfterFinished *int32 `json:"ttlSecondsAfterFinished,omitempty"`
}
// DynamoCheckpointSpec defines the desired state of DynamoCheckpoint
type DynamoCheckpointSpec struct {
// Identity defines the inputs that determine checkpoint equivalence
// +kubebuilder:validation:Required
Identity DynamoCheckpointIdentity `json:"identity"`
// Job defines the configuration for the checkpoint creation Job
// +kubebuilder:validation:Required
Job DynamoCheckpointJobConfig `json:"job"`
}
// DynamoCheckpointConditionType defines the types of conditions for DynamoCheckpoint
type DynamoCheckpointConditionType string
const (
// DynamoCheckpointConditionJobCreated indicates whether the checkpoint Job has been created
DynamoCheckpointConditionJobCreated DynamoCheckpointConditionType = "JobCreated"
// DynamoCheckpointConditionJobCompleted indicates whether the checkpoint Job has completed
DynamoCheckpointConditionJobCompleted DynamoCheckpointConditionType = "JobCompleted"
// DynamoCheckpointConditionTarAvailable indicates whether the checkpoint tar file exists
DynamoCheckpointConditionTarAvailable DynamoCheckpointConditionType = "TarAvailable"
)
// DynamoCheckpointStatus defines the observed state of DynamoCheckpoint
type DynamoCheckpointStatus struct {
// Phase represents the current phase of the checkpoint lifecycle
// +optional
Phase DynamoCheckpointPhase `json:"phase,omitempty"`
// IdentityHash is the computed hash of the checkpoint identity
// This hash is used to identify equivalent checkpoints
// +optional
IdentityHash string `json:"identityHash,omitempty"`
// Location is the full URI/path to the checkpoint in the storage backend
// For PVC: same as TarPath (e.g., /checkpoints/{hash}.tar)
// For S3: s3://bucket/prefix/{hash}.tar
// For OCI: oci://registry/repo:{hash}
// +optional
Location string `json:"location,omitempty"`
// StorageType indicates the storage backend type used for this checkpoint
// +optional
StorageType DynamoCheckpointStorageType `json:"storageType,omitempty"`
// JobName is the name of the checkpoint creation Job
// +optional
JobName string `json:"jobName,omitempty"`
// CreatedAt is the timestamp when the checkpoint tar was created
// +optional
CreatedAt *metav1.Time `json:"createdAt,omitempty"`
// Message provides additional information about the current state
// +optional
Message string `json:"message,omitempty"`
// Conditions represent the latest available observations of the checkpoint's state
// +optional
Conditions []metav1.Condition `json:"conditions,omitempty"`
}
// +kubebuilder:object:root=true
// +kubebuilder:subresource:status
// +kubebuilder:resource:shortName=dckpt
// +kubebuilder:printcolumn:name="Model",type="string",JSONPath=".spec.identity.model",description="Model identifier"
// +kubebuilder:printcolumn:name="Backend",type="string",JSONPath=".spec.identity.backendFramework",description="Backend framework"
// +kubebuilder:printcolumn:name="Phase",type="string",JSONPath=".status.phase",description="Current phase of the checkpoint"
// +kubebuilder:printcolumn:name="Hash",type="string",JSONPath=".status.identityHash",description="Identity hash of the checkpoint"
// +kubebuilder:printcolumn:name="Age",type="date",JSONPath=".metadata.creationTimestamp"
// +kubebuilder:validation:XValidation:rule="!has(oldSelf.spec.identity) || self.spec.identity == oldSelf.spec.identity",message="spec.identity is immutable after creation"
// DynamoCheckpoint is the Schema for the dynamocheckpoints API
// It represents a container checkpoint that can be used to restore pods to a warm state
type DynamoCheckpoint struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`
Spec DynamoCheckpointSpec `json:"spec,omitempty"`
Status DynamoCheckpointStatus `json:"status,omitempty"`
}
// +kubebuilder:object:root=true
// DynamoCheckpointList contains a list of DynamoCheckpoint
type DynamoCheckpointList struct {
metav1.TypeMeta `json:",inline"`
metav1.ListMeta `json:"metadata,omitempty"`
Items []DynamoCheckpoint `json:"items"`
}
func init() {
SchemeBuilder.Register(&DynamoCheckpoint{}, &DynamoCheckpointList{})
}
...@@ -130,6 +130,11 @@ type DynamoComponentDeploymentSharedSpec struct { ...@@ -130,6 +130,11 @@ type DynamoComponentDeploymentSharedSpec struct {
// Only applicable when ComponentType is "epp". // Only applicable when ComponentType is "epp".
// +optional // +optional
EPPConfig *EPPConfig `json:"eppConfig,omitempty"` EPPConfig *EPPConfig `json:"eppConfig,omitempty"`
// Checkpoint configures container checkpointing for this service.
// When enabled, pods can be restored from a checkpoint files for faster cold start.
// +optional
Checkpoint *ServiceCheckpointConfig `json:"checkpoint,omitempty"`
} }
type MultinodeSpec struct { type MultinodeSpec struct {
......
...@@ -113,6 +113,23 @@ type DynamoGraphDeploymentStatus struct { ...@@ -113,6 +113,23 @@ type DynamoGraphDeploymentStatus struct {
// Restart contains the status of the restart of the graph deployment. // Restart contains the status of the restart of the graph deployment.
// +optional // +optional
Restart *RestartStatus `json:"restart,omitempty"` Restart *RestartStatus `json:"restart,omitempty"`
// Checkpoints contains per-service checkpoint status information.
// The map key is the service name from spec.services.
// +optional
Checkpoints map[string]ServiceCheckpointStatus `json:"checkpoints,omitempty"`
}
// ServiceCheckpointStatus contains checkpoint information for a single service.
type ServiceCheckpointStatus struct {
// CheckpointName is the name of the associated Checkpoint CR
// +optional
CheckpointName string `json:"checkpointName,omitempty"`
// IdentityHash is the computed hash of the checkpoint identity
// +optional
IdentityHash string `json:"identityHash,omitempty"`
// Ready indicates if the checkpoint is ready for use
// +optional
Ready bool `json:"ready,omitempty"`
} }
// RestartStatus contains the status of the restart of the graph deployment. // RestartStatus contains the status of the restart of the graph deployment.
......
...@@ -219,6 +219,161 @@ func (in *DeploymentTargetHPAConf) DeepCopy() *DeploymentTargetHPAConf { ...@@ -219,6 +219,161 @@ func (in *DeploymentTargetHPAConf) DeepCopy() *DeploymentTargetHPAConf {
return out return out
} }
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *DynamoCheckpoint) DeepCopyInto(out *DynamoCheckpoint) {
*out = *in
out.TypeMeta = in.TypeMeta
in.ObjectMeta.DeepCopyInto(&out.ObjectMeta)
in.Spec.DeepCopyInto(&out.Spec)
in.Status.DeepCopyInto(&out.Status)
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DynamoCheckpoint.
func (in *DynamoCheckpoint) DeepCopy() *DynamoCheckpoint {
if in == nil {
return nil
}
out := new(DynamoCheckpoint)
in.DeepCopyInto(out)
return out
}
// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
func (in *DynamoCheckpoint) DeepCopyObject() runtime.Object {
if c := in.DeepCopy(); c != nil {
return c
}
return nil
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *DynamoCheckpointIdentity) DeepCopyInto(out *DynamoCheckpointIdentity) {
*out = *in
if in.ExtraParameters != nil {
in, out := &in.ExtraParameters, &out.ExtraParameters
*out = make(map[string]string, len(*in))
for key, val := range *in {
(*out)[key] = val
}
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DynamoCheckpointIdentity.
func (in *DynamoCheckpointIdentity) DeepCopy() *DynamoCheckpointIdentity {
if in == nil {
return nil
}
out := new(DynamoCheckpointIdentity)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *DynamoCheckpointJobConfig) DeepCopyInto(out *DynamoCheckpointJobConfig) {
*out = *in
in.PodTemplateSpec.DeepCopyInto(&out.PodTemplateSpec)
if in.ActiveDeadlineSeconds != nil {
in, out := &in.ActiveDeadlineSeconds, &out.ActiveDeadlineSeconds
*out = new(int64)
**out = **in
}
if in.BackoffLimit != nil {
in, out := &in.BackoffLimit, &out.BackoffLimit
*out = new(int32)
**out = **in
}
if in.TTLSecondsAfterFinished != nil {
in, out := &in.TTLSecondsAfterFinished, &out.TTLSecondsAfterFinished
*out = new(int32)
**out = **in
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DynamoCheckpointJobConfig.
func (in *DynamoCheckpointJobConfig) DeepCopy() *DynamoCheckpointJobConfig {
if in == nil {
return nil
}
out := new(DynamoCheckpointJobConfig)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *DynamoCheckpointList) DeepCopyInto(out *DynamoCheckpointList) {
*out = *in
out.TypeMeta = in.TypeMeta
in.ListMeta.DeepCopyInto(&out.ListMeta)
if in.Items != nil {
in, out := &in.Items, &out.Items
*out = make([]DynamoCheckpoint, len(*in))
for i := range *in {
(*in)[i].DeepCopyInto(&(*out)[i])
}
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DynamoCheckpointList.
func (in *DynamoCheckpointList) DeepCopy() *DynamoCheckpointList {
if in == nil {
return nil
}
out := new(DynamoCheckpointList)
in.DeepCopyInto(out)
return out
}
// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
func (in *DynamoCheckpointList) DeepCopyObject() runtime.Object {
if c := in.DeepCopy(); c != nil {
return c
}
return nil
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *DynamoCheckpointSpec) DeepCopyInto(out *DynamoCheckpointSpec) {
*out = *in
in.Identity.DeepCopyInto(&out.Identity)
in.Job.DeepCopyInto(&out.Job)
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DynamoCheckpointSpec.
func (in *DynamoCheckpointSpec) DeepCopy() *DynamoCheckpointSpec {
if in == nil {
return nil
}
out := new(DynamoCheckpointSpec)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *DynamoCheckpointStatus) DeepCopyInto(out *DynamoCheckpointStatus) {
*out = *in
if in.CreatedAt != nil {
in, out := &in.CreatedAt, &out.CreatedAt
*out = (*in).DeepCopy()
}
if in.Conditions != nil {
in, out := &in.Conditions, &out.Conditions
*out = make([]metav1.Condition, len(*in))
for i := range *in {
(*in)[i].DeepCopyInto(&(*out)[i])
}
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DynamoCheckpointStatus.
func (in *DynamoCheckpointStatus) DeepCopy() *DynamoCheckpointStatus {
if in == nil {
return nil
}
out := new(DynamoCheckpointStatus)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *DynamoComponentDeployment) DeepCopyInto(out *DynamoComponentDeployment) { func (in *DynamoComponentDeployment) DeepCopyInto(out *DynamoComponentDeployment) {
*out = *in *out = *in
...@@ -382,6 +537,11 @@ func (in *DynamoComponentDeploymentSharedSpec) DeepCopyInto(out *DynamoComponent ...@@ -382,6 +537,11 @@ func (in *DynamoComponentDeploymentSharedSpec) DeepCopyInto(out *DynamoComponent
*out = new(EPPConfig) *out = new(EPPConfig)
(*in).DeepCopyInto(*out) (*in).DeepCopyInto(*out)
} }
if in.Checkpoint != nil {
in, out := &in.Checkpoint, &out.Checkpoint
*out = new(ServiceCheckpointConfig)
(*in).DeepCopyInto(*out)
}
} }
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DynamoComponentDeploymentSharedSpec. // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DynamoComponentDeploymentSharedSpec.
...@@ -796,6 +956,13 @@ func (in *DynamoGraphDeploymentStatus) DeepCopyInto(out *DynamoGraphDeploymentSt ...@@ -796,6 +956,13 @@ func (in *DynamoGraphDeploymentStatus) DeepCopyInto(out *DynamoGraphDeploymentSt
*out = new(RestartStatus) *out = new(RestartStatus)
(*in).DeepCopyInto(*out) (*in).DeepCopyInto(*out)
} }
if in.Checkpoints != nil {
in, out := &in.Checkpoints, &out.Checkpoints
*out = make(map[string]ServiceCheckpointStatus, len(*in))
for key, val := range *in {
(*out)[key] = val
}
}
} }
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DynamoGraphDeploymentStatus. // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DynamoGraphDeploymentStatus.
...@@ -1327,6 +1494,46 @@ func (in *ScalingAdapter) DeepCopy() *ScalingAdapter { ...@@ -1327,6 +1494,46 @@ func (in *ScalingAdapter) DeepCopy() *ScalingAdapter {
return out return out
} }
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *ServiceCheckpointConfig) DeepCopyInto(out *ServiceCheckpointConfig) {
*out = *in
if in.CheckpointRef != nil {
in, out := &in.CheckpointRef, &out.CheckpointRef
*out = new(string)
**out = **in
}
if in.Identity != nil {
in, out := &in.Identity, &out.Identity
*out = new(DynamoCheckpointIdentity)
(*in).DeepCopyInto(*out)
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServiceCheckpointConfig.
func (in *ServiceCheckpointConfig) DeepCopy() *ServiceCheckpointConfig {
if in == nil {
return nil
}
out := new(ServiceCheckpointConfig)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *ServiceCheckpointStatus) DeepCopyInto(out *ServiceCheckpointStatus) {
*out = *in
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServiceCheckpointStatus.
func (in *ServiceCheckpointStatus) DeepCopy() *ServiceCheckpointStatus {
if in == nil {
return nil
}
out := new(ServiceCheckpointStatus)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *ServiceReplicaStatus) DeepCopyInto(out *ServiceReplicaStatus) { func (in *ServiceReplicaStatus) DeepCopyInto(out *ServiceReplicaStatus) {
*out = *in *out = *in
......
...@@ -155,6 +155,18 @@ func main() { ...@@ -155,6 +155,18 @@ func main() {
var operatorVersion string var operatorVersion string
var discoveryBackend string var discoveryBackend string
var enableWebhooks bool var enableWebhooks bool
// Checkpoint configuration
var checkpointEnabled bool
var checkpointStorageType string
var checkpointSignalHostPath string
var checkpointCRIUTimeout string
var checkpointPVCName string
var checkpointPVCBasePath string
var checkpointS3URI string
var checkpointS3CredentialsSecret string
var checkpointOCIURI string
var checkpointOCICredentialsSecret string
var checkpointInitContainerImage string
flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.") flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
flag.BoolVar(&enableLeaderElection, "leader-elect", false, flag.BoolVar(&enableLeaderElection, "leader-elect", false,
...@@ -210,6 +222,29 @@ func main() { ...@@ -210,6 +222,29 @@ func main() {
"Version of the operator (used in lease holder identity)") "Version of the operator (used in lease holder identity)")
flag.StringVar(&discoveryBackend, "discovery-backend", "kubernetes", flag.StringVar(&discoveryBackend, "discovery-backend", "kubernetes",
"Discovery backend to use: 'kubernetes' (default, uses Kubernetes API) or 'etcd' (uses ETCD)") "Discovery backend to use: 'kubernetes' (default, uses Kubernetes API) or 'etcd' (uses ETCD)")
// Checkpoint flags
flag.BoolVar(&checkpointEnabled, "checkpoint-enabled", false,
"Enable checkpoint/restore functionality")
flag.StringVar(&checkpointStorageType, "checkpoint-storage-type", commonController.CheckpointStorageTypePVC,
"Checkpoint storage backend type: pvc, s3, or oci")
flag.StringVar(&checkpointSignalHostPath, "checkpoint-signal-host-path", "",
"Host path for signal files used for checkpoint job coordination")
flag.StringVar(&checkpointCRIUTimeout, "checkpoint-criu-timeout", "21600",
"CRIU timeout in seconds (required for CUDA checkpoints/restores, default: 21600 = 6 hours)")
flag.StringVar(&checkpointPVCName, "checkpoint-pvc-name", "checkpoint-storage",
"Name of the PVC for checkpoint storage (used when storage-type=pvc)")
flag.StringVar(&checkpointPVCBasePath, "checkpoint-pvc-base-path", "/checkpoints",
"Base path within the PVC for storing checkpoints (used when storage-type=pvc)")
flag.StringVar(&checkpointS3URI, "checkpoint-s3-uri", "",
"S3 URI for checkpoint storage: s3://[endpoint/]bucket/prefix (used when storage-type=s3)")
flag.StringVar(&checkpointS3CredentialsSecret, "checkpoint-s3-credentials-secret", "",
"Secret name containing AWS credentials (used when storage-type=s3)")
flag.StringVar(&checkpointOCIURI, "checkpoint-oci-uri", "",
"OCI URI for checkpoint storage: oci://registry/repository (used when storage-type=oci)")
flag.StringVar(&checkpointOCICredentialsSecret, "checkpoint-oci-credentials-secret", "",
"Docker config secret name for OCI registry auth (used when storage-type=oci)")
flag.StringVar(&checkpointInitContainerImage, "checkpoint-init-container-image", "busybox:latest",
"Image to use for checkpoint init containers (e.g., signal file cleanup)")
opts := zap.Options{ opts := zap.Options{
Development: true, Development: true,
} }
...@@ -279,6 +314,27 @@ func main() { ...@@ -279,6 +314,27 @@ func main() {
EPPClusterRoleName: eppClusterRoleName, EPPClusterRoleName: eppClusterRoleName,
}, },
DiscoveryBackend: discoveryBackend, DiscoveryBackend: discoveryBackend,
Checkpoint: commonController.CheckpointConfig{
Enabled: checkpointEnabled,
CRIUTimeout: checkpointCRIUTimeout,
InitContainerImage: checkpointInitContainerImage,
Storage: commonController.CheckpointStorageConfig{
Type: checkpointStorageType,
SignalHostPath: checkpointSignalHostPath,
PVC: commonController.CheckpointPVCConfig{
PVCName: checkpointPVCName,
BasePath: checkpointPVCBasePath,
},
S3: commonController.CheckpointS3Config{
URI: checkpointS3URI,
CredentialsSecretRef: checkpointS3CredentialsSecret,
},
OCI: commonController.CheckpointOCIConfig{
URI: checkpointOCIURI,
CredentialsSecretRef: checkpointOCICredentialsSecret,
},
},
},
} }
mainCtx := ctrl.SetupSignalHandler() mainCtx := ctrl.SetupSignalHandler()
...@@ -620,6 +676,15 @@ func main() { ...@@ -620,6 +676,15 @@ func main() {
os.Exit(1) os.Exit(1)
} }
if err = (&controller.CheckpointReconciler{
Client: mgr.GetClient(),
Config: ctrlConfig,
Recorder: mgr.GetEventRecorderFor("checkpoint"),
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "DynamoCheckpoint")
os.Exit(1)
}
// Set webhooks enabled flag in config // Set webhooks enabled flag in config
ctrlConfig.WebhooksEnabled = enableWebhooks ctrlConfig.WebhooksEnabled = enableWebhooks
......
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -683,6 +683,87 @@ spec: ...@@ -683,6 +683,87 @@ spec:
- vllm - vllm
- trtllm - trtllm
type: string type: string
checkpoint:
description: |-
Checkpoint configures container checkpointing for this service.
When enabled, pods can be restored from a checkpoint files for faster cold start.
properties:
checkpointRef:
description: |-
CheckpointRef references an existing Checkpoint CR to use
If specified, Identity is ignored and this checkpoint is used directly
type: string
enabled:
default: false
description: Enabled indicates whether checkpointing is enabled for this service
type: boolean
identity:
description: |-
Identity defines the checkpoint identity for hash computation
Used when Mode is Auto or when looking up existing checkpoints
Required when checkpointRef is not specified
properties:
backendFramework:
description: BackendFramework is the runtime framework (vllm, sglang, trtllm)
enum:
- vllm
- sglang
- trtllm
type: string
dtype:
description: Dtype is the data type (fp16, bf16, fp8, etc.)
type: string
dynamoVersion:
description: |-
DynamoVersion is the Dynamo platform version (optional)
If not specified, version is not included in identity hash
This ensures checkpoint compatibility across Dynamo releases
type: string
extraParameters:
additionalProperties:
type: string
description: |-
ExtraParameters are additional parameters that affect the checkpoint hash
Use for any framework-specific or custom parameters not covered above
type: object
maxModelLen:
description: MaxModelLen is the maximum sequence length
format: int32
minimum: 1
type: integer
model:
description: Model is the model identifier (e.g., "meta-llama/Llama-3-70B")
type: string
pipelineParallelSize:
default: 1
description: PipelineParallelSize is the pipeline parallel configuration
format: int32
minimum: 1
type: integer
tensorParallelSize:
default: 1
description: TensorParallelSize is the tensor parallel configuration
format: int32
minimum: 1
type: integer
required:
- backendFramework
- model
type: object
mode:
default: Auto
description: |-
Mode defines how checkpoint creation is handled
- Auto: DGD controller creates Checkpoint CR automatically
- Manual: User must create Checkpoint CR
enum:
- Auto
- Manual
type: string
type: object
x-kubernetes-validations:
- message: When enabled, either checkpointRef or both identity.model and identity.backendFramework must be specified
rule: '!self.enabled || (has(self.checkpointRef) && size(self.checkpointRef) > 0) || (has(self.identity) && has(self.identity.model) && has(self.identity.backendFramework))'
componentType: componentType:
description: ComponentType indicates the role of this component (for example, "main"). description: ComponentType indicates the role of this component (for example, "main").
type: string type: string
......
...@@ -892,6 +892,87 @@ spec: ...@@ -892,6 +892,87 @@ spec:
description: 'Deprecated: This field is ignored.' description: 'Deprecated: This field is ignored.'
type: integer type: integer
type: object type: object
checkpoint:
description: |-
Checkpoint configures container checkpointing for this service.
When enabled, pods can be restored from a checkpoint files for faster cold start.
properties:
checkpointRef:
description: |-
CheckpointRef references an existing Checkpoint CR to use
If specified, Identity is ignored and this checkpoint is used directly
type: string
enabled:
default: false
description: Enabled indicates whether checkpointing is enabled for this service
type: boolean
identity:
description: |-
Identity defines the checkpoint identity for hash computation
Used when Mode is Auto or when looking up existing checkpoints
Required when checkpointRef is not specified
properties:
backendFramework:
description: BackendFramework is the runtime framework (vllm, sglang, trtllm)
enum:
- vllm
- sglang
- trtllm
type: string
dtype:
description: Dtype is the data type (fp16, bf16, fp8, etc.)
type: string
dynamoVersion:
description: |-
DynamoVersion is the Dynamo platform version (optional)
If not specified, version is not included in identity hash
This ensures checkpoint compatibility across Dynamo releases
type: string
extraParameters:
additionalProperties:
type: string
description: |-
ExtraParameters are additional parameters that affect the checkpoint hash
Use for any framework-specific or custom parameters not covered above
type: object
maxModelLen:
description: MaxModelLen is the maximum sequence length
format: int32
minimum: 1
type: integer
model:
description: Model is the model identifier (e.g., "meta-llama/Llama-3-70B")
type: string
pipelineParallelSize:
default: 1
description: PipelineParallelSize is the pipeline parallel configuration
format: int32
minimum: 1
type: integer
tensorParallelSize:
default: 1
description: TensorParallelSize is the tensor parallel configuration
format: int32
minimum: 1
type: integer
required:
- backendFramework
- model
type: object
mode:
default: Auto
description: |-
Mode defines how checkpoint creation is handled
- Auto: DGD controller creates Checkpoint CR automatically
- Manual: User must create Checkpoint CR
enum:
- Auto
- Manual
type: string
type: object
x-kubernetes-validations:
- message: When enabled, either checkpointRef or both identity.model and identity.backendFramework must be specified
rule: '!self.enabled || (has(self.checkpointRef) && size(self.checkpointRef) > 0) || (has(self.identity) && has(self.identity.model) && has(self.identity.backendFramework))'
componentType: componentType:
description: ComponentType indicates the role of this component (for example, "main"). description: ComponentType indicates the role of this component (for example, "main").
type: string type: string
...@@ -11212,6 +11293,24 @@ spec: ...@@ -11212,6 +11293,24 @@ spec:
status: status:
description: Status reflects the current observed state of this graph deployment. description: Status reflects the current observed state of this graph deployment.
properties: properties:
checkpoints:
additionalProperties:
description: ServiceCheckpointStatus contains checkpoint information for a single service.
properties:
checkpointName:
description: CheckpointName is the name of the associated Checkpoint CR
type: string
identityHash:
description: IdentityHash is the computed hash of the checkpoint identity
type: string
ready:
description: Ready indicates if the checkpoint is ready for use
type: boolean
type: object
description: |-
Checkpoints contains per-service checkpoint status information.
The map key is the service name from spec.services.
type: object
conditions: conditions:
description: |- description: |-
Conditions contains the latest observed conditions of the graph deployment. Conditions contains the latest observed conditions of the graph deployment.
......
...@@ -20,6 +20,7 @@ resources: ...@@ -20,6 +20,7 @@ resources:
- bases/nvidia.com_dynamocomponentdeployments.yaml - bases/nvidia.com_dynamocomponentdeployments.yaml
- bases/nvidia.com_dynamographdeployments.yaml - bases/nvidia.com_dynamographdeployments.yaml
- bases/nvidia.com_dynamomodels.yaml - bases/nvidia.com_dynamomodels.yaml
- bases/nvidia.com_dynamocheckpoints.yaml
#+kubebuilder:scaffold:crdkustomizeresource #+kubebuilder:scaffold:crdkustomizeresource
patches: [] patches: []
......
...@@ -191,6 +191,7 @@ rules: ...@@ -191,6 +191,7 @@ rules:
- apiGroups: - apiGroups:
- nvidia.com - nvidia.com
resources: resources:
- dynamocheckpoints
- dynamocomponentdeployments - dynamocomponentdeployments
- dynamographdeploymentrequests - dynamographdeploymentrequests
- dynamographdeployments - dynamographdeployments
...@@ -207,6 +208,7 @@ rules: ...@@ -207,6 +208,7 @@ rules:
- apiGroups: - apiGroups:
- nvidia.com - nvidia.com
resources: resources:
- dynamocheckpoints/finalizers
- dynamocomponentdeployments/finalizers - dynamocomponentdeployments/finalizers
- dynamographdeploymentrequests/finalizers - dynamographdeploymentrequests/finalizers
- dynamographdeployments/finalizers - dynamographdeployments/finalizers
...@@ -216,6 +218,7 @@ rules: ...@@ -216,6 +218,7 @@ rules:
- apiGroups: - apiGroups:
- nvidia.com - nvidia.com
resources: resources:
- dynamocheckpoints/status
- dynamocomponentdeployments/status - dynamocomponentdeployments/status
- dynamographdeploymentrequests/status - dynamographdeploymentrequests/status
- dynamographdeployments/status - dynamographdeployments/status
......
...@@ -20,4 +20,5 @@ resources: ...@@ -20,4 +20,5 @@ resources:
- nvidia.com_v1alpha1_dynamographdeployment.yaml - nvidia.com_v1alpha1_dynamographdeployment.yaml
- nvidia.com_v1alpha1_dynamographdeploymentrequest.yaml - nvidia.com_v1alpha1_dynamographdeploymentrequest.yaml
- nvidia.com_v1alpha1_dynamomodel.yaml - nvidia.com_v1alpha1_dynamomodel.yaml
- nvidia.com_v1alpha1_dynamocheckpoint.yaml
#+kubebuilder:scaffold:manifestskustomizesamples #+kubebuilder:scaffold:manifestskustomizesamples
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment