consts.go 9.2 KB
Newer Older
1
2
package consts

3
4
5
6
7
import (
	"time"

	"k8s.io/apimachinery/pkg/runtime/schema"
)
8

9
10
11
12
const (
	DefaultUserId = "default"
	DefaultOrgId  = "default"

13
	DynamoServicePort       = 8000
14
15
	DynamoServicePortName   = "http"
	DynamoContainerPortName = "http"
16

17
18
19
	DynamoPlannerMetricsPort = 9085
	DynamoMetricsPortName    = "metrics"

20
21
	DynamoSystemPort     = 9090
	DynamoSystemPortName = "system"
22

23
24
25
26
	// EPP (Endpoint Picker Plugin) ports
	EPPGRPCPort     = 9002
	EPPGRPCPortName = "grpc"

27
28
	MpiRunSshPort = 2222

29
30
31
32
33
	// Default security context values
	// These provide secure defaults for running containers as non-root
	// Users can override these via extraPodSpec.securityContext in their DynamoGraphDeployment
	DefaultSecurityContextFSGroup = 1000

34
	EnvDynamoServicePort = "DYNAMO_PORT"
35

36
37
	KubeLabelDynamoSelector = "nvidia.com/selector"

38
39
	KubeAnnotationEnableGrove = "nvidia.com/enable-grove"

40
	KubeAnnotationDisableImagePullSecretDiscovery = "nvidia.com/disable-image-pull-secret-discovery"
41
	KubeAnnotationDynamoDiscoveryBackend          = "nvidia.com/dynamo-discovery-backend"
42

43
	KubeLabelDynamoGraphDeploymentName  = "nvidia.com/dynamo-graph-deployment-name"
44
	KubeLabelDynamoComponent            = "nvidia.com/dynamo-component"
45
	KubeLabelDynamoNamespace            = "nvidia.com/dynamo-namespace"
46
	KubeLabelDynamoDeploymentTargetType = "nvidia.com/dynamo-deployment-target-type"
47
	KubeLabelDynamoComponentType        = "nvidia.com/dynamo-component-type"
48
	KubeLabelDynamoSubComponentType     = "nvidia.com/dynamo-sub-component-type"
49
50
51
	KubeLabelDynamoBaseModel            = "nvidia.com/dynamo-base-model"
	KubeLabelDynamoBaseModelHash        = "nvidia.com/dynamo-base-model-hash"
	KubeAnnotationDynamoBaseModel       = "nvidia.com/dynamo-base-model"
52
	KubeLabelDynamoDiscoveryBackend     = "nvidia.com/dynamo-discovery-backend"
53
	KubeLabelDynamoDiscoveryEnabled     = "nvidia.com/dynamo-discovery-enabled"
54

55
56
57
	KubeLabelValueFalse = "false"
	KubeLabelValueTrue  = "true"

58
	KubeLabelDynamoComponentPod = "nvidia.com/dynamo-component-pod"
59
60
61

	KubeResourceGPUNvidia = "nvidia.com/gpu"

62
	DynamoDeploymentConfigEnvVar = "DYN_DEPLOYMENT_CONFIG"
63
64
65
	DynamoNamespaceEnvVar        = "DYN_NAMESPACE"
	DynamoComponentEnvVar        = "DYN_COMPONENT"
	DynamoDiscoveryBackendEnvVar = "DYN_DISCOVERY_BACKEND"
66

67
68
	GlobalDynamoNamespace = "dynamo"

69
	ComponentTypePlanner      = "planner"
70
	ComponentTypeFrontend     = "frontend"
71
	ComponentTypeWorker       = "worker"
72
73
	ComponentTypePrefill      = "prefill"
	ComponentTypeDecode       = "decode"
74
	ComponentTypeEPP          = "epp"
75
	ComponentTypeDefault      = "default"
76
	PlannerServiceAccountName = "planner-serviceaccount"
77
78
	EPPServiceAccountName     = "epp-serviceaccount"
	EPPClusterRoleName        = "epp-cluster-role"
79
80

	DefaultIngressSuffix = "local"
81
82

	DefaultGroveTerminationDelay = 15 * time.Minute
83
84

	// Metrics related constants
85
86
87
88
89
	KubeAnnotationEnableMetrics  = "nvidia.com/enable-metrics"  // User-provided annotation to control metrics
	KubeLabelMetricsEnabled      = "nvidia.com/metrics-enabled" // Controller-managed label for pod selection
	KubeValueNameSharedMemory    = "shared-memory"
	DefaultSharedMemoryMountPath = "/dev/shm"
	DefaultSharedMemorySize      = "8Gi"
90

91
92
93
	// Compilation cache default mount points
	DefaultVLLMCacheMountPoint = "/root/.cache/vllm"

94
95
96
97
98
99
	// Kai-scheduler related constants
	KubeAnnotationKaiSchedulerQueue = "nvidia.com/kai-scheduler-queue" // User-provided annotation to specify queue name
	KubeLabelKaiSchedulerQueue      = "kai.scheduler/queue"            // Label injected into pods for kai-scheduler
	KaiSchedulerName                = "kai-scheduler"                  // Scheduler name for kai-scheduler
	DefaultKaiSchedulerQueue        = "dynamo"                         // Default queue name when none specified

100
101
102
	// Grove multinode role suffixes
	GroveRoleSuffixLeader = "ldr"
	GroveRoleSuffixWorker = "wkr"
103
104

	MainContainerName = "main"
105
106

	RestartAnnotation = "nvidia.com/restartAt"
107
108
109
110
111
112
113
114
115
116
117
118
119

	// Resource type constants - match Kubernetes Kind names
	// Used consistently across controllers, webhooks, and metrics
	ResourceTypeDynamoGraphDeployment               = "DynamoGraphDeployment"
	ResourceTypeDynamoComponentDeployment           = "DynamoComponentDeployment"
	ResourceTypeDynamoModel                         = "DynamoModel"
	ResourceTypeDynamoGraphDeploymentRequest        = "DynamoGraphDeploymentRequest"
	ResourceTypeDynamoGraphDeploymentScalingAdapter = "DynamoGraphDeploymentScalingAdapter"

	// Resource state constants - used in status reporting and metrics
	ResourceStateReady    = "ready"
	ResourceStateNotReady = "not_ready"
	ResourceStateUnknown  = "unknown"
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
	// Checkpoint related constants
	KubeLabelCheckpointSource = "nvidia.com/checkpoint-source"
	KubeLabelCheckpointHash   = "nvidia.com/checkpoint-hash"
	KubeLabelCheckpointName   = "nvidia.com/checkpoint-name"

	// EnvCheckpointStorageType indicates the storage backend type (pvc, s3, oci)
	EnvCheckpointStorageType = "DYN_CHECKPOINT_STORAGE_TYPE"
	// EnvCheckpointLocation is the source location of the checkpoint
	// For PVC: same as path (e.g., /checkpoints/{hash}.tar)
	// For S3: s3://bucket/prefix/{hash}.tar
	// For OCI: oci://registry/repo:{hash}
	EnvCheckpointLocation = "DYN_CHECKPOINT_LOCATION"
	// EnvCheckpointPath is the local path to the checkpoint tar file
	// For PVC: same as location
	// For S3/OCI: download destination (e.g., /tmp/{hash}.tar)
	EnvCheckpointPath = "DYN_CHECKPOINT_PATH"
	// EnvCheckpointHash is the identity hash (for debugging/observability)
	EnvCheckpointHash = "DYN_CHECKPOINT_HASH"
	// EnvCheckpointSignalFile is the full path to the signal file
	// The DaemonSet writes this file after checkpoint is complete
	// The checkpoint job pod waits for this file, then exits successfully
	EnvCheckpointSignalFile = "DYN_CHECKPOINT_SIGNAL_FILE"

	// EnvCheckpointReadyFile is the full path to a file the worker creates
	// when the model is loaded and ready for checkpointing.
	// The readiness probe watches this file to trigger DaemonSet checkpoint.
	EnvCheckpointReadyFile = "DYN_CHECKPOINT_READY_FILE"

	// CRIU-related environment variables for restore operations
	// EnvRestoreMarkerFile is the file created by CRIU after successful restore
	EnvRestoreMarkerFile = "DYN_RESTORE_MARKER_FILE"
	// EnvCRIUWorkDir is the working directory for CRIU operations
	EnvCRIUWorkDir = "CRIU_WORK_DIR"
	// EnvCRIULogDir is the directory where CRIU writes logs
	EnvCRIULogDir = "CRIU_LOG_DIR"
	// EnvCUDAPluginDir is the directory containing CRIU CUDA plugins
	EnvCUDAPluginDir = "CUDA_PLUGIN_DIR"
	// EnvCRIUTimeout is the timeout for CRIU operations
	EnvCRIUTimeout = "CRIU_TIMEOUT"

	// CheckpointReadyFilePath is the default path for the ready file
	CheckpointReadyFilePath = "/tmp/checkpoint-ready"
	// RestoreMarkerFilePath is the default path for the restore marker
	RestoreMarkerFilePath = "/tmp/dynamo-restored"
	// CRIUWorkDirPath is the default CRIU work directory
	CRIUWorkDirPath = "/var/criu-work"
	// CRIULogDirPath is the default CRIU log directory
	CRIULogDirPath = "/checkpoints/restore-logs"
	// CUDAPluginDirPath is the default CUDA plugin directory
	CUDAPluginDirPath = "/usr/local/lib/criu"
	// DefaultCRIUTimeout is the default CRIU timeout in seconds (6 hours)
	DefaultCRIUTimeout = "21600"

	CheckpointVolumeName       = "checkpoint-storage"
	CheckpointSignalVolumeName = "checkpoint-signal"
	CheckpointBasePath         = "/checkpoints"
	CheckpointSignalHostPath   = "/var/lib/dynamo-checkpoint/signals"
	CheckpointSignalMountPath  = "/checkpoint-signal"

	// PodInfo volume for Downward API (critical for CRIU restore)
	// After CRIU restore, environment variables contain stale values from checkpoint pod.
	// The Downward API files at /etc/podinfo always have current pod identity.
	PodInfoVolumeName = "podinfo"
	PodInfoMountPath  = "/etc/podinfo"

	// Downward API field paths
	PodInfoFieldPodName      = "metadata.name"
	PodInfoFieldPodUID       = "metadata.uid"
	PodInfoFieldPodNamespace = "metadata.namespace"

	// Downward API file names for DGD annotations
	PodInfoFileDynNamespace        = "dyn_namespace"
	PodInfoFileDynComponent        = "dyn_component"
	PodInfoFileDynParentDGDName    = "dyn_parent_dgd_name"
	PodInfoFileDynParentDGDNS      = "dyn_parent_dgd_namespace"
	PodInfoFileDynDiscoveryBackend = "dyn_discovery_backend"

	// Annotation keys for DGD info (exposed via Downward API)
	AnnotationDynNamespace        = "nvidia.com/dyn-namespace"
	AnnotationDynComponent        = "nvidia.com/dyn-component"
	AnnotationDynParentDGDName    = "nvidia.com/dyn-parent-dgd-name"
	AnnotationDynParentDGDNS      = "nvidia.com/dyn-parent-dgd-namespace"
	AnnotationDynDiscoveryBackend = "nvidia.com/dyn-discovery-backend"
203
204
205
206
207
208
209
)

type MultinodeDeploymentType string

const (
	MultinodeDeploymentTypeGrove MultinodeDeploymentType = "grove"
	MultinodeDeploymentTypeLWS   MultinodeDeploymentType = "lws"
210
)
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232

// GroupVersionResources for external APIs
var (
	// Grove GroupVersionResources for scaling operations
	PodCliqueGVR = schema.GroupVersionResource{
		Group:    "grove.io",
		Version:  "v1alpha1",
		Resource: "podcliques",
	}
	PodCliqueScalingGroupGVR = schema.GroupVersionResource{
		Group:    "grove.io",
		Version:  "v1alpha1",
		Resource: "podcliquescalinggroups",
	}

	// KAI-Scheduler GroupVersionResource for queue validation
	QueueGVR = schema.GroupVersionResource{
		Group:    "scheduling.run.ai",
		Version:  "v2",
		Resource: "queues",
	}
)