consts.go 8.78 KB
Newer Older
1
2
package consts

3
4
5
6
7
import (
	"time"

	"k8s.io/apimachinery/pkg/runtime/schema"
)
8

9
10
11
12
const (
	DefaultUserId = "default"
	DefaultOrgId  = "default"

13
	DynamoServicePort       = 8000
14
15
	DynamoServicePortName   = "http"
	DynamoContainerPortName = "http"
16

17
18
19
	DynamoPlannerMetricsPort = 9085
	DynamoMetricsPortName    = "metrics"

20
21
	DynamoSystemPort     = 9090
	DynamoSystemPortName = "system"
22

23
24
25
26
	// EPP (Endpoint Picker Plugin) ports
	EPPGRPCPort     = 9002
	EPPGRPCPortName = "grpc"

27
28
	MpiRunSshPort = 2222

29
30
31
32
33
	// Default security context values
	// These provide secure defaults for running containers as non-root
	// Users can override these via extraPodSpec.securityContext in their DynamoGraphDeployment
	DefaultSecurityContextFSGroup = 1000

34
	EnvDynamoServicePort = "DYNAMO_PORT"
35

36
37
	KubeLabelDynamoSelector = "nvidia.com/selector"

38
39
	KubeAnnotationEnableGrove = "nvidia.com/enable-grove"

40
	KubeAnnotationDisableImagePullSecretDiscovery = "nvidia.com/disable-image-pull-secret-discovery"
41
	KubeAnnotationDynamoDiscoveryBackend          = "nvidia.com/dynamo-discovery-backend"
42

43
	KubeLabelDynamoGraphDeploymentName  = "nvidia.com/dynamo-graph-deployment-name"
44
	KubeLabelDynamoComponent            = "nvidia.com/dynamo-component"
45
	KubeLabelDynamoNamespace            = "nvidia.com/dynamo-namespace"
46
	KubeLabelDynamoDeploymentTargetType = "nvidia.com/dynamo-deployment-target-type"
47
	KubeLabelDynamoComponentType        = "nvidia.com/dynamo-component-type"
48
	KubeLabelDynamoSubComponentType     = "nvidia.com/dynamo-sub-component-type"
49
50
51
	KubeLabelDynamoBaseModel            = "nvidia.com/dynamo-base-model"
	KubeLabelDynamoBaseModelHash        = "nvidia.com/dynamo-base-model-hash"
	KubeAnnotationDynamoBaseModel       = "nvidia.com/dynamo-base-model"
52
	KubeLabelDynamoDiscoveryBackend     = "nvidia.com/dynamo-discovery-backend"
53
	KubeLabelDynamoDiscoveryEnabled     = "nvidia.com/dynamo-discovery-enabled"
54

55
56
57
	KubeLabelValueFalse = "false"
	KubeLabelValueTrue  = "true"

58
	KubeLabelDynamoComponentPod = "nvidia.com/dynamo-component-pod"
59
60
61

	KubeResourceGPUNvidia = "nvidia.com/gpu"

62
	DynamoDeploymentConfigEnvVar = "DYN_DEPLOYMENT_CONFIG"
63
64
65
	DynamoNamespaceEnvVar        = "DYN_NAMESPACE"
	DynamoComponentEnvVar        = "DYN_COMPONENT"
	DynamoDiscoveryBackendEnvVar = "DYN_DISCOVERY_BACKEND"
66

67
68
	GlobalDynamoNamespace = "dynamo"

69
	ComponentTypePlanner      = "planner"
70
	ComponentTypeFrontend     = "frontend"
71
	ComponentTypeWorker       = "worker"
72
73
	ComponentTypePrefill      = "prefill"
	ComponentTypeDecode       = "decode"
74
	ComponentTypeEPP          = "epp"
75
	ComponentTypeDefault      = "default"
76
	PlannerServiceAccountName = "planner-serviceaccount"
77
78
	EPPServiceAccountName     = "epp-serviceaccount"
	EPPClusterRoleName        = "epp-cluster-role"
79
80

	DefaultIngressSuffix = "local"
81
82

	DefaultGroveTerminationDelay = 15 * time.Minute
83

84
85
86
87
	// Operator origin version: stamped on DGD at creation time by mutating webhook.
	// Records which operator version created the resource, enabling version-gated behavior changes.
	KubeAnnotationDynamoOperatorOriginVersion = "nvidia.com/dynamo-operator-origin-version"

88
	// Metrics related constants
89
90
91
92
93
	KubeAnnotationEnableMetrics  = "nvidia.com/enable-metrics"  // User-provided annotation to control metrics
	KubeLabelMetricsEnabled      = "nvidia.com/metrics-enabled" // Controller-managed label for pod selection
	KubeValueNameSharedMemory    = "shared-memory"
	DefaultSharedMemoryMountPath = "/dev/shm"
	DefaultSharedMemorySize      = "8Gi"
94

95
96
97
	// Compilation cache default mount points
	DefaultVLLMCacheMountPoint = "/root/.cache/vllm"

98
99
100
101
102
103
	// Kai-scheduler related constants
	KubeAnnotationKaiSchedulerQueue = "nvidia.com/kai-scheduler-queue" // User-provided annotation to specify queue name
	KubeLabelKaiSchedulerQueue      = "kai.scheduler/queue"            // Label injected into pods for kai-scheduler
	KaiSchedulerName                = "kai-scheduler"                  // Scheduler name for kai-scheduler
	DefaultKaiSchedulerQueue        = "dynamo"                         // Default queue name when none specified

104
105
106
	// Grove multinode role suffixes
	GroveRoleSuffixLeader = "ldr"
	GroveRoleSuffixWorker = "wkr"
107
108

	MainContainerName = "main"
109
110

	RestartAnnotation = "nvidia.com/restartAt"
111
112
113
114
115
116
117
118
119
120
121
122
123

	// Resource type constants - match Kubernetes Kind names
	// Used consistently across controllers, webhooks, and metrics
	ResourceTypeDynamoGraphDeployment               = "DynamoGraphDeployment"
	ResourceTypeDynamoComponentDeployment           = "DynamoComponentDeployment"
	ResourceTypeDynamoModel                         = "DynamoModel"
	ResourceTypeDynamoGraphDeploymentRequest        = "DynamoGraphDeploymentRequest"
	ResourceTypeDynamoGraphDeploymentScalingAdapter = "DynamoGraphDeploymentScalingAdapter"

	// Resource state constants - used in status reporting and metrics
	ResourceStateReady    = "ready"
	ResourceStateNotReady = "not_ready"
	ResourceStateUnknown  = "unknown"
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155

	// Checkpoint/restore constants
	// CROSS-REFERENCE: Some constants below are duplicated in the chrek package at
	// deploy/chrek/pkg/config/constants.go. If you change a value here, update there too.

	// Kubernetes labels
	KubeLabelCheckpointSource = "nvidia.com/checkpoint-source" // Pod label that triggers DaemonSet auto-checkpoint
	KubeLabelCheckpointHash   = "nvidia.com/checkpoint-hash"   // Checkpoint identity hash for deduplication
	KubeLabelCheckpointName   = "nvidia.com/checkpoint-name"   // DynamoCheckpoint CR name reference

	// Environment variables injected into pods
	EnvCheckpointStorageType  = "DYN_CHECKPOINT_STORAGE_TYPE"   // Storage backend (pvc, s3, oci) — checkpoint job pods only
	EnvCheckpointLocation     = "DYN_CHECKPOINT_LOCATION"       // Full checkpoint URI — future S3/OCI; for PVC, use PATH+HASH instead
	EnvCheckpointPath         = "DYN_CHECKPOINT_PATH"           // Base checkpoint directory (e.g., /checkpoints) — PVC restored pods
	EnvCheckpointHash         = "DYN_CHECKPOINT_HASH"           // Identity hash — all checkpoint-related pods
	EnvCheckpointSignalFile   = "DYN_CHECKPOINT_SIGNAL_FILE"    // Signal file path — checkpoint job pods
	EnvReadyForCheckpointFile = "DYN_READY_FOR_CHECKPOINT_FILE" // Ready-for-checkpoint file path — checkpoint job pods
	EnvRestoreMarkerFile      = "DYN_RESTORE_MARKER_FILE"       // Restore marker path — injected into restore and checkpoint job pods
	EnvSkipWaitForCheckpoint  = "SKIP_WAIT_FOR_CHECKPOINT"      // Skip polling, check once — restored/DGD pods
	// Checkpoint pod-internal constants
	CheckpointVolumeName               = "checkpoint-storage"  // Pod-internal volume name for checkpoint PVC
	CheckpointSignalVolumeName         = "checkpoint-signal"   // Pod-internal volume name for signal hostPath
	CheckpointSignalMountPath          = "/checkpoint-signal"  // Mount path for signal volume inside pods
	SignalFileCleanupInitContainerName = "cleanup-signal-file" // Init container that removes stale signal files before job starts

	// SeccompProfilePath is the localhost seccomp profile that blocks io_uring syscalls.
	// Deployed to nodes by the chrek DaemonSet init container.
	SeccompProfilePath = "profiles/block-iouring.json"

	// Pod identity (Downward API) ---
	// After CRIU restore, env vars contain stale values from the checkpoint pod.
	// The Downward API files at /etc/podinfo always reflect the current pod.
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
	PodInfoVolumeName = "podinfo"
	PodInfoMountPath  = "/etc/podinfo"

	// Downward API field paths
	PodInfoFieldPodName      = "metadata.name"
	PodInfoFieldPodUID       = "metadata.uid"
	PodInfoFieldPodNamespace = "metadata.namespace"

	// Downward API file names for DGD annotations
	PodInfoFileDynNamespace        = "dyn_namespace"
	PodInfoFileDynComponent        = "dyn_component"
	PodInfoFileDynParentDGDName    = "dyn_parent_dgd_name"
	PodInfoFileDynParentDGDNS      = "dyn_parent_dgd_namespace"
	PodInfoFileDynDiscoveryBackend = "dyn_discovery_backend"

	// Annotation keys for DGD info (exposed via Downward API)
	AnnotationDynNamespace        = "nvidia.com/dyn-namespace"
	AnnotationDynComponent        = "nvidia.com/dyn-component"
	AnnotationDynParentDGDName    = "nvidia.com/dyn-parent-dgd-name"
	AnnotationDynParentDGDNS      = "nvidia.com/dyn-parent-dgd-namespace"
	AnnotationDynDiscoveryBackend = "nvidia.com/dyn-discovery-backend"
177
178
179
180
181
182
183
)

type MultinodeDeploymentType string

const (
	MultinodeDeploymentTypeGrove MultinodeDeploymentType = "grove"
	MultinodeDeploymentTypeLWS   MultinodeDeploymentType = "lws"
184
)
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206

// GroupVersionResources for external APIs
var (
	// Grove GroupVersionResources for scaling operations
	PodCliqueGVR = schema.GroupVersionResource{
		Group:    "grove.io",
		Version:  "v1alpha1",
		Resource: "podcliques",
	}
	PodCliqueScalingGroupGVR = schema.GroupVersionResource{
		Group:    "grove.io",
		Version:  "v1alpha1",
		Resource: "podcliquescalinggroups",
	}

	// KAI-Scheduler GroupVersionResource for queue validation
	QueueGVR = schema.GroupVersionResource{
		Group:    "scheduling.run.ai",
		Version:  "v2",
		Resource: "queues",
	}
)