consts.go 8.83 KB
Newer Older
1
2
package consts

3
4
5
6
7
import (
	"time"

	"k8s.io/apimachinery/pkg/runtime/schema"
)
8

9
10
11
12
const (
	DefaultUserId = "default"
	DefaultOrgId  = "default"

13
	DynamoServicePort       = 8000
14
15
	DynamoServicePortName   = "http"
	DynamoContainerPortName = "http"
16

17
18
19
	DynamoPlannerMetricsPort = 9085
	DynamoMetricsPortName    = "metrics"

20
21
	DynamoSystemPort     = 9090
	DynamoSystemPortName = "system"
22

23
24
25
26
	// EPP (Endpoint Picker Plugin) ports
	EPPGRPCPort     = 9002
	EPPGRPCPortName = "grpc"

27
28
29
	DynamoNixlPort     = 19090
	DynamoNixlPortName = "nixl"

30
31
	MpiRunSshPort = 2222

32
33
34
35
36
	// Default security context values
	// These provide secure defaults for running containers as non-root
	// Users can override these via extraPodSpec.securityContext in their DynamoGraphDeployment
	DefaultSecurityContextFSGroup = 1000

37
	EnvDynamoServicePort = "DYNAMO_PORT"
38

39
40
	KubeLabelDynamoSelector = "nvidia.com/selector"

41
42
	KubeAnnotationEnableGrove = "nvidia.com/enable-grove"

43
	KubeAnnotationDisableImagePullSecretDiscovery = "nvidia.com/disable-image-pull-secret-discovery"
44
	KubeAnnotationDynamoDiscoveryBackend          = "nvidia.com/dynamo-discovery-backend"
45

46
	KubeLabelDynamoGraphDeploymentName  = "nvidia.com/dynamo-graph-deployment-name"
47
	KubeLabelDynamoComponent            = "nvidia.com/dynamo-component"
48
	KubeLabelDynamoNamespace            = "nvidia.com/dynamo-namespace"
49
	KubeLabelDynamoDeploymentTargetType = "nvidia.com/dynamo-deployment-target-type"
50
	KubeLabelDynamoComponentType        = "nvidia.com/dynamo-component-type"
51
	KubeLabelDynamoSubComponentType     = "nvidia.com/dynamo-sub-component-type"
52
53
54
	KubeLabelDynamoBaseModel            = "nvidia.com/dynamo-base-model"
	KubeLabelDynamoBaseModelHash        = "nvidia.com/dynamo-base-model-hash"
	KubeAnnotationDynamoBaseModel       = "nvidia.com/dynamo-base-model"
55
	KubeLabelDynamoDiscoveryBackend     = "nvidia.com/dynamo-discovery-backend"
56
	KubeLabelDynamoDiscoveryEnabled     = "nvidia.com/dynamo-discovery-enabled"
57

58
59
60
	KubeLabelValueFalse = "false"
	KubeLabelValueTrue  = "true"

61
	KubeLabelDynamoComponentPod = "nvidia.com/dynamo-component-pod"
62
63
64

	KubeResourceGPUNvidia = "nvidia.com/gpu"

65
	DynamoDeploymentConfigEnvVar = "DYN_DEPLOYMENT_CONFIG"
66
67
68
	DynamoNamespaceEnvVar        = "DYN_NAMESPACE"
	DynamoComponentEnvVar        = "DYN_COMPONENT"
	DynamoDiscoveryBackendEnvVar = "DYN_DISCOVERY_BACKEND"
69

70
71
	GlobalDynamoNamespace = "dynamo"

72
	ComponentTypePlanner      = "planner"
73
	ComponentTypeFrontend     = "frontend"
74
	ComponentTypeWorker       = "worker"
75
76
	ComponentTypePrefill      = "prefill"
	ComponentTypeDecode       = "decode"
77
	ComponentTypeEPP          = "epp"
78
	ComponentTypeDefault      = "default"
79
	PlannerServiceAccountName = "planner-serviceaccount"
80
81
	EPPServiceAccountName     = "epp-serviceaccount"
	EPPClusterRoleName        = "epp-cluster-role"
82
83

	DefaultIngressSuffix = "local"
84
85

	DefaultGroveTerminationDelay = 15 * time.Minute
86

87
88
89
90
	// Operator origin version: stamped on DGD at creation time by mutating webhook.
	// Records which operator version created the resource, enabling version-gated behavior changes.
	KubeAnnotationDynamoOperatorOriginVersion = "nvidia.com/dynamo-operator-origin-version"

91
	// Metrics related constants
92
93
94
95
96
	KubeAnnotationEnableMetrics  = "nvidia.com/enable-metrics"  // User-provided annotation to control metrics
	KubeLabelMetricsEnabled      = "nvidia.com/metrics-enabled" // Controller-managed label for pod selection
	KubeValueNameSharedMemory    = "shared-memory"
	DefaultSharedMemoryMountPath = "/dev/shm"
	DefaultSharedMemorySize      = "8Gi"
97

98
99
100
	// Compilation cache default mount points
	DefaultVLLMCacheMountPoint = "/root/.cache/vllm"

101
102
103
104
105
106
	// Kai-scheduler related constants
	KubeAnnotationKaiSchedulerQueue = "nvidia.com/kai-scheduler-queue" // User-provided annotation to specify queue name
	KubeLabelKaiSchedulerQueue      = "kai.scheduler/queue"            // Label injected into pods for kai-scheduler
	KaiSchedulerName                = "kai-scheduler"                  // Scheduler name for kai-scheduler
	DefaultKaiSchedulerQueue        = "dynamo"                         // Default queue name when none specified

107
108
109
	// Grove multinode role suffixes
	GroveRoleSuffixLeader = "ldr"
	GroveRoleSuffixWorker = "wkr"
110
111

	MainContainerName = "main"
112
113

	RestartAnnotation = "nvidia.com/restartAt"
114
115
116
117
118
119
120
121
122
123
124
125
126

	// Resource type constants - match Kubernetes Kind names
	// Used consistently across controllers, webhooks, and metrics
	ResourceTypeDynamoGraphDeployment               = "DynamoGraphDeployment"
	ResourceTypeDynamoComponentDeployment           = "DynamoComponentDeployment"
	ResourceTypeDynamoModel                         = "DynamoModel"
	ResourceTypeDynamoGraphDeploymentRequest        = "DynamoGraphDeploymentRequest"
	ResourceTypeDynamoGraphDeploymentScalingAdapter = "DynamoGraphDeploymentScalingAdapter"

	// Resource state constants - used in status reporting and metrics
	ResourceStateReady    = "ready"
	ResourceStateNotReady = "not_ready"
	ResourceStateUnknown  = "unknown"
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158

	// Checkpoint/restore constants
	// CROSS-REFERENCE: Some constants below are duplicated in the chrek package at
	// deploy/chrek/pkg/config/constants.go. If you change a value here, update there too.

	// Kubernetes labels
	KubeLabelCheckpointSource = "nvidia.com/checkpoint-source" // Pod label that triggers DaemonSet auto-checkpoint
	KubeLabelCheckpointHash   = "nvidia.com/checkpoint-hash"   // Checkpoint identity hash for deduplication
	KubeLabelCheckpointName   = "nvidia.com/checkpoint-name"   // DynamoCheckpoint CR name reference

	// Environment variables injected into pods
	EnvCheckpointStorageType  = "DYN_CHECKPOINT_STORAGE_TYPE"   // Storage backend (pvc, s3, oci) — checkpoint job pods only
	EnvCheckpointLocation     = "DYN_CHECKPOINT_LOCATION"       // Full checkpoint URI — future S3/OCI; for PVC, use PATH+HASH instead
	EnvCheckpointPath         = "DYN_CHECKPOINT_PATH"           // Base checkpoint directory (e.g., /checkpoints) — PVC restored pods
	EnvCheckpointHash         = "DYN_CHECKPOINT_HASH"           // Identity hash — all checkpoint-related pods
	EnvCheckpointSignalFile   = "DYN_CHECKPOINT_SIGNAL_FILE"    // Signal file path — checkpoint job pods
	EnvReadyForCheckpointFile = "DYN_READY_FOR_CHECKPOINT_FILE" // Ready-for-checkpoint file path — checkpoint job pods
	EnvRestoreMarkerFile      = "DYN_RESTORE_MARKER_FILE"       // Restore marker path — injected into restore and checkpoint job pods
	EnvSkipWaitForCheckpoint  = "SKIP_WAIT_FOR_CHECKPOINT"      // Skip polling, check once — restored/DGD pods
	// Checkpoint pod-internal constants
	CheckpointVolumeName               = "checkpoint-storage"  // Pod-internal volume name for checkpoint PVC
	CheckpointSignalVolumeName         = "checkpoint-signal"   // Pod-internal volume name for signal hostPath
	CheckpointSignalMountPath          = "/checkpoint-signal"  // Mount path for signal volume inside pods
	SignalFileCleanupInitContainerName = "cleanup-signal-file" // Init container that removes stale signal files before job starts

	// SeccompProfilePath is the localhost seccomp profile that blocks io_uring syscalls.
	// Deployed to nodes by the chrek DaemonSet init container.
	SeccompProfilePath = "profiles/block-iouring.json"

	// Pod identity (Downward API) ---
	// After CRIU restore, env vars contain stale values from the checkpoint pod.
	// The Downward API files at /etc/podinfo always reflect the current pod.
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
	PodInfoVolumeName = "podinfo"
	PodInfoMountPath  = "/etc/podinfo"

	// Downward API field paths
	PodInfoFieldPodName      = "metadata.name"
	PodInfoFieldPodUID       = "metadata.uid"
	PodInfoFieldPodNamespace = "metadata.namespace"

	// Downward API file names for DGD annotations
	PodInfoFileDynNamespace        = "dyn_namespace"
	PodInfoFileDynComponent        = "dyn_component"
	PodInfoFileDynParentDGDName    = "dyn_parent_dgd_name"
	PodInfoFileDynParentDGDNS      = "dyn_parent_dgd_namespace"
	PodInfoFileDynDiscoveryBackend = "dyn_discovery_backend"

	// Annotation keys for DGD info (exposed via Downward API)
	AnnotationDynNamespace        = "nvidia.com/dyn-namespace"
	AnnotationDynComponent        = "nvidia.com/dyn-component"
	AnnotationDynParentDGDName    = "nvidia.com/dyn-parent-dgd-name"
	AnnotationDynParentDGDNS      = "nvidia.com/dyn-parent-dgd-namespace"
	AnnotationDynDiscoveryBackend = "nvidia.com/dyn-discovery-backend"
180
181
182
183
184
185
186
)

type MultinodeDeploymentType string

const (
	MultinodeDeploymentTypeGrove MultinodeDeploymentType = "grove"
	MultinodeDeploymentTypeLWS   MultinodeDeploymentType = "lws"
187
)
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209

// GroupVersionResources for external APIs
var (
	// Grove GroupVersionResources for scaling operations
	PodCliqueGVR = schema.GroupVersionResource{
		Group:    "grove.io",
		Version:  "v1alpha1",
		Resource: "podcliques",
	}
	PodCliqueScalingGroupGVR = schema.GroupVersionResource{
		Group:    "grove.io",
		Version:  "v1alpha1",
		Resource: "podcliquescalinggroups",
	}

	// KAI-Scheduler GroupVersionResource for queue validation
	QueueGVR = schema.GroupVersionResource{
		Group:    "scheduling.run.ai",
		Version:  "v2",
		Resource: "queues",
	}
)