Unverified Commit 043c80c4 authored by mohammedabdulwahhab's avatar mohammedabdulwahhab Committed by GitHub
Browse files

fix: make kubernetes backed discovery default (#5024)


Signed-off-by: default avatarmohammedabdulwahhab <furkhan324@berkeley.edu>
parent 4d0b1a11
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
name: dynamoworkermetadatas.nvidia.com
spec:
group: nvidia.com
names:
kind: DynamoWorkerMetadata
listKind: DynamoWorkerMetadataList
plural: dynamoworkermetadatas
singular: dynamoworkermetadata
shortNames:
- dwm
scope: Namespaced
versions:
- name: v1alpha1
served: true
storage: true
schema:
openAPIV3Schema:
type: object
description: DynamoWorkerMetadata stores discovery metadata for a Dynamo worker pod
properties:
apiVersion:
type: string
description: APIVersion defines the versioned schema of this representation
kind:
type: string
description: Kind is a string value representing the REST resource
metadata:
type: object
spec:
type: object
description: Spec contains the worker metadata
required:
- data
properties:
data:
type: object
description: Raw JSON blob containing DiscoveryMetadata
x-kubernetes-preserve-unknown-fields: true
additionalPrinterColumns:
- name: Age
type: date
jsonPath: .metadata.creationTimestamp
......@@ -104,11 +104,11 @@ Validation for configuration consistency
{{- end -}}
{{/*
Validation for discoverBackend configuration
Validation for discoveryBackend configuration
*/}}
{{- define "dynamo-operator.validateDiscoveryBackend" -}}
{{- $discoveryBackend := .Values.discoveryBackend -}}
{{- if and (ne $discoveryBackend "") (ne $discoveryBackend "kubernetes") -}}
{{- fail (printf "VALIDATION ERROR: discoveryBackend must be either an empty string (defaults to ETCD) or 'kubernetes'. Got: '%s'" $discoveryBackend) -}}
{{- if and (ne $discoveryBackend "kubernetes") (ne $discoveryBackend "etcd") -}}
{{- fail (printf "VALIDATION ERROR: discoveryBackend must be 'kubernetes' (default) or 'etcd'. Got: '%s'" $discoveryBackend) -}}
{{- end -}}
{{- end -}}
......@@ -42,8 +42,8 @@ dynamo-operator:
# Interval for renewing the namespace scope marker lease (namespace-restricted mode only). The namespace-restricted operator renews its lease at this interval to signal it's still running.
renewInterval: 10s
# -- The Dynamo discovery backend to use. By default, will rely on ETCD for discovery. Can be set to "kubernetes" to use Kubernetes API for service discovery. --
discoveryBackend: ""
# -- The Dynamo discovery backend to use. Default is "kubernetes" for Kubernetes API service discovery. Set to "etcd" to use ETCD for discovery. --
discoveryBackend: "kubernetes"
# Controller manager configuration
controllerManager:
......
......@@ -201,8 +201,8 @@ func main() {
"Interval for renewing namespace scope marker lease (namespace-restricted mode only)")
flag.StringVar(&operatorVersion, "operator-version", "unknown",
"Version of the operator (used in lease holder identity)")
flag.StringVar(&discoveryBackend, "discovery-backend", "",
"Discovery backend to use: empty string (default, uses ETCD) or 'kubernetes' (uses Kubernetes API)")
flag.StringVar(&discoveryBackend, "discovery-backend", "kubernetes",
"Discovery backend to use: 'kubernetes' (default, uses Kubernetes API) or 'etcd' (uses ETCD)")
opts := zap.Options{
Development: true,
}
......@@ -215,16 +215,12 @@ func main() {
os.Exit(1)
}
// Validate discoverBackend value
if discoveryBackend != "" && discoveryBackend != "kubernetes" {
setupLog.Error(nil, "invalid discover-backend value, must be empty string or 'kubernetes'", "value", discoveryBackend)
// Validate discoveryBackend value
if discoveryBackend != "kubernetes" && discoveryBackend != "etcd" {
setupLog.Error(nil, "invalid discovery-backend value, must be 'kubernetes' or 'etcd'", "value", discoveryBackend)
os.Exit(1)
}
if discoveryBackend != "" {
setupLog.Info("Discovery backend configured", "backend", discoveryBackend)
} else {
setupLog.Info("Discovery backend configured", "backend", "etcd (default)")
}
setupLog.Info("Discovery backend configured", "backend", discoveryBackend)
// Validate modelExpressURL if provided
if modelExpressURL != "" {
......
......@@ -828,6 +828,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
Args: []string{"ray start --head --port=6379 && some dynamo command --tensor-parallel-size 4 --pipeline-parallel-size 1"},
Env: []corev1.EnvVar{
{Name: commonconsts.DynamoComponentEnvVar, Value: commonconsts.ComponentTypeWorker},
{Name: commonconsts.DynamoDiscoveryBackendEnvVar, Value: "kubernetes"},
{Name: "DYN_HEALTH_CHECK_ENABLED", Value: "true"},
{Name: commonconsts.DynamoNamespaceEnvVar, Value: "default"},
{Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-lws-deploy"},
......@@ -845,6 +846,11 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
FieldPath: "metadata.namespace",
},
}},
{Name: "POD_UID", ValueFrom: &corev1.EnvVarSource{
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.uid",
},
}},
{Name: "TEST_ENV_FROM_DYNAMO_COMPONENT_DEPLOYMENT_SPEC", Value: "test_value_from_dynamo_component_deployment_spec"},
{Name: "TEST_ENV_FROM_EXTRA_POD_SPEC", Value: "test_value_from_extra_pod_spec"},
},
......@@ -957,6 +963,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
Args: []string{"ray start --address=$LWS_LEADER_ADDRESS:6379 --block"},
Env: []corev1.EnvVar{
{Name: commonconsts.DynamoComponentEnvVar, Value: commonconsts.ComponentTypeWorker},
{Name: commonconsts.DynamoDiscoveryBackendEnvVar, Value: "kubernetes"},
{Name: "DYN_HEALTH_CHECK_ENABLED", Value: "true"},
{Name: commonconsts.DynamoNamespaceEnvVar, Value: "default"},
{Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-lws-deploy"},
......@@ -974,6 +981,11 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
FieldPath: "metadata.namespace",
},
}},
{Name: "POD_UID", ValueFrom: &corev1.EnvVarSource{
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.uid",
},
}},
{Name: "TEST_ENV_FROM_DYNAMO_COMPONENT_DEPLOYMENT_SPEC", Value: "test_value_from_dynamo_component_deployment_spec"},
{Name: "TEST_ENV_FROM_EXTRA_POD_SPEC", Value: "test_value_from_extra_pod_spec"},
},
......
......@@ -77,7 +77,7 @@ type Config struct {
// ExcludedNamespaces is a thread-safe set of namespaces to exclude (cluster-wide mode only)
ExcludedNamespaces ExcludedNamespacesInterface
// DiscoveryBackend is the discovery backend to use. By default, will rely on ETCD for discovery. Can be set to "kubernetes" to use Kubernetes API for service discovery.
// DiscoveryBackend is the discovery backend to use. Default is "kubernetes" for Kubernetes API service discovery. Set to "etcd" to use ETCD for discovery.
DiscoveryBackend string
// WebhooksEnabled indicates whether admission webhooks are enabled
......
......@@ -17,6 +17,7 @@ const (
kindServiceAccount = "ServiceAccount"
apiGroupRBAC = "rbac.authorization.k8s.io"
apiGroupCore = ""
apiGroupNvidia = "nvidia.com"
)
func GetK8sDiscoveryServiceAccountName(dgdName string) string {
......@@ -62,6 +63,11 @@ func GetK8sDiscoveryRole(dgdName string, namespace string) *rbacv1.Role {
Resources: []string{"endpointslices"},
Verbs: []string{"get", "list", "watch"},
},
{
APIGroups: []string{apiGroupNvidia},
Resources: []string{"dynamoworkermetadatas"},
Verbs: []string{"create", "get", "list", "watch", "update", "patch", "delete"},
},
},
}
}
......
......@@ -104,12 +104,21 @@ func (b *BaseComponentDefaults) getCommonContainer(context ComponentContext) cor
},
},
},
{
Name: "POD_UID",
ValueFrom: &corev1.EnvVarSource{
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.uid",
},
},
},
}
if context.DiscoveryBackend != "" {
// Set discovery backend env var to "kubernetes" unless explicitly set to "etcd"
if context.DiscoveryBackend != "etcd" {
container.Env = append(container.Env, corev1.EnvVar{
Name: commonconsts.DynamoDiscoveryBackendEnvVar,
Value: context.DiscoveryBackend,
Value: "kubernetes",
})
}
......
......@@ -64,6 +64,12 @@ func TestPlannerDefaults_GetBaseContainer(t *testing.T) {
FieldPath: "metadata.namespace",
},
}},
{Name: "POD_UID", ValueFrom: &corev1.EnvVarSource{
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.uid",
},
}},
{Name: commonconsts.DynamoDiscoveryBackendEnvVar, Value: "kubernetes"},
{Name: "PLANNER_PROMETHEUS_PORT", Value: fmt.Sprintf("%d", commonconsts.DynamoPlannerMetricsPort)},
},
},
......
......@@ -433,10 +433,7 @@ func GenerateComponentService(ctx context.Context, dynamoDeployment *v1alpha1.Dy
if isK8sDiscoveryEnabled {
service.Labels = map[string]string{
commonconsts.KubeLabelDynamoDiscoveryBackend: "kubernetes",
}
// Discovery is enabled for non frontend components
if component.ComponentType != commonconsts.ComponentTypeFrontend {
service.Labels[commonconsts.KubeLabelDynamoDiscoveryEnabled] = commonconsts.KubeLabelValueTrue
commonconsts.KubeLabelDynamoDiscoveryEnabled: commonconsts.KubeLabelValueTrue,
}
}
return service, nil
......
......@@ -1411,6 +1411,14 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
},
},
},
{
Name: "POD_UID",
ValueFrom: &corev1.EnvVarSource{
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.uid",
},
},
},
{
Name: "ETCD_ENDPOINTS",
Value: "etcd-address",
......@@ -1423,6 +1431,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
Name: commonconsts.DynamoComponentEnvVar,
Value: commonconsts.ComponentTypeFrontend,
},
{
Name: commonconsts.DynamoDiscoveryBackendEnvVar,
Value: "kubernetes",
},
{
Name: "DYN_PARENT_DGD_K8S_NAME",
Value: "test-dynamo-graph-deployment",
......@@ -1574,6 +1586,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
Name: commonconsts.DynamoComponentEnvVar,
Value: commonconsts.ComponentTypePlanner,
},
{
Name: commonconsts.DynamoDiscoveryBackendEnvVar,
Value: "kubernetes",
},
{
Name: "DYN_PARENT_DGD_K8S_NAME",
Value: "test-dynamo-graph-deployment",
......@@ -1610,6 +1626,14 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
},
},
},
{
Name: "POD_UID",
ValueFrom: &corev1.EnvVarSource{
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.uid",
},
},
},
},
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
......@@ -1963,6 +1987,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
Name: commonconsts.DynamoComponentEnvVar,
Value: commonconsts.ComponentTypeWorker,
},
{
Name: commonconsts.DynamoDiscoveryBackendEnvVar,
Value: "kubernetes",
},
{
Name: "DYN_HEALTH_CHECK_ENABLED",
Value: "true",
......@@ -1991,6 +2019,14 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
},
},
},
{
Name: "POD_UID",
ValueFrom: &corev1.EnvVarSource{
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.uid",
},
},
},
},
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
......@@ -2144,6 +2180,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
Name: commonconsts.DynamoComponentEnvVar,
Value: commonconsts.ComponentTypeWorker,
},
{
Name: commonconsts.DynamoDiscoveryBackendEnvVar,
Value: "kubernetes",
},
{
Name: "DYN_HEALTH_CHECK_ENABLED",
Value: "true",
......@@ -2172,6 +2212,14 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
},
},
},
{
Name: "POD_UID",
ValueFrom: &corev1.EnvVarSource{
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.uid",
},
},
},
},
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
......@@ -2303,6 +2351,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
Name: commonconsts.DynamoComponentEnvVar,
Value: commonconsts.ComponentTypeFrontend,
},
{
Name: commonconsts.DynamoDiscoveryBackendEnvVar,
Value: "kubernetes",
},
{
Name: "DYN_PARENT_DGD_K8S_NAME",
Value: "test-dynamo-graph-deployment",
......@@ -2327,6 +2379,14 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
},
},
},
{
Name: "POD_UID",
ValueFrom: &corev1.EnvVarSource{
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.uid",
},
},
},
},
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
......@@ -2461,6 +2521,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
Name: commonconsts.DynamoComponentEnvVar,
Value: commonconsts.ComponentTypePlanner,
},
{
Name: commonconsts.DynamoDiscoveryBackendEnvVar,
Value: "kubernetes",
},
{
Name: "DYN_PARENT_DGD_K8S_NAME",
Value: "test-dynamo-graph-deployment",
......@@ -2489,6 +2553,14 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
},
},
},
{
Name: "POD_UID",
ValueFrom: &corev1.EnvVarSource{
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.uid",
},
},
},
},
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
......@@ -2872,6 +2944,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
Name: commonconsts.DynamoComponentEnvVar,
Value: commonconsts.ComponentTypeWorker,
},
{
Name: commonconsts.DynamoDiscoveryBackendEnvVar,
Value: "kubernetes",
},
{
Name: "DYN_HEALTH_CHECK_ENABLED",
Value: "true",
......@@ -2900,6 +2976,14 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
},
},
},
{
Name: "POD_UID",
ValueFrom: &corev1.EnvVarSource{
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.uid",
},
},
},
},
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
......@@ -3040,6 +3124,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
Name: commonconsts.DynamoComponentEnvVar,
Value: commonconsts.ComponentTypeWorker,
},
{
Name: commonconsts.DynamoDiscoveryBackendEnvVar,
Value: "kubernetes",
},
{
Name: "DYN_HEALTH_CHECK_ENABLED",
Value: "true",
......@@ -3068,6 +3156,14 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
},
},
},
{
Name: "POD_UID",
ValueFrom: &corev1.EnvVarSource{
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.uid",
},
},
},
},
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
......@@ -3199,6 +3295,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
Name: commonconsts.DynamoComponentEnvVar,
Value: commonconsts.ComponentTypeFrontend,
},
{
Name: commonconsts.DynamoDiscoveryBackendEnvVar,
Value: "kubernetes",
},
{
Name: "DYN_PARENT_DGD_K8S_NAME",
Value: "test-dynamo-graph-deployment",
......@@ -3223,6 +3323,14 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
},
},
},
{
Name: "POD_UID",
ValueFrom: &corev1.EnvVarSource{
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.uid",
},
},
},
},
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
......@@ -3364,6 +3472,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
Name: commonconsts.DynamoComponentEnvVar,
Value: commonconsts.ComponentTypePlanner,
},
{
Name: commonconsts.DynamoDiscoveryBackendEnvVar,
Value: "kubernetes",
},
{
Name: "DYN_PARENT_DGD_K8S_NAME",
Value: "test-dynamo-graph-deployment",
......@@ -3392,6 +3504,14 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
},
},
},
{
Name: "POD_UID",
ValueFrom: &corev1.EnvVarSource{
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.uid",
},
},
},
},
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
......@@ -4892,7 +5012,7 @@ func TestGenerateBasePodSpec_DiscoverBackend(t *testing.T) {
wantEnvVar string
}{
{
name: "Discover backend should be set",
name: "Kubernetes discovery backend should set env var to kubernetes",
component: &v1alpha1.DynamoComponentDeploymentSharedSpec{
Annotations: map[string]string{
commonconsts.KubeAnnotationDynamoDiscoveryBackend: "kubernetes",
......@@ -4901,29 +5021,39 @@ func TestGenerateBasePodSpec_DiscoverBackend(t *testing.T) {
wantEnvVar: "kubernetes",
},
{
name: "Discover backend should override the controller config",
name: "Kubernetes discovery from controller config should set env var to kubernetes",
component: &v1alpha1.DynamoComponentDeploymentSharedSpec{
Annotations: map[string]string{},
},
controllerConfig: controller_common.Config{
DiscoveryBackend: "kubernetes",
},
wantEnvVar: "kubernetes",
},
{
name: "Etcd discovery backend annotation should not set env var",
component: &v1alpha1.DynamoComponentDeploymentSharedSpec{
Annotations: map[string]string{
commonconsts.KubeAnnotationDynamoDiscoveryBackend: "test",
commonconsts.KubeAnnotationDynamoDiscoveryBackend: "etcd",
},
},
controllerConfig: controller_common.Config{
DiscoveryBackend: "etcd",
DiscoveryBackend: "kubernetes",
},
wantEnvVar: "test",
wantEnvVar: "", // etcd is the runtime default, no env var needed
},
{
name: "Discover backend should be set by the controller config",
name: "Etcd discovery from controller config should not set env var",
component: &v1alpha1.DynamoComponentDeploymentSharedSpec{
Annotations: map[string]string{},
},
controllerConfig: controller_common.Config{
DiscoveryBackend: "etcd",
},
wantEnvVar: "etcd",
wantEnvVar: "", // etcd is the runtime default, no env var needed
},
{
name: "Discover backend empty string",
name: "Empty discovery backend defaults to kubernetes",
component: &v1alpha1.DynamoComponentDeploymentSharedSpec{
Annotations: map[string]string{
commonconsts.KubeAnnotationDynamoDiscoveryBackend: "",
......@@ -4932,10 +5062,12 @@ func TestGenerateBasePodSpec_DiscoverBackend(t *testing.T) {
controllerConfig: controller_common.Config{
DiscoveryBackend: "",
},
wantEnvVar: "kubernetes", // empty defaults to kubernetes
},
{
name: "Discover backend not set",
component: &v1alpha1.DynamoComponentDeploymentSharedSpec{},
name: "Discovery backend not set defaults to kubernetes",
component: &v1alpha1.DynamoComponentDeploymentSharedSpec{},
wantEnvVar: "kubernetes", // not set defaults to kubernetes
},
}
secretsRetriever := &mockSecretsRetriever{}
......@@ -5005,6 +5137,7 @@ func TestGenerateBasePodSpec_Worker(t *testing.T) {
{Name: "ANOTHER_COMPONENTENV", Value: "true"},
{Name: "ANOTHER_CONTAINER_ENV", Value: "true"},
{Name: commonconsts.DynamoComponentEnvVar, Value: "worker"},
{Name: commonconsts.DynamoDiscoveryBackendEnvVar, Value: "kubernetes"},
{Name: "DYN_HEALTH_CHECK_ENABLED", Value: "true"},
{Name: commonconsts.DynamoNamespaceEnvVar, Value: ""},
{Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-deployment"},
......@@ -5022,6 +5155,11 @@ func TestGenerateBasePodSpec_Worker(t *testing.T) {
FieldPath: "metadata.namespace",
},
}},
{Name: "POD_UID", ValueFrom: &corev1.EnvVarSource{
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.uid",
},
}},
},
VolumeMounts: []corev1.VolumeMount{
{
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment