fix: k8s native discovery (#4305)

334ce551 · Thomas Montfort · GitHub · 9a765d6d · 334ce551 · 334ce551
Unverified Commit 334ce551 authored Nov 14, 2025 by Thomas Montfort Committed by GitHub Nov 14, 2025
8 changed files
--- a/deploy/cloud/operator/internal/consts/consts.go
+++ b/deploy/cloud/operator/internal/consts/consts.go
@@ -43,6 +43,7 @@ const (
 	KubeLabelDynamoBaseModelHash        = "nvidia.com/dynamo-base-model-hash"
 	KubeAnnotationDynamoBaseModel       = "nvidia.com/dynamo-base-model"
 	KubeLabelDynamoDiscoveryBackend     = "nvidia.com/dynamo-discovery-backend"
+	KubeLabelDynamoDiscoveryEnabled     = "nvidia.com/dynamo-discovery-enabled"

 	KubeLabelValueFalse = "false"
 	KubeLabelValueTrue  = "true"

--- a/deploy/cloud/operator/internal/controller/dynamocomponentdeployment_controller.go
+++ b/deploy/cloud/operator/internal/controller/dynamocomponentdeployment_controller.go
@@ -1298,6 +1298,10 @@ func (r *DynamoComponentDeploymentReconciler) generateService(opt generateResour
 	if isK8sDiscovery {
 		labels[commonconsts.KubeLabelDynamoDiscoveryBackend] = "kubernetes"
 	}
+	// Discovery is enabled for non frontend components
+	if isK8sDiscovery && !opt.dynamoComponentDeployment.IsFrontendComponent() {
+		labels[commonconsts.KubeLabelDynamoDiscoveryEnabled] = commonconsts.KubeLabelValueTrue
+	}

 	var servicePort corev1.ServicePort
 	if opt.dynamoComponentDeployment.IsFrontendComponent() {

--- a/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go
+++ b/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go
@@ -368,8 +368,9 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGroveResources(ctx context.Co

 		// if k8s discovery is enabled, create a service for each component
 		// else, only create for the frontend component
-		if r.Config.IsK8sDiscoveryEnabled(dynamoDeployment.Annotations) || component.ComponentType == consts.ComponentTypeFrontend {
-			mainComponentService, err := dynamo.GenerateComponentService(ctx, dynamoDeployment, component, componentName)
+		isK8sDiscoveryEnabled := r.Config.IsK8sDiscoveryEnabled(dynamoDeployment.Annotations)
+		if isK8sDiscoveryEnabled || component.ComponentType == consts.ComponentTypeFrontend {
+			mainComponentService, err := dynamo.GenerateComponentService(ctx, dynamoDeployment, component, componentName, isK8sDiscoveryEnabled)
 			if err != nil {
 				logger.Error(err, "failed to generate the main component service")
 				return "", "", "", fmt.Errorf("failed to generate the main component service: %w", err)

--- a/deploy/cloud/operator/internal/dynamo/graph.go
+++ b/deploy/cloud/operator/internal/dynamo/graph.go
@@ -396,7 +396,7 @@ func getCliqueStartupDependencies(
 	return nil
 }

-func GenerateComponentService(ctx context.Context, dynamoDeployment *v1alpha1.DynamoGraphDeployment, component *v1alpha1.DynamoComponentDeploymentSharedSpec, componentName string) (*corev1.Service, error) {
+func GenerateComponentService(ctx context.Context, dynamoDeployment *v1alpha1.DynamoGraphDeployment, component *v1alpha1.DynamoComponentDeploymentSharedSpec, componentName string, isK8sDiscoveryEnabled bool) (*corev1.Service, error) {
 	if component.DynamoNamespace == nil {
 		return nil, fmt.Errorf("expected DynamoComponentDeployment %s to have a dynamoNamespace", componentName)
 	}
@@ -431,6 +431,15 @@ func GenerateComponentService(ctx context.Context, dynamoDeployment *v1alpha1.Dy
 			Ports: []corev1.ServicePort{servicePort},
 		},
 	}
+	if isK8sDiscoveryEnabled {
+		service.Labels = map[string]string{
+			commonconsts.KubeLabelDynamoDiscoveryBackend: "kubernetes",
+		}
+		// Discovery is enabled for non frontend components
+		if component.ComponentType != commonconsts.ComponentTypeFrontend {
+			service.Labels[commonconsts.KubeLabelDynamoDiscoveryEnabled] = commonconsts.KubeLabelValueTrue
+		}
+	}
 	return service, nil
 }


--- a/deploy/discovery/README.md
+++ b/deploy/discovery/README.md
@@ -16,7 +16,7 @@ When deploying with the Dynamo operator, simply add the annotation to your DGD m
 ```yaml
 metadata:
  annotations:
-    nvidia.com/dynamo-discover-backend: kubernetes
+    nvidia.com/dynamo-discovery-backend: kubernetes
 ```

 The operator will automatically configure the required EndpointSlices, labels, and pod environment variables. See [`dgd.yaml`](./dgd.yaml) for a complete example.

--- a/deploy/discovery/dgd.yaml
+++ b/deploy/discovery/dgd.yaml
@@ -17,9 +17,6 @@ spec:
      dynamoNamespace: dynamo
      componentType: frontend
      replicas: 1
-      envs:
-        - name: DYN_SYSTEM_PORT
-          value: "9090"
      extraPodSpec:
        mainContainer:
          image: ${IMAGE}
@@ -30,29 +27,6 @@ spec:
      resources:
        limits:
          gpu: "1"
-      envs:
-        - name: DYN_SYSTEM_PORT
-          value: "9090"
-        - name: DYN_SYSTEM_STARTING_HEALTH_STATUS
-          value: "notready"
-        - name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
-          value: "[\"generate\", \"clear_kv_blocks\"]"
-      readinessProbe:
-        httpGet:
-          path: /health
-          port: 9090
-        initialDelaySeconds: 60
-        periodSeconds: 10
-        timeoutSeconds: 5
-        failureThreshold: 60
-      livenessProbe:
-        httpGet:
-          path: /live
-          port: 9090
-        initialDelaySeconds: 60
-        periodSeconds: 10
-        timeoutSeconds: 5
-        failureThreshold: 60
      extraPodSpec:
        terminationGracePeriodSeconds: 120
        mainContainer:
@@ -72,29 +46,6 @@ spec:
      resources:
        limits:
          gpu: "1"
-      envs:
-        - name: DYN_SYSTEM_PORT
-          value: "9090"
-        - name: DYN_SYSTEM_STARTING_HEALTH_STATUS
-          value: "notready"
-        - name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
-          value: "[\"generate\", \"clear_kv_blocks\"]"
-      readinessProbe:
-        httpGet:
-          path: /health
-          port: 9090
-        initialDelaySeconds: 60
-        periodSeconds: 10
-        timeoutSeconds: 5
-        failureThreshold: 60
-      livenessProbe:
-        httpGet:
-          path: /live
-          port: 9090
-        initialDelaySeconds: 60
-        periodSeconds: 10
-        timeoutSeconds: 5
-        failureThreshold: 60
      extraPodSpec:
        terminationGracePeriodSeconds: 120
        mainContainer:

--- a/lib/runtime/src/discovery/kube/daemon.rs
+++ b/lib/runtime/src/discovery/kube/daemon.rs
@@ -68,11 +68,12 @@ impl DiscoveryDaemon {
        let (reader, writer) = reflector::store();

        // Apply label selector to only watch discovery-enabled EndpointSlices
-        let watch_config =
-            Config::default().labels("nvidia.com/dynamo-discovery-backend=kubernetes");
+        let watch_config = Config::default()
+            .labels("nvidia.com/dynamo-discovery-backend=kubernetes")
+            .labels("nvidia.com/dynamo-discovery-enabled=true");

        tracing::info!(
-            "Daemon watching EndpointSlices with label: nvidia.com/dynamo-discovery-backend=kubernetes"
+            "Daemon watching EndpointSlices with labels: nvidia.com/dynamo-discovery-backend=kubernetes, nvidia.com/dynamo-discovery-enabled=true"
        );

        // Spawn reflector task (runs independently)
@@ -181,8 +182,8 @@ impl DiscoveryDaemon {
    ) -> Result<MetadataSnapshot> {
        let start = std::time::Instant::now();

-        // Extract ALL ready endpoints (instance_id, pod_name, pod_ip) directly from reflector
-        let all_endpoints: Vec<(u64, String, String)> = reader
+        // Extract ALL ready endpoints (instance_id, pod_name, pod_ip, system_port) directly from reflector
+        let all_endpoints: Vec<(u64, String, String, u16)> = reader
            .state()
            .iter()
            .flat_map(|arc_slice| extract_endpoint_info(arc_slice.as_ref()))
@@ -194,25 +195,26 @@ impl DiscoveryDaemon {
        );

        // Concurrent fetch: Fetch metadata for all endpoints in parallel
-        let fetch_futures = all_endpoints
-            .into_iter()
-            .map(|(instance_id, pod_name, pod_ip)| {
-                let daemon = self.clone();
-                async move {
-                    match daemon.fetch_metadata(&pod_name, &pod_ip).await {
-                        Ok(metadata) => Some((instance_id, metadata)),
-                        Err(e) => {
-                            tracing::warn!(
-                                "Failed to fetch metadata for pod {} (instance_id={:x}): {}",
-                                pod_name,
-                                instance_id,
-                                e
-                            );
-                            None
+        let fetch_futures =
+            all_endpoints
+                .into_iter()
+                .map(|(instance_id, pod_name, pod_ip, system_port)| {
+                    let daemon = self.clone();
+                    async move {
+                        match daemon.fetch_metadata(&pod_name, &pod_ip, system_port).await {
+                            Ok(metadata) => Some((instance_id, metadata)),
+                            Err(e) => {
+                                tracing::warn!(
+                                    "Failed to fetch metadata for pod {} (instance_id={:x}): {}",
+                                    pod_name,
+                                    instance_id,
+                                    e
+                                );
+                                None
+                            }
                        }
                    }
-                }
-            });
+                });

        // Execute fetches concurrently with bounded parallelism
        let results: Vec<_> = futures::stream::iter(fetch_futures)
@@ -241,7 +243,12 @@ impl DiscoveryDaemon {
    }

    /// Fetch metadata for a single pod (with caching)
-    async fn fetch_metadata(&self, pod_name: &str, pod_ip: &str) -> Result<Arc<DiscoveryMetadata>> {
+    async fn fetch_metadata(
+        &self,
+        pod_name: &str,
+        pod_ip: &str,
+        system_port: u16,
+    ) -> Result<Arc<DiscoveryMetadata>> {
        let instance_id = hash_pod_name(pod_name);

        // Check cache
@@ -258,7 +265,7 @@ impl DiscoveryDaemon {
        }

        // Cache miss: fetch from HTTP
-        let url = format!("http://{}:{}/metadata", pod_ip, self.pod_info.system_port);
+        let url = format!("http://{}:{}/metadata", pod_ip, system_port);

        tracing::debug!("Fetching metadata from {url}");


--- a/lib/runtime/src/discovery/kube/utils.rs
+++ b/lib/runtime/src/discovery/kube/utils.rs
@@ -13,11 +13,34 @@ pub fn hash_pod_name(pod_name: &str) -> u64 {
    hasher.finish()
 }

+/// Extract the system port from an EndpointSlice's ports
+/// Looks for a port with name "system", returns None if not found
+fn extract_system_port(slice: &EndpointSlice) -> Option<u16> {
+    slice.ports.as_ref().and_then(|ports| {
+        ports
+            .iter()
+            .find(|p| p.name.as_deref() == Some("system"))
+            .and_then(|p| p.port.map(|port| port as u16))
+    })
+}
+
 /// Extract endpoint information from an EndpointSlice
-/// Returns (instance_id, pod_name, pod_ip) tuples for ready endpoints
-pub(super) fn extract_endpoint_info(slice: &EndpointSlice) -> Vec<(u64, String, String)> {
-    let mut result = Vec::new();
+/// Returns (instance_id, pod_name, pod_ip, system_port) tuples for ready endpoints
+pub(super) fn extract_endpoint_info(slice: &EndpointSlice) -> Vec<(u64, String, String, u16)> {
+    // Extract system port from EndpointSlice ports
+    let system_port = match extract_system_port(slice) {
+        Some(port) => port,
+        None => {
+            let slice_name = slice.metadata.name.as_deref().unwrap_or("unknown");
+            tracing::warn!(
+                "EndpointSlice '{}' did not have a system port defined",
+                slice_name
+            );
+            return Vec::new();
+        }
+    };

+    let mut result = Vec::new();
    let endpoints = &slice.endpoints;

    for endpoint in endpoints {
@@ -46,7 +69,7 @@ pub(super) fn extract_endpoint_info(slice: &EndpointSlice) -> Vec<(u64, String,

        // Get first IP only (avoid duplicate instance IDs)
        if let Some(ip) = endpoint.addresses.first() {
-            result.push((instance_id, pod_name.to_string(), ip.clone()));
+            result.push((instance_id, pod_name.to_string(), ip.clone(), system_port));
        }
    }