Unverified Commit dad4237d authored by Julien Mancuso's avatar Julien Mancuso Committed by GitHub
Browse files

chore: remove DCD debug features and consolidate k8s service generation (#6397)


Signed-off-by: default avatarJulien Mancuso <jmancuso@nvidia.com>
parent 0688d584
...@@ -115,8 +115,6 @@ The chart includes built-in validation to prevent all operator conflicts: ...@@ -115,8 +115,6 @@ The chart includes built-in validation to prevent all operator conflicts:
| dynamo-operator.controllerManager.manager.args[1] | string | `"--metrics-bind-address=127.0.0.1:8080"` | Metrics endpoint for Prometheus scraping (localhost only for security) | | dynamo-operator.controllerManager.manager.args[1] | string | `"--metrics-bind-address=127.0.0.1:8080"` | Metrics endpoint for Prometheus scraping (localhost only for security) |
| dynamo-operator.imagePullSecrets | list | `[]` | Secrets for pulling private container images | | dynamo-operator.imagePullSecrets | list | `[]` | Secrets for pulling private container images |
| dynamo-operator.dynamo.groveTerminationDelay | string | `"4h"` | How long to wait before forcefully terminating Grove instances | | dynamo-operator.dynamo.groveTerminationDelay | string | `"4h"` | How long to wait before forcefully terminating Grove instances |
| dynamo-operator.dynamo.internalImages.debugger | string | `"python:3.12-slim"` | Debugger image for troubleshooting deployments |
| dynamo-operator.dynamo.enableRestrictedSecurityContext | bool | `false` | Whether to enable restricted security contexts for enhanced security |
| dynamo-operator.dynamo.dockerRegistry.useKubernetesSecret | bool | `false` | Whether to use Kubernetes secrets for registry authentication | | dynamo-operator.dynamo.dockerRegistry.useKubernetesSecret | bool | `false` | Whether to use Kubernetes secrets for registry authentication |
| dynamo-operator.dynamo.dockerRegistry.server | string | `nil` | Docker registry server URL | | dynamo-operator.dynamo.dockerRegistry.server | string | `nil` | Docker registry server URL |
| dynamo-operator.dynamo.dockerRegistry.username | string | `nil` | Registry username | | dynamo-operator.dynamo.dockerRegistry.username | string | `nil` | Registry username |
......
...@@ -177,9 +177,6 @@ spec: ...@@ -177,9 +177,6 @@ spec:
env: env:
- name: KUBERNETES_CLUSTER_DOMAIN - name: KUBERNETES_CLUSTER_DOMAIN
value: {{ quote .Values.kubernetesClusterDomain }} value: {{ quote .Values.kubernetesClusterDomain }}
envFrom:
- secretRef:
name: dynamo-deployment-env
imagePullPolicy: {{ .Values.controllerManager.manager.image.pullPolicy | quote }} imagePullPolicy: {{ .Values.controllerManager.manager.image.pullPolicy | quote }}
image: {{ .Values.controllerManager.manager.image.repository }}:{{ .Values.controllerManager.manager.image.tag image: {{ .Values.controllerManager.manager.image.repository }}:{{ .Values.controllerManager.manager.image.tag
| default .Chart.AppVersion }} | default .Chart.AppVersion }}
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: v1
kind: Secret
metadata:
name: dynamo-deployment-env
namespace: {{ .Release.Namespace }}
labels:
{{- include "dynamo-operator.labels" . | nindent 4 }}
type: Opaque
stringData:
INTERNAL_IMAGES_DEBUGGER: {{ .Values.dynamo.internalImages.debugger | quote }}
{{- if .Values.dynamo.enableRestrictedSecurityContext }}
ENABLE_RESTRICTED_SECURITY_CONTEXT: "true"
{{- end }}
{{- if .Values.dynamo.dynamoIngressSuffix }}
DYNAMO_INGRESS_SUFFIX: {{ .Values.dynamo.dynamoIngressSuffix | quote }}
{{- end }}
\ No newline at end of file
...@@ -109,11 +109,6 @@ dynamo: ...@@ -109,11 +109,6 @@ dynamo:
groveTerminationDelay: 15m groveTerminationDelay: 15m
internalImages:
debugger: python:3.12-slim
enableRestrictedSecurityContext: false
dockerRegistry: dockerRegistry:
server: '' server: ''
# set to true if you want to use the kubernetes secret for the registry credentials # set to true if you want to use the kubernetes secret for the registry credentials
......
...@@ -96,14 +96,6 @@ dynamo-operator: ...@@ -96,14 +96,6 @@ dynamo-operator:
# -- How long to wait before forcefully terminating Grove instances # -- How long to wait before forcefully terminating Grove instances
groveTerminationDelay: 4h groveTerminationDelay: 4h
# Internal utility images used by the platform
internalImages:
# -- Debugger image for troubleshooting deployments
debugger: python:3.12-slim
# -- Whether to enable restricted security contexts for enhanced security
enableRestrictedSecurityContext: false
# Docker registry configuration for private repositories # Docker registry configuration for private repositories
dockerRegistry: dockerRegistry:
# -- Whether to use Kubernetes secrets for registry authentication # -- Whether to use Kubernetes secrets for registry authentication
......
...@@ -43,18 +43,17 @@ const ( ...@@ -43,18 +43,17 @@ const (
KubeAnnotationDisableImagePullSecretDiscovery = "nvidia.com/disable-image-pull-secret-discovery" KubeAnnotationDisableImagePullSecretDiscovery = "nvidia.com/disable-image-pull-secret-discovery"
KubeAnnotationDynamoDiscoveryBackend = "nvidia.com/dynamo-discovery-backend" KubeAnnotationDynamoDiscoveryBackend = "nvidia.com/dynamo-discovery-backend"
KubeLabelDynamoGraphDeploymentName = "nvidia.com/dynamo-graph-deployment-name" KubeLabelDynamoGraphDeploymentName = "nvidia.com/dynamo-graph-deployment-name"
KubeLabelDynamoComponent = "nvidia.com/dynamo-component" KubeLabelDynamoComponent = "nvidia.com/dynamo-component"
KubeLabelDynamoNamespace = "nvidia.com/dynamo-namespace" KubeLabelDynamoNamespace = "nvidia.com/dynamo-namespace"
KubeLabelDynamoDeploymentTargetType = "nvidia.com/dynamo-deployment-target-type" KubeLabelDynamoComponentType = "nvidia.com/dynamo-component-type"
KubeLabelDynamoComponentType = "nvidia.com/dynamo-component-type" KubeLabelDynamoSubComponentType = "nvidia.com/dynamo-sub-component-type"
KubeLabelDynamoSubComponentType = "nvidia.com/dynamo-sub-component-type" KubeLabelDynamoBaseModel = "nvidia.com/dynamo-base-model"
KubeLabelDynamoBaseModel = "nvidia.com/dynamo-base-model" KubeLabelDynamoBaseModelHash = "nvidia.com/dynamo-base-model-hash"
KubeLabelDynamoBaseModelHash = "nvidia.com/dynamo-base-model-hash" KubeAnnotationDynamoBaseModel = "nvidia.com/dynamo-base-model"
KubeAnnotationDynamoBaseModel = "nvidia.com/dynamo-base-model" KubeLabelDynamoDiscoveryBackend = "nvidia.com/dynamo-discovery-backend"
KubeLabelDynamoDiscoveryBackend = "nvidia.com/dynamo-discovery-backend" KubeLabelDynamoDiscoveryEnabled = "nvidia.com/dynamo-discovery-enabled"
KubeLabelDynamoDiscoveryEnabled = "nvidia.com/dynamo-discovery-enabled" KubeLabelDynamoWorkerHash = "nvidia.com/dynamo-worker-hash"
KubeLabelDynamoWorkerHash = "nvidia.com/dynamo-worker-hash"
KubeLabelValueFalse = "false" KubeLabelValueFalse = "false"
KubeLabelValueTrue = "true" KubeLabelValueTrue = "true"
......
...@@ -629,7 +629,18 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGroveResources(ctx context.Co ...@@ -629,7 +629,18 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGroveResources(ctx context.Co
// else, only create for the frontend component // else, only create for the frontend component
isK8sDiscoveryEnabled := r.Config.IsK8sDiscoveryEnabled(dynamoDeployment.Annotations) isK8sDiscoveryEnabled := r.Config.IsK8sDiscoveryEnabled(dynamoDeployment.Annotations)
if isK8sDiscoveryEnabled || component.ComponentType == consts.ComponentTypeFrontend { if isK8sDiscoveryEnabled || component.ComponentType == consts.ComponentTypeFrontend {
mainComponentService, err := dynamo.GenerateComponentService(ctx, dynamoDeployment, component, componentName, isK8sDiscoveryEnabled) if component.DynamoNamespace == nil {
return ReconcileResult{}, fmt.Errorf("expected component %s to have a dynamoNamespace", componentName)
}
mainComponentService, err := dynamo.GenerateComponentService(dynamo.ComponentServiceParams{
ServiceName: dynamo.GetDCDResourceName(dynamoDeployment, componentName, ""),
Namespace: dynamoDeployment.Namespace,
ComponentType: component.ComponentType,
DynamoNamespace: *component.DynamoNamespace,
ComponentName: componentName,
Labels: component.Labels,
IsK8sDiscovery: isK8sDiscoveryEnabled,
})
if err != nil { if err != nil {
logger.Error(err, "failed to generate the main component service") logger.Error(err, "failed to generate the main component service")
return ReconcileResult{}, fmt.Errorf("failed to generate the main component service: %w", err) return ReconcileResult{}, fmt.Errorf("failed to generate the main component service: %w", err)
......
...@@ -612,15 +612,23 @@ func getCliqueStartupDependencies( ...@@ -612,15 +612,23 @@ func getCliqueStartupDependencies(
return nil return nil
} }
func GenerateComponentService(ctx context.Context, dynamoDeployment *v1alpha1.DynamoGraphDeployment, component *v1alpha1.DynamoComponentDeploymentSharedSpec, componentName string, isK8sDiscoveryEnabled bool) (*corev1.Service, error) { // ComponentServiceParams contains all the fields needed to generate a Kubernetes
if component.DynamoNamespace == nil { // Service for a Dynamo component, independent of whether the caller is the DGD
return nil, fmt.Errorf("expected DynamoComponentDeployment %s to have a dynamoNamespace", componentName) // (Grove) or DCD controller.
} type ComponentServiceParams struct {
// DNS-safe service resource name: "{dgd-name}-{lowercase(componentName)}" ServiceName string
kubeServiceName := GetDCDResourceName(dynamoDeployment, componentName, "") Namespace string
ComponentType string
DynamoNamespace string
ComponentName string // original user-provided name, used in selector
Labels map[string]string
Annotations map[string]string
IsK8sDiscovery bool
}
func GenerateComponentService(params ComponentServiceParams) (*corev1.Service, error) {
var servicePort corev1.ServicePort var servicePort corev1.ServicePort
switch component.ComponentType { switch params.ComponentType {
case commonconsts.ComponentTypeFrontend: case commonconsts.ComponentTypeFrontend:
servicePort = corev1.ServicePort{ servicePort = corev1.ServicePort{
Name: commonconsts.DynamoServicePortName, Name: commonconsts.DynamoServicePortName,
...@@ -629,7 +637,6 @@ func GenerateComponentService(ctx context.Context, dynamoDeployment *v1alpha1.Dy ...@@ -629,7 +637,6 @@ func GenerateComponentService(ctx context.Context, dynamoDeployment *v1alpha1.Dy
Protocol: corev1.ProtocolTCP, Protocol: corev1.ProtocolTCP,
} }
case commonconsts.ComponentTypeEPP: case commonconsts.ComponentTypeEPP:
// EPP only exposes the gRPC endpoint for InferencePool communication
servicePort = corev1.ServicePort{ servicePort = corev1.ServicePort{
Name: commonconsts.EPPGRPCPortName, Name: commonconsts.EPPGRPCPortName,
Port: commonconsts.EPPGRPCPort, Port: commonconsts.EPPGRPCPort,
...@@ -646,33 +653,36 @@ func GenerateComponentService(ctx context.Context, dynamoDeployment *v1alpha1.Dy ...@@ -646,33 +653,36 @@ func GenerateComponentService(ctx context.Context, dynamoDeployment *v1alpha1.Dy
} }
} }
// Start with user-defined labels from component.Labels
labels := make(map[string]string) labels := make(map[string]string)
for k, v := range component.Labels { for k, v := range params.Labels {
labels[k] = v labels[k] = v
} }
if params.IsK8sDiscovery {
// Add k8s discovery labels (these take precedence over user labels)
if isK8sDiscoveryEnabled {
labels[commonconsts.KubeLabelDynamoDiscoveryBackend] = "kubernetes" labels[commonconsts.KubeLabelDynamoDiscoveryBackend] = "kubernetes"
labels[commonconsts.KubeLabelDynamoDiscoveryEnabled] = commonconsts.KubeLabelValueTrue labels[commonconsts.KubeLabelDynamoDiscoveryEnabled] = commonconsts.KubeLabelValueTrue
} }
selector := map[string]string{
commonconsts.KubeLabelDynamoComponentType: params.ComponentType,
commonconsts.KubeLabelDynamoNamespace: params.DynamoNamespace,
commonconsts.KubeLabelDynamoComponent: params.ComponentName,
}
annotations := make(map[string]string)
for k, v := range params.Annotations {
annotations[k] = v
}
service := &corev1.Service{ service := &corev1.Service{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
Name: kubeServiceName, Name: params.ServiceName,
Namespace: dynamoDeployment.Namespace, Namespace: params.Namespace,
Labels: labels, Labels: labels,
Annotations: annotations,
}, },
Spec: corev1.ServiceSpec{ Spec: corev1.ServiceSpec{
Selector: map[string]string{ Selector: selector,
commonconsts.KubeLabelDynamoComponentType: component.ComponentType, // e.g "worker" Ports: []corev1.ServicePort{servicePort},
commonconsts.KubeLabelDynamoNamespace: *component.DynamoNamespace, // result of ComputeDynamoNamespace(k8sNamespace, dgdName)
// The original user provided component name (the service map key, e.g. "VllmDecodeWorker" in the DGD).
// Needed to disambiguate amongst distinct components with the same component type within a DGD (e.g prefill/decode workers).
commonconsts.KubeLabelDynamoComponent: componentName,
},
Ports: []corev1.ServicePort{servicePort},
}, },
} }
return service, nil return service, nil
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment