Unverified Commit dad4237d authored by Julien Mancuso's avatar Julien Mancuso Committed by GitHub
Browse files

chore: remove DCD debug features and consolidate k8s service generation (#6397)


Signed-off-by: default avatarJulien Mancuso <jmancuso@nvidia.com>
parent 0688d584
......@@ -115,8 +115,6 @@ The chart includes built-in validation to prevent all operator conflicts:
| dynamo-operator.controllerManager.manager.args[1] | string | `"--metrics-bind-address=127.0.0.1:8080"` | Metrics endpoint for Prometheus scraping (localhost only for security) |
| dynamo-operator.imagePullSecrets | list | `[]` | Secrets for pulling private container images |
| dynamo-operator.dynamo.groveTerminationDelay | string | `"4h"` | How long to wait before forcefully terminating Grove instances |
| dynamo-operator.dynamo.internalImages.debugger | string | `"python:3.12-slim"` | Debugger image for troubleshooting deployments |
| dynamo-operator.dynamo.enableRestrictedSecurityContext | bool | `false` | Whether to enable restricted security contexts for enhanced security |
| dynamo-operator.dynamo.dockerRegistry.useKubernetesSecret | bool | `false` | Whether to use Kubernetes secrets for registry authentication |
| dynamo-operator.dynamo.dockerRegistry.server | string | `nil` | Docker registry server URL |
| dynamo-operator.dynamo.dockerRegistry.username | string | `nil` | Registry username |
......
......@@ -177,9 +177,6 @@ spec:
env:
- name: KUBERNETES_CLUSTER_DOMAIN
value: {{ quote .Values.kubernetesClusterDomain }}
envFrom:
- secretRef:
name: dynamo-deployment-env
imagePullPolicy: {{ .Values.controllerManager.manager.image.pullPolicy | quote }}
image: {{ .Values.controllerManager.manager.image.repository }}:{{ .Values.controllerManager.manager.image.tag
| default .Chart.AppVersion }}
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: v1
kind: Secret
metadata:
name: dynamo-deployment-env
namespace: {{ .Release.Namespace }}
labels:
{{- include "dynamo-operator.labels" . | nindent 4 }}
type: Opaque
stringData:
INTERNAL_IMAGES_DEBUGGER: {{ .Values.dynamo.internalImages.debugger | quote }}
{{- if .Values.dynamo.enableRestrictedSecurityContext }}
ENABLE_RESTRICTED_SECURITY_CONTEXT: "true"
{{- end }}
{{- if .Values.dynamo.dynamoIngressSuffix }}
DYNAMO_INGRESS_SUFFIX: {{ .Values.dynamo.dynamoIngressSuffix | quote }}
{{- end }}
\ No newline at end of file
......@@ -109,11 +109,6 @@ dynamo:
groveTerminationDelay: 15m
internalImages:
debugger: python:3.12-slim
enableRestrictedSecurityContext: false
dockerRegistry:
server: ''
# set to true if you want to use the kubernetes secret for the registry credentials
......
......@@ -96,14 +96,6 @@ dynamo-operator:
# -- How long to wait before forcefully terminating Grove instances
groveTerminationDelay: 4h
# Internal utility images used by the platform
internalImages:
# -- Debugger image for troubleshooting deployments
debugger: python:3.12-slim
# -- Whether to enable restricted security contexts for enhanced security
enableRestrictedSecurityContext: false
# Docker registry configuration for private repositories
dockerRegistry:
# -- Whether to use Kubernetes secrets for registry authentication
......
......@@ -46,7 +46,6 @@ const (
KubeLabelDynamoGraphDeploymentName = "nvidia.com/dynamo-graph-deployment-name"
KubeLabelDynamoComponent = "nvidia.com/dynamo-component"
KubeLabelDynamoNamespace = "nvidia.com/dynamo-namespace"
KubeLabelDynamoDeploymentTargetType = "nvidia.com/dynamo-deployment-target-type"
KubeLabelDynamoComponentType = "nvidia.com/dynamo-component-type"
KubeLabelDynamoSubComponentType = "nvidia.com/dynamo-sub-component-type"
KubeLabelDynamoBaseModel = "nvidia.com/dynamo-base-model"
......
......@@ -629,7 +629,18 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGroveResources(ctx context.Co
// else, only create for the frontend component
isK8sDiscoveryEnabled := r.Config.IsK8sDiscoveryEnabled(dynamoDeployment.Annotations)
if isK8sDiscoveryEnabled || component.ComponentType == consts.ComponentTypeFrontend {
mainComponentService, err := dynamo.GenerateComponentService(ctx, dynamoDeployment, component, componentName, isK8sDiscoveryEnabled)
if component.DynamoNamespace == nil {
return ReconcileResult{}, fmt.Errorf("expected component %s to have a dynamoNamespace", componentName)
}
mainComponentService, err := dynamo.GenerateComponentService(dynamo.ComponentServiceParams{
ServiceName: dynamo.GetDCDResourceName(dynamoDeployment, componentName, ""),
Namespace: dynamoDeployment.Namespace,
ComponentType: component.ComponentType,
DynamoNamespace: *component.DynamoNamespace,
ComponentName: componentName,
Labels: component.Labels,
IsK8sDiscovery: isK8sDiscoveryEnabled,
})
if err != nil {
logger.Error(err, "failed to generate the main component service")
return ReconcileResult{}, fmt.Errorf("failed to generate the main component service: %w", err)
......
......@@ -612,15 +612,23 @@ func getCliqueStartupDependencies(
return nil
}
func GenerateComponentService(ctx context.Context, dynamoDeployment *v1alpha1.DynamoGraphDeployment, component *v1alpha1.DynamoComponentDeploymentSharedSpec, componentName string, isK8sDiscoveryEnabled bool) (*corev1.Service, error) {
if component.DynamoNamespace == nil {
return nil, fmt.Errorf("expected DynamoComponentDeployment %s to have a dynamoNamespace", componentName)
}
// DNS-safe service resource name: "{dgd-name}-{lowercase(componentName)}"
kubeServiceName := GetDCDResourceName(dynamoDeployment, componentName, "")
// ComponentServiceParams contains all the fields needed to generate a Kubernetes
// Service for a Dynamo component, independent of whether the caller is the DGD
// (Grove) or DCD controller.
type ComponentServiceParams struct {
ServiceName string
Namespace string
ComponentType string
DynamoNamespace string
ComponentName string // original user-provided name, used in selector
Labels map[string]string
Annotations map[string]string
IsK8sDiscovery bool
}
func GenerateComponentService(params ComponentServiceParams) (*corev1.Service, error) {
var servicePort corev1.ServicePort
switch component.ComponentType {
switch params.ComponentType {
case commonconsts.ComponentTypeFrontend:
servicePort = corev1.ServicePort{
Name: commonconsts.DynamoServicePortName,
......@@ -629,7 +637,6 @@ func GenerateComponentService(ctx context.Context, dynamoDeployment *v1alpha1.Dy
Protocol: corev1.ProtocolTCP,
}
case commonconsts.ComponentTypeEPP:
// EPP only exposes the gRPC endpoint for InferencePool communication
servicePort = corev1.ServicePort{
Name: commonconsts.EPPGRPCPortName,
Port: commonconsts.EPPGRPCPort,
......@@ -646,32 +653,35 @@ func GenerateComponentService(ctx context.Context, dynamoDeployment *v1alpha1.Dy
}
}
// Start with user-defined labels from component.Labels
labels := make(map[string]string)
for k, v := range component.Labels {
for k, v := range params.Labels {
labels[k] = v
}
// Add k8s discovery labels (these take precedence over user labels)
if isK8sDiscoveryEnabled {
if params.IsK8sDiscovery {
labels[commonconsts.KubeLabelDynamoDiscoveryBackend] = "kubernetes"
labels[commonconsts.KubeLabelDynamoDiscoveryEnabled] = commonconsts.KubeLabelValueTrue
}
selector := map[string]string{
commonconsts.KubeLabelDynamoComponentType: params.ComponentType,
commonconsts.KubeLabelDynamoNamespace: params.DynamoNamespace,
commonconsts.KubeLabelDynamoComponent: params.ComponentName,
}
annotations := make(map[string]string)
for k, v := range params.Annotations {
annotations[k] = v
}
service := &corev1.Service{
ObjectMeta: metav1.ObjectMeta{
Name: kubeServiceName,
Namespace: dynamoDeployment.Namespace,
Name: params.ServiceName,
Namespace: params.Namespace,
Labels: labels,
Annotations: annotations,
},
Spec: corev1.ServiceSpec{
Selector: map[string]string{
commonconsts.KubeLabelDynamoComponentType: component.ComponentType, // e.g "worker"
commonconsts.KubeLabelDynamoNamespace: *component.DynamoNamespace, // result of ComputeDynamoNamespace(k8sNamespace, dgdName)
// The original user provided component name (the service map key, e.g. "VllmDecodeWorker" in the DGD).
// Needed to disambiguate amongst distinct components with the same component type within a DGD (e.g prefill/decode workers).
commonconsts.KubeLabelDynamoComponent: componentName,
},
Selector: selector,
Ports: []corev1.ServicePort{servicePort},
},
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment