Unverified Commit 44ecfda5 authored by julienmancuso's avatar julienmancuso Committed by GitHub
Browse files

fix: revisit grove and LWS selection (#2564)


Signed-off-by: default avatarJulien Mancuso <jmancuso@nvidia.com>
parent 60a6a96a
......@@ -48,7 +48,6 @@ export ISTIO_ENABLED="${ISTIO_ENABLED:=false}"
export ISTIO_GATEWAY="${ISTIO_GATEWAY:=istio-system/istio-ingressgateway}"
export INGRESS_CLASS="${INGRESS_CLASS:=nginx}"
export VIRTUAL_SERVICE_SUPPORTS_HTTPS="${VIRTUAL_SERVICE_SUPPORTS_HTTPS:=false}"
export ENABLE_LWS="${ENABLE_LWS:=false}"
export DOCKER_REGISTRY_USE_KUBERNETES_SECRET="${DOCKER_REGISTRY_USE_KUBERNETES_SECRET:=false}"
# Add command line options
......@@ -167,7 +166,7 @@ echo "VIRTUAL_SERVICE_SUPPORTS_HTTPS: $VIRTUAL_SERVICE_SUPPORTS_HTTPS"
echo "INSTALL_CRDS: $INSTALL_CRDS"
echo "DOCKER_REGISTRY_USE_KUBERNETES_SECRET: $DOCKER_REGISTRY_USE_KUBERNETES_SECRET"
envsubst '${NAMESPACE} ${RELEASE_NAME} ${DOCKER_USERNAME} ${DOCKER_PASSWORD} ${DOCKER_SERVER} ${IMAGE_TAG} ${DYNAMO_INGRESS_SUFFIX} ${PIPELINES_DOCKER_SERVER} ${PIPELINES_DOCKER_USERNAME} ${PIPELINES_DOCKER_PASSWORD} ${DOCKER_SECRET_NAME} ${INGRESS_ENABLED} ${ISTIO_ENABLED} ${INGRESS_CLASS} ${ISTIO_GATEWAY} ${VIRTUAL_SERVICE_SUPPORTS_HTTPS} ${ENABLE_LWS} ${DOCKER_REGISTRY_USE_KUBERNETES_SECRET}' < dynamo-platform-values.yaml > generated-values.yaml
envsubst '${NAMESPACE} ${RELEASE_NAME} ${DOCKER_USERNAME} ${DOCKER_PASSWORD} ${DOCKER_SERVER} ${IMAGE_TAG} ${DYNAMO_INGRESS_SUFFIX} ${PIPELINES_DOCKER_SERVER} ${PIPELINES_DOCKER_USERNAME} ${PIPELINES_DOCKER_PASSWORD} ${DOCKER_SECRET_NAME} ${INGRESS_ENABLED} ${ISTIO_ENABLED} ${INGRESS_CLASS} ${ISTIO_GATEWAY} ${VIRTUAL_SERVICE_SUPPORTS_HTTPS} ${DOCKER_REGISTRY_USE_KUBERNETES_SECRET}' < dynamo-platform-values.yaml > generated-values.yaml
echo "generated file contents:"
cat generated-values.yaml
......
......@@ -23,7 +23,6 @@ dynamo-operator:
- name: ${DOCKER_SECRET_NAME}
dynamo:
enableLWS: ${ENABLE_LWS}
ingress:
enabled: ${INGRESS_ENABLED}
className: ${INGRESS_CLASS}
......
......@@ -19,11 +19,11 @@ maintainers:
url: https://www.nvidia.com
description: A Helm chart for NVIDIA Dynamo Platform.
type: application
version: 0.4.1
version: 0.5.0
home: https://nvidia.com
dependencies:
- name: dynamo-operator
version: 0.4.1
version: 0.5.0
repository: file://components/operator
condition: dynamo-operator.enabled
- name: nats
......
......@@ -27,9 +27,9 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.4.1
version: 0.5.0
# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes.
appVersion: "0.4.1"
appVersion: "0.5.0"
......@@ -101,9 +101,6 @@ spec:
{{- if .Values.dynamo.virtualServiceSupportsHTTPS }}
- --virtual-service-supports-https={{ .Values.dynamo.virtualServiceSupportsHTTPS }}
{{- end }}
{{- if .Values.dynamo.enableLWS }}
- --enable-lws
{{- end }}
{{- if .Values.dynamo.groveTerminationDelay }}
- --grove-termination-delay={{ .Values.dynamo.groveTerminationDelay }}
{{- end }}
......
......@@ -425,7 +425,6 @@ rules:
- patch
- update
- watch
{{- if .Values.dynamo.enableLWS }}
- apiGroups:
- leaderworkerset.x-k8s.io
resources:
......@@ -450,7 +449,6 @@ rules:
- patch
- update
- watch
{{- end }}
---
apiVersion: rbac.authorization.k8s.io/v1
{{- if .Values.namespaceRestriction.enabled }}
......
......@@ -82,7 +82,6 @@ dynamo:
serviceAccount:
annotations: {}
enableLWS: false
groveTerminationDelay: 15m
internalImages:
......
......@@ -34,7 +34,6 @@ dynamo-operator:
- --metrics-bind-address=127.0.0.1:8080
imagePullSecrets: []
dynamo:
enableLWS: false
groveTerminationDelay: 15m
internalImages:
debugger: python:3.12-slim
......
......@@ -110,3 +110,13 @@ func (s *DynamoGraphDeployment) AddStatusCondition(condition metav1.Condition) {
// If no matching condition found, append the new one
s.Status.Conditions = append(s.Status.Conditions, condition)
}
// HasAnyMultinodeService reports whether any service in the graph is configured with more than one node.
func (s *DynamoGraphDeployment) HasAnyMultinodeService() bool {
for _, svc := range s.Spec.Services {
if svc != nil && svc.GetNumberOfNodes() > 1 {
return true
}
}
return false
}
......@@ -129,7 +129,6 @@ func main() {
var ingressControllerClassName string
var ingressControllerTLSSecretName string
var ingressHostSuffix string
var enableLWS bool
var groveTerminationDelay time.Duration
flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
......@@ -156,8 +155,6 @@ func main() {
"The name of the ingress controller TLS secret to use")
flag.StringVar(&ingressHostSuffix, "ingress-host-suffix", "",
"The suffix to use for the ingress host")
flag.BoolVar(&enableLWS, "enable-lws", false,
"If set, enable leader worker set")
flag.DurationVar(&groveTerminationDelay, "grove-termination-delay", consts.DefaultGroveTerminationDelay,
"The termination delay for Grove PodGangSets")
opts := zap.Options{
......@@ -168,11 +165,13 @@ func main() {
ctrlConfig := commonController.Config{
RestrictedNamespace: restrictedNamespace,
EnableLWS: enableLWS,
Grove: commonController.GroveConfig{
Enabled: false, // Will be set after Grove discovery
TerminationDelay: groveTerminationDelay,
},
LWS: commonController.LWSConfig{
Enabled: false, // Will be set after LWS discovery
},
EtcdAddress: etcdAddr,
NatsAddress: natsAddr,
IngressConfig: commonController.IngressConfig{
......@@ -240,10 +239,13 @@ func main() {
os.Exit(1)
}
// Detect Grove availability using discovery client
// Detect orchestrators availability using discovery client
setupLog.Info("Detecting Grove availability...")
groveEnabled := commonController.DetectGroveAvailability(mainCtx, mgr)
ctrlConfig.Grove.Enabled = groveEnabled
setupLog.Info("Detecting LWS availability...")
lwsEnabled := commonController.DetectLWSAvailability(mainCtx, mgr)
ctrlConfig.LWS.Enabled = lwsEnabled
// Create etcd client
cli, err := clientv3.New(clientv3.Config{
......
......@@ -199,7 +199,7 @@ func (r *DynamoComponentDeploymentReconciler) Reconcile(ctx context.Context, req
// Create the appropriate workload resource based on deployment type
var leaderWorkerSets []*leaderworkersetv1.LeaderWorkerSet
var deployment *appsv1.Deployment
if r.Config.EnableLWS && dynamoComponentDeployment.IsMultinode() {
if r.Config.LWS.Enabled && dynamoComponentDeployment.IsMultinode() {
desiredReplicas := int32(1)
if dynamoComponentDeployment.Spec.Replicas != nil {
desiredReplicas = *dynamoComponentDeployment.Spec.Replicas
......@@ -1356,7 +1356,7 @@ func (r *DynamoComponentDeploymentReconciler) SetupWithManager(mgr ctrl.Manager)
Owns(&corev1.PersistentVolumeClaim{}, builder.WithPredicates(predicate.GenerationChangedPredicate{})).
WithEventFilter(controller_common.EphemeralDeploymentEventFilter(r.Config))
if r.Config.EnableLWS {
if r.Config.LWS.Enabled {
m.Owns(&leaderworkersetv1.LeaderWorkerSet{}, builder.WithPredicates(predicate.Funcs{
// ignore creation cause we don't want to be called again after we create the LeaderWorkerSet
CreateFunc: func(ce event.CreateEvent) bool { return false },
......
......@@ -169,14 +169,27 @@ type Resource interface {
func (r *DynamoGraphDeploymentReconciler) reconcileResources(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment) (State, Reason, Message, error) {
logger := log.FromContext(ctx)
if r.Config.Grove.Enabled {
// check if explicit opt out of grove
if dynamoDeployment.Annotations[consts.KubeAnnotationEnableGrove] == consts.KubeLabelValueFalse {
logger.Info("Grove is explicitly disabled for this deployment, skipping grove resources reconciliation")
return r.reconcileDynamoComponentsDeployments(ctx, dynamoDeployment)
// Orchestrator selection via single boolean annotation: nvidia.com/enable-grove
// Unset or not "false": Grove if available; else component mode
// "false": component mode (multinode -> LWS; single-node -> standard)
enableGrove := true
if dynamoDeployment.Annotations != nil && strings.ToLower(dynamoDeployment.Annotations[consts.KubeAnnotationEnableGrove]) == consts.KubeLabelValueFalse {
enableGrove = false
}
// Determine if any service is multinode
hasMultinode := dynamoDeployment.HasAnyMultinodeService()
if enableGrove && r.Config.Grove.Enabled {
logger.Info("Reconciling Grove resources", "enableGrove", enableGrove, "groveEnabled", r.Config.Grove.Enabled, "hasMultinode", hasMultinode, "lwsEnabled", r.Config.LWS.Enabled)
return r.reconcileGroveResources(ctx, dynamoDeployment)
}
if hasMultinode && !r.Config.LWS.Enabled {
err := fmt.Errorf("no multinode orchestrator available")
logger.Error(err, err.Error(), "hasMultinode", hasMultinode, "lwsEnabled", r.Config.LWS.Enabled, "enableGrove", enableGrove, "groveEnabled", r.Config.Grove.Enabled)
return "", "", "", err
}
logger.Info("Reconciling Dynamo components deployments", "hasMultinode", hasMultinode, "lwsEnabled", r.Config.LWS.Enabled, "enableGrove", enableGrove, "groveEnabled", r.Config.Grove.Enabled)
return r.reconcileDynamoComponentsDeployments(ctx, dynamoDeployment)
}
......@@ -285,7 +298,7 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGroveResources(ctx context.Co
// Handle Grove scaling operations after structural changes
if err := r.reconcileGroveScaling(ctx, dynamoDeployment); err != nil {
logger.Error(err, "failed to reconcile Grove scaling")
return FailedState, "grove_scaling_failed", Message(err.Error()), err
return "", "", "", fmt.Errorf("failed to reconcile Grove scaling: %w", err)
}
resources := []Resource{groveGangSetAsResource}
......
......@@ -37,11 +37,16 @@ type GroveConfig struct {
TerminationDelay time.Duration
}
type LWSConfig struct {
// Enabled is automatically determined by checking if LWS CRDs are installed in the cluster
Enabled bool
}
type Config struct {
// Enable resources filtering, only the resources belonging to the given namespace will be handled.
RestrictedNamespace string
EnableLWS bool
Grove GroveConfig
LWS LWSConfig
EtcdAddress string
NatsAddress string
IngressConfig IngressConfig
......@@ -61,40 +66,47 @@ func (i *IngressConfig) UseVirtualService() bool {
// DetectGroveAvailability checks if Grove is available by checking if the Grove API group is registered
// This approach uses the discovery client which is simpler and more reliable
func DetectGroveAvailability(ctx context.Context, mgr ctrl.Manager) bool {
return detectAPIGroupAvailability(ctx, mgr, "grove.io")
}
// DetectLWSAvailability checks if LWS is available by checking if the LWS API group is registered
// This approach uses the discovery client which is simpler and more reliable
func DetectLWSAvailability(ctx context.Context, mgr ctrl.Manager) bool {
return detectAPIGroupAvailability(ctx, mgr, "leaderworkerset.x-k8s.io")
}
// detectAPIGroupAvailability checks if a specific API group is registered in the cluster
func detectAPIGroupAvailability(ctx context.Context, mgr ctrl.Manager, groupName string) bool {
logger := log.FromContext(ctx)
// Use the discovery client to check if Grove API groups are available
cfg := mgr.GetConfig()
if cfg == nil {
logger.Info("Grove detection failed, no discovery client available")
logger.Info("detection failed, no discovery client available", "group", groupName)
return false
}
// Try to create a discovery client
discoveryClient, err := discovery.NewDiscoveryClientForConfig(cfg)
if err != nil {
logger.Error(err, "Grove detection failed, could not create discovery client")
logger.Error(err, "detection failed, could not create discovery client", "group", groupName)
return false
}
// Check if grove.io API group is available
apiGroups, err := discoveryClient.ServerGroups()
if err != nil {
logger.Error(err, "Grove detection failed, could not list server groups")
logger.Error(err, "detection failed, could not list server groups", "group", groupName)
return false
}
for _, group := range apiGroups.Groups {
if group.Name == "grove.io" {
logger.Info("Grove is available, grove.io API group found")
if group.Name == groupName {
logger.Info("API group is available", "group", groupName)
return true
}
}
logger.Info("Grove not available, grove.io API group not found")
logger.Info("API group not available", "group", groupName)
return false
}
func EphemeralDeploymentEventFilter(config Config) predicate.Predicate {
return predicate.NewPredicateFuncs(func(o client.Object) bool {
l := log.FromContext(context.Background())
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment