Unverified Commit 44ecfda5 authored by julienmancuso's avatar julienmancuso Committed by GitHub
Browse files

fix: revisit grove and LWS selection (#2564)


Signed-off-by: default avatarJulien Mancuso <jmancuso@nvidia.com>
parent 60a6a96a
...@@ -48,7 +48,6 @@ export ISTIO_ENABLED="${ISTIO_ENABLED:=false}" ...@@ -48,7 +48,6 @@ export ISTIO_ENABLED="${ISTIO_ENABLED:=false}"
export ISTIO_GATEWAY="${ISTIO_GATEWAY:=istio-system/istio-ingressgateway}" export ISTIO_GATEWAY="${ISTIO_GATEWAY:=istio-system/istio-ingressgateway}"
export INGRESS_CLASS="${INGRESS_CLASS:=nginx}" export INGRESS_CLASS="${INGRESS_CLASS:=nginx}"
export VIRTUAL_SERVICE_SUPPORTS_HTTPS="${VIRTUAL_SERVICE_SUPPORTS_HTTPS:=false}" export VIRTUAL_SERVICE_SUPPORTS_HTTPS="${VIRTUAL_SERVICE_SUPPORTS_HTTPS:=false}"
export ENABLE_LWS="${ENABLE_LWS:=false}"
export DOCKER_REGISTRY_USE_KUBERNETES_SECRET="${DOCKER_REGISTRY_USE_KUBERNETES_SECRET:=false}" export DOCKER_REGISTRY_USE_KUBERNETES_SECRET="${DOCKER_REGISTRY_USE_KUBERNETES_SECRET:=false}"
# Add command line options # Add command line options
...@@ -167,7 +166,7 @@ echo "VIRTUAL_SERVICE_SUPPORTS_HTTPS: $VIRTUAL_SERVICE_SUPPORTS_HTTPS" ...@@ -167,7 +166,7 @@ echo "VIRTUAL_SERVICE_SUPPORTS_HTTPS: $VIRTUAL_SERVICE_SUPPORTS_HTTPS"
echo "INSTALL_CRDS: $INSTALL_CRDS" echo "INSTALL_CRDS: $INSTALL_CRDS"
echo "DOCKER_REGISTRY_USE_KUBERNETES_SECRET: $DOCKER_REGISTRY_USE_KUBERNETES_SECRET" echo "DOCKER_REGISTRY_USE_KUBERNETES_SECRET: $DOCKER_REGISTRY_USE_KUBERNETES_SECRET"
envsubst '${NAMESPACE} ${RELEASE_NAME} ${DOCKER_USERNAME} ${DOCKER_PASSWORD} ${DOCKER_SERVER} ${IMAGE_TAG} ${DYNAMO_INGRESS_SUFFIX} ${PIPELINES_DOCKER_SERVER} ${PIPELINES_DOCKER_USERNAME} ${PIPELINES_DOCKER_PASSWORD} ${DOCKER_SECRET_NAME} ${INGRESS_ENABLED} ${ISTIO_ENABLED} ${INGRESS_CLASS} ${ISTIO_GATEWAY} ${VIRTUAL_SERVICE_SUPPORTS_HTTPS} ${ENABLE_LWS} ${DOCKER_REGISTRY_USE_KUBERNETES_SECRET}' < dynamo-platform-values.yaml > generated-values.yaml envsubst '${NAMESPACE} ${RELEASE_NAME} ${DOCKER_USERNAME} ${DOCKER_PASSWORD} ${DOCKER_SERVER} ${IMAGE_TAG} ${DYNAMO_INGRESS_SUFFIX} ${PIPELINES_DOCKER_SERVER} ${PIPELINES_DOCKER_USERNAME} ${PIPELINES_DOCKER_PASSWORD} ${DOCKER_SECRET_NAME} ${INGRESS_ENABLED} ${ISTIO_ENABLED} ${INGRESS_CLASS} ${ISTIO_GATEWAY} ${VIRTUAL_SERVICE_SUPPORTS_HTTPS} ${DOCKER_REGISTRY_USE_KUBERNETES_SECRET}' < dynamo-platform-values.yaml > generated-values.yaml
echo "generated file contents:" echo "generated file contents:"
cat generated-values.yaml cat generated-values.yaml
......
...@@ -23,7 +23,6 @@ dynamo-operator: ...@@ -23,7 +23,6 @@ dynamo-operator:
- name: ${DOCKER_SECRET_NAME} - name: ${DOCKER_SECRET_NAME}
dynamo: dynamo:
enableLWS: ${ENABLE_LWS}
ingress: ingress:
enabled: ${INGRESS_ENABLED} enabled: ${INGRESS_ENABLED}
className: ${INGRESS_CLASS} className: ${INGRESS_CLASS}
......
...@@ -19,11 +19,11 @@ maintainers: ...@@ -19,11 +19,11 @@ maintainers:
url: https://www.nvidia.com url: https://www.nvidia.com
description: A Helm chart for NVIDIA Dynamo Platform. description: A Helm chart for NVIDIA Dynamo Platform.
type: application type: application
version: 0.4.1 version: 0.5.0
home: https://nvidia.com home: https://nvidia.com
dependencies: dependencies:
- name: dynamo-operator - name: dynamo-operator
version: 0.4.1 version: 0.5.0
repository: file://components/operator repository: file://components/operator
condition: dynamo-operator.enabled condition: dynamo-operator.enabled
- name: nats - name: nats
......
...@@ -27,9 +27,9 @@ type: application ...@@ -27,9 +27,9 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes # This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version. # to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/) # Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.4.1 version: 0.5.0
# This is the version number of the application being deployed. This version number should be # This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to # incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using. # follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes. # It is recommended to use it with quotes.
appVersion: "0.4.1" appVersion: "0.5.0"
...@@ -101,9 +101,6 @@ spec: ...@@ -101,9 +101,6 @@ spec:
{{- if .Values.dynamo.virtualServiceSupportsHTTPS }} {{- if .Values.dynamo.virtualServiceSupportsHTTPS }}
- --virtual-service-supports-https={{ .Values.dynamo.virtualServiceSupportsHTTPS }} - --virtual-service-supports-https={{ .Values.dynamo.virtualServiceSupportsHTTPS }}
{{- end }} {{- end }}
{{- if .Values.dynamo.enableLWS }}
- --enable-lws
{{- end }}
{{- if .Values.dynamo.groveTerminationDelay }} {{- if .Values.dynamo.groveTerminationDelay }}
- --grove-termination-delay={{ .Values.dynamo.groveTerminationDelay }} - --grove-termination-delay={{ .Values.dynamo.groveTerminationDelay }}
{{- end }} {{- end }}
......
...@@ -425,7 +425,6 @@ rules: ...@@ -425,7 +425,6 @@ rules:
- patch - patch
- update - update
- watch - watch
{{- if .Values.dynamo.enableLWS }}
- apiGroups: - apiGroups:
- leaderworkerset.x-k8s.io - leaderworkerset.x-k8s.io
resources: resources:
...@@ -450,7 +449,6 @@ rules: ...@@ -450,7 +449,6 @@ rules:
- patch - patch
- update - update
- watch - watch
{{- end }}
--- ---
apiVersion: rbac.authorization.k8s.io/v1 apiVersion: rbac.authorization.k8s.io/v1
{{- if .Values.namespaceRestriction.enabled }} {{- if .Values.namespaceRestriction.enabled }}
......
...@@ -82,7 +82,6 @@ dynamo: ...@@ -82,7 +82,6 @@ dynamo:
serviceAccount: serviceAccount:
annotations: {} annotations: {}
enableLWS: false
groveTerminationDelay: 15m groveTerminationDelay: 15m
internalImages: internalImages:
......
...@@ -34,7 +34,6 @@ dynamo-operator: ...@@ -34,7 +34,6 @@ dynamo-operator:
- --metrics-bind-address=127.0.0.1:8080 - --metrics-bind-address=127.0.0.1:8080
imagePullSecrets: [] imagePullSecrets: []
dynamo: dynamo:
enableLWS: false
groveTerminationDelay: 15m groveTerminationDelay: 15m
internalImages: internalImages:
debugger: python:3.12-slim debugger: python:3.12-slim
......
...@@ -110,3 +110,13 @@ func (s *DynamoGraphDeployment) AddStatusCondition(condition metav1.Condition) { ...@@ -110,3 +110,13 @@ func (s *DynamoGraphDeployment) AddStatusCondition(condition metav1.Condition) {
// If no matching condition found, append the new one // If no matching condition found, append the new one
s.Status.Conditions = append(s.Status.Conditions, condition) s.Status.Conditions = append(s.Status.Conditions, condition)
} }
// HasAnyMultinodeService reports whether any service in the graph is configured with more than one node.
func (s *DynamoGraphDeployment) HasAnyMultinodeService() bool {
for _, svc := range s.Spec.Services {
if svc != nil && svc.GetNumberOfNodes() > 1 {
return true
}
}
return false
}
...@@ -129,7 +129,6 @@ func main() { ...@@ -129,7 +129,6 @@ func main() {
var ingressControllerClassName string var ingressControllerClassName string
var ingressControllerTLSSecretName string var ingressControllerTLSSecretName string
var ingressHostSuffix string var ingressHostSuffix string
var enableLWS bool
var groveTerminationDelay time.Duration var groveTerminationDelay time.Duration
flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.") flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
...@@ -156,8 +155,6 @@ func main() { ...@@ -156,8 +155,6 @@ func main() {
"The name of the ingress controller TLS secret to use") "The name of the ingress controller TLS secret to use")
flag.StringVar(&ingressHostSuffix, "ingress-host-suffix", "", flag.StringVar(&ingressHostSuffix, "ingress-host-suffix", "",
"The suffix to use for the ingress host") "The suffix to use for the ingress host")
flag.BoolVar(&enableLWS, "enable-lws", false,
"If set, enable leader worker set")
flag.DurationVar(&groveTerminationDelay, "grove-termination-delay", consts.DefaultGroveTerminationDelay, flag.DurationVar(&groveTerminationDelay, "grove-termination-delay", consts.DefaultGroveTerminationDelay,
"The termination delay for Grove PodGangSets") "The termination delay for Grove PodGangSets")
opts := zap.Options{ opts := zap.Options{
...@@ -168,11 +165,13 @@ func main() { ...@@ -168,11 +165,13 @@ func main() {
ctrlConfig := commonController.Config{ ctrlConfig := commonController.Config{
RestrictedNamespace: restrictedNamespace, RestrictedNamespace: restrictedNamespace,
EnableLWS: enableLWS,
Grove: commonController.GroveConfig{ Grove: commonController.GroveConfig{
Enabled: false, // Will be set after Grove discovery Enabled: false, // Will be set after Grove discovery
TerminationDelay: groveTerminationDelay, TerminationDelay: groveTerminationDelay,
}, },
LWS: commonController.LWSConfig{
Enabled: false, // Will be set after LWS discovery
},
EtcdAddress: etcdAddr, EtcdAddress: etcdAddr,
NatsAddress: natsAddr, NatsAddress: natsAddr,
IngressConfig: commonController.IngressConfig{ IngressConfig: commonController.IngressConfig{
...@@ -240,10 +239,13 @@ func main() { ...@@ -240,10 +239,13 @@ func main() {
os.Exit(1) os.Exit(1)
} }
// Detect Grove availability using discovery client // Detect orchestrators availability using discovery client
setupLog.Info("Detecting Grove availability...") setupLog.Info("Detecting Grove availability...")
groveEnabled := commonController.DetectGroveAvailability(mainCtx, mgr) groveEnabled := commonController.DetectGroveAvailability(mainCtx, mgr)
ctrlConfig.Grove.Enabled = groveEnabled ctrlConfig.Grove.Enabled = groveEnabled
setupLog.Info("Detecting LWS availability...")
lwsEnabled := commonController.DetectLWSAvailability(mainCtx, mgr)
ctrlConfig.LWS.Enabled = lwsEnabled
// Create etcd client // Create etcd client
cli, err := clientv3.New(clientv3.Config{ cli, err := clientv3.New(clientv3.Config{
......
...@@ -199,7 +199,7 @@ func (r *DynamoComponentDeploymentReconciler) Reconcile(ctx context.Context, req ...@@ -199,7 +199,7 @@ func (r *DynamoComponentDeploymentReconciler) Reconcile(ctx context.Context, req
// Create the appropriate workload resource based on deployment type // Create the appropriate workload resource based on deployment type
var leaderWorkerSets []*leaderworkersetv1.LeaderWorkerSet var leaderWorkerSets []*leaderworkersetv1.LeaderWorkerSet
var deployment *appsv1.Deployment var deployment *appsv1.Deployment
if r.Config.EnableLWS && dynamoComponentDeployment.IsMultinode() { if r.Config.LWS.Enabled && dynamoComponentDeployment.IsMultinode() {
desiredReplicas := int32(1) desiredReplicas := int32(1)
if dynamoComponentDeployment.Spec.Replicas != nil { if dynamoComponentDeployment.Spec.Replicas != nil {
desiredReplicas = *dynamoComponentDeployment.Spec.Replicas desiredReplicas = *dynamoComponentDeployment.Spec.Replicas
...@@ -1356,7 +1356,7 @@ func (r *DynamoComponentDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) ...@@ -1356,7 +1356,7 @@ func (r *DynamoComponentDeploymentReconciler) SetupWithManager(mgr ctrl.Manager)
Owns(&corev1.PersistentVolumeClaim{}, builder.WithPredicates(predicate.GenerationChangedPredicate{})). Owns(&corev1.PersistentVolumeClaim{}, builder.WithPredicates(predicate.GenerationChangedPredicate{})).
WithEventFilter(controller_common.EphemeralDeploymentEventFilter(r.Config)) WithEventFilter(controller_common.EphemeralDeploymentEventFilter(r.Config))
if r.Config.EnableLWS { if r.Config.LWS.Enabled {
m.Owns(&leaderworkersetv1.LeaderWorkerSet{}, builder.WithPredicates(predicate.Funcs{ m.Owns(&leaderworkersetv1.LeaderWorkerSet{}, builder.WithPredicates(predicate.Funcs{
// ignore creation cause we don't want to be called again after we create the LeaderWorkerSet // ignore creation cause we don't want to be called again after we create the LeaderWorkerSet
CreateFunc: func(ce event.CreateEvent) bool { return false }, CreateFunc: func(ce event.CreateEvent) bool { return false },
......
...@@ -169,14 +169,27 @@ type Resource interface { ...@@ -169,14 +169,27 @@ type Resource interface {
func (r *DynamoGraphDeploymentReconciler) reconcileResources(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment) (State, Reason, Message, error) { func (r *DynamoGraphDeploymentReconciler) reconcileResources(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment) (State, Reason, Message, error) {
logger := log.FromContext(ctx) logger := log.FromContext(ctx)
if r.Config.Grove.Enabled { // Orchestrator selection via single boolean annotation: nvidia.com/enable-grove
// check if explicit opt out of grove // Unset or not "false": Grove if available; else component mode
if dynamoDeployment.Annotations[consts.KubeAnnotationEnableGrove] == consts.KubeLabelValueFalse { // "false": component mode (multinode -> LWS; single-node -> standard)
logger.Info("Grove is explicitly disabled for this deployment, skipping grove resources reconciliation") enableGrove := true
return r.reconcileDynamoComponentsDeployments(ctx, dynamoDeployment) if dynamoDeployment.Annotations != nil && strings.ToLower(dynamoDeployment.Annotations[consts.KubeAnnotationEnableGrove]) == consts.KubeLabelValueFalse {
} enableGrove = false
}
// Determine if any service is multinode
hasMultinode := dynamoDeployment.HasAnyMultinodeService()
if enableGrove && r.Config.Grove.Enabled {
logger.Info("Reconciling Grove resources", "enableGrove", enableGrove, "groveEnabled", r.Config.Grove.Enabled, "hasMultinode", hasMultinode, "lwsEnabled", r.Config.LWS.Enabled)
return r.reconcileGroveResources(ctx, dynamoDeployment) return r.reconcileGroveResources(ctx, dynamoDeployment)
} }
if hasMultinode && !r.Config.LWS.Enabled {
err := fmt.Errorf("no multinode orchestrator available")
logger.Error(err, err.Error(), "hasMultinode", hasMultinode, "lwsEnabled", r.Config.LWS.Enabled, "enableGrove", enableGrove, "groveEnabled", r.Config.Grove.Enabled)
return "", "", "", err
}
logger.Info("Reconciling Dynamo components deployments", "hasMultinode", hasMultinode, "lwsEnabled", r.Config.LWS.Enabled, "enableGrove", enableGrove, "groveEnabled", r.Config.Grove.Enabled)
return r.reconcileDynamoComponentsDeployments(ctx, dynamoDeployment) return r.reconcileDynamoComponentsDeployments(ctx, dynamoDeployment)
} }
...@@ -285,7 +298,7 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGroveResources(ctx context.Co ...@@ -285,7 +298,7 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGroveResources(ctx context.Co
// Handle Grove scaling operations after structural changes // Handle Grove scaling operations after structural changes
if err := r.reconcileGroveScaling(ctx, dynamoDeployment); err != nil { if err := r.reconcileGroveScaling(ctx, dynamoDeployment); err != nil {
logger.Error(err, "failed to reconcile Grove scaling") logger.Error(err, "failed to reconcile Grove scaling")
return FailedState, "grove_scaling_failed", Message(err.Error()), err return "", "", "", fmt.Errorf("failed to reconcile Grove scaling: %w", err)
} }
resources := []Resource{groveGangSetAsResource} resources := []Resource{groveGangSetAsResource}
......
...@@ -37,11 +37,16 @@ type GroveConfig struct { ...@@ -37,11 +37,16 @@ type GroveConfig struct {
TerminationDelay time.Duration TerminationDelay time.Duration
} }
type LWSConfig struct {
// Enabled is automatically determined by checking if LWS CRDs are installed in the cluster
Enabled bool
}
type Config struct { type Config struct {
// Enable resources filtering, only the resources belonging to the given namespace will be handled. // Enable resources filtering, only the resources belonging to the given namespace will be handled.
RestrictedNamespace string RestrictedNamespace string
EnableLWS bool
Grove GroveConfig Grove GroveConfig
LWS LWSConfig
EtcdAddress string EtcdAddress string
NatsAddress string NatsAddress string
IngressConfig IngressConfig IngressConfig IngressConfig
...@@ -61,40 +66,47 @@ func (i *IngressConfig) UseVirtualService() bool { ...@@ -61,40 +66,47 @@ func (i *IngressConfig) UseVirtualService() bool {
// DetectGroveAvailability checks if Grove is available by checking if the Grove API group is registered // DetectGroveAvailability checks if Grove is available by checking if the Grove API group is registered
// This approach uses the discovery client which is simpler and more reliable // This approach uses the discovery client which is simpler and more reliable
func DetectGroveAvailability(ctx context.Context, mgr ctrl.Manager) bool { func DetectGroveAvailability(ctx context.Context, mgr ctrl.Manager) bool {
return detectAPIGroupAvailability(ctx, mgr, "grove.io")
}
// DetectLWSAvailability checks if LWS is available by checking if the LWS API group is registered
// This approach uses the discovery client which is simpler and more reliable
func DetectLWSAvailability(ctx context.Context, mgr ctrl.Manager) bool {
return detectAPIGroupAvailability(ctx, mgr, "leaderworkerset.x-k8s.io")
}
// detectAPIGroupAvailability checks if a specific API group is registered in the cluster
func detectAPIGroupAvailability(ctx context.Context, mgr ctrl.Manager, groupName string) bool {
logger := log.FromContext(ctx) logger := log.FromContext(ctx)
// Use the discovery client to check if Grove API groups are available
cfg := mgr.GetConfig() cfg := mgr.GetConfig()
if cfg == nil { if cfg == nil {
logger.Info("Grove detection failed, no discovery client available") logger.Info("detection failed, no discovery client available", "group", groupName)
return false return false
} }
// Try to create a discovery client
discoveryClient, err := discovery.NewDiscoveryClientForConfig(cfg) discoveryClient, err := discovery.NewDiscoveryClientForConfig(cfg)
if err != nil { if err != nil {
logger.Error(err, "Grove detection failed, could not create discovery client") logger.Error(err, "detection failed, could not create discovery client", "group", groupName)
return false return false
} }
// Check if grove.io API group is available
apiGroups, err := discoveryClient.ServerGroups() apiGroups, err := discoveryClient.ServerGroups()
if err != nil { if err != nil {
logger.Error(err, "Grove detection failed, could not list server groups") logger.Error(err, "detection failed, could not list server groups", "group", groupName)
return false return false
} }
for _, group := range apiGroups.Groups { for _, group := range apiGroups.Groups {
if group.Name == "grove.io" { if group.Name == groupName {
logger.Info("Grove is available, grove.io API group found") logger.Info("API group is available", "group", groupName)
return true return true
} }
} }
logger.Info("Grove not available, grove.io API group not found") logger.Info("API group not available", "group", groupName)
return false return false
} }
func EphemeralDeploymentEventFilter(config Config) predicate.Predicate { func EphemeralDeploymentEventFilter(config Config) predicate.Predicate {
return predicate.NewPredicateFuncs(func(o client.Object) bool { return predicate.NewPredicateFuncs(func(o client.Object) bool {
l := log.FromContext(context.Background()) l := log.FromContext(context.Background())
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment