Unverified Commit dbd33df6 authored by julienmancuso's avatar julienmancuso Committed by GitHub
Browse files

fix: handle groveTerminationDelay and auto-detect grove installation (#2190)

parent f14f59c2
...@@ -49,7 +49,6 @@ export ISTIO_GATEWAY="${ISTIO_GATEWAY:=istio-system/istio-ingressgateway}" ...@@ -49,7 +49,6 @@ export ISTIO_GATEWAY="${ISTIO_GATEWAY:=istio-system/istio-ingressgateway}"
export INGRESS_CLASS="${INGRESS_CLASS:=nginx}" export INGRESS_CLASS="${INGRESS_CLASS:=nginx}"
export VIRTUAL_SERVICE_SUPPORTS_HTTPS="${VIRTUAL_SERVICE_SUPPORTS_HTTPS:=false}" export VIRTUAL_SERVICE_SUPPORTS_HTTPS="${VIRTUAL_SERVICE_SUPPORTS_HTTPS:=false}"
export ENABLE_LWS="${ENABLE_LWS:=false}" export ENABLE_LWS="${ENABLE_LWS:=false}"
export ENABLE_GROVE="${ENABLE_GROVE:=false}"
# Add command line options # Add command line options
INTERACTIVE=false INTERACTIVE=false
...@@ -165,7 +164,7 @@ echo "DYNAMO_INGRESS_SUFFIX: $DYNAMO_INGRESS_SUFFIX" ...@@ -165,7 +164,7 @@ echo "DYNAMO_INGRESS_SUFFIX: $DYNAMO_INGRESS_SUFFIX"
echo "VIRTUAL_SERVICE_SUPPORTS_HTTPS: $VIRTUAL_SERVICE_SUPPORTS_HTTPS" echo "VIRTUAL_SERVICE_SUPPORTS_HTTPS: $VIRTUAL_SERVICE_SUPPORTS_HTTPS"
echo "INSTALL_CRDS: $INSTALL_CRDS" echo "INSTALL_CRDS: $INSTALL_CRDS"
envsubst '${NAMESPACE} ${RELEASE_NAME} ${DOCKER_USERNAME} ${DOCKER_PASSWORD} ${DOCKER_SERVER} ${IMAGE_TAG} ${DYNAMO_INGRESS_SUFFIX} ${PIPELINES_DOCKER_SERVER} ${PIPELINES_DOCKER_USERNAME} ${PIPELINES_DOCKER_PASSWORD} ${DOCKER_SECRET_NAME} ${INGRESS_ENABLED} ${ISTIO_ENABLED} ${INGRESS_CLASS} ${ISTIO_GATEWAY} ${VIRTUAL_SERVICE_SUPPORTS_HTTPS} ${ENABLE_LWS} ${ENABLE_GROVE}' < dynamo-platform-values.yaml > generated-values.yaml envsubst '${NAMESPACE} ${RELEASE_NAME} ${DOCKER_USERNAME} ${DOCKER_PASSWORD} ${DOCKER_SERVER} ${IMAGE_TAG} ${DYNAMO_INGRESS_SUFFIX} ${PIPELINES_DOCKER_SERVER} ${PIPELINES_DOCKER_USERNAME} ${PIPELINES_DOCKER_PASSWORD} ${DOCKER_SECRET_NAME} ${INGRESS_ENABLED} ${ISTIO_ENABLED} ${INGRESS_CLASS} ${ISTIO_GATEWAY} ${VIRTUAL_SERVICE_SUPPORTS_HTTPS} ${ENABLE_LWS}' < dynamo-platform-values.yaml > generated-values.yaml
echo "generated file contents:" echo "generated file contents:"
cat generated-values.yaml cat generated-values.yaml
......
...@@ -24,7 +24,6 @@ dynamo-operator: ...@@ -24,7 +24,6 @@ dynamo-operator:
dynamo: dynamo:
enableLWS: ${ENABLE_LWS} enableLWS: ${ENABLE_LWS}
enableGrove: ${ENABLE_GROVE}
ingress: ingress:
enabled: ${INGRESS_ENABLED} enabled: ${INGRESS_ENABLED}
className: ${INGRESS_CLASS} className: ${INGRESS_CLASS}
......
...@@ -100,8 +100,8 @@ spec: ...@@ -100,8 +100,8 @@ spec:
{{- if .Values.dynamo.enableLWS }} {{- if .Values.dynamo.enableLWS }}
- --enable-lws - --enable-lws
{{- end }} {{- end }}
{{- if .Values.dynamo.enableGrove }} {{- if .Values.dynamo.groveTerminationDelay }}
- --enable-grove - --grove-termination-delay={{ .Values.dynamo.groveTerminationDelay }}
{{- end }} {{- end }}
command: command:
- /manager - /manager
......
...@@ -116,7 +116,6 @@ rules: ...@@ -116,7 +116,6 @@ rules:
- patch - patch
- update - update
- watch - watch
{{- if .Values.dynamo.enableGrove }}
- apiGroups: - apiGroups:
- grove.io - grove.io
resources: resources:
...@@ -129,7 +128,6 @@ rules: ...@@ -129,7 +128,6 @@ rules:
- patch - patch
- update - update
- watch - watch
{{- end }}
- apiGroups: - apiGroups:
- apps - apps
resources: resources:
......
...@@ -82,7 +82,7 @@ dynamo: ...@@ -82,7 +82,7 @@ dynamo:
annotations: {} annotations: {}
enableLWS: false enableLWS: false
enableGrove: false groveTerminationDelay: 15m
internalImages: internalImages:
debugger: python:3.12-slim debugger: python:3.12-slim
......
...@@ -34,7 +34,7 @@ dynamo-operator: ...@@ -34,7 +34,7 @@ dynamo-operator:
imagePullSecrets: [] imagePullSecrets: []
dynamo: dynamo:
enableLWS: false enableLWS: false
enableGrove: false groveTerminationDelay: 15m
internalImages: internalImages:
debugger: python:3.12-slim debugger: python:3.12-slim
enableRestrictedSecurityContext: false enableRestrictedSecurityContext: false
......
...@@ -30,6 +30,7 @@ import ( ...@@ -30,6 +30,7 @@ import (
// to ensure that exec-entrypoint and run can make use of them. // to ensure that exec-entrypoint and run can make use of them.
clientv3 "go.etcd.io/etcd/client/v3" clientv3 "go.etcd.io/etcd/client/v3"
corev1 "k8s.io/api/core/v1" corev1 "k8s.io/api/core/v1"
apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
"k8s.io/client-go/informers" "k8s.io/client-go/informers"
"k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes"
_ "k8s.io/client-go/plugin/pkg/client/auth" _ "k8s.io/client-go/plugin/pkg/client/auth"
...@@ -50,6 +51,7 @@ import ( ...@@ -50,6 +51,7 @@ import (
grovev1alpha1 "github.com/NVIDIA/grove/operator/api/core/v1alpha1" grovev1alpha1 "github.com/NVIDIA/grove/operator/api/core/v1alpha1"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1" nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller" "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller"
commonController "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common" commonController "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/etcd" "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/etcd"
...@@ -73,6 +75,10 @@ func init() { ...@@ -73,6 +75,10 @@ func init() {
utilruntime.Must(volcanoscheme.AddToScheme(scheme)) utilruntime.Must(volcanoscheme.AddToScheme(scheme))
utilruntime.Must(grovev1alpha1.AddToScheme(scheme)) utilruntime.Must(grovev1alpha1.AddToScheme(scheme))
utilruntime.Must(apiextensionsv1.AddToScheme(scheme))
utilruntime.Must(istioclientsetscheme.AddToScheme(scheme))
//+kubebuilder:scaffold:scheme //+kubebuilder:scaffold:scheme
} }
...@@ -92,7 +98,7 @@ func main() { ...@@ -92,7 +98,7 @@ func main() {
var ingressControllerTLSSecretName string var ingressControllerTLSSecretName string
var ingressHostSuffix string var ingressHostSuffix string
var enableLWS bool var enableLWS bool
var enableGrove bool var groveTerminationDelay time.Duration
flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.") flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
flag.BoolVar(&enableLeaderElection, "leader-elect", false, flag.BoolVar(&enableLeaderElection, "leader-elect", false,
...@@ -120,20 +126,21 @@ func main() { ...@@ -120,20 +126,21 @@ func main() {
"The suffix to use for the ingress host") "The suffix to use for the ingress host")
flag.BoolVar(&enableLWS, "enable-lws", false, flag.BoolVar(&enableLWS, "enable-lws", false,
"If set, enable leader worker set") "If set, enable leader worker set")
flag.BoolVar(&enableGrove, "enable-grove", false, flag.DurationVar(&groveTerminationDelay, "grove-termination-delay", consts.DefaultGroveTerminationDelay,
"If set, enable grove") "The termination delay for Grove PodGangSets")
opts := zap.Options{ opts := zap.Options{
Development: true, Development: true,
} }
opts.BindFlags(flag.CommandLine) opts.BindFlags(flag.CommandLine)
flag.Parse() flag.Parse()
utilruntime.Must(istioclientsetscheme.AddToScheme(scheme))
ctrlConfig := commonController.Config{ ctrlConfig := commonController.Config{
RestrictedNamespace: restrictedNamespace, RestrictedNamespace: restrictedNamespace,
EnableLWS: enableLWS, EnableLWS: enableLWS,
EnableGrove: enableGrove, Grove: commonController.GroveConfig{
Enabled: false, // Will be set after Grove discovery
TerminationDelay: groveTerminationDelay,
},
EtcdAddress: etcdAddr, EtcdAddress: etcdAddr,
NatsAddress: natsAddr, NatsAddress: natsAddr,
IngressConfig: commonController.IngressConfig{ IngressConfig: commonController.IngressConfig{
...@@ -201,6 +208,11 @@ func main() { ...@@ -201,6 +208,11 @@ func main() {
os.Exit(1) os.Exit(1)
} }
// Detect Grove availability using discovery client
setupLog.Info("Detecting Grove availability...")
groveEnabled := commonController.DetectGroveAvailability(mainCtx, mgr)
ctrlConfig.Grove.Enabled = groveEnabled
// Create etcd client // Create etcd client
cli, err := clientv3.New(clientv3.Config{ cli, err := clientv3.New(clientv3.Config{
Endpoints: []string{etcdAddr}, Endpoints: []string{etcdAddr},
......
package consts package consts
import "time"
const ( const (
HPACPUDefaultAverageUtilization = 80 HPACPUDefaultAverageUtilization = 80
...@@ -37,4 +39,6 @@ const ( ...@@ -37,4 +39,6 @@ const (
PlannerServiceAccountName = "planner-serviceaccount" PlannerServiceAccountName = "planner-serviceaccount"
DefaultIngressSuffix = "local" DefaultIngressSuffix = "local"
DefaultGroveTerminationDelay = 15 * time.Minute
) )
...@@ -144,7 +144,7 @@ type Resource interface { ...@@ -144,7 +144,7 @@ type Resource interface {
func (r *DynamoGraphDeploymentReconciler) reconcileResources(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment) (State, Reason, Message, error) { func (r *DynamoGraphDeploymentReconciler) reconcileResources(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment) (State, Reason, Message, error) {
logger := log.FromContext(ctx) logger := log.FromContext(ctx)
if r.Config.EnableGrove { if r.Config.Grove.Enabled {
// check if explicit opt out of grove // check if explicit opt out of grove
if dynamoDeployment.Annotations[consts.KubeAnnotationEnableGrove] == consts.KubeLabelValueFalse { if dynamoDeployment.Annotations[consts.KubeAnnotationEnableGrove] == consts.KubeLabelValueFalse {
logger.Info("Grove is explicitly disabled for this deployment, skipping grove resources reconciliation") logger.Info("Grove is explicitly disabled for this deployment, skipping grove resources reconciliation")
...@@ -308,7 +308,7 @@ func (r *DynamoGraphDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) err ...@@ -308,7 +308,7 @@ func (r *DynamoGraphDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) err
GenericFunc: func(ge event.GenericEvent) bool { return true }, GenericFunc: func(ge event.GenericEvent) bool { return true },
})). })).
WithEventFilter(commonController.EphemeralDeploymentEventFilter(r.Config)) WithEventFilter(commonController.EphemeralDeploymentEventFilter(r.Config))
if r.Config.EnableGrove { if r.Config.Grove.Enabled {
ctrlBuilder = ctrlBuilder.Owns(&grovev1alpha1.PodGangSet{}, builder.WithPredicates(predicate.Funcs{ ctrlBuilder = ctrlBuilder.Owns(&grovev1alpha1.PodGangSet{}, builder.WithPredicates(predicate.Funcs{
// ignore creation cause we don't want to be called again after we create the pod gang set // ignore creation cause we don't want to be called again after we create the pod gang set
CreateFunc: func(ce event.CreateEvent) bool { return false }, CreateFunc: func(ce event.CreateEvent) bool { return false },
......
...@@ -20,18 +20,28 @@ package controller_common ...@@ -20,18 +20,28 @@ package controller_common
import ( import (
"context" "context"
"strings" "strings"
"time"
"k8s.io/apimachinery/pkg/api/meta" "k8s.io/apimachinery/pkg/api/meta"
"k8s.io/client-go/discovery"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/predicate" "sigs.k8s.io/controller-runtime/pkg/predicate"
) )
type GroveConfig struct {
// Enabled is automatically determined by checking if Grove CRDs are installed in the cluster
Enabled bool
// TerminationDelay configures the termination delay for Grove PodGangSets
TerminationDelay time.Duration
}
type Config struct { type Config struct {
// Enable resources filtering, only the resources belonging to the given namespace will be handled. // Enable resources filtering, only the resources belonging to the given namespace will be handled.
RestrictedNamespace string RestrictedNamespace string
EnableLWS bool EnableLWS bool
EnableGrove bool Grove GroveConfig
EtcdAddress string EtcdAddress string
NatsAddress string NatsAddress string
IngressConfig IngressConfig IngressConfig IngressConfig
...@@ -48,6 +58,43 @@ func (i *IngressConfig) UseVirtualService() bool { ...@@ -48,6 +58,43 @@ func (i *IngressConfig) UseVirtualService() bool {
return i.VirtualServiceGateway != "" return i.VirtualServiceGateway != ""
} }
// DetectGroveAvailability checks if Grove is available by checking if the Grove API group is registered
// This approach uses the discovery client which is simpler and more reliable
func DetectGroveAvailability(ctx context.Context, mgr ctrl.Manager) bool {
logger := log.FromContext(ctx)
// Use the discovery client to check if Grove API groups are available
cfg := mgr.GetConfig()
if cfg == nil {
logger.Info("Grove detection failed, no discovery client available")
return false
}
// Try to create a discovery client
discoveryClient, err := discovery.NewDiscoveryClientForConfig(cfg)
if err != nil {
logger.Error(err, "Grove detection failed, could not create discovery client")
return false
}
// Check if grove.io API group is available
apiGroups, err := discoveryClient.ServerGroups()
if err != nil {
logger.Error(err, "Grove detection failed, could not list server groups")
return false
}
for _, group := range apiGroups.Groups {
if group.Name == "grove.io" {
logger.Info("Grove is available, grove.io API group found")
return true
}
}
logger.Info("Grove not available, grove.io API group not found")
return false
}
func EphemeralDeploymentEventFilter(config Config) predicate.Predicate { func EphemeralDeploymentEventFilter(config Config) predicate.Predicate {
return predicate.NewPredicateFuncs(func(o client.Object) bool { return predicate.NewPredicateFuncs(func(o client.Object) bool {
l := log.FromContext(context.Background()) l := log.FromContext(context.Background())
......
...@@ -316,6 +316,9 @@ func GenerateGrovePodGangSet(ctx context.Context, dynamoDeployment *v1alpha1.Dyn ...@@ -316,6 +316,9 @@ func GenerateGrovePodGangSet(ctx context.Context, dynamoDeployment *v1alpha1.Dyn
gangSet.Name = dynamoDeployment.Name gangSet.Name = dynamoDeployment.Name
gangSet.Namespace = dynamoDeployment.Namespace gangSet.Namespace = dynamoDeployment.Namespace
gangSet.Spec.Replicas = 1 gangSet.Spec.Replicas = 1
if controllerConfig.Grove.TerminationDelay > 0 {
gangSet.Spec.Template.TerminationDelay = &metav1.Duration{Duration: controllerConfig.Grove.TerminationDelay}
}
for componentName, component := range dynamoDeployment.Spec.Services { for componentName, component := range dynamoDeployment.Spec.Services {
container := corev1.Container{ container := corev1.Container{
Name: "main", Name: "main",
......
...@@ -23,6 +23,7 @@ import ( ...@@ -23,6 +23,7 @@ import (
"reflect" "reflect"
"sort" "sort"
"testing" "testing"
"time"
grovev1alpha1 "github.com/NVIDIA/grove/operator/api/core/v1alpha1" grovev1alpha1 "github.com/NVIDIA/grove/operator/api/core/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/dynamo/common" "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/dynamo/common"
...@@ -1136,6 +1137,9 @@ func TestGenerateGrovePodGangSet(t *testing.T) { ...@@ -1136,6 +1137,9 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
controllerConfig: controller_common.Config{ controllerConfig: controller_common.Config{
EtcdAddress: "etcd-address", EtcdAddress: "etcd-address",
NatsAddress: "nats-address", NatsAddress: "nats-address",
Grove: controller_common.GroveConfig{
TerminationDelay: 15 * time.Minute,
},
}, },
dynamoDeployment: &v1alpha1.DynamoGraphDeployment{ dynamoDeployment: &v1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
...@@ -1272,6 +1276,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) { ...@@ -1272,6 +1276,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
Spec: grovev1alpha1.PodGangSetSpec{ Spec: grovev1alpha1.PodGangSetSpec{
Replicas: 1, Replicas: 1,
Template: grovev1alpha1.PodGangSetTemplateSpec{ Template: grovev1alpha1.PodGangSetTemplateSpec{
TerminationDelay: &metav1.Duration{Duration: 15 * time.Minute},
Cliques: []*grovev1alpha1.PodCliqueTemplateSpec{ Cliques: []*grovev1alpha1.PodCliqueTemplateSpec{
{ {
Name: "frontend", Name: "frontend",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment