Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
dbd33df6
Unverified
Commit
dbd33df6
authored
Jul 31, 2025
by
julienmancuso
Committed by
GitHub
Jul 31, 2025
Browse files
fix: handle groveTerminationDelay and auto-detect grove installation (#2190)
parent
f14f59c2
Changes
12
Show whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
87 additions
and
20 deletions
+87
-20
deploy/cloud/helm/deploy.sh
deploy/cloud/helm/deploy.sh
+1
-2
deploy/cloud/helm/dynamo-platform-values.yaml
deploy/cloud/helm/dynamo-platform-values.yaml
+0
-1
deploy/cloud/helm/platform/components/operator/templates/deployment.yaml
...lm/platform/components/operator/templates/deployment.yaml
+2
-2
deploy/cloud/helm/platform/components/operator/templates/manager-rbac.yaml
.../platform/components/operator/templates/manager-rbac.yaml
+0
-2
deploy/cloud/helm/platform/components/operator/values.yaml
deploy/cloud/helm/platform/components/operator/values.yaml
+1
-1
deploy/cloud/helm/platform/values.yaml
deploy/cloud/helm/platform/values.yaml
+1
-1
deploy/cloud/operator/cmd/main.go
deploy/cloud/operator/cmd/main.go
+20
-8
deploy/cloud/operator/internal/consts/consts.go
deploy/cloud/operator/internal/consts/consts.go
+4
-0
deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go
...r/internal/controller/dynamographdeployment_controller.go
+2
-2
deploy/cloud/operator/internal/controller_common/predicate.go
...oy/cloud/operator/internal/controller_common/predicate.go
+48
-1
deploy/cloud/operator/internal/dynamo/graph.go
deploy/cloud/operator/internal/dynamo/graph.go
+3
-0
deploy/cloud/operator/internal/dynamo/graph_test.go
deploy/cloud/operator/internal/dynamo/graph_test.go
+5
-0
No files found.
deploy/cloud/helm/deploy.sh
View file @
dbd33df6
...
...
@@ -49,7 +49,6 @@ export ISTIO_GATEWAY="${ISTIO_GATEWAY:=istio-system/istio-ingressgateway}"
export
INGRESS_CLASS
=
"
${
INGRESS_CLASS
:
=nginx
}
"
export
VIRTUAL_SERVICE_SUPPORTS_HTTPS
=
"
${
VIRTUAL_SERVICE_SUPPORTS_HTTPS
:
=false
}
"
export
ENABLE_LWS
=
"
${
ENABLE_LWS
:
=false
}
"
export
ENABLE_GROVE
=
"
${
ENABLE_GROVE
:
=false
}
"
# Add command line options
INTERACTIVE
=
false
...
...
@@ -165,7 +164,7 @@ echo "DYNAMO_INGRESS_SUFFIX: $DYNAMO_INGRESS_SUFFIX"
echo
"VIRTUAL_SERVICE_SUPPORTS_HTTPS:
$VIRTUAL_SERVICE_SUPPORTS_HTTPS
"
echo
"INSTALL_CRDS:
$INSTALL_CRDS
"
envsubst
'${NAMESPACE} ${RELEASE_NAME} ${DOCKER_USERNAME} ${DOCKER_PASSWORD} ${DOCKER_SERVER} ${IMAGE_TAG} ${DYNAMO_INGRESS_SUFFIX} ${PIPELINES_DOCKER_SERVER} ${PIPELINES_DOCKER_USERNAME} ${PIPELINES_DOCKER_PASSWORD} ${DOCKER_SECRET_NAME} ${INGRESS_ENABLED} ${ISTIO_ENABLED} ${INGRESS_CLASS} ${ISTIO_GATEWAY} ${VIRTUAL_SERVICE_SUPPORTS_HTTPS} ${ENABLE_LWS}
${ENABLE_GROVE}
'
< dynamo-platform-values.yaml
>
generated-values.yaml
envsubst
'${NAMESPACE} ${RELEASE_NAME} ${DOCKER_USERNAME} ${DOCKER_PASSWORD} ${DOCKER_SERVER} ${IMAGE_TAG} ${DYNAMO_INGRESS_SUFFIX} ${PIPELINES_DOCKER_SERVER} ${PIPELINES_DOCKER_USERNAME} ${PIPELINES_DOCKER_PASSWORD} ${DOCKER_SECRET_NAME} ${INGRESS_ENABLED} ${ISTIO_ENABLED} ${INGRESS_CLASS} ${ISTIO_GATEWAY} ${VIRTUAL_SERVICE_SUPPORTS_HTTPS} ${ENABLE_LWS}'
< dynamo-platform-values.yaml
>
generated-values.yaml
echo
"generated file contents:"
cat
generated-values.yaml
...
...
deploy/cloud/helm/dynamo-platform-values.yaml
View file @
dbd33df6
...
...
@@ -24,7 +24,6 @@ dynamo-operator:
dynamo
:
enableLWS
:
${ENABLE_LWS}
enableGrove
:
${ENABLE_GROVE}
ingress
:
enabled
:
${INGRESS_ENABLED}
className
:
${INGRESS_CLASS}
...
...
deploy/cloud/helm/platform/components/operator/templates/deployment.yaml
View file @
dbd33df6
...
...
@@ -100,8 +100,8 @@ spec:
{{
- if .Values.dynamo.enableLWS
}}
-
--enable-lws
{{
- end
}}
{{
- if .Values.dynamo.
enableGrove
}}
-
--
enable-grove
{{
- if .Values.dynamo.
groveTerminationDelay
}}
-
--
grove-termination-delay={{ .Values.dynamo.groveTerminationDelay }}
{{
- end
}}
command
:
-
/manager
...
...
deploy/cloud/helm/platform/components/operator/templates/manager-rbac.yaml
View file @
dbd33df6
...
...
@@ -116,7 +116,6 @@ rules:
-
patch
-
update
-
watch
{{
- if .Values.dynamo.enableGrove
}}
-
apiGroups
:
-
grove.io
resources
:
...
...
@@ -129,7 +128,6 @@ rules:
-
patch
-
update
-
watch
{{
- end
}}
-
apiGroups
:
-
apps
resources
:
...
...
deploy/cloud/helm/platform/components/operator/values.yaml
View file @
dbd33df6
...
...
@@ -82,7 +82,7 @@ dynamo:
annotations
:
{}
enableLWS
:
false
enableGrove
:
false
groveTerminationDelay
:
15m
internalImages
:
debugger
:
python:3.12-slim
...
...
deploy/cloud/helm/platform/values.yaml
View file @
dbd33df6
...
...
@@ -34,7 +34,7 @@ dynamo-operator:
imagePullSecrets
:
[]
dynamo
:
enableLWS
:
false
enableGrove
:
false
groveTerminationDelay
:
15m
internalImages
:
debugger
:
python:3.12-slim
enableRestrictedSecurityContext
:
false
...
...
deploy/cloud/operator/cmd/main.go
View file @
dbd33df6
...
...
@@ -30,6 +30,7 @@ import (
// to ensure that exec-entrypoint and run can make use of them.
clientv3
"go.etcd.io/etcd/client/v3"
corev1
"k8s.io/api/core/v1"
apiextensionsv1
"k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
"k8s.io/client-go/informers"
"k8s.io/client-go/kubernetes"
_
"k8s.io/client-go/plugin/pkg/client/auth"
...
...
@@ -50,6 +51,7 @@ import (
grovev1alpha1
"github.com/NVIDIA/grove/operator/api/core/v1alpha1"
nvidiacomv1alpha1
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller"
commonController
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/etcd"
...
...
@@ -73,6 +75,10 @@ func init() {
utilruntime
.
Must
(
volcanoscheme
.
AddToScheme
(
scheme
))
utilruntime
.
Must
(
grovev1alpha1
.
AddToScheme
(
scheme
))
utilruntime
.
Must
(
apiextensionsv1
.
AddToScheme
(
scheme
))
utilruntime
.
Must
(
istioclientsetscheme
.
AddToScheme
(
scheme
))
//+kubebuilder:scaffold:scheme
}
...
...
@@ -92,7 +98,7 @@ func main() {
var
ingressControllerTLSSecretName
string
var
ingressHostSuffix
string
var
enableLWS
bool
var
enableGrove
bool
var
groveTerminationDelay
time
.
Duration
flag
.
StringVar
(
&
metricsAddr
,
"metrics-bind-address"
,
":8080"
,
"The address the metric endpoint binds to."
)
flag
.
StringVar
(
&
probeAddr
,
"health-probe-bind-address"
,
":8081"
,
"The address the probe endpoint binds to."
)
flag
.
BoolVar
(
&
enableLeaderElection
,
"leader-elect"
,
false
,
...
...
@@ -120,20 +126,21 @@ func main() {
"The suffix to use for the ingress host"
)
flag
.
BoolVar
(
&
enableLWS
,
"enable-lws"
,
false
,
"If set, enable leader worker set"
)
flag
.
BoolVar
(
&
enableGrove
,
"enable-grove"
,
false
,
"
If set, enable grove
"
)
flag
.
DurationVar
(
&
groveTerminationDelay
,
"grove-termination-delay"
,
consts
.
DefaultGroveTerminationDelay
,
"
The termination delay for Grove PodGangSets
"
)
opts
:=
zap
.
Options
{
Development
:
true
,
}
opts
.
BindFlags
(
flag
.
CommandLine
)
flag
.
Parse
()
utilruntime
.
Must
(
istioclientsetscheme
.
AddToScheme
(
scheme
))
ctrlConfig
:=
commonController
.
Config
{
RestrictedNamespace
:
restrictedNamespace
,
EnableLWS
:
enableLWS
,
EnableGrove
:
enableGrove
,
Grove
:
commonController
.
GroveConfig
{
Enabled
:
false
,
// Will be set after Grove discovery
TerminationDelay
:
groveTerminationDelay
,
},
EtcdAddress
:
etcdAddr
,
NatsAddress
:
natsAddr
,
IngressConfig
:
commonController
.
IngressConfig
{
...
...
@@ -201,6 +208,11 @@ func main() {
os
.
Exit
(
1
)
}
// Detect Grove availability using discovery client
setupLog
.
Info
(
"Detecting Grove availability..."
)
groveEnabled
:=
commonController
.
DetectGroveAvailability
(
mainCtx
,
mgr
)
ctrlConfig
.
Grove
.
Enabled
=
groveEnabled
// Create etcd client
cli
,
err
:=
clientv3
.
New
(
clientv3
.
Config
{
Endpoints
:
[]
string
{
etcdAddr
},
...
...
deploy/cloud/operator/internal/consts/consts.go
View file @
dbd33df6
package
consts
import
"time"
const
(
HPACPUDefaultAverageUtilization
=
80
...
...
@@ -37,4 +39,6 @@ const (
PlannerServiceAccountName
=
"planner-serviceaccount"
DefaultIngressSuffix
=
"local"
DefaultGroveTerminationDelay
=
15
*
time
.
Minute
)
deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go
View file @
dbd33df6
...
...
@@ -144,7 +144,7 @@ type Resource interface {
func
(
r
*
DynamoGraphDeploymentReconciler
)
reconcileResources
(
ctx
context
.
Context
,
dynamoDeployment
*
nvidiacomv1alpha1
.
DynamoGraphDeployment
)
(
State
,
Reason
,
Message
,
error
)
{
logger
:=
log
.
FromContext
(
ctx
)
if
r
.
Config
.
Enable
Grove
{
if
r
.
Config
.
Grove
.
Enable
d
{
// check if explicit opt out of grove
if
dynamoDeployment
.
Annotations
[
consts
.
KubeAnnotationEnableGrove
]
==
consts
.
KubeLabelValueFalse
{
logger
.
Info
(
"Grove is explicitly disabled for this deployment, skipping grove resources reconciliation"
)
...
...
@@ -308,7 +308,7 @@ func (r *DynamoGraphDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) err
GenericFunc
:
func
(
ge
event
.
GenericEvent
)
bool
{
return
true
},
}))
.
WithEventFilter
(
commonController
.
EphemeralDeploymentEventFilter
(
r
.
Config
))
if
r
.
Config
.
Enable
Grove
{
if
r
.
Config
.
Grove
.
Enable
d
{
ctrlBuilder
=
ctrlBuilder
.
Owns
(
&
grovev1alpha1
.
PodGangSet
{},
builder
.
WithPredicates
(
predicate
.
Funcs
{
// ignore creation cause we don't want to be called again after we create the pod gang set
CreateFunc
:
func
(
ce
event
.
CreateEvent
)
bool
{
return
false
},
...
...
deploy/cloud/operator/internal/controller_common/predicate.go
View file @
dbd33df6
...
...
@@ -20,18 +20,28 @@ package controller_common
import
(
"context"
"strings"
"time"
"k8s.io/apimachinery/pkg/api/meta"
"k8s.io/client-go/discovery"
ctrl
"sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/predicate"
)
type
GroveConfig
struct
{
// Enabled is automatically determined by checking if Grove CRDs are installed in the cluster
Enabled
bool
// TerminationDelay configures the termination delay for Grove PodGangSets
TerminationDelay
time
.
Duration
}
type
Config
struct
{
// Enable resources filtering, only the resources belonging to the given namespace will be handled.
RestrictedNamespace
string
EnableLWS
bool
EnableGrove
bool
Grove
GroveConfig
EtcdAddress
string
NatsAddress
string
IngressConfig
IngressConfig
...
...
@@ -48,6 +58,43 @@ func (i *IngressConfig) UseVirtualService() bool {
return
i
.
VirtualServiceGateway
!=
""
}
// DetectGroveAvailability checks if Grove is available by checking if the Grove API group is registered
// This approach uses the discovery client which is simpler and more reliable
func
DetectGroveAvailability
(
ctx
context
.
Context
,
mgr
ctrl
.
Manager
)
bool
{
logger
:=
log
.
FromContext
(
ctx
)
// Use the discovery client to check if Grove API groups are available
cfg
:=
mgr
.
GetConfig
()
if
cfg
==
nil
{
logger
.
Info
(
"Grove detection failed, no discovery client available"
)
return
false
}
// Try to create a discovery client
discoveryClient
,
err
:=
discovery
.
NewDiscoveryClientForConfig
(
cfg
)
if
err
!=
nil
{
logger
.
Error
(
err
,
"Grove detection failed, could not create discovery client"
)
return
false
}
// Check if grove.io API group is available
apiGroups
,
err
:=
discoveryClient
.
ServerGroups
()
if
err
!=
nil
{
logger
.
Error
(
err
,
"Grove detection failed, could not list server groups"
)
return
false
}
for
_
,
group
:=
range
apiGroups
.
Groups
{
if
group
.
Name
==
"grove.io"
{
logger
.
Info
(
"Grove is available, grove.io API group found"
)
return
true
}
}
logger
.
Info
(
"Grove not available, grove.io API group not found"
)
return
false
}
func
EphemeralDeploymentEventFilter
(
config
Config
)
predicate
.
Predicate
{
return
predicate
.
NewPredicateFuncs
(
func
(
o
client
.
Object
)
bool
{
l
:=
log
.
FromContext
(
context
.
Background
())
...
...
deploy/cloud/operator/internal/dynamo/graph.go
View file @
dbd33df6
...
...
@@ -316,6 +316,9 @@ func GenerateGrovePodGangSet(ctx context.Context, dynamoDeployment *v1alpha1.Dyn
gangSet
.
Name
=
dynamoDeployment
.
Name
gangSet
.
Namespace
=
dynamoDeployment
.
Namespace
gangSet
.
Spec
.
Replicas
=
1
if
controllerConfig
.
Grove
.
TerminationDelay
>
0
{
gangSet
.
Spec
.
Template
.
TerminationDelay
=
&
metav1
.
Duration
{
Duration
:
controllerConfig
.
Grove
.
TerminationDelay
}
}
for
componentName
,
component
:=
range
dynamoDeployment
.
Spec
.
Services
{
container
:=
corev1
.
Container
{
Name
:
"main"
,
...
...
deploy/cloud/operator/internal/dynamo/graph_test.go
View file @
dbd33df6
...
...
@@ -23,6 +23,7 @@ import (
"reflect"
"sort"
"testing"
"time"
grovev1alpha1
"github.com/NVIDIA/grove/operator/api/core/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/dynamo/common"
...
...
@@ -1136,6 +1137,9 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
controllerConfig
:
controller_common
.
Config
{
EtcdAddress
:
"etcd-address"
,
NatsAddress
:
"nats-address"
,
Grove
:
controller_common
.
GroveConfig
{
TerminationDelay
:
15
*
time
.
Minute
,
},
},
dynamoDeployment
:
&
v1alpha1
.
DynamoGraphDeployment
{
ObjectMeta
:
metav1
.
ObjectMeta
{
...
...
@@ -1272,6 +1276,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
Spec
:
grovev1alpha1
.
PodGangSetSpec
{
Replicas
:
1
,
Template
:
grovev1alpha1
.
PodGangSetTemplateSpec
{
TerminationDelay
:
&
metav1
.
Duration
{
Duration
:
15
*
time
.
Minute
},
Cliques
:
[]
*
grovev1alpha1
.
PodCliqueTemplateSpec
{
{
Name
:
"frontend"
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment