Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
c68d159f
Unverified
Commit
c68d159f
authored
Apr 20, 2026
by
Julien Mancuso
Committed by
GitHub
Apr 20, 2026
Browse files
fix(operator): reconcile DynamoGraphDeployment on PodCliqueScalingGroup status changes (#8328)
parent
44190094
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
184 additions
and
2 deletions
+184
-2
deploy/helm/charts/platform/components/operator/templates/manager-rbac.yaml
.../platform/components/operator/templates/manager-rbac.yaml
+9
-0
deploy/operator/config/rbac/role.yaml
deploy/operator/config/rbac/role.yaml
+2
-0
deploy/operator/internal/controller/dynamographdeployment_controller.go
...r/internal/controller/dynamographdeployment_controller.go
+78
-2
deploy/operator/internal/controller/dynamographdeployment_controller_test.go
...ernal/controller/dynamographdeployment_controller_test.go
+95
-0
No files found.
deploy/helm/charts/platform/components/operator/templates/manager-rbac.yaml
View file @
c68d159f
...
@@ -136,6 +136,15 @@ rules:
...
@@ -136,6 +136,15 @@ rules:
-
patch
-
patch
-
update
-
update
-
watch
-
watch
-
apiGroups
:
-
grove.io
resources
:
-
podcliques
-
podcliquescalinggroups
verbs
:
-
get
-
list
-
watch
-
apiGroups
:
-
apiGroups
:
-
grove.io
-
grove.io
resources
:
resources
:
...
...
deploy/operator/config/rbac/role.yaml
View file @
c68d159f
...
@@ -130,6 +130,8 @@ rules:
...
@@ -130,6 +130,8 @@ rules:
-
grove.io
-
grove.io
resources
:
resources
:
-
clustertopologies
-
clustertopologies
-
podcliques
-
podcliquescalinggroups
verbs
:
verbs
:
-
get
-
get
-
list
-
list
...
...
deploy/operator/internal/controller/dynamographdeployment_controller.go
View file @
c68d159f
...
@@ -87,7 +87,9 @@ type DynamoGraphDeploymentReconciler struct {
...
@@ -87,7 +87,9 @@ type DynamoGraphDeploymentReconciler struct {
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments/finalizers,verbs=update
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments/finalizers,verbs=update
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeploymentscalingadapters,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeploymentscalingadapters,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=grove.io,resources=podcliquesets,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=grove.io,resources=podcliquesets,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=grove.io,resources=podcliques,verbs=get;list;watch
// +kubebuilder:rbac:groups=grove.io,resources=podcliques/scale,verbs=get;update;patch
// +kubebuilder:rbac:groups=grove.io,resources=podcliques/scale,verbs=get;update;patch
// +kubebuilder:rbac:groups=grove.io,resources=podcliquescalinggroups,verbs=get;list;watch
// +kubebuilder:rbac:groups=grove.io,resources=podcliquescalinggroups/scale,verbs=get;update;patch
// +kubebuilder:rbac:groups=grove.io,resources=podcliquescalinggroups/scale,verbs=get;update;patch
// +kubebuilder:rbac:groups=grove.io,resources=clustertopologies,verbs=get;list;watch
// +kubebuilder:rbac:groups=grove.io,resources=clustertopologies,verbs=get;list;watch
// +kubebuilder:rbac:groups=scheduling.run.ai,resources=queues,verbs=get;list
// +kubebuilder:rbac:groups=scheduling.run.ai,resources=queues,verbs=get;list
...
@@ -1666,8 +1668,6 @@ func (r *DynamoGraphDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) err
...
@@ -1666,8 +1668,6 @@ func (r *DynamoGraphDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) err
GenericFunc
:
func
(
ge
event
.
GenericEvent
)
bool
{
return
true
},
GenericFunc
:
func
(
ge
event
.
GenericEvent
)
bool
{
return
true
},
}))
.
}))
.
// Watch PodClique resources - only on status changes
// Watch PodClique resources - only on status changes
// Note: We don't need to watch PodCliqueScalingGroup because it's just a container
// for PodCliques. The actual status changes happen at the PodClique level.
Watches
(
Watches
(
&
grovev1alpha1
.
PodClique
{},
&
grovev1alpha1
.
PodClique
{},
handler
.
EnqueueRequestsFromMapFunc
(
r
.
mapPodCliqueToRequests
),
handler
.
EnqueueRequestsFromMapFunc
(
r
.
mapPodCliqueToRequests
),
...
@@ -1687,6 +1687,38 @@ func (r *DynamoGraphDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) err
...
@@ -1687,6 +1687,38 @@ func (r *DynamoGraphDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) err
},
},
GenericFunc
:
func
(
ge
event
.
GenericEvent
)
bool
{
return
false
},
GenericFunc
:
func
(
ge
event
.
GenericEvent
)
bool
{
return
false
},
}),
}),
)
.
// Watch PodCliqueScalingGroup resources on status-replica changes.
// PCSG.Status.AvailableReplicas is independently recomputed by the PCSG
// controller and can land after the last PodClique event the DGD
// controller sees. Without this watch, the DGD aggregate
// (CheckPCSGReady reads pcsg.Status.AvailableReplicas) can stay stale
// indefinitely even though the underlying PCSG is already ready.
Watches
(
&
grovev1alpha1
.
PodCliqueScalingGroup
{},
handler
.
EnqueueRequestsFromMapFunc
(
r
.
mapPodCliqueScalingGroupToRequests
),
builder
.
WithPredicates
(
predicate
.
Funcs
{
CreateFunc
:
func
(
ce
event
.
CreateEvent
)
bool
{
return
false
},
DeleteFunc
:
func
(
de
event
.
DeleteEvent
)
bool
{
return
false
},
UpdateFunc
:
func
(
ue
event
.
UpdateEvent
)
bool
{
oldPCSG
,
okOld
:=
ue
.
ObjectOld
.
(
*
grovev1alpha1
.
PodCliqueScalingGroup
)
newPCSG
,
okNew
:=
ue
.
ObjectNew
.
(
*
grovev1alpha1
.
PodCliqueScalingGroup
)
if
!
okOld
||
!
okNew
{
return
false
}
// ObservedGeneration is tracked because CheckPCSGReady uses it as
// a readiness gate ("spec not yet processed" while
// ObservedGeneration < Generation). A PCSG spec edit that does
// not change Spec.Replicas (e.g. template/topology edits) would
// otherwise not wake the DGD when Grove catches up.
return
oldPCSG
.
Status
.
AvailableReplicas
!=
newPCSG
.
Status
.
AvailableReplicas
||
oldPCSG
.
Status
.
UpdatedReplicas
!=
newPCSG
.
Status
.
UpdatedReplicas
||
oldPCSG
.
Status
.
Replicas
!=
newPCSG
.
Status
.
Replicas
||
oldPCSG
.
Spec
.
Replicas
!=
newPCSG
.
Spec
.
Replicas
||
!
ptrInt64Equal
(
oldPCSG
.
Status
.
ObservedGeneration
,
newPCSG
.
Status
.
ObservedGeneration
)
},
GenericFunc
:
func
(
ge
event
.
GenericEvent
)
bool
{
return
false
},
}),
)
)
}
}
...
@@ -1723,3 +1755,47 @@ func (r *DynamoGraphDeploymentReconciler) mapPodCliqueToRequests(ctx context.Con
...
@@ -1723,3 +1755,47 @@ func (r *DynamoGraphDeploymentReconciler) mapPodCliqueToRequests(ctx context.Con
},
},
}}
}}
}
}
// mapPodCliqueScalingGroupToRequests maps a PodCliqueScalingGroup to reconcile
// requests for its owning DGD.
//
// The PCSG is owned by a PodCliqueSet (controller ownerRef), and Dynamo always
// creates the PodCliqueSet with the same name as the DGD
// (see graph.go: gangSet.Name = dynamoDeployment.Name), so the PodCliqueSet
// owner reference name is the DGD name.
func
(
r
*
DynamoGraphDeploymentReconciler
)
mapPodCliqueScalingGroupToRequests
(
ctx
context
.
Context
,
obj
client
.
Object
)
[]
ctrl
.
Request
{
pcsg
,
ok
:=
obj
.
(
*
grovev1alpha1
.
PodCliqueScalingGroup
)
if
!
ok
{
return
nil
}
controllerRef
:=
metav1
.
GetControllerOf
(
pcsg
)
if
controllerRef
==
nil
||
controllerRef
.
Kind
!=
"PodCliqueSet"
||
controllerRef
.
APIVersion
!=
grovev1alpha1
.
SchemeGroupVersion
.
String
()
{
log
.
FromContext
(
ctx
)
.
V
(
1
)
.
Info
(
"PodCliqueScalingGroup missing PodCliqueSet controller ownerReference"
,
"podCliqueScalingGroup"
,
pcsg
.
Name
,
"namespace"
,
pcsg
.
Namespace
)
return
nil
}
return
[]
ctrl
.
Request
{{
NamespacedName
:
types
.
NamespacedName
{
Name
:
controllerRef
.
Name
,
Namespace
:
pcsg
.
Namespace
,
},
}}
}
// ptrInt64Equal returns true when two *int64 values are equivalent, treating
// nil and a pointer to the same value as equal. Used to compare optional
// status fields like ObservedGeneration without tripping on pointer identity.
func
ptrInt64Equal
(
a
,
b
*
int64
)
bool
{
if
a
==
nil
&&
b
==
nil
{
return
true
}
if
a
==
nil
||
b
==
nil
{
return
false
}
return
*
a
==
*
b
}
deploy/operator/internal/controller/dynamographdeployment_controller_test.go
View file @
c68d159f
...
@@ -2696,3 +2696,98 @@ func TestPropagateTopologyCondition(t *testing.T) {
...
@@ -2696,3 +2696,98 @@ func TestPropagateTopologyCondition(t *testing.T) {
})
})
}
}
}
}
func
TestMapPodCliqueScalingGroupToRequests
(
t
*
testing
.
T
)
{
tests
:=
[]
struct
{
name
string
obj
client
.
Object
wantRequests
int
wantName
string
wantNs
string
}{
{
name
:
"PCSG with PodCliqueSet controller ownerRef returns DGD request"
,
obj
:
&
grovev1alpha1
.
PodCliqueScalingGroup
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
"dynamo-recipe-0-worker"
,
Namespace
:
"mwieczorek-dsv32-trtllm-agg"
,
OwnerReferences
:
[]
metav1
.
OwnerReference
{
{
APIVersion
:
grovev1alpha1
.
SchemeGroupVersion
.
String
(),
Kind
:
"PodCliqueSet"
,
Name
:
"dynamo-recipe"
,
Controller
:
ptr
.
To
(
true
),
},
},
},
},
wantRequests
:
1
,
wantName
:
"dynamo-recipe"
,
wantNs
:
"mwieczorek-dsv32-trtllm-agg"
,
},
{
name
:
"PCSG with no ownerRef returns no requests"
,
obj
:
&
grovev1alpha1
.
PodCliqueScalingGroup
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
"orphan-pcsg"
,
Namespace
:
"default"
,
},
},
wantRequests
:
0
,
},
{
name
:
"PCSG with non-controller PodCliqueSet ownerRef returns no requests"
,
obj
:
&
grovev1alpha1
.
PodCliqueScalingGroup
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
"pcsg-with-non-controller-ref"
,
Namespace
:
"default"
,
OwnerReferences
:
[]
metav1
.
OwnerReference
{
{
APIVersion
:
grovev1alpha1
.
SchemeGroupVersion
.
String
(),
Kind
:
"PodCliqueSet"
,
Name
:
"some-pcs"
,
// Controller flag omitted: metav1.GetControllerOf must ignore this ref.
},
},
},
},
wantRequests
:
0
,
},
{
name
:
"PCSG with non-PodCliqueSet ownerRef returns no requests"
,
obj
:
&
grovev1alpha1
.
PodCliqueScalingGroup
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
"weird-pcsg"
,
Namespace
:
"default"
,
OwnerReferences
:
[]
metav1
.
OwnerReference
{
{
APIVersion
:
"apps/v1"
,
Kind
:
"Deployment"
,
Name
:
"not-a-pcs"
,
},
},
},
},
wantRequests
:
0
,
},
{
name
:
"non-PCSG object returns no requests"
,
obj
:
&
corev1
.
Pod
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
"foo"
,
Namespace
:
"default"
}},
wantRequests
:
0
,
},
}
for
_
,
tt
:=
range
tests
{
t
.
Run
(
tt
.
name
,
func
(
t
*
testing
.
T
)
{
g
:=
gomega
.
NewGomegaWithT
(
t
)
r
:=
&
DynamoGraphDeploymentReconciler
{}
reqs
:=
r
.
mapPodCliqueScalingGroupToRequests
(
context
.
Background
(),
tt
.
obj
)
g
.
Expect
(
reqs
)
.
To
(
gomega
.
HaveLen
(
tt
.
wantRequests
))
if
tt
.
wantRequests
==
1
{
g
.
Expect
(
reqs
[
0
]
.
Name
)
.
To
(
gomega
.
Equal
(
tt
.
wantName
))
g
.
Expect
(
reqs
[
0
]
.
Namespace
)
.
To
(
gomega
.
Equal
(
tt
.
wantNs
))
}
})
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment