Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
c68d159f
"lib/vscode:/vscode.git/clone" did not exist on "4f2cbec0895a9ba87fc84d0719e8a181cdb3c4c9"
Unverified
Commit
c68d159f
authored
Apr 20, 2026
by
Julien Mancuso
Committed by
GitHub
Apr 20, 2026
Browse files
fix(operator): reconcile DynamoGraphDeployment on PodCliqueScalingGroup status changes (#8328)
parent
44190094
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
184 additions
and
2 deletions
+184
-2
deploy/helm/charts/platform/components/operator/templates/manager-rbac.yaml
.../platform/components/operator/templates/manager-rbac.yaml
+9
-0
deploy/operator/config/rbac/role.yaml
deploy/operator/config/rbac/role.yaml
+2
-0
deploy/operator/internal/controller/dynamographdeployment_controller.go
...r/internal/controller/dynamographdeployment_controller.go
+78
-2
deploy/operator/internal/controller/dynamographdeployment_controller_test.go
...ernal/controller/dynamographdeployment_controller_test.go
+95
-0
No files found.
deploy/helm/charts/platform/components/operator/templates/manager-rbac.yaml
View file @
c68d159f
...
@@ -136,6 +136,15 @@ rules:
...
@@ -136,6 +136,15 @@ rules:
-
patch
-
patch
-
update
-
update
-
watch
-
watch
-
apiGroups
:
-
grove.io
resources
:
-
podcliques
-
podcliquescalinggroups
verbs
:
-
get
-
list
-
watch
-
apiGroups
:
-
apiGroups
:
-
grove.io
-
grove.io
resources
:
resources
:
...
...
deploy/operator/config/rbac/role.yaml
View file @
c68d159f
...
@@ -130,6 +130,8 @@ rules:
...
@@ -130,6 +130,8 @@ rules:
-
grove.io
-
grove.io
resources
:
resources
:
-
clustertopologies
-
clustertopologies
-
podcliques
-
podcliquescalinggroups
verbs
:
verbs
:
-
get
-
get
-
list
-
list
...
...
deploy/operator/internal/controller/dynamographdeployment_controller.go
View file @
c68d159f
...
@@ -87,7 +87,9 @@ type DynamoGraphDeploymentReconciler struct {
...
@@ -87,7 +87,9 @@ type DynamoGraphDeploymentReconciler struct {
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments/finalizers,verbs=update
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments/finalizers,verbs=update
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeploymentscalingadapters,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeploymentscalingadapters,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=grove.io,resources=podcliquesets,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=grove.io,resources=podcliquesets,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=grove.io,resources=podcliques,verbs=get;list;watch
// +kubebuilder:rbac:groups=grove.io,resources=podcliques/scale,verbs=get;update;patch
// +kubebuilder:rbac:groups=grove.io,resources=podcliques/scale,verbs=get;update;patch
// +kubebuilder:rbac:groups=grove.io,resources=podcliquescalinggroups,verbs=get;list;watch
// +kubebuilder:rbac:groups=grove.io,resources=podcliquescalinggroups/scale,verbs=get;update;patch
// +kubebuilder:rbac:groups=grove.io,resources=podcliquescalinggroups/scale,verbs=get;update;patch
// +kubebuilder:rbac:groups=grove.io,resources=clustertopologies,verbs=get;list;watch
// +kubebuilder:rbac:groups=grove.io,resources=clustertopologies,verbs=get;list;watch
// +kubebuilder:rbac:groups=scheduling.run.ai,resources=queues,verbs=get;list
// +kubebuilder:rbac:groups=scheduling.run.ai,resources=queues,verbs=get;list
...
@@ -1666,8 +1668,6 @@ func (r *DynamoGraphDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) err
...
@@ -1666,8 +1668,6 @@ func (r *DynamoGraphDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) err
GenericFunc
:
func
(
ge
event
.
GenericEvent
)
bool
{
return
true
},
GenericFunc
:
func
(
ge
event
.
GenericEvent
)
bool
{
return
true
},
}))
.
}))
.
// Watch PodClique resources - only on status changes
// Watch PodClique resources - only on status changes
// Note: We don't need to watch PodCliqueScalingGroup because it's just a container
// for PodCliques. The actual status changes happen at the PodClique level.
Watches
(
Watches
(
&
grovev1alpha1
.
PodClique
{},
&
grovev1alpha1
.
PodClique
{},
handler
.
EnqueueRequestsFromMapFunc
(
r
.
mapPodCliqueToRequests
),
handler
.
EnqueueRequestsFromMapFunc
(
r
.
mapPodCliqueToRequests
),
...
@@ -1687,6 +1687,38 @@ func (r *DynamoGraphDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) err
...
@@ -1687,6 +1687,38 @@ func (r *DynamoGraphDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) err
},
},
GenericFunc
:
func
(
ge
event
.
GenericEvent
)
bool
{
return
false
},
GenericFunc
:
func
(
ge
event
.
GenericEvent
)
bool
{
return
false
},
}),
}),
)
.
// Watch PodCliqueScalingGroup resources on status-replica changes.
// PCSG.Status.AvailableReplicas is independently recomputed by the PCSG
// controller and can land after the last PodClique event the DGD
// controller sees. Without this watch, the DGD aggregate
// (CheckPCSGReady reads pcsg.Status.AvailableReplicas) can stay stale
// indefinitely even though the underlying PCSG is already ready.
Watches
(
&
grovev1alpha1
.
PodCliqueScalingGroup
{},
handler
.
EnqueueRequestsFromMapFunc
(
r
.
mapPodCliqueScalingGroupToRequests
),
builder
.
WithPredicates
(
predicate
.
Funcs
{
CreateFunc
:
func
(
ce
event
.
CreateEvent
)
bool
{
return
false
},
DeleteFunc
:
func
(
de
event
.
DeleteEvent
)
bool
{
return
false
},
UpdateFunc
:
func
(
ue
event
.
UpdateEvent
)
bool
{
oldPCSG
,
okOld
:=
ue
.
ObjectOld
.
(
*
grovev1alpha1
.
PodCliqueScalingGroup
)
newPCSG
,
okNew
:=
ue
.
ObjectNew
.
(
*
grovev1alpha1
.
PodCliqueScalingGroup
)
if
!
okOld
||
!
okNew
{
return
false
}
// ObservedGeneration is tracked because CheckPCSGReady uses it as
// a readiness gate ("spec not yet processed" while
// ObservedGeneration < Generation). A PCSG spec edit that does
// not change Spec.Replicas (e.g. template/topology edits) would
// otherwise not wake the DGD when Grove catches up.
return
oldPCSG
.
Status
.
AvailableReplicas
!=
newPCSG
.
Status
.
AvailableReplicas
||
oldPCSG
.
Status
.
UpdatedReplicas
!=
newPCSG
.
Status
.
UpdatedReplicas
||
oldPCSG
.
Status
.
Replicas
!=
newPCSG
.
Status
.
Replicas
||
oldPCSG
.
Spec
.
Replicas
!=
newPCSG
.
Spec
.
Replicas
||
!
ptrInt64Equal
(
oldPCSG
.
Status
.
ObservedGeneration
,
newPCSG
.
Status
.
ObservedGeneration
)
},
GenericFunc
:
func
(
ge
event
.
GenericEvent
)
bool
{
return
false
},
}),
)
)
}
}
...
@@ -1723,3 +1755,47 @@ func (r *DynamoGraphDeploymentReconciler) mapPodCliqueToRequests(ctx context.Con
...
@@ -1723,3 +1755,47 @@ func (r *DynamoGraphDeploymentReconciler) mapPodCliqueToRequests(ctx context.Con
},
},
}}
}}
}
}
// mapPodCliqueScalingGroupToRequests maps a PodCliqueScalingGroup to reconcile
// requests for its owning DGD.
//
// The PCSG is owned by a PodCliqueSet (controller ownerRef), and Dynamo always
// creates the PodCliqueSet with the same name as the DGD
// (see graph.go: gangSet.Name = dynamoDeployment.Name), so the PodCliqueSet
// owner reference name is the DGD name.
func
(
r
*
DynamoGraphDeploymentReconciler
)
mapPodCliqueScalingGroupToRequests
(
ctx
context
.
Context
,
obj
client
.
Object
)
[]
ctrl
.
Request
{
pcsg
,
ok
:=
obj
.
(
*
grovev1alpha1
.
PodCliqueScalingGroup
)
if
!
ok
{
return
nil
}
controllerRef
:=
metav1
.
GetControllerOf
(
pcsg
)
if
controllerRef
==
nil
||
controllerRef
.
Kind
!=
"PodCliqueSet"
||
controllerRef
.
APIVersion
!=
grovev1alpha1
.
SchemeGroupVersion
.
String
()
{
log
.
FromContext
(
ctx
)
.
V
(
1
)
.
Info
(
"PodCliqueScalingGroup missing PodCliqueSet controller ownerReference"
,
"podCliqueScalingGroup"
,
pcsg
.
Name
,
"namespace"
,
pcsg
.
Namespace
)
return
nil
}
return
[]
ctrl
.
Request
{{
NamespacedName
:
types
.
NamespacedName
{
Name
:
controllerRef
.
Name
,
Namespace
:
pcsg
.
Namespace
,
},
}}
}
// ptrInt64Equal returns true when two *int64 values are equivalent, treating
// nil and a pointer to the same value as equal. Used to compare optional
// status fields like ObservedGeneration without tripping on pointer identity.
func
ptrInt64Equal
(
a
,
b
*
int64
)
bool
{
if
a
==
nil
&&
b
==
nil
{
return
true
}
if
a
==
nil
||
b
==
nil
{
return
false
}
return
*
a
==
*
b
}
deploy/operator/internal/controller/dynamographdeployment_controller_test.go
View file @
c68d159f
...
@@ -2696,3 +2696,98 @@ func TestPropagateTopologyCondition(t *testing.T) {
...
@@ -2696,3 +2696,98 @@ func TestPropagateTopologyCondition(t *testing.T) {
})
})
}
}
}
}
func
TestMapPodCliqueScalingGroupToRequests
(
t
*
testing
.
T
)
{
tests
:=
[]
struct
{
name
string
obj
client
.
Object
wantRequests
int
wantName
string
wantNs
string
}{
{
name
:
"PCSG with PodCliqueSet controller ownerRef returns DGD request"
,
obj
:
&
grovev1alpha1
.
PodCliqueScalingGroup
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
"dynamo-recipe-0-worker"
,
Namespace
:
"mwieczorek-dsv32-trtllm-agg"
,
OwnerReferences
:
[]
metav1
.
OwnerReference
{
{
APIVersion
:
grovev1alpha1
.
SchemeGroupVersion
.
String
(),
Kind
:
"PodCliqueSet"
,
Name
:
"dynamo-recipe"
,
Controller
:
ptr
.
To
(
true
),
},
},
},
},
wantRequests
:
1
,
wantName
:
"dynamo-recipe"
,
wantNs
:
"mwieczorek-dsv32-trtllm-agg"
,
},
{
name
:
"PCSG with no ownerRef returns no requests"
,
obj
:
&
grovev1alpha1
.
PodCliqueScalingGroup
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
"orphan-pcsg"
,
Namespace
:
"default"
,
},
},
wantRequests
:
0
,
},
{
name
:
"PCSG with non-controller PodCliqueSet ownerRef returns no requests"
,
obj
:
&
grovev1alpha1
.
PodCliqueScalingGroup
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
"pcsg-with-non-controller-ref"
,
Namespace
:
"default"
,
OwnerReferences
:
[]
metav1
.
OwnerReference
{
{
APIVersion
:
grovev1alpha1
.
SchemeGroupVersion
.
String
(),
Kind
:
"PodCliqueSet"
,
Name
:
"some-pcs"
,
// Controller flag omitted: metav1.GetControllerOf must ignore this ref.
},
},
},
},
wantRequests
:
0
,
},
{
name
:
"PCSG with non-PodCliqueSet ownerRef returns no requests"
,
obj
:
&
grovev1alpha1
.
PodCliqueScalingGroup
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
"weird-pcsg"
,
Namespace
:
"default"
,
OwnerReferences
:
[]
metav1
.
OwnerReference
{
{
APIVersion
:
"apps/v1"
,
Kind
:
"Deployment"
,
Name
:
"not-a-pcs"
,
},
},
},
},
wantRequests
:
0
,
},
{
name
:
"non-PCSG object returns no requests"
,
obj
:
&
corev1
.
Pod
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
"foo"
,
Namespace
:
"default"
}},
wantRequests
:
0
,
},
}
for
_
,
tt
:=
range
tests
{
t
.
Run
(
tt
.
name
,
func
(
t
*
testing
.
T
)
{
g
:=
gomega
.
NewGomegaWithT
(
t
)
r
:=
&
DynamoGraphDeploymentReconciler
{}
reqs
:=
r
.
mapPodCliqueScalingGroupToRequests
(
context
.
Background
(),
tt
.
obj
)
g
.
Expect
(
reqs
)
.
To
(
gomega
.
HaveLen
(
tt
.
wantRequests
))
if
tt
.
wantRequests
==
1
{
g
.
Expect
(
reqs
[
0
]
.
Name
)
.
To
(
gomega
.
Equal
(
tt
.
wantName
))
g
.
Expect
(
reqs
[
0
]
.
Namespace
)
.
To
(
gomega
.
Equal
(
tt
.
wantNs
))
}
})
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment