Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
3d8e85e2
Unverified
Commit
3d8e85e2
authored
Apr 02, 2026
by
hhzhang16
Committed by
GitHub
Apr 02, 2026
Browse files
fix: use unique DGD name following DGDR (#7778)
Signed-off-by:
Hannah Zhang
<
hannahz@nvidia.com
>
parent
13058cbd
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
143 additions
and
11 deletions
+143
-11
deploy/operator/internal/controller/compute_dgd_name_test.go
deploy/operator/internal/controller/compute_dgd_name_test.go
+96
-0
deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go
...nal/controller/dynamographdeploymentrequest_controller.go
+26
-1
deploy/operator/internal/controller/dynamographdeploymentrequest_controller_test.go
...ontroller/dynamographdeploymentrequest_controller_test.go
+21
-10
No files found.
deploy/operator/internal/controller/compute_dgd_name_test.go
0 → 100644
View file @
3d8e85e2
/*
* SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package
controller
import
(
"testing"
nvidiacomv1beta1
"github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
metav1
"k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
)
func
TestComputeDGDName
(
t
*
testing
.
T
)
{
tests
:=
[]
struct
{
name
string
dgdr
*
nvidiacomv1beta1
.
DynamoGraphDeploymentRequest
expected
string
}{
{
name
:
"no overrides — uses DGDR name with -dgd suffix"
,
dgdr
:
&
nvidiacomv1beta1
.
DynamoGraphDeploymentRequest
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
"my-dgdr"
},
},
expected
:
"my-dgdr-dgd"
,
},
{
name
:
"overrides.dgd is nil — uses DGDR name with -dgd suffix"
,
dgdr
:
&
nvidiacomv1beta1
.
DynamoGraphDeploymentRequest
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
"my-dgdr"
},
Spec
:
nvidiacomv1beta1
.
DynamoGraphDeploymentRequestSpec
{
Overrides
:
&
nvidiacomv1beta1
.
OverridesSpec
{
DGD
:
nil
},
},
},
expected
:
"my-dgdr-dgd"
,
},
{
name
:
"overrides.dgd has explicit metadata.name — uses override name"
,
dgdr
:
&
nvidiacomv1beta1
.
DynamoGraphDeploymentRequest
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
"my-dgdr"
},
Spec
:
nvidiacomv1beta1
.
DynamoGraphDeploymentRequestSpec
{
Overrides
:
&
nvidiacomv1beta1
.
OverridesSpec
{
DGD
:
&
runtime
.
RawExtension
{
Raw
:
[]
byte
(
`{"apiVersion":"nvidia.com/v1alpha1","kind":"DynamoGraphDeployment","metadata":{"name":"explicit-name"}}`
),
},
},
},
},
expected
:
"explicit-name"
,
},
{
name
:
"overrides.dgd has no metadata.name — falls back to DGDR name with -dgd suffix"
,
dgdr
:
&
nvidiacomv1beta1
.
DynamoGraphDeploymentRequest
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
"my-dgdr"
},
Spec
:
nvidiacomv1beta1
.
DynamoGraphDeploymentRequestSpec
{
Overrides
:
&
nvidiacomv1beta1
.
OverridesSpec
{
DGD
:
&
runtime
.
RawExtension
{
Raw
:
[]
byte
(
`{"apiVersion":"nvidia.com/v1alpha1","kind":"DynamoGraphDeployment","metadata":{}}`
),
},
},
},
},
expected
:
"my-dgdr-dgd"
,
},
{
name
:
"two DGDRs with identical specs produce different names"
,
dgdr
:
&
nvidiacomv1beta1
.
DynamoGraphDeploymentRequest
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
"tc-2-8-parallel-beta"
},
},
expected
:
"tc-2-8-parallel-beta-dgd"
,
},
}
for
_
,
tt
:=
range
tests
{
t
.
Run
(
tt
.
name
,
func
(
t
*
testing
.
T
)
{
got
:=
computeDGDName
(
tt
.
dgdr
)
if
got
!=
tt
.
expected
{
t
.
Errorf
(
"computeDGDName() = %q, want %q"
,
got
,
tt
.
expected
)
}
})
}
}
deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go
View file @
3d8e85e2
...
...
@@ -1611,6 +1611,25 @@ func (r *DynamoGraphDeploymentRequestReconciler) getProfilingJobErrorDetails(ctx
return
""
}
// computeDGDName returns the Kubernetes name to use for the DGD that a DGDR owns.
// If the user supplied an explicit name via spec.overrides.dgd.metadata.name that
// value is returned as-is; otherwise the DGDR's own name is used with a "-dgd"
// suffix, guaranteeing uniqueness even when two DGDRs have identical specs (which
// would otherwise both produce the same profiler-generated name, e.g. "vllm-agg").
func
computeDGDName
(
dgdr
*
nvidiacomv1beta1
.
DynamoGraphDeploymentRequest
)
string
{
if
dgdr
.
Spec
.
Overrides
!=
nil
&&
dgdr
.
Spec
.
Overrides
.
DGD
!=
nil
&&
len
(
dgdr
.
Spec
.
Overrides
.
DGD
.
Raw
)
>
0
{
var
meta
struct
{
Metadata
struct
{
Name
string
`json:"name"`
}
`json:"metadata"`
}
if
err
:=
json
.
Unmarshal
(
dgdr
.
Spec
.
Overrides
.
DGD
.
Raw
,
&
meta
);
err
==
nil
&&
meta
.
Metadata
.
Name
!=
""
{
return
meta
.
Metadata
.
Name
}
}
return
dgdr
.
Name
+
"-dgd"
}
// generateDGDSpec reads profiling output from the sidecar ConfigMap, extracts the
// DynamoGraphDeployment spec and pareto configs, stores the spec in an annotation via
// r.Update, and returns the ProfilingResultsStatus and DGD name.
...
...
@@ -1651,7 +1670,13 @@ func (r *DynamoGraphDeploymentRequestReconciler) generateDGDSpec(ctx context.Con
return
nil
,
""
,
fmt
.
Errorf
(
"failed to extract DGD from %s: %w"
,
outputFile
,
err
)
}
logger
.
Info
(
"Parsed profiling output"
,
"dgdName"
,
dgd
.
Name
,
"additionalResources"
,
len
(
additionalResources
))
// Override the profiler-generated name with a DGDR-scoped unique name.
// The profiler emits a static topology-derived name (e.g. "vllm-agg") which
// collides when multiple DGDRs share identical specs. Derive the name from
// DGDR identity instead, respecting an explicit override if the user set one.
dgd
.
Name
=
computeDGDName
(
dgdr
)
logger
.
Info
(
"Parsed profiling output"
,
"profilerDGDName"
,
dgd
.
Name
,
"additionalResources"
,
len
(
additionalResources
))
if
len
(
additionalResources
)
>
0
{
if
err
:=
r
.
storeAdditionalResources
(
ctx
,
dgdr
,
additionalResources
);
err
!=
nil
{
...
...
deploy/operator/internal/controller/dynamographdeploymentrequest_controller_test.go
View file @
3d8e85e2
...
...
@@ -542,15 +542,21 @@ spec:
Expect
(
k8sClient
.
Status
()
.
Update
(
ctx
,
job
))
.
Should
(
Succeed
())
// Create output ConfigMap
// The profiler emits a static name in the YAML, but the operator must override
// it with a DGDR-scoped unique name to prevent collisions across DGDRs.
dgdYAML
:=
`apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name:
test-dgd-auto
name:
vllm-agg
spec:
services:
Frontend:
replicas: 1`
// expectedDGDName is the name the operator should assign: DGDR name + "-dgd",
// not the static "vllm-agg" that the profiler emitted.
expectedDGDName
:=
dgdrName
+
"-dgd"
outputConfigMapName
:=
getOutputConfigMapName
(
dgdr
)
cm
:=
&
corev1
.
ConfigMap
{
ObjectMeta
:
metav1
.
ObjectMeta
{
...
...
@@ -581,16 +587,16 @@ spec:
})
Expect
(
err
)
.
NotTo
(
HaveOccurred
())
// Verify DGD was created
// Verify DGD was created
with the DGDR-scoped name (not the profiler's "vllm-agg")
dgd
:=
&
dgdv1alpha1
.
DynamoGraphDeployment
{}
Expect
(
k8sClient
.
Get
(
ctx
,
types
.
NamespacedName
{
Name
:
"test-dgd-auto"
,
Namespace
:
namespace
},
dgd
))
.
Should
(
Succeed
())
Expect
(
k8sClient
.
Get
(
ctx
,
types
.
NamespacedName
{
Name
:
expectedDGDName
,
Namespace
:
namespace
},
dgd
))
.
Should
(
Succeed
())
// Get final DGDR status
Expect
(
k8sClient
.
Get
(
ctx
,
types
.
NamespacedName
{
Name
:
dgdrName
,
Namespace
:
namespace
},
&
updated
))
.
Should
(
Succeed
())
Expect
(
updated
.
Status
.
DGDName
)
.
Should
(
Equal
(
"test-dgd-auto"
))
Expect
(
updated
.
Status
.
DGDName
)
.
Should
(
Equal
(
expectedDGDName
))
// Clean up DGD
Expect
(
k8sClient
.
Get
(
ctx
,
types
.
NamespacedName
{
Name
:
"test-dgd-auto"
,
Namespace
:
namespace
},
dgd
))
.
Should
(
Succeed
())
Expect
(
k8sClient
.
Get
(
ctx
,
types
.
NamespacedName
{
Name
:
expectedDGDName
,
Namespace
:
namespace
},
dgd
))
.
Should
(
Succeed
())
_
=
k8sClient
.
Delete
(
ctx
,
dgd
)
})
})
...
...
@@ -1961,16 +1967,20 @@ spec:
}}
Expect
(
k8sClient
.
Status
()
.
Update
(
ctx
,
job
))
.
Should
(
Succeed
())
// Create output ConfigMap with mocker output file
// Create output ConfigMap with profiling output (the profiler itself handles mocker
// selection; the controller always reads from ProfilingOutputFile regardless).
dgdYAML
:=
`apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name:
test-dgd-mocker
name:
vllm-agg
spec:
services:
Frontend:
replicas: 1`
// expectedDGDName is derived from the DGDR name, not from the profiler's output.
expectedDGDName
:=
dgdrName
+
"-dgd"
cm
:=
&
corev1
.
ConfigMap
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
getOutputConfigMapName
(
dgdr
),
...
...
@@ -1983,16 +1993,17 @@ spec:
Expect
(
k8sClient
.
Create
(
ctx
,
cm
))
.
Should
(
Succeed
())
defer
func
()
{
_
=
k8sClient
.
Delete
(
ctx
,
cm
)
}()
// Reconcile — should read from mocker output file
// Reconcile — controller reads ProfilingOutputFile, then overrides the name to
// a DGDR-scoped unique name, and stores the result in the annotation.
_
,
err
:=
reconciler
.
Reconcile
(
ctx
,
reconcile
.
Request
{
NamespacedName
:
types
.
NamespacedName
{
Name
:
dgdrName
,
Namespace
:
namespace
},
})
Expect
(
err
)
.
NotTo
(
HaveOccurred
())
// Verify the generated spec
came from the mocker file
// Verify the generated spec
was stored and contains the DGDR-scoped DGD name.
var
updated
nvidiacomv1beta1
.
DynamoGraphDeploymentRequest
Expect
(
k8sClient
.
Get
(
ctx
,
types
.
NamespacedName
{
Name
:
dgdrName
,
Namespace
:
namespace
},
&
updated
))
.
Should
(
Succeed
())
Expect
(
updated
.
Annotations
[
"nvidia.com/generated-dgd-spec"
])
.
Should
(
ContainSubstring
(
"test-dgd-mocker"
))
Expect
(
updated
.
Annotations
[
"nvidia.com/generated-dgd-spec"
])
.
Should
(
ContainSubstring
(
expectedDGDName
))
})
It
(
"Should populate profilingJobName in status"
,
func
()
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment