Unverified Commit 3d8e85e2 authored by hhzhang16's avatar hhzhang16 Committed by GitHub
Browse files

fix: use unique DGD name following DGDR (#7778)


Signed-off-by: default avatarHannah Zhang <hannahz@nvidia.com>
parent 13058cbd
/*
* SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package controller
import (
"testing"
nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
)
func TestComputeDGDName(t *testing.T) {
tests := []struct {
name string
dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest
expected string
}{
{
name: "no overrides — uses DGDR name with -dgd suffix",
dgdr: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{Name: "my-dgdr"},
},
expected: "my-dgdr-dgd",
},
{
name: "overrides.dgd is nil — uses DGDR name with -dgd suffix",
dgdr: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{Name: "my-dgdr"},
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Overrides: &nvidiacomv1beta1.OverridesSpec{DGD: nil},
},
},
expected: "my-dgdr-dgd",
},
{
name: "overrides.dgd has explicit metadata.name — uses override name",
dgdr: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{Name: "my-dgdr"},
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Overrides: &nvidiacomv1beta1.OverridesSpec{
DGD: &runtime.RawExtension{
Raw: []byte(`{"apiVersion":"nvidia.com/v1alpha1","kind":"DynamoGraphDeployment","metadata":{"name":"explicit-name"}}`),
},
},
},
},
expected: "explicit-name",
},
{
name: "overrides.dgd has no metadata.name — falls back to DGDR name with -dgd suffix",
dgdr: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{Name: "my-dgdr"},
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Overrides: &nvidiacomv1beta1.OverridesSpec{
DGD: &runtime.RawExtension{
Raw: []byte(`{"apiVersion":"nvidia.com/v1alpha1","kind":"DynamoGraphDeployment","metadata":{}}`),
},
},
},
},
expected: "my-dgdr-dgd",
},
{
name: "two DGDRs with identical specs produce different names",
dgdr: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{Name: "tc-2-8-parallel-beta"},
},
expected: "tc-2-8-parallel-beta-dgd",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := computeDGDName(tt.dgdr)
if got != tt.expected {
t.Errorf("computeDGDName() = %q, want %q", got, tt.expected)
}
})
}
}
...@@ -1611,6 +1611,25 @@ func (r *DynamoGraphDeploymentRequestReconciler) getProfilingJobErrorDetails(ctx ...@@ -1611,6 +1611,25 @@ func (r *DynamoGraphDeploymentRequestReconciler) getProfilingJobErrorDetails(ctx
return "" return ""
} }
// computeDGDName returns the Kubernetes name to use for the DGD that a DGDR owns.
// If the user supplied an explicit name via spec.overrides.dgd.metadata.name that
// value is returned as-is; otherwise the DGDR's own name is used with a "-dgd"
// suffix, guaranteeing uniqueness even when two DGDRs have identical specs (which
// would otherwise both produce the same profiler-generated name, e.g. "vllm-agg").
func computeDGDName(dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) string {
if dgdr.Spec.Overrides != nil && dgdr.Spec.Overrides.DGD != nil && len(dgdr.Spec.Overrides.DGD.Raw) > 0 {
var meta struct {
Metadata struct {
Name string `json:"name"`
} `json:"metadata"`
}
if err := json.Unmarshal(dgdr.Spec.Overrides.DGD.Raw, &meta); err == nil && meta.Metadata.Name != "" {
return meta.Metadata.Name
}
}
return dgdr.Name + "-dgd"
}
// generateDGDSpec reads profiling output from the sidecar ConfigMap, extracts the // generateDGDSpec reads profiling output from the sidecar ConfigMap, extracts the
// DynamoGraphDeployment spec and pareto configs, stores the spec in an annotation via // DynamoGraphDeployment spec and pareto configs, stores the spec in an annotation via
// r.Update, and returns the ProfilingResultsStatus and DGD name. // r.Update, and returns the ProfilingResultsStatus and DGD name.
...@@ -1651,7 +1670,13 @@ func (r *DynamoGraphDeploymentRequestReconciler) generateDGDSpec(ctx context.Con ...@@ -1651,7 +1670,13 @@ func (r *DynamoGraphDeploymentRequestReconciler) generateDGDSpec(ctx context.Con
return nil, "", fmt.Errorf("failed to extract DGD from %s: %w", outputFile, err) return nil, "", fmt.Errorf("failed to extract DGD from %s: %w", outputFile, err)
} }
logger.Info("Parsed profiling output", "dgdName", dgd.Name, "additionalResources", len(additionalResources)) // Override the profiler-generated name with a DGDR-scoped unique name.
// The profiler emits a static topology-derived name (e.g. "vllm-agg") which
// collides when multiple DGDRs share identical specs. Derive the name from
// DGDR identity instead, respecting an explicit override if the user set one.
dgd.Name = computeDGDName(dgdr)
logger.Info("Parsed profiling output", "profilerDGDName", dgd.Name, "additionalResources", len(additionalResources))
if len(additionalResources) > 0 { if len(additionalResources) > 0 {
if err := r.storeAdditionalResources(ctx, dgdr, additionalResources); err != nil { if err := r.storeAdditionalResources(ctx, dgdr, additionalResources); err != nil {
......
...@@ -542,15 +542,21 @@ spec: ...@@ -542,15 +542,21 @@ spec:
Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed()) Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed())
// Create output ConfigMap // Create output ConfigMap
// The profiler emits a static name in the YAML, but the operator must override
// it with a DGDR-scoped unique name to prevent collisions across DGDRs.
dgdYAML := `apiVersion: nvidia.com/v1alpha1 dgdYAML := `apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment kind: DynamoGraphDeployment
metadata: metadata:
name: test-dgd-auto name: vllm-agg
spec: spec:
services: services:
Frontend: Frontend:
replicas: 1` replicas: 1`
// expectedDGDName is the name the operator should assign: DGDR name + "-dgd",
// not the static "vllm-agg" that the profiler emitted.
expectedDGDName := dgdrName + "-dgd"
outputConfigMapName := getOutputConfigMapName(dgdr) outputConfigMapName := getOutputConfigMapName(dgdr)
cm := &corev1.ConfigMap{ cm := &corev1.ConfigMap{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
...@@ -581,16 +587,16 @@ spec: ...@@ -581,16 +587,16 @@ spec:
}) })
Expect(err).NotTo(HaveOccurred()) Expect(err).NotTo(HaveOccurred())
// Verify DGD was created // Verify DGD was created with the DGDR-scoped name (not the profiler's "vllm-agg")
dgd := &dgdv1alpha1.DynamoGraphDeployment{} dgd := &dgdv1alpha1.DynamoGraphDeployment{}
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: "test-dgd-auto", Namespace: namespace}, dgd)).Should(Succeed()) Expect(k8sClient.Get(ctx, types.NamespacedName{Name: expectedDGDName, Namespace: namespace}, dgd)).Should(Succeed())
// Get final DGDR status // Get final DGDR status
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed()) Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
Expect(updated.Status.DGDName).Should(Equal("test-dgd-auto")) Expect(updated.Status.DGDName).Should(Equal(expectedDGDName))
// Clean up DGD // Clean up DGD
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: "test-dgd-auto", Namespace: namespace}, dgd)).Should(Succeed()) Expect(k8sClient.Get(ctx, types.NamespacedName{Name: expectedDGDName, Namespace: namespace}, dgd)).Should(Succeed())
_ = k8sClient.Delete(ctx, dgd) _ = k8sClient.Delete(ctx, dgd)
}) })
}) })
...@@ -1961,16 +1967,20 @@ spec: ...@@ -1961,16 +1967,20 @@ spec:
}} }}
Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed()) Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed())
// Create output ConfigMap with mocker output file // Create output ConfigMap with profiling output (the profiler itself handles mocker
// selection; the controller always reads from ProfilingOutputFile regardless).
dgdYAML := `apiVersion: nvidia.com/v1alpha1 dgdYAML := `apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment kind: DynamoGraphDeployment
metadata: metadata:
name: test-dgd-mocker name: vllm-agg
spec: spec:
services: services:
Frontend: Frontend:
replicas: 1` replicas: 1`
// expectedDGDName is derived from the DGDR name, not from the profiler's output.
expectedDGDName := dgdrName + "-dgd"
cm := &corev1.ConfigMap{ cm := &corev1.ConfigMap{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
Name: getOutputConfigMapName(dgdr), Name: getOutputConfigMapName(dgdr),
...@@ -1983,16 +1993,17 @@ spec: ...@@ -1983,16 +1993,17 @@ spec:
Expect(k8sClient.Create(ctx, cm)).Should(Succeed()) Expect(k8sClient.Create(ctx, cm)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, cm) }() defer func() { _ = k8sClient.Delete(ctx, cm) }()
// Reconcile — should read from mocker output file // Reconcile — controller reads ProfilingOutputFile, then overrides the name to
// a DGDR-scoped unique name, and stores the result in the annotation.
_, err := reconciler.Reconcile(ctx, reconcile.Request{ _, err := reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace}, NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
}) })
Expect(err).NotTo(HaveOccurred()) Expect(err).NotTo(HaveOccurred())
// Verify the generated spec came from the mocker file // Verify the generated spec was stored and contains the DGDR-scoped DGD name.
var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed()) Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
Expect(updated.Annotations["nvidia.com/generated-dgd-spec"]).Should(ContainSubstring("test-dgd-mocker")) Expect(updated.Annotations["nvidia.com/generated-dgd-spec"]).Should(ContainSubstring(expectedDGDName))
}) })
It("Should populate profilingJobName in status", func() { It("Should populate profilingJobName in status", func() {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment