/* * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package controller import ( "context" "time" configv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/config/v1alpha1" dgdv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1" commonController "github.com/ai-dynamo/dynamo/deploy/operator/internal/controller_common" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/tools/record" "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/reconcile" ) const ( defaultNamespace = "default" ) // MockRBACManager implements RBACManager for testing type MockRBACManager struct { EnsureServiceAccountWithRBACFunc func(ctx context.Context, targetNamespace, serviceAccountName, clusterRoleName string) error } func (m *MockRBACManager) EnsureServiceAccountWithRBAC(ctx context.Context, targetNamespace, serviceAccountName, clusterRoleName string) error { if m.EnsureServiceAccountWithRBACFunc != nil { return m.EnsureServiceAccountWithRBACFunc(ctx, targetNamespace, serviceAccountName, clusterRoleName) } return nil } var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { const ( timeout = time.Second * 10 interval = time.Millisecond * 250 ) var ( reconciler *DynamoGraphDeploymentRequestReconciler recorder *record.FakeRecorder ) BeforeEach(func() { recorder = record.NewFakeRecorder(100) reconciler = &DynamoGraphDeploymentRequestReconciler{ Client: k8sClient, Recorder: recorder, Config: &configv1alpha1.OperatorConfiguration{ Namespace: configv1alpha1.NamespaceConfiguration{ Restricted: "", }, RBAC: configv1alpha1.RBACConfiguration{ DGDRProfilingClusterRoleName: "test-cluster-role", }, }, RuntimeConfig: &commonController.RuntimeConfig{}, RBACManager: &MockRBACManager{}, } }) Context("When reconciling initial DGDR", func() { It("Should validate spec and transition to Pending", func() { ctx := context.Background() dgdrName := "test-dgdr-initial" namespace := defaultNamespace dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{ ObjectMeta: metav1.ObjectMeta{ Name: dgdrName, Namespace: namespace, }, Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{ Model: "test-model", Backend: "vllm", Image: "test-profiler:latest", AutoApply: ptr.To(true), Hardware: &nvidiacomv1beta1.HardwareSpec{ NumGPUsPerNode: ptr.To[int32](8), GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM, VRAMMB: ptr.To(81920.0), TotalGPUs: ptr.To[int32](128), }, SLA: &nvidiacomv1beta1.SLASpec{ TTFT: ptr.To(100.0), ITL: ptr.To(1500.0), }, }, } Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, dgdr) }() // First reconcile: Empty -> Pending _, err := reconciler.Reconcile(ctx, reconcile.Request{ NamespacedName: types.NamespacedName{ Name: dgdrName, Namespace: namespace, }, }) Expect(err).NotTo(HaveOccurred()) // Check status Eventually(func() nvidiacomv1beta1.DGDRPhase { var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest _ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated) return updated.Status.Phase }, timeout, interval).Should(Equal(nvidiacomv1beta1.DGDRPhasePending)) // Verify observedGeneration is set var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest _ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated) Expect(updated.Status.ObservedGeneration).Should(Equal(updated.Generation)) }) It("Should pass validation with minimal config", func() { ctx := context.Background() dgdrName := "test-dgdr-minimal" namespace := defaultNamespace dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{ ObjectMeta: metav1.ObjectMeta{ Name: dgdrName, Namespace: namespace, }, Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{ Model: "test-model", Backend: "vllm", Image: "test-profiler:latest", Hardware: &nvidiacomv1beta1.HardwareSpec{ NumGPUsPerNode: ptr.To[int32](8), GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM, VRAMMB: ptr.To(81920.0), TotalGPUs: ptr.To[int32](128), }, SLA: &nvidiacomv1beta1.SLASpec{ TTFT: ptr.To(100.0), ITL: ptr.To(1500.0), }, }, } Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, dgdr) }() // Reconcile - should succeed with minimal config _, err := reconciler.Reconcile(ctx, reconcile.Request{ NamespacedName: types.NamespacedName{ Name: dgdrName, Namespace: namespace, }, }) Expect(err).NotTo(HaveOccurred()) // Check status transitions to Pending (not Failed) Eventually(func() nvidiacomv1beta1.DGDRPhase { var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest _ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated) return updated.Status.Phase }, timeout, interval).Should(Equal(nvidiacomv1beta1.DGDRPhasePending)) }) }) Context("When creating profiling job", func() { It("Should create online profiling job", func() { ctx := context.Background() dgdrName := "test-dgdr-profiling-online" namespace := defaultNamespace // Create ConfigMap for DGD base config configMap := &corev1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ Name: "test-config", Namespace: namespace, }, Data: map[string]string{ "disagg.yaml": "test: config", }, } Expect(k8sClient.Create(ctx, configMap)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, configMap) }() // Create ServiceAccount sa := &corev1.ServiceAccount{ ObjectMeta: metav1.ObjectMeta{ Name: ServiceAccountProfilingJob, Namespace: namespace, }, } Expect(k8sClient.Create(ctx, sa)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, sa) }() dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{ ObjectMeta: metav1.ObjectMeta{ Name: dgdrName, Namespace: namespace, Annotations: map[string]string{ "nvidia.com/dgdr-config-map-ref": `{"name":"test-config","key":"disagg.yaml"}`, }, }, Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{ Model: "test-model", Backend: "vllm", Image: "test-profiler:latest", Hardware: &nvidiacomv1beta1.HardwareSpec{ NumGPUsPerNode: ptr.To[int32](8), GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM, VRAMMB: ptr.To(81920.0), TotalGPUs: ptr.To[int32](128), }, SLA: &nvidiacomv1beta1.SLASpec{ TTFT: ptr.To(100.0), ITL: ptr.To(1500.0), }, }, } Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, dgdr) }() // Reconcile multiple times to move through states _, err := reconciler.Reconcile(ctx, reconcile.Request{ NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace}, }) Expect(err).NotTo(HaveOccurred()) // Second reconcile: Pending -> Profiling _, err = reconciler.Reconcile(ctx, reconcile.Request{ NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace}, }) Expect(err).NotTo(HaveOccurred()) // Verify profiling job was created Eventually(func() bool { jobName := getProfilingJobName(dgdr) job := &batchv1.Job{} err := k8sClient.Get(ctx, types.NamespacedName{Name: jobName, Namespace: namespace}, job) return err == nil }, timeout, interval).Should(BeTrue()) // Verify job has correct labels jobName := getProfilingJobName(dgdr) job := &batchv1.Job{} _ = k8sClient.Get(ctx, types.NamespacedName{Name: jobName, Namespace: namespace}, job) Expect(job.Labels[nvidiacomv1beta1.LabelApp]).Should(Equal(nvidiacomv1beta1.LabelValueDynamoProfiler)) Expect(job.Labels[nvidiacomv1beta1.LabelDGDR]).Should(Equal(dgdrName)) // Verify job has profiler container Expect(job.Spec.Template.Spec.Containers).Should(HaveLen(2)) Expect(job.Spec.Template.Spec.Containers[0].Name).Should(Equal(ContainerNameProfiler)) Expect(job.Spec.Template.Spec.Containers[1].Name).Should(Equal(ContainerNameOutputCopier)) // Verify emptyDir volume (not PVC) Expect(job.Spec.Template.Spec.Volumes).Should(ContainElement( corev1.Volume{ Name: VolumeNameProfilingOutput, VolumeSource: corev1.VolumeSource{ EmptyDir: &corev1.EmptyDirVolumeSource{}, }, }, )) // Clean up job _ = k8sClient.Delete(ctx, job) }) It("Should create offline (AIC) profiling job", func() { ctx := context.Background() dgdrName := "test-dgdr-profiling-aic" namespace := defaultNamespace // Create ServiceAccount sa := &corev1.ServiceAccount{ ObjectMeta: metav1.ObjectMeta{ Name: ServiceAccountProfilingJob, Namespace: namespace, }, } Expect(k8sClient.Create(ctx, sa)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, sa) }() dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{ ObjectMeta: metav1.ObjectMeta{ Name: dgdrName, Namespace: namespace, }, Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{ Model: "test-model", Backend: "trtllm", Image: "test-profiler:latest", SearchStrategy: "rapid", Hardware: &nvidiacomv1beta1.HardwareSpec{ NumGPUsPerNode: ptr.To[int32](8), GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM, VRAMMB: ptr.To(81920.0), TotalGPUs: ptr.To[int32](128), }, SLA: &nvidiacomv1beta1.SLASpec{ TTFT: ptr.To(100.0), ITL: ptr.To(1500.0), }, }, } Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, dgdr) }() // Reconcile _, err := reconciler.Reconcile(ctx, reconcile.Request{ NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace}, }) Expect(err).NotTo(HaveOccurred()) _, err = reconciler.Reconcile(ctx, reconcile.Request{ NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace}, }) Expect(err).NotTo(HaveOccurred()) // Verify job was created with AIC label Eventually(func() string { jobName := getProfilingJobName(dgdr) job := &batchv1.Job{} if err := k8sClient.Get(ctx, types.NamespacedName{Name: jobName, Namespace: namespace}, job); err != nil { return "" } return job.Labels[nvidiacomv1beta1.LabelApp] }, timeout, interval).Should(Equal(nvidiacomv1beta1.LabelValueDynamoProfiler)) // Clean up jobName := getProfilingJobName(dgdr) job := &batchv1.Job{} if err := k8sClient.Get(ctx, types.NamespacedName{Name: jobName, Namespace: namespace}, job); err == nil { _ = k8sClient.Delete(ctx, job) } }) }) Context("When profiling completes", func() { It("Should generate DGD spec from ConfigMap", func() { ctx := context.Background() dgdrName := "test-dgdr-profiling-complete" namespace := defaultNamespace dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{ ObjectMeta: metav1.ObjectMeta{ Name: dgdrName, Namespace: namespace, }, Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{ Model: "test-model", Backend: "vllm", Image: "test-profiler:latest", Hardware: &nvidiacomv1beta1.HardwareSpec{ NumGPUsPerNode: ptr.To[int32](8), GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM, VRAMMB: ptr.To(81920.0), TotalGPUs: ptr.To[int32](128), }, SLA: &nvidiacomv1beta1.SLASpec{ TTFT: ptr.To(100.0), ITL: ptr.To(1500.0), }, }, } Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, dgdr) }() // Update status to Profiling using Status subresource dgdr.Status.Phase = nvidiacomv1beta1.DGDRPhaseProfiling Expect(k8sClient.Status().Update(ctx, dgdr)).Should(Succeed()) // Create completed profiling job jobName := getProfilingJobName(dgdr) job := &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{ Name: jobName, Namespace: namespace, }, Spec: batchv1.JobSpec{ Template: corev1.PodTemplateSpec{ Spec: corev1.PodSpec{ Containers: []corev1.Container{{ Name: "test", Image: "test", }}, RestartPolicy: corev1.RestartPolicyNever, }, }, }, Status: batchv1.JobStatus{ Conditions: []batchv1.JobCondition{{ Type: batchv1.JobComplete, Status: corev1.ConditionTrue, }}, }, } Expect(k8sClient.Create(ctx, job)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, job) }() // Update job status to completed using Status subresource job.Status.Conditions = []batchv1.JobCondition{{ Type: batchv1.JobComplete, Status: corev1.ConditionTrue, }} Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed()) // Create output ConfigMap with DGD spec dgdYAML := `apiVersion: nvidia.com/v1alpha1 kind: DynamoGraphDeployment metadata: name: test-dgd spec: services: Frontend: replicas: 1` outputConfigMapName := getOutputConfigMapName(dgdr) cm := &corev1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ Name: outputConfigMapName, Namespace: namespace, }, Data: map[string]string{ ProfilingOutputFile: dgdYAML, }, } Expect(k8sClient.Create(ctx, cm)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, cm) }() // Reconcile to process the profiling completion _, err := reconciler.Reconcile(ctx, reconcile.Request{ NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace}, }) Expect(err).NotTo(HaveOccurred()) // Get the updated DGDR var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed()) // Check that DGD spec was generated (stored in annotation) Expect(updated.Annotations["nvidia.com/generated-dgd-spec"]).NotTo(BeEmpty()) // autoApply defaults to true in v1beta1, so after profiling the DGDR transitions to Deploying Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhaseDeploying)) }) }) Context("When autoApply is enabled", func() { It("Should create DGD after profiling", func() { ctx := context.Background() dgdrName := "test-dgdr-autoapply" namespace := defaultNamespace dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{ ObjectMeta: metav1.ObjectMeta{ Name: dgdrName, Namespace: namespace, }, Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{ Model: "test-model", Backend: "vllm", Image: "test-profiler:latest", Hardware: &nvidiacomv1beta1.HardwareSpec{ NumGPUsPerNode: ptr.To[int32](8), GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM, VRAMMB: ptr.To(81920.0), TotalGPUs: ptr.To[int32](128), }, SLA: &nvidiacomv1beta1.SLASpec{ TTFT: ptr.To(100.0), ITL: ptr.To(1500.0), }, AutoApply: ptr.To(true), }, } Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, dgdr) }() // Update status to Profiling using Status subresource dgdr.Status.Phase = nvidiacomv1beta1.DGDRPhaseProfiling Expect(k8sClient.Status().Update(ctx, dgdr)).Should(Succeed()) // Create completed profiling job jobName := getProfilingJobName(dgdr) job := &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{ Name: jobName, Namespace: namespace, }, Spec: batchv1.JobSpec{ Template: corev1.PodTemplateSpec{ Spec: corev1.PodSpec{ Containers: []corev1.Container{{ Name: "test", Image: "test", }}, RestartPolicy: corev1.RestartPolicyNever, }, }, }, Status: batchv1.JobStatus{ Conditions: []batchv1.JobCondition{{ Type: batchv1.JobComplete, Status: corev1.ConditionTrue, }}, }, } Expect(k8sClient.Create(ctx, job)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, job) }() // Update job status to completed using Status subresource job.Status.Conditions = []batchv1.JobCondition{{ Type: batchv1.JobComplete, Status: corev1.ConditionTrue, }} Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed()) // Create output ConfigMap dgdYAML := `apiVersion: nvidia.com/v1alpha1 kind: DynamoGraphDeployment metadata: name: test-dgd-auto spec: services: Frontend: replicas: 1` outputConfigMapName := getOutputConfigMapName(dgdr) cm := &corev1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ Name: outputConfigMapName, Namespace: namespace, }, Data: map[string]string{ ProfilingOutputFile: dgdYAML, }, } Expect(k8sClient.Create(ctx, cm)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, cm) }() // Reconcile to generate spec (transitions to Deploying because autoApply=true) _, err := reconciler.Reconcile(ctx, reconcile.Request{ NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace}, }) Expect(err).NotTo(HaveOccurred()) // Get updated DGDR and check state is Deploying var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed()) Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhaseDeploying)) // Reconcile again to create DGD _, err = reconciler.Reconcile(ctx, reconcile.Request{ NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace}, }) Expect(err).NotTo(HaveOccurred()) // Verify DGD was created dgd := &dgdv1alpha1.DynamoGraphDeployment{} Expect(k8sClient.Get(ctx, types.NamespacedName{Name: "test-dgd-auto", Namespace: namespace}, dgd)).Should(Succeed()) // Get final DGDR status Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed()) Expect(updated.Status.DGDName).Should(Equal("test-dgd-auto")) // Clean up DGD Expect(k8sClient.Get(ctx, types.NamespacedName{Name: "test-dgd-auto", Namespace: namespace}, dgd)).Should(Succeed()) _ = k8sClient.Delete(ctx, dgd) }) }) Context("When enforcing spec immutability", func() { It("Should reject spec changes after profiling starts", func() { ctx := context.Background() dgdrName := "test-dgdr-immutable" namespace := defaultNamespace dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{ ObjectMeta: metav1.ObjectMeta{ Name: dgdrName, Namespace: namespace, }, Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{ Model: "test-model", Backend: "vllm", Image: "test-profiler:latest", Hardware: &nvidiacomv1beta1.HardwareSpec{ NumGPUsPerNode: ptr.To[int32](8), GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM, VRAMMB: ptr.To(81920.0), TotalGPUs: ptr.To[int32](128), }, SLA: &nvidiacomv1beta1.SLASpec{ TTFT: ptr.To(100.0), ITL: ptr.To(1500.0), }, }, } Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, dgdr) }() // Reconcile to initialize _, err := reconciler.Reconcile(ctx, reconcile.Request{ NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace}, }) Expect(err).NotTo(HaveOccurred()) // Get current generation var current nvidiacomv1beta1.DynamoGraphDeploymentRequest Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, ¤t)).Should(Succeed()) initialGeneration := current.Generation observedGeneration := current.Status.ObservedGeneration // Manually set state to Profiling to simulate in-progress profiling current.Status.Phase = nvidiacomv1beta1.DGDRPhaseProfiling Expect(k8sClient.Status().Update(ctx, ¤t)).Should(Succeed()) // Try to modify spec Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, ¤t)).Should(Succeed()) current.Spec.Model = "modified-model" Expect(k8sClient.Update(ctx, ¤t)).Should(Succeed()) // Reconcile _, err = reconciler.Reconcile(ctx, reconcile.Request{ NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace}, }) Expect(err).NotTo(HaveOccurred()) // Verify generation changed but observedGeneration stayed the same Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, ¤t)).Should(Succeed()) Expect(current.Generation).Should(BeNumerically(">", initialGeneration)) Expect(current.Status.ObservedGeneration).Should(Equal(observedGeneration)) Expect(current.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhaseProfiling)) // State unchanged // Verify event was recorded Eventually(func() bool { select { case event := <-recorder.Events: return event == "Warning SpecChangeRejected Cannot modify spec in phase 'Profiling'. DynamoGraphDeploymentRequest is immutable once profiling starts. Create a new resource with a different name instead." default: return false } }, timeout, interval).Should(BeTrue()) }) }) Context("When handling DGD deletion", func() { It("Should transition to Failed phase when DGD is deleted", func() { ctx := context.Background() dgdrName := "test-dgdr-dgd-deleted" namespace := defaultNamespace dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{ ObjectMeta: metav1.ObjectMeta{ Name: dgdrName, Namespace: namespace, }, Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{ Model: "test-model", Backend: "vllm", Image: "test-profiler:latest", Hardware: &nvidiacomv1beta1.HardwareSpec{ NumGPUsPerNode: ptr.To[int32](8), GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM, VRAMMB: ptr.To(81920.0), TotalGPUs: ptr.To[int32](128), }, SLA: &nvidiacomv1beta1.SLASpec{ TTFT: ptr.To(100.0), ITL: ptr.To(1500.0), }, AutoApply: ptr.To(true), }, } Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, dgdr) }() // Update status to Deployed with Deployment info using Status subresource dgdr.Status.Phase = nvidiacomv1beta1.DGDRPhaseDeployed dgdr.Status.DGDName = "test-dgd-to-delete" Expect(k8sClient.Status().Update(ctx, dgdr)).Should(Succeed()) // Reconcile when DGD doesn't exist (simulating deletion) _, err := reconciler.Reconcile(ctx, reconcile.Request{ NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace}, }) Expect(err).NotTo(HaveOccurred()) // Get updated DGDR and check phase transitioned to Failed var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed()) Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhaseFailed)) }) }) }) var _ = Describe("DGDR Helper Functions", func() { Context("getProfilingJobName", func() { It("Should return correct job name", func() { dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{ ObjectMeta: metav1.ObjectMeta{ Name: "test-dgdr", }, } Expect(getProfilingJobName(dgdr)).Should(Equal("profile-test-dgdr")) }) }) Context("getOutputConfigMapName", func() { It("Should return correct ConfigMap name", func() { dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{ ObjectMeta: metav1.ObjectMeta{ Name: "test-dgdr", }, } Expect(getOutputConfigMapName(dgdr)).Should(Equal("dgdr-output-test-dgdr")) }) }) Context("isOnlineProfiling", func() { It("Should always return true regardless of spec", func() { dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{ Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{ Model: "test-model", Backend: "vllm", }, } Expect(isOnlineProfiling(dgdr)).Should(BeTrue()) }) It("Should return true with search strategy rapid", func() { dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{ Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{ Model: "test-model", Backend: "trtllm", SearchStrategy: "rapid", }, } Expect(isOnlineProfiling(dgdr)).Should(BeTrue()) }) It("Should return true with search strategy thorough", func() { dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{ Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{ Model: "test-model", Backend: "vllm", SearchStrategy: "thorough", }, } Expect(isOnlineProfiling(dgdr)).Should(BeTrue()) }) It("Should return true with nil spec fields", func() { dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{ Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{ Model: "test-model", }, } Expect(isOnlineProfiling(dgdr)).Should(BeTrue()) }) }) }) var _ = Describe("DGDR Validation", func() { var reconciler *DynamoGraphDeploymentRequestReconciler BeforeEach(func() { reconciler = &DynamoGraphDeploymentRequestReconciler{ Client: k8sClient, } }) Context("validateSpec", func() { It("Should pass validation for valid spec", func() { ctx := context.Background() dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{ Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{ Model: "test-model", Backend: "vllm", Image: "test-profiler:latest", Hardware: &nvidiacomv1beta1.HardwareSpec{ NumGPUsPerNode: ptr.To[int32](8), GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM, VRAMMB: ptr.To(81920.0), TotalGPUs: ptr.To[int32](128), }, SLA: &nvidiacomv1beta1.SLASpec{ TTFT: ptr.To(100.0), ITL: ptr.To(1500.0), }, }, } err := reconciler.validateSpec(ctx, dgdr) Expect(err).NotTo(HaveOccurred()) }) It("Should pass validation with minimal config", func() { ctx := context.Background() dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{ Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{ Model: "test-model", Backend: "vllm", Image: "test-profiler:latest", Hardware: &nvidiacomv1beta1.HardwareSpec{ NumGPUsPerNode: ptr.To[int32](8), GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM, VRAMMB: ptr.To(81920.0), TotalGPUs: ptr.To[int32](128), }, SLA: &nvidiacomv1beta1.SLASpec{ TTFT: ptr.To(100.0), ITL: ptr.To(1500.0), }, }, } // Validation should pass - profiler will auto-generate missing config err := reconciler.validateSpec(ctx, dgdr) Expect(err).NotTo(HaveOccurred()) }) }) }) var _ = Describe("DGDR Profiler Arguments", func() { var reconciler *DynamoGraphDeploymentRequestReconciler BeforeEach(func() { reconciler = &DynamoGraphDeploymentRequestReconciler{ Client: k8sClient, Recorder: record.NewFakeRecorder(100), Config: &configv1alpha1.OperatorConfiguration{ Namespace: configv1alpha1.NamespaceConfiguration{ Restricted: "", }, }, RuntimeConfig: &commonController.RuntimeConfig{}, RBACManager: &MockRBACManager{}, } }) Context("When creating profiling job with inline config", func() { It("Should pass config as --config argument for online profiling", func() { ctx := context.Background() namespace := "default" dgdrName := "test-args-online" // Create ServiceAccount sa := &corev1.ServiceAccount{ ObjectMeta: metav1.ObjectMeta{ Name: ServiceAccountProfilingJob, Namespace: namespace, }, } Expect(k8sClient.Create(ctx, sa)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, sa) }() dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{ ObjectMeta: metav1.ObjectMeta{ Name: dgdrName, Namespace: namespace, }, Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{ Model: "test-model", Backend: "trtllm", Image: "test-profiler:latest", Hardware: &nvidiacomv1beta1.HardwareSpec{ GPUSKU: nvidiacomv1beta1.GPUSKUTypeH200SXM, NumGPUsPerNode: ptr.To[int32](8), VRAMMB: ptr.To(81920.0), TotalGPUs: ptr.To[int32](128), }, SLA: &nvidiacomv1beta1.SLASpec{ TTFT: ptr.To(50.0), ITL: ptr.To(10.0), }, }, } Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, dgdr) }() // Re-fetch DGDR to get proper metadata from API server var fetchedDGDR nvidiacomv1beta1.DynamoGraphDeploymentRequest Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &fetchedDGDR)).Should(Succeed()) // Create profiling job with properly initialized DGDR err := reconciler.createProfilingJob(ctx, &fetchedDGDR) Expect(err).NotTo(HaveOccurred()) // Verify job was created jobName := getProfilingJobName(&fetchedDGDR) job := &batchv1.Job{} Expect(k8sClient.Get(ctx, types.NamespacedName{Name: jobName, Namespace: namespace}, job)).Should(Succeed()) // Verify profiler container has --config argument profilerContainer := job.Spec.Template.Spec.Containers[0] args := profilerContainer.Args // Check that --config argument is present Expect(args).Should(ContainElement("--config")) // Clean up _ = k8sClient.Delete(ctx, job) }) It("Should pass config with AI Configurator settings for offline profiling", func() { ctx := context.Background() namespace := defaultNamespace dgdrName := "test-args-offline" // Create ServiceAccount sa := &corev1.ServiceAccount{ ObjectMeta: metav1.ObjectMeta{ Name: ServiceAccountProfilingJob, Namespace: namespace, }, } Expect(k8sClient.Create(ctx, sa)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, sa) }() dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{ ObjectMeta: metav1.ObjectMeta{ Name: dgdrName, Namespace: namespace, }, Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{ Model: "test-model", Backend: "trtllm", Image: "test-profiler:latest", SearchStrategy: "rapid", Hardware: &nvidiacomv1beta1.HardwareSpec{ GPUSKU: nvidiacomv1beta1.GPUSKUTypeH200SXM, NumGPUsPerNode: ptr.To[int32](8), VRAMMB: ptr.To(81920.0), TotalGPUs: ptr.To[int32](128), }, SLA: &nvidiacomv1beta1.SLASpec{ TTFT: ptr.To(50.0), ITL: ptr.To(10.0), }, }, } Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, dgdr) }() // Re-fetch DGDR to get proper metadata from API server var fetchedDGDR nvidiacomv1beta1.DynamoGraphDeploymentRequest Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &fetchedDGDR)).Should(Succeed()) // Create profiling job with properly initialized DGDR err := reconciler.createProfilingJob(ctx, &fetchedDGDR) Expect(err).NotTo(HaveOccurred()) // Verify job was created jobName := getProfilingJobName(&fetchedDGDR) job := &batchv1.Job{} Expect(k8sClient.Get(ctx, types.NamespacedName{Name: jobName, Namespace: namespace}, job)).Should(Succeed()) // Verify profiler container has --config argument profilerContainer := job.Spec.Template.Spec.Containers[0] args := profilerContainer.Args // Check that --config argument is present Expect(args).Should(ContainElement("--config")) // Clean up _ = k8sClient.Delete(ctx, job) }) It("Should set fsGroup in pod security context for volume permissions", func() { ctx := context.Background() namespace := "default" dgdrName := "test-fsgroup" // Create ServiceAccount sa := &corev1.ServiceAccount{ ObjectMeta: metav1.ObjectMeta{ Name: ServiceAccountProfilingJob, Namespace: namespace, }, } Expect(k8sClient.Create(ctx, sa)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, sa) }() dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{ ObjectMeta: metav1.ObjectMeta{ Name: dgdrName, Namespace: namespace, }, Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{ Model: "test-model", Backend: "trtllm", Image: "test-profiler:latest", Hardware: &nvidiacomv1beta1.HardwareSpec{ NumGPUsPerNode: ptr.To[int32](8), GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM, VRAMMB: ptr.To(81920.0), TotalGPUs: ptr.To[int32](128), }, SLA: &nvidiacomv1beta1.SLASpec{ TTFT: ptr.To(50.0), ITL: ptr.To(10.0), }, }, } Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, dgdr) }() // Re-fetch DGDR to get proper metadata from API server var fetchedDGDR nvidiacomv1beta1.DynamoGraphDeploymentRequest Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &fetchedDGDR)).Should(Succeed()) // Create profiling job with properly initialized DGDR err := reconciler.createProfilingJob(ctx, &fetchedDGDR) Expect(err).NotTo(HaveOccurred()) // Verify job was created jobName := getProfilingJobName(&fetchedDGDR) job := &batchv1.Job{} Expect(k8sClient.Get(ctx, types.NamespacedName{Name: jobName, Namespace: namespace}, job)).Should(Succeed()) // Verify security context has all security fields set correctly podSecurityContext := job.Spec.Template.Spec.SecurityContext Expect(podSecurityContext).NotTo(BeNil()) Expect(podSecurityContext.RunAsNonRoot).NotTo(BeNil()) Expect(*podSecurityContext.RunAsNonRoot).To(BeTrue()) Expect(podSecurityContext.RunAsUser).NotTo(BeNil()) Expect(*podSecurityContext.RunAsUser).To(Equal(int64(1000))) Expect(podSecurityContext.RunAsGroup).NotTo(BeNil()) Expect(*podSecurityContext.RunAsGroup).To(Equal(int64(1000))) Expect(podSecurityContext.FSGroup).NotTo(BeNil()) Expect(*podSecurityContext.FSGroup).To(Equal(int64(1000))) // Clean up _ = k8sClient.Delete(ctx, job) }) }) }) var _ = Describe("DGDR Error Handling", func() { var reconciler *DynamoGraphDeploymentRequestReconciler var recorder *record.FakeRecorder BeforeEach(func() { recorder = record.NewFakeRecorder(100) reconciler = &DynamoGraphDeploymentRequestReconciler{ Client: k8sClient, APIReader: k8sClient, Recorder: recorder, Config: &configv1alpha1.OperatorConfiguration{ Namespace: configv1alpha1.NamespaceConfiguration{ Restricted: "", }, }, RuntimeConfig: &commonController.RuntimeConfig{}, RBACManager: &MockRBACManager{}, } }) Context("When profiling job fails", func() { It("Should capture detailed error from pod termination state", func() { ctx := context.Background() namespace := defaultNamespace dgdrName := "test-error-capture" dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{ ObjectMeta: metav1.ObjectMeta{ Name: dgdrName, Namespace: namespace, }, Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{ Model: "test-model", Backend: "vllm", Image: "test-profiler:latest", Hardware: &nvidiacomv1beta1.HardwareSpec{ NumGPUsPerNode: ptr.To[int32](8), GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM, VRAMMB: ptr.To(81920.0), TotalGPUs: ptr.To[int32](128), }, SLA: &nvidiacomv1beta1.SLASpec{ TTFT: ptr.To(100.0), ITL: ptr.To(1500.0), }, }, } Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, dgdr) }() // Set status to Profiling dgdr.Status.Phase = nvidiacomv1beta1.DGDRPhaseProfiling Expect(k8sClient.Status().Update(ctx, dgdr)).Should(Succeed()) // Create failed job jobName := getProfilingJobName(dgdr) job := &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{ Name: jobName, Namespace: namespace, }, Spec: batchv1.JobSpec{ Template: corev1.PodTemplateSpec{ Spec: corev1.PodSpec{ Containers: []corev1.Container{{ Name: ContainerNameProfiler, Image: "test", }}, RestartPolicy: corev1.RestartPolicyNever, }, }, }, Status: batchv1.JobStatus{ Conditions: []batchv1.JobCondition{{ Type: batchv1.JobFailed, Status: corev1.ConditionTrue, Message: "BackoffLimitExceeded", }}, }, } Expect(k8sClient.Create(ctx, job)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, job) }() // Update job status job.Status.Conditions = []batchv1.JobCondition{{ Type: batchv1.JobFailed, Status: corev1.ConditionTrue, Message: "BackoffLimitExceeded", }} Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed()) // Create failed pod with termination details pod := &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: jobName + "-pod", Namespace: namespace, Labels: map[string]string{ "job-name": jobName, }, }, Spec: corev1.PodSpec{ Containers: []corev1.Container{{ Name: ContainerNameProfiler, Image: "test", }}, RestartPolicy: corev1.RestartPolicyNever, }, Status: corev1.PodStatus{ Phase: corev1.PodFailed, ContainerStatuses: []corev1.ContainerStatus{{ Name: ContainerNameProfiler, State: corev1.ContainerState{ Terminated: &corev1.ContainerStateTerminated{ ExitCode: 1, Reason: "Error", Message: "ValueError: Invalid model name for AI Configurator", }, }, }}, }, } Expect(k8sClient.Create(ctx, pod)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, pod) }() // Reconcile - should capture error details _, err := reconciler.Reconcile(ctx, reconcile.Request{ NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace}, }) Expect(err).NotTo(HaveOccurred()) // Verify DGDR transitioned to Failed state var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed()) Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhaseFailed)) // Verify error condition contains detailed error condition := meta.FindStatusCondition(updated.Status.Conditions, nvidiacomv1beta1.ConditionTypeProfiling) Expect(condition).NotTo(BeNil()) Expect(condition.Status).Should(Equal(metav1.ConditionFalse)) Expect(condition.Message).Should(ContainSubstring("profiling job failed")) }) }) Context("When parsing multi-document YAML", func() { It("Should extract DGD from ConfigMap + DGD YAML", func() { // Multi-document YAML with ConfigMap first, then DGD multiDocYAML := `--- apiVersion: v1 kind: ConfigMap metadata: name: test-config namespace: default data: some-data: "value" --- apiVersion: nvidia.com/v1alpha1 kind: DynamoGraphDeployment metadata: name: test-dgd namespace: default spec: backendFramework: vllm services: {}` dgd, err := reconciler.extractDGDFromYAML([]byte(multiDocYAML)) Expect(err).NotTo(HaveOccurred()) Expect(dgd).NotTo(BeNil()) Expect(dgd.Kind).Should(Equal("DynamoGraphDeployment")) Expect(dgd.Name).Should(Equal("test-dgd")) Expect(dgd.Spec.BackendFramework).Should(Equal("vllm")) }) It("Should extract DGD from single-document YAML", func() { // Single document YAML without separator singleDocYAML := `apiVersion: nvidia.com/v1alpha1 kind: DynamoGraphDeployment metadata: name: test-dgd-single namespace: default spec: backendFramework: vllm services: {}` dgd, err := reconciler.extractDGDFromYAML([]byte(singleDocYAML)) Expect(err).NotTo(HaveOccurred()) Expect(dgd).NotTo(BeNil()) Expect(dgd.Kind).Should(Equal("DynamoGraphDeployment")) Expect(dgd.Name).Should(Equal("test-dgd-single")) }) It("Should handle DGD + ConfigMap order (DGD first)", func() { // Multi-document YAML with DGD first, then ConfigMap multiDocYAML := `--- apiVersion: nvidia.com/v1alpha1 kind: DynamoGraphDeployment metadata: name: test-dgd-first namespace: default spec: backendFramework: vllm services: {} --- apiVersion: v1 kind: ConfigMap metadata: name: test-config namespace: default data: some-data: "value"` dgd, err := reconciler.extractDGDFromYAML([]byte(multiDocYAML)) Expect(err).NotTo(HaveOccurred()) Expect(dgd).NotTo(BeNil()) Expect(dgd.Kind).Should(Equal("DynamoGraphDeployment")) Expect(dgd.Name).Should(Equal("test-dgd-first")) }) It("Should return error when no DGD found", func() { // YAML with only ConfigMap configMapOnlyYAML := `--- apiVersion: v1 kind: ConfigMap metadata: name: test-config namespace: default data: some-data: "value"` _, err := reconciler.extractDGDFromYAML([]byte(configMapOnlyYAML)) Expect(err).To(HaveOccurred()) Expect(err.Error()).Should(ContainSubstring("no DynamoGraphDeployment found")) }) It("Should handle YAML with leading separator", func() { // YAML starting with --- separator yamlWithLeadingSeparator := `--- apiVersion: nvidia.com/v1alpha1 kind: DynamoGraphDeployment metadata: name: test-dgd-leading namespace: default spec: backendFramework: vllm services: {}` dgd, err := reconciler.extractDGDFromYAML([]byte(yamlWithLeadingSeparator)) Expect(err).NotTo(HaveOccurred()) Expect(dgd).NotTo(BeNil()) Expect(dgd.Name).Should(Equal("test-dgd-leading")) }) It("Should extract DGD and additional resources correctly", func() { // Multi-document YAML with ConfigMap and DGD multiDocYAML := `--- apiVersion: v1 kind: ConfigMap metadata: name: model-config namespace: default data: model.json: '{"name": "test-model"}' --- apiVersion: nvidia.com/v1alpha1 kind: DynamoGraphDeployment metadata: name: test-dgd namespace: default spec: backendFramework: vllm services: {}` dgd, additionalResources, err := reconciler.extractResourcesFromYAML([]byte(multiDocYAML)) Expect(err).NotTo(HaveOccurred()) Expect(dgd).NotTo(BeNil()) Expect(dgd.Name).Should(Equal("test-dgd")) Expect(additionalResources).To(HaveLen(1)) Expect(additionalResources[0].GetKind()).Should(Equal("ConfigMap")) Expect(additionalResources[0].GetName()).Should(Equal("model-config")) }) It("Should handle multiple additional resources", func() { // Multi-document YAML with multiple ConfigMaps and DGD multiDocYAML := `--- apiVersion: v1 kind: ConfigMap metadata: name: config1 data: key1: value1 --- apiVersion: v1 kind: ConfigMap metadata: name: config2 data: key2: value2 --- apiVersion: nvidia.com/v1alpha1 kind: DynamoGraphDeployment metadata: name: test-dgd spec: backendFramework: vllm services: {}` dgd, additionalResources, err := reconciler.extractResourcesFromYAML([]byte(multiDocYAML)) Expect(err).NotTo(HaveOccurred()) Expect(dgd).NotTo(BeNil()) Expect(additionalResources).To(HaveLen(2)) Expect(additionalResources[0].GetName()).Should(Equal("config1")) Expect(additionalResources[1].GetName()).Should(Equal("config2")) }) }) Context("GPU Discovery Integration Tests", func() { It("Should use GPU discovery when nodes have GPU labels", func() { ctx := context.Background() dgdrName := "test-dgdr-gpu-discovery" namespace := defaultNamespace // Create a node with GPU labels (simulating GFD labels) gpuNode := &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ Name: "gpu-worker-1", Labels: map[string]string{ "nvidia.com/gpu.count": "8", "nvidia.com/gpu.product": "H100-SXM5-80GB", "nvidia.com/gpu.memory": "81920", }, }, } Expect(k8sClient.Create(ctx, gpuNode)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, gpuNode) }() // Create DGDR WITHOUT hardware config (should use GPU discovery) dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{ ObjectMeta: metav1.ObjectMeta{ Name: dgdrName, Namespace: namespace, }, Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{ Model: "test-model", Backend: "vllm", Image: "test-profiler:latest", SLA: &nvidiacomv1beta1.SLASpec{ TTFT: ptr.To(100.0), ITL: ptr.To(1500.0), }, }, } Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, dgdr) }() // Reconcile - should succeed with GPU discovery _, err := reconciler.Reconcile(ctx, reconcile.Request{ NamespacedName: types.NamespacedName{ Name: dgdrName, Namespace: namespace, }, }) Expect(err).NotTo(HaveOccurred()) // Should transition to Pending (validation passed) var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest _ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated) Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhasePending)) }) It("Should respect manual hardware config over GPU discovery", func() { ctx := context.Background() dgdrName := "test-dgdr-manual-override" namespace := defaultNamespace // Create a node with H100 GPUs gpuNode := &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ Name: "gpu-worker-h100", Labels: map[string]string{ "nvidia.com/gpu.count": "8", "nvidia.com/gpu.product": "H100-SXM5-80GB", "nvidia.com/gpu.memory": "81920", }, }, } Expect(k8sClient.Create(ctx, gpuNode)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, gpuNode) }() // Create DGDR WITH manual hardware config (A100, not H100) dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{ ObjectMeta: metav1.ObjectMeta{ Name: dgdrName, Namespace: namespace, }, Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{ Model: "test-model", Backend: "vllm", Image: "test-profiler:latest", Hardware: &nvidiacomv1beta1.HardwareSpec{ NumGPUsPerNode: ptr.To[int32](4), GPUSKU: nvidiacomv1beta1.GPUSKUTypeA100SXM, VRAMMB: ptr.To(40960.0), }, SLA: &nvidiacomv1beta1.SLASpec{ TTFT: ptr.To(100.0), ITL: ptr.To(1500.0), }, }, } Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, dgdr) }() // Reconcile - should succeed and use manual config _, err := reconciler.Reconcile(ctx, reconcile.Request{ NamespacedName: types.NamespacedName{ Name: dgdrName, Namespace: namespace, }, }) Expect(err).NotTo(HaveOccurred()) // Should transition to Pending (validation passed with manual config) var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest _ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated) Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhasePending)) }) It("Should succeed with GPU discovery when cluster has GPU nodes", func() { ctx := context.Background() dgdrName := "test-dgdr-with-autodiscovery" namespace := defaultNamespace // Create a GPU node so GPU discovery can succeed node := &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ Name: "gpu-worker-autodiscovery", Labels: map[string]string{ "nvidia.com/gpu.count": "8", "nvidia.com/gpu.product": "H100-SXM5-80GB", "nvidia.com/gpu.memory": "81920", }, }, } Expect(k8sClient.Create(ctx, node)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, node) }() // Create DGDR WITHOUT hardware config - should use GPU discovery dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{ ObjectMeta: metav1.ObjectMeta{ Name: dgdrName, Namespace: namespace, }, Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{ Model: "test-model", Backend: "vllm", Image: "test-profiler:latest", SLA: &nvidiacomv1beta1.SLASpec{ TTFT: ptr.To(100.0), ITL: ptr.To(1500.0), }, }, } Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, dgdr) }() // Reconcile - should succeed with GPU discovery _, err := reconciler.Reconcile(ctx, reconcile.Request{ NamespacedName: types.NamespacedName{ Name: dgdrName, Namespace: namespace, }, }) Expect(err).NotTo(HaveOccurred()) // Should transition to Pending var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest _ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated) Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhasePending)) }) It("Should pass validation with explicit GPU ranges without GPU discovery", func() { ctx := context.Background() dgdrName := "test-dgdr-explicit-ranges" namespace := defaultNamespace // Intentionally don't create GPU nodes to test that explicit ranges work without GPU discovery // Create DGDR with explicit minNumGpusPerEngine/maxNumGpusPerEngine dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{ ObjectMeta: metav1.ObjectMeta{ Name: dgdrName, Namespace: namespace, }, Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{ Model: "test-model", Backend: "vllm", Image: "test-profiler:latest", Hardware: &nvidiacomv1beta1.HardwareSpec{ NumGPUsPerNode: ptr.To[int32](8), }, SLA: &nvidiacomv1beta1.SLASpec{ TTFT: ptr.To(100.0), ITL: ptr.To(1500.0), }, }, } Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, dgdr) }() // Reconcile - should succeed (explicit ranges + minimal hardware bypass GPU discovery requirement) _, err := reconciler.Reconcile(ctx, reconcile.Request{ NamespacedName: types.NamespacedName{ Name: dgdrName, Namespace: namespace, }, }) Expect(err).NotTo(HaveOccurred()) // Should transition to Pending var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest _ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated) Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhasePending)) }) It("Should use GPU discovery with heterogeneous nodes (picks best)", func() { ctx := context.Background() dgdrName := "test-dgdr-heterogeneous" namespace := defaultNamespace // Create nodes with different GPU configs nodeA100 := &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ Name: "gpu-worker-a100", Labels: map[string]string{ "nvidia.com/gpu.count": "4", "nvidia.com/gpu.product": "A100-SXM4-40GB", "nvidia.com/gpu.memory": "40960", }, }, } nodeH100 := &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ Name: "gpu-worker-h100", Labels: map[string]string{ "nvidia.com/gpu.count": "8", "nvidia.com/gpu.product": "H100-SXM5-80GB", "nvidia.com/gpu.memory": "81920", }, }, } Expect(k8sClient.Create(ctx, nodeA100)).Should(Succeed()) Expect(k8sClient.Create(ctx, nodeH100)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, nodeA100) _ = k8sClient.Delete(ctx, nodeH100) }() // Create DGDR without hardware config dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{ ObjectMeta: metav1.ObjectMeta{ Name: dgdrName, Namespace: namespace, }, Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{ Model: "test-model", Backend: "vllm", Image: "test-profiler:latest", SLA: &nvidiacomv1beta1.SLASpec{ TTFT: ptr.To(100.0), ITL: ptr.To(1500.0), }, }, } Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, dgdr) }() // Reconcile - should pick H100 (8 GPUs > 4 GPUs) _, err := reconciler.Reconcile(ctx, reconcile.Request{ NamespacedName: types.NamespacedName{ Name: dgdrName, Namespace: namespace, }, }) Expect(err).NotTo(HaveOccurred()) // Should transition to Pending (using H100 config) var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest _ = k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated) Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhasePending)) }) }) Context("v1beta1-specific behavior", func() { It("Should transition to Deployed when DGD reaches Ready", func() { ctx := context.Background() dgdrName := "test-dgdr-deployed-phase" namespace := defaultNamespace dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{ ObjectMeta: metav1.ObjectMeta{ Name: dgdrName, Namespace: namespace, }, Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{ Model: "test-model", Backend: "vllm", Image: "test-profiler:latest", Hardware: &nvidiacomv1beta1.HardwareSpec{ NumGPUsPerNode: ptr.To[int32](8), GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM, VRAMMB: ptr.To(81920.0), TotalGPUs: ptr.To[int32](128), }, SLA: &nvidiacomv1beta1.SLASpec{ TTFT: ptr.To(100.0), ITL: ptr.To(1500.0), }, }, } Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, dgdr) }() // Set DGDR to Deploying with a DGDName dgdr.Status.Phase = nvidiacomv1beta1.DGDRPhaseDeploying dgdr.Status.DGDName = "test-dgd-deployed" Expect(k8sClient.Status().Update(ctx, dgdr)).Should(Succeed()) // Create the DGD in Ready state dgd := &dgdv1alpha1.DynamoGraphDeployment{ ObjectMeta: metav1.ObjectMeta{ Name: "test-dgd-deployed", Namespace: namespace, Labels: map[string]string{ nvidiacomv1beta1.LabelDGDRName: dgdrName, nvidiacomv1beta1.LabelDGDRNamespace: namespace, }, }, Spec: dgdv1alpha1.DynamoGraphDeploymentSpec{}, } Expect(k8sClient.Create(ctx, dgd)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, dgd) }() // Set DGD to Successful state dgd.Status.State = dgdv1alpha1.DGDStateSuccessful Expect(k8sClient.Status().Update(ctx, dgd)).Should(Succeed()) // Reconcile — should transition DGDR to Deployed _, err := reconciler.Reconcile(ctx, reconcile.Request{ NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace}, }) Expect(err).NotTo(HaveOccurred()) var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed()) Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhaseDeployed)) }) It("Should set Succeeded condition at each phase transition", func() { ctx := context.Background() dgdrName := "test-dgdr-succeeded-cond" namespace := defaultNamespace dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{ ObjectMeta: metav1.ObjectMeta{ Name: dgdrName, Namespace: namespace, }, Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{ Model: "test-model", Backend: "vllm", Image: "test-profiler:latest", Hardware: &nvidiacomv1beta1.HardwareSpec{ NumGPUsPerNode: ptr.To[int32](8), GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM, VRAMMB: ptr.To(81920.0), TotalGPUs: ptr.To[int32](128), }, SLA: &nvidiacomv1beta1.SLASpec{ TTFT: ptr.To(100.0), ITL: ptr.To(1500.0), }, }, } Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, dgdr) }() // First reconcile: initial validation → Pending _, err := reconciler.Reconcile(ctx, reconcile.Request{ NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace}, }) Expect(err).NotTo(HaveOccurred()) var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed()) // Check that Succeeded condition exists with reason matching the phase succeededCond := meta.FindStatusCondition(updated.Status.Conditions, nvidiacomv1beta1.ConditionTypeSucceeded) Expect(succeededCond).NotTo(BeNil()) Expect(succeededCond.Reason).Should(Equal(string(nvidiacomv1beta1.DGDRPhasePending))) }) It("Should set ProfilingPhase on entry to Profiling and clear on exit", func() { ctx := context.Background() dgdrName := "test-dgdr-profiling-phase" namespace := defaultNamespace dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{ ObjectMeta: metav1.ObjectMeta{ Name: dgdrName, Namespace: namespace, }, Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{ Model: "test-model", Backend: "vllm", Image: "test-profiler:latest", Hardware: &nvidiacomv1beta1.HardwareSpec{ NumGPUsPerNode: ptr.To[int32](8), GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM, VRAMMB: ptr.To(81920.0), TotalGPUs: ptr.To[int32](128), }, SLA: &nvidiacomv1beta1.SLASpec{ TTFT: ptr.To(100.0), ITL: ptr.To(1500.0), }, }, } Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, dgdr) }() // Transition through initial validation to Pending _, err := reconciler.Reconcile(ctx, reconcile.Request{ NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace}, }) Expect(err).NotTo(HaveOccurred()) // Reconcile again to start profiling (creates job, transitions to Profiling) _, err = reconciler.Reconcile(ctx, reconcile.Request{ NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace}, }) Expect(err).NotTo(HaveOccurred()) // Check ProfilingPhase is set var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed()) Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhaseProfiling)) Expect(updated.Status.ProfilingPhase).Should(Equal(nvidiacomv1beta1.ProfilingPhaseInitializing)) // Simulate profiling completion jobName := getProfilingJobName(&updated) job := &batchv1.Job{} Expect(k8sClient.Get(ctx, types.NamespacedName{Name: jobName, Namespace: namespace}, job)).Should(Succeed()) job.Status.Conditions = []batchv1.JobCondition{{ Type: batchv1.JobComplete, Status: corev1.ConditionTrue, }} Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed()) dgdYAML := `apiVersion: nvidia.com/v1alpha1 kind: DynamoGraphDeployment metadata: name: test-dgd-profphase spec: services: Frontend: replicas: 1` cm := &corev1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ Name: getOutputConfigMapName(&updated), Namespace: namespace, }, Data: map[string]string{ ProfilingOutputFile: dgdYAML, }, } Expect(k8sClient.Create(ctx, cm)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, cm) }() // Reconcile to complete profiling _, err = reconciler.Reconcile(ctx, reconcile.Request{ NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace}, }) Expect(err).NotTo(HaveOccurred()) Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed()) Expect(updated.Status.ProfilingPhase).Should(BeEmpty(), "profilingPhase must be cleared after profiling completes") Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhaseDeploying), "phase must be Deploying after profiling completes with autoApply=true") }) It("Should use spec.features.mocker.enabled to select mocker output", func() { ctx := context.Background() dgdrName := "test-dgdr-mocker" namespace := defaultNamespace dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{ ObjectMeta: metav1.ObjectMeta{ Name: dgdrName, Namespace: namespace, }, Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{ Model: "test-model", Backend: "vllm", Image: "test-profiler:latest", AutoApply: ptr.To(true), Hardware: &nvidiacomv1beta1.HardwareSpec{ NumGPUsPerNode: ptr.To[int32](8), GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM, VRAMMB: ptr.To(81920.0), TotalGPUs: ptr.To[int32](128), }, SLA: &nvidiacomv1beta1.SLASpec{ TTFT: ptr.To(100.0), ITL: ptr.To(1500.0), }, Features: &nvidiacomv1beta1.FeaturesSpec{ Mocker: &nvidiacomv1beta1.MockerSpec{Enabled: true}, }, }, } Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, dgdr) }() // Transition to Profiling dgdr.Status.Phase = nvidiacomv1beta1.DGDRPhaseProfiling dgdr.Status.ObservedGeneration = dgdr.Generation Expect(k8sClient.Status().Update(ctx, dgdr)).Should(Succeed()) // Create completed job jobName := getProfilingJobName(dgdr) job := &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{Name: jobName, Namespace: namespace}, Spec: batchv1.JobSpec{ Template: corev1.PodTemplateSpec{ Spec: corev1.PodSpec{ Containers: []corev1.Container{{Name: "test", Image: "test"}}, RestartPolicy: corev1.RestartPolicyNever, }, }, }, } Expect(k8sClient.Create(ctx, job)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, job) }() job.Status.Conditions = []batchv1.JobCondition{{ Type: batchv1.JobComplete, Status: corev1.ConditionTrue, }} Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed()) // Create output ConfigMap with mocker output file dgdYAML := `apiVersion: nvidia.com/v1alpha1 kind: DynamoGraphDeployment metadata: name: test-dgd-mocker spec: services: Frontend: replicas: 1` cm := &corev1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ Name: getOutputConfigMapName(dgdr), Namespace: namespace, }, Data: map[string]string{ ProfilingOutputFile: dgdYAML, }, } Expect(k8sClient.Create(ctx, cm)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, cm) }() // Reconcile — should read from mocker output file _, err := reconciler.Reconcile(ctx, reconcile.Request{ NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace}, }) Expect(err).NotTo(HaveOccurred()) // Verify the generated spec came from the mocker file var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed()) Expect(updated.Annotations["nvidia.com/generated-dgd-spec"]).Should(ContainSubstring("test-dgd-mocker")) }) It("Should populate profilingJobName in status", func() { ctx := context.Background() dgdrName := "test-dgdr-jobname" namespace := defaultNamespace dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{ ObjectMeta: metav1.ObjectMeta{ Name: dgdrName, Namespace: namespace, }, Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{ Model: "test-model", Backend: "vllm", Image: "test-profiler:latest", Hardware: &nvidiacomv1beta1.HardwareSpec{ NumGPUsPerNode: ptr.To[int32](8), GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM, VRAMMB: ptr.To(81920.0), TotalGPUs: ptr.To[int32](128), }, SLA: &nvidiacomv1beta1.SLASpec{ TTFT: ptr.To(100.0), ITL: ptr.To(1500.0), }, }, } Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, dgdr) }() // Reconcile through initial validation _, err := reconciler.Reconcile(ctx, reconcile.Request{ NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace}, }) Expect(err).NotTo(HaveOccurred()) // Reconcile to create profiling job _, err = reconciler.Reconcile(ctx, reconcile.Request{ NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace}, }) Expect(err).NotTo(HaveOccurred()) // Check profilingJobName is set in status var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed()) Expect(updated.Status.ProfilingJobName).Should(Equal(getProfilingJobName(&updated))) // Clean up job job := &batchv1.Job{} _ = k8sClient.Get(ctx, types.NamespacedName{Name: updated.Status.ProfilingJobName, Namespace: namespace}, job) _ = k8sClient.Delete(ctx, job) }) It("Should clear profilingPhase, set ProfilingCompleted condition, and populate profilingResults.selectedConfig after profiling completes", func() { // Regression test for: profilingPhase staying "Initializing", Profiling condition // not updated to ProfilingCompleted, and profilingResults never populated. // Root cause was that generateDGDSpec called r.Get() before r.Status().Update(), // overwriting the in-memory status changes made by handleProfilingPhase. ctx := context.Background() dgdrName := "test-dgdr-status-regression" namespace := defaultNamespace dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{ ObjectMeta: metav1.ObjectMeta{ Name: dgdrName, Namespace: namespace, }, Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{ Model: "test-model", Backend: "vllm", Image: "test-profiler:latest", AutoApply: ptr.To(false), Hardware: &nvidiacomv1beta1.HardwareSpec{ NumGPUsPerNode: ptr.To[int32](8), GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM, VRAMMB: ptr.To(81920.0), TotalGPUs: ptr.To[int32](8), }, SLA: &nvidiacomv1beta1.SLASpec{ TTFT: ptr.To(200.0), ITL: ptr.To(30.0), }, }, } Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, dgdr) }() // Reconcile: initial validation → Pending _, err := reconciler.Reconcile(ctx, reconcile.Request{ NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace}, }) Expect(err).NotTo(HaveOccurred()) // Reconcile: create profiling job → Profiling + ProfilingPhase=Initializing _, err = reconciler.Reconcile(ctx, reconcile.Request{ NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace}, }) Expect(err).NotTo(HaveOccurred()) var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed()) Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhaseProfiling)) Expect(updated.Status.ProfilingPhase).Should(Equal(nvidiacomv1beta1.ProfilingPhaseInitializing)) // Mark profiling job as complete jobName := getProfilingJobName(&updated) job := &batchv1.Job{} Expect(k8sClient.Get(ctx, types.NamespacedName{Name: jobName, Namespace: namespace}, job)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, job) }() job.Status.Conditions = []batchv1.JobCondition{{ Type: batchv1.JobComplete, Status: corev1.ConditionTrue, }} Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed()) // Create the output ConfigMap that the sidecar would have created dgdYAML := `apiVersion: nvidia.com/v1alpha1 kind: DynamoGraphDeployment metadata: name: test-status-regression-dgd spec: services: Frontend: replicas: 1 VllmWorker: replicas: 2` cm := &corev1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ Name: getOutputConfigMapName(&updated), Namespace: namespace, }, Data: map[string]string{ ProfilingOutputFile: dgdYAML, }, } Expect(k8sClient.Create(ctx, cm)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, cm) }() // Reconcile: profiling complete → should clear profilingPhase, set // ProfilingCompleted condition, populate profilingResults.selectedConfig _, err = reconciler.Reconcile(ctx, reconcile.Request{ NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace}, }) Expect(err).NotTo(HaveOccurred()) Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed()) // profilingPhase must be cleared (was staying "Initializing" before fix) Expect(updated.Status.ProfilingPhase).Should(BeEmpty(), "profilingPhase should be cleared after profiling completes") // Profiling condition must be ProfilingCompleted (was never set before fix) profilingCond := meta.FindStatusCondition(updated.Status.Conditions, nvidiacomv1beta1.ConditionTypeProfiling) Expect(profilingCond).ShouldNot(BeNil(), "Profiling condition must exist") Expect(profilingCond.Status).Should(Equal(metav1.ConditionTrue), "Profiling condition status must be True") Expect(profilingCond.Reason).Should(Equal("ProfilingCompleted"), "Profiling condition reason must be ProfilingCompleted") // profilingResults.selectedConfig must be populated (was nil before fix) Expect(updated.Status.ProfilingResults).ShouldNot(BeNil(), "profilingResults must be populated after profiling") Expect(updated.Status.ProfilingResults.SelectedConfig).ShouldNot(BeNil(), "profilingResults.selectedConfig must be set") Expect(updated.Status.ProfilingResults.SelectedConfig.Raw).ShouldNot(BeEmpty(), "profilingResults.selectedConfig must not be empty JSON") // Phase must be Ready (autoApply=false → profiling complete, spec available) Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhaseReady), "phase must be Ready after profiling completes with autoApply=false") // status.dgdName must be preserved after profiling Expect(updated.Status.DGDName).ShouldNot(BeEmpty(), "status.dgdName must be preserved after profiling") }) It("Should populate profilingResults.pareto from webui_data.json in output ConfigMap", func() { // Regression test for profilingResults.pareto never being populated. // The sidecar now includes webui_data.json in the output ConfigMap; // the controller reads it to populate status.profilingResults.pareto. ctx := context.Background() dgdrName := "test-dgdr-pareto" namespace := defaultNamespace dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{ ObjectMeta: metav1.ObjectMeta{ Name: dgdrName, Namespace: namespace, }, Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{ Model: "test-model", Backend: "vllm", Image: "test-profiler:latest", AutoApply: ptr.To(false), Hardware: &nvidiacomv1beta1.HardwareSpec{ NumGPUsPerNode: ptr.To[int32](8), GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM, VRAMMB: ptr.To(81920.0), TotalGPUs: ptr.To[int32](8), }, SLA: &nvidiacomv1beta1.SLASpec{ TTFT: ptr.To(200.0), ITL: ptr.To(30.0), }, }, } Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, dgdr) }() // Drive to Profiling phase _, err := reconciler.Reconcile(ctx, reconcile.Request{ NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace}, }) Expect(err).NotTo(HaveOccurred()) _, err = reconciler.Reconcile(ctx, reconcile.Request{ NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace}, }) Expect(err).NotTo(HaveOccurred()) var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed()) Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhaseProfiling)) // Mark job complete jobName := getProfilingJobName(&updated) job := &batchv1.Job{} Expect(k8sClient.Get(ctx, types.NamespacedName{Name: jobName, Namespace: namespace}, job)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, job) }() job.Status.Conditions = []batchv1.JobCondition{{ Type: batchv1.JobComplete, Status: corev1.ConditionTrue, }} Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed()) // Create output ConfigMap that includes both final_config.yaml and webui_data.json. // The webui_data.json format mirrors what the profiler writes; two rows in cost.table // represent two Pareto-optimal configurations. dgdYAML := `apiVersion: nvidia.com/v1alpha1 kind: DynamoGraphDeployment metadata: name: test-pareto-dgd spec: services: Frontend: replicas: 1` webUIDataJSON := `{ "settings": {}, "prefill": {"chart": {}, "table": {"columns": [], "data": []}}, "decode": {"chart": {}, "table": {"columns": [], "data": []}}, "cost": { "chart": {}, "index_mapping": {"0": [0, 0], "1": [0, 1]}, "table": { "columns": ["TTFT (ms)", "Prefill Thpt", "ITL (ms)", "Decode Thpt", "Tokens/User", "GPU Hours", "Action"], "data": [ [94.38, 7946.85, 7.09, 35.26, 141.04, 16.17, "# Prefill: 4 GPU(s), TP=4\n# Decode: 4 GPU(s), TP=4\napiVersion: nvidia.com/v1alpha1\nkind: DynamoGraphDeployment\nspec:\n services:\n PrefillWorker:\n replicas: 1\n DecodeWorker:\n replicas: 1\n"], [94.38, 7946.85, 10.69, 46.77, 93.54, 6.36, "# Prefill: 4 GPU(s), TP=4\n# Decode: 2 GPU(s), TP=2\napiVersion: nvidia.com/v1alpha1\nkind: DynamoGraphDeployment\nspec:\n services:\n PrefillWorker:\n replicas: 1\n DecodeWorker:\n replicas: 1\n"] ] } } }` cm := &corev1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ Name: getOutputConfigMapName(&updated), Namespace: namespace, }, Data: map[string]string{ ProfilingOutputFile: dgdYAML, "webui_data.json": webUIDataJSON, }, } Expect(k8sClient.Create(ctx, cm)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, cm) }() // Reconcile to complete profiling and populate profilingResults _, err = reconciler.Reconcile(ctx, reconcile.Request{ NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace}, }) Expect(err).NotTo(HaveOccurred()) Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed()) // profilingResults.pareto must contain both Pareto-optimal configurations Expect(updated.Status.ProfilingResults).ShouldNot(BeNil(), "profilingResults must be populated") Expect(updated.Status.ProfilingResults.Pareto).Should(HaveLen(2), "profilingResults.pareto should contain 2 entries from webui_data.json") // Each pareto entry must have non-empty Config for i, p := range updated.Status.ProfilingResults.Pareto { Expect(p.Config.Raw).ShouldNot(BeEmpty(), "pareto[%d].config must not be empty", i) } }) It("Should validate typed hardware fields without blob parsing", func() { ctx := context.Background() dgdrName := "test-dgdr-typed-hw" namespace := defaultNamespace dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{ ObjectMeta: metav1.ObjectMeta{ Name: dgdrName, Namespace: namespace, }, Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{ Model: "test-model", Backend: "vllm", Image: "test-profiler:latest", Hardware: &nvidiacomv1beta1.HardwareSpec{ GPUSKU: nvidiacomv1beta1.GPUSKUTypeA100SXM, }, SLA: &nvidiacomv1beta1.SLASpec{ TTFT: ptr.To(100.0), ITL: ptr.To(1500.0), }, }, } Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed()) defer func() { _ = k8sClient.Delete(ctx, dgdr) }() // Reconcile — partial hardware (GPUSKU only) should pass validation _, err := reconciler.Reconcile(ctx, reconcile.Request{ NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace}, }) Expect(err).NotTo(HaveOccurred()) var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed()) Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhasePending)) }) }) })