Unverified Commit fce90859 authored by hhzhang16's avatar hhzhang16 Committed by GitHub
Browse files

fix: populate profilingResults, clear profilingPhase, and update Profiling...


fix: populate profilingResults, clear profilingPhase, and update Profiling condition after profiling completes (#7116)
Signed-off-by: default avatarHannah Zhang <hannahz@nvidia.com>
parent 218cb13c
...@@ -24,6 +24,7 @@ import ( ...@@ -24,6 +24,7 @@ import (
"errors" "errors"
"fmt" "fmt"
"io" "io"
"strings"
"text/template" "text/template"
batchv1 "k8s.io/api/batch/v1" batchv1 "k8s.io/api/batch/v1"
...@@ -228,6 +229,12 @@ if [ -f {{.OutputPath}}/profiler_status.yaml ]; then ...@@ -228,6 +229,12 @@ if [ -f {{.OutputPath}}/profiler_status.yaml ]; then
sed 's/^/ /' {{.OutputPath}}/profiler_status.yaml >> /tmp/cm.yaml sed 's/^/ /' {{.OutputPath}}/profiler_status.yaml >> /tmp/cm.yaml
fi fi
# Add webui_data.json for pareto curve data (used by operator to populate status.profilingResults.pareto)
if [ -f {{.OutputPath}}/webui_data.json ]; then
echo " webui_data.json: |" >> /tmp/cm.yaml
sed 's/^/ /' {{.OutputPath}}/webui_data.json >> /tmp/cm.yaml
fi
# Note: Profiling data (raw_data.npz converted to JSON) is included in the # Note: Profiling data (raw_data.npz converted to JSON) is included in the
# generated DGD YAML as a separate ConfigMap by the profiler, no need to add it here # generated DGD YAML as a separate ConfigMap by the profiler, no need to add it here
...@@ -412,10 +419,17 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleProfilingPhase(ctx contex ...@@ -412,10 +419,17 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleProfilingPhase(ctx contex
return ctrl.Result{}, nil return ctrl.Result{}, nil
} }
// Profiling complete — clear the profiling sub-phase profilingResults, dgdName, err := r.generateDGDSpec(ctx, dgdr)
dgdr.ClearProfilingPhase() if err != nil {
dgdr.ClearProfilingPhase()
r.Recorder.Event(dgdr, corev1.EventTypeWarning, MessageGenerationFailed, err.Error())
return r.updatePhaseWithCondition(ctx, dgdr, nvidiacomv1beta1.DGDRPhaseFailed, nvidiacomv1beta1.ConditionTypeSpecGenerated, metav1.ConditionFalse, MessageGenerationFailed, err.Error())
}
if err := r.Get(ctx, types.NamespacedName{Name: dgdr.Name, Namespace: dgdr.Namespace}, dgdr); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to refetch DGDR after generateDGDSpec: %w", err)
}
// Mark profiling as completed successfully dgdr.ClearProfilingPhase()
meta.SetStatusCondition(&dgdr.Status.Conditions, metav1.Condition{ meta.SetStatusCondition(&dgdr.Status.Conditions, metav1.Condition{
Type: nvidiacomv1beta1.ConditionTypeProfiling, Type: nvidiacomv1beta1.ConditionTypeProfiling,
Status: metav1.ConditionTrue, Status: metav1.ConditionTrue,
...@@ -423,14 +437,9 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleProfilingPhase(ctx contex ...@@ -423,14 +437,9 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleProfilingPhase(ctx contex
Reason: "ProfilingCompleted", Reason: "ProfilingCompleted",
Message: "Profiling job completed successfully", Message: "Profiling job completed successfully",
}) })
dgdr.Status.DGDName = dgdName
dgdr.Status.ProfilingResults = profilingResults
// Retrieve profiling results and generate spec
if err := r.generateDGDSpec(ctx, dgdr); err != nil {
r.Recorder.Event(dgdr, corev1.EventTypeWarning, MessageGenerationFailed, err.Error())
return r.updatePhaseWithCondition(ctx, dgdr, nvidiacomv1beta1.DGDRPhaseFailed, nvidiacomv1beta1.ConditionTypeSpecGenerated, metav1.ConditionFalse, MessageGenerationFailed, err.Error())
}
// Record spec generation event
r.Recorder.Event(dgdr, corev1.EventTypeNormal, nvidiacomv1beta1.EventReasonSpecGenerated, MessageSpecGenerated) r.Recorder.Event(dgdr, corev1.EventTypeNormal, nvidiacomv1beta1.EventReasonSpecGenerated, MessageSpecGenerated)
// Create additional resources (ConfigMaps) immediately after profiling // Create additional resources (ConfigMaps) immediately after profiling
...@@ -476,12 +485,10 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleDeployingPhase(ctx contex ...@@ -476,12 +485,10 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleDeployingPhase(ctx contex
return ctrl.Result{}, r.Status().Update(ctx, dgdr) return ctrl.Result{}, r.Status().Update(ctx, dgdr)
} }
// Check if we need to create DGD
if dgdr.Status.DGDName == "" { if dgdr.Status.DGDName == "" {
return r.createDGD(ctx, dgdr) return r.createDGD(ctx, dgdr)
} }
// DGD was already created, check its status
dgd := &dgdv1alpha1.DynamoGraphDeployment{} dgd := &dgdv1alpha1.DynamoGraphDeployment{}
err := r.Get(ctx, types.NamespacedName{ err := r.Get(ctx, types.NamespacedName{
Name: dgdr.Status.DGDName, Name: dgdr.Status.DGDName,
...@@ -489,7 +496,11 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleDeployingPhase(ctx contex ...@@ -489,7 +496,11 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleDeployingPhase(ctx contex
}, dgd) }, dgd)
if apierrors.IsNotFound(err) { if apierrors.IsNotFound(err) {
// DGD was deleted by user // Annotation present means DGD was never created (spec ready but create not yet called).
// Annotation absent means DGD was previously created and then manually deleted.
if _, hasSpec := dgdr.Annotations["nvidia.com/generated-dgd-spec"]; hasSpec {
return r.createDGD(ctx, dgdr)
}
return r.handleDGDDeleted(ctx, dgdr) return r.handleDGDDeleted(ctx, dgdr)
} }
...@@ -664,8 +675,12 @@ func (r *DynamoGraphDeploymentRequestReconciler) createDGD(ctx context.Context, ...@@ -664,8 +675,12 @@ func (r *DynamoGraphDeploymentRequestReconciler) createDGD(ctx context.Context,
if err := r.Create(ctx, dgd); err != nil { if err := r.Create(ctx, dgd); err != nil {
if apierrors.IsAlreadyExists(err) { if apierrors.IsAlreadyExists(err) {
// DGD already exists, just update status
logger.Info("DGD already exists, updating status") logger.Info("DGD already exists, updating status")
delete(dgdr.Annotations, "nvidia.com/generated-dgd-spec")
if updateErr := r.Update(ctx, dgdr); updateErr != nil {
logger.Error(updateErr, "Failed to remove generated-dgd-spec annotation on IsAlreadyExists path")
return ctrl.Result{}, updateErr
}
dgdr.Status.DGDName = dgdName dgdr.Status.DGDName = dgdName
return ctrl.Result{}, r.Status().Update(ctx, dgdr) return ctrl.Result{}, r.Status().Update(ctx, dgdr)
} }
...@@ -673,6 +688,13 @@ func (r *DynamoGraphDeploymentRequestReconciler) createDGD(ctx context.Context, ...@@ -673,6 +688,13 @@ func (r *DynamoGraphDeploymentRequestReconciler) createDGD(ctx context.Context,
return ctrl.Result{}, err return ctrl.Result{}, err
} }
delete(dgdr.Annotations, "nvidia.com/generated-dgd-spec")
if err := r.Update(ctx, dgdr); err != nil {
// Return the error to force a retry. The DGD was created successfully, so a
// retry will hit the IsAlreadyExists path above and attempt cleanup again.
return ctrl.Result{}, fmt.Errorf("failed to remove generated-dgd-spec annotation after DGD creation: %w", err)
}
// Update status // Update status
dgdr.Status.DGDName = dgdName dgdr.Status.DGDName = dgdName
...@@ -1350,8 +1372,10 @@ func (r *DynamoGraphDeploymentRequestReconciler) getProfilingJobErrorDetails(ctx ...@@ -1350,8 +1372,10 @@ func (r *DynamoGraphDeploymentRequestReconciler) getProfilingJobErrorDetails(ctx
return "" return ""
} }
// generateDGDSpec generates DGD spec from profiling results (online or offline/AIC) // generateDGDSpec reads profiling output from the sidecar ConfigMap, extracts the
func (r *DynamoGraphDeploymentRequestReconciler) generateDGDSpec(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) error { // DynamoGraphDeployment spec and pareto configs, stores the spec in an annotation via
// r.Update, and returns the ProfilingResultsStatus and DGD name.
func (r *DynamoGraphDeploymentRequestReconciler) generateDGDSpec(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) (*nvidiacomv1beta1.ProfilingResultsStatus, string, error) {
logger := log.FromContext(ctx) logger := log.FromContext(ctx)
logger.Info("Generating DGD spec from profiling results", "name", dgdr.Name, "backend", dgdr.Spec.Backend) logger.Info("Generating DGD spec from profiling results", "name", dgdr.Name, "backend", dgdr.Spec.Backend)
...@@ -1365,9 +1389,9 @@ func (r *DynamoGraphDeploymentRequestReconciler) generateDGDSpec(ctx context.Con ...@@ -1365,9 +1389,9 @@ func (r *DynamoGraphDeploymentRequestReconciler) generateDGDSpec(ctx context.Con
if err != nil { if err != nil {
if apierrors.IsNotFound(err) { if apierrors.IsNotFound(err) {
return fmt.Errorf("output ConfigMap %s not found - profiling may not have completed yet", outputConfigMapName) return nil, "", fmt.Errorf("output ConfigMap %s not found - profiling may not have completed yet", outputConfigMapName)
} }
return fmt.Errorf("failed to get output ConfigMap: %w", err) return nil, "", fmt.Errorf("failed to get output ConfigMap: %w", err)
} }
// Select the right config file based on mocker feature flag // Select the right config file based on mocker feature flag
...@@ -1377,7 +1401,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) generateDGDSpec(ctx context.Con ...@@ -1377,7 +1401,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) generateDGDSpec(ctx context.Con
// Get YAML content from ConfigMap // Get YAML content from ConfigMap
yamlContent, exists := cm.Data[outputFile] yamlContent, exists := cm.Data[outputFile]
if !exists { if !exists {
return fmt.Errorf("key %s not found in ConfigMap %s", outputFile, outputConfigMapName) return nil, "", fmt.Errorf("key %s not found in ConfigMap %s", outputFile, outputConfigMapName)
} }
logger.Info("Found profiling output in ConfigMap", "configMap", outputConfigMapName, "outputFile", outputFile, "size", len(yamlContent)) logger.Info("Found profiling output in ConfigMap", "configMap", outputConfigMapName, "outputFile", outputFile, "size", len(yamlContent))
...@@ -1385,58 +1409,125 @@ func (r *DynamoGraphDeploymentRequestReconciler) generateDGDSpec(ctx context.Con ...@@ -1385,58 +1409,125 @@ func (r *DynamoGraphDeploymentRequestReconciler) generateDGDSpec(ctx context.Con
// Extract DGD and any supporting resources from potentially multi-document YAML (ConfigMap + DGD) // Extract DGD and any supporting resources from potentially multi-document YAML (ConfigMap + DGD)
dgd, additionalResources, err := r.extractResourcesFromYAML([]byte(yamlContent)) dgd, additionalResources, err := r.extractResourcesFromYAML([]byte(yamlContent))
if err != nil { if err != nil {
return fmt.Errorf("failed to extract DGD from %s: %w", outputFile, err) return nil, "", fmt.Errorf("failed to extract DGD from %s: %w", outputFile, err)
} }
logger.Info("Parsed profiling output", "dgdName", dgd.Name, "additionalResources", len(additionalResources)) logger.Info("Parsed profiling output", "dgdName", dgd.Name, "additionalResources", len(additionalResources))
// Store additional resources (ConfigMaps) in annotations first
if len(additionalResources) > 0 { if len(additionalResources) > 0 {
if err := r.storeAdditionalResources(ctx, dgdr, additionalResources); err != nil { if err := r.storeAdditionalResources(ctx, dgdr, additionalResources); err != nil {
logger.Error(err, "Failed to store additional resources") logger.Error(err, "Failed to store additional resources")
return err return nil, "", err
} }
// Refetch the DGDR after updating annotations to get the latest resourceVersion // storeAdditionalResources calls r.Update internally, bumping resourceVersion.
// Refetch so the subsequent r.Update for the spec annotation doesn't 409.
if err := r.Get(ctx, types.NamespacedName{Name: dgdr.Name, Namespace: dgdr.Namespace}, dgdr); err != nil { if err := r.Get(ctx, types.NamespacedName{Name: dgdr.Name, Namespace: dgdr.Namespace}, dgdr); err != nil {
return fmt.Errorf("failed to refetch DGDR after storing annotations: %w", err) return nil, "", fmt.Errorf("failed to refetch DGDR after storing additional resources: %w", err)
} }
} }
// Store the generated DGD name in status and cache the spec in an annotation for createDGD profilingResults := &nvidiacomv1beta1.ProfilingResultsStatus{}
dgdr.Status.DGDName = dgd.Name if webUIData, ok := cm.Data["webui_data.json"]; ok {
pareto, err := extractParetoFromWebUIData([]byte(webUIData))
if err != nil {
logger.Error(err, "Failed to parse webui_data.json; skipping pareto population")
} else {
profilingResults.Pareto = pareto
logger.Info("Populated ProfilingResults.Pareto", "count", len(pareto))
}
}
// Store the generated DGD in ProfilingResults.SelectedConfig for status visibility // Store the generated DGD in ProfilingResults.SelectedConfig
dgdJSON, err := json.Marshal(dgd) dgdJSON, err := json.Marshal(dgd)
if err != nil { if err != nil {
return fmt.Errorf("failed to marshal generated DGD to JSON: %w", err) return nil, "", fmt.Errorf("failed to marshal generated DGD to JSON: %w", err)
}
if dgdr.Status.ProfilingResults == nil {
dgdr.Status.ProfilingResults = &nvidiacomv1beta1.ProfilingResultsStatus{}
} }
dgdr.Status.ProfilingResults.SelectedConfig = &runtime.RawExtension{Raw: dgdJSON} profilingResults.SelectedConfig = &runtime.RawExtension{Raw: dgdJSON}
// Serialize the DGD spec to an annotation so createDGD can retrieve it // Serialize the DGD spec to an annotation so createDGD can retrieve it
dgdBytes, err := sigsyaml.Marshal(dgd) dgdBytes, err := sigsyaml.Marshal(dgd)
if err != nil { if err != nil {
return fmt.Errorf("failed to marshal generated DGD: %w", err) return nil, "", fmt.Errorf("failed to marshal generated DGD: %w", err)
} }
if dgdr.Annotations == nil { if dgdr.Annotations == nil {
dgdr.Annotations = make(map[string]string) dgdr.Annotations = make(map[string]string)
} }
dgdr.Annotations["nvidia.com/generated-dgd-spec"] = string(dgdBytes) dgdr.Annotations["nvidia.com/generated-dgd-spec"] = string(dgdBytes)
// Update the object (annotations are on the object, not status)
if err := r.Update(ctx, dgdr); err != nil { if err := r.Update(ctx, dgdr); err != nil {
return fmt.Errorf("failed to update DGDR with generated DGD annotation: %w", err) return nil, "", fmt.Errorf("failed to update DGDR with generated DGD annotation: %w", err)
}
return profilingResults, dgd.Name, nil
}
// extractParetoFromWebUIData parses webui_data.json and returns all Pareto-optimal
// deployment configurations from the cost table. Each row's last column ("Action")
// is a partial DynamoGraphDeployment YAML snippet.
func extractParetoFromWebUIData(data []byte) ([]nvidiacomv1beta1.ParetoConfig, error) {
var parsed struct {
Cost struct {
Table struct {
Data [][]json.RawMessage `json:"data"`
} `json:"table"`
} `json:"cost"`
}
if err := json.Unmarshal(data, &parsed); err != nil {
return nil, fmt.Errorf("failed to unmarshal webui_data.json: %w", err)
} }
// Refetch the DGDR after the annotation update to get the latest resourceVersion rows := parsed.Cost.Table.Data
// and avoid conflicts with concurrent modifications before updating status. if len(rows) == 0 {
if err := r.Get(ctx, types.NamespacedName{Name: dgdr.Name, Namespace: dgdr.Namespace}, dgdr); err != nil { return nil, nil
return fmt.Errorf("failed to refetch DGDR after annotation update: %w", err) }
// Schema: [TTFT(ms), PrefillThpt, ITL(ms), DecodeThpt, TokensPerUser, GPUHours, ActionYAML]
const minColumns = 7
const actionColumnIndex = 6
pareto := make([]nvidiacomv1beta1.ParetoConfig, 0, len(rows))
for _, row := range rows {
if len(row) < minColumns {
continue
}
var actionYAML string
if err := json.Unmarshal(row[actionColumnIndex], &actionYAML); err != nil {
continue
}
var configObj map[string]interface{}
if err := sigsyaml.Unmarshal([]byte(stripYAMLComments(actionYAML)), &configObj); err != nil {
continue
}
if len(configObj) == 0 {
continue
}
configJSON, err := json.Marshal(configObj)
if err != nil {
continue
}
pareto = append(pareto, nvidiacomv1beta1.ParetoConfig{
Config: runtime.RawExtension{Raw: configJSON},
})
} }
return r.Status().Update(ctx, dgdr) return pareto, nil
}
// stripYAMLComments removes comment lines (lines whose first non-whitespace character
// is '#') from a YAML string. The profiler prefixes action snippets with comment lines.
func stripYAMLComments(s string) string {
lines := strings.Split(s, "\n")
out := lines[:0] // reuse backing array; write index always <= range read index
for _, line := range lines {
if !strings.HasPrefix(strings.TrimLeft(line, " \t"), "#") {
out = append(out, line)
}
}
return strings.Join(out, "\n")
} }
// storeAdditionalResources marshals additional resources to YAML and stores them in DGDR annotations. // storeAdditionalResources marshals additional resources to YAML and stores them in DGDR annotations.
......
...@@ -1859,12 +1859,10 @@ spec: ...@@ -1859,12 +1859,10 @@ spec:
}) })
Expect(err).NotTo(HaveOccurred()) Expect(err).NotTo(HaveOccurred())
// ProfilingPhase should be cleared after profiling completes
// Note: Due to the r.Update/r.Status().Update ordering in generateDGDSpec,
// ProfilingPhase may not be cleared in a single reconcile. Verify the phase
// transition happened correctly.
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed()) Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
Expect(updated.Status.Phase).ShouldNot(Equal(nvidiacomv1beta1.DGDRPhaseProfiling)) Expect(updated.Status.ProfilingPhase).Should(BeEmpty(), "profilingPhase must be cleared after profiling completes")
Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhaseDeploying),
"phase must be Deploying after profiling completes with autoApply=true")
}) })
It("Should use spec.features.mocker.enabled to select mocker output", func() { It("Should use spec.features.mocker.enabled to select mocker output", func() {
...@@ -2014,6 +2012,254 @@ spec: ...@@ -2014,6 +2012,254 @@ spec:
_ = k8sClient.Delete(ctx, job) _ = k8sClient.Delete(ctx, job)
}) })
It("Should clear profilingPhase, set ProfilingCompleted condition, and populate profilingResults.selectedConfig after profiling completes", func() {
// Regression test for: profilingPhase staying "Initializing", Profiling condition
// not updated to ProfilingCompleted, and profilingResults never populated.
// Root cause was that generateDGDSpec called r.Get() before r.Status().Update(),
// overwriting the in-memory status changes made by handleProfilingPhase.
ctx := context.Background()
dgdrName := "test-dgdr-status-regression"
namespace := defaultNamespace
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "vllm",
Image: "test-profiler:latest",
AutoApply: ptr.To(false),
Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM,
VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](8),
},
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(200.0),
ITL: ptr.To(30.0),
},
},
}
Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Reconcile: initial validation → Pending
_, err := reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
})
Expect(err).NotTo(HaveOccurred())
// Reconcile: create profiling job → Profiling + ProfilingPhase=Initializing
_, err = reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
})
Expect(err).NotTo(HaveOccurred())
var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhaseProfiling))
Expect(updated.Status.ProfilingPhase).Should(Equal(nvidiacomv1beta1.ProfilingPhaseInitializing))
// Mark profiling job as complete
jobName := getProfilingJobName(&updated)
job := &batchv1.Job{}
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: jobName, Namespace: namespace}, job)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, job) }()
job.Status.Conditions = []batchv1.JobCondition{{
Type: batchv1.JobComplete,
Status: corev1.ConditionTrue,
}}
Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed())
// Create the output ConfigMap that the sidecar would have created
dgdYAML := `apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: test-status-regression-dgd
spec:
services:
Frontend:
replicas: 1
VllmWorker:
replicas: 2`
cm := &corev1.ConfigMap{
ObjectMeta: metav1.ObjectMeta{
Name: getOutputConfigMapName(&updated),
Namespace: namespace,
},
Data: map[string]string{
ProfilingOutputFile: dgdYAML,
},
}
Expect(k8sClient.Create(ctx, cm)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, cm) }()
// Reconcile: profiling complete → should clear profilingPhase, set
// ProfilingCompleted condition, populate profilingResults.selectedConfig
_, err = reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
})
Expect(err).NotTo(HaveOccurred())
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
// profilingPhase must be cleared (was staying "Initializing" before fix)
Expect(updated.Status.ProfilingPhase).Should(BeEmpty(),
"profilingPhase should be cleared after profiling completes")
// Profiling condition must be ProfilingCompleted (was never set before fix)
profilingCond := meta.FindStatusCondition(updated.Status.Conditions, nvidiacomv1beta1.ConditionTypeProfiling)
Expect(profilingCond).ShouldNot(BeNil(), "Profiling condition must exist")
Expect(profilingCond.Status).Should(Equal(metav1.ConditionTrue),
"Profiling condition status must be True")
Expect(profilingCond.Reason).Should(Equal("ProfilingCompleted"),
"Profiling condition reason must be ProfilingCompleted")
// profilingResults.selectedConfig must be populated (was nil before fix)
Expect(updated.Status.ProfilingResults).ShouldNot(BeNil(),
"profilingResults must be populated after profiling")
Expect(updated.Status.ProfilingResults.SelectedConfig).ShouldNot(BeNil(),
"profilingResults.selectedConfig must be set")
Expect(updated.Status.ProfilingResults.SelectedConfig.Raw).ShouldNot(BeEmpty(),
"profilingResults.selectedConfig must not be empty JSON")
// Phase must be Ready (autoApply=false → profiling complete, spec available)
Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhaseReady),
"phase must be Ready after profiling completes with autoApply=false")
// status.dgdName must be preserved after profiling
Expect(updated.Status.DGDName).ShouldNot(BeEmpty(), "status.dgdName must be preserved after profiling")
})
It("Should populate profilingResults.pareto from webui_data.json in output ConfigMap", func() {
// Regression test for profilingResults.pareto never being populated.
// The sidecar now includes webui_data.json in the output ConfigMap;
// the controller reads it to populate status.profilingResults.pareto.
ctx := context.Background()
dgdrName := "test-dgdr-pareto"
namespace := defaultNamespace
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: dgdrName,
Namespace: namespace,
},
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model",
Backend: "vllm",
Image: "test-profiler:latest",
AutoApply: ptr.To(false),
Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM,
VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](8),
},
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(200.0),
ITL: ptr.To(30.0),
},
},
}
Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
// Drive to Profiling phase
_, err := reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
})
Expect(err).NotTo(HaveOccurred())
_, err = reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
})
Expect(err).NotTo(HaveOccurred())
var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhaseProfiling))
// Mark job complete
jobName := getProfilingJobName(&updated)
job := &batchv1.Job{}
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: jobName, Namespace: namespace}, job)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, job) }()
job.Status.Conditions = []batchv1.JobCondition{{
Type: batchv1.JobComplete,
Status: corev1.ConditionTrue,
}}
Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed())
// Create output ConfigMap that includes both final_config.yaml and webui_data.json.
// The webui_data.json format mirrors what the profiler writes; two rows in cost.table
// represent two Pareto-optimal configurations.
dgdYAML := `apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: test-pareto-dgd
spec:
services:
Frontend:
replicas: 1`
webUIDataJSON := `{
"settings": {},
"prefill": {"chart": {}, "table": {"columns": [], "data": []}},
"decode": {"chart": {}, "table": {"columns": [], "data": []}},
"cost": {
"chart": {},
"index_mapping": {"0": [0, 0], "1": [0, 1]},
"table": {
"columns": ["TTFT (ms)", "Prefill Thpt", "ITL (ms)", "Decode Thpt", "Tokens/User", "GPU Hours", "Action"],
"data": [
[94.38, 7946.85, 7.09, 35.26, 141.04, 16.17,
"# Prefill: 4 GPU(s), TP=4\n# Decode: 4 GPU(s), TP=4\napiVersion: nvidia.com/v1alpha1\nkind: DynamoGraphDeployment\nspec:\n services:\n PrefillWorker:\n replicas: 1\n DecodeWorker:\n replicas: 1\n"],
[94.38, 7946.85, 10.69, 46.77, 93.54, 6.36,
"# Prefill: 4 GPU(s), TP=4\n# Decode: 2 GPU(s), TP=2\napiVersion: nvidia.com/v1alpha1\nkind: DynamoGraphDeployment\nspec:\n services:\n PrefillWorker:\n replicas: 1\n DecodeWorker:\n replicas: 1\n"]
]
}
}
}`
cm := &corev1.ConfigMap{
ObjectMeta: metav1.ObjectMeta{
Name: getOutputConfigMapName(&updated),
Namespace: namespace,
},
Data: map[string]string{
ProfilingOutputFile: dgdYAML,
"webui_data.json": webUIDataJSON,
},
}
Expect(k8sClient.Create(ctx, cm)).Should(Succeed())
defer func() { _ = k8sClient.Delete(ctx, cm) }()
// Reconcile to complete profiling and populate profilingResults
_, err = reconciler.Reconcile(ctx, reconcile.Request{
NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
})
Expect(err).NotTo(HaveOccurred())
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
// profilingResults.pareto must contain both Pareto-optimal configurations
Expect(updated.Status.ProfilingResults).ShouldNot(BeNil(),
"profilingResults must be populated")
Expect(updated.Status.ProfilingResults.Pareto).Should(HaveLen(2),
"profilingResults.pareto should contain 2 entries from webui_data.json")
// Each pareto entry must have non-empty Config
for i, p := range updated.Status.ProfilingResults.Pareto {
Expect(p.Config.Raw).ShouldNot(BeEmpty(),
"pareto[%d].config must not be empty", i)
}
})
It("Should validate typed hardware fields without blob parsing", func() { It("Should validate typed hardware fields without blob parsing", func() {
ctx := context.Background() ctx := context.Background()
dgdrName := "test-dgdr-typed-hw" dgdrName := "test-dgdr-typed-hw"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment