Unverified Commit 94876e58 authored by julienmancuso's avatar julienmancuso Committed by GitHub
Browse files

Fix: Canonicalize PodGangSet to prevent spurious updates (#2469)

parent 78c7e352
...@@ -13,6 +13,7 @@ require ( ...@@ -13,6 +13,7 @@ require (
github.com/onsi/ginkgo/v2 v2.23.4 github.com/onsi/ginkgo/v2 v2.23.4
github.com/onsi/gomega v1.37.0 github.com/onsi/gomega v1.37.0
github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.71.2 github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.71.2
github.com/stretchr/testify v1.10.0
go.etcd.io/etcd/client/v3 v3.5.21 go.etcd.io/etcd/client/v3 v3.5.21
istio.io/api v1.23.1 istio.io/api v1.23.1
istio.io/client-go v1.23.1 istio.io/client-go v1.23.1
...@@ -57,6 +58,7 @@ require ( ...@@ -57,6 +58,7 @@ require (
github.com/modern-go/reflect2 v1.0.2 // indirect github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/pkg/errors v0.9.1 // indirect github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/prometheus/client_golang v1.22.0 // indirect github.com/prometheus/client_golang v1.22.0 // indirect
github.com/prometheus/client_model v0.6.1 // indirect github.com/prometheus/client_model v0.6.1 // indirect
github.com/prometheus/common v0.62.0 // indirect github.com/prometheus/common v0.62.0 // indirect
......
...@@ -1217,7 +1217,7 @@ func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx contex ...@@ -1217,7 +1217,7 @@ func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx contex
podLabels[commonconsts.KubeLabelDynamoSelector] = kubeName podLabels[commonconsts.KubeLabelDynamoSelector] = kubeName
podSpec := &basePodSpec podSpec := basePodSpec
podSpec.Containers = containers podSpec.Containers = containers
extraPodMetadata := opt.dynamoComponentDeployment.Spec.ExtraPodMetadata extraPodMetadata := opt.dynamoComponentDeployment.Spec.ExtraPodMetadata
......
...@@ -796,6 +796,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing. ...@@ -796,6 +796,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
}, },
}, },
}, },
RestartPolicy: corev1.RestartPolicyAlways,
Containers: []corev1.Container{ Containers: []corev1.Container{
{ {
Name: "main", Name: "main",
...@@ -895,6 +896,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing. ...@@ -895,6 +896,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
}, },
}, },
}, },
RestartPolicy: corev1.RestartPolicyAlways,
Containers: []corev1.Container{ Containers: []corev1.Container{
{ {
Name: "main", Name: "main",
......
package controller_common
import (
"sort"
corev1 "k8s.io/api/core/v1"
)
// CanonicalizePodSpec sorts the pod spec in a way that is deterministic and easy to reason about.
//
//nolint:gocyclo
func CanonicalizePodSpec(podSpec *corev1.PodSpec) *corev1.PodSpec {
// Helper function to get EnvFromSource sort key
envFromKey := func(e corev1.EnvFromSource) string {
if e.ConfigMapRef != nil {
return "cm:" + e.ConfigMapRef.Name + ":" + e.Prefix
}
if e.SecretRef != nil {
return "sec:" + e.SecretRef.Name + ":" + e.Prefix
}
return "other:" + e.Prefix
}
// Helper function to sort container-like fields (works for both Container and EphemeralContainer)
sortContainerFields := func(env []corev1.EnvVar, envFrom []corev1.EnvFromSource, ports []corev1.ContainerPort, volumeMounts []corev1.VolumeMount, securityContext *corev1.SecurityContext) {
// Sort env vars by name
if len(env) > 1 {
sort.Slice(env, func(i, j int) bool { return env[i].Name < env[j].Name })
}
// Sort envFrom by referenced source and prefix
if len(envFrom) > 1 {
sort.Slice(envFrom, func(i, j int) bool {
return envFromKey(envFrom[i]) < envFromKey(envFrom[j])
})
}
// Sort ports by name then port number
if len(ports) > 1 {
sort.Slice(ports, func(i, j int) bool {
if ports[i].Name == ports[j].Name {
return ports[i].ContainerPort < ports[j].ContainerPort
}
return ports[i].Name < ports[j].Name
})
}
// Sort volume mounts by name then mount path
if len(volumeMounts) > 1 {
sort.Slice(volumeMounts, func(i, j int) bool {
if volumeMounts[i].Name == volumeMounts[j].Name {
return volumeMounts[i].MountPath < volumeMounts[j].MountPath
}
return volumeMounts[i].Name < volumeMounts[j].Name
})
}
// Sort security context capability lists
if securityContext != nil && securityContext.Capabilities != nil {
if caps := securityContext.Capabilities.Add; len(caps) > 1 {
sort.Slice(caps, func(i, j int) bool { return string(caps[i]) < string(caps[j]) })
}
if caps := securityContext.Capabilities.Drop; len(caps) > 1 {
sort.Slice(caps, func(i, j int) bool { return string(caps[i]) < string(caps[j]) })
}
}
}
// Sort regular containers
for i := range podSpec.Containers {
c := &podSpec.Containers[i]
sortContainerFields(c.Env, c.EnvFrom, c.Ports, c.VolumeMounts, c.SecurityContext)
}
if len(podSpec.Containers) > 1 {
sort.Slice(podSpec.Containers, func(i, j int) bool {
return podSpec.Containers[i].Name < podSpec.Containers[j].Name
})
}
// Sort init containers
for i := range podSpec.InitContainers {
c := &podSpec.InitContainers[i]
sortContainerFields(c.Env, c.EnvFrom, c.Ports, c.VolumeMounts, c.SecurityContext)
}
if len(podSpec.InitContainers) > 1 {
sort.Slice(podSpec.InitContainers, func(i, j int) bool {
return podSpec.InitContainers[i].Name < podSpec.InitContainers[j].Name
})
}
// Sort ephemeral containers
for i := range podSpec.EphemeralContainers {
ec := &podSpec.EphemeralContainers[i]
sortContainerFields(ec.Env, ec.EnvFrom, ec.Ports, ec.VolumeMounts, ec.SecurityContext)
}
if len(podSpec.EphemeralContainers) > 1 {
sort.Slice(podSpec.EphemeralContainers, func(i, j int) bool {
return podSpec.EphemeralContainers[i].Name < podSpec.EphemeralContainers[j].Name
})
}
// Sort image pull secrets
if len(podSpec.ImagePullSecrets) > 1 {
sort.Slice(podSpec.ImagePullSecrets, func(i, j int) bool {
return podSpec.ImagePullSecrets[i].Name < podSpec.ImagePullSecrets[j].Name
})
}
// Sort volumes and their nested items
sortKeyToPathItems := func(items []corev1.KeyToPath) {
if len(items) > 1 {
sort.Slice(items, func(i, j int) bool {
if items[i].Key == items[j].Key {
return items[i].Path < items[j].Path
}
return items[i].Key < items[j].Key
})
}
}
for i := range podSpec.Volumes {
v := &podSpec.Volumes[i]
// ConfigMap items
if v.ConfigMap != nil {
sortKeyToPathItems(v.ConfigMap.Items)
}
// Secret items
if v.Secret != nil {
sortKeyToPathItems(v.Secret.Items)
}
// DownwardAPI items
if v.DownwardAPI != nil && len(v.DownwardAPI.Items) > 1 {
sort.Slice(v.DownwardAPI.Items, func(i, j int) bool {
return v.DownwardAPI.Items[i].Path < v.DownwardAPI.Items[j].Path
})
}
// Projected sources
if v.Projected != nil {
// Sort projected sources
if len(v.Projected.Sources) > 1 {
sort.Slice(v.Projected.Sources, func(i, j int) bool {
getProjectionKey := func(p corev1.VolumeProjection) string {
if p.ConfigMap != nil {
return "cm:" + p.ConfigMap.Name
}
if p.Secret != nil {
return "sec:" + p.Secret.Name
}
if p.DownwardAPI != nil {
return "downward:"
}
if p.ServiceAccountToken != nil {
return "sat:" + p.ServiceAccountToken.Audience
}
return "z:other"
}
return getProjectionKey(v.Projected.Sources[i]) < getProjectionKey(v.Projected.Sources[j])
})
}
// Sort nested items for each projection
for j := range v.Projected.Sources {
p := &v.Projected.Sources[j]
if p.ConfigMap != nil {
sortKeyToPathItems(p.ConfigMap.Items)
}
if p.Secret != nil {
sortKeyToPathItems(p.Secret.Items)
}
if p.DownwardAPI != nil && len(p.DownwardAPI.Items) > 1 {
sort.Slice(p.DownwardAPI.Items, func(i, j int) bool {
return p.DownwardAPI.Items[i].Path < p.DownwardAPI.Items[j].Path
})
}
}
}
}
// Sort volumes by name
if len(podSpec.Volumes) > 1 {
sort.Slice(podSpec.Volumes, func(i, j int) bool {
return podSpec.Volumes[i].Name < podSpec.Volumes[j].Name
})
}
// Sort tolerations
if len(podSpec.Tolerations) > 1 {
sort.Slice(podSpec.Tolerations, func(i, j int) bool {
a, b := podSpec.Tolerations[i], podSpec.Tolerations[j]
if a.Key != b.Key {
return a.Key < b.Key
}
if string(a.Operator) != string(b.Operator) {
return string(a.Operator) < string(b.Operator)
}
if a.Value != b.Value {
return a.Value < b.Value
}
if string(a.Effect) != string(b.Effect) {
return string(a.Effect) < string(b.Effect)
}
// Handle TolerationSeconds (could be nil)
aSec, bSec := int64(0), int64(0)
if a.TolerationSeconds != nil {
aSec = *a.TolerationSeconds
}
if b.TolerationSeconds != nil {
bSec = *b.TolerationSeconds
}
return aSec < bSec
})
}
// Sort topology spread constraints
if len(podSpec.TopologySpreadConstraints) > 1 {
sort.Slice(podSpec.TopologySpreadConstraints, func(i, j int) bool {
a, b := podSpec.TopologySpreadConstraints[i], podSpec.TopologySpreadConstraints[j]
if a.TopologyKey != b.TopologyKey {
return a.TopologyKey < b.TopologyKey
}
if string(a.WhenUnsatisfiable) != string(b.WhenUnsatisfiable) {
return string(a.WhenUnsatisfiable) < string(b.WhenUnsatisfiable)
}
return a.MaxSkew < b.MaxSkew
})
}
// Sort host aliases
if len(podSpec.HostAliases) > 1 {
// First sort hostnames within each alias
for i := range podSpec.HostAliases {
if len(podSpec.HostAliases[i].Hostnames) > 1 {
sort.Strings(podSpec.HostAliases[i].Hostnames)
}
}
// Then sort aliases by IP
sort.Slice(podSpec.HostAliases, func(i, j int) bool {
return podSpec.HostAliases[i].IP < podSpec.HostAliases[j].IP
})
}
// Sort DNS config
if podSpec.DNSConfig != nil {
// Sort DNS options
if len(podSpec.DNSConfig.Options) > 1 {
sort.Slice(podSpec.DNSConfig.Options, func(i, j int) bool {
if podSpec.DNSConfig.Options[i].Name == podSpec.DNSConfig.Options[j].Name {
vi, vj := "", ""
if podSpec.DNSConfig.Options[i].Value != nil {
vi = *podSpec.DNSConfig.Options[i].Value
}
if podSpec.DNSConfig.Options[j].Value != nil {
vj = *podSpec.DNSConfig.Options[j].Value
}
return vi < vj
}
return podSpec.DNSConfig.Options[i].Name < podSpec.DNSConfig.Options[j].Name
})
}
// Sort nameservers and search domains
if len(podSpec.DNSConfig.Nameservers) > 1 {
sort.Strings(podSpec.DNSConfig.Nameservers)
}
if len(podSpec.DNSConfig.Searches) > 1 {
sort.Strings(podSpec.DNSConfig.Searches)
}
}
return podSpec
}
This diff is collapsed.
package controller_common
import (
"sort"
grovev1alpha1 "github.com/NVIDIA/grove/operator/api/core/v1alpha1"
)
func CanonicalizePodGangSet(gangSet *grovev1alpha1.PodGangSet) *grovev1alpha1.PodGangSet {
// sort cliques by name
sort.Slice(gangSet.Spec.Template.Cliques, func(i, j int) bool {
return gangSet.Spec.Template.Cliques[i].Name < gangSet.Spec.Template.Cliques[j].Name
})
// sort scaling groups by name
sort.Slice(gangSet.Spec.Template.PodCliqueScalingGroupConfigs, func(i, j int) bool {
return gangSet.Spec.Template.PodCliqueScalingGroupConfigs[i].Name < gangSet.Spec.Template.PodCliqueScalingGroupConfigs[j].Name
})
return gangSet
}
...@@ -27,6 +27,7 @@ import ( ...@@ -27,6 +27,7 @@ import (
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/dynamo/common" "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/dynamo/common"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts" "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
"github.com/google/go-cmp/cmp"
corev1 "k8s.io/api/core/v1" corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/api/resource"
...@@ -156,6 +157,14 @@ func SyncResource[T client.Object](ctx context.Context, r Reconciler, parentReso ...@@ -156,6 +157,14 @@ func SyncResource[T client.Object](ctx context.Context, r Reconciler, parentReso
return false, resource, fmt.Errorf("failed to check if spec has changed: %w", err) return false, resource, fmt.Errorf("failed to check if spec has changed: %w", err)
} }
if newHash != nil { if newHash != nil {
// Generate and log diff before updating
diff, diffErr := generateSpecDiff(oldResource, resource)
if diffErr != nil {
logs.V(1).Info(fmt.Sprintf("Failed to generate diff for %s: %v", resourceType, diffErr))
} else if diff != "" {
logs.Info(fmt.Sprintf("%s spec changes detected", resourceType), "diff", diff)
}
// update the spec of the current object with the desired spec // update the spec of the current object with the desired spec
err = CopySpec(resource, oldResource) err = CopySpec(resource, oldResource)
if err != nil { if err != nil {
...@@ -252,6 +261,27 @@ func IsSpecChanged(current client.Object, desired client.Object) (*string, error ...@@ -252,6 +261,27 @@ func IsSpecChanged(current client.Object, desired client.Object) (*string, error
return &hashStr, nil return &hashStr, nil
} }
// generateSpecDiff creates a unified diff showing changes between old and new resource specs
func generateSpecDiff(oldResource, newResource client.Object) (string, error) {
oldSpec, err := getSpec(oldResource)
if err != nil {
return "", fmt.Errorf("failed to get old spec: %w", err)
}
newSpec, err := getSpec(newResource)
if err != nil {
return "", fmt.Errorf("failed to get new spec: %w", err)
}
// Generate diff using cmp
diff := cmp.Diff(oldSpec, newSpec)
if diff == "" {
return "", nil
}
return diff, nil
}
func GetSpecHash(obj client.Object) (string, error) { func GetSpecHash(obj client.Object) (string, error) {
spec, err := getSpec(obj) spec, err := getSpec(obj)
if err != nil { if err != nil {
......
...@@ -8,6 +8,7 @@ package dynamo ...@@ -8,6 +8,7 @@ package dynamo
import ( import (
commonconsts "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts" commonconsts "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
corev1 "k8s.io/api/core/v1" corev1 "k8s.io/api/core/v1"
"k8s.io/utils/ptr"
) )
// ComponentDefaults interface defines how defaults should be provided // ComponentDefaults interface defines how defaults should be provided
...@@ -50,7 +51,14 @@ func (b *BaseComponentDefaults) GetBaseContainer(context ComponentContext) (core ...@@ -50,7 +51,14 @@ func (b *BaseComponentDefaults) GetBaseContainer(context ComponentContext) (core
} }
func (b *BaseComponentDefaults) GetBasePodSpec(context ComponentContext) (corev1.PodSpec, error) { func (b *BaseComponentDefaults) GetBasePodSpec(context ComponentContext) (corev1.PodSpec, error) {
return corev1.PodSpec{}, nil return b.getCommonPodSpec(), nil
}
func (b *BaseComponentDefaults) getCommonPodSpec() corev1.PodSpec {
return corev1.PodSpec{
TerminationGracePeriodSeconds: ptr.To(int64(60)),
RestartPolicy: corev1.RestartPolicyAlways,
}
} }
func (b *BaseComponentDefaults) getCommonContainer(context ComponentContext) corev1.Container { func (b *BaseComponentDefaults) getCommonContainer(context ComponentContext) corev1.Container {
......
...@@ -39,8 +39,7 @@ func (p *PlannerDefaults) GetBaseContainer(context ComponentContext) (corev1.Con ...@@ -39,8 +39,7 @@ func (p *PlannerDefaults) GetBaseContainer(context ComponentContext) (corev1.Con
} }
func (p *PlannerDefaults) GetBasePodSpec(context ComponentContext) (corev1.PodSpec, error) { func (p *PlannerDefaults) GetBasePodSpec(context ComponentContext) (corev1.PodSpec, error) {
podSpec := corev1.PodSpec{ podSpec := p.getCommonPodSpec()
ServiceAccountName: commonconsts.PlannerServiceAccountName, podSpec.ServiceAccountName = commonconsts.PlannerServiceAccountName
}
return podSpec, nil return podSpec, nil
} }
...@@ -31,6 +31,7 @@ import ( ...@@ -31,6 +31,7 @@ import (
"k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr" "k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/utils/ptr"
grovev1alpha1 "github.com/NVIDIA/grove/operator/api/core/v1alpha1" grovev1alpha1 "github.com/NVIDIA/grove/operator/api/core/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/dynamo/common" "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/dynamo/common"
...@@ -687,13 +688,13 @@ func GenerateBasePodSpec( ...@@ -687,13 +688,13 @@ func GenerateBasePodSpec(
controllerConfig controller_common.Config, controllerConfig controller_common.Config,
multinodeDeploymentType commonconsts.MultinodeDeploymentType, multinodeDeploymentType commonconsts.MultinodeDeploymentType,
serviceName string, serviceName string,
) (corev1.PodSpec, error) { ) (*corev1.PodSpec, error) {
// Start with base container generated per component type // Start with base container generated per component type
componentContext := generateComponentContext(component, parentGraphDeploymentName, namespace, numberOfNodes) componentContext := generateComponentContext(component, parentGraphDeploymentName, namespace, numberOfNodes)
componentDefaults := ComponentDefaultsFactory(component.ComponentType) componentDefaults := ComponentDefaultsFactory(component.ComponentType)
container, err := componentDefaults.GetBaseContainer(componentContext) container, err := componentDefaults.GetBaseContainer(componentContext)
if err != nil { if err != nil {
return corev1.PodSpec{}, fmt.Errorf("failed to get base container: %w", err) return nil, fmt.Errorf("failed to get base container: %w", err)
} }
if component.ExtraPodSpec != nil && component.ExtraPodSpec.MainContainer != nil { if component.ExtraPodSpec != nil && component.ExtraPodSpec.MainContainer != nil {
...@@ -702,7 +703,7 @@ func GenerateBasePodSpec( ...@@ -702,7 +703,7 @@ func GenerateBasePodSpec(
// merge the extraPodSpec from the parent deployment with the extraPodSpec from the service // merge the extraPodSpec from the parent deployment with the extraPodSpec from the service
err = mergo.Merge(&container, *main, mergo.WithOverride) err = mergo.Merge(&container, *main, mergo.WithOverride)
if err != nil { if err != nil {
return corev1.PodSpec{}, fmt.Errorf("failed to merge extraPodSpec: %w", err) return nil, fmt.Errorf("failed to merge extraPodSpec: %w", err)
} }
// main container fields that require special handling // main container fields that require special handling
...@@ -725,7 +726,7 @@ func GenerateBasePodSpec( ...@@ -725,7 +726,7 @@ func GenerateBasePodSpec(
overrideResources, err := controller_common.GetResourcesConfig(component.Resources) overrideResources, err := controller_common.GetResourcesConfig(component.Resources)
if err != nil { if err != nil {
return corev1.PodSpec{}, fmt.Errorf("failed to get resources config: %w", err) return nil, fmt.Errorf("failed to get resources config: %w", err)
} }
// Requests // Requests
if overrideResources != nil && len(overrideResources.Requests) > 0 { if overrideResources != nil && len(overrideResources.Requests) > 0 {
...@@ -784,32 +785,32 @@ func GenerateBasePodSpec( ...@@ -784,32 +785,32 @@ func GenerateBasePodSpec(
// Apply backend-specific container modifications // Apply backend-specific container modifications
multinodeDeployer := MultinodeDeployerFactory(multinodeDeploymentType) multinodeDeployer := MultinodeDeployerFactory(multinodeDeploymentType)
if multinodeDeployer == nil { if multinodeDeployer == nil {
return corev1.PodSpec{}, fmt.Errorf("unsupported multinode deployment type: %s", multinodeDeploymentType) return nil, fmt.Errorf("unsupported multinode deployment type: %s", multinodeDeploymentType)
} }
backend := BackendFactory(backendFramework) backend := BackendFactory(backendFramework)
if backend == nil { if backend == nil {
return corev1.PodSpec{}, fmt.Errorf("unsupported backend framework: %s", backendFramework) return nil, fmt.Errorf("unsupported backend framework: %s", backendFramework)
} }
backend.UpdateContainer(&container, numberOfNodes, role, component, serviceName, multinodeDeployer) backend.UpdateContainer(&container, numberOfNodes, role, component, serviceName, multinodeDeployer)
// get base podspec from component // get base podspec from component
podSpec, err := componentDefaults.GetBasePodSpec(componentContext) podSpec, err := componentDefaults.GetBasePodSpec(componentContext)
if err != nil { if err != nil {
return corev1.PodSpec{}, fmt.Errorf("failed to get base podspec: %w", err) return nil, fmt.Errorf("failed to get base podspec: %w", err)
} }
if component.ExtraPodSpec != nil && component.ExtraPodSpec.PodSpec != nil { if component.ExtraPodSpec != nil && component.ExtraPodSpec.PodSpec != nil {
// merge extraPodSpec PodSpec with base podspec // merge extraPodSpec PodSpec with base podspec
err := mergo.Merge(&podSpec, component.ExtraPodSpec.PodSpec.DeepCopy(), mergo.WithOverride) err := mergo.Merge(&podSpec, component.ExtraPodSpec.PodSpec.DeepCopy(), mergo.WithOverride)
if err != nil { if err != nil {
return corev1.PodSpec{}, fmt.Errorf("failed to merge extraPodSpec: %w", err) return nil, fmt.Errorf("failed to merge extraPodSpec: %w", err)
} }
} }
podSpec.Containers = append(podSpec.Containers, container) podSpec.Containers = append(podSpec.Containers, container)
podSpec.Volumes = append(podSpec.Volumes, volumes...) podSpec.Volumes = append(podSpec.Volumes, volumes...)
podSpec.ImagePullSecrets = append(podSpec.ImagePullSecrets, imagePullSecrets...) podSpec.ImagePullSecrets = append(podSpec.ImagePullSecrets, imagePullSecrets...)
backend.UpdatePodSpec(&podSpec, numberOfNodes, role, component, serviceName) backend.UpdatePodSpec(&podSpec, numberOfNodes, role, component, serviceName)
return podSpec, nil return controller_common.CanonicalizePodSpec(&podSpec), nil
} }
func setMetricsLabels(labels map[string]string, dynamoGraphDeployment *v1alpha1.DynamoGraphDeployment) { func setMetricsLabels(labels map[string]string, dynamoGraphDeployment *v1alpha1.DynamoGraphDeployment) {
...@@ -846,13 +847,13 @@ func GeneratePodSpecForComponent( ...@@ -846,13 +847,13 @@ func GeneratePodSpecForComponent(
controllerConfig controller_common.Config, controllerConfig controller_common.Config,
multinodeDeploymentType commonconsts.MultinodeDeploymentType, multinodeDeploymentType commonconsts.MultinodeDeploymentType,
serviceName string, serviceName string,
) (corev1.PodSpec, error) { ) (*corev1.PodSpec, error) {
if len(dynamoDeployment.Spec.Envs) > 0 { if len(dynamoDeployment.Spec.Envs) > 0 {
component.Envs = MergeEnvs(dynamoDeployment.Spec.Envs, component.Envs) component.Envs = MergeEnvs(dynamoDeployment.Spec.Envs, component.Envs)
} }
podSpec, err := GenerateBasePodSpec(component, backendFramework, secretsRetriever, dynamoDeployment.Name, dynamoDeployment.Namespace, role, numberOfNodes, controllerConfig, multinodeDeploymentType, serviceName) podSpec, err := GenerateBasePodSpec(component, backendFramework, secretsRetriever, dynamoDeployment.Name, dynamoDeployment.Namespace, role, numberOfNodes, controllerConfig, multinodeDeploymentType, serviceName)
if err != nil { if err != nil {
return corev1.PodSpec{}, err return nil, err
} }
return podSpec, nil return podSpec, nil
} }
...@@ -871,6 +872,7 @@ func GenerateGrovePodGangSet( ...@@ -871,6 +872,7 @@ func GenerateGrovePodGangSet(
gangSet.Spec.Template.HeadlessServiceConfig = &grovev1alpha1.HeadlessServiceConfig{ gangSet.Spec.Template.HeadlessServiceConfig = &grovev1alpha1.HeadlessServiceConfig{
PublishNotReadyAddresses: true, PublishNotReadyAddresses: true,
} }
gangSet.Spec.Template.StartupType = ptr.To(grovev1alpha1.CliqueStartupTypeAnyOrder)
if controllerConfig.Grove.TerminationDelay > 0 { if controllerConfig.Grove.TerminationDelay > 0 {
gangSet.Spec.Template.TerminationDelay = &metav1.Duration{Duration: controllerConfig.Grove.TerminationDelay} gangSet.Spec.Template.TerminationDelay = &metav1.Duration{Duration: controllerConfig.Grove.TerminationDelay}
} }
...@@ -911,9 +913,10 @@ func GenerateGrovePodGangSet( ...@@ -911,9 +913,10 @@ func GenerateGrovePodGangSet(
clique := &grovev1alpha1.PodCliqueTemplateSpec{ clique := &grovev1alpha1.PodCliqueTemplateSpec{
Name: strings.ToLower(r.Name), Name: strings.ToLower(r.Name),
Spec: grovev1alpha1.PodCliqueSpec{ Spec: grovev1alpha1.PodCliqueSpec{
RoleName: strings.ToLower(r.Name), RoleName: strings.ToLower(r.Name),
Replicas: r.Replicas, Replicas: r.Replicas,
PodSpec: podSpec, MinAvailable: ptr.To(int32(1)),
PodSpec: *podSpec,
}, },
} }
labels, err := generateLabels(component, dynamoDeployment, r.Name) labels, err := generateLabels(component, dynamoDeployment, r.Name)
...@@ -935,9 +938,10 @@ func GenerateGrovePodGangSet( ...@@ -935,9 +938,10 @@ func GenerateGrovePodGangSet(
if isMultinode { if isMultinode {
scalingGroups = append(scalingGroups, grovev1alpha1.PodCliqueScalingGroupConfig{ scalingGroups = append(scalingGroups, grovev1alpha1.PodCliqueScalingGroupConfig{
Name: strings.ToLower(serviceName), Name: strings.ToLower(serviceName),
CliqueNames: cliqueNames, CliqueNames: cliqueNames,
Replicas: component.Replicas, Replicas: component.Replicas,
MinAvailable: ptr.To(int32(1)),
}) })
} }
} }
...@@ -945,7 +949,7 @@ func GenerateGrovePodGangSet( ...@@ -945,7 +949,7 @@ func GenerateGrovePodGangSet(
gangSet.Spec.Template.PodCliqueScalingGroupConfigs = scalingGroups gangSet.Spec.Template.PodCliqueScalingGroupConfigs = scalingGroups
} }
return gangSet, nil return controller_common.CanonicalizePodGangSet(gangSet), nil
} }
func generateLabels(component *v1alpha1.DynamoComponentDeploymentOverridesSpec, dynamoDeployment *v1alpha1.DynamoGraphDeployment, componentName string) (map[string]string, error) { func generateLabels(component *v1alpha1.DynamoComponentDeploymentOverridesSpec, dynamoDeployment *v1alpha1.DynamoGraphDeployment, componentName string) (map[string]string, error) {
...@@ -1141,7 +1145,7 @@ func GenerateBasePodSpecForController( ...@@ -1141,7 +1145,7 @@ func GenerateBasePodSpecForController(
controllerConfig controller_common.Config, controllerConfig controller_common.Config,
role Role, role Role,
multinodeDeploymentType commonconsts.MultinodeDeploymentType, multinodeDeploymentType commonconsts.MultinodeDeploymentType,
) (corev1.PodSpec, error) { ) (*corev1.PodSpec, error) {
// Convert to our interface // Convert to our interface
componentSpec := ConvertDynamoComponentDeploymentToSpec(dynComponent) componentSpec := ConvertDynamoComponentDeploymentToSpec(dynComponent)
...@@ -1150,7 +1154,7 @@ func GenerateBasePodSpecForController( ...@@ -1150,7 +1154,7 @@ func GenerateBasePodSpecForController(
// Determine backend framework using hybrid approach // Determine backend framework using hybrid approach
backendFramework, err := getBackendFrameworkFromDynamoComponent(dynComponent) backendFramework, err := getBackendFrameworkFromDynamoComponent(dynComponent)
if err != nil { if err != nil {
return corev1.PodSpec{}, fmt.Errorf("failed to determine backend framework: %w", err) return nil, fmt.Errorf("failed to determine backend framework: %w", err)
} }
// Generate base PodSpec with standard env vars using merged component envs // Generate base PodSpec with standard env vars using merged component envs
...@@ -1169,7 +1173,7 @@ func GenerateBasePodSpecForController( ...@@ -1169,7 +1173,7 @@ func GenerateBasePodSpecForController(
serviceName, serviceName,
) )
if err != nil { if err != nil {
return corev1.PodSpec{}, err return nil, err
} }
return podSpec, nil return podSpec, nil
......
...@@ -1205,6 +1205,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) { ...@@ -1205,6 +1205,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
Spec: grovev1alpha1.PodGangSetSpec{ Spec: grovev1alpha1.PodGangSetSpec{
Replicas: 1, Replicas: 1,
Template: grovev1alpha1.PodGangSetTemplateSpec{ Template: grovev1alpha1.PodGangSetTemplateSpec{
StartupType: ptr.To(grovev1alpha1.CliqueStartupTypeAnyOrder),
HeadlessServiceConfig: &grovev1alpha1.HeadlessServiceConfig{ HeadlessServiceConfig: &grovev1alpha1.HeadlessServiceConfig{
PublishNotReadyAddresses: true, PublishNotReadyAddresses: true,
}, },
...@@ -1224,8 +1225,9 @@ func TestGenerateGrovePodGangSet(t *testing.T) { ...@@ -1224,8 +1225,9 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
"nvidia.com/annotation2": "annotation2", "nvidia.com/annotation2": "annotation2",
}, },
Spec: grovev1alpha1.PodCliqueSpec{ Spec: grovev1alpha1.PodCliqueSpec{
RoleName: "frontend", RoleName: "frontend",
Replicas: 1, Replicas: 1,
MinAvailable: ptr.To(int32(1)),
PodSpec: corev1.PodSpec{ PodSpec: corev1.PodSpec{
Volumes: []corev1.Volume{ Volumes: []corev1.Volume{
{ {
...@@ -1244,6 +1246,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) { ...@@ -1244,6 +1246,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
Name: "frontend-secret", Name: "frontend-secret",
}, },
}, },
RestartPolicy: corev1.RestartPolicyAlways,
Containers: []corev1.Container{ Containers: []corev1.Container{
{ {
Name: "main", Name: "main",
...@@ -1353,8 +1356,9 @@ func TestGenerateGrovePodGangSet(t *testing.T) { ...@@ -1353,8 +1356,9 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
}, },
Annotations: map[string]string{}, Annotations: map[string]string{},
Spec: grovev1alpha1.PodCliqueSpec{ Spec: grovev1alpha1.PodCliqueSpec{
RoleName: "planner", RoleName: "planner",
Replicas: 2, Replicas: 2,
MinAvailable: ptr.To(int32(1)),
PodSpec: corev1.PodSpec{ PodSpec: corev1.PodSpec{
Volumes: []corev1.Volume{ Volumes: []corev1.Volume{
{ {
...@@ -1375,6 +1379,8 @@ func TestGenerateGrovePodGangSet(t *testing.T) { ...@@ -1375,6 +1379,8 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
}, },
}, },
}, },
TerminationGracePeriodSeconds: ptr.To(int64(60)),
RestartPolicy: corev1.RestartPolicyAlways,
Containers: []corev1.Container{ Containers: []corev1.Container{
{ {
Name: "main", Name: "main",
...@@ -1690,10 +1696,12 @@ func TestGenerateGrovePodGangSet(t *testing.T) { ...@@ -1690,10 +1696,12 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
"worker-ldr", "worker-ldr",
"worker-wkr", "worker-wkr",
}, },
Replicas: ptr.To(int32(5)), Replicas: ptr.To(int32(5)),
MinAvailable: ptr.To(int32(1)),
}, },
}, },
// StartupType: ptr.To(grovev1alpha1.CliqueStartupTypeExplicit), // StartupType: ptr.To(grovev1alpha1.CliqueStartupTypeExplicit),
StartupType: ptr.To(grovev1alpha1.CliqueStartupTypeAnyOrder),
Cliques: []*grovev1alpha1.PodCliqueTemplateSpec{ Cliques: []*grovev1alpha1.PodCliqueTemplateSpec{
{ {
Name: "worker-ldr", Name: "worker-ldr",
...@@ -1709,9 +1717,12 @@ func TestGenerateGrovePodGangSet(t *testing.T) { ...@@ -1709,9 +1717,12 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
"nvidia.com/annotation2": "annotation2", "nvidia.com/annotation2": "annotation2",
}, },
Spec: grovev1alpha1.PodCliqueSpec{ Spec: grovev1alpha1.PodCliqueSpec{
RoleName: "worker-ldr", RoleName: "worker-ldr",
Replicas: 1, Replicas: 1,
MinAvailable: ptr.To(int32(1)),
PodSpec: corev1.PodSpec{ PodSpec: corev1.PodSpec{
RestartPolicy: corev1.RestartPolicyAlways,
TerminationGracePeriodSeconds: ptr.To(int64(60)),
Volumes: []corev1.Volume{ Volumes: []corev1.Volume{
{ {
Name: "shared-memory", Name: "shared-memory",
...@@ -1855,10 +1866,13 @@ func TestGenerateGrovePodGangSet(t *testing.T) { ...@@ -1855,10 +1866,13 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
"nvidia.com/annotation2": "annotation2", "nvidia.com/annotation2": "annotation2",
}, },
Spec: grovev1alpha1.PodCliqueSpec{ Spec: grovev1alpha1.PodCliqueSpec{
RoleName: "worker-wkr", RoleName: "worker-wkr",
Replicas: 2, Replicas: 2,
MinAvailable: ptr.To(int32(1)),
// StartsAfter: []string{"worker-ldr"}, // StartsAfter: []string{"worker-ldr"},
PodSpec: corev1.PodSpec{ PodSpec: corev1.PodSpec{
RestartPolicy: corev1.RestartPolicyAlways,
TerminationGracePeriodSeconds: ptr.To(int64(60)),
Volumes: []corev1.Volume{ Volumes: []corev1.Volume{
{ {
Name: "shared-memory", Name: "shared-memory",
...@@ -1961,8 +1975,9 @@ func TestGenerateGrovePodGangSet(t *testing.T) { ...@@ -1961,8 +1975,9 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
}, },
Annotations: map[string]string{}, Annotations: map[string]string{},
Spec: grovev1alpha1.PodCliqueSpec{ Spec: grovev1alpha1.PodCliqueSpec{
RoleName: "frontend", RoleName: "frontend",
Replicas: 1, Replicas: 1,
MinAvailable: ptr.To(int32(1)),
PodSpec: corev1.PodSpec{ PodSpec: corev1.PodSpec{
Volumes: []corev1.Volume{ Volumes: []corev1.Volume{
{ {
...@@ -1981,6 +1996,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) { ...@@ -1981,6 +1996,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
}, },
}, },
TerminationGracePeriodSeconds: ptr.To(int64(10)), TerminationGracePeriodSeconds: ptr.To(int64(10)),
RestartPolicy: corev1.RestartPolicyAlways,
Containers: []corev1.Container{ Containers: []corev1.Container{
{ {
Name: "main", Name: "main",
...@@ -2090,9 +2106,12 @@ func TestGenerateGrovePodGangSet(t *testing.T) { ...@@ -2090,9 +2106,12 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
}, },
Annotations: map[string]string{}, Annotations: map[string]string{},
Spec: grovev1alpha1.PodCliqueSpec{ Spec: grovev1alpha1.PodCliqueSpec{
RoleName: "planner", RoleName: "planner",
Replicas: 2, Replicas: 2,
MinAvailable: ptr.To(int32(1)),
PodSpec: corev1.PodSpec{ PodSpec: corev1.PodSpec{
TerminationGracePeriodSeconds: ptr.To(int64(60)),
RestartPolicy: corev1.RestartPolicyAlways,
Volumes: []corev1.Volume{ Volumes: []corev1.Volume{
{ {
Name: "planner-pvc", Name: "planner-pvc",
...@@ -2440,6 +2459,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) { ...@@ -2440,6 +2459,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
Spec: grovev1alpha1.PodGangSetSpec{ Spec: grovev1alpha1.PodGangSetSpec{
Replicas: 1, Replicas: 1,
Template: grovev1alpha1.PodGangSetTemplateSpec{ Template: grovev1alpha1.PodGangSetTemplateSpec{
StartupType: ptr.To(grovev1alpha1.CliqueStartupTypeAnyOrder),
HeadlessServiceConfig: &grovev1alpha1.HeadlessServiceConfig{ HeadlessServiceConfig: &grovev1alpha1.HeadlessServiceConfig{
PublishNotReadyAddresses: true, PublishNotReadyAddresses: true,
}, },
...@@ -2451,7 +2471,8 @@ func TestGenerateGrovePodGangSet(t *testing.T) { ...@@ -2451,7 +2471,8 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
"worker-ldr", "worker-ldr",
"worker-wkr", "worker-wkr",
}, },
Replicas: ptr.To(int32(5)), Replicas: ptr.To(int32(5)),
MinAvailable: ptr.To(int32(1)),
}, },
}, },
// StartupType: ptr.To(grovev1alpha1.CliqueStartupTypeExplicit), // StartupType: ptr.To(grovev1alpha1.CliqueStartupTypeExplicit),
...@@ -2470,8 +2491,9 @@ func TestGenerateGrovePodGangSet(t *testing.T) { ...@@ -2470,8 +2491,9 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
"nvidia.com/annotation2": "annotation2", "nvidia.com/annotation2": "annotation2",
}, },
Spec: grovev1alpha1.PodCliqueSpec{ Spec: grovev1alpha1.PodCliqueSpec{
RoleName: "worker-ldr", RoleName: "worker-ldr",
Replicas: 1, Replicas: 1,
MinAvailable: ptr.To(int32(1)),
PodSpec: corev1.PodSpec{ PodSpec: corev1.PodSpec{
Volumes: []corev1.Volume{ Volumes: []corev1.Volume{
{ {
...@@ -2484,6 +2506,8 @@ func TestGenerateGrovePodGangSet(t *testing.T) { ...@@ -2484,6 +2506,8 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
}, },
}, },
}, },
TerminationGracePeriodSeconds: ptr.To(int64(60)),
RestartPolicy: corev1.RestartPolicyAlways,
Containers: []corev1.Container{ Containers: []corev1.Container{
{ {
Name: "main", Name: "main",
...@@ -2604,10 +2628,12 @@ func TestGenerateGrovePodGangSet(t *testing.T) { ...@@ -2604,10 +2628,12 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
"nvidia.com/annotation2": "annotation2", "nvidia.com/annotation2": "annotation2",
}, },
Spec: grovev1alpha1.PodCliqueSpec{ Spec: grovev1alpha1.PodCliqueSpec{
RoleName: "worker-wkr", RoleName: "worker-wkr",
Replicas: 2, Replicas: 2,
MinAvailable: ptr.To(int32(1)),
// StartsAfter: []string{"worker-ldr"}, // StartsAfter: []string{"worker-ldr"},
PodSpec: corev1.PodSpec{ PodSpec: corev1.PodSpec{
TerminationGracePeriodSeconds: ptr.To(int64(60)),
Volumes: []corev1.Volume{ Volumes: []corev1.Volume{
{ {
Name: "shared-memory", Name: "shared-memory",
...@@ -2619,6 +2645,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) { ...@@ -2619,6 +2645,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
}, },
}, },
}, },
RestartPolicy: corev1.RestartPolicyAlways,
Containers: []corev1.Container{ Containers: []corev1.Container{
{ {
Name: "main", Name: "main",
...@@ -2710,8 +2737,9 @@ func TestGenerateGrovePodGangSet(t *testing.T) { ...@@ -2710,8 +2737,9 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
}, },
Annotations: map[string]string{}, Annotations: map[string]string{},
Spec: grovev1alpha1.PodCliqueSpec{ Spec: grovev1alpha1.PodCliqueSpec{
RoleName: "frontend", RoleName: "frontend",
Replicas: 1, Replicas: 1,
MinAvailable: ptr.To(int32(1)),
PodSpec: corev1.PodSpec{ PodSpec: corev1.PodSpec{
Volumes: []corev1.Volume{ Volumes: []corev1.Volume{
{ {
...@@ -2730,6 +2758,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) { ...@@ -2730,6 +2758,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
}, },
}, },
TerminationGracePeriodSeconds: ptr.To(int64(10)), TerminationGracePeriodSeconds: ptr.To(int64(10)),
RestartPolicy: corev1.RestartPolicyAlways,
Containers: []corev1.Container{ Containers: []corev1.Container{
{ {
Name: "main", Name: "main",
...@@ -2839,9 +2868,11 @@ func TestGenerateGrovePodGangSet(t *testing.T) { ...@@ -2839,9 +2868,11 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
}, },
Annotations: map[string]string{}, Annotations: map[string]string{},
Spec: grovev1alpha1.PodCliqueSpec{ Spec: grovev1alpha1.PodCliqueSpec{
RoleName: "planner", RoleName: "planner",
Replicas: 2, Replicas: 2,
MinAvailable: ptr.To(int32(1)),
PodSpec: corev1.PodSpec{ PodSpec: corev1.PodSpec{
TerminationGracePeriodSeconds: ptr.To(int64(60)),
Volumes: []corev1.Volume{ Volumes: []corev1.Volume{
{ {
Name: "planner-pvc", Name: "planner-pvc",
...@@ -2861,6 +2892,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) { ...@@ -2861,6 +2892,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
}, },
}, },
}, },
RestartPolicy: corev1.RestartPolicyAlways,
Containers: []corev1.Container{ Containers: []corev1.Container{
{ {
Name: "main", Name: "main",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment