Unverified Commit 81c27803 authored by mohammedabdulwahhab's avatar mohammedabdulwahhab Committed by GitHub
Browse files

fix: operator defaults (#2398)


Signed-off-by: default avatarmohammedabdulwahhab <furkhan324@berkeley.edu>
parent 9ddb3efd
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*/
package dynamo
import (
"fmt"
commonconsts "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/util/intstr"
)
// FrontendDefaults implements ComponentDefaults for Frontend components
type FrontendDefaults struct {
*BaseComponentDefaults
}
func NewFrontendDefaults() *FrontendDefaults {
return &FrontendDefaults{&BaseComponentDefaults{}}
}
func (f *FrontendDefaults) GetBaseContainer(numberOfNodes int32) (corev1.Container, error) {
// Frontend doesn't need backend-specific config
container := f.getCommonContainer()
// Add HTTP port
container.Ports = []corev1.ContainerPort{
{
Protocol: corev1.ProtocolTCP,
Name: commonconsts.DynamoContainerPortName,
ContainerPort: int32(commonconsts.DynamoServicePort),
},
}
// Add frontend-specific defaults
container.LivenessProbe = &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Path: "/health",
Port: intstr.FromString(commonconsts.DynamoContainerPortName),
},
},
InitialDelaySeconds: 60,
PeriodSeconds: 60,
TimeoutSeconds: 30,
FailureThreshold: 10,
}
container.ReadinessProbe = &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
Exec: &corev1.ExecAction{
Command: []string{
"/bin/sh",
"-c",
"curl -s http://localhost:${DYNAMO_PORT}/health | jq -e \".status == \\\"healthy\\\"\"",
},
},
},
InitialDelaySeconds: 60,
PeriodSeconds: 60,
TimeoutSeconds: 30,
FailureThreshold: 10,
}
container.Resources = corev1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("1"),
corev1.ResourceMemory: resource.MustParse("2Gi"),
},
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("1"),
corev1.ResourceMemory: resource.MustParse("2Gi"),
},
}
// Add standard environment variables
container.Env = []corev1.EnvVar{
{
Name: commonconsts.EnvDynamoServicePort,
Value: fmt.Sprintf("%d", commonconsts.DynamoServicePort),
},
}
return container, nil
}
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*/
package dynamo
import (
commonconsts "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
)
// PlannerDefaults implements ComponentDefaults for Planner components
type PlannerDefaults struct {
*BaseComponentDefaults
}
func NewPlannerDefaults() *PlannerDefaults {
return &PlannerDefaults{&BaseComponentDefaults{}}
}
func (p *PlannerDefaults) GetBaseContainer(numberOfNodes int32) (corev1.Container, error) {
container := p.getCommonContainer()
// Add planner-specific defaults
container.Resources = corev1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("2"),
corev1.ResourceMemory: resource.MustParse("2Gi"),
},
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("2"),
corev1.ResourceMemory: resource.MustParse("2Gi"),
},
}
return container, nil
}
func (p *PlannerDefaults) GetBasePodSpec(numberOfNodes int32) (corev1.PodSpec, error) {
podSpec := corev1.PodSpec{
ServiceAccountName: commonconsts.PlannerServiceAccountName,
}
return podSpec, nil
}
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*/
package dynamo
import (
"fmt"
commonconsts "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/util/intstr"
)
// WorkerDefaults implements ComponentDefaults for Worker components
type WorkerDefaults struct {
*BaseComponentDefaults
}
func NewWorkerDefaults() *WorkerDefaults {
return &WorkerDefaults{&BaseComponentDefaults{}}
}
func (w *WorkerDefaults) GetBaseContainer(numberOfNodes int32) (corev1.Container, error) {
container := w.getCommonContainer()
// Add system port
container.Ports = []corev1.ContainerPort{
{
Protocol: corev1.ProtocolTCP,
Name: commonconsts.DynamoSystemPortName,
ContainerPort: int32(commonconsts.DynamoSystemPort),
},
}
// Add worker base defaults
container.Resources = corev1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("10"),
corev1.ResourceMemory: resource.MustParse("20Gi"),
},
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("10"),
corev1.ResourceMemory: resource.MustParse("20Gi"),
"nvidia.com/gpu": resource.MustParse("1"),
},
}
container.LivenessProbe = &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Path: "/live",
Port: intstr.FromString(commonconsts.DynamoSystemPortName),
},
},
PeriodSeconds: 5,
TimeoutSeconds: 30,
FailureThreshold: 1,
}
container.ReadinessProbe = &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Path: "/health",
Port: intstr.FromString(commonconsts.DynamoSystemPortName),
},
},
PeriodSeconds: 10,
TimeoutSeconds: 30,
FailureThreshold: 60,
}
container.StartupProbe = &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Path: "/live",
Port: intstr.FromString(commonconsts.DynamoSystemPortName),
},
},
PeriodSeconds: 10,
TimeoutSeconds: 5,
FailureThreshold: 60,
}
container.Env = []corev1.EnvVar{
{
Name: "DYN_SYSTEM_ENABLED",
Value: "true",
},
{
Name: "DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS",
Value: "[\"generate\"]",
},
{
Name: "DYN_SYSTEM_PORT",
Value: fmt.Sprintf("%d", commonconsts.DynamoSystemPort),
},
}
return container, nil
}
......@@ -21,6 +21,7 @@ import (
"context"
"encoding/json"
"fmt"
"maps"
"regexp"
"sort"
"strconv"
......@@ -191,7 +192,7 @@ func GenerateDynamoComponentsDeployments(ctx context.Context, parentDynamoGraphD
// finally set the service account name
deployment.Spec.ExtraPodSpec.PodSpec.ServiceAccountName = commonconsts.PlannerServiceAccountName
}
if deployment.IsMainComponent() && defaultIngressSpec != nil && deployment.Spec.Ingress == nil {
if deployment.IsFrontendComponent() && defaultIngressSpec != nil && deployment.Spec.Ingress == nil {
deployment.Spec.Ingress = defaultIngressSpec
}
// merge the envs from the parent deployment with the envs from the service
......@@ -219,7 +220,7 @@ func GenerateDynamoComponentsDeployments(ctx context.Context, parentDynamoGraphD
// updateDynDeploymentConfig updates the runtime config object for the given dynamoDeploymentComponent
// It updates the port for the given service (if it is the main component)
func updateDynDeploymentConfig(dynamoDeploymentComponent *v1alpha1.DynamoComponentDeployment, newPort int) error {
if dynamoDeploymentComponent.IsMainComponent() {
if dynamoDeploymentComponent.IsFrontendComponent() {
dynamoDeploymentConfig := dynamoDeploymentComponent.GetDynamoDeploymentConfig()
if dynamoDeploymentConfig != nil {
var config map[string]any
......@@ -668,11 +669,6 @@ func isWorkerComponent(componentType string) bool {
// addStandardEnvVars adds the standard environment variables that are common to both Grove and Controller
func addStandardEnvVars(container *corev1.Container, controllerConfig controller_common.Config) {
container.Env = append(container.Env, corev1.EnvVar{
Name: commonconsts.EnvDynamoServicePort,
Value: fmt.Sprintf("%d", commonconsts.DynamoServicePort),
})
if controllerConfig.NatsAddress != "" {
container.Env = append(container.Env, corev1.EnvVar{
Name: "NATS_SERVER",
......@@ -702,47 +698,60 @@ func GenerateBasePodSpec(
multinodeDeploymentType commonconsts.MultinodeDeploymentType,
serviceName string,
) (corev1.PodSpec, error) {
container := corev1.Container{
Name: "main",
LivenessProbe: component.LivenessProbe,
ReadinessProbe: component.ReadinessProbe,
Env: component.Envs,
Ports: []corev1.ContainerPort{
{
Protocol: corev1.ProtocolTCP,
Name: commonconsts.DynamoContainerPortName,
ContainerPort: int32(commonconsts.DynamoServicePort),
},
},
}
// Add system port for worker components
if component.ComponentType == commonconsts.ComponentTypeWorker {
container.Ports = append(container.Ports, corev1.ContainerPort{
Protocol: corev1.ProtocolTCP,
Name: commonconsts.DynamoSystemPortName,
ContainerPort: int32(commonconsts.DynamoSystemPort),
})
// Start with base container generated per component type
componentDefaults := ComponentDefaultsFactory(component.ComponentType, numberOfNodes)
container, err := componentDefaults.GetBaseContainer(numberOfNodes)
if err != nil {
return corev1.PodSpec{}, fmt.Errorf("failed to get base container: %w", err)
}
// First merge the mainContainer from extraPodSpec to get the base command and args
if component.ExtraPodSpec != nil && component.ExtraPodSpec.MainContainer != nil {
main := component.ExtraPodSpec.MainContainer.DeepCopy()
if main != nil {
// merge the extraPodSpec from the parent deployment with the extraPodSpec from the service
err := mergo.Merge(&container, *main, mergo.WithOverride)
err = mergo.Merge(&container, *main, mergo.WithOverride)
if err != nil {
return corev1.PodSpec{}, fmt.Errorf("failed to merge extraPodSpec: %w", err)
}
// main container fields that require special handling
container.Env = MergeEnvs(component.Envs, container.Env)
// Note: startup probe does not have its own top level field so it must be passed in extraPodSpec.MainContainer
// We want to overwrite entirely if provided rather than merge
if main.StartupProbe != nil {
container.StartupProbe = main.StartupProbe
}
}
}
resourcesConfig, err := controller_common.GetResourcesConfig(component.Resources)
// Merge probes entirely if they are passed (no partial merge)
if component.LivenessProbe != nil {
container.LivenessProbe = component.LivenessProbe.DeepCopy()
}
if component.ReadinessProbe != nil {
container.ReadinessProbe = component.ReadinessProbe.DeepCopy()
}
overrideResources, err := controller_common.GetResourcesConfig(component.Resources)
if err != nil {
return corev1.PodSpec{}, fmt.Errorf("failed to get resources config: %w", err)
}
if resourcesConfig != nil {
container.Resources = *resourcesConfig
// Requests
if overrideResources != nil && len(overrideResources.Requests) > 0 {
if container.Resources.Requests == nil {
container.Resources.Requests = corev1.ResourceList{}
}
maps.Copy(container.Resources.Requests, overrideResources.Requests)
}
// Limits
if overrideResources != nil && len(overrideResources.Limits) > 0 {
if container.Resources.Limits == nil {
container.Resources.Limits = corev1.ResourceList{}
}
maps.Copy(container.Resources.Limits, overrideResources.Limits)
}
imagePullSecrets := []corev1.LocalObjectReference{}
if secretsRetriever != nil && component.ExtraPodSpec != nil && component.ExtraPodSpec.MainContainer != nil && component.ExtraPodSpec.MainContainer.Image != "" {
secretsName, err := secretsRetriever.GetSecrets(namespace, component.ExtraPodSpec.MainContainer.Image)
......@@ -780,15 +789,26 @@ func GenerateBasePodSpec(
shmVolume, shmVolumeMount := generateSharedMemoryVolumeAndMount(&container.Resources)
volumes = append(volumes, shmVolume)
container.VolumeMounts = append(container.VolumeMounts, shmVolumeMount)
// Apply backend-specific container modifications
backend := BackendFactory(backendFramework)
if backend == nil {
return corev1.PodSpec{}, fmt.Errorf("unsupported backend framework: %s", backendFramework)
}
backend.UpdateContainer(&container, numberOfNodes, role, component, multinodeDeploymentType, serviceName)
var podSpec corev1.PodSpec
// get base podspec from component
podSpec, err := componentDefaults.GetBasePodSpec(numberOfNodes)
if err != nil {
return corev1.PodSpec{}, fmt.Errorf("failed to get base podspec: %w", err)
}
if component.ExtraPodSpec != nil && component.ExtraPodSpec.PodSpec != nil {
podSpec = *component.ExtraPodSpec.PodSpec.DeepCopy()
// merge extraPodSpec PodSpec with base podspec
err := mergo.Merge(&podSpec, component.ExtraPodSpec.PodSpec.DeepCopy(), mergo.WithOverride)
if err != nil {
return corev1.PodSpec{}, fmt.Errorf("failed to merge extraPodSpec: %w", err)
}
}
podSpec.Containers = append(podSpec.Containers, container)
podSpec.Volumes = append(podSpec.Volumes, volumes...)
......
......@@ -63,7 +63,7 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
"service1": {
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
DynamoNamespace: &[]string{"default"}[0],
ComponentType: "main",
ComponentType: "frontend",
Replicas: &[]int32{3}[0],
Resources: &common.Resources{
Requests: &common.ResourceItem{
......@@ -107,7 +107,7 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
ServiceName: "service1",
DynamoNamespace: &[]string{"default"}[0],
ComponentType: "main",
ComponentType: "frontend",
Replicas: &[]int32{3}[0],
Resources: &common.Resources{
Requests: &common.ResourceItem{
......@@ -171,7 +171,7 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
"service1": {
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
DynamoNamespace: nil,
ComponentType: "main",
ComponentType: "frontend",
Replicas: &[]int32{3}[0],
Resources: &common.Resources{
Requests: &common.ResourceItem{
......@@ -215,7 +215,7 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
ServiceName: "service1",
DynamoNamespace: &[]string{"dynamo-test-dynamographdeployment"}[0],
ComponentType: "main",
ComponentType: "frontend",
Replicas: &[]int32{3}[0],
Resources: &common.Resources{
Requests: &common.ResourceItem{
......@@ -279,7 +279,7 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
"service1": {
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
DynamoNamespace: &[]string{"default"}[0],
ComponentType: "main",
ComponentType: "frontend",
Replicas: &[]int32{3}[0],
Resources: &common.Resources{
Requests: &common.ResourceItem{
......@@ -325,7 +325,7 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
"service1": {
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
DynamoNamespace: nil,
ComponentType: "main",
ComponentType: "frontend",
Replicas: &[]int32{3}[0],
Resources: &common.Resources{
Requests: &common.ResourceItem{
......@@ -373,7 +373,7 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
ServiceName: "service1",
DynamoNamespace: &[]string{"dynamo-test-dynamographdeployment"}[0],
ComponentType: "main",
ComponentType: "frontend",
Replicas: &[]int32{3}[0],
Resources: &common.Resources{
Requests: &common.ResourceItem{
......@@ -447,7 +447,7 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
"service1": {
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
DynamoNamespace: nil,
ComponentType: "main",
ComponentType: "frontend",
Replicas: &[]int32{3}[0],
Resources: &common.Resources{
Requests: &common.ResourceItem{
......@@ -491,7 +491,7 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
ServiceName: "service1",
DynamoNamespace: &[]string{"dynamo-test-dynamographdeployment"}[0],
ComponentType: "main",
ComponentType: "frontend",
Replicas: &[]int32{3}[0],
Resources: &common.Resources{
Requests: &common.ResourceItem{
......@@ -574,7 +574,7 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
"service1": {
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
DynamoNamespace: &[]string{"default"}[0],
ComponentType: "main",
ComponentType: "frontend",
Replicas: &[]int32{3}[0],
Resources: &common.Resources{
Requests: &common.ResourceItem{
......@@ -625,7 +625,7 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
ServiceName: "service1",
DynamoNamespace: &[]string{"default"}[0],
ComponentType: "main",
ComponentType: "frontend",
Replicas: &[]int32{3}[0],
Resources: &common.Resources{
Requests: &common.ResourceItem{
......@@ -1121,6 +1121,15 @@ func Test_mergeEnvs(t *testing.T) {
}
}
func sortEnvVars(envs []corev1.EnvVar) []corev1.EnvVar {
sorted := make([]corev1.EnvVar, len(envs))
copy(sorted, envs)
sort.Slice(sorted, func(i, j int) bool {
return sorted[i].Name < sorted[j].Name
})
return sorted
}
func TestGenerateGrovePodGangSet(t *testing.T) {
type args struct {
ctx context.Context
......@@ -1159,7 +1168,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
Services: map[string]*v1alpha1.DynamoComponentDeploymentOverridesSpec{
"Frontend": {
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: "main", // Frontend component
ComponentType: "frontend", // Frontend component
ExtraPodMetadata: &common.ExtraPodMetadata{
Annotations: map[string]string{
"nvidia.com/annotation1": "annotation1",
......@@ -1308,7 +1317,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
Labels: map[string]string{
commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-frontend",
commonconsts.KubeLabelMetricsEnabled: commonconsts.KubeLabelValueTrue,
commonconsts.KubeLabelDynamoComponentType: commonconsts.ComponentTypeMain,
commonconsts.KubeLabelDynamoComponentType: commonconsts.ComponentTypeFrontend,
"nvidia.com/label1": "label1",
"nvidia.com/label2": "label2",
},
......@@ -1503,10 +1512,6 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
Name: "PLANNER_ENV_1",
Value: "2",
},
{
Name: "DYNAMO_PORT",
Value: fmt.Sprintf("%d", commonconsts.DynamoServicePort),
},
{
Name: "NATS_SERVER",
Value: "nats-address",
......@@ -1537,13 +1542,6 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
MountPath: "/dev/shm",
},
},
Ports: []corev1.ContainerPort{
{
Protocol: corev1.ProtocolTCP,
Name: commonconsts.DynamoContainerPortName,
ContainerPort: int32(commonconsts.DynamoServicePort),
},
},
},
},
},
......@@ -1594,6 +1592,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
GPU: "1",
},
},
ComponentType: commonconsts.ComponentTypeFrontend,
Envs: []corev1.EnvVar{
{
Name: "FRONTEND_ENV_1",
......@@ -1813,11 +1812,6 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
"python3 -m dynamo.sglang.worker --dist-init-addr ${GROVE_PCSG_NAME}-${GROVE_PCSG_INDEX}-worker-ldr-0.${GROVE_HEADLESS_SERVICE}:29500 --nnodes 3 --node-rank 0 --custom-flag custom-value",
},
Ports: []corev1.ContainerPort{
{
Protocol: corev1.ProtocolTCP,
Name: commonconsts.DynamoContainerPortName,
ContainerPort: int32(commonconsts.DynamoServicePort),
},
{
Protocol: corev1.ProtocolTCP,
Name: commonconsts.DynamoSystemPortName,
......@@ -1830,12 +1824,20 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
Value: "1",
},
{
Name: "WORKER_ENV_1",
Value: "1",
Name: "DYN_SYSTEM_ENABLED",
Value: "true",
},
{
Name: "DYNAMO_PORT",
Value: fmt.Sprintf("%d", commonconsts.DynamoServicePort),
Name: "DYN_SYSTEM_PORT",
Value: "9090",
},
{
Name: "DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS",
Value: `["generate"]`,
},
{
Name: "WORKER_ENV_1",
Value: "1",
},
{
Name: "NATS_SERVER",
......@@ -1909,11 +1911,6 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
"python3 -m dynamo.sglang.worker --dist-init-addr ${GROVE_PCSG_NAME}-${GROVE_PCSG_INDEX}-worker-ldr-0.${GROVE_HEADLESS_SERVICE}:29500 --nnodes 3 --node-rank $((GROVE_PCLQ_POD_INDEX + 1)) --custom-flag custom-value",
},
Ports: []corev1.ContainerPort{
{
Protocol: corev1.ProtocolTCP,
Name: commonconsts.DynamoContainerPortName,
ContainerPort: int32(commonconsts.DynamoServicePort),
},
{
Protocol: corev1.ProtocolTCP,
Name: commonconsts.DynamoSystemPortName,
......@@ -1926,12 +1923,20 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
Value: "1",
},
{
Name: "WORKER_ENV_1",
Value: "1",
Name: "DYN_SYSTEM_ENABLED",
Value: "true",
},
{
Name: "DYNAMO_PORT",
Value: fmt.Sprintf("%d", commonconsts.DynamoServicePort),
Name: "DYN_SYSTEM_PORT",
Value: "9090",
},
{
Name: "DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS",
Value: `["generate"]`,
},
{
Name: "WORKER_ENV_1",
Value: "1",
},
{
Name: "NATS_SERVER",
......@@ -1967,8 +1972,9 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
{
Name: "frontend",
Labels: map[string]string{
commonconsts.KubeLabelMetricsEnabled: commonconsts.KubeLabelValueTrue,
commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-frontend",
commonconsts.KubeLabelMetricsEnabled: commonconsts.KubeLabelValueTrue,
commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-frontend",
commonconsts.KubeLabelDynamoComponentType: commonconsts.ComponentTypeFrontend,
},
Annotations: map[string]string{},
Spec: grovev1alpha1.PodCliqueSpec{
......@@ -2158,10 +2164,6 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
Name: "PLANNER_ENV_1",
Value: "2",
},
{
Name: "DYNAMO_PORT",
Value: fmt.Sprintf("%d", commonconsts.DynamoServicePort),
},
{
Name: "NATS_SERVER",
Value: "nats-address",
......@@ -2192,13 +2194,6 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
MountPath: "/dev/shm",
},
},
Ports: []corev1.ContainerPort{
{
Protocol: corev1.ProtocolTCP,
Name: commonconsts.DynamoContainerPortName,
ContainerPort: int32(commonconsts.DynamoServicePort),
},
},
},
},
},
......@@ -2237,7 +2232,8 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
Services: map[string]*v1alpha1.DynamoComponentDeploymentOverridesSpec{
"Frontend": {
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
Replicas: &[]int32{1}[0],
Replicas: &[]int32{1}[0],
ComponentType: commonconsts.ComponentTypeFrontend,
Resources: &common.Resources{
Requests: &common.ResourceItem{
CPU: "1",
......@@ -2492,11 +2488,6 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
"ray start --head --port=6379 && python3 -m dynamo.vllm --custom-flag custom-value",
},
Ports: []corev1.ContainerPort{
{
Protocol: corev1.ProtocolTCP,
Name: commonconsts.DynamoContainerPortName,
ContainerPort: int32(commonconsts.DynamoServicePort),
},
{
Protocol: corev1.ProtocolTCP,
Name: commonconsts.DynamoSystemPortName,
......@@ -2509,12 +2500,20 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
Value: "1",
},
{
Name: "WORKER_ENV_1",
Value: "1",
Name: "DYN_SYSTEM_ENABLED",
Value: "true",
},
{
Name: "DYNAMO_PORT",
Value: fmt.Sprintf("%d", commonconsts.DynamoServicePort),
Name: "DYN_SYSTEM_PORT",
Value: "9090",
},
{
Name: "DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS",
Value: `["generate"]`,
},
{
Name: "WORKER_ENV_1",
Value: "1",
},
{
Name: "NATS_SERVER",
......@@ -2591,11 +2590,6 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
"ray start --address=${GROVE_PCSG_NAME}-${GROVE_PCSG_INDEX}-worker-ldr-0.${GROVE_HEADLESS_SERVICE}:6379 --block",
},
Ports: []corev1.ContainerPort{
{
Protocol: corev1.ProtocolTCP,
Name: commonconsts.DynamoContainerPortName,
ContainerPort: int32(commonconsts.DynamoServicePort),
},
{
Protocol: corev1.ProtocolTCP,
Name: commonconsts.DynamoSystemPortName,
......@@ -2608,12 +2602,20 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
Value: "1",
},
{
Name: "WORKER_ENV_1",
Value: "1",
Name: "DYN_SYSTEM_ENABLED",
Value: "true",
},
{
Name: "DYNAMO_PORT",
Value: fmt.Sprintf("%d", commonconsts.DynamoServicePort),
Name: "DYN_SYSTEM_PORT",
Value: "9090",
},
{
Name: "DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS",
Value: `["generate"]`,
},
{
Name: "WORKER_ENV_1",
Value: "1",
},
{
Name: "NATS_SERVER",
......@@ -2649,8 +2651,9 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
{
Name: "frontend",
Labels: map[string]string{
commonconsts.KubeLabelMetricsEnabled: commonconsts.KubeLabelValueTrue,
commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-frontend",
commonconsts.KubeLabelDynamoComponentType: commonconsts.ComponentTypeFrontend,
commonconsts.KubeLabelMetricsEnabled: commonconsts.KubeLabelValueTrue,
commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-frontend",
},
Annotations: map[string]string{},
Spec: grovev1alpha1.PodCliqueSpec{
......@@ -2840,10 +2843,6 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
Name: "PLANNER_ENV_1",
Value: "2",
},
{
Name: "DYNAMO_PORT",
Value: fmt.Sprintf("%d", commonconsts.DynamoServicePort),
},
{
Name: "NATS_SERVER",
Value: "nats-address",
......@@ -2874,13 +2873,6 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
MountPath: "/dev/shm",
},
},
Ports: []corev1.ContainerPort{
{
Protocol: corev1.ProtocolTCP,
Name: commonconsts.DynamoContainerPortName,
ContainerPort: int32(commonconsts.DynamoServicePort),
},
},
},
},
},
......@@ -2906,6 +2898,19 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
sort.Slice(tt.want.Spec.Template.Cliques, func(i, j int) bool {
return tt.want.Spec.Template.Cliques[i].Name < tt.want.Spec.Template.Cliques[j].Name
})
// Sort environment variables for all containers in all cliques
for _, clique := range got.Spec.Template.Cliques {
for i := range clique.Spec.PodSpec.Containers {
clique.Spec.PodSpec.Containers[i].Env = sortEnvVars(clique.Spec.PodSpec.Containers[i].Env)
}
}
for _, clique := range tt.want.Spec.Template.Cliques {
for i := range clique.Spec.PodSpec.Containers {
clique.Spec.PodSpec.Containers[i].Env = sortEnvVars(clique.Spec.PodSpec.Containers[i].Env)
}
}
if diff := cmp.Diff(got, tt.want); diff != "" {
t.Errorf("GenerateGrovePodGangSet() mismatch (-want +got):\n%s", diff)
}
......@@ -3018,31 +3023,6 @@ func TestGeneratePodSpecForComponent_SGLang(t *testing.T) {
expectError: false,
expectContains: []string{},
},
{
name: "SGLang with resources",
component: &v1alpha1.DynamoComponentDeploymentOverridesSpec{
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: commonconsts.ComponentTypeWorker,
ExtraPodSpec: &common.ExtraPodSpec{
MainContainer: &corev1.Container{
Args: []string{"python3", "-m", "dynamo.sglang.worker"},
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("1"),
corev1.ResourceMemory: resource.MustParse("2Gi"),
},
},
},
},
},
},
backendFramework: BackendFrameworkSGLang,
role: RoleMain,
numberOfNodes: 1,
expectError: false,
expectContains: []string{"python3 -m dynamo.sglang.worker"},
},
}
for _, tt := range tests {
......@@ -3574,7 +3554,7 @@ func TestDetermineBackendFramework(t *testing.T) {
}{
{
name: "non-worker component returns noop",
componentType: "main",
componentType: "frontend",
command: []string{"/bin/sh", "-c"},
args: []string{"echo hello world"},
expected: BackendFrameworkNoop,
......@@ -3735,7 +3715,7 @@ func TestGetBackendFrameworkFromComponent(t *testing.T) {
name: "non-worker component returns noop",
component: &v1alpha1.DynamoComponentDeploymentOverridesSpec{
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: "main", // Frontend component
ComponentType: "frontend", // Frontend component
},
},
deployment: &v1alpha1.DynamoGraphDeployment{},
......@@ -4145,3 +4125,59 @@ func TestGenerateGrovePodGangSet_StartsAfterDependencies(t *testing.T) {
})
}
}
func TestGenerateBasePodSpec_PlannerServiceAccount(t *testing.T) {
secretsRetriever := &mockSecretsRetriever{}
controllerConfig := controller_common.Config{}
tests := []struct {
name string
component *v1alpha1.DynamoComponentDeploymentOverridesSpec
expectedServiceAcc string
}{
{
name: "Planner component should have planner service account",
component: &v1alpha1.DynamoComponentDeploymentOverridesSpec{
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: commonconsts.ComponentTypePlanner,
},
},
expectedServiceAcc: commonconsts.PlannerServiceAccountName,
},
{
name: "Planner service account should not be set for non-planner components",
component: &v1alpha1.DynamoComponentDeploymentOverridesSpec{
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: commonconsts.ComponentTypeWorker,
},
},
expectedServiceAcc: "",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
podSpec, err := GenerateBasePodSpec(
tt.component,
BackendFrameworkSGLang,
secretsRetriever,
"default",
RoleMain,
1,
controllerConfig,
commonconsts.MultinodeDeploymentTypeGrove,
"test-service",
)
if err != nil {
t.Errorf("GenerateBasePodSpec() error = %v", err)
return
}
if podSpec.ServiceAccountName != tt.expectedServiceAcc {
t.Errorf("GenerateBasePodSpec() serviceAccountName = %v, want %v",
podSpec.ServiceAccountName, tt.expectedServiceAcc)
}
})
}
}
......@@ -27,7 +27,7 @@ spec:
timeoutSeconds: 2
failureThreshold: 3
dynamoNamespace: hello-world
componentType: main
componentType: frontend
replicas: 1
resources:
requests:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment