Unverified Commit e710533c authored by Julien Mancuso's avatar Julien Mancuso Committed by GitHub
Browse files

fix: Additional containers not injected when using LWS (#3390)


Signed-off-by: default avatarJulien Mancuso <jmancuso@nvidia.com>
parent e1ed7009
...@@ -77,6 +77,8 @@ const ( ...@@ -77,6 +77,8 @@ const (
// Grove multinode role suffixes // Grove multinode role suffixes
GroveRoleSuffixLeader = "ldr" GroveRoleSuffixLeader = "ldr"
GroveRoleSuffixWorker = "wkr" GroveRoleSuffixWorker = "wkr"
MainContainerName = "main"
) )
type MultinodeDeploymentType string type MultinodeDeploymentType string
......
...@@ -479,17 +479,46 @@ func (r *DynamoComponentDeploymentReconciler) generateLeaderPodTemplateSpec(ctx ...@@ -479,17 +479,46 @@ func (r *DynamoComponentDeploymentReconciler) generateLeaderPodTemplateSpec(ctx
leaderPodTemplateSpec.Spec.SchedulerName = "volcano" leaderPodTemplateSpec.Spec.SchedulerName = "volcano"
if leaderPodTemplateSpec.Spec.Containers[0].Command == nil { err = checkMainContainer(&leaderPodTemplateSpec.Spec)
return nil, errors.New("generateLeaderPodTemplateSpec: container Command cannot be nil for LWS leader pod")
}
if len(leaderPodTemplateSpec.Spec.Containers[0].Args) == 0 { if err != nil {
return nil, errors.New("generateLeaderPodTemplateSpec: container Args cannot be empty for LWS leader pod") return nil, errors.Wrap(err, "generateLeaderPodTemplateSpec: failed to check main container")
} }
return leaderPodTemplateSpec, nil return leaderPodTemplateSpec, nil
} }
func checkMainContainer(spec *corev1.PodSpec) error {
if len(spec.Containers) == 0 {
return errors.New("No containers found in pod spec")
}
mainContainerFound := false
for _, container := range spec.Containers {
if container.Name != commonconsts.MainContainerName {
continue
}
if len(container.Command) == 0 {
return errors.New("container Command cannot be nil for LWS pod")
}
if len(container.Args) == 0 {
return errors.New("container Args cannot be empty for LWS pod")
}
mainContainerFound = true
break
}
if !mainContainerFound {
return errors.New("main container not found in pod spec")
}
return nil
}
func (r *DynamoComponentDeploymentReconciler) generateWorkerPodTemplateSpec(ctx context.Context, opt generateResourceOption, kubeName string, labels map[string]string, instanceID int) (*corev1.PodTemplateSpec, error) { func (r *DynamoComponentDeploymentReconciler) generateWorkerPodTemplateSpec(ctx context.Context, opt generateResourceOption, kubeName string, labels map[string]string, instanceID int) (*corev1.PodTemplateSpec, error) {
workerPodTemplateSpec, err := r.generatePodTemplateSpec(ctx, opt, dynamo.RoleWorker) workerPodTemplateSpec, err := r.generatePodTemplateSpec(ctx, opt, dynamo.RoleWorker)
if err != nil { if err != nil {
...@@ -508,12 +537,10 @@ func (r *DynamoComponentDeploymentReconciler) generateWorkerPodTemplateSpec(ctx ...@@ -508,12 +537,10 @@ func (r *DynamoComponentDeploymentReconciler) generateWorkerPodTemplateSpec(ctx
} }
workerPodTemplateSpec.ObjectMeta.Annotations["scheduling.k8s.io/group-name"] = kubeName workerPodTemplateSpec.ObjectMeta.Annotations["scheduling.k8s.io/group-name"] = kubeName
if workerPodTemplateSpec.Spec.Containers[0].Command == nil { err = checkMainContainer(&workerPodTemplateSpec.Spec)
return nil, errors.New("generateWorkerPodTemplateSpec: container Command cannot be nil for LWS worker pod")
}
if len(workerPodTemplateSpec.Spec.Containers[0].Args) == 0 { if err != nil {
return nil, errors.New("generateWorkerPodTemplateSpec: container Args cannot be empty for LWS worker pod") return nil, errors.Wrap(err, "generateWorkerPodTemplateSpec: failed to check LWS worker main container")
} }
if opt.dynamoComponentDeployment.Spec.Resources == nil || opt.dynamoComponentDeployment.Spec.Resources.Limits == nil || opt.dynamoComponentDeployment.Spec.Resources.Limits.GPU == "" { if opt.dynamoComponentDeployment.Spec.Resources == nil || opt.dynamoComponentDeployment.Spec.Resources.Limits == nil || opt.dynamoComponentDeployment.Spec.Resources.Limits.GPU == "" {
...@@ -1127,24 +1154,17 @@ func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx contex ...@@ -1127,24 +1154,17 @@ func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx contex
isDebugModeEnabled := checkIfIsDebugModeEnabled(resourceAnnotations) isDebugModeEnabled := checkIfIsDebugModeEnabled(resourceAnnotations)
basePodSpec, err := dynamo.GenerateBasePodSpecForController(opt.dynamoComponentDeployment, r.DockerSecretRetriever, r.Config, role, consts.MultinodeDeploymentTypeLWS) podSpec, err := dynamo.GenerateBasePodSpecForController(opt.dynamoComponentDeployment, r.DockerSecretRetriever, r.Config, role, consts.MultinodeDeploymentTypeLWS)
if err != nil { if err != nil {
err = errors.Wrap(err, "failed to generate base pod spec") err = errors.Wrap(err, "failed to generate base pod spec")
return nil, err return nil, err
} }
// Ensure we have at least one container (the main container should be there from GenerateBasePodSpec) // Ensure we have at least one container (the main container should be there from GenerateBasePodSpec)
if len(basePodSpec.Containers) == 0 { if len(podSpec.Containers) == 0 {
return nil, errors.New("no containers found in base pod spec") return nil, errors.New("no containers found in base pod spec")
} }
// Get the main container from the base spec
container := basePodSpec.Containers[0]
containers := make([]corev1.Container, 0, 2)
containers = append(containers, container)
debuggerImage := "python:3.12-slim" debuggerImage := "python:3.12-slim"
debuggerImage_ := os.Getenv("INTERNAL_IMAGES_DEBUGGER") debuggerImage_ := os.Getenv("INTERNAL_IMAGES_DEBUGGER")
if debuggerImage_ != "" { if debuggerImage_ != "" {
...@@ -1152,7 +1172,7 @@ func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx contex ...@@ -1152,7 +1172,7 @@ func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx contex
} }
if opt.isStealingTrafficDebugModeEnabled || isDebugModeEnabled { if opt.isStealingTrafficDebugModeEnabled || isDebugModeEnabled {
containers = append(containers, corev1.Container{ podSpec.Containers = append(podSpec.Containers, corev1.Container{
Name: "debugger", Name: "debugger",
Image: debuggerImage, Image: debuggerImage,
Command: []string{ Command: []string{
...@@ -1181,9 +1201,6 @@ func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx contex ...@@ -1181,9 +1201,6 @@ func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx contex
podLabels[commonconsts.KubeLabelDynamoSelector] = kubeName podLabels[commonconsts.KubeLabelDynamoSelector] = kubeName
podSpec := basePodSpec
podSpec.Containers = containers
extraPodMetadata := opt.dynamoComponentDeployment.Spec.ExtraPodMetadata extraPodMetadata := opt.dynamoComponentDeployment.Spec.ExtraPodMetadata
if extraPodMetadata != nil { if extraPodMetadata != nil {
......
...@@ -727,6 +727,11 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing. ...@@ -727,6 +727,11 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
ExtraPodSpec: &dynamoCommon.ExtraPodSpec{ ExtraPodSpec: &dynamoCommon.ExtraPodSpec{
PodSpec: &corev1.PodSpec{ PodSpec: &corev1.PodSpec{
TerminationGracePeriodSeconds: ptr.To(int64(10)), TerminationGracePeriodSeconds: ptr.To(int64(10)),
Containers: []corev1.Container{
{
Image: "another-image:latest",
},
},
}, },
MainContainer: &corev1.Container{ MainContainer: &corev1.Container{
Image: "test-image:latest", Image: "test-image:latest",
...@@ -809,7 +814,10 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing. ...@@ -809,7 +814,10 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
RestartPolicy: corev1.RestartPolicyAlways, RestartPolicy: corev1.RestartPolicyAlways,
Containers: []corev1.Container{ Containers: []corev1.Container{
{ {
Name: "main", Image: "another-image:latest",
},
{
Name: commonconsts.MainContainerName,
Image: "test-image:latest", Image: "test-image:latest",
Command: []string{"sh", "-c"}, Command: []string{"sh", "-c"},
Args: []string{"ray start --head --port=6379 && some dynamo command"}, Args: []string{"ray start --head --port=6379 && some dynamo command"},
...@@ -920,7 +928,10 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing. ...@@ -920,7 +928,10 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
RestartPolicy: corev1.RestartPolicyAlways, RestartPolicy: corev1.RestartPolicyAlways,
Containers: []corev1.Container{ Containers: []corev1.Container{
{ {
Name: "main", Image: "another-image:latest",
},
{
Name: commonconsts.MainContainerName,
Image: "test-image:latest", Image: "test-image:latest",
Command: []string{"sh", "-c"}, Command: []string{"sh", "-c"},
Args: []string{"ray start --address=$(LWS_LEADER_ADDRESS):6379 --block"}, Args: []string{"ray start --address=$(LWS_LEADER_ADDRESS):6379 --block"},
......
...@@ -345,7 +345,7 @@ func TestTRTLLMBackend_UpdatePodSpec(t *testing.T) { ...@@ -345,7 +345,7 @@ func TestTRTLLMBackend_UpdatePodSpec(t *testing.T) {
Volumes: tt.initialVolumes, Volumes: tt.initialVolumes,
Containers: []corev1.Container{ Containers: []corev1.Container{
{ {
Name: "main", Name: commonconsts.MainContainerName,
Env: []corev1.EnvVar{}, Env: []corev1.EnvVar{},
}, },
}, },
......
...@@ -63,7 +63,7 @@ func (b *BaseComponentDefaults) getCommonPodSpec() corev1.PodSpec { ...@@ -63,7 +63,7 @@ func (b *BaseComponentDefaults) getCommonPodSpec() corev1.PodSpec {
func (b *BaseComponentDefaults) getCommonContainer(context ComponentContext) corev1.Container { func (b *BaseComponentDefaults) getCommonContainer(context ComponentContext) corev1.Container {
container := corev1.Container{ container := corev1.Container{
Name: "main", Name: commonconsts.MainContainerName,
Command: []string{ Command: []string{
"/bin/sh", "/bin/sh",
"-c", "-c",
......
...@@ -43,7 +43,7 @@ func TestPlannerDefaults_GetBaseContainer(t *testing.T) { ...@@ -43,7 +43,7 @@ func TestPlannerDefaults_GetBaseContainer(t *testing.T) {
dynamoNamespace: "dynamo-namespace", dynamoNamespace: "dynamo-namespace",
}, },
want: corev1.Container{ want: corev1.Container{
Name: "main", Name: commonconsts.MainContainerName,
Command: []string{ Command: []string{
"/bin/sh", "/bin/sh",
"-c", "-c",
......
...@@ -1298,7 +1298,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { ...@@ -1298,7 +1298,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
RestartPolicy: corev1.RestartPolicyAlways, RestartPolicy: corev1.RestartPolicyAlways,
Containers: []corev1.Container{ Containers: []corev1.Container{
{ {
Name: "main", Name: commonconsts.MainContainerName,
Image: "frontend-image", Image: "frontend-image",
Command: []string{ Command: []string{
"/bin/sh", "/bin/sh",
...@@ -1449,7 +1449,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { ...@@ -1449,7 +1449,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
Containers: []corev1.Container{ Containers: []corev1.Container{
{ {
Name: "main", Name: commonconsts.MainContainerName,
Image: "planner-image", Image: "planner-image",
Command: []string{ Command: []string{
"/bin/sh", "/bin/sh",
...@@ -1828,7 +1828,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { ...@@ -1828,7 +1828,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
}, },
Containers: []corev1.Container{ Containers: []corev1.Container{
{ {
Name: "main", Name: commonconsts.MainContainerName,
Image: "worker-image", Image: "worker-image",
Command: []string{ Command: []string{
"/bin/sh", "/bin/sh",
...@@ -1981,7 +1981,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { ...@@ -1981,7 +1981,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
}, },
Containers: []corev1.Container{ Containers: []corev1.Container{
{ {
Name: "main", Name: commonconsts.MainContainerName,
Image: "worker-image", Image: "worker-image",
Command: []string{ Command: []string{
"/bin/sh", "/bin/sh",
...@@ -2096,7 +2096,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { ...@@ -2096,7 +2096,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
RestartPolicy: corev1.RestartPolicyAlways, RestartPolicy: corev1.RestartPolicyAlways,
Containers: []corev1.Container{ Containers: []corev1.Container{
{ {
Name: "main", Name: commonconsts.MainContainerName,
Image: "frontend-image", Image: "frontend-image",
Command: []string{ Command: []string{
"/bin/sh", "/bin/sh",
...@@ -2238,7 +2238,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { ...@@ -2238,7 +2238,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
}, },
Containers: []corev1.Container{ Containers: []corev1.Container{
{ {
Name: "main", Name: commonconsts.MainContainerName,
Image: "planner-image", Image: "planner-image",
Command: []string{ Command: []string{
"/bin/sh", "/bin/sh",
...@@ -2631,7 +2631,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { ...@@ -2631,7 +2631,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
RestartPolicy: corev1.RestartPolicyAlways, RestartPolicy: corev1.RestartPolicyAlways,
Containers: []corev1.Container{ Containers: []corev1.Container{
{ {
Name: "main", Name: commonconsts.MainContainerName,
Image: "worker-image", Image: "worker-image",
Command: []string{ Command: []string{
"/bin/sh", "/bin/sh",
...@@ -2771,7 +2771,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { ...@@ -2771,7 +2771,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
RestartPolicy: corev1.RestartPolicyAlways, RestartPolicy: corev1.RestartPolicyAlways,
Containers: []corev1.Container{ Containers: []corev1.Container{
{ {
Name: "main", Name: commonconsts.MainContainerName,
Image: "worker-image", Image: "worker-image",
Command: []string{ Command: []string{
"/bin/sh", "/bin/sh",
...@@ -2886,7 +2886,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { ...@@ -2886,7 +2886,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
RestartPolicy: corev1.RestartPolicyAlways, RestartPolicy: corev1.RestartPolicyAlways,
Containers: []corev1.Container{ Containers: []corev1.Container{
{ {
Name: "main", Name: commonconsts.MainContainerName,
Image: "frontend-image", Image: "frontend-image",
Command: []string{ Command: []string{
"/bin/sh", "/bin/sh",
...@@ -3028,7 +3028,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { ...@@ -3028,7 +3028,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
RestartPolicy: corev1.RestartPolicyAlways, RestartPolicy: corev1.RestartPolicyAlways,
Containers: []corev1.Container{ Containers: []corev1.Container{
{ {
Name: "main", Name: commonconsts.MainContainerName,
Image: "planner-image", Image: "planner-image",
Command: []string{ Command: []string{
"/bin/sh", "/bin/sh",
...@@ -3343,7 +3343,7 @@ func TestGeneratePodSpecForComponent_SGLang(t *testing.T) { ...@@ -3343,7 +3343,7 @@ func TestGeneratePodSpecForComponent_SGLang(t *testing.T) {
} }
// Check that container name is set // Check that container name is set
if container.Name != "main" { if container.Name != commonconsts.MainContainerName {
t.Errorf("GeneratePodSpecForComponent() container name = %s, want main", container.Name) t.Errorf("GeneratePodSpecForComponent() container name = %s, want main", container.Name)
} }
}) })
...@@ -4671,7 +4671,7 @@ func TestGenerateBasePodSpec_Worker(t *testing.T) { ...@@ -4671,7 +4671,7 @@ func TestGenerateBasePodSpec_Worker(t *testing.T) {
expectedPodSpec: &corev1.PodSpec{ expectedPodSpec: &corev1.PodSpec{
Containers: []corev1.Container{ Containers: []corev1.Container{
{ {
Name: "main", Name: commonconsts.MainContainerName,
Command: []string{"python3"}, Command: []string{"python3"},
Args: []string{"-m", "dynamo.worker"}, Args: []string{"-m", "dynamo.worker"},
Env: []corev1.EnvVar{ Env: []corev1.EnvVar{
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment