"...git@developer.sourcefind.cn:2222/OpenDAS/vllm_cscc.git" did not exist on "d65798f78c76f03f068fc2f69a68cff430ee6b6f"
Unverified Commit e710533c authored by Julien Mancuso's avatar Julien Mancuso Committed by GitHub
Browse files

fix: Additional containers not injected when using LWS (#3390)


Signed-off-by: default avatarJulien Mancuso <jmancuso@nvidia.com>
parent e1ed7009
......@@ -77,6 +77,8 @@ const (
// Grove multinode role suffixes
GroveRoleSuffixLeader = "ldr"
GroveRoleSuffixWorker = "wkr"
MainContainerName = "main"
)
type MultinodeDeploymentType string
......
......@@ -479,17 +479,46 @@ func (r *DynamoComponentDeploymentReconciler) generateLeaderPodTemplateSpec(ctx
leaderPodTemplateSpec.Spec.SchedulerName = "volcano"
if leaderPodTemplateSpec.Spec.Containers[0].Command == nil {
return nil, errors.New("generateLeaderPodTemplateSpec: container Command cannot be nil for LWS leader pod")
}
err = checkMainContainer(&leaderPodTemplateSpec.Spec)
if len(leaderPodTemplateSpec.Spec.Containers[0].Args) == 0 {
return nil, errors.New("generateLeaderPodTemplateSpec: container Args cannot be empty for LWS leader pod")
if err != nil {
return nil, errors.Wrap(err, "generateLeaderPodTemplateSpec: failed to check main container")
}
return leaderPodTemplateSpec, nil
}
func checkMainContainer(spec *corev1.PodSpec) error {
if len(spec.Containers) == 0 {
return errors.New("No containers found in pod spec")
}
mainContainerFound := false
for _, container := range spec.Containers {
if container.Name != commonconsts.MainContainerName {
continue
}
if len(container.Command) == 0 {
return errors.New("container Command cannot be nil for LWS pod")
}
if len(container.Args) == 0 {
return errors.New("container Args cannot be empty for LWS pod")
}
mainContainerFound = true
break
}
if !mainContainerFound {
return errors.New("main container not found in pod spec")
}
return nil
}
func (r *DynamoComponentDeploymentReconciler) generateWorkerPodTemplateSpec(ctx context.Context, opt generateResourceOption, kubeName string, labels map[string]string, instanceID int) (*corev1.PodTemplateSpec, error) {
workerPodTemplateSpec, err := r.generatePodTemplateSpec(ctx, opt, dynamo.RoleWorker)
if err != nil {
......@@ -508,12 +537,10 @@ func (r *DynamoComponentDeploymentReconciler) generateWorkerPodTemplateSpec(ctx
}
workerPodTemplateSpec.ObjectMeta.Annotations["scheduling.k8s.io/group-name"] = kubeName
if workerPodTemplateSpec.Spec.Containers[0].Command == nil {
return nil, errors.New("generateWorkerPodTemplateSpec: container Command cannot be nil for LWS worker pod")
}
err = checkMainContainer(&workerPodTemplateSpec.Spec)
if len(workerPodTemplateSpec.Spec.Containers[0].Args) == 0 {
return nil, errors.New("generateWorkerPodTemplateSpec: container Args cannot be empty for LWS worker pod")
if err != nil {
return nil, errors.Wrap(err, "generateWorkerPodTemplateSpec: failed to check LWS worker main container")
}
if opt.dynamoComponentDeployment.Spec.Resources == nil || opt.dynamoComponentDeployment.Spec.Resources.Limits == nil || opt.dynamoComponentDeployment.Spec.Resources.Limits.GPU == "" {
......@@ -1127,24 +1154,17 @@ func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx contex
isDebugModeEnabled := checkIfIsDebugModeEnabled(resourceAnnotations)
basePodSpec, err := dynamo.GenerateBasePodSpecForController(opt.dynamoComponentDeployment, r.DockerSecretRetriever, r.Config, role, consts.MultinodeDeploymentTypeLWS)
podSpec, err := dynamo.GenerateBasePodSpecForController(opt.dynamoComponentDeployment, r.DockerSecretRetriever, r.Config, role, consts.MultinodeDeploymentTypeLWS)
if err != nil {
err = errors.Wrap(err, "failed to generate base pod spec")
return nil, err
}
// Ensure we have at least one container (the main container should be there from GenerateBasePodSpec)
if len(basePodSpec.Containers) == 0 {
if len(podSpec.Containers) == 0 {
return nil, errors.New("no containers found in base pod spec")
}
// Get the main container from the base spec
container := basePodSpec.Containers[0]
containers := make([]corev1.Container, 0, 2)
containers = append(containers, container)
debuggerImage := "python:3.12-slim"
debuggerImage_ := os.Getenv("INTERNAL_IMAGES_DEBUGGER")
if debuggerImage_ != "" {
......@@ -1152,7 +1172,7 @@ func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx contex
}
if opt.isStealingTrafficDebugModeEnabled || isDebugModeEnabled {
containers = append(containers, corev1.Container{
podSpec.Containers = append(podSpec.Containers, corev1.Container{
Name: "debugger",
Image: debuggerImage,
Command: []string{
......@@ -1181,9 +1201,6 @@ func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx contex
podLabels[commonconsts.KubeLabelDynamoSelector] = kubeName
podSpec := basePodSpec
podSpec.Containers = containers
extraPodMetadata := opt.dynamoComponentDeployment.Spec.ExtraPodMetadata
if extraPodMetadata != nil {
......
......@@ -727,6 +727,11 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
ExtraPodSpec: &dynamoCommon.ExtraPodSpec{
PodSpec: &corev1.PodSpec{
TerminationGracePeriodSeconds: ptr.To(int64(10)),
Containers: []corev1.Container{
{
Image: "another-image:latest",
},
},
},
MainContainer: &corev1.Container{
Image: "test-image:latest",
......@@ -809,7 +814,10 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
RestartPolicy: corev1.RestartPolicyAlways,
Containers: []corev1.Container{
{
Name: "main",
Image: "another-image:latest",
},
{
Name: commonconsts.MainContainerName,
Image: "test-image:latest",
Command: []string{"sh", "-c"},
Args: []string{"ray start --head --port=6379 && some dynamo command"},
......@@ -920,7 +928,10 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
RestartPolicy: corev1.RestartPolicyAlways,
Containers: []corev1.Container{
{
Name: "main",
Image: "another-image:latest",
},
{
Name: commonconsts.MainContainerName,
Image: "test-image:latest",
Command: []string{"sh", "-c"},
Args: []string{"ray start --address=$(LWS_LEADER_ADDRESS):6379 --block"},
......
......@@ -345,7 +345,7 @@ func TestTRTLLMBackend_UpdatePodSpec(t *testing.T) {
Volumes: tt.initialVolumes,
Containers: []corev1.Container{
{
Name: "main",
Name: commonconsts.MainContainerName,
Env: []corev1.EnvVar{},
},
},
......
......@@ -63,7 +63,7 @@ func (b *BaseComponentDefaults) getCommonPodSpec() corev1.PodSpec {
func (b *BaseComponentDefaults) getCommonContainer(context ComponentContext) corev1.Container {
container := corev1.Container{
Name: "main",
Name: commonconsts.MainContainerName,
Command: []string{
"/bin/sh",
"-c",
......
......@@ -43,7 +43,7 @@ func TestPlannerDefaults_GetBaseContainer(t *testing.T) {
dynamoNamespace: "dynamo-namespace",
},
want: corev1.Container{
Name: "main",
Name: commonconsts.MainContainerName,
Command: []string{
"/bin/sh",
"-c",
......
......@@ -1298,7 +1298,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
RestartPolicy: corev1.RestartPolicyAlways,
Containers: []corev1.Container{
{
Name: "main",
Name: commonconsts.MainContainerName,
Image: "frontend-image",
Command: []string{
"/bin/sh",
......@@ -1449,7 +1449,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
Containers: []corev1.Container{
{
Name: "main",
Name: commonconsts.MainContainerName,
Image: "planner-image",
Command: []string{
"/bin/sh",
......@@ -1828,7 +1828,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
},
Containers: []corev1.Container{
{
Name: "main",
Name: commonconsts.MainContainerName,
Image: "worker-image",
Command: []string{
"/bin/sh",
......@@ -1981,7 +1981,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
},
Containers: []corev1.Container{
{
Name: "main",
Name: commonconsts.MainContainerName,
Image: "worker-image",
Command: []string{
"/bin/sh",
......@@ -2096,7 +2096,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
RestartPolicy: corev1.RestartPolicyAlways,
Containers: []corev1.Container{
{
Name: "main",
Name: commonconsts.MainContainerName,
Image: "frontend-image",
Command: []string{
"/bin/sh",
......@@ -2238,7 +2238,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
},
Containers: []corev1.Container{
{
Name: "main",
Name: commonconsts.MainContainerName,
Image: "planner-image",
Command: []string{
"/bin/sh",
......@@ -2631,7 +2631,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
RestartPolicy: corev1.RestartPolicyAlways,
Containers: []corev1.Container{
{
Name: "main",
Name: commonconsts.MainContainerName,
Image: "worker-image",
Command: []string{
"/bin/sh",
......@@ -2771,7 +2771,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
RestartPolicy: corev1.RestartPolicyAlways,
Containers: []corev1.Container{
{
Name: "main",
Name: commonconsts.MainContainerName,
Image: "worker-image",
Command: []string{
"/bin/sh",
......@@ -2886,7 +2886,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
RestartPolicy: corev1.RestartPolicyAlways,
Containers: []corev1.Container{
{
Name: "main",
Name: commonconsts.MainContainerName,
Image: "frontend-image",
Command: []string{
"/bin/sh",
......@@ -3028,7 +3028,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
RestartPolicy: corev1.RestartPolicyAlways,
Containers: []corev1.Container{
{
Name: "main",
Name: commonconsts.MainContainerName,
Image: "planner-image",
Command: []string{
"/bin/sh",
......@@ -3343,7 +3343,7 @@ func TestGeneratePodSpecForComponent_SGLang(t *testing.T) {
}
// Check that container name is set
if container.Name != "main" {
if container.Name != commonconsts.MainContainerName {
t.Errorf("GeneratePodSpecForComponent() container name = %s, want main", container.Name)
}
})
......@@ -4671,7 +4671,7 @@ func TestGenerateBasePodSpec_Worker(t *testing.T) {
expectedPodSpec: &corev1.PodSpec{
Containers: []corev1.Container{
{
Name: "main",
Name: commonconsts.MainContainerName,
Command: []string{"python3"},
Args: []string{"-m", "dynamo.worker"},
Env: []corev1.EnvVar{
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment