Unverified Commit 6e213d90 authored by Julien Mancuso's avatar Julien Mancuso Committed by GitHub
Browse files

fix: fix operator tests (#3904)


Signed-off-by: default avatarJulien Mancuso <jmancuso@nvidia.com>
parent 59535682
...@@ -734,11 +734,15 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing. ...@@ -734,11 +734,15 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
MainContainer: &corev1.Container{ MainContainer: &corev1.Container{
Image: "test-image:latest", Image: "test-image:latest",
Command: []string{ Command: []string{
"sh", "some",
"-c", "dynamo",
"command",
}, },
Args: []string{ Args: []string{
"some dynamo command", "--tensor-parallel-size",
"4",
"--pipeline-parallel-size",
"1",
}, },
Env: []corev1.EnvVar{ Env: []corev1.EnvVar{
{ {
...@@ -817,8 +821,8 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing. ...@@ -817,8 +821,8 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
{ {
Name: commonconsts.MainContainerName, Name: commonconsts.MainContainerName,
Image: "test-image:latest", Image: "test-image:latest",
Command: []string{"sh", "-c"}, Command: []string{"/bin/sh", "-c"},
Args: []string{"ray start --head --port=6379 && some dynamo command"}, Args: []string{"ray start --head --port=6379 && some dynamo command --tensor-parallel-size 4 --pipeline-parallel-size 1"},
Env: []corev1.EnvVar{ Env: []corev1.EnvVar{
{Name: "DYN_NAMESPACE", Value: "default"}, {Name: "DYN_NAMESPACE", Value: "default"},
{Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-lws-deploy"}, {Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-lws-deploy"},
...@@ -931,7 +935,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing. ...@@ -931,7 +935,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
{ {
Name: commonconsts.MainContainerName, Name: commonconsts.MainContainerName,
Image: "test-image:latest", Image: "test-image:latest",
Command: []string{"sh", "-c"}, Command: []string{"/bin/sh", "-c"},
Args: []string{"ray start --address=$(LWS_LEADER_ADDRESS):6379 --block"}, Args: []string{"ray start --address=$(LWS_LEADER_ADDRESS):6379 --block"},
Env: []corev1.EnvVar{ Env: []corev1.EnvVar{
{Name: "DYN_NAMESPACE", Value: "default"}, {Name: "DYN_NAMESPACE", Value: "default"},
......
...@@ -106,7 +106,7 @@ func injectRayDistributedLaunchFlags(container *corev1.Container, role Role, ser ...@@ -106,7 +106,7 @@ func injectRayDistributedLaunchFlags(container *corev1.Container, role Role, ser
leaderHostname := multinodeDeployer.GetLeaderHostname(serviceName) leaderHostname := multinodeDeployer.GetLeaderHostname(serviceName)
container.Args = []string{fmt.Sprintf("ray start --address=%s:%s --block", leaderHostname, VLLMPort)} container.Args = []string{fmt.Sprintf("ray start --address=%s:%s --block", leaderHostname, VLLMPort)}
} }
container.Command = []string{"sh", "-c"} // ensure cmd is a shell container.Command = []string{"/bin/sh", "-c"} // ensure cmd is a shell
} }
func injectDataParallelLaunchFlags(container *corev1.Container, role Role, serviceName string, multinodeDeployer MultinodeDeployer) { func injectDataParallelLaunchFlags(container *corev1.Container, role Role, serviceName string, multinodeDeployer MultinodeDeployer) {
......
...@@ -303,7 +303,7 @@ func TestUpdateVLLMMultinodeArgs(t *testing.T) { ...@@ -303,7 +303,7 @@ func TestUpdateVLLMMultinodeArgs(t *testing.T) {
name: "leader prepends ray start --head", name: "leader prepends ray start --head",
role: RoleLeader, role: RoleLeader,
multinodeDeployer: &GroveMultinodeDeployer{}, multinodeDeployer: &GroveMultinodeDeployer{},
initialContainer: &corev1.Container{Args: []string{"python3", "-m", "dynamo.vllm", tensorParallelSizeFlag, "16"}, Resources: corev1.ResourceRequirements{Limits: corev1.ResourceList{corev1.ResourceName("nvidia.com/gpu"): resource.MustParse("8")}}}, initialContainer: &corev1.Container{Command: []string{"python3"}, Args: []string{"-m", "dynamo.vllm", tensorParallelSizeFlag, "16"}, Resources: corev1.ResourceRequirements{Limits: corev1.ResourceList{corev1.ResourceName("nvidia.com/gpu"): resource.MustParse("8")}}},
expectedArgs: []string{fmt.Sprintf("ray start --head --port=%s && python3 -m dynamo.vllm %s 16", VLLMPort, tensorParallelSizeFlag)}, expectedArgs: []string{fmt.Sprintf("ray start --head --port=%s && python3 -m dynamo.vllm %s 16", VLLMPort, tensorParallelSizeFlag)},
}, },
{ {
......
...@@ -2417,11 +2417,17 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { ...@@ -2417,11 +2417,17 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
MainContainer: &corev1.Container{ MainContainer: &corev1.Container{
Image: "worker-image", Image: "worker-image",
Command: []string{ Command: []string{
"/bin/sh", "python3",
"-c", "-m",
"dynamo.vllm",
}, },
Args: []string{ Args: []string{
"python3 -m dynamo.vllm --custom-flag custom-value", "--custom-flag",
"custom-value",
"--tensor-parallel-size",
"4",
"--pipeline-parallel-size",
"1",
}, },
StartupProbe: &corev1.Probe{ StartupProbe: &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{ ProbeHandler: corev1.ProbeHandler{
...@@ -2598,7 +2604,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { ...@@ -2598,7 +2604,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
"-c", "-c",
}, },
Args: []string{ Args: []string{
"ray start --head --port=6379 && python3 -m dynamo.vllm --custom-flag custom-value", "ray start --head --port=6379 && python3 -m dynamo.vllm --custom-flag custom-value --tensor-parallel-size 4 --pipeline-parallel-size 1",
}, },
Ports: []corev1.ContainerPort{ Ports: []corev1.ContainerPort{
{ {
...@@ -3345,7 +3351,7 @@ func TestGeneratePodSpecForComponent_VLLM(t *testing.T) { ...@@ -3345,7 +3351,7 @@ func TestGeneratePodSpecForComponent_VLLM(t *testing.T) {
ComponentType: commonconsts.ComponentTypeWorker, ComponentType: commonconsts.ComponentTypeWorker,
ExtraPodSpec: &common.ExtraPodSpec{ ExtraPodSpec: &common.ExtraPodSpec{
MainContainer: &corev1.Container{ MainContainer: &corev1.Container{
Args: []string{"python3", "-m", "dynamo.vllm"}, Args: []string{"python3", "-m", "dynamo.vllm", "--tensor-parallel-size", "4", "--pipeline-parallel-size", "1"},
}, },
}, },
}, },
...@@ -3359,6 +3365,11 @@ func TestGeneratePodSpecForComponent_VLLM(t *testing.T) { ...@@ -3359,6 +3365,11 @@ func TestGeneratePodSpecForComponent_VLLM(t *testing.T) {
name: "VLLM multinode worker", name: "VLLM multinode worker",
component: &v1alpha1.DynamoComponentDeploymentSharedSpec{ component: &v1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: commonconsts.ComponentTypeWorker, ComponentType: commonconsts.ComponentTypeWorker,
ExtraPodSpec: &common.ExtraPodSpec{
MainContainer: &corev1.Container{
Args: []string{"python3", "-m", "dynamo.vllm", "--tensor-parallel-size", "4", "--pipeline-parallel-size", "1"},
},
},
}, },
backendFramework: BackendFrameworkVLLM, backendFramework: BackendFrameworkVLLM,
role: RoleWorker, role: RoleWorker,
...@@ -4757,7 +4768,7 @@ func TestGenerateBasePodSpec_ResourceClaims(t *testing.T) { ...@@ -4757,7 +4768,7 @@ func TestGenerateBasePodSpec_ResourceClaims(t *testing.T) {
tests := []struct { tests := []struct {
name string name string
component *v1alpha1.DynamoComponentDeploymentOverridesSpec component *v1alpha1.DynamoComponentDeploymentSharedSpec
expectError bool expectError bool
expectedResourceClaims []corev1.ResourceClaim expectedResourceClaims []corev1.ResourceClaim
expectedPodClaims []corev1.PodResourceClaim expectedPodClaims []corev1.PodResourceClaim
...@@ -4765,55 +4776,53 @@ func TestGenerateBasePodSpec_ResourceClaims(t *testing.T) { ...@@ -4765,55 +4776,53 @@ func TestGenerateBasePodSpec_ResourceClaims(t *testing.T) {
}{ }{
{ {
name: "component with resource claims", name: "component with resource claims",
component: &v1alpha1.DynamoComponentDeploymentOverridesSpec{ component: &v1alpha1.DynamoComponentDeploymentSharedSpec{
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{ ComponentType: commonconsts.ComponentTypeWorker,
ComponentType: commonconsts.ComponentTypeWorker, Resources: &common.Resources{
Resources: &common.Resources{ Requests: &common.ResourceItem{
Requests: &common.ResourceItem{ CPU: "130",
CPU: "130", Memory: "800Gi",
Memory: "800Gi", },
}, Limits: &common.ResourceItem{
Limits: &common.ResourceItem{ CPU: "130",
CPU: "130", Memory: "800Gi",
Memory: "800Gi", GPU: "4",
GPU: "4", },
Claims: []corev1.ResourceClaim{
{
Name: "compute-domain-channel",
}, },
Claims: []corev1.ResourceClaim{ },
},
ExtraPodSpec: &common.ExtraPodSpec{
PodSpec: &corev1.PodSpec{
ResourceClaims: []corev1.PodResourceClaim{
{ {
Name: "compute-domain-channel", Name: "compute-domain-channel",
ResourceClaimTemplateName: ptr.To("trtllm-test-compute-domain-channel"),
}, },
}, },
}, Volumes: []corev1.Volume{
ExtraPodSpec: &common.ExtraPodSpec{ {
PodSpec: &corev1.PodSpec{ Name: "model-storage",
ResourceClaims: []corev1.PodResourceClaim{ VolumeSource: corev1.VolumeSource{
{ PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
Name: "compute-domain-channel", ClaimName: "dynamo-pvc",
ResourceClaimTemplateName: ptr.To("trtllm-test-compute-domain-channel"),
},
},
Volumes: []corev1.Volume{
{
Name: "model-storage",
VolumeSource: corev1.VolumeSource{
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
ClaimName: "dynamo-pvc",
},
}, },
}, },
}, },
}, },
MainContainer: &corev1.Container{ },
Image: "rohanv672/dynamo:v0.5.1-trtllm", MainContainer: &corev1.Container{
Args: []string{ Image: "rohanv672/dynamo:v0.5.1-trtllm",
"python3 -m dynamo.trtllm --model-path /data/deepseek-r1 --served-model-name deepseek-ai/DeepSeek-R1 --extra-engine-args /data/engine_configs/wide_ep_agg.yaml", Args: []string{
}, "python3 -m dynamo.trtllm --model-path /data/deepseek-r1 --served-model-name deepseek-ai/DeepSeek-R1 --extra-engine-args /data/engine_configs/wide_ep_agg.yaml",
Command: []string{"/bin/sh", "-c"}, },
VolumeMounts: []corev1.VolumeMount{ Command: []string{"/bin/sh", "-c"},
{ VolumeMounts: []corev1.VolumeMount{
Name: "model-storage", {
MountPath: "/data", Name: "model-storage",
}, MountPath: "/data",
}, },
}, },
}, },
...@@ -4853,37 +4862,35 @@ func TestGenerateBasePodSpec_ResourceClaims(t *testing.T) { ...@@ -4853,37 +4862,35 @@ func TestGenerateBasePodSpec_ResourceClaims(t *testing.T) {
}, },
{ {
name: "component with multiple resource claims", name: "component with multiple resource claims",
component: &v1alpha1.DynamoComponentDeploymentOverridesSpec{ component: &v1alpha1.DynamoComponentDeploymentSharedSpec{
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{ ComponentType: commonconsts.ComponentTypeWorker,
ComponentType: commonconsts.ComponentTypeWorker, Resources: &common.Resources{
Resources: &common.Resources{ Claims: []corev1.ResourceClaim{
Claims: []corev1.ResourceClaim{ {
Name: "compute-domain-channel",
},
{
Name: "network-domain-channel",
},
},
},
ExtraPodSpec: &common.ExtraPodSpec{
PodSpec: &corev1.PodSpec{
ResourceClaims: []corev1.PodResourceClaim{
{ {
Name: "compute-domain-channel", Name: "compute-domain-channel",
ResourceClaimTemplateName: ptr.To("compute-template"),
}, },
{ {
Name: "network-domain-channel", Name: "network-domain-channel",
ResourceClaimTemplateName: ptr.To("network-template"),
}, },
}, },
}, },
ExtraPodSpec: &common.ExtraPodSpec{ MainContainer: &corev1.Container{
PodSpec: &corev1.PodSpec{ Image: "test-image",
ResourceClaims: []corev1.PodResourceClaim{ Command: []string{"python3"},
{ Args: []string{"-m", "dynamo.worker"},
Name: "compute-domain-channel",
ResourceClaimTemplateName: ptr.To("compute-template"),
},
{
Name: "network-domain-channel",
ResourceClaimTemplateName: ptr.To("network-template"),
},
},
},
MainContainer: &corev1.Container{
Image: "test-image",
Command: []string{"python3"},
Args: []string{"-m", "dynamo.worker"},
},
}, },
}, },
}, },
...@@ -4909,14 +4916,12 @@ func TestGenerateBasePodSpec_ResourceClaims(t *testing.T) { ...@@ -4909,14 +4916,12 @@ func TestGenerateBasePodSpec_ResourceClaims(t *testing.T) {
}, },
{ {
name: "component without resource claims", name: "component without resource claims",
component: &v1alpha1.DynamoComponentDeploymentOverridesSpec{ component: &v1alpha1.DynamoComponentDeploymentSharedSpec{
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{ ComponentType: commonconsts.ComponentTypeFrontend,
ComponentType: commonconsts.ComponentTypeFrontend, Resources: &common.Resources{
Resources: &common.Resources{ Requests: &common.ResourceItem{
Requests: &common.ResourceItem{ CPU: "1",
CPU: "1", Memory: "1Gi",
Memory: "1Gi",
},
}, },
}, },
}, },
......
...@@ -39,7 +39,8 @@ helm fetch https://helm.ngc.nvidia.com/nvidia/ai-dynamo/charts/dynamo-platform-$ ...@@ -39,7 +39,8 @@ helm fetch https://helm.ngc.nvidia.com/nvidia/ai-dynamo/charts/dynamo-platform-$
helm install dynamo-platform dynamo-platform-${RELEASE_VERSION}.tgz --namespace ${NAMESPACE} --create-namespace helm install dynamo-platform dynamo-platform-${RELEASE_VERSION}.tgz --namespace ${NAMESPACE} --create-namespace
``` ```
For namespace-restricted installations (shared clusters): For namespace-restricted installations (shared clusters), you'll need to install the Dynamo platform in each namespace you want to deploy to.
Namespace restriction is enabled by setting the `dynamo-operator.namespaceRestriction.enabled` flag to `true`.
```bash ```bash
helm install dynamo-platform dynamo-platform-${RELEASE_VERSION}.tgz \ helm install dynamo-platform dynamo-platform-${RELEASE_VERSION}.tgz \
--namespace ${NAMESPACE} \ --namespace ${NAMESPACE} \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment