Unverified Commit 6e213d90 authored by Julien Mancuso's avatar Julien Mancuso Committed by GitHub
Browse files

fix: fix operator tests (#3904)


Signed-off-by: default avatarJulien Mancuso <jmancuso@nvidia.com>
parent 59535682
......@@ -734,11 +734,15 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
MainContainer: &corev1.Container{
Image: "test-image:latest",
Command: []string{
"sh",
"-c",
"some",
"dynamo",
"command",
},
Args: []string{
"some dynamo command",
"--tensor-parallel-size",
"4",
"--pipeline-parallel-size",
"1",
},
Env: []corev1.EnvVar{
{
......@@ -817,8 +821,8 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
{
Name: commonconsts.MainContainerName,
Image: "test-image:latest",
Command: []string{"sh", "-c"},
Args: []string{"ray start --head --port=6379 && some dynamo command"},
Command: []string{"/bin/sh", "-c"},
Args: []string{"ray start --head --port=6379 && some dynamo command --tensor-parallel-size 4 --pipeline-parallel-size 1"},
Env: []corev1.EnvVar{
{Name: "DYN_NAMESPACE", Value: "default"},
{Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-lws-deploy"},
......@@ -931,7 +935,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
{
Name: commonconsts.MainContainerName,
Image: "test-image:latest",
Command: []string{"sh", "-c"},
Command: []string{"/bin/sh", "-c"},
Args: []string{"ray start --address=$(LWS_LEADER_ADDRESS):6379 --block"},
Env: []corev1.EnvVar{
{Name: "DYN_NAMESPACE", Value: "default"},
......
......@@ -106,7 +106,7 @@ func injectRayDistributedLaunchFlags(container *corev1.Container, role Role, ser
leaderHostname := multinodeDeployer.GetLeaderHostname(serviceName)
container.Args = []string{fmt.Sprintf("ray start --address=%s:%s --block", leaderHostname, VLLMPort)}
}
container.Command = []string{"sh", "-c"} // ensure cmd is a shell
container.Command = []string{"/bin/sh", "-c"} // ensure cmd is a shell
}
func injectDataParallelLaunchFlags(container *corev1.Container, role Role, serviceName string, multinodeDeployer MultinodeDeployer) {
......
......@@ -303,7 +303,7 @@ func TestUpdateVLLMMultinodeArgs(t *testing.T) {
name: "leader prepends ray start --head",
role: RoleLeader,
multinodeDeployer: &GroveMultinodeDeployer{},
initialContainer: &corev1.Container{Args: []string{"python3", "-m", "dynamo.vllm", tensorParallelSizeFlag, "16"}, Resources: corev1.ResourceRequirements{Limits: corev1.ResourceList{corev1.ResourceName("nvidia.com/gpu"): resource.MustParse("8")}}},
initialContainer: &corev1.Container{Command: []string{"python3"}, Args: []string{"-m", "dynamo.vllm", tensorParallelSizeFlag, "16"}, Resources: corev1.ResourceRequirements{Limits: corev1.ResourceList{corev1.ResourceName("nvidia.com/gpu"): resource.MustParse("8")}}},
expectedArgs: []string{fmt.Sprintf("ray start --head --port=%s && python3 -m dynamo.vllm %s 16", VLLMPort, tensorParallelSizeFlag)},
},
{
......
......@@ -2417,11 +2417,17 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
MainContainer: &corev1.Container{
Image: "worker-image",
Command: []string{
"/bin/sh",
"-c",
"python3",
"-m",
"dynamo.vllm",
},
Args: []string{
"python3 -m dynamo.vllm --custom-flag custom-value",
"--custom-flag",
"custom-value",
"--tensor-parallel-size",
"4",
"--pipeline-parallel-size",
"1",
},
StartupProbe: &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
......@@ -2598,7 +2604,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
"-c",
},
Args: []string{
"ray start --head --port=6379 && python3 -m dynamo.vllm --custom-flag custom-value",
"ray start --head --port=6379 && python3 -m dynamo.vllm --custom-flag custom-value --tensor-parallel-size 4 --pipeline-parallel-size 1",
},
Ports: []corev1.ContainerPort{
{
......@@ -3345,7 +3351,7 @@ func TestGeneratePodSpecForComponent_VLLM(t *testing.T) {
ComponentType: commonconsts.ComponentTypeWorker,
ExtraPodSpec: &common.ExtraPodSpec{
MainContainer: &corev1.Container{
Args: []string{"python3", "-m", "dynamo.vllm"},
Args: []string{"python3", "-m", "dynamo.vllm", "--tensor-parallel-size", "4", "--pipeline-parallel-size", "1"},
},
},
},
......@@ -3359,6 +3365,11 @@ func TestGeneratePodSpecForComponent_VLLM(t *testing.T) {
name: "VLLM multinode worker",
component: &v1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: commonconsts.ComponentTypeWorker,
ExtraPodSpec: &common.ExtraPodSpec{
MainContainer: &corev1.Container{
Args: []string{"python3", "-m", "dynamo.vllm", "--tensor-parallel-size", "4", "--pipeline-parallel-size", "1"},
},
},
},
backendFramework: BackendFrameworkVLLM,
role: RoleWorker,
......@@ -4757,7 +4768,7 @@ func TestGenerateBasePodSpec_ResourceClaims(t *testing.T) {
tests := []struct {
name string
component *v1alpha1.DynamoComponentDeploymentOverridesSpec
component *v1alpha1.DynamoComponentDeploymentSharedSpec
expectError bool
expectedResourceClaims []corev1.ResourceClaim
expectedPodClaims []corev1.PodResourceClaim
......@@ -4765,55 +4776,53 @@ func TestGenerateBasePodSpec_ResourceClaims(t *testing.T) {
}{
{
name: "component with resource claims",
component: &v1alpha1.DynamoComponentDeploymentOverridesSpec{
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: commonconsts.ComponentTypeWorker,
Resources: &common.Resources{
Requests: &common.ResourceItem{
CPU: "130",
Memory: "800Gi",
},
Limits: &common.ResourceItem{
CPU: "130",
Memory: "800Gi",
GPU: "4",
component: &v1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: commonconsts.ComponentTypeWorker,
Resources: &common.Resources{
Requests: &common.ResourceItem{
CPU: "130",
Memory: "800Gi",
},
Limits: &common.ResourceItem{
CPU: "130",
Memory: "800Gi",
GPU: "4",
},
Claims: []corev1.ResourceClaim{
{
Name: "compute-domain-channel",
},
Claims: []corev1.ResourceClaim{
},
},
ExtraPodSpec: &common.ExtraPodSpec{
PodSpec: &corev1.PodSpec{
ResourceClaims: []corev1.PodResourceClaim{
{
Name: "compute-domain-channel",
Name: "compute-domain-channel",
ResourceClaimTemplateName: ptr.To("trtllm-test-compute-domain-channel"),
},
},
},
ExtraPodSpec: &common.ExtraPodSpec{
PodSpec: &corev1.PodSpec{
ResourceClaims: []corev1.PodResourceClaim{
{
Name: "compute-domain-channel",
ResourceClaimTemplateName: ptr.To("trtllm-test-compute-domain-channel"),
},
},
Volumes: []corev1.Volume{
{
Name: "model-storage",
VolumeSource: corev1.VolumeSource{
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
ClaimName: "dynamo-pvc",
},
Volumes: []corev1.Volume{
{
Name: "model-storage",
VolumeSource: corev1.VolumeSource{
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
ClaimName: "dynamo-pvc",
},
},
},
},
MainContainer: &corev1.Container{
Image: "rohanv672/dynamo:v0.5.1-trtllm",
Args: []string{
"python3 -m dynamo.trtllm --model-path /data/deepseek-r1 --served-model-name deepseek-ai/DeepSeek-R1 --extra-engine-args /data/engine_configs/wide_ep_agg.yaml",
},
Command: []string{"/bin/sh", "-c"},
VolumeMounts: []corev1.VolumeMount{
{
Name: "model-storage",
MountPath: "/data",
},
},
MainContainer: &corev1.Container{
Image: "rohanv672/dynamo:v0.5.1-trtllm",
Args: []string{
"python3 -m dynamo.trtllm --model-path /data/deepseek-r1 --served-model-name deepseek-ai/DeepSeek-R1 --extra-engine-args /data/engine_configs/wide_ep_agg.yaml",
},
Command: []string{"/bin/sh", "-c"},
VolumeMounts: []corev1.VolumeMount{
{
Name: "model-storage",
MountPath: "/data",
},
},
},
......@@ -4853,37 +4862,35 @@ func TestGenerateBasePodSpec_ResourceClaims(t *testing.T) {
},
{
name: "component with multiple resource claims",
component: &v1alpha1.DynamoComponentDeploymentOverridesSpec{
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: commonconsts.ComponentTypeWorker,
Resources: &common.Resources{
Claims: []corev1.ResourceClaim{
component: &v1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: commonconsts.ComponentTypeWorker,
Resources: &common.Resources{
Claims: []corev1.ResourceClaim{
{
Name: "compute-domain-channel",
},
{
Name: "network-domain-channel",
},
},
},
ExtraPodSpec: &common.ExtraPodSpec{
PodSpec: &corev1.PodSpec{
ResourceClaims: []corev1.PodResourceClaim{
{
Name: "compute-domain-channel",
Name: "compute-domain-channel",
ResourceClaimTemplateName: ptr.To("compute-template"),
},
{
Name: "network-domain-channel",
Name: "network-domain-channel",
ResourceClaimTemplateName: ptr.To("network-template"),
},
},
},
ExtraPodSpec: &common.ExtraPodSpec{
PodSpec: &corev1.PodSpec{
ResourceClaims: []corev1.PodResourceClaim{
{
Name: "compute-domain-channel",
ResourceClaimTemplateName: ptr.To("compute-template"),
},
{
Name: "network-domain-channel",
ResourceClaimTemplateName: ptr.To("network-template"),
},
},
},
MainContainer: &corev1.Container{
Image: "test-image",
Command: []string{"python3"},
Args: []string{"-m", "dynamo.worker"},
},
MainContainer: &corev1.Container{
Image: "test-image",
Command: []string{"python3"},
Args: []string{"-m", "dynamo.worker"},
},
},
},
......@@ -4909,14 +4916,12 @@ func TestGenerateBasePodSpec_ResourceClaims(t *testing.T) {
},
{
name: "component without resource claims",
component: &v1alpha1.DynamoComponentDeploymentOverridesSpec{
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: commonconsts.ComponentTypeFrontend,
Resources: &common.Resources{
Requests: &common.ResourceItem{
CPU: "1",
Memory: "1Gi",
},
component: &v1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: commonconsts.ComponentTypeFrontend,
Resources: &common.Resources{
Requests: &common.ResourceItem{
CPU: "1",
Memory: "1Gi",
},
},
},
......
......@@ -39,7 +39,8 @@ helm fetch https://helm.ngc.nvidia.com/nvidia/ai-dynamo/charts/dynamo-platform-$
helm install dynamo-platform dynamo-platform-${RELEASE_VERSION}.tgz --namespace ${NAMESPACE} --create-namespace
```
For namespace-restricted installations (shared clusters):
For namespace-restricted installations (shared clusters), you'll need to install the Dynamo platform in each namespace you want to deploy to.
Namespace restriction is enabled by setting the `dynamo-operator.namespaceRestriction.enabled` flag to `true`.
```bash
helm install dynamo-platform dynamo-platform-${RELEASE_VERSION}.tgz \
--namespace ${NAMESPACE} \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment