Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
5d90e530
Unverified
Commit
5d90e530
authored
Oct 22, 2025
by
Rohan Varma
Committed by
GitHub
Oct 22, 2025
Browse files
fix: mpi flow and add resourceClaim (#3446)
Signed-off-by:
Rohan Varma
<
rohanv@nvidia.com
>
parent
2626126a
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
380 additions
and
14 deletions
+380
-14
deploy/cloud/helm/crds/templates/nvidia.com_dynamocomponentdeployments.yaml
...crds/templates/nvidia.com_dynamocomponentdeployments.yaml
+20
-0
deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeployments.yaml
...elm/crds/templates/nvidia.com_dynamographdeployments.yaml
+20
-0
deploy/cloud/operator/api/dynamo/common/common.go
deploy/cloud/operator/api/dynamo/common/common.go
+3
-2
deploy/cloud/operator/api/dynamo/common/zz_generated.deepcopy.go
...cloud/operator/api/dynamo/common/zz_generated.deepcopy.go
+5
-0
deploy/cloud/operator/config/crd/bases/nvidia.com_dynamocomponentdeployments.yaml
...nfig/crd/bases/nvidia.com_dynamocomponentdeployments.yaml
+20
-0
deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeployments.yaml
...r/config/crd/bases/nvidia.com_dynamographdeployments.yaml
+20
-0
deploy/cloud/operator/internal/controller_common/resource.go
deploy/cloud/operator/internal/controller_common/resource.go
+6
-0
deploy/cloud/operator/internal/dynamo/backend_trtllm.go
deploy/cloud/operator/internal/dynamo/backend_trtllm.go
+2
-2
deploy/cloud/operator/internal/dynamo/backend_trtllm_test.go
deploy/cloud/operator/internal/dynamo/backend_trtllm_test.go
+10
-10
deploy/cloud/operator/internal/dynamo/graph.go
deploy/cloud/operator/internal/dynamo/graph.go
+9
-0
deploy/cloud/operator/internal/dynamo/graph_test.go
deploy/cloud/operator/internal/dynamo/graph_test.go
+265
-0
No files found.
deploy/cloud/helm/crds/templates/nvidia.com_dynamocomponentdeployments.yaml
View file @
5d90e530
...
@@ -10173,6 +10173,26 @@ spec:
...
@@ -10173,6 +10173,26 @@ spec:
Resources requested and limits for this component, including CPU, memory,
Resources requested and limits for this component, including CPU, memory,
GPUs/devices, and any runtime-specific resources.
GPUs/devices, and any runtime-specific resources.
properties:
properties:
claims:
items:
description: ResourceClaim references one entry in PodSpec.ResourceClaims.
properties:
name:
description: |-
Name must match the name of one entry in pod.spec.resourceClaims of
the Pod where this field is used. It makes that resource available
inside a container.
type: string
request:
description: |-
Request is the name chosen for a request in the referenced claim.
If empty, everything from the claim is made available, otherwise
only the result of this request.
type: string
required:
- name
type: object
type: array
limits:
limits:
properties:
properties:
cpu:
cpu:
...
...
deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeployments.yaml
View file @
5d90e530
...
@@ -10307,6 +10307,26 @@ spec:
...
@@ -10307,6 +10307,26 @@ spec:
Resources requested and limits for this component, including CPU, memory,
Resources requested and limits for this component, including CPU, memory,
GPUs/devices, and any runtime-specific resources.
GPUs/devices, and any runtime-specific resources.
properties:
properties:
claims:
items:
description: ResourceClaim references one entry in PodSpec.ResourceClaims.
properties:
name:
description: |-
Name must match the name of one entry in pod.spec.resourceClaims of
the Pod where this field is used. It makes that resource available
inside a container.
type: string
request:
description: |-
Request is the name chosen for a request in the referenced claim.
If empty, everything from the claim is made available, otherwise
only the result of this request.
type: string
required:
- name
type: object
type: array
limits:
limits:
properties:
properties:
cpu:
cpu:
...
...
deploy/cloud/operator/api/dynamo/common/common.go
View file @
5d90e530
...
@@ -32,8 +32,9 @@ type ResourceItem struct {
...
@@ -32,8 +32,9 @@ type ResourceItem struct {
}
}
type
Resources
struct
{
type
Resources
struct
{
Requests
*
ResourceItem
`json:"requests,omitempty"`
Requests
*
ResourceItem
`json:"requests,omitempty"`
Limits
*
ResourceItem
`json:"limits,omitempty"`
Limits
*
ResourceItem
`json:"limits,omitempty"`
Claims
[]
corev1
.
ResourceClaim
`json:"claims,omitempty"`
}
}
type
DeploymentTargetHPAConf
struct
{
type
DeploymentTargetHPAConf
struct
{
...
...
deploy/cloud/operator/api/dynamo/common/zz_generated.deepcopy.go
View file @
5d90e530
...
@@ -190,6 +190,11 @@ func (in *Resources) DeepCopyInto(out *Resources) {
...
@@ -190,6 +190,11 @@ func (in *Resources) DeepCopyInto(out *Resources) {
*
out
=
new
(
ResourceItem
)
*
out
=
new
(
ResourceItem
)
(
*
in
)
.
DeepCopyInto
(
*
out
)
(
*
in
)
.
DeepCopyInto
(
*
out
)
}
}
if
in
.
Claims
!=
nil
{
in
,
out
:=
&
in
.
Claims
,
&
out
.
Claims
*
out
=
make
([]
v1
.
ResourceClaim
,
len
(
*
in
))
copy
(
*
out
,
*
in
)
}
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Resources.
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Resources.
...
...
deploy/cloud/operator/config/crd/bases/nvidia.com_dynamocomponentdeployments.yaml
View file @
5d90e530
...
@@ -10173,6 +10173,26 @@ spec:
...
@@ -10173,6 +10173,26 @@ spec:
Resources requested and limits for this component, including CPU, memory,
Resources requested and limits for this component, including CPU, memory,
GPUs/devices, and any runtime-specific resources.
GPUs/devices, and any runtime-specific resources.
properties:
properties:
claims:
items:
description: ResourceClaim references one entry in PodSpec.ResourceClaims.
properties:
name:
description: |-
Name must match the name of one entry in pod.spec.resourceClaims of
the Pod where this field is used. It makes that resource available
inside a container.
type: string
request:
description: |-
Request is the name chosen for a request in the referenced claim.
If empty, everything from the claim is made available, otherwise
only the result of this request.
type: string
required:
- name
type: object
type: array
limits:
limits:
properties:
properties:
cpu:
cpu:
...
...
deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeployments.yaml
View file @
5d90e530
...
@@ -10307,6 +10307,26 @@ spec:
...
@@ -10307,6 +10307,26 @@ spec:
Resources requested and limits for this component, including CPU, memory,
Resources requested and limits for this component, including CPU, memory,
GPUs/devices, and any runtime-specific resources.
GPUs/devices, and any runtime-specific resources.
properties:
properties:
claims:
items:
description: ResourceClaim references one entry in PodSpec.ResourceClaims.
properties:
name:
description: |-
Name must match the name of one entry in pod.spec.resourceClaims of
the Pod where this field is used. It makes that resource available
inside a container.
type: string
request:
description: |-
Request is the name chosen for a request in the referenced claim.
If empty, everything from the claim is made available, otherwise
only the result of this request.
type: string
required:
- name
type: object
type: array
limits:
limits:
properties:
properties:
cpu:
cpu:
...
...
deploy/cloud/operator/internal/controller_common/resource.go
View file @
5d90e530
...
@@ -468,6 +468,12 @@ func GetResourcesConfig(resources *common.Resources) (*corev1.ResourceRequiremen
...
@@ -468,6 +468,12 @@ func GetResourcesConfig(resources *common.Resources) (*corev1.ResourceRequiremen
currentResources
.
Requests
[
corev1
.
ResourceName
(
k
)]
=
q
currentResources
.
Requests
[
corev1
.
ResourceName
(
k
)]
=
q
}
}
}
}
if
resources
.
Claims
!=
nil
{
if
currentResources
.
Claims
==
nil
{
currentResources
.
Claims
=
make
([]
corev1
.
ResourceClaim
,
0
)
}
currentResources
.
Claims
=
append
(
currentResources
.
Claims
,
resources
.
Claims
...
)
}
return
currentResources
,
nil
return
currentResources
,
nil
}
}
...
...
deploy/cloud/operator/internal/dynamo/backend_trtllm.go
View file @
5d90e530
...
@@ -143,12 +143,12 @@ func (b *TRTLLMBackend) setupLeaderContainer(container *corev1.Container, number
...
@@ -143,12 +143,12 @@ func (b *TRTLLMBackend) setupLeaderContainer(container *corev1.Container, number
// Build mpirun command with explicit SSH configuration and environment variables
// Build mpirun command with explicit SSH configuration and environment variables
// Wrap the entire command (trtllm-llmapi-launch + original command) in bash -c for proper shell interpretation
// Wrap the entire command (trtllm-llmapi-launch + original command) in bash -c for proper shell interpretation
wrappedCommand
:=
fmt
.
Sprintf
(
"bash -c '
source /opt/dynamo/venv/bin/activate &&
trtllm-llmapi-launch %s'"
,
originalCommand
)
wrappedCommand
:=
fmt
.
Sprintf
(
"bash -c 'trtllm-llmapi-launch %s'"
,
originalCommand
)
// Generate environment variable flags for mpirun
// Generate environment variable flags for mpirun
envVarsStr
:=
generateEnvVarFlags
(
container
.
Env
)
envVarsStr
:=
generateEnvVarFlags
(
container
.
Env
)
mpirunCmd
:=
fmt
.
Sprintf
(
"mpirun --oversubscribe -n %d -H %s --mca pml ob1 --mca plm_rsh_args
\"
-p %d -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
%s %s"
,
mpirunCmd
:=
fmt
.
Sprintf
(
"mpirun
--allow-run-as-root
--oversubscribe -n %d -H %s --mca pml ob1 --mca plm_rsh_args
\"
-p %d -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
%s %s"
,
totalGPUs
,
totalGPUs
,
workerHosts
,
workerHosts
,
commonconsts
.
MpiRunSshPort
,
commonconsts
.
MpiRunSshPort
,
...
...
deploy/cloud/operator/internal/dynamo/backend_trtllm_test.go
View file @
5d90e530
...
@@ -62,7 +62,7 @@ func TestTRTLLMBackend_UpdateContainer(t *testing.T) {
...
@@ -62,7 +62,7 @@ func TestTRTLLMBackend_UpdateContainer(t *testing.T) {
{
Name
:
mpiRunSecretName
,
MountPath
:
"/ssh-pk"
,
ReadOnly
:
true
},
{
Name
:
mpiRunSecretName
,
MountPath
:
"/ssh-pk"
,
ReadOnly
:
true
},
},
},
expectedCommand
:
[]
string
{
"/bin/sh"
,
"-c"
},
expectedCommand
:
[]
string
{
"/bin/sh"
,
"-c"
},
expectedArgs
:
[]
string
{
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --oversubscribe -n 6 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-wkr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-wkr-1.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x OMPI_MCA_orte_keep_fqdn_hostnames -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c '
source /opt/dynamo/venv/bin/activate &&
trtllm-llmapi-launch python3 --model test'"
},
expectedArgs
:
[]
string
{
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun
--allow-run-as-root
--oversubscribe -n 6 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-wkr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-wkr-1.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x OMPI_MCA_orte_keep_fqdn_hostnames -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'trtllm-llmapi-launch python3 --model test'"
},
expectedEnv
:
[]
corev1
.
EnvVar
{
expectedEnv
:
[]
corev1
.
EnvVar
{
{
Name
:
"OMPI_MCA_orte_keep_fqdn_hostnames"
,
Value
:
"1"
},
{
Name
:
"OMPI_MCA_orte_keep_fqdn_hostnames"
,
Value
:
"1"
},
},
},
...
@@ -116,7 +116,7 @@ func TestTRTLLMBackend_UpdateContainer(t *testing.T) {
...
@@ -116,7 +116,7 @@ func TestTRTLLMBackend_UpdateContainer(t *testing.T) {
{
Name
:
mpiRunSecretName
,
MountPath
:
"/ssh-pk"
,
ReadOnly
:
true
},
{
Name
:
mpiRunSecretName
,
MountPath
:
"/ssh-pk"
,
ReadOnly
:
true
},
},
},
expectedCommand
:
[]
string
{
"/bin/sh"
,
"-c"
},
expectedCommand
:
[]
string
{
"/bin/sh"
,
"-c"
},
expectedArgs
:
[]
string
{
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --oversubscribe -n 2 -H $(LWS_LEADER_ADDRESS),$(LWS_WORKER_1_ADDRESS) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x OMPI_MCA_orte_keep_fqdn_hostnames -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c '
source /opt/dynamo/venv/bin/activate &&
trtllm-llmapi-launch python3 --model test'"
},
expectedArgs
:
[]
string
{
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun
--allow-run-as-root
--oversubscribe -n 2 -H $(LWS_LEADER_ADDRESS),$(LWS_WORKER_1_ADDRESS) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x OMPI_MCA_orte_keep_fqdn_hostnames -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'trtllm-llmapi-launch python3 --model test'"
},
expectedEnv
:
[]
corev1
.
EnvVar
{
expectedEnv
:
[]
corev1
.
EnvVar
{
{
Name
:
"OMPI_MCA_orte_keep_fqdn_hostnames"
,
Value
:
"1"
},
{
Name
:
"OMPI_MCA_orte_keep_fqdn_hostnames"
,
Value
:
"1"
},
},
},
...
@@ -565,7 +565,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
...
@@ -565,7 +565,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
},
},
initialArgs
:
[]
string
{
"python3"
,
"--model"
,
"test"
},
initialArgs
:
[]
string
{
"python3"
,
"--model"
,
"test"
},
initialCommand
:
[]
string
{},
initialCommand
:
[]
string
{},
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --oversubscribe -n 6 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-wkr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-wkr-1.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c '
source /opt/dynamo/venv/bin/activate &&
trtllm-llmapi-launch python3 --model test'"
,
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun
--allow-run-as-root
--oversubscribe -n 6 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-wkr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-wkr-1.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'trtllm-llmapi-launch python3 --model test'"
,
},
},
{
{
name
:
"Leader with command and no GPU resources"
,
name
:
"Leader with command and no GPU resources"
,
...
@@ -575,7 +575,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
...
@@ -575,7 +575,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
component
:
&
v1alpha1
.
DynamoComponentDeploymentSharedSpec
{},
component
:
&
v1alpha1
.
DynamoComponentDeploymentSharedSpec
{},
initialArgs
:
[]
string
{},
initialArgs
:
[]
string
{},
initialCommand
:
[]
string
{
"python"
,
"-m"
,
"worker"
},
initialCommand
:
[]
string
{
"python"
,
"-m"
,
"worker"
},
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --oversubscribe -n 0 -H $(LWS_LEADER_ADDRESS),$(LWS_WORKER_1_ADDRESS) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c '
source /opt/dynamo/venv/bin/activate &&
trtllm-llmapi-launch python -m worker'"
,
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun
--allow-run-as-root
--oversubscribe -n 0 -H $(LWS_LEADER_ADDRESS),$(LWS_WORKER_1_ADDRESS) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'trtllm-llmapi-launch python -m worker'"
,
},
},
{
{
name
:
"Leader with both command and args (shell command - args take precedence)"
,
name
:
"Leader with both command and args (shell command - args take precedence)"
,
...
@@ -591,7 +591,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
...
@@ -591,7 +591,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
},
},
initialArgs
:
[]
string
{
"launch"
,
"--config"
,
"test.yaml"
},
initialArgs
:
[]
string
{
"launch"
,
"--config"
,
"test.yaml"
},
initialCommand
:
[]
string
{
"sh"
,
"-c"
},
initialCommand
:
[]
string
{
"sh"
,
"-c"
},
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c '
source /opt/dynamo/venv/bin/activate &&
trtllm-llmapi-launch launch --config test.yaml'"
,
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun
--allow-run-as-root
--oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'trtllm-llmapi-launch launch --config test.yaml'"
,
},
},
{
{
name
:
"Leader with python command and args (combined)"
,
name
:
"Leader with python command and args (combined)"
,
...
@@ -607,7 +607,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
...
@@ -607,7 +607,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
},
},
initialArgs
:
[]
string
{
"-m"
,
"dynamo.trtllm"
,
"--model-path"
,
"test"
},
initialArgs
:
[]
string
{
"-m"
,
"dynamo.trtllm"
,
"--model-path"
,
"test"
},
initialCommand
:
[]
string
{
"python3"
},
initialCommand
:
[]
string
{
"python3"
},
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c '
source /opt/dynamo/venv/bin/activate &&
trtllm-llmapi-launch python3 -m dynamo.trtllm --model-path test'"
,
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun
--allow-run-as-root
--oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'trtllm-llmapi-launch python3 -m dynamo.trtllm --model-path test'"
,
},
},
{
{
name
:
"Leader with python module command and separate args"
,
name
:
"Leader with python module command and separate args"
,
...
@@ -623,7 +623,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
...
@@ -623,7 +623,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
},
},
initialArgs
:
[]
string
{
"--model-path"
,
"Qwen/Qwen3-0.6B"
,
"--served-model-name"
,
"Qwen/Qwen3-0.6B"
,
"--disaggregation-mode"
,
"prefill"
},
initialArgs
:
[]
string
{
"--model-path"
,
"Qwen/Qwen3-0.6B"
,
"--served-model-name"
,
"Qwen/Qwen3-0.6B"
,
"--disaggregation-mode"
,
"prefill"
},
initialCommand
:
[]
string
{
"python3"
,
"-m"
,
"dynamo.trtllm"
},
initialCommand
:
[]
string
{
"python3"
,
"-m"
,
"dynamo.trtllm"
},
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c '
source /opt/dynamo/venv/bin/activate &&
trtllm-llmapi-launch python3 -m dynamo.trtllm --model-path Qwen/Qwen3-0.6B --served-model-name Qwen/Qwen3-0.6B --disaggregation-mode prefill'"
,
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun
--allow-run-as-root
--oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'trtllm-llmapi-launch python3 -m dynamo.trtllm --model-path Qwen/Qwen3-0.6B --served-model-name Qwen/Qwen3-0.6B --disaggregation-mode prefill'"
,
},
},
{
{
name
:
"Leader with absolute path python command"
,
name
:
"Leader with absolute path python command"
,
...
@@ -639,7 +639,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
...
@@ -639,7 +639,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
},
},
initialArgs
:
[]
string
{
"-m"
,
"dynamo.trtllm"
,
"--model-path"
,
"test"
},
initialArgs
:
[]
string
{
"-m"
,
"dynamo.trtllm"
,
"--model-path"
,
"test"
},
initialCommand
:
[]
string
{
"/usr/bin/python3.8"
},
initialCommand
:
[]
string
{
"/usr/bin/python3.8"
},
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c '
source /opt/dynamo/venv/bin/activate &&
trtllm-llmapi-launch /usr/bin/python3.8 -m dynamo.trtllm --model-path test'"
,
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun
--allow-run-as-root
--oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'trtllm-llmapi-launch /usr/bin/python3.8 -m dynamo.trtllm --model-path test'"
,
},
},
{
{
name
:
"Leader with all environment variables forwarded"
,
name
:
"Leader with all environment variables forwarded"
,
...
@@ -655,7 +655,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
...
@@ -655,7 +655,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
},
},
initialArgs
:
[]
string
{
"serve"
,
"--model"
,
"test"
},
initialArgs
:
[]
string
{
"serve"
,
"--model"
,
"test"
},
initialCommand
:
[]
string
{},
initialCommand
:
[]
string
{},
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c '
source /opt/dynamo/venv/bin/activate &&
trtllm-llmapi-launch serve --model test'"
,
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun
--allow-run-as-root
--oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'trtllm-llmapi-launch serve --model test'"
,
},
},
{
{
name
:
"Leader with overlapping environment variables (deduplication test)"
,
name
:
"Leader with overlapping environment variables (deduplication test)"
,
...
@@ -671,7 +671,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
...
@@ -671,7 +671,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
},
},
initialArgs
:
[]
string
{
"serve"
,
"--model"
,
"test"
},
initialArgs
:
[]
string
{
"serve"
,
"--model"
,
"test"
},
initialCommand
:
[]
string
{},
initialCommand
:
[]
string
{},
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x CUSTOM_VAR -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c '
source /opt/dynamo/venv/bin/activate &&
trtllm-llmapi-launch serve --model test'"
,
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun
--allow-run-as-root
--oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x CUSTOM_VAR -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'trtllm-llmapi-launch serve --model test'"
,
},
},
}
}
...
...
deploy/cloud/operator/internal/dynamo/graph.go
View file @
5d90e530
...
@@ -759,6 +759,14 @@ func GenerateBasePodSpec(
...
@@ -759,6 +759,14 @@ func GenerateBasePodSpec(
maps
.
Copy
(
container
.
Resources
.
Limits
,
overrideResources
.
Limits
)
maps
.
Copy
(
container
.
Resources
.
Limits
,
overrideResources
.
Limits
)
}
}
// Claims
if
overrideResources
!=
nil
&&
len
(
overrideResources
.
Claims
)
>
0
{
if
container
.
Resources
.
Claims
==
nil
{
container
.
Resources
.
Claims
=
[]
corev1
.
ResourceClaim
{}
}
container
.
Resources
.
Claims
=
append
(
container
.
Resources
.
Claims
,
overrideResources
.
Claims
...
)
}
shouldDisableImagePullSecret
:=
component
.
Annotations
[
commonconsts
.
KubeAnnotationDisableImagePullSecretDiscovery
]
==
commonconsts
.
KubeLabelValueTrue
shouldDisableImagePullSecret
:=
component
.
Annotations
[
commonconsts
.
KubeAnnotationDisableImagePullSecretDiscovery
]
==
commonconsts
.
KubeLabelValueTrue
imagePullSecrets
:=
[]
corev1
.
LocalObjectReference
{}
imagePullSecrets
:=
[]
corev1
.
LocalObjectReference
{}
...
@@ -846,6 +854,7 @@ func GenerateBasePodSpec(
...
@@ -846,6 +854,7 @@ func GenerateBasePodSpec(
podSpec
.
Containers
=
append
(
podSpec
.
Containers
,
container
)
podSpec
.
Containers
=
append
(
podSpec
.
Containers
,
container
)
podSpec
.
Volumes
=
append
(
podSpec
.
Volumes
,
volumes
...
)
podSpec
.
Volumes
=
append
(
podSpec
.
Volumes
,
volumes
...
)
podSpec
.
ImagePullSecrets
=
append
(
podSpec
.
ImagePullSecrets
,
imagePullSecrets
...
)
podSpec
.
ImagePullSecrets
=
append
(
podSpec
.
ImagePullSecrets
,
imagePullSecrets
...
)
backend
.
UpdatePodSpec
(
&
podSpec
,
numberOfNodes
,
role
,
component
,
serviceName
)
backend
.
UpdatePodSpec
(
&
podSpec
,
numberOfNodes
,
role
,
component
,
serviceName
)
return
controller_common
.
CanonicalizePodSpec
(
&
podSpec
),
nil
return
controller_common
.
CanonicalizePodSpec
(
&
podSpec
),
nil
}
}
...
...
deploy/cloud/operator/internal/dynamo/graph_test.go
View file @
5d90e530
...
@@ -4751,6 +4751,271 @@ func TestGenerateBasePodSpec_VolumeMounts(t *testing.T) {
...
@@ -4751,6 +4751,271 @@ func TestGenerateBasePodSpec_VolumeMounts(t *testing.T) {
}
}
}
}
func
TestGenerateBasePodSpec_ResourceClaims
(
t
*
testing
.
T
)
{
secretsRetriever
:=
&
mockSecretsRetriever
{}
controllerConfig
:=
controller_common
.
Config
{}
tests
:=
[]
struct
{
name
string
component
*
v1alpha1
.
DynamoComponentDeploymentOverridesSpec
expectError
bool
expectedResourceClaims
[]
corev1
.
ResourceClaim
expectedPodClaims
[]
corev1
.
PodResourceClaim
expectedVolumes
[]
corev1
.
Volume
}{
{
name
:
"component with resource claims"
,
component
:
&
v1alpha1
.
DynamoComponentDeploymentOverridesSpec
{
DynamoComponentDeploymentSharedSpec
:
v1alpha1
.
DynamoComponentDeploymentSharedSpec
{
ComponentType
:
commonconsts
.
ComponentTypeWorker
,
Resources
:
&
common
.
Resources
{
Requests
:
&
common
.
ResourceItem
{
CPU
:
"130"
,
Memory
:
"800Gi"
,
},
Limits
:
&
common
.
ResourceItem
{
CPU
:
"130"
,
Memory
:
"800Gi"
,
GPU
:
"4"
,
},
Claims
:
[]
corev1
.
ResourceClaim
{
{
Name
:
"compute-domain-channel"
,
},
},
},
ExtraPodSpec
:
&
common
.
ExtraPodSpec
{
PodSpec
:
&
corev1
.
PodSpec
{
ResourceClaims
:
[]
corev1
.
PodResourceClaim
{
{
Name
:
"compute-domain-channel"
,
ResourceClaimTemplateName
:
ptr
.
To
(
"trtllm-test-compute-domain-channel"
),
},
},
Volumes
:
[]
corev1
.
Volume
{
{
Name
:
"model-storage"
,
VolumeSource
:
corev1
.
VolumeSource
{
PersistentVolumeClaim
:
&
corev1
.
PersistentVolumeClaimVolumeSource
{
ClaimName
:
"dynamo-pvc"
,
},
},
},
},
},
MainContainer
:
&
corev1
.
Container
{
Image
:
"rohanv672/dynamo:v0.5.1-trtllm"
,
Args
:
[]
string
{
"python3 -m dynamo.trtllm --model-path /data/deepseek-r1 --served-model-name deepseek-ai/DeepSeek-R1 --extra-engine-args /data/engine_configs/wide_ep_agg.yaml"
,
},
Command
:
[]
string
{
"/bin/sh"
,
"-c"
},
VolumeMounts
:
[]
corev1
.
VolumeMount
{
{
Name
:
"model-storage"
,
MountPath
:
"/data"
,
},
},
},
},
},
},
expectError
:
false
,
expectedResourceClaims
:
[]
corev1
.
ResourceClaim
{
{
Name
:
"compute-domain-channel"
,
},
},
expectedPodClaims
:
[]
corev1
.
PodResourceClaim
{
{
Name
:
"compute-domain-channel"
,
ResourceClaimTemplateName
:
ptr
.
To
(
"trtllm-test-compute-domain-channel"
),
},
},
expectedVolumes
:
[]
corev1
.
Volume
{
{
Name
:
"model-storage"
,
VolumeSource
:
corev1
.
VolumeSource
{
PersistentVolumeClaim
:
&
corev1
.
PersistentVolumeClaimVolumeSource
{
ClaimName
:
"dynamo-pvc"
,
},
},
},
{
Name
:
"shared-memory"
,
VolumeSource
:
corev1
.
VolumeSource
{
EmptyDir
:
&
corev1
.
EmptyDirVolumeSource
{
Medium
:
corev1
.
StorageMediumMemory
,
SizeLimit
:
func
()
*
resource
.
Quantity
{
q
:=
resource
.
MustParse
(
commonconsts
.
DefaultSharedMemorySize
);
return
&
q
}(),
},
},
},
},
},
{
name
:
"component with multiple resource claims"
,
component
:
&
v1alpha1
.
DynamoComponentDeploymentOverridesSpec
{
DynamoComponentDeploymentSharedSpec
:
v1alpha1
.
DynamoComponentDeploymentSharedSpec
{
ComponentType
:
commonconsts
.
ComponentTypeWorker
,
Resources
:
&
common
.
Resources
{
Claims
:
[]
corev1
.
ResourceClaim
{
{
Name
:
"compute-domain-channel"
,
},
{
Name
:
"network-domain-channel"
,
},
},
},
ExtraPodSpec
:
&
common
.
ExtraPodSpec
{
PodSpec
:
&
corev1
.
PodSpec
{
ResourceClaims
:
[]
corev1
.
PodResourceClaim
{
{
Name
:
"compute-domain-channel"
,
ResourceClaimTemplateName
:
ptr
.
To
(
"compute-template"
),
},
{
Name
:
"network-domain-channel"
,
ResourceClaimTemplateName
:
ptr
.
To
(
"network-template"
),
},
},
},
MainContainer
:
&
corev1
.
Container
{
Image
:
"test-image"
,
Command
:
[]
string
{
"python3"
},
Args
:
[]
string
{
"-m"
,
"dynamo.worker"
},
},
},
},
},
expectError
:
false
,
expectedResourceClaims
:
[]
corev1
.
ResourceClaim
{
{
Name
:
"compute-domain-channel"
,
},
{
Name
:
"network-domain-channel"
,
},
},
expectedPodClaims
:
[]
corev1
.
PodResourceClaim
{
{
Name
:
"compute-domain-channel"
,
ResourceClaimTemplateName
:
ptr
.
To
(
"compute-template"
),
},
{
Name
:
"network-domain-channel"
,
ResourceClaimTemplateName
:
ptr
.
To
(
"network-template"
),
},
},
},
{
name
:
"component without resource claims"
,
component
:
&
v1alpha1
.
DynamoComponentDeploymentOverridesSpec
{
DynamoComponentDeploymentSharedSpec
:
v1alpha1
.
DynamoComponentDeploymentSharedSpec
{
ComponentType
:
commonconsts
.
ComponentTypeFrontend
,
Resources
:
&
common
.
Resources
{
Requests
:
&
common
.
ResourceItem
{
CPU
:
"1"
,
Memory
:
"1Gi"
,
},
},
},
},
expectError
:
false
,
expectedResourceClaims
:
nil
,
expectedPodClaims
:
nil
,
},
}
for
_
,
tt
:=
range
tests
{
t
.
Run
(
tt
.
name
,
func
(
t
*
testing
.
T
)
{
podSpec
,
err
:=
GenerateBasePodSpec
(
tt
.
component
,
BackendFrameworkTRTLLM
,
secretsRetriever
,
"test-deployment"
,
"default"
,
RoleMain
,
1
,
controllerConfig
,
commonconsts
.
MultinodeDeploymentTypeGrove
,
"test-service"
,
)
if
tt
.
expectError
{
if
err
==
nil
{
t
.
Errorf
(
"GenerateBasePodSpec() expected error, got nil"
)
}
return
}
if
err
!=
nil
{
t
.
Errorf
(
"GenerateBasePodSpec() unexpected error: %v"
,
err
)
return
}
// Check containers exist
if
len
(
podSpec
.
Containers
)
==
0
{
t
.
Errorf
(
"GenerateBasePodSpec() no containers found"
)
return
}
container
:=
podSpec
.
Containers
[
0
]
// Check resource claims in container resources using reflect.DeepEqual
if
!
reflect
.
DeepEqual
(
container
.
Resources
.
Claims
,
tt
.
expectedResourceClaims
)
{
t
.
Errorf
(
"GenerateBasePodSpec() resource claims mismatch:
\n
got: %+v
\n
want: %+v"
,
container
.
Resources
.
Claims
,
tt
.
expectedResourceClaims
)
}
// Check pod resource claims using reflect.DeepEqual
if
!
reflect
.
DeepEqual
(
podSpec
.
ResourceClaims
,
tt
.
expectedPodClaims
)
{
t
.
Errorf
(
"GenerateBasePodSpec() pod resource claims mismatch:
\n
got: %+v
\n
want: %+v"
,
podSpec
.
ResourceClaims
,
tt
.
expectedPodClaims
)
}
// Check expected volumes if specified using reflect.DeepEqual
if
tt
.
expectedVolumes
!=
nil
{
if
!
reflect
.
DeepEqual
(
podSpec
.
Volumes
,
tt
.
expectedVolumes
)
{
t
.
Errorf
(
"GenerateBasePodSpec() volumes mismatch:
\n
got: %+v
\n
want: %+v"
,
podSpec
.
Volumes
,
tt
.
expectedVolumes
)
}
}
// Verify resource requests and limits are properly set when claims are present
if
len
(
tt
.
expectedResourceClaims
)
>
0
{
// Check that standard resources are still processed correctly
if
tt
.
component
.
Resources
!=
nil
{
if
tt
.
component
.
Resources
.
Requests
!=
nil
{
if
tt
.
component
.
Resources
.
Requests
.
CPU
!=
""
{
if
container
.
Resources
.
Requests
==
nil
{
t
.
Errorf
(
"GenerateBasePodSpec() expected CPU request to be set"
)
}
else
if
cpu
,
exists
:=
container
.
Resources
.
Requests
[
corev1
.
ResourceCPU
];
!
exists
||
cpu
.
IsZero
()
{
t
.
Errorf
(
"GenerateBasePodSpec() expected CPU request to be set"
)
}
}
if
tt
.
component
.
Resources
.
Requests
.
Memory
!=
""
{
if
container
.
Resources
.
Requests
==
nil
{
t
.
Errorf
(
"GenerateBasePodSpec() expected Memory request to be set"
)
}
else
if
memory
,
exists
:=
container
.
Resources
.
Requests
[
corev1
.
ResourceMemory
];
!
exists
||
memory
.
IsZero
()
{
t
.
Errorf
(
"GenerateBasePodSpec() expected Memory request to be set"
)
}
}
}
if
tt
.
component
.
Resources
.
Limits
!=
nil
{
if
tt
.
component
.
Resources
.
Limits
.
GPU
!=
""
{
if
container
.
Resources
.
Limits
==
nil
{
t
.
Errorf
(
"GenerateBasePodSpec() expected GPU limit to be set"
)
}
else
if
gpu
,
exists
:=
container
.
Resources
.
Limits
[
corev1
.
ResourceName
(
"nvidia.com/gpu"
)];
!
exists
||
gpu
.
IsZero
()
{
t
.
Errorf
(
"GenerateBasePodSpec() expected GPU limit to be set"
)
}
}
}
}
}
})
}
}
func
TestGenerateBasePodSpec_UseAsCompilationCache_BackendSupport
(
t
*
testing
.
T
)
{
func
TestGenerateBasePodSpec_UseAsCompilationCache_BackendSupport
(
t
*
testing
.
T
)
{
secretsRetriever
:=
&
mockSecretsRetriever
{}
secretsRetriever
:=
&
mockSecretsRetriever
{}
controllerConfig
:=
controller_common
.
Config
{}
controllerConfig
:=
controller_common
.
Config
{}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment