Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
df973acd
Unverified
Commit
df973acd
authored
Oct 02, 2025
by
Julien Mancuso
Committed by
GitHub
Oct 02, 2025
Browse files
fix: fix trtllm multinode command (#3373)
Signed-off-by:
Julien Mancuso
<
jmancuso@nvidia.com
>
parent
d20a292b
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
116 additions
and
7 deletions
+116
-7
deploy/cloud/operator/internal/dynamo/backend_sglang.go
deploy/cloud/operator/internal/dynamo/backend_sglang.go
+2
-1
deploy/cloud/operator/internal/dynamo/backend_sglang_test.go
deploy/cloud/operator/internal/dynamo/backend_sglang_test.go
+46
-3
deploy/cloud/operator/internal/dynamo/backend_trtllm.go
deploy/cloud/operator/internal/dynamo/backend_trtllm.go
+12
-1
deploy/cloud/operator/internal/dynamo/backend_trtllm_test.go
deploy/cloud/operator/internal/dynamo/backend_trtllm_test.go
+56
-2
No files found.
deploy/cloud/operator/internal/dynamo/backend_sglang.go
View file @
df973acd
...
...
@@ -22,7 +22,8 @@ func isPythonCommand(cmd string) bool {
return
true
}
// Match python with version numbers like python3.11, python2.7, etc.
matched
,
_
:=
regexp
.
MatchString
(
`^python\d+(\.\d+)*$`
,
cmd
)
// Also support absolute paths like /usr/bin/python3.8, /opt/python/bin/python3.11
matched
,
_
:=
regexp
.
MatchString
(
`^(.*/)?(python\d*(\.\d+)*)$`
,
cmd
)
return
matched
}
...
...
deploy/cloud/operator/internal/dynamo/backend_sglang_test.go
View file @
df973acd
...
...
@@ -117,6 +117,28 @@ func TestSGLangBackend_PythonCommandInjection(t *testing.T) {
expectedArgs
:
[]
string
{
"-m"
,
"dynamo.sglang"
,
"--dist-init-addr"
,
"leader.example.com:29500"
,
"--nnodes"
,
"2"
,
"--node-rank"
,
"1"
},
description
:
"Python version variants should be recognized"
,
},
{
name
:
"absolute path python command supported"
,
numberOfNodes
:
2
,
role
:
RoleWorker
,
multinodeDeployer
:
&
MockSimpleDeployer
{},
initialCommand
:
[]
string
{
"/usr/bin/python3.8"
},
initialArgs
:
[]
string
{
"-m"
,
"dynamo.sglang"
,
"--model"
,
"llama"
},
expectedCommand
:
[]
string
{
"/usr/bin/python3.8"
},
expectedArgs
:
[]
string
{
"-m"
,
"dynamo.sglang"
,
"--model"
,
"llama"
,
"--dist-init-addr"
,
"leader.example.com:29500"
,
"--nnodes"
,
"2"
,
"--node-rank"
,
"1"
},
description
:
"Absolute path Python commands should be recognized"
,
},
{
name
:
"pyenv shims python command supported"
,
numberOfNodes
:
2
,
role
:
RoleWorker
,
multinodeDeployer
:
&
MockShellDeployer
{},
initialCommand
:
[]
string
{
"/home/user/.pyenv/shims/python3.9"
},
initialArgs
:
[]
string
{
"-m"
,
"dynamo.sglang"
},
expectedCommand
:
[]
string
{
"sh"
,
"-c"
},
expectedArgs
:
[]
string
{
"exec /home/user/.pyenv/shims/python3.9 -m dynamo.sglang --dist-init-addr $(LEADER_HOST):29500 --nnodes 2 --node-rank $(WORKER_INDEX)"
},
description
:
"Pyenv shims Python paths should be recognized and wrapped with shell"
,
},
{
name
:
"python command with module in command array - simple deployer"
,
numberOfNodes
:
2
,
...
...
@@ -314,19 +336,40 @@ func TestIsPythonCommand(t *testing.T) {
cmd
string
expected
bool
}{
// Base python commands
{
"python"
,
true
},
{
"python3"
,
true
},
{
"python2"
,
true
},
{
"python3.11"
,
true
},
{
"python2.7"
,
true
},
{
"python3.12.1"
,
true
},
// Absolute paths
{
"/usr/bin/python"
,
true
},
{
"/usr/bin/python3"
,
true
},
{
"/usr/bin/python3.8"
,
true
},
{
"/usr/bin/python2.7"
,
true
},
{
"/opt/python/bin/python3.11"
,
true
},
{
"/usr/local/bin/python3.12.1"
,
true
},
{
"/home/user/.pyenv/shims/python3.9"
,
true
},
{
"./python3"
,
true
},
{
"../bin/python"
,
true
},
{
"bin/python3.10"
,
true
},
// Invalid cases
{
"java"
,
false
},
{
"sh"
,
false
},
{
"node"
,
false
},
{
"python-config"
,
false
},
// hyphen makes it not a python interpreter
{
"python-config"
,
false
},
// hyphen makes it not a python interpreter
{
"/usr/bin/python-config"
,
false
},
// hyphen in absolute path
{
""
,
false
},
{
"python "
,
false
},
// space makes it invalid
{
"pythonx"
,
false
},
// extra characters
{
"python "
,
false
},
// space makes it invalid
{
"/usr/bin/python "
,
false
},
// space in absolute path
{
"pythonx"
,
false
},
// extra characters
{
"/usr/bin/pythonx"
,
false
},
// extra characters in absolute path
{
"/usr/bin/"
,
false
},
// empty filename
{
"python/"
,
false
},
// directory, not executable
{
"/usr/bin/python/"
,
false
},
// directory path
}
for
_
,
tt
:=
range
tests
{
...
...
deploy/cloud/operator/internal/dynamo/backend_trtllm.go
View file @
df973acd
...
...
@@ -108,9 +108,20 @@ func (b *TRTLLMBackend) setupLeaderContainer(container *corev1.Container, number
// Store original command/args for later use
var
originalCommand
string
if
len
(
container
.
Args
)
>
0
{
if
len
(
container
.
Command
)
>
0
&&
isPythonCommand
(
container
.
Command
[
0
])
{
// Direct Python command: combine command + args
var
parts
[]
string
parts
=
append
(
parts
,
container
.
Command
...
)
if
len
(
container
.
Args
)
>
0
{
parts
=
append
(
parts
,
container
.
Args
...
)
}
originalCommand
=
strings
.
Join
(
parts
,
" "
)
}
else
if
len
(
container
.
Args
)
>
0
{
// Shell command (sh -c): args contains the full command
originalCommand
=
strings
.
Join
(
container
.
Args
,
" "
)
}
else
if
len
(
container
.
Command
)
>
0
{
// Fallback: just command
originalCommand
=
strings
.
Join
(
container
.
Command
,
" "
)
}
...
...
deploy/cloud/operator/internal/dynamo/backend_trtllm_test.go
View file @
df973acd
...
...
@@ -584,7 +584,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --oversubscribe -n 0 -H $(LWS_LEADER_ADDRESS),$(LWS_WORKER_1_ADDRESS) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'source /opt/dynamo/venv/bin/activate && trtllm-llmapi-launch python -m worker'"
,
},
{
name
:
"Leader with both command and args (args take precedence)"
,
name
:
"Leader with both command and args (
shell command -
args take precedence)"
,
numberOfNodes
:
2
,
multinodeDeployer
:
&
GroveMultinodeDeployer
{},
serviceName
:
"test"
,
...
...
@@ -598,9 +598,63 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
},
},
initialArgs
:
[]
string
{
"launch"
,
"--config"
,
"test.yaml"
},
initialCommand
:
[]
string
{
"
ignored-command
"
},
initialCommand
:
[]
string
{
"
sh"
,
"-c
"
},
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'source /opt/dynamo/venv/bin/activate && trtllm-llmapi-launch launch --config test.yaml'"
,
},
{
name
:
"Leader with python command and args (combined)"
,
numberOfNodes
:
2
,
multinodeDeployer
:
&
GroveMultinodeDeployer
{},
serviceName
:
"test"
,
component
:
&
v1alpha1
.
DynamoComponentDeploymentOverridesSpec
{
DynamoComponentDeploymentSharedSpec
:
v1alpha1
.
DynamoComponentDeploymentSharedSpec
{
Resources
:
&
common
.
Resources
{
Limits
:
&
common
.
ResourceItem
{
GPU
:
"1"
,
},
},
},
},
initialArgs
:
[]
string
{
"-m"
,
"dynamo.trtllm"
,
"--model-path"
,
"test"
},
initialCommand
:
[]
string
{
"python3"
},
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'source /opt/dynamo/venv/bin/activate && trtllm-llmapi-launch python3 -m dynamo.trtllm --model-path test'"
,
},
{
name
:
"Leader with python module command and separate args"
,
numberOfNodes
:
2
,
multinodeDeployer
:
&
GroveMultinodeDeployer
{},
serviceName
:
"test"
,
component
:
&
v1alpha1
.
DynamoComponentDeploymentOverridesSpec
{
DynamoComponentDeploymentSharedSpec
:
v1alpha1
.
DynamoComponentDeploymentSharedSpec
{
Resources
:
&
common
.
Resources
{
Limits
:
&
common
.
ResourceItem
{
GPU
:
"1"
,
},
},
},
},
initialArgs
:
[]
string
{
"--model-path"
,
"Qwen/Qwen3-0.6B"
,
"--served-model-name"
,
"Qwen/Qwen3-0.6B"
,
"--disaggregation-mode"
,
"prefill"
},
initialCommand
:
[]
string
{
"python3"
,
"-m"
,
"dynamo.trtllm"
},
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'source /opt/dynamo/venv/bin/activate && trtllm-llmapi-launch python3 -m dynamo.trtllm --model-path Qwen/Qwen3-0.6B --served-model-name Qwen/Qwen3-0.6B --disaggregation-mode prefill'"
,
},
{
name
:
"Leader with absolute path python command"
,
numberOfNodes
:
2
,
multinodeDeployer
:
&
GroveMultinodeDeployer
{},
serviceName
:
"test"
,
component
:
&
v1alpha1
.
DynamoComponentDeploymentOverridesSpec
{
DynamoComponentDeploymentSharedSpec
:
v1alpha1
.
DynamoComponentDeploymentSharedSpec
{
Resources
:
&
common
.
Resources
{
Limits
:
&
common
.
ResourceItem
{
GPU
:
"1"
,
},
},
},
},
initialArgs
:
[]
string
{
"-m"
,
"dynamo.trtllm"
,
"--model-path"
,
"test"
},
initialCommand
:
[]
string
{
"/usr/bin/python3.8"
},
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'source /opt/dynamo/venv/bin/activate && trtllm-llmapi-launch /usr/bin/python3.8 -m dynamo.trtllm --model-path test'"
,
},
{
name
:
"Leader with all environment variables forwarded"
,
numberOfNodes
:
2
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment