Unverified Commit df973acd authored by Julien Mancuso's avatar Julien Mancuso Committed by GitHub
Browse files

fix: fix trtllm multinode command (#3373)


Signed-off-by: default avatarJulien Mancuso <jmancuso@nvidia.com>
parent d20a292b
......@@ -22,7 +22,8 @@ func isPythonCommand(cmd string) bool {
return true
}
// Match python with version numbers like python3.11, python2.7, etc.
matched, _ := regexp.MatchString(`^python\d+(\.\d+)*$`, cmd)
// Also support absolute paths like /usr/bin/python3.8, /opt/python/bin/python3.11
matched, _ := regexp.MatchString(`^(.*/)?(python\d*(\.\d+)*)$`, cmd)
return matched
}
......
......@@ -117,6 +117,28 @@ func TestSGLangBackend_PythonCommandInjection(t *testing.T) {
expectedArgs: []string{"-m", "dynamo.sglang", "--dist-init-addr", "leader.example.com:29500", "--nnodes", "2", "--node-rank", "1"},
description: "Python version variants should be recognized",
},
{
name: "absolute path python command supported",
numberOfNodes: 2,
role: RoleWorker,
multinodeDeployer: &MockSimpleDeployer{},
initialCommand: []string{"/usr/bin/python3.8"},
initialArgs: []string{"-m", "dynamo.sglang", "--model", "llama"},
expectedCommand: []string{"/usr/bin/python3.8"},
expectedArgs: []string{"-m", "dynamo.sglang", "--model", "llama", "--dist-init-addr", "leader.example.com:29500", "--nnodes", "2", "--node-rank", "1"},
description: "Absolute path Python commands should be recognized",
},
{
name: "pyenv shims python command supported",
numberOfNodes: 2,
role: RoleWorker,
multinodeDeployer: &MockShellDeployer{},
initialCommand: []string{"/home/user/.pyenv/shims/python3.9"},
initialArgs: []string{"-m", "dynamo.sglang"},
expectedCommand: []string{"sh", "-c"},
expectedArgs: []string{"exec /home/user/.pyenv/shims/python3.9 -m dynamo.sglang --dist-init-addr $(LEADER_HOST):29500 --nnodes 2 --node-rank $(WORKER_INDEX)"},
description: "Pyenv shims Python paths should be recognized and wrapped with shell",
},
{
name: "python command with module in command array - simple deployer",
numberOfNodes: 2,
......@@ -314,19 +336,40 @@ func TestIsPythonCommand(t *testing.T) {
cmd string
expected bool
}{
// Base python commands
{"python", true},
{"python3", true},
{"python2", true},
{"python3.11", true},
{"python2.7", true},
{"python3.12.1", true},
// Absolute paths
{"/usr/bin/python", true},
{"/usr/bin/python3", true},
{"/usr/bin/python3.8", true},
{"/usr/bin/python2.7", true},
{"/opt/python/bin/python3.11", true},
{"/usr/local/bin/python3.12.1", true},
{"/home/user/.pyenv/shims/python3.9", true},
{"./python3", true},
{"../bin/python", true},
{"bin/python3.10", true},
// Invalid cases
{"java", false},
{"sh", false},
{"node", false},
{"python-config", false}, // hyphen makes it not a python interpreter
{"python-config", false}, // hyphen makes it not a python interpreter
{"/usr/bin/python-config", false}, // hyphen in absolute path
{"", false},
{"python ", false}, // space makes it invalid
{"pythonx", false}, // extra characters
{"python ", false}, // space makes it invalid
{"/usr/bin/python ", false}, // space in absolute path
{"pythonx", false}, // extra characters
{"/usr/bin/pythonx", false}, // extra characters in absolute path
{"/usr/bin/", false}, // empty filename
{"python/", false}, // directory, not executable
{"/usr/bin/python/", false}, // directory path
}
for _, tt := range tests {
......
......@@ -108,9 +108,20 @@ func (b *TRTLLMBackend) setupLeaderContainer(container *corev1.Container, number
// Store original command/args for later use
var originalCommand string
if len(container.Args) > 0 {
if len(container.Command) > 0 && isPythonCommand(container.Command[0]) {
// Direct Python command: combine command + args
var parts []string
parts = append(parts, container.Command...)
if len(container.Args) > 0 {
parts = append(parts, container.Args...)
}
originalCommand = strings.Join(parts, " ")
} else if len(container.Args) > 0 {
// Shell command (sh -c): args contains the full command
originalCommand = strings.Join(container.Args, " ")
} else if len(container.Command) > 0 {
// Fallback: just command
originalCommand = strings.Join(container.Command, " ")
}
......
......@@ -584,7 +584,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
expected: "mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *\\nIdentityFile ~/.ssh/id_rsa\\nStrictHostKeyChecking no\\nPort 2222\\n' > ~/.ssh/config && mpirun --oversubscribe -n 0 -H $(LWS_LEADER_ADDRESS),$(LWS_WORKER_1_ADDRESS) --mca pml ob1 --mca plm_rsh_args \"-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa\" -x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'source /opt/dynamo/venv/bin/activate && trtllm-llmapi-launch python -m worker'",
},
{
name: "Leader with both command and args (args take precedence)",
name: "Leader with both command and args (shell command - args take precedence)",
numberOfNodes: 2,
multinodeDeployer: &GroveMultinodeDeployer{},
serviceName: "test",
......@@ -598,9 +598,63 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
},
},
initialArgs: []string{"launch", "--config", "test.yaml"},
initialCommand: []string{"ignored-command"},
initialCommand: []string{"sh", "-c"},
expected: "mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *\\nIdentityFile ~/.ssh/id_rsa\\nStrictHostKeyChecking no\\nPort 2222\\n' > ~/.ssh/config && mpirun --oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args \"-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa\" -x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'source /opt/dynamo/venv/bin/activate && trtllm-llmapi-launch launch --config test.yaml'",
},
{
name: "Leader with python command and args (combined)",
numberOfNodes: 2,
multinodeDeployer: &GroveMultinodeDeployer{},
serviceName: "test",
component: &v1alpha1.DynamoComponentDeploymentOverridesSpec{
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
Resources: &common.Resources{
Limits: &common.ResourceItem{
GPU: "1",
},
},
},
},
initialArgs: []string{"-m", "dynamo.trtllm", "--model-path", "test"},
initialCommand: []string{"python3"},
expected: "mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *\\nIdentityFile ~/.ssh/id_rsa\\nStrictHostKeyChecking no\\nPort 2222\\n' > ~/.ssh/config && mpirun --oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args \"-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa\" -x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'source /opt/dynamo/venv/bin/activate && trtllm-llmapi-launch python3 -m dynamo.trtllm --model-path test'",
},
{
name: "Leader with python module command and separate args",
numberOfNodes: 2,
multinodeDeployer: &GroveMultinodeDeployer{},
serviceName: "test",
component: &v1alpha1.DynamoComponentDeploymentOverridesSpec{
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
Resources: &common.Resources{
Limits: &common.ResourceItem{
GPU: "1",
},
},
},
},
initialArgs: []string{"--model-path", "Qwen/Qwen3-0.6B", "--served-model-name", "Qwen/Qwen3-0.6B", "--disaggregation-mode", "prefill"},
initialCommand: []string{"python3", "-m", "dynamo.trtllm"},
expected: "mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *\\nIdentityFile ~/.ssh/id_rsa\\nStrictHostKeyChecking no\\nPort 2222\\n' > ~/.ssh/config && mpirun --oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args \"-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa\" -x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'source /opt/dynamo/venv/bin/activate && trtllm-llmapi-launch python3 -m dynamo.trtllm --model-path Qwen/Qwen3-0.6B --served-model-name Qwen/Qwen3-0.6B --disaggregation-mode prefill'",
},
{
name: "Leader with absolute path python command",
numberOfNodes: 2,
multinodeDeployer: &GroveMultinodeDeployer{},
serviceName: "test",
component: &v1alpha1.DynamoComponentDeploymentOverridesSpec{
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
Resources: &common.Resources{
Limits: &common.ResourceItem{
GPU: "1",
},
},
},
},
initialArgs: []string{"-m", "dynamo.trtllm", "--model-path", "test"},
initialCommand: []string{"/usr/bin/python3.8"},
expected: "mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *\\nIdentityFile ~/.ssh/id_rsa\\nStrictHostKeyChecking no\\nPort 2222\\n' > ~/.ssh/config && mpirun --oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args \"-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa\" -x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'source /opt/dynamo/venv/bin/activate && trtllm-llmapi-launch /usr/bin/python3.8 -m dynamo.trtllm --model-path test'",
},
{
name: "Leader with all environment variables forwarded",
numberOfNodes: 2,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment