Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
6ef20625
Unverified
Commit
6ef20625
authored
Feb 13, 2026
by
MatejKosec
Committed by
GitHub
Feb 13, 2026
Browse files
fix(operator): fix SSH setup bugs in TRT-LLM multinode workers (#6225)
Signed-off-by:
Matej Kosec
<
mkosec@nvidia.com
>
parent
67883ec6
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
47 additions
and
34 deletions
+47
-34
deploy/operator/internal/dynamo/backend_trtllm.go
deploy/operator/internal/dynamo/backend_trtllm.go
+34
-21
deploy/operator/internal/dynamo/backend_trtllm_test.go
deploy/operator/internal/dynamo/backend_trtllm_test.go
+13
-13
No files found.
deploy/operator/internal/dynamo/backend_trtllm.go
View file @
6ef20625
...
...
@@ -126,15 +126,18 @@ func (b *TRTLLMBackend) setupLeaderContainer(container *corev1.Container, number
}
// Setup SSH and run mpirun command
// Use $HOME instead of ~ because the container may set HOME=/home/dynamo via Dockerfile
// while the shell user is root (from securityContext.runAsUser: 0). ~ follows /etc/passwd
// but $HOME follows the environment, and SSH/mpirun need to find keys where we put them.
sshSetupCommands
:=
[]
string
{
"mkdir -p
~
/.ssh"
,
"mkdir -p
$HOME
/.ssh"
,
"ls -la /ssh-pk/"
,
// Debug: list files in ssh-pk directory
"cp /ssh-pk/private.key
~
/.ssh/id_rsa"
,
"cp /ssh-pk/private.key.pub
~
/.ssh/id_rsa.pub"
,
"cp /ssh-pk/private.key.pub
~
/.ssh/authorized_keys"
,
"chmod 600
~
/.ssh/id_rsa
~
/.ssh/authorized_keys"
,
"chmod 644
~
/.ssh/id_rsa.pub
~/.ssh/authorized_keys
"
,
fmt
.
Sprintf
(
"printf 'Host *
\\
nIdentityFile
~
/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort %d
\\
n' >
~
/.ssh/config"
,
commonconsts
.
MpiRunSshPort
),
"cp /ssh-pk/private.key
$HOME
/.ssh/id_rsa"
,
"cp /ssh-pk/private.key.pub
$HOME
/.ssh/id_rsa.pub"
,
"cp /ssh-pk/private.key.pub
$HOME
/.ssh/authorized_keys"
,
"chmod 600
$HOME
/.ssh/id_rsa
$HOME
/.ssh/authorized_keys"
,
"chmod 644
$HOME
/.ssh/id_rsa.pub"
,
fmt
.
Sprintf
(
"printf 'Host *
\\
nIdentityFile
'$HOME'
/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort %d
\\
n' >
$HOME
/.ssh/config"
,
commonconsts
.
MpiRunSshPort
),
}
// Calculate total number of GPUs across all nodes
...
...
@@ -148,7 +151,10 @@ func (b *TRTLLMBackend) setupLeaderContainer(container *corev1.Container, number
// Generate environment variable flags for mpirun
envVarsStr
:=
generateEnvVarFlags
(
container
.
Env
)
mpirunCmd
:=
fmt
.
Sprintf
(
"mpirun --allow-run-as-root --oversubscribe -n %d -H %s --mca pml ob1 --mca plm_rsh_args
\"
-p %d -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
%s %s"
,
// Use --allow-run-as-root only when the container is running as root (UID 0).
// When running as a non-root user, mpirun works without this flag and omitting
// it avoids masking accidental root execution.
mpirunCmd
:=
fmt
.
Sprintf
(
"mpirun $([
\"
$(id -u)
\"
=
\"
0
\"
] && echo --allow-run-as-root) --oversubscribe -n %d -H %s --mca pml ob1 --mca plm_rsh_args
\"
-p %d -o StrictHostKeyChecking=no -i $HOME/.ssh/id_rsa
\"
%s %s"
,
totalGPUs
,
allHostnames
,
commonconsts
.
MpiRunSshPort
,
...
...
@@ -176,22 +182,29 @@ func (b *TRTLLMBackend) setupLeaderContainer(container *corev1.Container, number
// setupWorkerContainer configures worker nodes with SSH setup and daemon
func
(
b
*
TRTLLMBackend
)
setupWorkerContainer
(
container
*
corev1
.
Container
)
{
// Setup SSH for worker nodes
// Use $HOME instead of ~ for the same reasons as setupLeaderContainer (see comment above).
sshSetupCommands
:=
[]
string
{
"mkdir -p
~
/.ssh
~
/.ssh/host_keys
~
/.ssh/run"
,
"mkdir -p
$HOME
/.ssh
$HOME
/.ssh/host_keys
$HOME
/.ssh/run
/run/sshd
"
,
"ls -la /ssh-pk/"
,
// Debug: list files in ssh-pk directory
"cp /ssh-pk/private.key
~
/.ssh/id_rsa"
,
"cp /ssh-pk/private.key.pub
~
/.ssh/id_rsa.pub"
,
"cp /ssh-pk/private.key.pub
~
/.ssh/authorized_keys"
,
"chmod 600
~
/.ssh/id_rsa
~
/.ssh/authorized_keys"
,
"chmod 644
~
/.ssh/id_rsa.pub
~/.ssh/authorized_keys
"
,
fmt
.
Sprintf
(
"printf 'Host *
\\
nIdentityFile
~
/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort %d
\\
n' >
~
/.ssh/config"
,
commonconsts
.
MpiRunSshPort
),
"cp /ssh-pk/private.key
$HOME
/.ssh/id_rsa"
,
"cp /ssh-pk/private.key.pub
$HOME
/.ssh/id_rsa.pub"
,
"cp /ssh-pk/private.key.pub
$HOME
/.ssh/authorized_keys"
,
"chmod 600
$HOME
/.ssh/id_rsa
$HOME
/.ssh/authorized_keys"
,
"chmod 644
$HOME
/.ssh/id_rsa.pub"
,
fmt
.
Sprintf
(
"printf 'Host *
\\
nIdentityFile
'$HOME'
/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort %d
\\
n' >
$HOME
/.ssh/config"
,
commonconsts
.
MpiRunSshPort
),
// Generate host keys in user writable directory
"ssh-keygen -t rsa -f ~/.ssh/host_keys/ssh_host_rsa_key -N ''"
,
"ssh-keygen -t ecdsa -f ~/.ssh/host_keys/ssh_host_ecdsa_key -N ''"
,
"ssh-keygen -t ed25519 -f ~/.ssh/host_keys/ssh_host_ed25519_key -N ''"
,
// Create SSH daemon config to use custom host keys location and non-privileged port
fmt
.
Sprintf
(
"printf 'Port %d
\\
nHostKey ~/.ssh/host_keys/ssh_host_rsa_key
\\
nHostKey ~/.ssh/host_keys/ssh_host_ecdsa_key
\\
nHostKey ~/.ssh/host_keys/ssh_host_ed25519_key
\\
nPidFile ~/.ssh/run/sshd.pid
\\
nPermitRootLogin yes
\\
nPasswordAuthentication no
\\
nPubkeyAuthentication yes
\\
nAuthorizedKeysFile ~/.ssh/authorized_keys
\\
n' > ~/.ssh/sshd_config"
,
commonconsts
.
MpiRunSshPort
),
"/usr/sbin/sshd -D -f ~/.ssh/sshd_config"
,
"ssh-keygen -t rsa -f $HOME/.ssh/host_keys/ssh_host_rsa_key -N ''"
,
"ssh-keygen -t ecdsa -f $HOME/.ssh/host_keys/ssh_host_ecdsa_key -N ''"
,
"ssh-keygen -t ed25519 -f $HOME/.ssh/host_keys/ssh_host_ed25519_key -N ''"
,
// Create SSH daemon config using $HOME for absolute paths.
// sshd expands ~ via /etc/passwd (-> /root/) not the HOME env var,
// so we break out of single quotes to let the shell expand $HOME.
// AuthorizedKeysFile also needs absolute $HOME path because sshd resolves
// relative paths from the connecting user's /etc/passwd home (-> /root/).
// StrictModes disabled because /home/dynamo may be owned by a non-root UID
// while sshd runs as root, causing permission check failures.
fmt
.
Sprintf
(
"printf 'Port %d
\\
nHostKey '$HOME'/.ssh/host_keys/ssh_host_rsa_key
\\
nHostKey '$HOME'/.ssh/host_keys/ssh_host_ecdsa_key
\\
nHostKey '$HOME'/.ssh/host_keys/ssh_host_ed25519_key
\\
nPidFile '$HOME'/.ssh/run/sshd.pid
\\
nStrictModes no
\\
nPermitRootLogin yes
\\
nPasswordAuthentication no
\\
nPubkeyAuthentication yes
\\
nAuthorizedKeysFile '$HOME'/.ssh/authorized_keys
\\
n' > $HOME/.ssh/sshd_config"
,
commonconsts
.
MpiRunSshPort
),
"/usr/sbin/sshd -D -f $HOME/.ssh/sshd_config"
,
}
fullCommand
:=
strings
.
Join
(
sshSetupCommands
,
" && "
)
...
...
deploy/operator/internal/dynamo/backend_trtllm_test.go
View file @
6ef20625
...
...
@@ -61,7 +61,7 @@ func TestTRTLLMBackend_UpdateContainer(t *testing.T) {
{
Name
:
mpiRunSecretName
,
MountPath
:
"/ssh-pk"
,
ReadOnly
:
true
},
},
expectedCommand
:
[]
string
{
"/bin/sh"
,
"-c"
},
expectedArgs
:
[]
string
{
"mkdir -p
~
/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key
~
/.ssh/id_rsa && cp /ssh-pk/private.key.pub
~
/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub
~
/.ssh/authorized_keys && chmod 600
~
/.ssh/id_rsa
~
/.ssh/authorized_keys && chmod 644
~
/.ssh/id_rsa.pub
~/.ssh/authorized_keys
&& printf 'Host *
\\
nIdentityFile
~
/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' >
~
/.ssh/config && mpirun --allow-run-as-root --oversubscribe -n 6 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-wkr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-wkr-1.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i
~
/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x OMPI_MCA_orte_keep_fqdn_hostnames -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x TRTLLM_USE_UCX_KVCACHE -x USER bash -c 'trtllm-llmapi-launch python3 --model test'"
},
expectedArgs
:
[]
string
{
"mkdir -p
$HOME
/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key
$HOME
/.ssh/id_rsa && cp /ssh-pk/private.key.pub
$HOME
/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub
$HOME
/.ssh/authorized_keys && chmod 600
$HOME
/.ssh/id_rsa
$HOME
/.ssh/authorized_keys && chmod 644
$HOME
/.ssh/id_rsa.pub && printf 'Host *
\\
nIdentityFile
'$HOME'
/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' >
$HOME
/.ssh/config && mpirun
$([
\"
$(id -u)
\"
=
\"
0
\"
] && echo
--allow-run-as-root
)
--oversubscribe -n 6 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-wkr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-wkr-1.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i
$HOME
/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x OMPI_MCA_orte_keep_fqdn_hostnames -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x TRTLLM_USE_UCX_KVCACHE -x USER bash -c 'trtllm-llmapi-launch python3 --model test'"
},
expectedEnv
:
[]
corev1
.
EnvVar
{
{
Name
:
"OMPI_MCA_orte_keep_fqdn_hostnames"
,
Value
:
"1"
},
},
...
...
@@ -80,7 +80,7 @@ func TestTRTLLMBackend_UpdateContainer(t *testing.T) {
{
Name
:
mpiRunSecretName
,
MountPath
:
"/ssh-pk"
,
ReadOnly
:
true
},
},
expectedCommand
:
[]
string
{
"/bin/sh"
,
"-c"
},
expectedArgs
:
[]
string
{
"mkdir -p
~
/.ssh
~
/.ssh/host_keys
~
/.ssh/run && ls -la /ssh-pk/ && cp /ssh-pk/private.key
~
/.ssh/id_rsa && cp /ssh-pk/private.key.pub
~
/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub
~
/.ssh/authorized_keys && chmod 600
~
/.ssh/id_rsa
~
/.ssh/authorized_keys && chmod 644
~
/.ssh/id_rsa.pub
~/.ssh/authorized_keys
&& printf 'Host *
\\
nIdentityFile
~
/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' >
~
/.ssh/config && ssh-keygen -t rsa -f
~
/.ssh/host_keys/ssh_host_rsa_key -N '' && ssh-keygen -t ecdsa -f
~
/.ssh/host_keys/ssh_host_ecdsa_key -N '' && ssh-keygen -t ed25519 -f
~
/.ssh/host_keys/ssh_host_ed25519_key -N '' && printf 'Port 2222
\\
nHostKey
~
/.ssh/host_keys/ssh_host_rsa_key
\\
nHostKey
~
/.ssh/host_keys/ssh_host_ecdsa_key
\\
nHostKey
~
/.ssh/host_keys/ssh_host_ed25519_key
\\
nPidFile
~
/.ssh/run/sshd.pid
\\
nPermitRootLogin yes
\\
nPasswordAuthentication no
\\
nPubkeyAuthentication yes
\\
nAuthorizedKeysFile
~
/.ssh/authorized_keys
\\
n' >
~
/.ssh/sshd_config && /usr/sbin/sshd -D -f
~
/.ssh/sshd_config"
},
expectedArgs
:
[]
string
{
"mkdir -p
$HOME
/.ssh
$HOME
/.ssh/host_keys
$HOME
/.ssh/run
/run/sshd
&& ls -la /ssh-pk/ && cp /ssh-pk/private.key
$HOME
/.ssh/id_rsa && cp /ssh-pk/private.key.pub
$HOME
/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub
$HOME
/.ssh/authorized_keys && chmod 600
$HOME
/.ssh/id_rsa
$HOME
/.ssh/authorized_keys && chmod 644
$HOME
/.ssh/id_rsa.pub && printf 'Host *
\\
nIdentityFile
'$HOME'
/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' >
$HOME
/.ssh/config && ssh-keygen -t rsa -f
$HOME
/.ssh/host_keys/ssh_host_rsa_key -N '' && ssh-keygen -t ecdsa -f
$HOME
/.ssh/host_keys/ssh_host_ecdsa_key -N '' && ssh-keygen -t ed25519 -f
$HOME
/.ssh/host_keys/ssh_host_ed25519_key -N '' && printf 'Port 2222
\\
nHostKey
'$HOME'
/.ssh/host_keys/ssh_host_rsa_key
\\
nHostKey
'$HOME'
/.ssh/host_keys/ssh_host_ecdsa_key
\\
nHostKey
'$HOME'
/.ssh/host_keys/ssh_host_ed25519_key
\\
nPidFile
'$HOME'
/.ssh/run/sshd.pid
\\
nStrictModes no
\\
nPermitRootLogin yes
\\
nPasswordAuthentication no
\\
nPubkeyAuthentication yes
\\
nAuthorizedKeysFile
'$HOME'
/.ssh/authorized_keys
\\
n' >
$HOME
/.ssh/sshd_config && /usr/sbin/sshd -D -f
$HOME
/.ssh/sshd_config"
},
expectedEnv
:
[]
corev1
.
EnvVar
{
{
Name
:
"OMPI_MCA_orte_keep_fqdn_hostnames"
,
Value
:
"1"
},
},
...
...
@@ -115,7 +115,7 @@ func TestTRTLLMBackend_UpdateContainer(t *testing.T) {
{
Name
:
mpiRunSecretName
,
MountPath
:
"/ssh-pk"
,
ReadOnly
:
true
},
},
expectedCommand
:
[]
string
{
"/bin/sh"
,
"-c"
},
expectedArgs
:
[]
string
{
"mkdir -p
~
/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key
~
/.ssh/id_rsa && cp /ssh-pk/private.key.pub
~
/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub
~
/.ssh/authorized_keys && chmod 600
~
/.ssh/id_rsa
~
/.ssh/authorized_keys && chmod 644
~
/.ssh/id_rsa.pub
~/.ssh/authorized_keys
&& printf 'Host *
\\
nIdentityFile
~
/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' >
~
/.ssh/config && TIMEOUT=300; START_TIME=$(date +%s); for worker in $(echo
\"
$LWS_LEADER_ADDRESS
\"
| sed 's/
\\
./-1
\\
./'); do echo
\"
Waiting for DNS: $worker
\"
; until getent hosts $worker >/dev/null 2>&1; do CURRENT_TIME=$(date +%s); if [ $((CURRENT_TIME - START_TIME)) -gt $TIMEOUT ]; then echo
\"
ERROR: Timeout waiting for DNS: $worker
\"
; exit 1; fi; echo
\"
DNS not ready for $worker, retrying...
\"
; sleep 2; done; echo
\"
✓ DNS resolved: $worker
\"
; done; echo
\"
All workers DNS ready
\"
&& mpirun --allow-run-as-root --oversubscribe -n 2 -H $LWS_LEADER_ADDRESS,$(echo
\"
$LWS_LEADER_ADDRESS
\"
| sed 's/
\\
./-1
\\
./') --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i
~
/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x OMPI_MCA_orte_keep_fqdn_hostnames -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x TRTLLM_USE_UCX_KVCACHE -x USER bash -c 'trtllm-llmapi-launch python3 --model test'"
},
expectedArgs
:
[]
string
{
"mkdir -p
$HOME
/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key
$HOME
/.ssh/id_rsa && cp /ssh-pk/private.key.pub
$HOME
/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub
$HOME
/.ssh/authorized_keys && chmod 600
$HOME
/.ssh/id_rsa
$HOME
/.ssh/authorized_keys && chmod 644
$HOME
/.ssh/id_rsa.pub && printf 'Host *
\\
nIdentityFile
'$HOME'
/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' >
$HOME
/.ssh/config && TIMEOUT=300; START_TIME=$(date +%s); for worker in $(echo
\"
$LWS_LEADER_ADDRESS
\"
| sed 's/
\\
./-1
\\
./'); do echo
\"
Waiting for DNS: $worker
\"
; until getent hosts $worker >/dev/null 2>&1; do CURRENT_TIME=$(date +%s); if [ $((CURRENT_TIME - START_TIME)) -gt $TIMEOUT ]; then echo
\"
ERROR: Timeout waiting for DNS: $worker
\"
; exit 1; fi; echo
\"
DNS not ready for $worker, retrying...
\"
; sleep 2; done; echo
\"
✓ DNS resolved: $worker
\"
; done; echo
\"
All workers DNS ready
\"
&& mpirun
$([
\"
$(id -u)
\"
=
\"
0
\"
] && echo
--allow-run-as-root
)
--oversubscribe -n 2 -H $LWS_LEADER_ADDRESS,$(echo
\"
$LWS_LEADER_ADDRESS
\"
| sed 's/
\\
./-1
\\
./') --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i
$HOME
/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x OMPI_MCA_orte_keep_fqdn_hostnames -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x TRTLLM_USE_UCX_KVCACHE -x USER bash -c 'trtllm-llmapi-launch python3 --model test'"
},
expectedEnv
:
[]
corev1
.
EnvVar
{
{
Name
:
"OMPI_MCA_orte_keep_fqdn_hostnames"
,
Value
:
"1"
},
},
...
...
@@ -542,7 +542,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
},
initialArgs
:
[]
string
{
"python3"
,
"--model"
,
"test"
},
initialCommand
:
[]
string
{},
expected
:
"mkdir -p
~
/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key
~
/.ssh/id_rsa && cp /ssh-pk/private.key.pub
~
/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub
~
/.ssh/authorized_keys && chmod 600
~
/.ssh/id_rsa
~
/.ssh/authorized_keys && chmod 644
~
/.ssh/id_rsa.pub
~/.ssh/authorized_keys
&& printf 'Host *
\\
nIdentityFile
~
/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' >
~
/.ssh/config && mpirun --allow-run-as-root --oversubscribe -n 6 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-wkr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-wkr-1.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i
~
/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x TRTLLM_USE_UCX_KVCACHE -x USER bash -c 'trtllm-llmapi-launch python3 --model test'"
,
expected
:
"mkdir -p
$HOME
/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key
$HOME
/.ssh/id_rsa && cp /ssh-pk/private.key.pub
$HOME
/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub
$HOME
/.ssh/authorized_keys && chmod 600
$HOME
/.ssh/id_rsa
$HOME
/.ssh/authorized_keys && chmod 644
$HOME
/.ssh/id_rsa.pub && printf 'Host *
\\
nIdentityFile
'$HOME'
/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' >
$HOME
/.ssh/config && mpirun
$([
\"
$(id -u)
\"
=
\"
0
\"
] && echo
--allow-run-as-root
)
--oversubscribe -n 6 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-wkr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-wkr-1.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i
$HOME
/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x TRTLLM_USE_UCX_KVCACHE -x USER bash -c 'trtllm-llmapi-launch python3 --model test'"
,
},
{
name
:
"Leader with command and no GPU resources"
,
...
...
@@ -552,7 +552,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
component
:
&
v1alpha1
.
DynamoComponentDeploymentSharedSpec
{},
initialArgs
:
[]
string
{},
initialCommand
:
[]
string
{
"python"
,
"-m"
,
"worker"
},
expected
:
"mkdir -p
~
/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key
~
/.ssh/id_rsa && cp /ssh-pk/private.key.pub
~
/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub
~
/.ssh/authorized_keys && chmod 600
~
/.ssh/id_rsa
~
/.ssh/authorized_keys && chmod 644
~
/.ssh/id_rsa.pub
~/.ssh/authorized_keys
&& printf 'Host *
\\
nIdentityFile
~
/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' >
~
/.ssh/config && TIMEOUT=300; START_TIME=$(date +%s); for worker in $(echo
\"
$LWS_LEADER_ADDRESS
\"
| sed 's/
\\
./-1
\\
./'); do echo
\"
Waiting for DNS: $worker
\"
; until getent hosts $worker >/dev/null 2>&1; do CURRENT_TIME=$(date +%s); if [ $((CURRENT_TIME - START_TIME)) -gt $TIMEOUT ]; then echo
\"
ERROR: Timeout waiting for DNS: $worker
\"
; exit 1; fi; echo
\"
DNS not ready for $worker, retrying...
\"
; sleep 2; done; echo
\"
✓ DNS resolved: $worker
\"
; done; echo
\"
All workers DNS ready
\"
&& mpirun --allow-run-as-root --oversubscribe -n 0 -H $LWS_LEADER_ADDRESS,$(echo
\"
$LWS_LEADER_ADDRESS
\"
| sed 's/
\\
./-1
\\
./') --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i
~
/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x TRTLLM_USE_UCX_KVCACHE -x USER bash -c 'trtllm-llmapi-launch python -m worker'"
,
expected
:
"mkdir -p
$HOME
/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key
$HOME
/.ssh/id_rsa && cp /ssh-pk/private.key.pub
$HOME
/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub
$HOME
/.ssh/authorized_keys && chmod 600
$HOME
/.ssh/id_rsa
$HOME
/.ssh/authorized_keys && chmod 644
$HOME
/.ssh/id_rsa.pub && printf 'Host *
\\
nIdentityFile
'$HOME'
/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' >
$HOME
/.ssh/config && TIMEOUT=300; START_TIME=$(date +%s); for worker in $(echo
\"
$LWS_LEADER_ADDRESS
\"
| sed 's/
\\
./-1
\\
./'); do echo
\"
Waiting for DNS: $worker
\"
; until getent hosts $worker >/dev/null 2>&1; do CURRENT_TIME=$(date +%s); if [ $((CURRENT_TIME - START_TIME)) -gt $TIMEOUT ]; then echo
\"
ERROR: Timeout waiting for DNS: $worker
\"
; exit 1; fi; echo
\"
DNS not ready for $worker, retrying...
\"
; sleep 2; done; echo
\"
✓ DNS resolved: $worker
\"
; done; echo
\"
All workers DNS ready
\"
&& mpirun
$([
\"
$(id -u)
\"
=
\"
0
\"
] && echo
--allow-run-as-root
)
--oversubscribe -n 0 -H $LWS_LEADER_ADDRESS,$(echo
\"
$LWS_LEADER_ADDRESS
\"
| sed 's/
\\
./-1
\\
./') --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i
$HOME
/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x TRTLLM_USE_UCX_KVCACHE -x USER bash -c 'trtllm-llmapi-launch python -m worker'"
,
},
{
name
:
"Leader with both command and args (shell command - args take precedence)"
,
...
...
@@ -568,7 +568,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
},
initialArgs
:
[]
string
{
"launch"
,
"--config"
,
"test.yaml"
},
initialCommand
:
[]
string
{
"sh"
,
"-c"
},
expected
:
"mkdir -p
~
/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key
~
/.ssh/id_rsa && cp /ssh-pk/private.key.pub
~
/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub
~
/.ssh/authorized_keys && chmod 600
~
/.ssh/id_rsa
~
/.ssh/authorized_keys && chmod 644
~
/.ssh/id_rsa.pub
~/.ssh/authorized_keys
&& printf 'Host *
\\
nIdentityFile
~
/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' >
~
/.ssh/config && mpirun --allow-run-as-root --oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i
~
/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x TRTLLM_USE_UCX_KVCACHE -x USER bash -c 'trtllm-llmapi-launch launch --config test.yaml'"
,
expected
:
"mkdir -p
$HOME
/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key
$HOME
/.ssh/id_rsa && cp /ssh-pk/private.key.pub
$HOME
/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub
$HOME
/.ssh/authorized_keys && chmod 600
$HOME
/.ssh/id_rsa
$HOME
/.ssh/authorized_keys && chmod 644
$HOME
/.ssh/id_rsa.pub && printf 'Host *
\\
nIdentityFile
'$HOME'
/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' >
$HOME
/.ssh/config && mpirun
$([
\"
$(id -u)
\"
=
\"
0
\"
] && echo
--allow-run-as-root
)
--oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i
$HOME
/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x TRTLLM_USE_UCX_KVCACHE -x USER bash -c 'trtllm-llmapi-launch launch --config test.yaml'"
,
},
{
name
:
"Leader with python command and args (combined)"
,
...
...
@@ -584,7 +584,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
},
initialArgs
:
[]
string
{
"-m"
,
"dynamo.trtllm"
,
"--model-path"
,
"test"
},
initialCommand
:
[]
string
{
"python3"
},
expected
:
"mkdir -p
~
/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key
~
/.ssh/id_rsa && cp /ssh-pk/private.key.pub
~
/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub
~
/.ssh/authorized_keys && chmod 600
~
/.ssh/id_rsa
~
/.ssh/authorized_keys && chmod 644
~
/.ssh/id_rsa.pub
~/.ssh/authorized_keys
&& printf 'Host *
\\
nIdentityFile
~
/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' >
~
/.ssh/config && mpirun --allow-run-as-root --oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i
~
/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x TRTLLM_USE_UCX_KVCACHE -x USER bash -c 'trtllm-llmapi-launch python3 -m dynamo.trtllm --model-path test'"
,
expected
:
"mkdir -p
$HOME
/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key
$HOME
/.ssh/id_rsa && cp /ssh-pk/private.key.pub
$HOME
/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub
$HOME
/.ssh/authorized_keys && chmod 600
$HOME
/.ssh/id_rsa
$HOME
/.ssh/authorized_keys && chmod 644
$HOME
/.ssh/id_rsa.pub && printf 'Host *
\\
nIdentityFile
'$HOME'
/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' >
$HOME
/.ssh/config && mpirun
$([
\"
$(id -u)
\"
=
\"
0
\"
] && echo
--allow-run-as-root
)
--oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i
$HOME
/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x TRTLLM_USE_UCX_KVCACHE -x USER bash -c 'trtllm-llmapi-launch python3 -m dynamo.trtllm --model-path test'"
,
},
{
name
:
"Leader with python module command and separate args"
,
...
...
@@ -600,7 +600,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
},
initialArgs
:
[]
string
{
"--model-path"
,
"Qwen/Qwen3-0.6B"
,
"--served-model-name"
,
"Qwen/Qwen3-0.6B"
,
"--disaggregation-mode"
,
"prefill"
},
initialCommand
:
[]
string
{
"python3"
,
"-m"
,
"dynamo.trtllm"
},
expected
:
"mkdir -p
~
/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key
~
/.ssh/id_rsa && cp /ssh-pk/private.key.pub
~
/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub
~
/.ssh/authorized_keys && chmod 600
~
/.ssh/id_rsa
~
/.ssh/authorized_keys && chmod 644
~
/.ssh/id_rsa.pub
~/.ssh/authorized_keys
&& printf 'Host *
\\
nIdentityFile
~
/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' >
~
/.ssh/config && mpirun --allow-run-as-root --oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i
~
/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x TRTLLM_USE_UCX_KVCACHE -x USER bash -c 'trtllm-llmapi-launch python3 -m dynamo.trtllm --model-path Qwen/Qwen3-0.6B --served-model-name Qwen/Qwen3-0.6B --disaggregation-mode prefill'"
,
expected
:
"mkdir -p
$HOME
/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key
$HOME
/.ssh/id_rsa && cp /ssh-pk/private.key.pub
$HOME
/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub
$HOME
/.ssh/authorized_keys && chmod 600
$HOME
/.ssh/id_rsa
$HOME
/.ssh/authorized_keys && chmod 644
$HOME
/.ssh/id_rsa.pub && printf 'Host *
\\
nIdentityFile
'$HOME'
/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' >
$HOME
/.ssh/config && mpirun
$([
\"
$(id -u)
\"
=
\"
0
\"
] && echo
--allow-run-as-root
)
--oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i
$HOME
/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x TRTLLM_USE_UCX_KVCACHE -x USER bash -c 'trtllm-llmapi-launch python3 -m dynamo.trtllm --model-path Qwen/Qwen3-0.6B --served-model-name Qwen/Qwen3-0.6B --disaggregation-mode prefill'"
,
},
{
name
:
"Leader with absolute path python command"
,
...
...
@@ -616,7 +616,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
},
initialArgs
:
[]
string
{
"-m"
,
"dynamo.trtllm"
,
"--model-path"
,
"test"
},
initialCommand
:
[]
string
{
"/usr/bin/python3.8"
},
expected
:
"mkdir -p
~
/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key
~
/.ssh/id_rsa && cp /ssh-pk/private.key.pub
~
/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub
~
/.ssh/authorized_keys && chmod 600
~
/.ssh/id_rsa
~
/.ssh/authorized_keys && chmod 644
~
/.ssh/id_rsa.pub
~/.ssh/authorized_keys
&& printf 'Host *
\\
nIdentityFile
~
/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' >
~
/.ssh/config && mpirun --allow-run-as-root --oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i
~
/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x TRTLLM_USE_UCX_KVCACHE -x USER bash -c 'trtllm-llmapi-launch /usr/bin/python3.8 -m dynamo.trtllm --model-path test'"
,
expected
:
"mkdir -p
$HOME
/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key
$HOME
/.ssh/id_rsa && cp /ssh-pk/private.key.pub
$HOME
/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub
$HOME
/.ssh/authorized_keys && chmod 600
$HOME
/.ssh/id_rsa
$HOME
/.ssh/authorized_keys && chmod 644
$HOME
/.ssh/id_rsa.pub && printf 'Host *
\\
nIdentityFile
'$HOME'
/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' >
$HOME
/.ssh/config && mpirun
$([
\"
$(id -u)
\"
=
\"
0
\"
] && echo
--allow-run-as-root
)
--oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i
$HOME
/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x TRTLLM_USE_UCX_KVCACHE -x USER bash -c 'trtllm-llmapi-launch /usr/bin/python3.8 -m dynamo.trtllm --model-path test'"
,
},
{
name
:
"Leader with all environment variables forwarded"
,
...
...
@@ -632,7 +632,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
},
initialArgs
:
[]
string
{
"serve"
,
"--model"
,
"test"
},
initialCommand
:
[]
string
{},
expected
:
"mkdir -p
~
/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key
~
/.ssh/id_rsa && cp /ssh-pk/private.key.pub
~
/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub
~
/.ssh/authorized_keys && chmod 600
~
/.ssh/id_rsa
~
/.ssh/authorized_keys && chmod 644
~
/.ssh/id_rsa.pub
~/.ssh/authorized_keys
&& printf 'Host *
\\
nIdentityFile
~
/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' >
~
/.ssh/config && mpirun --allow-run-as-root --oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i
~
/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x TRTLLM_USE_UCX_KVCACHE -x USER bash -c 'trtllm-llmapi-launch serve --model test'"
,
expected
:
"mkdir -p
$HOME
/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key
$HOME
/.ssh/id_rsa && cp /ssh-pk/private.key.pub
$HOME
/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub
$HOME
/.ssh/authorized_keys && chmod 600
$HOME
/.ssh/id_rsa
$HOME
/.ssh/authorized_keys && chmod 644
$HOME
/.ssh/id_rsa.pub && printf 'Host *
\\
nIdentityFile
'$HOME'
/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' >
$HOME
/.ssh/config && mpirun
$([
\"
$(id -u)
\"
=
\"
0
\"
] && echo
--allow-run-as-root
)
--oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i
$HOME
/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x TRTLLM_USE_UCX_KVCACHE -x USER bash -c 'trtllm-llmapi-launch serve --model test'"
,
},
{
name
:
"Leader with overlapping environment variables (deduplication test)"
,
...
...
@@ -648,7 +648,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
},
initialArgs
:
[]
string
{
"serve"
,
"--model"
,
"test"
},
initialCommand
:
[]
string
{},
expected
:
"mkdir -p
~
/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key
~
/.ssh/id_rsa && cp /ssh-pk/private.key.pub
~
/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub
~
/.ssh/authorized_keys && chmod 600
~
/.ssh/id_rsa
~
/.ssh/authorized_keys && chmod 644
~
/.ssh/id_rsa.pub
~/.ssh/authorized_keys
&& printf 'Host *
\\
nIdentityFile
~
/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' >
~
/.ssh/config && mpirun --allow-run-as-root --oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i
~
/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x CUSTOM_VAR -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x TRTLLM_USE_UCX_KVCACHE -x USER bash -c 'trtllm-llmapi-launch serve --model test'"
,
expected
:
"mkdir -p
$HOME
/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key
$HOME
/.ssh/id_rsa && cp /ssh-pk/private.key.pub
$HOME
/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub
$HOME
/.ssh/authorized_keys && chmod 600
$HOME
/.ssh/id_rsa
$HOME
/.ssh/authorized_keys && chmod 644
$HOME
/.ssh/id_rsa.pub && printf 'Host *
\\
nIdentityFile
'$HOME'
/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' >
$HOME
/.ssh/config && mpirun
$([
\"
$(id -u)
\"
=
\"
0
\"
] && echo
--allow-run-as-root
)
--oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i
$HOME
/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x CUSTOM_VAR -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x TRTLLM_USE_UCX_KVCACHE -x USER bash -c 'trtllm-llmapi-launch serve --model test'"
,
},
}
...
...
@@ -707,13 +707,13 @@ func TestTRTLLMBackend_setupWorkerContainer(t *testing.T) {
name
:
"Worker setup with initial args"
,
initialArgs
:
[]
string
{
"some"
,
"args"
},
initialCommand
:
[]
string
{},
expected
:
"mkdir -p
~
/.ssh
~
/.ssh/host_keys
~
/.ssh/run && ls -la /ssh-pk/ && cp /ssh-pk/private.key
~
/.ssh/id_rsa && cp /ssh-pk/private.key.pub
~
/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub
~
/.ssh/authorized_keys && chmod 600
~
/.ssh/id_rsa
~
/.ssh/authorized_keys && chmod 644
~
/.ssh/id_rsa.pub
~/.ssh/authorized_keys
&& printf 'Host *
\\
nIdentityFile
~
/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' >
~
/.ssh/config && ssh-keygen -t rsa -f
~
/.ssh/host_keys/ssh_host_rsa_key -N '' && ssh-keygen -t ecdsa -f
~
/.ssh/host_keys/ssh_host_ecdsa_key -N '' && ssh-keygen -t ed25519 -f
~
/.ssh/host_keys/ssh_host_ed25519_key -N '' && printf 'Port 2222
\\
nHostKey
~
/.ssh/host_keys/ssh_host_rsa_key
\\
nHostKey
~
/.ssh/host_keys/ssh_host_ecdsa_key
\\
nHostKey
~
/.ssh/host_keys/ssh_host_ed25519_key
\\
nPidFile
~
/.ssh/run/sshd.pid
\\
nPermitRootLogin yes
\\
nPasswordAuthentication no
\\
nPubkeyAuthentication yes
\\
nAuthorizedKeysFile
~
/.ssh/authorized_keys
\\
n' >
~
/.ssh/sshd_config && /usr/sbin/sshd -D -f
~
/.ssh/sshd_config"
,
expected
:
"mkdir -p
$HOME
/.ssh
$HOME
/.ssh/host_keys
$HOME
/.ssh/run
/run/sshd
&& ls -la /ssh-pk/ && cp /ssh-pk/private.key
$HOME
/.ssh/id_rsa && cp /ssh-pk/private.key.pub
$HOME
/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub
$HOME
/.ssh/authorized_keys && chmod 600
$HOME
/.ssh/id_rsa
$HOME
/.ssh/authorized_keys && chmod 644
$HOME
/.ssh/id_rsa.pub && printf 'Host *
\\
nIdentityFile
'$HOME'
/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' >
$HOME
/.ssh/config && ssh-keygen -t rsa -f
$HOME
/.ssh/host_keys/ssh_host_rsa_key -N '' && ssh-keygen -t ecdsa -f
$HOME
/.ssh/host_keys/ssh_host_ecdsa_key -N '' && ssh-keygen -t ed25519 -f
$HOME
/.ssh/host_keys/ssh_host_ed25519_key -N '' && printf 'Port 2222
\\
nHostKey
'$HOME'
/.ssh/host_keys/ssh_host_rsa_key
\\
nHostKey
'$HOME'
/.ssh/host_keys/ssh_host_ecdsa_key
\\
nHostKey
'$HOME'
/.ssh/host_keys/ssh_host_ed25519_key
\\
nPidFile
'$HOME'
/.ssh/run/sshd.pid
\\
nStrictModes no
\\
nPermitRootLogin yes
\\
nPasswordAuthentication no
\\
nPubkeyAuthentication yes
\\
nAuthorizedKeysFile
'$HOME'
/.ssh/authorized_keys
\\
n' >
$HOME
/.ssh/sshd_config && /usr/sbin/sshd -D -f
$HOME
/.ssh/sshd_config"
,
},
{
name
:
"Worker setup with initial command"
,
initialArgs
:
[]
string
{},
initialCommand
:
[]
string
{
"original"
,
"command"
},
expected
:
"mkdir -p
~
/.ssh
~
/.ssh/host_keys
~
/.ssh/run && ls -la /ssh-pk/ && cp /ssh-pk/private.key
~
/.ssh/id_rsa && cp /ssh-pk/private.key.pub
~
/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub
~
/.ssh/authorized_keys && chmod 600
~
/.ssh/id_rsa
~
/.ssh/authorized_keys && chmod 644
~
/.ssh/id_rsa.pub
~/.ssh/authorized_keys
&& printf 'Host *
\\
nIdentityFile
~
/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' >
~
/.ssh/config && ssh-keygen -t rsa -f
~
/.ssh/host_keys/ssh_host_rsa_key -N '' && ssh-keygen -t ecdsa -f
~
/.ssh/host_keys/ssh_host_ecdsa_key -N '' && ssh-keygen -t ed25519 -f
~
/.ssh/host_keys/ssh_host_ed25519_key -N '' && printf 'Port 2222
\\
nHostKey
~
/.ssh/host_keys/ssh_host_rsa_key
\\
nHostKey
~
/.ssh/host_keys/ssh_host_ecdsa_key
\\
nHostKey
~
/.ssh/host_keys/ssh_host_ed25519_key
\\
nPidFile
~
/.ssh/run/sshd.pid
\\
nPermitRootLogin yes
\\
nPasswordAuthentication no
\\
nPubkeyAuthentication yes
\\
nAuthorizedKeysFile
~
/.ssh/authorized_keys
\\
n' >
~
/.ssh/sshd_config && /usr/sbin/sshd -D -f
~
/.ssh/sshd_config"
,
expected
:
"mkdir -p
$HOME
/.ssh
$HOME
/.ssh/host_keys
$HOME
/.ssh/run
/run/sshd
&& ls -la /ssh-pk/ && cp /ssh-pk/private.key
$HOME
/.ssh/id_rsa && cp /ssh-pk/private.key.pub
$HOME
/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub
$HOME
/.ssh/authorized_keys && chmod 600
$HOME
/.ssh/id_rsa
$HOME
/.ssh/authorized_keys && chmod 644
$HOME
/.ssh/id_rsa.pub && printf 'Host *
\\
nIdentityFile
'$HOME'
/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' >
$HOME
/.ssh/config && ssh-keygen -t rsa -f
$HOME
/.ssh/host_keys/ssh_host_rsa_key -N '' && ssh-keygen -t ecdsa -f
$HOME
/.ssh/host_keys/ssh_host_ecdsa_key -N '' && ssh-keygen -t ed25519 -f
$HOME
/.ssh/host_keys/ssh_host_ed25519_key -N '' && printf 'Port 2222
\\
nHostKey
'$HOME'
/.ssh/host_keys/ssh_host_rsa_key
\\
nHostKey
'$HOME'
/.ssh/host_keys/ssh_host_ecdsa_key
\\
nHostKey
'$HOME'
/.ssh/host_keys/ssh_host_ed25519_key
\\
nPidFile
'$HOME'
/.ssh/run/sshd.pid
\\
nStrictModes no
\\
nPermitRootLogin yes
\\
nPasswordAuthentication no
\\
nPubkeyAuthentication yes
\\
nAuthorizedKeysFile
'$HOME'
/.ssh/authorized_keys
\\
n' >
$HOME
/.ssh/sshd_config && /usr/sbin/sshd -D -f
$HOME
/.ssh/sshd_config"
,
},
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment