Unverified Commit 6ef20625 authored by MatejKosec's avatar MatejKosec Committed by GitHub
Browse files

fix(operator): fix SSH setup bugs in TRT-LLM multinode workers (#6225)


Signed-off-by: default avatarMatej Kosec <mkosec@nvidia.com>
parent 67883ec6
...@@ -126,15 +126,18 @@ func (b *TRTLLMBackend) setupLeaderContainer(container *corev1.Container, number ...@@ -126,15 +126,18 @@ func (b *TRTLLMBackend) setupLeaderContainer(container *corev1.Container, number
} }
// Setup SSH and run mpirun command // Setup SSH and run mpirun command
// Use $HOME instead of ~ because the container may set HOME=/home/dynamo via Dockerfile
// while the shell user is root (from securityContext.runAsUser: 0). ~ follows /etc/passwd
// but $HOME follows the environment, and SSH/mpirun need to find keys where we put them.
sshSetupCommands := []string{ sshSetupCommands := []string{
"mkdir -p ~/.ssh", "mkdir -p $HOME/.ssh",
"ls -la /ssh-pk/", // Debug: list files in ssh-pk directory "ls -la /ssh-pk/", // Debug: list files in ssh-pk directory
"cp /ssh-pk/private.key ~/.ssh/id_rsa", "cp /ssh-pk/private.key $HOME/.ssh/id_rsa",
"cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub", "cp /ssh-pk/private.key.pub $HOME/.ssh/id_rsa.pub",
"cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys", "cp /ssh-pk/private.key.pub $HOME/.ssh/authorized_keys",
"chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys", "chmod 600 $HOME/.ssh/id_rsa $HOME/.ssh/authorized_keys",
"chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys", "chmod 644 $HOME/.ssh/id_rsa.pub",
fmt.Sprintf("printf 'Host *\\nIdentityFile ~/.ssh/id_rsa\\nStrictHostKeyChecking no\\nPort %d\\n' > ~/.ssh/config", commonconsts.MpiRunSshPort), fmt.Sprintf("printf 'Host *\\nIdentityFile '$HOME'/.ssh/id_rsa\\nStrictHostKeyChecking no\\nPort %d\\n' > $HOME/.ssh/config", commonconsts.MpiRunSshPort),
} }
// Calculate total number of GPUs across all nodes // Calculate total number of GPUs across all nodes
...@@ -148,7 +151,10 @@ func (b *TRTLLMBackend) setupLeaderContainer(container *corev1.Container, number ...@@ -148,7 +151,10 @@ func (b *TRTLLMBackend) setupLeaderContainer(container *corev1.Container, number
// Generate environment variable flags for mpirun // Generate environment variable flags for mpirun
envVarsStr := generateEnvVarFlags(container.Env) envVarsStr := generateEnvVarFlags(container.Env)
mpirunCmd := fmt.Sprintf("mpirun --allow-run-as-root --oversubscribe -n %d -H %s --mca pml ob1 --mca plm_rsh_args \"-p %d -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa\" %s %s", // Use --allow-run-as-root only when the container is running as root (UID 0).
// When running as a non-root user, mpirun works without this flag and omitting
// it avoids masking accidental root execution.
mpirunCmd := fmt.Sprintf("mpirun $([ \"$(id -u)\" = \"0\" ] && echo --allow-run-as-root) --oversubscribe -n %d -H %s --mca pml ob1 --mca plm_rsh_args \"-p %d -o StrictHostKeyChecking=no -i $HOME/.ssh/id_rsa\" %s %s",
totalGPUs, totalGPUs,
allHostnames, allHostnames,
commonconsts.MpiRunSshPort, commonconsts.MpiRunSshPort,
...@@ -176,22 +182,29 @@ func (b *TRTLLMBackend) setupLeaderContainer(container *corev1.Container, number ...@@ -176,22 +182,29 @@ func (b *TRTLLMBackend) setupLeaderContainer(container *corev1.Container, number
// setupWorkerContainer configures worker nodes with SSH setup and daemon // setupWorkerContainer configures worker nodes with SSH setup and daemon
func (b *TRTLLMBackend) setupWorkerContainer(container *corev1.Container) { func (b *TRTLLMBackend) setupWorkerContainer(container *corev1.Container) {
// Setup SSH for worker nodes // Setup SSH for worker nodes
// Use $HOME instead of ~ for the same reasons as setupLeaderContainer (see comment above).
sshSetupCommands := []string{ sshSetupCommands := []string{
"mkdir -p ~/.ssh ~/.ssh/host_keys ~/.ssh/run", "mkdir -p $HOME/.ssh $HOME/.ssh/host_keys $HOME/.ssh/run /run/sshd",
"ls -la /ssh-pk/", // Debug: list files in ssh-pk directory "ls -la /ssh-pk/", // Debug: list files in ssh-pk directory
"cp /ssh-pk/private.key ~/.ssh/id_rsa", "cp /ssh-pk/private.key $HOME/.ssh/id_rsa",
"cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub", "cp /ssh-pk/private.key.pub $HOME/.ssh/id_rsa.pub",
"cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys", "cp /ssh-pk/private.key.pub $HOME/.ssh/authorized_keys",
"chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys", "chmod 600 $HOME/.ssh/id_rsa $HOME/.ssh/authorized_keys",
"chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys", "chmod 644 $HOME/.ssh/id_rsa.pub",
fmt.Sprintf("printf 'Host *\\nIdentityFile ~/.ssh/id_rsa\\nStrictHostKeyChecking no\\nPort %d\\n' > ~/.ssh/config", commonconsts.MpiRunSshPort), fmt.Sprintf("printf 'Host *\\nIdentityFile '$HOME'/.ssh/id_rsa\\nStrictHostKeyChecking no\\nPort %d\\n' > $HOME/.ssh/config", commonconsts.MpiRunSshPort),
// Generate host keys in user writable directory // Generate host keys in user writable directory
"ssh-keygen -t rsa -f ~/.ssh/host_keys/ssh_host_rsa_key -N ''", "ssh-keygen -t rsa -f $HOME/.ssh/host_keys/ssh_host_rsa_key -N ''",
"ssh-keygen -t ecdsa -f ~/.ssh/host_keys/ssh_host_ecdsa_key -N ''", "ssh-keygen -t ecdsa -f $HOME/.ssh/host_keys/ssh_host_ecdsa_key -N ''",
"ssh-keygen -t ed25519 -f ~/.ssh/host_keys/ssh_host_ed25519_key -N ''", "ssh-keygen -t ed25519 -f $HOME/.ssh/host_keys/ssh_host_ed25519_key -N ''",
// Create SSH daemon config to use custom host keys location and non-privileged port // Create SSH daemon config using $HOME for absolute paths.
fmt.Sprintf("printf 'Port %d\\nHostKey ~/.ssh/host_keys/ssh_host_rsa_key\\nHostKey ~/.ssh/host_keys/ssh_host_ecdsa_key\\nHostKey ~/.ssh/host_keys/ssh_host_ed25519_key\\nPidFile ~/.ssh/run/sshd.pid\\nPermitRootLogin yes\\nPasswordAuthentication no\\nPubkeyAuthentication yes\\nAuthorizedKeysFile ~/.ssh/authorized_keys\\n' > ~/.ssh/sshd_config", commonconsts.MpiRunSshPort), // sshd expands ~ via /etc/passwd (-> /root/) not the HOME env var,
"/usr/sbin/sshd -D -f ~/.ssh/sshd_config", // so we break out of single quotes to let the shell expand $HOME.
// AuthorizedKeysFile also needs absolute $HOME path because sshd resolves
// relative paths from the connecting user's /etc/passwd home (-> /root/).
// StrictModes disabled because /home/dynamo may be owned by a non-root UID
// while sshd runs as root, causing permission check failures.
fmt.Sprintf("printf 'Port %d\\nHostKey '$HOME'/.ssh/host_keys/ssh_host_rsa_key\\nHostKey '$HOME'/.ssh/host_keys/ssh_host_ecdsa_key\\nHostKey '$HOME'/.ssh/host_keys/ssh_host_ed25519_key\\nPidFile '$HOME'/.ssh/run/sshd.pid\\nStrictModes no\\nPermitRootLogin yes\\nPasswordAuthentication no\\nPubkeyAuthentication yes\\nAuthorizedKeysFile '$HOME'/.ssh/authorized_keys\\n' > $HOME/.ssh/sshd_config", commonconsts.MpiRunSshPort),
"/usr/sbin/sshd -D -f $HOME/.ssh/sshd_config",
} }
fullCommand := strings.Join(sshSetupCommands, " && ") fullCommand := strings.Join(sshSetupCommands, " && ")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment