Unverified Commit d152596a authored by MatejKosec's avatar MatejKosec Committed by GitHub
Browse files

fix(operator): fix TRT-LLM worker SSH crash in non-root containers (#6694)


Signed-off-by: default avatarMatej Kosec <mkosec@nvidia.com>
parent c68052ff
......@@ -17,6 +17,10 @@ type TRTLLMBackend struct {
MpiRunSecretName string
}
// UpdateContainer configures the container for TRT-LLM multinode deployments.
// For single-node deployments it is a no-op. For multinode, it mounts the SSH
// keypair secret and injects the appropriate SSH setup and launch commands for
// leader (mpirun) and worker (sshd) roles.
func (b *TRTLLMBackend) UpdateContainer(container *corev1.Container, numberOfNodes int32, role Role, component *v1alpha1.DynamoComponentDeploymentSharedSpec, serviceName string, multinodeDeployer MultinodeDeployer) {
// Check for volumeMounts with useAsCompilationCache=true
for _, volumeMount := range component.VolumeMounts {
......@@ -74,6 +78,8 @@ func (b *TRTLLMBackend) UpdateContainer(container *corev1.Container, numberOfNod
}
}
// UpdatePodSpec injects the SSH keypair volume into the pod spec for TRT-LLM
// multinode deployments so that leader and worker containers can mount it.
func (b *TRTLLMBackend) UpdatePodSpec(podSpec *corev1.PodSpec, numberOfNodes int32, role Role, component *v1alpha1.DynamoComponentDeploymentSharedSpec, serviceName string, multinodeDeployer MultinodeDeployer) {
// Add SSH keypair volume for TRTLLM multinode deployments
if numberOfNodes > 1 {
......@@ -187,7 +193,7 @@ func (b *TRTLLMBackend) setupWorkerContainer(container *corev1.Container) {
// Setup SSH for worker nodes
// Use $HOME instead of ~ for the same reasons as setupLeaderContainer (see comment above).
sshSetupCommands := []string{
"mkdir -p $HOME/.ssh $HOME/.ssh/host_keys $HOME/.ssh/run /run/sshd",
"mkdir -p $HOME/.ssh $HOME/.ssh/host_keys $HOME/.ssh/run",
"ls -la /ssh-pk/", // Debug: list files in ssh-pk directory
"cp /ssh-pk/private.key $HOME/.ssh/id_rsa",
"cp /ssh-pk/private.key.pub $HOME/.ssh/id_rsa.pub",
......@@ -206,6 +212,10 @@ func (b *TRTLLMBackend) setupWorkerContainer(container *corev1.Container) {
// relative paths from the connecting user's /etc/passwd home (-> /root/).
// StrictModes disabled because /home/dynamo may be owned by a non-root UID
// while sshd runs as root, causing permission check failures.
// Note: /run/sshd (the privilege separation directory) is not needed here
// because sshd started as a non-root user skips the privsep directory check
// entirely — privsep requires forking a privileged monitor process, which is
// only possible when sshd starts as UID 0.
fmt.Sprintf("printf 'Port %d\\nHostKey '$HOME'/.ssh/host_keys/ssh_host_rsa_key\\nHostKey '$HOME'/.ssh/host_keys/ssh_host_ecdsa_key\\nHostKey '$HOME'/.ssh/host_keys/ssh_host_ed25519_key\\nPidFile '$HOME'/.ssh/run/sshd.pid\\nStrictModes no\\nPermitRootLogin yes\\nPasswordAuthentication no\\nPubkeyAuthentication yes\\nAuthorizedKeysFile '$HOME'/.ssh/authorized_keys\\n' > $HOME/.ssh/sshd_config", commonconsts.MpiRunSshPort),
"/usr/sbin/sshd -D -f $HOME/.ssh/sshd_config",
}
......
......@@ -180,7 +180,7 @@ func TestTRTLLMBackend_UpdateContainer(t *testing.T) {
{Name: mpiRunSecretName, MountPath: "/ssh-pk", ReadOnly: true},
},
expectedCommand: []string{"/bin/sh", "-c"},
expectedArgs: []string{"mkdir -p $HOME/.ssh $HOME/.ssh/host_keys $HOME/.ssh/run /run/sshd && ls -la /ssh-pk/ && cp /ssh-pk/private.key $HOME/.ssh/id_rsa && cp /ssh-pk/private.key.pub $HOME/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub $HOME/.ssh/authorized_keys && chmod 600 $HOME/.ssh/id_rsa $HOME/.ssh/authorized_keys && chmod 644 $HOME/.ssh/id_rsa.pub && printf 'Host *\\nIdentityFile '$HOME'/.ssh/id_rsa\\nStrictHostKeyChecking no\\nPort 2222\\n' > $HOME/.ssh/config && ssh-keygen -t rsa -f $HOME/.ssh/host_keys/ssh_host_rsa_key -N '' && ssh-keygen -t ecdsa -f $HOME/.ssh/host_keys/ssh_host_ecdsa_key -N '' && ssh-keygen -t ed25519 -f $HOME/.ssh/host_keys/ssh_host_ed25519_key -N '' && printf 'Port 2222\\nHostKey '$HOME'/.ssh/host_keys/ssh_host_rsa_key\\nHostKey '$HOME'/.ssh/host_keys/ssh_host_ecdsa_key\\nHostKey '$HOME'/.ssh/host_keys/ssh_host_ed25519_key\\nPidFile '$HOME'/.ssh/run/sshd.pid\\nStrictModes no\\nPermitRootLogin yes\\nPasswordAuthentication no\\nPubkeyAuthentication yes\\nAuthorizedKeysFile '$HOME'/.ssh/authorized_keys\\n' > $HOME/.ssh/sshd_config && /usr/sbin/sshd -D -f $HOME/.ssh/sshd_config"},
expectedArgs: []string{"mkdir -p $HOME/.ssh $HOME/.ssh/host_keys $HOME/.ssh/run && ls -la /ssh-pk/ && cp /ssh-pk/private.key $HOME/.ssh/id_rsa && cp /ssh-pk/private.key.pub $HOME/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub $HOME/.ssh/authorized_keys && chmod 600 $HOME/.ssh/id_rsa $HOME/.ssh/authorized_keys && chmod 644 $HOME/.ssh/id_rsa.pub && printf 'Host *\\nIdentityFile '$HOME'/.ssh/id_rsa\\nStrictHostKeyChecking no\\nPort 2222\\n' > $HOME/.ssh/config && ssh-keygen -t rsa -f $HOME/.ssh/host_keys/ssh_host_rsa_key -N '' && ssh-keygen -t ecdsa -f $HOME/.ssh/host_keys/ssh_host_ecdsa_key -N '' && ssh-keygen -t ed25519 -f $HOME/.ssh/host_keys/ssh_host_ed25519_key -N '' && printf 'Port 2222\\nHostKey '$HOME'/.ssh/host_keys/ssh_host_rsa_key\\nHostKey '$HOME'/.ssh/host_keys/ssh_host_ecdsa_key\\nHostKey '$HOME'/.ssh/host_keys/ssh_host_ed25519_key\\nPidFile '$HOME'/.ssh/run/sshd.pid\\nStrictModes no\\nPermitRootLogin yes\\nPasswordAuthentication no\\nPubkeyAuthentication yes\\nAuthorizedKeysFile '$HOME'/.ssh/authorized_keys\\n' > $HOME/.ssh/sshd_config && /usr/sbin/sshd -D -f $HOME/.ssh/sshd_config"},
expectedEnv: []corev1.EnvVar{
{Name: "OMPI_MCA_orte_keep_fqdn_hostnames", Value: "1"},
},
......@@ -807,13 +807,13 @@ func TestTRTLLMBackend_setupWorkerContainer(t *testing.T) {
name: "Worker setup with initial args",
initialArgs: []string{"some", "args"},
initialCommand: []string{},
expected: "mkdir -p $HOME/.ssh $HOME/.ssh/host_keys $HOME/.ssh/run /run/sshd && ls -la /ssh-pk/ && cp /ssh-pk/private.key $HOME/.ssh/id_rsa && cp /ssh-pk/private.key.pub $HOME/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub $HOME/.ssh/authorized_keys && chmod 600 $HOME/.ssh/id_rsa $HOME/.ssh/authorized_keys && chmod 644 $HOME/.ssh/id_rsa.pub && printf 'Host *\\nIdentityFile '$HOME'/.ssh/id_rsa\\nStrictHostKeyChecking no\\nPort 2222\\n' > $HOME/.ssh/config && ssh-keygen -t rsa -f $HOME/.ssh/host_keys/ssh_host_rsa_key -N '' && ssh-keygen -t ecdsa -f $HOME/.ssh/host_keys/ssh_host_ecdsa_key -N '' && ssh-keygen -t ed25519 -f $HOME/.ssh/host_keys/ssh_host_ed25519_key -N '' && printf 'Port 2222\\nHostKey '$HOME'/.ssh/host_keys/ssh_host_rsa_key\\nHostKey '$HOME'/.ssh/host_keys/ssh_host_ecdsa_key\\nHostKey '$HOME'/.ssh/host_keys/ssh_host_ed25519_key\\nPidFile '$HOME'/.ssh/run/sshd.pid\\nStrictModes no\\nPermitRootLogin yes\\nPasswordAuthentication no\\nPubkeyAuthentication yes\\nAuthorizedKeysFile '$HOME'/.ssh/authorized_keys\\n' > $HOME/.ssh/sshd_config && /usr/sbin/sshd -D -f $HOME/.ssh/sshd_config",
expected: "mkdir -p $HOME/.ssh $HOME/.ssh/host_keys $HOME/.ssh/run && ls -la /ssh-pk/ && cp /ssh-pk/private.key $HOME/.ssh/id_rsa && cp /ssh-pk/private.key.pub $HOME/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub $HOME/.ssh/authorized_keys && chmod 600 $HOME/.ssh/id_rsa $HOME/.ssh/authorized_keys && chmod 644 $HOME/.ssh/id_rsa.pub && printf 'Host *\\nIdentityFile '$HOME'/.ssh/id_rsa\\nStrictHostKeyChecking no\\nPort 2222\\n' > $HOME/.ssh/config && ssh-keygen -t rsa -f $HOME/.ssh/host_keys/ssh_host_rsa_key -N '' && ssh-keygen -t ecdsa -f $HOME/.ssh/host_keys/ssh_host_ecdsa_key -N '' && ssh-keygen -t ed25519 -f $HOME/.ssh/host_keys/ssh_host_ed25519_key -N '' && printf 'Port 2222\\nHostKey '$HOME'/.ssh/host_keys/ssh_host_rsa_key\\nHostKey '$HOME'/.ssh/host_keys/ssh_host_ecdsa_key\\nHostKey '$HOME'/.ssh/host_keys/ssh_host_ed25519_key\\nPidFile '$HOME'/.ssh/run/sshd.pid\\nStrictModes no\\nPermitRootLogin yes\\nPasswordAuthentication no\\nPubkeyAuthentication yes\\nAuthorizedKeysFile '$HOME'/.ssh/authorized_keys\\n' > $HOME/.ssh/sshd_config && /usr/sbin/sshd -D -f $HOME/.ssh/sshd_config",
},
{
name: "Worker setup with initial command",
initialArgs: []string{},
initialCommand: []string{"original", "command"},
expected: "mkdir -p $HOME/.ssh $HOME/.ssh/host_keys $HOME/.ssh/run /run/sshd && ls -la /ssh-pk/ && cp /ssh-pk/private.key $HOME/.ssh/id_rsa && cp /ssh-pk/private.key.pub $HOME/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub $HOME/.ssh/authorized_keys && chmod 600 $HOME/.ssh/id_rsa $HOME/.ssh/authorized_keys && chmod 644 $HOME/.ssh/id_rsa.pub && printf 'Host *\\nIdentityFile '$HOME'/.ssh/id_rsa\\nStrictHostKeyChecking no\\nPort 2222\\n' > $HOME/.ssh/config && ssh-keygen -t rsa -f $HOME/.ssh/host_keys/ssh_host_rsa_key -N '' && ssh-keygen -t ecdsa -f $HOME/.ssh/host_keys/ssh_host_ecdsa_key -N '' && ssh-keygen -t ed25519 -f $HOME/.ssh/host_keys/ssh_host_ed25519_key -N '' && printf 'Port 2222\\nHostKey '$HOME'/.ssh/host_keys/ssh_host_rsa_key\\nHostKey '$HOME'/.ssh/host_keys/ssh_host_ecdsa_key\\nHostKey '$HOME'/.ssh/host_keys/ssh_host_ed25519_key\\nPidFile '$HOME'/.ssh/run/sshd.pid\\nStrictModes no\\nPermitRootLogin yes\\nPasswordAuthentication no\\nPubkeyAuthentication yes\\nAuthorizedKeysFile '$HOME'/.ssh/authorized_keys\\n' > $HOME/.ssh/sshd_config && /usr/sbin/sshd -D -f $HOME/.ssh/sshd_config",
expected: "mkdir -p $HOME/.ssh $HOME/.ssh/host_keys $HOME/.ssh/run && ls -la /ssh-pk/ && cp /ssh-pk/private.key $HOME/.ssh/id_rsa && cp /ssh-pk/private.key.pub $HOME/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub $HOME/.ssh/authorized_keys && chmod 600 $HOME/.ssh/id_rsa $HOME/.ssh/authorized_keys && chmod 644 $HOME/.ssh/id_rsa.pub && printf 'Host *\\nIdentityFile '$HOME'/.ssh/id_rsa\\nStrictHostKeyChecking no\\nPort 2222\\n' > $HOME/.ssh/config && ssh-keygen -t rsa -f $HOME/.ssh/host_keys/ssh_host_rsa_key -N '' && ssh-keygen -t ecdsa -f $HOME/.ssh/host_keys/ssh_host_ecdsa_key -N '' && ssh-keygen -t ed25519 -f $HOME/.ssh/host_keys/ssh_host_ed25519_key -N '' && printf 'Port 2222\\nHostKey '$HOME'/.ssh/host_keys/ssh_host_rsa_key\\nHostKey '$HOME'/.ssh/host_keys/ssh_host_ecdsa_key\\nHostKey '$HOME'/.ssh/host_keys/ssh_host_ed25519_key\\nPidFile '$HOME'/.ssh/run/sshd.pid\\nStrictModes no\\nPermitRootLogin yes\\nPasswordAuthentication no\\nPubkeyAuthentication yes\\nAuthorizedKeysFile '$HOME'/.ssh/authorized_keys\\n' > $HOME/.ssh/sshd_config && /usr/sbin/sshd -D -f $HOME/.ssh/sshd_config",
},
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment