Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
a5049f0f
Unverified
Commit
a5049f0f
authored
Nov 25, 2025
by
Rohan Varma
Committed by
GitHub
Nov 26, 2025
Browse files
fix: inject TRTLLM_USE_UCX_KVCACHE env var in mpirun (#4609)
parent
fcb91e4b
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
11 additions
and
11 deletions
+11
-11
deploy/cloud/operator/internal/dynamo/backend_trtllm.go
deploy/cloud/operator/internal/dynamo/backend_trtllm.go
+1
-1
deploy/cloud/operator/internal/dynamo/backend_trtllm_test.go
deploy/cloud/operator/internal/dynamo/backend_trtllm_test.go
+10
-10
No files found.
deploy/cloud/operator/internal/dynamo/backend_trtllm.go
View file @
a5049f0f
...
@@ -224,7 +224,7 @@ func getCommonTRTLLMEnvVars() map[string]bool {
...
@@ -224,7 +224,7 @@ func getCommonTRTLLMEnvVars() map[string]bool {
"CUDA_VISIBLE_DEVICES"
:
true
,
"MODEL_PATH"
:
true
,
"HF_TOKEN"
:
true
,
"HUGGING_FACE_HUB_TOKEN"
:
true
,
"HF_ENDPOINT"
:
true
,
"CUDA_VISIBLE_DEVICES"
:
true
,
"MODEL_PATH"
:
true
,
"HF_TOKEN"
:
true
,
"HUGGING_FACE_HUB_TOKEN"
:
true
,
"HF_ENDPOINT"
:
true
,
"TOKENIZERS_PARALLELISM"
:
true
,
"NCCL_DEBUG"
:
true
,
"NCCL_IB_DISABLE"
:
true
,
"NCCL_P2P_DISABLE"
:
true
,
"TOKENIZERS_PARALLELISM"
:
true
,
"NCCL_DEBUG"
:
true
,
"NCCL_IB_DISABLE"
:
true
,
"NCCL_P2P_DISABLE"
:
true
,
"TENSORRT_LLM_CACHE_DIR"
:
true
,
"HF_HOME"
:
true
,
"TRANSFORMERS_CACHE"
:
true
,
"HF_DATASETS_CACHE"
:
true
,
"TENSORRT_LLM_CACHE_DIR"
:
true
,
"HF_HOME"
:
true
,
"TRANSFORMERS_CACHE"
:
true
,
"HF_DATASETS_CACHE"
:
true
,
"PATH"
:
true
,
"LD_LIBRARY_PATH"
:
true
,
"PYTHONPATH"
:
true
,
"HOME"
:
true
,
"USER"
:
true
,
"PATH"
:
true
,
"LD_LIBRARY_PATH"
:
true
,
"PYTHONPATH"
:
true
,
"HOME"
:
true
,
"USER"
:
true
,
"TRTLLM_USE_UCX_KVCACHE"
:
true
,
}
}
}
}
...
...
deploy/cloud/operator/internal/dynamo/backend_trtllm_test.go
View file @
a5049f0f
...
@@ -61,7 +61,7 @@ func TestTRTLLMBackend_UpdateContainer(t *testing.T) {
...
@@ -61,7 +61,7 @@ func TestTRTLLMBackend_UpdateContainer(t *testing.T) {
{
Name
:
mpiRunSecretName
,
MountPath
:
"/ssh-pk"
,
ReadOnly
:
true
},
{
Name
:
mpiRunSecretName
,
MountPath
:
"/ssh-pk"
,
ReadOnly
:
true
},
},
},
expectedCommand
:
[]
string
{
"/bin/sh"
,
"-c"
},
expectedCommand
:
[]
string
{
"/bin/sh"
,
"-c"
},
expectedArgs
:
[]
string
{
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --allow-run-as-root --oversubscribe -n 6 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-wkr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-wkr-1.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x OMPI_MCA_orte_keep_fqdn_hostnames -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'trtllm-llmapi-launch python3 --model test'"
},
expectedArgs
:
[]
string
{
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --allow-run-as-root --oversubscribe -n 6 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-wkr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-wkr-1.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x OMPI_MCA_orte_keep_fqdn_hostnames -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x
TRTLLM_USE_UCX_KVCACHE -x
USER bash -c 'trtllm-llmapi-launch python3 --model test'"
},
expectedEnv
:
[]
corev1
.
EnvVar
{
expectedEnv
:
[]
corev1
.
EnvVar
{
{
Name
:
"OMPI_MCA_orte_keep_fqdn_hostnames"
,
Value
:
"1"
},
{
Name
:
"OMPI_MCA_orte_keep_fqdn_hostnames"
,
Value
:
"1"
},
},
},
...
@@ -115,7 +115,7 @@ func TestTRTLLMBackend_UpdateContainer(t *testing.T) {
...
@@ -115,7 +115,7 @@ func TestTRTLLMBackend_UpdateContainer(t *testing.T) {
{
Name
:
mpiRunSecretName
,
MountPath
:
"/ssh-pk"
,
ReadOnly
:
true
},
{
Name
:
mpiRunSecretName
,
MountPath
:
"/ssh-pk"
,
ReadOnly
:
true
},
},
},
expectedCommand
:
[]
string
{
"/bin/sh"
,
"-c"
},
expectedCommand
:
[]
string
{
"/bin/sh"
,
"-c"
},
expectedArgs
:
[]
string
{
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && TIMEOUT=300; START_TIME=$(date +%s); for worker in $(echo
\"
$LWS_LEADER_ADDRESS,$(echo
\"
$LWS_LEADER_ADDRESS
\"
| sed 's/
\\
./-1
\\
./')
\"
| tr ',' ' '); do echo
\"
Waiting for DNS: $worker
\"
; until getent hosts $worker >/dev/null 2>&1; do CURRENT_TIME=$(date +%s); if [ $((CURRENT_TIME - START_TIME)) -gt $TIMEOUT ]; then echo
\"
ERROR: Timeout waiting for DNS: $worker
\"
; exit 1; fi; echo
\"
DNS not ready for $worker, retrying...
\"
; sleep 2; done; echo
\"
✓ DNS resolved: $worker
\"
; done; echo
\"
All workers DNS ready
\"
&& mpirun --allow-run-as-root --oversubscribe -n 2 -H $LWS_LEADER_ADDRESS,$(echo
\"
$LWS_LEADER_ADDRESS
\"
| sed 's/
\\
./-1
\\
./') --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x OMPI_MCA_orte_keep_fqdn_hostnames -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'trtllm-llmapi-launch python3 --model test'"
},
expectedArgs
:
[]
string
{
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && TIMEOUT=300; START_TIME=$(date +%s); for worker in $(echo
\"
$LWS_LEADER_ADDRESS,$(echo
\"
$LWS_LEADER_ADDRESS
\"
| sed 's/
\\
./-1
\\
./')
\"
| tr ',' ' '); do echo
\"
Waiting for DNS: $worker
\"
; until getent hosts $worker >/dev/null 2>&1; do CURRENT_TIME=$(date +%s); if [ $((CURRENT_TIME - START_TIME)) -gt $TIMEOUT ]; then echo
\"
ERROR: Timeout waiting for DNS: $worker
\"
; exit 1; fi; echo
\"
DNS not ready for $worker, retrying...
\"
; sleep 2; done; echo
\"
✓ DNS resolved: $worker
\"
; done; echo
\"
All workers DNS ready
\"
&& mpirun --allow-run-as-root --oversubscribe -n 2 -H $LWS_LEADER_ADDRESS,$(echo
\"
$LWS_LEADER_ADDRESS
\"
| sed 's/
\\
./-1
\\
./') --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x OMPI_MCA_orte_keep_fqdn_hostnames -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x
TRTLLM_USE_UCX_KVCACHE -x
USER bash -c 'trtllm-llmapi-launch python3 --model test'"
},
expectedEnv
:
[]
corev1
.
EnvVar
{
expectedEnv
:
[]
corev1
.
EnvVar
{
{
Name
:
"OMPI_MCA_orte_keep_fqdn_hostnames"
,
Value
:
"1"
},
{
Name
:
"OMPI_MCA_orte_keep_fqdn_hostnames"
,
Value
:
"1"
},
},
},
...
@@ -564,7 +564,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
...
@@ -564,7 +564,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
},
},
initialArgs
:
[]
string
{
"python3"
,
"--model"
,
"test"
},
initialArgs
:
[]
string
{
"python3"
,
"--model"
,
"test"
},
initialCommand
:
[]
string
{},
initialCommand
:
[]
string
{},
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --allow-run-as-root --oversubscribe -n 6 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-wkr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-wkr-1.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'trtllm-llmapi-launch python3 --model test'"
,
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --allow-run-as-root --oversubscribe -n 6 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-wkr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-wkr-1.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x
TRTLLM_USE_UCX_KVCACHE -x
USER bash -c 'trtllm-llmapi-launch python3 --model test'"
,
},
},
{
{
name
:
"Leader with command and no GPU resources"
,
name
:
"Leader with command and no GPU resources"
,
...
@@ -574,7 +574,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
...
@@ -574,7 +574,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
component
:
&
v1alpha1
.
DynamoComponentDeploymentSharedSpec
{},
component
:
&
v1alpha1
.
DynamoComponentDeploymentSharedSpec
{},
initialArgs
:
[]
string
{},
initialArgs
:
[]
string
{},
initialCommand
:
[]
string
{
"python"
,
"-m"
,
"worker"
},
initialCommand
:
[]
string
{
"python"
,
"-m"
,
"worker"
},
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && TIMEOUT=300; START_TIME=$(date +%s); for worker in $(echo
\"
$LWS_LEADER_ADDRESS,$(echo
\"
$LWS_LEADER_ADDRESS
\"
| sed 's/
\\
./-1
\\
./')
\"
| tr ',' ' '); do echo
\"
Waiting for DNS: $worker
\"
; until getent hosts $worker >/dev/null 2>&1; do CURRENT_TIME=$(date +%s); if [ $((CURRENT_TIME - START_TIME)) -gt $TIMEOUT ]; then echo
\"
ERROR: Timeout waiting for DNS: $worker
\"
; exit 1; fi; echo
\"
DNS not ready for $worker, retrying...
\"
; sleep 2; done; echo
\"
✓ DNS resolved: $worker
\"
; done; echo
\"
All workers DNS ready
\"
&& mpirun --allow-run-as-root --oversubscribe -n 0 -H $LWS_LEADER_ADDRESS,$(echo
\"
$LWS_LEADER_ADDRESS
\"
| sed 's/
\\
./-1
\\
./') --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'trtllm-llmapi-launch python -m worker'"
,
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && TIMEOUT=300; START_TIME=$(date +%s); for worker in $(echo
\"
$LWS_LEADER_ADDRESS,$(echo
\"
$LWS_LEADER_ADDRESS
\"
| sed 's/
\\
./-1
\\
./')
\"
| tr ',' ' '); do echo
\"
Waiting for DNS: $worker
\"
; until getent hosts $worker >/dev/null 2>&1; do CURRENT_TIME=$(date +%s); if [ $((CURRENT_TIME - START_TIME)) -gt $TIMEOUT ]; then echo
\"
ERROR: Timeout waiting for DNS: $worker
\"
; exit 1; fi; echo
\"
DNS not ready for $worker, retrying...
\"
; sleep 2; done; echo
\"
✓ DNS resolved: $worker
\"
; done; echo
\"
All workers DNS ready
\"
&& mpirun --allow-run-as-root --oversubscribe -n 0 -H $LWS_LEADER_ADDRESS,$(echo
\"
$LWS_LEADER_ADDRESS
\"
| sed 's/
\\
./-1
\\
./') --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x
TRTLLM_USE_UCX_KVCACHE -x
USER bash -c 'trtllm-llmapi-launch python -m worker'"
,
},
},
{
{
name
:
"Leader with both command and args (shell command - args take precedence)"
,
name
:
"Leader with both command and args (shell command - args take precedence)"
,
...
@@ -590,7 +590,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
...
@@ -590,7 +590,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
},
},
initialArgs
:
[]
string
{
"launch"
,
"--config"
,
"test.yaml"
},
initialArgs
:
[]
string
{
"launch"
,
"--config"
,
"test.yaml"
},
initialCommand
:
[]
string
{
"sh"
,
"-c"
},
initialCommand
:
[]
string
{
"sh"
,
"-c"
},
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --allow-run-as-root --oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'trtllm-llmapi-launch launch --config test.yaml'"
,
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --allow-run-as-root --oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x
TRTLLM_USE_UCX_KVCACHE -x
USER bash -c 'trtllm-llmapi-launch launch --config test.yaml'"
,
},
},
{
{
name
:
"Leader with python command and args (combined)"
,
name
:
"Leader with python command and args (combined)"
,
...
@@ -606,7 +606,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
...
@@ -606,7 +606,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
},
},
initialArgs
:
[]
string
{
"-m"
,
"dynamo.trtllm"
,
"--model-path"
,
"test"
},
initialArgs
:
[]
string
{
"-m"
,
"dynamo.trtllm"
,
"--model-path"
,
"test"
},
initialCommand
:
[]
string
{
"python3"
},
initialCommand
:
[]
string
{
"python3"
},
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --allow-run-as-root --oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'trtllm-llmapi-launch python3 -m dynamo.trtllm --model-path test'"
,
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --allow-run-as-root --oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x
TRTLLM_USE_UCX_KVCACHE -x
USER bash -c 'trtllm-llmapi-launch python3 -m dynamo.trtllm --model-path test'"
,
},
},
{
{
name
:
"Leader with python module command and separate args"
,
name
:
"Leader with python module command and separate args"
,
...
@@ -622,7 +622,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
...
@@ -622,7 +622,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
},
},
initialArgs
:
[]
string
{
"--model-path"
,
"Qwen/Qwen3-0.6B"
,
"--served-model-name"
,
"Qwen/Qwen3-0.6B"
,
"--disaggregation-mode"
,
"prefill"
},
initialArgs
:
[]
string
{
"--model-path"
,
"Qwen/Qwen3-0.6B"
,
"--served-model-name"
,
"Qwen/Qwen3-0.6B"
,
"--disaggregation-mode"
,
"prefill"
},
initialCommand
:
[]
string
{
"python3"
,
"-m"
,
"dynamo.trtllm"
},
initialCommand
:
[]
string
{
"python3"
,
"-m"
,
"dynamo.trtllm"
},
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --allow-run-as-root --oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'trtllm-llmapi-launch python3 -m dynamo.trtllm --model-path Qwen/Qwen3-0.6B --served-model-name Qwen/Qwen3-0.6B --disaggregation-mode prefill'"
,
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --allow-run-as-root --oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x
TRTLLM_USE_UCX_KVCACHE -x
USER bash -c 'trtllm-llmapi-launch python3 -m dynamo.trtllm --model-path Qwen/Qwen3-0.6B --served-model-name Qwen/Qwen3-0.6B --disaggregation-mode prefill'"
,
},
},
{
{
name
:
"Leader with absolute path python command"
,
name
:
"Leader with absolute path python command"
,
...
@@ -638,7 +638,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
...
@@ -638,7 +638,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
},
},
initialArgs
:
[]
string
{
"-m"
,
"dynamo.trtllm"
,
"--model-path"
,
"test"
},
initialArgs
:
[]
string
{
"-m"
,
"dynamo.trtllm"
,
"--model-path"
,
"test"
},
initialCommand
:
[]
string
{
"/usr/bin/python3.8"
},
initialCommand
:
[]
string
{
"/usr/bin/python3.8"
},
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --allow-run-as-root --oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'trtllm-llmapi-launch /usr/bin/python3.8 -m dynamo.trtllm --model-path test'"
,
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --allow-run-as-root --oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x
TRTLLM_USE_UCX_KVCACHE -x
USER bash -c 'trtllm-llmapi-launch /usr/bin/python3.8 -m dynamo.trtllm --model-path test'"
,
},
},
{
{
name
:
"Leader with all environment variables forwarded"
,
name
:
"Leader with all environment variables forwarded"
,
...
@@ -654,7 +654,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
...
@@ -654,7 +654,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
},
},
initialArgs
:
[]
string
{
"serve"
,
"--model"
,
"test"
},
initialArgs
:
[]
string
{
"serve"
,
"--model"
,
"test"
},
initialCommand
:
[]
string
{},
initialCommand
:
[]
string
{},
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --allow-run-as-root --oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'trtllm-llmapi-launch serve --model test'"
,
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --allow-run-as-root --oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x
TRTLLM_USE_UCX_KVCACHE -x
USER bash -c 'trtllm-llmapi-launch serve --model test'"
,
},
},
{
{
name
:
"Leader with overlapping environment variables (deduplication test)"
,
name
:
"Leader with overlapping environment variables (deduplication test)"
,
...
@@ -670,7 +670,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
...
@@ -670,7 +670,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
},
},
initialArgs
:
[]
string
{
"serve"
,
"--model"
,
"test"
},
initialArgs
:
[]
string
{
"serve"
,
"--model"
,
"test"
},
initialCommand
:
[]
string
{},
initialCommand
:
[]
string
{},
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --allow-run-as-root --oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x CUSTOM_VAR -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'trtllm-llmapi-launch serve --model test'"
,
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --allow-run-as-root --oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x CUSTOM_VAR -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x
TRTLLM_USE_UCX_KVCACHE -x
USER bash -c 'trtllm-llmapi-launch serve --model test'"
,
},
},
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment