Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
9b893c93
Unverified
Commit
9b893c93
authored
Sep 23, 2025
by
Julien Mancuso
Committed by
GitHub
Sep 23, 2025
Browse files
fix: improve sglang multinode handling in operator (#3151)
Signed-off-by:
Julien Mancuso
<
jmancuso@nvidia.com
>
parent
c1907d12
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
416 additions
and
76 deletions
+416
-76
deploy/cloud/operator/internal/dynamo/backend_sglang.go
deploy/cloud/operator/internal/dynamo/backend_sglang.go
+72
-11
deploy/cloud/operator/internal/dynamo/backend_sglang_test.go
deploy/cloud/operator/internal/dynamo/backend_sglang_test.go
+308
-31
deploy/cloud/operator/internal/dynamo/backend_trtllm_test.go
deploy/cloud/operator/internal/dynamo/backend_trtllm_test.go
+13
-13
deploy/cloud/operator/internal/dynamo/backend_vllm_test.go
deploy/cloud/operator/internal/dynamo/backend_vllm_test.go
+4
-4
deploy/cloud/operator/internal/dynamo/graph.go
deploy/cloud/operator/internal/dynamo/graph.go
+1
-1
deploy/cloud/operator/internal/dynamo/graph_test.go
deploy/cloud/operator/internal/dynamo/graph_test.go
+8
-8
deploy/cloud/operator/internal/dynamo/grove.go
deploy/cloud/operator/internal/dynamo/grove.go
+5
-4
deploy/cloud/operator/internal/dynamo/lws.go
deploy/cloud/operator/internal/dynamo/lws.go
+5
-4
No files found.
deploy/cloud/operator/internal/dynamo/backend_sglang.go
View file @
9b893c93
...
@@ -15,6 +15,16 @@ const (
...
@@ -15,6 +15,16 @@ const (
type
SGLangBackend
struct
{}
type
SGLangBackend
struct
{}
// isPythonCommand checks if the command is a Python interpreter
func
isPythonCommand
(
cmd
string
)
bool
{
if
cmd
==
"python"
||
cmd
==
"python3"
{
return
true
}
// Match python with version numbers like python3.11, python2.7, etc.
matched
,
_
:=
regexp
.
MatchString
(
`^python\d+(\.\d+)*$`
,
cmd
)
return
matched
}
func
(
b
*
SGLangBackend
)
UpdateContainer
(
container
*
corev1
.
Container
,
numberOfNodes
int32
,
role
Role
,
component
*
v1alpha1
.
DynamoComponentDeploymentOverridesSpec
,
serviceName
string
,
multinodeDeployer
MultinodeDeployer
)
{
func
(
b
*
SGLangBackend
)
UpdateContainer
(
container
*
corev1
.
Container
,
numberOfNodes
int32
,
role
Role
,
component
*
v1alpha1
.
DynamoComponentDeploymentOverridesSpec
,
serviceName
string
,
multinodeDeployer
MultinodeDeployer
)
{
// For single node, nothing to do
// For single node, nothing to do
if
numberOfNodes
<=
1
{
if
numberOfNodes
<=
1
{
...
@@ -29,16 +39,60 @@ func (b *SGLangBackend) UpdateContainer(container *corev1.Container, numberOfNod
...
@@ -29,16 +39,60 @@ func (b *SGLangBackend) UpdateContainer(container *corev1.Container, numberOfNod
}
}
// Generate the flags to add
// Generate the flags to add
flags
:=
b
.
getMultinodeFlags
(
numberOfNodes
,
role
,
serviceName
,
multinodeDeployer
)
flags
,
needsShell
:=
b
.
getMultinodeFlags
(
numberOfNodes
,
role
,
serviceName
,
multinodeDeployer
)
if
flags
==
""
{
if
flags
==
""
{
return
return
}
}
// Flatten all args into a single command and inject flags
/*
if
len
(
container
.
Args
)
>
0
{
* Flag Injection Strategy for Multinode SGLang Deployments
fullCommand
:=
strings
.
Join
(
container
.
Args
,
" "
)
*
modifiedCommand
:=
b
.
injectFlagsIntoPythonCommand
(
fullCommand
,
flags
)
* This code handles the injection of distributed training flags (--dist-init-addr, --nnodes, --node-rank)
container
.
Args
=
[]
string
{
modifiedCommand
}
* into container commands for multinode SGLang deployments. The complexity arises from supporting multiple
* container command patterns and ensuring proper environment variable interpretation.
*
* Two main scenarios are handled:
*
* 1. Direct Python Command (e.g., Command: ["python3"], Args: ["-m", "sglang", "..."])
* - If shell interpretation is needed (for env vars): Wrap in "sh -c" with exec
* - If no shell needed: Simply append flags to the Args array
*
* 2. Non-Python Command (e.g., Command: ["sh"], Args: ["-c", "python3 -m sglang ..."])
* - Use regex-based injection to find embedded Python+SGLang commands within args
* - Insert flags after the Python command but before any shell operators (|, &, ;)
*
* The needsShell flag indicates when environment variables require shell interpretation
*/
if
len
(
container
.
Command
)
>
0
&&
isPythonCommand
(
container
.
Command
[
0
])
{
// Direct python command case
if
needsShell
{
// Transform to shell wrapper for env var interpretation
fullCommand
:=
strings
.
Join
(
container
.
Command
,
" "
)
originalArgs
:=
strings
.
Join
(
container
.
Args
,
" "
)
var
shellCommand
string
if
len
(
container
.
Args
)
>
0
{
// Use exec to ensure PID 1 is given to the python command
shellCommand
=
fmt
.
Sprintf
(
"exec %s %s %s"
,
fullCommand
,
originalArgs
,
flags
)
}
else
{
// Use exec to ensure PID 1 is given to the python command
shellCommand
=
fmt
.
Sprintf
(
"exec %s %s"
,
fullCommand
,
flags
)
}
container
.
Command
=
[]
string
{
"sh"
,
"-c"
}
container
.
Args
=
[]
string
{
shellCommand
}
}
else
{
// Simple append to args
flagsSlice
:=
strings
.
Fields
(
flags
)
container
.
Args
=
append
(
container
.
Args
,
flagsSlice
...
)
}
}
else
{
// Non-python command case - try injection on each arg individually
for
i
,
arg
:=
range
container
.
Args
{
modifiedArg
:=
b
.
injectFlagsIntoPythonCommand
(
arg
,
flags
)
if
modifiedArg
!=
arg
{
// flags were successfully injected
container
.
Args
[
i
]
=
modifiedArg
break
// stop after first successful injection
}
}
}
}
}
}
...
@@ -46,15 +100,22 @@ func (b *SGLangBackend) UpdatePodSpec(podSpec *corev1.PodSpec, numberOfNodes int
...
@@ -46,15 +100,22 @@ func (b *SGLangBackend) UpdatePodSpec(podSpec *corev1.PodSpec, numberOfNodes int
// do nothing
// do nothing
}
}
// getMultinodeFlags returns the multinode flags a
s a single string
// getMultinodeFlags returns the multinode flags a
nd whether shell interpretation is needed
func
(
b
*
SGLangBackend
)
getMultinodeFlags
(
numberOfNodes
int32
,
role
Role
,
serviceName
string
,
multinodeDeployer
MultinodeDeployer
)
string
{
func
(
b
*
SGLangBackend
)
getMultinodeFlags
(
numberOfNodes
int32
,
role
Role
,
serviceName
string
,
multinodeDeployer
MultinodeDeployer
)
(
string
,
bool
)
{
distInitAddr
:=
fmt
.
Sprintf
(
"%s:%s"
,
multinodeDeployer
.
GetLeaderHostname
(
serviceName
),
SglangPort
)
distInitAddr
:=
fmt
.
Sprintf
(
"%s:%s"
,
multinodeDeployer
.
GetLeaderHostname
(
serviceName
),
SglangPort
)
nodeRank
:=
multinodeDeployer
.
GetNodeRank
()
// Determine node-rank
var
nodeRank
string
var
needsShell
bool
if
role
==
RoleLeader
{
if
role
==
RoleLeader
{
nodeRank
=
"0"
nodeRank
=
"0"
needsShell
=
false
}
else
{
nodeRank
,
needsShell
=
multinodeDeployer
.
GetNodeRank
()
}
}
return
fmt
.
Sprintf
(
"--dist-init-addr %s --nnodes %d --node-rank %s"
,
distInitAddr
,
numberOfNodes
,
nodeRank
)
flags
:=
fmt
.
Sprintf
(
"--dist-init-addr %s --nnodes %d --node-rank %s"
,
distInitAddr
,
numberOfNodes
,
nodeRank
)
return
flags
,
needsShell
}
}
// injectFlagsIntoPythonCommand finds python sglang commands and adds flags after them
// injectFlagsIntoPythonCommand finds python sglang commands and adds flags after them
...
...
deploy/cloud/operator/internal/dynamo/backend_sglang_test.go
View file @
9b893c93
...
@@ -8,7 +8,182 @@ import (
...
@@ -8,7 +8,182 @@ import (
corev1
"k8s.io/api/core/v1"
corev1
"k8s.io/api/core/v1"
)
)
func
TestSGLangBackend_DirectFlagInjection
(
t
*
testing
.
T
)
{
// Mock MultinodeDeployer for testing with no shell interpretation needed
type
MockSimpleDeployer
struct
{}
func
(
m
*
MockSimpleDeployer
)
GetLeaderHostname
(
serviceName
string
)
string
{
return
"leader.example.com"
}
func
(
m
*
MockSimpleDeployer
)
GetHostNames
(
serviceName
string
,
numberOfNodes
int32
)
[]
string
{
hostnames
:=
make
([]
string
,
numberOfNodes
)
hostnames
[
0
]
=
m
.
GetLeaderHostname
(
serviceName
)
for
i
:=
int32
(
1
);
i
<
numberOfNodes
;
i
++
{
hostnames
[
i
]
=
"worker"
+
string
(
rune
(
'0'
+
i
))
+
".example.com"
}
return
hostnames
}
func
(
m
*
MockSimpleDeployer
)
GetNodeRank
()
(
string
,
bool
)
{
return
"1"
,
false
// simple rank, no shell interpretation needed
}
// Mock MultinodeDeployer for testing with shell interpretation needed
type
MockShellDeployer
struct
{}
func
(
m
*
MockShellDeployer
)
GetLeaderHostname
(
serviceName
string
)
string
{
return
"$(LEADER_HOST)"
}
func
(
m
*
MockShellDeployer
)
GetHostNames
(
serviceName
string
,
numberOfNodes
int32
)
[]
string
{
hostnames
:=
make
([]
string
,
numberOfNodes
)
hostnames
[
0
]
=
m
.
GetLeaderHostname
(
serviceName
)
for
i
:=
int32
(
1
);
i
<
numberOfNodes
;
i
++
{
hostnames
[
i
]
=
"$(WORKER_"
+
string
(
rune
(
'0'
+
i
))
+
"_HOST)"
}
return
hostnames
}
func
(
m
*
MockShellDeployer
)
GetNodeRank
()
(
string
,
bool
)
{
return
"$(WORKER_INDEX)"
,
true
// needs shell interpretation
}
func
TestSGLangBackend_PythonCommandInjection
(
t
*
testing
.
T
)
{
backend
:=
&
SGLangBackend
{}
tests
:=
[]
struct
{
name
string
numberOfNodes
int32
role
Role
multinodeDeployer
MultinodeDeployer
initialCommand
[]
string
initialArgs
[]
string
expectedCommand
[]
string
expectedArgs
[]
string
description
string
}{
{
name
:
"single node python command no changes"
,
numberOfNodes
:
1
,
role
:
RoleMain
,
multinodeDeployer
:
&
MockSimpleDeployer
{},
initialCommand
:
[]
string
{
"python3"
},
initialArgs
:
[]
string
{
"-m"
,
"dynamo.sglang.worker"
},
expectedCommand
:
[]
string
{
"python3"
},
expectedArgs
:
[]
string
{
"-m"
,
"dynamo.sglang.worker"
},
description
:
"Single node should not modify python commands"
,
},
{
name
:
"python command simple deployer - direct append"
,
numberOfNodes
:
2
,
role
:
RoleWorker
,
multinodeDeployer
:
&
MockSimpleDeployer
{},
initialCommand
:
[]
string
{
"python3"
},
initialArgs
:
[]
string
{
"-m"
,
"dynamo.sglang.worker"
,
"--model"
,
"llama"
},
expectedCommand
:
[]
string
{
"python3"
},
expectedArgs
:
[]
string
{
"-m"
,
"dynamo.sglang.worker"
,
"--model"
,
"llama"
,
"--dist-init-addr"
,
"leader.example.com:29500"
,
"--nnodes"
,
"2"
,
"--node-rank"
,
"1"
},
description
:
"Direct python command with simple deployer should append flags"
,
},
{
name
:
"python command shell deployer - shell wrapping"
,
numberOfNodes
:
2
,
role
:
RoleWorker
,
multinodeDeployer
:
&
MockShellDeployer
{},
initialCommand
:
[]
string
{
"python3"
},
initialArgs
:
[]
string
{
"-m"
,
"dynamo.sglang.worker"
,
"--model"
,
"llama"
},
expectedCommand
:
[]
string
{
"sh"
,
"-c"
},
expectedArgs
:
[]
string
{
"exec python3 -m dynamo.sglang.worker --model llama --dist-init-addr $(LEADER_HOST):29500 --nnodes 2 --node-rank $(WORKER_INDEX)"
},
description
:
"Direct python command with shell deployer should wrap with sh -c exec"
,
},
{
name
:
"python command leader role - always simple"
,
numberOfNodes
:
3
,
role
:
RoleLeader
,
multinodeDeployer
:
&
MockShellDeployer
{},
initialCommand
:
[]
string
{
"python"
},
initialArgs
:
[]
string
{
"-m"
,
"dynamo.sglang.worker"
},
expectedCommand
:
[]
string
{
"python"
},
expectedArgs
:
[]
string
{
"-m"
,
"dynamo.sglang.worker"
,
"--dist-init-addr"
,
"$(LEADER_HOST):29500"
,
"--nnodes"
,
"3"
,
"--node-rank"
,
"0"
},
description
:
"Leader role should never use shell wrapping"
,
},
{
name
:
"python3.11 variant supported"
,
numberOfNodes
:
2
,
role
:
RoleWorker
,
multinodeDeployer
:
&
MockSimpleDeployer
{},
initialCommand
:
[]
string
{
"python3.11"
},
initialArgs
:
[]
string
{
"-m"
,
"dynamo.sglang.worker"
},
expectedCommand
:
[]
string
{
"python3.11"
},
expectedArgs
:
[]
string
{
"-m"
,
"dynamo.sglang.worker"
,
"--dist-init-addr"
,
"leader.example.com:29500"
,
"--nnodes"
,
"2"
,
"--node-rank"
,
"1"
},
description
:
"Python version variants should be recognized"
,
},
{
name
:
"python command with module in command array - simple deployer"
,
numberOfNodes
:
2
,
role
:
RoleWorker
,
multinodeDeployer
:
&
MockSimpleDeployer
{},
initialCommand
:
[]
string
{
"python3"
,
"-m"
,
"dynamo.sglang"
},
initialArgs
:
[]
string
{
"--model-path"
,
"Qwen/Qwen3-0.6B"
,
"--tp-size"
,
"8"
},
expectedCommand
:
[]
string
{
"python3"
,
"-m"
,
"dynamo.sglang"
},
expectedArgs
:
[]
string
{
"--model-path"
,
"Qwen/Qwen3-0.6B"
,
"--tp-size"
,
"8"
,
"--dist-init-addr"
,
"leader.example.com:29500"
,
"--nnodes"
,
"2"
,
"--node-rank"
,
"1"
},
description
:
"Multi-element python command should have flags appended to args"
,
},
{
name
:
"python command with module in command array - shell deployer"
,
numberOfNodes
:
2
,
role
:
RoleWorker
,
multinodeDeployer
:
&
MockShellDeployer
{},
initialCommand
:
[]
string
{
"python3"
,
"-m"
,
"dynamo.sglang"
},
initialArgs
:
[]
string
{
"--model-path"
,
"Qwen/Qwen3-0.6B"
},
expectedCommand
:
[]
string
{
"sh"
,
"-c"
},
expectedArgs
:
[]
string
{
"exec python3 -m dynamo.sglang --model-path Qwen/Qwen3-0.6B --dist-init-addr $(LEADER_HOST):29500 --nnodes 2 --node-rank $(WORKER_INDEX)"
},
description
:
"Multi-element python command with shell deployer should wrap entire command"
,
},
{
name
:
"python command with no args - shell deployer"
,
numberOfNodes
:
2
,
role
:
RoleWorker
,
multinodeDeployer
:
&
MockShellDeployer
{},
initialCommand
:
[]
string
{
"python3"
,
"-m"
,
"dynamo.sglang"
},
initialArgs
:
[]
string
{},
expectedCommand
:
[]
string
{
"sh"
,
"-c"
},
expectedArgs
:
[]
string
{
"exec python3 -m dynamo.sglang --dist-init-addr $(LEADER_HOST):29500 --nnodes 2 --node-rank $(WORKER_INDEX)"
},
description
:
"Multi-element python command with no args should still work with shell wrapper"
,
},
{
name
:
"non-python command multinode unchanged"
,
numberOfNodes
:
2
,
role
:
RoleWorker
,
multinodeDeployer
:
&
MockShellDeployer
{},
initialCommand
:
[]
string
{
"java"
},
initialArgs
:
[]
string
{
"-jar"
,
"app.jar"
},
expectedCommand
:
[]
string
{
"java"
},
expectedArgs
:
[]
string
{
"-jar"
,
"app.jar"
},
// Args remain separate, no python found, no changes
description
:
"Non-python commands should remain unchanged (no flattening)"
,
},
}
for
_
,
tt
:=
range
tests
{
t
.
Run
(
tt
.
name
,
func
(
t
*
testing
.
T
)
{
container
:=
&
corev1
.
Container
{
Command
:
append
([]
string
{},
tt
.
initialCommand
...
),
Args
:
append
([]
string
{},
tt
.
initialArgs
...
),
}
backend
.
UpdateContainer
(
container
,
tt
.
numberOfNodes
,
tt
.
role
,
&
v1alpha1
.
DynamoComponentDeploymentOverridesSpec
{},
"test-service"
,
tt
.
multinodeDeployer
)
if
!
reflect
.
DeepEqual
(
container
.
Command
,
tt
.
expectedCommand
)
{
t
.
Errorf
(
"UpdateContainer() command = %v, want %v"
,
container
.
Command
,
tt
.
expectedCommand
)
}
if
!
reflect
.
DeepEqual
(
container
.
Args
,
tt
.
expectedArgs
)
{
t
.
Errorf
(
"UpdateContainer() args = %v, want %v"
,
container
.
Args
,
tt
.
expectedArgs
)
}
})
}
}
func
TestSGLangBackend_ShellCommandInjection
(
t
*
testing
.
T
)
{
backend
:=
&
SGLangBackend
{}
backend
:=
&
SGLangBackend
{}
tests
:=
[]
struct
{
tests
:=
[]
struct
{
...
@@ -16,88 +191,108 @@ func TestSGLangBackend_DirectFlagInjection(t *testing.T) {
...
@@ -16,88 +191,108 @@ func TestSGLangBackend_DirectFlagInjection(t *testing.T) {
numberOfNodes
int32
numberOfNodes
int32
role
Role
role
Role
multinodeDeployer
MultinodeDeployer
multinodeDeployer
MultinodeDeployer
initialCommand
[]
string
initialArgs
[]
string
initialArgs
[]
string
expectedArgs
[]
string
expectedArgs
[]
string
description
string
description
string
}{
}{
{
{
name
:
"single node
does
not modif
y args
"
,
name
:
"single node
shell command
not modif
ied
"
,
numberOfNodes
:
1
,
numberOfNodes
:
1
,
role
:
RoleMain
,
role
:
RoleMain
,
multinodeDeployer
:
&
GroveMultinodeDeployer
{},
multinodeDeployer
:
&
GroveMultinodeDeployer
{},
initialCommand
:
[]
string
{
"sh"
,
"-c"
},
initialArgs
:
[]
string
{
"python -m dynamo.sglang.worker"
},
initialArgs
:
[]
string
{
"python -m dynamo.sglang.worker"
},
expectedArgs
:
[]
string
{
"python -m dynamo.sglang.worker"
},
expectedArgs
:
[]
string
{
"python -m dynamo.sglang.worker"
},
description
:
"Single node should not modify
anything
"
,
description
:
"Single node should not modify
shell commands
"
,
},
},
{
{
name
:
"multinode
adds flags to simple python command
"
,
name
:
"multinode
shell command with regex injection
"
,
numberOfNodes
:
2
,
numberOfNodes
:
2
,
role
:
RoleLeader
,
role
:
RoleLeader
,
multinodeDeployer
:
&
GroveMultinodeDeployer
{},
multinodeDeployer
:
&
GroveMultinodeDeployer
{},
initialCommand
:
[]
string
{
"sh"
,
"-c"
},
initialArgs
:
[]
string
{
"python -m dynamo.sglang.worker"
},
initialArgs
:
[]
string
{
"python -m dynamo.sglang.worker"
},
expectedArgs
:
[]
string
{
"python -m dynamo.sglang.worker --dist-init-addr $
{
GROVE_PCSG_NAME
}
-$
{
GROVE_PCSG_INDEX
}
-test-service-ldr-0.$
{
GROVE_HEADLESS_SERVICE
}
:29500 --nnodes 2 --node-rank 0"
},
expectedArgs
:
[]
string
{
"python -m dynamo.sglang.worker --dist-init-addr $
(
GROVE_PCSG_NAME
)
-$
(
GROVE_PCSG_INDEX
)
-test-service-ldr-0.$
(
GROVE_HEADLESS_SERVICE
)
:29500 --nnodes 2 --node-rank 0"
},
description
:
"Sh
ould add multinode flags directly to
python command"
,
description
:
"Sh
ell commands should use regex injection for
python command
s
"
,
},
},
{
{
name
:
"multinode with complex
command
"
,
name
:
"multinode
shell command
with complex
pipeline
"
,
numberOfNodes
:
2
,
numberOfNodes
:
2
,
role
:
RoleLeader
,
role
:
RoleLeader
,
multinodeDeployer
:
&
GroveMultinodeDeployer
{},
multinodeDeployer
:
&
GroveMultinodeDeployer
{},
initialCommand
:
[]
string
{
"sh"
,
"-c"
},
initialArgs
:
[]
string
{
"echo blah | wc -l && python -m dynamo.sglang.worker && ls -al"
},
initialArgs
:
[]
string
{
"echo blah | wc -l && python -m dynamo.sglang.worker && ls -al"
},
expectedArgs
:
[]
string
{
"echo blah | wc -l && python -m dynamo.sglang.worker --dist-init-addr $
{
GROVE_PCSG_NAME
}
-$
{
GROVE_PCSG_INDEX
}
-test-service-ldr-0.$
{
GROVE_HEADLESS_SERVICE
}
:29500 --nnodes 2 --node-rank 0 && ls -al"
},
expectedArgs
:
[]
string
{
"echo blah | wc -l && python -m dynamo.sglang.worker --dist-init-addr $
(
GROVE_PCSG_NAME
)
-$
(
GROVE_PCSG_INDEX
)
-test-service-ldr-0.$
(
GROVE_HEADLESS_SERVICE
)
:29500 --nnodes 2 --node-rank 0 && ls -al"
},
description
:
"
Should add
flags only to python
command, not other commands
"
,
description
:
"
Complex shell commands should inject
flags only
in
to python
part
"
,
},
},
{
{
name
:
"
multinode
worker with
G
rove
deployment
"
,
name
:
"
shell command
worker with
g
rove
env vars
"
,
numberOfNodes
:
3
,
numberOfNodes
:
3
,
role
:
RoleWorker
,
role
:
RoleWorker
,
multinodeDeployer
:
&
GroveMultinodeDeployer
{},
multinodeDeployer
:
&
GroveMultinodeDeployer
{},
initialCommand
:
[]
string
{
"sh"
,
"-c"
},
initialArgs
:
[]
string
{
"python -m dynamo.sglang.worker"
},
initialArgs
:
[]
string
{
"python -m dynamo.sglang.worker"
},
expectedArgs
:
[]
string
{
"python -m dynamo.sglang.worker --dist-init-addr $
{
GROVE_PCSG_NAME
}
-$
{
GROVE_PCSG_INDEX
}
-test-service-ldr-0.$
{
GROVE_HEADLESS_SERVICE
}
:29500 --nnodes 3 --node-rank $((GROVE_PCLQ_POD_INDEX + 1))"
},
expectedArgs
:
[]
string
{
"python -m dynamo.sglang.worker --dist-init-addr $
(
GROVE_PCSG_NAME
)
-$
(
GROVE_PCSG_INDEX
)
-test-service-ldr-0.$
(
GROVE_HEADLESS_SERVICE
)
:29500 --nnodes 3 --node-rank $((GROVE_PCLQ_POD_INDEX + 1))"
},
description
:
"
W
orker should get
correct
node rank"
,
description
:
"
Shell command w
orker should get
grove env vars in
node rank"
,
},
},
{
{
name
:
"
LWS deployment uses correct address
"
,
name
:
"
shell command with LWS deployer
"
,
numberOfNodes
:
2
,
numberOfNodes
:
2
,
role
:
RoleLeader
,
role
:
RoleLeader
,
multinodeDeployer
:
&
LWSMultinodeDeployer
{},
multinodeDeployer
:
&
LWSMultinodeDeployer
{},
initialCommand
:
[]
string
{
"sh"
,
"-c"
},
initialArgs
:
[]
string
{
"python -m dynamo.sglang.worker"
},
initialArgs
:
[]
string
{
"python -m dynamo.sglang.worker"
},
expectedArgs
:
[]
string
{
"python -m dynamo.sglang.worker --dist-init-addr $
{
LWS_LEADER_ADDRESS
}
:29500 --nnodes 2 --node-rank 0"
},
expectedArgs
:
[]
string
{
"python -m dynamo.sglang.worker --dist-init-addr $
(
LWS_LEADER_ADDRESS
)
:29500 --nnodes 2 --node-rank 0"
},
description
:
"LWS
deployment
should use LWS
_LEADER_ADDRESS
"
,
description
:
"LWS
shell commands
should use LWS
variables
"
,
},
},
{
{
name
:
"command with pipes
gets flags before pipe
"
,
name
:
"
shell
command with pipes"
,
numberOfNodes
:
2
,
numberOfNodes
:
2
,
role
:
RoleLeader
,
role
:
RoleLeader
,
multinodeDeployer
:
&
GroveMultinodeDeployer
{},
multinodeDeployer
:
&
GroveMultinodeDeployer
{},
initialCommand
:
[]
string
{
"sh"
,
"-c"
},
initialArgs
:
[]
string
{
"python -m dynamo.sglang.worker | tee /tmp/log"
},
initialArgs
:
[]
string
{
"python -m dynamo.sglang.worker | tee /tmp/log"
},
expectedArgs
:
[]
string
{
"python -m dynamo.sglang.worker --dist-init-addr $
{
GROVE_PCSG_NAME
}
-$
{
GROVE_PCSG_INDEX
}
-test-service-ldr-0.$
{
GROVE_HEADLESS_SERVICE
}
:29500 --nnodes 2 --node-rank 0 | tee /tmp/log"
},
expectedArgs
:
[]
string
{
"python -m dynamo.sglang.worker --dist-init-addr $
(
GROVE_PCSG_NAME
)
-$
(
GROVE_PCSG_INDEX
)
-test-service-ldr-0.$
(
GROVE_HEADLESS_SERVICE
)
:29500 --nnodes 2 --node-rank 0 | tee /tmp/log"
},
description
:
"Should in
ser
t flags before pipe
operator
"
,
description
:
"Sh
ell commands with pipes sh
ould in
jec
t flags before pipe"
,
},
},
{
{
name
:
"
multiple args are flattened and processed together
"
,
name
:
"
shell command multiple args individual processing
"
,
numberOfNodes
:
2
,
numberOfNodes
:
2
,
role
:
RoleLeader
,
role
:
RoleLeader
,
multinodeDeployer
:
&
GroveMultinodeDeployer
{},
multinodeDeployer
:
&
GroveMultinodeDeployer
{},
initialCommand
:
[]
string
{
"sh"
,
"-c"
},
initialArgs
:
[]
string
{
"echo start"
,
"python -m dynamo.sglang.worker"
,
"echo done"
},
initialArgs
:
[]
string
{
"echo start"
,
"python -m dynamo.sglang.worker"
,
"echo done"
},
expectedArgs
:
[]
string
{
"echo start
python -m dynamo.sglang.worker --dist-init-addr $
{
GROVE_PCSG_NAME
}
-$
{
GROVE_PCSG_INDEX
}
-test-service-ldr-0.$
{
GROVE_HEADLESS_SERVICE
}
:29500 --nnodes 2 --node-rank 0
echo done"
},
expectedArgs
:
[]
string
{
"echo start
"
,
"
python -m dynamo.sglang.worker --dist-init-addr $
(
GROVE_PCSG_NAME
)
-$
(
GROVE_PCSG_INDEX
)
-test-service-ldr-0.$
(
GROVE_HEADLESS_SERVICE
)
:29500 --nnodes 2 --node-rank 0
"
,
"
echo done"
},
description
:
"
Multiple args should be flattened and python command gets flags
"
,
description
:
"
Shell commands with multiple args should process each individually, modify only the python arg
"
,
},
},
{
{
name
:
"
no sglang command means flattened but no
change
s
"
,
name
:
"
shell command no sglang modules un
change
d
"
,
numberOfNodes
:
2
,
numberOfNodes
:
2
,
role
:
RoleLeader
,
role
:
RoleLeader
,
multinodeDeployer
:
&
GroveMultinodeDeployer
{},
multinodeDeployer
:
&
GroveMultinodeDeployer
{},
initialCommand
:
[]
string
{
"sh"
,
"-c"
},
initialArgs
:
[]
string
{
"echo hello"
,
"python -m some.other.module"
},
initialArgs
:
[]
string
{
"echo hello"
,
"python -m some.other.module"
},
expectedArgs
:
[]
string
{
"echo hello python -m some.other.module"
},
expectedArgs
:
[]
string
{
"echo hello"
,
"python -m some.other.module"
},
description
:
"Non-sglang commands should be flattened but not modified"
,
description
:
"Shell commands without sglang modules should remain unchanged (args stay separate)"
,
},
{
name
:
"shell command stops after first python injection"
,
numberOfNodes
:
2
,
role
:
RoleLeader
,
multinodeDeployer
:
&
GroveMultinodeDeployer
{},
initialCommand
:
[]
string
{
"sh"
,
"-c"
},
initialArgs
:
[]
string
{
"python -m dynamo.sglang.worker"
,
"python -m dynamo.sglang.worker --other-flags"
},
expectedArgs
:
[]
string
{
"python -m dynamo.sglang.worker --dist-init-addr $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-ldr-0.$(GROVE_HEADLESS_SERVICE):29500 --nnodes 2 --node-rank 0"
,
"python -m dynamo.sglang.worker --other-flags"
},
description
:
"Should stop processing after first successful python flag injection"
,
},
},
}
}
for
_
,
tt
:=
range
tests
{
for
_
,
tt
:=
range
tests
{
t
.
Run
(
tt
.
name
,
func
(
t
*
testing
.
T
)
{
t
.
Run
(
tt
.
name
,
func
(
t
*
testing
.
T
)
{
container
:=
&
corev1
.
Container
{
container
:=
&
corev1
.
Container
{
Args
:
append
([]
string
{},
tt
.
initialArgs
...
),
Command
:
append
([]
string
{},
tt
.
initialCommand
...
),
Args
:
append
([]
string
{},
tt
.
initialArgs
...
),
}
}
backend
.
UpdateContainer
(
container
,
tt
.
numberOfNodes
,
tt
.
role
,
&
v1alpha1
.
DynamoComponentDeploymentOverridesSpec
{},
"test-service"
,
tt
.
multinodeDeployer
)
backend
.
UpdateContainer
(
container
,
tt
.
numberOfNodes
,
tt
.
role
,
&
v1alpha1
.
DynamoComponentDeploymentOverridesSpec
{},
"test-service"
,
tt
.
multinodeDeployer
)
...
@@ -106,14 +301,96 @@ func TestSGLangBackend_DirectFlagInjection(t *testing.T) {
...
@@ -106,14 +301,96 @@ func TestSGLangBackend_DirectFlagInjection(t *testing.T) {
t
.
Errorf
(
"UpdateContainer() args = %v, want %v"
,
container
.
Args
,
tt
.
expectedArgs
)
t
.
Errorf
(
"UpdateContainer() args = %v, want %v"
,
container
.
Args
,
tt
.
expectedArgs
)
}
}
// Verify no environment variables were added
// Verify command is still sh -c for shell commands
if
len
(
container
.
Env
)
>
0
{
expectedCommand
:=
tt
.
initialCommand
t
.
Errorf
(
"UpdateContainer() should not add environment variables, but added: %v"
,
container
.
Env
)
if
!
reflect
.
DeepEqual
(
container
.
Command
,
expectedCommand
)
{
t
.
Errorf
(
"UpdateContainer() should preserve shell command, got: %v, want: %v"
,
container
.
Command
,
expectedCommand
)
}
})
}
}
func
TestIsPythonCommand
(
t
*
testing
.
T
)
{
tests
:=
[]
struct
{
cmd
string
expected
bool
}{
{
"python"
,
true
},
{
"python3"
,
true
},
{
"python2"
,
true
},
{
"python3.11"
,
true
},
{
"python2.7"
,
true
},
{
"python3.12.1"
,
true
},
{
"java"
,
false
},
{
"sh"
,
false
},
{
"node"
,
false
},
{
"python-config"
,
false
},
// hyphen makes it not a python interpreter
{
""
,
false
},
{
"python "
,
false
},
// space makes it invalid
{
"pythonx"
,
false
},
// extra characters
}
for
_
,
tt
:=
range
tests
{
t
.
Run
(
tt
.
cmd
,
func
(
t
*
testing
.
T
)
{
result
:=
isPythonCommand
(
tt
.
cmd
)
if
result
!=
tt
.
expected
{
t
.
Errorf
(
"isPythonCommand(%q) = %v, want %v"
,
tt
.
cmd
,
result
,
tt
.
expected
)
}
})
}
}
func
TestSGLangBackend_GetMultinodeFlags
(
t
*
testing
.
T
)
{
backend
:=
&
SGLangBackend
{}
tests
:=
[]
struct
{
name
string
numberOfNodes
int32
role
Role
multinodeDeployer
MultinodeDeployer
expectedFlags
string
expectedNeedsShell
bool
description
string
}{
{
name
:
"leader role never needs shell"
,
numberOfNodes
:
2
,
role
:
RoleLeader
,
multinodeDeployer
:
&
MockShellDeployer
{},
expectedFlags
:
"--dist-init-addr $(LEADER_HOST):29500 --nnodes 2 --node-rank 0"
,
expectedNeedsShell
:
false
,
description
:
"Leader should always use rank 0 and no shell interpretation"
,
},
{
name
:
"worker with simple deployer"
,
numberOfNodes
:
3
,
role
:
RoleWorker
,
multinodeDeployer
:
&
MockSimpleDeployer
{},
expectedFlags
:
"--dist-init-addr leader.example.com:29500 --nnodes 3 --node-rank 1"
,
expectedNeedsShell
:
false
,
description
:
"Simple deployer should not need shell interpretation"
,
},
{
name
:
"worker with shell deployer"
,
numberOfNodes
:
2
,
role
:
RoleWorker
,
multinodeDeployer
:
&
MockShellDeployer
{},
expectedFlags
:
"--dist-init-addr $(LEADER_HOST):29500 --nnodes 2 --node-rank $(WORKER_INDEX)"
,
expectedNeedsShell
:
true
,
description
:
"Shell deployer should need shell interpretation for workers"
,
},
}
for
_
,
tt
:=
range
tests
{
t
.
Run
(
tt
.
name
,
func
(
t
*
testing
.
T
)
{
flags
,
needsShell
:=
backend
.
getMultinodeFlags
(
tt
.
numberOfNodes
,
tt
.
role
,
"test-service"
,
tt
.
multinodeDeployer
)
if
flags
!=
tt
.
expectedFlags
{
t
.
Errorf
(
"getMultinodeFlags() flags = %q, want %q"
,
flags
,
tt
.
expectedFlags
)
}
}
// Verify command was not changed
if
needsShell
!=
tt
.
expectedNeedsShell
{
if
len
(
container
.
Command
)
>
0
{
t
.
Errorf
(
"getMultinodeFlags() needsShell = %v, want %v"
,
needsShell
,
tt
.
expectedNeedsShell
)
t
.
Errorf
(
"UpdateContainer() should not modify command, but set: %v"
,
container
.
Command
)
}
}
})
})
}
}
...
...
deploy/cloud/operator/internal/dynamo/backend_trtllm_test.go
View file @
9b893c93
...
@@ -60,7 +60,7 @@ func TestTRTLLMBackend_UpdateContainer(t *testing.T) {
...
@@ -60,7 +60,7 @@ func TestTRTLLMBackend_UpdateContainer(t *testing.T) {
{
Name
:
commonconsts
.
MpiRunSshSecretName
,
MountPath
:
"/ssh-pk"
,
ReadOnly
:
true
},
{
Name
:
commonconsts
.
MpiRunSshSecretName
,
MountPath
:
"/ssh-pk"
,
ReadOnly
:
true
},
},
},
expectedCommand
:
[]
string
{
"/bin/sh"
,
"-c"
},
expectedCommand
:
[]
string
{
"/bin/sh"
,
"-c"
},
expectedArgs
:
[]
string
{
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --oversubscribe -n 6 -H $
{
GROVE_PCSG_NAME
}
-$
{
GROVE_PCSG_INDEX
}
-test-service-ldr-0.$
{
GROVE_HEADLESS_SERVICE
}
,$
{
GROVE_PCSG_NAME
}
-$
{
GROVE_PCSG_INDEX
}
-test-service-wkr-0.$
{
GROVE_HEADLESS_SERVICE
}
,$
{
GROVE_PCSG_NAME
}
-$
{
GROVE_PCSG_INDEX
}
-test-service-wkr-1.$
{
GROVE_HEADLESS_SERVICE
}
--mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x OMPI_MCA_orte_keep_fqdn_hostnames -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'source /opt/dynamo/venv/bin/activate && trtllm-llmapi-launch python3 --model test'"
},
expectedArgs
:
[]
string
{
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --oversubscribe -n 6 -H $
(
GROVE_PCSG_NAME
)
-$
(
GROVE_PCSG_INDEX
)
-test-service-ldr-0.$
(
GROVE_HEADLESS_SERVICE
)
,$
(
GROVE_PCSG_NAME
)
-$
(
GROVE_PCSG_INDEX
)
-test-service-wkr-0.$
(
GROVE_HEADLESS_SERVICE
)
,$
(
GROVE_PCSG_NAME
)
-$
(
GROVE_PCSG_INDEX
)
-test-service-wkr-1.$
(
GROVE_HEADLESS_SERVICE
)
--mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x OMPI_MCA_orte_keep_fqdn_hostnames -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'source /opt/dynamo/venv/bin/activate && trtllm-llmapi-launch python3 --model test'"
},
expectedEnv
:
[]
corev1
.
EnvVar
{
expectedEnv
:
[]
corev1
.
EnvVar
{
{
Name
:
"OMPI_MCA_orte_keep_fqdn_hostnames"
,
Value
:
"1"
},
{
Name
:
"OMPI_MCA_orte_keep_fqdn_hostnames"
,
Value
:
"1"
},
},
},
...
@@ -116,7 +116,7 @@ func TestTRTLLMBackend_UpdateContainer(t *testing.T) {
...
@@ -116,7 +116,7 @@ func TestTRTLLMBackend_UpdateContainer(t *testing.T) {
{
Name
:
commonconsts
.
MpiRunSshSecretName
,
MountPath
:
"/ssh-pk"
,
ReadOnly
:
true
},
{
Name
:
commonconsts
.
MpiRunSshSecretName
,
MountPath
:
"/ssh-pk"
,
ReadOnly
:
true
},
},
},
expectedCommand
:
[]
string
{
"/bin/sh"
,
"-c"
},
expectedCommand
:
[]
string
{
"/bin/sh"
,
"-c"
},
expectedArgs
:
[]
string
{
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --oversubscribe -n 2 -H $
{
LWS_LEADER_ADDRESS
}
,$
{
LWS_WORKER_1_ADDRESS
}
--mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x OMPI_MCA_orte_keep_fqdn_hostnames -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'source /opt/dynamo/venv/bin/activate && trtllm-llmapi-launch python3 --model test'"
},
expectedArgs
:
[]
string
{
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --oversubscribe -n 2 -H $
(
LWS_LEADER_ADDRESS
)
,$
(
LWS_WORKER_1_ADDRESS
)
--mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x OMPI_MCA_orte_keep_fqdn_hostnames -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'source /opt/dynamo/venv/bin/activate && trtllm-llmapi-launch python3 --model test'"
},
expectedEnv
:
[]
corev1
.
EnvVar
{
expectedEnv
:
[]
corev1
.
EnvVar
{
{
Name
:
"OMPI_MCA_orte_keep_fqdn_hostnames"
,
Value
:
"1"
},
{
Name
:
"OMPI_MCA_orte_keep_fqdn_hostnames"
,
Value
:
"1"
},
},
},
...
@@ -415,8 +415,8 @@ func TestTRTLLMBackend_generateWorkerHostnames(t *testing.T) {
...
@@ -415,8 +415,8 @@ func TestTRTLLMBackend_generateWorkerHostnames(t *testing.T) {
multinodeDeployer
:
&
LWSMultinodeDeployer
{},
multinodeDeployer
:
&
LWSMultinodeDeployer
{},
serviceName
:
"test-service"
,
serviceName
:
"test-service"
,
expectedContains
:
[]
string
{
expectedContains
:
[]
string
{
"$
{
LWS_LEADER_ADDRESS
}
"
,
"$
(
LWS_LEADER_ADDRESS
)
"
,
"$
{
LWS_WORKER_1_ADDRESS
}
"
,
"$
(
LWS_WORKER_1_ADDRESS
)
"
,
},
},
expectedNodeCount
:
2
,
expectedNodeCount
:
2
,
},
},
...
@@ -440,10 +440,10 @@ func TestTRTLLMBackend_generateWorkerHostnames(t *testing.T) {
...
@@ -440,10 +440,10 @@ func TestTRTLLMBackend_generateWorkerHostnames(t *testing.T) {
multinodeDeployer
:
&
LWSMultinodeDeployer
{},
multinodeDeployer
:
&
LWSMultinodeDeployer
{},
serviceName
:
"worker"
,
serviceName
:
"worker"
,
expectedContains
:
[]
string
{
expectedContains
:
[]
string
{
"$
{
LWS_LEADER_ADDRESS
}
"
,
"$
(
LWS_LEADER_ADDRESS
)
"
,
"$
{
LWS_WORKER_1_ADDRESS
}
"
,
"$
(
LWS_WORKER_1_ADDRESS
)
"
,
"$
{
LWS_WORKER_2_ADDRESS
}
"
,
"$
(
LWS_WORKER_2_ADDRESS
)
"
,
"$
{
LWS_WORKER_3_ADDRESS
}
"
,
"$
(
LWS_WORKER_3_ADDRESS
)
"
,
},
},
expectedNodeCount
:
4
,
expectedNodeCount
:
4
,
},
},
...
@@ -563,7 +563,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
...
@@ -563,7 +563,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
},
},
initialArgs
:
[]
string
{
"python3"
,
"--model"
,
"test"
},
initialArgs
:
[]
string
{
"python3"
,
"--model"
,
"test"
},
initialCommand
:
[]
string
{},
initialCommand
:
[]
string
{},
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --oversubscribe -n 6 -H $
{
GROVE_PCSG_NAME
}
-$
{
GROVE_PCSG_INDEX
}
-test-service-ldr-0.$
{
GROVE_HEADLESS_SERVICE
}
,$
{
GROVE_PCSG_NAME
}
-$
{
GROVE_PCSG_INDEX
}
-test-service-wkr-0.$
{
GROVE_HEADLESS_SERVICE
}
,$
{
GROVE_PCSG_NAME
}
-$
{
GROVE_PCSG_INDEX
}
-test-service-wkr-1.$
{
GROVE_HEADLESS_SERVICE
}
--mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'source /opt/dynamo/venv/bin/activate && trtllm-llmapi-launch python3 --model test'"
,
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --oversubscribe -n 6 -H $
(
GROVE_PCSG_NAME
)
-$
(
GROVE_PCSG_INDEX
)
-test-service-ldr-0.$
(
GROVE_HEADLESS_SERVICE
)
,$
(
GROVE_PCSG_NAME
)
-$
(
GROVE_PCSG_INDEX
)
-test-service-wkr-0.$
(
GROVE_HEADLESS_SERVICE
)
,$
(
GROVE_PCSG_NAME
)
-$
(
GROVE_PCSG_INDEX
)
-test-service-wkr-1.$
(
GROVE_HEADLESS_SERVICE
)
--mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'source /opt/dynamo/venv/bin/activate && trtllm-llmapi-launch python3 --model test'"
,
},
},
{
{
name
:
"Leader with command and no GPU resources"
,
name
:
"Leader with command and no GPU resources"
,
...
@@ -573,7 +573,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
...
@@ -573,7 +573,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
component
:
&
v1alpha1
.
DynamoComponentDeploymentOverridesSpec
{},
component
:
&
v1alpha1
.
DynamoComponentDeploymentOverridesSpec
{},
initialArgs
:
[]
string
{},
initialArgs
:
[]
string
{},
initialCommand
:
[]
string
{
"python"
,
"-m"
,
"worker"
},
initialCommand
:
[]
string
{
"python"
,
"-m"
,
"worker"
},
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --oversubscribe -n 0 -H $
{
LWS_LEADER_ADDRESS
}
,$
{
LWS_WORKER_1_ADDRESS
}
--mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'source /opt/dynamo/venv/bin/activate && trtllm-llmapi-launch python -m worker'"
,
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --oversubscribe -n 0 -H $
(
LWS_LEADER_ADDRESS
)
,$
(
LWS_WORKER_1_ADDRESS
)
--mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'source /opt/dynamo/venv/bin/activate && trtllm-llmapi-launch python -m worker'"
,
},
},
{
{
name
:
"Leader with both command and args (args take precedence)"
,
name
:
"Leader with both command and args (args take precedence)"
,
...
@@ -591,7 +591,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
...
@@ -591,7 +591,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
},
},
initialArgs
:
[]
string
{
"launch"
,
"--config"
,
"test.yaml"
},
initialArgs
:
[]
string
{
"launch"
,
"--config"
,
"test.yaml"
},
initialCommand
:
[]
string
{
"ignored-command"
},
initialCommand
:
[]
string
{
"ignored-command"
},
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --oversubscribe -n 2 -H $
{
GROVE_PCSG_NAME
}
-$
{
GROVE_PCSG_INDEX
}
-test-ldr-0.$
{
GROVE_HEADLESS_SERVICE
}
,$
{
GROVE_PCSG_NAME
}
-$
{
GROVE_PCSG_INDEX
}
-test-wkr-0.$
{
GROVE_HEADLESS_SERVICE
}
--mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'source /opt/dynamo/venv/bin/activate && trtllm-llmapi-launch launch --config test.yaml'"
,
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --oversubscribe -n 2 -H $
(
GROVE_PCSG_NAME
)
-$
(
GROVE_PCSG_INDEX
)
-test-ldr-0.$
(
GROVE_HEADLESS_SERVICE
)
,$
(
GROVE_PCSG_NAME
)
-$
(
GROVE_PCSG_INDEX
)
-test-wkr-0.$
(
GROVE_HEADLESS_SERVICE
)
--mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'source /opt/dynamo/venv/bin/activate && trtllm-llmapi-launch launch --config test.yaml'"
,
},
},
{
{
name
:
"Leader with all environment variables forwarded"
,
name
:
"Leader with all environment variables forwarded"
,
...
@@ -609,7 +609,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
...
@@ -609,7 +609,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
},
},
initialArgs
:
[]
string
{
"serve"
,
"--model"
,
"test"
},
initialArgs
:
[]
string
{
"serve"
,
"--model"
,
"test"
},
initialCommand
:
[]
string
{},
initialCommand
:
[]
string
{},
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --oversubscribe -n 2 -H $
{
GROVE_PCSG_NAME
}
-$
{
GROVE_PCSG_INDEX
}
-test-ldr-0.$
{
GROVE_HEADLESS_SERVICE
}
,$
{
GROVE_PCSG_NAME
}
-$
{
GROVE_PCSG_INDEX
}
-test-wkr-0.$
{
GROVE_HEADLESS_SERVICE
}
--mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'source /opt/dynamo/venv/bin/activate && trtllm-llmapi-launch serve --model test'"
,
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --oversubscribe -n 2 -H $
(
GROVE_PCSG_NAME
)
-$
(
GROVE_PCSG_INDEX
)
-test-ldr-0.$
(
GROVE_HEADLESS_SERVICE
)
,$
(
GROVE_PCSG_NAME
)
-$
(
GROVE_PCSG_INDEX
)
-test-wkr-0.$
(
GROVE_HEADLESS_SERVICE
)
--mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'source /opt/dynamo/venv/bin/activate && trtllm-llmapi-launch serve --model test'"
,
},
},
{
{
name
:
"Leader with overlapping environment variables (deduplication test)"
,
name
:
"Leader with overlapping environment variables (deduplication test)"
,
...
@@ -627,7 +627,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
...
@@ -627,7 +627,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
},
},
initialArgs
:
[]
string
{
"serve"
,
"--model"
,
"test"
},
initialArgs
:
[]
string
{
"serve"
,
"--model"
,
"test"
},
initialCommand
:
[]
string
{},
initialCommand
:
[]
string
{},
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --oversubscribe -n 2 -H $
{
GROVE_PCSG_NAME
}
-$
{
GROVE_PCSG_INDEX
}
-test-ldr-0.$
{
GROVE_HEADLESS_SERVICE
}
,$
{
GROVE_PCSG_NAME
}
-$
{
GROVE_PCSG_INDEX
}
-test-wkr-0.$
{
GROVE_HEADLESS_SERVICE
}
--mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x CUSTOM_VAR -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'source /opt/dynamo/venv/bin/activate && trtllm-llmapi-launch serve --model test'"
,
expected
:
"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort 2222
\\
n' > ~/.ssh/config && mpirun --oversubscribe -n 2 -H $
(
GROVE_PCSG_NAME
)
-$
(
GROVE_PCSG_INDEX
)
-test-ldr-0.$
(
GROVE_HEADLESS_SERVICE
)
,$
(
GROVE_PCSG_NAME
)
-$
(
GROVE_PCSG_INDEX
)
-test-wkr-0.$
(
GROVE_HEADLESS_SERVICE
)
--mca pml ob1 --mca plm_rsh_args
\"
-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
-x CUDA_VISIBLE_DEVICES -x CUSTOM_VAR -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'source /opt/dynamo/venv/bin/activate && trtllm-llmapi-launch serve --model test'"
,
},
},
}
}
...
...
deploy/cloud/operator/internal/dynamo/backend_vllm_test.go
View file @
9b893c93
...
@@ -53,7 +53,7 @@ func TestVLLMBackend_UpdateContainer(t *testing.T) {
...
@@ -53,7 +53,7 @@ func TestVLLMBackend_UpdateContainer(t *testing.T) {
component
:
&
v1alpha1
.
DynamoComponentDeploymentOverridesSpec
{},
component
:
&
v1alpha1
.
DynamoComponentDeploymentOverridesSpec
{},
multinodeDeployer
:
&
GroveMultinodeDeployer
{},
multinodeDeployer
:
&
GroveMultinodeDeployer
{},
initialArgs
:
[]
string
{
"python3"
,
"-m"
,
"dynamo.vllm"
,
"--model"
,
"test"
},
initialArgs
:
[]
string
{
"python3"
,
"-m"
,
"dynamo.vllm"
,
"--model"
,
"test"
},
expectedArgs
:
[]
string
{
"ray start --address=$
{
GROVE_PCSG_NAME
}
-$
{
GROVE_PCSG_INDEX
}
-test-service-ldr-0.$
{
GROVE_HEADLESS_SERVICE
}
:6379 --block"
},
expectedArgs
:
[]
string
{
"ray start --address=$
(
GROVE_PCSG_NAME
)
-$
(
GROVE_PCSG_INDEX
)
-test-service-ldr-0.$
(
GROVE_HEADLESS_SERVICE
)
:6379 --block"
},
expectProbesRemoved
:
true
,
expectProbesRemoved
:
true
,
},
},
{
{
...
@@ -63,7 +63,7 @@ func TestVLLMBackend_UpdateContainer(t *testing.T) {
...
@@ -63,7 +63,7 @@ func TestVLLMBackend_UpdateContainer(t *testing.T) {
component
:
&
v1alpha1
.
DynamoComponentDeploymentOverridesSpec
{},
component
:
&
v1alpha1
.
DynamoComponentDeploymentOverridesSpec
{},
multinodeDeployer
:
&
LWSMultinodeDeployer
{},
multinodeDeployer
:
&
LWSMultinodeDeployer
{},
initialArgs
:
[]
string
{
"python3"
,
"-m"
,
"dynamo.vllm"
},
initialArgs
:
[]
string
{
"python3"
,
"-m"
,
"dynamo.vllm"
},
expectedArgs
:
[]
string
{
"ray start --address=$
{
LWS_LEADER_ADDRESS
}
:6379 --block"
},
expectedArgs
:
[]
string
{
"ray start --address=$
(
LWS_LEADER_ADDRESS
)
:6379 --block"
},
expectProbesRemoved
:
true
,
expectProbesRemoved
:
true
,
},
},
{
{
...
@@ -155,14 +155,14 @@ func TestUpdateVLLMMultinodeArgs(t *testing.T) {
...
@@ -155,14 +155,14 @@ func TestUpdateVLLMMultinodeArgs(t *testing.T) {
role
:
RoleWorker
,
role
:
RoleWorker
,
multinodeDeployer
:
&
GroveMultinodeDeployer
{},
multinodeDeployer
:
&
GroveMultinodeDeployer
{},
initialArgs
:
[]
string
{
"python3"
,
"-m"
,
"dynamo.vllm"
},
initialArgs
:
[]
string
{
"python3"
,
"-m"
,
"dynamo.vllm"
},
expectedArgs
:
[]
string
{
"ray start --address=$
{
GROVE_PCSG_NAME
}
-$
{
GROVE_PCSG_INDEX
}
-test-service-ldr-0.$
{
GROVE_HEADLESS_SERVICE
}
:6379 --block"
},
expectedArgs
:
[]
string
{
"ray start --address=$
(
GROVE_PCSG_NAME
)
-$
(
GROVE_PCSG_INDEX
)
-test-service-ldr-0.$
(
GROVE_HEADLESS_SERVICE
)
:6379 --block"
},
},
},
{
{
name
:
"worker with LWS deployment"
,
name
:
"worker with LWS deployment"
,
role
:
RoleWorker
,
role
:
RoleWorker
,
multinodeDeployer
:
&
LWSMultinodeDeployer
{},
multinodeDeployer
:
&
LWSMultinodeDeployer
{},
initialArgs
:
[]
string
{
"python3"
,
"-m"
,
"dynamo.vllm"
},
initialArgs
:
[]
string
{
"python3"
,
"-m"
,
"dynamo.vllm"
},
expectedArgs
:
[]
string
{
"ray start --address=$
{
LWS_LEADER_ADDRESS
}
:6379 --block"
},
expectedArgs
:
[]
string
{
"ray start --address=$
(
LWS_LEADER_ADDRESS
)
:6379 --block"
},
},
},
{
{
name
:
"main role does not modify args"
,
name
:
"main role does not modify args"
,
...
...
deploy/cloud/operator/internal/dynamo/graph.go
View file @
9b893c93
...
@@ -623,7 +623,7 @@ func (b *NoopBackend) UpdatePodSpec(podSpec *corev1.PodSpec, numberOfNodes int32
...
@@ -623,7 +623,7 @@ func (b *NoopBackend) UpdatePodSpec(podSpec *corev1.PodSpec, numberOfNodes int32
type
MultinodeDeployer
interface
{
type
MultinodeDeployer
interface
{
GetLeaderHostname
(
serviceName
string
)
string
GetLeaderHostname
(
serviceName
string
)
string
GetHostNames
(
serviceName
string
,
numberOfNodes
int32
)
[]
string
GetHostNames
(
serviceName
string
,
numberOfNodes
int32
)
[]
string
GetNodeRank
()
string
GetNodeRank
()
(
string
,
bool
)
// returns (rank, needsShellInterpretation)
}
}
// BackendFactory creates backend instances based on the framework type
// BackendFactory creates backend instances based on the framework type
...
...
deploy/cloud/operator/internal/dynamo/graph_test.go
View file @
9b893c93
...
@@ -1804,7 +1804,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
...
@@ -1804,7 +1804,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
"-c"
,
"-c"
,
},
},
Args
:
[]
string
{
Args
:
[]
string
{
"python3 -m dynamo.sglang.worker --dist-init-addr $
{
GROVE_PCSG_NAME
}
-$
{
GROVE_PCSG_INDEX
}
-worker-ldr-0.$
{
GROVE_HEADLESS_SERVICE
}
:29500 --nnodes 3 --node-rank 0 --custom-flag custom-value"
,
"python3 -m dynamo.sglang.worker --dist-init-addr $
(
GROVE_PCSG_NAME
)
-$
(
GROVE_PCSG_INDEX
)
-worker-ldr-0.$
(
GROVE_HEADLESS_SERVICE
)
:29500 --nnodes 3 --node-rank 0 --custom-flag custom-value"
,
},
},
Ports
:
[]
corev1
.
ContainerPort
{
Ports
:
[]
corev1
.
ContainerPort
{
{
{
...
@@ -1955,7 +1955,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
...
@@ -1955,7 +1955,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
"-c"
,
"-c"
,
},
},
Args
:
[]
string
{
Args
:
[]
string
{
"python3 -m dynamo.sglang.worker --dist-init-addr $
{
GROVE_PCSG_NAME
}
-$
{
GROVE_PCSG_INDEX
}
-worker-ldr-0.$
{
GROVE_HEADLESS_SERVICE
}
:29500 --nnodes 3 --node-rank $((GROVE_PCLQ_POD_INDEX + 1)) --custom-flag custom-value"
,
"python3 -m dynamo.sglang.worker --dist-init-addr $
(
GROVE_PCSG_NAME
)
-$
(
GROVE_PCSG_INDEX
)
-worker-ldr-0.$
(
GROVE_HEADLESS_SERVICE
)
:29500 --nnodes 3 --node-rank $((GROVE_PCLQ_POD_INDEX + 1)) --custom-flag custom-value"
,
},
},
Ports
:
[]
corev1
.
ContainerPort
{
Ports
:
[]
corev1
.
ContainerPort
{
{
{
...
@@ -2739,7 +2739,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
...
@@ -2739,7 +2739,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
"-c"
,
"-c"
,
},
},
Args
:
[]
string
{
Args
:
[]
string
{
"ray start --address=$
{
GROVE_PCSG_NAME
}
-$
{
GROVE_PCSG_INDEX
}
-worker-ldr-0.$
{
GROVE_HEADLESS_SERVICE
}
:6379 --block"
,
"ray start --address=$
(
GROVE_PCSG_NAME
)
-$
(
GROVE_PCSG_INDEX
)
-worker-ldr-0.$
(
GROVE_HEADLESS_SERVICE
)
:6379 --block"
,
},
},
Ports
:
[]
corev1
.
ContainerPort
{
Ports
:
[]
corev1
.
ContainerPort
{
{
{
...
@@ -3182,7 +3182,7 @@ func TestGeneratePodSpecForComponent_SGLang(t *testing.T) {
...
@@ -3182,7 +3182,7 @@ func TestGeneratePodSpecForComponent_SGLang(t *testing.T) {
ComponentType
:
commonconsts
.
ComponentTypeWorker
,
ComponentType
:
commonconsts
.
ComponentTypeWorker
,
ExtraPodSpec
:
&
common
.
ExtraPodSpec
{
ExtraPodSpec
:
&
common
.
ExtraPodSpec
{
MainContainer
:
&
corev1
.
Container
{
MainContainer
:
&
corev1
.
Container
{
Args
:
[]
string
{
"python3
"
,
"-m"
,
"
dynamo.sglang.worker"
},
Args
:
[]
string
{
"python3
-m
dynamo.sglang.worker"
},
},
},
},
},
},
},
...
@@ -3201,7 +3201,7 @@ func TestGeneratePodSpecForComponent_SGLang(t *testing.T) {
...
@@ -3201,7 +3201,7 @@ func TestGeneratePodSpecForComponent_SGLang(t *testing.T) {
ComponentType
:
commonconsts
.
ComponentTypeWorker
,
ComponentType
:
commonconsts
.
ComponentTypeWorker
,
ExtraPodSpec
:
&
common
.
ExtraPodSpec
{
ExtraPodSpec
:
&
common
.
ExtraPodSpec
{
MainContainer
:
&
corev1
.
Container
{
MainContainer
:
&
corev1
.
Container
{
Args
:
[]
string
{
"python3
"
,
"-m"
,
"
dynamo.sglang.worker"
},
Args
:
[]
string
{
"python3
-m
dynamo.sglang.worker"
},
},
},
},
},
},
},
...
@@ -3219,7 +3219,7 @@ func TestGeneratePodSpecForComponent_SGLang(t *testing.T) {
...
@@ -3219,7 +3219,7 @@ func TestGeneratePodSpecForComponent_SGLang(t *testing.T) {
ComponentType
:
commonconsts
.
ComponentTypeWorker
,
ComponentType
:
commonconsts
.
ComponentTypeWorker
,
ExtraPodSpec
:
&
common
.
ExtraPodSpec
{
ExtraPodSpec
:
&
common
.
ExtraPodSpec
{
MainContainer
:
&
corev1
.
Container
{
MainContainer
:
&
corev1
.
Container
{
Args
:
[]
string
{
"python3
"
,
"-m"
,
"
dynamo.sglang.worker"
},
Args
:
[]
string
{
"python3
-m
dynamo.sglang.worker"
},
},
},
},
},
},
},
...
@@ -3377,7 +3377,7 @@ func TestGeneratePodSpecForComponent_VLLM(t *testing.T) {
...
@@ -3377,7 +3377,7 @@ func TestGeneratePodSpecForComponent_VLLM(t *testing.T) {
role
:
RoleWorker
,
role
:
RoleWorker
,
numberOfNodes
:
3
,
numberOfNodes
:
3
,
expectError
:
false
,
expectError
:
false
,
expectContains
:
[]
string
{
"ray start --address=$
{
GROVE_PCSG_NAME
}
-$
{
GROVE_PCSG_INDEX
}
-worker-ldr-0.$
{
GROVE_HEADLESS_SERVICE
}
:6379 --block"
},
expectContains
:
[]
string
{
"ray start --address=$
(
GROVE_PCSG_NAME
)
-$
(
GROVE_PCSG_INDEX
)
-worker-ldr-0.$
(
GROVE_HEADLESS_SERVICE
)
:6379 --block"
},
expectNotContains
:
[]
string
{
"python3 -m dynamo.vllm"
},
expectNotContains
:
[]
string
{
"python3 -m dynamo.vllm"
},
},
},
{
{
...
@@ -4671,8 +4671,8 @@ func TestGenerateBasePodSpec_Worker(t *testing.T) {
...
@@ -4671,8 +4671,8 @@ func TestGenerateBasePodSpec_Worker(t *testing.T) {
{
Name
:
"DYN_PARENT_DGD_K8S_NAME"
,
Value
:
"test-deployment"
},
{
Name
:
"DYN_PARENT_DGD_K8S_NAME"
,
Value
:
"test-deployment"
},
{
Name
:
"DYN_PARENT_DGD_K8S_NAMESPACE"
,
Value
:
"default"
},
{
Name
:
"DYN_PARENT_DGD_K8S_NAMESPACE"
,
Value
:
"default"
},
{
Name
:
"DYN_SYSTEM_ENABLED"
,
Value
:
"true"
},
{
Name
:
"DYN_SYSTEM_ENABLED"
,
Value
:
"true"
},
{
Name
:
"DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS"
,
Value
:
"[
\"
generate
\"
]"
},
{
Name
:
"DYN_SYSTEM_PORT"
,
Value
:
"9090"
},
{
Name
:
"DYN_SYSTEM_PORT"
,
Value
:
"9090"
},
{
Name
:
"DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS"
,
Value
:
"[
\"
generate
\"
]"
},
},
},
VolumeMounts
:
[]
corev1
.
VolumeMount
{
VolumeMounts
:
[]
corev1
.
VolumeMount
{
{
{
...
...
deploy/cloud/operator/internal/dynamo/grove.go
View file @
9b893c93
...
@@ -25,11 +25,12 @@ type GroveMultinodeDeployer struct {
...
@@ -25,11 +25,12 @@ type GroveMultinodeDeployer struct {
}
}
func
(
d
*
GroveMultinodeDeployer
)
GetLeaderHostname
(
serviceName
string
)
string
{
func
(
d
*
GroveMultinodeDeployer
)
GetLeaderHostname
(
serviceName
string
)
string
{
return
fmt
.
Sprintf
(
"$
{
GROVE_PCSG_NAME
}
-$
{
GROVE_PCSG_INDEX
}
-%s-%s-0.$
{
GROVE_HEADLESS_SERVICE
}
"
,
serviceName
,
commonconsts
.
GroveRoleSuffixLeader
)
return
fmt
.
Sprintf
(
"$
(
GROVE_PCSG_NAME
)
-$
(
GROVE_PCSG_INDEX
)
-%s-%s-0.$
(
GROVE_HEADLESS_SERVICE
)
"
,
serviceName
,
commonconsts
.
GroveRoleSuffixLeader
)
}
}
func
(
d
*
GroveMultinodeDeployer
)
GetNodeRank
()
string
{
func
(
d
*
GroveMultinodeDeployer
)
GetNodeRank
()
(
string
,
bool
)
{
return
"$((GROVE_PCLQ_POD_INDEX + 1))"
// This requires shell expansion for arithmetic expression
return
"$((GROVE_PCLQ_POD_INDEX + 1))"
,
true
}
}
func
(
d
*
GroveMultinodeDeployer
)
GetHostNames
(
serviceName
string
,
numberOfNodes
int32
)
[]
string
{
func
(
d
*
GroveMultinodeDeployer
)
GetHostNames
(
serviceName
string
,
numberOfNodes
int32
)
[]
string
{
...
@@ -38,7 +39,7 @@ func (d *GroveMultinodeDeployer) GetHostNames(serviceName string, numberOfNodes
...
@@ -38,7 +39,7 @@ func (d *GroveMultinodeDeployer) GetHostNames(serviceName string, numberOfNodes
hostnames
=
append
(
hostnames
,
leaderHostname
)
hostnames
=
append
(
hostnames
,
leaderHostname
)
// Add worker hostnames
// Add worker hostnames
for
i
:=
int32
(
0
);
i
<
numberOfNodes
-
1
;
i
++
{
for
i
:=
int32
(
0
);
i
<
numberOfNodes
-
1
;
i
++
{
workerHostname
:=
fmt
.
Sprintf
(
"$
{
GROVE_PCSG_NAME
}
-$
{
GROVE_PCSG_INDEX
}
-%s-%s-%d.$
{
GROVE_HEADLESS_SERVICE
}
"
,
workerHostname
:=
fmt
.
Sprintf
(
"$
(
GROVE_PCSG_NAME
)
-$
(
GROVE_PCSG_INDEX
)
-%s-%s-%d.$
(
GROVE_HEADLESS_SERVICE
)
"
,
serviceName
,
commonconsts
.
GroveRoleSuffixWorker
,
i
)
serviceName
,
commonconsts
.
GroveRoleSuffixWorker
,
i
)
hostnames
=
append
(
hostnames
,
workerHostname
)
hostnames
=
append
(
hostnames
,
workerHostname
)
}
}
...
...
deploy/cloud/operator/internal/dynamo/lws.go
View file @
9b893c93
...
@@ -7,18 +7,19 @@ type LWSMultinodeDeployer struct {
...
@@ -7,18 +7,19 @@ type LWSMultinodeDeployer struct {
}
}
func
(
d
*
LWSMultinodeDeployer
)
GetLeaderHostname
(
serviceName
string
)
string
{
func
(
d
*
LWSMultinodeDeployer
)
GetLeaderHostname
(
serviceName
string
)
string
{
return
"$
{
LWS_LEADER_ADDRESS
}
"
return
"$
(
LWS_LEADER_ADDRESS
)
"
}
}
func
(
d
*
LWSMultinodeDeployer
)
GetNodeRank
()
string
{
func
(
d
*
LWSMultinodeDeployer
)
GetNodeRank
()
(
string
,
bool
)
{
return
"${LWS_WORKER_INDEX}"
// This requires shell expansion for variable substitution
return
"$(LWS_WORKER_INDEX)"
,
true
}
}
func
(
d
*
LWSMultinodeDeployer
)
GetHostNames
(
serviceName
string
,
numberOfNodes
int32
)
[]
string
{
func
(
d
*
LWSMultinodeDeployer
)
GetHostNames
(
serviceName
string
,
numberOfNodes
int32
)
[]
string
{
hostnames
:=
make
([]
string
,
numberOfNodes
)
hostnames
:=
make
([]
string
,
numberOfNodes
)
hostnames
[
0
]
=
d
.
GetLeaderHostname
(
serviceName
)
hostnames
[
0
]
=
d
.
GetLeaderHostname
(
serviceName
)
for
i
:=
int32
(
1
);
i
<
numberOfNodes
;
i
++
{
for
i
:=
int32
(
1
);
i
<
numberOfNodes
;
i
++
{
hostnames
[
i
]
=
fmt
.
Sprintf
(
"$
{
LWS_WORKER_%d_ADDRESS
}
"
,
i
)
hostnames
[
i
]
=
fmt
.
Sprintf
(
"$
(
LWS_WORKER_%d_ADDRESS
)
"
,
i
)
}
}
return
hostnames
return
hostnames
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment