Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
55de76b6
Unverified
Commit
55de76b6
authored
Oct 07, 2025
by
Julien Mancuso
Committed by
GitHub
Oct 07, 2025
Browse files
feat: add startup order for multinode TRTLLM (#3462)
Signed-off-by:
Julien Mancuso
<
jmancuso@nvidia.com
>
parent
96a6dcda
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
31 additions
and
97 deletions
+31
-97
deploy/cloud/operator/internal/dynamo/graph.go
deploy/cloud/operator/internal/dynamo/graph.go
+4
-4
deploy/cloud/operator/internal/dynamo/graph_test.go
deploy/cloud/operator/internal/dynamo/graph_test.go
+27
-93
No files found.
deploy/cloud/operator/internal/dynamo/graph.go
View file @
55de76b6
...
@@ -331,11 +331,11 @@ func applyCliqueStartupDependencies(
...
@@ -331,11 +331,11 @@ func applyCliqueStartupDependencies(
backendFramework
BackendFramework
,
backendFramework
BackendFramework
,
numberOfNodes
int32
,
numberOfNodes
int32
,
)
{
)
{
//
deactivated for now.
//
enabled for TRTLLM multinode deployments only
// TODO: reactivate
thi
s when we have a better way to handle the readiness probe for the leader.
// TODO: reactivate
for all backend
s when we have a better way to handle the readiness probe for the leader.
deactivated
:=
true
enabled
:=
backendFramework
==
BackendFrameworkTRTLLM
&&
numberOfNodes
>
1
if
deactivated
||
numberOfNodes
<=
1
{
if
!
enabled
{
return
// No dependencies for single-node deployments
return
// No dependencies for single-node deployments
}
}
...
...
deploy/cloud/operator/internal/dynamo/graph_test.go
View file @
55de76b6
...
@@ -3995,9 +3995,7 @@ func TestGetBackendFrameworkFromComponent(t *testing.T) {
...
@@ -3995,9 +3995,7 @@ func TestGetBackendFrameworkFromComponent(t *testing.T) {
}
}
}
}
// deactivated for now.
func
TestApplyCliqueStartupDependencies
(
t
*
testing
.
T
)
{
// TODO: reactivate this when we have a better way to handle the readiness probe for the leader.
func
XTestApplyCliqueStartupDependencies
(
t
*
testing
.
T
)
{
tests
:=
[]
struct
{
tests
:=
[]
struct
{
name
string
name
string
roles
[]
ServiceRole
roles
[]
ServiceRole
...
@@ -4016,9 +4014,9 @@ func XTestApplyCliqueStartupDependencies(t *testing.T) {
...
@@ -4016,9 +4014,9 @@ func XTestApplyCliqueStartupDependencies(t *testing.T) {
numberOfNodes
:
3
,
numberOfNodes
:
3
,
expectedDeps
:
map
[
string
][]
string
{
expectedDeps
:
map
[
string
][]
string
{
"service-ldr"
:
nil
,
"service-ldr"
:
nil
,
"service-wkr"
:
{
"service-ldr"
}
,
"service-wkr"
:
nil
,
},
},
expectStartupType
:
tru
e
,
expectStartupType
:
fals
e
,
},
},
{
{
name
:
"sglang_multinode_applies_dependencies"
,
name
:
"sglang_multinode_applies_dependencies"
,
...
@@ -4030,9 +4028,9 @@ func XTestApplyCliqueStartupDependencies(t *testing.T) {
...
@@ -4030,9 +4028,9 @@ func XTestApplyCliqueStartupDependencies(t *testing.T) {
numberOfNodes
:
3
,
numberOfNodes
:
3
,
expectedDeps
:
map
[
string
][]
string
{
expectedDeps
:
map
[
string
][]
string
{
"service-ldr"
:
nil
,
"service-ldr"
:
nil
,
"service-wkr"
:
{
"service-ldr"
}
,
"service-wkr"
:
nil
,
},
},
expectStartupType
:
tru
e
,
expectStartupType
:
fals
e
,
},
},
{
{
name
:
"trtllm_multinode_applies_dependencies"
,
name
:
"trtllm_multinode_applies_dependencies"
,
...
@@ -4053,27 +4051,13 @@ func XTestApplyCliqueStartupDependencies(t *testing.T) {
...
@@ -4053,27 +4051,13 @@ func XTestApplyCliqueStartupDependencies(t *testing.T) {
roles
:
[]
ServiceRole
{
roles
:
[]
ServiceRole
{
{
Name
:
"service"
,
Role
:
RoleMain
,
Replicas
:
1
},
{
Name
:
"service"
,
Role
:
RoleMain
,
Replicas
:
1
},
},
},
backendFramework
:
BackendFramework
V
LLM
,
backendFramework
:
BackendFramework
TRT
LLM
,
numberOfNodes
:
1
,
numberOfNodes
:
1
,
expectedDeps
:
map
[
string
][]
string
{
expectedDeps
:
map
[
string
][]
string
{
"service"
:
nil
,
"service"
:
nil
,
},
},
expectStartupType
:
false
,
expectStartupType
:
false
,
},
},
{
name
:
"noop_backend_no_dependencies"
,
roles
:
[]
ServiceRole
{
{
Name
:
"service-ldr"
,
Role
:
RoleLeader
,
Replicas
:
1
},
{
Name
:
"service-wkr"
,
Role
:
RoleWorker
,
Replicas
:
2
},
},
backendFramework
:
BackendFrameworkNoop
,
numberOfNodes
:
3
,
expectedDeps
:
map
[
string
][]
string
{
"service-ldr"
:
nil
,
"service-wkr"
:
nil
,
},
expectStartupType
:
false
,
},
}
}
for
_
,
tt
:=
range
tests
{
for
_
,
tt
:=
range
tests
{
...
@@ -4129,9 +4113,7 @@ func XTestApplyCliqueStartupDependencies(t *testing.T) {
...
@@ -4129,9 +4113,7 @@ func XTestApplyCliqueStartupDependencies(t *testing.T) {
}
}
}
}
// deactivated for now.
func
TestGetCliqueStartupDependencies
(
t
*
testing
.
T
)
{
// TODO: reactivate this when we have a better way to handle the readiness probe for the leader.
func
XTestGetCliqueStartupDependencies
(
t
*
testing
.
T
)
{
tests
:=
[]
struct
{
tests
:=
[]
struct
{
name
string
name
string
role
Role
role
Role
...
@@ -4140,38 +4122,6 @@ func XTestGetCliqueStartupDependencies(t *testing.T) {
...
@@ -4140,38 +4122,6 @@ func XTestGetCliqueStartupDependencies(t *testing.T) {
workerCliqueNames
[]
string
workerCliqueNames
[]
string
expected
[]
string
expected
[]
string
}{
}{
{
name
:
"vllm_worker_depends_on_leader"
,
role
:
RoleWorker
,
backendFramework
:
BackendFrameworkVLLM
,
leaderCliqueName
:
"service-ldr"
,
workerCliqueNames
:
[]
string
{
"service-wkr"
},
expected
:
[]
string
{
"service-ldr"
},
},
{
name
:
"vllm_leader_has_no_dependencies"
,
role
:
RoleLeader
,
backendFramework
:
BackendFrameworkVLLM
,
leaderCliqueName
:
"service-ldr"
,
workerCliqueNames
:
[]
string
{
"service-wkr"
},
expected
:
nil
,
},
{
name
:
"sglang_worker_depends_on_leader"
,
role
:
RoleWorker
,
backendFramework
:
BackendFrameworkSGLang
,
leaderCliqueName
:
"service-ldr"
,
workerCliqueNames
:
[]
string
{
"service-wkr"
},
expected
:
[]
string
{
"service-ldr"
},
},
{
name
:
"sglang_leader_has_no_dependencies"
,
role
:
RoleLeader
,
backendFramework
:
BackendFrameworkSGLang
,
leaderCliqueName
:
"service-ldr"
,
workerCliqueNames
:
[]
string
{
"service-wkr"
},
expected
:
nil
,
},
{
{
name
:
"trtllm_leader_depends_on_workers"
,
name
:
"trtllm_leader_depends_on_workers"
,
role
:
RoleLeader
,
role
:
RoleLeader
,
...
@@ -4188,30 +4138,6 @@ func XTestGetCliqueStartupDependencies(t *testing.T) {
...
@@ -4188,30 +4138,6 @@ func XTestGetCliqueStartupDependencies(t *testing.T) {
workerCliqueNames
:
[]
string
{
"service-wkr"
},
workerCliqueNames
:
[]
string
{
"service-wkr"
},
expected
:
nil
,
expected
:
nil
,
},
},
{
name
:
"noop_backend_has_no_dependencies"
,
role
:
RoleWorker
,
backendFramework
:
BackendFrameworkNoop
,
leaderCliqueName
:
"service-ldr"
,
workerCliqueNames
:
[]
string
{
"service-wkr"
},
expected
:
nil
,
},
{
name
:
"main_role_has_no_dependencies"
,
role
:
RoleMain
,
backendFramework
:
BackendFrameworkVLLM
,
leaderCliqueName
:
""
,
workerCliqueNames
:
nil
,
expected
:
nil
,
},
{
name
:
"worker_with_empty_leader_name"
,
role
:
RoleWorker
,
backendFramework
:
BackendFrameworkVLLM
,
leaderCliqueName
:
""
,
workerCliqueNames
:
[]
string
{
"service-wkr"
},
expected
:
nil
,
},
{
{
name
:
"leader_with_empty_worker_names"
,
name
:
"leader_with_empty_worker_names"
,
role
:
RoleLeader
,
role
:
RoleLeader
,
...
@@ -4238,31 +4164,32 @@ func XTestGetCliqueStartupDependencies(t *testing.T) {
...
@@ -4238,31 +4164,32 @@ func XTestGetCliqueStartupDependencies(t *testing.T) {
}
}
}
}
// deactivated for now.
func
TestGenerateGrovePodCliqueSet_StartsAfterDependencies
(
t
*
testing
.
T
)
{
// TODO: reactivate this when we have a better way to handle the readiness probe for the leader.
func
XTestGenerateGrovePodCliqueSet_StartsAfterDependencies
(
t
*
testing
.
T
)
{
secretsRetriever
:=
&
mockSecretsRetriever
{}
secretsRetriever
:=
&
mockSecretsRetriever
{}
tests
:=
[]
struct
{
tests
:=
[]
struct
{
name
string
name
string
backendFramework
string
backendFramework
string
expectedDeps
map
[
string
][]
string
// clique name -> expected StartsAfter dependencies
expectedDeps
map
[
string
][]
string
// clique name -> expected StartsAfter dependencies
expectStartupType
bool
}{
}{
{
{
name
:
"vllm_worker_starts_after_leader"
,
name
:
"vllm_worker_starts_after_leader"
,
backendFramework
:
string
(
BackendFrameworkVLLM
),
backendFramework
:
string
(
BackendFrameworkVLLM
),
expectedDeps
:
map
[
string
][]
string
{
expectedDeps
:
map
[
string
][]
string
{
"main-wkr"
:
{
"main-ldr"
}
,
// worker starts after leader
"main-wkr"
:
nil
,
// worker starts after leader
"main-ldr"
:
nil
,
// leader has no dependencies
"main-ldr"
:
nil
,
// leader has no dependencies
},
},
expectStartupType
:
false
,
},
},
{
{
name
:
"sglang_worker_starts_after_leader"
,
name
:
"sglang_worker_starts_after_leader"
,
backendFramework
:
string
(
BackendFrameworkSGLang
),
backendFramework
:
string
(
BackendFrameworkSGLang
),
expectedDeps
:
map
[
string
][]
string
{
expectedDeps
:
map
[
string
][]
string
{
"main-wkr"
:
{
"main-ldr"
}
,
// worker starts after leader
"main-wkr"
:
nil
,
// worker starts after leader
"main-ldr"
:
nil
,
// leader has no dependencies
"main-ldr"
:
nil
,
// leader has no dependencies
},
},
expectStartupType
:
false
,
},
},
{
{
name
:
"trtllm_leader_starts_after_worker"
,
name
:
"trtllm_leader_starts_after_worker"
,
...
@@ -4271,6 +4198,7 @@ func XTestGenerateGrovePodCliqueSet_StartsAfterDependencies(t *testing.T) {
...
@@ -4271,6 +4198,7 @@ func XTestGenerateGrovePodCliqueSet_StartsAfterDependencies(t *testing.T) {
"main-ldr"
:
{
"main-wkr"
},
// leader starts after worker
"main-ldr"
:
{
"main-wkr"
},
// leader starts after worker
"main-wkr"
:
nil
,
// worker has no dependencies
"main-wkr"
:
nil
,
// worker has no dependencies
},
},
expectStartupType
:
true
,
},
},
}
}
...
@@ -4314,9 +4242,15 @@ func XTestGenerateGrovePodCliqueSet_StartsAfterDependencies(t *testing.T) {
...
@@ -4314,9 +4242,15 @@ func XTestGenerateGrovePodCliqueSet_StartsAfterDependencies(t *testing.T) {
}
}
// Verify that StartupType is set to Explicit
// Verify that StartupType is set to Explicit
if
tt
.
expectStartupType
{
if
got
.
Spec
.
Template
.
StartupType
==
nil
||
*
got
.
Spec
.
Template
.
StartupType
!=
grovev1alpha1
.
CliqueStartupTypeExplicit
{
if
got
.
Spec
.
Template
.
StartupType
==
nil
||
*
got
.
Spec
.
Template
.
StartupType
!=
grovev1alpha1
.
CliqueStartupTypeExplicit
{
t
.
Errorf
(
"Expected StartupType to be CliqueStartupTypeExplicit, got %v"
,
got
.
Spec
.
Template
.
StartupType
)
t
.
Errorf
(
"Expected StartupType to be CliqueStartupTypeExplicit, got %v"
,
got
.
Spec
.
Template
.
StartupType
)
}
}
}
else
{
if
got
.
Spec
.
Template
.
StartupType
==
nil
||
*
got
.
Spec
.
Template
.
StartupType
!=
grovev1alpha1
.
CliqueStartupTypeAnyOrder
{
t
.
Errorf
(
"Expected StartupType to be CliqueStartupTypeAnyOrder, got %v"
,
got
.
Spec
.
Template
.
StartupType
)
}
}
// Verify StartsAfter dependencies for each clique
// Verify StartsAfter dependencies for each clique
cliqueMap
:=
make
(
map
[
string
]
*
grovev1alpha1
.
PodCliqueTemplateSpec
)
cliqueMap
:=
make
(
map
[
string
]
*
grovev1alpha1
.
PodCliqueTemplateSpec
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment