Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
55de76b6
Unverified
Commit
55de76b6
authored
Oct 07, 2025
by
Julien Mancuso
Committed by
GitHub
Oct 07, 2025
Browse files
feat: add startup order for multinode TRTLLM (#3462)
Signed-off-by:
Julien Mancuso
<
jmancuso@nvidia.com
>
parent
96a6dcda
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
31 additions
and
97 deletions
+31
-97
deploy/cloud/operator/internal/dynamo/graph.go
deploy/cloud/operator/internal/dynamo/graph.go
+4
-4
deploy/cloud/operator/internal/dynamo/graph_test.go
deploy/cloud/operator/internal/dynamo/graph_test.go
+27
-93
No files found.
deploy/cloud/operator/internal/dynamo/graph.go
View file @
55de76b6
...
...
@@ -331,11 +331,11 @@ func applyCliqueStartupDependencies(
backendFramework
BackendFramework
,
numberOfNodes
int32
,
)
{
//
deactivated for now.
// TODO: reactivate
thi
s when we have a better way to handle the readiness probe for the leader.
deactivated
:=
true
//
enabled for TRTLLM multinode deployments only
// TODO: reactivate
for all backend
s when we have a better way to handle the readiness probe for the leader.
enabled
:=
backendFramework
==
BackendFrameworkTRTLLM
&&
numberOfNodes
>
1
if
deactivated
||
numberOfNodes
<=
1
{
if
!
enabled
{
return
// No dependencies for single-node deployments
}
...
...
deploy/cloud/operator/internal/dynamo/graph_test.go
View file @
55de76b6
...
...
@@ -3995,9 +3995,7 @@ func TestGetBackendFrameworkFromComponent(t *testing.T) {
}
}
// deactivated for now.
// TODO: reactivate this when we have a better way to handle the readiness probe for the leader.
func
XTestApplyCliqueStartupDependencies
(
t
*
testing
.
T
)
{
func
TestApplyCliqueStartupDependencies
(
t
*
testing
.
T
)
{
tests
:=
[]
struct
{
name
string
roles
[]
ServiceRole
...
...
@@ -4016,9 +4014,9 @@ func XTestApplyCliqueStartupDependencies(t *testing.T) {
numberOfNodes
:
3
,
expectedDeps
:
map
[
string
][]
string
{
"service-ldr"
:
nil
,
"service-wkr"
:
{
"service-ldr"
}
,
"service-wkr"
:
nil
,
},
expectStartupType
:
tru
e
,
expectStartupType
:
fals
e
,
},
{
name
:
"sglang_multinode_applies_dependencies"
,
...
...
@@ -4030,9 +4028,9 @@ func XTestApplyCliqueStartupDependencies(t *testing.T) {
numberOfNodes
:
3
,
expectedDeps
:
map
[
string
][]
string
{
"service-ldr"
:
nil
,
"service-wkr"
:
{
"service-ldr"
}
,
"service-wkr"
:
nil
,
},
expectStartupType
:
tru
e
,
expectStartupType
:
fals
e
,
},
{
name
:
"trtllm_multinode_applies_dependencies"
,
...
...
@@ -4053,27 +4051,13 @@ func XTestApplyCliqueStartupDependencies(t *testing.T) {
roles
:
[]
ServiceRole
{
{
Name
:
"service"
,
Role
:
RoleMain
,
Replicas
:
1
},
},
backendFramework
:
BackendFramework
V
LLM
,
backendFramework
:
BackendFramework
TRT
LLM
,
numberOfNodes
:
1
,
expectedDeps
:
map
[
string
][]
string
{
"service"
:
nil
,
},
expectStartupType
:
false
,
},
{
name
:
"noop_backend_no_dependencies"
,
roles
:
[]
ServiceRole
{
{
Name
:
"service-ldr"
,
Role
:
RoleLeader
,
Replicas
:
1
},
{
Name
:
"service-wkr"
,
Role
:
RoleWorker
,
Replicas
:
2
},
},
backendFramework
:
BackendFrameworkNoop
,
numberOfNodes
:
3
,
expectedDeps
:
map
[
string
][]
string
{
"service-ldr"
:
nil
,
"service-wkr"
:
nil
,
},
expectStartupType
:
false
,
},
}
for
_
,
tt
:=
range
tests
{
...
...
@@ -4129,9 +4113,7 @@ func XTestApplyCliqueStartupDependencies(t *testing.T) {
}
}
// deactivated for now.
// TODO: reactivate this when we have a better way to handle the readiness probe for the leader.
func
XTestGetCliqueStartupDependencies
(
t
*
testing
.
T
)
{
func
TestGetCliqueStartupDependencies
(
t
*
testing
.
T
)
{
tests
:=
[]
struct
{
name
string
role
Role
...
...
@@ -4140,38 +4122,6 @@ func XTestGetCliqueStartupDependencies(t *testing.T) {
workerCliqueNames
[]
string
expected
[]
string
}{
{
name
:
"vllm_worker_depends_on_leader"
,
role
:
RoleWorker
,
backendFramework
:
BackendFrameworkVLLM
,
leaderCliqueName
:
"service-ldr"
,
workerCliqueNames
:
[]
string
{
"service-wkr"
},
expected
:
[]
string
{
"service-ldr"
},
},
{
name
:
"vllm_leader_has_no_dependencies"
,
role
:
RoleLeader
,
backendFramework
:
BackendFrameworkVLLM
,
leaderCliqueName
:
"service-ldr"
,
workerCliqueNames
:
[]
string
{
"service-wkr"
},
expected
:
nil
,
},
{
name
:
"sglang_worker_depends_on_leader"
,
role
:
RoleWorker
,
backendFramework
:
BackendFrameworkSGLang
,
leaderCliqueName
:
"service-ldr"
,
workerCliqueNames
:
[]
string
{
"service-wkr"
},
expected
:
[]
string
{
"service-ldr"
},
},
{
name
:
"sglang_leader_has_no_dependencies"
,
role
:
RoleLeader
,
backendFramework
:
BackendFrameworkSGLang
,
leaderCliqueName
:
"service-ldr"
,
workerCliqueNames
:
[]
string
{
"service-wkr"
},
expected
:
nil
,
},
{
name
:
"trtllm_leader_depends_on_workers"
,
role
:
RoleLeader
,
...
...
@@ -4188,30 +4138,6 @@ func XTestGetCliqueStartupDependencies(t *testing.T) {
workerCliqueNames
:
[]
string
{
"service-wkr"
},
expected
:
nil
,
},
{
name
:
"noop_backend_has_no_dependencies"
,
role
:
RoleWorker
,
backendFramework
:
BackendFrameworkNoop
,
leaderCliqueName
:
"service-ldr"
,
workerCliqueNames
:
[]
string
{
"service-wkr"
},
expected
:
nil
,
},
{
name
:
"main_role_has_no_dependencies"
,
role
:
RoleMain
,
backendFramework
:
BackendFrameworkVLLM
,
leaderCliqueName
:
""
,
workerCliqueNames
:
nil
,
expected
:
nil
,
},
{
name
:
"worker_with_empty_leader_name"
,
role
:
RoleWorker
,
backendFramework
:
BackendFrameworkVLLM
,
leaderCliqueName
:
""
,
workerCliqueNames
:
[]
string
{
"service-wkr"
},
expected
:
nil
,
},
{
name
:
"leader_with_empty_worker_names"
,
role
:
RoleLeader
,
...
...
@@ -4238,31 +4164,32 @@ func XTestGetCliqueStartupDependencies(t *testing.T) {
}
}
// deactivated for now.
// TODO: reactivate this when we have a better way to handle the readiness probe for the leader.
func
XTestGenerateGrovePodCliqueSet_StartsAfterDependencies
(
t
*
testing
.
T
)
{
func
TestGenerateGrovePodCliqueSet_StartsAfterDependencies
(
t
*
testing
.
T
)
{
secretsRetriever
:=
&
mockSecretsRetriever
{}
tests
:=
[]
struct
{
name
string
backendFramework
string
expectedDeps
map
[
string
][]
string
// clique name -> expected StartsAfter dependencies
name
string
backendFramework
string
expectedDeps
map
[
string
][]
string
// clique name -> expected StartsAfter dependencies
expectStartupType
bool
}{
{
name
:
"vllm_worker_starts_after_leader"
,
backendFramework
:
string
(
BackendFrameworkVLLM
),
expectedDeps
:
map
[
string
][]
string
{
"main-wkr"
:
{
"main-ldr"
}
,
// worker starts after leader
"main-ldr"
:
nil
,
// leader has no dependencies
"main-wkr"
:
nil
,
// worker starts after leader
"main-ldr"
:
nil
,
// leader has no dependencies
},
expectStartupType
:
false
,
},
{
name
:
"sglang_worker_starts_after_leader"
,
backendFramework
:
string
(
BackendFrameworkSGLang
),
expectedDeps
:
map
[
string
][]
string
{
"main-wkr"
:
{
"main-ldr"
}
,
// worker starts after leader
"main-ldr"
:
nil
,
// leader has no dependencies
"main-wkr"
:
nil
,
// worker starts after leader
"main-ldr"
:
nil
,
// leader has no dependencies
},
expectStartupType
:
false
,
},
{
name
:
"trtllm_leader_starts_after_worker"
,
...
...
@@ -4271,6 +4198,7 @@ func XTestGenerateGrovePodCliqueSet_StartsAfterDependencies(t *testing.T) {
"main-ldr"
:
{
"main-wkr"
},
// leader starts after worker
"main-wkr"
:
nil
,
// worker has no dependencies
},
expectStartupType
:
true
,
},
}
...
...
@@ -4314,8 +4242,14 @@ func XTestGenerateGrovePodCliqueSet_StartsAfterDependencies(t *testing.T) {
}
// Verify that StartupType is set to Explicit
if
got
.
Spec
.
Template
.
StartupType
==
nil
||
*
got
.
Spec
.
Template
.
StartupType
!=
grovev1alpha1
.
CliqueStartupTypeExplicit
{
t
.
Errorf
(
"Expected StartupType to be CliqueStartupTypeExplicit, got %v"
,
got
.
Spec
.
Template
.
StartupType
)
if
tt
.
expectStartupType
{
if
got
.
Spec
.
Template
.
StartupType
==
nil
||
*
got
.
Spec
.
Template
.
StartupType
!=
grovev1alpha1
.
CliqueStartupTypeExplicit
{
t
.
Errorf
(
"Expected StartupType to be CliqueStartupTypeExplicit, got %v"
,
got
.
Spec
.
Template
.
StartupType
)
}
}
else
{
if
got
.
Spec
.
Template
.
StartupType
==
nil
||
*
got
.
Spec
.
Template
.
StartupType
!=
grovev1alpha1
.
CliqueStartupTypeAnyOrder
{
t
.
Errorf
(
"Expected StartupType to be CliqueStartupTypeAnyOrder, got %v"
,
got
.
Spec
.
Template
.
StartupType
)
}
}
// Verify StartsAfter dependencies for each clique
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment