"vscode:/vscode.git/clone" did not exist on "7ba057541d80e040ebaed03e942ebe8c57308844"
Unverified Commit 55de76b6 authored by Julien Mancuso's avatar Julien Mancuso Committed by GitHub
Browse files

feat: add startup order for multinode TRTLLM (#3462)


Signed-off-by: default avatarJulien Mancuso <jmancuso@nvidia.com>
parent 96a6dcda
...@@ -331,11 +331,11 @@ func applyCliqueStartupDependencies( ...@@ -331,11 +331,11 @@ func applyCliqueStartupDependencies(
backendFramework BackendFramework, backendFramework BackendFramework,
numberOfNodes int32, numberOfNodes int32,
) { ) {
// deactivated for now. // enabled for TRTLLM multinode deployments only
// TODO: reactivate this when we have a better way to handle the readiness probe for the leader. // TODO: reactivate for all backends when we have a better way to handle the readiness probe for the leader.
deactivated := true enabled := backendFramework == BackendFrameworkTRTLLM && numberOfNodes > 1
if deactivated || numberOfNodes <= 1 { if !enabled {
return // No dependencies for single-node deployments return // No dependencies for single-node deployments
} }
......
...@@ -3995,9 +3995,7 @@ func TestGetBackendFrameworkFromComponent(t *testing.T) { ...@@ -3995,9 +3995,7 @@ func TestGetBackendFrameworkFromComponent(t *testing.T) {
} }
} }
// deactivated for now. func TestApplyCliqueStartupDependencies(t *testing.T) {
// TODO: reactivate this when we have a better way to handle the readiness probe for the leader.
func XTestApplyCliqueStartupDependencies(t *testing.T) {
tests := []struct { tests := []struct {
name string name string
roles []ServiceRole roles []ServiceRole
...@@ -4016,9 +4014,9 @@ func XTestApplyCliqueStartupDependencies(t *testing.T) { ...@@ -4016,9 +4014,9 @@ func XTestApplyCliqueStartupDependencies(t *testing.T) {
numberOfNodes: 3, numberOfNodes: 3,
expectedDeps: map[string][]string{ expectedDeps: map[string][]string{
"service-ldr": nil, "service-ldr": nil,
"service-wkr": {"service-ldr"}, "service-wkr": nil,
}, },
expectStartupType: true, expectStartupType: false,
}, },
{ {
name: "sglang_multinode_applies_dependencies", name: "sglang_multinode_applies_dependencies",
...@@ -4030,9 +4028,9 @@ func XTestApplyCliqueStartupDependencies(t *testing.T) { ...@@ -4030,9 +4028,9 @@ func XTestApplyCliqueStartupDependencies(t *testing.T) {
numberOfNodes: 3, numberOfNodes: 3,
expectedDeps: map[string][]string{ expectedDeps: map[string][]string{
"service-ldr": nil, "service-ldr": nil,
"service-wkr": {"service-ldr"}, "service-wkr": nil,
}, },
expectStartupType: true, expectStartupType: false,
}, },
{ {
name: "trtllm_multinode_applies_dependencies", name: "trtllm_multinode_applies_dependencies",
...@@ -4053,27 +4051,13 @@ func XTestApplyCliqueStartupDependencies(t *testing.T) { ...@@ -4053,27 +4051,13 @@ func XTestApplyCliqueStartupDependencies(t *testing.T) {
roles: []ServiceRole{ roles: []ServiceRole{
{Name: "service", Role: RoleMain, Replicas: 1}, {Name: "service", Role: RoleMain, Replicas: 1},
}, },
backendFramework: BackendFrameworkVLLM, backendFramework: BackendFrameworkTRTLLM,
numberOfNodes: 1, numberOfNodes: 1,
expectedDeps: map[string][]string{ expectedDeps: map[string][]string{
"service": nil, "service": nil,
}, },
expectStartupType: false, expectStartupType: false,
}, },
{
name: "noop_backend_no_dependencies",
roles: []ServiceRole{
{Name: "service-ldr", Role: RoleLeader, Replicas: 1},
{Name: "service-wkr", Role: RoleWorker, Replicas: 2},
},
backendFramework: BackendFrameworkNoop,
numberOfNodes: 3,
expectedDeps: map[string][]string{
"service-ldr": nil,
"service-wkr": nil,
},
expectStartupType: false,
},
} }
for _, tt := range tests { for _, tt := range tests {
...@@ -4129,9 +4113,7 @@ func XTestApplyCliqueStartupDependencies(t *testing.T) { ...@@ -4129,9 +4113,7 @@ func XTestApplyCliqueStartupDependencies(t *testing.T) {
} }
} }
// deactivated for now. func TestGetCliqueStartupDependencies(t *testing.T) {
// TODO: reactivate this when we have a better way to handle the readiness probe for the leader.
func XTestGetCliqueStartupDependencies(t *testing.T) {
tests := []struct { tests := []struct {
name string name string
role Role role Role
...@@ -4140,38 +4122,6 @@ func XTestGetCliqueStartupDependencies(t *testing.T) { ...@@ -4140,38 +4122,6 @@ func XTestGetCliqueStartupDependencies(t *testing.T) {
workerCliqueNames []string workerCliqueNames []string
expected []string expected []string
}{ }{
{
name: "vllm_worker_depends_on_leader",
role: RoleWorker,
backendFramework: BackendFrameworkVLLM,
leaderCliqueName: "service-ldr",
workerCliqueNames: []string{"service-wkr"},
expected: []string{"service-ldr"},
},
{
name: "vllm_leader_has_no_dependencies",
role: RoleLeader,
backendFramework: BackendFrameworkVLLM,
leaderCliqueName: "service-ldr",
workerCliqueNames: []string{"service-wkr"},
expected: nil,
},
{
name: "sglang_worker_depends_on_leader",
role: RoleWorker,
backendFramework: BackendFrameworkSGLang,
leaderCliqueName: "service-ldr",
workerCliqueNames: []string{"service-wkr"},
expected: []string{"service-ldr"},
},
{
name: "sglang_leader_has_no_dependencies",
role: RoleLeader,
backendFramework: BackendFrameworkSGLang,
leaderCliqueName: "service-ldr",
workerCliqueNames: []string{"service-wkr"},
expected: nil,
},
{ {
name: "trtllm_leader_depends_on_workers", name: "trtllm_leader_depends_on_workers",
role: RoleLeader, role: RoleLeader,
...@@ -4188,30 +4138,6 @@ func XTestGetCliqueStartupDependencies(t *testing.T) { ...@@ -4188,30 +4138,6 @@ func XTestGetCliqueStartupDependencies(t *testing.T) {
workerCliqueNames: []string{"service-wkr"}, workerCliqueNames: []string{"service-wkr"},
expected: nil, expected: nil,
}, },
{
name: "noop_backend_has_no_dependencies",
role: RoleWorker,
backendFramework: BackendFrameworkNoop,
leaderCliqueName: "service-ldr",
workerCliqueNames: []string{"service-wkr"},
expected: nil,
},
{
name: "main_role_has_no_dependencies",
role: RoleMain,
backendFramework: BackendFrameworkVLLM,
leaderCliqueName: "",
workerCliqueNames: nil,
expected: nil,
},
{
name: "worker_with_empty_leader_name",
role: RoleWorker,
backendFramework: BackendFrameworkVLLM,
leaderCliqueName: "",
workerCliqueNames: []string{"service-wkr"},
expected: nil,
},
{ {
name: "leader_with_empty_worker_names", name: "leader_with_empty_worker_names",
role: RoleLeader, role: RoleLeader,
...@@ -4238,31 +4164,32 @@ func XTestGetCliqueStartupDependencies(t *testing.T) { ...@@ -4238,31 +4164,32 @@ func XTestGetCliqueStartupDependencies(t *testing.T) {
} }
} }
// deactivated for now. func TestGenerateGrovePodCliqueSet_StartsAfterDependencies(t *testing.T) {
// TODO: reactivate this when we have a better way to handle the readiness probe for the leader.
func XTestGenerateGrovePodCliqueSet_StartsAfterDependencies(t *testing.T) {
secretsRetriever := &mockSecretsRetriever{} secretsRetriever := &mockSecretsRetriever{}
tests := []struct { tests := []struct {
name string name string
backendFramework string backendFramework string
expectedDeps map[string][]string // clique name -> expected StartsAfter dependencies expectedDeps map[string][]string // clique name -> expected StartsAfter dependencies
expectStartupType bool
}{ }{
{ {
name: "vllm_worker_starts_after_leader", name: "vllm_worker_starts_after_leader",
backendFramework: string(BackendFrameworkVLLM), backendFramework: string(BackendFrameworkVLLM),
expectedDeps: map[string][]string{ expectedDeps: map[string][]string{
"main-wkr": {"main-ldr"}, // worker starts after leader "main-wkr": nil, // worker starts after leader
"main-ldr": nil, // leader has no dependencies "main-ldr": nil, // leader has no dependencies
}, },
expectStartupType: false,
}, },
{ {
name: "sglang_worker_starts_after_leader", name: "sglang_worker_starts_after_leader",
backendFramework: string(BackendFrameworkSGLang), backendFramework: string(BackendFrameworkSGLang),
expectedDeps: map[string][]string{ expectedDeps: map[string][]string{
"main-wkr": {"main-ldr"}, // worker starts after leader "main-wkr": nil, // worker starts after leader
"main-ldr": nil, // leader has no dependencies "main-ldr": nil, // leader has no dependencies
}, },
expectStartupType: false,
}, },
{ {
name: "trtllm_leader_starts_after_worker", name: "trtllm_leader_starts_after_worker",
...@@ -4271,6 +4198,7 @@ func XTestGenerateGrovePodCliqueSet_StartsAfterDependencies(t *testing.T) { ...@@ -4271,6 +4198,7 @@ func XTestGenerateGrovePodCliqueSet_StartsAfterDependencies(t *testing.T) {
"main-ldr": {"main-wkr"}, // leader starts after worker "main-ldr": {"main-wkr"}, // leader starts after worker
"main-wkr": nil, // worker has no dependencies "main-wkr": nil, // worker has no dependencies
}, },
expectStartupType: true,
}, },
} }
...@@ -4314,8 +4242,14 @@ func XTestGenerateGrovePodCliqueSet_StartsAfterDependencies(t *testing.T) { ...@@ -4314,8 +4242,14 @@ func XTestGenerateGrovePodCliqueSet_StartsAfterDependencies(t *testing.T) {
} }
// Verify that StartupType is set to Explicit // Verify that StartupType is set to Explicit
if got.Spec.Template.StartupType == nil || *got.Spec.Template.StartupType != grovev1alpha1.CliqueStartupTypeExplicit { if tt.expectStartupType {
t.Errorf("Expected StartupType to be CliqueStartupTypeExplicit, got %v", got.Spec.Template.StartupType) if got.Spec.Template.StartupType == nil || *got.Spec.Template.StartupType != grovev1alpha1.CliqueStartupTypeExplicit {
t.Errorf("Expected StartupType to be CliqueStartupTypeExplicit, got %v", got.Spec.Template.StartupType)
}
} else {
if got.Spec.Template.StartupType == nil || *got.Spec.Template.StartupType != grovev1alpha1.CliqueStartupTypeAnyOrder {
t.Errorf("Expected StartupType to be CliqueStartupTypeAnyOrder, got %v", got.Spec.Template.StartupType)
}
} }
// Verify StartsAfter dependencies for each clique // Verify StartsAfter dependencies for each clique
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment