Unverified Commit 55de76b6 authored by Julien Mancuso's avatar Julien Mancuso Committed by GitHub
Browse files

feat: add startup order for multinode TRTLLM (#3462)


Signed-off-by: default avatarJulien Mancuso <jmancuso@nvidia.com>
parent 96a6dcda
......@@ -331,11 +331,11 @@ func applyCliqueStartupDependencies(
backendFramework BackendFramework,
numberOfNodes int32,
) {
// deactivated for now.
// TODO: reactivate this when we have a better way to handle the readiness probe for the leader.
deactivated := true
// enabled for TRTLLM multinode deployments only
// TODO: reactivate for all backends when we have a better way to handle the readiness probe for the leader.
enabled := backendFramework == BackendFrameworkTRTLLM && numberOfNodes > 1
if deactivated || numberOfNodes <= 1 {
if !enabled {
return // No dependencies for single-node deployments
}
......
......@@ -3995,9 +3995,7 @@ func TestGetBackendFrameworkFromComponent(t *testing.T) {
}
}
// deactivated for now.
// TODO: reactivate this when we have a better way to handle the readiness probe for the leader.
func XTestApplyCliqueStartupDependencies(t *testing.T) {
func TestApplyCliqueStartupDependencies(t *testing.T) {
tests := []struct {
name string
roles []ServiceRole
......@@ -4016,9 +4014,9 @@ func XTestApplyCliqueStartupDependencies(t *testing.T) {
numberOfNodes: 3,
expectedDeps: map[string][]string{
"service-ldr": nil,
"service-wkr": {"service-ldr"},
"service-wkr": nil,
},
expectStartupType: true,
expectStartupType: false,
},
{
name: "sglang_multinode_applies_dependencies",
......@@ -4030,9 +4028,9 @@ func XTestApplyCliqueStartupDependencies(t *testing.T) {
numberOfNodes: 3,
expectedDeps: map[string][]string{
"service-ldr": nil,
"service-wkr": {"service-ldr"},
"service-wkr": nil,
},
expectStartupType: true,
expectStartupType: false,
},
{
name: "trtllm_multinode_applies_dependencies",
......@@ -4053,27 +4051,13 @@ func XTestApplyCliqueStartupDependencies(t *testing.T) {
roles: []ServiceRole{
{Name: "service", Role: RoleMain, Replicas: 1},
},
backendFramework: BackendFrameworkVLLM,
backendFramework: BackendFrameworkTRTLLM,
numberOfNodes: 1,
expectedDeps: map[string][]string{
"service": nil,
},
expectStartupType: false,
},
{
name: "noop_backend_no_dependencies",
roles: []ServiceRole{
{Name: "service-ldr", Role: RoleLeader, Replicas: 1},
{Name: "service-wkr", Role: RoleWorker, Replicas: 2},
},
backendFramework: BackendFrameworkNoop,
numberOfNodes: 3,
expectedDeps: map[string][]string{
"service-ldr": nil,
"service-wkr": nil,
},
expectStartupType: false,
},
}
for _, tt := range tests {
......@@ -4129,9 +4113,7 @@ func XTestApplyCliqueStartupDependencies(t *testing.T) {
}
}
// deactivated for now.
// TODO: reactivate this when we have a better way to handle the readiness probe for the leader.
func XTestGetCliqueStartupDependencies(t *testing.T) {
func TestGetCliqueStartupDependencies(t *testing.T) {
tests := []struct {
name string
role Role
......@@ -4140,38 +4122,6 @@ func XTestGetCliqueStartupDependencies(t *testing.T) {
workerCliqueNames []string
expected []string
}{
{
name: "vllm_worker_depends_on_leader",
role: RoleWorker,
backendFramework: BackendFrameworkVLLM,
leaderCliqueName: "service-ldr",
workerCliqueNames: []string{"service-wkr"},
expected: []string{"service-ldr"},
},
{
name: "vllm_leader_has_no_dependencies",
role: RoleLeader,
backendFramework: BackendFrameworkVLLM,
leaderCliqueName: "service-ldr",
workerCliqueNames: []string{"service-wkr"},
expected: nil,
},
{
name: "sglang_worker_depends_on_leader",
role: RoleWorker,
backendFramework: BackendFrameworkSGLang,
leaderCliqueName: "service-ldr",
workerCliqueNames: []string{"service-wkr"},
expected: []string{"service-ldr"},
},
{
name: "sglang_leader_has_no_dependencies",
role: RoleLeader,
backendFramework: BackendFrameworkSGLang,
leaderCliqueName: "service-ldr",
workerCliqueNames: []string{"service-wkr"},
expected: nil,
},
{
name: "trtllm_leader_depends_on_workers",
role: RoleLeader,
......@@ -4188,30 +4138,6 @@ func XTestGetCliqueStartupDependencies(t *testing.T) {
workerCliqueNames: []string{"service-wkr"},
expected: nil,
},
{
name: "noop_backend_has_no_dependencies",
role: RoleWorker,
backendFramework: BackendFrameworkNoop,
leaderCliqueName: "service-ldr",
workerCliqueNames: []string{"service-wkr"},
expected: nil,
},
{
name: "main_role_has_no_dependencies",
role: RoleMain,
backendFramework: BackendFrameworkVLLM,
leaderCliqueName: "",
workerCliqueNames: nil,
expected: nil,
},
{
name: "worker_with_empty_leader_name",
role: RoleWorker,
backendFramework: BackendFrameworkVLLM,
leaderCliqueName: "",
workerCliqueNames: []string{"service-wkr"},
expected: nil,
},
{
name: "leader_with_empty_worker_names",
role: RoleLeader,
......@@ -4238,31 +4164,32 @@ func XTestGetCliqueStartupDependencies(t *testing.T) {
}
}
// deactivated for now.
// TODO: reactivate this when we have a better way to handle the readiness probe for the leader.
func XTestGenerateGrovePodCliqueSet_StartsAfterDependencies(t *testing.T) {
func TestGenerateGrovePodCliqueSet_StartsAfterDependencies(t *testing.T) {
secretsRetriever := &mockSecretsRetriever{}
tests := []struct {
name string
backendFramework string
expectedDeps map[string][]string // clique name -> expected StartsAfter dependencies
name string
backendFramework string
expectedDeps map[string][]string // clique name -> expected StartsAfter dependencies
expectStartupType bool
}{
{
name: "vllm_worker_starts_after_leader",
backendFramework: string(BackendFrameworkVLLM),
expectedDeps: map[string][]string{
"main-wkr": {"main-ldr"}, // worker starts after leader
"main-ldr": nil, // leader has no dependencies
"main-wkr": nil, // worker starts after leader
"main-ldr": nil, // leader has no dependencies
},
expectStartupType: false,
},
{
name: "sglang_worker_starts_after_leader",
backendFramework: string(BackendFrameworkSGLang),
expectedDeps: map[string][]string{
"main-wkr": {"main-ldr"}, // worker starts after leader
"main-ldr": nil, // leader has no dependencies
"main-wkr": nil, // worker starts after leader
"main-ldr": nil, // leader has no dependencies
},
expectStartupType: false,
},
{
name: "trtllm_leader_starts_after_worker",
......@@ -4271,6 +4198,7 @@ func XTestGenerateGrovePodCliqueSet_StartsAfterDependencies(t *testing.T) {
"main-ldr": {"main-wkr"}, // leader starts after worker
"main-wkr": nil, // worker has no dependencies
},
expectStartupType: true,
},
}
......@@ -4314,8 +4242,14 @@ func XTestGenerateGrovePodCliqueSet_StartsAfterDependencies(t *testing.T) {
}
// Verify that StartupType is set to Explicit
if got.Spec.Template.StartupType == nil || *got.Spec.Template.StartupType != grovev1alpha1.CliqueStartupTypeExplicit {
t.Errorf("Expected StartupType to be CliqueStartupTypeExplicit, got %v", got.Spec.Template.StartupType)
if tt.expectStartupType {
if got.Spec.Template.StartupType == nil || *got.Spec.Template.StartupType != grovev1alpha1.CliqueStartupTypeExplicit {
t.Errorf("Expected StartupType to be CliqueStartupTypeExplicit, got %v", got.Spec.Template.StartupType)
}
} else {
if got.Spec.Template.StartupType == nil || *got.Spec.Template.StartupType != grovev1alpha1.CliqueStartupTypeAnyOrder {
t.Errorf("Expected StartupType to be CliqueStartupTypeAnyOrder, got %v", got.Spec.Template.StartupType)
}
}
// Verify StartsAfter dependencies for each clique
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment