Unverified Commit 9b893c93 authored by Julien Mancuso's avatar Julien Mancuso Committed by GitHub
Browse files

fix: improve sglang multinode handling in operator (#3151)


Signed-off-by: default avatarJulien Mancuso <jmancuso@nvidia.com>
parent c1907d12
...@@ -15,6 +15,16 @@ const ( ...@@ -15,6 +15,16 @@ const (
type SGLangBackend struct{} type SGLangBackend struct{}
// isPythonCommand checks if the command is a Python interpreter
func isPythonCommand(cmd string) bool {
if cmd == "python" || cmd == "python3" {
return true
}
// Match python with version numbers like python3.11, python2.7, etc.
matched, _ := regexp.MatchString(`^python\d+(\.\d+)*$`, cmd)
return matched
}
func (b *SGLangBackend) UpdateContainer(container *corev1.Container, numberOfNodes int32, role Role, component *v1alpha1.DynamoComponentDeploymentOverridesSpec, serviceName string, multinodeDeployer MultinodeDeployer) { func (b *SGLangBackend) UpdateContainer(container *corev1.Container, numberOfNodes int32, role Role, component *v1alpha1.DynamoComponentDeploymentOverridesSpec, serviceName string, multinodeDeployer MultinodeDeployer) {
// For single node, nothing to do // For single node, nothing to do
if numberOfNodes <= 1 { if numberOfNodes <= 1 {
...@@ -29,16 +39,60 @@ func (b *SGLangBackend) UpdateContainer(container *corev1.Container, numberOfNod ...@@ -29,16 +39,60 @@ func (b *SGLangBackend) UpdateContainer(container *corev1.Container, numberOfNod
} }
// Generate the flags to add // Generate the flags to add
flags := b.getMultinodeFlags(numberOfNodes, role, serviceName, multinodeDeployer) flags, needsShell := b.getMultinodeFlags(numberOfNodes, role, serviceName, multinodeDeployer)
if flags == "" { if flags == "" {
return return
} }
// Flatten all args into a single command and inject flags /*
if len(container.Args) > 0 { * Flag Injection Strategy for Multinode SGLang Deployments
fullCommand := strings.Join(container.Args, " ") *
modifiedCommand := b.injectFlagsIntoPythonCommand(fullCommand, flags) * This code handles the injection of distributed training flags (--dist-init-addr, --nnodes, --node-rank)
container.Args = []string{modifiedCommand} * into container commands for multinode SGLang deployments. The complexity arises from supporting multiple
* container command patterns and ensuring proper environment variable interpretation.
*
* Two main scenarios are handled:
*
* 1. Direct Python Command (e.g., Command: ["python3"], Args: ["-m", "sglang", "..."])
* - If shell interpretation is needed (for env vars): Wrap in "sh -c" with exec
* - If no shell needed: Simply append flags to the Args array
*
* 2. Non-Python Command (e.g., Command: ["sh"], Args: ["-c", "python3 -m sglang ..."])
* - Use regex-based injection to find embedded Python+SGLang commands within args
* - Insert flags after the Python command but before any shell operators (|, &, ;)
*
* The needsShell flag indicates when environment variables require shell interpretation
*/
if len(container.Command) > 0 && isPythonCommand(container.Command[0]) {
// Direct python command case
if needsShell {
// Transform to shell wrapper for env var interpretation
fullCommand := strings.Join(container.Command, " ")
originalArgs := strings.Join(container.Args, " ")
var shellCommand string
if len(container.Args) > 0 {
// Use exec to ensure PID 1 is given to the python command
shellCommand = fmt.Sprintf("exec %s %s %s", fullCommand, originalArgs, flags)
} else {
// Use exec to ensure PID 1 is given to the python command
shellCommand = fmt.Sprintf("exec %s %s", fullCommand, flags)
}
container.Command = []string{"sh", "-c"}
container.Args = []string{shellCommand}
} else {
// Simple append to args
flagsSlice := strings.Fields(flags)
container.Args = append(container.Args, flagsSlice...)
}
} else {
// Non-python command case - try injection on each arg individually
for i, arg := range container.Args {
modifiedArg := b.injectFlagsIntoPythonCommand(arg, flags)
if modifiedArg != arg { // flags were successfully injected
container.Args[i] = modifiedArg
break // stop after first successful injection
}
}
} }
} }
...@@ -46,15 +100,22 @@ func (b *SGLangBackend) UpdatePodSpec(podSpec *corev1.PodSpec, numberOfNodes int ...@@ -46,15 +100,22 @@ func (b *SGLangBackend) UpdatePodSpec(podSpec *corev1.PodSpec, numberOfNodes int
// do nothing // do nothing
} }
// getMultinodeFlags returns the multinode flags as a single string // getMultinodeFlags returns the multinode flags and whether shell interpretation is needed
func (b *SGLangBackend) getMultinodeFlags(numberOfNodes int32, role Role, serviceName string, multinodeDeployer MultinodeDeployer) string { func (b *SGLangBackend) getMultinodeFlags(numberOfNodes int32, role Role, serviceName string, multinodeDeployer MultinodeDeployer) (string, bool) {
distInitAddr := fmt.Sprintf("%s:%s", multinodeDeployer.GetLeaderHostname(serviceName), SglangPort) distInitAddr := fmt.Sprintf("%s:%s", multinodeDeployer.GetLeaderHostname(serviceName), SglangPort)
nodeRank := multinodeDeployer.GetNodeRank()
// Determine node-rank var nodeRank string
var needsShell bool
if role == RoleLeader { if role == RoleLeader {
nodeRank = "0" nodeRank = "0"
needsShell = false
} else {
nodeRank, needsShell = multinodeDeployer.GetNodeRank()
} }
return fmt.Sprintf("--dist-init-addr %s --nnodes %d --node-rank %s", distInitAddr, numberOfNodes, nodeRank)
flags := fmt.Sprintf("--dist-init-addr %s --nnodes %d --node-rank %s", distInitAddr, numberOfNodes, nodeRank)
return flags, needsShell
} }
// injectFlagsIntoPythonCommand finds python sglang commands and adds flags after them // injectFlagsIntoPythonCommand finds python sglang commands and adds flags after them
......
...@@ -8,7 +8,182 @@ import ( ...@@ -8,7 +8,182 @@ import (
corev1 "k8s.io/api/core/v1" corev1 "k8s.io/api/core/v1"
) )
func TestSGLangBackend_DirectFlagInjection(t *testing.T) { // Mock MultinodeDeployer for testing with no shell interpretation needed
type MockSimpleDeployer struct{}
func (m *MockSimpleDeployer) GetLeaderHostname(serviceName string) string {
return "leader.example.com"
}
func (m *MockSimpleDeployer) GetHostNames(serviceName string, numberOfNodes int32) []string {
hostnames := make([]string, numberOfNodes)
hostnames[0] = m.GetLeaderHostname(serviceName)
for i := int32(1); i < numberOfNodes; i++ {
hostnames[i] = "worker" + string(rune('0'+i)) + ".example.com"
}
return hostnames
}
func (m *MockSimpleDeployer) GetNodeRank() (string, bool) {
return "1", false // simple rank, no shell interpretation needed
}
// Mock MultinodeDeployer for testing with shell interpretation needed
type MockShellDeployer struct{}
func (m *MockShellDeployer) GetLeaderHostname(serviceName string) string {
return "$(LEADER_HOST)"
}
func (m *MockShellDeployer) GetHostNames(serviceName string, numberOfNodes int32) []string {
hostnames := make([]string, numberOfNodes)
hostnames[0] = m.GetLeaderHostname(serviceName)
for i := int32(1); i < numberOfNodes; i++ {
hostnames[i] = "$(WORKER_" + string(rune('0'+i)) + "_HOST)"
}
return hostnames
}
func (m *MockShellDeployer) GetNodeRank() (string, bool) {
return "$(WORKER_INDEX)", true // needs shell interpretation
}
func TestSGLangBackend_PythonCommandInjection(t *testing.T) {
backend := &SGLangBackend{}
tests := []struct {
name string
numberOfNodes int32
role Role
multinodeDeployer MultinodeDeployer
initialCommand []string
initialArgs []string
expectedCommand []string
expectedArgs []string
description string
}{
{
name: "single node python command no changes",
numberOfNodes: 1,
role: RoleMain,
multinodeDeployer: &MockSimpleDeployer{},
initialCommand: []string{"python3"},
initialArgs: []string{"-m", "dynamo.sglang.worker"},
expectedCommand: []string{"python3"},
expectedArgs: []string{"-m", "dynamo.sglang.worker"},
description: "Single node should not modify python commands",
},
{
name: "python command simple deployer - direct append",
numberOfNodes: 2,
role: RoleWorker,
multinodeDeployer: &MockSimpleDeployer{},
initialCommand: []string{"python3"},
initialArgs: []string{"-m", "dynamo.sglang.worker", "--model", "llama"},
expectedCommand: []string{"python3"},
expectedArgs: []string{"-m", "dynamo.sglang.worker", "--model", "llama", "--dist-init-addr", "leader.example.com:29500", "--nnodes", "2", "--node-rank", "1"},
description: "Direct python command with simple deployer should append flags",
},
{
name: "python command shell deployer - shell wrapping",
numberOfNodes: 2,
role: RoleWorker,
multinodeDeployer: &MockShellDeployer{},
initialCommand: []string{"python3"},
initialArgs: []string{"-m", "dynamo.sglang.worker", "--model", "llama"},
expectedCommand: []string{"sh", "-c"},
expectedArgs: []string{"exec python3 -m dynamo.sglang.worker --model llama --dist-init-addr $(LEADER_HOST):29500 --nnodes 2 --node-rank $(WORKER_INDEX)"},
description: "Direct python command with shell deployer should wrap with sh -c exec",
},
{
name: "python command leader role - always simple",
numberOfNodes: 3,
role: RoleLeader,
multinodeDeployer: &MockShellDeployer{},
initialCommand: []string{"python"},
initialArgs: []string{"-m", "dynamo.sglang.worker"},
expectedCommand: []string{"python"},
expectedArgs: []string{"-m", "dynamo.sglang.worker", "--dist-init-addr", "$(LEADER_HOST):29500", "--nnodes", "3", "--node-rank", "0"},
description: "Leader role should never use shell wrapping",
},
{
name: "python3.11 variant supported",
numberOfNodes: 2,
role: RoleWorker,
multinodeDeployer: &MockSimpleDeployer{},
initialCommand: []string{"python3.11"},
initialArgs: []string{"-m", "dynamo.sglang.worker"},
expectedCommand: []string{"python3.11"},
expectedArgs: []string{"-m", "dynamo.sglang.worker", "--dist-init-addr", "leader.example.com:29500", "--nnodes", "2", "--node-rank", "1"},
description: "Python version variants should be recognized",
},
{
name: "python command with module in command array - simple deployer",
numberOfNodes: 2,
role: RoleWorker,
multinodeDeployer: &MockSimpleDeployer{},
initialCommand: []string{"python3", "-m", "dynamo.sglang"},
initialArgs: []string{"--model-path", "Qwen/Qwen3-0.6B", "--tp-size", "8"},
expectedCommand: []string{"python3", "-m", "dynamo.sglang"},
expectedArgs: []string{"--model-path", "Qwen/Qwen3-0.6B", "--tp-size", "8", "--dist-init-addr", "leader.example.com:29500", "--nnodes", "2", "--node-rank", "1"},
description: "Multi-element python command should have flags appended to args",
},
{
name: "python command with module in command array - shell deployer",
numberOfNodes: 2,
role: RoleWorker,
multinodeDeployer: &MockShellDeployer{},
initialCommand: []string{"python3", "-m", "dynamo.sglang"},
initialArgs: []string{"--model-path", "Qwen/Qwen3-0.6B"},
expectedCommand: []string{"sh", "-c"},
expectedArgs: []string{"exec python3 -m dynamo.sglang --model-path Qwen/Qwen3-0.6B --dist-init-addr $(LEADER_HOST):29500 --nnodes 2 --node-rank $(WORKER_INDEX)"},
description: "Multi-element python command with shell deployer should wrap entire command",
},
{
name: "python command with no args - shell deployer",
numberOfNodes: 2,
role: RoleWorker,
multinodeDeployer: &MockShellDeployer{},
initialCommand: []string{"python3", "-m", "dynamo.sglang"},
initialArgs: []string{},
expectedCommand: []string{"sh", "-c"},
expectedArgs: []string{"exec python3 -m dynamo.sglang --dist-init-addr $(LEADER_HOST):29500 --nnodes 2 --node-rank $(WORKER_INDEX)"},
description: "Multi-element python command with no args should still work with shell wrapper",
},
{
name: "non-python command multinode unchanged",
numberOfNodes: 2,
role: RoleWorker,
multinodeDeployer: &MockShellDeployer{},
initialCommand: []string{"java"},
initialArgs: []string{"-jar", "app.jar"},
expectedCommand: []string{"java"},
expectedArgs: []string{"-jar", "app.jar"}, // Args remain separate, no python found, no changes
description: "Non-python commands should remain unchanged (no flattening)",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
container := &corev1.Container{
Command: append([]string{}, tt.initialCommand...),
Args: append([]string{}, tt.initialArgs...),
}
backend.UpdateContainer(container, tt.numberOfNodes, tt.role, &v1alpha1.DynamoComponentDeploymentOverridesSpec{}, "test-service", tt.multinodeDeployer)
if !reflect.DeepEqual(container.Command, tt.expectedCommand) {
t.Errorf("UpdateContainer() command = %v, want %v", container.Command, tt.expectedCommand)
}
if !reflect.DeepEqual(container.Args, tt.expectedArgs) {
t.Errorf("UpdateContainer() args = %v, want %v", container.Args, tt.expectedArgs)
}
})
}
}
func TestSGLangBackend_ShellCommandInjection(t *testing.T) {
backend := &SGLangBackend{} backend := &SGLangBackend{}
tests := []struct { tests := []struct {
...@@ -16,88 +191,108 @@ func TestSGLangBackend_DirectFlagInjection(t *testing.T) { ...@@ -16,88 +191,108 @@ func TestSGLangBackend_DirectFlagInjection(t *testing.T) {
numberOfNodes int32 numberOfNodes int32
role Role role Role
multinodeDeployer MultinodeDeployer multinodeDeployer MultinodeDeployer
initialCommand []string
initialArgs []string initialArgs []string
expectedArgs []string expectedArgs []string
description string description string
}{ }{
{ {
name: "single node does not modify args", name: "single node shell command not modified",
numberOfNodes: 1, numberOfNodes: 1,
role: RoleMain, role: RoleMain,
multinodeDeployer: &GroveMultinodeDeployer{}, multinodeDeployer: &GroveMultinodeDeployer{},
initialCommand: []string{"sh", "-c"},
initialArgs: []string{"python -m dynamo.sglang.worker"}, initialArgs: []string{"python -m dynamo.sglang.worker"},
expectedArgs: []string{"python -m dynamo.sglang.worker"}, expectedArgs: []string{"python -m dynamo.sglang.worker"},
description: "Single node should not modify anything", description: "Single node should not modify shell commands",
}, },
{ {
name: "multinode adds flags to simple python command", name: "multinode shell command with regex injection",
numberOfNodes: 2, numberOfNodes: 2,
role: RoleLeader, role: RoleLeader,
multinodeDeployer: &GroveMultinodeDeployer{}, multinodeDeployer: &GroveMultinodeDeployer{},
initialCommand: []string{"sh", "-c"},
initialArgs: []string{"python -m dynamo.sglang.worker"}, initialArgs: []string{"python -m dynamo.sglang.worker"},
expectedArgs: []string{"python -m dynamo.sglang.worker --dist-init-addr ${GROVE_PCSG_NAME}-${GROVE_PCSG_INDEX}-test-service-ldr-0.${GROVE_HEADLESS_SERVICE}:29500 --nnodes 2 --node-rank 0"}, expectedArgs: []string{"python -m dynamo.sglang.worker --dist-init-addr $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-ldr-0.$(GROVE_HEADLESS_SERVICE):29500 --nnodes 2 --node-rank 0"},
description: "Should add multinode flags directly to python command", description: "Shell commands should use regex injection for python commands",
}, },
{ {
name: "multinode with complex command", name: "multinode shell command with complex pipeline",
numberOfNodes: 2, numberOfNodes: 2,
role: RoleLeader, role: RoleLeader,
multinodeDeployer: &GroveMultinodeDeployer{}, multinodeDeployer: &GroveMultinodeDeployer{},
initialCommand: []string{"sh", "-c"},
initialArgs: []string{"echo blah | wc -l && python -m dynamo.sglang.worker && ls -al"}, initialArgs: []string{"echo blah | wc -l && python -m dynamo.sglang.worker && ls -al"},
expectedArgs: []string{"echo blah | wc -l && python -m dynamo.sglang.worker --dist-init-addr ${GROVE_PCSG_NAME}-${GROVE_PCSG_INDEX}-test-service-ldr-0.${GROVE_HEADLESS_SERVICE}:29500 --nnodes 2 --node-rank 0 && ls -al"}, expectedArgs: []string{"echo blah | wc -l && python -m dynamo.sglang.worker --dist-init-addr $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-ldr-0.$(GROVE_HEADLESS_SERVICE):29500 --nnodes 2 --node-rank 0 && ls -al"},
description: "Should add flags only to python command, not other commands", description: "Complex shell commands should inject flags only into python part",
}, },
{ {
name: "multinode worker with Grove deployment", name: "shell command worker with grove env vars",
numberOfNodes: 3, numberOfNodes: 3,
role: RoleWorker, role: RoleWorker,
multinodeDeployer: &GroveMultinodeDeployer{}, multinodeDeployer: &GroveMultinodeDeployer{},
initialCommand: []string{"sh", "-c"},
initialArgs: []string{"python -m dynamo.sglang.worker"}, initialArgs: []string{"python -m dynamo.sglang.worker"},
expectedArgs: []string{"python -m dynamo.sglang.worker --dist-init-addr ${GROVE_PCSG_NAME}-${GROVE_PCSG_INDEX}-test-service-ldr-0.${GROVE_HEADLESS_SERVICE}:29500 --nnodes 3 --node-rank $((GROVE_PCLQ_POD_INDEX + 1))"}, expectedArgs: []string{"python -m dynamo.sglang.worker --dist-init-addr $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-ldr-0.$(GROVE_HEADLESS_SERVICE):29500 --nnodes 3 --node-rank $((GROVE_PCLQ_POD_INDEX + 1))"},
description: "Worker should get correct node rank", description: "Shell command worker should get grove env vars in node rank",
}, },
{ {
name: "LWS deployment uses correct address", name: "shell command with LWS deployer",
numberOfNodes: 2, numberOfNodes: 2,
role: RoleLeader, role: RoleLeader,
multinodeDeployer: &LWSMultinodeDeployer{}, multinodeDeployer: &LWSMultinodeDeployer{},
initialCommand: []string{"sh", "-c"},
initialArgs: []string{"python -m dynamo.sglang.worker"}, initialArgs: []string{"python -m dynamo.sglang.worker"},
expectedArgs: []string{"python -m dynamo.sglang.worker --dist-init-addr ${LWS_LEADER_ADDRESS}:29500 --nnodes 2 --node-rank 0"}, expectedArgs: []string{"python -m dynamo.sglang.worker --dist-init-addr $(LWS_LEADER_ADDRESS):29500 --nnodes 2 --node-rank 0"},
description: "LWS deployment should use LWS_LEADER_ADDRESS", description: "LWS shell commands should use LWS variables",
}, },
{ {
name: "command with pipes gets flags before pipe", name: "shell command with pipes",
numberOfNodes: 2, numberOfNodes: 2,
role: RoleLeader, role: RoleLeader,
multinodeDeployer: &GroveMultinodeDeployer{}, multinodeDeployer: &GroveMultinodeDeployer{},
initialCommand: []string{"sh", "-c"},
initialArgs: []string{"python -m dynamo.sglang.worker | tee /tmp/log"}, initialArgs: []string{"python -m dynamo.sglang.worker | tee /tmp/log"},
expectedArgs: []string{"python -m dynamo.sglang.worker --dist-init-addr ${GROVE_PCSG_NAME}-${GROVE_PCSG_INDEX}-test-service-ldr-0.${GROVE_HEADLESS_SERVICE}:29500 --nnodes 2 --node-rank 0 | tee /tmp/log"}, expectedArgs: []string{"python -m dynamo.sglang.worker --dist-init-addr $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-ldr-0.$(GROVE_HEADLESS_SERVICE):29500 --nnodes 2 --node-rank 0 | tee /tmp/log"},
description: "Should insert flags before pipe operator", description: "Shell commands with pipes should inject flags before pipe",
}, },
{ {
name: "multiple args are flattened and processed together", name: "shell command multiple args individual processing",
numberOfNodes: 2, numberOfNodes: 2,
role: RoleLeader, role: RoleLeader,
multinodeDeployer: &GroveMultinodeDeployer{}, multinodeDeployer: &GroveMultinodeDeployer{},
initialCommand: []string{"sh", "-c"},
initialArgs: []string{"echo start", "python -m dynamo.sglang.worker", "echo done"}, initialArgs: []string{"echo start", "python -m dynamo.sglang.worker", "echo done"},
expectedArgs: []string{"echo start python -m dynamo.sglang.worker --dist-init-addr ${GROVE_PCSG_NAME}-${GROVE_PCSG_INDEX}-test-service-ldr-0.${GROVE_HEADLESS_SERVICE}:29500 --nnodes 2 --node-rank 0 echo done"}, expectedArgs: []string{"echo start", "python -m dynamo.sglang.worker --dist-init-addr $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-ldr-0.$(GROVE_HEADLESS_SERVICE):29500 --nnodes 2 --node-rank 0", "echo done"},
description: "Multiple args should be flattened and python command gets flags", description: "Shell commands with multiple args should process each individually, modify only the python arg",
}, },
{ {
name: "no sglang command means flattened but no changes", name: "shell command no sglang modules unchanged",
numberOfNodes: 2, numberOfNodes: 2,
role: RoleLeader, role: RoleLeader,
multinodeDeployer: &GroveMultinodeDeployer{}, multinodeDeployer: &GroveMultinodeDeployer{},
initialCommand: []string{"sh", "-c"},
initialArgs: []string{"echo hello", "python -m some.other.module"}, initialArgs: []string{"echo hello", "python -m some.other.module"},
expectedArgs: []string{"echo hello python -m some.other.module"}, expectedArgs: []string{"echo hello", "python -m some.other.module"},
description: "Non-sglang commands should be flattened but not modified", description: "Shell commands without sglang modules should remain unchanged (args stay separate)",
},
{
name: "shell command stops after first python injection",
numberOfNodes: 2,
role: RoleLeader,
multinodeDeployer: &GroveMultinodeDeployer{},
initialCommand: []string{"sh", "-c"},
initialArgs: []string{"python -m dynamo.sglang.worker", "python -m dynamo.sglang.worker --other-flags"},
expectedArgs: []string{"python -m dynamo.sglang.worker --dist-init-addr $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-ldr-0.$(GROVE_HEADLESS_SERVICE):29500 --nnodes 2 --node-rank 0", "python -m dynamo.sglang.worker --other-flags"},
description: "Should stop processing after first successful python flag injection",
}, },
} }
for _, tt := range tests { for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) { t.Run(tt.name, func(t *testing.T) {
container := &corev1.Container{ container := &corev1.Container{
Args: append([]string{}, tt.initialArgs...), Command: append([]string{}, tt.initialCommand...),
Args: append([]string{}, tt.initialArgs...),
} }
backend.UpdateContainer(container, tt.numberOfNodes, tt.role, &v1alpha1.DynamoComponentDeploymentOverridesSpec{}, "test-service", tt.multinodeDeployer) backend.UpdateContainer(container, tt.numberOfNodes, tt.role, &v1alpha1.DynamoComponentDeploymentOverridesSpec{}, "test-service", tt.multinodeDeployer)
...@@ -106,14 +301,96 @@ func TestSGLangBackend_DirectFlagInjection(t *testing.T) { ...@@ -106,14 +301,96 @@ func TestSGLangBackend_DirectFlagInjection(t *testing.T) {
t.Errorf("UpdateContainer() args = %v, want %v", container.Args, tt.expectedArgs) t.Errorf("UpdateContainer() args = %v, want %v", container.Args, tt.expectedArgs)
} }
// Verify no environment variables were added // Verify command is still sh -c for shell commands
if len(container.Env) > 0 { expectedCommand := tt.initialCommand
t.Errorf("UpdateContainer() should not add environment variables, but added: %v", container.Env) if !reflect.DeepEqual(container.Command, expectedCommand) {
t.Errorf("UpdateContainer() should preserve shell command, got: %v, want: %v", container.Command, expectedCommand)
}
})
}
}
func TestIsPythonCommand(t *testing.T) {
tests := []struct {
cmd string
expected bool
}{
{"python", true},
{"python3", true},
{"python2", true},
{"python3.11", true},
{"python2.7", true},
{"python3.12.1", true},
{"java", false},
{"sh", false},
{"node", false},
{"python-config", false}, // hyphen makes it not a python interpreter
{"", false},
{"python ", false}, // space makes it invalid
{"pythonx", false}, // extra characters
}
for _, tt := range tests {
t.Run(tt.cmd, func(t *testing.T) {
result := isPythonCommand(tt.cmd)
if result != tt.expected {
t.Errorf("isPythonCommand(%q) = %v, want %v", tt.cmd, result, tt.expected)
}
})
}
}
func TestSGLangBackend_GetMultinodeFlags(t *testing.T) {
backend := &SGLangBackend{}
tests := []struct {
name string
numberOfNodes int32
role Role
multinodeDeployer MultinodeDeployer
expectedFlags string
expectedNeedsShell bool
description string
}{
{
name: "leader role never needs shell",
numberOfNodes: 2,
role: RoleLeader,
multinodeDeployer: &MockShellDeployer{},
expectedFlags: "--dist-init-addr $(LEADER_HOST):29500 --nnodes 2 --node-rank 0",
expectedNeedsShell: false,
description: "Leader should always use rank 0 and no shell interpretation",
},
{
name: "worker with simple deployer",
numberOfNodes: 3,
role: RoleWorker,
multinodeDeployer: &MockSimpleDeployer{},
expectedFlags: "--dist-init-addr leader.example.com:29500 --nnodes 3 --node-rank 1",
expectedNeedsShell: false,
description: "Simple deployer should not need shell interpretation",
},
{
name: "worker with shell deployer",
numberOfNodes: 2,
role: RoleWorker,
multinodeDeployer: &MockShellDeployer{},
expectedFlags: "--dist-init-addr $(LEADER_HOST):29500 --nnodes 2 --node-rank $(WORKER_INDEX)",
expectedNeedsShell: true,
description: "Shell deployer should need shell interpretation for workers",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
flags, needsShell := backend.getMultinodeFlags(tt.numberOfNodes, tt.role, "test-service", tt.multinodeDeployer)
if flags != tt.expectedFlags {
t.Errorf("getMultinodeFlags() flags = %q, want %q", flags, tt.expectedFlags)
} }
// Verify command was not changed if needsShell != tt.expectedNeedsShell {
if len(container.Command) > 0 { t.Errorf("getMultinodeFlags() needsShell = %v, want %v", needsShell, tt.expectedNeedsShell)
t.Errorf("UpdateContainer() should not modify command, but set: %v", container.Command)
} }
}) })
} }
......
...@@ -60,7 +60,7 @@ func TestTRTLLMBackend_UpdateContainer(t *testing.T) { ...@@ -60,7 +60,7 @@ func TestTRTLLMBackend_UpdateContainer(t *testing.T) {
{Name: commonconsts.MpiRunSshSecretName, MountPath: "/ssh-pk", ReadOnly: true}, {Name: commonconsts.MpiRunSshSecretName, MountPath: "/ssh-pk", ReadOnly: true},
}, },
expectedCommand: []string{"/bin/sh", "-c"}, expectedCommand: []string{"/bin/sh", "-c"},
expectedArgs: []string{"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *\\nIdentityFile ~/.ssh/id_rsa\\nStrictHostKeyChecking no\\nPort 2222\\n' > ~/.ssh/config && mpirun --oversubscribe -n 6 -H ${GROVE_PCSG_NAME}-${GROVE_PCSG_INDEX}-test-service-ldr-0.${GROVE_HEADLESS_SERVICE},${GROVE_PCSG_NAME}-${GROVE_PCSG_INDEX}-test-service-wkr-0.${GROVE_HEADLESS_SERVICE},${GROVE_PCSG_NAME}-${GROVE_PCSG_INDEX}-test-service-wkr-1.${GROVE_HEADLESS_SERVICE} --mca pml ob1 --mca plm_rsh_args \"-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa\" -x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x OMPI_MCA_orte_keep_fqdn_hostnames -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'source /opt/dynamo/venv/bin/activate && trtllm-llmapi-launch python3 --model test'"}, expectedArgs: []string{"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *\\nIdentityFile ~/.ssh/id_rsa\\nStrictHostKeyChecking no\\nPort 2222\\n' > ~/.ssh/config && mpirun --oversubscribe -n 6 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-wkr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-wkr-1.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args \"-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa\" -x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x OMPI_MCA_orte_keep_fqdn_hostnames -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'source /opt/dynamo/venv/bin/activate && trtllm-llmapi-launch python3 --model test'"},
expectedEnv: []corev1.EnvVar{ expectedEnv: []corev1.EnvVar{
{Name: "OMPI_MCA_orte_keep_fqdn_hostnames", Value: "1"}, {Name: "OMPI_MCA_orte_keep_fqdn_hostnames", Value: "1"},
}, },
...@@ -116,7 +116,7 @@ func TestTRTLLMBackend_UpdateContainer(t *testing.T) { ...@@ -116,7 +116,7 @@ func TestTRTLLMBackend_UpdateContainer(t *testing.T) {
{Name: commonconsts.MpiRunSshSecretName, MountPath: "/ssh-pk", ReadOnly: true}, {Name: commonconsts.MpiRunSshSecretName, MountPath: "/ssh-pk", ReadOnly: true},
}, },
expectedCommand: []string{"/bin/sh", "-c"}, expectedCommand: []string{"/bin/sh", "-c"},
expectedArgs: []string{"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *\\nIdentityFile ~/.ssh/id_rsa\\nStrictHostKeyChecking no\\nPort 2222\\n' > ~/.ssh/config && mpirun --oversubscribe -n 2 -H ${LWS_LEADER_ADDRESS},${LWS_WORKER_1_ADDRESS} --mca pml ob1 --mca plm_rsh_args \"-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa\" -x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x OMPI_MCA_orte_keep_fqdn_hostnames -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'source /opt/dynamo/venv/bin/activate && trtllm-llmapi-launch python3 --model test'"}, expectedArgs: []string{"mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *\\nIdentityFile ~/.ssh/id_rsa\\nStrictHostKeyChecking no\\nPort 2222\\n' > ~/.ssh/config && mpirun --oversubscribe -n 2 -H $(LWS_LEADER_ADDRESS),$(LWS_WORKER_1_ADDRESS) --mca pml ob1 --mca plm_rsh_args \"-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa\" -x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x OMPI_MCA_orte_keep_fqdn_hostnames -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'source /opt/dynamo/venv/bin/activate && trtllm-llmapi-launch python3 --model test'"},
expectedEnv: []corev1.EnvVar{ expectedEnv: []corev1.EnvVar{
{Name: "OMPI_MCA_orte_keep_fqdn_hostnames", Value: "1"}, {Name: "OMPI_MCA_orte_keep_fqdn_hostnames", Value: "1"},
}, },
...@@ -415,8 +415,8 @@ func TestTRTLLMBackend_generateWorkerHostnames(t *testing.T) { ...@@ -415,8 +415,8 @@ func TestTRTLLMBackend_generateWorkerHostnames(t *testing.T) {
multinodeDeployer: &LWSMultinodeDeployer{}, multinodeDeployer: &LWSMultinodeDeployer{},
serviceName: "test-service", serviceName: "test-service",
expectedContains: []string{ expectedContains: []string{
"${LWS_LEADER_ADDRESS}", "$(LWS_LEADER_ADDRESS)",
"${LWS_WORKER_1_ADDRESS}", "$(LWS_WORKER_1_ADDRESS)",
}, },
expectedNodeCount: 2, expectedNodeCount: 2,
}, },
...@@ -440,10 +440,10 @@ func TestTRTLLMBackend_generateWorkerHostnames(t *testing.T) { ...@@ -440,10 +440,10 @@ func TestTRTLLMBackend_generateWorkerHostnames(t *testing.T) {
multinodeDeployer: &LWSMultinodeDeployer{}, multinodeDeployer: &LWSMultinodeDeployer{},
serviceName: "worker", serviceName: "worker",
expectedContains: []string{ expectedContains: []string{
"${LWS_LEADER_ADDRESS}", "$(LWS_LEADER_ADDRESS)",
"${LWS_WORKER_1_ADDRESS}", "$(LWS_WORKER_1_ADDRESS)",
"${LWS_WORKER_2_ADDRESS}", "$(LWS_WORKER_2_ADDRESS)",
"${LWS_WORKER_3_ADDRESS}", "$(LWS_WORKER_3_ADDRESS)",
}, },
expectedNodeCount: 4, expectedNodeCount: 4,
}, },
...@@ -563,7 +563,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) { ...@@ -563,7 +563,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
}, },
initialArgs: []string{"python3", "--model", "test"}, initialArgs: []string{"python3", "--model", "test"},
initialCommand: []string{}, initialCommand: []string{},
expected: "mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *\\nIdentityFile ~/.ssh/id_rsa\\nStrictHostKeyChecking no\\nPort 2222\\n' > ~/.ssh/config && mpirun --oversubscribe -n 6 -H ${GROVE_PCSG_NAME}-${GROVE_PCSG_INDEX}-test-service-ldr-0.${GROVE_HEADLESS_SERVICE},${GROVE_PCSG_NAME}-${GROVE_PCSG_INDEX}-test-service-wkr-0.${GROVE_HEADLESS_SERVICE},${GROVE_PCSG_NAME}-${GROVE_PCSG_INDEX}-test-service-wkr-1.${GROVE_HEADLESS_SERVICE} --mca pml ob1 --mca plm_rsh_args \"-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa\" -x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'source /opt/dynamo/venv/bin/activate && trtllm-llmapi-launch python3 --model test'", expected: "mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *\\nIdentityFile ~/.ssh/id_rsa\\nStrictHostKeyChecking no\\nPort 2222\\n' > ~/.ssh/config && mpirun --oversubscribe -n 6 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-wkr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-wkr-1.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args \"-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa\" -x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'source /opt/dynamo/venv/bin/activate && trtllm-llmapi-launch python3 --model test'",
}, },
{ {
name: "Leader with command and no GPU resources", name: "Leader with command and no GPU resources",
...@@ -573,7 +573,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) { ...@@ -573,7 +573,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
component: &v1alpha1.DynamoComponentDeploymentOverridesSpec{}, component: &v1alpha1.DynamoComponentDeploymentOverridesSpec{},
initialArgs: []string{}, initialArgs: []string{},
initialCommand: []string{"python", "-m", "worker"}, initialCommand: []string{"python", "-m", "worker"},
expected: "mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *\\nIdentityFile ~/.ssh/id_rsa\\nStrictHostKeyChecking no\\nPort 2222\\n' > ~/.ssh/config && mpirun --oversubscribe -n 0 -H ${LWS_LEADER_ADDRESS},${LWS_WORKER_1_ADDRESS} --mca pml ob1 --mca plm_rsh_args \"-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa\" -x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'source /opt/dynamo/venv/bin/activate && trtllm-llmapi-launch python -m worker'", expected: "mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *\\nIdentityFile ~/.ssh/id_rsa\\nStrictHostKeyChecking no\\nPort 2222\\n' > ~/.ssh/config && mpirun --oversubscribe -n 0 -H $(LWS_LEADER_ADDRESS),$(LWS_WORKER_1_ADDRESS) --mca pml ob1 --mca plm_rsh_args \"-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa\" -x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'source /opt/dynamo/venv/bin/activate && trtllm-llmapi-launch python -m worker'",
}, },
{ {
name: "Leader with both command and args (args take precedence)", name: "Leader with both command and args (args take precedence)",
...@@ -591,7 +591,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) { ...@@ -591,7 +591,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
}, },
initialArgs: []string{"launch", "--config", "test.yaml"}, initialArgs: []string{"launch", "--config", "test.yaml"},
initialCommand: []string{"ignored-command"}, initialCommand: []string{"ignored-command"},
expected: "mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *\\nIdentityFile ~/.ssh/id_rsa\\nStrictHostKeyChecking no\\nPort 2222\\n' > ~/.ssh/config && mpirun --oversubscribe -n 2 -H ${GROVE_PCSG_NAME}-${GROVE_PCSG_INDEX}-test-ldr-0.${GROVE_HEADLESS_SERVICE},${GROVE_PCSG_NAME}-${GROVE_PCSG_INDEX}-test-wkr-0.${GROVE_HEADLESS_SERVICE} --mca pml ob1 --mca plm_rsh_args \"-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa\" -x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'source /opt/dynamo/venv/bin/activate && trtllm-llmapi-launch launch --config test.yaml'", expected: "mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *\\nIdentityFile ~/.ssh/id_rsa\\nStrictHostKeyChecking no\\nPort 2222\\n' > ~/.ssh/config && mpirun --oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args \"-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa\" -x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'source /opt/dynamo/venv/bin/activate && trtllm-llmapi-launch launch --config test.yaml'",
}, },
{ {
name: "Leader with all environment variables forwarded", name: "Leader with all environment variables forwarded",
...@@ -609,7 +609,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) { ...@@ -609,7 +609,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
}, },
initialArgs: []string{"serve", "--model", "test"}, initialArgs: []string{"serve", "--model", "test"},
initialCommand: []string{}, initialCommand: []string{},
expected: "mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *\\nIdentityFile ~/.ssh/id_rsa\\nStrictHostKeyChecking no\\nPort 2222\\n' > ~/.ssh/config && mpirun --oversubscribe -n 2 -H ${GROVE_PCSG_NAME}-${GROVE_PCSG_INDEX}-test-ldr-0.${GROVE_HEADLESS_SERVICE},${GROVE_PCSG_NAME}-${GROVE_PCSG_INDEX}-test-wkr-0.${GROVE_HEADLESS_SERVICE} --mca pml ob1 --mca plm_rsh_args \"-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa\" -x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'source /opt/dynamo/venv/bin/activate && trtllm-llmapi-launch serve --model test'", expected: "mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *\\nIdentityFile ~/.ssh/id_rsa\\nStrictHostKeyChecking no\\nPort 2222\\n' > ~/.ssh/config && mpirun --oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args \"-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa\" -x CUDA_VISIBLE_DEVICES -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'source /opt/dynamo/venv/bin/activate && trtllm-llmapi-launch serve --model test'",
}, },
{ {
name: "Leader with overlapping environment variables (deduplication test)", name: "Leader with overlapping environment variables (deduplication test)",
...@@ -627,7 +627,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) { ...@@ -627,7 +627,7 @@ func TestTRTLLMBackend_setupLeaderContainer(t *testing.T) {
}, },
initialArgs: []string{"serve", "--model", "test"}, initialArgs: []string{"serve", "--model", "test"},
initialCommand: []string{}, initialCommand: []string{},
expected: "mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *\\nIdentityFile ~/.ssh/id_rsa\\nStrictHostKeyChecking no\\nPort 2222\\n' > ~/.ssh/config && mpirun --oversubscribe -n 2 -H ${GROVE_PCSG_NAME}-${GROVE_PCSG_INDEX}-test-ldr-0.${GROVE_HEADLESS_SERVICE},${GROVE_PCSG_NAME}-${GROVE_PCSG_INDEX}-test-wkr-0.${GROVE_HEADLESS_SERVICE} --mca pml ob1 --mca plm_rsh_args \"-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa\" -x CUDA_VISIBLE_DEVICES -x CUSTOM_VAR -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'source /opt/dynamo/venv/bin/activate && trtllm-llmapi-launch serve --model test'", expected: "mkdir -p ~/.ssh && ls -la /ssh-pk/ && cp /ssh-pk/private.key ~/.ssh/id_rsa && cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub && cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys && chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys && chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys && printf 'Host *\\nIdentityFile ~/.ssh/id_rsa\\nStrictHostKeyChecking no\\nPort 2222\\n' > ~/.ssh/config && mpirun --oversubscribe -n 2 -H $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-ldr-0.$(GROVE_HEADLESS_SERVICE),$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-wkr-0.$(GROVE_HEADLESS_SERVICE) --mca pml ob1 --mca plm_rsh_args \"-p 2222 -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa\" -x CUDA_VISIBLE_DEVICES -x CUSTOM_VAR -x HF_DATASETS_CACHE -x HF_ENDPOINT -x HF_HOME -x HF_TOKEN -x HOME -x HUGGING_FACE_HUB_TOKEN -x LD_LIBRARY_PATH -x MODEL_PATH -x NCCL_DEBUG -x NCCL_IB_DISABLE -x NCCL_P2P_DISABLE -x PATH -x PYTHONPATH -x TENSORRT_LLM_CACHE_DIR -x TOKENIZERS_PARALLELISM -x TRANSFORMERS_CACHE -x USER bash -c 'source /opt/dynamo/venv/bin/activate && trtllm-llmapi-launch serve --model test'",
}, },
} }
......
...@@ -53,7 +53,7 @@ func TestVLLMBackend_UpdateContainer(t *testing.T) { ...@@ -53,7 +53,7 @@ func TestVLLMBackend_UpdateContainer(t *testing.T) {
component: &v1alpha1.DynamoComponentDeploymentOverridesSpec{}, component: &v1alpha1.DynamoComponentDeploymentOverridesSpec{},
multinodeDeployer: &GroveMultinodeDeployer{}, multinodeDeployer: &GroveMultinodeDeployer{},
initialArgs: []string{"python3", "-m", "dynamo.vllm", "--model", "test"}, initialArgs: []string{"python3", "-m", "dynamo.vllm", "--model", "test"},
expectedArgs: []string{"ray start --address=${GROVE_PCSG_NAME}-${GROVE_PCSG_INDEX}-test-service-ldr-0.${GROVE_HEADLESS_SERVICE}:6379 --block"}, expectedArgs: []string{"ray start --address=$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-ldr-0.$(GROVE_HEADLESS_SERVICE):6379 --block"},
expectProbesRemoved: true, expectProbesRemoved: true,
}, },
{ {
...@@ -63,7 +63,7 @@ func TestVLLMBackend_UpdateContainer(t *testing.T) { ...@@ -63,7 +63,7 @@ func TestVLLMBackend_UpdateContainer(t *testing.T) {
component: &v1alpha1.DynamoComponentDeploymentOverridesSpec{}, component: &v1alpha1.DynamoComponentDeploymentOverridesSpec{},
multinodeDeployer: &LWSMultinodeDeployer{}, multinodeDeployer: &LWSMultinodeDeployer{},
initialArgs: []string{"python3", "-m", "dynamo.vllm"}, initialArgs: []string{"python3", "-m", "dynamo.vllm"},
expectedArgs: []string{"ray start --address=${LWS_LEADER_ADDRESS}:6379 --block"}, expectedArgs: []string{"ray start --address=$(LWS_LEADER_ADDRESS):6379 --block"},
expectProbesRemoved: true, expectProbesRemoved: true,
}, },
{ {
...@@ -155,14 +155,14 @@ func TestUpdateVLLMMultinodeArgs(t *testing.T) { ...@@ -155,14 +155,14 @@ func TestUpdateVLLMMultinodeArgs(t *testing.T) {
role: RoleWorker, role: RoleWorker,
multinodeDeployer: &GroveMultinodeDeployer{}, multinodeDeployer: &GroveMultinodeDeployer{},
initialArgs: []string{"python3", "-m", "dynamo.vllm"}, initialArgs: []string{"python3", "-m", "dynamo.vllm"},
expectedArgs: []string{"ray start --address=${GROVE_PCSG_NAME}-${GROVE_PCSG_INDEX}-test-service-ldr-0.${GROVE_HEADLESS_SERVICE}:6379 --block"}, expectedArgs: []string{"ray start --address=$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-ldr-0.$(GROVE_HEADLESS_SERVICE):6379 --block"},
}, },
{ {
name: "worker with LWS deployment", name: "worker with LWS deployment",
role: RoleWorker, role: RoleWorker,
multinodeDeployer: &LWSMultinodeDeployer{}, multinodeDeployer: &LWSMultinodeDeployer{},
initialArgs: []string{"python3", "-m", "dynamo.vllm"}, initialArgs: []string{"python3", "-m", "dynamo.vllm"},
expectedArgs: []string{"ray start --address=${LWS_LEADER_ADDRESS}:6379 --block"}, expectedArgs: []string{"ray start --address=$(LWS_LEADER_ADDRESS):6379 --block"},
}, },
{ {
name: "main role does not modify args", name: "main role does not modify args",
......
...@@ -623,7 +623,7 @@ func (b *NoopBackend) UpdatePodSpec(podSpec *corev1.PodSpec, numberOfNodes int32 ...@@ -623,7 +623,7 @@ func (b *NoopBackend) UpdatePodSpec(podSpec *corev1.PodSpec, numberOfNodes int32
type MultinodeDeployer interface { type MultinodeDeployer interface {
GetLeaderHostname(serviceName string) string GetLeaderHostname(serviceName string) string
GetHostNames(serviceName string, numberOfNodes int32) []string GetHostNames(serviceName string, numberOfNodes int32) []string
GetNodeRank() string GetNodeRank() (string, bool) // returns (rank, needsShellInterpretation)
} }
// BackendFactory creates backend instances based on the framework type // BackendFactory creates backend instances based on the framework type
......
...@@ -1804,7 +1804,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { ...@@ -1804,7 +1804,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
"-c", "-c",
}, },
Args: []string{ Args: []string{
"python3 -m dynamo.sglang.worker --dist-init-addr ${GROVE_PCSG_NAME}-${GROVE_PCSG_INDEX}-worker-ldr-0.${GROVE_HEADLESS_SERVICE}:29500 --nnodes 3 --node-rank 0 --custom-flag custom-value", "python3 -m dynamo.sglang.worker --dist-init-addr $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-worker-ldr-0.$(GROVE_HEADLESS_SERVICE):29500 --nnodes 3 --node-rank 0 --custom-flag custom-value",
}, },
Ports: []corev1.ContainerPort{ Ports: []corev1.ContainerPort{
{ {
...@@ -1955,7 +1955,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { ...@@ -1955,7 +1955,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
"-c", "-c",
}, },
Args: []string{ Args: []string{
"python3 -m dynamo.sglang.worker --dist-init-addr ${GROVE_PCSG_NAME}-${GROVE_PCSG_INDEX}-worker-ldr-0.${GROVE_HEADLESS_SERVICE}:29500 --nnodes 3 --node-rank $((GROVE_PCLQ_POD_INDEX + 1)) --custom-flag custom-value", "python3 -m dynamo.sglang.worker --dist-init-addr $(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-worker-ldr-0.$(GROVE_HEADLESS_SERVICE):29500 --nnodes 3 --node-rank $((GROVE_PCLQ_POD_INDEX + 1)) --custom-flag custom-value",
}, },
Ports: []corev1.ContainerPort{ Ports: []corev1.ContainerPort{
{ {
...@@ -2739,7 +2739,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { ...@@ -2739,7 +2739,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
"-c", "-c",
}, },
Args: []string{ Args: []string{
"ray start --address=${GROVE_PCSG_NAME}-${GROVE_PCSG_INDEX}-worker-ldr-0.${GROVE_HEADLESS_SERVICE}:6379 --block", "ray start --address=$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-worker-ldr-0.$(GROVE_HEADLESS_SERVICE):6379 --block",
}, },
Ports: []corev1.ContainerPort{ Ports: []corev1.ContainerPort{
{ {
...@@ -3182,7 +3182,7 @@ func TestGeneratePodSpecForComponent_SGLang(t *testing.T) { ...@@ -3182,7 +3182,7 @@ func TestGeneratePodSpecForComponent_SGLang(t *testing.T) {
ComponentType: commonconsts.ComponentTypeWorker, ComponentType: commonconsts.ComponentTypeWorker,
ExtraPodSpec: &common.ExtraPodSpec{ ExtraPodSpec: &common.ExtraPodSpec{
MainContainer: &corev1.Container{ MainContainer: &corev1.Container{
Args: []string{"python3", "-m", "dynamo.sglang.worker"}, Args: []string{"python3 -m dynamo.sglang.worker"},
}, },
}, },
}, },
...@@ -3201,7 +3201,7 @@ func TestGeneratePodSpecForComponent_SGLang(t *testing.T) { ...@@ -3201,7 +3201,7 @@ func TestGeneratePodSpecForComponent_SGLang(t *testing.T) {
ComponentType: commonconsts.ComponentTypeWorker, ComponentType: commonconsts.ComponentTypeWorker,
ExtraPodSpec: &common.ExtraPodSpec{ ExtraPodSpec: &common.ExtraPodSpec{
MainContainer: &corev1.Container{ MainContainer: &corev1.Container{
Args: []string{"python3", "-m", "dynamo.sglang.worker"}, Args: []string{"python3 -m dynamo.sglang.worker"},
}, },
}, },
}, },
...@@ -3219,7 +3219,7 @@ func TestGeneratePodSpecForComponent_SGLang(t *testing.T) { ...@@ -3219,7 +3219,7 @@ func TestGeneratePodSpecForComponent_SGLang(t *testing.T) {
ComponentType: commonconsts.ComponentTypeWorker, ComponentType: commonconsts.ComponentTypeWorker,
ExtraPodSpec: &common.ExtraPodSpec{ ExtraPodSpec: &common.ExtraPodSpec{
MainContainer: &corev1.Container{ MainContainer: &corev1.Container{
Args: []string{"python3", "-m", "dynamo.sglang.worker"}, Args: []string{"python3 -m dynamo.sglang.worker"},
}, },
}, },
}, },
...@@ -3377,7 +3377,7 @@ func TestGeneratePodSpecForComponent_VLLM(t *testing.T) { ...@@ -3377,7 +3377,7 @@ func TestGeneratePodSpecForComponent_VLLM(t *testing.T) {
role: RoleWorker, role: RoleWorker,
numberOfNodes: 3, numberOfNodes: 3,
expectError: false, expectError: false,
expectContains: []string{"ray start --address=${GROVE_PCSG_NAME}-${GROVE_PCSG_INDEX}-worker-ldr-0.${GROVE_HEADLESS_SERVICE}:6379 --block"}, expectContains: []string{"ray start --address=$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-worker-ldr-0.$(GROVE_HEADLESS_SERVICE):6379 --block"},
expectNotContains: []string{"python3 -m dynamo.vllm"}, expectNotContains: []string{"python3 -m dynamo.vllm"},
}, },
{ {
...@@ -4671,8 +4671,8 @@ func TestGenerateBasePodSpec_Worker(t *testing.T) { ...@@ -4671,8 +4671,8 @@ func TestGenerateBasePodSpec_Worker(t *testing.T) {
{Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-deployment"}, {Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-deployment"},
{Name: "DYN_PARENT_DGD_K8S_NAMESPACE", Value: "default"}, {Name: "DYN_PARENT_DGD_K8S_NAMESPACE", Value: "default"},
{Name: "DYN_SYSTEM_ENABLED", Value: "true"}, {Name: "DYN_SYSTEM_ENABLED", Value: "true"},
{Name: "DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS", Value: "[\"generate\"]"},
{Name: "DYN_SYSTEM_PORT", Value: "9090"}, {Name: "DYN_SYSTEM_PORT", Value: "9090"},
{Name: "DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS", Value: "[\"generate\"]"},
}, },
VolumeMounts: []corev1.VolumeMount{ VolumeMounts: []corev1.VolumeMount{
{ {
......
...@@ -25,11 +25,12 @@ type GroveMultinodeDeployer struct { ...@@ -25,11 +25,12 @@ type GroveMultinodeDeployer struct {
} }
func (d *GroveMultinodeDeployer) GetLeaderHostname(serviceName string) string { func (d *GroveMultinodeDeployer) GetLeaderHostname(serviceName string) string {
return fmt.Sprintf("${GROVE_PCSG_NAME}-${GROVE_PCSG_INDEX}-%s-%s-0.${GROVE_HEADLESS_SERVICE}", serviceName, commonconsts.GroveRoleSuffixLeader) return fmt.Sprintf("$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-%s-%s-0.$(GROVE_HEADLESS_SERVICE)", serviceName, commonconsts.GroveRoleSuffixLeader)
} }
func (d *GroveMultinodeDeployer) GetNodeRank() string { func (d *GroveMultinodeDeployer) GetNodeRank() (string, bool) {
return "$((GROVE_PCLQ_POD_INDEX + 1))" // This requires shell expansion for arithmetic expression
return "$((GROVE_PCLQ_POD_INDEX + 1))", true
} }
func (d *GroveMultinodeDeployer) GetHostNames(serviceName string, numberOfNodes int32) []string { func (d *GroveMultinodeDeployer) GetHostNames(serviceName string, numberOfNodes int32) []string {
...@@ -38,7 +39,7 @@ func (d *GroveMultinodeDeployer) GetHostNames(serviceName string, numberOfNodes ...@@ -38,7 +39,7 @@ func (d *GroveMultinodeDeployer) GetHostNames(serviceName string, numberOfNodes
hostnames = append(hostnames, leaderHostname) hostnames = append(hostnames, leaderHostname)
// Add worker hostnames // Add worker hostnames
for i := int32(0); i < numberOfNodes-1; i++ { for i := int32(0); i < numberOfNodes-1; i++ {
workerHostname := fmt.Sprintf("${GROVE_PCSG_NAME}-${GROVE_PCSG_INDEX}-%s-%s-%d.${GROVE_HEADLESS_SERVICE}", workerHostname := fmt.Sprintf("$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-%s-%s-%d.$(GROVE_HEADLESS_SERVICE)",
serviceName, commonconsts.GroveRoleSuffixWorker, i) serviceName, commonconsts.GroveRoleSuffixWorker, i)
hostnames = append(hostnames, workerHostname) hostnames = append(hostnames, workerHostname)
} }
......
...@@ -7,18 +7,19 @@ type LWSMultinodeDeployer struct { ...@@ -7,18 +7,19 @@ type LWSMultinodeDeployer struct {
} }
func (d *LWSMultinodeDeployer) GetLeaderHostname(serviceName string) string { func (d *LWSMultinodeDeployer) GetLeaderHostname(serviceName string) string {
return "${LWS_LEADER_ADDRESS}" return "$(LWS_LEADER_ADDRESS)"
} }
func (d *LWSMultinodeDeployer) GetNodeRank() string { func (d *LWSMultinodeDeployer) GetNodeRank() (string, bool) {
return "${LWS_WORKER_INDEX}" // This requires shell expansion for variable substitution
return "$(LWS_WORKER_INDEX)", true
} }
func (d *LWSMultinodeDeployer) GetHostNames(serviceName string, numberOfNodes int32) []string { func (d *LWSMultinodeDeployer) GetHostNames(serviceName string, numberOfNodes int32) []string {
hostnames := make([]string, numberOfNodes) hostnames := make([]string, numberOfNodes)
hostnames[0] = d.GetLeaderHostname(serviceName) hostnames[0] = d.GetLeaderHostname(serviceName)
for i := int32(1); i < numberOfNodes; i++ { for i := int32(1); i < numberOfNodes; i++ {
hostnames[i] = fmt.Sprintf("${LWS_WORKER_%d_ADDRESS}", i) hostnames[i] = fmt.Sprintf("$(LWS_WORKER_%d_ADDRESS)", i)
} }
return hostnames return hostnames
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment