Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
dabd2267
Unverified
Commit
dabd2267
authored
Aug 11, 2025
by
julienmancuso
Committed by
GitHub
Aug 11, 2025
Browse files
feat: add grove multinode support (#2269)
parent
d51580a4
Changes
23
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1599 additions
and
324 deletions
+1599
-324
deploy/cloud/helm/crds/Chart.yaml
deploy/cloud/helm/crds/Chart.yaml
+1
-1
deploy/cloud/helm/crds/templates/nvidia.com_dynamocomponentdeployments.yaml
...crds/templates/nvidia.com_dynamocomponentdeployments.yaml
+10
-0
deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeployments.yaml
...elm/crds/templates/nvidia.com_dynamographdeployments.yaml
+10
-0
deploy/cloud/operator/api/dynamo/common/common.go
deploy/cloud/operator/api/dynamo/common/common.go
+1
-0
deploy/cloud/operator/api/v1alpha1/dynamocomponentdeployment_types.go
.../operator/api/v1alpha1/dynamocomponentdeployment_types.go
+11
-8
deploy/cloud/operator/api/v1alpha1/dynamographdeployment_types.go
...loud/operator/api/v1alpha1/dynamographdeployment_types.go
+3
-0
deploy/cloud/operator/config/crd/bases/nvidia.com_dynamocomponentdeployments.yaml
...nfig/crd/bases/nvidia.com_dynamocomponentdeployments.yaml
+10
-0
deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeployments.yaml
...r/config/crd/bases/nvidia.com_dynamographdeployments.yaml
+10
-0
deploy/cloud/operator/go.mod
deploy/cloud/operator/go.mod
+1
-1
deploy/cloud/operator/go.sum
deploy/cloud/operator/go.sum
+2
-2
deploy/cloud/operator/internal/consts/consts.go
deploy/cloud/operator/internal/consts/consts.go
+16
-0
deploy/cloud/operator/internal/controller/dynamocomponentdeployment_controller.go
...ternal/controller/dynamocomponentdeployment_controller.go
+17
-264
deploy/cloud/operator/internal/controller/dynamocomponentdeployment_controller_test.go
...l/controller/dynamocomponentdeployment_controller_test.go
+69
-30
deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go
...r/internal/controller/dynamographdeployment_controller.go
+19
-18
deploy/cloud/operator/internal/dynamo/backend_common.go
deploy/cloud/operator/internal/dynamo/backend_common.go
+13
-0
deploy/cloud/operator/internal/dynamo/backend_sglang.go
deploy/cloud/operator/internal/dynamo/backend_sglang.go
+93
-0
deploy/cloud/operator/internal/dynamo/backend_sglang_test.go
deploy/cloud/operator/internal/dynamo/backend_sglang_test.go
+202
-0
deploy/cloud/operator/internal/dynamo/backend_trtllm.go
deploy/cloud/operator/internal/dynamo/backend_trtllm.go
+252
-0
deploy/cloud/operator/internal/dynamo/backend_trtllm_test.go
deploy/cloud/operator/internal/dynamo/backend_trtllm_test.go
+808
-0
deploy/cloud/operator/internal/dynamo/backend_vllm.go
deploy/cloud/operator/internal/dynamo/backend_vllm.go
+51
-0
No files found.
deploy/cloud/helm/crds/Chart.yaml
View file @
dabd2267
...
@@ -16,5 +16,5 @@ apiVersion: v2
...
@@ -16,5 +16,5 @@ apiVersion: v2
name
:
dynamo-crds
name
:
dynamo-crds
description
:
A Helm chart for dynamo CRDs
description
:
A Helm chart for dynamo CRDs
type
:
application
type
:
application
version
:
0.4.
0
version
:
0.4.
1
dependencies
:
[]
dependencies
:
[]
\ No newline at end of file
deploy/cloud/helm/crds/templates/nvidia.com_dynamocomponentdeployments.yaml
View file @
dabd2267
...
@@ -404,6 +404,12 @@ spec:
...
@@ -404,6 +404,12 @@ spec:
minReplicas
:
minReplicas
:
type
:
integer
type
:
integer
type
:
object
type
:
object
backendFramework
:
enum
:
-
sglang
-
vllm
-
trtllm
type
:
string
componentType
:
componentType
:
type
:
string
type
:
string
dynamoComponent
:
dynamoComponent
:
...
@@ -5039,6 +5045,8 @@ spec:
...
@@ -5039,6 +5045,8 @@ spec:
type
:
string
type
:
string
memory
:
memory
:
type
:
string
type
:
string
nodes
:
type
:
string
type
:
object
type
:
object
requests
:
requests
:
properties
:
properties
:
...
@@ -5052,6 +5060,8 @@ spec:
...
@@ -5052,6 +5060,8 @@ spec:
type
:
string
type
:
string
memory
:
memory
:
type
:
string
type
:
string
nodes
:
type
:
string
type
:
object
type
:
object
type
:
object
type
:
object
runMode
:
runMode
:
...
...
deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeployments.yaml
View file @
dabd2267
...
@@ -44,6 +44,12 @@ spec:
...
@@ -44,6 +44,12 @@ spec:
type
:
object
type
:
object
spec
:
spec
:
properties
:
properties
:
backendFramework
:
enum
:
-
sglang
-
vllm
-
trtllm
type
:
string
dynamoGraph
:
dynamoGraph
:
type
:
string
type
:
string
envs
:
envs
:
...
@@ -5094,6 +5100,8 @@ spec:
...
@@ -5094,6 +5100,8 @@ spec:
type
:
string
type
:
string
memory
:
memory
:
type
:
string
type
:
string
nodes
:
type
:
string
type
:
object
type
:
object
requests
:
requests
:
properties
:
properties
:
...
@@ -5107,6 +5115,8 @@ spec:
...
@@ -5107,6 +5115,8 @@ spec:
type
:
string
type
:
string
memory
:
memory
:
type
:
string
type
:
string
nodes
:
type
:
string
type
:
object
type
:
object
type
:
object
type
:
object
runMode
:
runMode
:
...
...
deploy/cloud/operator/api/dynamo/common/common.go
View file @
dabd2267
...
@@ -26,6 +26,7 @@ type ResourceItem struct {
...
@@ -26,6 +26,7 @@ type ResourceItem struct {
CPU
string
`json:"cpu,omitempty"`
CPU
string
`json:"cpu,omitempty"`
Memory
string
`json:"memory,omitempty"`
Memory
string
`json:"memory,omitempty"`
GPU
string
`json:"gpu,omitempty"`
GPU
string
`json:"gpu,omitempty"`
Nodes
string
`json:"nodes,omitempty"`
Custom
map
[
string
]
string
`json:"custom,omitempty"`
Custom
map
[
string
]
string
`json:"custom,omitempty"`
}
}
...
...
deploy/cloud/operator/api/v1alpha1/dynamocomponentdeployment_types.go
View file @
dabd2267
...
@@ -42,6 +42,10 @@ type DynamoComponentDeploymentSpec struct {
...
@@ -42,6 +42,10 @@ type DynamoComponentDeploymentSpec struct {
// contains the tag of the DynamoComponent: for example, "my_package:MyService"
// contains the tag of the DynamoComponent: for example, "my_package:MyService"
DynamoTag
string
`json:"dynamoTag,omitempty"`
DynamoTag
string
`json:"dynamoTag,omitempty"`
// BackendFramework specifies the backend framework (e.g., "sglang", "vllm", "trtllm")
// +kubebuilder:validation:Enum=sglang;vllm;trtllm
BackendFramework
string
`json:"backendFramework,omitempty"`
DynamoComponentDeploymentSharedSpec
`json:",inline"`
DynamoComponentDeploymentSharedSpec
`json:",inline"`
}
}
...
@@ -110,6 +114,13 @@ type IngressSpec struct {
...
@@ -110,6 +114,13 @@ type IngressSpec struct {
IngressControllerClassName
*
string
`json:"ingressControllerClassName,omitempty"`
IngressControllerClassName
*
string
`json:"ingressControllerClassName,omitempty"`
}
}
func
(
i
*
IngressSpec
)
IsVirtualServiceEnabled
()
bool
{
if
i
==
nil
{
return
false
}
return
i
.
Enabled
&&
i
.
UseVirtualService
&&
i
.
VirtualServiceGateway
!=
nil
}
// DynamoComponentDeploymentStatus defines the observed state of DynamoComponentDeployment
// DynamoComponentDeploymentStatus defines the observed state of DynamoComponentDeployment
type
DynamoComponentDeploymentStatus
struct
{
type
DynamoComponentDeploymentStatus
struct
{
// INSERT ADDITIONAL STATUS FIELD - define observed state of cluster
// INSERT ADDITIONAL STATUS FIELD - define observed state of cluster
...
@@ -195,11 +206,3 @@ func (s *DynamoComponentDeployment) SetDynamoDeploymentConfig(config []byte) {
...
@@ -195,11 +206,3 @@ func (s *DynamoComponentDeployment) SetDynamoDeploymentConfig(config []byte) {
Value
:
string
(
config
),
Value
:
string
(
config
),
})
})
}
}
// GetImage returns the docker image of the DynamoComponent
func
(
s
*
DynamoComponentDeployment
)
GetImage
()
string
{
if
s
.
Spec
.
ExtraPodSpec
!=
nil
&&
s
.
Spec
.
ExtraPodSpec
.
MainContainer
!=
nil
{
return
s
.
Spec
.
ExtraPodSpec
.
MainContainer
.
Image
}
return
""
}
deploy/cloud/operator/api/v1alpha1/dynamographdeployment_types.go
View file @
dabd2267
...
@@ -40,6 +40,9 @@ type DynamoGraphDeploymentSpec struct {
...
@@ -40,6 +40,9 @@ type DynamoGraphDeploymentSpec struct {
// Environment variables to be set in the deployment
// Environment variables to be set in the deployment
// +kubebuilder:validation:Optional
// +kubebuilder:validation:Optional
Envs
[]
corev1
.
EnvVar
`json:"envs,omitempty"`
Envs
[]
corev1
.
EnvVar
`json:"envs,omitempty"`
// BackendFramework specifies the backend framework (e.g., "sglang", "vllm", "trtllm")
// +kubebuilder:validation:Enum=sglang;vllm;trtllm
BackendFramework
string
`json:"backendFramework,omitempty"`
}
}
// DynamoGraphDeploymentStatus defines the observed state of DynamoGraphDeployment.
// DynamoGraphDeploymentStatus defines the observed state of DynamoGraphDeployment.
...
...
deploy/cloud/operator/config/crd/bases/nvidia.com_dynamocomponentdeployments.yaml
View file @
dabd2267
...
@@ -404,6 +404,12 @@ spec:
...
@@ -404,6 +404,12 @@ spec:
minReplicas
:
minReplicas
:
type
:
integer
type
:
integer
type
:
object
type
:
object
backendFramework
:
enum
:
-
sglang
-
vllm
-
trtllm
type
:
string
componentType
:
componentType
:
type
:
string
type
:
string
dynamoComponent
:
dynamoComponent
:
...
@@ -5039,6 +5045,8 @@ spec:
...
@@ -5039,6 +5045,8 @@ spec:
type
:
string
type
:
string
memory
:
memory
:
type
:
string
type
:
string
nodes
:
type
:
string
type
:
object
type
:
object
requests
:
requests
:
properties
:
properties
:
...
@@ -5052,6 +5060,8 @@ spec:
...
@@ -5052,6 +5060,8 @@ spec:
type
:
string
type
:
string
memory
:
memory
:
type
:
string
type
:
string
nodes
:
type
:
string
type
:
object
type
:
object
type
:
object
type
:
object
runMode
:
runMode
:
...
...
deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeployments.yaml
View file @
dabd2267
...
@@ -44,6 +44,12 @@ spec:
...
@@ -44,6 +44,12 @@ spec:
type
:
object
type
:
object
spec
:
spec
:
properties
:
properties
:
backendFramework
:
enum
:
-
sglang
-
vllm
-
trtllm
type
:
string
dynamoGraph
:
dynamoGraph
:
type
:
string
type
:
string
envs
:
envs
:
...
@@ -5094,6 +5100,8 @@ spec:
...
@@ -5094,6 +5100,8 @@ spec:
type
:
string
type
:
string
memory
:
memory
:
type
:
string
type
:
string
nodes
:
type
:
string
type
:
object
type
:
object
requests
:
requests
:
properties
:
properties
:
...
@@ -5107,6 +5115,8 @@ spec:
...
@@ -5107,6 +5115,8 @@ spec:
type
:
string
type
:
string
memory
:
memory
:
type
:
string
type
:
string
nodes
:
type
:
string
type
:
object
type
:
object
type
:
object
type
:
object
runMode
:
runMode
:
...
...
deploy/cloud/operator/go.mod
View file @
dabd2267
...
@@ -6,7 +6,7 @@ toolchain go1.24.3
...
@@ -6,7 +6,7 @@ toolchain go1.24.3
require (
require (
emperror.dev/errors v0.8.1
emperror.dev/errors v0.8.1
github.com/NVIDIA/grove/operator/api v0.0.0-20250
717114148-daac6e53774f
github.com/NVIDIA/grove/operator/api v0.0.0-20250
801123021-8b42bac59ef2
github.com/bsm/gomega v1.27.10
github.com/bsm/gomega v1.27.10
github.com/google/go-cmp v0.7.0
github.com/google/go-cmp v0.7.0
github.com/imdario/mergo v0.3.6
github.com/imdario/mergo v0.3.6
...
...
deploy/cloud/operator/go.sum
View file @
dabd2267
emperror.dev/errors v0.8.1 h1:UavXZ5cSX/4u9iyvH6aDcuGkVjeexUGJ7Ij7G4VfQT0=
emperror.dev/errors v0.8.1 h1:UavXZ5cSX/4u9iyvH6aDcuGkVjeexUGJ7Ij7G4VfQT0=
emperror.dev/errors v0.8.1/go.mod h1:YcRvLPh626Ubn2xqtoprejnA5nFha+TJ+2vew48kWuE=
emperror.dev/errors v0.8.1/go.mod h1:YcRvLPh626Ubn2xqtoprejnA5nFha+TJ+2vew48kWuE=
github.com/NVIDIA/grove/operator/api v0.0.0-20250
717114148-daac6e53774f h1:2ePSNDm7/Tep8F99yCQVH8/vmn86L1cUzTbVlyNopmQ
=
github.com/NVIDIA/grove/operator/api v0.0.0-20250
801123021-8b42bac59ef2 h1:JLOj0GiubP3VlR0okIbuqljvl+e2Vccnu6LX6wL34G0
=
github.com/NVIDIA/grove/operator/api v0.0.0-20250
717114148-daac6e53774f/go.mod h1:nJL33lsBe+9xCcZLYkNYg1wucE4hJfa4ZfHm1zamuG0
=
github.com/NVIDIA/grove/operator/api v0.0.0-20250
801123021-8b42bac59ef2/go.mod h1:QlsR2wQLj9m/zVEqv5SsCPzyjN2ykYZ0r/NEnDf4WB4
=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM=
github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM=
...
...
deploy/cloud/operator/internal/consts/consts.go
View file @
dabd2267
...
@@ -15,6 +15,8 @@ const (
...
@@ -15,6 +15,8 @@ const (
DynamoSystemPort
=
9090
DynamoSystemPort
=
9090
DynamoSystemPortName
=
"system"
DynamoSystemPortName
=
"system"
MpiRunSshPort
=
2222
EnvDynamoServicePort
=
"DYNAMO_PORT"
EnvDynamoServicePort
=
"DYNAMO_PORT"
KubeLabelDynamoSelector
=
"nvidia.com/selector"
KubeLabelDynamoSelector
=
"nvidia.com/selector"
...
@@ -47,4 +49,18 @@ const (
...
@@ -47,4 +49,18 @@ const (
// Metrics related constants
// Metrics related constants
KubeAnnotationEnableMetrics
=
"nvidia.com/enable-metrics"
// User-provided annotation to control metrics
KubeAnnotationEnableMetrics
=
"nvidia.com/enable-metrics"
// User-provided annotation to control metrics
KubeLabelMetricsEnabled
=
"nvidia.com/metrics-enabled"
// Controller-managed label for pod selection
KubeLabelMetricsEnabled
=
"nvidia.com/metrics-enabled"
// Controller-managed label for pod selection
KubeValueNameSharedMemory
=
"shared-memory"
// Grove multinode role suffixes
GroveRoleSuffixLeader
=
"ldr"
GroveRoleSuffixWorker
=
"wkr"
MpiRunSshSecretName
=
"mpi-run-ssh-secret"
)
type
MultinodeDeploymentType
string
const
(
MultinodeDeploymentTypeGrove
MultinodeDeploymentType
=
"grove"
MultinodeDeploymentTypeLWS
MultinodeDeploymentType
=
"lws"
)
)
deploy/cloud/operator/internal/controller/dynamocomponentdeployment_controller.go
View file @
dabd2267
...
@@ -22,11 +22,11 @@ package controller
...
@@ -22,11 +22,11 @@ package controller
import
(
import
(
"context"
"context"
"fmt"
"fmt"
"maps"
"os"
"os"
"strconv"
"strconv"
"time"
"time"
"github.com/imdario/mergo"
appsv1
"k8s.io/api/apps/v1"
appsv1
"k8s.io/api/apps/v1"
autoscalingv2
"k8s.io/api/autoscaling/v2"
autoscalingv2
"k8s.io/api/autoscaling/v2"
corev1
"k8s.io/api/core/v1"
corev1
"k8s.io/api/core/v1"
...
@@ -34,9 +34,9 @@ import (
...
@@ -34,9 +34,9 @@ import (
metav1
"k8s.io/apimachinery/pkg/apis/meta/v1"
metav1
"k8s.io/apimachinery/pkg/apis/meta/v1"
"emperror.dev/errors"
"emperror.dev/errors"
dynamoCommon
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/dynamo/common"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/dynamo/schemas"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/dynamo/schemas"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
commonconsts
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
commonconsts
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common"
commonController
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common"
commonController
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common"
...
@@ -48,7 +48,6 @@ import (
...
@@ -48,7 +48,6 @@ import (
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/client-go/tools/record"
"k8s.io/client-go/tools/record"
"k8s.io/utils/ptr"
ctrl
"sigs.k8s.io/controller-runtime"
ctrl
"sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/builder"
"sigs.k8s.io/controller-runtime/pkg/builder"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client"
...
@@ -64,7 +63,6 @@ import (
...
@@ -64,7 +63,6 @@ import (
const
(
const
(
DefaultClusterName
=
"default"
DefaultClusterName
=
"default"
DefaultServiceAccountName
=
"default"
DefaultServiceAccountName
=
"default"
KubeValueNameSharedMemory
=
"shared-memory"
KubeAnnotationDeploymentStrategy
=
"nvidia.com/deployment-strategy"
KubeAnnotationDeploymentStrategy
=
"nvidia.com/deployment-strategy"
KubeAnnotationEnableStealingTrafficDebugMode
=
"nvidia.com/enable-stealing-traffic-debug-mode"
KubeAnnotationEnableStealingTrafficDebugMode
=
"nvidia.com/enable-stealing-traffic-debug-mode"
KubeAnnotationEnableDebugMode
=
"nvidia.com/enable-debug-mode"
KubeAnnotationEnableDebugMode
=
"nvidia.com/enable-debug-mode"
...
@@ -78,6 +76,7 @@ const (
...
@@ -78,6 +76,7 @@ const (
KubeAnnotationLWSSize
=
"nvidia.com/lws-size"
KubeAnnotationLWSSize
=
"nvidia.com/lws-size"
DeploymentTypeStandard
=
"standard"
DeploymentTypeStandard
=
"standard"
DeploymentTypeLeaderWorker
=
"leader-worker"
DeploymentTypeLeaderWorker
=
"leader-worker"
DeploymentTypeMultinodeGrove
=
"multinode-grove"
ComponentTypePlanner
=
"Planner"
ComponentTypePlanner
=
"Planner"
)
)
...
@@ -510,11 +509,7 @@ func (r *DynamoComponentDeploymentReconciler) generateLeaderPodTemplateSpec(ctx
...
@@ -510,11 +509,7 @@ func (r *DynamoComponentDeploymentReconciler) generateLeaderPodTemplateSpec(ctx
return
nil
,
errors
.
Wrap
(
err
,
"failed to generate leader pod template"
)
return
nil
,
errors
.
Wrap
(
err
,
"failed to generate leader pod template"
)
}
}
if
labels
!=
nil
{
maps
.
Copy
(
leaderPodTemplateSpec
.
ObjectMeta
.
Labels
,
labels
)
leaderPodTemplateSpec
.
ObjectMeta
.
Labels
=
labels
}
else
{
leaderPodTemplateSpec
.
ObjectMeta
.
Labels
=
make
(
map
[
string
]
string
)
}
leaderPodTemplateSpec
.
ObjectMeta
.
Labels
[
"role"
]
=
"leader"
leaderPodTemplateSpec
.
ObjectMeta
.
Labels
[
"role"
]
=
"leader"
leaderPodTemplateSpec
.
ObjectMeta
.
Labels
[
"instance-id"
]
=
fmt
.
Sprintf
(
"%d"
,
instanceID
)
leaderPodTemplateSpec
.
ObjectMeta
.
Labels
[
"instance-id"
]
=
fmt
.
Sprintf
(
"%d"
,
instanceID
)
delete
(
leaderPodTemplateSpec
.
ObjectMeta
.
Labels
,
commonconsts
.
KubeLabelDynamoSelector
)
delete
(
leaderPodTemplateSpec
.
ObjectMeta
.
Labels
,
commonconsts
.
KubeLabelDynamoSelector
)
...
@@ -556,11 +551,7 @@ func (r *DynamoComponentDeploymentReconciler) generateWorkerPodTemplateSpec(ctx
...
@@ -556,11 +551,7 @@ func (r *DynamoComponentDeploymentReconciler) generateWorkerPodTemplateSpec(ctx
return
nil
,
errors
.
Wrap
(
err
,
"failed to generate worker pod template"
)
return
nil
,
errors
.
Wrap
(
err
,
"failed to generate worker pod template"
)
}
}
if
labels
!=
nil
{
maps
.
Copy
(
workerPodTemplateSpec
.
ObjectMeta
.
Labels
,
labels
)
workerPodTemplateSpec
.
ObjectMeta
.
Labels
=
labels
}
else
{
workerPodTemplateSpec
.
ObjectMeta
.
Labels
=
make
(
map
[
string
]
string
)
}
workerPodTemplateSpec
.
ObjectMeta
.
Labels
[
"role"
]
=
"worker"
workerPodTemplateSpec
.
ObjectMeta
.
Labels
[
"role"
]
=
"worker"
workerPodTemplateSpec
.
ObjectMeta
.
Labels
[
"instance-id"
]
=
fmt
.
Sprintf
(
"%d"
,
instanceID
)
workerPodTemplateSpec
.
ObjectMeta
.
Labels
[
"instance-id"
]
=
fmt
.
Sprintf
(
"%d"
,
instanceID
)
delete
(
workerPodTemplateSpec
.
ObjectMeta
.
Labels
,
commonconsts
.
KubeLabelDynamoSelector
)
delete
(
workerPodTemplateSpec
.
ObjectMeta
.
Labels
,
commonconsts
.
KubeLabelDynamoSelector
)
...
@@ -988,8 +979,7 @@ func (r *DynamoComponentDeploymentReconciler) generateVirtualService(ctx context
...
@@ -988,8 +979,7 @@ func (r *DynamoComponentDeploymentReconciler) generateVirtualService(ctx context
},
},
}
}
vsEnabled
:=
opt
.
dynamoComponentDeployment
.
Spec
.
Ingress
!=
nil
&&
opt
.
dynamoComponentDeployment
.
Spec
.
Ingress
.
Enabled
&&
opt
.
dynamoComponentDeployment
.
Spec
.
Ingress
.
UseVirtualService
&&
opt
.
dynamoComponentDeployment
.
Spec
.
Ingress
.
VirtualServiceGateway
!=
nil
if
!
opt
.
dynamoComponentDeployment
.
Spec
.
Ingress
.
IsVirtualServiceEnabled
()
{
if
!
vsEnabled
{
log
.
Info
(
"VirtualService is not enabled"
)
log
.
Info
(
"VirtualService is not enabled"
)
return
vs
,
true
,
nil
return
vs
,
true
,
nil
}
}
...
@@ -1231,8 +1221,6 @@ func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx contex
...
@@ -1231,8 +1221,6 @@ func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx contex
kubeName
:=
r
.
getKubeName
(
opt
.
dynamoComponentDeployment
,
opt
.
isStealingTrafficDebugModeEnabled
)
kubeName
:=
r
.
getKubeName
(
opt
.
dynamoComponentDeployment
,
opt
.
isStealingTrafficDebugModeEnabled
)
containerPort
:=
commonconsts
.
DynamoServicePort
resourceAnnotations
:=
opt
.
dynamoComponentDeployment
.
Spec
.
Annotations
resourceAnnotations
:=
opt
.
dynamoComponentDeployment
.
Spec
.
Annotations
if
resourceAnnotations
==
nil
{
if
resourceAnnotations
==
nil
{
...
@@ -1241,192 +1229,22 @@ func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx contex
...
@@ -1241,192 +1229,22 @@ func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx contex
isDebugModeEnabled
:=
checkIfIsDebugModeEnabled
(
resourceAnnotations
)
isDebugModeEnabled
:=
checkIfIsDebugModeEnabled
(
resourceAnnotations
)
defaultEnvs
:=
[]
corev1
.
EnvVar
{
basePodSpec
,
err
:=
dynamo
.
GenerateBasePodSpecForController
(
opt
.
dynamoComponentDeployment
,
r
.
DockerSecretRetriever
,
r
.
Config
,
dynamo
.
RoleMain
,
consts
.
MultinodeDeploymentTypeLWS
)
{
Name
:
commonconsts
.
EnvDynamoServicePort
,
Value
:
fmt
.
Sprintf
(
"%d"
,
containerPort
),
},
}
if
r
.
Config
.
NatsAddress
!=
""
{
defaultEnvs
=
append
(
defaultEnvs
,
corev1
.
EnvVar
{
Name
:
"NATS_SERVER"
,
Value
:
r
.
Config
.
NatsAddress
,
})
}
if
r
.
Config
.
EtcdAddress
!=
""
{
defaultEnvs
=
append
(
defaultEnvs
,
corev1
.
EnvVar
{
Name
:
"ETCD_ENDPOINTS"
,
Value
:
r
.
Config
.
EtcdAddress
,
})
}
envs
:=
dynamo
.
MergeEnvs
(
opt
.
dynamoComponentDeployment
.
Spec
.
Envs
,
defaultEnvs
)
var
livenessProbe
*
corev1
.
Probe
if
opt
.
dynamoComponentDeployment
.
Spec
.
LivenessProbe
!=
nil
{
livenessProbe
=
opt
.
dynamoComponentDeployment
.
Spec
.
LivenessProbe
}
var
readinessProbe
*
corev1
.
Probe
if
opt
.
dynamoComponentDeployment
.
Spec
.
ReadinessProbe
!=
nil
{
readinessProbe
=
opt
.
dynamoComponentDeployment
.
Spec
.
ReadinessProbe
}
volumes
:=
make
([]
corev1
.
Volume
,
0
)
volumeMounts
:=
make
([]
corev1
.
VolumeMount
,
0
)
dynamoResources
:=
opt
.
dynamoComponentDeployment
.
Spec
.
Resources
resources
,
err
:=
getResourcesConfig
(
dynamoResources
)
if
err
!=
nil
{
if
err
!=
nil
{
err
=
errors
.
Wrap
(
err
,
"failed to ge
t resources config
"
)
err
=
errors
.
Wrap
(
err
,
"failed to ge
nerate base pod spec
"
)
return
nil
,
err
return
nil
,
err
}
}
sharedMemorySizeLimit
:=
resource
.
MustParse
(
"64Mi"
)
// Ensure we have at least one container (the main container should be there from GenerateBasePodSpec)
memoryLimit
:=
resources
.
Limits
[
corev1
.
ResourceMemory
]
if
len
(
basePodSpec
.
Containers
)
==
0
{
if
!
memoryLimit
.
IsZero
()
{
return
nil
,
errors
.
New
(
"no containers found in base pod spec"
)
sharedMemorySizeLimit
.
SetMilli
(
memoryLimit
.
MilliValue
()
/
2
)
}
volumes
=
append
(
volumes
,
corev1
.
Volume
{
Name
:
KubeValueNameSharedMemory
,
VolumeSource
:
corev1
.
VolumeSource
{
EmptyDir
:
&
corev1
.
EmptyDirVolumeSource
{
Medium
:
corev1
.
StorageMediumMemory
,
SizeLimit
:
&
sharedMemorySizeLimit
,
},
},
})
volumeMounts
=
append
(
volumeMounts
,
corev1
.
VolumeMount
{
Name
:
KubeValueNameSharedMemory
,
MountPath
:
"/dev/shm"
,
})
if
opt
.
dynamoComponentDeployment
.
Spec
.
PVC
!=
nil
{
volumes
=
append
(
volumes
,
corev1
.
Volume
{
Name
:
getPvcName
(
opt
.
dynamoComponentDeployment
,
opt
.
dynamoComponentDeployment
.
Spec
.
PVC
.
Name
),
VolumeSource
:
corev1
.
VolumeSource
{
PersistentVolumeClaim
:
&
corev1
.
PersistentVolumeClaimVolumeSource
{
ClaimName
:
getPvcName
(
opt
.
dynamoComponentDeployment
,
opt
.
dynamoComponentDeployment
.
Spec
.
PVC
.
Name
),
},
},
})
volumeMounts
=
append
(
volumeMounts
,
corev1
.
VolumeMount
{
Name
:
getPvcName
(
opt
.
dynamoComponentDeployment
,
opt
.
dynamoComponentDeployment
.
Spec
.
PVC
.
Name
),
MountPath
:
*
opt
.
dynamoComponentDeployment
.
Spec
.
PVC
.
MountPoint
,
})
}
imageName
:=
opt
.
dynamoComponentDeployment
.
GetImage
()
if
imageName
==
""
{
return
nil
,
errors
.
Errorf
(
"image is not set for component %s"
,
opt
.
dynamoComponentDeployment
.
Name
)
}
}
var
securityContext
*
corev1
.
SecurityContext
// Get the main container from the base spec
var
mainContainerSecurityContext
*
corev1
.
SecurityContext
container
:=
basePodSpec
.
Containers
[
0
]
enableRestrictedSecurityContext
:=
os
.
Getenv
(
"ENABLE_RESTRICTED_SECURITY_CONTEXT"
)
==
"true"
if
enableRestrictedSecurityContext
{
securityContext
=
&
corev1
.
SecurityContext
{
AllowPrivilegeEscalation
:
ptr
.
To
(
false
),
RunAsNonRoot
:
ptr
.
To
(
true
),
RunAsUser
:
ptr
.
To
(
int64
(
1000
)),
RunAsGroup
:
ptr
.
To
(
int64
(
1000
)),
SeccompProfile
:
&
corev1
.
SeccompProfile
{
Type
:
corev1
.
SeccompProfileTypeRuntimeDefault
,
},
Capabilities
:
&
corev1
.
Capabilities
{
Drop
:
[]
corev1
.
Capability
{
"ALL"
},
},
}
mainContainerSecurityContext
=
securityContext
.
DeepCopy
()
mainContainerSecurityContext
.
RunAsUser
=
ptr
.
To
(
int64
(
1034
))
}
containers
:=
make
([]
corev1
.
Container
,
0
,
2
)
containers
:=
make
([]
corev1
.
Container
,
0
,
2
)
// TODO: Temporarily disabling probes
container
:=
corev1
.
Container
{
Name
:
"main"
,
Image
:
imageName
,
LivenessProbe
:
livenessProbe
,
ReadinessProbe
:
readinessProbe
,
Resources
:
resources
,
Env
:
envs
,
TTY
:
true
,
Stdin
:
true
,
VolumeMounts
:
volumeMounts
,
Ports
:
[]
corev1
.
ContainerPort
{
{
Protocol
:
corev1
.
ProtocolTCP
,
Name
:
commonconsts
.
DynamoContainerPortName
,
ContainerPort
:
int32
(
containerPort
),
// nolint: gosec
},
},
SecurityContext
:
mainContainerSecurityContext
,
}
// Add system port for worker components
if
opt
.
dynamoComponentDeployment
.
Spec
.
ComponentType
==
commonconsts
.
ComponentTypeWorker
{
container
.
Ports
=
append
(
container
.
Ports
,
corev1
.
ContainerPort
{
Protocol
:
corev1
.
ProtocolTCP
,
Name
:
commonconsts
.
DynamoSystemPortName
,
ContainerPort
:
int32
(
commonconsts
.
DynamoSystemPort
),
})
}
if
opt
.
dynamoComponentDeployment
.
Spec
.
EnvFromSecret
!=
nil
{
container
.
EnvFrom
=
[]
corev1
.
EnvFromSource
{
{
SecretRef
:
&
corev1
.
SecretEnvSource
{
LocalObjectReference
:
corev1
.
LocalObjectReference
{
Name
:
*
opt
.
dynamoComponentDeployment
.
Spec
.
EnvFromSecret
,
},
},
},
}
}
if
resourceAnnotations
[
"nvidia.com/enable-container-privileged"
]
==
commonconsts
.
KubeLabelValueTrue
{
if
container
.
SecurityContext
==
nil
{
container
.
SecurityContext
=
&
corev1
.
SecurityContext
{}
}
container
.
SecurityContext
.
Privileged
=
&
[]
bool
{
true
}[
0
]
}
if
resourceAnnotations
[
"nvidia.com/enable-container-ptrace"
]
==
commonconsts
.
KubeLabelValueTrue
{
if
container
.
SecurityContext
==
nil
{
container
.
SecurityContext
=
&
corev1
.
SecurityContext
{}
}
container
.
SecurityContext
.
Capabilities
=
&
corev1
.
Capabilities
{
Add
:
[]
corev1
.
Capability
{
"SYS_PTRACE"
},
}
}
if
resourceAnnotations
[
"nvidia.com/run-container-as-root"
]
==
commonconsts
.
KubeLabelValueTrue
{
if
container
.
SecurityContext
==
nil
{
container
.
SecurityContext
=
&
corev1
.
SecurityContext
{}
}
container
.
SecurityContext
.
RunAsUser
=
&
[]
int64
{
0
}[
0
]
}
// Merge extraPodSpecMainContainer into container, only overriding empty fields
if
opt
.
dynamoComponentDeployment
.
Spec
.
ExtraPodSpec
!=
nil
{
extraPodSpecMainContainer
:=
opt
.
dynamoComponentDeployment
.
Spec
.
ExtraPodSpec
.
MainContainer
if
extraPodSpecMainContainer
!=
nil
{
// Merge non empty fields from extraPodSpecMainContainer into container, only overriding empty fields
err
:=
mergo
.
Merge
(
&
container
,
extraPodSpecMainContainer
.
DeepCopy
())
if
err
!=
nil
{
err
=
errors
.
Wrapf
(
err
,
"failed to merge extraPodSpecMainContainer into container"
)
return
nil
,
err
}
// finally merge the envs from extraPodSpecMainContainer into container
container
.
Env
=
dynamo
.
MergeEnvs
(
container
.
Env
,
extraPodSpecMainContainer
.
Env
)
}
}
containers
=
append
(
containers
,
container
)
containers
=
append
(
containers
,
container
)
debuggerImage
:=
"python:3.12-slim"
debuggerImage
:=
"python:3.12-slim"
...
@@ -1465,42 +1283,14 @@ func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx contex
...
@@ -1465,42 +1283,14 @@ func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx contex
podLabels
[
commonconsts
.
KubeLabelDynamoSelector
]
=
kubeName
podLabels
[
commonconsts
.
KubeLabelDynamoSelector
]
=
kubeName
imagePullSecrets
:=
[]
corev1
.
LocalObjectReference
{}
podSpec
:=
&
basePodSpec
podSpec
.
Containers
=
containers
if
r
.
DockerSecretRetriever
==
nil
{
err
=
errors
.
New
(
"DockerSecretRetriever is not initialized"
)
return
}
secretsName
,
err
:=
r
.
DockerSecretRetriever
.
GetSecrets
(
opt
.
dynamoComponentDeployment
.
Namespace
,
imageName
)
if
err
!=
nil
{
err
=
errors
.
Wrapf
(
err
,
"failed to get secrets for component %s and image %s"
,
opt
.
dynamoComponentDeployment
.
Name
,
imageName
)
return
}
for
_
,
secretName
:=
range
secretsName
{
imagePullSecrets
=
append
(
imagePullSecrets
,
corev1
.
LocalObjectReference
{
Name
:
secretName
,
})
}
podSpec
:=
&
corev1
.
PodSpec
{}
if
opt
.
dynamoComponentDeployment
.
Spec
.
ExtraPodSpec
!=
nil
&&
opt
.
dynamoComponentDeployment
.
Spec
.
ExtraPodSpec
.
PodSpec
!=
nil
{
podSpec
=
opt
.
dynamoComponentDeployment
.
Spec
.
ExtraPodSpec
.
PodSpec
.
DeepCopy
()
}
podSpec
.
Containers
=
append
(
podSpec
.
Containers
,
containers
...
)
podSpec
.
Volumes
=
append
(
podSpec
.
Volumes
,
volumes
...
)
podSpec
.
ImagePullSecrets
=
append
(
podSpec
.
ImagePullSecrets
,
imagePullSecrets
...
)
extraPodMetadata
:=
opt
.
dynamoComponentDeployment
.
Spec
.
ExtraPodMetadata
extraPodMetadata
:=
opt
.
dynamoComponentDeployment
.
Spec
.
ExtraPodMetadata
if
extraPodMetadata
!=
nil
{
if
extraPodMetadata
!=
nil
{
for
k
,
v
:=
range
extraPodMetadata
.
Annotations
{
maps
.
Copy
(
podAnnotations
,
extraPodMetadata
.
Annotations
)
podAnnotations
[
k
]
=
v
maps
.
Copy
(
podLabels
,
extraPodMetadata
.
Labels
)
}
for
k
,
v
:=
range
extraPodMetadata
.
Labels
{
podLabels
[
k
]
=
v
}
}
}
if
podSpec
.
ServiceAccountName
==
""
{
if
podSpec
.
ServiceAccountName
==
""
{
...
@@ -1519,18 +1309,6 @@ func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx contex
...
@@ -1519,18 +1309,6 @@ func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx contex
}
}
}
}
if
resourceAnnotations
[
"nvidia.com/enable-host-ipc"
]
==
commonconsts
.
KubeLabelValueTrue
{
podSpec
.
HostIPC
=
true
}
if
resourceAnnotations
[
"nvidia.com/enable-host-network"
]
==
commonconsts
.
KubeLabelValueTrue
{
podSpec
.
HostNetwork
=
true
}
if
resourceAnnotations
[
"nvidia.com/enable-host-pid"
]
==
commonconsts
.
KubeLabelValueTrue
{
podSpec
.
HostPID
=
true
}
if
opt
.
isStealingTrafficDebugModeEnabled
||
isDebugModeEnabled
{
if
opt
.
isStealingTrafficDebugModeEnabled
||
isDebugModeEnabled
{
podSpec
.
ShareProcessNamespace
=
&
[]
bool
{
true
}[
0
]
podSpec
.
ShareProcessNamespace
=
&
[]
bool
{
true
}[
0
]
}
}
...
@@ -1546,31 +1324,6 @@ func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx contex
...
@@ -1546,31 +1324,6 @@ func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx contex
return
return
}
}
func
getResourcesConfig
(
resources
*
dynamoCommon
.
Resources
)
(
corev1
.
ResourceRequirements
,
error
)
{
defaultResources
:=
corev1
.
ResourceRequirements
{
Requests
:
corev1
.
ResourceList
{
corev1
.
ResourceCPU
:
resource
.
MustParse
(
"300m"
),
corev1
.
ResourceMemory
:
resource
.
MustParse
(
"500Mi"
),
},
Limits
:
corev1
.
ResourceList
{
corev1
.
ResourceCPU
:
resource
.
MustParse
(
"500m"
),
corev1
.
ResourceMemory
:
resource
.
MustParse
(
"1Gi"
),
},
}
if
resources
==
nil
{
return
defaultResources
,
nil
}
resourcesConfig
,
err
:=
controller_common
.
GetResourcesConfig
(
resources
)
if
err
!=
nil
{
return
corev1
.
ResourceRequirements
{},
errors
.
Wrapf
(
err
,
"failed to get resources config"
)
}
err
=
mergo
.
Merge
(
resourcesConfig
,
defaultResources
.
DeepCopy
())
if
err
!=
nil
{
return
corev1
.
ResourceRequirements
{},
errors
.
Wrapf
(
err
,
"failed to merge resources config"
)
}
return
*
resourcesConfig
,
nil
}
func
(
r
*
DynamoComponentDeploymentReconciler
)
generateService
(
opt
generateResourceOption
)
(
*
corev1
.
Service
,
bool
,
error
)
{
func
(
r
*
DynamoComponentDeploymentReconciler
)
generateService
(
opt
generateResourceOption
)
(
*
corev1
.
Service
,
bool
,
error
)
{
var
kubeName
string
var
kubeName
string
if
opt
.
isGenericService
{
if
opt
.
isGenericService
{
...
...
deploy/cloud/operator/internal/controller/dynamocomponentdeployment_controller_test.go
View file @
dabd2267
...
@@ -29,6 +29,7 @@ import (
...
@@ -29,6 +29,7 @@ import (
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
commonconsts
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
commonconsts
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/dynamo"
"github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp"
"github.com/onsi/gomega"
"github.com/onsi/gomega"
"github.com/onsi/gomega/format"
"github.com/onsi/gomega/format"
...
@@ -823,6 +824,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
...
@@ -823,6 +824,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
Spec
:
v1alpha1
.
DynamoComponentDeploymentSpec
{
Spec
:
v1alpha1
.
DynamoComponentDeploymentSpec
{
DynamoComponent
:
"test-lws-component"
,
DynamoComponent
:
"test-lws-component"
,
DynamoTag
:
"test-tag"
,
DynamoTag
:
"test-tag"
,
BackendFramework
:
string
(
dynamo
.
BackendFrameworkVLLM
),
DynamoComponentDeploymentSharedSpec
:
v1alpha1
.
DynamoComponentDeploymentSharedSpec
{
DynamoComponentDeploymentSharedSpec
:
v1alpha1
.
DynamoComponentDeploymentSharedSpec
{
Envs
:
[]
corev1
.
EnvVar
{
Envs
:
[]
corev1
.
EnvVar
{
{
{
...
@@ -837,10 +839,22 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
...
@@ -837,10 +839,22 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
"nvidia.com/lws-size"
:
"2"
,
"nvidia.com/lws-size"
:
"2"
,
},
},
Resources
:
&
common
.
Resources
{
Resources
:
&
common
.
Resources
{
Requests
:
&
common
.
ResourceItem
{
CPU
:
"300m"
,
Memory
:
"500Mi"
,
},
Limits
:
&
common
.
ResourceItem
{
Limits
:
&
common
.
ResourceItem
{
GPU
:
"1"
,
GPU
:
"1"
,
},
},
},
},
ExtraPodMetadata
:
&
common
.
ExtraPodMetadata
{
Annotations
:
map
[
string
]
string
{
"nvidia.com/annotation1"
:
"annotation1"
,
},
Labels
:
map
[
string
]
string
{
"nvidia.com/label1"
:
"label1"
,
},
},
ExtraPodSpec
:
&
dynamoCommon
.
ExtraPodSpec
{
ExtraPodSpec
:
&
dynamoCommon
.
ExtraPodSpec
{
PodSpec
:
&
corev1
.
PodSpec
{
PodSpec
:
&
corev1
.
PodSpec
{
TerminationGracePeriodSeconds
:
ptr
.
To
(
int64
(
10
)),
TerminationGracePeriodSeconds
:
ptr
.
To
(
int64
(
10
)),
...
@@ -897,48 +911,58 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
...
@@ -897,48 +911,58 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
ObjectMeta
:
metav1
.
ObjectMeta
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Labels
:
map
[
string
]
string
{
Labels
:
map
[
string
]
string
{
"instance-id"
:
"0"
,
"instance-id"
:
"0"
,
commonconsts
.
KubeLabelMetricsEnabled
:
commonconsts
.
KubeLabelValueTrue
,
"role"
:
"leader"
,
"role"
:
"leader"
,
"nvidia.com/label1"
:
"label1"
,
},
},
Annotations
:
map
[
string
]
string
{
Annotations
:
map
[
string
]
string
{
"scheduling.k8s.io/group-name"
:
"test-lws-deploy-0"
,
"scheduling.k8s.io/group-name"
:
"test-lws-deploy-0"
,
"nvidia.com/annotation1"
:
"annotation1"
,
},
},
},
},
Spec
:
corev1
.
PodSpec
{
Spec
:
corev1
.
PodSpec
{
SchedulerName
:
"volcano"
,
SchedulerName
:
"volcano"
,
TerminationGracePeriodSeconds
:
ptr
.
To
(
int64
(
10
)),
TerminationGracePeriodSeconds
:
ptr
.
To
(
int64
(
10
)),
Volumes
:
[]
corev1
.
Volume
{
{
Name
:
"shared-memory"
,
VolumeSource
:
corev1
.
VolumeSource
{
EmptyDir
:
&
corev1
.
EmptyDirVolumeSource
{
Medium
:
corev1
.
StorageMediumMemory
,
SizeLimit
:
resource
.
NewQuantity
(
512
*
1024
*
1024
,
resource
.
BinarySI
),
// 512Mi default (calculated from memory limit)
},
},
},
},
Containers
:
[]
corev1
.
Container
{
Containers
:
[]
corev1
.
Container
{
{
{
Name
:
"main"
,
Name
:
"main"
,
Image
:
"test-image:latest"
,
Image
:
"test-image:latest"
,
Command
:
[]
string
{
"sh"
,
"-c"
},
Command
:
[]
string
{
"sh"
,
"-c"
},
Args
:
[]
string
{
"ray start --head --port=6379 && some dynamo command"
},
Args
:
[]
string
{
"ray start --head --port=6379 && some dynamo command"
},
Env
:
[]
corev1
.
EnvVar
{{
Name
:
"DYNAMO_PORT"
,
Value
:
fmt
.
Sprintf
(
"%d"
,
commonconsts
.
DynamoServicePort
)},
{
Name
:
"TEST_ENV_FROM_DYNAMO_COMPONENT_DEPLOYMENT_SPEC"
,
Value
:
"test_value_from_dynamo_component_deployment_spec"
},
{
Name
:
"TEST_ENV_FROM_EXTRA_POD_SPEC"
,
Value
:
"test_value_from_extra_pod_spec"
}},
Env
:
[]
corev1
.
EnvVar
{{
Name
:
"TEST_ENV_FROM_DYNAMO_COMPONENT_DEPLOYMENT_SPEC"
,
Value
:
"test_value_from_dynamo_component_deployment_spec"
},
{
Name
:
"TEST_ENV_FROM_EXTRA_POD_SPEC"
,
Value
:
"test_value_from_extra_pod_spec"
}
,
{
Name
:
"DYNAMO_PORT"
,
Value
:
fmt
.
Sprintf
(
"%d"
,
commonconsts
.
DynamoServicePort
)}
},
VolumeMoun
ts
:
[]
corev1
.
VolumeMoun
t
{
Por
ts
:
[]
corev1
.
ContainerPor
t
{
{
{
Name
:
"shared-memory"
,
MountPath
:
"/dev/shm"
,
Protocol
:
corev1
.
ProtocolTCP
,
Name
:
commonconsts
.
DynamoServicePortName
,
ContainerPort
:
commonconsts
.
DynamoServicePort
,
},
},
},
},
Por
ts
:
[]
corev1
.
ContainerPor
t
{
VolumeMoun
ts
:
[]
corev1
.
VolumeMoun
t
{
{
{
Protocol
:
corev1
.
ProtocolTCP
,
Name
:
commonconsts
.
DynamoServicePortName
,
ContainerPort
:
commonconsts
.
DynamoServicePort
,
Name
:
"shared-memory"
,
MountPath
:
"/dev/shm"
,
},
},
},
},
TTY
:
true
,
Stdin
:
true
,
Resources
:
corev1
.
ResourceRequirements
{
Resources
:
corev1
.
ResourceRequirements
{
Requests
:
corev1
.
ResourceList
{
Requests
:
corev1
.
ResourceList
{
corev1
.
ResourceCPU
:
resource
.
MustParse
(
"300m"
),
corev1
.
ResourceCPU
:
resource
.
MustParse
(
"300m"
),
corev1
.
ResourceMemory
:
resource
.
MustParse
(
"500Mi"
),
corev1
.
ResourceMemory
:
resource
.
MustParse
(
"500Mi"
),
},
},
Limits
:
corev1
.
ResourceList
{
Limits
:
corev1
.
ResourceList
{
corev1
.
ResourceCPU
:
resource
.
MustParse
(
"500m"
),
corev1
.
ResourceMemory
:
resource
.
MustParse
(
"1Gi"
),
"nvidia.com/gpu"
:
resource
.
MustParse
(
"1"
),
"nvidia.com/gpu"
:
resource
.
MustParse
(
"1"
),
},
},
},
},
},
},
},
},
Volumes
:
[]
corev1
.
Volume
{{
Name
:
"shared-memory"
,
VolumeSource
:
corev1
.
VolumeSource
{
EmptyDir
:
&
corev1
.
EmptyDirVolumeSource
{
Medium
:
corev1
.
StorageMediumMemory
,
SizeLimit
:
limit
}}}},
ImagePullSecrets
:
nil
,
// Assuming default config gives empty secret name
ImagePullSecrets
:
nil
,
// Assuming default config gives empty secret name
ServiceAccountName
:
"default-test-sa"
,
// Updated to reflect mocked SA
ServiceAccountName
:
"default-test-sa"
,
// Updated to reflect mocked SA
},
},
...
@@ -947,37 +971,52 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
...
@@ -947,37 +971,52 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
ObjectMeta
:
metav1
.
ObjectMeta
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Labels
:
map
[
string
]
string
{
Labels
:
map
[
string
]
string
{
"instance-id"
:
"0"
,
"instance-id"
:
"0"
,
commonconsts
.
KubeLabelMetricsEnabled
:
commonconsts
.
KubeLabelValueTrue
,
"role"
:
"worker"
,
"role"
:
"worker"
,
"nvidia.com/label1"
:
"label1"
,
},
},
Annotations
:
map
[
string
]
string
{
Annotations
:
map
[
string
]
string
{
"scheduling.k8s.io/group-name"
:
"test-lws-deploy-0"
,
"scheduling.k8s.io/group-name"
:
"test-lws-deploy-0"
,
"nvidia.com/annotation1"
:
"annotation1"
,
},
},
},
},
Spec
:
corev1
.
PodSpec
{
Spec
:
corev1
.
PodSpec
{
TerminationGracePeriodSeconds
:
ptr
.
To
(
int64
(
10
)),
TerminationGracePeriodSeconds
:
ptr
.
To
(
int64
(
10
)),
SchedulerName
:
"volcano"
,
SchedulerName
:
"volcano"
,
Volumes
:
[]
corev1
.
Volume
{
{
Name
:
"shared-memory"
,
VolumeSource
:
corev1
.
VolumeSource
{
EmptyDir
:
&
corev1
.
EmptyDirVolumeSource
{
Medium
:
corev1
.
StorageMediumMemory
,
SizeLimit
:
resource
.
NewQuantity
(
512
*
1024
*
1024
,
resource
.
BinarySI
),
// 512Mi default (calculated from memory limit)
},
},
},
},
Containers
:
[]
corev1
.
Container
{
Containers
:
[]
corev1
.
Container
{
{
{
Name
:
"main"
,
Name
:
"main"
,
Image
:
"test-image:latest"
,
Image
:
"test-image:latest"
,
Command
:
[]
string
{
"sh"
,
"-c"
},
Command
:
[]
string
{
"sh"
,
"-c"
},
Args
:
[]
string
{
"ray start --address=$(LWS_LEADER_ADDRESS):6379 --block"
},
Args
:
[]
string
{
"ray start --address=$(LWS_LEADER_ADDRESS):6379 --block"
},
Env
:
[]
corev1
.
EnvVar
{{
Name
:
"DYNAMO_PORT"
,
Value
:
fmt
.
Sprintf
(
"%d"
,
commonconsts
.
DynamoServicePort
)},
{
Name
:
"TEST_ENV_FROM_DYNAMO_COMPONENT_DEPLOYMENT_SPEC"
,
Value
:
"test_value_from_dynamo_component_deployment_spec"
},
{
Name
:
"TEST_ENV_FROM_EXTRA_POD_SPEC"
,
Value
:
"test_value_from_extra_pod_spec"
}},
Env
:
[]
corev1
.
EnvVar
{{
Name
:
"TEST_ENV_FROM_DYNAMO_COMPONENT_DEPLOYMENT_SPEC"
,
Value
:
"test_value_from_dynamo_component_deployment_spec"
},
{
Name
:
"TEST_ENV_FROM_EXTRA_POD_SPEC"
,
Value
:
"test_value_from_extra_pod_spec"
}
,
{
Name
:
"DYNAMO_PORT"
,
Value
:
fmt
.
Sprintf
(
"%d"
,
commonconsts
.
DynamoServicePort
)}
},
VolumeMoun
ts
:
[]
corev1
.
VolumeMount
{{
Name
:
"shared-memory"
,
MountPath
:
"/dev/shm"
}},
Por
ts
:
[]
corev1
.
ContainerPort
{{
Protocol
:
corev1
.
ProtocolTCP
,
Name
:
commonconsts
.
DynamoServicePortName
,
ContainerPort
:
commonconsts
.
DynamoServicePort
}},
Por
ts
:
[]
corev1
.
ContainerPor
t
{
VolumeMoun
ts
:
[]
corev1
.
VolumeMoun
t
{
{
{
Protocol
:
corev1
.
ProtocolTCP
,
Name
:
commonconsts
.
DynamoServicePortName
,
ContainerPort
:
commonconsts
.
DynamoServicePort
,
Name
:
"shared-memory"
,
MountPath
:
"/dev/shm"
,
},
},
},
},
TTY
:
true
,
Stdin
:
true
,
Resources
:
corev1
.
ResourceRequirements
{
Resources
:
corev1
.
ResourceRequirements
{
Requests
:
corev1
.
ResourceList
{
corev1
.
ResourceCPU
:
resource
.
MustParse
(
"300m"
),
corev1
.
ResourceMemory
:
resource
.
MustParse
(
"500Mi"
)},
Limits
:
corev1
.
ResourceList
{
"nvidia.com/gpu"
:
resource
.
MustParse
(
"1"
)},
Limits
:
corev1
.
ResourceList
{
corev1
.
ResourceCPU
:
resource
.
MustParse
(
"500m"
),
corev1
.
ResourceMemory
:
resource
.
MustParse
(
"1Gi"
),
"nvidia.com/gpu"
:
resource
.
MustParse
(
"1"
)},
Requests
:
corev1
.
ResourceList
{
corev1
.
ResourceCPU
:
resource
.
MustParse
(
"300m"
),
corev1
.
ResourceMemory
:
resource
.
MustParse
(
"500Mi"
),
},
},
},
},
},
},
},
Volumes
:
[]
corev1
.
Volume
{{
Name
:
"shared-memory"
,
VolumeSource
:
corev1
.
VolumeSource
{
EmptyDir
:
&
corev1
.
EmptyDirVolumeSource
{
Medium
:
corev1
.
StorageMediumMemory
,
SizeLimit
:
limit
}}}},
ImagePullSecrets
:
nil
,
ImagePullSecrets
:
nil
,
ServiceAccountName
:
"default-test-sa"
,
// Updated to reflect mocked SA
ServiceAccountName
:
"default-test-sa"
,
// Updated to reflect mocked SA
},
},
...
...
deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go
View file @
dabd2267
...
@@ -222,14 +222,14 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGroveResources(ctx context.Co
...
@@ -222,14 +222,14 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGroveResources(ctx context.Co
return
true
return
true
}))
}))
// generate the main component virtual service
// generate the main component virtual service
if
r
.
Config
.
IngressConfig
.
UseVirtualService
()
{
mainComponentVirtualService
:=
dynamo
.
GenerateComponentVirtualService
(
ctx
,
dynamo
.
GetDynamoComponentName
(
dynamoDeployment
,
componentName
),
dynamoDeployment
.
Namespace
,
ingressSpec
)
mainComponentVirtualService
:=
dynamo
.
GenerateComponentVirtualService
(
ctx
,
dynamo
.
GetDynamoComponentName
(
dynamoDeployment
,
componentName
),
dynamoDeployment
.
Namespace
,
ingressSpec
)
if
err
!=
nil
{
if
err
!=
nil
{
logger
.
Error
(
err
,
"failed to generate the main component virtual service"
)
logger
.
Error
(
err
,
"failed to generate the main component virtual service"
)
return
""
,
""
,
""
,
fmt
.
Errorf
(
"failed to generate the main component virtual service: %w"
,
err
)
return
""
,
""
,
""
,
fmt
.
Errorf
(
"failed to generate the main component virtual service: %w"
,
err
)
}
}
_
,
syncedMainComponentVirtualService
,
err
:=
commonController
.
SyncResource
(
ctx
,
r
,
dynamoDeployment
,
func
(
ctx
context
.
Context
)
(
*
networkingv1beta1
.
VirtualService
,
bool
,
error
)
{
_
,
syncedMainComponentVirtualService
,
err
:=
commonController
.
SyncResource
(
ctx
,
r
,
dynamoDeployment
,
func
(
ctx
context
.
Context
)
(
*
networkingv1beta1
.
VirtualService
,
bool
,
error
)
{
vsEnabled
:=
ingressSpec
.
Enabled
&&
ingressSpec
.
UseVirtualService
&&
ingressSpec
.
VirtualServiceGateway
!=
nil
if
!
ingressSpec
.
IsVirtualServiceEnabled
()
{
if
!
vsEnabled
{
logger
.
Info
(
"VirtualService is not enabled"
)
logger
.
Info
(
"VirtualService is not enabled"
)
return
mainComponentVirtualService
,
true
,
nil
return
mainComponentVirtualService
,
true
,
nil
}
}
...
@@ -244,6 +244,7 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGroveResources(ctx context.Co
...
@@ -244,6 +244,7 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGroveResources(ctx context.Co
}))
}))
}
}
}
}
}
return
r
.
checkResourcesReadiness
(
resources
)
return
r
.
checkResourcesReadiness
(
resources
)
}
}
...
...
deploy/cloud/operator/internal/dynamo/backend_common.go
0 → 100644
View file @
dabd2267
package
dynamo
import
(
"fmt"
commonconsts
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
)
// generateGroveLeaderHostname generates the hostname for the leader pod in Grove multinode deployments
// The leader hostname follows the pattern: {GROVE_PCSG_NAME}-{GROVE_PCSG_INDEX}-serviceName-{GroveRoleSuffixLeader}-0.{GROVE_HEADLESS_SERVICE}
func
generateGroveLeaderHostname
(
serviceName
string
)
string
{
return
fmt
.
Sprintf
(
"${GROVE_PCSG_NAME}-${GROVE_PCSG_INDEX}-%s-%s-0.${GROVE_HEADLESS_SERVICE}"
,
serviceName
,
commonconsts
.
GroveRoleSuffixLeader
)
}
deploy/cloud/operator/internal/dynamo/backend_sglang.go
0 → 100644
View file @
dabd2267
package
dynamo
import
(
"fmt"
"regexp"
"strings"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
commonconsts
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
corev1
"k8s.io/api/core/v1"
)
type
SGLangBackend
struct
{}
func
(
b
*
SGLangBackend
)
UpdateContainer
(
container
*
corev1
.
Container
,
numberOfNodes
int32
,
role
Role
,
component
*
v1alpha1
.
DynamoComponentDeploymentOverridesSpec
,
multinodeDeploymentType
commonconsts
.
MultinodeDeploymentType
,
serviceName
string
)
{
// For single node, nothing to do
if
numberOfNodes
<=
1
{
return
}
// Remove probes for multinode leader and worker
if
role
==
RoleLeader
||
role
==
RoleWorker
{
container
.
LivenessProbe
=
nil
container
.
ReadinessProbe
=
nil
container
.
StartupProbe
=
nil
}
// Generate the flags to add
flags
:=
b
.
getMultinodeFlags
(
numberOfNodes
,
role
,
multinodeDeploymentType
,
serviceName
)
if
flags
==
""
{
return
}
// Flatten all args into a single command and inject flags
if
len
(
container
.
Args
)
>
0
{
fullCommand
:=
strings
.
Join
(
container
.
Args
,
" "
)
modifiedCommand
:=
b
.
injectFlagsIntoPythonCommand
(
fullCommand
,
flags
)
container
.
Args
=
[]
string
{
modifiedCommand
}
}
}
func
(
b
*
SGLangBackend
)
UpdatePodSpec
(
podSpec
*
corev1
.
PodSpec
,
numberOfNodes
int32
,
role
Role
,
component
*
v1alpha1
.
DynamoComponentDeploymentOverridesSpec
,
multinodeDeploymentType
commonconsts
.
MultinodeDeploymentType
,
serviceName
string
)
{
// do nothing
}
// getMultinodeFlags returns the multinode flags as a single string
func
(
b
*
SGLangBackend
)
getMultinodeFlags
(
numberOfNodes
int32
,
role
Role
,
multinodeDeploymentType
commonconsts
.
MultinodeDeploymentType
,
serviceName
string
)
string
{
var
distInitAddr
,
nodeRank
string
// Determine dist-init-addr
if
multinodeDeploymentType
==
commonconsts
.
MultinodeDeploymentTypeGrove
{
leaderHostname
:=
generateGroveLeaderHostname
(
serviceName
)
distInitAddr
=
fmt
.
Sprintf
(
"%s:29500"
,
leaderHostname
)
}
else
{
distInitAddr
=
"${LWS_LEADER_ADDRESS}:29500"
}
// Determine node-rank
if
role
==
RoleLeader
{
nodeRank
=
"0"
}
else
{
if
multinodeDeploymentType
==
commonconsts
.
MultinodeDeploymentTypeGrove
{
nodeRank
=
"$((GROVE_PCLQ_POD_INDEX + 1))"
}
else
{
nodeRank
=
"${LWS_WORKER_INDEX}"
}
}
return
fmt
.
Sprintf
(
"--dist-init-addr %s --nnodes %d --node-rank %s"
,
distInitAddr
,
numberOfNodes
,
nodeRank
)
}
// injectFlagsIntoPythonCommand finds python sglang commands and adds flags after them
func
(
b
*
SGLangBackend
)
injectFlagsIntoPythonCommand
(
arg
,
flags
string
)
string
{
// Regex to match python commands that contain sglang
// Matches: python, python3, python3.11, etc. followed by sglang-related modules
pattern
:=
`(python[0-9.]*\s+[^|&;]*sglang[^|&;]*?)(\s|$|[|&;])`
re
:=
regexp
.
MustCompile
(
pattern
)
// Replace with the command + flags + whatever comes after
result
:=
re
.
ReplaceAllStringFunc
(
arg
,
func
(
match
string
)
string
{
// Extract the python command part and the delimiter
submatches
:=
re
.
FindStringSubmatch
(
match
)
if
len
(
submatches
)
>=
3
{
pythonCmd
:=
submatches
[
1
]
delimiter
:=
submatches
[
2
]
return
pythonCmd
+
" "
+
flags
+
delimiter
}
return
match
})
return
result
}
deploy/cloud/operator/internal/dynamo/backend_sglang_test.go
0 → 100644
View file @
dabd2267
package
dynamo
import
(
"reflect"
"testing"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
corev1
"k8s.io/api/core/v1"
)
func
TestSGLangBackend_DirectFlagInjection
(
t
*
testing
.
T
)
{
backend
:=
&
SGLangBackend
{}
tests
:=
[]
struct
{
name
string
numberOfNodes
int32
role
Role
multinodeDeploymentType
consts
.
MultinodeDeploymentType
initialArgs
[]
string
expectedArgs
[]
string
description
string
}{
{
name
:
"single node does not modify args"
,
numberOfNodes
:
1
,
role
:
RoleMain
,
multinodeDeploymentType
:
consts
.
MultinodeDeploymentTypeGrove
,
initialArgs
:
[]
string
{
"python -m dynamo.sglang.worker"
},
expectedArgs
:
[]
string
{
"python -m dynamo.sglang.worker"
},
description
:
"Single node should not modify anything"
,
},
{
name
:
"multinode adds flags to simple python command"
,
numberOfNodes
:
2
,
role
:
RoleLeader
,
multinodeDeploymentType
:
consts
.
MultinodeDeploymentTypeGrove
,
initialArgs
:
[]
string
{
"python -m dynamo.sglang.worker"
},
expectedArgs
:
[]
string
{
"python -m dynamo.sglang.worker --dist-init-addr ${GROVE_PCSG_NAME}-${GROVE_PCSG_INDEX}-test-service-ldr-0.${GROVE_HEADLESS_SERVICE}:29500 --nnodes 2 --node-rank 0"
},
description
:
"Should add multinode flags directly to python command"
,
},
{
name
:
"multinode with complex command"
,
numberOfNodes
:
2
,
role
:
RoleLeader
,
multinodeDeploymentType
:
consts
.
MultinodeDeploymentTypeGrove
,
initialArgs
:
[]
string
{
"echo blah | wc -l && python -m dynamo.sglang.worker && ls -al"
},
expectedArgs
:
[]
string
{
"echo blah | wc -l && python -m dynamo.sglang.worker --dist-init-addr ${GROVE_PCSG_NAME}-${GROVE_PCSG_INDEX}-test-service-ldr-0.${GROVE_HEADLESS_SERVICE}:29500 --nnodes 2 --node-rank 0 && ls -al"
},
description
:
"Should add flags only to python command, not other commands"
,
},
{
name
:
"multinode worker with Grove deployment"
,
numberOfNodes
:
3
,
role
:
RoleWorker
,
multinodeDeploymentType
:
consts
.
MultinodeDeploymentTypeGrove
,
initialArgs
:
[]
string
{
"python -m dynamo.sglang.worker"
},
expectedArgs
:
[]
string
{
"python -m dynamo.sglang.worker --dist-init-addr ${GROVE_PCSG_NAME}-${GROVE_PCSG_INDEX}-test-service-ldr-0.${GROVE_HEADLESS_SERVICE}:29500 --nnodes 3 --node-rank $((GROVE_PCLQ_POD_INDEX + 1))"
},
description
:
"Worker should get correct node rank"
,
},
{
name
:
"LWS deployment uses correct address"
,
numberOfNodes
:
2
,
role
:
RoleLeader
,
multinodeDeploymentType
:
consts
.
MultinodeDeploymentTypeLWS
,
initialArgs
:
[]
string
{
"python -m dynamo.sglang.worker"
},
expectedArgs
:
[]
string
{
"python -m dynamo.sglang.worker --dist-init-addr ${LWS_LEADER_ADDRESS}:29500 --nnodes 2 --node-rank 0"
},
description
:
"LWS deployment should use LWS_LEADER_ADDRESS"
,
},
{
name
:
"command with pipes gets flags before pipe"
,
numberOfNodes
:
2
,
role
:
RoleLeader
,
multinodeDeploymentType
:
consts
.
MultinodeDeploymentTypeGrove
,
initialArgs
:
[]
string
{
"python -m dynamo.sglang.worker | tee /tmp/log"
},
expectedArgs
:
[]
string
{
"python -m dynamo.sglang.worker --dist-init-addr ${GROVE_PCSG_NAME}-${GROVE_PCSG_INDEX}-test-service-ldr-0.${GROVE_HEADLESS_SERVICE}:29500 --nnodes 2 --node-rank 0 | tee /tmp/log"
},
description
:
"Should insert flags before pipe operator"
,
},
{
name
:
"multiple args are flattened and processed together"
,
numberOfNodes
:
2
,
role
:
RoleLeader
,
multinodeDeploymentType
:
consts
.
MultinodeDeploymentTypeGrove
,
initialArgs
:
[]
string
{
"echo start"
,
"python -m dynamo.sglang.worker"
,
"echo done"
},
expectedArgs
:
[]
string
{
"echo start python -m dynamo.sglang.worker --dist-init-addr ${GROVE_PCSG_NAME}-${GROVE_PCSG_INDEX}-test-service-ldr-0.${GROVE_HEADLESS_SERVICE}:29500 --nnodes 2 --node-rank 0 echo done"
},
description
:
"Multiple args should be flattened and python command gets flags"
,
},
{
name
:
"no sglang command means flattened but no changes"
,
numberOfNodes
:
2
,
role
:
RoleLeader
,
multinodeDeploymentType
:
consts
.
MultinodeDeploymentTypeGrove
,
initialArgs
:
[]
string
{
"echo hello"
,
"python -m some.other.module"
},
expectedArgs
:
[]
string
{
"echo hello python -m some.other.module"
},
description
:
"Non-sglang commands should be flattened but not modified"
,
},
}
for
_
,
tt
:=
range
tests
{
t
.
Run
(
tt
.
name
,
func
(
t
*
testing
.
T
)
{
container
:=
&
corev1
.
Container
{
Args
:
append
([]
string
{},
tt
.
initialArgs
...
),
}
backend
.
UpdateContainer
(
container
,
tt
.
numberOfNodes
,
tt
.
role
,
&
v1alpha1
.
DynamoComponentDeploymentOverridesSpec
{},
tt
.
multinodeDeploymentType
,
"test-service"
)
if
!
reflect
.
DeepEqual
(
container
.
Args
,
tt
.
expectedArgs
)
{
t
.
Errorf
(
"UpdateContainer() args = %v, want %v"
,
container
.
Args
,
tt
.
expectedArgs
)
}
// Verify no environment variables were added
if
len
(
container
.
Env
)
>
0
{
t
.
Errorf
(
"UpdateContainer() should not add environment variables, but added: %v"
,
container
.
Env
)
}
// Verify command was not changed
if
len
(
container
.
Command
)
>
0
{
t
.
Errorf
(
"UpdateContainer() should not modify command, but set: %v"
,
container
.
Command
)
}
})
}
}
func
TestSGLangBackend_ProbeRemoval
(
t
*
testing
.
T
)
{
backend
:=
&
SGLangBackend
{}
tests
:=
[]
struct
{
name
string
numberOfNodes
int32
role
Role
multinodeDeploymentType
consts
.
MultinodeDeploymentType
expectProbesRemoved
bool
}{
{
name
:
"single node does not remove probes"
,
numberOfNodes
:
1
,
role
:
RoleMain
,
multinodeDeploymentType
:
consts
.
MultinodeDeploymentTypeGrove
,
expectProbesRemoved
:
false
,
},
{
name
:
"multinode leader removes probes"
,
numberOfNodes
:
2
,
role
:
RoleLeader
,
multinodeDeploymentType
:
consts
.
MultinodeDeploymentTypeGrove
,
expectProbesRemoved
:
true
,
},
{
name
:
"multinode worker removes probes"
,
numberOfNodes
:
2
,
role
:
RoleWorker
,
multinodeDeploymentType
:
consts
.
MultinodeDeploymentTypeGrove
,
expectProbesRemoved
:
true
,
},
{
name
:
"multinode main role does not remove probes"
,
numberOfNodes
:
2
,
role
:
RoleMain
,
multinodeDeploymentType
:
consts
.
MultinodeDeploymentTypeGrove
,
expectProbesRemoved
:
false
,
},
}
for
_
,
tt
:=
range
tests
{
t
.
Run
(
tt
.
name
,
func
(
t
*
testing
.
T
)
{
// Create initial probes
livenessProbe
:=
&
corev1
.
Probe
{
InitialDelaySeconds
:
30
}
readinessProbe
:=
&
corev1
.
Probe
{
InitialDelaySeconds
:
10
}
startupProbe
:=
&
corev1
.
Probe
{
InitialDelaySeconds
:
5
}
container
:=
&
corev1
.
Container
{
Args
:
[]
string
{
"python -m dynamo.sglang.worker"
},
LivenessProbe
:
livenessProbe
,
ReadinessProbe
:
readinessProbe
,
StartupProbe
:
startupProbe
,
}
backend
.
UpdateContainer
(
container
,
tt
.
numberOfNodes
,
tt
.
role
,
&
v1alpha1
.
DynamoComponentDeploymentOverridesSpec
{},
tt
.
multinodeDeploymentType
,
"test-service"
)
if
tt
.
expectProbesRemoved
{
if
container
.
LivenessProbe
!=
nil
{
t
.
Errorf
(
"Expected LivenessProbe to be removed, but it was not"
)
}
if
container
.
ReadinessProbe
!=
nil
{
t
.
Errorf
(
"Expected ReadinessProbe to be removed, but it was not"
)
}
if
container
.
StartupProbe
!=
nil
{
t
.
Errorf
(
"Expected StartupProbe to be removed, but it was not"
)
}
}
else
{
if
container
.
LivenessProbe
==
nil
{
t
.
Errorf
(
"Expected LivenessProbe to be preserved, but it was removed"
)
}
if
container
.
ReadinessProbe
==
nil
{
t
.
Errorf
(
"Expected ReadinessProbe to be preserved, but it was removed"
)
}
if
container
.
StartupProbe
==
nil
{
t
.
Errorf
(
"Expected StartupProbe to be preserved, but it was removed"
)
}
}
})
}
}
deploy/cloud/operator/internal/dynamo/backend_trtllm.go
0 → 100644
View file @
dabd2267
package
dynamo
import
(
"fmt"
"sort"
"strconv"
"strings"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/dynamo/common"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
commonconsts
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
corev1
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/intstr"
)
type
TRTLLMBackend
struct
{}
func
(
b
*
TRTLLMBackend
)
UpdateContainer
(
container
*
corev1
.
Container
,
numberOfNodes
int32
,
role
Role
,
component
*
v1alpha1
.
DynamoComponentDeploymentOverridesSpec
,
multinodeDeploymentType
commonconsts
.
MultinodeDeploymentType
,
serviceName
string
)
{
// For single node, nothing to do
if
numberOfNodes
<=
1
{
return
}
// Configure probes for multinode deployments
if
role
==
RoleWorker
{
// For workers: remove liveness and startup probes, set readiness to check SSH port
container
.
LivenessProbe
=
nil
container
.
StartupProbe
=
nil
container
.
ReadinessProbe
=
&
corev1
.
Probe
{
ProbeHandler
:
corev1
.
ProbeHandler
{
TCPSocket
:
&
corev1
.
TCPSocketAction
{
Port
:
intstr
.
FromInt
(
commonconsts
.
MpiRunSshPort
),
},
},
InitialDelaySeconds
:
20
,
PeriodSeconds
:
20
,
TimeoutSeconds
:
5
,
FailureThreshold
:
10
,
}
}
// For leaders: leave all probes untouched
// Add SSH keypair volume mount for multinode deployments
b
.
addSSHVolumeMount
(
container
)
// Add OpenMPI environment variable to keep FQDN hostnames
envVar
:=
corev1
.
EnvVar
{
Name
:
"OMPI_MCA_orte_keep_fqdn_hostnames"
,
Value
:
"1"
,
}
container
.
Env
=
append
(
container
.
Env
,
envVar
)
// Update container command based on role
switch
role
{
case
RoleLeader
:
b
.
setupLeaderContainer
(
container
,
numberOfNodes
,
multinodeDeploymentType
,
serviceName
,
component
)
case
RoleWorker
:
b
.
setupWorkerContainer
(
container
)
}
}
func
(
b
*
TRTLLMBackend
)
UpdatePodSpec
(
podSpec
*
corev1
.
PodSpec
,
numberOfNodes
int32
,
role
Role
,
component
*
v1alpha1
.
DynamoComponentDeploymentOverridesSpec
,
multinodeDeploymentType
commonconsts
.
MultinodeDeploymentType
,
serviceName
string
)
{
// Add SSH keypair volume for TRTLLM multinode deployments
if
numberOfNodes
>
1
{
sshVolume
:=
corev1
.
Volume
{
Name
:
commonconsts
.
MpiRunSshSecretName
,
VolumeSource
:
corev1
.
VolumeSource
{
Secret
:
&
corev1
.
SecretVolumeSource
{
SecretName
:
commonconsts
.
MpiRunSshSecretName
,
DefaultMode
:
func
()
*
int32
{
mode
:=
int32
(
0644
);
return
&
mode
}(),
},
},
}
podSpec
.
Volumes
=
append
(
podSpec
.
Volumes
,
sshVolume
)
}
}
// addSSHVolumeMount adds the SSH keypair secret volume mount to the container
func
(
b
*
TRTLLMBackend
)
addSSHVolumeMount
(
container
*
corev1
.
Container
)
{
sshVolumeMount
:=
corev1
.
VolumeMount
{
Name
:
commonconsts
.
MpiRunSshSecretName
,
MountPath
:
"/ssh-pk"
,
ReadOnly
:
true
,
}
container
.
VolumeMounts
=
append
(
container
.
VolumeMounts
,
sshVolumeMount
)
}
// setupLeaderContainer configures the leader node with SSH setup and mpirun command
func
(
b
*
TRTLLMBackend
)
setupLeaderContainer
(
container
*
corev1
.
Container
,
numberOfNodes
int32
,
multinodeDeploymentType
commonconsts
.
MultinodeDeploymentType
,
serviceName
string
,
component
*
v1alpha1
.
DynamoComponentDeploymentOverridesSpec
)
{
// Generate the list of worker hostnames
workerHosts
:=
b
.
generateWorkerHostnames
(
numberOfNodes
,
multinodeDeploymentType
,
serviceName
)
// Store original command/args for later use
var
originalCommand
string
if
len
(
container
.
Args
)
>
0
{
originalCommand
=
strings
.
Join
(
container
.
Args
,
" "
)
}
else
if
len
(
container
.
Command
)
>
0
{
originalCommand
=
strings
.
Join
(
container
.
Command
,
" "
)
}
// Setup SSH and run mpirun command
sshSetupCommands
:=
[]
string
{
"mkdir -p ~/.ssh"
,
"ls -la /ssh-pk/"
,
// Debug: list files in ssh-pk directory
"cp /ssh-pk/private.key ~/.ssh/id_rsa"
,
"cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub"
,
"cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys"
,
"chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys"
,
"chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys"
,
fmt
.
Sprintf
(
"printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort %d
\\
n' > ~/.ssh/config"
,
commonconsts
.
MpiRunSshPort
),
}
// Calculate total number of GPUs across all nodes
gpusPerNode
:=
getGPUsPerNode
(
component
.
Resources
)
totalGPUs
:=
numberOfNodes
*
gpusPerNode
// Build mpirun command with explicit SSH configuration and environment variables
// Wrap the entire command (trtllm-llmapi-launch + original command) in bash -c for proper shell interpretation
wrappedCommand
:=
fmt
.
Sprintf
(
"bash -c 'source /opt/dynamo/venv/bin/activate && trtllm-llmapi-launch %s'"
,
originalCommand
)
// Generate environment variable flags for mpirun
envVarsStr
:=
generateEnvVarFlags
(
container
.
Env
)
mpirunCmd
:=
fmt
.
Sprintf
(
"mpirun --oversubscribe -n %d -H %s --mca pml ob1 --mca plm_rsh_args
\"
-p %d -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa
\"
%s %s"
,
totalGPUs
,
workerHosts
,
commonconsts
.
MpiRunSshPort
,
envVarsStr
,
wrappedCommand
)
// Combine SSH setup and mpirun command
fullCommand
:=
strings
.
Join
(
append
(
sshSetupCommands
,
mpirunCmd
),
" && "
)
// Update container to use bash with the full command
container
.
Command
=
[]
string
{
"/bin/sh"
,
"-c"
}
container
.
Args
=
[]
string
{
fullCommand
}
}
// setupWorkerContainer configures worker nodes with SSH setup and daemon
func
(
b
*
TRTLLMBackend
)
setupWorkerContainer
(
container
*
corev1
.
Container
)
{
// Setup SSH for worker nodes
sshSetupCommands
:=
[]
string
{
"mkdir -p ~/.ssh ~/.ssh/host_keys ~/.ssh/run"
,
"ls -la /ssh-pk/"
,
// Debug: list files in ssh-pk directory
"cp /ssh-pk/private.key ~/.ssh/id_rsa"
,
"cp /ssh-pk/private.key.pub ~/.ssh/id_rsa.pub"
,
"cp /ssh-pk/private.key.pub ~/.ssh/authorized_keys"
,
"chmod 600 ~/.ssh/id_rsa ~/.ssh/authorized_keys"
,
"chmod 644 ~/.ssh/id_rsa.pub ~/.ssh/authorized_keys"
,
fmt
.
Sprintf
(
"printf 'Host *
\\
nIdentityFile ~/.ssh/id_rsa
\\
nStrictHostKeyChecking no
\\
nPort %d
\\
n' > ~/.ssh/config"
,
commonconsts
.
MpiRunSshPort
),
// Generate host keys in user writable directory
"ssh-keygen -t rsa -f ~/.ssh/host_keys/ssh_host_rsa_key -N ''"
,
"ssh-keygen -t ecdsa -f ~/.ssh/host_keys/ssh_host_ecdsa_key -N ''"
,
"ssh-keygen -t ed25519 -f ~/.ssh/host_keys/ssh_host_ed25519_key -N ''"
,
// Create SSH daemon config to use custom host keys location and non-privileged port
fmt
.
Sprintf
(
"printf 'Port %d
\\
nHostKey ~/.ssh/host_keys/ssh_host_rsa_key
\\
nHostKey ~/.ssh/host_keys/ssh_host_ecdsa_key
\\
nHostKey ~/.ssh/host_keys/ssh_host_ed25519_key
\\
nPidFile ~/.ssh/run/sshd.pid
\\
nPermitRootLogin yes
\\
nPasswordAuthentication no
\\
nPubkeyAuthentication yes
\\
nAuthorizedKeysFile ~/.ssh/authorized_keys
\\
n' > ~/.ssh/sshd_config"
,
commonconsts
.
MpiRunSshPort
),
"mkdir -p /run/sshd"
,
"/usr/sbin/sshd -D -f ~/.ssh/sshd_config"
,
}
fullCommand
:=
strings
.
Join
(
sshSetupCommands
,
" && "
)
// Update container to use bash with the SSH setup and daemon
container
.
Command
=
[]
string
{
"/bin/sh"
,
"-c"
}
container
.
Args
=
[]
string
{
fullCommand
}
}
// generateWorkerHostnames creates a comma-separated list of worker hostnames
func
(
b
*
TRTLLMBackend
)
generateWorkerHostnames
(
numberOfNodes
int32
,
multinodeDeploymentType
commonconsts
.
MultinodeDeploymentType
,
serviceName
string
)
string
{
var
hostnames
[]
string
// Add leader hostname first
if
multinodeDeploymentType
==
commonconsts
.
MultinodeDeploymentTypeGrove
{
leaderHostname
:=
generateGroveLeaderHostname
(
serviceName
)
hostnames
=
append
(
hostnames
,
leaderHostname
)
// Add worker hostnames
for
i
:=
int32
(
0
);
i
<
numberOfNodes
-
1
;
i
++
{
workerHostname
:=
fmt
.
Sprintf
(
"${GROVE_PCSG_NAME}-${GROVE_PCSG_INDEX}-%s-%s-%d.${GROVE_HEADLESS_SERVICE}"
,
serviceName
,
commonconsts
.
GroveRoleSuffixWorker
,
i
)
hostnames
=
append
(
hostnames
,
workerHostname
)
}
}
else
{
// For LWS deployment type - using environment variables
hostnames
=
append
(
hostnames
,
"${LWS_LEADER_ADDRESS}"
)
for
i
:=
int32
(
1
);
i
<
numberOfNodes
;
i
++
{
hostnames
=
append
(
hostnames
,
fmt
.
Sprintf
(
"${LWS_WORKER_%d_ADDRESS}"
,
i
))
}
}
return
strings
.
Join
(
hostnames
,
","
)
}
// getGPUsPerNode extracts the number of GPUs per node from resources
func
getGPUsPerNode
(
resources
*
common
.
Resources
)
int32
{
if
resources
!=
nil
&&
resources
.
Requests
!=
nil
&&
resources
.
Requests
.
GPU
!=
""
{
if
gpus
,
err
:=
strconv
.
ParseInt
(
resources
.
Requests
.
GPU
,
10
,
32
);
err
==
nil
{
return
int32
(
gpus
)
}
}
if
resources
!=
nil
&&
resources
.
Limits
!=
nil
&&
resources
.
Limits
.
GPU
!=
""
{
if
gpus
,
err
:=
strconv
.
ParseInt
(
resources
.
Limits
.
GPU
,
10
,
32
);
err
==
nil
{
return
int32
(
gpus
)
}
}
return
0
// Default to 0 GPUs if not specified
}
// getCommonTRTLLMEnvVars returns a map of common environment variables for TRTLLM deployments
func
getCommonTRTLLMEnvVars
()
map
[
string
]
bool
{
return
map
[
string
]
bool
{
"CUDA_VISIBLE_DEVICES"
:
true
,
"MODEL_PATH"
:
true
,
"HF_TOKEN"
:
true
,
"HUGGING_FACE_HUB_TOKEN"
:
true
,
"TOKENIZERS_PARALLELISM"
:
true
,
"NCCL_DEBUG"
:
true
,
"NCCL_IB_DISABLE"
:
true
,
"NCCL_P2P_DISABLE"
:
true
,
"TENSORRT_LLM_CACHE_DIR"
:
true
,
"HF_HOME"
:
true
,
"TRANSFORMERS_CACHE"
:
true
,
"HF_DATASETS_CACHE"
:
true
,
"PATH"
:
true
,
"LD_LIBRARY_PATH"
:
true
,
"PYTHONPATH"
:
true
,
"HOME"
:
true
,
"USER"
:
true
,
}
}
// collectAllEnvVars combines explicit container env vars with common TRTLLM env vars, removing duplicates
func
collectAllEnvVars
(
containerEnvVars
[]
corev1
.
EnvVar
)
[]
string
{
// Initialize set with common environment variables
envVarSet
:=
getCommonTRTLLMEnvVars
()
// Add explicit environment variables from container
for
_
,
env
:=
range
containerEnvVars
{
envVarSet
[
env
.
Name
]
=
true
}
// Convert set to sorted slice for consistent output
envVarNames
:=
make
([]
string
,
0
,
len
(
envVarSet
))
for
envVar
:=
range
envVarSet
{
envVarNames
=
append
(
envVarNames
,
envVar
)
}
sort
.
Strings
(
envVarNames
)
return
envVarNames
}
// formatEnvVarFlags converts environment variable names to mpirun -x flags
func
formatEnvVarFlags
(
envVarNames
[]
string
)
string
{
envVars
:=
make
([]
string
,
0
,
len
(
envVarNames
))
for
_
,
envVar
:=
range
envVarNames
{
envVars
=
append
(
envVars
,
fmt
.
Sprintf
(
"-x %s"
,
envVar
))
}
return
strings
.
Join
(
envVars
,
" "
)
}
// generateEnvVarFlags generates the complete environment variable flags string for mpirun
func
generateEnvVarFlags
(
containerEnvVars
[]
corev1
.
EnvVar
)
string
{
envVarNames
:=
collectAllEnvVars
(
containerEnvVars
)
return
formatEnvVarFlags
(
envVarNames
)
}
deploy/cloud/operator/internal/dynamo/backend_trtllm_test.go
0 → 100644
View file @
dabd2267
This diff is collapsed.
Click to expand it.
deploy/cloud/operator/internal/dynamo/backend_vllm.go
0 → 100644
View file @
dabd2267
package
dynamo
import
(
"fmt"
"strings"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
commonconsts
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
corev1
"k8s.io/api/core/v1"
)
type
VLLMBackend
struct
{}
func
(
b
*
VLLMBackend
)
UpdateContainer
(
container
*
corev1
.
Container
,
numberOfNodes
int32
,
role
Role
,
component
*
v1alpha1
.
DynamoComponentDeploymentOverridesSpec
,
multinodeDeploymentType
commonconsts
.
MultinodeDeploymentType
,
serviceName
string
)
{
isMultinode
:=
numberOfNodes
>
1
if
isMultinode
{
// Apply multinode-specific argument modifications
updateVLLMMultinodeArgs
(
container
,
role
,
multinodeDeploymentType
,
serviceName
)
// Remove probes for multinode worker and leader
if
role
==
RoleWorker
||
role
==
RoleLeader
{
container
.
LivenessProbe
=
nil
container
.
ReadinessProbe
=
nil
container
.
StartupProbe
=
nil
}
}
}
func
(
b
*
VLLMBackend
)
UpdatePodSpec
(
podSpec
*
corev1
.
PodSpec
,
numberOfNodes
int32
,
role
Role
,
component
*
v1alpha1
.
DynamoComponentDeploymentOverridesSpec
,
multinodeDeploymentType
commonconsts
.
MultinodeDeploymentType
,
serviceName
string
)
{
// do nothing
}
// updateVLLMMultinodeArgs applies Ray-specific modifications for multinode deployments
func
updateVLLMMultinodeArgs
(
container
*
corev1
.
Container
,
role
Role
,
multinodeDeploymentType
commonconsts
.
MultinodeDeploymentType
,
serviceName
string
)
{
switch
role
{
case
RoleLeader
:
if
len
(
container
.
Args
)
>
0
{
// Prepend ray start --head command to existing args
container
.
Args
=
[]
string
{
fmt
.
Sprintf
(
"ray start --head --port=6379 && %s"
,
strings
.
Join
(
container
.
Args
,
" "
))}
}
case
RoleWorker
:
// Worker nodes only run Ray, completely replace args
if
multinodeDeploymentType
==
commonconsts
.
MultinodeDeploymentTypeGrove
{
leaderHostname
:=
generateGroveLeaderHostname
(
serviceName
)
container
.
Args
=
[]
string
{
fmt
.
Sprintf
(
"ray start --address=%s:6379 --block"
,
leaderHostname
)}
}
else
{
container
.
Args
=
[]
string
{
"ray start --address=${LWS_LEADER_ADDRESS}:6379 --block"
}
}
}
}
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment