Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
81c27803
Unverified
Commit
81c27803
authored
Aug 14, 2025
by
mohammedabdulwahhab
Committed by
GitHub
Aug 14, 2025
Browse files
fix: operator defaults (#2398)
Signed-off-by:
mohammedabdulwahhab
<
furkhan324@berkeley.edu
>
parent
9ddb3efd
Changes
26
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
443 additions
and
149 deletions
+443
-149
deploy/cloud/operator/internal/dynamo/component_frontend.go
deploy/cloud/operator/internal/dynamo/component_frontend.go
+89
-0
deploy/cloud/operator/internal/dynamo/component_planner.go
deploy/cloud/operator/internal/dynamo/component_planner.go
+46
-0
deploy/cloud/operator/internal/dynamo/component_worker.go
deploy/cloud/operator/internal/dynamo/component_worker.go
+103
-0
deploy/cloud/operator/internal/dynamo/graph.go
deploy/cloud/operator/internal/dynamo/graph.go
+54
-34
deploy/cloud/operator/internal/dynamo/graph_test.go
deploy/cloud/operator/internal/dynamo/graph_test.go
+150
-114
examples/runtime/hello_world/deploy/hello_world.yaml
examples/runtime/hello_world/deploy/hello_world.yaml
+1
-1
No files found.
deploy/cloud/operator/internal/dynamo/component_frontend.go
0 → 100644
View file @
81c27803
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*/
package
dynamo
import
(
"fmt"
commonconsts
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
corev1
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/util/intstr"
)
// FrontendDefaults implements ComponentDefaults for Frontend components
type
FrontendDefaults
struct
{
*
BaseComponentDefaults
}
func
NewFrontendDefaults
()
*
FrontendDefaults
{
return
&
FrontendDefaults
{
&
BaseComponentDefaults
{}}
}
func
(
f
*
FrontendDefaults
)
GetBaseContainer
(
numberOfNodes
int32
)
(
corev1
.
Container
,
error
)
{
// Frontend doesn't need backend-specific config
container
:=
f
.
getCommonContainer
()
// Add HTTP port
container
.
Ports
=
[]
corev1
.
ContainerPort
{
{
Protocol
:
corev1
.
ProtocolTCP
,
Name
:
commonconsts
.
DynamoContainerPortName
,
ContainerPort
:
int32
(
commonconsts
.
DynamoServicePort
),
},
}
// Add frontend-specific defaults
container
.
LivenessProbe
=
&
corev1
.
Probe
{
ProbeHandler
:
corev1
.
ProbeHandler
{
HTTPGet
:
&
corev1
.
HTTPGetAction
{
Path
:
"/health"
,
Port
:
intstr
.
FromString
(
commonconsts
.
DynamoContainerPortName
),
},
},
InitialDelaySeconds
:
60
,
PeriodSeconds
:
60
,
TimeoutSeconds
:
30
,
FailureThreshold
:
10
,
}
container
.
ReadinessProbe
=
&
corev1
.
Probe
{
ProbeHandler
:
corev1
.
ProbeHandler
{
Exec
:
&
corev1
.
ExecAction
{
Command
:
[]
string
{
"/bin/sh"
,
"-c"
,
"curl -s http://localhost:${DYNAMO_PORT}/health | jq -e
\"
.status ==
\\\"
healthy
\\\"\"
"
,
},
},
},
InitialDelaySeconds
:
60
,
PeriodSeconds
:
60
,
TimeoutSeconds
:
30
,
FailureThreshold
:
10
,
}
container
.
Resources
=
corev1
.
ResourceRequirements
{
Requests
:
corev1
.
ResourceList
{
corev1
.
ResourceCPU
:
resource
.
MustParse
(
"1"
),
corev1
.
ResourceMemory
:
resource
.
MustParse
(
"2Gi"
),
},
Limits
:
corev1
.
ResourceList
{
corev1
.
ResourceCPU
:
resource
.
MustParse
(
"1"
),
corev1
.
ResourceMemory
:
resource
.
MustParse
(
"2Gi"
),
},
}
// Add standard environment variables
container
.
Env
=
[]
corev1
.
EnvVar
{
{
Name
:
commonconsts
.
EnvDynamoServicePort
,
Value
:
fmt
.
Sprintf
(
"%d"
,
commonconsts
.
DynamoServicePort
),
},
}
return
container
,
nil
}
deploy/cloud/operator/internal/dynamo/component_planner.go
0 → 100644
View file @
81c27803
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*/
package
dynamo
import
(
commonconsts
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
corev1
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
)
// PlannerDefaults implements ComponentDefaults for Planner components
type
PlannerDefaults
struct
{
*
BaseComponentDefaults
}
func
NewPlannerDefaults
()
*
PlannerDefaults
{
return
&
PlannerDefaults
{
&
BaseComponentDefaults
{}}
}
func
(
p
*
PlannerDefaults
)
GetBaseContainer
(
numberOfNodes
int32
)
(
corev1
.
Container
,
error
)
{
container
:=
p
.
getCommonContainer
()
// Add planner-specific defaults
container
.
Resources
=
corev1
.
ResourceRequirements
{
Requests
:
corev1
.
ResourceList
{
corev1
.
ResourceCPU
:
resource
.
MustParse
(
"2"
),
corev1
.
ResourceMemory
:
resource
.
MustParse
(
"2Gi"
),
},
Limits
:
corev1
.
ResourceList
{
corev1
.
ResourceCPU
:
resource
.
MustParse
(
"2"
),
corev1
.
ResourceMemory
:
resource
.
MustParse
(
"2Gi"
),
},
}
return
container
,
nil
}
func
(
p
*
PlannerDefaults
)
GetBasePodSpec
(
numberOfNodes
int32
)
(
corev1
.
PodSpec
,
error
)
{
podSpec
:=
corev1
.
PodSpec
{
ServiceAccountName
:
commonconsts
.
PlannerServiceAccountName
,
}
return
podSpec
,
nil
}
deploy/cloud/operator/internal/dynamo/component_worker.go
0 → 100644
View file @
81c27803
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*/
package
dynamo
import
(
"fmt"
commonconsts
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
corev1
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/util/intstr"
)
// WorkerDefaults implements ComponentDefaults for Worker components
type
WorkerDefaults
struct
{
*
BaseComponentDefaults
}
func
NewWorkerDefaults
()
*
WorkerDefaults
{
return
&
WorkerDefaults
{
&
BaseComponentDefaults
{}}
}
func
(
w
*
WorkerDefaults
)
GetBaseContainer
(
numberOfNodes
int32
)
(
corev1
.
Container
,
error
)
{
container
:=
w
.
getCommonContainer
()
// Add system port
container
.
Ports
=
[]
corev1
.
ContainerPort
{
{
Protocol
:
corev1
.
ProtocolTCP
,
Name
:
commonconsts
.
DynamoSystemPortName
,
ContainerPort
:
int32
(
commonconsts
.
DynamoSystemPort
),
},
}
// Add worker base defaults
container
.
Resources
=
corev1
.
ResourceRequirements
{
Requests
:
corev1
.
ResourceList
{
corev1
.
ResourceCPU
:
resource
.
MustParse
(
"10"
),
corev1
.
ResourceMemory
:
resource
.
MustParse
(
"20Gi"
),
},
Limits
:
corev1
.
ResourceList
{
corev1
.
ResourceCPU
:
resource
.
MustParse
(
"10"
),
corev1
.
ResourceMemory
:
resource
.
MustParse
(
"20Gi"
),
"nvidia.com/gpu"
:
resource
.
MustParse
(
"1"
),
},
}
container
.
LivenessProbe
=
&
corev1
.
Probe
{
ProbeHandler
:
corev1
.
ProbeHandler
{
HTTPGet
:
&
corev1
.
HTTPGetAction
{
Path
:
"/live"
,
Port
:
intstr
.
FromString
(
commonconsts
.
DynamoSystemPortName
),
},
},
PeriodSeconds
:
5
,
TimeoutSeconds
:
30
,
FailureThreshold
:
1
,
}
container
.
ReadinessProbe
=
&
corev1
.
Probe
{
ProbeHandler
:
corev1
.
ProbeHandler
{
HTTPGet
:
&
corev1
.
HTTPGetAction
{
Path
:
"/health"
,
Port
:
intstr
.
FromString
(
commonconsts
.
DynamoSystemPortName
),
},
},
PeriodSeconds
:
10
,
TimeoutSeconds
:
30
,
FailureThreshold
:
60
,
}
container
.
StartupProbe
=
&
corev1
.
Probe
{
ProbeHandler
:
corev1
.
ProbeHandler
{
HTTPGet
:
&
corev1
.
HTTPGetAction
{
Path
:
"/live"
,
Port
:
intstr
.
FromString
(
commonconsts
.
DynamoSystemPortName
),
},
},
PeriodSeconds
:
10
,
TimeoutSeconds
:
5
,
FailureThreshold
:
60
,
}
container
.
Env
=
[]
corev1
.
EnvVar
{
{
Name
:
"DYN_SYSTEM_ENABLED"
,
Value
:
"true"
,
},
{
Name
:
"DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS"
,
Value
:
"[
\"
generate
\"
]"
,
},
{
Name
:
"DYN_SYSTEM_PORT"
,
Value
:
fmt
.
Sprintf
(
"%d"
,
commonconsts
.
DynamoSystemPort
),
},
}
return
container
,
nil
}
deploy/cloud/operator/internal/dynamo/graph.go
View file @
81c27803
...
...
@@ -21,6 +21,7 @@ import (
"context"
"encoding/json"
"fmt"
"maps"
"regexp"
"sort"
"strconv"
...
...
@@ -191,7 +192,7 @@ func GenerateDynamoComponentsDeployments(ctx context.Context, parentDynamoGraphD
// finally set the service account name
deployment
.
Spec
.
ExtraPodSpec
.
PodSpec
.
ServiceAccountName
=
commonconsts
.
PlannerServiceAccountName
}
if
deployment
.
Is
Main
Component
()
&&
defaultIngressSpec
!=
nil
&&
deployment
.
Spec
.
Ingress
==
nil
{
if
deployment
.
Is
Frontend
Component
()
&&
defaultIngressSpec
!=
nil
&&
deployment
.
Spec
.
Ingress
==
nil
{
deployment
.
Spec
.
Ingress
=
defaultIngressSpec
}
// merge the envs from the parent deployment with the envs from the service
...
...
@@ -219,7 +220,7 @@ func GenerateDynamoComponentsDeployments(ctx context.Context, parentDynamoGraphD
// updateDynDeploymentConfig updates the runtime config object for the given dynamoDeploymentComponent
// It updates the port for the given service (if it is the main component)
func
updateDynDeploymentConfig
(
dynamoDeploymentComponent
*
v1alpha1
.
DynamoComponentDeployment
,
newPort
int
)
error
{
if
dynamoDeploymentComponent
.
Is
Main
Component
()
{
if
dynamoDeploymentComponent
.
Is
Frontend
Component
()
{
dynamoDeploymentConfig
:=
dynamoDeploymentComponent
.
GetDynamoDeploymentConfig
()
if
dynamoDeploymentConfig
!=
nil
{
var
config
map
[
string
]
any
...
...
@@ -668,11 +669,6 @@ func isWorkerComponent(componentType string) bool {
// addStandardEnvVars adds the standard environment variables that are common to both Grove and Controller
func
addStandardEnvVars
(
container
*
corev1
.
Container
,
controllerConfig
controller_common
.
Config
)
{
container
.
Env
=
append
(
container
.
Env
,
corev1
.
EnvVar
{
Name
:
commonconsts
.
EnvDynamoServicePort
,
Value
:
fmt
.
Sprintf
(
"%d"
,
commonconsts
.
DynamoServicePort
),
})
if
controllerConfig
.
NatsAddress
!=
""
{
container
.
Env
=
append
(
container
.
Env
,
corev1
.
EnvVar
{
Name
:
"NATS_SERVER"
,
...
...
@@ -702,47 +698,60 @@ func GenerateBasePodSpec(
multinodeDeploymentType
commonconsts
.
MultinodeDeploymentType
,
serviceName
string
,
)
(
corev1
.
PodSpec
,
error
)
{
container
:=
corev1
.
Container
{
Name
:
"main"
,
LivenessProbe
:
component
.
LivenessProbe
,
ReadinessProbe
:
component
.
ReadinessProbe
,
Env
:
component
.
Envs
,
Ports
:
[]
corev1
.
ContainerPort
{
{
Protocol
:
corev1
.
ProtocolTCP
,
Name
:
commonconsts
.
DynamoContainerPortName
,
ContainerPort
:
int32
(
commonconsts
.
DynamoServicePort
),
},
},
}
// Add system port for worker components
if
component
.
ComponentType
==
commonconsts
.
ComponentTypeWorker
{
container
.
Ports
=
append
(
container
.
Ports
,
corev1
.
ContainerPort
{
Protocol
:
corev1
.
ProtocolTCP
,
Name
:
commonconsts
.
DynamoSystemPortName
,
ContainerPort
:
int32
(
commonconsts
.
DynamoSystemPort
),
})
// Start with base container generated per component type
componentDefaults
:=
ComponentDefaultsFactory
(
component
.
ComponentType
,
numberOfNodes
)
container
,
err
:=
componentDefaults
.
GetBaseContainer
(
numberOfNodes
)
if
err
!=
nil
{
return
corev1
.
PodSpec
{},
fmt
.
Errorf
(
"failed to get base container: %w"
,
err
)
}
// First merge the mainContainer from extraPodSpec to get the base command and args
if
component
.
ExtraPodSpec
!=
nil
&&
component
.
ExtraPodSpec
.
MainContainer
!=
nil
{
main
:=
component
.
ExtraPodSpec
.
MainContainer
.
DeepCopy
()
if
main
!=
nil
{
// merge the extraPodSpec from the parent deployment with the extraPodSpec from the service
err
:
=
mergo
.
Merge
(
&
container
,
*
main
,
mergo
.
WithOverride
)
err
=
mergo
.
Merge
(
&
container
,
*
main
,
mergo
.
WithOverride
)
if
err
!=
nil
{
return
corev1
.
PodSpec
{},
fmt
.
Errorf
(
"failed to merge extraPodSpec: %w"
,
err
)
}
// main container fields that require special handling
container
.
Env
=
MergeEnvs
(
component
.
Envs
,
container
.
Env
)
// Note: startup probe does not have its own top level field so it must be passed in extraPodSpec.MainContainer
// We want to overwrite entirely if provided rather than merge
if
main
.
StartupProbe
!=
nil
{
container
.
StartupProbe
=
main
.
StartupProbe
}
}
}
resourcesConfig
,
err
:=
controller_common
.
GetResourcesConfig
(
component
.
Resources
)
// Merge probes entirely if they are passed (no partial merge)
if
component
.
LivenessProbe
!=
nil
{
container
.
LivenessProbe
=
component
.
LivenessProbe
.
DeepCopy
()
}
if
component
.
ReadinessProbe
!=
nil
{
container
.
ReadinessProbe
=
component
.
ReadinessProbe
.
DeepCopy
()
}
overrideResources
,
err
:=
controller_common
.
GetResourcesConfig
(
component
.
Resources
)
if
err
!=
nil
{
return
corev1
.
PodSpec
{},
fmt
.
Errorf
(
"failed to get resources config: %w"
,
err
)
}
if
resourcesConfig
!=
nil
{
container
.
Resources
=
*
resourcesConfig
// Requests
if
overrideResources
!=
nil
&&
len
(
overrideResources
.
Requests
)
>
0
{
if
container
.
Resources
.
Requests
==
nil
{
container
.
Resources
.
Requests
=
corev1
.
ResourceList
{}
}
maps
.
Copy
(
container
.
Resources
.
Requests
,
overrideResources
.
Requests
)
}
// Limits
if
overrideResources
!=
nil
&&
len
(
overrideResources
.
Limits
)
>
0
{
if
container
.
Resources
.
Limits
==
nil
{
container
.
Resources
.
Limits
=
corev1
.
ResourceList
{}
}
maps
.
Copy
(
container
.
Resources
.
Limits
,
overrideResources
.
Limits
)
}
imagePullSecrets
:=
[]
corev1
.
LocalObjectReference
{}
if
secretsRetriever
!=
nil
&&
component
.
ExtraPodSpec
!=
nil
&&
component
.
ExtraPodSpec
.
MainContainer
!=
nil
&&
component
.
ExtraPodSpec
.
MainContainer
.
Image
!=
""
{
secretsName
,
err
:=
secretsRetriever
.
GetSecrets
(
namespace
,
component
.
ExtraPodSpec
.
MainContainer
.
Image
)
...
...
@@ -780,15 +789,26 @@ func GenerateBasePodSpec(
shmVolume
,
shmVolumeMount
:=
generateSharedMemoryVolumeAndMount
(
&
container
.
Resources
)
volumes
=
append
(
volumes
,
shmVolume
)
container
.
VolumeMounts
=
append
(
container
.
VolumeMounts
,
shmVolumeMount
)
// Apply backend-specific container modifications
backend
:=
BackendFactory
(
backendFramework
)
if
backend
==
nil
{
return
corev1
.
PodSpec
{},
fmt
.
Errorf
(
"unsupported backend framework: %s"
,
backendFramework
)
}
backend
.
UpdateContainer
(
&
container
,
numberOfNodes
,
role
,
component
,
multinodeDeploymentType
,
serviceName
)
var
podSpec
corev1
.
PodSpec
// get base podspec from component
podSpec
,
err
:=
componentDefaults
.
GetBasePodSpec
(
numberOfNodes
)
if
err
!=
nil
{
return
corev1
.
PodSpec
{},
fmt
.
Errorf
(
"failed to get base podspec: %w"
,
err
)
}
if
component
.
ExtraPodSpec
!=
nil
&&
component
.
ExtraPodSpec
.
PodSpec
!=
nil
{
podSpec
=
*
component
.
ExtraPodSpec
.
PodSpec
.
DeepCopy
()
// merge extraPodSpec PodSpec with base podspec
err
:=
mergo
.
Merge
(
&
podSpec
,
component
.
ExtraPodSpec
.
PodSpec
.
DeepCopy
(),
mergo
.
WithOverride
)
if
err
!=
nil
{
return
corev1
.
PodSpec
{},
fmt
.
Errorf
(
"failed to merge extraPodSpec: %w"
,
err
)
}
}
podSpec
.
Containers
=
append
(
podSpec
.
Containers
,
container
)
podSpec
.
Volumes
=
append
(
podSpec
.
Volumes
,
volumes
...
)
...
...
deploy/cloud/operator/internal/dynamo/graph_test.go
View file @
81c27803
...
...
@@ -63,7 +63,7 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
"service1"
:
{
DynamoComponentDeploymentSharedSpec
:
v1alpha1
.
DynamoComponentDeploymentSharedSpec
{
DynamoNamespace
:
&
[]
string
{
"default"
}[
0
],
ComponentType
:
"
main
"
,
ComponentType
:
"
frontend
"
,
Replicas
:
&
[]
int32
{
3
}[
0
],
Resources
:
&
common
.
Resources
{
Requests
:
&
common
.
ResourceItem
{
...
...
@@ -107,7 +107,7 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
DynamoComponentDeploymentSharedSpec
:
v1alpha1
.
DynamoComponentDeploymentSharedSpec
{
ServiceName
:
"service1"
,
DynamoNamespace
:
&
[]
string
{
"default"
}[
0
],
ComponentType
:
"
main
"
,
ComponentType
:
"
frontend
"
,
Replicas
:
&
[]
int32
{
3
}[
0
],
Resources
:
&
common
.
Resources
{
Requests
:
&
common
.
ResourceItem
{
...
...
@@ -171,7 +171,7 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
"service1"
:
{
DynamoComponentDeploymentSharedSpec
:
v1alpha1
.
DynamoComponentDeploymentSharedSpec
{
DynamoNamespace
:
nil
,
ComponentType
:
"
main
"
,
ComponentType
:
"
frontend
"
,
Replicas
:
&
[]
int32
{
3
}[
0
],
Resources
:
&
common
.
Resources
{
Requests
:
&
common
.
ResourceItem
{
...
...
@@ -215,7 +215,7 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
DynamoComponentDeploymentSharedSpec
:
v1alpha1
.
DynamoComponentDeploymentSharedSpec
{
ServiceName
:
"service1"
,
DynamoNamespace
:
&
[]
string
{
"dynamo-test-dynamographdeployment"
}[
0
],
ComponentType
:
"
main
"
,
ComponentType
:
"
frontend
"
,
Replicas
:
&
[]
int32
{
3
}[
0
],
Resources
:
&
common
.
Resources
{
Requests
:
&
common
.
ResourceItem
{
...
...
@@ -279,7 +279,7 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
"service1"
:
{
DynamoComponentDeploymentSharedSpec
:
v1alpha1
.
DynamoComponentDeploymentSharedSpec
{
DynamoNamespace
:
&
[]
string
{
"default"
}[
0
],
ComponentType
:
"
main
"
,
ComponentType
:
"
frontend
"
,
Replicas
:
&
[]
int32
{
3
}[
0
],
Resources
:
&
common
.
Resources
{
Requests
:
&
common
.
ResourceItem
{
...
...
@@ -325,7 +325,7 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
"service1"
:
{
DynamoComponentDeploymentSharedSpec
:
v1alpha1
.
DynamoComponentDeploymentSharedSpec
{
DynamoNamespace
:
nil
,
ComponentType
:
"
main
"
,
ComponentType
:
"
frontend
"
,
Replicas
:
&
[]
int32
{
3
}[
0
],
Resources
:
&
common
.
Resources
{
Requests
:
&
common
.
ResourceItem
{
...
...
@@ -373,7 +373,7 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
DynamoComponentDeploymentSharedSpec
:
v1alpha1
.
DynamoComponentDeploymentSharedSpec
{
ServiceName
:
"service1"
,
DynamoNamespace
:
&
[]
string
{
"dynamo-test-dynamographdeployment"
}[
0
],
ComponentType
:
"
main
"
,
ComponentType
:
"
frontend
"
,
Replicas
:
&
[]
int32
{
3
}[
0
],
Resources
:
&
common
.
Resources
{
Requests
:
&
common
.
ResourceItem
{
...
...
@@ -447,7 +447,7 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
"service1"
:
{
DynamoComponentDeploymentSharedSpec
:
v1alpha1
.
DynamoComponentDeploymentSharedSpec
{
DynamoNamespace
:
nil
,
ComponentType
:
"
main
"
,
ComponentType
:
"
frontend
"
,
Replicas
:
&
[]
int32
{
3
}[
0
],
Resources
:
&
common
.
Resources
{
Requests
:
&
common
.
ResourceItem
{
...
...
@@ -491,7 +491,7 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
DynamoComponentDeploymentSharedSpec
:
v1alpha1
.
DynamoComponentDeploymentSharedSpec
{
ServiceName
:
"service1"
,
DynamoNamespace
:
&
[]
string
{
"dynamo-test-dynamographdeployment"
}[
0
],
ComponentType
:
"
main
"
,
ComponentType
:
"
frontend
"
,
Replicas
:
&
[]
int32
{
3
}[
0
],
Resources
:
&
common
.
Resources
{
Requests
:
&
common
.
ResourceItem
{
...
...
@@ -574,7 +574,7 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
"service1"
:
{
DynamoComponentDeploymentSharedSpec
:
v1alpha1
.
DynamoComponentDeploymentSharedSpec
{
DynamoNamespace
:
&
[]
string
{
"default"
}[
0
],
ComponentType
:
"
main
"
,
ComponentType
:
"
frontend
"
,
Replicas
:
&
[]
int32
{
3
}[
0
],
Resources
:
&
common
.
Resources
{
Requests
:
&
common
.
ResourceItem
{
...
...
@@ -625,7 +625,7 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
DynamoComponentDeploymentSharedSpec
:
v1alpha1
.
DynamoComponentDeploymentSharedSpec
{
ServiceName
:
"service1"
,
DynamoNamespace
:
&
[]
string
{
"default"
}[
0
],
ComponentType
:
"
main
"
,
ComponentType
:
"
frontend
"
,
Replicas
:
&
[]
int32
{
3
}[
0
],
Resources
:
&
common
.
Resources
{
Requests
:
&
common
.
ResourceItem
{
...
...
@@ -1121,6 +1121,15 @@ func Test_mergeEnvs(t *testing.T) {
}
}
func
sortEnvVars
(
envs
[]
corev1
.
EnvVar
)
[]
corev1
.
EnvVar
{
sorted
:=
make
([]
corev1
.
EnvVar
,
len
(
envs
))
copy
(
sorted
,
envs
)
sort
.
Slice
(
sorted
,
func
(
i
,
j
int
)
bool
{
return
sorted
[
i
]
.
Name
<
sorted
[
j
]
.
Name
})
return
sorted
}
func
TestGenerateGrovePodGangSet
(
t
*
testing
.
T
)
{
type
args
struct
{
ctx
context
.
Context
...
...
@@ -1159,7 +1168,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
Services
:
map
[
string
]
*
v1alpha1
.
DynamoComponentDeploymentOverridesSpec
{
"Frontend"
:
{
DynamoComponentDeploymentSharedSpec
:
v1alpha1
.
DynamoComponentDeploymentSharedSpec
{
ComponentType
:
"
main
"
,
// Frontend component
ComponentType
:
"
frontend
"
,
// Frontend component
ExtraPodMetadata
:
&
common
.
ExtraPodMetadata
{
Annotations
:
map
[
string
]
string
{
"nvidia.com/annotation1"
:
"annotation1"
,
...
...
@@ -1308,7 +1317,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
Labels
:
map
[
string
]
string
{
commonconsts
.
KubeLabelDynamoSelector
:
"test-dynamo-graph-deployment-frontend"
,
commonconsts
.
KubeLabelMetricsEnabled
:
commonconsts
.
KubeLabelValueTrue
,
commonconsts
.
KubeLabelDynamoComponentType
:
commonconsts
.
ComponentType
Main
,
commonconsts
.
KubeLabelDynamoComponentType
:
commonconsts
.
ComponentType
Frontend
,
"nvidia.com/label1"
:
"label1"
,
"nvidia.com/label2"
:
"label2"
,
},
...
...
@@ -1503,10 +1512,6 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
Name
:
"PLANNER_ENV_1"
,
Value
:
"2"
,
},
{
Name
:
"DYNAMO_PORT"
,
Value
:
fmt
.
Sprintf
(
"%d"
,
commonconsts
.
DynamoServicePort
),
},
{
Name
:
"NATS_SERVER"
,
Value
:
"nats-address"
,
...
...
@@ -1537,13 +1542,6 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
MountPath
:
"/dev/shm"
,
},
},
Ports
:
[]
corev1
.
ContainerPort
{
{
Protocol
:
corev1
.
ProtocolTCP
,
Name
:
commonconsts
.
DynamoContainerPortName
,
ContainerPort
:
int32
(
commonconsts
.
DynamoServicePort
),
},
},
},
},
},
...
...
@@ -1594,6 +1592,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
GPU
:
"1"
,
},
},
ComponentType
:
commonconsts
.
ComponentTypeFrontend
,
Envs
:
[]
corev1
.
EnvVar
{
{
Name
:
"FRONTEND_ENV_1"
,
...
...
@@ -1813,11 +1812,6 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
"python3 -m dynamo.sglang.worker --dist-init-addr ${GROVE_PCSG_NAME}-${GROVE_PCSG_INDEX}-worker-ldr-0.${GROVE_HEADLESS_SERVICE}:29500 --nnodes 3 --node-rank 0 --custom-flag custom-value"
,
},
Ports
:
[]
corev1
.
ContainerPort
{
{
Protocol
:
corev1
.
ProtocolTCP
,
Name
:
commonconsts
.
DynamoContainerPortName
,
ContainerPort
:
int32
(
commonconsts
.
DynamoServicePort
),
},
{
Protocol
:
corev1
.
ProtocolTCP
,
Name
:
commonconsts
.
DynamoSystemPortName
,
...
...
@@ -1830,12 +1824,20 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
Value
:
"1"
,
},
{
Name
:
"
WORKER_ENV_1
"
,
Value
:
"
1
"
,
Name
:
"
DYN_SYSTEM_ENABLED
"
,
Value
:
"
true
"
,
},
{
Name
:
"DYNAMO_PORT"
,
Value
:
fmt
.
Sprintf
(
"%d"
,
commonconsts
.
DynamoServicePort
),
Name
:
"DYN_SYSTEM_PORT"
,
Value
:
"9090"
,
},
{
Name
:
"DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS"
,
Value
:
`["generate"]`
,
},
{
Name
:
"WORKER_ENV_1"
,
Value
:
"1"
,
},
{
Name
:
"NATS_SERVER"
,
...
...
@@ -1909,11 +1911,6 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
"python3 -m dynamo.sglang.worker --dist-init-addr ${GROVE_PCSG_NAME}-${GROVE_PCSG_INDEX}-worker-ldr-0.${GROVE_HEADLESS_SERVICE}:29500 --nnodes 3 --node-rank $((GROVE_PCLQ_POD_INDEX + 1)) --custom-flag custom-value"
,
},
Ports
:
[]
corev1
.
ContainerPort
{
{
Protocol
:
corev1
.
ProtocolTCP
,
Name
:
commonconsts
.
DynamoContainerPortName
,
ContainerPort
:
int32
(
commonconsts
.
DynamoServicePort
),
},
{
Protocol
:
corev1
.
ProtocolTCP
,
Name
:
commonconsts
.
DynamoSystemPortName
,
...
...
@@ -1926,12 +1923,20 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
Value
:
"1"
,
},
{
Name
:
"
WORKER_ENV_1
"
,
Value
:
"
1
"
,
Name
:
"
DYN_SYSTEM_ENABLED
"
,
Value
:
"
true
"
,
},
{
Name
:
"DYNAMO_PORT"
,
Value
:
fmt
.
Sprintf
(
"%d"
,
commonconsts
.
DynamoServicePort
),
Name
:
"DYN_SYSTEM_PORT"
,
Value
:
"9090"
,
},
{
Name
:
"DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS"
,
Value
:
`["generate"]`
,
},
{
Name
:
"WORKER_ENV_1"
,
Value
:
"1"
,
},
{
Name
:
"NATS_SERVER"
,
...
...
@@ -1967,8 +1972,9 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
{
Name
:
"frontend"
,
Labels
:
map
[
string
]
string
{
commonconsts
.
KubeLabelMetricsEnabled
:
commonconsts
.
KubeLabelValueTrue
,
commonconsts
.
KubeLabelDynamoSelector
:
"test-dynamo-graph-deployment-frontend"
,
commonconsts
.
KubeLabelMetricsEnabled
:
commonconsts
.
KubeLabelValueTrue
,
commonconsts
.
KubeLabelDynamoSelector
:
"test-dynamo-graph-deployment-frontend"
,
commonconsts
.
KubeLabelDynamoComponentType
:
commonconsts
.
ComponentTypeFrontend
,
},
Annotations
:
map
[
string
]
string
{},
Spec
:
grovev1alpha1
.
PodCliqueSpec
{
...
...
@@ -2158,10 +2164,6 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
Name
:
"PLANNER_ENV_1"
,
Value
:
"2"
,
},
{
Name
:
"DYNAMO_PORT"
,
Value
:
fmt
.
Sprintf
(
"%d"
,
commonconsts
.
DynamoServicePort
),
},
{
Name
:
"NATS_SERVER"
,
Value
:
"nats-address"
,
...
...
@@ -2192,13 +2194,6 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
MountPath
:
"/dev/shm"
,
},
},
Ports
:
[]
corev1
.
ContainerPort
{
{
Protocol
:
corev1
.
ProtocolTCP
,
Name
:
commonconsts
.
DynamoContainerPortName
,
ContainerPort
:
int32
(
commonconsts
.
DynamoServicePort
),
},
},
},
},
},
...
...
@@ -2237,7 +2232,8 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
Services
:
map
[
string
]
*
v1alpha1
.
DynamoComponentDeploymentOverridesSpec
{
"Frontend"
:
{
DynamoComponentDeploymentSharedSpec
:
v1alpha1
.
DynamoComponentDeploymentSharedSpec
{
Replicas
:
&
[]
int32
{
1
}[
0
],
Replicas
:
&
[]
int32
{
1
}[
0
],
ComponentType
:
commonconsts
.
ComponentTypeFrontend
,
Resources
:
&
common
.
Resources
{
Requests
:
&
common
.
ResourceItem
{
CPU
:
"1"
,
...
...
@@ -2492,11 +2488,6 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
"ray start --head --port=6379 && python3 -m dynamo.vllm --custom-flag custom-value"
,
},
Ports
:
[]
corev1
.
ContainerPort
{
{
Protocol
:
corev1
.
ProtocolTCP
,
Name
:
commonconsts
.
DynamoContainerPortName
,
ContainerPort
:
int32
(
commonconsts
.
DynamoServicePort
),
},
{
Protocol
:
corev1
.
ProtocolTCP
,
Name
:
commonconsts
.
DynamoSystemPortName
,
...
...
@@ -2509,12 +2500,20 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
Value
:
"1"
,
},
{
Name
:
"
WORKER_ENV_1
"
,
Value
:
"
1
"
,
Name
:
"
DYN_SYSTEM_ENABLED
"
,
Value
:
"
true
"
,
},
{
Name
:
"DYNAMO_PORT"
,
Value
:
fmt
.
Sprintf
(
"%d"
,
commonconsts
.
DynamoServicePort
),
Name
:
"DYN_SYSTEM_PORT"
,
Value
:
"9090"
,
},
{
Name
:
"DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS"
,
Value
:
`["generate"]`
,
},
{
Name
:
"WORKER_ENV_1"
,
Value
:
"1"
,
},
{
Name
:
"NATS_SERVER"
,
...
...
@@ -2591,11 +2590,6 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
"ray start --address=${GROVE_PCSG_NAME}-${GROVE_PCSG_INDEX}-worker-ldr-0.${GROVE_HEADLESS_SERVICE}:6379 --block"
,
},
Ports
:
[]
corev1
.
ContainerPort
{
{
Protocol
:
corev1
.
ProtocolTCP
,
Name
:
commonconsts
.
DynamoContainerPortName
,
ContainerPort
:
int32
(
commonconsts
.
DynamoServicePort
),
},
{
Protocol
:
corev1
.
ProtocolTCP
,
Name
:
commonconsts
.
DynamoSystemPortName
,
...
...
@@ -2608,12 +2602,20 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
Value
:
"1"
,
},
{
Name
:
"
WORKER_ENV_1
"
,
Value
:
"
1
"
,
Name
:
"
DYN_SYSTEM_ENABLED
"
,
Value
:
"
true
"
,
},
{
Name
:
"DYNAMO_PORT"
,
Value
:
fmt
.
Sprintf
(
"%d"
,
commonconsts
.
DynamoServicePort
),
Name
:
"DYN_SYSTEM_PORT"
,
Value
:
"9090"
,
},
{
Name
:
"DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS"
,
Value
:
`["generate"]`
,
},
{
Name
:
"WORKER_ENV_1"
,
Value
:
"1"
,
},
{
Name
:
"NATS_SERVER"
,
...
...
@@ -2649,8 +2651,9 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
{
Name
:
"frontend"
,
Labels
:
map
[
string
]
string
{
commonconsts
.
KubeLabelMetricsEnabled
:
commonconsts
.
KubeLabelValueTrue
,
commonconsts
.
KubeLabelDynamoSelector
:
"test-dynamo-graph-deployment-frontend"
,
commonconsts
.
KubeLabelDynamoComponentType
:
commonconsts
.
ComponentTypeFrontend
,
commonconsts
.
KubeLabelMetricsEnabled
:
commonconsts
.
KubeLabelValueTrue
,
commonconsts
.
KubeLabelDynamoSelector
:
"test-dynamo-graph-deployment-frontend"
,
},
Annotations
:
map
[
string
]
string
{},
Spec
:
grovev1alpha1
.
PodCliqueSpec
{
...
...
@@ -2840,10 +2843,6 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
Name
:
"PLANNER_ENV_1"
,
Value
:
"2"
,
},
{
Name
:
"DYNAMO_PORT"
,
Value
:
fmt
.
Sprintf
(
"%d"
,
commonconsts
.
DynamoServicePort
),
},
{
Name
:
"NATS_SERVER"
,
Value
:
"nats-address"
,
...
...
@@ -2874,13 +2873,6 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
MountPath
:
"/dev/shm"
,
},
},
Ports
:
[]
corev1
.
ContainerPort
{
{
Protocol
:
corev1
.
ProtocolTCP
,
Name
:
commonconsts
.
DynamoContainerPortName
,
ContainerPort
:
int32
(
commonconsts
.
DynamoServicePort
),
},
},
},
},
},
...
...
@@ -2906,6 +2898,19 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
sort
.
Slice
(
tt
.
want
.
Spec
.
Template
.
Cliques
,
func
(
i
,
j
int
)
bool
{
return
tt
.
want
.
Spec
.
Template
.
Cliques
[
i
]
.
Name
<
tt
.
want
.
Spec
.
Template
.
Cliques
[
j
]
.
Name
})
// Sort environment variables for all containers in all cliques
for
_
,
clique
:=
range
got
.
Spec
.
Template
.
Cliques
{
for
i
:=
range
clique
.
Spec
.
PodSpec
.
Containers
{
clique
.
Spec
.
PodSpec
.
Containers
[
i
]
.
Env
=
sortEnvVars
(
clique
.
Spec
.
PodSpec
.
Containers
[
i
]
.
Env
)
}
}
for
_
,
clique
:=
range
tt
.
want
.
Spec
.
Template
.
Cliques
{
for
i
:=
range
clique
.
Spec
.
PodSpec
.
Containers
{
clique
.
Spec
.
PodSpec
.
Containers
[
i
]
.
Env
=
sortEnvVars
(
clique
.
Spec
.
PodSpec
.
Containers
[
i
]
.
Env
)
}
}
if
diff
:=
cmp
.
Diff
(
got
,
tt
.
want
);
diff
!=
""
{
t
.
Errorf
(
"GenerateGrovePodGangSet() mismatch (-want +got):
\n
%s"
,
diff
)
}
...
...
@@ -3018,31 +3023,6 @@ func TestGeneratePodSpecForComponent_SGLang(t *testing.T) {
expectError
:
false
,
expectContains
:
[]
string
{},
},
{
name
:
"SGLang with resources"
,
component
:
&
v1alpha1
.
DynamoComponentDeploymentOverridesSpec
{
DynamoComponentDeploymentSharedSpec
:
v1alpha1
.
DynamoComponentDeploymentSharedSpec
{
ComponentType
:
commonconsts
.
ComponentTypeWorker
,
ExtraPodSpec
:
&
common
.
ExtraPodSpec
{
MainContainer
:
&
corev1
.
Container
{
Args
:
[]
string
{
"python3"
,
"-m"
,
"dynamo.sglang.worker"
},
Resources
:
corev1
.
ResourceRequirements
{
Requests
:
corev1
.
ResourceList
{
corev1
.
ResourceCPU
:
resource
.
MustParse
(
"1"
),
corev1
.
ResourceMemory
:
resource
.
MustParse
(
"2Gi"
),
},
},
},
},
},
},
backendFramework
:
BackendFrameworkSGLang
,
role
:
RoleMain
,
numberOfNodes
:
1
,
expectError
:
false
,
expectContains
:
[]
string
{
"python3 -m dynamo.sglang.worker"
},
},
}
for
_
,
tt
:=
range
tests
{
...
...
@@ -3574,7 +3554,7 @@ func TestDetermineBackendFramework(t *testing.T) {
}{
{
name
:
"non-worker component returns noop"
,
componentType
:
"
main
"
,
componentType
:
"
frontend
"
,
command
:
[]
string
{
"/bin/sh"
,
"-c"
},
args
:
[]
string
{
"echo hello world"
},
expected
:
BackendFrameworkNoop
,
...
...
@@ -3735,7 +3715,7 @@ func TestGetBackendFrameworkFromComponent(t *testing.T) {
name
:
"non-worker component returns noop"
,
component
:
&
v1alpha1
.
DynamoComponentDeploymentOverridesSpec
{
DynamoComponentDeploymentSharedSpec
:
v1alpha1
.
DynamoComponentDeploymentSharedSpec
{
ComponentType
:
"
main
"
,
// Frontend component
ComponentType
:
"
frontend
"
,
// Frontend component
},
},
deployment
:
&
v1alpha1
.
DynamoGraphDeployment
{},
...
...
@@ -4145,3 +4125,59 @@ func TestGenerateGrovePodGangSet_StartsAfterDependencies(t *testing.T) {
})
}
}
func
TestGenerateBasePodSpec_PlannerServiceAccount
(
t
*
testing
.
T
)
{
secretsRetriever
:=
&
mockSecretsRetriever
{}
controllerConfig
:=
controller_common
.
Config
{}
tests
:=
[]
struct
{
name
string
component
*
v1alpha1
.
DynamoComponentDeploymentOverridesSpec
expectedServiceAcc
string
}{
{
name
:
"Planner component should have planner service account"
,
component
:
&
v1alpha1
.
DynamoComponentDeploymentOverridesSpec
{
DynamoComponentDeploymentSharedSpec
:
v1alpha1
.
DynamoComponentDeploymentSharedSpec
{
ComponentType
:
commonconsts
.
ComponentTypePlanner
,
},
},
expectedServiceAcc
:
commonconsts
.
PlannerServiceAccountName
,
},
{
name
:
"Planner service account should not be set for non-planner components"
,
component
:
&
v1alpha1
.
DynamoComponentDeploymentOverridesSpec
{
DynamoComponentDeploymentSharedSpec
:
v1alpha1
.
DynamoComponentDeploymentSharedSpec
{
ComponentType
:
commonconsts
.
ComponentTypeWorker
,
},
},
expectedServiceAcc
:
""
,
},
}
for
_
,
tt
:=
range
tests
{
t
.
Run
(
tt
.
name
,
func
(
t
*
testing
.
T
)
{
podSpec
,
err
:=
GenerateBasePodSpec
(
tt
.
component
,
BackendFrameworkSGLang
,
secretsRetriever
,
"default"
,
RoleMain
,
1
,
controllerConfig
,
commonconsts
.
MultinodeDeploymentTypeGrove
,
"test-service"
,
)
if
err
!=
nil
{
t
.
Errorf
(
"GenerateBasePodSpec() error = %v"
,
err
)
return
}
if
podSpec
.
ServiceAccountName
!=
tt
.
expectedServiceAcc
{
t
.
Errorf
(
"GenerateBasePodSpec() serviceAccountName = %v, want %v"
,
podSpec
.
ServiceAccountName
,
tt
.
expectedServiceAcc
)
}
})
}
}
examples/runtime/hello_world/deploy/hello_world.yaml
View file @
81c27803
...
...
@@ -27,7 +27,7 @@ spec:
timeoutSeconds
:
2
failureThreshold
:
3
dynamoNamespace
:
hello-world
componentType
:
main
componentType
:
frontend
replicas
:
1
resources
:
requests
:
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment