Unverified Commit 024422b9 authored by Rohan Varma's avatar Rohan Varma Committed by GitHub
Browse files

feat: Add LWS to Dynamo Operator (#998)


Co-authored-by: default avatarRohan Varma <rohanv@rohanv-mlt.client.nvidia.com>
Co-authored-by: default avatarJulien Mancuso <jmancuso@nvidia.com>
Co-authored-by: default avatarjulienmancuso <161955438+julienmancuso@users.noreply.github.com>
parent eb133e3f
......@@ -21,8 +21,8 @@ VERSION 0.8
############### SHARED LIBRARY TARGETS ##############################
golang-base:
FROM golang:1.23
RUN apt-get update && apt-get install -y git && apt-get clean && rm -rf /var/lib/apt/lists/* && curl -sSfL https://github.com/golangci/golangci-lint/releases/download/v1.61.0/golangci-lint-1.61.0-linux-amd64.tar.gz | tar -xzv && mv golangci-lint-1.61.0-linux-amd64/golangci-lint /usr/local/bin/
FROM golang:1.24
RUN apt-get update && apt-get install -y git && apt-get clean && rm -rf /var/lib/apt/lists/* && go install github.com/golangci/golangci-lint/cmd/golangci-lint@v1.64.8
operator-src:
FROM +golang-base
......
......@@ -37,6 +37,7 @@ export ISTIO_ENABLED="${ISTIO_ENABLED:=false}"
export ISTIO_GATEWAY="${ISTIO_GATEWAY:=istio-system/istio-ingressgateway}"
export INGRESS_CLASS="${INGRESS_CLASS:=nginx}"
export VIRTUAL_SERVICE_SUPPORTS_HTTPS="${VIRTUAL_SERVICE_SUPPORTS_HTTPS:=false}"
export ENABLE_LWS="${ENABLE_LWS:=false}"
# Add command line options
INTERACTIVE=false
......@@ -143,7 +144,7 @@ echo "ISTIO_GATEWAY: $ISTIO_GATEWAY"
echo "DYNAMO_INGRESS_SUFFIX: $DYNAMO_INGRESS_SUFFIX"
echo "VIRTUAL_SERVICE_SUPPORTS_HTTPS: $VIRTUAL_SERVICE_SUPPORTS_HTTPS"
envsubst '${NAMESPACE} ${RELEASE_NAME} ${DOCKER_USERNAME} ${DOCKER_PASSWORD} ${DOCKER_SERVER} ${IMAGE_TAG} ${DYNAMO_INGRESS_SUFFIX} ${PIPELINES_DOCKER_SERVER} ${PIPELINES_DOCKER_USERNAME} ${PIPELINES_DOCKER_PASSWORD} ${DOCKER_SECRET_NAME} ${INGRESS_ENABLED} ${ISTIO_ENABLED} ${INGRESS_CLASS} ${ISTIO_GATEWAY} ${VIRTUAL_SERVICE_SUPPORTS_HTTPS}' < dynamo-platform-values.yaml > generated-values.yaml
envsubst '${NAMESPACE} ${RELEASE_NAME} ${DOCKER_USERNAME} ${DOCKER_PASSWORD} ${DOCKER_SERVER} ${IMAGE_TAG} ${DYNAMO_INGRESS_SUFFIX} ${PIPELINES_DOCKER_SERVER} ${PIPELINES_DOCKER_USERNAME} ${PIPELINES_DOCKER_PASSWORD} ${DOCKER_SECRET_NAME} ${INGRESS_ENABLED} ${ISTIO_ENABLED} ${INGRESS_CLASS} ${ISTIO_GATEWAY} ${VIRTUAL_SERVICE_SUPPORTS_HTTPS} ${ENABLE_LWS}' < dynamo-platform-values.yaml > generated-values.yaml
echo "generated file contents:"
cat generated-values.yaml
......
......@@ -27,6 +27,7 @@ dynamo-operator:
- name: ${DOCKER_SECRET_NAME}
dynamo:
enableLWS: ${ENABLE_LWS}
ingress:
enabled: ${INGRESS_ENABLED}
className: ${INGRESS_CLASS}
......
......@@ -89,6 +89,9 @@ spec:
{{- if .Values.dynamo.virtualServiceSupportsHTTPS }}
- --virtual-service-supports-https={{ .Values.dynamo.virtualServiceSupportsHTTPS }}
{{- end }}
{{- if .Values.dynamo.enableLWS }}
- --enable-lws
{{- end }}
command:
- /manager
......
......@@ -407,6 +407,32 @@ rules:
- patch
- update
- watch
{{- if .Values.dynamo.enableLWS }}
- apiGroups:
- leaderworkerset.x-k8s.io
resources:
- leaderworkersets
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- scheduling.volcano.sh
resources:
- podgroups
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
{{- end }}
---
apiVersion: rbac.authorization.k8s.io/v1
{{- if .Values.namespaceRestriction.enabled }}
......
......@@ -73,6 +73,7 @@ controllerManager:
annotations: {}
dynamo:
enableLWS: false
apiStore:
endpoint: http://dynamo-server.dynamo-system.svc.cluster.local
clusterName: default
......
......@@ -37,6 +37,7 @@ dynamo-operator:
- --health-probe-bind-address=:8081
- --metrics-bind-address=127.0.0.1:8080
dynamo:
enableLWS: false
apiStore:
endpoint: http://dynamo-store
clusterName: default
......
......@@ -40,7 +40,6 @@ linters:
enable:
- dupl
- errcheck
- exportloopref
- goconst
- gocyclo
- gofmt
......
......@@ -166,7 +166,7 @@ GOLANGCI_LINT = $(LOCALBIN)/golangci-lint-$(GOLANGCI_LINT_VERSION)
KUSTOMIZE_VERSION ?= v5.5.0
CONTROLLER_TOOLS_VERSION ?= v0.16.4
ENVTEST_VERSION ?= release-0.19
GOLANGCI_LINT_VERSION ?= v1.61.0
GOLANGCI_LINT_VERSION ?= v1.64.8
.PHONY: kustomize
kustomize: $(KUSTOMIZE) ## Download kustomize locally if necessary.
......
......@@ -13,7 +13,7 @@ Built with [Kubebuilder](https://book.kubebuilder.io/), it follows Kubernetes be
### Pre-requisites
- [Go](https://go.dev/doc/install) >= 1.23
- [Go](https://go.dev/doc/install) >= 1.24
- [Kubebuilder](https://book.kubebuilder.io/quick-start.html)
### Build
......
......@@ -40,6 +40,9 @@ import (
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
"sigs.k8s.io/controller-runtime/pkg/webhook"
lwsscheme "sigs.k8s.io/lws/client-go/clientset/versioned/scheme"
volcanoscheme "volcano.sh/apis/pkg/client/clientset/versioned/scheme"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller"
commonController "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common"
......@@ -57,6 +60,10 @@ func init() {
utilruntime.Must(clientgoscheme.AddToScheme(scheme))
utilruntime.Must(nvidiacomv1alpha1.AddToScheme(scheme))
utilruntime.Must(lwsscheme.AddToScheme(scheme))
utilruntime.Must(volcanoscheme.AddToScheme(scheme))
//+kubebuilder:scaffold:scheme
}
......@@ -75,6 +82,7 @@ func main() {
var ingressControllerClassName string
var ingressControllerTLSSecretName string
var ingressHostSuffix string
var enableLWS bool
flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
flag.BoolVar(&enableLeaderElection, "leader-elect", false,
......@@ -100,6 +108,8 @@ func main() {
"The name of the ingress controller TLS secret to use")
flag.StringVar(&ingressHostSuffix, "ingress-host-suffix", "",
"The suffix to use for the ingress host")
flag.BoolVar(&enableLWS, "enable-lws", false,
"If set, enable leader worker set")
opts := zap.Options{
Development: true,
}
......@@ -111,6 +121,7 @@ func main() {
ctrlConfig := commonController.Config{
RestrictedNamespace: restrictedNamespace,
VirtualServiceSupportsHTTPS: virtualServiceSupportsHTTPS,
EnableLWS: enableLWS,
}
ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts)))
......
......@@ -100,6 +100,18 @@ rules:
- patch
- update
- watch
- apiGroups:
- leaderworkerset.x-k8s.io
resources:
- leaderworkersets
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- networking.istio.io
resources:
......@@ -157,3 +169,15 @@ rules:
- get
- patch
- update
- apiGroups:
- scheduling.volcano.sh
resources:
- podgroups
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
module github.com/ai-dynamo/dynamo/deploy/cloud/operator
go 1.23.0
go 1.24.0
toolchain go1.23.4
toolchain go1.24.3
require (
dario.cat/mergo v1.0.1
emperror.dev/errors v0.8.1
github.com/apparentlymart/go-shquot v0.0.1
github.com/bsm/gomega v1.27.10
github.com/google/go-cmp v0.7.0
github.com/huandu/xstrings v1.4.0
github.com/mitchellh/hashstructure/v2 v2.0.2
github.com/onsi/ginkgo/v2 v2.19.0
github.com/onsi/gomega v1.33.1
github.com/onsi/ginkgo/v2 v2.23.4
github.com/onsi/gomega v1.37.0
github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.71.2
github.com/prune998/docker-registry-client v0.0.0-20200114164314-f8cd511a014c
github.com/rs/xid v1.4.0
github.com/sergeymakinen/go-quote v1.1.0
github.com/sirupsen/logrus v1.9.3
go.etcd.io/etcd/client/v3 v3.5.14
go.etcd.io/etcd/client/v3 v3.5.16
gopkg.in/yaml.v2 v2.4.0
istio.io/api v1.23.1
istio.io/client-go v1.23.1
k8s.io/api v0.31.3
k8s.io/apiextensions-apiserver v0.31.0
k8s.io/apimachinery v0.31.3
k8s.io/client-go v0.31.3
k8s.io/utils v0.0.0-20240711033017-18e509b52bc8
k8s.io/api v0.32.3
k8s.io/apiextensions-apiserver v0.32.3
k8s.io/apimachinery v0.32.3
k8s.io/client-go v0.32.3
k8s.io/utils v0.0.0-20241210054802-24370beab758
resty.dev/v3 v3.0.0-beta.2
sigs.k8s.io/controller-runtime v0.19.4
sigs.k8s.io/controller-runtime v0.20.4
sigs.k8s.io/lws v0.6.1
volcano.sh/apis v1.11.0
)
......@@ -41,29 +43,28 @@ require (
github.com/distribution/reference v0.6.0 // indirect
github.com/docker/distribution v2.8.3+incompatible // indirect
github.com/docker/libtrust v0.0.0-20150114040149-fa567046d9b1 // indirect
github.com/emicklei/go-restful/v3 v3.11.0 // indirect
github.com/emicklei/go-restful/v3 v3.12.0 // indirect
github.com/evanphx/json-patch v5.7.0+incompatible // indirect
github.com/evanphx/json-patch/v5 v5.9.0 // indirect
github.com/evanphx/json-patch/v5 v5.9.11 // indirect
github.com/fsnotify/fsnotify v1.7.0 // indirect
github.com/fxamacker/cbor/v2 v2.7.0 // indirect
github.com/go-logr/logr v1.4.2 // indirect
github.com/go-logr/zapr v1.3.0 // indirect
github.com/go-openapi/jsonpointer v0.20.2 // indirect
github.com/go-openapi/jsonreference v0.20.2 // indirect
github.com/go-openapi/swag v0.22.8 // indirect
github.com/go-openapi/jsonpointer v0.21.0 // indirect
github.com/go-openapi/jsonreference v0.21.0 // indirect
github.com/go-openapi/swag v0.23.0 // indirect
github.com/go-task/slim-sprig/v3 v3.0.0 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
github.com/golang/protobuf v1.5.4 // indirect
github.com/google/btree v1.1.3 // indirect
github.com/google/gnostic-models v0.6.8 // indirect
github.com/google/go-cmp v0.6.0 // indirect
github.com/google/gofuzz v1.2.0 // indirect
github.com/google/pprof v0.0.0-20240525223248-4bfdf5a9a2af // indirect
github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/gorilla/mux v1.8.1 // indirect
github.com/imdario/mergo v0.3.13 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/klauspost/compress v1.17.9 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
......@@ -71,34 +72,36 @@ require (
github.com/opencontainers/go-digest v1.0.0 // indirect
github.com/opencontainers/image-spec v1.1.0 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/prometheus/client_golang v1.19.1 // indirect
github.com/prometheus/client_golang v1.20.2 // indirect
github.com/prometheus/client_model v0.6.1 // indirect
github.com/prometheus/common v0.55.0 // indirect
github.com/prometheus/procfs v0.15.1 // indirect
github.com/spf13/pflag v1.0.5 // indirect
github.com/x448/float16 v0.8.4 // indirect
go.etcd.io/etcd/api/v3 v3.5.14 // indirect
go.etcd.io/etcd/client/pkg/v3 v3.5.14 // indirect
go.etcd.io/etcd/api/v3 v3.5.16 // indirect
go.etcd.io/etcd/client/pkg/v3 v3.5.16 // indirect
go.uber.org/automaxprocs v1.6.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
go.uber.org/zap v1.26.0 // indirect
golang.org/x/exp v0.0.0-20230515195305-f3d0a9c9a5cc // indirect
golang.org/x/net v0.33.0 // indirect
golang.org/x/oauth2 v0.21.0 // indirect
golang.org/x/sys v0.28.0 // indirect
golang.org/x/term v0.27.0 // indirect
golang.org/x/text v0.21.0 // indirect
golang.org/x/time v0.6.0 // indirect
golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d // indirect
go.uber.org/zap v1.27.0 // indirect
golang.org/x/net v0.37.0 // indirect
golang.org/x/oauth2 v0.23.0 // indirect
golang.org/x/sync v0.12.0 // indirect
golang.org/x/sys v0.32.0 // indirect
golang.org/x/term v0.30.0 // indirect
golang.org/x/text v0.23.0 // indirect
golang.org/x/time v0.7.0 // indirect
golang.org/x/tools v0.31.0 // indirect
gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20240528184218-531527333157 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20240701130421-f6361c86f094 // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20240826202546-f6391c0de4c7 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20240826202546-f6391c0de4c7 // indirect
google.golang.org/grpc v1.65.0 // indirect
google.golang.org/protobuf v1.34.2 // indirect
google.golang.org/protobuf v1.36.6 // indirect
gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
k8s.io/klog/v2 v2.130.1 // indirect
k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 // indirect
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect
k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f // indirect
sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect
sigs.k8s.io/structured-merge-diff/v4 v4.7.0 // indirect
sigs.k8s.io/yaml v1.4.0 // indirect
)
This diff is collapsed.
......@@ -323,7 +323,9 @@ func overrideWithDynDeploymentConfig(ctx context.Context, dynamoDeploymentCompon
}
componentDynConfig := dynDeploymentConfig[dynamoDeploymentComponent.Spec.ServiceName]
if componentDynConfig != nil {
if componentDynConfig.ServiceArgs != nil && componentDynConfig.ServiceArgs.Workers != nil {
if componentDynConfig.ServiceArgs != nil && componentDynConfig.ServiceArgs.Workers != nil && dynamoDeploymentComponent.Spec.Replicas == nil {
// we only override the replicas if it is not set in the CRD.
// replicas, if set in the CRD set in the CRD must always be the source of truth.
dynamoDeploymentComponent.Spec.Replicas = componentDynConfig.ServiceArgs.Workers
}
if componentDynConfig.ServiceArgs != nil && componentDynConfig.ServiceArgs.Resources != nil {
......@@ -362,6 +364,9 @@ func overrideWithDynDeploymentConfig(ctx context.Context, dynamoDeploymentCompon
requests.Custom = componentDynConfig.ServiceArgs.Resources.Custom
limits.Custom = componentDynConfig.ServiceArgs.Resources.Custom
}
if err := dynamo.SetLwsAnnotations(componentDynConfig.ServiceArgs, dynamoDeploymentComponent); err != nil {
return err
}
}
}
break
......
......@@ -26,6 +26,7 @@ import (
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/dynamo/common"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
"github.com/bsm/gomega"
"github.com/google/go-cmp/cmp"
corev1 "k8s.io/api/core/v1"
)
......@@ -269,7 +270,7 @@ func Test_overrideWithDynDeploymentConfig(t *testing.T) {
Value: `{"Frontend":{"port":8080,"ServiceArgs":{"Workers":3, "Resources":{"CPU":"2", "Memory":"2Gi", "GPU":"2"}}},"Planner":{"environment":"kubernetes"}}`,
},
},
Replicas: &[]int32{1}[0],
Replicas: nil,
Resources: &common.Resources{
Requests: &common.ResourceItem{
CPU: "1",
......@@ -309,6 +310,64 @@ func Test_overrideWithDynDeploymentConfig(t *testing.T) {
},
},
},
{
name: "override workers and resources with gpusPerNode",
args: args{
ctx: context.Background(),
dynamoDeploymentComponent: &nvidiacomv1alpha1.DynamoComponentDeployment{
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ServiceName: "Frontend",
Envs: []corev1.EnvVar{
{
Name: "DYN_DEPLOYMENT_CONFIG",
Value: `{"Frontend":{"port":8080,"ServiceArgs":{"Workers":3, "Resources":{"CPU":"2", "Memory":"2Gi", "GPU":"8"}, "total_gpus":16}},"Planner":{"environment":"kubernetes"}}`,
},
},
Replicas: nil,
Resources: &common.Resources{
Requests: &common.ResourceItem{
CPU: "1",
Memory: "1Gi",
GPU: "1",
},
},
},
},
},
},
wantErr: false,
expected: &nvidiacomv1alpha1.DynamoComponentDeployment{
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ServiceName: "Frontend",
Envs: []corev1.EnvVar{
{
Name: "DYN_DEPLOYMENT_CONFIG",
Value: `{"Frontend":{"port":8080,"ServiceArgs":{"Workers":3, "Resources":{"CPU":"2", "Memory":"2Gi", "GPU":"8"}, "total_gpus":16}},"Planner":{"environment":"kubernetes"}}`,
},
},
Replicas: &[]int32{3}[0],
Resources: &common.Resources{
Requests: &common.ResourceItem{
CPU: "2",
Memory: "2Gi",
GPU: "8",
},
Limits: &common.ResourceItem{
CPU: "2",
Memory: "2Gi",
GPU: "8",
},
},
Annotations: map[string]string{
"nvidia.com/deployment-type": "leader-worker",
"nvidia.com/lws-size": "2",
},
},
},
},
},
{
name: "override subset of resources",
args: args{
......@@ -323,7 +382,7 @@ func Test_overrideWithDynDeploymentConfig(t *testing.T) {
Value: `{"Frontend":{"port":8080,"ServiceArgs":{"Workers":3, "Resources":{"GPU":"2"}}},"Planner":{"environment":"kubernetes"}}`,
},
},
Replicas: &[]int32{1}[0],
Replicas: nil,
Resources: &common.Resources{
Requests: &common.ResourceItem{
CPU: "1",
......@@ -363,14 +422,64 @@ func Test_overrideWithDynDeploymentConfig(t *testing.T) {
},
},
},
{
name: "do not override replicas if explicitly set in the CRD !",
args: args{
ctx: context.Background(),
dynamoDeploymentComponent: &nvidiacomv1alpha1.DynamoComponentDeployment{
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ServiceName: "Frontend",
Envs: []corev1.EnvVar{
{
Name: "DYN_DEPLOYMENT_CONFIG",
Value: `{"Frontend":{"port":8080,"ServiceArgs":{"Workers":3}},"Planner":{"environment":"kubernetes"}}`,
},
},
Replicas: &[]int32{1}[0],
Resources: &common.Resources{
Requests: &common.ResourceItem{
CPU: "1",
Memory: "1Gi",
GPU: "1",
},
},
},
},
},
},
wantErr: false,
expected: &nvidiacomv1alpha1.DynamoComponentDeployment{
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ServiceName: "Frontend",
Envs: []corev1.EnvVar{
{
Name: "DYN_DEPLOYMENT_CONFIG",
Value: `{"Frontend":{"port":8080,"ServiceArgs":{"Workers":3}},"Planner":{"environment":"kubernetes"}}`,
},
},
Replicas: &[]int32{1}[0],
Resources: &common.Resources{
Requests: &common.ResourceItem{
CPU: "1",
Memory: "1Gi",
GPU: "1",
},
},
},
},
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
g := gomega.NewGomegaWithT(t)
if err := overrideWithDynDeploymentConfig(tt.args.ctx, tt.args.dynamoDeploymentComponent); (err != nil) != tt.wantErr {
t.Errorf("overrideWithDynDeploymentConfig() error = %v, wantErr %v", err, tt.wantErr)
}
g.Expect(tt.args.dynamoDeploymentComponent).To(gomega.Equal(tt.expected))
if diff := cmp.Diff(tt.args.dynamoDeploymentComponent, tt.expected); diff != "" {
t.Errorf("overrideWithDynDeploymentConfig() mismatch (-want +got):\n%s", diff)
}
})
}
}
......@@ -32,6 +32,7 @@ type Config struct {
RestrictedNamespace string
// If true, assume VirtualService endpoints are HTTPS
VirtualServiceSupportsHTTPS bool
EnableLWS bool
}
func EphemeralDeploymentEventFilter(config Config) predicate.Predicate {
......
......@@ -24,6 +24,7 @@ import (
"fmt"
"io"
"net/http"
"strconv"
"strings"
"emperror.dev/errors"
......@@ -80,6 +81,7 @@ type Config struct {
HttpExposed bool `yaml:"http_exposed,omitempty"`
ApiEndpoints []string `yaml:"api_endpoints,omitempty"`
Workers *int32 `yaml:"workers,omitempty"`
TotalGpus *int32 `yaml:"total_gpus,omitempty"`
}
type ServiceConfig struct {
......@@ -99,6 +101,7 @@ type DynDeploymentServiceConfig struct {
type ServiceArgs struct {
Workers *int32 `json:"workers,omitempty"`
Resources *Resources `json:"resources,omitempty"`
TotalGpus *int32 `json:"total_gpus,omitempty"`
}
func (s ServiceConfig) GetNamespace() *string {
......@@ -253,6 +256,31 @@ func GetDynamoGraphConfig(ctx context.Context, dynamoDeployment *v1alpha1.Dynamo
return ParseDynamoGraphConfig(ctx, yamlContent)
}
func SetLwsAnnotations(serviceArgs *ServiceArgs, deployment *v1alpha1.DynamoComponentDeployment) error {
if serviceArgs.Resources != nil &&
serviceArgs.Resources.GPU != nil && *serviceArgs.Resources.GPU != "" && *serviceArgs.Resources.GPU != "0" &&
serviceArgs.TotalGpus != nil && *serviceArgs.TotalGpus > 0 {
gpusPerNodeStr := *serviceArgs.Resources.GPU
gpusPerNode, errGpusPerNode := strconv.Atoi(gpusPerNodeStr)
if errGpusPerNode != nil {
return fmt.Errorf("failed to parse GPUs per node value '%s' for service %s: %w", gpusPerNodeStr, deployment.Spec.ServiceName, errGpusPerNode)
}
// Calculate lwsSize using ceiling division to ensure enough nodes for all GPUs
lwsSize := (int(*serviceArgs.TotalGpus) + gpusPerNode - 1) / gpusPerNode
if lwsSize > 1 {
if deployment.Spec.Annotations == nil {
deployment.Spec.Annotations = make(map[string]string)
}
deployment.Spec.Annotations["nvidia.com/lws-size"] = strconv.Itoa(lwsSize)
deployment.Spec.Annotations["nvidia.com/deployment-type"] = "leader-worker"
}
}
return nil
}
// GenerateDynamoComponentsDeployments generates a map of DynamoComponentDeployments from a DynamoGraphConfig
func GenerateDynamoComponentsDeployments(ctx context.Context, parentDynamoGraphDeployment *v1alpha1.DynamoGraphDeployment, config *DynamoGraphConfig, ingressSpec *v1alpha1.IngressSpec) (map[string]*v1alpha1.DynamoComponentDeployment, error) {
dynamoServices := make(map[string]string)
......@@ -321,6 +349,15 @@ func GenerateDynamoComponentsDeployments(ctx context.Context, parentDynamoGraphD
deployment.Spec.Resources.Requests.GPU = *service.Config.Resources.GPU
deployment.Spec.Resources.Limits.GPU = *service.Config.Resources.GPU
}
serviceArgs := ServiceArgs{
Resources: service.Config.Resources,
TotalGpus: service.Config.TotalGpus,
Workers: service.Config.Workers,
}
if err := SetLwsAnnotations(&serviceArgs, deployment); err != nil {
return nil, err
}
}
deployment.Spec.Autoscaling = &v1alpha1.Autoscaling{
Enabled: false,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment