"vllm/vscode:/vscode.git/clone" did not exist on "3cc328a4be4976f75ce016f60bc55beee4701d1b"
Unverified Commit 024422b9 authored by Rohan Varma's avatar Rohan Varma Committed by GitHub
Browse files

feat: Add LWS to Dynamo Operator (#998)


Co-authored-by: default avatarRohan Varma <rohanv@rohanv-mlt.client.nvidia.com>
Co-authored-by: default avatarJulien Mancuso <jmancuso@nvidia.com>
Co-authored-by: default avatarjulienmancuso <161955438+julienmancuso@users.noreply.github.com>
parent eb133e3f
......@@ -21,8 +21,8 @@ VERSION 0.8
############### SHARED LIBRARY TARGETS ##############################
golang-base:
FROM golang:1.23
RUN apt-get update && apt-get install -y git && apt-get clean && rm -rf /var/lib/apt/lists/* && curl -sSfL https://github.com/golangci/golangci-lint/releases/download/v1.61.0/golangci-lint-1.61.0-linux-amd64.tar.gz | tar -xzv && mv golangci-lint-1.61.0-linux-amd64/golangci-lint /usr/local/bin/
FROM golang:1.24
RUN apt-get update && apt-get install -y git && apt-get clean && rm -rf /var/lib/apt/lists/* && go install github.com/golangci/golangci-lint/cmd/golangci-lint@v1.64.8
operator-src:
FROM +golang-base
......
......@@ -37,6 +37,7 @@ export ISTIO_ENABLED="${ISTIO_ENABLED:=false}"
export ISTIO_GATEWAY="${ISTIO_GATEWAY:=istio-system/istio-ingressgateway}"
export INGRESS_CLASS="${INGRESS_CLASS:=nginx}"
export VIRTUAL_SERVICE_SUPPORTS_HTTPS="${VIRTUAL_SERVICE_SUPPORTS_HTTPS:=false}"
export ENABLE_LWS="${ENABLE_LWS:=false}"
# Add command line options
INTERACTIVE=false
......@@ -143,7 +144,7 @@ echo "ISTIO_GATEWAY: $ISTIO_GATEWAY"
echo "DYNAMO_INGRESS_SUFFIX: $DYNAMO_INGRESS_SUFFIX"
echo "VIRTUAL_SERVICE_SUPPORTS_HTTPS: $VIRTUAL_SERVICE_SUPPORTS_HTTPS"
envsubst '${NAMESPACE} ${RELEASE_NAME} ${DOCKER_USERNAME} ${DOCKER_PASSWORD} ${DOCKER_SERVER} ${IMAGE_TAG} ${DYNAMO_INGRESS_SUFFIX} ${PIPELINES_DOCKER_SERVER} ${PIPELINES_DOCKER_USERNAME} ${PIPELINES_DOCKER_PASSWORD} ${DOCKER_SECRET_NAME} ${INGRESS_ENABLED} ${ISTIO_ENABLED} ${INGRESS_CLASS} ${ISTIO_GATEWAY} ${VIRTUAL_SERVICE_SUPPORTS_HTTPS}' < dynamo-platform-values.yaml > generated-values.yaml
envsubst '${NAMESPACE} ${RELEASE_NAME} ${DOCKER_USERNAME} ${DOCKER_PASSWORD} ${DOCKER_SERVER} ${IMAGE_TAG} ${DYNAMO_INGRESS_SUFFIX} ${PIPELINES_DOCKER_SERVER} ${PIPELINES_DOCKER_USERNAME} ${PIPELINES_DOCKER_PASSWORD} ${DOCKER_SECRET_NAME} ${INGRESS_ENABLED} ${ISTIO_ENABLED} ${INGRESS_CLASS} ${ISTIO_GATEWAY} ${VIRTUAL_SERVICE_SUPPORTS_HTTPS} ${ENABLE_LWS}' < dynamo-platform-values.yaml > generated-values.yaml
echo "generated file contents:"
cat generated-values.yaml
......
......@@ -27,6 +27,7 @@ dynamo-operator:
- name: ${DOCKER_SECRET_NAME}
dynamo:
enableLWS: ${ENABLE_LWS}
ingress:
enabled: ${INGRESS_ENABLED}
className: ${INGRESS_CLASS}
......
......@@ -89,6 +89,9 @@ spec:
{{- if .Values.dynamo.virtualServiceSupportsHTTPS }}
- --virtual-service-supports-https={{ .Values.dynamo.virtualServiceSupportsHTTPS }}
{{- end }}
{{- if .Values.dynamo.enableLWS }}
- --enable-lws
{{- end }}
command:
- /manager
......
......@@ -407,6 +407,32 @@ rules:
- patch
- update
- watch
{{- if .Values.dynamo.enableLWS }}
- apiGroups:
- leaderworkerset.x-k8s.io
resources:
- leaderworkersets
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- scheduling.volcano.sh
resources:
- podgroups
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
{{- end }}
---
apiVersion: rbac.authorization.k8s.io/v1
{{- if .Values.namespaceRestriction.enabled }}
......
......@@ -73,6 +73,7 @@ controllerManager:
annotations: {}
dynamo:
enableLWS: false
apiStore:
endpoint: http://dynamo-server.dynamo-system.svc.cluster.local
clusterName: default
......
......@@ -37,6 +37,7 @@ dynamo-operator:
- --health-probe-bind-address=:8081
- --metrics-bind-address=127.0.0.1:8080
dynamo:
enableLWS: false
apiStore:
endpoint: http://dynamo-store
clusterName: default
......
......@@ -40,7 +40,6 @@ linters:
enable:
- dupl
- errcheck
- exportloopref
- goconst
- gocyclo
- gofmt
......
......@@ -166,7 +166,7 @@ GOLANGCI_LINT = $(LOCALBIN)/golangci-lint-$(GOLANGCI_LINT_VERSION)
KUSTOMIZE_VERSION ?= v5.5.0
CONTROLLER_TOOLS_VERSION ?= v0.16.4
ENVTEST_VERSION ?= release-0.19
GOLANGCI_LINT_VERSION ?= v1.61.0
GOLANGCI_LINT_VERSION ?= v1.64.8
.PHONY: kustomize
kustomize: $(KUSTOMIZE) ## Download kustomize locally if necessary.
......
......@@ -13,7 +13,7 @@ Built with [Kubebuilder](https://book.kubebuilder.io/), it follows Kubernetes be
### Pre-requisites
- [Go](https://go.dev/doc/install) >= 1.23
- [Go](https://go.dev/doc/install) >= 1.24
- [Kubebuilder](https://book.kubebuilder.io/quick-start.html)
### Build
......
......@@ -40,6 +40,9 @@ import (
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
"sigs.k8s.io/controller-runtime/pkg/webhook"
lwsscheme "sigs.k8s.io/lws/client-go/clientset/versioned/scheme"
volcanoscheme "volcano.sh/apis/pkg/client/clientset/versioned/scheme"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller"
commonController "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common"
......@@ -57,6 +60,10 @@ func init() {
utilruntime.Must(clientgoscheme.AddToScheme(scheme))
utilruntime.Must(nvidiacomv1alpha1.AddToScheme(scheme))
utilruntime.Must(lwsscheme.AddToScheme(scheme))
utilruntime.Must(volcanoscheme.AddToScheme(scheme))
//+kubebuilder:scaffold:scheme
}
......@@ -75,6 +82,7 @@ func main() {
var ingressControllerClassName string
var ingressControllerTLSSecretName string
var ingressHostSuffix string
var enableLWS bool
flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
flag.BoolVar(&enableLeaderElection, "leader-elect", false,
......@@ -100,6 +108,8 @@ func main() {
"The name of the ingress controller TLS secret to use")
flag.StringVar(&ingressHostSuffix, "ingress-host-suffix", "",
"The suffix to use for the ingress host")
flag.BoolVar(&enableLWS, "enable-lws", false,
"If set, enable leader worker set")
opts := zap.Options{
Development: true,
}
......@@ -111,6 +121,7 @@ func main() {
ctrlConfig := commonController.Config{
RestrictedNamespace: restrictedNamespace,
VirtualServiceSupportsHTTPS: virtualServiceSupportsHTTPS,
EnableLWS: enableLWS,
}
ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts)))
......
......@@ -100,6 +100,18 @@ rules:
- patch
- update
- watch
- apiGroups:
- leaderworkerset.x-k8s.io
resources:
- leaderworkersets
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- networking.istio.io
resources:
......@@ -157,3 +169,15 @@ rules:
- get
- patch
- update
- apiGroups:
- scheduling.volcano.sh
resources:
- podgroups
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
module github.com/ai-dynamo/dynamo/deploy/cloud/operator
go 1.23.0
go 1.24.0
toolchain go1.23.4
toolchain go1.24.3
require (
dario.cat/mergo v1.0.1
emperror.dev/errors v0.8.1
github.com/apparentlymart/go-shquot v0.0.1
github.com/bsm/gomega v1.27.10
github.com/google/go-cmp v0.7.0
github.com/huandu/xstrings v1.4.0
github.com/mitchellh/hashstructure/v2 v2.0.2
github.com/onsi/ginkgo/v2 v2.19.0
github.com/onsi/gomega v1.33.1
github.com/onsi/ginkgo/v2 v2.23.4
github.com/onsi/gomega v1.37.0
github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.71.2
github.com/prune998/docker-registry-client v0.0.0-20200114164314-f8cd511a014c
github.com/rs/xid v1.4.0
github.com/sergeymakinen/go-quote v1.1.0
github.com/sirupsen/logrus v1.9.3
go.etcd.io/etcd/client/v3 v3.5.14
go.etcd.io/etcd/client/v3 v3.5.16
gopkg.in/yaml.v2 v2.4.0
istio.io/api v1.23.1
istio.io/client-go v1.23.1
k8s.io/api v0.31.3
k8s.io/apiextensions-apiserver v0.31.0
k8s.io/apimachinery v0.31.3
k8s.io/client-go v0.31.3
k8s.io/utils v0.0.0-20240711033017-18e509b52bc8
k8s.io/api v0.32.3
k8s.io/apiextensions-apiserver v0.32.3
k8s.io/apimachinery v0.32.3
k8s.io/client-go v0.32.3
k8s.io/utils v0.0.0-20241210054802-24370beab758
resty.dev/v3 v3.0.0-beta.2
sigs.k8s.io/controller-runtime v0.19.4
sigs.k8s.io/controller-runtime v0.20.4
sigs.k8s.io/lws v0.6.1
volcano.sh/apis v1.11.0
)
......@@ -41,29 +43,28 @@ require (
github.com/distribution/reference v0.6.0 // indirect
github.com/docker/distribution v2.8.3+incompatible // indirect
github.com/docker/libtrust v0.0.0-20150114040149-fa567046d9b1 // indirect
github.com/emicklei/go-restful/v3 v3.11.0 // indirect
github.com/emicklei/go-restful/v3 v3.12.0 // indirect
github.com/evanphx/json-patch v5.7.0+incompatible // indirect
github.com/evanphx/json-patch/v5 v5.9.0 // indirect
github.com/evanphx/json-patch/v5 v5.9.11 // indirect
github.com/fsnotify/fsnotify v1.7.0 // indirect
github.com/fxamacker/cbor/v2 v2.7.0 // indirect
github.com/go-logr/logr v1.4.2 // indirect
github.com/go-logr/zapr v1.3.0 // indirect
github.com/go-openapi/jsonpointer v0.20.2 // indirect
github.com/go-openapi/jsonreference v0.20.2 // indirect
github.com/go-openapi/swag v0.22.8 // indirect
github.com/go-openapi/jsonpointer v0.21.0 // indirect
github.com/go-openapi/jsonreference v0.21.0 // indirect
github.com/go-openapi/swag v0.23.0 // indirect
github.com/go-task/slim-sprig/v3 v3.0.0 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
github.com/golang/protobuf v1.5.4 // indirect
github.com/google/btree v1.1.3 // indirect
github.com/google/gnostic-models v0.6.8 // indirect
github.com/google/go-cmp v0.6.0 // indirect
github.com/google/gofuzz v1.2.0 // indirect
github.com/google/pprof v0.0.0-20240525223248-4bfdf5a9a2af // indirect
github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/gorilla/mux v1.8.1 // indirect
github.com/imdario/mergo v0.3.13 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/klauspost/compress v1.17.9 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
......@@ -71,34 +72,36 @@ require (
github.com/opencontainers/go-digest v1.0.0 // indirect
github.com/opencontainers/image-spec v1.1.0 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/prometheus/client_golang v1.19.1 // indirect
github.com/prometheus/client_golang v1.20.2 // indirect
github.com/prometheus/client_model v0.6.1 // indirect
github.com/prometheus/common v0.55.0 // indirect
github.com/prometheus/procfs v0.15.1 // indirect
github.com/spf13/pflag v1.0.5 // indirect
github.com/x448/float16 v0.8.4 // indirect
go.etcd.io/etcd/api/v3 v3.5.14 // indirect
go.etcd.io/etcd/client/pkg/v3 v3.5.14 // indirect
go.etcd.io/etcd/api/v3 v3.5.16 // indirect
go.etcd.io/etcd/client/pkg/v3 v3.5.16 // indirect
go.uber.org/automaxprocs v1.6.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
go.uber.org/zap v1.26.0 // indirect
golang.org/x/exp v0.0.0-20230515195305-f3d0a9c9a5cc // indirect
golang.org/x/net v0.33.0 // indirect
golang.org/x/oauth2 v0.21.0 // indirect
golang.org/x/sys v0.28.0 // indirect
golang.org/x/term v0.27.0 // indirect
golang.org/x/text v0.21.0 // indirect
golang.org/x/time v0.6.0 // indirect
golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d // indirect
go.uber.org/zap v1.27.0 // indirect
golang.org/x/net v0.37.0 // indirect
golang.org/x/oauth2 v0.23.0 // indirect
golang.org/x/sync v0.12.0 // indirect
golang.org/x/sys v0.32.0 // indirect
golang.org/x/term v0.30.0 // indirect
golang.org/x/text v0.23.0 // indirect
golang.org/x/time v0.7.0 // indirect
golang.org/x/tools v0.31.0 // indirect
gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20240528184218-531527333157 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20240701130421-f6361c86f094 // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20240826202546-f6391c0de4c7 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20240826202546-f6391c0de4c7 // indirect
google.golang.org/grpc v1.65.0 // indirect
google.golang.org/protobuf v1.34.2 // indirect
google.golang.org/protobuf v1.36.6 // indirect
gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
k8s.io/klog/v2 v2.130.1 // indirect
k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 // indirect
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect
k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f // indirect
sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect
sigs.k8s.io/structured-merge-diff/v4 v4.7.0 // indirect
sigs.k8s.io/yaml v1.4.0 // indirect
)
......@@ -14,7 +14,6 @@ github.com/coreos/go-semver v0.3.1 h1:yi21YpKnrx1gt5R+la8n5WgS0kCrsPp33dmEyHReZr
github.com/coreos/go-semver v0.3.1/go.mod h1:irMmmIw/7yzSRPWryHsK7EYSg09caPQL03VsM8rvUec=
github.com/coreos/go-systemd/v22 v22.5.0 h1:RrqgGjYQKalulkV8NGVIfkXQf6YYmOyiJKk8iXXhfZs=
github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
......@@ -25,12 +24,12 @@ github.com/docker/distribution v2.8.3+incompatible h1:AtKxIZ36LoNK51+Z6RpzLpddBi
github.com/docker/distribution v2.8.3+incompatible/go.mod h1:J2gT2udsDAN96Uj4KfcMRqY0/ypR+oyYUYmja8H+y+w=
github.com/docker/libtrust v0.0.0-20150114040149-fa567046d9b1 h1:ZClxb8laGDf5arXfYcAtECDFgAgHklGI8CxgjHnXKJ4=
github.com/docker/libtrust v0.0.0-20150114040149-fa567046d9b1/go.mod h1:cyGadeNEkKy96OOhEzfZl+yxihPEzKnqJwvfuSUqbZE=
github.com/emicklei/go-restful/v3 v3.11.0 h1:rAQeMHw1c7zTmncogyy8VvRZwtkmkZ4FxERmMY4rD+g=
github.com/emicklei/go-restful/v3 v3.11.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc=
github.com/emicklei/go-restful/v3 v3.12.0 h1:y2DdzBAURM29NFF94q6RaY4vjIH1rtwDapwQtU84iWk=
github.com/emicklei/go-restful/v3 v3.12.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc=
github.com/evanphx/json-patch v5.7.0+incompatible h1:vgGkfT/9f8zE6tvSCe74nfpAVDQ2tG6yudJd8LBksgI=
github.com/evanphx/json-patch v5.7.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk=
github.com/evanphx/json-patch/v5 v5.9.0 h1:kcBlZQbplgElYIlo/n1hJbls2z/1awpXxpRi0/FOJfg=
github.com/evanphx/json-patch/v5 v5.9.0/go.mod h1:VNkHZ/282BpEyt/tObQO8s5CMPmYYq14uClGH4abBuQ=
github.com/evanphx/json-patch/v5 v5.9.11 h1:/8HVnzMq13/3x9TPvjG08wUGqBTmZBsCWzjTM0wiaDU=
github.com/evanphx/json-patch/v5 v5.9.11/go.mod h1:3j+LviiESTElxA4p3EMKAB9HXj3/XEtnUf6OZxqIQTM=
github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA=
github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM=
github.com/fxamacker/cbor/v2 v2.7.0 h1:iM5WgngdRBanHcxugY4JySA0nk1wZorNOpTgCMedv5E=
......@@ -39,54 +38,51 @@ github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY=
github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ=
github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg=
github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs=
github.com/go-openapi/jsonpointer v0.20.2 h1:mQc3nmndL8ZBzStEo3JYF8wzmeWffDH4VbXz58sAx6Q=
github.com/go-openapi/jsonpointer v0.20.2/go.mod h1:bHen+N0u1KEO3YlmqOjTT9Adn1RfD91Ar825/PuiRVs=
github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE=
github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k=
github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14=
github.com/go-openapi/swag v0.22.8 h1:/9RjDSQ0vbFR+NyjGMkFTsA1IA0fmhKSThmfGZjicbw=
github.com/go-openapi/swag v0.22.8/go.mod h1:6QT22icPLEqAM/z/TChgb4WAveCHF92+2gF0CNjHpPI=
github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ=
github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY=
github.com/go-openapi/jsonreference v0.21.0 h1:Rs+Y7hSXT83Jacb7kFyjn4ijOuVGSvOdF2+tg1TRrwQ=
github.com/go-openapi/jsonreference v0.21.0/go.mod h1:LmZmgsrTkVg9LG4EaHeY8cBDslNPMo06cago5JNLkm4=
github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE=
github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ=
github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI=
github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8=
github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE=
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek=
github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps=
github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg=
github.com/google/btree v1.1.3/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4=
github.com/google/gnostic-models v0.6.8 h1:yo/ABAfM5IMRsS1VnXjTBvUb61tFIHozhlYvRgGre9I=
github.com/google/gnostic-models v0.6.8/go.mod h1:5n7qKqH0f5wFt+aWF8CW6pZLLNOfYuF5OpfBSENuI8U=
github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0=
github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/google/pprof v0.0.0-20240525223248-4bfdf5a9a2af h1:kmjWCqn2qkEml422C2Rrd27c3VGxi6a/6HNq8QmHRKM=
github.com/google/pprof v0.0.0-20240525223248-4bfdf5a9a2af/go.mod h1:K1liHPHnj73Fdn/EKuT8nrFqBihUSKXoLYU0BuatOYo=
github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 h1:BHT72Gu3keYf3ZEu2J0b1vyeLSOYI8bm5wbJM/8yDe8=
github.com/google/pprof v0.0.0-20250403155104-27863c87afa6/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY=
github.com/gorilla/mux v1.8.1/go.mod h1:AKf9I4AEqPTmMytcMc0KkNouC66V3BtZ4qD5fmWSiMQ=
github.com/huandu/xstrings v1.4.0 h1:D17IlohoQq4UcpqD7fDk80P7l+lwAmlFaBHgOipl2FU=
github.com/huandu/xstrings v1.4.0/go.mod h1:y5/lhBue+AyNmUVz9RLU9xbLR0o4KIIExikq4ovT0aE=
github.com/imdario/mergo v0.3.13 h1:lFzP57bqS/wsqKssCGmtLAb8A0wKjLGrve2q3PPVcBk=
github.com/imdario/mergo v0.3.13/go.mod h1:4lJ1jqUDcsbIECGy0RUJAXNIhg+6ocWgb1ALK2O4oXg=
github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
github.com/klauspost/compress v1.17.9 h1:6KIumPrER1LHsvBVuDa0r5xaG0Es51mhhB9BQB2qeMA=
github.com/klauspost/compress v1.17.9/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw=
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
github.com/mitchellh/hashstructure/v2 v2.0.2 h1:vGKWl0YJqUNxE8d+h8f6NJLcCJrgbhC4NcD46KavDd4=
......@@ -98,10 +94,10 @@ github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9G
github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
github.com/onsi/ginkgo/v2 v2.19.0 h1:9Cnnf7UHo57Hy3k6/m5k3dRfGTMXGvxhHFvkDTCTpvA=
github.com/onsi/ginkgo/v2 v2.19.0/go.mod h1:rlwLi9PilAFJ8jCg9UE1QP6VBpd6/xj3SRC0d6TU0To=
github.com/onsi/gomega v1.33.1 h1:dsYjIxxSR755MDmKVsaFQTE22ChNBcuuTWgkUDSubOk=
github.com/onsi/gomega v1.33.1/go.mod h1:U4R44UsT+9eLIaYRB2a5qajjtQYn0hauxvRm16AVYg0=
github.com/onsi/ginkgo/v2 v2.23.4 h1:ktYTpKJAVZnDT4VjxSbiBenUjmlL/5QkBEocaWXiQus=
github.com/onsi/ginkgo/v2 v2.23.4/go.mod h1:Bt66ApGPBFzHyR+JO10Zbt0Gsp4uWxu5mIOTusL46e8=
github.com/onsi/gomega v1.37.0 h1:CdEG8g0S133B4OswTDC/5XPSzE1OeP29QOioj2PID2Y=
github.com/onsi/gomega v1.37.0/go.mod h1:8D9+Txp43QWKhM24yyOBEdpkzN8FvJyAwecBgsU4KU0=
github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U=
github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM=
github.com/opencontainers/image-spec v1.1.0 h1:8SG7/vwALn54lVB/0yZ/MMwhFrPYtpEHQb2IpWsCzug=
......@@ -111,10 +107,12 @@ github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINE
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/prashantv/gostub v1.1.0 h1:BTyx3RfQjRHnUWaGF9oQos79AlQ5k8WNktv7VGvVH4g=
github.com/prashantv/gostub v1.1.0/go.mod h1:A5zLQHz7ieHGG7is6LLXLz7I8+3LZzsrV0P1IAHhP5U=
github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.71.2 h1:HZdPRm0ApWPg7F4sHgbqWkL+ddWfpTZsopm5HM/2g4o=
github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.71.2/go.mod h1:3RiUkFmR9kmPZi9r/8a5jw0a9yg+LMmr7qa0wjqvSiI=
github.com/prometheus/client_golang v1.19.1 h1:wZWJDwK+NameRJuPGDhlnFgx8e8HN3XHQeLaYJFJBOE=
github.com/prometheus/client_golang v1.19.1/go.mod h1:mP78NwGzrVks5S2H6ab8+ZZGJLZUq1hoULYBAYBw1Ho=
github.com/prometheus/client_golang v1.20.2 h1:5ctymQzZlyOON1666svgwn3s6IKWgfbjsejTMiXIyjg=
github.com/prometheus/client_golang v1.20.2/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE=
github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E=
github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY=
github.com/prometheus/common v0.55.0 h1:KEi6DK7lXW/m7Ig5i47x0vRzuBsHuvJdi5ee6Y3G1dc=
......@@ -134,85 +132,82 @@ github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVs
github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg=
github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
go.etcd.io/etcd/api/v3 v3.5.14 h1:vHObSCxyB9zlF60w7qzAdTcGaglbJOpSj1Xj9+WGxq0=
go.etcd.io/etcd/api/v3 v3.5.14/go.mod h1:BmtWcRlQvwa1h3G2jvKYwIQy4PkHlDej5t7uLMUdJUU=
go.etcd.io/etcd/client/pkg/v3 v3.5.14 h1:SaNH6Y+rVEdxfpA2Jr5wkEvN6Zykme5+YnbCkxvuWxQ=
go.etcd.io/etcd/client/pkg/v3 v3.5.14/go.mod h1:8uMgAokyG1czCtIdsq+AGyYQMvpIKnSvPjFMunkgeZI=
go.etcd.io/etcd/client/v3 v3.5.14 h1:CWfRs4FDaDoSz81giL7zPpZH2Z35tbOrAJkkjMqOupg=
go.etcd.io/etcd/client/v3 v3.5.14/go.mod h1:k3XfdV/VIHy/97rqWjoUzrj9tk7GgJGH9J8L4dNXmAk=
go.etcd.io/etcd/api/v3 v3.5.16 h1:WvmyJVbjWqK4R1E+B12RRHz3bRGy9XVfh++MgbN+6n0=
go.etcd.io/etcd/api/v3 v3.5.16/go.mod h1:1P4SlIP/VwkDmGo3OlOD7faPeP8KDIFhqvciH5EfN28=
go.etcd.io/etcd/client/pkg/v3 v3.5.16 h1:ZgY48uH6UvB+/7R9Yf4x574uCO3jIx0TRDyetSfId3Q=
go.etcd.io/etcd/client/pkg/v3 v3.5.16/go.mod h1:V8acl8pcEK0Y2g19YlOV9m9ssUe6MgiDSobSoaBAM0E=
go.etcd.io/etcd/client/v3 v3.5.16 h1:sSmVYOAHeC9doqi0gv7v86oY/BTld0SEFGaxsU9eRhE=
go.etcd.io/etcd/client/v3 v3.5.16/go.mod h1:X+rExSGkyqxvu276cr2OwPLBaeqFu1cIl4vmRjAD/50=
go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc=
go.uber.org/automaxprocs v1.6.0 h1:O3y2/QNTOdbF+e/dpXNNW7Rx2hZ4sTIPyybbxyNqTUs=
go.uber.org/automaxprocs v1.6.0/go.mod h1:ifeIMSnPZuznNm6jmdzmU3/bfk01Fe2fotchwEFJ8r8=
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU=
go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0=
go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y=
go.uber.org/zap v1.26.0 h1:sI7k6L95XOKS281NhVKOFCUNIvv9e0w4BF8N3u+tCRo=
go.uber.org/zap v1.26.0/go.mod h1:dtElttAiwGvoJ/vj4IwHBS/gXsEu/pZ50mUIRWuG0so=
go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8=
go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/exp v0.0.0-20230515195305-f3d0a9c9a5cc h1:mCRnTeVUjcrhlRmO0VK8a6k6Rrf6TF9htwo2pJVSjIU=
golang.org/x/exp v0.0.0-20230515195305-f3d0a9c9a5cc/go.mod h1:V1LtkGg67GoY2N1AnLN78QLrzxkLyJw7RJb1gzOOz9w=
golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
golang.org/x/net v0.33.0 h1:74SYHlV8BIgHIFC/LrYkOGIwL19eTYXQ5wc6TBuO36I=
golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4=
golang.org/x/oauth2 v0.21.0 h1:tsimM75w1tF/uws5rbeHzIWxEqElMehnc+iW793zsZs=
golang.org/x/oauth2 v0.21.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI=
golang.org/x/net v0.37.0 h1:1zLorHbz+LYj7MQlSf1+2tPIIgibq2eL5xkrGk6f+2c=
golang.org/x/net v0.37.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8=
golang.org/x/oauth2 v0.23.0 h1:PbgcYx2W7i4LvjJWEbf0ngHV6qJYr86PkAV3bXdLEbs=
golang.org/x/oauth2 v0.23.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.12.0 h1:MHc5BpPuC30uJk597Ri8TV3CNZcTLu6B6z4lJy+g6Jw=
golang.org/x/sync v0.12.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.28.0 h1:Fksou7UEQUWlKvIdsqzJmUmCX3cZuD2+P3XyyzwMhlA=
golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/term v0.27.0 h1:WP60Sv1nlK1T6SupCHbXzSaN0b9wUmsPoRS9b61A23Q=
golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM=
golang.org/x/sys v0.32.0 h1:s77OFDvIQeibCmezSnk/q6iAfkdiQaJi4VzroCFrN20=
golang.org/x/sys v0.32.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
golang.org/x/term v0.30.0 h1:PQ39fJZ+mfadBm0y5WlL4vlM7Sx1Hgf13sMIY2+QS9Y=
golang.org/x/term v0.30.0/go.mod h1:NYYFdzHoI5wRh/h5tDMdMqCqPJZEuNqVR5xJLd/n67g=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo=
golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ=
golang.org/x/time v0.6.0 h1:eTDhh4ZXt5Qf0augr54TN6suAUudPcawVZeIAPU7D4U=
golang.org/x/time v0.6.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM=
golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY=
golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4=
golang.org/x/time v0.7.0 h1:ntUhktv3OPE6TgYxXWv9vKvUSJyIFJlyohwbkEwPrKQ=
golang.org/x/time v0.7.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d h1:vU5i/LfpvrRCpgM/VPfJLg5KjxD3E+hfT1SH+d9zLwg=
golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk=
golang.org/x/tools v0.31.0 h1:0EedkvKDbh+qistFTd0Bcwe/YLh4vHwWEkiI0toFIBU=
golang.org/x/tools v0.31.0/go.mod h1:naFTU+Cev749tSJRXJlna0T3WxKvb1kWEx15xA4SdmQ=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw=
gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY=
google.golang.org/genproto/googleapis/api v0.0.0-20240528184218-531527333157 h1:7whR9kGa5LUwFtpLm2ArCEejtnxlGeLbAyjFY8sGNFw=
google.golang.org/genproto/googleapis/api v0.0.0-20240528184218-531527333157/go.mod h1:99sLkeliLXfdj2J75X3Ho+rrVCaJze0uwN7zDDkjPVU=
google.golang.org/genproto/googleapis/rpc v0.0.0-20240701130421-f6361c86f094 h1:BwIjyKYGsK9dMCBOorzRri8MQwmi7mT9rGHsCEinZkA=
google.golang.org/genproto/googleapis/rpc v0.0.0-20240701130421-f6361c86f094/go.mod h1:Ue6ibwXGpU+dqIcODieyLOcgj7z8+IcskoNIgZxtrFY=
google.golang.org/genproto/googleapis/api v0.0.0-20240826202546-f6391c0de4c7 h1:YcyjlL1PRr2Q17/I0dPk2JmYS5CDXfcdb2Z3YRioEbw=
google.golang.org/genproto/googleapis/api v0.0.0-20240826202546-f6391c0de4c7/go.mod h1:OCdP9MfskevB/rbYvHTsXTtKC+3bHWajPdoKgjcYkfo=
google.golang.org/genproto/googleapis/rpc v0.0.0-20240826202546-f6391c0de4c7 h1:2035KHhUv+EpyB+hWgJnaWKJOdX1E95w2S8Rr4uWKTs=
google.golang.org/genproto/googleapis/rpc v0.0.0-20240826202546-f6391c0de4c7/go.mod h1:UqMtugtsSgubUsoxbuAoiCXvqvErP7Gf0so0mK9tHxU=
google.golang.org/grpc v1.65.0 h1:bs/cUb4lp1G5iImFFd3u5ixQzweKizoZJAwBNLR42lc=
google.golang.org/grpc v1.65.0/go.mod h1:WgYC2ypjlB0EiQi6wdKixMqukr6lBc0Vo+oOgjrM5ZQ=
google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg=
google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw=
google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY=
google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
......@@ -220,39 +215,41 @@ gopkg.in/evanphx/json-patch.v4 v4.12.0 h1:n6jtcsulIzXPJaxegRbvFNNrZDjbij7ny3gmSP
gopkg.in/evanphx/json-patch.v4 v4.12.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M=
gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc=
gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw=
gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.0/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
istio.io/api v1.23.1 h1:bm2XF0j058FfzWVHUfpmMj4sFDkcD1X609qs5AU97Pc=
istio.io/api v1.23.1/go.mod h1:QPSTGXuIQdnZFEm3myf9NZ5uBMwCdJWUvfj9ZZ+2oBM=
istio.io/client-go v1.23.1 h1:IX2cgUUXnVYo+9H6bFGSp/vuKVLPUkmiN8qk1/mvsYs=
istio.io/client-go v1.23.1/go.mod h1:+fxu+O2GkITM3HEREUWdobvRXqI/UhAAI7hfxqqpRh0=
k8s.io/api v0.31.3 h1:umzm5o8lFbdN/hIXbrK9oRpOproJO62CV1zqxXrLgk8=
k8s.io/api v0.31.3/go.mod h1:UJrkIp9pnMOI9K2nlL6vwpxRzzEX5sWgn8kGQe92kCE=
k8s.io/apiextensions-apiserver v0.31.0 h1:fZgCVhGwsclj3qCw1buVXCV6khjRzKC5eCFt24kyLSk=
k8s.io/apiextensions-apiserver v0.31.0/go.mod h1:b9aMDEYaEe5sdK+1T0KU78ApR/5ZVp4i56VacZYEHxk=
k8s.io/apimachinery v0.31.3 h1:6l0WhcYgasZ/wk9ktLq5vLaoXJJr5ts6lkaQzgeYPq4=
k8s.io/apimachinery v0.31.3/go.mod h1:rsPdaZJfTfLsNJSQzNHQvYoTmxhoOEofxtOsF3rtsMo=
k8s.io/client-go v0.31.3 h1:CAlZuM+PH2cm+86LOBemaJI/lQ5linJ6UFxKX/SoG+4=
k8s.io/client-go v0.31.3/go.mod h1:2CgjPUTpv3fE5dNygAr2NcM8nhHzXvxB8KL5gYc3kJs=
k8s.io/api v0.32.3 h1:Hw7KqxRusq+6QSplE3NYG4MBxZw1BZnq4aP4cJVINls=
k8s.io/api v0.32.3/go.mod h1:2wEDTXADtm/HA7CCMD8D8bK4yuBUptzaRhYcYEEYA3k=
k8s.io/apiextensions-apiserver v0.32.3 h1:4D8vy+9GWerlErCwVIbcQjsWunF9SUGNu7O7hiQTyPY=
k8s.io/apiextensions-apiserver v0.32.3/go.mod h1:8YwcvVRMVzw0r1Stc7XfGAzB/SIVLunqApySV5V7Dss=
k8s.io/apimachinery v0.32.3 h1:JmDuDarhDmA/Li7j3aPrwhpNBA94Nvk5zLeOge9HH1U=
k8s.io/apimachinery v0.32.3/go.mod h1:GpHVgxoKlTxClKcteaeuF1Ul/lDVb74KpZcxcmLDElE=
k8s.io/client-go v0.32.3 h1:RKPVltzopkSgHS7aS98QdscAgtgah/+zmpAogooIqVU=
k8s.io/client-go v0.32.3/go.mod h1:3v0+3k4IcT9bXTc4V2rt+d2ZPPG700Xy6Oi0Gdl2PaY=
k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk=
k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE=
k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 h1:BZqlfIlq5YbRMFko6/PM7FjZpUb45WallggurYhKGag=
k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340/go.mod h1:yD4MZYeKMBwQKVht279WycxKyM84kkAx2DPrTXaeb98=
k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 h1:pUdcCO1Lk/tbT5ztQWOBi5HBgbBP1J8+AsQnQCKsi8A=
k8s.io/utils v0.0.0-20240711033017-18e509b52bc8/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f h1:GA7//TjRY9yWGy1poLzYYJJ4JRdzg3+O6e8I+e+8T5Y=
k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f/go.mod h1:R/HEjbvWI0qdfb8viZUeVZm0X6IZnxAydC7YU42CMw4=
k8s.io/utils v0.0.0-20241210054802-24370beab758 h1:sdbE21q2nlQtFh65saZY+rRM6x6aJJI8IUa1AmH/qa0=
k8s.io/utils v0.0.0-20241210054802-24370beab758/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
resty.dev/v3 v3.0.0-beta.2 h1:xu4mGAdbCLuc3kbk7eddWfWm4JfhwDtdapwss5nCjnQ=
resty.dev/v3 v3.0.0-beta.2/go.mod h1:OgkqiPvTDtOuV4MGZuUDhwOpkY8enjOsjjMzeOHefy4=
sigs.k8s.io/controller-runtime v0.19.4 h1:SUmheabttt0nx8uJtoII4oIP27BVVvAKFvdvGFwV/Qo=
sigs.k8s.io/controller-runtime v0.19.4/go.mod h1:iRmWllt8IlaLjvTTDLhRBXIEtkCK6hwVBJJsYS9Ajf4=
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo=
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0=
sigs.k8s.io/structured-merge-diff/v4 v4.4.1 h1:150L+0vs/8DA78h1u02ooW1/fFq/Lwr+sGiqlzvrtq4=
sigs.k8s.io/structured-merge-diff/v4 v4.4.1/go.mod h1:N8hJocpFajUSSeSJ9bOZ77VzejKZaXsTtZo4/u7Io08=
sigs.k8s.io/controller-runtime v0.20.4 h1:X3c+Odnxz+iPTRobG4tp092+CvBU9UK0t/bRf+n0DGU=
sigs.k8s.io/controller-runtime v0.20.4/go.mod h1:xg2XB0K5ShQzAgsoujxuKN4LNXR2LfwwHsPj7Iaw+XY=
sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 h1:/Rv+M11QRah1itp8VhT6HoVx1Ray9eB4DBr+K+/sCJ8=
sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3/go.mod h1:18nIHnGi6636UCz6m8i4DhaJ65T6EruyzmoQqI2BVDo=
sigs.k8s.io/lws v0.6.1 h1:cWiRmMSflo8hQPBrmIIZtoaX3XuVkmAgFKkmjxlPULI=
sigs.k8s.io/lws v0.6.1/go.mod h1:aoT5ROMriBtN/H8JH0POBF6e2uyFCOxKGKtXSA3DVV8=
sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016 h1:kXv6kKdoEtedwuqMmkqhbkgvYKeycVbC8+iPCP9j5kQ=
sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY=
sigs.k8s.io/structured-merge-diff/v4 v4.7.0 h1:qPeWmscJcXP0snki5IYF79Z8xrl8ETFxgMd7wez1XkI=
sigs.k8s.io/structured-merge-diff/v4 v4.7.0/go.mod h1:dDy58f92j70zLsuZVuUX5Wp9vtxXpaZnkPGWeqDfCps=
sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E=
sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY=
volcano.sh/apis v1.11.0 h1:Z5ZXxxgUNfXv1OhfVXXfGPN7StoSsozQM+8CAPoNWY8=
......
......@@ -59,6 +59,9 @@ import (
"sigs.k8s.io/controller-runtime/pkg/event"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/predicate"
leaderworkersetv1 "sigs.k8s.io/lws/api/leaderworkerset/v1"
volcanov1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
)
const (
......@@ -74,6 +77,11 @@ const (
HeaderNameDebug = "X-Nvidia-Debug"
DefaultIngressSuffix = "local"
KubernetesDeploymentStrategy = "kubernetes"
KubeAnnotationDeploymentType = "nvidia.com/deployment-type"
KubeAnnotationLWSSize = "nvidia.com/lws-size"
DeploymentTypeStandard = "standard"
DeploymentTypeLeaderWorker = "leader-worker"
)
// DynamoComponentDeploymentReconciler reconciles a DynamoComponentDeployment object
......@@ -104,6 +112,9 @@ type DynamoComponentDeploymentReconciler struct {
//+kubebuilder:rbac:groups=networking.istio.io,resources=virtualservices,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=core,resources=persistentvolumeclaims,verbs=get;list;create;delete
// +kubebuilder:rbac:groups=scheduling.volcano.sh,resources=podgroups,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=leaderworkerset.x-k8s.io,resources=leaderworkersets,verbs=get;list;watch;create;update;patch;delete
// Reconcile is part of the main kubernetes reconciliation loop which aims to
// move the current state of the cluster closer to the desired state.
// TODO(user): Modify the Reconcile function to compare the state specified by
......@@ -239,19 +250,125 @@ func (r *DynamoComponentDeploymentReconciler) Reconcile(ctx context.Context, req
return ctrl.Result{}, err
}
// create or update api-server deployment
modified_, deployment, err := r.createOrUpdateOrDeleteDeployments(ctx, generateResourceOption{
// Determine deployment type
deploymentType := GetDeploymentType(dynamoComponentDeployment)
logs.Info("Using deployment type", "type", deploymentType)
// Create the appropriate workload resource based on deployment type
var leaderWorkerSets []*leaderworkersetv1.LeaderWorkerSet
var deployment *appsv1.Deployment
if r.Config.EnableLWS && deploymentType == DeploymentTypeLeaderWorker {
desiredReplicas := int32(1)
if dynamoComponentDeployment.Spec.Replicas != nil {
desiredReplicas = *dynamoComponentDeployment.Spec.Replicas
}
anyModified := false
for i := range int(desiredReplicas) {
modified_, _, err := commonController.SyncResource(ctx, r, dynamoComponentDeployment, func(ctx context.Context) (*volcanov1beta1.PodGroup, bool, error) {
return r.generateVolcanoPodGroup(ctx, generateResourceOption{
dynamoComponentDeployment: dynamoComponentDeployment,
dynamoComponent: dynamoComponentCR,
isStealingTrafficDebugModeEnabled: false,
containsStealingTrafficDebugModeEnabled: false,
instanceID: &i,
})
})
if err != nil {
return
return ctrl.Result{}, err
}
if modified_ {
anyModified = true
}
modified_, lwsObj, err := commonController.SyncResource(ctx, r, dynamoComponentDeployment, func(ctx context.Context) (*leaderworkersetv1.LeaderWorkerSet, bool, error) {
return r.generateLeaderWorkerSet(ctx, generateResourceOption{
dynamoComponentDeployment: dynamoComponentDeployment,
dynamoComponent: dynamoComponentCR,
isStealingTrafficDebugModeEnabled: false,
containsStealingTrafficDebugModeEnabled: false,
instanceID: &i,
})
})
if err != nil {
return ctrl.Result{}, err
}
if modified_ {
anyModified = true
}
leaderWorkerSets = append(leaderWorkerSets, lwsObj)
}
// Clean up any excess LeaderWorkerSets (if replicas were decreased)
baseKubeName := r.getKubeName(dynamoComponentDeployment, dynamoComponentCR, false)
for i := int(desiredReplicas); ; i++ {
// Try to find a LeaderWorkerSet with the next index
nextLWSName := fmt.Sprintf("%s-%d", baseKubeName, i)
lwsToDelete := &leaderworkersetv1.LeaderWorkerSet{}
err := r.Get(ctx, types.NamespacedName{
Name: nextLWSName,
Namespace: dynamoComponentDeployment.Namespace,
}, lwsToDelete)
if err != nil {
if k8serrors.IsNotFound(err) {
break
}
return ctrl.Result{}, err
}
err = r.Delete(ctx, lwsToDelete)
if err != nil {
return ctrl.Result{}, err
}
podGroupName := nextLWSName
podGroupToDelete := &volcanov1beta1.PodGroup{}
err = r.Get(ctx, types.NamespacedName{
Name: podGroupName,
Namespace: dynamoComponentDeployment.Namespace,
}, podGroupToDelete)
if err != nil {
if !k8serrors.IsNotFound(err) {
logs.Error(err, "Failed to get PodGroup for deletion", "podGroupName", podGroupName)
}
} else {
err = r.Delete(ctx, podGroupToDelete)
if err != nil {
logs.Error(err, "Failed to delete PodGroup", "podGroupName", podGroupName)
}
}
anyModified = true
}
modified = anyModified
} else {
modified_, obj, err := r.createOrUpdateOrDeleteDeployments(ctx, generateResourceOption{
dynamoComponentDeployment: dynamoComponentDeployment,
dynamoComponent: dynamoComponentCR,
})
if err != nil {
return ctrl.Result{}, err
}
if modified_ {
modified = true
}
deployment = obj
// create or update api-server hpa
modified_, _, err = commonController.SyncResource(ctx, r, dynamoComponentDeployment, func(ctx context.Context) (*autoscalingv2.HorizontalPodAutoscaler, bool, error) {
return r.generateHPA(generateResourceOption{
......@@ -260,15 +377,17 @@ func (r *DynamoComponentDeploymentReconciler) Reconcile(ctx context.Context, req
})
})
if err != nil {
return
return ctrl.Result{}, err
}
if modified_ {
modified = true
}
}
// create or update api-server service
modified_, err = r.createOrUpdateOrDeleteServices(ctx, generateResourceOption{
modified_, err := r.createOrUpdateOrDeleteServices(ctx, generateResourceOption{
dynamoComponentDeployment: dynamoComponentDeployment,
dynamoComponent: dynamoComponentCR,
})
......@@ -299,10 +418,305 @@ func (r *DynamoComponentDeploymentReconciler) Reconcile(ctx context.Context, req
logs.Info("Finished reconciling.")
r.Recorder.Eventf(dynamoComponentDeployment, corev1.EventTypeNormal, "Update", "All resources updated!")
if deploymentType == DeploymentTypeLeaderWorker {
err = r.computeAvailableStatusConditionForLeaderWorkerSets(ctx, req, leaderWorkerSets)
} else {
err = r.computeAvailableStatusCondition(ctx, req, deployment)
}
return
}
// computeAvailableStatusConditionForLeaderWorkerSet updates the status condition based on LeaderWorkerSet readiness
func (r *DynamoComponentDeploymentReconciler) computeAvailableStatusConditionForLeaderWorkerSets(ctx context.Context, req ctrl.Request, leaderWorkerSets []*leaderworkersetv1.LeaderWorkerSet) error {
logs := log.FromContext(ctx)
allReady := true
for _, leaderWorkerSet := range leaderWorkerSets {
if !IsLeaderWorkerSetReady(leaderWorkerSet) {
allReady = false
break
}
}
if allReady {
logs.Info("All LeaderWorkerSets are ready. Setting available status condition to true.")
_, err := r.setStatusConditions(ctx, req,
metav1.Condition{
Type: v1alpha1.DynamoGraphDeploymentConditionTypeAvailable,
Status: metav1.ConditionTrue,
Reason: "AllLeaderWorkerSetsReady",
Message: "All LeaderWorkerSets are ready",
},
)
return err
} else {
logs.Info("Not all LeaderWorkerSets are ready. Setting available status condition to false.")
_, err := r.setStatusConditions(ctx, req,
metav1.Condition{
Type: v1alpha1.DynamoGraphDeploymentConditionTypeAvailable,
Status: metav1.ConditionFalse,
Reason: "LeaderWorkerSetsNotReady",
Message: "Not all LeaderWorkerSets are ready",
},
)
return err
}
}
// GetDeploymentType returns the deployment type from the annotations
// If not set, it returns the default DeploymentTypeStandard
func GetDeploymentType(dynamoComponentDeployment *v1alpha1.DynamoComponentDeployment) string {
resourceAnnotations := getResourceAnnotations(dynamoComponentDeployment)
deploymentType := resourceAnnotations[KubeAnnotationDeploymentType]
if deploymentType == "" {
deploymentType = DeploymentTypeStandard
}
return deploymentType
}
// IsLeaderWorkerSetReady determines if a LeaderWorkerSet is fully ready and available
func IsLeaderWorkerSetReady(leaderWorkerSet *leaderworkersetv1.LeaderWorkerSet) bool {
if leaderWorkerSet == nil {
return false
}
desiredReplicas := int32(1)
if leaderWorkerSet.Spec.Replicas != nil {
desiredReplicas = *leaderWorkerSet.Spec.Replicas
}
// Special case: if no replicas are desired, the LeaderWorkerSet is considered ready
if desiredReplicas == 0 {
return true
}
status := leaderWorkerSet.Status
if status.ReadyReplicas < desiredReplicas {
return false
}
// Look for the Available condition specifically - this is defined in the CRD for LeaderWorkerSet
for _, cond := range leaderWorkerSet.Status.Conditions {
if cond.Type == string(leaderworkersetv1.LeaderWorkerSetAvailable) {
return cond.Status == metav1.ConditionTrue
}
}
return false
}
func (r *DynamoComponentDeploymentReconciler) generateVolcanoPodGroup(ctx context.Context, opt generateResourceOption) (*volcanov1beta1.PodGroup, bool, error) {
logs := log.FromContext(ctx)
logs.Info("Generating Volcano PodGroup")
if opt.instanceID == nil {
return nil, false, errors.New("generateVolcanoPodGroup: instanceID cannot be nil")
}
instanceID := *opt.instanceID
if instanceID < 0 {
return nil, false, fmt.Errorf("generateVolcanoPodGroup: instanceID cannot be negative, got %d", instanceID)
}
podGroupName := r.getKubeName(opt.dynamoComponentDeployment, opt.dynamoComponent, opt.isStealingTrafficDebugModeEnabled)
podGroupName = fmt.Sprintf("%s-%d", podGroupName, instanceID)
kubeNs := opt.dynamoComponentDeployment.Namespace
labels := make(map[string]string)
labels["instance-id"] = fmt.Sprintf("%d", instanceID)
lwsSizeStr, ok := opt.dynamoComponentDeployment.Spec.Annotations[KubeAnnotationLWSSize]
if !ok {
return nil, false, fmt.Errorf("generateVolcanoPodGroup: missing required annotation %s", KubeAnnotationLWSSize)
}
lwsSize, err := strconv.ParseInt(lwsSizeStr, 10, 32)
if err != nil {
return nil, false, fmt.Errorf("generateVolcanoPodGroup: invalid value for annotation %s: %v", KubeAnnotationLWSSize, err)
}
if lwsSize <= 0 {
return nil, false, fmt.Errorf("generateVolcanoPodGroup: LWS size must be greater than 0, got %d", lwsSize)
}
if lwsSize == 1 {
return nil, false, errors.New("generateVolcanoPodGroup: LWS size of 1 means that the LWS is not needed, change 'nvidia.com/deployment-type' to 'standard'/disable whatever flag you used to enable LWS")
}
minMember := int32(lwsSize)
podGroup := &volcanov1beta1.PodGroup{
ObjectMeta: metav1.ObjectMeta{
Name: podGroupName,
Namespace: kubeNs,
Labels: labels,
},
Spec: volcanov1beta1.PodGroupSpec{
MinMember: minMember,
},
}
return podGroup, false, nil
}
func (r *DynamoComponentDeploymentReconciler) generateLeaderPodTemplateSpec(ctx context.Context, opt generateResourceOption, kubeName string, labels map[string]string, instanceID int) (*corev1.PodTemplateSpec, error) {
leaderPodTemplateSpec, err := r.generatePodTemplateSpec(ctx, opt)
if err != nil {
return nil, errors.Wrap(err, "failed to generate leader pod template")
}
if labels != nil {
leaderPodTemplateSpec.ObjectMeta.Labels = labels
} else {
leaderPodTemplateSpec.ObjectMeta.Labels = make(map[string]string)
}
leaderPodTemplateSpec.ObjectMeta.Labels["role"] = "leader"
leaderPodTemplateSpec.ObjectMeta.Labels["instance-id"] = fmt.Sprintf("%d", instanceID)
delete(leaderPodTemplateSpec.ObjectMeta.Labels, commonconsts.KubeLabelDynamoSelector)
if leaderPodTemplateSpec.ObjectMeta.Annotations == nil {
leaderPodTemplateSpec.ObjectMeta.Annotations = make(map[string]string)
}
leaderPodTemplateSpec.ObjectMeta.Annotations["scheduling.k8s.io/group-name"] = kubeName
leaderPodTemplateSpec.Spec.SchedulerName = "volcano"
if leaderPodTemplateSpec.Spec.Containers[0].Command == nil {
return nil, errors.New("generateLeaderPodTemplateSpec: container Command cannot be nil for Ray leader pod")
}
if len(leaderPodTemplateSpec.Spec.Containers[0].Args) == 0 {
return nil, errors.New("generateLeaderPodTemplateSpec: container Args cannot be empty for Ray leader pod")
}
currentArgs := leaderPodTemplateSpec.Spec.Containers[0].Args[0]
if opt.dynamoComponentDeployment.Spec.Resources == nil || opt.dynamoComponentDeployment.Spec.Resources.Limits == nil || opt.dynamoComponentDeployment.Spec.Resources.Limits.GPU == "" {
return nil, fmt.Errorf("generateLeaderPodTemplateSpec: GPU limit is not set for Ray leader pod")
}
leaderPodTemplateSpec.Spec.Containers[0].Args[0] = fmt.Sprintf("ray start --head --port=6379 && %s", currentArgs)
return leaderPodTemplateSpec, nil
}
func (r *DynamoComponentDeploymentReconciler) generateWorkerPodTemplateSpec(ctx context.Context, opt generateResourceOption, kubeName string, labels map[string]string, instanceID int) (*corev1.PodTemplateSpec, error) {
workerPodTemplateSpec, err := r.generatePodTemplateSpec(ctx, opt)
if err != nil {
return nil, errors.Wrap(err, "failed to generate worker pod template")
}
if labels != nil {
workerPodTemplateSpec.ObjectMeta.Labels = labels
} else {
workerPodTemplateSpec.ObjectMeta.Labels = make(map[string]string)
}
workerPodTemplateSpec.ObjectMeta.Labels["role"] = "worker"
workerPodTemplateSpec.ObjectMeta.Labels["instance-id"] = fmt.Sprintf("%d", instanceID)
delete(workerPodTemplateSpec.ObjectMeta.Labels, commonconsts.KubeLabelDynamoSelector)
workerPodTemplateSpec.Spec.SchedulerName = "volcano"
if workerPodTemplateSpec.ObjectMeta.Annotations == nil {
workerPodTemplateSpec.ObjectMeta.Annotations = make(map[string]string)
}
workerPodTemplateSpec.ObjectMeta.Annotations["scheduling.k8s.io/group-name"] = kubeName
if workerPodTemplateSpec.Spec.Containers[0].Command == nil {
return nil, errors.New("generateWorkerPodTemplateSpec: container Command cannot be nil for Ray worker pod")
}
if len(workerPodTemplateSpec.Spec.Containers[0].Args) == 0 {
return nil, errors.New("generateWorkerPodTemplateSpec: container Args cannot be empty for Ray worker pod")
}
if opt.dynamoComponentDeployment.Spec.Resources == nil || opt.dynamoComponentDeployment.Spec.Resources.Limits == nil || opt.dynamoComponentDeployment.Spec.Resources.Limits.GPU == "" {
return nil, fmt.Errorf("generateWorkerPodTemplateSpec: GPU limit is not set for Ray worker pod")
}
workerPodTemplateSpec.Spec.Containers[0].Args[0] = "ray start --address=$(LWS_LEADER_ADDRESS):6379 --block"
return workerPodTemplateSpec, nil
}
// generateLeaderWorkerSet creates a LeaderWorkerSet resource from the DynamoComponentDeployment
func (r *DynamoComponentDeploymentReconciler) generateLeaderWorkerSet(ctx context.Context, opt generateResourceOption) (*leaderworkersetv1.LeaderWorkerSet, bool, error) {
logs := log.FromContext(ctx)
logs.Info("Generating LeaderWorkerSet")
if opt.instanceID == nil {
return nil, false, errors.New("generateLeaderWorkerSet: instanceID cannot be nil")
}
instanceID := *opt.instanceID
if instanceID < 0 {
return nil, false, fmt.Errorf("generateLeaderWorkerSet: instanceID cannot be negative, got %d", instanceID)
}
kubeName := r.getKubeName(opt.dynamoComponentDeployment, opt.dynamoComponent, opt.isStealingTrafficDebugModeEnabled)
kubeName = fmt.Sprintf("%s-%d", kubeName, instanceID)
kubeNs := opt.dynamoComponentDeployment.Namespace
labels := r.getKubeLabels(opt.dynamoComponentDeployment, opt.dynamoComponent)
if labels == nil {
labels = make(map[string]string)
}
labels["instance-id"] = fmt.Sprintf("%d", instanceID)
leaderWorkerSet := &leaderworkersetv1.LeaderWorkerSet{
ObjectMeta: metav1.ObjectMeta{
Name: kubeName,
Namespace: kubeNs,
Labels: labels,
},
}
leaderPodLabels := make(map[string]string)
for k, v := range labels {
leaderPodLabels[k] = v
}
leaderPodTemplateSpec, err := r.generateLeaderPodTemplateSpec(ctx, opt, kubeName, leaderPodLabels, instanceID)
if err != nil {
return nil, false, errors.Wrap(err, "generateLeaderWorkerSet: failed to generate leader pod template")
}
workerPodLabels := make(map[string]string)
for k, v := range labels {
workerPodLabels[k] = v
}
workerPodTemplateSpec, err := r.generateWorkerPodTemplateSpec(ctx, opt, kubeName, workerPodLabels, instanceID)
if err != nil {
return nil, false, errors.Wrap(err, "generateLeaderWorkerSet: failed to generate worker pod template")
}
// Each individual LeaderWorkerSet always has exactly 1 replica
singleReplica := int32(1)
size, ok := opt.dynamoComponentDeployment.Spec.Annotations[KubeAnnotationLWSSize]
if !ok {
return nil, false, fmt.Errorf("generateLeaderWorkerSet: LWS size annotation '%s' is required", KubeAnnotationLWSSize)
}
sizeInt, err := strconv.ParseInt(size, 10, 32)
if err != nil {
return nil, false, errors.Wrap(err, "generateLeaderWorkerSet: LWS size annotation value must be an integer")
}
if sizeInt < 1 {
return nil, false, fmt.Errorf("generateLeaderWorkerSet: LWS size must be greater than 0, got %d", sizeInt)
}
groupSize := int32(sizeInt)
leaderWorkerSet.Spec = leaderworkersetv1.LeaderWorkerSetSpec{
Replicas: &singleReplica,
StartupPolicy: leaderworkersetv1.LeaderCreatedStartupPolicy,
LeaderWorkerTemplate: leaderworkersetv1.LeaderWorkerTemplate{
LeaderTemplate: leaderPodTemplateSpec,
WorkerTemplate: *workerPodTemplateSpec,
Size: &groupSize,
},
}
return leaderWorkerSet, false, nil
}
func (r *DynamoComponentDeploymentReconciler) FinalizeResource(ctx context.Context, dynamoComponentDeployment *v1alpha1.DynamoComponentDeployment) error {
logger := log.FromContext(ctx)
logger.Info("Finalizing the DynamoComponentDeployment", "dynamoComponentDeployment", dynamoComponentDeployment)
......@@ -528,7 +942,7 @@ func (r *DynamoComponentDeploymentReconciler) createOrUpdateOrDeleteServices(ctx
containsStealingTrafficDebugModeEnabled := checkIfContainsStealingTrafficDebugModeEnabled(opt.dynamoComponentDeployment)
// main generic service
modified, _, err = commonController.SyncResource(ctx, r, opt.dynamoComponentDeployment, func(ctx context.Context) (*corev1.Service, bool, error) {
return r.generateService(ctx, generateResourceOption{
return r.generateService(generateResourceOption{
dynamoComponentDeployment: opt.dynamoComponentDeployment,
dynamoComponent: opt.dynamoComponent,
isStealingTrafficDebugModeEnabled: false,
......@@ -543,7 +957,7 @@ func (r *DynamoComponentDeploymentReconciler) createOrUpdateOrDeleteServices(ctx
// debug production service (if enabled)
modified_, _, err := commonController.SyncResource(ctx, r, opt.dynamoComponentDeployment, func(ctx context.Context) (*corev1.Service, bool, error) {
return r.generateService(ctx, generateResourceOption{
return r.generateService(generateResourceOption{
dynamoComponentDeployment: opt.dynamoComponentDeployment,
dynamoComponent: opt.dynamoComponent,
isStealingTrafficDebugModeEnabled: false,
......@@ -558,7 +972,7 @@ func (r *DynamoComponentDeploymentReconciler) createOrUpdateOrDeleteServices(ctx
modified = modified || modified_
// debug service (if enabled)
modified_, _, err = commonController.SyncResource(ctx, r, opt.dynamoComponentDeployment, func(ctx context.Context) (*corev1.Service, bool, error) {
return r.generateService(ctx, generateResourceOption{
return r.generateService(generateResourceOption{
dynamoComponentDeployment: opt.dynamoComponentDeployment,
dynamoComponent: opt.dynamoComponent,
isStealingTrafficDebugModeEnabled: true,
......@@ -845,6 +1259,7 @@ type generateResourceOption struct {
containsStealingTrafficDebugModeEnabled bool
isDebugPodReceiveProductionTraffic bool
isGenericService bool
instanceID *int
}
func (r *DynamoComponentDeploymentReconciler) generateHPA(opt generateResourceOption) (*autoscalingv2.HorizontalPodAutoscaler, bool, error) {
......@@ -917,7 +1332,7 @@ func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx contex
podLabels[commonconsts.KubeLabelDynamoDeploymentTargetType] = DeploymentTargetTypeDebug
}
podAnnotations := r.getKubeAnnotations(opt.dynamoComponentDeployment, opt.dynamoComponent)
podAnnotations := make(map[string]string)
kubeName := r.getKubeName(opt.dynamoComponentDeployment, opt.dynamoComponent, opt.isStealingTrafficDebugModeEnabled)
......@@ -1388,8 +1803,7 @@ func getResourcesConfig(resources *dynamoCommon.Resources) (corev1.ResourceRequi
return currentResources, nil
}
//nolint:nakedret
func (r *DynamoComponentDeploymentReconciler) generateService(_ context.Context, opt generateResourceOption) (kubeService *corev1.Service, toDelete bool, err error) {
func (r *DynamoComponentDeploymentReconciler) generateService(opt generateResourceOption) (*corev1.Service, bool, error) {
var kubeName string
if opt.isGenericService {
kubeName = r.getGenericServiceName(opt.dynamoComponentDeployment, opt.dynamoComponent)
......@@ -1399,7 +1813,7 @@ func (r *DynamoComponentDeploymentReconciler) generateService(_ context.Context,
kubeNs := opt.dynamoComponentDeployment.Namespace
kubeService = &corev1.Service{
kubeService := &corev1.Service{
ObjectMeta: metav1.ObjectMeta{
Name: kubeName,
Namespace: kubeNs,
......@@ -1419,6 +1833,14 @@ func (r *DynamoComponentDeploymentReconciler) generateService(_ context.Context,
selector[k] = v
}
// Check if we're using LeaderWorkerSet
deploymentType := GetDeploymentType(opt.dynamoComponentDeployment)
// If using LeaderWorkerSet, modify selector to only target leaders
if deploymentType == DeploymentTypeLeaderWorker {
selector["role"] = "leader"
}
if opt.isStealingTrafficDebugModeEnabled {
selector[commonconsts.KubeLabelDynamoDeploymentTargetType] = DeploymentTargetTypeDebug
}
......@@ -1443,7 +1865,7 @@ func (r *DynamoComponentDeploymentReconciler) generateService(_ context.Context,
kubeService.ObjectMeta.Labels = labels
kubeService.Spec = spec
return
return kubeService, false, nil
}
type TLSModeOpt string
......@@ -1465,7 +1887,6 @@ type IngressConfig struct {
// SetupWithManager sets up the controller with the Manager.
func (r *DynamoComponentDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) error {
m := ctrl.NewControllerManagedBy(mgr).
For(&v1alpha1.DynamoComponentDeployment{}, builder.WithPredicates(predicate.GenerationChangedPredicate{})).
Owns(&appsv1.Deployment{}, builder.WithPredicates(predicate.Funcs{
......@@ -1480,6 +1901,23 @@ func (r *DynamoComponentDeploymentReconciler) SetupWithManager(mgr ctrl.Manager)
Owns(&corev1.PersistentVolumeClaim{}, builder.WithPredicates(predicate.GenerationChangedPredicate{})).
WithEventFilter(controller_common.EphemeralDeploymentEventFilter(r.Config))
if r.Config.EnableLWS {
m.Owns(&leaderworkersetv1.LeaderWorkerSet{}, builder.WithPredicates(predicate.Funcs{
// ignore creation cause we don't want to be called again after we create the LeaderWorkerSet
CreateFunc: func(ce event.CreateEvent) bool { return false },
DeleteFunc: func(de event.DeleteEvent) bool { return true },
UpdateFunc: func(de event.UpdateEvent) bool { return true },
GenericFunc: func(ge event.GenericEvent) bool { return true },
})).
Owns(&volcanov1beta1.PodGroup{}, builder.WithPredicates(predicate.Funcs{
// ignore creation cause we don't want to be called again after we create the LeaderWorkerSet
CreateFunc: func(ce event.CreateEvent) bool { return false },
DeleteFunc: func(de event.DeleteEvent) bool { return true },
UpdateFunc: func(de event.UpdateEvent) bool { return true },
GenericFunc: func(ge event.GenericEvent) bool { return true },
}))
}
if r.UseVirtualService {
m.Owns(&networkingv1beta1.VirtualService{}, builder.WithPredicates(predicate.GenerationChangedPredicate{}))
}
......
......@@ -24,14 +24,27 @@ import (
"fmt"
"testing"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/dynamo/common"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
"github.com/bsm/gomega"
commonconsts "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common"
"github.com/google/go-cmp/cmp"
"github.com/onsi/gomega"
"github.com/onsi/gomega/format"
istioNetworking "istio.io/api/networking/v1beta1"
networkingv1beta1 "istio.io/client-go/pkg/apis/networking/v1beta1"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
networkingv1 "k8s.io/api/networking/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes/scheme"
"k8s.io/client-go/tools/record"
"k8s.io/utils/ptr"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/fake"
leaderworkersetv1 "sigs.k8s.io/lws/api/leaderworkerset/v1"
volcanov1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
)
func TestIsDeploymentReady(t *testing.T) {
......@@ -478,3 +491,661 @@ func TestDynamoComponentDeploymentReconciler_generateVirtualService(t *testing.T
})
}
}
func TestDynamoComponentDeploymentReconciler_generateVolcanoPodGroup(t *testing.T) {
type fields struct {
Client client.Client
Recorder record.EventRecorder
Config controller_common.Config
NatsAddr string
EtcdAddr string
EtcdStorage etcdStorage
UseVirtualService bool
}
type args struct {
ctx context.Context
opt generateResourceOption
}
tests := []struct {
name string
fields fields
args args
want *volcanov1beta1.PodGroup
want1 bool
wantErr bool
}{
{
name: "generate volcano pod group",
args: args{
ctx: context.Background(),
opt: generateResourceOption{
dynamoComponentDeployment: &v1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "service1",
Namespace: "default",
},
Spec: v1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
ServiceName: "service1",
DynamoNamespace: &[]string{"default"}[0],
Annotations: map[string]string{
"nvidia.com/deployment-type": "leader-worker",
"nvidia.com/lws-size": "2",
},
},
},
},
dynamoComponent: &v1alpha1.DynamoComponent{
ObjectMeta: metav1.ObjectMeta{
Name: "service1",
Namespace: "default",
},
},
instanceID: ptr.To(5),
},
},
want: &volcanov1beta1.PodGroup{
ObjectMeta: metav1.ObjectMeta{
Name: "service1-5",
Namespace: "default",
Labels: map[string]string{
"instance-id": "5",
},
},
Spec: volcanov1beta1.PodGroupSpec{
MinMember: 2,
},
},
want1: false,
wantErr: false,
},
{
name: "missing lws size annotation",
args: args{
ctx: context.Background(),
opt: generateResourceOption{
dynamoComponentDeployment: &v1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "service-missing-lws-size",
Namespace: "default",
},
Spec: v1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
ServiceName: "service-missing-lws-size",
DynamoNamespace: &[]string{"default"}[0],
Annotations: map[string]string{
"nvidia.com/deployment-type": "leader-worker",
// "nvidia.com/lws-size" is missing
},
},
},
},
dynamoComponent: &v1alpha1.DynamoComponent{
ObjectMeta: metav1.ObjectMeta{
Name: "service-missing-lws-size",
Namespace: "default",
},
},
instanceID: ptr.To(0),
},
},
want: nil,
want1: false,
wantErr: true,
},
{
name: "invalid lws size annotation (non-integer)",
args: args{
ctx: context.Background(),
opt: generateResourceOption{
dynamoComponentDeployment: &v1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "service-invalid-lws-size-non-int",
Namespace: "default",
},
Spec: v1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
ServiceName: "service-invalid-lws-size-non-int",
DynamoNamespace: &[]string{"default"}[0],
Annotations: map[string]string{
"nvidia.com/deployment-type": "leader-worker",
"nvidia.com/lws-size": "abc",
},
},
},
},
dynamoComponent: &v1alpha1.DynamoComponent{
ObjectMeta: metav1.ObjectMeta{
Name: "service-invalid-lws-size-non-int",
Namespace: "default",
},
},
instanceID: ptr.To(1),
},
},
want: nil,
want1: false,
wantErr: true,
},
{
name: "invalid lws size annotation (zero)",
args: args{
ctx: context.Background(),
opt: generateResourceOption{
dynamoComponentDeployment: &v1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "service-invalid-lws-size-zero",
Namespace: "default",
},
Spec: v1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
ServiceName: "service-invalid-lws-size-zero",
DynamoNamespace: &[]string{"default"}[0],
Annotations: map[string]string{
"nvidia.com/deployment-type": "leader-worker",
"nvidia.com/lws-size": "0",
},
},
},
},
dynamoComponent: &v1alpha1.DynamoComponent{
ObjectMeta: metav1.ObjectMeta{
Name: "service-invalid-lws-size-zero",
Namespace: "default",
},
},
instanceID: ptr.To(2),
},
},
want: nil,
want1: false,
wantErr: true,
},
{
name: "invalid lws size annotation (negative)",
args: args{
ctx: context.Background(),
opt: generateResourceOption{
dynamoComponentDeployment: &v1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "service-invalid-lws-size-negative",
Namespace: "default",
},
Spec: v1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
ServiceName: "service-invalid-lws-size-negative",
DynamoNamespace: &[]string{"default"}[0],
Annotations: map[string]string{
"nvidia.com/deployment-type": "leader-worker",
"nvidia.com/lws-size": "-1",
},
},
},
},
dynamoComponent: &v1alpha1.DynamoComponent{
ObjectMeta: metav1.ObjectMeta{
Name: "service-invalid-lws-size-negative",
Namespace: "default",
},
},
instanceID: ptr.To(3),
},
},
want: nil,
want1: false,
wantErr: true,
},
{
name: "lws size of 1 - lws should not be used",
args: args{
ctx: context.Background(),
opt: generateResourceOption{
dynamoComponentDeployment: &v1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "service-valid-lws-size-one",
Namespace: "default",
},
Spec: v1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
ServiceName: "service-valid-lws-size-one",
DynamoNamespace: &[]string{"default"}[0],
Annotations: map[string]string{
"nvidia.com/deployment-type": "leader-worker",
"nvidia.com/lws-size": "1",
},
},
},
},
dynamoComponent: &v1alpha1.DynamoComponent{
ObjectMeta: metav1.ObjectMeta{
Name: "service-valid-lws-size-one",
Namespace: "default",
},
},
instanceID: ptr.To(4),
},
},
want: nil,
want1: false,
wantErr: true,
},
{
name: "nil instanceID",
args: args{
ctx: context.Background(),
opt: generateResourceOption{
dynamoComponentDeployment: &v1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "service-nil-instanceid",
Namespace: "default",
},
Spec: v1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
ServiceName: "service-nil-instanceid",
DynamoNamespace: &[]string{"default"}[0],
Annotations: map[string]string{
"nvidia.com/deployment-type": "leader-worker",
"nvidia.com/lws-size": "2",
},
},
},
},
dynamoComponent: &v1alpha1.DynamoComponent{
ObjectMeta: metav1.ObjectMeta{
Name: "service-nil-instanceid",
Namespace: "default",
},
},
instanceID: nil,
},
},
want: nil,
want1: false,
wantErr: true,
},
{
name: "negative instanceID",
args: args{
ctx: context.Background(),
opt: generateResourceOption{
dynamoComponentDeployment: &v1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "service-negative-instanceid",
Namespace: "default",
},
Spec: v1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
ServiceName: "service-negative-instanceid",
DynamoNamespace: &[]string{"default"}[0],
Annotations: map[string]string{
"nvidia.com/deployment-type": "leader-worker",
"nvidia.com/lws-size": "2",
},
},
},
},
dynamoComponent: &v1alpha1.DynamoComponent{
ObjectMeta: metav1.ObjectMeta{
Name: "service-negative-instanceid",
Namespace: "default",
},
},
instanceID: ptr.To(-1),
},
},
want: nil,
want1: false,
wantErr: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
g := gomega.NewGomegaWithT(t)
r := &DynamoComponentDeploymentReconciler{
Client: tt.fields.Client,
Recorder: tt.fields.Recorder,
Config: tt.fields.Config,
NatsAddr: tt.fields.NatsAddr,
EtcdAddr: tt.fields.EtcdAddr,
EtcdStorage: tt.fields.EtcdStorage,
UseVirtualService: tt.fields.UseVirtualService,
}
got, got1, err := r.generateVolcanoPodGroup(tt.args.ctx, tt.args.opt)
if (err != nil) != tt.wantErr {
t.Errorf("DynamoComponentDeploymentReconciler.generateVolcanoPodGroup() error = %v, wantErr %v", err, tt.wantErr)
return
}
g.Expect(got).To(gomega.Equal(tt.want))
g.Expect(got1).To(gomega.Equal(tt.want1))
})
}
}
func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.T) {
var limit = ptr.To(resource.MustParse("250Mi"))
limit.SetMilli(ptr.To(resource.MustParse("1Gi")).MilliValue() / 2)
type fields struct {
Client client.Client
Recorder record.EventRecorder
Config controller_common.Config
NatsAddr string
EtcdAddr string
EtcdStorage etcdStorage
UseVirtualService bool
}
type args struct {
ctx context.Context
opt generateResourceOption
// Add expected ServiceAccountName if you want to verify it's picked up
// For now, we'll ensure a default one exists for the happy path
mockServiceAccounts []client.Object
}
tests := []struct {
name string
fields fields
args args
want *leaderworkersetv1.LeaderWorkerSet
want1 bool // toDelete
wantErr bool
}{
{
name: "generateLeaderWorkerSet - nominal case",
fields: fields{
Recorder: record.NewFakeRecorder(100),
Config: controller_common.Config{}, // Provide default or test-specific config
},
args: args{
ctx: context.Background(),
opt: generateResourceOption{
dynamoComponentDeployment: &v1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-lws-deploy",
Namespace: "default",
},
Spec: v1alpha1.DynamoComponentDeploymentSpec{
DynamoComponent: "test-lws-component",
DynamoTag: "test-tag",
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
ServiceName: "test-lws-deploy-service",
DynamoNamespace: &[]string{"default"}[0],
Annotations: map[string]string{
"nvidia.com/deployment-type": "leader-worker",
"nvidia.com/lws-size": "2",
},
Resources: &common.Resources{
Limits: &common.ResourceItem{
GPU: "1",
},
},
},
},
},
dynamoComponent: &v1alpha1.DynamoComponent{
ObjectMeta: metav1.ObjectMeta{Name: "test-lws-component", Namespace: "default"},
Spec: v1alpha1.DynamoComponentSpec{Image: "test-image:latest"},
},
instanceID: ptr.To(0),
},
// Define a mock ServiceAccount that should be found by r.List
mockServiceAccounts: []client.Object{
&corev1.ServiceAccount{
ObjectMeta: metav1.ObjectMeta{
Name: "default-test-sa", // Name it will be resolved to
Namespace: "default", // Must match dynamoComponentDeployment.Namespace
Labels: map[string]string{
commonconsts.KubeLabelDynamoDeploymentPod: commonconsts.KubeLabelValueTrue,
},
},
},
},
},
want: &leaderworkersetv1.LeaderWorkerSet{
ObjectMeta: metav1.ObjectMeta{
Name: "test-lws-deploy-0",
Namespace: "default",
Labels: map[string]string{
commonconsts.KubeLabelDynamoComponent: "test-lws-component",
commonconsts.KubeLabelDynamoComponentType: commonconsts.DynamoApiServerComponentName,
"instance-id": "0",
},
},
Spec: leaderworkersetv1.LeaderWorkerSetSpec{
Replicas: ptr.To(int32(1)),
StartupPolicy: leaderworkersetv1.LeaderCreatedStartupPolicy,
LeaderWorkerTemplate: leaderworkersetv1.LeaderWorkerTemplate{
Size: ptr.To(int32(2)),
LeaderTemplate: &corev1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{
"instance-id": "0",
"role": "leader",
commonconsts.KubeLabelDynamoComponent: "test-lws-component",
commonconsts.KubeLabelDynamoComponentType: commonconsts.DynamoApiServerComponentName,
},
Annotations: map[string]string{
"scheduling.k8s.io/group-name": "test-lws-deploy-0",
},
},
Spec: corev1.PodSpec{
SchedulerName: "volcano",
Containers: []corev1.Container{
{
Name: "main",
Image: "test-image:latest",
Command: []string{"sh", "-c"},
Args: []string{"ray start --head --port=6379 && cd src && uv run dynamo serve --service-name test-lws-deploy-service test-tag --test-lws-deploy-service.ServiceArgs.dynamo.namespace=default --test-lws-deploy-service.environment=kubernetes"},
Env: []corev1.EnvVar{{Name: "DYNAMO_PORT", Value: "3000"}},
VolumeMounts: []corev1.VolumeMount{
{
Name: "shared-memory", MountPath: "/dev/shm",
},
},
Ports: []corev1.ContainerPort{
{
Protocol: corev1.ProtocolTCP, Name: "http", ContainerPort: 3000,
},
},
TTY: true,
Stdin: true,
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("300m"),
corev1.ResourceMemory: resource.MustParse("500Mi"),
},
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("500m"),
corev1.ResourceMemory: resource.MustParse("1Gi"),
"nvidia.com/gpu": resource.MustParse("1"),
},
},
},
},
Volumes: []corev1.Volume{{Name: "shared-memory", VolumeSource: corev1.VolumeSource{EmptyDir: &corev1.EmptyDirVolumeSource{Medium: corev1.StorageMediumMemory, SizeLimit: limit}}}},
ImagePullSecrets: []corev1.LocalObjectReference{{Name: ""}}, // Assuming default config gives empty secret name
ServiceAccountName: "default-test-sa", // Updated to reflect mocked SA
},
},
WorkerTemplate: corev1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{
"instance-id": "0",
"role": "worker",
commonconsts.KubeLabelDynamoComponent: "test-lws-component",
commonconsts.KubeLabelDynamoComponentType: commonconsts.DynamoApiServerComponentName,
},
Annotations: map[string]string{
"scheduling.k8s.io/group-name": "test-lws-deploy-0",
},
},
Spec: corev1.PodSpec{
SchedulerName: "volcano",
Containers: []corev1.Container{
{
Name: "main",
Image: "test-image:latest",
Command: []string{"sh", "-c"},
Args: []string{"ray start --address=$(LWS_LEADER_ADDRESS):6379 --block"},
Env: []corev1.EnvVar{{Name: "DYNAMO_PORT", Value: "3000"}},
VolumeMounts: []corev1.VolumeMount{{Name: "shared-memory", MountPath: "/dev/shm"}},
Ports: []corev1.ContainerPort{{Protocol: corev1.ProtocolTCP, Name: "http", ContainerPort: 3000}},
TTY: true,
Stdin: true,
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{corev1.ResourceCPU: resource.MustParse("300m"), corev1.ResourceMemory: resource.MustParse("500Mi")},
Limits: corev1.ResourceList{corev1.ResourceCPU: resource.MustParse("500m"), corev1.ResourceMemory: resource.MustParse("1Gi"), "nvidia.com/gpu": resource.MustParse("1")},
},
},
},
Volumes: []corev1.Volume{{Name: "shared-memory", VolumeSource: corev1.VolumeSource{EmptyDir: &corev1.EmptyDirVolumeSource{Medium: corev1.StorageMediumMemory, SizeLimit: limit}}}},
ImagePullSecrets: []corev1.LocalObjectReference{{Name: ""}},
ServiceAccountName: "default-test-sa", // Updated to reflect mocked SA
},
},
},
},
},
want1: false,
wantErr: false,
},
{
name: "nil instanceID", // This test should fail before r.List is called in generatePodTemplateSpec
fields: fields{
Recorder: record.NewFakeRecorder(100),
},
args: args{
ctx: context.Background(),
opt: generateResourceOption{
dynamoComponentDeployment: &v1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{Name: "test-lws-nil-id", Namespace: "default", Annotations: map[string]string{KubeAnnotationLWSSize: "2"}},
Spec: v1alpha1.DynamoComponentDeploymentSpec{
DynamoComponent: "test-comp", DynamoTag: "test",
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
Resources: &common.Resources{
Limits: &common.ResourceItem{
GPU: "1",
},
},
},
},
},
dynamoComponent: &v1alpha1.DynamoComponent{
ObjectMeta: metav1.ObjectMeta{Namespace: "default"},
Spec: v1alpha1.DynamoComponentSpec{Image: "test-image:latest"},
},
instanceID: nil,
},
mockServiceAccounts: []client.Object{ // Provide a default SA for consistency, though not strictly needed here
&corev1.ServiceAccount{
ObjectMeta: metav1.ObjectMeta{
Name: "default-test-sa", Namespace: "default", // Match namespace
Labels: map[string]string{commonconsts.KubeLabelDynamoDeploymentPod: commonconsts.KubeLabelValueTrue},
},
},
},
},
want: nil,
want1: false,
wantErr: true,
},
{
name: "error from generateLeaderPodTemplateSpec", // This case involves an error from generatePodTemplateSpec
fields: fields{
Recorder: record.NewFakeRecorder(100),
},
args: args{
ctx: context.Background(),
opt: generateResourceOption{
dynamoComponentDeployment: &v1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{Name: "test-lws-leader-err", Namespace: "default", Annotations: map[string]string{KubeAnnotationLWSSize: "2"}},
Spec: v1alpha1.DynamoComponentDeploymentSpec{
DynamoComponent: "test-comp", DynamoTag: "test",
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
Resources: &common.Resources{
Limits: &common.ResourceItem{
GPU: "1",
},
},
},
},
},
dynamoComponent: &v1alpha1.DynamoComponent{ // Image is missing, will cause error in generatePodTemplateSpec
ObjectMeta: metav1.ObjectMeta{Name: "test-lws-component-leader-err", Namespace: "default"},
Spec: v1alpha1.DynamoComponentSpec{Image: ""},
},
instanceID: ptr.To(0),
},
// No specific SA needed if error is before SA listing, but good to be consistent
mockServiceAccounts: []client.Object{
&corev1.ServiceAccount{
ObjectMeta: metav1.ObjectMeta{
Name: "default-test-sa", Namespace: "default", // Match namespace
Labels: map[string]string{commonconsts.KubeLabelDynamoDeploymentPod: commonconsts.KubeLabelValueTrue},
},
},
},
},
want: nil,
want1: false,
wantErr: true,
},
}
// Initialize scheme & add API types
s := scheme.Scheme
if err := v1alpha1.AddToScheme(s); err != nil {
t.Fatalf("Failed to add v1alpha1 to scheme: %v", err)
}
if err := corev1.AddToScheme(s); err != nil {
t.Fatalf("Failed to add corev1 to scheme: %v", err)
}
// Add LeaderWorkerSet to scheme if not already present globally for tests
if err := leaderworkersetv1.AddToScheme(s); err != nil {
t.Fatalf("Failed to add leaderworkersetv1 to scheme: %v", err)
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
format.MaxLength = 0
g := gomega.NewGomegaWithT(t)
// Build initial objects for fake client for this test case
var initialClientObjects []client.Object
if tt.args.opt.dynamoComponentDeployment != nil {
initialClientObjects = append(initialClientObjects, tt.args.opt.dynamoComponentDeployment)
}
if tt.args.opt.dynamoComponent != nil {
initialClientObjects = append(initialClientObjects, tt.args.opt.dynamoComponent)
}
if len(tt.args.mockServiceAccounts) > 0 {
initialClientObjects = append(initialClientObjects, tt.args.mockServiceAccounts...)
}
fakeKubeClient := fake.NewClientBuilder().
WithScheme(s).
WithObjects(initialClientObjects...).
Build()
r := &DynamoComponentDeploymentReconciler{
Client: fakeKubeClient, // Use the fake client
Recorder: tt.fields.Recorder,
Config: tt.fields.Config,
NatsAddr: tt.fields.NatsAddr,
EtcdAddr: tt.fields.EtcdAddr,
EtcdStorage: tt.fields.EtcdStorage,
UseVirtualService: tt.fields.UseVirtualService,
// Scheme: s, // Pass scheme if reconciler uses it directly, often client uses it
}
got, got1, err := r.generateLeaderWorkerSet(tt.args.ctx, tt.args.opt)
if (err != nil) != tt.wantErr {
t.Errorf("DynamoComponentDeploymentReconciler.generateLeaderWorkerSet() error = %v, wantErr %v", err, tt.wantErr)
return
}
if diff := cmp.Diff(tt.want, got); diff != "" {
t.Errorf("Mismatch (-expected +actual):\n%s", diff)
}
// Use gomega.Equal for deep comparison of complex structs
g.Expect(got).To(gomega.BeEquivalentTo(tt.want))
g.Expect(got1).To(gomega.BeEquivalentTo(tt.want1))
})
}
}
......@@ -323,7 +323,9 @@ func overrideWithDynDeploymentConfig(ctx context.Context, dynamoDeploymentCompon
}
componentDynConfig := dynDeploymentConfig[dynamoDeploymentComponent.Spec.ServiceName]
if componentDynConfig != nil {
if componentDynConfig.ServiceArgs != nil && componentDynConfig.ServiceArgs.Workers != nil {
if componentDynConfig.ServiceArgs != nil && componentDynConfig.ServiceArgs.Workers != nil && dynamoDeploymentComponent.Spec.Replicas == nil {
// we only override the replicas if it is not set in the CRD.
// replicas, if set in the CRD set in the CRD must always be the source of truth.
dynamoDeploymentComponent.Spec.Replicas = componentDynConfig.ServiceArgs.Workers
}
if componentDynConfig.ServiceArgs != nil && componentDynConfig.ServiceArgs.Resources != nil {
......@@ -362,6 +364,9 @@ func overrideWithDynDeploymentConfig(ctx context.Context, dynamoDeploymentCompon
requests.Custom = componentDynConfig.ServiceArgs.Resources.Custom
limits.Custom = componentDynConfig.ServiceArgs.Resources.Custom
}
if err := dynamo.SetLwsAnnotations(componentDynConfig.ServiceArgs, dynamoDeploymentComponent); err != nil {
return err
}
}
}
break
......
......@@ -26,6 +26,7 @@ import (
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/dynamo/common"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
"github.com/bsm/gomega"
"github.com/google/go-cmp/cmp"
corev1 "k8s.io/api/core/v1"
)
......@@ -269,7 +270,7 @@ func Test_overrideWithDynDeploymentConfig(t *testing.T) {
Value: `{"Frontend":{"port":8080,"ServiceArgs":{"Workers":3, "Resources":{"CPU":"2", "Memory":"2Gi", "GPU":"2"}}},"Planner":{"environment":"kubernetes"}}`,
},
},
Replicas: &[]int32{1}[0],
Replicas: nil,
Resources: &common.Resources{
Requests: &common.ResourceItem{
CPU: "1",
......@@ -309,6 +310,64 @@ func Test_overrideWithDynDeploymentConfig(t *testing.T) {
},
},
},
{
name: "override workers and resources with gpusPerNode",
args: args{
ctx: context.Background(),
dynamoDeploymentComponent: &nvidiacomv1alpha1.DynamoComponentDeployment{
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ServiceName: "Frontend",
Envs: []corev1.EnvVar{
{
Name: "DYN_DEPLOYMENT_CONFIG",
Value: `{"Frontend":{"port":8080,"ServiceArgs":{"Workers":3, "Resources":{"CPU":"2", "Memory":"2Gi", "GPU":"8"}, "total_gpus":16}},"Planner":{"environment":"kubernetes"}}`,
},
},
Replicas: nil,
Resources: &common.Resources{
Requests: &common.ResourceItem{
CPU: "1",
Memory: "1Gi",
GPU: "1",
},
},
},
},
},
},
wantErr: false,
expected: &nvidiacomv1alpha1.DynamoComponentDeployment{
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ServiceName: "Frontend",
Envs: []corev1.EnvVar{
{
Name: "DYN_DEPLOYMENT_CONFIG",
Value: `{"Frontend":{"port":8080,"ServiceArgs":{"Workers":3, "Resources":{"CPU":"2", "Memory":"2Gi", "GPU":"8"}, "total_gpus":16}},"Planner":{"environment":"kubernetes"}}`,
},
},
Replicas: &[]int32{3}[0],
Resources: &common.Resources{
Requests: &common.ResourceItem{
CPU: "2",
Memory: "2Gi",
GPU: "8",
},
Limits: &common.ResourceItem{
CPU: "2",
Memory: "2Gi",
GPU: "8",
},
},
Annotations: map[string]string{
"nvidia.com/deployment-type": "leader-worker",
"nvidia.com/lws-size": "2",
},
},
},
},
},
{
name: "override subset of resources",
args: args{
......@@ -323,7 +382,7 @@ func Test_overrideWithDynDeploymentConfig(t *testing.T) {
Value: `{"Frontend":{"port":8080,"ServiceArgs":{"Workers":3, "Resources":{"GPU":"2"}}},"Planner":{"environment":"kubernetes"}}`,
},
},
Replicas: &[]int32{1}[0],
Replicas: nil,
Resources: &common.Resources{
Requests: &common.ResourceItem{
CPU: "1",
......@@ -363,14 +422,64 @@ func Test_overrideWithDynDeploymentConfig(t *testing.T) {
},
},
},
{
name: "do not override replicas if explicitly set in the CRD !",
args: args{
ctx: context.Background(),
dynamoDeploymentComponent: &nvidiacomv1alpha1.DynamoComponentDeployment{
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ServiceName: "Frontend",
Envs: []corev1.EnvVar{
{
Name: "DYN_DEPLOYMENT_CONFIG",
Value: `{"Frontend":{"port":8080,"ServiceArgs":{"Workers":3}},"Planner":{"environment":"kubernetes"}}`,
},
},
Replicas: &[]int32{1}[0],
Resources: &common.Resources{
Requests: &common.ResourceItem{
CPU: "1",
Memory: "1Gi",
GPU: "1",
},
},
},
},
},
},
wantErr: false,
expected: &nvidiacomv1alpha1.DynamoComponentDeployment{
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ServiceName: "Frontend",
Envs: []corev1.EnvVar{
{
Name: "DYN_DEPLOYMENT_CONFIG",
Value: `{"Frontend":{"port":8080,"ServiceArgs":{"Workers":3}},"Planner":{"environment":"kubernetes"}}`,
},
},
Replicas: &[]int32{1}[0],
Resources: &common.Resources{
Requests: &common.ResourceItem{
CPU: "1",
Memory: "1Gi",
GPU: "1",
},
},
},
},
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
g := gomega.NewGomegaWithT(t)
if err := overrideWithDynDeploymentConfig(tt.args.ctx, tt.args.dynamoDeploymentComponent); (err != nil) != tt.wantErr {
t.Errorf("overrideWithDynDeploymentConfig() error = %v, wantErr %v", err, tt.wantErr)
}
g.Expect(tt.args.dynamoDeploymentComponent).To(gomega.Equal(tt.expected))
if diff := cmp.Diff(tt.args.dynamoDeploymentComponent, tt.expected); diff != "" {
t.Errorf("overrideWithDynDeploymentConfig() mismatch (-want +got):\n%s", diff)
}
})
}
}
......@@ -32,6 +32,7 @@ type Config struct {
RestrictedNamespace string
// If true, assume VirtualService endpoints are HTTPS
VirtualServiceSupportsHTTPS bool
EnableLWS bool
}
func EphemeralDeploymentEventFilter(config Config) predicate.Predicate {
......
......@@ -24,6 +24,7 @@ import (
"fmt"
"io"
"net/http"
"strconv"
"strings"
"emperror.dev/errors"
......@@ -80,6 +81,7 @@ type Config struct {
HttpExposed bool `yaml:"http_exposed,omitempty"`
ApiEndpoints []string `yaml:"api_endpoints,omitempty"`
Workers *int32 `yaml:"workers,omitempty"`
TotalGpus *int32 `yaml:"total_gpus,omitempty"`
}
type ServiceConfig struct {
......@@ -99,6 +101,7 @@ type DynDeploymentServiceConfig struct {
type ServiceArgs struct {
Workers *int32 `json:"workers,omitempty"`
Resources *Resources `json:"resources,omitempty"`
TotalGpus *int32 `json:"total_gpus,omitempty"`
}
func (s ServiceConfig) GetNamespace() *string {
......@@ -253,6 +256,31 @@ func GetDynamoGraphConfig(ctx context.Context, dynamoDeployment *v1alpha1.Dynamo
return ParseDynamoGraphConfig(ctx, yamlContent)
}
func SetLwsAnnotations(serviceArgs *ServiceArgs, deployment *v1alpha1.DynamoComponentDeployment) error {
if serviceArgs.Resources != nil &&
serviceArgs.Resources.GPU != nil && *serviceArgs.Resources.GPU != "" && *serviceArgs.Resources.GPU != "0" &&
serviceArgs.TotalGpus != nil && *serviceArgs.TotalGpus > 0 {
gpusPerNodeStr := *serviceArgs.Resources.GPU
gpusPerNode, errGpusPerNode := strconv.Atoi(gpusPerNodeStr)
if errGpusPerNode != nil {
return fmt.Errorf("failed to parse GPUs per node value '%s' for service %s: %w", gpusPerNodeStr, deployment.Spec.ServiceName, errGpusPerNode)
}
// Calculate lwsSize using ceiling division to ensure enough nodes for all GPUs
lwsSize := (int(*serviceArgs.TotalGpus) + gpusPerNode - 1) / gpusPerNode
if lwsSize > 1 {
if deployment.Spec.Annotations == nil {
deployment.Spec.Annotations = make(map[string]string)
}
deployment.Spec.Annotations["nvidia.com/lws-size"] = strconv.Itoa(lwsSize)
deployment.Spec.Annotations["nvidia.com/deployment-type"] = "leader-worker"
}
}
return nil
}
// GenerateDynamoComponentsDeployments generates a map of DynamoComponentDeployments from a DynamoGraphConfig
func GenerateDynamoComponentsDeployments(ctx context.Context, parentDynamoGraphDeployment *v1alpha1.DynamoGraphDeployment, config *DynamoGraphConfig, ingressSpec *v1alpha1.IngressSpec) (map[string]*v1alpha1.DynamoComponentDeployment, error) {
dynamoServices := make(map[string]string)
......@@ -321,6 +349,15 @@ func GenerateDynamoComponentsDeployments(ctx context.Context, parentDynamoGraphD
deployment.Spec.Resources.Requests.GPU = *service.Config.Resources.GPU
deployment.Spec.Resources.Limits.GPU = *service.Config.Resources.GPU
}
serviceArgs := ServiceArgs{
Resources: service.Config.Resources,
TotalGpus: service.Config.TotalGpus,
Workers: service.Config.Workers,
}
if err := SetLwsAnnotations(&serviceArgs, deployment); err != nil {
return nil, err
}
}
deployment.Spec.Autoscaling = &v1alpha1.Autoscaling{
Enabled: false,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment