Unverified Commit ee3a8e42 authored by julienmancuso's avatar julienmancuso Committed by GitHub
Browse files

feat: add initial Grove support (#2012)

parent 19a77ae7
...@@ -148,6 +148,12 @@ spec: ...@@ -148,6 +148,12 @@ spec:
stabilizationWindowSeconds: stabilizationWindowSeconds:
format: int32 format: int32
type: integer type: integer
tolerance:
anyOf:
- type: integer
- type: string
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
type: object type: object
scaleUp: scaleUp:
properties: properties:
...@@ -174,6 +180,12 @@ spec: ...@@ -174,6 +180,12 @@ spec:
stabilizationWindowSeconds: stabilizationWindowSeconds:
format: int32 format: int32
type: integer type: integer
tolerance:
anyOf:
- type: integer
- type: string
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
type: object type: object
type: object type: object
enabled: enabled:
...@@ -1218,6 +1230,8 @@ spec: ...@@ -1218,6 +1230,8 @@ spec:
- port - port
type: object type: object
type: object type: object
stopSignal:
type: string
type: object type: object
livenessProbe: livenessProbe:
properties: properties:
...@@ -1897,6 +1911,8 @@ spec: ...@@ -1897,6 +1911,8 @@ spec:
- port - port
type: object type: object
type: object type: object
stopSignal:
type: string
type: object type: object
livenessProbe: livenessProbe:
properties: properties:
......
...@@ -98,6 +98,18 @@ rules: ...@@ -98,6 +98,18 @@ rules:
- patch - patch
- update - update
- watch - watch
- apiGroups:
- grove.io
resources:
- podgangsets
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups: - apiGroups:
- leaderworkerset.x-k8s.io - leaderworkerset.x-k8s.io
resources: resources:
......
...@@ -6,27 +6,29 @@ toolchain go1.24.3 ...@@ -6,27 +6,29 @@ toolchain go1.24.3
require ( require (
emperror.dev/errors v0.8.1 emperror.dev/errors v0.8.1
github.com/NVIDIA/grove/operator/api v0.0.0-20250717114148-daac6e53774f
github.com/bsm/gomega v1.27.10 github.com/bsm/gomega v1.27.10
github.com/google/go-cmp v0.7.0 github.com/google/go-cmp v0.7.0
github.com/imdario/mergo v0.3.6 github.com/imdario/mergo v0.3.6
github.com/onsi/ginkgo/v2 v2.23.4 github.com/onsi/ginkgo/v2 v2.23.4
github.com/onsi/gomega v1.37.0 github.com/onsi/gomega v1.37.0
github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.71.2 github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.71.2
go.etcd.io/etcd/client/v3 v3.5.16 go.etcd.io/etcd/client/v3 v3.5.21
istio.io/api v1.23.1 istio.io/api v1.23.1
istio.io/client-go v1.23.1 istio.io/client-go v1.23.1
k8s.io/api v0.32.3 k8s.io/api v0.33.3
k8s.io/apiextensions-apiserver v0.32.3 k8s.io/apiextensions-apiserver v0.33.3
k8s.io/apimachinery v0.32.3 k8s.io/apimachinery v0.33.3
k8s.io/client-go v0.32.3 k8s.io/client-go v0.33.3
k8s.io/utils v0.0.0-20241210054802-24370beab758 k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979
sigs.k8s.io/controller-runtime v0.20.4 sigs.k8s.io/controller-runtime v0.21.0
sigs.k8s.io/lws v0.6.1 sigs.k8s.io/lws v0.6.1
volcano.sh/apis v1.11.0 volcano.sh/apis v1.11.0
) )
require ( require (
github.com/beorn7/perks v1.0.1 // indirect github.com/beorn7/perks v1.0.1 // indirect
github.com/blang/semver/v4 v4.0.0 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/coreos/go-semver v0.3.1 // indirect github.com/coreos/go-semver v0.3.1 // indirect
github.com/coreos/go-systemd/v22 v22.5.0 // indirect github.com/coreos/go-systemd/v22 v22.5.0 // indirect
...@@ -45,28 +47,24 @@ require ( ...@@ -45,28 +47,24 @@ require (
github.com/gogo/protobuf v1.3.2 // indirect github.com/gogo/protobuf v1.3.2 // indirect
github.com/golang/protobuf v1.5.4 // indirect github.com/golang/protobuf v1.5.4 // indirect
github.com/google/btree v1.1.3 // indirect github.com/google/btree v1.1.3 // indirect
github.com/google/gnostic-models v0.6.8 // indirect github.com/google/gnostic-models v0.6.9 // indirect
github.com/google/gofuzz v1.2.0 // indirect
github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 // indirect github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 // indirect
github.com/google/uuid v1.6.0 // indirect github.com/google/uuid v1.6.0 // indirect
github.com/josharian/intern v1.0.0 // indirect github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect github.com/json-iterator/go v1.1.12 // indirect
github.com/klauspost/compress v1.18.0 // indirect
github.com/mailru/easyjson v0.7.7 // indirect github.com/mailru/easyjson v0.7.7 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/pkg/errors v0.9.1 // indirect github.com/pkg/errors v0.9.1 // indirect
github.com/prometheus/client_golang v1.20.2 // indirect github.com/prometheus/client_golang v1.22.0 // indirect
github.com/prometheus/client_model v0.6.1 // indirect github.com/prometheus/client_model v0.6.1 // indirect
github.com/prometheus/common v0.55.0 // indirect github.com/prometheus/common v0.62.0 // indirect
github.com/prometheus/procfs v0.15.1 // indirect github.com/prometheus/procfs v0.15.1 // indirect
github.com/rogpeppe/go-internal v1.13.1 // indirect
github.com/spf13/pflag v1.0.6 // indirect github.com/spf13/pflag v1.0.6 // indirect
github.com/stretchr/testify v1.10.0 // indirect
github.com/x448/float16 v0.8.4 // indirect github.com/x448/float16 v0.8.4 // indirect
go.etcd.io/etcd/api/v3 v3.5.16 // indirect go.etcd.io/etcd/api/v3 v3.5.21 // indirect
go.etcd.io/etcd/client/pkg/v3 v3.5.16 // indirect go.etcd.io/etcd/client/pkg/v3 v3.5.21 // indirect
go.opentelemetry.io/otel v1.36.0 // indirect go.opentelemetry.io/otel v1.36.0 // indirect
go.opentelemetry.io/otel/sdk v1.36.0 // indirect go.opentelemetry.io/otel/sdk v1.36.0 // indirect
go.opentelemetry.io/otel/sdk/metric v1.35.0 // indirect go.opentelemetry.io/otel/sdk/metric v1.35.0 // indirect
...@@ -79,7 +77,7 @@ require ( ...@@ -79,7 +77,7 @@ require (
golang.org/x/sys v0.33.0 // indirect golang.org/x/sys v0.33.0 // indirect
golang.org/x/term v0.32.0 // indirect golang.org/x/term v0.32.0 // indirect
golang.org/x/text v0.25.0 // indirect golang.org/x/text v0.25.0 // indirect
golang.org/x/time v0.7.0 // indirect golang.org/x/time v0.9.0 // indirect
golang.org/x/tools v0.33.0 // indirect golang.org/x/tools v0.33.0 // indirect
gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20250519155744-55703ea1f237 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20250519155744-55703ea1f237 // indirect
...@@ -90,8 +88,9 @@ require ( ...@@ -90,8 +88,9 @@ require (
gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect
k8s.io/klog/v2 v2.130.1 // indirect k8s.io/klog/v2 v2.130.1 // indirect
k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f // indirect k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff // indirect
sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect
sigs.k8s.io/randfill v1.0.0 // indirect
sigs.k8s.io/structured-merge-diff/v4 v4.7.0 // indirect sigs.k8s.io/structured-merge-diff/v4 v4.7.0 // indirect
sigs.k8s.io/yaml v1.4.0 // indirect sigs.k8s.io/yaml v1.4.0 // indirect
) )
emperror.dev/errors v0.8.1 h1:UavXZ5cSX/4u9iyvH6aDcuGkVjeexUGJ7Ij7G4VfQT0= emperror.dev/errors v0.8.1 h1:UavXZ5cSX/4u9iyvH6aDcuGkVjeexUGJ7Ij7G4VfQT0=
emperror.dev/errors v0.8.1/go.mod h1:YcRvLPh626Ubn2xqtoprejnA5nFha+TJ+2vew48kWuE= emperror.dev/errors v0.8.1/go.mod h1:YcRvLPh626Ubn2xqtoprejnA5nFha+TJ+2vew48kWuE=
github.com/NVIDIA/grove/operator/api v0.0.0-20250717114148-daac6e53774f h1:2ePSNDm7/Tep8F99yCQVH8/vmn86L1cUzTbVlyNopmQ=
github.com/NVIDIA/grove/operator/api v0.0.0-20250717114148-daac6e53774f/go.mod h1:nJL33lsBe+9xCcZLYkNYg1wucE4hJfa4ZfHm1zamuG0=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM=
github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ=
github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA= github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA=
github.com/bsm/gomega v1.27.10/go.mod h1:JyEr/xRbxbtgWNi8tIEVPUYZ5Dzef52k01W3YH0H+O0= github.com/bsm/gomega v1.27.10/go.mod h1:JyEr/xRbxbtgWNi8tIEVPUYZ5Dzef52k01W3YH0H+O0=
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
...@@ -45,8 +49,8 @@ github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek ...@@ -45,8 +49,8 @@ github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek
github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps=
github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg= github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg=
github.com/google/btree v1.1.3/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4= github.com/google/btree v1.1.3/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4=
github.com/google/gnostic-models v0.6.8 h1:yo/ABAfM5IMRsS1VnXjTBvUb61tFIHozhlYvRgGre9I= github.com/google/gnostic-models v0.6.9 h1:MU/8wDLif2qCXZmzncUQ/BOfxWfthHi63KqpoNbWqVw=
github.com/google/gnostic-models v0.6.8/go.mod h1:5n7qKqH0f5wFt+aWF8CW6pZLLNOfYuF5OpfBSENuI8U= github.com/google/gnostic-models v0.6.9/go.mod h1:CiWsm0s6BSQd1hRn8/QmxqB6BesYcbSZxsz9b0KuDBw=
github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
...@@ -95,12 +99,12 @@ github.com/prashantv/gostub v1.1.0 h1:BTyx3RfQjRHnUWaGF9oQos79AlQ5k8WNktv7VGvVH4 ...@@ -95,12 +99,12 @@ github.com/prashantv/gostub v1.1.0 h1:BTyx3RfQjRHnUWaGF9oQos79AlQ5k8WNktv7VGvVH4
github.com/prashantv/gostub v1.1.0/go.mod h1:A5zLQHz7ieHGG7is6LLXLz7I8+3LZzsrV0P1IAHhP5U= github.com/prashantv/gostub v1.1.0/go.mod h1:A5zLQHz7ieHGG7is6LLXLz7I8+3LZzsrV0P1IAHhP5U=
github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.71.2 h1:HZdPRm0ApWPg7F4sHgbqWkL+ddWfpTZsopm5HM/2g4o= github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.71.2 h1:HZdPRm0ApWPg7F4sHgbqWkL+ddWfpTZsopm5HM/2g4o=
github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.71.2/go.mod h1:3RiUkFmR9kmPZi9r/8a5jw0a9yg+LMmr7qa0wjqvSiI= github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.71.2/go.mod h1:3RiUkFmR9kmPZi9r/8a5jw0a9yg+LMmr7qa0wjqvSiI=
github.com/prometheus/client_golang v1.20.2 h1:5ctymQzZlyOON1666svgwn3s6IKWgfbjsejTMiXIyjg= github.com/prometheus/client_golang v1.22.0 h1:rb93p9lokFEsctTys46VnV1kLCDpVZ0a/Y92Vm0Zc6Q=
github.com/prometheus/client_golang v1.20.2/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE= github.com/prometheus/client_golang v1.22.0/go.mod h1:R7ljNsLXhuQXYZYtw6GAE9AZg8Y7vEW5scdCXrWRXC0=
github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E=
github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY=
github.com/prometheus/common v0.55.0 h1:KEi6DK7lXW/m7Ig5i47x0vRzuBsHuvJdi5ee6Y3G1dc= github.com/prometheus/common v0.62.0 h1:xasJaQlnWAeyHdUBeGjXmutelfJHWMRr+Fg4QszZ2Io=
github.com/prometheus/common v0.55.0/go.mod h1:2SECS4xJG1kd8XF9IcM1gMX6510RAEL65zxzNImwdc8= github.com/prometheus/common v0.62.0/go.mod h1:vyBcEuLSvWos9B1+CyL7JZ2up+uFzXhkqml0W5zIY1I=
github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc=
github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk=
github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII=
...@@ -108,6 +112,8 @@ github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWN ...@@ -108,6 +112,8 @@ github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWN
github.com/spf13/pflag v1.0.6 h1:jFzHGLGAlb3ruxLB8MhbI6A8+AQX/2eW4qeyNZXNp2o= github.com/spf13/pflag v1.0.6 h1:jFzHGLGAlb3ruxLB8MhbI6A8+AQX/2eW4qeyNZXNp2o=
github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY=
github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
...@@ -115,12 +121,12 @@ github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= ...@@ -115,12 +121,12 @@ github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg=
github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
go.etcd.io/etcd/api/v3 v3.5.16 h1:WvmyJVbjWqK4R1E+B12RRHz3bRGy9XVfh++MgbN+6n0= go.etcd.io/etcd/api/v3 v3.5.21 h1:A6O2/JDb3tvHhiIz3xf9nJ7REHvtEFJJ3veW3FbCnS8=
go.etcd.io/etcd/api/v3 v3.5.16/go.mod h1:1P4SlIP/VwkDmGo3OlOD7faPeP8KDIFhqvciH5EfN28= go.etcd.io/etcd/api/v3 v3.5.21/go.mod h1:c3aH5wcvXv/9dqIw2Y810LDXJfhSYdHQ0vxmP3CCHVY=
go.etcd.io/etcd/client/pkg/v3 v3.5.16 h1:ZgY48uH6UvB+/7R9Yf4x574uCO3jIx0TRDyetSfId3Q= go.etcd.io/etcd/client/pkg/v3 v3.5.21 h1:lPBu71Y7osQmzlflM9OfeIV2JlmpBjqBNlLtcoBqUTc=
go.etcd.io/etcd/client/pkg/v3 v3.5.16/go.mod h1:V8acl8pcEK0Y2g19YlOV9m9ssUe6MgiDSobSoaBAM0E= go.etcd.io/etcd/client/pkg/v3 v3.5.21/go.mod h1:BgqT/IXPjK9NkeSDjbzwsHySX3yIle2+ndz28nVsjUs=
go.etcd.io/etcd/client/v3 v3.5.16 h1:sSmVYOAHeC9doqi0gv7v86oY/BTld0SEFGaxsU9eRhE= go.etcd.io/etcd/client/v3 v3.5.21 h1:T6b1Ow6fNjOLOtM0xSoKNQt1ASPCLWrF9XMHcH9pEyY=
go.etcd.io/etcd/client/v3 v3.5.16/go.mod h1:X+rExSGkyqxvu276cr2OwPLBaeqFu1cIl4vmRjAD/50= go.etcd.io/etcd/client/v3 v3.5.21/go.mod h1:mFYy67IOqmbRf/kRUvsHixzo3iG+1OF2W2+jVIQRAnU=
go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA=
go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A=
go.opentelemetry.io/otel v1.36.0 h1:UumtzIklRBY6cI/lllNZlALOF5nNIzJVb16APdvgTXg= go.opentelemetry.io/otel v1.36.0 h1:UumtzIklRBY6cI/lllNZlALOF5nNIzJVb16APdvgTXg=
...@@ -172,8 +178,8 @@ golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= ...@@ -172,8 +178,8 @@ golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.25.0 h1:qVyWApTSYLk/drJRO5mDlNYskwQznZmkpV2c8q9zls4= golang.org/x/text v0.25.0 h1:qVyWApTSYLk/drJRO5mDlNYskwQznZmkpV2c8q9zls4=
golang.org/x/text v0.25.0/go.mod h1:WEdwpYrmk1qmdHvhkSTNPm3app7v4rsT8F2UD6+VHIA= golang.org/x/text v0.25.0/go.mod h1:WEdwpYrmk1qmdHvhkSTNPm3app7v4rsT8F2UD6+VHIA=
golang.org/x/time v0.7.0 h1:ntUhktv3OPE6TgYxXWv9vKvUSJyIFJlyohwbkEwPrKQ= golang.org/x/time v0.9.0 h1:EsRrnYcQiGH+5FfbgvV4AP7qEZstoyrHB0DzarOQ4ZY=
golang.org/x/time v0.7.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/time v0.9.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
...@@ -209,28 +215,29 @@ istio.io/api v1.23.1 h1:bm2XF0j058FfzWVHUfpmMj4sFDkcD1X609qs5AU97Pc= ...@@ -209,28 +215,29 @@ istio.io/api v1.23.1 h1:bm2XF0j058FfzWVHUfpmMj4sFDkcD1X609qs5AU97Pc=
istio.io/api v1.23.1/go.mod h1:QPSTGXuIQdnZFEm3myf9NZ5uBMwCdJWUvfj9ZZ+2oBM= istio.io/api v1.23.1/go.mod h1:QPSTGXuIQdnZFEm3myf9NZ5uBMwCdJWUvfj9ZZ+2oBM=
istio.io/client-go v1.23.1 h1:IX2cgUUXnVYo+9H6bFGSp/vuKVLPUkmiN8qk1/mvsYs= istio.io/client-go v1.23.1 h1:IX2cgUUXnVYo+9H6bFGSp/vuKVLPUkmiN8qk1/mvsYs=
istio.io/client-go v1.23.1/go.mod h1:+fxu+O2GkITM3HEREUWdobvRXqI/UhAAI7hfxqqpRh0= istio.io/client-go v1.23.1/go.mod h1:+fxu+O2GkITM3HEREUWdobvRXqI/UhAAI7hfxqqpRh0=
k8s.io/api v0.32.3 h1:Hw7KqxRusq+6QSplE3NYG4MBxZw1BZnq4aP4cJVINls= k8s.io/api v0.33.3 h1:SRd5t//hhkI1buzxb288fy2xvjubstenEKL9K51KBI8=
k8s.io/api v0.32.3/go.mod h1:2wEDTXADtm/HA7CCMD8D8bK4yuBUptzaRhYcYEEYA3k= k8s.io/api v0.33.3/go.mod h1:01Y/iLUjNBM3TAvypct7DIj0M0NIZc+PzAHCIo0CYGE=
k8s.io/apiextensions-apiserver v0.32.3 h1:4D8vy+9GWerlErCwVIbcQjsWunF9SUGNu7O7hiQTyPY= k8s.io/apiextensions-apiserver v0.33.3 h1:qmOcAHN6DjfD0v9kxL5udB27SRP6SG/MTopmge3MwEs=
k8s.io/apiextensions-apiserver v0.32.3/go.mod h1:8YwcvVRMVzw0r1Stc7XfGAzB/SIVLunqApySV5V7Dss= k8s.io/apiextensions-apiserver v0.33.3/go.mod h1:oROuctgo27mUsyp9+Obahos6CWcMISSAPzQ77CAQGz8=
k8s.io/apimachinery v0.32.3 h1:JmDuDarhDmA/Li7j3aPrwhpNBA94Nvk5zLeOge9HH1U= k8s.io/apimachinery v0.33.3 h1:4ZSrmNa0c/ZpZJhAgRdcsFcZOw1PQU1bALVQ0B3I5LA=
k8s.io/apimachinery v0.32.3/go.mod h1:GpHVgxoKlTxClKcteaeuF1Ul/lDVb74KpZcxcmLDElE= k8s.io/apimachinery v0.33.3/go.mod h1:BHW0YOu7n22fFv/JkYOEfkUYNRN0fj0BlvMFWA7b+SM=
k8s.io/client-go v0.32.3 h1:RKPVltzopkSgHS7aS98QdscAgtgah/+zmpAogooIqVU= k8s.io/client-go v0.33.3 h1:M5AfDnKfYmVJif92ngN532gFqakcGi6RvaOF16efrpA=
k8s.io/client-go v0.32.3/go.mod h1:3v0+3k4IcT9bXTc4V2rt+d2ZPPG700Xy6Oi0Gdl2PaY= k8s.io/client-go v0.33.3/go.mod h1:luqKBQggEf3shbxHY4uVENAxrDISLOarxpTKMiUuujg=
k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk=
k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE=
k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f h1:GA7//TjRY9yWGy1poLzYYJJ4JRdzg3+O6e8I+e+8T5Y= k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff h1:/usPimJzUKKu+m+TE36gUyGcf03XZEP0ZIKgKj35LS4=
k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f/go.mod h1:R/HEjbvWI0qdfb8viZUeVZm0X6IZnxAydC7YU42CMw4= k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff/go.mod h1:5jIi+8yX4RIb8wk3XwBo5Pq2ccx4FP10ohkbSKCZoK8=
k8s.io/utils v0.0.0-20241210054802-24370beab758 h1:sdbE21q2nlQtFh65saZY+rRM6x6aJJI8IUa1AmH/qa0= k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979 h1:jgJW5IePPXLGB8e/1wvd0Ich9QE97RvvF3a8J3fP/Lg=
k8s.io/utils v0.0.0-20241210054802-24370beab758/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
sigs.k8s.io/controller-runtime v0.20.4 h1:X3c+Odnxz+iPTRobG4tp092+CvBU9UK0t/bRf+n0DGU= sigs.k8s.io/controller-runtime v0.21.0 h1:CYfjpEuicjUecRk+KAeyYh+ouUBn4llGyDYytIGcJS8=
sigs.k8s.io/controller-runtime v0.20.4/go.mod h1:xg2XB0K5ShQzAgsoujxuKN4LNXR2LfwwHsPj7Iaw+XY= sigs.k8s.io/controller-runtime v0.21.0/go.mod h1:OSg14+F65eWqIu4DceX7k/+QRAbTTvxeQSNSOQpukWM=
sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 h1:/Rv+M11QRah1itp8VhT6HoVx1Ray9eB4DBr+K+/sCJ8= sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 h1:/Rv+M11QRah1itp8VhT6HoVx1Ray9eB4DBr+K+/sCJ8=
sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3/go.mod h1:18nIHnGi6636UCz6m8i4DhaJ65T6EruyzmoQqI2BVDo= sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3/go.mod h1:18nIHnGi6636UCz6m8i4DhaJ65T6EruyzmoQqI2BVDo=
sigs.k8s.io/lws v0.6.1 h1:cWiRmMSflo8hQPBrmIIZtoaX3XuVkmAgFKkmjxlPULI= sigs.k8s.io/lws v0.6.1 h1:cWiRmMSflo8hQPBrmIIZtoaX3XuVkmAgFKkmjxlPULI=
sigs.k8s.io/lws v0.6.1/go.mod h1:aoT5ROMriBtN/H8JH0POBF6e2uyFCOxKGKtXSA3DVV8= sigs.k8s.io/lws v0.6.1/go.mod h1:aoT5ROMriBtN/H8JH0POBF6e2uyFCOxKGKtXSA3DVV8=
sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016 h1:kXv6kKdoEtedwuqMmkqhbkgvYKeycVbC8+iPCP9j5kQ=
sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY=
sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU=
sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY=
sigs.k8s.io/structured-merge-diff/v4 v4.7.0 h1:qPeWmscJcXP0snki5IYF79Z8xrl8ETFxgMd7wez1XkI= sigs.k8s.io/structured-merge-diff/v4 v4.7.0 h1:qPeWmscJcXP0snki5IYF79Z8xrl8ETFxgMd7wez1XkI=
sigs.k8s.io/structured-merge-diff/v4 v4.7.0/go.mod h1:dDy58f92j70zLsuZVuUX5Wp9vtxXpaZnkPGWeqDfCps= sigs.k8s.io/structured-merge-diff/v4 v4.7.0/go.mod h1:dDy58f92j70zLsuZVuUX5Wp9vtxXpaZnkPGWeqDfCps=
sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E= sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E=
......
...@@ -17,6 +17,8 @@ const ( ...@@ -17,6 +17,8 @@ const (
KubeLabelDynamoSelector = "nvidia.com/selector" KubeLabelDynamoSelector = "nvidia.com/selector"
KubeAnnotationEnableGrove = "nvidia.com/enable-grove"
KubeLabelDynamoComponent = "nvidia.com/dynamo-component" KubeLabelDynamoComponent = "nvidia.com/dynamo-component"
KubeLabelDynamoNamespace = "nvidia.com/dynamo-namespace" KubeLabelDynamoNamespace = "nvidia.com/dynamo-namespace"
KubeLabelDynamoDeploymentTargetType = "nvidia.com/dynamo-deployment-target-type" KubeLabelDynamoDeploymentTargetType = "nvidia.com/dynamo-deployment-target-type"
...@@ -33,4 +35,6 @@ const ( ...@@ -33,4 +35,6 @@ const (
ComponentTypePlanner = "planner" ComponentTypePlanner = "planner"
ComponentTypeMain = "main" ComponentTypeMain = "main"
PlannerServiceAccountName = "planner-serviceaccount" PlannerServiceAccountName = "planner-serviceaccount"
DefaultIngressSuffix = "local"
) )
...@@ -18,8 +18,6 @@ ...@@ -18,8 +18,6 @@
package controller package controller
import ( import (
"fmt"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1" "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
corev1 "k8s.io/api/core/v1" corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
...@@ -51,18 +49,6 @@ func getPvcName(crd metav1.Object, defaultName *string) string { ...@@ -51,18 +49,6 @@ func getPvcName(crd metav1.Object, defaultName *string) string {
return crd.GetName() return crd.GetName()
} }
func getIngressHost(ingressSpec v1alpha1.IngressSpec) string {
host := ingressSpec.Host
if ingressSpec.HostPrefix != nil {
host = *ingressSpec.HostPrefix + host
}
ingressSuffix := DefaultIngressSuffix
if ingressSpec.HostSuffix != nil {
ingressSuffix = *ingressSpec.HostSuffix
}
return fmt.Sprintf("%s.%s", host, ingressSuffix)
}
type dockerSecretRetriever interface { type dockerSecretRetriever interface {
// returns a list of secret names associated with the docker registry // returns a list of secret names associated with the docker registry
GetSecrets(namespace, registry string) ([]string, error) GetSecrets(namespace, registry string) ([]string, error)
......
...@@ -24,7 +24,6 @@ import ( ...@@ -24,7 +24,6 @@ import (
"fmt" "fmt"
"os" "os"
"strconv" "strconv"
"strings"
"time" "time"
"github.com/imdario/mergo" "github.com/imdario/mergo"
...@@ -41,7 +40,7 @@ import ( ...@@ -41,7 +40,7 @@ import (
commonconsts "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts" commonconsts "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common" "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common"
commonController "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common" commonController "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common"
istioNetworking "istio.io/api/networking/v1beta1" "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/dynamo"
networkingv1beta1 "istio.io/client-go/pkg/apis/networking/v1beta1" networkingv1beta1 "istio.io/client-go/pkg/apis/networking/v1beta1"
k8serrors "k8s.io/apimachinery/pkg/api/errors" k8serrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/meta" "k8s.io/apimachinery/pkg/api/meta"
...@@ -73,7 +72,6 @@ const ( ...@@ -73,7 +72,6 @@ const (
DeploymentTargetTypeProduction = "production" DeploymentTargetTypeProduction = "production"
DeploymentTargetTypeDebug = "debug" DeploymentTargetTypeDebug = "debug"
HeaderNameDebug = "X-Nvidia-Debug" HeaderNameDebug = "X-Nvidia-Debug"
DefaultIngressSuffix = "local"
KubernetesDeploymentStrategy = "kubernetes" KubernetesDeploymentStrategy = "kubernetes"
KubeAnnotationDeploymentType = "nvidia.com/deployment-type" KubeAnnotationDeploymentType = "nvidia.com/deployment-type"
...@@ -88,10 +86,7 @@ type DynamoComponentDeploymentReconciler struct { ...@@ -88,10 +86,7 @@ type DynamoComponentDeploymentReconciler struct {
client.Client client.Client
Recorder record.EventRecorder Recorder record.EventRecorder
Config controller_common.Config Config controller_common.Config
NatsAddr string
EtcdAddr string
EtcdStorage etcdStorage EtcdStorage etcdStorage
UseVirtualService bool
DockerSecretRetriever dockerSecretRetriever DockerSecretRetriever dockerSecretRetriever
} }
...@@ -952,7 +947,7 @@ func (r *DynamoComponentDeploymentReconciler) createOrUpdateOrDeleteIngress(ctx ...@@ -952,7 +947,7 @@ func (r *DynamoComponentDeploymentReconciler) createOrUpdateOrDeleteIngress(ctx
if err != nil { if err != nil {
return false, err return false, err
} }
if r.UseVirtualService { if r.Config.IngressConfig.UseVirtualService() {
modified_, _, err := commonController.SyncResource(ctx, r, opt.dynamoComponentDeployment, func(ctx context.Context) (*networkingv1beta1.VirtualService, bool, error) { modified_, _, err := commonController.SyncResource(ctx, r, opt.dynamoComponentDeployment, func(ctx context.Context) (*networkingv1beta1.VirtualService, bool, error) {
return r.generateVirtualService(ctx, opt) return r.generateVirtualService(ctx, opt)
}) })
...@@ -975,49 +970,11 @@ func (r *DynamoComponentDeploymentReconciler) generateIngress(ctx context.Contex ...@@ -975,49 +970,11 @@ func (r *DynamoComponentDeploymentReconciler) generateIngress(ctx context.Contex
}, },
} }
if !opt.dynamoComponentDeployment.Spec.Ingress.Enabled || opt.dynamoComponentDeployment.Spec.Ingress.IngressControllerClassName == nil { if opt.dynamoComponentDeployment.Spec.Ingress == nil || !opt.dynamoComponentDeployment.Spec.Ingress.Enabled || opt.dynamoComponentDeployment.Spec.Ingress.IngressControllerClassName == nil {
log.Info("Ingress is not enabled") log.Info("Ingress is not enabled")
return ingress, true, nil return ingress, true, nil
} }
host := getIngressHost(opt.dynamoComponentDeployment.Spec.Ingress) return dynamo.GenerateComponentIngress(ctx, opt.dynamoComponentDeployment.Name, opt.dynamoComponentDeployment.Namespace, *opt.dynamoComponentDeployment.Spec.Ingress), false, nil
ingress.Spec = networkingv1.IngressSpec{
IngressClassName: opt.dynamoComponentDeployment.Spec.Ingress.IngressControllerClassName,
Rules: []networkingv1.IngressRule{
{
Host: host,
IngressRuleValue: networkingv1.IngressRuleValue{
HTTP: &networkingv1.HTTPIngressRuleValue{
Paths: []networkingv1.HTTPIngressPath{
{
Path: "/",
PathType: &[]networkingv1.PathType{networkingv1.PathTypePrefix}[0],
Backend: networkingv1.IngressBackend{
Service: &networkingv1.IngressServiceBackend{
Name: opt.dynamoComponentDeployment.Name,
Port: networkingv1.ServiceBackendPort{
Number: commonconsts.DynamoServicePort,
},
},
},
},
},
},
},
},
},
}
if opt.dynamoComponentDeployment.Spec.Ingress.TLS != nil {
ingress.Spec.TLS = []networkingv1.IngressTLS{
{
Hosts: []string{host},
SecretName: opt.dynamoComponentDeployment.Spec.Ingress.TLS.SecretName,
},
}
}
return ingress, false, nil
} }
func (r *DynamoComponentDeploymentReconciler) generateVirtualService(ctx context.Context, opt generateResourceOption) (*networkingv1beta1.VirtualService, bool, error) { func (r *DynamoComponentDeploymentReconciler) generateVirtualService(ctx context.Context, opt generateResourceOption) (*networkingv1beta1.VirtualService, bool, error) {
...@@ -1031,40 +988,12 @@ func (r *DynamoComponentDeploymentReconciler) generateVirtualService(ctx context ...@@ -1031,40 +988,12 @@ func (r *DynamoComponentDeploymentReconciler) generateVirtualService(ctx context
}, },
} }
vsEnabled := opt.dynamoComponentDeployment.Spec.Ingress.Enabled && opt.dynamoComponentDeployment.Spec.Ingress.UseVirtualService && opt.dynamoComponentDeployment.Spec.Ingress.VirtualServiceGateway != nil vsEnabled := opt.dynamoComponentDeployment.Spec.Ingress != nil && opt.dynamoComponentDeployment.Spec.Ingress.Enabled && opt.dynamoComponentDeployment.Spec.Ingress.UseVirtualService && opt.dynamoComponentDeployment.Spec.Ingress.VirtualServiceGateway != nil
if !vsEnabled { if !vsEnabled {
log.Info("VirtualService is not enabled") log.Info("VirtualService is not enabled")
return vs, true, nil return vs, true, nil
} }
return dynamo.GenerateComponentVirtualService(ctx, opt.dynamoComponentDeployment.Name, opt.dynamoComponentDeployment.Namespace, *opt.dynamoComponentDeployment.Spec.Ingress), false, nil
vs.Spec = istioNetworking.VirtualService{
Hosts: []string{
getIngressHost(opt.dynamoComponentDeployment.Spec.Ingress),
},
Gateways: []string{*opt.dynamoComponentDeployment.Spec.Ingress.VirtualServiceGateway},
Http: []*istioNetworking.HTTPRoute{
{
Match: []*istioNetworking.HTTPMatchRequest{
{
Uri: &istioNetworking.StringMatch{
MatchType: &istioNetworking.StringMatch_Prefix{Prefix: "/"},
},
},
},
Route: []*istioNetworking.HTTPRouteDestination{
{
Destination: &istioNetworking.Destination{
Host: opt.dynamoComponentDeployment.Name,
Port: &istioNetworking.PortSelector{
Number: commonconsts.DynamoServicePort,
},
},
},
},
},
},
}
return vs, false, nil
} }
func (r *DynamoComponentDeploymentReconciler) getKubeName(dynamoComponentDeployment *v1alpha1.DynamoComponentDeployment, debug bool) string { func (r *DynamoComponentDeploymentReconciler) getKubeName(dynamoComponentDeployment *v1alpha1.DynamoComponentDeployment, debug bool) string {
...@@ -1274,7 +1203,6 @@ func (r *DynamoComponentDeploymentReconciler) generateHPA(opt generateResourceOp ...@@ -1274,7 +1203,6 @@ func (r *DynamoComponentDeploymentReconciler) generateHPA(opt generateResourceOp
//nolint:gocyclo,nakedret //nolint:gocyclo,nakedret
func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx context.Context, opt generateResourceOption) (podTemplateSpec *corev1.PodTemplateSpec, err error) { func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx context.Context, opt generateResourceOption) (podTemplateSpec *corev1.PodTemplateSpec, err error) {
logs := log.FromContext(ctx)
podLabels := r.getKubeLabels(opt.dynamoComponentDeployment) podLabels := r.getKubeLabels(opt.dynamoComponentDeployment)
if opt.isStealingTrafficDebugModeEnabled { if opt.isStealingTrafficDebugModeEnabled {
podLabels[commonconsts.KubeLabelDynamoDeploymentTargetType] = DeploymentTargetTypeDebug podLabels[commonconsts.KubeLabelDynamoDeploymentTargetType] = DeploymentTargetTypeDebug
...@@ -1333,17 +1261,17 @@ func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx contex ...@@ -1333,17 +1261,17 @@ func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx contex
}, },
} }
if r.NatsAddr != "" { if r.Config.NatsAddress != "" {
defaultEnvs = append(defaultEnvs, corev1.EnvVar{ defaultEnvs = append(defaultEnvs, corev1.EnvVar{
Name: "NATS_SERVER", Name: "NATS_SERVER",
Value: r.NatsAddr, Value: r.Config.NatsAddress,
}) })
} }
if r.EtcdAddr != "" { if r.Config.EtcdAddress != "" {
defaultEnvs = append(defaultEnvs, corev1.EnvVar{ defaultEnvs = append(defaultEnvs, corev1.EnvVar{
Name: "ETCD_ENDPOINTS", Name: "ETCD_ENDPOINTS",
Value: r.EtcdAddr, Value: r.Config.EtcdAddress,
}) })
} }
...@@ -1366,34 +1294,6 @@ func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx contex ...@@ -1366,34 +1294,6 @@ func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx contex
volumes := make([]corev1.Volume, 0) volumes := make([]corev1.Volume, 0)
volumeMounts := make([]corev1.VolumeMount, 0) volumeMounts := make([]corev1.VolumeMount, 0)
args := make([]string, 0)
args = append(args, "cd", "src", "&&", "uv", "run", "dynamo", "serve")
// ensure liveness and readiness probes are enabled for the dynamo components
args = append(args, "--system-app-port", fmt.Sprintf("%d", commonconsts.DynamoHealthPort))
args = append(args, "--enable-system-app")
args = append(args, "--use-default-health-checks")
if opt.dynamoComponentDeployment.Spec.ServiceName != "" {
args = append(args, []string{"--service-name", opt.dynamoComponentDeployment.Spec.ServiceName}...)
args = append(args, opt.dynamoComponentDeployment.Spec.DynamoTag)
if opt.dynamoComponentDeployment.Spec.DynamoNamespace != nil && *opt.dynamoComponentDeployment.Spec.DynamoNamespace != "" {
args = append(args, fmt.Sprintf("--%s.ServiceArgs.dynamo.namespace=%s", opt.dynamoComponentDeployment.Spec.ServiceName, *opt.dynamoComponentDeployment.Spec.DynamoNamespace))
}
if componentType, exists := opt.dynamoComponentDeployment.Labels[commonconsts.KubeLabelDynamoComponent]; exists && componentType == ComponentTypePlanner {
args = append(args, fmt.Sprintf("--%s.environment=%s", opt.dynamoComponentDeployment.Spec.ServiceName, KubernetesDeploymentStrategy))
}
}
if len(opt.dynamoComponentDeployment.Spec.Envs) > 0 {
for _, env := range opt.dynamoComponentDeployment.Spec.Envs {
if env.Name == "DYNAMO_CONFIG_PATH" {
args = append(args, "-f", env.Value)
}
}
}
dynamoResources := opt.dynamoComponentDeployment.Spec.Resources dynamoResources := opt.dynamoComponentDeployment.Spec.Resources
resources, err := getResourcesConfig(dynamoResources) resources, err := getResourcesConfig(dynamoResources)
...@@ -1468,8 +1368,6 @@ func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx contex ...@@ -1468,8 +1368,6 @@ func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx contex
container := corev1.Container{ container := corev1.Container{
Name: "main", Name: "main",
Image: imageName, Image: imageName,
Command: []string{"sh", "-c"},
Args: []string{strings.Join(args, " ")},
LivenessProbe: livenessProbe, LivenessProbe: livenessProbe,
ReadinessProbe: readinessProbe, ReadinessProbe: readinessProbe,
Resources: resources, Resources: resources,
...@@ -1566,23 +1464,8 @@ func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx contex ...@@ -1566,23 +1464,8 @@ func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx contex
if opt.dynamoComponentDeployment.Spec.ExtraPodSpec != nil { if opt.dynamoComponentDeployment.Spec.ExtraPodSpec != nil {
extraPodSpecMainContainer := opt.dynamoComponentDeployment.Spec.ExtraPodSpec.MainContainer extraPodSpecMainContainer := opt.dynamoComponentDeployment.Spec.ExtraPodSpec.MainContainer
if extraPodSpecMainContainer != nil { if extraPodSpecMainContainer != nil {
if len(extraPodSpecMainContainer.Command) > 0 { // Merge non empty fields from extraPodSpecMainContainer into container, only overriding empty fields
logs.Info("Overriding container '" + container.Name + "' Command with: " + strings.Join(extraPodSpecMainContainer.Command, " ")) err := mergo.Merge(&container, extraPodSpecMainContainer.DeepCopy())
container.Command = extraPodSpecMainContainer.Command
}
if len(extraPodSpecMainContainer.Args) > 0 {
// Special case: if command is "sh -c", we must collapse args into a single string
if len(container.Command) == 2 && container.Command[0] == "sh" && container.Command[1] == "-c" {
joinedArgs := strings.Join(extraPodSpecMainContainer.Args, " ")
logs.Info("Special case detected for container '" + container.Name + "': Command is 'sh -c'; collapsing Args to: " + joinedArgs)
container.Args = []string{joinedArgs}
} else {
logs.Info("Overriding container '" + container.Name + "' Args with: " + strings.Join(extraPodSpecMainContainer.Args, " "))
container.Args = extraPodSpecMainContainer.Args
}
}
// finally, Merge non empty fields from extraPodSpecMainContainer into container, only overriding empty fields
err := mergo.Merge(&container, extraPodSpecMainContainer)
if err != nil { if err != nil {
err = errors.Wrapf(err, "failed to merge extraPodSpecMainContainer into container") err = errors.Wrapf(err, "failed to merge extraPodSpecMainContainer into container")
return nil, err return nil, err
...@@ -1723,7 +1606,7 @@ func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx contex ...@@ -1723,7 +1606,7 @@ func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx contex
} }
func getResourcesConfig(resources *dynamoCommon.Resources) (corev1.ResourceRequirements, error) { func getResourcesConfig(resources *dynamoCommon.Resources) (corev1.ResourceRequirements, error) {
currentResources := corev1.ResourceRequirements{ defaultResources := corev1.ResourceRequirements{
Requests: corev1.ResourceList{ Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("300m"), corev1.ResourceCPU: resource.MustParse("300m"),
corev1.ResourceMemory: resource.MustParse("500Mi"), corev1.ResourceMemory: resource.MustParse("500Mi"),
...@@ -1733,86 +1616,18 @@ func getResourcesConfig(resources *dynamoCommon.Resources) (corev1.ResourceRequi ...@@ -1733,86 +1616,18 @@ func getResourcesConfig(resources *dynamoCommon.Resources) (corev1.ResourceRequi
corev1.ResourceMemory: resource.MustParse("1Gi"), corev1.ResourceMemory: resource.MustParse("1Gi"),
}, },
} }
if resources == nil { if resources == nil {
return currentResources, nil return defaultResources, nil
}
if resources.Limits != nil {
if resources.Limits.CPU != "" {
q, err := resource.ParseQuantity(resources.Limits.CPU)
if err != nil {
return currentResources, errors.Wrapf(err, "parse limits cpu quantity")
} }
if currentResources.Limits == nil { resourcesConfig, err := controller_common.GetResourcesConfig(resources)
currentResources.Limits = make(corev1.ResourceList)
}
currentResources.Limits[corev1.ResourceCPU] = q
}
if resources.Limits.Memory != "" {
q, err := resource.ParseQuantity(resources.Limits.Memory)
if err != nil { if err != nil {
return currentResources, errors.Wrapf(err, "parse limits memory quantity") return corev1.ResourceRequirements{}, errors.Wrapf(err, "failed to get resources config")
}
if currentResources.Limits == nil {
currentResources.Limits = make(corev1.ResourceList)
} }
currentResources.Limits[corev1.ResourceMemory] = q err = mergo.Merge(resourcesConfig, defaultResources.DeepCopy())
}
if resources.Limits.GPU != "" {
q, err := resource.ParseQuantity(resources.Limits.GPU)
if err != nil { if err != nil {
return currentResources, errors.Wrapf(err, "parse limits gpu quantity") return corev1.ResourceRequirements{}, errors.Wrapf(err, "failed to merge resources config")
}
if currentResources.Limits == nil {
currentResources.Limits = make(corev1.ResourceList)
}
currentResources.Limits[commonconsts.KubeResourceGPUNvidia] = q
}
for k, v := range resources.Limits.Custom {
q, err := resource.ParseQuantity(v)
if err != nil {
return currentResources, errors.Wrapf(err, "parse limits %s quantity", k)
}
if currentResources.Limits == nil {
currentResources.Limits = make(corev1.ResourceList)
}
currentResources.Limits[corev1.ResourceName(k)] = q
}
}
if resources.Requests != nil {
if resources.Requests.CPU != "" {
q, err := resource.ParseQuantity(resources.Requests.CPU)
if err != nil {
return currentResources, errors.Wrapf(err, "parse requests cpu quantity")
}
if currentResources.Requests == nil {
currentResources.Requests = make(corev1.ResourceList)
}
currentResources.Requests[corev1.ResourceCPU] = q
}
if resources.Requests.Memory != "" {
q, err := resource.ParseQuantity(resources.Requests.Memory)
if err != nil {
return currentResources, errors.Wrapf(err, "parse requests memory quantity")
}
if currentResources.Requests == nil {
currentResources.Requests = make(corev1.ResourceList)
}
currentResources.Requests[corev1.ResourceMemory] = q
}
for k, v := range resources.Requests.Custom {
q, err := resource.ParseQuantity(v)
if err != nil {
return currentResources, errors.Wrapf(err, "parse requests %s quantity", k)
}
if currentResources.Requests == nil {
currentResources.Requests = make(corev1.ResourceList)
}
currentResources.Requests[corev1.ResourceName(k)] = q
}
} }
return currentResources, nil return *resourcesConfig, nil
} }
func (r *DynamoComponentDeploymentReconciler) generateService(opt generateResourceOption) (*corev1.Service, bool, error) { func (r *DynamoComponentDeploymentReconciler) generateService(opt generateResourceOption) (*corev1.Service, bool, error) {
...@@ -1930,7 +1745,7 @@ func (r *DynamoComponentDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) ...@@ -1930,7 +1745,7 @@ func (r *DynamoComponentDeploymentReconciler) SetupWithManager(mgr ctrl.Manager)
})) }))
} }
if r.UseVirtualService { if r.Config.IngressConfig.UseVirtualService() {
m.Owns(&networkingv1beta1.VirtualService{}, builder.WithPredicates(predicate.GenerationChangedPredicate{})) m.Owns(&networkingv1beta1.VirtualService{}, builder.WithPredicates(predicate.GenerationChangedPredicate{}))
} }
m.Owns(&autoscalingv2.HorizontalPodAutoscaler{}) m.Owns(&autoscalingv2.HorizontalPodAutoscaler{})
......
...@@ -278,7 +278,7 @@ func TestDynamoComponentDeploymentReconciler_generateIngress(t *testing.T) { ...@@ -278,7 +278,7 @@ func TestDynamoComponentDeploymentReconciler_generateIngress(t *testing.T) {
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{ DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
ServiceName: "service1", ServiceName: "service1",
DynamoNamespace: &[]string{"default"}[0], DynamoNamespace: &[]string{"default"}[0],
Ingress: v1alpha1.IngressSpec{ Ingress: &v1alpha1.IngressSpec{
Enabled: true, Enabled: true,
Host: "someservice", Host: "someservice",
IngressControllerClassName: &[]string{"nginx"}[0], IngressControllerClassName: &[]string{"nginx"}[0],
...@@ -337,7 +337,7 @@ func TestDynamoComponentDeploymentReconciler_generateIngress(t *testing.T) { ...@@ -337,7 +337,7 @@ func TestDynamoComponentDeploymentReconciler_generateIngress(t *testing.T) {
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{ DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
ServiceName: "service1", ServiceName: "service1",
DynamoNamespace: &[]string{"default"}[0], DynamoNamespace: &[]string{"default"}[0],
Ingress: v1alpha1.IngressSpec{ Ingress: &v1alpha1.IngressSpec{
Enabled: false, Enabled: false,
}, },
}, },
...@@ -400,7 +400,7 @@ func TestDynamoComponentDeploymentReconciler_generateVirtualService(t *testing.T ...@@ -400,7 +400,7 @@ func TestDynamoComponentDeploymentReconciler_generateVirtualService(t *testing.T
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{ DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
ServiceName: "service1", ServiceName: "service1",
DynamoNamespace: &[]string{"default"}[0], DynamoNamespace: &[]string{"default"}[0],
Ingress: v1alpha1.IngressSpec{ Ingress: &v1alpha1.IngressSpec{
Enabled: true, Enabled: true,
}, },
}, },
...@@ -432,7 +432,7 @@ func TestDynamoComponentDeploymentReconciler_generateVirtualService(t *testing.T ...@@ -432,7 +432,7 @@ func TestDynamoComponentDeploymentReconciler_generateVirtualService(t *testing.T
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{ DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
ServiceName: "service1", ServiceName: "service1",
DynamoNamespace: &[]string{"default"}[0], DynamoNamespace: &[]string{"default"}[0],
Ingress: v1alpha1.IngressSpec{ Ingress: &v1alpha1.IngressSpec{
Enabled: true, Enabled: true,
Host: "someservice", Host: "someservice",
UseVirtualService: true, UseVirtualService: true,
...@@ -498,10 +498,7 @@ func TestDynamoComponentDeploymentReconciler_generateVolcanoPodGroup(t *testing. ...@@ -498,10 +498,7 @@ func TestDynamoComponentDeploymentReconciler_generateVolcanoPodGroup(t *testing.
Client client.Client Client client.Client
Recorder record.EventRecorder Recorder record.EventRecorder
Config controller_common.Config Config controller_common.Config
NatsAddr string
EtcdAddr string
EtcdStorage etcdStorage EtcdStorage etcdStorage
UseVirtualService bool
} }
type args struct { type args struct {
ctx context.Context ctx context.Context
...@@ -758,10 +755,7 @@ func TestDynamoComponentDeploymentReconciler_generateVolcanoPodGroup(t *testing. ...@@ -758,10 +755,7 @@ func TestDynamoComponentDeploymentReconciler_generateVolcanoPodGroup(t *testing.
Client: tt.fields.Client, Client: tt.fields.Client,
Recorder: tt.fields.Recorder, Recorder: tt.fields.Recorder,
Config: tt.fields.Config, Config: tt.fields.Config,
NatsAddr: tt.fields.NatsAddr,
EtcdAddr: tt.fields.EtcdAddr,
EtcdStorage: tt.fields.EtcdStorage, EtcdStorage: tt.fields.EtcdStorage,
UseVirtualService: tt.fields.UseVirtualService,
} }
got, got1, err := r.generateVolcanoPodGroup(tt.args.ctx, tt.args.opt) got, got1, err := r.generateVolcanoPodGroup(tt.args.ctx, tt.args.opt)
if (err != nil) != tt.wantErr { if (err != nil) != tt.wantErr {
...@@ -789,10 +783,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing. ...@@ -789,10 +783,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
Client client.Client Client client.Client
Recorder record.EventRecorder Recorder record.EventRecorder
Config controller_common.Config Config controller_common.Config
NatsAddr string
EtcdAddr string
EtcdStorage etcdStorage EtcdStorage etcdStorage
UseVirtualService bool
DockerSecretRetriever *mockDockerSecretRetriever DockerSecretRetriever *mockDockerSecretRetriever
} }
type args struct { type args struct {
...@@ -847,6 +838,13 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing. ...@@ -847,6 +838,13 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
ExtraPodSpec: &dynamoCommon.ExtraPodSpec{ ExtraPodSpec: &dynamoCommon.ExtraPodSpec{
MainContainer: &corev1.Container{ MainContainer: &corev1.Container{
Image: "test-image:latest", Image: "test-image:latest",
Command: []string{
"sh",
"-c",
},
Args: []string{
"some dynamo command",
},
}, },
}, },
}, },
...@@ -897,7 +895,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing. ...@@ -897,7 +895,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
Name: "main", Name: "main",
Image: "test-image:latest", Image: "test-image:latest",
Command: []string{"sh", "-c"}, Command: []string{"sh", "-c"},
Args: []string{"ray start --head --port=6379 && cd src && uv run dynamo serve --system-app-port 5000 --enable-system-app --use-default-health-checks --service-name test-lws-deploy-service test-tag --test-lws-deploy-service.ServiceArgs.dynamo.namespace=default"}, Args: []string{"ray start --head --port=6379 && some dynamo command"},
Env: []corev1.EnvVar{{Name: "DYNAMO_PORT", Value: fmt.Sprintf("%d", commonconsts.DynamoServicePort)}}, Env: []corev1.EnvVar{{Name: "DYNAMO_PORT", Value: fmt.Sprintf("%d", commonconsts.DynamoServicePort)}},
VolumeMounts: []corev1.VolumeMount{ VolumeMounts: []corev1.VolumeMount{
{ {
...@@ -1095,10 +1093,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing. ...@@ -1095,10 +1093,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
Client: fakeKubeClient, // Use the fake client Client: fakeKubeClient, // Use the fake client
Recorder: tt.fields.Recorder, Recorder: tt.fields.Recorder,
Config: tt.fields.Config, Config: tt.fields.Config,
NatsAddr: tt.fields.NatsAddr,
EtcdAddr: tt.fields.EtcdAddr,
EtcdStorage: tt.fields.EtcdStorage, EtcdStorage: tt.fields.EtcdStorage,
UseVirtualService: tt.fields.UseVirtualService,
DockerSecretRetriever: tt.fields.DockerSecretRetriever, DockerSecretRetriever: tt.fields.DockerSecretRetriever,
// Scheme: s, // Pass scheme if reconciler uses it directly, often client uses it // Scheme: s, // Pass scheme if reconciler uses it directly, often client uses it
} }
......
...@@ -21,6 +21,10 @@ import ( ...@@ -21,6 +21,10 @@ import (
"context" "context"
"fmt" "fmt"
grovev1alpha1 "github.com/NVIDIA/grove/operator/api/core/v1alpha1"
networkingv1beta1 "istio.io/client-go/pkg/apis/networking/v1beta1"
corev1 "k8s.io/api/core/v1"
networkingv1 "k8s.io/api/networking/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/tools/record" "k8s.io/client-go/tools/record"
ctrl "sigs.k8s.io/controller-runtime" ctrl "sigs.k8s.io/controller-runtime"
...@@ -31,14 +35,19 @@ import ( ...@@ -31,14 +35,19 @@ import (
"sigs.k8s.io/controller-runtime/pkg/predicate" "sigs.k8s.io/controller-runtime/pkg/predicate"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1" nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
commonController "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common" commonController "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/dynamo" "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/dynamo"
) )
type State string
type Reason string
type Message string
const ( const (
FailedState = "failed" FailedState State = "failed"
ReadyState = "successful" ReadyState State = "successful"
PendingState = "pending" PendingState State = "pending"
) )
type etcdStorage interface { type etcdStorage interface {
...@@ -50,15 +59,13 @@ type DynamoGraphDeploymentReconciler struct { ...@@ -50,15 +59,13 @@ type DynamoGraphDeploymentReconciler struct {
client.Client client.Client
Config commonController.Config Config commonController.Config
Recorder record.EventRecorder Recorder record.EventRecorder
VirtualServiceGateway string DockerSecretRetriever dockerSecretRetriever
IngressControllerClassName string
IngressControllerTLSSecret string
IngressHostSuffix string
} }
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments/status,verbs=get;update;patch // +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments/status,verbs=get;update;patch
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments/finalizers,verbs=update // +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments/finalizers,verbs=update
// +kubebuilder:rbac:groups=grove.io,resources=podgangsets,verbs=get;list;watch;create;update;patch;delete
// Reconcile is part of the main kubernetes reconciliation loop which aims to // Reconcile is part of the main kubernetes reconciliation loop which aims to
// move the current state of the cluster closer to the desired state. // move the current state of the cluster closer to the desired state.
...@@ -73,8 +80,9 @@ func (r *DynamoGraphDeploymentReconciler) Reconcile(ctx context.Context, req ctr ...@@ -73,8 +80,9 @@ func (r *DynamoGraphDeploymentReconciler) Reconcile(ctx context.Context, req ctr
logger := log.FromContext(ctx) logger := log.FromContext(ctx)
var err error var err error
reason := "undefined" reason := Reason("undefined")
message := "" message := Message("")
state := PendingState
readyStatus := metav1.ConditionFalse readyStatus := metav1.ConditionFalse
// retrieve the CRD // retrieve the CRD
dynamoDeployment := &nvidiacomv1alpha1.DynamoGraphDeployment{} dynamoDeployment := &nvidiacomv1alpha1.DynamoGraphDeployment{}
...@@ -88,16 +96,20 @@ func (r *DynamoGraphDeploymentReconciler) Reconcile(ctx context.Context, req ctr ...@@ -88,16 +96,20 @@ func (r *DynamoGraphDeploymentReconciler) Reconcile(ctx context.Context, req ctr
defer func() { defer func() {
if err != nil { if err != nil {
dynamoDeployment.SetState(FailedState) state = FailedState
message = err.Error() message = Message(err.Error())
logger.Error(err, "Reconciliation failed") logger.Error(err, "Reconciliation failed")
} }
dynamoDeployment.SetState(string(state))
if state == ReadyState {
readyStatus = metav1.ConditionTrue
}
// update the CRD status condition // update the CRD status condition
dynamoDeployment.AddStatusCondition(metav1.Condition{ dynamoDeployment.AddStatusCondition(metav1.Condition{
Type: "Ready", Type: "Ready",
Status: readyStatus, Status: readyStatus,
Reason: reason, Reason: string(reason),
Message: message, Message: string(message),
LastTransitionTime: metav1.Now(), LastTransitionTime: metav1.Now(),
}) })
err = r.Status().Update(ctx, dynamoDeployment) err = r.Status().Update(ctx, dynamoDeployment)
...@@ -116,81 +128,164 @@ func (r *DynamoGraphDeploymentReconciler) Reconcile(ctx context.Context, req ctr ...@@ -116,81 +128,164 @@ func (r *DynamoGraphDeploymentReconciler) Reconcile(ctx context.Context, req ctr
if deleted { if deleted {
return ctrl.Result{}, nil return ctrl.Result{}, nil
} }
state, reason, message, err = r.reconcileResources(ctx, dynamoDeployment)
// generate the dynamoComponentsDeployments from the config
dynamoComponentsDeployments, err := dynamo.GenerateDynamoComponentsDeployments(ctx, dynamoDeployment, r.generateDefaultIngressSpec(dynamoDeployment))
if err != nil { if err != nil {
logger.Error(err, "failed to generate the DynamoComponentsDeployments and DynamoComponents") logger.Error(err, "failed to reconcile the resources")
reason = "failed_to_generate_the_DynamoComponentsDeployments" reason = "failed_to_reconcile_the_resources"
return ctrl.Result{}, err return ctrl.Result{}, err
} }
return ctrl.Result{}, nil
}
// merge the dynamoComponentsDeployments with the dynamoComponentsDeployments from the CRD type Resource interface {
for _, deployment := range dynamoComponentsDeployments { IsReady() bool
if deployment.Spec.Ingress.Enabled { GetName() string
dynamoDeployment.SetEndpointStatus(r.isEndpointSecured(), getIngressHost(deployment.Spec.Ingress)) }
func (r *DynamoGraphDeploymentReconciler) reconcileResources(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment) (State, Reason, Message, error) {
logger := log.FromContext(ctx)
if r.Config.EnableGrove {
// check if explicit opt out of grove
if dynamoDeployment.Annotations[consts.KubeAnnotationEnableGrove] == consts.KubeLabelValueFalse {
logger.Info("Grove is explicitly disabled for this deployment, skipping grove resources reconciliation")
return r.reconcileDynamoComponentsDeployments(ctx, dynamoDeployment)
} }
return r.reconcileGroveResources(ctx, dynamoDeployment)
} }
return r.reconcileDynamoComponentsDeployments(ctx, dynamoDeployment)
notReadyDeployments := []string{} }
// reconcile the dynamoComponentsDeployments
for serviceName, dynamoComponentDeployment := range dynamoComponentsDeployments { func (r *DynamoGraphDeploymentReconciler) reconcileGroveResources(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment) (State, Reason, Message, error) {
logger.Info("Reconciling the DynamoComponentDeployment", "serviceName", serviceName, "dynamoComponentDeployment", dynamoComponentDeployment) logger := log.FromContext(ctx)
_, dynamoComponentDeployment, err = commonController.SyncResource(ctx, r, dynamoDeployment, func(ctx context.Context) (*nvidiacomv1alpha1.DynamoComponentDeployment, bool, error) { // generate the dynamoComponentsDeployments from the config
return dynamoComponentDeployment, false, nil groveGangSet, err := dynamo.GenerateGrovePodGangSet(ctx, dynamoDeployment, r.Config, r.DockerSecretRetriever)
if err != nil {
logger.Error(err, "failed to generate the Grove GangSet")
return "", "", "", fmt.Errorf("failed to generate the Grove GangSet: %w", err)
}
_, syncedGroveGangSet, err := commonController.SyncResource(ctx, r, dynamoDeployment, func(ctx context.Context) (*grovev1alpha1.PodGangSet, bool, error) {
return groveGangSet, false, nil
}) })
if err != nil { if err != nil {
logger.Error(err, "failed to sync the DynamoComponentDeployment") logger.Error(err, "failed to sync the Grove GangSet")
reason = "failed_to_sync_the_DynamoComponentDeployment" return "", "", "", fmt.Errorf("failed to sync the Grove GangSet: %w", err)
return ctrl.Result{}, err
} }
if !dynamoComponentDeployment.Status.IsReady() { groveGangSetAsResource := commonController.WrapResource(syncedGroveGangSet, func() bool {
notReadyDeployments = append(notReadyDeployments, dynamoComponentDeployment.Name) if syncedGroveGangSet.Status.LastOperation != nil && syncedGroveGangSet.Status.LastOperation.State == grovev1alpha1.LastOperationStateSucceeded {
return true
} }
return false
})
resources := []Resource{groveGangSetAsResource}
for componentName, component := range dynamoDeployment.Spec.Services {
if component.ComponentType == consts.ComponentTypeMain {
// generate the main component service
mainComponentService, err := dynamo.GenerateComponentService(ctx, dynamo.GetDynamoComponentName(dynamoDeployment, componentName), dynamoDeployment.Namespace)
if err != nil {
logger.Error(err, "failed to generate the main component service")
return "", "", "", fmt.Errorf("failed to generate the main component service: %w", err)
} }
if len(notReadyDeployments) == 0 { _, syncedMainComponentService, err := commonController.SyncResource(ctx, r, dynamoDeployment, func(ctx context.Context) (*corev1.Service, bool, error) {
dynamoDeployment.SetState(ReadyState) return mainComponentService, false, nil
reason = "all_deployments_are_ready" })
message = "All deployments are ready" if err != nil {
readyStatus = metav1.ConditionTrue logger.Error(err, "failed to sync the main component service")
} else { return "", "", "", fmt.Errorf("failed to sync the main component service: %w", err)
reason = "some_deployments_are_not_ready"
message = fmt.Sprintf("The following deployments are not ready: %v", notReadyDeployments)
dynamoDeployment.SetState(PendingState)
} }
mainComponentServiceAsResource := commonController.WrapResource(syncedMainComponentService, func() bool {
return ctrl.Result{}, nil return true
})
} resources = append(resources, mainComponentServiceAsResource)
// generate the main component ingress
func (r *DynamoGraphDeploymentReconciler) generateDefaultIngressSpec(dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment) *nvidiacomv1alpha1.IngressSpec { ingressSpec := dynamo.GenerateDefaultIngressSpec(dynamoDeployment, r.Config.IngressConfig)
res := &nvidiacomv1alpha1.IngressSpec{ if component.Ingress != nil {
Enabled: r.VirtualServiceGateway != "" || r.IngressControllerClassName != "", ingressSpec = *component.Ingress
Host: dynamoDeployment.Name, }
UseVirtualService: r.VirtualServiceGateway != "", mainComponentIngress := dynamo.GenerateComponentIngress(ctx, dynamo.GetDynamoComponentName(dynamoDeployment, componentName), dynamoDeployment.Namespace, ingressSpec)
if err != nil {
logger.Error(err, "failed to generate the main component ingress")
return "", "", "", fmt.Errorf("failed to generate the main component ingress: %w", err)
}
_, syncedMainComponentIngress, err := commonController.SyncResource(ctx, r, dynamoDeployment, func(ctx context.Context) (*networkingv1.Ingress, bool, error) {
if !ingressSpec.Enabled || ingressSpec.IngressControllerClassName == nil {
logger.Info("Ingress is not enabled")
return mainComponentIngress, true, nil
}
return mainComponentIngress, false, nil
})
if err != nil {
logger.Error(err, "failed to sync the main component ingress")
return "", "", "", fmt.Errorf("failed to sync the main component ingress: %w", err)
} }
if r.IngressControllerClassName != "" { resources = append(resources, commonController.WrapResource(syncedMainComponentIngress, func() bool {
res.IngressControllerClassName = &r.IngressControllerClassName return true
}))
// generate the main component virtual service
mainComponentVirtualService := dynamo.GenerateComponentVirtualService(ctx, dynamo.GetDynamoComponentName(dynamoDeployment, componentName), dynamoDeployment.Namespace, ingressSpec)
if err != nil {
logger.Error(err, "failed to generate the main component virtual service")
return "", "", "", fmt.Errorf("failed to generate the main component virtual service: %w", err)
} }
if r.IngressControllerTLSSecret != "" { _, syncedMainComponentVirtualService, err := commonController.SyncResource(ctx, r, dynamoDeployment, func(ctx context.Context) (*networkingv1beta1.VirtualService, bool, error) {
res.TLS = &nvidiacomv1alpha1.IngressTLSSpec{ vsEnabled := ingressSpec.Enabled && ingressSpec.UseVirtualService && ingressSpec.VirtualServiceGateway != nil
SecretName: r.IngressControllerTLSSecret, if !vsEnabled {
logger.Info("VirtualService is not enabled")
return mainComponentVirtualService, true, nil
} }
return mainComponentVirtualService, false, nil
})
if err != nil {
logger.Error(err, "failed to sync the main component virtual service")
return "", "", "", fmt.Errorf("failed to sync the main component virtual service: %w", err)
} }
if r.IngressHostSuffix != "" { resources = append(resources, commonController.WrapResource(syncedMainComponentVirtualService, func() bool {
res.HostSuffix = &r.IngressHostSuffix return true
}))
} }
if r.VirtualServiceGateway != "" {
res.VirtualServiceGateway = &r.VirtualServiceGateway
} }
return res return r.checkResourcesReadiness(resources)
} }
func (r *DynamoGraphDeploymentReconciler) isEndpointSecured() bool { func (r *DynamoGraphDeploymentReconciler) checkResourcesReadiness(resources []Resource) (State, Reason, Message, error) {
if r.VirtualServiceGateway != "" && r.Config.VirtualServiceSupportsHTTPS { notReadyResources := []string{}
return true for _, resource := range resources {
if !resource.IsReady() {
notReadyResources = append(notReadyResources, resource.GetName())
}
}
if len(notReadyResources) == 0 {
return ReadyState, "all_resources_are_ready", Message("All resources are ready"), nil
} }
return r.IngressControllerTLSSecret != "" return PendingState, "some_resources_are_not_ready", Message(fmt.Sprintf("%d resources not ready: %v", len(notReadyResources), notReadyResources)), nil
}
func (r *DynamoGraphDeploymentReconciler) reconcileDynamoComponentsDeployments(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment) (State, Reason, Message, error) {
resources := []Resource{}
logger := log.FromContext(ctx)
// generate the dynamoComponentsDeployments from the config
defaultIngressSpec := dynamo.GenerateDefaultIngressSpec(dynamoDeployment, r.Config.IngressConfig)
dynamoComponentsDeployments, err := dynamo.GenerateDynamoComponentsDeployments(ctx, dynamoDeployment, &defaultIngressSpec)
if err != nil {
logger.Error(err, "failed to generate the DynamoComponentsDeployments")
return "", "", "", fmt.Errorf("failed to generate the DynamoComponentsDeployments: %w", err)
}
// reconcile the dynamoComponentsDeployments
for serviceName, dynamoComponentDeployment := range dynamoComponentsDeployments {
logger.Info("Reconciling the DynamoComponentDeployment", "serviceName", serviceName, "dynamoComponentDeployment", dynamoComponentDeployment)
_, dynamoComponentDeployment, err = commonController.SyncResource(ctx, r, dynamoDeployment, func(ctx context.Context) (*nvidiacomv1alpha1.DynamoComponentDeployment, bool, error) {
return dynamoComponentDeployment, false, nil
})
if err != nil {
logger.Error(err, "failed to sync the DynamoComponentDeployment")
return "", "", "", fmt.Errorf("failed to sync the DynamoComponentDeployment: %w", err)
}
resources = append(resources, dynamoComponentDeployment)
}
return r.checkResourcesReadiness(resources)
} }
func (r *DynamoGraphDeploymentReconciler) FinalizeResource(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment) error { func (r *DynamoGraphDeploymentReconciler) FinalizeResource(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment) error {
...@@ -200,7 +295,7 @@ func (r *DynamoGraphDeploymentReconciler) FinalizeResource(ctx context.Context, ...@@ -200,7 +295,7 @@ func (r *DynamoGraphDeploymentReconciler) FinalizeResource(ctx context.Context,
// SetupWithManager sets up the controller with the Manager. // SetupWithManager sets up the controller with the Manager.
func (r *DynamoGraphDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) error { func (r *DynamoGraphDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) error {
return ctrl.NewControllerManagedBy(mgr). ctrlBuilder := ctrl.NewControllerManagedBy(mgr).
For(&nvidiacomv1alpha1.DynamoGraphDeployment{}, builder.WithPredicates( For(&nvidiacomv1alpha1.DynamoGraphDeployment{}, builder.WithPredicates(
predicate.GenerationChangedPredicate{}, predicate.GenerationChangedPredicate{},
)). )).
...@@ -212,8 +307,17 @@ func (r *DynamoGraphDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) err ...@@ -212,8 +307,17 @@ func (r *DynamoGraphDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) err
UpdateFunc: func(de event.UpdateEvent) bool { return true }, UpdateFunc: func(de event.UpdateEvent) bool { return true },
GenericFunc: func(ge event.GenericEvent) bool { return true }, GenericFunc: func(ge event.GenericEvent) bool { return true },
})). })).
WithEventFilter(commonController.EphemeralDeploymentEventFilter(r.Config)). WithEventFilter(commonController.EphemeralDeploymentEventFilter(r.Config))
Complete(r) if r.Config.EnableGrove {
ctrlBuilder = ctrlBuilder.Owns(&grovev1alpha1.PodGangSet{}, builder.WithPredicates(predicate.Funcs{
// ignore creation cause we don't want to be called again after we create the pod gang set
CreateFunc: func(ce event.CreateEvent) bool { return false },
DeleteFunc: func(de event.DeleteEvent) bool { return true },
UpdateFunc: func(de event.UpdateEvent) bool { return true },
GenericFunc: func(ge event.GenericEvent) bool { return true },
}))
}
return ctrlBuilder.Complete(r)
} }
func (r *DynamoGraphDeploymentReconciler) GetRecorder() record.EventRecorder { func (r *DynamoGraphDeploymentReconciler) GetRecorder() record.EventRecorder {
......
...@@ -30,9 +30,22 @@ import ( ...@@ -30,9 +30,22 @@ import (
type Config struct { type Config struct {
// Enable resources filtering, only the resources belonging to the given namespace will be handled. // Enable resources filtering, only the resources belonging to the given namespace will be handled.
RestrictedNamespace string RestrictedNamespace string
// If true, assume VirtualService endpoints are HTTPS
VirtualServiceSupportsHTTPS bool
EnableLWS bool EnableLWS bool
EnableGrove bool
EtcdAddress string
NatsAddress string
IngressConfig IngressConfig
}
type IngressConfig struct {
VirtualServiceGateway string
IngressControllerClassName string
IngressControllerTLSSecret string
IngressHostSuffix string
}
func (i *IngressConfig) UseVirtualService() bool {
return i.VirtualServiceGateway != ""
} }
func EphemeralDeploymentEventFilter(config Config) predicate.Predicate { func EphemeralDeploymentEventFilter(config Config) predicate.Predicate {
......
...@@ -25,8 +25,11 @@ import ( ...@@ -25,8 +25,11 @@ import (
"reflect" "reflect"
"sort" "sort"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/dynamo/common"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
corev1 "k8s.io/api/core/v1" corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/types"
...@@ -352,3 +355,104 @@ func firstKey(m map[string]interface{}) string { ...@@ -352,3 +355,104 @@ func firstKey(m map[string]interface{}) string {
sort.Strings(keys) sort.Strings(keys)
return keys[0] return keys[0]
} }
func GetResourcesConfig(resources *common.Resources) (*corev1.ResourceRequirements, error) {
if resources == nil {
return nil, nil
}
currentResources := &corev1.ResourceRequirements{}
if resources.Limits != nil {
if resources.Limits.CPU != "" {
q, err := resource.ParseQuantity(resources.Limits.CPU)
if err != nil {
return nil, fmt.Errorf("parse limits cpu quantity: %w", err)
}
if currentResources.Limits == nil {
currentResources.Limits = make(corev1.ResourceList)
}
currentResources.Limits[corev1.ResourceCPU] = q
}
if resources.Limits.Memory != "" {
q, err := resource.ParseQuantity(resources.Limits.Memory)
if err != nil {
return nil, fmt.Errorf("parse limits memory quantity: %w", err)
}
if currentResources.Limits == nil {
currentResources.Limits = make(corev1.ResourceList)
}
currentResources.Limits[corev1.ResourceMemory] = q
}
if resources.Limits.GPU != "" {
q, err := resource.ParseQuantity(resources.Limits.GPU)
if err != nil {
return nil, fmt.Errorf("parse limits gpu quantity: %w", err)
}
if currentResources.Limits == nil {
currentResources.Limits = make(corev1.ResourceList)
}
currentResources.Limits[corev1.ResourceName(consts.KubeResourceGPUNvidia)] = q
}
for k, v := range resources.Limits.Custom {
q, err := resource.ParseQuantity(v)
if err != nil {
return nil, fmt.Errorf("parse limits %s quantity: %w", k, err)
}
if currentResources.Limits == nil {
currentResources.Limits = make(corev1.ResourceList)
}
currentResources.Limits[corev1.ResourceName(k)] = q
}
}
if resources.Requests != nil {
if resources.Requests.CPU != "" {
q, err := resource.ParseQuantity(resources.Requests.CPU)
if err != nil {
return nil, fmt.Errorf("parse requests cpu quantity: %w", err)
}
if currentResources.Requests == nil {
currentResources.Requests = make(corev1.ResourceList)
}
currentResources.Requests[corev1.ResourceCPU] = q
}
if resources.Requests.Memory != "" {
q, err := resource.ParseQuantity(resources.Requests.Memory)
if err != nil {
return nil, fmt.Errorf("parse requests memory quantity: %w", err)
}
if currentResources.Requests == nil {
currentResources.Requests = make(corev1.ResourceList)
}
currentResources.Requests[corev1.ResourceMemory] = q
}
for k, v := range resources.Requests.Custom {
q, err := resource.ParseQuantity(v)
if err != nil {
return nil, fmt.Errorf("parse requests %s quantity: %w", k, err)
}
if currentResources.Requests == nil {
currentResources.Requests = make(corev1.ResourceList)
}
currentResources.Requests[corev1.ResourceName(k)] = q
}
}
return currentResources, nil
}
type Resource struct {
client.Object
isReady func() bool
}
func WrapResource[T client.Object](resource T, isReady func() bool) *Resource {
return &Resource{
Object: resource,
isReady: isReady,
}
}
func (r *Resource) IsReady() bool {
return r.isReady()
}
...@@ -21,13 +21,24 @@ import ( ...@@ -21,13 +21,24 @@ import (
"context" "context"
"encoding/json" "encoding/json"
"fmt" "fmt"
"sort"
"strconv" "strconv"
"strings" "strings"
istioNetworking "istio.io/api/networking/v1beta1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
grovev1alpha1 "github.com/NVIDIA/grove/operator/api/core/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/dynamo/common" "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/dynamo/common"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1" "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
commonconsts "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts" commonconsts "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common"
"github.com/imdario/mergo"
networkingv1beta1 "istio.io/client-go/pkg/apis/networking/v1beta1"
corev1 "k8s.io/api/core/v1" corev1 "k8s.io/api/core/v1"
networkingv1 "k8s.io/api/networking/v1"
) )
// ServiceConfig represents the YAML configuration structure for a service // ServiceConfig represents the YAML configuration structure for a service
...@@ -129,13 +140,13 @@ func SetLwsAnnotations(serviceArgs *ServiceArgs, deployment *v1alpha1.DynamoComp ...@@ -129,13 +140,13 @@ func SetLwsAnnotations(serviceArgs *ServiceArgs, deployment *v1alpha1.DynamoComp
} }
// GenerateDynamoComponentsDeployments generates a map of DynamoComponentDeployments from a DynamoGraphConfig // GenerateDynamoComponentsDeployments generates a map of DynamoComponentDeployments from a DynamoGraphConfig
func GenerateDynamoComponentsDeployments(ctx context.Context, parentDynamoGraphDeployment *v1alpha1.DynamoGraphDeployment, ingressSpec *v1alpha1.IngressSpec) (map[string]*v1alpha1.DynamoComponentDeployment, error) { func GenerateDynamoComponentsDeployments(ctx context.Context, parentDynamoGraphDeployment *v1alpha1.DynamoGraphDeployment, defaultIngressSpec *v1alpha1.IngressSpec) (map[string]*v1alpha1.DynamoComponentDeployment, error) {
deployments := make(map[string]*v1alpha1.DynamoComponentDeployment) deployments := make(map[string]*v1alpha1.DynamoComponentDeployment)
graphDynamoNamespace := "" graphDynamoNamespace := ""
for componentName, component := range parentDynamoGraphDeployment.Spec.Services { for componentName, component := range parentDynamoGraphDeployment.Spec.Services {
deployment := &v1alpha1.DynamoComponentDeployment{} deployment := &v1alpha1.DynamoComponentDeployment{}
deployment.Spec.DynamoComponentDeploymentSharedSpec = component.DynamoComponentDeploymentSharedSpec deployment.Spec.DynamoComponentDeploymentSharedSpec = component.DynamoComponentDeploymentSharedSpec
deployment.Name = getDynamoComponentName(parentDynamoGraphDeployment, componentName) deployment.Name = GetDynamoComponentName(parentDynamoGraphDeployment, componentName)
deployment.Namespace = parentDynamoGraphDeployment.Namespace deployment.Namespace = parentDynamoGraphDeployment.Namespace
deployment.Spec.ServiceName = componentName deployment.Spec.ServiceName = componentName
dynamoNamespace := GetDefaultDynamoNamespace(ctx, parentDynamoGraphDeployment) dynamoNamespace := GetDefaultDynamoNamespace(ctx, parentDynamoGraphDeployment)
...@@ -160,8 +171,8 @@ func GenerateDynamoComponentsDeployments(ctx context.Context, parentDynamoGraphD ...@@ -160,8 +171,8 @@ func GenerateDynamoComponentsDeployments(ctx context.Context, parentDynamoGraphD
} }
deployment.Spec.ExtraPodSpec.ServiceAccountName = commonconsts.PlannerServiceAccountName deployment.Spec.ExtraPodSpec.ServiceAccountName = commonconsts.PlannerServiceAccountName
} }
if deployment.IsMainComponent() && ingressSpec != nil { if deployment.IsMainComponent() && defaultIngressSpec != nil && deployment.Spec.Ingress == nil {
deployment.Spec.Ingress = *ingressSpec deployment.Spec.Ingress = defaultIngressSpec
} }
// merge the envs from the parent deployment with the envs from the service // merge the envs from the parent deployment with the envs from the service
if len(parentDynamoGraphDeployment.Spec.Envs) > 0 { if len(parentDynamoGraphDeployment.Spec.Envs) > 0 {
...@@ -286,9 +297,271 @@ func mergeEnvs(common, specific []corev1.EnvVar) []corev1.EnvVar { ...@@ -286,9 +297,271 @@ func mergeEnvs(common, specific []corev1.EnvVar) []corev1.EnvVar {
for _, env := range envMap { for _, env := range envMap {
merged = append(merged, env) merged = append(merged, env)
} }
sort.Slice(merged, func(i, j int) bool {
return merged[i].Name < merged[j].Name
})
return merged return merged
} }
func getDynamoComponentName(dynamoDeployment *v1alpha1.DynamoGraphDeployment, component string) string { func GetDynamoComponentName(dynamoDeployment *v1alpha1.DynamoGraphDeployment, component string) string {
return fmt.Sprintf("%s-%s", dynamoDeployment.Name, strings.ToLower(component)) return fmt.Sprintf("%s-%s", dynamoDeployment.Name, strings.ToLower(component))
} }
type SecretsRetriever interface {
GetSecrets(namespace, registry string) ([]string, error)
}
func GenerateGrovePodGangSet(ctx context.Context, dynamoDeployment *v1alpha1.DynamoGraphDeployment, controllerConfig controller_common.Config, secretsRetriever SecretsRetriever) (*grovev1alpha1.PodGangSet, error) {
gangSet := &grovev1alpha1.PodGangSet{}
gangSet.Name = dynamoDeployment.Name
gangSet.Namespace = dynamoDeployment.Namespace
gangSet.Spec.Replicas = 1
for componentName, component := range dynamoDeployment.Spec.Services {
container := corev1.Container{
Name: "main",
LivenessProbe: component.LivenessProbe,
ReadinessProbe: component.ReadinessProbe,
Env: component.Envs,
Ports: []corev1.ContainerPort{
{
Protocol: corev1.ProtocolTCP,
Name: commonconsts.DynamoContainerPortName,
ContainerPort: int32(commonconsts.DynamoServicePort),
},
{
Protocol: corev1.ProtocolTCP,
Name: commonconsts.DynamoHealthPortName,
ContainerPort: int32(commonconsts.DynamoHealthPort),
},
},
}
resourcesConfig, err := controller_common.GetResourcesConfig(component.Resources)
if err != nil {
return nil, fmt.Errorf("failed to get resources config: %w", err)
}
container.Resources = *resourcesConfig
if component.ExtraPodSpec != nil && component.ExtraPodSpec.MainContainer != nil {
// merge the extraPodSpec from the parent deployment with the extraPodSpec from the service
err := mergo.Merge(&container, *component.ExtraPodSpec.MainContainer.DeepCopy(), mergo.WithOverride)
if err != nil {
return nil, fmt.Errorf("failed to merge extraPodSpec: %w", err)
}
}
// retrieve the image pull secrets for the container
imagePullSecrets := []corev1.LocalObjectReference{}
if secretsRetriever != nil {
secretsName, err := secretsRetriever.GetSecrets(dynamoDeployment.Namespace, container.Image)
if err != nil {
return nil, fmt.Errorf("failed to get secrets for component %s and image %s: %w", componentName, container.Image, err)
}
for _, secretName := range secretsName {
imagePullSecrets = append(imagePullSecrets, corev1.LocalObjectReference{
Name: secretName,
})
}
}
// merge the envs from the parent deployment with the envs from the service
if len(dynamoDeployment.Spec.Envs) > 0 {
container.Env = mergeEnvs(dynamoDeployment.Spec.Envs, container.Env)
}
container.Env = append(container.Env, corev1.EnvVar{
Name: commonconsts.EnvDynamoServicePort,
Value: fmt.Sprintf("%d", commonconsts.DynamoServicePort),
})
if controllerConfig.NatsAddress != "" {
container.Env = append(container.Env, corev1.EnvVar{
Name: "NATS_SERVER",
Value: controllerConfig.NatsAddress,
})
}
if controllerConfig.EtcdAddress != "" {
container.Env = append(container.Env, corev1.EnvVar{
Name: "ETCD_ENDPOINTS",
Value: controllerConfig.EtcdAddress,
})
}
if component.EnvFromSecret != nil {
container.EnvFrom = append(container.EnvFrom, corev1.EnvFromSource{
SecretRef: &corev1.SecretEnvSource{
LocalObjectReference: corev1.LocalObjectReference{Name: *component.EnvFromSecret},
},
})
}
gangSet.Spec.Template.Cliques = append(gangSet.Spec.Template.Cliques, &grovev1alpha1.PodCliqueTemplateSpec{
Name: strings.ToLower(componentName),
Labels: map[string]string{
commonconsts.KubeLabelDynamoSelector: GetDynamoComponentName(dynamoDeployment, componentName),
},
Spec: grovev1alpha1.PodCliqueSpec{
RoleName: strings.ToLower(componentName),
Replicas: func() int32 {
if component.Replicas != nil {
return *component.Replicas
}
return 1
}(),
PodSpec: corev1.PodSpec{
Containers: []corev1.Container{container},
ImagePullSecrets: imagePullSecrets,
},
},
})
if component.PVC != nil {
cliqueIndex := len(gangSet.Spec.Template.Cliques) - 1
gangSet.Spec.Template.Cliques[cliqueIndex].Spec.PodSpec.Volumes = append(gangSet.Spec.Template.Cliques[cliqueIndex].Spec.PodSpec.Volumes, corev1.Volume{
Name: *component.PVC.Name,
VolumeSource: corev1.VolumeSource{
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
ClaimName: *component.PVC.Name,
},
},
})
gangSet.Spec.Template.Cliques[cliqueIndex].Spec.PodSpec.Containers[0].VolumeMounts = append(gangSet.Spec.Template.Cliques[cliqueIndex].Spec.PodSpec.Containers[0].VolumeMounts, corev1.VolumeMount{
Name: *component.PVC.Name,
MountPath: *component.PVC.MountPoint,
})
}
}
return gangSet, nil
}
func GenerateComponentService(ctx context.Context, componentName, componentNamespace string) (*corev1.Service, error) {
service := &corev1.Service{
ObjectMeta: metav1.ObjectMeta{
Name: componentName,
Namespace: componentNamespace,
},
Spec: corev1.ServiceSpec{
Selector: map[string]string{
commonconsts.KubeLabelDynamoSelector: componentName,
},
Ports: []corev1.ServicePort{
{
Name: commonconsts.DynamoServicePortName,
Port: commonconsts.DynamoServicePort,
TargetPort: intstr.FromString(commonconsts.DynamoContainerPortName),
Protocol: corev1.ProtocolTCP,
},
},
},
}
return service, nil
}
func GenerateComponentIngress(ctx context.Context, componentName, componentNamespace string, ingressSpec v1alpha1.IngressSpec) *networkingv1.Ingress {
resourceName := componentName
ingress := &networkingv1.Ingress{
ObjectMeta: metav1.ObjectMeta{
Name: resourceName,
Namespace: componentNamespace,
},
}
host := getIngressHost(ingressSpec)
ingress.Spec = networkingv1.IngressSpec{
IngressClassName: ingressSpec.IngressControllerClassName,
Rules: []networkingv1.IngressRule{
{
Host: host,
IngressRuleValue: networkingv1.IngressRuleValue{
HTTP: &networkingv1.HTTPIngressRuleValue{
Paths: []networkingv1.HTTPIngressPath{
{
Path: "/",
PathType: &[]networkingv1.PathType{networkingv1.PathTypePrefix}[0],
Backend: networkingv1.IngressBackend{
Service: &networkingv1.IngressServiceBackend{
Name: resourceName,
Port: networkingv1.ServiceBackendPort{
Number: commonconsts.DynamoServicePort,
},
},
},
},
},
},
},
},
},
}
if ingressSpec.TLS != nil {
ingress.Spec.TLS = []networkingv1.IngressTLS{
{
Hosts: []string{host},
SecretName: ingressSpec.TLS.SecretName,
},
}
}
return ingress
}
func getIngressHost(ingressSpec v1alpha1.IngressSpec) string {
host := ingressSpec.Host
if ingressSpec.HostPrefix != nil {
host = *ingressSpec.HostPrefix + host
}
ingressSuffix := commonconsts.DefaultIngressSuffix
if ingressSpec.HostSuffix != nil {
ingressSuffix = *ingressSpec.HostSuffix
}
return fmt.Sprintf("%s.%s", host, ingressSuffix)
}
func GenerateComponentVirtualService(ctx context.Context, componentName, componentNamespace string, ingressSpec v1alpha1.IngressSpec) *networkingv1beta1.VirtualService {
vs := &networkingv1beta1.VirtualService{
ObjectMeta: metav1.ObjectMeta{
Name: componentName,
Namespace: componentNamespace,
},
}
vs.Spec = istioNetworking.VirtualService{
Hosts: []string{
getIngressHost(ingressSpec),
},
Gateways: []string{*ingressSpec.VirtualServiceGateway},
Http: []*istioNetworking.HTTPRoute{
{
Match: []*istioNetworking.HTTPMatchRequest{
{
Uri: &istioNetworking.StringMatch{
MatchType: &istioNetworking.StringMatch_Prefix{Prefix: "/"},
},
},
},
Route: []*istioNetworking.HTTPRouteDestination{
{
Destination: &istioNetworking.Destination{
Host: componentName,
Port: &istioNetworking.PortSelector{
Number: commonconsts.DynamoServicePort,
},
},
},
},
},
},
}
return vs
}
func GenerateDefaultIngressSpec(dynamoDeployment *v1alpha1.DynamoGraphDeployment, ingressConfig controller_common.IngressConfig) v1alpha1.IngressSpec {
res := v1alpha1.IngressSpec{
Enabled: ingressConfig.VirtualServiceGateway != "" || ingressConfig.IngressControllerClassName != "",
Host: dynamoDeployment.Name,
UseVirtualService: ingressConfig.VirtualServiceGateway != "",
}
if ingressConfig.IngressControllerClassName != "" {
res.IngressControllerClassName = &ingressConfig.IngressControllerClassName
}
if ingressConfig.IngressControllerTLSSecret != "" {
res.TLS = &v1alpha1.IngressTLSSpec{
SecretName: ingressConfig.IngressControllerTLSSecret,
}
}
if ingressConfig.IngressHostSuffix != "" {
res.HostSuffix = &ingressConfig.IngressHostSuffix
}
if ingressConfig.VirtualServiceGateway != "" {
res.VirtualServiceGateway = &ingressConfig.VirtualServiceGateway
}
return res
}
...@@ -24,15 +24,18 @@ import ( ...@@ -24,15 +24,18 @@ import (
"sort" "sort"
"testing" "testing"
grovev1alpha1 "github.com/NVIDIA/grove/operator/api/core/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/dynamo/common" "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/dynamo/common"
compounaiCommon "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/dynamo/common" compounaiCommon "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/dynamo/common"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1" "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
commonconsts "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts" commonconsts "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common"
"github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp"
corev1 "k8s.io/api/core/v1" corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
) )
func TestGenerateDynamoComponentsDeployments(t *testing.T) { func TestGenerateDynamoComponentsDeployments(t *testing.T) {
...@@ -88,7 +91,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) { ...@@ -88,7 +91,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
}, },
}, },
}, },
ingressSpec: &v1alpha1.IngressSpec{},
}, },
want: map[string]*v1alpha1.DynamoComponentDeployment{ want: map[string]*v1alpha1.DynamoComponentDeployment{
"service1": { "service1": {
...@@ -197,7 +199,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) { ...@@ -197,7 +199,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
}, },
}, },
}, },
ingressSpec: &v1alpha1.IngressSpec{},
}, },
want: map[string]*v1alpha1.DynamoComponentDeployment{ want: map[string]*v1alpha1.DynamoComponentDeployment{
"service1": { "service1": {
...@@ -306,7 +307,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) { ...@@ -306,7 +307,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
}, },
}, },
}, },
ingressSpec: &v1alpha1.IngressSpec{},
}, },
want: nil, want: nil,
wantErr: true, wantErr: true,
...@@ -387,7 +387,7 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) { ...@@ -387,7 +387,7 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
commonconsts.KubeLabelDynamoNamespace: "dynamo-test-dynamographdeployment", commonconsts.KubeLabelDynamoNamespace: "dynamo-test-dynamographdeployment",
}, },
Autoscaling: nil, Autoscaling: nil,
Ingress: v1alpha1.IngressSpec{ Ingress: &v1alpha1.IngressSpec{
Enabled: true, Enabled: true,
Host: "test-dynamographdeployment", Host: "test-dynamographdeployment",
}, },
...@@ -607,7 +607,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) { ...@@ -607,7 +607,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
}, },
}, },
}, },
ingressSpec: &v1alpha1.IngressSpec{},
}, },
want: map[string]*v1alpha1.DynamoComponentDeployment{ want: map[string]*v1alpha1.DynamoComponentDeployment{
"service1": { "service1": {
...@@ -1117,3 +1116,398 @@ func Test_mergeEnvs(t *testing.T) { ...@@ -1117,3 +1116,398 @@ func Test_mergeEnvs(t *testing.T) {
}) })
} }
} }
func TestGenerateGrovePodGangSet(t *testing.T) {
type args struct {
ctx context.Context
dynamoDeployment *v1alpha1.DynamoGraphDeployment
controllerConfig controller_common.Config
}
tests := []struct {
name string
args args
want *grovev1alpha1.PodGangSet
wantErr bool
}{
{
name: "test_generate_grove_pod_gang_set",
args: args{
ctx: context.Background(),
controllerConfig: controller_common.Config{
EtcdAddress: "etcd-address",
NatsAddress: "nats-address",
},
dynamoDeployment: &v1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dynamo-graph-deployment",
Namespace: "test-namespace",
},
Spec: v1alpha1.DynamoGraphDeploymentSpec{
Envs: []corev1.EnvVar{
{
Name: "DYNAMO_POD_GANG_SET_REPLICAS",
Value: "1",
},
},
Services: map[string]*v1alpha1.DynamoComponentDeploymentOverridesSpec{
"Frontend": {
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
Replicas: &[]int32{1}[0],
Resources: &common.Resources{
Requests: &common.ResourceItem{
CPU: "1",
Memory: "1Gi",
},
Limits: &common.ResourceItem{
CPU: "1",
Memory: "1Gi",
GPU: "1",
},
},
Envs: []corev1.EnvVar{
{
Name: "FRONTEND_ENV_1",
Value: "1",
},
},
EnvFromSecret: &[]string{"frontend-secret"}[0],
LivenessProbe: &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Path: "/health",
Port: intstr.FromInt(8080),
},
},
},
ReadinessProbe: &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Path: "/ready",
Port: intstr.FromInt(8080),
},
},
},
ExtraPodSpec: &common.ExtraPodSpec{
MainContainer: &corev1.Container{
Command: []string{
"/bin/sh",
"-c",
"echo $FRONTEND_ENV_1",
},
Args: []string{
"--frontend-env-1",
"1",
},
Image: "frontend-image",
},
},
},
},
"Planner": {
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
Replicas: &[]int32{2}[0],
Resources: &common.Resources{
Requests: &common.ResourceItem{
CPU: "2",
Memory: "2Gi",
},
Limits: &common.ResourceItem{
CPU: "2",
Memory: "2Gi",
GPU: "2",
},
},
Envs: []corev1.EnvVar{
{
Name: "PLANNER_ENV_1",
Value: "2",
},
},
PVC: &v1alpha1.PVC{
Name: &[]string{"planner-pvc"}[0],
MountPoint: &[]string{"/planner"}[0],
},
EnvFromSecret: &[]string{"planner-secret"}[0],
LivenessProbe: &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Path: "/health",
Port: intstr.FromInt(8080),
},
},
},
ReadinessProbe: &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Path: "/ready",
Port: intstr.FromInt(8080),
},
},
},
ExtraPodSpec: &common.ExtraPodSpec{
MainContainer: &corev1.Container{
Command: []string{
"/bin/sh",
"-c",
"echo $PLANNER_ENV_1",
},
Args: []string{
"--planner-env-1",
"1",
},
Image: "planner-image",
},
},
},
},
},
},
},
},
want: &grovev1alpha1.PodGangSet{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dynamo-graph-deployment",
Namespace: "test-namespace",
},
Spec: grovev1alpha1.PodGangSetSpec{
Replicas: 1,
Template: grovev1alpha1.PodGangSetTemplateSpec{
Cliques: []*grovev1alpha1.PodCliqueTemplateSpec{
{
Name: "frontend",
Labels: map[string]string{
commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-frontend",
},
Spec: grovev1alpha1.PodCliqueSpec{
RoleName: "frontend",
Replicas: 1,
PodSpec: corev1.PodSpec{
ImagePullSecrets: []corev1.LocalObjectReference{},
Containers: []corev1.Container{
{
Name: "main",
Image: "frontend-image",
Command: []string{
"/bin/sh",
"-c",
"echo $FRONTEND_ENV_1",
},
Args: []string{
"--frontend-env-1",
"1",
},
EnvFrom: []corev1.EnvFromSource{
{
SecretRef: &corev1.SecretEnvSource{
LocalObjectReference: corev1.LocalObjectReference{
Name: "frontend-secret",
},
},
},
},
LivenessProbe: &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Path: "/health",
Port: intstr.FromInt(8080),
},
},
},
ReadinessProbe: &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Path: "/ready",
Port: intstr.FromInt(8080),
},
},
},
Env: []corev1.EnvVar{
{
Name: "DYNAMO_POD_GANG_SET_REPLICAS",
Value: "1",
},
{
Name: "FRONTEND_ENV_1",
Value: "1",
},
{
Name: "DYNAMO_PORT",
Value: fmt.Sprintf("%d", commonconsts.DynamoServicePort),
},
{
Name: "NATS_SERVER",
Value: "nats-address",
},
{
Name: "ETCD_ENDPOINTS",
Value: "etcd-address",
},
},
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("1"),
corev1.ResourceMemory: resource.MustParse("1Gi"),
},
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("1"),
corev1.ResourceMemory: resource.MustParse("1Gi"),
corev1.ResourceName("nvidia.com/gpu"): resource.MustParse("1"),
},
},
Ports: []corev1.ContainerPort{
{
Protocol: corev1.ProtocolTCP,
Name: commonconsts.DynamoContainerPortName,
ContainerPort: int32(commonconsts.DynamoServicePort),
},
{
Protocol: corev1.ProtocolTCP,
Name: commonconsts.DynamoHealthPortName,
ContainerPort: int32(commonconsts.DynamoHealthPort),
},
},
},
},
},
},
},
{
Name: "planner",
Labels: map[string]string{
commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-planner",
},
Spec: grovev1alpha1.PodCliqueSpec{
RoleName: "planner",
Replicas: 2,
PodSpec: corev1.PodSpec{
ImagePullSecrets: []corev1.LocalObjectReference{},
Volumes: []corev1.Volume{
{
Name: "planner-pvc",
VolumeSource: corev1.VolumeSource{
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
ClaimName: "planner-pvc",
},
},
},
},
Containers: []corev1.Container{
{
Name: "main",
Image: "planner-image",
Command: []string{
"/bin/sh",
"-c",
"echo $PLANNER_ENV_1",
},
Args: []string{
"--planner-env-1",
"1",
},
EnvFrom: []corev1.EnvFromSource{
{
SecretRef: &corev1.SecretEnvSource{
LocalObjectReference: corev1.LocalObjectReference{
Name: "planner-secret",
},
},
},
},
LivenessProbe: &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Path: "/health",
Port: intstr.FromInt(8080),
},
},
},
ReadinessProbe: &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Path: "/ready",
Port: intstr.FromInt(8080),
},
},
},
Env: []corev1.EnvVar{
{
Name: "DYNAMO_POD_GANG_SET_REPLICAS",
Value: "1",
},
{
Name: "PLANNER_ENV_1",
Value: "2",
},
{
Name: "DYNAMO_PORT",
Value: fmt.Sprintf("%d", commonconsts.DynamoServicePort),
},
{
Name: "NATS_SERVER",
Value: "nats-address",
},
{
Name: "ETCD_ENDPOINTS",
Value: "etcd-address",
},
},
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("2"),
corev1.ResourceMemory: resource.MustParse("2Gi"),
},
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("2"),
corev1.ResourceMemory: resource.MustParse("2Gi"),
corev1.ResourceName("nvidia.com/gpu"): resource.MustParse("2"),
},
},
VolumeMounts: []corev1.VolumeMount{
{
Name: "planner-pvc",
MountPath: "/planner",
},
},
Ports: []corev1.ContainerPort{
{
Protocol: corev1.ProtocolTCP,
Name: commonconsts.DynamoContainerPortName,
ContainerPort: int32(commonconsts.DynamoServicePort),
},
{
Protocol: corev1.ProtocolTCP,
Name: commonconsts.DynamoHealthPortName,
ContainerPort: int32(commonconsts.DynamoHealthPort),
},
},
},
},
},
},
},
},
},
},
},
wantErr: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := GenerateGrovePodGangSet(tt.args.ctx, tt.args.dynamoDeployment, tt.args.controllerConfig, nil)
if (err != nil) != tt.wantErr {
t.Errorf("GenerateGrovePodGangSet() error = %v, wantErr %v", err, tt.wantErr)
return
}
sort.Slice(got.Spec.Template.Cliques, func(i, j int) bool {
return got.Spec.Template.Cliques[i].Name < got.Spec.Template.Cliques[j].Name
})
sort.Slice(tt.want.Spec.Template.Cliques, func(i, j int) bool {
return tt.want.Spec.Template.Cliques[i].Name < tt.want.Spec.Template.Cliques[j].Name
})
if diff := cmp.Diff(got, tt.want); diff != "" {
t.Errorf("GenerateGrovePodGangSet() mismatch (-want +got):\n%s", diff)
}
})
}
}
...@@ -38,8 +38,14 @@ spec: ...@@ -38,8 +38,14 @@ spec:
- name: {{ $.Release.Name }}-{{ $serviceName | lower }} - name: {{ $.Release.Name }}-{{ $serviceName | lower }}
image: {{ $serviceSpec.extraPodSpec.mainContainer.image }} image: {{ $serviceSpec.extraPodSpec.mainContainer.image }}
workingDir: {{ $serviceSpec.extraPodSpec.mainContainer.workingDir }} workingDir: {{ $serviceSpec.extraPodSpec.mainContainer.workingDir }}
{{- if $serviceSpec.extraPodSpec.mainContainer.command }}
command:
{{- $serviceSpec.extraPodSpec.mainContainer.command | toYaml | nindent 8 }}
{{- end }}
{{- if $serviceSpec.extraPodSpec.mainContainer.args }}
args: args:
{{- $serviceSpec.extraPodSpec.mainContainer.args | toYaml | nindent 8 }} {{- $serviceSpec.extraPodSpec.mainContainer.args | toYaml | nindent 8 }}
{{- end }}
{{ if $serviceSpec.resources }} {{ if $serviceSpec.resources }}
resources: resources:
requests: requests:
...@@ -83,8 +89,8 @@ spec: ...@@ -83,8 +89,8 @@ spec:
- name: health - name: health
containerPort: {{ $.Values.healthPort | default 5000 }} containerPort: {{ $.Values.healthPort | default 5000 }}
livenessProbe: livenessProbe:
{{- if $serviceSpec.extraPodSpec.mainContainer.livenessProbe }} {{- if $serviceSpec.livenessProbe }}
{{ $serviceSpec.extraPodSpec.mainContainer.livenessProbe | toYaml | nindent 10 }} {{ $serviceSpec.livenessProbe | toYaml | nindent 10 }}
{{- else }} {{- else }}
initialDelaySeconds: 60 initialDelaySeconds: 60
periodSeconds: 60 periodSeconds: 60
...@@ -97,8 +103,8 @@ spec: ...@@ -97,8 +103,8 @@ spec:
scheme: HTTP scheme: HTTP
{{- end }} {{- end }}
readinessProbe: readinessProbe:
{{- if $serviceSpec.extraPodSpec.mainContainer.readinessProbe }} {{- if $serviceSpec.readinessProbe }}
{{ $serviceSpec.extraPodSpec.mainContainer.readinessProbe | toYaml | nindent 10 }} {{ $serviceSpec.readinessProbe | toYaml | nindent 10 }}
{{- else }} {{- else }}
initialDelaySeconds: 60 initialDelaySeconds: 60
periodSeconds: 60 periodSeconds: 60
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment