Commit 5ddc7f7d authored by Maksim Khadkevich's avatar Maksim Khadkevich Committed by GitHub
Browse files

feat: moved compoundAI operator, APIserver, and examples (#10)

parent 14ce7e03
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: nvidia.com/v1alpha1
kind: CompoundAIDeployment
metadata:
labels:
app.kubernetes.io/name: compoundai-kubernetes-operator
app.kubernetes.io/managed-by: kustomize
name: compoundaideployment-sample
spec:
compoundAINim: frontend:2c4romhs6s33e4w7
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: nvidia.com/v1alpha1
kind: CompoundAINim
metadata:
labels:
app.kubernetes.io/name: compoundai-kubernetes-operator
app.kubernetes.io/managed-by: kustomize
name: compoundainim-sample
spec:
# TODO(user): Add fields here
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: nvidia.com/v1alpha1
kind: CompoundAINimDeployment
metadata:
labels:
app.kubernetes.io/name: compoundai-kubernetes-operator
app.kubernetes.io/managed-by: kustomize
name: compoundainimdeployment-sample
spec:
# TODO(user): Add fields here
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: nvidia.com/v1alpha1
kind: CompoundAINimRequest
metadata:
labels:
app.kubernetes.io/name: compoundai-kubernetes-operator
app.kubernetes.io/managed-by: kustomize
name: compoundainimrequest-sample
spec:
# TODO(user): Add fields here
module github.com/dynemo-ai/dynemo/deploy/compoundai/operator
go 1.23.0
toolchain go1.23.4
require (
dario.cat/mergo v1.0.1
emperror.dev/errors v0.8.1
github.com/apparentlymart/go-shquot v0.0.1
github.com/cisco-open/k8s-objectmatcher v1.9.0
github.com/ettle/strcase v0.2.0
github.com/huandu/xstrings v1.4.0
github.com/jinzhu/copier v0.4.0
github.com/mitchellh/hashstructure/v2 v2.0.2
github.com/onsi/ginkgo/v2 v2.19.0
github.com/onsi/gomega v1.33.1
github.com/pkg/errors v0.9.1
github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.71.2
github.com/prometheus/common v0.55.0
github.com/prune998/docker-registry-client v0.0.0-20200114164314-f8cd511a014c
github.com/rs/xid v1.4.0
github.com/sergeymakinen/go-quote v1.1.0
github.com/sirupsen/logrus v1.9.3
gopkg.in/yaml.v2 v2.4.0
istio.io/api v1.23.1
istio.io/client-go v1.23.1
k8s.io/api v0.31.3
k8s.io/apiextensions-apiserver v0.31.0
k8s.io/apimachinery v0.31.3
k8s.io/client-go v0.31.3
k8s.io/utils v0.0.0-20240711033017-18e509b52bc8
sigs.k8s.io/controller-runtime v0.19.4
volcano.sh/apis v1.11.0
)
require (
github.com/beorn7/perks v1.0.1 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
github.com/distribution/reference v0.6.0 // indirect
github.com/docker/distribution v2.8.3+incompatible // indirect
github.com/docker/libtrust v0.0.0-20150114040149-fa567046d9b1 // indirect
github.com/emicklei/go-restful/v3 v3.11.0 // indirect
github.com/evanphx/json-patch v5.7.0+incompatible // indirect
github.com/evanphx/json-patch/v5 v5.9.0 // indirect
github.com/fsnotify/fsnotify v1.7.0 // indirect
github.com/fxamacker/cbor/v2 v2.7.0 // indirect
github.com/go-logr/logr v1.4.2 // indirect
github.com/go-logr/zapr v1.3.0 // indirect
github.com/go-openapi/jsonpointer v0.20.2 // indirect
github.com/go-openapi/jsonreference v0.20.2 // indirect
github.com/go-openapi/swag v0.22.8 // indirect
github.com/go-task/slim-sprig/v3 v3.0.0 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
github.com/golang/protobuf v1.5.4 // indirect
github.com/google/gnostic-models v0.6.8 // indirect
github.com/google/go-cmp v0.6.0 // indirect
github.com/google/gofuzz v1.2.0 // indirect
github.com/google/pprof v0.0.0-20240525223248-4bfdf5a9a2af // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/gorilla/mux v1.8.1 // indirect
github.com/imdario/mergo v0.3.13 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/opencontainers/go-digest v1.0.0 // indirect
github.com/opencontainers/image-spec v1.1.0 // indirect
github.com/prometheus/client_golang v1.19.1 // indirect
github.com/prometheus/client_model v0.6.1 // indirect
github.com/prometheus/procfs v0.15.1 // indirect
github.com/spf13/pflag v1.0.5 // indirect
github.com/x448/float16 v0.8.4 // indirect
go.uber.org/multierr v1.11.0 // indirect
go.uber.org/zap v1.26.0 // indirect
golang.org/x/exp v0.0.0-20230515195305-f3d0a9c9a5cc // indirect
golang.org/x/net v0.33.0 // indirect
golang.org/x/oauth2 v0.21.0 // indirect
golang.org/x/sys v0.28.0 // indirect
golang.org/x/term v0.27.0 // indirect
golang.org/x/text v0.21.0 // indirect
golang.org/x/time v0.6.0 // indirect
golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d // indirect
gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20240528184218-531527333157 // indirect
google.golang.org/protobuf v1.34.2 // indirect
gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
k8s.io/klog/v2 v2.130.1 // indirect
k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 // indirect
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect
sigs.k8s.io/yaml v1.4.0 // indirect
)
dario.cat/mergo v1.0.1 h1:Ra4+bf83h2ztPIQYNP99R6m+Y7KfnARDfID+a+vLl4s=
dario.cat/mergo v1.0.1/go.mod h1:uNxQE+84aUszobStD9th8a29P2fMDhsBdgRYvZOxGmk=
emperror.dev/errors v0.8.1 h1:UavXZ5cSX/4u9iyvH6aDcuGkVjeexUGJ7Ij7G4VfQT0=
emperror.dev/errors v0.8.1/go.mod h1:YcRvLPh626Ubn2xqtoprejnA5nFha+TJ+2vew48kWuE=
github.com/apparentlymart/go-shquot v0.0.1 h1:MGV8lwxF4zw75lN7e0MGs7o6AFYn7L6AZaExUpLh0Mo=
github.com/apparentlymart/go-shquot v0.0.1/go.mod h1:lw58XsE5IgUXZ9h0cxnypdx31p9mPFIVEQ9P3c7MlrU=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/cisco-open/k8s-objectmatcher v1.9.0 h1:/sfuO0BD09fpynZjXsqeZrh28Juc4VEwc2P6Ov/Q6fM=
github.com/cisco-open/k8s-objectmatcher v1.9.0/go.mod h1:CH4E6qAK+q+JwKFJn0DaTNqxrbmWCaDQzGthKLK4nZ0=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5QvfrDyIgxBk=
github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E=
github.com/docker/distribution v2.8.3+incompatible h1:AtKxIZ36LoNK51+Z6RpzLpddBirtxJnzDrHLEKxTAYk=
github.com/docker/distribution v2.8.3+incompatible/go.mod h1:J2gT2udsDAN96Uj4KfcMRqY0/ypR+oyYUYmja8H+y+w=
github.com/docker/libtrust v0.0.0-20150114040149-fa567046d9b1 h1:ZClxb8laGDf5arXfYcAtECDFgAgHklGI8CxgjHnXKJ4=
github.com/docker/libtrust v0.0.0-20150114040149-fa567046d9b1/go.mod h1:cyGadeNEkKy96OOhEzfZl+yxihPEzKnqJwvfuSUqbZE=
github.com/emicklei/go-restful/v3 v3.11.0 h1:rAQeMHw1c7zTmncogyy8VvRZwtkmkZ4FxERmMY4rD+g=
github.com/emicklei/go-restful/v3 v3.11.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc=
github.com/ettle/strcase v0.2.0 h1:fGNiVF21fHXpX1niBgk0aROov1LagYsOwV/xqKDKR/Q=
github.com/ettle/strcase v0.2.0/go.mod h1:DajmHElDSaX76ITe3/VHVyMin4LWSJN5Z909Wp+ED1A=
github.com/evanphx/json-patch v5.7.0+incompatible h1:vgGkfT/9f8zE6tvSCe74nfpAVDQ2tG6yudJd8LBksgI=
github.com/evanphx/json-patch v5.7.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk=
github.com/evanphx/json-patch/v5 v5.9.0 h1:kcBlZQbplgElYIlo/n1hJbls2z/1awpXxpRi0/FOJfg=
github.com/evanphx/json-patch/v5 v5.9.0/go.mod h1:VNkHZ/282BpEyt/tObQO8s5CMPmYYq14uClGH4abBuQ=
github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA=
github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM=
github.com/fxamacker/cbor/v2 v2.7.0 h1:iM5WgngdRBanHcxugY4JySA0nk1wZorNOpTgCMedv5E=
github.com/fxamacker/cbor/v2 v2.7.0/go.mod h1:pxXPTn3joSm21Gbwsv0w9OSA2y1HFR9qXEeXQVeNoDQ=
github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY=
github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ=
github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg=
github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs=
github.com/go-openapi/jsonpointer v0.20.2 h1:mQc3nmndL8ZBzStEo3JYF8wzmeWffDH4VbXz58sAx6Q=
github.com/go-openapi/jsonpointer v0.20.2/go.mod h1:bHen+N0u1KEO3YlmqOjTT9Adn1RfD91Ar825/PuiRVs=
github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE=
github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k=
github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14=
github.com/go-openapi/swag v0.22.8 h1:/9RjDSQ0vbFR+NyjGMkFTsA1IA0fmhKSThmfGZjicbw=
github.com/go-openapi/swag v0.22.8/go.mod h1:6QT22icPLEqAM/z/TChgb4WAveCHF92+2gF0CNjHpPI=
github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI=
github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8=
github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE=
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek=
github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps=
github.com/google/gnostic-models v0.6.8 h1:yo/ABAfM5IMRsS1VnXjTBvUb61tFIHozhlYvRgGre9I=
github.com/google/gnostic-models v0.6.8/go.mod h1:5n7qKqH0f5wFt+aWF8CW6pZLLNOfYuF5OpfBSENuI8U=
github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0=
github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/google/pprof v0.0.0-20240525223248-4bfdf5a9a2af h1:kmjWCqn2qkEml422C2Rrd27c3VGxi6a/6HNq8QmHRKM=
github.com/google/pprof v0.0.0-20240525223248-4bfdf5a9a2af/go.mod h1:K1liHPHnj73Fdn/EKuT8nrFqBihUSKXoLYU0BuatOYo=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY=
github.com/gorilla/mux v1.8.1/go.mod h1:AKf9I4AEqPTmMytcMc0KkNouC66V3BtZ4qD5fmWSiMQ=
github.com/huandu/xstrings v1.4.0 h1:D17IlohoQq4UcpqD7fDk80P7l+lwAmlFaBHgOipl2FU=
github.com/huandu/xstrings v1.4.0/go.mod h1:y5/lhBue+AyNmUVz9RLU9xbLR0o4KIIExikq4ovT0aE=
github.com/imdario/mergo v0.3.13 h1:lFzP57bqS/wsqKssCGmtLAb8A0wKjLGrve2q3PPVcBk=
github.com/imdario/mergo v0.3.13/go.mod h1:4lJ1jqUDcsbIECGy0RUJAXNIhg+6ocWgb1ALK2O4oXg=
github.com/jinzhu/copier v0.4.0 h1:w3ciUoD19shMCRargcpm0cm91ytaBhDvuRpz1ODO/U8=
github.com/jinzhu/copier v0.4.0/go.mod h1:DfbEm0FYsaqBcKcFuvmOZb218JkPGtvSHsKg8S8hyyg=
github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
github.com/mitchellh/hashstructure/v2 v2.0.2 h1:vGKWl0YJqUNxE8d+h8f6NJLcCJrgbhC4NcD46KavDd4=
github.com/mitchellh/hashstructure/v2 v2.0.2/go.mod h1:MG3aRVU/N29oo/V/IhBX8GR/zz4kQkprJgF2EVszyDE=
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=
github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
github.com/onsi/ginkgo/v2 v2.19.0 h1:9Cnnf7UHo57Hy3k6/m5k3dRfGTMXGvxhHFvkDTCTpvA=
github.com/onsi/ginkgo/v2 v2.19.0/go.mod h1:rlwLi9PilAFJ8jCg9UE1QP6VBpd6/xj3SRC0d6TU0To=
github.com/onsi/gomega v1.33.1 h1:dsYjIxxSR755MDmKVsaFQTE22ChNBcuuTWgkUDSubOk=
github.com/onsi/gomega v1.33.1/go.mod h1:U4R44UsT+9eLIaYRB2a5qajjtQYn0hauxvRm16AVYg0=
github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U=
github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM=
github.com/opencontainers/image-spec v1.1.0 h1:8SG7/vwALn54lVB/0yZ/MMwhFrPYtpEHQb2IpWsCzug=
github.com/opencontainers/image-spec v1.1.0/go.mod h1:W4s4sFTMaBeK1BQLXbG4AdM2szdn85PY75RI83NrTrM=
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.71.2 h1:HZdPRm0ApWPg7F4sHgbqWkL+ddWfpTZsopm5HM/2g4o=
github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.71.2/go.mod h1:3RiUkFmR9kmPZi9r/8a5jw0a9yg+LMmr7qa0wjqvSiI=
github.com/prometheus/client_golang v1.19.1 h1:wZWJDwK+NameRJuPGDhlnFgx8e8HN3XHQeLaYJFJBOE=
github.com/prometheus/client_golang v1.19.1/go.mod h1:mP78NwGzrVks5S2H6ab8+ZZGJLZUq1hoULYBAYBw1Ho=
github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E=
github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY=
github.com/prometheus/common v0.55.0 h1:KEi6DK7lXW/m7Ig5i47x0vRzuBsHuvJdi5ee6Y3G1dc=
github.com/prometheus/common v0.55.0/go.mod h1:2SECS4xJG1kd8XF9IcM1gMX6510RAEL65zxzNImwdc8=
github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc=
github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk=
github.com/prune998/docker-registry-client v0.0.0-20200114164314-f8cd511a014c h1:YppmMj184YDFwHUCLq6n9m3MC6J+82OTi4hRGYVAZc4=
github.com/prune998/docker-registry-client v0.0.0-20200114164314-f8cd511a014c/go.mod h1:0mv86yBh00jRSWDYBNe2Ev+RYZT/iVDWGeFS+B3gpMI=
github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8=
github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4=
github.com/rs/xid v1.4.0 h1:qd7wPTDkN6KQx2VmMBLrpHkiyQwgFXRnkOLacUiaSNY=
github.com/rs/xid v1.4.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg=
github.com/sergeymakinen/go-quote v1.1.0 h1:mwCRejFVH26bf6TFaBNdXixeB5LtNU1yVHrfsNAmnjc=
github.com/sergeymakinen/go-quote v1.1.0/go.mod h1:AuXYBfIQbIXlzf9KawRyfSxc/YGAyVLtMUUtmc5oGHA=
github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ=
github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg=
github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc=
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU=
go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0=
go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y=
go.uber.org/zap v1.26.0 h1:sI7k6L95XOKS281NhVKOFCUNIvv9e0w4BF8N3u+tCRo=
go.uber.org/zap v1.26.0/go.mod h1:dtElttAiwGvoJ/vj4IwHBS/gXsEu/pZ50mUIRWuG0so=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/exp v0.0.0-20230515195305-f3d0a9c9a5cc h1:mCRnTeVUjcrhlRmO0VK8a6k6Rrf6TF9htwo2pJVSjIU=
golang.org/x/exp v0.0.0-20230515195305-f3d0a9c9a5cc/go.mod h1:V1LtkGg67GoY2N1AnLN78QLrzxkLyJw7RJb1gzOOz9w=
golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
golang.org/x/net v0.33.0 h1:74SYHlV8BIgHIFC/LrYkOGIwL19eTYXQ5wc6TBuO36I=
golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4=
golang.org/x/oauth2 v0.21.0 h1:tsimM75w1tF/uws5rbeHzIWxEqElMehnc+iW793zsZs=
golang.org/x/oauth2 v0.21.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.28.0 h1:Fksou7UEQUWlKvIdsqzJmUmCX3cZuD2+P3XyyzwMhlA=
golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/term v0.27.0 h1:WP60Sv1nlK1T6SupCHbXzSaN0b9wUmsPoRS9b61A23Q=
golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo=
golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ=
golang.org/x/time v0.6.0 h1:eTDhh4ZXt5Qf0augr54TN6suAUudPcawVZeIAPU7D4U=
golang.org/x/time v0.6.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d h1:vU5i/LfpvrRCpgM/VPfJLg5KjxD3E+hfT1SH+d9zLwg=
golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw=
gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY=
google.golang.org/genproto/googleapis/api v0.0.0-20240528184218-531527333157 h1:7whR9kGa5LUwFtpLm2ArCEejtnxlGeLbAyjFY8sGNFw=
google.golang.org/genproto/googleapis/api v0.0.0-20240528184218-531527333157/go.mod h1:99sLkeliLXfdj2J75X3Ho+rrVCaJze0uwN7zDDkjPVU=
google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg=
google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
gopkg.in/evanphx/json-patch.v4 v4.12.0 h1:n6jtcsulIzXPJaxegRbvFNNrZDjbij7ny3gmSPG+6V4=
gopkg.in/evanphx/json-patch.v4 v4.12.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M=
gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc=
gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw=
gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.0/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
istio.io/api v1.23.1 h1:bm2XF0j058FfzWVHUfpmMj4sFDkcD1X609qs5AU97Pc=
istio.io/api v1.23.1/go.mod h1:QPSTGXuIQdnZFEm3myf9NZ5uBMwCdJWUvfj9ZZ+2oBM=
istio.io/client-go v1.23.1 h1:IX2cgUUXnVYo+9H6bFGSp/vuKVLPUkmiN8qk1/mvsYs=
istio.io/client-go v1.23.1/go.mod h1:+fxu+O2GkITM3HEREUWdobvRXqI/UhAAI7hfxqqpRh0=
k8s.io/api v0.31.3 h1:umzm5o8lFbdN/hIXbrK9oRpOproJO62CV1zqxXrLgk8=
k8s.io/api v0.31.3/go.mod h1:UJrkIp9pnMOI9K2nlL6vwpxRzzEX5sWgn8kGQe92kCE=
k8s.io/apiextensions-apiserver v0.31.0 h1:fZgCVhGwsclj3qCw1buVXCV6khjRzKC5eCFt24kyLSk=
k8s.io/apiextensions-apiserver v0.31.0/go.mod h1:b9aMDEYaEe5sdK+1T0KU78ApR/5ZVp4i56VacZYEHxk=
k8s.io/apimachinery v0.31.3 h1:6l0WhcYgasZ/wk9ktLq5vLaoXJJr5ts6lkaQzgeYPq4=
k8s.io/apimachinery v0.31.3/go.mod h1:rsPdaZJfTfLsNJSQzNHQvYoTmxhoOEofxtOsF3rtsMo=
k8s.io/client-go v0.31.3 h1:CAlZuM+PH2cm+86LOBemaJI/lQ5linJ6UFxKX/SoG+4=
k8s.io/client-go v0.31.3/go.mod h1:2CgjPUTpv3fE5dNygAr2NcM8nhHzXvxB8KL5gYc3kJs=
k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk=
k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE=
k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 h1:BZqlfIlq5YbRMFko6/PM7FjZpUb45WallggurYhKGag=
k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340/go.mod h1:yD4MZYeKMBwQKVht279WycxKyM84kkAx2DPrTXaeb98=
k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 h1:pUdcCO1Lk/tbT5ztQWOBi5HBgbBP1J8+AsQnQCKsi8A=
k8s.io/utils v0.0.0-20240711033017-18e509b52bc8/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
sigs.k8s.io/controller-runtime v0.19.4 h1:SUmheabttt0nx8uJtoII4oIP27BVVvAKFvdvGFwV/Qo=
sigs.k8s.io/controller-runtime v0.19.4/go.mod h1:iRmWllt8IlaLjvTTDLhRBXIEtkCK6hwVBJJsYS9Ajf4=
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo=
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0=
sigs.k8s.io/structured-merge-diff/v4 v4.4.1 h1:150L+0vs/8DA78h1u02ooW1/fFq/Lwr+sGiqlzvrtq4=
sigs.k8s.io/structured-merge-diff/v4 v4.4.1/go.mod h1:N8hJocpFajUSSeSJ9bOZ77VzejKZaXsTtZo4/u7Io08=
sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E=
sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY=
volcano.sh/apis v1.11.0 h1:Z5ZXxxgUNfXv1OhfVXXfGPN7StoSsozQM+8CAPoNWY8=
volcano.sh/apis v1.11.0/go.mod h1:FOdmG++9+8lgENJ9XXDh+O3Jcb9YVRnlMSpgIh3NSVI=
/*
SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
SPDX-License-Identifier: Apache-2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
/*
Copyright 2024.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
\ No newline at end of file
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package archive
import (
"archive/tar"
"bytes"
"fmt"
"io"
"path/filepath"
)
func ExtractFileFromTar(tarData []byte, fileName string) (*bytes.Buffer, error) {
// Create a tar reader
tarReader := tar.NewReader(bytes.NewReader(tarData))
// Iterate through tar archive
for {
header, err := tarReader.Next()
if err == io.EOF {
break // End of archive
}
if err != nil {
return nil, fmt.Errorf("error reading tar file: %w", err)
}
// Check if the current file is the desired YAML file
if header.Typeflag == tar.TypeReg && (header.Name == fileName || filepath.Base(header.Name) == fileName) {
var content bytes.Buffer
_, err = content.ReadFrom(tarReader)
if err != nil {
return nil, fmt.Errorf("error extracting file: %w", err)
}
return &content, nil
}
}
return nil, fmt.Errorf("file %s not found in tar archive", fileName)
}
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package archive
import (
"bytes"
"os"
"reflect"
"testing"
)
func TestExtractFileFromTar(t *testing.T) {
// read test.tar file
// it contains test.yaml at the root
tarData, err := os.ReadFile("test.tar")
if err != nil {
t.Fatalf("Failed to read test.tar: %v", err)
}
// read test2.tar file
// it contains test2.yaml inside a folder
tarData2, err := os.ReadFile("test2.tar")
if err != nil {
t.Fatalf("Failed to read test2.tar: %v", err)
}
type args struct {
tarData []byte
yamlFileName string
}
tests := []struct {
name string
args args
want *bytes.Buffer
wantErr bool
}{
{
name: "Test ExtractFileFromTar",
args: args{
tarData: tarData,
yamlFileName: "test.yaml",
},
want: bytes.NewBufferString("property1: true\n"),
wantErr: false,
},
{
name: "Test ExtractFileFromTar",
args: args{
tarData: tarData2,
yamlFileName: "test.yaml",
},
want: bytes.NewBufferString("property1: true\n"),
wantErr: false,
},
{
name: "Test ExtractFileFromTar, file not found",
args: args{
tarData: tarData,
yamlFileName: "test2.yaml",
},
want: nil,
wantErr: true,
},
{
name: "Test ExtractFileFromTar, invalid content",
args: args{
tarData: []byte("invalid content"),
yamlFileName: "test.yaml",
},
want: nil,
wantErr: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := ExtractFileFromTar(tt.args.tarData, tt.args.yamlFileName)
if (err != nil) != tt.wantErr {
t.Errorf("ExtractFileFromTar() error = %v, wantErr %v", err, tt.wantErr)
return
}
if !reflect.DeepEqual(got, tt.want) {
t.Errorf("ExtractFileFromTar() = %v, want %v", got, tt.want)
}
})
}
}
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package controller
import (
"strings"
"github.com/dynemo-ai/dynemo/deploy/compoundai/operator/api/v1alpha1"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
const (
EntityHandlerTypeJob = "job"
// EntityHandlerCreated indicates that the EntityHandler is created.
EntityHandlerCreated = "ENTITY_HANDLER_CREATED"
// EntityHandlerCompleted indicates that the EntityHandler has completed.
EntityHandlerCompleted = "ENTITY_HANDLER_COMPLETED"
// EntityHandlerPending indicates that the EntityHandler is in pending state.
EntityHandlerPending = "ENTITY_HANDLER_PENDING"
EntityHandlerCreatedState = "EntityHandlerCreated"
EntityHandlerCompletedState = "EntityHandlerCompleted"
EntityHandlerFailedState = "EntityHandlerFailed"
EntityHandlerPendingState = "EntityHandlerPending"
EntityHandlerRunningState = "EntityHandlerRunning"
PVCCreatedState = "PVCCreated"
CrdRunning = "running"
CrdFailed = "failed"
CrdSuccessful = "successful"
PVCMountPath = "/pvc"
// TrainingJobPVCCreated indicates that the caching pvc is created.
PVCCreated = "PVC_CREATED"
)
func constructPVC(crd metav1.Object, pvcConfig v1alpha1.PVC) *corev1.PersistentVolumeClaim {
storageClassName := pvcConfig.StorageClass
return &corev1.PersistentVolumeClaim{
ObjectMeta: metav1.ObjectMeta{
Name: getPvcName(crd, pvcConfig.Name),
Namespace: crd.GetNamespace(),
},
Spec: corev1.PersistentVolumeClaimSpec{
AccessModes: []corev1.PersistentVolumeAccessMode{pvcConfig.VolumeAccessMode},
Resources: corev1.VolumeResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceStorage: pvcConfig.Size,
},
},
StorageClassName: &storageClassName,
},
}
}
func getPvcName(crd metav1.Object, defaultName *string) string {
if defaultName != nil {
return *defaultName
}
return crd.GetName()
}
func generateCompoundAINimRequestName(tag string) string {
return strings.Split(tag, ":")[0]
}
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package controller
import (
"context"
"dario.cat/mergo"
"emperror.dev/errors"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/tools/record"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/builder"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/event"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/predicate"
nvidiacomv1alpha1 "github.com/dynemo-ai/dynemo/deploy/compoundai/operator/api/v1alpha1"
commonController "github.com/dynemo-ai/dynemo/deploy/compoundai/operator/internal/controller_common"
"github.com/dynemo-ai/dynemo/deploy/compoundai/operator/internal/nim"
)
const (
FailedState = "failed"
ReadyState = "successful"
PendingState = "pending"
)
// CompoundAIDeploymentReconciler reconciles a CompoundAIDeployment object
type CompoundAIDeploymentReconciler struct {
client.Client
Scheme *runtime.Scheme
Config commonController.Config
Recorder record.EventRecorder
}
// +kubebuilder:rbac:groups=nvidia.com,resources=compoundaideployments,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=nvidia.com,resources=compoundaideployments/status,verbs=get;update;patch
// +kubebuilder:rbac:groups=nvidia.com,resources=compoundaideployments/finalizers,verbs=update
// Reconcile is part of the main kubernetes reconciliation loop which aims to
// move the current state of the cluster closer to the desired state.
// TODO(user): Modify the Reconcile function to compare the state specified by
// the CompoundAIDeployment object against the actual cluster state, and then
// perform operations to make the cluster state reflect the state specified by
// the user.
//
// For more details, check Reconcile and its Result here:
// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.19.1/pkg/reconcile
func (r *CompoundAIDeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
logger := log.FromContext(ctx)
var err error
reason := "undefined"
readyStatus := metav1.ConditionFalse
// retrieve the CRD
compoundAIDeployment := &nvidiacomv1alpha1.CompoundAIDeployment{}
if err = r.Get(ctx, req.NamespacedName, compoundAIDeployment); err != nil {
return ctrl.Result{}, client.IgnoreNotFound(err)
}
if err != nil {
// not found, nothing to do
return ctrl.Result{}, nil
}
defer func() {
message := ""
if err != nil {
compoundAIDeployment.SetState(FailedState)
message = err.Error()
}
// update the CRD status condition
compoundAIDeployment.Status.Conditions = []metav1.Condition{
{
Type: "Ready",
Status: readyStatus,
Reason: reason,
Message: message,
LastTransitionTime: metav1.Now(),
},
}
err = r.Status().Update(ctx, compoundAIDeployment)
if err != nil {
logger.Error(err, "Unable to update the CRD status", "crd", req.NamespacedName)
}
logger.Info("Reconciliation done")
}()
// fetch the CompoundAINIMConfig
compoundAINIMConfig, err := nim.GetCompoundAINIMConfig(ctx, compoundAIDeployment, r.getSecret, r.Recorder)
if err != nil {
reason = "failed_to_get_the_CompoundAINIMConfig"
return ctrl.Result{}, err
}
// generate the CompoundAINimDeployments from the config
compoundAINimDeployments, err := nim.GenerateCompoundAINIMDeployments(compoundAIDeployment, compoundAINIMConfig)
if err != nil {
reason = "failed_to_generate_the_CompoundAINimDeployments"
return ctrl.Result{}, err
}
// merge the CompoundAINimDeployments with the CompoundAINimDeployments from the CRD
for serviceName, deployment := range compoundAINimDeployments {
if _, ok := compoundAIDeployment.Spec.Services[serviceName]; ok {
err := mergo.Merge(deployment, compoundAIDeployment.Spec.Services[serviceName], mergo.WithOverride)
if err != nil {
reason = "failed_to_merge_the_CompoundAINimDeployments"
return ctrl.Result{}, err
}
}
}
// reconcile the compoundAINimRequest
compoundAINimRequest := &nvidiacomv1alpha1.CompoundAINimRequest{
ObjectMeta: metav1.ObjectMeta{
Name: generateCompoundAINimRequestName(compoundAIDeployment.Spec.CompoundAINim),
Namespace: compoundAIDeployment.Namespace,
},
Spec: nvidiacomv1alpha1.CompoundAINimRequestSpec{
BentoTag: compoundAIDeployment.Spec.CompoundAINim,
},
}
if err := ctrl.SetControllerReference(compoundAIDeployment, compoundAINimRequest, r.Scheme); err != nil {
reason = "failed_to_set_the_controller_reference_for_the_CompoundAINimRequest"
return ctrl.Result{}, err
}
_, err = commonController.SyncResource(ctx, r.Client, compoundAINimRequest, types.NamespacedName{Name: compoundAINimRequest.Name, Namespace: compoundAINimRequest.Namespace}, true)
if err != nil {
reason = "failed_to_sync_the_CompoundAINimRequest"
return ctrl.Result{}, err
}
allAreReady := true
// reconcile the CompoundAINimDeployments
for serviceName, compoundAINimDeployment := range compoundAINimDeployments {
logger.Info("Reconciling the CompoundAINimDeployment", "serviceName", serviceName, "compoundAINimDeployment", compoundAINimDeployment)
if err := ctrl.SetControllerReference(compoundAIDeployment, compoundAINimDeployment, r.Scheme); err != nil {
reason = "failed_to_set_the_controller_reference_for_the_CompoundAINimDeployment"
return ctrl.Result{}, err
}
compoundAINimDeployment, err = commonController.SyncResource(ctx, r.Client, compoundAINimDeployment, types.NamespacedName{Name: compoundAINimDeployment.Name, Namespace: compoundAINimDeployment.Namespace}, true)
if err != nil {
reason = "failed_to_sync_the_CompoundAINimDeployment"
return ctrl.Result{}, err
}
if !compoundAINimDeployment.Status.IsReady() {
allAreReady = false
}
}
if allAreReady {
compoundAIDeployment.SetState(ReadyState)
readyStatus = metav1.ConditionTrue
} else {
compoundAIDeployment.SetState(PendingState)
}
return ctrl.Result{}, nil
}
func (r *CompoundAIDeploymentReconciler) getSecret(ctx context.Context, namespace, name string) (*corev1.Secret, error) {
secret := &corev1.Secret{}
err := r.Get(ctx, types.NamespacedName{Namespace: namespace, Name: name}, secret)
return secret, errors.Wrap(err, "get secret")
}
// SetupWithManager sets up the controller with the Manager.
func (r *CompoundAIDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) error {
return ctrl.NewControllerManagedBy(mgr).
For(&nvidiacomv1alpha1.CompoundAIDeployment{}).
Named("compoundaideployment").
Owns(&nvidiacomv1alpha1.CompoundAINimDeployment{}, builder.WithPredicates(predicate.Funcs{
// ignore creation cause we don't want to be called again after we create the deployment
CreateFunc: func(ce event.CreateEvent) bool { return false },
DeleteFunc: func(de event.DeleteEvent) bool { return true },
UpdateFunc: func(de event.UpdateEvent) bool { return true },
GenericFunc: func(ge event.GenericEvent) bool { return true },
})).
WithEventFilter(commonController.EphemeralDeploymentEventFilter(r.Config)).
Complete(r)
}
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package controller
import (
"context"
"encoding/json"
"fmt"
"os"
"reflect"
"sort"
"strconv"
"strings"
"time"
appsv1 "k8s.io/api/apps/v1"
autoscalingv2 "k8s.io/api/autoscaling/v2"
corev1 "k8s.io/api/core/v1"
networkingv1 "k8s.io/api/networking/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"emperror.dev/errors"
"github.com/cisco-open/k8s-objectmatcher/patch"
compoundaiCommon "github.com/dynemo-ai/dynemo/deploy/compoundai/operator/api/compoundai/common"
"github.com/dynemo-ai/dynemo/deploy/compoundai/operator/api/compoundai/modelschemas"
"github.com/dynemo-ai/dynemo/deploy/compoundai/operator/api/compoundai/schemasv1"
yataiclient "github.com/dynemo-ai/dynemo/deploy/compoundai/operator/api/compoundai/yatai-client"
"github.com/dynemo-ai/dynemo/deploy/compoundai/operator/api/v1alpha1"
"github.com/dynemo-ai/dynemo/deploy/compoundai/operator/internal/controller_common"
"github.com/dynemo-ai/dynemo/deploy/compoundai/operator/internal/envoy"
commonconfig "github.com/dynemo-ai/dynemo/deploy/compoundai/operator/pkg/compoundai/config"
commonconsts "github.com/dynemo-ai/dynemo/deploy/compoundai/operator/pkg/compoundai/consts"
"github.com/dynemo-ai/dynemo/deploy/compoundai/operator/pkg/compoundai/system"
"github.com/huandu/xstrings"
"github.com/jinzhu/copier"
"github.com/prometheus/common/version"
istioNetworking "istio.io/api/networking/v1beta1"
networkingv1beta1 "istio.io/client-go/pkg/apis/networking/v1beta1"
k8serrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/meta"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/record"
"k8s.io/utils/ptr"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/builder"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/config"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
"sigs.k8s.io/controller-runtime/pkg/handler"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/predicate"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
compounadaiConversion "github.com/dynemo-ai/dynemo/deploy/compoundai/operator/api/compoundai/conversion"
)
const (
DefaultClusterName = "default"
DefaultServiceAccountName = "default"
KubeValueNameSharedMemory = "shared-memory"
KubeAnnotationDeploymentStrategy = "yatai.ai/deployment-strategy"
KubeAnnotationYataiEnableStealingTrafficDebugMode = "yatai.ai/enable-stealing-traffic-debug-mode"
KubeAnnotationYataiEnableDebugMode = "yatai.ai/enable-debug-mode"
KubeAnnotationYataiEnableDebugPodReceiveProductionTraffic = "yatai.ai/enable-debug-pod-receive-production-traffic"
KubeAnnotationYataiProxySidecarResourcesLimitsCPU = "yatai.ai/proxy-sidecar-resources-limits-cpu"
KubeAnnotationYataiProxySidecarResourcesLimitsMemory = "yatai.ai/proxy-sidecar-resources-limits-memory"
KubeAnnotationYataiProxySidecarResourcesRequestsCPU = "yatai.ai/proxy-sidecar-resources-requests-cpu"
KubeAnnotationYataiProxySidecarResourcesRequestsMemory = "yatai.ai/proxy-sidecar-resources-requests-memory"
DeploymentTargetTypeProduction = "production"
DeploymentTargetTypeDebug = "debug"
ContainerPortNameHTTPProxy = "http-proxy"
ServicePortNameHTTPNonProxy = "http-non-proxy"
HeaderNameDebug = "X-Yatai-Debug"
)
var ServicePortHTTPNonProxy = commonconsts.BentoServicePort + 1
// CompoundAINimDeploymentReconciler reconciles a CompoundAINimDeployment object
type CompoundAINimDeploymentReconciler struct {
client.Client
Scheme *runtime.Scheme
Recorder record.EventRecorder
Config controller_common.Config
NatsAddr string
EtcdAddr string
}
// +kubebuilder:rbac:groups=nvidia.com,resources=compoundainimdeployments,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=nvidia.com,resources=compoundainimdeployments/status,verbs=get;update;patch
// +kubebuilder:rbac:groups=nvidia.com,resources=compoundainimdeployments/finalizers,verbs=update
//+kubebuilder:rbac:groups=apps,resources=deployments,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch
//+kubebuilder:rbac:groups=core,resources=services,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=core,resources=configmaps,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=core,resources=events,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=autoscaling,resources=horizontalpodautoscalers,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=networking.k8s.io,resources=ingressclasses,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=networking.k8s.io,resources=ingresses,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=events.k8s.io,resources=events,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=coordination.k8s.io,resources=leases,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=networking.istio.io,resources=virtualservices,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=core,resources=persistentvolumeclaims,verbs=get;list;create;delete
// Reconcile is part of the main kubernetes reconciliation loop which aims to
// move the current state of the cluster closer to the desired state.
// TODO(user): Modify the Reconcile function to compare the state specified by
// the CompoundAINimDeployment object against the actual cluster state, and then
// perform operations to make the cluster state reflect the state specified by
// the user.
//
// For more details, check Reconcile and its Result here:
// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.18.2/pkg/reconcile
//
//nolint:gocyclo,nakedret
func (r *CompoundAINimDeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, err error) {
logs := log.FromContext(ctx)
compoundAINimDeployment := &v1alpha1.CompoundAINimDeployment{}
err = r.Get(ctx, req.NamespacedName, compoundAINimDeployment)
if err != nil {
if k8serrors.IsNotFound(err) {
// Object not found, return. Created objects are automatically garbage collected.
// For additional cleanup logic use finalizers.
logs.Info("CompoundAINimDeployment resource not found. Ignoring since object must be deleted.")
err = nil
return
}
// Error reading the object - requeue the request.
logs.Error(err, "Failed to get CompoundAINimDeployment.")
return
}
logs = logs.WithValues("compoundAINimDeployment", compoundAINimDeployment.Name, "namespace", compoundAINimDeployment.Namespace)
if len(compoundAINimDeployment.Status.Conditions) == 0 {
logs.Info("Starting to reconcile CompoundAINimDeployment")
logs.Info("Initializing CompoundAINimDeployment status")
r.Recorder.Event(compoundAINimDeployment, corev1.EventTypeNormal, "Reconciling", "Starting to reconcile CompoundAINimDeployment")
compoundAINimDeployment, err = r.setStatusConditions(ctx, req,
metav1.Condition{
Type: v1alpha1.CompoundAIDeploymentConditionTypeAvailable,
Status: metav1.ConditionUnknown,
Reason: "Reconciling",
Message: "Starting to reconcile CompoundAINimDeployment",
},
metav1.Condition{
Type: v1alpha1.CompoundAIDeploymentConditionTypeCompoundAINimFound,
Status: metav1.ConditionUnknown,
Reason: "Reconciling",
Message: "Starting to reconcile CompoundAINimDeployment",
},
)
if err != nil {
return
}
}
defer func() {
if err == nil {
return
}
logs.Error(err, "Failed to reconcile CompoundAINimDeployment.")
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeWarning, "ReconcileError", "Failed to reconcile CompoundAINimDeployment: %v", err)
_, err = r.setStatusConditions(ctx, req,
metav1.Condition{
Type: v1alpha1.CompoundAIDeploymentConditionTypeAvailable,
Status: metav1.ConditionFalse,
Reason: "Reconciling",
Message: fmt.Sprintf("Failed to reconcile CompoundAINimDeployment: %v", err),
},
)
if err != nil {
return
}
}()
yataiClient, clusterName, err := r.getYataiClientWithAuth(ctx, compoundAINimDeployment)
if err != nil {
err = errors.Wrap(err, "get yatai client with auth")
return
}
compoundAINimFoundCondition := meta.FindStatusCondition(compoundAINimDeployment.Status.Conditions, v1alpha1.CompoundAIDeploymentConditionTypeCompoundAINimFound)
if compoundAINimFoundCondition != nil && compoundAINimFoundCondition.Status == metav1.ConditionUnknown {
logs.Info(fmt.Sprintf("Getting Compound AI NIM %s", compoundAINimDeployment.Spec.CompoundAINim))
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeNormal, "GetCompoundAINim", "Getting Compound AI NIM %s", compoundAINimDeployment.Spec.CompoundAINim)
}
compoundAINimRequest := &v1alpha1.CompoundAINimRequest{}
compoundAINimCR := &v1alpha1.CompoundAINim{}
err = r.Get(ctx, types.NamespacedName{
Namespace: compoundAINimDeployment.Namespace,
Name: compoundAINimDeployment.Spec.CompoundAINim,
}, compoundAINimCR)
compoundAINimIsNotFound := k8serrors.IsNotFound(err)
if err != nil && !compoundAINimIsNotFound {
err = errors.Wrapf(err, "get CompoundAINim %s/%s", compoundAINimDeployment.Namespace, compoundAINimDeployment.Spec.CompoundAINim)
return
}
if compoundAINimIsNotFound {
if compoundAINimFoundCondition != nil && compoundAINimFoundCondition.Status == metav1.ConditionUnknown {
logs.Info(fmt.Sprintf("CompoundAINim %s not found", compoundAINimDeployment.Spec.CompoundAINim))
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeNormal, "GetCompoundAINim", "CompoundAINim %s not found", compoundAINimDeployment.Spec.CompoundAINim)
}
compoundAINimDeployment, err = r.setStatusConditions(ctx, req,
metav1.Condition{
Type: v1alpha1.CompoundAIDeploymentConditionTypeCompoundAINimFound,
Status: metav1.ConditionFalse,
Reason: "Reconciling",
Message: "CompoundAINim not found",
},
)
if err != nil {
return
}
compoundAINimRequestFoundCondition := meta.FindStatusCondition(compoundAINimDeployment.Status.Conditions, v1alpha1.CompoundAIDeploymentConditionTypeCompoundAINimRequestFound)
if compoundAINimRequestFoundCondition == nil || compoundAINimRequestFoundCondition.Status != metav1.ConditionUnknown {
compoundAINimDeployment, err = r.setStatusConditions(ctx, req,
metav1.Condition{
Type: v1alpha1.CompoundAIDeploymentConditionTypeCompoundAINimRequestFound,
Status: metav1.ConditionUnknown,
Reason: "Reconciling",
Message: "CompoundAINim not found",
},
)
if err != nil {
return
}
}
if compoundAINimRequestFoundCondition != nil && compoundAINimRequestFoundCondition.Status == metav1.ConditionUnknown {
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeNormal, "GetCompoundAINimRequest", "Getting CompoundAINimRequest %s", compoundAINimDeployment.Spec.CompoundAINim)
}
err = r.Get(ctx, types.NamespacedName{
Namespace: compoundAINimDeployment.Namespace,
Name: compoundAINimDeployment.Spec.CompoundAINim,
}, compoundAINimRequest)
if err != nil {
err = errors.Wrapf(err, "get CompoundAINimRequest %s/%s", compoundAINimDeployment.Namespace, compoundAINimDeployment.Spec.CompoundAINim)
compoundAINimDeployment, err = r.setStatusConditions(ctx, req,
metav1.Condition{
Type: v1alpha1.CompoundAIDeploymentConditionTypeCompoundAINimFound,
Status: metav1.ConditionFalse,
Reason: "Reconciling",
Message: err.Error(),
},
metav1.Condition{
Type: v1alpha1.CompoundAIDeploymentConditionTypeCompoundAINimRequestFound,
Status: metav1.ConditionFalse,
Reason: "Reconciling",
Message: err.Error(),
},
)
if err != nil {
return
}
}
if compoundAINimRequestFoundCondition != nil && compoundAINimRequestFoundCondition.Status == metav1.ConditionUnknown {
logs.Info(fmt.Sprintf("CompoundAINimRequest %s found", compoundAINimDeployment.Spec.CompoundAINim))
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeNormal, "GetCompoundAINimRequest", "CompoundAINimRequest %s is found and waiting for its compoundAINim to be provided", compoundAINimDeployment.Spec.CompoundAINim)
}
compoundAINimDeployment, err = r.setStatusConditions(ctx, req,
metav1.Condition{
Type: v1alpha1.CompoundAIDeploymentConditionTypeCompoundAINimRequestFound,
Status: metav1.ConditionTrue,
Reason: "Reconciling",
Message: "CompoundAINim not found",
},
)
if err != nil {
return
}
compoundAINimRequestAvailableCondition := meta.FindStatusCondition(compoundAINimRequest.Status.Conditions, v1alpha1.CompoundAIDeploymentConditionTypeAvailable)
if compoundAINimRequestAvailableCondition != nil && compoundAINimRequestAvailableCondition.Status == metav1.ConditionFalse {
err = errors.Errorf("CompoundAINimRequest %s/%s is not available: %s", compoundAINimRequest.Namespace, compoundAINimRequest.Name, compoundAINimRequestAvailableCondition.Message)
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeWarning, "GetCompoundAINimRequest", err.Error())
_, err_ := r.setStatusConditions(ctx, req,
metav1.Condition{
Type: v1alpha1.CompoundAIDeploymentConditionTypeCompoundAINimFound,
Status: metav1.ConditionFalse,
Reason: "Reconciling",
Message: err.Error(),
},
metav1.Condition{
Type: v1alpha1.CompoundAIDeploymentConditionTypeAvailable,
Status: metav1.ConditionFalse,
Reason: "Reconciling",
Message: err.Error(),
},
)
if err_ != nil {
err = err_
return
}
return
}
return
} else {
if compoundAINimFoundCondition != nil && compoundAINimFoundCondition.Status != metav1.ConditionTrue {
logs.Info(fmt.Sprintf("CompoundAINim %s found", compoundAINimDeployment.Spec.CompoundAINim))
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeNormal, "GetCompoundAINim", "CompoundAINim %s is found", compoundAINimDeployment.Spec.CompoundAINim)
}
compoundAINimDeployment, err = r.setStatusConditions(ctx, req,
metav1.Condition{
Type: v1alpha1.CompoundAIDeploymentConditionTypeCompoundAINimFound,
Status: metav1.ConditionTrue,
Reason: "Reconciling",
Message: "CompoundAINim found",
},
)
if err != nil {
return
}
}
modified := false
// Reconcile PVC
_, err = r.reconcilePVC(ctx, compoundAINimDeployment)
if err != nil {
logs.Error(err, "Unable to create PVC", "crd", req.NamespacedName)
return ctrl.Result{}, err
}
// create or update api-server deployment
modified_, err := r.createOrUpdateOrDeleteDeployments(ctx, createOrUpdateOrDeleteDeploymentsOption{
yataiClient: yataiClient,
compoundAINimDeployment: compoundAINimDeployment,
compoundAINim: compoundAINimCR,
clusterName: clusterName,
})
if err != nil {
return
}
if modified_ {
modified = true
}
// create or update api-server hpa
modified_, err = r.createOrUpdateHPA(ctx, compoundAINimDeployment, compoundAINimCR)
if err != nil {
return
}
if modified_ {
modified = true
}
// create or update api-server service
modified_, err = r.createOrUpdateOrDeleteServices(ctx, createOrUpdateOrDeleteServicesOption{
compoundAINimDeployment: compoundAINimDeployment,
compoundAINim: compoundAINimCR,
})
if err != nil {
return
}
if modified_ {
modified = true
}
// create or update api-server ingresses
modified_, err = r.createOrUpdateIngresses(ctx, createOrUpdateIngressOption{
yataiClient: yataiClient,
compoundAINimDeployment: compoundAINimDeployment,
compoundAINim: compoundAINimCR,
})
if err != nil {
return
}
if modified_ {
modified = true
}
if yataiClient != nil && clusterName != nil {
yataiClient_ := *yataiClient
clusterName_ := *clusterName
compoundAINimRepositoryName, compoundAINimVersion := getCompoundAINimRepositoryNameAndCompoundAINimVersion(compoundAINimCR)
_, err = yataiClient_.GetBento(ctx, compoundAINimRepositoryName, compoundAINimVersion)
compoundAINimIsNotFound := isNotFoundError(err)
if err != nil && !compoundAINimIsNotFound {
return
}
if compoundAINimIsNotFound {
compoundAINimDeployment, err = r.setStatusConditions(ctx, req,
metav1.Condition{
Type: v1alpha1.CompoundAIDeploymentConditionTypeAvailable,
Status: metav1.ConditionTrue,
Reason: "Reconciling",
Message: "Remote compoundAINim from Yatai is not found",
},
)
return
}
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeNormal, "GetYataiDeployment", "Fetching yatai deployment %s", compoundAINimDeployment.Name)
var oldYataiDeployment *schemasv1.DeploymentSchema
oldYataiDeployment, err = yataiClient_.GetDeployment(ctx, clusterName_, compoundAINimDeployment.Namespace, compoundAINimDeployment.Name)
isNotFound := isNotFoundError(err)
if err != nil && !isNotFound {
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeWarning, "GetYataiDeployment", "Failed to fetch yatai deployment %s: %s", compoundAINimDeployment.Name, err)
return
}
err = nil
envs := make([]*modelschemas.LabelItemSchema, 0)
specEnvs := compoundAINimDeployment.Spec.Envs
for _, env := range specEnvs {
envs = append(envs, &modelschemas.LabelItemSchema{
Key: env.Name,
Value: env.Value,
})
}
var hpaConf *modelschemas.DeploymentTargetHPAConf
hpaConf, err = TransformToOldHPA(compoundAINimDeployment.Spec.Autoscaling)
if err != nil {
return
}
deploymentTargets := make([]*schemasv1.CreateDeploymentTargetSchema, 0, 1)
deploymentTarget := &schemasv1.CreateDeploymentTargetSchema{
DeploymentTargetTypeSchema: schemasv1.DeploymentTargetTypeSchema{
Type: modelschemas.DeploymentTargetTypeStable,
},
BentoRepository: compoundAINimRepositoryName,
Bento: compoundAINimVersion,
Config: &modelschemas.DeploymentTargetConfig{
KubeResourceUid: string(compoundAINimDeployment.UID),
KubeResourceVersion: compoundAINimDeployment.ResourceVersion,
Resources: compounadaiConversion.ConvertToDeploymentTargetResources(compoundAINimDeployment.Spec.Resources),
HPAConf: hpaConf,
Envs: &envs,
EnableIngress: &compoundAINimDeployment.Spec.Ingress.Enabled,
EnableStealingTrafficDebugMode: &[]bool{checkIfIsStealingTrafficDebugModeEnabled(compoundAINimDeployment.Spec.Annotations)}[0],
EnableDebugMode: &[]bool{checkIfIsDebugModeEnabled(compoundAINimDeployment.Spec.Annotations)}[0],
EnableDebugPodReceiveProductionTraffic: &[]bool{checkIfIsDebugPodReceiveProductionTrafficEnabled(compoundAINimDeployment.Spec.Annotations)}[0],
BentoDeploymentOverrides: &modelschemas.ApiServerBentoDeploymentOverrides{
MonitorExporter: compoundAINimDeployment.Spec.MonitorExporter,
ExtraPodMetadata: compoundAINimDeployment.Spec.ExtraPodMetadata,
ExtraPodSpec: compoundAINimDeployment.Spec.ExtraPodSpec,
},
BentoRequestOverrides: &modelschemas.BentoRequestOverrides{
ImageBuildTimeout: compoundAINimRequest.Spec.ImageBuildTimeout,
ImageBuilderExtraPodSpec: compoundAINimRequest.Spec.ImageBuilderExtraPodSpec,
ImageBuilderExtraPodMetadata: compoundAINimRequest.Spec.ImageBuilderExtraPodMetadata,
ImageBuilderExtraContainerEnv: compoundAINimRequest.Spec.ImageBuilderExtraContainerEnv,
ImageBuilderContainerResources: compoundAINimRequest.Spec.ImageBuilderContainerResources,
DockerConfigJSONSecretName: compoundAINimRequest.Spec.DockerConfigJSONSecretName,
DownloaderContainerEnvFrom: compoundAINimRequest.Spec.DownloaderContainerEnvFrom,
},
},
}
deploymentTargets = append(deploymentTargets, deploymentTarget)
updateSchema := &schemasv1.UpdateDeploymentSchema{
Targets: deploymentTargets,
DoNotDeploy: true,
}
if isNotFound {
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeNormal, "CreateYataiDeployment", "Creating yatai deployment %s", compoundAINimDeployment.Name)
_, err = yataiClient_.CreateDeployment(ctx, clusterName_, &schemasv1.CreateDeploymentSchema{
Name: compoundAINimDeployment.Name,
KubeNamespace: compoundAINimDeployment.Namespace,
UpdateDeploymentSchema: *updateSchema,
})
if err != nil {
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeWarning, "CreateYataiDeployment", "Failed to create yatai deployment %s: %s", compoundAINimDeployment.Name, err)
return
}
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeNormal, "CreateYataiDeployment", "Created yatai deployment %s", compoundAINimDeployment.Name)
} else {
noChange := false
if oldYataiDeployment != nil && oldYataiDeployment.LatestRevision != nil && len(oldYataiDeployment.LatestRevision.Targets) > 0 {
oldYataiDeployment.LatestRevision.Targets[0].Config.KubeResourceUid = updateSchema.Targets[0].Config.KubeResourceUid
oldYataiDeployment.LatestRevision.Targets[0].Config.KubeResourceVersion = updateSchema.Targets[0].Config.KubeResourceVersion
noChange = reflect.DeepEqual(oldYataiDeployment.LatestRevision.Targets[0].Config, updateSchema.Targets[0].Config)
}
if noChange {
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeNormal, "UpdateYataiDeployment", "No change in yatai deployment %s, skipping", compoundAINimDeployment.Name)
} else {
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeNormal, "UpdateYataiDeployment", "Updating yatai deployment %s", compoundAINimDeployment.Name)
_, err = yataiClient_.UpdateDeployment(ctx, clusterName_, compoundAINimDeployment.Namespace, compoundAINimDeployment.Name, updateSchema)
if err != nil {
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeWarning, "UpdateYataiDeployment", "Failed to update yatai deployment %s: %s", compoundAINimDeployment.Name, err)
return
}
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeNormal, "UpdateYataiDeployment", "Updated yatai deployment %s", compoundAINimDeployment.Name)
}
}
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeNormal, "SyncYataiDeploymentStatus", "Syncing yatai deployment %s status", compoundAINimDeployment.Name)
_, err = yataiClient_.SyncDeploymentStatus(ctx, clusterName_, compoundAINimDeployment.Namespace, compoundAINimDeployment.Name)
if err != nil {
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeWarning, "SyncYataiDeploymentStatus", "Failed to sync yatai deployment %s status: %s", compoundAINimDeployment.Name, err)
return
}
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeNormal, "SyncYataiDeploymentStatus", "Synced yatai deployment %s status", compoundAINimDeployment.Name)
}
if !modified {
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeNormal, "UpdateYataiDeployment", "No changes to yatai deployment %s", compoundAINimDeployment.Name)
}
logs.Info("Finished reconciling.")
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeNormal, "Update", "All resources updated!")
compoundAINimDeployment, err = r.setStatusConditions(ctx, req,
metav1.Condition{
Type: v1alpha1.CompoundAIDeploymentConditionTypeAvailable,
Status: metav1.ConditionTrue,
Reason: "Reconciling",
Message: "Reconciling",
},
)
return
}
func isNotFoundError(err error) bool {
if err == nil {
return false
}
errMsg := strings.ToLower(err.Error())
return strings.Contains(errMsg, "not found") || strings.Contains(errMsg, "could not find") || strings.Contains(errMsg, "404")
}
func (r *CompoundAINimDeploymentReconciler) reconcilePVC(ctx context.Context, crd *v1alpha1.CompoundAINimDeployment) (*corev1.PersistentVolumeClaim, error) {
logger := log.FromContext(ctx)
if crd.Spec.PVC == nil {
return nil, nil
}
pvcConfig := *crd.Spec.PVC
pvc := &corev1.PersistentVolumeClaim{}
pvcName := types.NamespacedName{Name: getPvcName(crd, pvcConfig.Name), Namespace: crd.GetNamespace()}
err := r.Get(ctx, pvcName, pvc)
if err != nil && client.IgnoreNotFound(err) != nil {
logger.Error(err, "Unable to retrieve PVC", "crd", crd.GetName())
return nil, err
}
// If PVC does not exist, create a new one
if err != nil {
if pvcConfig.Create == nil || !*pvcConfig.Create {
logger.Error(err, "Unknown PVC", "pvc", pvc.Name)
return nil, err
}
pvc = constructPVC(crd, pvcConfig)
if err := controllerutil.SetControllerReference(crd, pvc, r.Scheme); err != nil {
logger.Error(err, "Failed to set controller reference", "pvc", pvc.Name)
return nil, err
}
err = r.Create(ctx, pvc)
if err != nil {
logger.Error(err, "Failed to create pvc", "pvc", pvc.Name)
return nil, err
}
logger.Info("PVC created", "pvc", pvcName)
}
return pvc, nil
}
func (r *CompoundAINimDeploymentReconciler) setStatusConditions(ctx context.Context, req ctrl.Request, conditions ...metav1.Condition) (compoundAINimDeployment *v1alpha1.CompoundAINimDeployment, err error) {
compoundAINimDeployment = &v1alpha1.CompoundAINimDeployment{}
for i := 0; i < 3; i++ {
if err = r.Get(ctx, req.NamespacedName, compoundAINimDeployment); err != nil {
err = errors.Wrap(err, "Failed to re-fetch CompoundAINimDeployment")
return
}
for _, condition := range conditions {
meta.SetStatusCondition(&compoundAINimDeployment.Status.Conditions, condition)
}
if err = r.Status().Update(ctx, compoundAINimDeployment); err != nil {
time.Sleep(100 * time.Millisecond)
} else {
break
}
}
if err != nil {
err = errors.Wrap(err, "Failed to update CompoundAINimDeployment status")
return
}
if err = r.Get(ctx, req.NamespacedName, compoundAINimDeployment); err != nil {
err = errors.Wrap(err, "Failed to re-fetch CompoundAINimDeployment")
return
}
return
}
var cachedYataiConf *commonconfig.YataiConfig
//nolint:nakedret
func (r *CompoundAINimDeploymentReconciler) getYataiClient(ctx context.Context) (yataiClient **yataiclient.YataiClient, clusterName *string, err error) {
restConfig := config.GetConfigOrDie()
clientset, err := kubernetes.NewForConfig(restConfig)
if err != nil {
err = errors.Wrapf(err, "create kubernetes clientset")
return
}
var yataiConf *commonconfig.YataiConfig
if cachedYataiConf != nil {
yataiConf = cachedYataiConf
} else {
yataiConf, err = commonconfig.GetYataiConfig(ctx, func(ctx context.Context, namespace, name string) (*corev1.Secret, error) {
secret, err := clientset.CoreV1().Secrets(namespace).Get(ctx, name, metav1.GetOptions{})
return secret, errors.Wrap(err, "get secret")
}, commonconsts.YataiDeploymentComponentName, false)
isNotFound := k8serrors.IsNotFound(err)
if err != nil && !isNotFound {
err = errors.Wrap(err, "get yatai config")
return
}
if isNotFound {
return
}
cachedYataiConf = yataiConf
}
yataiEndpoint := yataiConf.Endpoint
yataiAPIToken := yataiConf.ApiToken
if yataiEndpoint == "" {
return
}
clusterName_ := yataiConf.ClusterName
if clusterName_ == "" {
clusterName_ = DefaultClusterName
}
yataiClient_ := yataiclient.NewYataiClient(yataiEndpoint, fmt.Sprintf("%s:%s:%s", commonconsts.YataiDeploymentComponentName, clusterName_, yataiAPIToken))
yataiClient = &yataiClient_
clusterName = &clusterName_
return
}
func (r *CompoundAINimDeploymentReconciler) getYataiClientWithAuth(ctx context.Context, compoundAINimDeployment *v1alpha1.CompoundAINimDeployment) (**yataiclient.YataiClient, *string, error) {
orgId, ok := compoundAINimDeployment.Labels[commonconsts.NgcOrganizationHeaderName]
if !ok {
orgId = commonconsts.DefaultOrgId
}
userId, ok := compoundAINimDeployment.Labels[commonconsts.NgcUserHeaderName]
if !ok {
userId = commonconsts.DefaultUserId
}
auth := yataiclient.CompoundAIAuthHeaders{
OrgId: orgId,
UserId: userId,
}
client, clusterName, err := r.getYataiClient(ctx)
if err != nil {
return nil, nil, err
}
(*client).SetAuth(auth)
return client, clusterName, err
}
type createOrUpdateOrDeleteDeploymentsOption struct {
yataiClient **yataiclient.YataiClient
compoundAINimDeployment *v1alpha1.CompoundAINimDeployment
compoundAINim *v1alpha1.CompoundAINim
clusterName *string
}
//nolint:nakedret
func (r *CompoundAINimDeploymentReconciler) createOrUpdateOrDeleteDeployments(ctx context.Context, opt createOrUpdateOrDeleteDeploymentsOption) (modified bool, err error) {
containsStealingTrafficDebugModeEnabled := checkIfContainsStealingTrafficDebugModeEnabled(opt.compoundAINimDeployment)
modified, err = r.createOrUpdateDeployment(ctx, createOrUpdateDeploymentOption{
createOrUpdateOrDeleteDeploymentsOption: opt,
isStealingTrafficDebugModeEnabled: false,
containsStealingTrafficDebugModeEnabled: containsStealingTrafficDebugModeEnabled,
})
if err != nil {
err = errors.Wrap(err, "create or update deployment")
return
}
if containsStealingTrafficDebugModeEnabled {
modified, err = r.createOrUpdateDeployment(ctx, createOrUpdateDeploymentOption{
createOrUpdateOrDeleteDeploymentsOption: opt,
isStealingTrafficDebugModeEnabled: true,
containsStealingTrafficDebugModeEnabled: containsStealingTrafficDebugModeEnabled,
})
if err != nil {
err = errors.Wrap(err, "create or update deployment")
return
}
} else {
debugDeploymentName := r.getKubeName(opt.compoundAINimDeployment, opt.compoundAINim, true)
debugDeployment := &appsv1.Deployment{}
err = r.Get(ctx, types.NamespacedName{Name: debugDeploymentName, Namespace: opt.compoundAINimDeployment.Namespace}, debugDeployment)
isNotFound := k8serrors.IsNotFound(err)
if err != nil && !isNotFound {
err = errors.Wrap(err, "get deployment")
return
}
err = nil
if !isNotFound {
err = r.Delete(ctx, debugDeployment)
if err != nil {
err = errors.Wrap(err, "delete deployment")
return
}
modified = true
}
}
return
}
type createOrUpdateDeploymentOption struct {
createOrUpdateOrDeleteDeploymentsOption
isStealingTrafficDebugModeEnabled bool
containsStealingTrafficDebugModeEnabled bool
}
//nolint:nakedret
func (r *CompoundAINimDeploymentReconciler) createOrUpdateDeployment(ctx context.Context, opt createOrUpdateDeploymentOption) (modified bool, err error) {
logs := log.FromContext(ctx)
deployment, err := r.generateDeployment(ctx, generateDeploymentOption{
compoundAINimDeployment: opt.compoundAINimDeployment,
compoundAINim: opt.compoundAINim,
yataiClient: opt.yataiClient,
clusterName: opt.clusterName,
isStealingTrafficDebugModeEnabled: opt.isStealingTrafficDebugModeEnabled,
containsStealingTrafficDebugModeEnabled: opt.containsStealingTrafficDebugModeEnabled,
})
if err != nil {
return
}
logs = logs.WithValues("namespace", deployment.Namespace, "deploymentName", deployment.Name)
deploymentNamespacedName := fmt.Sprintf("%s/%s", deployment.Namespace, deployment.Name)
r.Recorder.Eventf(opt.compoundAINimDeployment, corev1.EventTypeNormal, "GetDeployment", "Getting Deployment %s", deploymentNamespacedName)
oldDeployment := &appsv1.Deployment{}
err = r.Get(ctx, types.NamespacedName{Name: deployment.Name, Namespace: deployment.Namespace}, oldDeployment)
oldDeploymentIsNotFound := k8serrors.IsNotFound(err)
if err != nil && !oldDeploymentIsNotFound {
r.Recorder.Eventf(opt.compoundAINimDeployment, corev1.EventTypeWarning, "GetDeployment", "Failed to get Deployment %s: %s", deploymentNamespacedName, err)
logs.Error(err, "Failed to get Deployment.")
return
}
if oldDeploymentIsNotFound {
logs.Info("Deployment not found. Creating a new one.")
err = errors.Wrapf(patch.DefaultAnnotator.SetLastAppliedAnnotation(deployment), "set last applied annotation for deployment %s", deployment.Name)
if err != nil {
logs.Error(err, "Failed to set last applied annotation.")
r.Recorder.Eventf(opt.compoundAINimDeployment, corev1.EventTypeWarning, "SetLastAppliedAnnotation", "Failed to set last applied annotation for Deployment %s: %s", deploymentNamespacedName, err)
return
}
r.Recorder.Eventf(opt.compoundAINimDeployment, corev1.EventTypeNormal, "CreateDeployment", "Creating a new Deployment %s", deploymentNamespacedName)
err = r.Create(ctx, deployment)
if err != nil {
logs.Error(err, "Failed to create Deployment.")
r.Recorder.Eventf(opt.compoundAINimDeployment, corev1.EventTypeWarning, "CreateDeployment", "Failed to create Deployment %s: %s", deploymentNamespacedName, err)
return
}
logs.Info("Deployment created.")
r.Recorder.Eventf(opt.compoundAINimDeployment, corev1.EventTypeNormal, "CreateDeployment", "Created Deployment %s", deploymentNamespacedName)
modified = true
} else {
logs.Info("Deployment found.")
var patchResult *patch.PatchResult
patchResult, err = patch.DefaultPatchMaker.Calculate(oldDeployment, deployment)
if err != nil {
logs.Error(err, "Failed to calculate patch.")
r.Recorder.Eventf(opt.compoundAINimDeployment, corev1.EventTypeWarning, "CalculatePatch", "Failed to calculate patch for Deployment %s: %s", deploymentNamespacedName, err)
return
}
if !patchResult.IsEmpty() {
logs.Info("Deployment spec is different. Updating Deployment.")
err = errors.Wrapf(patch.DefaultAnnotator.SetLastAppliedAnnotation(deployment), "set last applied annotation for deployment %s", deployment.Name)
if err != nil {
logs.Error(err, "Failed to set last applied annotation.")
r.Recorder.Eventf(opt.compoundAINimDeployment, corev1.EventTypeWarning, "SetLastAppliedAnnotation", "Failed to set last applied annotation for Deployment %s: %s", deploymentNamespacedName, err)
return
}
r.Recorder.Eventf(opt.compoundAINimDeployment, corev1.EventTypeNormal, "UpdateDeployment", "Updating Deployment %s", deploymentNamespacedName)
err = r.Update(ctx, deployment)
if err != nil {
logs.Error(err, "Failed to update Deployment.")
r.Recorder.Eventf(opt.compoundAINimDeployment, corev1.EventTypeWarning, "UpdateDeployment", "Failed to update Deployment %s: %s", deploymentNamespacedName, err)
return
}
logs.Info("Deployment updated.")
r.Recorder.Eventf(opt.compoundAINimDeployment, corev1.EventTypeNormal, "UpdateDeployment", "Updated Deployment %s", deploymentNamespacedName)
modified = true
} else {
logs.Info("Deployment spec is the same. Skipping update.")
r.Recorder.Eventf(opt.compoundAINimDeployment, corev1.EventTypeNormal, "UpdateDeployment", "Skipping update Deployment %s", deploymentNamespacedName)
}
}
return
}
//nolint:nakedret
func (r *CompoundAINimDeploymentReconciler) createOrUpdateHPA(ctx context.Context, compoundAINimDeployment *v1alpha1.CompoundAINimDeployment, compoundAINim *v1alpha1.CompoundAINim) (modified bool, err error) {
logs := log.FromContext(ctx)
hpa, err := r.generateHPA(compoundAINimDeployment, compoundAINim)
if err != nil {
return
}
logs = logs.WithValues("namespace", hpa.Namespace, "hpaName", hpa.Name)
hpaNamespacedName := fmt.Sprintf("%s/%s", hpa.Namespace, hpa.Name)
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeNormal, "GetHPA", "Getting HPA %s", hpaNamespacedName)
oldHPA, err := r.getHPA(ctx, hpa)
oldHPAIsNotFound := k8serrors.IsNotFound(err)
if err != nil && !oldHPAIsNotFound {
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeWarning, "GetHPA", "Failed to get HPA %s: %s", hpaNamespacedName, err)
logs.Error(err, "Failed to get HPA.")
return
}
if oldHPAIsNotFound {
logs.Info("HPA not found. Creating a new one.")
err = errors.Wrapf(patch.DefaultAnnotator.SetLastAppliedAnnotation(hpa), "set last applied annotation for hpa %s", hpa.Name)
if err != nil {
logs.Error(err, "Failed to set last applied annotation.")
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeWarning, "SetLastAppliedAnnotation", "Failed to set last applied annotation for HPA %s: %s", hpaNamespacedName, err)
return
}
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeNormal, "CreateHPA", "Creating a new HPA %s", hpaNamespacedName)
err = r.Create(ctx, hpa)
if err != nil {
logs.Error(err, "Failed to create HPA.")
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeWarning, "CreateHPA", "Failed to create HPA %s: %s", hpaNamespacedName, err)
return
}
logs.Info("HPA created.")
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeNormal, "CreateHPA", "Created HPA %s", hpaNamespacedName)
modified = true
} else {
logs.Info("HPA found.")
var patchResult *patch.PatchResult
patchResult, err = patch.DefaultPatchMaker.Calculate(oldHPA, hpa)
if err != nil {
logs.Error(err, "Failed to calculate patch.")
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeWarning, "CalculatePatch", "Failed to calculate patch for HPA %s: %s", hpaNamespacedName, err)
return
}
if !patchResult.IsEmpty() {
logs.Info(fmt.Sprintf("HPA spec is different. Updating HPA. The patch result is: %s", patchResult.String()))
err = errors.Wrapf(patch.DefaultAnnotator.SetLastAppliedAnnotation(hpa), "set last applied annotation for hpa %s", hpa.Name)
if err != nil {
logs.Error(err, "Failed to set last applied annotation.")
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeWarning, "SetLastAppliedAnnotation", "Failed to set last applied annotation for HPA %s: %s", hpaNamespacedName, err)
return
}
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeNormal, "UpdateHPA", "Updating HPA %s", hpaNamespacedName)
err = r.Update(ctx, hpa)
if err != nil {
logs.Error(err, "Failed to update HPA.")
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeWarning, "UpdateHPA", "Failed to update HPA %s: %s", hpaNamespacedName, err)
return
}
logs.Info("HPA updated.")
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeNormal, "UpdateHPA", "Updated HPA %s", hpaNamespacedName)
modified = true
} else {
logs.Info("HPA spec is the same. Skipping update.")
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeNormal, "UpdateHPA", "Skipping update HPA %s", hpaNamespacedName)
}
}
return
}
func getResourceAnnotations(compoundAINimDeployment *v1alpha1.CompoundAINimDeployment) map[string]string {
resourceAnnotations := compoundAINimDeployment.Spec.Annotations
if resourceAnnotations == nil {
resourceAnnotations = map[string]string{}
}
return resourceAnnotations
}
func checkIfIsDebugModeEnabled(annotations map[string]string) bool {
if annotations == nil {
return false
}
return annotations[KubeAnnotationYataiEnableDebugMode] == commonconsts.KubeLabelValueTrue
}
func checkIfIsStealingTrafficDebugModeEnabled(annotations map[string]string) bool {
if annotations == nil {
return false
}
return annotations[KubeAnnotationYataiEnableStealingTrafficDebugMode] == commonconsts.KubeLabelValueTrue
}
func checkIfIsDebugPodReceiveProductionTrafficEnabled(annotations map[string]string) bool {
if annotations == nil {
return false
}
return annotations[KubeAnnotationYataiEnableDebugPodReceiveProductionTraffic] == commonconsts.KubeLabelValueTrue
}
func checkIfContainsStealingTrafficDebugModeEnabled(compoundAINimDeployment *v1alpha1.CompoundAINimDeployment) bool {
return checkIfIsStealingTrafficDebugModeEnabled(compoundAINimDeployment.Spec.Annotations)
}
type createOrUpdateOrDeleteServicesOption struct {
compoundAINimDeployment *v1alpha1.CompoundAINimDeployment
compoundAINim *v1alpha1.CompoundAINim
}
//nolint:nakedret
func (r *CompoundAINimDeploymentReconciler) createOrUpdateOrDeleteServices(ctx context.Context, opt createOrUpdateOrDeleteServicesOption) (modified bool, err error) {
resourceAnnotations := getResourceAnnotations(opt.compoundAINimDeployment)
isDebugPodReceiveProductionTrafficEnabled := checkIfIsDebugPodReceiveProductionTrafficEnabled(resourceAnnotations)
containsStealingTrafficDebugModeEnabled := checkIfContainsStealingTrafficDebugModeEnabled(opt.compoundAINimDeployment)
modified, err = r.createOrUpdateService(ctx, createOrUpdateServiceOption{
compoundAINimDeployment: opt.compoundAINimDeployment,
compoundAINim: opt.compoundAINim,
isStealingTrafficDebugModeEnabled: false,
isDebugPodReceiveProductionTraffic: isDebugPodReceiveProductionTrafficEnabled,
containsStealingTrafficDebugModeEnabled: containsStealingTrafficDebugModeEnabled,
isGenericService: true,
})
if err != nil {
return
}
if containsStealingTrafficDebugModeEnabled {
var modified_ bool
modified_, err = r.createOrUpdateService(ctx, createOrUpdateServiceOption{
compoundAINimDeployment: opt.compoundAINimDeployment,
compoundAINim: opt.compoundAINim,
isStealingTrafficDebugModeEnabled: false,
isDebugPodReceiveProductionTraffic: isDebugPodReceiveProductionTrafficEnabled,
containsStealingTrafficDebugModeEnabled: containsStealingTrafficDebugModeEnabled,
isGenericService: false,
})
if err != nil {
return
}
if modified_ {
modified = true
}
modified_, err = r.createOrUpdateService(ctx, createOrUpdateServiceOption{
compoundAINimDeployment: opt.compoundAINimDeployment,
compoundAINim: opt.compoundAINim,
isStealingTrafficDebugModeEnabled: true,
isDebugPodReceiveProductionTraffic: isDebugPodReceiveProductionTrafficEnabled,
containsStealingTrafficDebugModeEnabled: containsStealingTrafficDebugModeEnabled,
isGenericService: false,
})
if err != nil {
return
}
if modified_ {
modified = true
}
} else {
productionServiceName := r.getServiceName(opt.compoundAINimDeployment, opt.compoundAINim, false)
svc := &corev1.Service{}
err = r.Get(ctx, types.NamespacedName{Name: productionServiceName, Namespace: opt.compoundAINimDeployment.Namespace}, svc)
isNotFound := k8serrors.IsNotFound(err)
if err != nil && !isNotFound {
err = errors.Wrapf(err, "Failed to get service %s", productionServiceName)
return
}
if !isNotFound {
modified = true
err = r.Delete(ctx, svc)
if err != nil {
err = errors.Wrapf(err, "Failed to delete service %s", productionServiceName)
return
}
}
debugServiceName := r.getServiceName(opt.compoundAINimDeployment, opt.compoundAINim, true)
svc = &corev1.Service{}
err = r.Get(ctx, types.NamespacedName{Name: debugServiceName, Namespace: opt.compoundAINimDeployment.Namespace}, svc)
isNotFound = k8serrors.IsNotFound(err)
if err != nil && !isNotFound {
err = errors.Wrapf(err, "Failed to get service %s", debugServiceName)
return
}
err = nil
if !isNotFound {
modified = true
err = r.Delete(ctx, svc)
if err != nil {
err = errors.Wrapf(err, "Failed to delete service %s", debugServiceName)
return
}
}
}
return
}
type createOrUpdateServiceOption struct {
compoundAINimDeployment *v1alpha1.CompoundAINimDeployment
compoundAINim *v1alpha1.CompoundAINim
isStealingTrafficDebugModeEnabled bool
isDebugPodReceiveProductionTraffic bool
containsStealingTrafficDebugModeEnabled bool
isGenericService bool
}
//nolint:nakedret
func (r *CompoundAINimDeploymentReconciler) createOrUpdateService(ctx context.Context, opt createOrUpdateServiceOption) (modified bool, err error) {
logs := log.FromContext(ctx)
// nolint: gosimple
service, err := r.generateService(generateServiceOption(opt))
if err != nil {
return
}
logs = logs.WithValues("namespace", service.Namespace, "serviceName", service.Name, "serviceSelector", service.Spec.Selector)
serviceNamespacedName := fmt.Sprintf("%s/%s", service.Namespace, service.Name)
r.Recorder.Eventf(opt.compoundAINimDeployment, corev1.EventTypeNormal, "GetService", "Getting Service %s", serviceNamespacedName)
oldService := &corev1.Service{}
err = r.Get(ctx, types.NamespacedName{Name: service.Name, Namespace: service.Namespace}, oldService)
oldServiceIsNotFound := k8serrors.IsNotFound(err)
if err != nil && !oldServiceIsNotFound {
r.Recorder.Eventf(opt.compoundAINimDeployment, corev1.EventTypeWarning, "GetService", "Failed to get Service %s: %s", serviceNamespacedName, err)
logs.Error(err, "Failed to get Service.")
return
}
if oldServiceIsNotFound {
logs.Info("Service not found. Creating a new one.")
err = errors.Wrapf(patch.DefaultAnnotator.SetLastAppliedAnnotation(service), "set last applied annotation for service %s", service.Name)
if err != nil {
logs.Error(err, "Failed to set last applied annotation.")
r.Recorder.Eventf(opt.compoundAINimDeployment, corev1.EventTypeWarning, "SetLastAppliedAnnotation", "Failed to set last applied annotation for Service %s: %s", serviceNamespacedName, err)
return
}
r.Recorder.Eventf(opt.compoundAINimDeployment, corev1.EventTypeNormal, "CreateService", "Creating a new Service %s", serviceNamespacedName)
err = r.Create(ctx, service)
if err != nil {
logs.Error(err, "Failed to create Service.")
r.Recorder.Eventf(opt.compoundAINimDeployment, corev1.EventTypeWarning, "CreateService", "Failed to create Service %s: %s", serviceNamespacedName, err)
return
}
logs.Info("Service created.")
r.Recorder.Eventf(opt.compoundAINimDeployment, corev1.EventTypeNormal, "CreateService", "Created Service %s", serviceNamespacedName)
modified = true
} else {
logs.Info("Service found.")
var patchResult *patch.PatchResult
patchResult, err = patch.DefaultPatchMaker.Calculate(oldService, service)
if err != nil {
logs.Error(err, "Failed to calculate patch.")
r.Recorder.Eventf(opt.compoundAINimDeployment, corev1.EventTypeWarning, "CalculatePatch", "Failed to calculate patch for Service %s: %s", serviceNamespacedName, err)
return
}
if !patchResult.IsEmpty() {
logs.Info("Service spec is different. Updating Service.")
err = errors.Wrapf(patch.DefaultAnnotator.SetLastAppliedAnnotation(service), "set last applied annotation for service %s", service.Name)
if err != nil {
logs.Error(err, "Failed to set last applied annotation.")
r.Recorder.Eventf(opt.compoundAINimDeployment, corev1.EventTypeWarning, "SetLastAppliedAnnotation", "Failed to set last applied annotation for Service %s: %s", serviceNamespacedName, err)
return
}
r.Recorder.Eventf(opt.compoundAINimDeployment, corev1.EventTypeNormal, "UpdateService", "Updating Service %s", serviceNamespacedName)
oldService.Annotations = service.Annotations
oldService.Labels = service.Labels
oldService.Spec = service.Spec
err = r.Update(ctx, oldService)
if err != nil {
logs.Error(err, "Failed to update Service.")
r.Recorder.Eventf(opt.compoundAINimDeployment, corev1.EventTypeWarning, "UpdateService", "Failed to update Service %s: %s", serviceNamespacedName, err)
return
}
logs.Info("Service updated.")
r.Recorder.Eventf(opt.compoundAINimDeployment, corev1.EventTypeNormal, "UpdateService", "Updated Service %s", serviceNamespacedName)
modified = true
} else {
logs = logs.WithValues("oldServiceSelector", oldService.Spec.Selector)
logs.Info("Service spec is the same. Skipping update.")
r.Recorder.Eventf(opt.compoundAINimDeployment, corev1.EventTypeNormal, "UpdateService", "Skipping update Service %s", serviceNamespacedName)
}
}
return
}
func (r *CompoundAINimDeploymentReconciler) createOrUpdateVirtualService(ctx context.Context, compoundAINimDeployment *v1alpha1.CompoundAINimDeployment) (bool, error) {
log := log.FromContext(ctx)
log.Info("Starting createOrUpdateVirtualService")
vsName := compoundAINimDeployment.Name
if compoundAINimDeployment.Spec.Ingress.HostPrefix != nil {
vsName = *compoundAINimDeployment.Spec.Ingress.HostPrefix + vsName
}
vs := &networkingv1beta1.VirtualService{
ObjectMeta: metav1.ObjectMeta{
Name: compoundAINimDeployment.Name,
Namespace: compoundAINimDeployment.Namespace,
},
Spec: istioNetworking.VirtualService{
Hosts: []string{
fmt.Sprintf("%s.dev.aire.nvidia.com", vsName),
},
Gateways: []string{"istio-system/ingress-alb"},
Http: []*istioNetworking.HTTPRoute{
{
Match: []*istioNetworking.HTTPMatchRequest{
{
Uri: &istioNetworking.StringMatch{
MatchType: &istioNetworking.StringMatch_Prefix{Prefix: "/"},
},
},
},
Route: []*istioNetworking.HTTPRouteDestination{
{
Destination: &istioNetworking.Destination{
Host: fmt.Sprintf("%s.yatai.svc.cluster.local", compoundAINimDeployment.Name),
Port: &istioNetworking.PortSelector{
Number: 3000,
},
},
},
},
},
},
},
}
log.Info("VirtualService object constructed", "VirtualService", vs)
oldVS := &networkingv1beta1.VirtualService{}
err := r.Get(ctx, types.NamespacedName{Name: vs.Name, Namespace: vs.Namespace}, oldVS)
if client.IgnoreNotFound(err) != nil {
log.Error(err, "Failed to get VirtualService")
return false, err
}
vsEnabled := compoundAINimDeployment.Spec.Ingress.Enabled && compoundAINimDeployment.Spec.Ingress.UseVirtualService != nil && *compoundAINimDeployment.Spec.Ingress.UseVirtualService
if err != nil {
if vsEnabled {
log.Info("VirtualService not found, creating new one")
if err := r.Create(ctx, vs); err != nil {
log.Error(err, "Failed to create VirtualService")
return false, err
}
log.Info("VirtualService created successfully", "VirtualService", vs)
return true, nil
}
return false, nil
}
if !vsEnabled {
log.Info("VirtualService found, deleting", "OldVirtualService", oldVS)
if err := r.Delete(ctx, oldVS); err != nil {
log.Error(err, "Failed to delete VirtualService")
return false, err
}
return true, err
}
log.Info("VirtualService found, updating", "OldVirtualService", oldVS)
if err := r.Update(ctx, vs); err != nil {
log.Error(err, "Failed to update VirtualService")
return false, err
}
log.Info("VirtualService updated successfully", "VirtualService", oldVS)
return true, nil
}
type createOrUpdateIngressOption struct {
yataiClient **yataiclient.YataiClient
compoundAINimDeployment *v1alpha1.CompoundAINimDeployment
compoundAINim *v1alpha1.CompoundAINim
}
//nolint:nakedret
func (r *CompoundAINimDeploymentReconciler) createOrUpdateIngresses(ctx context.Context, opt createOrUpdateIngressOption) (modified bool, err error) {
logs := log.FromContext(ctx)
compoundAINimDeployment := opt.compoundAINimDeployment
compoundAINim := opt.compoundAINim
modified, err = r.createOrUpdateVirtualService(ctx, compoundAINimDeployment)
if err != nil {
return false, err
}
// generateIngresses generates an ingress and actively waits for the ingress to come online ....
// so disabling it for now unless explicitly enabled
if !opt.compoundAINimDeployment.Spec.Ingress.Enabled || (opt.compoundAINimDeployment.Spec.Ingress.UseVirtualService != nil && *opt.compoundAINimDeployment.Spec.Ingress.UseVirtualService) {
return false, nil
}
ingresses, err := r.generateIngresses(ctx, generateIngressesOption{
yataiClient: opt.yataiClient,
compoundAINimDeployment: compoundAINimDeployment,
compoundAINim: compoundAINim,
})
if err != nil {
return
}
for _, ingress := range ingresses {
logs := logs.WithValues("namespace", ingress.Namespace, "ingressName", ingress.Name)
ingressNamespacedName := fmt.Sprintf("%s/%s", ingress.Namespace, ingress.Name)
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeNormal, "GetIngress", "Getting Ingress %s", ingressNamespacedName)
oldIngress := &networkingv1.Ingress{}
err = r.Get(ctx, types.NamespacedName{Name: ingress.Name, Namespace: ingress.Namespace}, oldIngress)
oldIngressIsNotFound := k8serrors.IsNotFound(err)
if err != nil && !oldIngressIsNotFound {
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeWarning, "GetIngress", "Failed to get Ingress %s: %s", ingressNamespacedName, err)
logs.Error(err, "Failed to get Ingress.")
return
}
err = nil
if oldIngressIsNotFound {
if !compoundAINimDeployment.Spec.Ingress.Enabled {
logs.Info("Ingress not enabled. Skipping.")
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeNormal, "GetIngress", "Skipping Ingress %s", ingressNamespacedName)
continue
}
logs.Info("Ingress not found. Creating a new one.")
err = errors.Wrapf(patch.DefaultAnnotator.SetLastAppliedAnnotation(ingress), "set last applied annotation for ingress %s", ingress.Name)
if err != nil {
logs.Error(err, "Failed to set last applied annotation.")
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeWarning, "SetLastAppliedAnnotation", "Failed to set last applied annotation for Ingress %s: %s", ingressNamespacedName, err)
return
}
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeNormal, "CreateIngress", "Creating a new Ingress %s", ingressNamespacedName)
err = r.Create(ctx, ingress)
if err != nil {
logs.Error(err, "Failed to create Ingress.")
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeWarning, "CreateIngress", "Failed to create Ingress %s: %s", ingressNamespacedName, err)
return
}
logs.Info("Ingress created.")
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeNormal, "CreateIngress", "Created Ingress %s", ingressNamespacedName)
modified = true
} else {
logs.Info("Ingress found.")
if !compoundAINimDeployment.Spec.Ingress.Enabled {
logs.Info("Ingress not enabled. Deleting.")
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeNormal, "DeleteIngress", "Deleting Ingress %s", ingressNamespacedName)
err = r.Delete(ctx, ingress)
if err != nil {
logs.Error(err, "Failed to delete Ingress.")
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeWarning, "DeleteIngress", "Failed to delete Ingress %s: %s", ingressNamespacedName, err)
return
}
logs.Info("Ingress deleted.")
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeNormal, "DeleteIngress", "Deleted Ingress %s", ingressNamespacedName)
modified = true
continue
}
// Keep host unchanged
ingress.Spec.Rules[0].Host = oldIngress.Spec.Rules[0].Host
var patchResult *patch.PatchResult
patchResult, err = patch.DefaultPatchMaker.Calculate(oldIngress, ingress)
if err != nil {
logs.Error(err, "Failed to calculate patch.")
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeWarning, "CalculatePatch", "Failed to calculate patch for Ingress %s: %s", ingressNamespacedName, err)
return
}
if !patchResult.IsEmpty() {
logs.Info("Ingress spec is different. Updating Ingress.")
err = errors.Wrapf(patch.DefaultAnnotator.SetLastAppliedAnnotation(ingress), "set last applied annotation for ingress %s", ingress.Name)
if err != nil {
logs.Error(err, "Failed to set last applied annotation.")
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeWarning, "SetLastAppliedAnnotation", "Failed to set last applied annotation for Ingress %s: %s", ingressNamespacedName, err)
return
}
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeNormal, "UpdateIngress", "Updating Ingress %s", ingressNamespacedName)
err = r.Update(ctx, ingress)
if err != nil {
logs.Error(err, "Failed to update Ingress.")
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeWarning, "UpdateIngress", "Failed to update Ingress %s: %s", ingressNamespacedName, err)
return
}
logs.Info("Ingress updated.")
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeNormal, "UpdateIngress", "Updated Ingress %s", ingressNamespacedName)
modified = true
} else {
logs.Info("Ingress spec is the same. Skipping update.")
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeNormal, "UpdateIngress", "Skipping update Ingress %s", ingressNamespacedName)
}
}
}
return
}
func (r *CompoundAINimDeploymentReconciler) getKubeName(compoundAINimDeployment *v1alpha1.CompoundAINimDeployment, _ *v1alpha1.CompoundAINim, debug bool) string {
if debug {
return fmt.Sprintf("%s-d", compoundAINimDeployment.Name)
}
return compoundAINimDeployment.Name
}
func (r *CompoundAINimDeploymentReconciler) getServiceName(compoundAINimDeployment *v1alpha1.CompoundAINimDeployment, _ *v1alpha1.CompoundAINim, debug bool) string {
var kubeName string
if debug {
kubeName = fmt.Sprintf("%s-d", compoundAINimDeployment.Name)
} else {
kubeName = fmt.Sprintf("%s-p", compoundAINimDeployment.Name)
}
return kubeName
}
func (r *CompoundAINimDeploymentReconciler) getGenericServiceName(compoundAINimDeployment *v1alpha1.CompoundAINimDeployment, compoundAINim *v1alpha1.CompoundAINim) string {
return r.getKubeName(compoundAINimDeployment, compoundAINim, false)
}
func (r *CompoundAINimDeploymentReconciler) getKubeLabels(compoundAINimDeployment *v1alpha1.CompoundAINimDeployment, compoundAINim *v1alpha1.CompoundAINim) map[string]string {
compoundAINimRepositoryName, _, compoundAINimVersion := xstrings.Partition(compoundAINim.Spec.Tag, ":")
labels := map[string]string{
commonconsts.KubeLabelYataiBentoDeployment: compoundAINimDeployment.Name,
commonconsts.KubeLabelBentoRepository: compoundAINimRepositoryName,
commonconsts.KubeLabelBentoVersion: compoundAINimVersion,
commonconsts.KubeLabelYataiBentoDeploymentTargetType: DeploymentTargetTypeProduction,
commonconsts.KubeLabelCreator: "yatai-deployment",
}
labels[commonconsts.KubeLabelYataiBentoDeploymentComponentType] = commonconsts.YataiBentoDeploymentComponentApiServer
return labels
}
func (r *CompoundAINimDeploymentReconciler) getKubeAnnotations(compoundAINimDeployment *v1alpha1.CompoundAINimDeployment, compoundAINim *v1alpha1.CompoundAINim) map[string]string {
compoundAINimRepositoryName, compoundAINimVersion := getCompoundAINimRepositoryNameAndCompoundAINimVersion(compoundAINim)
annotations := map[string]string{
commonconsts.KubeAnnotationBentoRepository: compoundAINimRepositoryName,
commonconsts.KubeAnnotationBentoVersion: compoundAINimVersion,
}
var extraAnnotations map[string]string
if compoundAINimDeployment.Spec.ExtraPodMetadata != nil {
extraAnnotations = compoundAINimDeployment.Spec.ExtraPodMetadata.Annotations
} else {
extraAnnotations = map[string]string{}
}
for k, v := range extraAnnotations {
annotations[k] = v
}
return annotations
}
type generateDeploymentOption struct {
compoundAINimDeployment *v1alpha1.CompoundAINimDeployment
compoundAINim *v1alpha1.CompoundAINim
yataiClient **yataiclient.YataiClient
clusterName *string
isStealingTrafficDebugModeEnabled bool
containsStealingTrafficDebugModeEnabled bool
}
//nolint:nakedret
func (r *CompoundAINimDeploymentReconciler) generateDeployment(ctx context.Context, opt generateDeploymentOption) (kubeDeployment *appsv1.Deployment, err error) {
kubeNs := opt.compoundAINimDeployment.Namespace
// nolint: gosimple
podTemplateSpec, err := r.generatePodTemplateSpec(ctx, generatePodTemplateSpecOption(opt))
if err != nil {
return
}
labels := r.getKubeLabels(opt.compoundAINimDeployment, opt.compoundAINim)
annotations := r.getKubeAnnotations(opt.compoundAINimDeployment, opt.compoundAINim)
kubeName := r.getKubeName(opt.compoundAINimDeployment, opt.compoundAINim, opt.isStealingTrafficDebugModeEnabled)
defaultMaxSurge := intstr.FromString("25%")
defaultMaxUnavailable := intstr.FromString("25%")
strategy := appsv1.DeploymentStrategy{
Type: appsv1.RollingUpdateDeploymentStrategyType,
RollingUpdate: &appsv1.RollingUpdateDeployment{
MaxSurge: &defaultMaxSurge,
MaxUnavailable: &defaultMaxUnavailable,
},
}
resourceAnnotations := getResourceAnnotations(opt.compoundAINimDeployment)
strategyStr := resourceAnnotations[KubeAnnotationDeploymentStrategy]
if strategyStr != "" {
strategyType := modelschemas.DeploymentStrategy(strategyStr)
switch strategyType {
case modelschemas.DeploymentStrategyRollingUpdate:
strategy = appsv1.DeploymentStrategy{
Type: appsv1.RollingUpdateDeploymentStrategyType,
RollingUpdate: &appsv1.RollingUpdateDeployment{
MaxSurge: &defaultMaxSurge,
MaxUnavailable: &defaultMaxUnavailable,
},
}
case modelschemas.DeploymentStrategyRecreate:
strategy = appsv1.DeploymentStrategy{
Type: appsv1.RecreateDeploymentStrategyType,
}
case modelschemas.DeploymentStrategyRampedSlowRollout:
strategy = appsv1.DeploymentStrategy{
Type: appsv1.RollingUpdateDeploymentStrategyType,
RollingUpdate: &appsv1.RollingUpdateDeployment{
MaxSurge: &[]intstr.IntOrString{intstr.FromInt(1)}[0],
MaxUnavailable: &[]intstr.IntOrString{intstr.FromInt(0)}[0],
},
}
case modelschemas.DeploymentStrategyBestEffortControlledRollout:
strategy = appsv1.DeploymentStrategy{
Type: appsv1.RollingUpdateDeploymentStrategyType,
RollingUpdate: &appsv1.RollingUpdateDeployment{
MaxSurge: &[]intstr.IntOrString{intstr.FromInt(0)}[0],
MaxUnavailable: &[]intstr.IntOrString{intstr.FromString("20%")}[0],
},
}
}
}
var replicas *int32
if opt.isStealingTrafficDebugModeEnabled {
replicas = &[]int32{int32(1)}[0]
}
kubeDeployment = &appsv1.Deployment{
ObjectMeta: metav1.ObjectMeta{
Name: kubeName,
Namespace: kubeNs,
Labels: labels,
Annotations: annotations,
},
Spec: appsv1.DeploymentSpec{
Replicas: replicas,
Selector: &metav1.LabelSelector{
MatchLabels: map[string]string{
commonconsts.KubeLabelYataiSelector: kubeName,
},
},
Template: *podTemplateSpec,
Strategy: strategy,
},
}
err = ctrl.SetControllerReference(opt.compoundAINimDeployment, kubeDeployment, r.Scheme)
if err != nil {
err = errors.Wrapf(err, "set deployment %s controller reference", kubeDeployment.Name)
}
return
}
func (r *CompoundAINimDeploymentReconciler) generateHPA(compoundAINimDeployment *v1alpha1.CompoundAINimDeployment, compoundAINim *v1alpha1.CompoundAINim) (*autoscalingv2.HorizontalPodAutoscaler, error) {
labels := r.getKubeLabels(compoundAINimDeployment, compoundAINim)
annotations := r.getKubeAnnotations(compoundAINimDeployment, compoundAINim)
kubeName := r.getKubeName(compoundAINimDeployment, compoundAINim, false)
kubeNs := compoundAINimDeployment.Namespace
hpaConf := compoundAINimDeployment.Spec.Autoscaling
if hpaConf == nil {
hpaConf = &v1alpha1.Autoscaling{
MinReplicas: 1,
MaxReplicas: 1,
}
}
minReplica := int32(hpaConf.MinReplicas)
kubeHpa := &autoscalingv2.HorizontalPodAutoscaler{
ObjectMeta: metav1.ObjectMeta{
Name: kubeName,
Namespace: kubeNs,
Labels: labels,
Annotations: annotations,
},
Spec: autoscalingv2.HorizontalPodAutoscalerSpec{
MinReplicas: &minReplica,
MaxReplicas: int32(hpaConf.MaxReplicas),
ScaleTargetRef: autoscalingv2.CrossVersionObjectReference{
APIVersion: "apps/v1",
Kind: "Deployment",
Name: kubeName,
},
Metrics: hpaConf.Metrics,
},
}
if len(kubeHpa.Spec.Metrics) == 0 {
averageUtilization := int32(commonconsts.HPACPUDefaultAverageUtilization)
kubeHpa.Spec.Metrics = []autoscalingv2.MetricSpec{
{
Type: autoscalingv2.ResourceMetricSourceType,
Resource: &autoscalingv2.ResourceMetricSource{
Name: corev1.ResourceCPU,
Target: autoscalingv2.MetricTarget{
Type: autoscalingv2.UtilizationMetricType,
AverageUtilization: &averageUtilization,
},
},
},
}
}
err := ctrl.SetControllerReference(compoundAINimDeployment, kubeHpa, r.Scheme)
if err != nil {
return nil, errors.Wrapf(err, "set hpa %s controller reference", kubeName)
}
return kubeHpa, err
}
func (r *CompoundAINimDeploymentReconciler) getHPA(ctx context.Context, hpa *autoscalingv2.HorizontalPodAutoscaler) (client.Object, error) {
name, ns := hpa.Name, hpa.Namespace
obj := &autoscalingv2.HorizontalPodAutoscaler{}
err := r.Get(ctx, types.NamespacedName{Name: name, Namespace: ns}, obj)
if err == nil {
legacyStatus := &autoscalingv2.HorizontalPodAutoscalerStatus{}
if err := copier.Copy(legacyStatus, obj.Status); err != nil {
return nil, err
}
obj.Status = *legacyStatus
}
return obj, err
}
func getCompoundAINimRepositoryNameAndCompoundAINimVersion(compoundAINim *v1alpha1.CompoundAINim) (repositoryName string, version string) {
repositoryName, _, version = xstrings.Partition(compoundAINim.Spec.Tag, ":")
return
}
type generatePodTemplateSpecOption struct {
compoundAINimDeployment *v1alpha1.CompoundAINimDeployment
compoundAINim *v1alpha1.CompoundAINim
yataiClient **yataiclient.YataiClient
clusterName *string
isStealingTrafficDebugModeEnabled bool
containsStealingTrafficDebugModeEnabled bool
}
//nolint:gocyclo,nakedret
func (r *CompoundAINimDeploymentReconciler) generatePodTemplateSpec(ctx context.Context, opt generatePodTemplateSpecOption) (podTemplateSpec *corev1.PodTemplateSpec, err error) {
compoundAINimRepositoryName, _ := getCompoundAINimRepositoryNameAndCompoundAINimVersion(opt.compoundAINim)
podLabels := r.getKubeLabels(opt.compoundAINimDeployment, opt.compoundAINim)
if opt.isStealingTrafficDebugModeEnabled {
podLabels[commonconsts.KubeLabelYataiBentoDeploymentTargetType] = DeploymentTargetTypeDebug
}
podAnnotations := r.getKubeAnnotations(opt.compoundAINimDeployment, opt.compoundAINim)
kubeName := r.getKubeName(opt.compoundAINimDeployment, opt.compoundAINim, opt.isStealingTrafficDebugModeEnabled)
containerPort := commonconsts.BentoServicePort
lastPort := containerPort + 1
monitorExporter := opt.compoundAINimDeployment.Spec.MonitorExporter
needMonitorContainer := monitorExporter != nil && monitorExporter.Enabled
lastPort++
monitorExporterPort := lastPort
var envs []corev1.EnvVar
envsSeen := make(map[string]struct{})
resourceAnnotations := opt.compoundAINimDeployment.Spec.Annotations
specEnvs := opt.compoundAINimDeployment.Spec.Envs
if resourceAnnotations == nil {
resourceAnnotations = make(map[string]string)
}
isDebugModeEnabled := checkIfIsDebugModeEnabled(resourceAnnotations)
if specEnvs != nil {
envs = make([]corev1.EnvVar, 0, len(specEnvs)+1)
for _, env := range specEnvs {
if _, ok := envsSeen[env.Name]; ok {
continue
}
if env.Name == commonconsts.EnvBentoServicePort {
// nolint: gosec
containerPort, err = strconv.Atoi(env.Value)
if err != nil {
return nil, errors.Wrapf(err, "invalid port value %s", env.Value)
}
}
envsSeen[env.Name] = struct{}{}
envs = append(envs, corev1.EnvVar{
Name: env.Name,
Value: env.Value,
})
}
}
defaultEnvs := []corev1.EnvVar{
{
Name: commonconsts.EnvBentoServicePort,
Value: fmt.Sprintf("%d", containerPort),
},
{
Name: commonconsts.EnvYataiDeploymentUID,
Value: string(opt.compoundAINimDeployment.UID),
},
{
Name: commonconsts.EnvYataiBentoDeploymentName,
Value: opt.compoundAINimDeployment.Name,
},
{
Name: commonconsts.EnvYataiBentoDeploymentNamespace,
Value: opt.compoundAINimDeployment.Namespace,
},
}
if r.NatsAddr != "" {
defaultEnvs = append(defaultEnvs, corev1.EnvVar{
Name: "NATS_SERVER",
Value: r.NatsAddr,
})
}
if r.EtcdAddr != "" {
defaultEnvs = append(defaultEnvs, corev1.EnvVar{
Name: "ETCD_ENDPOINTS",
Value: r.EtcdAddr,
})
}
if opt.yataiClient != nil {
yataiClient := *opt.yataiClient
var cluster *schemasv1.ClusterFullSchema
clusterName := DefaultClusterName
if opt.clusterName != nil {
clusterName = *opt.clusterName
}
cluster, err = yataiClient.GetCluster(ctx, clusterName)
if err != nil {
return
}
var version *schemasv1.VersionSchema
version, err = yataiClient.GetVersion(ctx)
if err != nil {
return
}
defaultEnvs = append(defaultEnvs, []corev1.EnvVar{
{
Name: commonconsts.EnvYataiVersion,
Value: fmt.Sprintf("%s-%s", version.Version, version.GitCommit),
},
{
Name: commonconsts.EnvYataiClusterUID,
Value: cluster.Uid,
},
}...)
}
for _, env := range defaultEnvs {
if _, ok := envsSeen[env.Name]; !ok {
envs = append(envs, env)
}
}
if needMonitorContainer {
monitoringConfigTemplate := `monitoring.enabled=true
monitoring.type=otlp
monitoring.options.endpoint=http://127.0.0.1:%d
monitoring.options.insecure=true`
var bentomlOptions string
index := -1
for i, env := range envs {
if env.Name == "BENTOML_CONFIG_OPTIONS" {
bentomlOptions = env.Value
index = i
break
}
}
if index == -1 {
// BENOML_CONFIG_OPTIONS not defined
bentomlOptions = fmt.Sprintf(monitoringConfigTemplate, monitorExporterPort)
envs = append(envs, corev1.EnvVar{
Name: "BENTOML_CONFIG_OPTIONS",
Value: bentomlOptions,
})
} else if !strings.Contains(bentomlOptions, "monitoring") {
// monitoring config not defined
envs = append(envs[:index], envs[index+1:]...)
bentomlOptions = strings.TrimSpace(bentomlOptions) // ' ' -> ''
if bentomlOptions != "" {
bentomlOptions += "\n"
}
bentomlOptions += fmt.Sprintf(monitoringConfigTemplate, monitorExporterPort)
envs = append(envs, corev1.EnvVar{
Name: "BENTOML_CONFIG_OPTIONS",
Value: bentomlOptions,
})
}
// monitoring config already defined
// do nothing
}
livenessProbe := &corev1.Probe{
InitialDelaySeconds: 10,
TimeoutSeconds: 20,
FailureThreshold: 6,
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Path: "/livez",
Port: intstr.FromString(commonconsts.BentoContainerPortName),
},
},
}
if opt.compoundAINimDeployment.Spec.LivenessProbe != nil {
livenessProbe = opt.compoundAINimDeployment.Spec.LivenessProbe
}
readinessProbe := &corev1.Probe{
InitialDelaySeconds: 5,
TimeoutSeconds: 5,
FailureThreshold: 12,
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Path: "/readyz",
Port: intstr.FromString(commonconsts.BentoContainerPortName),
},
},
}
if opt.compoundAINimDeployment.Spec.ReadinessProbe != nil {
readinessProbe = opt.compoundAINimDeployment.Spec.ReadinessProbe
}
volumes := make([]corev1.Volume, 0)
volumeMounts := make([]corev1.VolumeMount, 0)
args := make([]string, 0)
args = append(args, "uv", "run", "compoundai", "start")
if opt.compoundAINimDeployment.Spec.ServiceName != "" {
args = append(args, []string{"--service-name", opt.compoundAINimDeployment.Spec.ServiceName}...)
}
if len(opt.compoundAINimDeployment.Spec.ExternalServices) > 0 {
serviceSuffix := fmt.Sprintf("%s.svc.cluster.local:3000", opt.compoundAINimDeployment.Namespace)
keys := make([]string, 0, len(opt.compoundAINimDeployment.Spec.ExternalServices))
for key := range opt.compoundAINimDeployment.Spec.ExternalServices {
keys = append(keys, key)
}
sort.Strings(keys)
for _, key := range keys {
service := opt.compoundAINimDeployment.Spec.ExternalServices[key]
// Check if DeploymentSelectorKey is not "name"
if service.DeploymentSelectorKey == "name" {
dependsFlag := fmt.Sprintf("--depends \"%s=http://%s.%s\"", key, service.DeploymentSelectorValue, serviceSuffix)
args = append(args, dependsFlag)
} else if service.DeploymentSelectorKey == "nova" {
dependsFlag := fmt.Sprintf("--depends \"%s=nova://%s\"", key, service.DeploymentSelectorValue)
args = append(args, dependsFlag)
} else {
return nil, errors.Errorf("DeploymentSelectorKey '%s' not supported. Only 'name' and 'nova' are supported", service.DeploymentSelectorKey)
}
}
}
yataiResources := opt.compoundAINimDeployment.Spec.Resources
resources, err := getResourcesConfig(yataiResources)
if err != nil {
err = errors.Wrap(err, "failed to get resources config")
return nil, err
}
sharedMemorySizeLimit := resource.MustParse("64Mi")
memoryLimit := resources.Limits[corev1.ResourceMemory]
if !memoryLimit.IsZero() {
sharedMemorySizeLimit.SetMilli(memoryLimit.MilliValue() / 2)
}
volumes = append(volumes, corev1.Volume{
Name: KubeValueNameSharedMemory,
VolumeSource: corev1.VolumeSource{
EmptyDir: &corev1.EmptyDirVolumeSource{
Medium: corev1.StorageMediumMemory,
SizeLimit: &sharedMemorySizeLimit,
},
},
})
volumeMounts = append(volumeMounts, corev1.VolumeMount{
Name: KubeValueNameSharedMemory,
MountPath: "/dev/shm",
})
if opt.compoundAINimDeployment.Spec.PVC != nil {
volumes = append(volumes, corev1.Volume{
Name: getPvcName(opt.compoundAINimDeployment, opt.compoundAINimDeployment.Spec.PVC.Name),
VolumeSource: corev1.VolumeSource{
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
ClaimName: getPvcName(opt.compoundAINimDeployment, opt.compoundAINimDeployment.Spec.PVC.Name),
},
},
})
volumeMounts = append(volumeMounts, corev1.VolumeMount{
Name: getPvcName(opt.compoundAINimDeployment, opt.compoundAINimDeployment.Spec.PVC.Name),
MountPath: *opt.compoundAINimDeployment.Spec.PVC.MountPoint,
})
}
imageName := opt.compoundAINim.Spec.Image
var securityContext *corev1.SecurityContext
var mainContainerSecurityContext *corev1.SecurityContext
enableRestrictedSecurityContext := os.Getenv("ENABLE_RESTRICTED_SECURITY_CONTEXT") == "true"
if enableRestrictedSecurityContext {
securityContext = &corev1.SecurityContext{
AllowPrivilegeEscalation: ptr.To(false),
RunAsNonRoot: ptr.To(true),
RunAsUser: ptr.To(int64(1000)),
RunAsGroup: ptr.To(int64(1000)),
SeccompProfile: &corev1.SeccompProfile{
Type: corev1.SeccompProfileTypeRuntimeDefault,
},
Capabilities: &corev1.Capabilities{
Drop: []corev1.Capability{"ALL"},
},
}
mainContainerSecurityContext = securityContext.DeepCopy()
mainContainerSecurityContext.RunAsUser = ptr.To(int64(1034))
}
containers := make([]corev1.Container, 0, 2)
// TODO: Temporarily disabling probes
container := corev1.Container{
Name: "main",
Image: imageName,
Command: []string{"sh", "-c"},
Args: []string{strings.Join(args, " ")},
LivenessProbe: livenessProbe,
ReadinessProbe: readinessProbe,
Resources: resources,
Env: envs,
TTY: true,
Stdin: true,
VolumeMounts: volumeMounts,
Ports: []corev1.ContainerPort{
{
Protocol: corev1.ProtocolTCP,
Name: commonconsts.BentoContainerPortName,
ContainerPort: int32(containerPort), // nolint: gosec
},
},
SecurityContext: mainContainerSecurityContext,
}
if opt.compoundAINimDeployment.Spec.EnvFromSecret != nil {
container.EnvFrom = []corev1.EnvFromSource{
{
SecretRef: &corev1.SecretEnvSource{
LocalObjectReference: corev1.LocalObjectReference{
Name: *opt.compoundAINimDeployment.Spec.EnvFromSecret,
},
},
},
}
}
if resourceAnnotations["yatai.ai/enable-container-privileged"] == commonconsts.KubeLabelValueTrue {
if container.SecurityContext == nil {
container.SecurityContext = &corev1.SecurityContext{}
}
container.SecurityContext.Privileged = &[]bool{true}[0]
}
if resourceAnnotations["yatai.ai/enable-container-ptrace"] == commonconsts.KubeLabelValueTrue {
if container.SecurityContext == nil {
container.SecurityContext = &corev1.SecurityContext{}
}
container.SecurityContext.Capabilities = &corev1.Capabilities{
Add: []corev1.Capability{"SYS_PTRACE"},
}
}
if resourceAnnotations["yatai.ai/run-container-as-root"] == commonconsts.KubeLabelValueTrue {
if container.SecurityContext == nil {
container.SecurityContext = &corev1.SecurityContext{}
}
container.SecurityContext.RunAsUser = &[]int64{0}[0]
}
containers = append(containers, container)
lastPort++
metricsPort := lastPort
containers = append(containers, corev1.Container{
Name: "metrics-transformer",
Image: commonconfig.GetInternalImages().MetricsTransformer,
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("10m"),
corev1.ResourceMemory: resource.MustParse("10Mi"),
},
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("100m"),
corev1.ResourceMemory: resource.MustParse("100Mi"),
},
},
ReadinessProbe: &corev1.Probe{
InitialDelaySeconds: 5,
TimeoutSeconds: 5,
FailureThreshold: 10,
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Path: "/healthz",
Port: intstr.FromString("metrics"),
},
},
},
LivenessProbe: &corev1.Probe{
InitialDelaySeconds: 5,
TimeoutSeconds: 5,
FailureThreshold: 10,
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Path: "/healthz",
Port: intstr.FromString("metrics"),
},
},
},
Env: []corev1.EnvVar{
{
Name: "BENTOML_SERVER_HOST",
Value: "localhost",
},
{
Name: "BENTOML_SERVER_PORT",
Value: fmt.Sprintf("%d", containerPort),
},
{
Name: "PORT",
Value: fmt.Sprintf("%d", metricsPort),
},
{
Name: "OLD_METRICS_PREFIX",
Value: fmt.Sprintf("BENTOML_%s_", strings.ReplaceAll(compoundAINimRepositoryName, "-", ":")),
},
{
Name: "NEW_METRICS_PREFIX",
Value: "BENTOML_",
},
},
Ports: []corev1.ContainerPort{
{
Protocol: corev1.ProtocolTCP,
Name: "metrics",
ContainerPort: int32(metricsPort),
},
},
SecurityContext: securityContext,
})
lastPort++
proxyPort := lastPort
proxyResourcesRequestsCPUStr := resourceAnnotations[KubeAnnotationYataiProxySidecarResourcesRequestsCPU]
if proxyResourcesRequestsCPUStr == "" {
proxyResourcesRequestsCPUStr = "100m"
}
var proxyResourcesRequestsCPU resource.Quantity
proxyResourcesRequestsCPU, err = resource.ParseQuantity(proxyResourcesRequestsCPUStr)
if err != nil {
err = errors.Wrapf(err, "failed to parse proxy sidecar resources requests cpu: %s", proxyResourcesRequestsCPUStr)
return nil, err
}
proxyResourcesRequestsMemoryStr := resourceAnnotations[KubeAnnotationYataiProxySidecarResourcesRequestsMemory]
if proxyResourcesRequestsMemoryStr == "" {
proxyResourcesRequestsMemoryStr = "200Mi"
}
var proxyResourcesRequestsMemory resource.Quantity
proxyResourcesRequestsMemory, err = resource.ParseQuantity(proxyResourcesRequestsMemoryStr)
if err != nil {
err = errors.Wrapf(err, "failed to parse proxy sidecar resources requests memory: %s", proxyResourcesRequestsMemoryStr)
return nil, err
}
proxyResourcesLimitsCPUStr := resourceAnnotations[KubeAnnotationYataiProxySidecarResourcesLimitsCPU]
if proxyResourcesLimitsCPUStr == "" {
proxyResourcesLimitsCPUStr = "300m"
}
var proxyResourcesLimitsCPU resource.Quantity
proxyResourcesLimitsCPU, err = resource.ParseQuantity(proxyResourcesLimitsCPUStr)
if err != nil {
err = errors.Wrapf(err, "failed to parse proxy sidecar resources limits cpu: %s", proxyResourcesLimitsCPUStr)
return nil, err
}
proxyResourcesLimitsMemoryStr := resourceAnnotations[KubeAnnotationYataiProxySidecarResourcesLimitsMemory]
if proxyResourcesLimitsMemoryStr == "" {
proxyResourcesLimitsMemoryStr = "1000Mi"
}
var proxyResourcesLimitsMemory resource.Quantity
proxyResourcesLimitsMemory, err = resource.ParseQuantity(proxyResourcesLimitsMemoryStr)
if err != nil {
err = errors.Wrapf(err, "failed to parse proxy sidecar resources limits memory: %s", proxyResourcesLimitsMemoryStr)
return nil, err
}
var envoyConfigContent string
if opt.isStealingTrafficDebugModeEnabled {
productionServiceName := r.getServiceName(opt.compoundAINimDeployment, opt.compoundAINim, false)
envoyConfigContent, err = envoy.GenerateEnvoyConfigurationContent(envoy.CreateEnvoyConfig{
ListenPort: proxyPort,
DebugHeaderName: HeaderNameDebug,
DebugHeaderValue: commonconsts.KubeLabelValueTrue,
DebugServerAddress: "localhost",
DebugServerPort: containerPort,
ProductionServerAddress: fmt.Sprintf("%s.%s.svc.cluster.local", productionServiceName, opt.compoundAINimDeployment.Namespace),
ProductionServerPort: ServicePortHTTPNonProxy,
})
} else {
debugServiceName := r.getServiceName(opt.compoundAINimDeployment, opt.compoundAINim, true)
envoyConfigContent, err = envoy.GenerateEnvoyConfigurationContent(envoy.CreateEnvoyConfig{
ListenPort: proxyPort,
DebugHeaderName: HeaderNameDebug,
DebugHeaderValue: commonconsts.KubeLabelValueTrue,
DebugServerAddress: fmt.Sprintf("%s.%s.svc.cluster.local", debugServiceName, opt.compoundAINimDeployment.Namespace),
DebugServerPort: ServicePortHTTPNonProxy,
ProductionServerAddress: "localhost",
ProductionServerPort: containerPort,
})
}
if err != nil {
err = errors.Wrapf(err, "failed to generate envoy configuration content")
return nil, err
}
envoyConfigConfigMapName := fmt.Sprintf("%s-envoy-config", kubeName)
envoyConfigConfigMap := &corev1.ConfigMap{
ObjectMeta: metav1.ObjectMeta{
Name: envoyConfigConfigMapName,
Namespace: opt.compoundAINimDeployment.Namespace,
},
Data: map[string]string{
"envoy.yaml": envoyConfigContent,
},
}
err = ctrl.SetControllerReference(opt.compoundAINimDeployment, envoyConfigConfigMap, r.Scheme)
if err != nil {
err = errors.Wrapf(err, "failed to set controller reference for envoy config config map")
return nil, err
}
_, err = ctrl.CreateOrUpdate(ctx, r.Client, envoyConfigConfigMap, func() error {
envoyConfigConfigMap.Data["envoy.yaml"] = envoyConfigContent
return nil
})
if err != nil {
err = errors.Wrapf(err, "failed to create or update envoy config configmap")
return nil, err
}
volumes = append(volumes, corev1.Volume{
Name: "envoy-config",
VolumeSource: corev1.VolumeSource{
ConfigMap: &corev1.ConfigMapVolumeSource{
LocalObjectReference: corev1.LocalObjectReference{
Name: envoyConfigConfigMapName,
},
},
},
})
proxyImage := "quay.io/bentoml/bentoml-proxy:0.0.1"
proxyImage_ := os.Getenv("INTERNAL_IMAGES_PROXY")
if proxyImage_ != "" {
proxyImage = proxyImage_
}
containers = append(containers, corev1.Container{
Name: "proxy",
Image: proxyImage,
Command: []string{
"envoy",
"--config-path",
"/etc/envoy/envoy.yaml",
},
VolumeMounts: []corev1.VolumeMount{
{
Name: "envoy-config",
MountPath: "/etc/envoy",
},
},
Ports: []corev1.ContainerPort{
{
Name: ContainerPortNameHTTPProxy,
ContainerPort: int32(proxyPort),
Protocol: corev1.ProtocolTCP,
},
},
ReadinessProbe: &corev1.Probe{
InitialDelaySeconds: 5,
TimeoutSeconds: 5,
FailureThreshold: 10,
ProbeHandler: corev1.ProbeHandler{
Exec: &corev1.ExecAction{
Command: []string{
"sh",
"-c",
"curl -s localhost:9901/server_info | grep state | grep -q LIVE",
},
},
},
},
LivenessProbe: &corev1.Probe{
InitialDelaySeconds: 5,
TimeoutSeconds: 5,
FailureThreshold: 10,
ProbeHandler: corev1.ProbeHandler{
Exec: &corev1.ExecAction{
Command: []string{
"sh",
"-c",
"curl -s localhost:9901/server_info | grep state | grep -q LIVE",
},
},
},
},
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceCPU: proxyResourcesRequestsCPU,
corev1.ResourceMemory: proxyResourcesRequestsMemory,
},
Limits: corev1.ResourceList{
corev1.ResourceCPU: proxyResourcesLimitsCPU,
corev1.ResourceMemory: proxyResourcesLimitsMemory,
},
},
SecurityContext: securityContext,
})
if needMonitorContainer {
lastPort++
monitorExporterProbePort := lastPort
monitorExporterImage := "quay.io/bentoml/bentoml-monitor-exporter:0.0.3"
monitorExporterImage_ := os.Getenv("INTERNAL_IMAGES_MONITOR_EXPORTER")
if monitorExporterImage_ != "" {
monitorExporterImage = monitorExporterImage_
}
monitorOptEnvs := make([]corev1.EnvVar, 0, len(monitorExporter.Options)+len(monitorExporter.StructureOptions))
monitorOptEnvsSeen := make(map[string]struct{})
for _, env := range monitorExporter.StructureOptions {
monitorOptEnvsSeen[strings.ToLower(env.Name)] = struct{}{}
monitorOptEnvs = append(monitorOptEnvs, corev1.EnvVar{
Name: "FLUENTBIT_OUTPUT_OPTION_" + strings.ToUpper(env.Name),
Value: env.Value,
ValueFrom: env.ValueFrom,
})
}
for k, v := range monitorExporter.Options {
if _, exists := monitorOptEnvsSeen[strings.ToLower(k)]; exists {
continue
}
monitorOptEnvs = append(monitorOptEnvs, corev1.EnvVar{
Name: "FLUENTBIT_OUTPUT_OPTION_" + strings.ToUpper(k),
Value: v,
})
}
monitorVolumeMounts := make([]corev1.VolumeMount, 0, len(monitorExporter.Mounts))
for idx, mount := range monitorExporter.Mounts {
volumeName := fmt.Sprintf("monitor-exporter-%d", idx)
volumes = append(volumes, corev1.Volume{
Name: volumeName,
VolumeSource: mount.VolumeSource,
})
monitorVolumeMounts = append(monitorVolumeMounts, corev1.VolumeMount{
Name: volumeName,
MountPath: mount.Path,
ReadOnly: mount.ReadOnly,
})
}
containers = append(containers, corev1.Container{
Name: "monitor-exporter",
Image: monitorExporterImage,
VolumeMounts: monitorVolumeMounts,
Env: append([]corev1.EnvVar{
{
Name: "FLUENTBIT_OTLP_PORT",
Value: fmt.Sprint(monitorExporterPort),
},
{
Name: "FLUENTBIT_HTTP_PORT",
Value: fmt.Sprint(monitorExporterProbePort),
},
{
Name: "FLUENTBIT_OUTPUT",
Value: monitorExporter.Output,
},
}, monitorOptEnvs...),
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("100m"),
corev1.ResourceMemory: resource.MustParse("24Mi"),
},
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("1000m"),
corev1.ResourceMemory: resource.MustParse("72Mi"),
},
},
ReadinessProbe: &corev1.Probe{
InitialDelaySeconds: 5,
TimeoutSeconds: 5,
FailureThreshold: 10,
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Path: "/readyz",
Port: intstr.FromInt(monitorExporterProbePort),
},
},
},
LivenessProbe: &corev1.Probe{
InitialDelaySeconds: 5,
TimeoutSeconds: 5,
FailureThreshold: 10,
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Path: "/livez",
Port: intstr.FromInt(monitorExporterProbePort),
},
},
},
SecurityContext: securityContext,
})
}
debuggerImage := "quay.io/bentoml/bento-debugger:0.0.8"
debuggerImage_ := os.Getenv("INTERNAL_IMAGES_DEBUGGER")
if debuggerImage_ != "" {
debuggerImage = debuggerImage_
}
if opt.isStealingTrafficDebugModeEnabled || isDebugModeEnabled {
containers = append(containers, corev1.Container{
Name: "debugger",
Image: debuggerImage,
Command: []string{
"sleep",
"infinity",
},
SecurityContext: &corev1.SecurityContext{
Capabilities: &corev1.Capabilities{
Add: []corev1.Capability{"SYS_PTRACE"},
},
},
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("100m"),
corev1.ResourceMemory: resource.MustParse("100Mi"),
},
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("1000m"),
corev1.ResourceMemory: resource.MustParse("1000Mi"),
},
},
Stdin: true,
TTY: true,
})
}
podLabels[commonconsts.KubeLabelYataiSelector] = kubeName
podSpec := corev1.PodSpec{
Containers: containers,
Volumes: volumes,
}
podSpec.ImagePullSecrets = opt.compoundAINim.Spec.ImagePullSecrets
extraPodMetadata := opt.compoundAINimDeployment.Spec.ExtraPodMetadata
if extraPodMetadata != nil {
for k, v := range extraPodMetadata.Annotations {
podAnnotations[k] = v
}
for k, v := range extraPodMetadata.Labels {
podLabels[k] = v
}
}
extraPodSpec := opt.compoundAINimDeployment.Spec.ExtraPodSpec
if extraPodSpec != nil {
podSpec.SchedulerName = extraPodSpec.SchedulerName
podSpec.NodeSelector = extraPodSpec.NodeSelector
podSpec.Affinity = extraPodSpec.Affinity
podSpec.Tolerations = extraPodSpec.Tolerations
podSpec.TopologySpreadConstraints = extraPodSpec.TopologySpreadConstraints
podSpec.Containers = append(podSpec.Containers, extraPodSpec.Containers...)
podSpec.ServiceAccountName = extraPodSpec.ServiceAccountName
}
if podSpec.ServiceAccountName == "" {
serviceAccounts := &corev1.ServiceAccountList{}
err = r.List(ctx, serviceAccounts, client.InNamespace(opt.compoundAINimDeployment.Namespace), client.MatchingLabels{
commonconsts.KubeLabelBentoDeploymentPod: commonconsts.KubeLabelValueTrue,
})
if err != nil {
err = errors.Wrapf(err, "failed to list service accounts in namespace %s", opt.compoundAINimDeployment.Namespace)
return
}
if len(serviceAccounts.Items) > 0 {
podSpec.ServiceAccountName = serviceAccounts.Items[0].Name
} else {
podSpec.ServiceAccountName = DefaultServiceAccountName
}
}
if resourceAnnotations["yatai.ai/enable-host-ipc"] == commonconsts.KubeLabelValueTrue {
podSpec.HostIPC = true
}
if resourceAnnotations["yatai.ai/enable-host-network"] == commonconsts.KubeLabelValueTrue {
podSpec.HostNetwork = true
}
if resourceAnnotations["yatai.ai/enable-host-pid"] == commonconsts.KubeLabelValueTrue {
podSpec.HostPID = true
}
if opt.isStealingTrafficDebugModeEnabled || isDebugModeEnabled {
podSpec.ShareProcessNamespace = &[]bool{true}[0]
}
podTemplateSpec = &corev1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Labels: podLabels,
Annotations: podAnnotations,
},
Spec: podSpec,
}
return
}
func getResourcesConfig(resources *compoundaiCommon.Resources) (corev1.ResourceRequirements, error) {
currentResources := corev1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("300m"),
corev1.ResourceMemory: resource.MustParse("500Mi"),
},
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("500m"),
corev1.ResourceMemory: resource.MustParse("1Gi"),
},
}
if resources == nil {
return currentResources, nil
}
if resources.Limits != nil {
if resources.Limits.CPU != "" {
q, err := resource.ParseQuantity(resources.Limits.CPU)
if err != nil {
return currentResources, errors.Wrapf(err, "parse limits cpu quantity")
}
if currentResources.Limits == nil {
currentResources.Limits = make(corev1.ResourceList)
}
currentResources.Limits[corev1.ResourceCPU] = q
}
if resources.Limits.Memory != "" {
q, err := resource.ParseQuantity(resources.Limits.Memory)
if err != nil {
return currentResources, errors.Wrapf(err, "parse limits memory quantity")
}
if currentResources.Limits == nil {
currentResources.Limits = make(corev1.ResourceList)
}
currentResources.Limits[corev1.ResourceMemory] = q
}
if resources.Limits.GPU != "" {
q, err := resource.ParseQuantity(resources.Limits.GPU)
if err != nil {
return currentResources, errors.Wrapf(err, "parse limits gpu quantity")
}
if currentResources.Limits == nil {
currentResources.Limits = make(corev1.ResourceList)
}
currentResources.Limits[commonconsts.KubeResourceGPUNvidia] = q
}
for k, v := range resources.Limits.Custom {
q, err := resource.ParseQuantity(v)
if err != nil {
return currentResources, errors.Wrapf(err, "parse limits %s quantity", k)
}
if currentResources.Limits == nil {
currentResources.Limits = make(corev1.ResourceList)
}
currentResources.Limits[corev1.ResourceName(k)] = q
}
}
if resources.Requests != nil {
if resources.Requests.CPU != "" {
q, err := resource.ParseQuantity(resources.Requests.CPU)
if err != nil {
return currentResources, errors.Wrapf(err, "parse requests cpu quantity")
}
if currentResources.Requests == nil {
currentResources.Requests = make(corev1.ResourceList)
}
currentResources.Requests[corev1.ResourceCPU] = q
}
if resources.Requests.Memory != "" {
q, err := resource.ParseQuantity(resources.Requests.Memory)
if err != nil {
return currentResources, errors.Wrapf(err, "parse requests memory quantity")
}
if currentResources.Requests == nil {
currentResources.Requests = make(corev1.ResourceList)
}
currentResources.Requests[corev1.ResourceMemory] = q
}
for k, v := range resources.Requests.Custom {
q, err := resource.ParseQuantity(v)
if err != nil {
return currentResources, errors.Wrapf(err, "parse requests %s quantity", k)
}
if currentResources.Requests == nil {
currentResources.Requests = make(corev1.ResourceList)
}
currentResources.Requests[corev1.ResourceName(k)] = q
}
}
return currentResources, nil
}
type generateServiceOption struct {
compoundAINimDeployment *v1alpha1.CompoundAINimDeployment
compoundAINim *v1alpha1.CompoundAINim
isStealingTrafficDebugModeEnabled bool
isDebugPodReceiveProductionTraffic bool
containsStealingTrafficDebugModeEnabled bool
isGenericService bool
}
//nolint:nakedret
func (r *CompoundAINimDeploymentReconciler) generateService(opt generateServiceOption) (kubeService *corev1.Service, err error) {
var kubeName string
if opt.isGenericService {
kubeName = r.getGenericServiceName(opt.compoundAINimDeployment, opt.compoundAINim)
} else {
kubeName = r.getServiceName(opt.compoundAINimDeployment, opt.compoundAINim, opt.isStealingTrafficDebugModeEnabled)
}
labels := r.getKubeLabels(opt.compoundAINimDeployment, opt.compoundAINim)
selector := make(map[string]string)
for k, v := range labels {
selector[k] = v
}
if opt.isStealingTrafficDebugModeEnabled {
selector[commonconsts.KubeLabelYataiBentoDeploymentTargetType] = DeploymentTargetTypeDebug
}
targetPort := intstr.FromString(commonconsts.BentoContainerPortName)
if opt.isGenericService {
delete(selector, commonconsts.KubeLabelYataiBentoDeploymentTargetType)
if opt.containsStealingTrafficDebugModeEnabled {
targetPort = intstr.FromString(ContainerPortNameHTTPProxy)
}
}
spec := corev1.ServiceSpec{
Selector: selector,
Ports: []corev1.ServicePort{
{
Name: commonconsts.BentoServicePortName,
Port: commonconsts.BentoServicePort,
TargetPort: targetPort,
Protocol: corev1.ProtocolTCP,
},
{
Name: ServicePortNameHTTPNonProxy,
Port: int32(ServicePortHTTPNonProxy),
TargetPort: intstr.FromString(commonconsts.BentoContainerPortName),
Protocol: corev1.ProtocolTCP,
},
},
}
annotations := r.getKubeAnnotations(opt.compoundAINimDeployment, opt.compoundAINim)
kubeNs := opt.compoundAINimDeployment.Namespace
kubeService = &corev1.Service{
ObjectMeta: metav1.ObjectMeta{
Name: kubeName,
Namespace: kubeNs,
Labels: labels,
Annotations: annotations,
},
Spec: spec,
}
err = ctrl.SetControllerReference(opt.compoundAINimDeployment, kubeService, r.Scheme)
if err != nil {
err = errors.Wrapf(err, "set controller reference for service %s", kubeService.Name)
return
}
return
}
func (r *CompoundAINimDeploymentReconciler) generateIngressHost(ctx context.Context, compoundAINimDeployment *v1alpha1.CompoundAINimDeployment) (string, error) {
return r.generateDefaultHostname(ctx, compoundAINimDeployment)
}
var cachedDomainSuffix *string
func (r *CompoundAINimDeploymentReconciler) generateDefaultHostname(ctx context.Context, compoundAINimDeployment *v1alpha1.CompoundAINimDeployment) (string, error) {
var domainSuffix string
if cachedDomainSuffix != nil {
domainSuffix = *cachedDomainSuffix
} else {
restConfig := config.GetConfigOrDie()
clientset, err := kubernetes.NewForConfig(restConfig)
if err != nil {
return "", errors.Wrapf(err, "create kubernetes clientset")
}
domainSuffix, err = system.GetDomainSuffix(ctx, func(ctx context.Context, namespace, name string) (*corev1.ConfigMap, error) {
configmap, err := clientset.CoreV1().ConfigMaps(namespace).Get(ctx, name, metav1.GetOptions{})
return configmap, errors.Wrap(err, "get configmap")
}, clientset)
if err != nil {
return "", errors.Wrapf(err, "get domain suffix")
}
cachedDomainSuffix = &domainSuffix
}
return fmt.Sprintf("%s-%s.%s", compoundAINimDeployment.Name, compoundAINimDeployment.Namespace, domainSuffix), nil
}
type TLSModeOpt string
const (
TLSModeNone TLSModeOpt = "none"
TLSModeAuto TLSModeOpt = "auto"
TLSModeStatic TLSModeOpt = "static"
)
type IngressConfig struct {
ClassName *string
Annotations map[string]string
Path string
PathType networkingv1.PathType
TLSMode TLSModeOpt
StaticTLSSecretName string
}
var cachedIngressConfig *IngressConfig
//nolint:nakedret
func (r *CompoundAINimDeploymentReconciler) GetIngressConfig(ctx context.Context) (ingressConfig *IngressConfig, err error) {
if cachedIngressConfig != nil {
ingressConfig = cachedIngressConfig
return
}
restConfig := config.GetConfigOrDie()
clientset, err := kubernetes.NewForConfig(restConfig)
if err != nil {
err = errors.Wrapf(err, "create kubernetes clientset")
return
}
configMap, err := system.GetNetworkConfigConfigMap(ctx, func(ctx context.Context, namespace, name string) (*corev1.ConfigMap, error) {
configmap, err := clientset.CoreV1().ConfigMaps(namespace).Get(ctx, name, metav1.GetOptions{})
return configmap, errors.Wrap(err, "get network config configmap")
})
if err != nil {
err = errors.Wrapf(err, "failed to get configmap %s", commonconsts.KubeConfigMapNameNetworkConfig)
return
}
var className *string
className_ := strings.TrimSpace(configMap.Data[commonconsts.KubeConfigMapKeyNetworkConfigIngressClass])
if className_ != "" {
className = &className_
}
annotations := make(map[string]string)
annotations_ := strings.TrimSpace(configMap.Data[commonconsts.KubeConfigMapKeyNetworkConfigIngressAnnotations])
if annotations_ != "" {
err = json.Unmarshal([]byte(annotations_), &annotations)
if err != nil {
err = errors.Wrapf(err, "failed to json unmarshal %s in configmap %s: %s", commonconsts.KubeConfigMapKeyNetworkConfigIngressAnnotations, commonconsts.KubeConfigMapNameNetworkConfig, annotations_)
return
}
}
path := strings.TrimSpace(configMap.Data["ingress-path"])
if path == "" {
path = "/"
}
pathType := networkingv1.PathTypeImplementationSpecific
pathType_ := strings.TrimSpace(configMap.Data["ingress-path-type"])
if pathType_ != "" {
pathType = networkingv1.PathType(pathType_)
}
tlsMode := TLSModeNone
tlsModeStr := strings.TrimSpace(configMap.Data["ingress-tls-mode"])
if tlsModeStr != "" && tlsModeStr != "none" {
if tlsModeStr == "auto" || tlsModeStr == "static" {
tlsMode = TLSModeOpt(tlsModeStr)
} else {
fmt.Println("Invalid TLS mode:", tlsModeStr)
err = errors.Wrapf(err, "Invalid TLS mode: %s", tlsModeStr)
return
}
}
staticTLSSecretName := strings.TrimSpace(configMap.Data["ingress-static-tls-secret-name"])
if tlsMode == TLSModeStatic && staticTLSSecretName == "" {
err = errors.Wrapf(err, "TLS mode is static but ingress-static-tls-secret isn't set")
return
}
ingressConfig = &IngressConfig{
ClassName: className,
Annotations: annotations,
Path: path,
PathType: pathType,
TLSMode: tlsMode,
StaticTLSSecretName: staticTLSSecretName,
}
cachedIngressConfig = ingressConfig
return
}
type generateIngressesOption struct {
yataiClient **yataiclient.YataiClient
compoundAINimDeployment *v1alpha1.CompoundAINimDeployment
compoundAINim *v1alpha1.CompoundAINim
}
//nolint:nakedret
func (r *CompoundAINimDeploymentReconciler) generateIngresses(ctx context.Context, opt generateIngressesOption) (ingresses []*networkingv1.Ingress, err error) {
compoundAINimRepositoryName, compoundAINimVersion := getCompoundAINimRepositoryNameAndCompoundAINimVersion(opt.compoundAINim)
compoundAINimDeployment := opt.compoundAINimDeployment
compoundAINim := opt.compoundAINim
kubeName := r.getKubeName(compoundAINimDeployment, compoundAINim, false)
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeNormal, "GenerateIngressHost", "Generating hostname for ingress")
internalHost, err := r.generateIngressHost(ctx, compoundAINimDeployment)
if err != nil {
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeWarning, "GenerateIngressHost", "Failed to generate hostname for ingress: %v", err)
return
}
r.Recorder.Eventf(compoundAINimDeployment, corev1.EventTypeNormal, "GenerateIngressHost", "Generated hostname for ingress: %s", internalHost)
annotations := r.getKubeAnnotations(compoundAINimDeployment, compoundAINim)
tag := fmt.Sprintf("%s:%s", compoundAINimRepositoryName, compoundAINimVersion)
orgName := "unknown"
annotations["nginx.ingress.kubernetes.io/configuration-snippet"] = fmt.Sprintf(`
more_set_headers "X-Powered-By: Yatai";
more_set_headers "X-Yatai-Org-Name: %s";
more_set_headers "X-Yatai-Bento: %s";
`, orgName, tag)
annotations["nginx.ingress.kubernetes.io/ssl-redirect"] = "false"
labels := r.getKubeLabels(compoundAINimDeployment, compoundAINim)
kubeNs := compoundAINimDeployment.Namespace
ingressConfig, err := r.GetIngressConfig(ctx)
if err != nil {
err = errors.Wrapf(err, "get ingress config")
return
}
ingressClassName := ingressConfig.ClassName
ingressAnnotations := ingressConfig.Annotations
ingressPath := ingressConfig.Path
ingressPathType := ingressConfig.PathType
ingressTLSMode := ingressConfig.TLSMode
ingressStaticTLSSecretName := ingressConfig.StaticTLSSecretName
for k, v := range ingressAnnotations {
annotations[k] = v
}
for k, v := range opt.compoundAINimDeployment.Spec.Ingress.Annotations {
annotations[k] = v
}
for k, v := range opt.compoundAINimDeployment.Spec.Ingress.Labels {
labels[k] = v
}
var tls []networkingv1.IngressTLS
// set default tls from network configmap
switch ingressTLSMode {
case TLSModeNone:
case TLSModeAuto:
tls = make([]networkingv1.IngressTLS, 0, 1)
tls = append(tls, networkingv1.IngressTLS{
Hosts: []string{internalHost},
SecretName: kubeName,
})
case TLSModeStatic:
tls = make([]networkingv1.IngressTLS, 0, 1)
tls = append(tls, networkingv1.IngressTLS{
Hosts: []string{internalHost},
SecretName: ingressStaticTLSSecretName,
})
default:
err = errors.Wrapf(err, "TLS mode is invalid: %s", ingressTLSMode)
return
}
// override default tls if CompoundAINimDeployment defines its own tls section
if opt.compoundAINimDeployment.Spec.Ingress.TLS != nil && opt.compoundAINimDeployment.Spec.Ingress.TLS.SecretName != "" {
tls = make([]networkingv1.IngressTLS, 0, 1)
tls = append(tls, networkingv1.IngressTLS{
Hosts: []string{internalHost},
SecretName: opt.compoundAINimDeployment.Spec.Ingress.TLS.SecretName,
})
}
serviceName := r.getGenericServiceName(compoundAINimDeployment, compoundAINim)
interIng := &networkingv1.Ingress{
ObjectMeta: metav1.ObjectMeta{
Name: kubeName,
Namespace: kubeNs,
Labels: labels,
Annotations: annotations,
},
Spec: networkingv1.IngressSpec{
IngressClassName: ingressClassName,
TLS: tls,
Rules: []networkingv1.IngressRule{
{
Host: internalHost,
IngressRuleValue: networkingv1.IngressRuleValue{
HTTP: &networkingv1.HTTPIngressRuleValue{
Paths: []networkingv1.HTTPIngressPath{
{
Path: ingressPath,
PathType: &ingressPathType,
Backend: networkingv1.IngressBackend{
Service: &networkingv1.IngressServiceBackend{
Name: serviceName,
Port: networkingv1.ServiceBackendPort{
Name: commonconsts.BentoServicePortName,
},
},
},
},
},
},
},
},
},
},
}
err = ctrl.SetControllerReference(compoundAINimDeployment, interIng, r.Scheme)
if err != nil {
err = errors.Wrapf(err, "set ingress %s controller reference", interIng.Name)
return
}
ings := []*networkingv1.Ingress{interIng}
return ings, err
}
var cachedCompoundAINimDeploymentNamespaces *[]string
func (r *CompoundAINimDeploymentReconciler) doCleanUpAbandonedRunnerServices() error {
logs := log.Log.WithValues("func", "doCleanUpAbandonedRunnerServices")
logs.Info("start cleaning up abandoned runner services")
ctx, cancel := context.WithTimeout(context.TODO(), time.Minute*10)
defer cancel()
var compoundAINimDeploymentNamespaces []string
if cachedCompoundAINimDeploymentNamespaces != nil {
compoundAINimDeploymentNamespaces = *cachedCompoundAINimDeploymentNamespaces
} else {
restConfig := config.GetConfigOrDie()
clientset, err := kubernetes.NewForConfig(restConfig)
if err != nil {
return errors.Wrapf(err, "create kubernetes clientset")
}
compoundAINimDeploymentNamespaces, err = commonconfig.GetBentoDeploymentNamespaces(ctx, func(ctx context.Context, namespace, name string) (*corev1.Secret, error) {
secret, err := clientset.CoreV1().Secrets(namespace).Get(ctx, name, metav1.GetOptions{})
return secret, errors.Wrap(err, "get secret")
})
if err != nil {
err = errors.Wrapf(err, "get compoundAINim deployment namespaces")
return err
}
cachedCompoundAINimDeploymentNamespaces = &compoundAINimDeploymentNamespaces
}
for _, compoundAINimDeploymentNamespace := range compoundAINimDeploymentNamespaces {
serviceList := &corev1.ServiceList{}
serviceListOpts := []client.ListOption{
client.HasLabels{commonconsts.KubeLabelYataiBentoDeploymentRunner},
client.InNamespace(compoundAINimDeploymentNamespace),
}
err := r.List(ctx, serviceList, serviceListOpts...)
if err != nil {
return errors.Wrap(err, "list services")
}
for _, service := range serviceList.Items {
service := service
podList := &corev1.PodList{}
podListOpts := []client.ListOption{
client.InNamespace(service.Namespace),
client.MatchingLabels(service.Spec.Selector),
}
err := r.List(ctx, podList, podListOpts...)
if err != nil {
return errors.Wrap(err, "list pods")
}
if len(podList.Items) > 0 {
continue
}
createdAt := service.ObjectMeta.CreationTimestamp
if time.Since(createdAt.Time) < time.Minute*3 {
continue
}
logs.Info("deleting abandoned runner service", "name", service.Name, "namespace", service.Namespace)
err = r.Delete(ctx, &service)
if err != nil {
return errors.Wrapf(err, "delete service %s", service.Name)
}
}
}
logs.Info("finished cleaning up abandoned runner services")
return nil
}
func (r *CompoundAINimDeploymentReconciler) cleanUpAbandonedRunnerServices() {
logs := log.Log.WithValues("func", "cleanUpAbandonedRunnerServices")
err := r.doCleanUpAbandonedRunnerServices()
if err != nil {
logs.Error(err, "cleanUpAbandonedRunnerServices")
}
ticker := time.NewTicker(time.Second * 30)
for range ticker.C {
err := r.doCleanUpAbandonedRunnerServices()
if err != nil {
logs.Error(err, "cleanUpAbandonedRunnerServices")
}
}
}
//nolint:nakedret
func (r *CompoundAINimDeploymentReconciler) doRegisterCompoundComponent() (err error) {
logs := log.Log.WithValues("func", "doRegisterYataiComponent")
ctx, cancel := context.WithTimeout(context.TODO(), time.Minute*5)
defer cancel()
logs.Info("getting yatai client")
yataiClient, clusterName, err := r.getYataiClient(ctx)
if err != nil {
err = errors.Wrap(err, "get yatai client")
return
}
if yataiClient == nil {
logs.Info("yatai client is nil")
return
}
yataiClient_ := *yataiClient
namespace, err := commonconfig.GetYataiDeploymentNamespace(ctx, func(ctx context.Context, namespace, name string) (*corev1.Secret, error) {
secret := &corev1.Secret{}
err := r.Client.Get(ctx, client.ObjectKey{Namespace: namespace, Name: name}, secret)
return secret, errors.Wrap(err, "get secret")
})
if err != nil {
err = errors.Wrap(err, "get yatai deployment namespace")
return
}
_, err = yataiClient_.RegisterYataiComponent(ctx, *clusterName, &schemasv1.RegisterYataiComponentSchema{
Name: modelschemas.YataiComponentNameDeployment,
KubeNamespace: namespace,
Version: version.Version,
SelectorLabels: map[string]string{
"app.kubernetes.io/name": "yatai-deployment",
},
Manifest: &modelschemas.YataiComponentManifestSchema{
SelectorLabels: map[string]string{
"app.kubernetes.io/name": "yatai-deployment",
},
LatestCRDVersion: "v2alpha1",
},
})
return err
}
func (r *CompoundAINimDeploymentReconciler) registerCompoundComponent() {
logs := log.Log.WithValues("func", "registerYataiComponent")
err := r.doRegisterCompoundComponent()
if err != nil {
logs.Error(err, "registerYataiComponent")
}
ticker := time.NewTicker(time.Minute * 5)
for range ticker.C {
err := r.doRegisterCompoundComponent()
if err != nil {
logs.Error(err, "registerYataiComponent")
}
}
}
// SetupWithManager sets up the controller with the Manager.
func (r *CompoundAINimDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) error {
logs := log.Log.WithValues("func", "SetupWithManager")
if os.Getenv("DISABLE_CLEANUP_ABANDONED_RUNNER_SERVICES") != commonconsts.KubeLabelValueTrue {
go r.cleanUpAbandonedRunnerServices()
} else {
logs.Info("cleanup abandoned runner services is disabled")
}
if os.Getenv("DISABLE_YATAI_COMPONENT_REGISTRATION") != commonconsts.KubeLabelValueTrue {
go r.registerCompoundComponent()
} else {
logs.Info("yatai component registration is disabled")
}
m := ctrl.NewControllerManagedBy(mgr).
For(&v1alpha1.CompoundAINimDeployment{}, builder.WithPredicates(predicate.GenerationChangedPredicate{})).
Owns(&appsv1.Deployment{}, builder.WithPredicates(predicate.GenerationChangedPredicate{})).
Owns(&corev1.Service{}, builder.WithPredicates(predicate.GenerationChangedPredicate{})).
Owns(&networkingv1beta1.VirtualService{}, builder.WithPredicates(predicate.GenerationChangedPredicate{})).
Owns(&networkingv1.Ingress{}, builder.WithPredicates(predicate.GenerationChangedPredicate{})).
Owns(&corev1.PersistentVolumeClaim{}, builder.WithPredicates(predicate.GenerationChangedPredicate{})).
Watches(&v1alpha1.CompoundAINimRequest{}, handler.EnqueueRequestsFromMapFunc(func(ctx context.Context, compoundAINimRequest client.Object) []reconcile.Request {
reqs := make([]reconcile.Request, 0)
logs := log.Log.WithValues("func", "Watches", "kind", "CompoundAINimRequest", "name", compoundAINimRequest.GetName(), "namespace", compoundAINimRequest.GetNamespace())
logs.Info("Triggering reconciliation for CompoundAINimRequest", "CompoundAINimRequestName", compoundAINimRequest.GetName(), "Namespace", compoundAINimRequest.GetNamespace())
compoundAINim := &v1alpha1.CompoundAINim{}
err := r.Get(context.Background(), types.NamespacedName{
Name: compoundAINimRequest.GetName(),
Namespace: compoundAINimRequest.GetNamespace(),
}, compoundAINim)
compoundAINimIsNotFound := k8serrors.IsNotFound(err)
if err != nil && !compoundAINimIsNotFound {
logs.Info("Failed to get CompoundAINim", "name", compoundAINimRequest.GetName(), "namespace", compoundAINimRequest.GetNamespace(), "error", err)
return reqs
}
if !compoundAINimIsNotFound {
logs.Info("CompoundAINim found, skipping enqueue as it's already present", "CompoundAINimName", compoundAINimRequest.GetName())
return reqs
}
compoundAINimDeployments := &v1alpha1.CompoundAINimDeploymentList{}
err = r.List(context.Background(), compoundAINimDeployments, &client.ListOptions{
Namespace: compoundAINimRequest.GetNamespace(),
})
if err != nil {
logs.Info("Failed to list CompoundAINimDeployments", "Namespace", compoundAINimRequest.GetNamespace(), "error", err)
return reqs
}
for _, compoundAINimDeployment := range compoundAINimDeployments.Items {
compoundAINimDeployment := compoundAINimDeployment
if compoundAINimDeployment.Spec.CompoundAINim == compoundAINimRequest.GetName() {
reqs = append(reqs, reconcile.Request{
NamespacedName: client.ObjectKeyFromObject(&compoundAINimDeployment),
})
}
}
// Log the list of CompoundAINimDeployments being enqueued for reconciliation
logs.Info("Enqueuing CompoundAINimDeployments for reconciliation", "ReconcileRequests", reqs)
return reqs
})).WithEventFilter(controller_common.EphemeralDeploymentEventFilter(r.Config)).
Watches(&v1alpha1.CompoundAINim{}, handler.EnqueueRequestsFromMapFunc(func(ctx context.Context, compoundAINim client.Object) []reconcile.Request {
logs := log.Log.WithValues("func", "Watches", "kind", "CompoundAINim", "name", compoundAINim.GetName(), "namespace", compoundAINim.GetNamespace())
logs.Info("Triggering reconciliation for CompoundAINim", "CompoundAINimName", compoundAINim.GetName(), "Namespace", compoundAINim.GetNamespace())
compoundAINimDeployments := &v1alpha1.CompoundAINimDeploymentList{}
err := r.List(context.Background(), compoundAINimDeployments, &client.ListOptions{
Namespace: compoundAINim.GetNamespace(),
})
if err != nil {
logs.Info("Failed to list CompoundAINimDeployments", "Namespace", compoundAINim.GetNamespace(), "error", err)
return []reconcile.Request{}
}
reqs := make([]reconcile.Request, 0)
for _, compoundAINimDeployment := range compoundAINimDeployments.Items {
compoundAINimDeployment := compoundAINimDeployment
if compoundAINimDeployment.Spec.CompoundAINim == compoundAINim.GetName() {
reqs = append(reqs, reconcile.Request{
NamespacedName: client.ObjectKeyFromObject(&compoundAINimDeployment),
})
}
}
// Log the list of CompoundAINimDeployments being enqueued for reconciliation
logs.Info("Enqueuing CompoundAINimDeployments for reconciliation", "ReconcileRequests", reqs)
return reqs
}))
m.Owns(&autoscalingv2.HorizontalPodAutoscaler{})
return m.Complete(r)
}
//nolint:nakedret
func TransformToOldHPA(hpa *v1alpha1.Autoscaling) (oldHpa *modelschemas.DeploymentTargetHPAConf, err error) {
if hpa == nil {
return
}
minReplicas := int32(hpa.MinReplicas)
maxReplicas := int32(hpa.MaxReplicas)
oldHpa = &modelschemas.DeploymentTargetHPAConf{
MinReplicas: &minReplicas,
MaxReplicas: &maxReplicas,
}
for _, metric := range hpa.Metrics {
if metric.Type == autoscalingv2.PodsMetricSourceType {
if metric.Pods == nil {
continue
}
if metric.Pods.Metric.Name == commonconsts.KubeHPAQPSMetric {
if metric.Pods.Target.Type != autoscalingv2.UtilizationMetricType {
continue
}
if metric.Pods.Target.AverageValue == nil {
continue
}
qps := metric.Pods.Target.AverageValue.Value()
oldHpa.QPS = &qps
}
} else if metric.Type == autoscalingv2.ResourceMetricSourceType {
if metric.Resource == nil {
continue
}
if metric.Resource.Name == corev1.ResourceCPU {
if metric.Resource.Target.Type != autoscalingv2.UtilizationMetricType {
continue
}
if metric.Resource.Target.AverageUtilization == nil {
continue
}
cpu := *metric.Resource.Target.AverageUtilization
oldHpa.CPU = &cpu
} else if metric.Resource.Name == corev1.ResourceMemory {
if metric.Resource.Target.Type != autoscalingv2.UtilizationMetricType {
continue
}
if metric.Resource.Target.AverageUtilization == nil {
continue
}
memory := metric.Resource.Target.AverageValue.String()
oldHpa.Memory = &memory
}
}
}
return
}
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package controller
import (
"bytes"
"context"
"crypto/md5"
"encoding/base64"
"encoding/hex"
"encoding/json"
"fmt"
"os"
"path/filepath"
"reflect"
"strconv"
"strings"
"text/template"
"time"
"emperror.dev/errors"
"github.com/apparentlymart/go-shquot/shquot"
"github.com/dynemo-ai/dynemo/deploy/compoundai/operator/internal/controller_common"
commonconfig "github.com/dynemo-ai/dynemo/deploy/compoundai/operator/pkg/compoundai/config"
commonconsts "github.com/dynemo-ai/dynemo/deploy/compoundai/operator/pkg/compoundai/consts"
"github.com/ettle/strcase"
"github.com/huandu/xstrings"
"github.com/mitchellh/hashstructure/v2"
"github.com/prometheus/common/version"
"github.com/prune998/docker-registry-client/registry"
"github.com/rs/xid"
"github.com/sergeymakinen/go-quote/unix"
"github.com/sirupsen/logrus"
"gopkg.in/yaml.v2"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
k8serrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/meta"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/tools/record"
"k8s.io/utils/ptr"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/builder"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/predicate"
compoundaiCommon "github.com/dynemo-ai/dynemo/deploy/compoundai/operator/api/compoundai/common"
"github.com/dynemo-ai/dynemo/deploy/compoundai/operator/api/compoundai/modelschemas"
"github.com/dynemo-ai/dynemo/deploy/compoundai/operator/api/compoundai/schemasv1"
yataiclient "github.com/dynemo-ai/dynemo/deploy/compoundai/operator/api/compoundai/yatai-client"
nvidiacomv1alpha1 "github.com/dynemo-ai/dynemo/deploy/compoundai/operator/api/v1alpha1"
)
const (
KubeAnnotationCompoundAINimRequestHash = "yatai.ai/bento-request-hash"
KubeAnnotationCompoundAINimRequestImageBuiderHash = "yatai.ai/bento-request-image-builder-hash"
KubeAnnotationCompoundAINimRequestModelSeederHash = "yatai.ai/bento-request-model-seeder-hash"
KubeLabelYataiImageBuilderSeparateModels = "yatai.ai/yatai-image-builder-separate-models"
KubeAnnotationCompoundAINimStorageNS = "yatai.ai/bento-storage-namespace"
KubeAnnotationModelStorageNS = "yatai.ai/model-storage-namespace"
StoreSchemaAWS = "aws"
StoreSchemaGCP = "gcp"
)
// CompoundAINimRequestReconciler reconciles a CompoundAINimRequest object
type CompoundAINimRequestReconciler struct {
client.Client
Scheme *runtime.Scheme
Recorder record.EventRecorder
Config controller_common.Config
}
// +kubebuilder:rbac:groups=nvidia.com,resources=compoundainimrequests,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=nvidia.com,resources=compoundainimrequests/status,verbs=get;update;patch
// +kubebuilder:rbac:groups=nvidia.com,resources=compoundainimrequests/finalizers,verbs=update
//+kubebuilder:rbac:groups=nvidia.com,resources=compoundainims,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=nvidia.com,resources=compoundainims/status,verbs=get;update;patch
//+kubebuilder:rbac:groups=events.k8s.io,resources=events,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=core,resources=events,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=core,resources=configmaps,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=core,resources=secrets,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=core,resources=serviceaccounts,verbs=get;list;watch
//+kubebuilder:rbac:groups=coordination.k8s.io,resources=leases,verbs=get;list;watch;create;update;patch;delete
// Reconcile is part of the main kubernetes reconciliation loop which aims to
// move the current state of the cluster closer to the desired state.
// TODO(user): Modify the Reconcile function to compare the state specified by
// the CompoundAINimRequest object against the actual cluster state, and then
// perform operations to make the cluster state reflect the state specified by
// the user.
//
// For more details, check Reconcile and its Result here:
// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.18.2/pkg/reconcile
//
//nolint:gocyclo,nakedret
func (r *CompoundAINimRequestReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, err error) {
logs := log.FromContext(ctx)
compoundAINimRequest := &nvidiacomv1alpha1.CompoundAINimRequest{}
err = r.Get(ctx, req.NamespacedName, compoundAINimRequest)
if err != nil {
if k8serrors.IsNotFound(err) {
// Object not found, return. Created objects are automatically garbage collected.
// For additional cleanup logic use finalizers.
logs.Info("compoundAINimRequest resource not found. Ignoring since object must be deleted")
err = nil
return
}
// Error reading the object - requeue the request.
logs.Error(err, "Failed to get compoundAINimRequest")
return
}
for _, condition := range compoundAINimRequest.Status.Conditions {
if condition.Type == nvidiacomv1alpha1.CompoundAIDeploymentConditionTypeAvailable && condition.Status == metav1.ConditionTrue {
logs.Info("Skip available compoundAINimRequest")
return
}
}
if len(compoundAINimRequest.Status.Conditions) == 0 {
compoundAINimRequest, err = r.setStatusConditions(ctx, req,
metav1.Condition{
Type: nvidiacomv1alpha1.CompoundAINimRequestConditionTypeModelsSeeding,
Status: metav1.ConditionUnknown,
Reason: "Reconciling",
Message: "Starting to reconcile compoundAINimRequest",
},
metav1.Condition{
Type: nvidiacomv1alpha1.CompoundAINimRequestConditionTypeImageBuilding,
Status: metav1.ConditionUnknown,
Reason: "Reconciling",
Message: "Starting to reconcile compoundAINimRequest",
},
metav1.Condition{
Type: nvidiacomv1alpha1.CompoundAINimRequestConditionTypeImageExists,
Status: metav1.ConditionUnknown,
Reason: "Reconciling",
Message: "Starting to reconcile compoundAINimRequest",
},
)
if err != nil {
return
}
}
logs = logs.WithValues("compoundAINimRequest", compoundAINimRequest.Name, "compoundAINimRequestNamespace", compoundAINimRequest.Namespace)
defer func() {
if err == nil {
logs.Info("Reconcile success")
return
}
logs.Error(err, "Failed to reconcile compoundAINimRequest.")
r.Recorder.Eventf(compoundAINimRequest, corev1.EventTypeWarning, "ReconcileError", "Failed to reconcile compoundAINimRequest: %v", err)
_, err_ := r.setStatusConditions(ctx, req,
metav1.Condition{
Type: nvidiacomv1alpha1.CompoundAINimRequestConditionTypeCompoundAINimAvailable,
Status: metav1.ConditionFalse,
Reason: "Reconciling",
Message: err.Error(),
},
)
if err_ != nil {
logs.Error(err_, "Failed to update compoundAINimRequest status")
return
}
}()
compoundAINimAvailableCondition := meta.FindStatusCondition(compoundAINimRequest.Status.Conditions, nvidiacomv1alpha1.CompoundAINimRequestConditionTypeCompoundAINimAvailable)
if compoundAINimAvailableCondition == nil || compoundAINimAvailableCondition.Status != metav1.ConditionUnknown {
compoundAINimRequest, err = r.setStatusConditions(ctx, req,
metav1.Condition{
Type: nvidiacomv1alpha1.CompoundAINimRequestConditionTypeCompoundAINimAvailable,
Status: metav1.ConditionUnknown,
Reason: "Reconciling",
Message: "Reconciling",
},
)
if err != nil {
return
}
}
separateModels := isSeparateModels(compoundAINimRequest)
modelsExists := false
var modelsExistsResult ctrl.Result
var modelsExistsErr error
if separateModels {
compoundAINimRequest, modelsExists, modelsExistsResult, modelsExistsErr = r.ensureModelsExists(ctx, ensureModelsExistsOption{
compoundAINimRequest: compoundAINimRequest,
req: req,
})
}
compoundAINimRequest, imageInfo, imageExists, imageExistsResult, err := r.ensureImageExists(ctx, ensureImageExistsOption{
compoundAINimRequest: compoundAINimRequest,
req: req,
})
if err != nil {
err = errors.Wrapf(err, "ensure image exists")
return
}
if !imageExists {
result = imageExistsResult
compoundAINimRequest, err = r.setStatusConditions(ctx, req,
metav1.Condition{
Type: nvidiacomv1alpha1.CompoundAINimRequestConditionTypeCompoundAINimAvailable,
Status: metav1.ConditionUnknown,
Reason: "Reconciling",
Message: "CompoundAINim image is building",
},
)
if err != nil {
return
}
return
}
if modelsExistsErr != nil {
err = errors.Wrap(modelsExistsErr, "ensure model exists")
return
}
if separateModels && !modelsExists {
result = modelsExistsResult
compoundAINimRequest, err = r.setStatusConditions(ctx, req,
metav1.Condition{
Type: nvidiacomv1alpha1.CompoundAINimRequestConditionTypeCompoundAINimAvailable,
Status: metav1.ConditionUnknown,
Reason: "Reconciling",
Message: "Model is seeding",
},
)
if err != nil {
return
}
return
}
compoundAINimCR := &nvidiacomv1alpha1.CompoundAINim{
ObjectMeta: metav1.ObjectMeta{
Name: compoundAINimRequest.Name,
Namespace: compoundAINimRequest.Namespace,
},
Spec: nvidiacomv1alpha1.CompoundAINimSpec{
Tag: compoundAINimRequest.Spec.BentoTag,
Image: imageInfo.ImageName,
ServiceName: compoundAINimRequest.Spec.ServiceName,
Context: compoundAINimRequest.Spec.Context,
Models: compoundAINimRequest.Spec.Models,
},
}
if separateModels {
compoundAINimCR.Annotations = map[string]string{
commonconsts.KubeAnnotationYataiImageBuilderSeparateModels: commonconsts.KubeLabelValueTrue,
}
if isAddNamespacePrefix() { // deprecated
compoundAINimCR.Annotations[commonconsts.KubeAnnotationIsMultiTenancy] = commonconsts.KubeLabelValueTrue
}
compoundAINimCR.Annotations[KubeAnnotationModelStorageNS] = compoundAINimRequest.Annotations[KubeAnnotationModelStorageNS]
}
err = ctrl.SetControllerReference(compoundAINimRequest, compoundAINimCR, r.Scheme)
if err != nil {
err = errors.Wrap(err, "set controller reference")
return
}
if imageInfo.DockerConfigJSONSecretName != "" {
compoundAINimCR.Spec.ImagePullSecrets = []corev1.LocalObjectReference{
{
Name: imageInfo.DockerConfigJSONSecretName,
},
}
}
if compoundAINimRequest.Spec.DownloadURL == "" {
var compoundAINim *schemasv1.BentoFullSchema
compoundAINim, err = r.getCompoundAINim(ctx, compoundAINimRequest)
if err != nil {
err = errors.Wrap(err, "get compoundAINim")
return
}
compoundAINimCR.Spec.Context = &nvidiacomv1alpha1.BentoContext{
BentomlVersion: compoundAINim.Manifest.BentomlVersion,
}
}
r.Recorder.Eventf(compoundAINimRequest, corev1.EventTypeNormal, "CompoundAINimImageBuilder", "Creating CompoundAINim CR %s in namespace %s", compoundAINimCR.Name, compoundAINimCR.Namespace)
err = r.Create(ctx, compoundAINimCR)
isAlreadyExists := k8serrors.IsAlreadyExists(err)
if err != nil && !isAlreadyExists {
err = errors.Wrap(err, "create CompoundAINim resource")
return
}
if isAlreadyExists {
oldCompoundAINimCR := &nvidiacomv1alpha1.CompoundAINim{}
r.Recorder.Eventf(compoundAINimRequest, corev1.EventTypeNormal, "CompoundAINimImageBuilder", "Updating CompoundAINim CR %s in namespace %s", compoundAINimCR.Name, compoundAINimCR.Namespace)
err = r.Get(ctx, types.NamespacedName{Name: compoundAINimCR.Name, Namespace: compoundAINimCR.Namespace}, oldCompoundAINimCR)
if err != nil {
err = errors.Wrap(err, "get CompoundAINim resource")
return
}
if !reflect.DeepEqual(oldCompoundAINimCR.Spec, compoundAINimCR.Spec) {
oldCompoundAINimCR.OwnerReferences = compoundAINimCR.OwnerReferences
oldCompoundAINimCR.Spec = compoundAINimCR.Spec
err = r.Update(ctx, oldCompoundAINimCR)
if err != nil {
err = errors.Wrap(err, "update CompoundAINim resource")
return
}
}
}
compoundAINimRequest, err = r.setStatusConditions(ctx, req,
metav1.Condition{
Type: nvidiacomv1alpha1.CompoundAINimRequestConditionTypeCompoundAINimAvailable,
Status: metav1.ConditionTrue,
Reason: "Reconciling",
Message: "CompoundAINim is generated",
},
)
if err != nil {
return
}
return
}
func isEstargzEnabled() bool {
return os.Getenv("ESTARGZ_ENABLED") == commonconsts.KubeLabelValueTrue
}
type ensureImageExistsOption struct {
compoundAINimRequest *nvidiacomv1alpha1.CompoundAINimRequest
req ctrl.Request
}
//nolint:gocyclo,nakedret
func (r *CompoundAINimRequestReconciler) ensureImageExists(ctx context.Context, opt ensureImageExistsOption) (compoundAINimRequest *nvidiacomv1alpha1.CompoundAINimRequest, imageInfo ImageInfo, imageExists bool, result ctrl.Result, err error) { // nolint: unparam
logs := log.FromContext(ctx)
compoundAINimRequest = opt.compoundAINimRequest
req := opt.req
imageInfo, err = r.getImageInfo(ctx, GetImageInfoOption{
CompoundAINimRequest: compoundAINimRequest,
})
if err != nil {
err = errors.Wrap(err, "get image info")
return
}
imageExistsCheckedCondition := meta.FindStatusCondition(compoundAINimRequest.Status.Conditions, nvidiacomv1alpha1.CompoundAINimRequestConditionTypeImageExistsChecked)
imageExistsCondition := meta.FindStatusCondition(compoundAINimRequest.Status.Conditions, nvidiacomv1alpha1.CompoundAINimRequestConditionTypeImageExists)
if imageExistsCheckedCondition == nil || imageExistsCheckedCondition.Status != metav1.ConditionTrue || imageExistsCheckedCondition.Message != imageInfo.ImageName {
imageExistsCheckedCondition = &metav1.Condition{
Type: nvidiacomv1alpha1.CompoundAINimRequestConditionTypeImageExistsChecked,
Status: metav1.ConditionUnknown,
Reason: "Reconciling",
Message: imageInfo.ImageName,
}
compoundAINimAvailableCondition := &metav1.Condition{
Type: nvidiacomv1alpha1.CompoundAINimRequestConditionTypeCompoundAINimAvailable,
Status: metav1.ConditionUnknown,
Reason: "Reconciling",
Message: "Checking image exists",
}
compoundAINimRequest, err = r.setStatusConditions(ctx, req, *imageExistsCheckedCondition, *compoundAINimAvailableCondition)
if err != nil {
return
}
r.Recorder.Eventf(compoundAINimRequest, corev1.EventTypeNormal, "CheckingImage", "Checking image exists: %s", imageInfo.ImageName)
imageExists, err = checkImageExists(compoundAINimRequest, imageInfo.DockerRegistry, imageInfo.InClusterImageName)
if err != nil {
err = errors.Wrapf(err, "check image %s exists", imageInfo.ImageName)
return
}
err = r.Get(ctx, req.NamespacedName, compoundAINimRequest)
if err != nil {
logs.Error(err, "Failed to re-fetch compoundAINimRequest")
return
}
if imageExists {
r.Recorder.Eventf(compoundAINimRequest, corev1.EventTypeNormal, "CheckingImage", "Image exists: %s", imageInfo.ImageName)
imageExistsCheckedCondition = &metav1.Condition{
Type: nvidiacomv1alpha1.CompoundAINimRequestConditionTypeImageExistsChecked,
Status: metav1.ConditionTrue,
Reason: "Reconciling",
Message: imageInfo.ImageName,
}
imageExistsCondition = &metav1.Condition{
Type: nvidiacomv1alpha1.CompoundAINimRequestConditionTypeImageExists,
Status: metav1.ConditionTrue,
Reason: "Reconciling",
Message: imageInfo.ImageName,
}
compoundAINimRequest, err = r.setStatusConditions(ctx, req, *imageExistsCondition, *imageExistsCheckedCondition)
if err != nil {
return
}
} else {
r.Recorder.Eventf(compoundAINimRequest, corev1.EventTypeNormal, "CheckingImage", "Image not exists: %s", imageInfo.ImageName)
imageExistsCheckedCondition = &metav1.Condition{
Type: nvidiacomv1alpha1.CompoundAINimRequestConditionTypeImageExistsChecked,
Status: metav1.ConditionFalse,
Reason: "Reconciling",
Message: fmt.Sprintf("Image not exists: %s", imageInfo.ImageName),
}
imageExistsCondition = &metav1.Condition{
Type: nvidiacomv1alpha1.CompoundAINimRequestConditionTypeImageExists,
Status: metav1.ConditionFalse,
Reason: "Reconciling",
Message: fmt.Sprintf("Image %s is not exists", imageInfo.ImageName),
}
compoundAINimRequest, err = r.setStatusConditions(ctx, req, *imageExistsCondition, *imageExistsCheckedCondition)
if err != nil {
return
}
}
}
var compoundAINimRequestHashStr string
compoundAINimRequestHashStr, err = r.getHashStr(compoundAINimRequest)
if err != nil {
err = errors.Wrapf(err, "get compoundAINimRequest %s/%s hash string", compoundAINimRequest.Namespace, compoundAINimRequest.Name)
return
}
imageExists = imageExistsCondition != nil && imageExistsCondition.Status == metav1.ConditionTrue && imageExistsCondition.Message == imageInfo.ImageName
if imageExists {
return
}
jobLabels := map[string]string{
commonconsts.KubeLabelBentoRequest: compoundAINimRequest.Name,
commonconsts.KubeLabelIsBentoImageBuilder: commonconsts.KubeLabelValueTrue,
}
if isSeparateModels(opt.compoundAINimRequest) {
jobLabels[KubeLabelYataiImageBuilderSeparateModels] = commonconsts.KubeLabelValueTrue
} else {
jobLabels[KubeLabelYataiImageBuilderSeparateModels] = commonconsts.KubeLabelValueFalse
}
jobs := &batchv1.JobList{}
err = r.List(ctx, jobs, client.InNamespace(req.Namespace), client.MatchingLabels(jobLabels))
if err != nil {
err = errors.Wrap(err, "list jobs")
return
}
reservedJobs := make([]*batchv1.Job, 0)
for _, job_ := range jobs.Items {
job_ := job_
oldHash := job_.Annotations[KubeAnnotationCompoundAINimRequestHash]
if oldHash != compoundAINimRequestHashStr {
logs.Info("Because hash changed, delete old job", "job", job_.Name, "oldHash", oldHash, "newHash", compoundAINimRequestHashStr)
// --cascade=foreground
err = r.Delete(ctx, &job_, &client.DeleteOptions{
PropagationPolicy: &[]metav1.DeletionPropagation{metav1.DeletePropagationForeground}[0],
})
if err != nil {
err = errors.Wrapf(err, "delete job %s", job_.Name)
return
}
return
} else {
reservedJobs = append(reservedJobs, &job_)
}
}
var job *batchv1.Job
if len(reservedJobs) > 0 {
job = reservedJobs[0]
}
if len(reservedJobs) > 1 {
for _, job_ := range reservedJobs[1:] {
logs.Info("Because has more than one job, delete old job", "job", job_.Name)
// --cascade=foreground
err = r.Delete(ctx, job_, &client.DeleteOptions{
PropagationPolicy: &[]metav1.DeletionPropagation{metav1.DeletePropagationForeground}[0],
})
if err != nil {
err = errors.Wrapf(err, "delete job %s", job_.Name)
return
}
}
}
if job == nil {
job, err = r.generateImageBuilderJob(ctx, GenerateImageBuilderJobOption{
ImageInfo: imageInfo,
CompoundAINimRequest: compoundAINimRequest,
})
if err != nil {
err = errors.Wrap(err, "generate image builder job")
return
}
r.Recorder.Eventf(compoundAINimRequest, corev1.EventTypeNormal, "GenerateImageBuilderJob", "Creating image builder job: %s", job.Name)
err = r.Create(ctx, job)
if err != nil {
err = errors.Wrapf(err, "create image builder job %s", job.Name)
return
}
r.Recorder.Eventf(compoundAINimRequest, corev1.EventTypeNormal, "GenerateImageBuilderJob", "Created image builder job: %s", job.Name)
return
}
r.Recorder.Eventf(compoundAINimRequest, corev1.EventTypeNormal, "CheckingImageBuilderJob", "Found image builder job: %s", job.Name)
err = r.Get(ctx, req.NamespacedName, compoundAINimRequest)
if err != nil {
logs.Error(err, "Failed to re-fetch compoundAINimRequest")
return
}
imageBuildingCondition := meta.FindStatusCondition(compoundAINimRequest.Status.Conditions, nvidiacomv1alpha1.CompoundAINimRequestConditionTypeImageBuilding)
isJobFailed := false
isJobRunning := true
if job.Spec.Completions != nil {
if job.Status.Succeeded != *job.Spec.Completions {
if job.Status.Failed > 0 {
for _, condition := range job.Status.Conditions {
if condition.Type == batchv1.JobFailed && condition.Status == corev1.ConditionTrue {
isJobFailed = true
break
}
}
}
isJobRunning = !isJobFailed
} else {
isJobRunning = false
}
}
if isJobRunning {
conditions := make([]metav1.Condition, 0)
if job.Status.Active > 0 {
conditions = append(conditions, metav1.Condition{
Type: nvidiacomv1alpha1.CompoundAINimRequestConditionTypeImageBuilding,
Status: metav1.ConditionTrue,
Reason: "Reconciling",
Message: fmt.Sprintf("Image building job %s is running", job.Name),
})
} else {
conditions = append(conditions, metav1.Condition{
Type: nvidiacomv1alpha1.CompoundAINimRequestConditionTypeImageBuilding,
Status: metav1.ConditionUnknown,
Reason: "Reconciling",
Message: fmt.Sprintf("Image building job %s is waiting", job.Name),
})
}
if compoundAINimRequest.Spec.ImageBuildTimeout != nil {
if imageBuildingCondition != nil && imageBuildingCondition.LastTransitionTime.Add(time.Duration(*compoundAINimRequest.Spec.ImageBuildTimeout)).Before(time.Now()) {
conditions = append(conditions, metav1.Condition{
Type: nvidiacomv1alpha1.CompoundAINimRequestConditionTypeImageBuilding,
Status: metav1.ConditionFalse,
Reason: "Timeout",
Message: fmt.Sprintf("Image building job %s is timeout", job.Name),
})
if _, err = r.setStatusConditions(ctx, req, conditions...); err != nil {
return
}
err = errors.New("image build timeout")
return
}
}
if compoundAINimRequest, err = r.setStatusConditions(ctx, req, conditions...); err != nil {
return
}
if imageBuildingCondition != nil && imageBuildingCondition.Status != metav1.ConditionTrue && isJobRunning {
r.Recorder.Eventf(compoundAINimRequest, corev1.EventTypeNormal, "CompoundAINimImageBuilder", "Image is building now")
}
return
}
if isJobFailed {
compoundAINimRequest, err = r.setStatusConditions(ctx, req,
metav1.Condition{
Type: nvidiacomv1alpha1.CompoundAINimRequestConditionTypeImageBuilding,
Status: metav1.ConditionFalse,
Reason: "Reconciling",
Message: fmt.Sprintf("Image building job %s is failed.", job.Name),
},
metav1.Condition{
Type: nvidiacomv1alpha1.CompoundAINimRequestConditionTypeCompoundAINimAvailable,
Status: metav1.ConditionFalse,
Reason: "Reconciling",
Message: fmt.Sprintf("Image building job %s is failed.", job.Name),
},
)
if err != nil {
return
}
return
}
compoundAINimRequest, err = r.setStatusConditions(ctx, req,
metav1.Condition{
Type: nvidiacomv1alpha1.CompoundAINimRequestConditionTypeImageBuilding,
Status: metav1.ConditionFalse,
Reason: "Reconciling",
Message: fmt.Sprintf("Image building job %s is succeeded.", job.Name),
},
metav1.Condition{
Type: nvidiacomv1alpha1.CompoundAINimRequestConditionTypeImageExists,
Status: metav1.ConditionTrue,
Reason: "Reconciling",
Message: imageInfo.ImageName,
},
)
if err != nil {
return
}
r.Recorder.Eventf(compoundAINimRequest, corev1.EventTypeNormal, "CompoundAINimImageBuilder", "Image has been built successfully")
imageExists = true
return
}
type ensureModelsExistsOption struct {
compoundAINimRequest *nvidiacomv1alpha1.CompoundAINimRequest
req ctrl.Request
}
//nolint:gocyclo,nakedret
func (r *CompoundAINimRequestReconciler) ensureModelsExists(ctx context.Context, opt ensureModelsExistsOption) (compoundAINimRequest *nvidiacomv1alpha1.CompoundAINimRequest, modelsExists bool, result ctrl.Result, err error) { // nolint: unparam
compoundAINimRequest = opt.compoundAINimRequest
modelTags := make([]string, 0)
for _, model := range compoundAINimRequest.Spec.Models {
modelTags = append(modelTags, model.Tag)
}
modelsExistsCondition := meta.FindStatusCondition(compoundAINimRequest.Status.Conditions, nvidiacomv1alpha1.CompoundAINimRequestConditionTypeModelsExists)
r.Recorder.Eventf(compoundAINimRequest, corev1.EventTypeNormal, "SeparateModels", "Separate models are enabled")
if modelsExistsCondition == nil || modelsExistsCondition.Status == metav1.ConditionUnknown {
r.Recorder.Eventf(compoundAINimRequest, corev1.EventTypeNormal, "ModelsExists", "Models are not ready")
modelsExistsCondition = &metav1.Condition{
Type: nvidiacomv1alpha1.CompoundAINimRequestConditionTypeModelsExists,
Status: metav1.ConditionFalse,
Reason: "Reconciling",
Message: "Models are not ready",
}
compoundAINimRequest, err = r.setStatusConditions(ctx, opt.req, *modelsExistsCondition)
if err != nil {
return
}
}
modelsExists = modelsExistsCondition != nil && modelsExistsCondition.Status == metav1.ConditionTrue && modelsExistsCondition.Message == fmt.Sprintf("%s:%s", getJuiceFSStorageClassName(), strings.Join(modelTags, ", "))
if modelsExists {
return
}
modelsMap := make(map[string]*nvidiacomv1alpha1.BentoModel)
for _, model := range compoundAINimRequest.Spec.Models {
model := model
modelsMap[model.Tag] = &model
}
jobLabels := map[string]string{
commonconsts.KubeLabelBentoRequest: compoundAINimRequest.Name,
commonconsts.KubeLabelIsModelSeeder: "true",
}
jobs := &batchv1.JobList{}
err = r.List(ctx, jobs, client.InNamespace(compoundAINimRequest.Namespace), client.MatchingLabels(jobLabels))
if err != nil {
err = errors.Wrap(err, "list jobs")
return
}
var compoundAINimRequestHashStr string
compoundAINimRequestHashStr, err = r.getHashStr(compoundAINimRequest)
if err != nil {
err = errors.Wrapf(err, "get compoundAINimRequest %s/%s hash string", compoundAINimRequest.Namespace, compoundAINimRequest.Name)
return
}
existingJobModelTags := make(map[string]struct{})
for _, job_ := range jobs.Items {
job_ := job_
oldHash := job_.Annotations[KubeAnnotationCompoundAINimRequestHash]
if oldHash != compoundAINimRequestHashStr {
r.Recorder.Eventf(compoundAINimRequest, corev1.EventTypeNormal, "DeleteJob", "Because hash changed, delete old job %s, oldHash: %s, newHash: %s", job_.Name, oldHash, compoundAINimRequestHashStr)
// --cascade=foreground
err = r.Delete(ctx, &job_, &client.DeleteOptions{
PropagationPolicy: &[]metav1.DeletionPropagation{metav1.DeletePropagationForeground}[0],
})
if err != nil {
err = errors.Wrapf(err, "delete job %s", job_.Name)
return
}
continue
}
modelTag := fmt.Sprintf("%s:%s", job_.Labels[commonconsts.KubeLabelYataiModelRepository], job_.Labels[commonconsts.KubeLabelYataiModel])
_, ok := modelsMap[modelTag]
if !ok {
r.Recorder.Eventf(compoundAINimRequest, corev1.EventTypeNormal, "DeleteJob", "Due to the nonexistence of the model %s, job %s has been deleted.", modelTag, job_.Name)
// --cascade=foreground
err = r.Delete(ctx, &job_, &client.DeleteOptions{
PropagationPolicy: &[]metav1.DeletionPropagation{metav1.DeletePropagationForeground}[0],
})
if err != nil {
err = errors.Wrapf(err, "delete job %s", job_.Name)
return
}
} else {
existingJobModelTags[modelTag] = struct{}{}
}
}
for _, model := range compoundAINimRequest.Spec.Models {
if _, ok := existingJobModelTags[model.Tag]; ok {
continue
}
model := model
pvc := &corev1.PersistentVolumeClaim{}
pvcName := r.getModelPVCName(compoundAINimRequest, &model)
err = r.Get(ctx, client.ObjectKey{
Namespace: compoundAINimRequest.Namespace,
Name: pvcName,
}, pvc)
isPVCNotFound := k8serrors.IsNotFound(err)
if err != nil && !isPVCNotFound {
err = errors.Wrapf(err, "get PVC %s/%s", compoundAINimRequest.Namespace, pvcName)
return
}
if isPVCNotFound {
pvc = r.generateModelPVC(GenerateModelPVCOption{
CompoundAINimRequest: compoundAINimRequest,
Model: &model,
})
err = r.Create(ctx, pvc)
isPVCAlreadyExists := k8serrors.IsAlreadyExists(err)
if err != nil && !isPVCAlreadyExists {
err = errors.Wrapf(err, "create model %s/%s pvc", compoundAINimRequest.Namespace, model.Tag)
return
}
}
var job *batchv1.Job
job, err = r.generateModelSeederJob(ctx, GenerateModelSeederJobOption{
CompoundAINimRequest: compoundAINimRequest,
Model: &model,
})
if err != nil {
err = errors.Wrap(err, "generate model seeder job")
return
}
oldJob := &batchv1.Job{}
err = r.Get(ctx, client.ObjectKeyFromObject(job), oldJob)
oldJobIsNotFound := k8serrors.IsNotFound(err)
if err != nil && !oldJobIsNotFound {
err = errors.Wrap(err, "get job")
return
}
if oldJobIsNotFound {
err = r.Create(ctx, job)
if err != nil {
err = errors.Wrap(err, "create job")
return
}
r.Recorder.Eventf(compoundAINimRequest, corev1.EventTypeNormal, "CreateJob", "Job %s has been created.", job.Name)
} else if !reflect.DeepEqual(job.Labels, oldJob.Labels) || !reflect.DeepEqual(job.Annotations, oldJob.Annotations) {
job.Labels = oldJob.Labels
job.Annotations = oldJob.Annotations
err = r.Update(ctx, job)
if err != nil {
err = errors.Wrap(err, "update job")
return
}
r.Recorder.Eventf(compoundAINimRequest, corev1.EventTypeNormal, "UpdateJob", "Job %s has been updated.", job.Name)
}
}
jobs = &batchv1.JobList{}
err = r.List(ctx, jobs, client.InNamespace(compoundAINimRequest.Namespace), client.MatchingLabels(jobLabels))
if err != nil {
err = errors.Wrap(err, "list jobs")
return
}
succeedModelTags := make(map[string]struct{})
failedJobNames := make([]string, 0)
notReadyJobNames := make([]string, 0)
for _, job_ := range jobs.Items {
if job_.Spec.Completions != nil && job_.Status.Succeeded == *job_.Spec.Completions {
modelTag := fmt.Sprintf("%s:%s", job_.Labels[commonconsts.KubeLabelYataiModelRepository], job_.Labels[commonconsts.KubeLabelYataiModel])
succeedModelTags[modelTag] = struct{}{}
continue
}
if job_.Status.Failed > 0 {
for _, condition := range job_.Status.Conditions {
if condition.Type == batchv1.JobFailed && condition.Status == corev1.ConditionTrue {
failedJobNames = append(failedJobNames, job_.Name)
continue
}
}
}
notReadyJobNames = append(notReadyJobNames, job_.Name)
}
if len(failedJobNames) > 0 {
msg := fmt.Sprintf("Model seeder jobs failed: %s", strings.Join(failedJobNames, ", "))
r.Recorder.Event(compoundAINimRequest, corev1.EventTypeNormal, "ModelsExists", msg)
compoundAINimRequest, err = r.setStatusConditions(ctx, opt.req,
metav1.Condition{
Type: nvidiacomv1alpha1.CompoundAINimRequestConditionTypeModelsExists,
Status: metav1.ConditionFalse,
Reason: "Reconciling",
Message: msg,
},
metav1.Condition{
Type: nvidiacomv1alpha1.CompoundAINimRequestConditionTypeCompoundAINimAvailable,
Status: metav1.ConditionFalse,
Reason: "Reconciling",
Message: msg,
},
)
if err != nil {
return
}
err = errors.New(msg)
return
}
modelsExists = true
for _, model := range compoundAINimRequest.Spec.Models {
if _, ok := succeedModelTags[model.Tag]; !ok {
modelsExists = false
break
}
}
if modelsExists {
compoundAINimRequest, err = r.setStatusConditions(ctx, opt.req,
metav1.Condition{
Type: nvidiacomv1alpha1.CompoundAINimRequestConditionTypeModelsExists,
Status: metav1.ConditionTrue,
Reason: "Reconciling",
Message: fmt.Sprintf("%s:%s", getJuiceFSStorageClassName(), strings.Join(modelTags, ", ")),
},
metav1.Condition{
Type: nvidiacomv1alpha1.CompoundAINimRequestConditionTypeModelsSeeding,
Status: metav1.ConditionFalse,
Reason: "Reconciling",
Message: "All models have been seeded.",
},
)
if err != nil {
return
}
} else {
compoundAINimRequest, err = r.setStatusConditions(ctx, opt.req,
metav1.Condition{
Type: nvidiacomv1alpha1.CompoundAINimRequestConditionTypeModelsSeeding,
Status: metav1.ConditionTrue,
Reason: "Reconciling",
Message: fmt.Sprintf("Model seeder jobs are not ready: %s.", strings.Join(notReadyJobNames, ", ")),
},
)
if err != nil {
return
}
}
return
}
func (r *CompoundAINimRequestReconciler) setStatusConditions(ctx context.Context, req ctrl.Request, conditions ...metav1.Condition) (compoundAINimRequest *nvidiacomv1alpha1.CompoundAINimRequest, err error) {
compoundAINimRequest = &nvidiacomv1alpha1.CompoundAINimRequest{}
/*
Please don't blame me when you see this kind of code,
this is to avoid "the object has been modified; please apply your changes to the latest version and try again" when updating CR status,
don't doubt that almost all CRD operators (e.g. cert-manager) can't avoid this stupid error and can only try to avoid this by this stupid way.
*/
for i := 0; i < 3; i++ {
if err = r.Get(ctx, req.NamespacedName, compoundAINimRequest); err != nil {
err = errors.Wrap(err, "Failed to re-fetch compoundAINimRequest")
return
}
for _, condition := range conditions {
meta.SetStatusCondition(&compoundAINimRequest.Status.Conditions, condition)
}
if err = r.Status().Update(ctx, compoundAINimRequest); err != nil {
time.Sleep(100 * time.Millisecond)
} else {
break
}
}
if err != nil {
err = errors.Wrap(err, "Failed to update compoundAINimRequest status")
return
}
if err = r.Get(ctx, req.NamespacedName, compoundAINimRequest); err != nil {
err = errors.Wrap(err, "Failed to re-fetch compoundAINimRequest")
return
}
return
}
type CompoundAINimImageBuildEngine string
const (
CompoundAINimImageBuildEngineKaniko CompoundAINimImageBuildEngine = "kaniko"
CompoundAINimImageBuildEngineBuildkit CompoundAINimImageBuildEngine = "buildkit"
CompoundAINimImageBuildEngineBuildkitRootless CompoundAINimImageBuildEngine = "buildkit-rootless"
)
const (
EnvCompoundAINimImageBuildEngine = "BENTO_IMAGE_BUILD_ENGINE"
)
func getCompoundAINimImageBuildEngine() CompoundAINimImageBuildEngine {
engine := os.Getenv(EnvCompoundAINimImageBuildEngine)
if engine == "" {
return CompoundAINimImageBuildEngineKaniko
}
return CompoundAINimImageBuildEngine(engine)
}
//nolint:nakedret
func (r *CompoundAINimRequestReconciler) makeSureDockerConfigJSONSecret(ctx context.Context, namespace string, dockerRegistryConf *commonconfig.DockerRegistryConfig) (dockerConfigJSONSecret *corev1.Secret, err error) {
if dockerRegistryConf.Username == "" {
return
}
// nolint: gosec
dockerConfigSecretName := commonconsts.KubeSecretNameRegcred
dockerConfigObj := struct {
Auths map[string]struct {
Auth string `json:"auth"`
} `json:"auths"`
}{
Auths: map[string]struct {
Auth string `json:"auth"`
}{
dockerRegistryConf.Server: {
Auth: base64.StdEncoding.EncodeToString([]byte(fmt.Sprintf("%s:%s", dockerRegistryConf.Username, dockerRegistryConf.Password))),
},
},
}
dockerConfigContent, err := json.Marshal(dockerConfigObj)
if err != nil {
err = errors.Wrap(err, "marshal docker config")
return nil, err
}
dockerConfigJSONSecret = &corev1.Secret{}
err = r.Get(ctx, types.NamespacedName{Namespace: namespace, Name: dockerConfigSecretName}, dockerConfigJSONSecret)
dockerConfigIsNotFound := k8serrors.IsNotFound(err)
// nolint: gocritic
if err != nil && !dockerConfigIsNotFound {
err = errors.Wrap(err, "get docker config secret")
return nil, err
}
err = nil
if dockerConfigIsNotFound {
dockerConfigJSONSecret = &corev1.Secret{
Type: corev1.SecretTypeDockerConfigJson,
ObjectMeta: metav1.ObjectMeta{
Name: dockerConfigSecretName,
Namespace: namespace,
},
Data: map[string][]byte{
".dockerconfigjson": dockerConfigContent,
},
}
err_ := r.Create(ctx, dockerConfigJSONSecret)
if err_ != nil {
dockerConfigJSONSecret = &corev1.Secret{}
err = r.Get(ctx, types.NamespacedName{Namespace: namespace, Name: dockerConfigSecretName}, dockerConfigJSONSecret)
dockerConfigIsNotFound = k8serrors.IsNotFound(err)
if err != nil && !dockerConfigIsNotFound {
err = errors.Wrap(err, "get docker config secret")
return nil, err
}
if dockerConfigIsNotFound {
err_ = errors.Wrap(err_, "create docker config secret")
return nil, err_
}
if err != nil {
err = nil
}
}
} else {
dockerConfigJSONSecret.Data[".dockerconfigjson"] = dockerConfigContent
err = r.Update(ctx, dockerConfigJSONSecret)
if err != nil {
err = errors.Wrap(err, "update docker config secret")
return nil, err
}
}
return
}
//nolint:nakedret
func (r *CompoundAINimRequestReconciler) getYataiClient(ctx context.Context) (yataiClient **yataiclient.YataiClient, yataiConf **commonconfig.YataiConfig, err error) {
yataiConf_, err := commonconfig.GetYataiConfig(ctx, func(ctx context.Context, namespace, name string) (*corev1.Secret, error) {
secret := &corev1.Secret{}
err := r.Get(ctx, types.NamespacedName{
Namespace: namespace,
Name: name,
}, secret)
return secret, errors.Wrap(err, "get secret")
}, commonconsts.YataiImageBuilderComponentName, false)
isNotFound := k8serrors.IsNotFound(err)
if err != nil && !isNotFound {
err = errors.Wrap(err, "get yatai config")
return
}
if isNotFound {
return
}
if yataiConf_.Endpoint == "" {
return
}
if yataiConf_.ClusterName == "" {
yataiConf_.ClusterName = "default"
}
yataiClient_ := yataiclient.NewYataiClient(yataiConf_.Endpoint, fmt.Sprintf("%s:%s:%s", commonconsts.YataiImageBuilderComponentName, yataiConf_.ClusterName, yataiConf_.ApiToken))
yataiClient = &yataiClient_
yataiConf = &yataiConf_
return
}
func (r *CompoundAINimRequestReconciler) getYataiClientWithAuth(ctx context.Context, compoundAINimRequest *nvidiacomv1alpha1.CompoundAINimRequest) (**yataiclient.YataiClient, **commonconfig.YataiConfig, error) {
orgId, ok := compoundAINimRequest.Labels[commonconsts.NgcOrganizationHeaderName]
if !ok {
orgId = commonconsts.DefaultOrgId
}
userId, ok := compoundAINimRequest.Labels[commonconsts.NgcUserHeaderName]
if !ok {
userId = commonconsts.DefaultUserId
}
auth := yataiclient.CompoundAIAuthHeaders{
OrgId: orgId,
UserId: userId,
}
client, yataiConf, err := r.getYataiClient(ctx)
if err != nil {
return nil, nil, err
}
(*client).SetAuth(auth)
return client, yataiConf, err
}
//nolint:nakedret
func (r *CompoundAINimRequestReconciler) getDockerRegistry(ctx context.Context, compoundAINimRequest *nvidiacomv1alpha1.CompoundAINimRequest) (dockerRegistry modelschemas.DockerRegistrySchema, err error) {
if compoundAINimRequest != nil && compoundAINimRequest.Spec.DockerConfigJSONSecretName != "" {
secret := &corev1.Secret{}
err = r.Get(ctx, types.NamespacedName{
Namespace: compoundAINimRequest.Namespace,
Name: compoundAINimRequest.Spec.DockerConfigJSONSecretName,
}, secret)
if err != nil {
err = errors.Wrapf(err, "get docker config json secret %s", compoundAINimRequest.Spec.DockerConfigJSONSecretName)
return
}
configJSON, ok := secret.Data[".dockerconfigjson"]
if !ok {
err = errors.Errorf("docker config json secret %s does not have .dockerconfigjson key", compoundAINimRequest.Spec.DockerConfigJSONSecretName)
return
}
var configObj struct {
Auths map[string]struct {
Auth string `json:"auth"`
} `json:"auths"`
}
err = json.Unmarshal(configJSON, &configObj)
if err != nil {
err = errors.Wrapf(err, "unmarshal docker config json secret %s", compoundAINimRequest.Spec.DockerConfigJSONSecretName)
return
}
imageRegistryURI, _, _ := xstrings.Partition(compoundAINimRequest.Spec.Image, "/")
var server string
var auth string
if imageRegistryURI != "" {
for k, v := range configObj.Auths {
if k == imageRegistryURI {
server = k
auth = v.Auth
break
}
}
if server == "" {
for k, v := range configObj.Auths {
if strings.Contains(k, imageRegistryURI) {
server = k
auth = v.Auth
break
}
}
}
}
if server == "" {
for k, v := range configObj.Auths {
server = k
auth = v.Auth
break
}
}
if server == "" {
err = errors.Errorf("no auth in docker config json secret %s", compoundAINimRequest.Spec.DockerConfigJSONSecretName)
return
}
dockerRegistry.Server = server
var credentials []byte
credentials, err = base64.StdEncoding.DecodeString(auth)
if err != nil {
err = errors.Wrapf(err, "cannot base64 decode auth in docker config json secret %s", compoundAINimRequest.Spec.DockerConfigJSONSecretName)
return
}
dockerRegistry.Username, _, dockerRegistry.Password = xstrings.Partition(string(credentials), ":")
if compoundAINimRequest.Spec.OCIRegistryInsecure != nil {
dockerRegistry.Secure = !*compoundAINimRequest.Spec.OCIRegistryInsecure
}
return
}
dockerRegistryConfig, err := commonconfig.GetDockerRegistryConfig(ctx, func(ctx context.Context, namespace, name string) (*corev1.Secret, error) {
secret := &corev1.Secret{}
err := r.Get(ctx, types.NamespacedName{
Namespace: namespace,
Name: name,
}, secret)
return secret, errors.Wrap(err, "get secret")
})
if err != nil {
err = errors.Wrap(err, "get docker registry")
return
}
compoundAINimRepositoryName := "yatai-bentos"
modelRepositoryName := "yatai-models"
if dockerRegistryConfig.BentoRepositoryName != "" {
compoundAINimRepositoryName = dockerRegistryConfig.BentoRepositoryName
}
if dockerRegistryConfig.ModelRepositoryName != "" {
modelRepositoryName = dockerRegistryConfig.ModelRepositoryName
}
compoundAINimRepositoryURI := fmt.Sprintf("%s/%s", strings.TrimRight(dockerRegistryConfig.Server, "/"), compoundAINimRepositoryName)
modelRepositoryURI := fmt.Sprintf("%s/%s", strings.TrimRight(dockerRegistryConfig.Server, "/"), modelRepositoryName)
if strings.Contains(dockerRegistryConfig.Server, "docker.io") {
compoundAINimRepositoryURI = fmt.Sprintf("docker.io/%s", compoundAINimRepositoryName)
modelRepositoryURI = fmt.Sprintf("docker.io/%s", modelRepositoryName)
}
compoundAINimRepositoryInClusterURI := compoundAINimRepositoryURI
modelRepositoryInClusterURI := modelRepositoryURI
if dockerRegistryConfig.InClusterServer != "" {
compoundAINimRepositoryInClusterURI = fmt.Sprintf("%s/%s", strings.TrimRight(dockerRegistryConfig.InClusterServer, "/"), compoundAINimRepositoryName)
modelRepositoryInClusterURI = fmt.Sprintf("%s/%s", strings.TrimRight(dockerRegistryConfig.InClusterServer, "/"), modelRepositoryName)
if strings.Contains(dockerRegistryConfig.InClusterServer, "docker.io") {
compoundAINimRepositoryInClusterURI = fmt.Sprintf("docker.io/%s", compoundAINimRepositoryName)
modelRepositoryInClusterURI = fmt.Sprintf("docker.io/%s", modelRepositoryName)
}
}
dockerRegistry = modelschemas.DockerRegistrySchema{
Server: dockerRegistryConfig.Server,
Username: dockerRegistryConfig.Username,
Password: dockerRegistryConfig.Password,
Secure: dockerRegistryConfig.Secure,
BentosRepositoryURI: compoundAINimRepositoryURI,
BentosRepositoryURIInCluster: compoundAINimRepositoryInClusterURI,
ModelsRepositoryURI: modelRepositoryURI,
ModelsRepositoryURIInCluster: modelRepositoryInClusterURI,
}
return
}
func isAddNamespacePrefix() bool {
return os.Getenv("ADD_NAMESPACE_PREFIX_TO_IMAGE_NAME") == trueStr
}
func getCompoundAINimImagePrefix(compoundAINimRequest *nvidiacomv1alpha1.CompoundAINimRequest) string {
if compoundAINimRequest == nil {
return ""
}
prefix, exist := compoundAINimRequest.Annotations[KubeAnnotationCompoundAINimStorageNS]
if exist && prefix != "" {
return fmt.Sprintf("%s.", prefix)
}
if isAddNamespacePrefix() {
return fmt.Sprintf("%s.", compoundAINimRequest.Namespace)
}
return ""
}
func getModelNamespace(compoundAINimRequest *nvidiacomv1alpha1.CompoundAINimRequest) string {
if compoundAINimRequest == nil {
return ""
}
prefix := compoundAINimRequest.Annotations[KubeAnnotationModelStorageNS]
if prefix != "" {
return prefix
}
if isAddNamespacePrefix() {
return compoundAINimRequest.Namespace
}
return ""
}
func getCompoundAINimImageName(compoundAINimRequest *nvidiacomv1alpha1.CompoundAINimRequest, dockerRegistry modelschemas.DockerRegistrySchema, compoundAINimRepositoryName, compoundAINimVersion string, inCluster bool) string {
if compoundAINimRequest != nil && compoundAINimRequest.Spec.Image != "" {
return compoundAINimRequest.Spec.Image
}
var uri, tag string
if inCluster {
uri = dockerRegistry.BentosRepositoryURIInCluster
} else {
uri = dockerRegistry.BentosRepositoryURI
}
tail := fmt.Sprintf("%s.%s", compoundAINimRepositoryName, compoundAINimVersion)
separateModels := isSeparateModels(compoundAINimRequest)
if separateModels {
tail += ".nomodels"
}
if isEstargzEnabled() {
tail += ".esgz"
}
tag = fmt.Sprintf("yatai.%s%s", getCompoundAINimImagePrefix(compoundAINimRequest), tail)
if len(tag) > 128 {
hashStr := hash(tail)
tag = fmt.Sprintf("yatai.%s%s", getCompoundAINimImagePrefix(compoundAINimRequest), hashStr)
if len(tag) > 128 {
tag = fmt.Sprintf("yatai.%s", hash(fmt.Sprintf("%s%s", getCompoundAINimImagePrefix(compoundAINimRequest), tail)))[:128]
}
}
return fmt.Sprintf("%s:%s", uri, tag)
}
func isSeparateModels(compoundAINimRequest *nvidiacomv1alpha1.CompoundAINimRequest) (separateModels bool) {
return compoundAINimRequest.Annotations[commonconsts.KubeAnnotationYataiImageBuilderSeparateModels] == commonconsts.KubeLabelValueTrue
}
func checkImageExists(compoundAINimRequest *nvidiacomv1alpha1.CompoundAINimRequest, dockerRegistry modelschemas.DockerRegistrySchema, imageName string) (bool, error) {
if compoundAINimRequest.Annotations["yatai.ai/force-build-image"] == commonconsts.KubeLabelValueTrue {
return false, nil
}
server, _, imageName := xstrings.Partition(imageName, "/")
if strings.Contains(server, "docker.io") {
server = "index.docker.io"
}
if dockerRegistry.Secure {
server = fmt.Sprintf("https://%s", server)
} else {
server = fmt.Sprintf("http://%s", server)
}
hub, err := registry.New(server, dockerRegistry.Username, dockerRegistry.Password, logrus.Debugf)
if err != nil {
err = errors.Wrapf(err, "create docker registry client for %s", server)
return false, err
}
imageName, _, tag := xstrings.LastPartition(imageName, ":")
tags, err := hub.Tags(imageName)
isNotFound := err != nil && strings.Contains(err.Error(), "404")
if isNotFound {
return false, nil
}
if err != nil {
err = errors.Wrapf(err, "get tags for docker image %s", imageName)
return false, err
}
for _, tag_ := range tags {
if tag_ == tag {
return true, nil
}
}
return false, nil
}
type ImageInfo struct {
DockerRegistry modelschemas.DockerRegistrySchema
DockerConfigJSONSecretName string
ImageName string
InClusterImageName string
DockerRegistryInsecure bool
}
type GetImageInfoOption struct {
CompoundAINimRequest *nvidiacomv1alpha1.CompoundAINimRequest
}
//nolint:nakedret
func (r *CompoundAINimRequestReconciler) getImageInfo(ctx context.Context, opt GetImageInfoOption) (imageInfo ImageInfo, err error) {
compoundAINimRepositoryName, _, compoundAINimVersion := xstrings.Partition(opt.CompoundAINimRequest.Spec.BentoTag, ":")
dockerRegistry, err := r.getDockerRegistry(ctx, opt.CompoundAINimRequest)
if err != nil {
err = errors.Wrap(err, "get docker registry")
return
}
imageInfo.DockerRegistry = dockerRegistry
imageInfo.ImageName = getCompoundAINimImageName(opt.CompoundAINimRequest, dockerRegistry, compoundAINimRepositoryName, compoundAINimVersion, false)
imageInfo.InClusterImageName = getCompoundAINimImageName(opt.CompoundAINimRequest, dockerRegistry, compoundAINimRepositoryName, compoundAINimVersion, true)
imageInfo.DockerConfigJSONSecretName = opt.CompoundAINimRequest.Spec.DockerConfigJSONSecretName
imageInfo.DockerRegistryInsecure = opt.CompoundAINimRequest.Annotations[commonconsts.KubeAnnotationDockerRegistryInsecure] == "true"
if opt.CompoundAINimRequest.Spec.OCIRegistryInsecure != nil {
imageInfo.DockerRegistryInsecure = *opt.CompoundAINimRequest.Spec.OCIRegistryInsecure
}
if imageInfo.DockerConfigJSONSecretName == "" {
var dockerRegistryConf *commonconfig.DockerRegistryConfig
dockerRegistryConf, err = commonconfig.GetDockerRegistryConfig(ctx, func(ctx context.Context, namespace, name string) (*corev1.Secret, error) {
secret := &corev1.Secret{}
err := r.Get(ctx, types.NamespacedName{Namespace: namespace, Name: name}, secret)
return secret, errors.Wrap(err, "get docker registry secret")
})
if err != nil {
err = errors.Wrap(err, "get docker registry")
return
}
imageInfo.DockerRegistryInsecure = !dockerRegistryConf.Secure
var dockerConfigSecret *corev1.Secret
r.Recorder.Eventf(opt.CompoundAINimRequest, corev1.EventTypeNormal, "GenerateImageBuilderPod", "Making sure docker config secret %s in namespace %s", commonconsts.KubeSecretNameRegcred, opt.CompoundAINimRequest.Namespace)
dockerConfigSecret, err = r.makeSureDockerConfigJSONSecret(ctx, opt.CompoundAINimRequest.Namespace, dockerRegistryConf)
if err != nil {
err = errors.Wrap(err, "make sure docker config secret")
return
}
r.Recorder.Eventf(opt.CompoundAINimRequest, corev1.EventTypeNormal, "GenerateImageBuilderPod", "Docker config secret %s in namespace %s is ready", commonconsts.KubeSecretNameRegcred, opt.CompoundAINimRequest.Namespace)
if dockerConfigSecret != nil {
imageInfo.DockerConfigJSONSecretName = dockerConfigSecret.Name
}
}
return
}
func (r *CompoundAINimRequestReconciler) getCompoundAINim(ctx context.Context, compoundAINimRequest *nvidiacomv1alpha1.CompoundAINimRequest) (compoundAINim *schemasv1.BentoFullSchema, err error) {
compoundAINimRepositoryName, _, compoundAINimVersion := xstrings.Partition(compoundAINimRequest.Spec.BentoTag, ":")
yataiClient_, _, err := r.getYataiClient(ctx)
if err != nil {
err = errors.Wrap(err, "get yatai client")
return
}
if yataiClient_ == nil {
err = errors.New("can't get yatai client, please check yatai configuration")
return
}
yataiClient := *yataiClient_
r.Recorder.Eventf(compoundAINimRequest, corev1.EventTypeNormal, "FetchCompoundAINim", "Getting compoundAINim %s from yatai service", compoundAINimRequest.Spec.BentoTag)
compoundAINim, err = yataiClient.GetBento(ctx, compoundAINimRepositoryName, compoundAINimVersion)
if err != nil {
err = errors.Wrap(err, "get compoundAINim")
return
}
r.Recorder.Eventf(compoundAINimRequest, corev1.EventTypeNormal, "FetchCompoundAINim", "Got compoundAINim %s from yatai service", compoundAINimRequest.Spec.BentoTag)
return
}
func (r *CompoundAINimRequestReconciler) getImageBuilderJobName() string {
guid := xid.New()
return fmt.Sprintf("yatai-compoundainim-image-builder-%s", guid.String())
}
func (r *CompoundAINimRequestReconciler) getModelSeederJobName() string {
guid := xid.New()
return fmt.Sprintf("yatai-model-seeder-%s", guid.String())
}
func (r *CompoundAINimRequestReconciler) getModelSeederJobLabels(compoundAINimRequest *nvidiacomv1alpha1.CompoundAINimRequest, model *nvidiacomv1alpha1.BentoModel) map[string]string {
compoundAINimRepositoryName, _, compoundAINimVersion := xstrings.Partition(compoundAINimRequest.Spec.BentoTag, ":")
modelRepositoryName, _, modelVersion := xstrings.Partition(model.Tag, ":")
return map[string]string{
commonconsts.KubeLabelBentoRequest: compoundAINimRequest.Name,
commonconsts.KubeLabelIsModelSeeder: "true",
commonconsts.KubeLabelYataiModelRepository: modelRepositoryName,
commonconsts.KubeLabelYataiModel: modelVersion,
commonconsts.KubeLabelYataiBentoRepository: compoundAINimRepositoryName,
commonconsts.KubeLabelYataiBento: compoundAINimVersion,
}
}
func (r *CompoundAINimRequestReconciler) getModelSeederPodLabels(compoundAINimRequest *nvidiacomv1alpha1.CompoundAINimRequest, model *nvidiacomv1alpha1.BentoModel) map[string]string {
compoundAINimRepositoryName, _, compoundAINimVersion := xstrings.Partition(compoundAINimRequest.Spec.BentoTag, ":")
modelRepositoryName, _, modelVersion := xstrings.Partition(model.Tag, ":")
return map[string]string{
commonconsts.KubeLabelBentoRequest: compoundAINimRequest.Name,
commonconsts.KubeLabelIsModelSeeder: "true",
commonconsts.KubeLabelIsBentoImageBuilder: "true",
commonconsts.KubeLabelYataiModelRepository: modelRepositoryName,
commonconsts.KubeLabelYataiModel: modelVersion,
commonconsts.KubeLabelYataiBentoRepository: compoundAINimRepositoryName,
commonconsts.KubeLabelYataiBento: compoundAINimVersion,
}
}
func (r *CompoundAINimRequestReconciler) getImageBuilderJobLabels(compoundAINimRequest *nvidiacomv1alpha1.CompoundAINimRequest) map[string]string {
compoundAINimRepositoryName, _, compoundAINimVersion := xstrings.Partition(compoundAINimRequest.Spec.BentoTag, ":")
labels := map[string]string{
commonconsts.KubeLabelBentoRequest: compoundAINimRequest.Name,
commonconsts.KubeLabelIsBentoImageBuilder: "true",
commonconsts.KubeLabelYataiBentoRepository: compoundAINimRepositoryName,
commonconsts.KubeLabelYataiBento: compoundAINimVersion,
}
if isSeparateModels(compoundAINimRequest) {
labels[KubeLabelYataiImageBuilderSeparateModels] = commonconsts.KubeLabelValueTrue
} else {
labels[KubeLabelYataiImageBuilderSeparateModels] = commonconsts.KubeLabelValueFalse
}
return labels
}
func (r *CompoundAINimRequestReconciler) getImageBuilderPodLabels(compoundAINimRequest *nvidiacomv1alpha1.CompoundAINimRequest) map[string]string {
compoundAINimRepositoryName, _, compoundAINimVersion := xstrings.Partition(compoundAINimRequest.Spec.BentoTag, ":")
return map[string]string{
commonconsts.KubeLabelBentoRequest: compoundAINimRequest.Name,
commonconsts.KubeLabelIsBentoImageBuilder: "true",
commonconsts.KubeLabelYataiBentoRepository: compoundAINimRepositoryName,
commonconsts.KubeLabelYataiBento: compoundAINimVersion,
}
}
func hash(text string) string {
// nolint: gosec
hasher := md5.New()
hasher.Write([]byte(text))
return hex.EncodeToString(hasher.Sum(nil))
}
func (r *CompoundAINimRequestReconciler) getModelPVCName(compoundAINimRequest *nvidiacomv1alpha1.CompoundAINimRequest, model *nvidiacomv1alpha1.BentoModel) string {
storageClassName := getJuiceFSStorageClassName()
var hashStr string
ns := getModelNamespace(compoundAINimRequest)
if ns == "" {
hashStr = hash(fmt.Sprintf("%s:%s", storageClassName, model.Tag))
} else {
hashStr = hash(fmt.Sprintf("%s:%s:%s", storageClassName, ns, model.Tag))
}
pvcName := fmt.Sprintf("model-seeder-%s", hashStr)
if len(pvcName) > 63 {
pvcName = pvcName[:63]
}
return pvcName
}
func (r *CompoundAINimRequestReconciler) getJuiceFSModelPath(compoundAINimRequest *nvidiacomv1alpha1.CompoundAINimRequest, model *nvidiacomv1alpha1.BentoModel) string {
modelRepositoryName, _, modelVersion := xstrings.Partition(model.Tag, ":")
ns := getModelNamespace(compoundAINimRequest)
if isHuggingfaceModel(model) {
modelVersion = "all"
}
var path string
if ns == "" {
path = fmt.Sprintf("models/.shared/%s/%s", modelRepositoryName, modelVersion)
} else {
path = fmt.Sprintf("models/%s/%s/%s", ns, modelRepositoryName, modelVersion)
}
return path
}
func isHuggingfaceModel(model *nvidiacomv1alpha1.BentoModel) bool {
return strings.HasPrefix(model.DownloadURL, "hf://")
}
type GenerateModelPVCOption struct {
CompoundAINimRequest *nvidiacomv1alpha1.CompoundAINimRequest
Model *nvidiacomv1alpha1.BentoModel
}
//nolint:nakedret
func (r *CompoundAINimRequestReconciler) generateModelPVC(opt GenerateModelPVCOption) (pvc *corev1.PersistentVolumeClaim) {
storageSize := resource.MustParse("100Gi")
if opt.Model.Size != nil {
storageSize = *opt.Model.Size
minStorageSize := resource.MustParse("1Gi")
if storageSize.Value() < minStorageSize.Value() {
storageSize = minStorageSize
}
storageSize.Set(storageSize.Value() * 2)
}
path := r.getJuiceFSModelPath(opt.CompoundAINimRequest, opt.Model)
pvcName := r.getModelPVCName(opt.CompoundAINimRequest, opt.Model)
pvc = &corev1.PersistentVolumeClaim{
ObjectMeta: metav1.ObjectMeta{
Name: pvcName,
Namespace: opt.CompoundAINimRequest.Namespace,
Annotations: map[string]string{
"path": path,
},
},
Spec: corev1.PersistentVolumeClaimSpec{
AccessModes: []corev1.PersistentVolumeAccessMode{corev1.ReadWriteMany},
Resources: corev1.VolumeResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceStorage: storageSize,
},
},
StorageClassName: ptr.To(getJuiceFSStorageClassName()),
},
}
return
}
type GenerateModelSeederJobOption struct {
CompoundAINimRequest *nvidiacomv1alpha1.CompoundAINimRequest
Model *nvidiacomv1alpha1.BentoModel
}
//nolint:nakedret
func (r *CompoundAINimRequestReconciler) generateModelSeederJob(ctx context.Context, opt GenerateModelSeederJobOption) (job *batchv1.Job, err error) {
// nolint: gosimple
podTemplateSpec, err := r.generateModelSeederPodTemplateSpec(ctx, GenerateModelSeederPodTemplateSpecOption(opt))
if err != nil {
err = errors.Wrap(err, "generate model seeder pod template spec")
return
}
kubeAnnotations := make(map[string]string)
hashStr, err := r.getHashStr(opt.CompoundAINimRequest)
if err != nil {
err = errors.Wrap(err, "failed to get hash string")
return
}
kubeAnnotations[KubeAnnotationCompoundAINimRequestHash] = hashStr
job = &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{
Name: r.getModelSeederJobName(),
Namespace: opt.CompoundAINimRequest.Namespace,
Labels: r.getModelSeederJobLabels(opt.CompoundAINimRequest, opt.Model),
Annotations: kubeAnnotations,
},
Spec: batchv1.JobSpec{
Completions: ptr.To(int32(1)),
Parallelism: ptr.To(int32(1)),
PodFailurePolicy: &batchv1.PodFailurePolicy{
Rules: []batchv1.PodFailurePolicyRule{
{
Action: batchv1.PodFailurePolicyActionFailJob,
OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{
ContainerName: ptr.To(ModelSeederContainerName),
Operator: batchv1.PodFailurePolicyOnExitCodesOpIn,
Values: []int32{ModelSeederJobFailedExitCode},
},
},
},
},
Template: *podTemplateSpec,
},
}
err = ctrl.SetControllerReference(opt.CompoundAINimRequest, job, r.Scheme)
if err != nil {
err = errors.Wrapf(err, "set controller reference for job %s", job.Name)
return
}
return
}
type GenerateModelSeederPodTemplateSpecOption struct {
CompoundAINimRequest *nvidiacomv1alpha1.CompoundAINimRequest
Model *nvidiacomv1alpha1.BentoModel
}
//nolint:nakedret
func (r *CompoundAINimRequestReconciler) generateModelSeederPodTemplateSpec(ctx context.Context, opt GenerateModelSeederPodTemplateSpecOption) (pod *corev1.PodTemplateSpec, err error) {
kubeLabels := r.getModelSeederPodLabels(opt.CompoundAINimRequest, opt.Model)
volumes := make([]corev1.Volume, 0)
volumeMounts := make([]corev1.VolumeMount, 0)
yataiAPITokenSecretName := ""
internalImages := commonconfig.GetInternalImages()
logrus.Infof("Model seeder is using the images %v", *internalImages)
downloaderContainerResources := corev1.ResourceRequirements{
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("1000m"),
corev1.ResourceMemory: resource.MustParse("3000Mi"),
},
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("100m"),
corev1.ResourceMemory: resource.MustParse("1000Mi"),
},
}
downloaderContainerEnvFrom := opt.CompoundAINimRequest.Spec.DownloaderContainerEnvFrom
if yataiAPITokenSecretName != "" {
downloaderContainerEnvFrom = append(downloaderContainerEnvFrom, corev1.EnvFromSource{
SecretRef: &corev1.SecretEnvSource{
LocalObjectReference: corev1.LocalObjectReference{
Name: yataiAPITokenSecretName,
},
},
})
}
containers := make([]corev1.Container, 0)
model := opt.Model
modelRepositoryName, _, modelVersion := xstrings.Partition(model.Tag, ":")
modelDownloadURL := model.DownloadURL
modelDownloadHeader := ""
if modelDownloadURL == "" {
var yataiClient_ **yataiclient.YataiClient
var yataiConf_ **commonconfig.YataiConfig
yataiClient_, yataiConf_, err = r.getYataiClient(ctx)
if err != nil {
err = errors.Wrap(err, "get yatai client")
return
}
if yataiClient_ == nil || yataiConf_ == nil {
err = errors.New("can't get yatai client, please check yatai configuration")
return
}
yataiClient := *yataiClient_
yataiConf := *yataiConf_
var model_ *schemasv1.ModelFullSchema
r.Recorder.Eventf(opt.CompoundAINimRequest, corev1.EventTypeNormal, "GenerateImageBuilderPod", "Getting model %s from yatai service", model.Tag)
model_, err = yataiClient.GetModel(ctx, modelRepositoryName, modelVersion)
if err != nil {
err = errors.Wrap(err, "get model")
return
}
r.Recorder.Eventf(opt.CompoundAINimRequest, corev1.EventTypeNormal, "GenerateImageBuilderPod", "Model %s is got from yatai service", model.Tag)
if model_.TransmissionStrategy != nil && *model_.TransmissionStrategy == modelschemas.TransmissionStrategyPresignedURL {
var model0 *schemasv1.ModelSchema
r.Recorder.Eventf(opt.CompoundAINimRequest, corev1.EventTypeNormal, "GenerateImageBuilderPod", "Getting presigned url for model %s from yatai service", model.Tag)
model0, err = yataiClient.PresignModelDownloadURL(ctx, modelRepositoryName, modelVersion)
if err != nil {
err = errors.Wrap(err, "presign model download url")
return
}
r.Recorder.Eventf(opt.CompoundAINimRequest, corev1.EventTypeNormal, "GenerateImageBuilderPod", "Presigned url for model %s is got from yatai service", model.Tag)
modelDownloadURL = model0.PresignedDownloadUrl
} else {
modelDownloadURL = fmt.Sprintf("%s/api/v1/model_repositories/%s/models/%s/download", yataiConf.Endpoint, modelRepositoryName, modelVersion)
modelDownloadHeader = fmt.Sprintf("%s: %s:%s:$%s", commonconsts.YataiApiTokenHeaderName, commonconsts.YataiImageBuilderComponentName, yataiConf.ClusterName, commonconsts.EnvYataiApiToken)
}
}
modelDirPath := "/juicefs-workspace"
var modelSeedCommandOutput bytes.Buffer
err = template.Must(template.New("script").Parse(`
set -e
mkdir -p {{.ModelDirPath}}
url="{{.ModelDownloadURL}}"
if [[ ${url} == hf://* ]]; then
if [ -f "{{.ModelDirPath}}/{{.ModelVersion}}.exists" ]; then
echo "Model {{.ModelDirPath}}/{{.ModelVersion}}.exists already exists, skip downloading"
exit 0
fi
else
if [ -f "{{.ModelDirPath}}/.exists" ]; then
echo "Model {{.ModelDirPath}} already exists, skip downloading"
exit 0
fi
fi
cleanup() {
echo "Cleaning up..."
rm -rf /tmp/model
rm -f /tmp/downloaded.tar
}
trap cleanup EXIT
if [[ ${url} == hf://* ]]; then
mkdir -p /tmp/model
hf_url="${url:5}"
model_id=$(echo "$hf_url" | awk -F '@' '{print $1}')
revision=$(echo "$hf_url" | awk -F '@' '{print $2}')
endpoint=$(echo "$hf_url" | awk -F '@' '{print $3}')
export HF_ENDPOINT=${endpoint}
echo "Downloading model ${model_id} (endpoint=${endpoint}, revision=${revision}) from Huggingface..."
huggingface-cli download ${model_id} --revision ${revision} --cache-dir {{.ModelDirPath}}
else
echo "Downloading model {{.ModelRepositoryName}}:{{.ModelVersion}} to /tmp/downloaded.tar..."
if [[ ${url} == s3://* ]]; then
echo "Downloading from s3..."
aws s3 cp ${url} /tmp/downloaded.tar
elif [[ ${url} == gs://* ]]; then
echo "Downloading from GCS..."
gsutil cp ${url} /tmp/downloaded.tar
else
curl --fail -L -H "{{.ModelDownloadHeader}}" ${url} --output /tmp/downloaded.tar --progress-bar
fi
cd {{.ModelDirPath}}
echo "Extracting model tar file..."
tar -xvf /tmp/downloaded.tar
fi
if [[ ${url} == hf://* ]]; then
echo "Creating {{.ModelDirPath}}/{{.ModelVersion}}.exists file..."
touch {{.ModelDirPath}}/{{.ModelVersion}}.exists
else
echo "Creating {{.ModelDirPath}}/.exists file..."
touch {{.ModelDirPath}}/.exists
fi
echo "Done"
`)).Execute(&modelSeedCommandOutput, map[string]interface{}{
"ModelDirPath": modelDirPath,
"ModelDownloadURL": modelDownloadURL,
"ModelDownloadHeader": modelDownloadHeader,
"ModelRepositoryName": modelRepositoryName,
"ModelVersion": modelVersion,
"HuggingfaceModelDir": fmt.Sprintf("models--%s", strings.ReplaceAll(modelRepositoryName, "/", "--")),
})
if err != nil {
err = errors.Wrap(err, "failed to generate download command")
return
}
modelSeedCommand := modelSeedCommandOutput.String()
pvcName := r.getModelPVCName(opt.CompoundAINimRequest, model)
volumes = append(volumes, corev1.Volume{
Name: pvcName,
VolumeSource: corev1.VolumeSource{
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
ClaimName: pvcName,
},
},
})
containers = append(containers, corev1.Container{
Name: ModelSeederContainerName,
Image: internalImages.BentoDownloader,
Command: []string{
"bash",
"-c",
modelSeedCommand,
},
VolumeMounts: append(volumeMounts, corev1.VolumeMount{
Name: pvcName,
MountPath: modelDirPath,
}),
Resources: downloaderContainerResources,
EnvFrom: downloaderContainerEnvFrom,
Env: []corev1.EnvVar{
{
Name: "AWS_EC2_METADATA_DISABLED",
Value: "true",
},
},
})
kubeAnnotations := make(map[string]string)
kubeAnnotations[KubeAnnotationCompoundAINimRequestModelSeederHash] = opt.CompoundAINimRequest.Annotations[KubeAnnotationCompoundAINimRequestModelSeederHash]
pod = &corev1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Labels: kubeLabels,
Annotations: kubeAnnotations,
},
Spec: corev1.PodSpec{
RestartPolicy: corev1.RestartPolicyNever,
Volumes: volumes,
Containers: containers,
},
}
var globalExtraPodSpec *compoundaiCommon.ExtraPodSpec
configNamespace, err := commonconfig.GetYataiImageBuilderNamespace(ctx, func(ctx context.Context, namespace, name string) (*corev1.Secret, error) {
secret := &corev1.Secret{}
err := r.Get(ctx, types.NamespacedName{
Namespace: namespace,
Name: name,
}, secret)
return secret, errors.Wrap(err, "get secret")
})
if err != nil {
err = errors.Wrap(err, "failed to get Yatai image builder namespace")
return
}
configCmName := "yatai-image-builder-config"
r.Recorder.Eventf(opt.CompoundAINimRequest, corev1.EventTypeNormal, "GenerateModelSeederPod", "Getting configmap %s from namespace %s", configCmName, configNamespace)
configCm := &corev1.ConfigMap{}
err = r.Get(ctx, types.NamespacedName{Name: configCmName, Namespace: configNamespace}, configCm)
configCmIsNotFound := k8serrors.IsNotFound(err)
if err != nil && !configCmIsNotFound {
err = errors.Wrap(err, "failed to get configmap")
return
}
err = nil
if !configCmIsNotFound {
r.Recorder.Eventf(opt.CompoundAINimRequest, corev1.EventTypeNormal, "GenerateModelSeederPod", "Configmap %s is got from namespace %s", configCmName, configNamespace)
globalExtraPodSpec = &compoundaiCommon.ExtraPodSpec{}
if val, ok := configCm.Data["extra_pod_spec"]; ok {
err = yaml.Unmarshal([]byte(val), globalExtraPodSpec)
if err != nil {
err = errors.Wrapf(err, "failed to yaml unmarshal extra_pod_spec, please check the configmap %s in namespace %s", configCmName, configNamespace)
return
}
}
} else {
r.Recorder.Eventf(opt.CompoundAINimRequest, corev1.EventTypeNormal, "GenerateModelSeederPod", "Configmap %s is not found in namespace %s", configCmName, configNamespace)
}
if globalExtraPodSpec != nil {
pod.Spec.PriorityClassName = globalExtraPodSpec.PriorityClassName
pod.Spec.SchedulerName = globalExtraPodSpec.SchedulerName
pod.Spec.NodeSelector = globalExtraPodSpec.NodeSelector
pod.Spec.Affinity = globalExtraPodSpec.Affinity
pod.Spec.Tolerations = globalExtraPodSpec.Tolerations
pod.Spec.TopologySpreadConstraints = globalExtraPodSpec.TopologySpreadConstraints
pod.Spec.ServiceAccountName = globalExtraPodSpec.ServiceAccountName
}
injectPodAffinity(&pod.Spec, opt.CompoundAINimRequest)
return
}
type GenerateImageBuilderJobOption struct {
ImageInfo ImageInfo
CompoundAINimRequest *nvidiacomv1alpha1.CompoundAINimRequest
}
//nolint:nakedret
func (r *CompoundAINimRequestReconciler) generateImageBuilderJob(ctx context.Context, opt GenerateImageBuilderJobOption) (job *batchv1.Job, err error) {
// nolint: gosimple
podTemplateSpec, err := r.generateImageBuilderPodTemplateSpec(ctx, GenerateImageBuilderPodTemplateSpecOption(opt))
if err != nil {
err = errors.Wrap(err, "generate image builder pod template spec")
return
}
kubeAnnotations := make(map[string]string)
hashStr, err := r.getHashStr(opt.CompoundAINimRequest)
if err != nil {
err = errors.Wrap(err, "failed to get hash string")
return
}
kubeAnnotations[KubeAnnotationCompoundAINimRequestHash] = hashStr
job = &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{
Name: r.getImageBuilderJobName(),
Namespace: opt.CompoundAINimRequest.Namespace,
Labels: r.getImageBuilderJobLabels(opt.CompoundAINimRequest),
Annotations: kubeAnnotations,
},
Spec: batchv1.JobSpec{
Completions: ptr.To(int32(1)),
Parallelism: ptr.To(int32(1)),
PodFailurePolicy: &batchv1.PodFailurePolicy{
Rules: []batchv1.PodFailurePolicyRule{
{
Action: batchv1.PodFailurePolicyActionFailJob,
OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{
ContainerName: ptr.To(BuilderContainerName),
Operator: batchv1.PodFailurePolicyOnExitCodesOpIn,
Values: []int32{BuilderJobFailedExitCode},
},
},
},
},
Template: *podTemplateSpec,
},
}
err = ctrl.SetControllerReference(opt.CompoundAINimRequest, job, r.Scheme)
if err != nil {
err = errors.Wrapf(err, "set controller reference for job %s", job.Name)
return
}
return
}
func injectPodAffinity(podSpec *corev1.PodSpec, compoundAINimRequest *nvidiacomv1alpha1.CompoundAINimRequest) {
if podSpec.Affinity == nil {
podSpec.Affinity = &corev1.Affinity{}
}
if podSpec.Affinity.PodAffinity == nil {
podSpec.Affinity.PodAffinity = &corev1.PodAffinity{}
}
podSpec.Affinity.PodAffinity.PreferredDuringSchedulingIgnoredDuringExecution = append(podSpec.Affinity.PodAffinity.PreferredDuringSchedulingIgnoredDuringExecution, corev1.WeightedPodAffinityTerm{
Weight: 100,
PodAffinityTerm: corev1.PodAffinityTerm{
LabelSelector: &metav1.LabelSelector{
MatchLabels: map[string]string{
commonconsts.KubeLabelBentoRequest: compoundAINimRequest.Name,
},
},
TopologyKey: corev1.LabelHostname,
},
})
}
const BuilderContainerName = "builder"
const BuilderJobFailedExitCode = 42
const ModelSeederContainerName = "seeder"
const ModelSeederJobFailedExitCode = 42
type GenerateImageBuilderPodTemplateSpecOption struct {
ImageInfo ImageInfo
CompoundAINimRequest *nvidiacomv1alpha1.CompoundAINimRequest
}
//nolint:gocyclo,nakedret
func (r *CompoundAINimRequestReconciler) generateImageBuilderPodTemplateSpec(ctx context.Context, opt GenerateImageBuilderPodTemplateSpecOption) (pod *corev1.PodTemplateSpec, err error) {
compoundAINimRepositoryName, _, compoundAINimVersion := xstrings.Partition(opt.CompoundAINimRequest.Spec.BentoTag, ":")
kubeLabels := r.getImageBuilderPodLabels(opt.CompoundAINimRequest)
inClusterImageName := opt.ImageInfo.InClusterImageName
dockerConfigJSONSecretName := opt.ImageInfo.DockerConfigJSONSecretName
dockerRegistryInsecure := opt.ImageInfo.DockerRegistryInsecure
volumes := []corev1.Volume{
{
Name: "yatai",
VolumeSource: corev1.VolumeSource{
EmptyDir: &corev1.EmptyDirVolumeSource{},
},
},
{
Name: "workspace",
VolumeSource: corev1.VolumeSource{
EmptyDir: &corev1.EmptyDirVolumeSource{},
},
},
}
volumeMounts := []corev1.VolumeMount{
{
Name: "yatai",
MountPath: "/yatai",
},
{
Name: "workspace",
MountPath: "/workspace",
},
}
if dockerConfigJSONSecretName != "" {
volumes = append(volumes, corev1.Volume{
Name: dockerConfigJSONSecretName,
VolumeSource: corev1.VolumeSource{
Secret: &corev1.SecretVolumeSource{
SecretName: dockerConfigJSONSecretName,
Items: []corev1.KeyToPath{
{
Key: ".dockerconfigjson",
Path: "config.json",
},
},
},
},
})
volumeMounts = append(volumeMounts, corev1.VolumeMount{
Name: dockerConfigJSONSecretName,
MountPath: "/kaniko/.docker/",
})
}
var compoundAINim *schemasv1.BentoFullSchema
yataiAPITokenSecretName := ""
compoundAINimDownloadURL := opt.CompoundAINimRequest.Spec.DownloadURL
compoundAINimDownloadHeader := ""
if compoundAINimDownloadURL == "" {
var yataiClient_ **yataiclient.YataiClient
var yataiConf_ **commonconfig.YataiConfig
yataiClient_, yataiConf_, err = r.getYataiClientWithAuth(ctx, opt.CompoundAINimRequest)
if err != nil {
err = errors.Wrap(err, "get yatai client")
return
}
if yataiClient_ == nil || yataiConf_ == nil {
err = errors.New("can't get yatai client, please check yatai configuration")
return
}
yataiClient := *yataiClient_
yataiConf := *yataiConf_
r.Recorder.Eventf(opt.CompoundAINimRequest, corev1.EventTypeNormal, "GenerateImageBuilderPod", "Getting compoundAINim %s from yatai service", opt.CompoundAINimRequest.Spec.BentoTag)
compoundAINim, err = yataiClient.GetBento(ctx, compoundAINimRepositoryName, compoundAINimVersion)
if err != nil {
err = errors.Wrap(err, "get compoundAINim")
return
}
r.Recorder.Eventf(opt.CompoundAINimRequest, corev1.EventTypeNormal, "GenerateImageBuilderPod", "Got compoundAINim %s from yatai service", opt.CompoundAINimRequest.Spec.BentoTag)
if compoundAINim.TransmissionStrategy != nil && *compoundAINim.TransmissionStrategy == modelschemas.TransmissionStrategyPresignedURL {
var compoundAINim_ *schemasv1.BentoSchema
r.Recorder.Eventf(opt.CompoundAINimRequest, corev1.EventTypeNormal, "GenerateImageBuilderPod", "Getting presigned url for compoundAINim %s from yatai service", opt.CompoundAINimRequest.Spec.BentoTag)
compoundAINim_, err = yataiClient.PresignBentoDownloadURL(ctx, compoundAINimRepositoryName, compoundAINimVersion)
if err != nil {
err = errors.Wrap(err, "presign compoundAINim download url")
return
}
r.Recorder.Eventf(opt.CompoundAINimRequest, corev1.EventTypeNormal, "GenerateImageBuilderPod", "Got presigned url for compoundAINim %s from yatai service", opt.CompoundAINimRequest.Spec.BentoTag)
compoundAINimDownloadURL = compoundAINim_.PresignedDownloadUrl
} else {
compoundAINimDownloadURL = fmt.Sprintf("%s/api/v1/bento_repositories/%s/bentos/%s/download", yataiConf.Endpoint, compoundAINimRepositoryName, compoundAINimVersion)
compoundAINimDownloadHeader = fmt.Sprintf("%s: %s:%s:$%s", commonconsts.YataiApiTokenHeaderName, commonconsts.YataiImageBuilderComponentName, yataiConf.ClusterName, commonconsts.EnvYataiApiToken)
}
// nolint: gosec
yataiAPITokenSecretName = "yatai-api-token"
yataiAPITokenSecret := &corev1.Secret{
ObjectMeta: metav1.ObjectMeta{
Name: yataiAPITokenSecretName,
Namespace: opt.CompoundAINimRequest.Namespace,
},
StringData: map[string]string{
commonconsts.EnvYataiApiToken: yataiConf.ApiToken,
},
}
r.Recorder.Eventf(opt.CompoundAINimRequest, corev1.EventTypeNormal, "GenerateImageBuilderPod", "Getting secret %s in namespace %s", yataiAPITokenSecretName, opt.CompoundAINimRequest.Namespace)
_yataiAPITokenSecret := &corev1.Secret{}
err = r.Get(ctx, types.NamespacedName{Namespace: opt.CompoundAINimRequest.Namespace, Name: yataiAPITokenSecretName}, _yataiAPITokenSecret)
isNotFound := k8serrors.IsNotFound(err)
if err != nil && !isNotFound {
err = errors.Wrapf(err, "failed to get secret %s", yataiAPITokenSecretName)
return
}
if isNotFound {
r.Recorder.Eventf(opt.CompoundAINimRequest, corev1.EventTypeNormal, "GenerateImageBuilderPod", "Secret %s is not found, so creating it in namespace %s", yataiAPITokenSecretName, opt.CompoundAINimRequest.Namespace)
err = r.Create(ctx, yataiAPITokenSecret)
isExists := k8serrors.IsAlreadyExists(err)
if err != nil && !isExists {
err = errors.Wrapf(err, "failed to create secret %s", yataiAPITokenSecretName)
return
}
r.Recorder.Eventf(opt.CompoundAINimRequest, corev1.EventTypeNormal, "GenerateImageBuilderPod", "Secret %s is created in namespace %s", yataiAPITokenSecretName, opt.CompoundAINimRequest.Namespace)
} else {
r.Recorder.Eventf(opt.CompoundAINimRequest, corev1.EventTypeNormal, "GenerateImageBuilderPod", "Secret %s is found in namespace %s, so updating it", yataiAPITokenSecretName, opt.CompoundAINimRequest.Namespace)
err = r.Update(ctx, yataiAPITokenSecret)
if err != nil {
err = errors.Wrapf(err, "failed to update secret %s", yataiAPITokenSecretName)
return
}
r.Recorder.Eventf(opt.CompoundAINimRequest, corev1.EventTypeNormal, "GenerateImageBuilderPod", "Secret %s is updated in namespace %s", yataiAPITokenSecretName, opt.CompoundAINimRequest.Namespace)
}
}
internalImages := commonconfig.GetInternalImages()
logrus.Infof("Image builder is using the images %v", *internalImages)
buildEngine := getCompoundAINimImageBuildEngine()
privileged := buildEngine != CompoundAINimImageBuildEngineBuildkitRootless
compoundAINimDownloadCommandTemplate, err := template.New("downloadCommand").Parse(`
set -e
mkdir -p /workspace/buildcontext
url="{{.CompoundAINimDownloadURL}}"
echo "Downloading compoundAINim {{.CompoundAINimRepositoryName}}:{{.CompoundAINimVersion}} to /tmp/downloaded.tar..."
if [[ ${url} == s3://* ]]; then
echo "Downloading from s3..."
aws s3 cp ${url} /tmp/downloaded.tar
elif [[ ${url} == gs://* ]]; then
echo "Downloading from GCS..."
gsutil cp ${url} /tmp/downloaded.tar
else
curl --fail -L -H "{{.CompoundAINimDownloadHeader}}" ${url} --output /tmp/downloaded.tar --progress-bar
fi
cd /workspace/buildcontext
echo "Extracting compoundAINim tar file..."
tar -xvf /tmp/downloaded.tar
echo "Removing compoundAINim tar file..."
rm /tmp/downloaded.tar
{{if not .Privileged}}
echo "Changing directory permission..."
chown -R 1000:1000 /workspace
{{end}}
echo "Done"
`)
if err != nil {
err = errors.Wrap(err, "failed to parse download command template")
return
}
var compoundAINimDownloadCommandBuffer bytes.Buffer
err = compoundAINimDownloadCommandTemplate.Execute(&compoundAINimDownloadCommandBuffer, map[string]interface{}{
"CompoundAINimDownloadURL": compoundAINimDownloadURL,
"CompoundAINimDownloadHeader": compoundAINimDownloadHeader,
"CompoundAINimRepositoryName": compoundAINimRepositoryName,
"CompoundAINimVersion": compoundAINimVersion,
"Privileged": privileged,
})
if err != nil {
err = errors.Wrap(err, "failed to execute download command template")
return
}
compoundAINimDownloadCommand := compoundAINimDownloadCommandBuffer.String()
downloaderContainerResources := corev1.ResourceRequirements{
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("1000m"),
corev1.ResourceMemory: resource.MustParse("3000Mi"),
},
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("100m"),
corev1.ResourceMemory: resource.MustParse("1000Mi"),
},
}
downloaderContainerEnvFrom := opt.CompoundAINimRequest.Spec.DownloaderContainerEnvFrom
if yataiAPITokenSecretName != "" {
downloaderContainerEnvFrom = append(downloaderContainerEnvFrom, corev1.EnvFromSource{
SecretRef: &corev1.SecretEnvSource{
LocalObjectReference: corev1.LocalObjectReference{
Name: yataiAPITokenSecretName,
},
},
})
}
initContainers := []corev1.Container{
{
Name: "compoundainim-downloader",
Image: internalImages.BentoDownloader,
Command: []string{
"bash",
"-c",
compoundAINimDownloadCommand,
},
VolumeMounts: volumeMounts,
Resources: downloaderContainerResources,
EnvFrom: downloaderContainerEnvFrom,
Env: []corev1.EnvVar{
{
Name: "AWS_EC2_METADATA_DISABLED",
Value: "true",
},
},
},
}
containers := make([]corev1.Container, 0)
separateModels := isSeparateModels(opt.CompoundAINimRequest)
models := opt.CompoundAINimRequest.Spec.Models
modelsSeen := map[string]struct{}{}
for _, model := range models {
modelsSeen[model.Tag] = struct{}{}
}
if compoundAINim != nil {
for _, modelTag := range compoundAINim.Manifest.Models {
if _, ok := modelsSeen[modelTag]; !ok {
models = append(models, nvidiacomv1alpha1.BentoModel{
Tag: modelTag,
})
}
}
}
for idx, model := range models {
if separateModels {
continue
}
modelRepositoryName, _, modelVersion := xstrings.Partition(model.Tag, ":")
modelDownloadURL := model.DownloadURL
modelDownloadHeader := ""
if modelDownloadURL == "" {
if compoundAINim == nil {
continue
}
var yataiClient_ **yataiclient.YataiClient
var yataiConf_ **commonconfig.YataiConfig
yataiClient_, yataiConf_, err = r.getYataiClientWithAuth(ctx, opt.CompoundAINimRequest)
if err != nil {
err = errors.Wrap(err, "get yatai client")
return
}
if yataiClient_ == nil || yataiConf_ == nil {
err = errors.New("can't get yatai client, please check yatai configuration")
return
}
yataiClient := *yataiClient_
yataiConf := *yataiConf_
var model_ *schemasv1.ModelFullSchema
r.Recorder.Eventf(opt.CompoundAINimRequest, corev1.EventTypeNormal, "GenerateImageBuilderPod", "Getting model %s from yatai service", model.Tag)
model_, err = yataiClient.GetModel(ctx, modelRepositoryName, modelVersion)
if err != nil {
err = errors.Wrap(err, "get model")
return
}
r.Recorder.Eventf(opt.CompoundAINimRequest, corev1.EventTypeNormal, "GenerateImageBuilderPod", "Model %s is got from yatai service", model.Tag)
if model_.TransmissionStrategy != nil && *model_.TransmissionStrategy == modelschemas.TransmissionStrategyPresignedURL {
var model0 *schemasv1.ModelSchema
r.Recorder.Eventf(opt.CompoundAINimRequest, corev1.EventTypeNormal, "GenerateImageBuilderPod", "Getting presigned url for model %s from yatai service", model.Tag)
model0, err = yataiClient.PresignModelDownloadURL(ctx, modelRepositoryName, modelVersion)
if err != nil {
err = errors.Wrap(err, "presign model download url")
return
}
r.Recorder.Eventf(opt.CompoundAINimRequest, corev1.EventTypeNormal, "GenerateImageBuilderPod", "Presigned url for model %s is got from yatai service", model.Tag)
modelDownloadURL = model0.PresignedDownloadUrl
} else {
modelDownloadURL = fmt.Sprintf("%s/api/v1/model_repositories/%s/models/%s/download", yataiConf.Endpoint, modelRepositoryName, modelVersion)
modelDownloadHeader = fmt.Sprintf("%s: %s:%s:$%s", commonconsts.YataiApiTokenHeaderName, commonconsts.YataiImageBuilderComponentName, yataiConf.ClusterName, commonconsts.EnvYataiApiToken)
}
}
modelRepositoryDirPath := fmt.Sprintf("/workspace/buildcontext/models/%s", modelRepositoryName)
modelDirPath := filepath.Join(modelRepositoryDirPath, modelVersion)
var modelDownloadCommandOutput bytes.Buffer
err = template.Must(template.New("script").Parse(`
set -e
mkdir -p {{.ModelDirPath}}
url="{{.ModelDownloadURL}}"
echo "Downloading model {{.ModelRepositoryName}}:{{.ModelVersion}} to /tmp/downloaded.tar..."
if [[ ${url} == s3://* ]]; then
echo "Downloading from s3..."
aws s3 cp ${url} /tmp/downloaded.tar
elif [[ ${url} == gs://* ]]; then
echo "Downloading from GCS..."
gsutil cp ${url} /tmp/downloaded.tar
else
curl --fail -L -H "{{.ModelDownloadHeader}}" ${url} --output /tmp/downloaded.tar --progress-bar
fi
cd {{.ModelDirPath}}
echo "Extracting model tar file..."
tar -xvf /tmp/downloaded.tar
echo -n '{{.ModelVersion}}' > {{.ModelRepositoryDirPath}}/latest
echo "Removing model tar file..."
rm /tmp/downloaded.tar
{{if not .Privileged}}
echo "Changing directory permission..."
chown -R 1000:1000 /workspace
{{end}}
echo "Done"
`)).Execute(&modelDownloadCommandOutput, map[string]interface{}{
"ModelDirPath": modelDirPath,
"ModelDownloadURL": modelDownloadURL,
"ModelDownloadHeader": modelDownloadHeader,
"ModelRepositoryDirPath": modelRepositoryDirPath,
"ModelRepositoryName": modelRepositoryName,
"ModelVersion": modelVersion,
"Privileged": privileged,
})
if err != nil {
err = errors.Wrap(err, "failed to generate download command")
return
}
modelDownloadCommand := modelDownloadCommandOutput.String()
initContainers = append(initContainers, corev1.Container{
Name: fmt.Sprintf("model-downloader-%d", idx),
Image: internalImages.BentoDownloader,
Command: []string{
"bash",
"-c",
modelDownloadCommand,
},
VolumeMounts: volumeMounts,
Resources: downloaderContainerResources,
EnvFrom: downloaderContainerEnvFrom,
Env: []corev1.EnvVar{
{
Name: "AWS_EC2_METADATA_DISABLED",
Value: "true",
},
},
})
}
var globalExtraPodMetadata *compoundaiCommon.ExtraPodMetadata
var globalExtraPodSpec *compoundaiCommon.ExtraPodSpec
var globalExtraContainerEnv []corev1.EnvVar
var globalDefaultImageBuilderContainerResources *corev1.ResourceRequirements
var buildArgs []string
var builderArgs []string
configNamespace, err := commonconfig.GetYataiImageBuilderNamespace(ctx, func(ctx context.Context, namespace, name string) (*corev1.Secret, error) {
secret := &corev1.Secret{}
err := r.Get(ctx, types.NamespacedName{
Namespace: namespace,
Name: name,
}, secret)
return secret, errors.Wrap(err, "get secret")
})
if err != nil {
err = errors.Wrap(err, "failed to get Yatai image builder namespace")
return
}
configCmName := "yatai-image-builder-config"
r.Recorder.Eventf(opt.CompoundAINimRequest, corev1.EventTypeNormal, "GenerateImageBuilderPod", "Getting configmap %s from namespace %s", configCmName, configNamespace)
configCm := &corev1.ConfigMap{}
err = r.Get(ctx, types.NamespacedName{Name: configCmName, Namespace: configNamespace}, configCm)
configCmIsNotFound := k8serrors.IsNotFound(err)
if err != nil && !configCmIsNotFound {
err = errors.Wrap(err, "failed to get configmap")
return
}
err = nil // nolint: ineffassign
if !configCmIsNotFound {
r.Recorder.Eventf(opt.CompoundAINimRequest, corev1.EventTypeNormal, "GenerateImageBuilderPod", "Configmap %s is got from namespace %s", configCmName, configNamespace)
globalExtraPodMetadata = &compoundaiCommon.ExtraPodMetadata{}
if val, ok := configCm.Data["extra_pod_metadata"]; ok {
err = yaml.Unmarshal([]byte(val), globalExtraPodMetadata)
if err != nil {
err = errors.Wrapf(err, "failed to yaml unmarshal extra_pod_metadata, please check the configmap %s in namespace %s", configCmName, configNamespace)
return
}
}
globalExtraPodSpec = &compoundaiCommon.ExtraPodSpec{}
if val, ok := configCm.Data["extra_pod_spec"]; ok {
err = yaml.Unmarshal([]byte(val), globalExtraPodSpec)
if err != nil {
err = errors.Wrapf(err, "failed to yaml unmarshal extra_pod_spec, please check the configmap %s in namespace %s", configCmName, configNamespace)
return
}
}
globalExtraContainerEnv = []corev1.EnvVar{}
if val, ok := configCm.Data["extra_container_env"]; ok {
err = yaml.Unmarshal([]byte(val), &globalExtraContainerEnv)
if err != nil {
err = errors.Wrapf(err, "failed to yaml unmarshal extra_container_env, please check the configmap %s in namespace %s", configCmName, configNamespace)
return
}
}
if val, ok := configCm.Data["default_image_builder_container_resources"]; ok {
globalDefaultImageBuilderContainerResources = &corev1.ResourceRequirements{}
err = yaml.Unmarshal([]byte(val), globalDefaultImageBuilderContainerResources)
if err != nil {
err = errors.Wrapf(err, "failed to yaml unmarshal default_image_builder_container_resources, please check the configmap %s in namespace %s", configCmName, configNamespace)
return
}
}
buildArgs = []string{}
if val, ok := configCm.Data["build_args"]; ok {
err = yaml.Unmarshal([]byte(val), &buildArgs)
if err != nil {
err = errors.Wrapf(err, "failed to yaml unmarshal build_args, please check the configmap %s in namespace %s", configCmName, configNamespace)
return
}
}
builderArgs = []string{}
if val, ok := configCm.Data["builder_args"]; ok {
err = yaml.Unmarshal([]byte(val), &builderArgs)
if err != nil {
err = errors.Wrapf(err, "failed to yaml unmarshal builder_args, please check the configmap %s in namespace %s", configCmName, configNamespace)
return
}
}
logrus.Info("passed in builder args: ", builderArgs)
} else {
r.Recorder.Eventf(opt.CompoundAINimRequest, corev1.EventTypeNormal, "GenerateImageBuilderPod", "Configmap %s is not found in namespace %s", configCmName, configNamespace)
}
if buildArgs == nil {
buildArgs = make([]string, 0)
}
if opt.CompoundAINimRequest.Spec.BuildArgs != nil {
buildArgs = append(buildArgs, opt.CompoundAINimRequest.Spec.BuildArgs...)
}
dockerFilePath := "/workspace/buildcontext/env/docker/Dockerfile"
builderContainerEnvFrom := make([]corev1.EnvFromSource, 0)
builderContainerEnvs := []corev1.EnvVar{
{
Name: "DOCKER_CONFIG",
Value: "/kaniko/.docker/",
},
{
Name: "IFS",
Value: "''",
},
}
kanikoCacheRepo := os.Getenv("KANIKO_CACHE_REPO")
if kanikoCacheRepo == "" {
kanikoCacheRepo = opt.ImageInfo.DockerRegistry.BentosRepositoryURIInCluster
}
kubeAnnotations := make(map[string]string)
kubeAnnotations[KubeAnnotationCompoundAINimRequestImageBuiderHash] = opt.CompoundAINimRequest.Annotations[KubeAnnotationCompoundAINimRequestImageBuiderHash]
command := []string{
"/kaniko/executor",
}
args := []string{
"--context=/workspace/buildcontext",
"--verbosity=info",
"--image-fs-extract-retry=3",
"--cache=false",
fmt.Sprintf("--cache-repo=%s", kanikoCacheRepo),
"--compressed-caching=false",
"--compression=zstd",
"--compression-level=-7",
fmt.Sprintf("--dockerfile=%s", dockerFilePath),
fmt.Sprintf("--insecure=%v", dockerRegistryInsecure),
fmt.Sprintf("--destination=%s", inClusterImageName),
}
kanikoSnapshotMode := os.Getenv("KANIKO_SNAPSHOT_MODE")
if kanikoSnapshotMode != "" {
args = append(args, fmt.Sprintf("--snapshot-mode=%s", kanikoSnapshotMode))
}
var builderImage string
switch buildEngine {
case CompoundAINimImageBuildEngineKaniko:
builderImage = internalImages.Kaniko
if isEstargzEnabled() {
builderContainerEnvs = append(builderContainerEnvs, corev1.EnvVar{
Name: "GGCR_EXPERIMENT_ESTARGZ",
Value: "1",
})
}
case CompoundAINimImageBuildEngineBuildkit:
builderImage = internalImages.Buildkit
case CompoundAINimImageBuildEngineBuildkitRootless:
builderImage = internalImages.BuildkitRootless
default:
err = errors.Errorf("unknown compoundAINim image build engine %s", buildEngine)
return
}
isBuildkit := buildEngine == CompoundAINimImageBuildEngineBuildkit || buildEngine == CompoundAINimImageBuildEngineBuildkitRootless
if isBuildkit {
output := fmt.Sprintf("type=image,name=%s,push=true,registry.insecure=%v", inClusterImageName, dockerRegistryInsecure)
buildkitdFlags := []string{}
if !privileged {
buildkitdFlags = append(buildkitdFlags, "--oci-worker-no-process-sandbox")
}
if isEstargzEnabled() {
buildkitdFlags = append(buildkitdFlags, "--oci-worker-snapshotter=stargz")
output += ",oci-mediatypes=true,compression=estargz,force-compression=true"
}
if len(buildkitdFlags) > 0 {
builderContainerEnvs = append(builderContainerEnvs, corev1.EnvVar{
Name: "BUILDKITD_FLAGS",
Value: strings.Join(buildkitdFlags, " "),
})
}
command = []string{"buildctl-daemonless.sh"}
args = []string{
"build",
"--frontend",
"dockerfile.v0",
"--local",
"context=/workspace/buildcontext",
"--local",
fmt.Sprintf("dockerfile=%s", filepath.Dir(dockerFilePath)),
"--output",
output,
}
cacheRepo := os.Getenv("BUILDKIT_CACHE_REPO")
if cacheRepo == "" {
cacheRepo = opt.ImageInfo.DockerRegistry.BentosRepositoryURIInCluster
}
args = append(args, "--export-cache", fmt.Sprintf("type=registry,ref=%s:buildcache,mode=max,compression=zstd,ignore-error=true", cacheRepo))
args = append(args, "--import-cache", fmt.Sprintf("type=registry,ref=%s:buildcache", cacheRepo))
}
var builderContainerSecurityContext *corev1.SecurityContext
if buildEngine == CompoundAINimImageBuildEngineBuildkit {
builderContainerSecurityContext = &corev1.SecurityContext{
Privileged: ptr.To(true),
}
} else if buildEngine == CompoundAINimImageBuildEngineBuildkitRootless {
kubeAnnotations["container.apparmor.security.beta.kubernetes.io/builder"] = "unconfined"
builderContainerSecurityContext = &corev1.SecurityContext{
SeccompProfile: &corev1.SeccompProfile{
Type: corev1.SeccompProfileTypeUnconfined,
},
RunAsUser: ptr.To(int64(1000)),
RunAsGroup: ptr.To(int64(1000)),
}
}
// add build args to pass via --build-arg
for _, buildArg := range buildArgs {
quotedBuildArg := unix.SingleQuote.Quote(buildArg)
if isBuildkit {
args = append(args, "--opt", fmt.Sprintf("build-arg:%s", quotedBuildArg))
} else {
args = append(args, fmt.Sprintf("--build-arg=%s", quotedBuildArg))
}
}
// add other arguments to builder
args = append(args, builderArgs...)
logrus.Info("yatai-image-builder args: ", args)
// nolint: gosec
buildArgsSecretName := "yatai-image-builder-build-args"
r.Recorder.Eventf(opt.CompoundAINimRequest, corev1.EventTypeNormal, "GenerateImageBuilderPod", "Getting secret %s from namespace %s", buildArgsSecretName, configNamespace)
buildArgsSecret := &corev1.Secret{}
err = r.Get(ctx, types.NamespacedName{Name: buildArgsSecretName, Namespace: configNamespace}, buildArgsSecret)
buildArgsSecretIsNotFound := k8serrors.IsNotFound(err)
if err != nil && !buildArgsSecretIsNotFound {
err = errors.Wrap(err, "failed to get secret")
return
}
if !buildArgsSecretIsNotFound {
r.Recorder.Eventf(opt.CompoundAINimRequest, corev1.EventTypeNormal, "GenerateImageBuilderPod", "Secret %s is got from namespace %s", buildArgsSecretName, configNamespace)
if configNamespace != opt.CompoundAINimRequest.Namespace {
r.Recorder.Eventf(opt.CompoundAINimRequest, corev1.EventTypeNormal, "GenerateImageBuilderPod", "Secret %s is in namespace %s, but CompoundAINimRequest is in namespace %s, so we need to copy the secret to CompoundAINimRequest namespace", buildArgsSecretName, configNamespace, opt.CompoundAINimRequest.Namespace)
r.Recorder.Eventf(opt.CompoundAINimRequest, corev1.EventTypeNormal, "GenerateImageBuilderPod", "Getting secret %s in namespace %s", buildArgsSecretName, opt.CompoundAINimRequest.Namespace)
_buildArgsSecret := &corev1.Secret{}
err = r.Get(ctx, types.NamespacedName{Namespace: opt.CompoundAINimRequest.Namespace, Name: buildArgsSecretName}, _buildArgsSecret)
localBuildArgsSecretIsNotFound := k8serrors.IsNotFound(err)
if err != nil && !localBuildArgsSecretIsNotFound {
err = errors.Wrapf(err, "failed to get secret %s from namespace %s", buildArgsSecretName, opt.CompoundAINimRequest.Namespace)
return
}
if localBuildArgsSecretIsNotFound {
r.Recorder.Eventf(opt.CompoundAINimRequest, corev1.EventTypeNormal, "GenerateImageBuilderPod", "Copying secret %s from namespace %s to namespace %s", buildArgsSecretName, configNamespace, opt.CompoundAINimRequest.Namespace)
err = r.Create(ctx, &corev1.Secret{
ObjectMeta: metav1.ObjectMeta{
Name: buildArgsSecretName,
Namespace: opt.CompoundAINimRequest.Namespace,
},
Data: buildArgsSecret.Data,
})
if err != nil {
err = errors.Wrapf(err, "failed to create secret %s in namespace %s", buildArgsSecretName, opt.CompoundAINimRequest.Namespace)
return
}
} else {
r.Recorder.Eventf(opt.CompoundAINimRequest, corev1.EventTypeNormal, "GenerateImageBuilderPod", "Secret %s is already in namespace %s", buildArgsSecretName, opt.CompoundAINimRequest.Namespace)
r.Recorder.Eventf(opt.CompoundAINimRequest, corev1.EventTypeNormal, "GenerateImageBuilderPod", "Updating secret %s in namespace %s", buildArgsSecretName, opt.CompoundAINimRequest.Namespace)
err = r.Update(ctx, &corev1.Secret{
ObjectMeta: metav1.ObjectMeta{
Name: buildArgsSecretName,
Namespace: opt.CompoundAINimRequest.Namespace,
},
Data: buildArgsSecret.Data,
})
if err != nil {
err = errors.Wrapf(err, "failed to update secret %s in namespace %s", buildArgsSecretName, opt.CompoundAINimRequest.Namespace)
return
}
}
}
for key := range buildArgsSecret.Data {
envName := fmt.Sprintf("BENTOML_BUILD_ARG_%s", strings.ReplaceAll(strings.ToUpper(strcase.ToKebab(key)), "-", "_"))
builderContainerEnvs = append(builderContainerEnvs, corev1.EnvVar{
Name: envName,
ValueFrom: &corev1.EnvVarSource{
SecretKeyRef: &corev1.SecretKeySelector{
LocalObjectReference: corev1.LocalObjectReference{
Name: buildArgsSecretName,
},
Key: key,
},
},
})
if isBuildkit {
args = append(args, "--opt", fmt.Sprintf("build-arg:%s=$(%s)", key, envName))
} else {
args = append(args, fmt.Sprintf("--build-arg=%s=$(%s)", key, envName))
}
}
} else {
r.Recorder.Eventf(opt.CompoundAINimRequest, corev1.EventTypeNormal, "GenerateImageBuilderPod", "Secret %s is not found in namespace %s", buildArgsSecretName, configNamespace)
}
builderContainerArgs := []string{
"-c",
fmt.Sprintf("sleep 15; %s && exit 0 || exit %d", shquot.POSIXShell(append(command, args...)), BuilderJobFailedExitCode), // TODO: remove once functionality exists to wait for istio sidecar.
}
container := corev1.Container{
Name: BuilderContainerName,
Image: builderImage,
ImagePullPolicy: corev1.PullAlways,
Command: []string{"sh"},
Args: builderContainerArgs,
VolumeMounts: volumeMounts,
Env: builderContainerEnvs,
EnvFrom: builderContainerEnvFrom,
TTY: true,
Stdin: true,
SecurityContext: builderContainerSecurityContext,
}
if globalDefaultImageBuilderContainerResources != nil {
container.Resources = *globalDefaultImageBuilderContainerResources
}
if opt.CompoundAINimRequest.Spec.ImageBuilderContainerResources != nil {
container.Resources = *opt.CompoundAINimRequest.Spec.ImageBuilderContainerResources
}
containers = append(containers, container)
pod = &corev1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Labels: kubeLabels,
Annotations: kubeAnnotations,
},
Spec: corev1.PodSpec{
RestartPolicy: corev1.RestartPolicyNever,
Volumes: volumes,
InitContainers: initContainers,
Containers: containers,
},
}
if globalExtraPodMetadata != nil {
for k, v := range globalExtraPodMetadata.Annotations {
pod.Annotations[k] = v
}
for k, v := range globalExtraPodMetadata.Labels {
pod.Labels[k] = v
}
}
if opt.CompoundAINimRequest.Spec.ImageBuilderExtraPodMetadata != nil {
for k, v := range opt.CompoundAINimRequest.Spec.ImageBuilderExtraPodMetadata.Annotations {
pod.Annotations[k] = v
}
for k, v := range opt.CompoundAINimRequest.Spec.ImageBuilderExtraPodMetadata.Labels {
pod.Labels[k] = v
}
}
if globalExtraPodSpec != nil {
pod.Spec.PriorityClassName = globalExtraPodSpec.PriorityClassName
pod.Spec.SchedulerName = globalExtraPodSpec.SchedulerName
pod.Spec.NodeSelector = globalExtraPodSpec.NodeSelector
pod.Spec.Affinity = globalExtraPodSpec.Affinity
pod.Spec.Tolerations = globalExtraPodSpec.Tolerations
pod.Spec.TopologySpreadConstraints = globalExtraPodSpec.TopologySpreadConstraints
pod.Spec.ServiceAccountName = globalExtraPodSpec.ServiceAccountName
}
if opt.CompoundAINimRequest.Spec.ImageBuilderExtraPodSpec != nil {
if opt.CompoundAINimRequest.Spec.ImageBuilderExtraPodSpec.PriorityClassName != "" {
pod.Spec.PriorityClassName = opt.CompoundAINimRequest.Spec.ImageBuilderExtraPodSpec.PriorityClassName
}
if opt.CompoundAINimRequest.Spec.ImageBuilderExtraPodSpec.SchedulerName != "" {
pod.Spec.SchedulerName = opt.CompoundAINimRequest.Spec.ImageBuilderExtraPodSpec.SchedulerName
}
if opt.CompoundAINimRequest.Spec.ImageBuilderExtraPodSpec.NodeSelector != nil {
pod.Spec.NodeSelector = opt.CompoundAINimRequest.Spec.ImageBuilderExtraPodSpec.NodeSelector
}
if opt.CompoundAINimRequest.Spec.ImageBuilderExtraPodSpec.Affinity != nil {
pod.Spec.Affinity = opt.CompoundAINimRequest.Spec.ImageBuilderExtraPodSpec.Affinity
}
if opt.CompoundAINimRequest.Spec.ImageBuilderExtraPodSpec.Tolerations != nil {
pod.Spec.Tolerations = opt.CompoundAINimRequest.Spec.ImageBuilderExtraPodSpec.Tolerations
}
if opt.CompoundAINimRequest.Spec.ImageBuilderExtraPodSpec.TopologySpreadConstraints != nil {
pod.Spec.TopologySpreadConstraints = opt.CompoundAINimRequest.Spec.ImageBuilderExtraPodSpec.TopologySpreadConstraints
}
if opt.CompoundAINimRequest.Spec.ImageBuilderExtraPodSpec.ServiceAccountName != "" {
pod.Spec.ServiceAccountName = opt.CompoundAINimRequest.Spec.ImageBuilderExtraPodSpec.ServiceAccountName
}
}
injectPodAffinity(&pod.Spec, opt.CompoundAINimRequest)
if pod.Spec.ServiceAccountName == "" {
serviceAccounts := &corev1.ServiceAccountList{}
err = r.List(ctx, serviceAccounts, client.InNamespace(opt.CompoundAINimRequest.Namespace), client.MatchingLabels{
commonconsts.KubeLabelYataiImageBuilderPod: commonconsts.KubeLabelValueTrue,
})
if err != nil {
err = errors.Wrapf(err, "failed to list service accounts in namespace %s", opt.CompoundAINimRequest.Namespace)
return
}
if len(serviceAccounts.Items) > 0 {
pod.Spec.ServiceAccountName = serviceAccounts.Items[0].Name
} else {
pod.Spec.ServiceAccountName = "default"
}
}
for i, c := range pod.Spec.InitContainers {
env := c.Env
if globalExtraContainerEnv != nil {
env = append(env, globalExtraContainerEnv...)
}
env = append(env, opt.CompoundAINimRequest.Spec.ImageBuilderExtraContainerEnv...)
pod.Spec.InitContainers[i].Env = env
}
for i, c := range pod.Spec.Containers {
env := c.Env
if globalExtraContainerEnv != nil {
env = append(env, globalExtraContainerEnv...)
}
env = append(env, opt.CompoundAINimRequest.Spec.ImageBuilderExtraContainerEnv...)
pod.Spec.Containers[i].Env = env
}
return
}
func (r *CompoundAINimRequestReconciler) getHashStr(compoundAINimRequest *nvidiacomv1alpha1.CompoundAINimRequest) (string, error) {
var hash uint64
hash, err := hashstructure.Hash(struct {
Spec nvidiacomv1alpha1.CompoundAINimRequestSpec
Labels map[string]string
Annotations map[string]string
}{
Spec: compoundAINimRequest.Spec,
Labels: compoundAINimRequest.Labels,
Annotations: compoundAINimRequest.Annotations,
}, hashstructure.FormatV2, nil)
if err != nil {
err = errors.Wrap(err, "get compoundAINimRequest CR spec hash")
return "", err
}
hashStr := strconv.FormatUint(hash, 10)
return hashStr, nil
}
func getJuiceFSStorageClassName() string {
if v := os.Getenv("JUICEFS_STORAGE_CLASS_NAME"); v != "" {
return v
}
return "juicefs-sc"
}
const (
trueStr = "true"
)
//nolint:nakedret
func (r *CompoundAINimRequestReconciler) doRegisterCompoundComponent() (err error) {
logs := log.Log.WithValues("func", "doRegisterYataiComponent")
ctx, cancel := context.WithTimeout(context.TODO(), time.Minute*5)
defer cancel()
logs.Info("getting yatai client")
yataiClient, yataiConf, err := r.getYataiClient(ctx)
if err != nil {
err = errors.Wrap(err, "get yatai client")
return
}
if yataiClient == nil || yataiConf == nil {
logs.Info("can't get yatai client, skip registering")
return
}
yataiClient_ := *yataiClient
yataiConf_ := *yataiConf
namespace, err := commonconfig.GetYataiImageBuilderNamespace(ctx, func(ctx context.Context, namespace, name string) (*corev1.Secret, error) {
secret := &corev1.Secret{}
err := r.Get(ctx, types.NamespacedName{
Namespace: namespace,
Name: name,
}, secret)
return secret, errors.Wrap(err, "get secret")
})
if err != nil {
err = errors.Wrap(err, "get yatai image builder namespace")
return
}
_, err = yataiClient_.RegisterYataiComponent(ctx, yataiConf_.ClusterName, &schemasv1.RegisterYataiComponentSchema{
Name: modelschemas.YataiComponentNameImageBuilder,
KubeNamespace: namespace,
Version: version.Version,
SelectorLabels: map[string]string{
"app.kubernetes.io/name": "yatai-image-builder",
},
Manifest: &modelschemas.YataiComponentManifestSchema{
SelectorLabels: map[string]string{
"app.kubernetes.io/name": "yatai-image-builder",
},
LatestCRDVersion: "v1alpha1",
},
})
err = errors.Wrap(err, "register yatai component")
return err
}
func (r *CompoundAINimRequestReconciler) registerCompoundComponent() {
logs := log.Log.WithValues("func", "registerYataiComponent")
err := r.doRegisterCompoundComponent()
if err != nil {
logs.Error(err, "registerYataiComponent")
}
ticker := time.NewTicker(time.Minute * 5)
for range ticker.C {
err := r.doRegisterCompoundComponent()
if err != nil {
logs.Error(err, "registerYataiComponent")
}
}
}
// SetupWithManager sets up the controller with the Manager.
func (r *CompoundAINimRequestReconciler) SetupWithManager(mgr ctrl.Manager) error {
logs := log.Log.WithValues("func", "SetupWithManager")
if os.Getenv("DISABLE_YATAI_COMPONENT_REGISTRATION") != trueStr {
go r.registerCompoundComponent()
} else {
logs.Info("yatai component registration is disabled")
}
err := ctrl.NewControllerManagedBy(mgr).
For(&nvidiacomv1alpha1.CompoundAINimRequest{}, builder.WithPredicates(predicate.GenerationChangedPredicate{})).
Owns(&nvidiacomv1alpha1.CompoundAINim{}).
Owns(&batchv1.Job{}).
WithEventFilter(controller_common.EphemeralDeploymentEventFilter(r.Config)).
Complete(r)
return errors.Wrap(err, "failed to setup CompoundAINimRequest controller")
}
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package controller
import (
"context"
"fmt"
"path/filepath"
"runtime"
"testing"
"github.com/dynemo-ai/dynemo/deploy/compoundai/operator/api/v1alpha1"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
appsv1 "k8s.io/api/apps/v1"
autoscalingv2 "k8s.io/api/autoscaling/v2"
corev1 "k8s.io/api/core/v1"
networkingv1 "k8s.io/api/networking/v1"
k8sruntime "k8s.io/apimachinery/pkg/runtime"
admissionregistrationv1 "k8s.io/api/admissionregistration/v1"
batchv1 "k8s.io/api/batch/v1"
rbacv1 "k8s.io/api/rbac/v1"
apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
vcbatchv1alpha1 "volcano.sh/apis/pkg/apis/batch/v1alpha1"
volcanov1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
"k8s.io/client-go/rest"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/envtest"
logf "sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/log/zap"
//+kubebuilder:scaffold:imports
)
// These tests use Ginkgo (BDD-style Go testing framework). Refer to
// http://onsi.github.io/ginkgo/ to learn more about Ginkgo.
var cfg *rest.Config
var k8sClient client.Client
var testEnv *envtest.Environment
var ctx context.Context
var cancel context.CancelFunc
func TestControllers(t *testing.T) {
RegisterFailHandler(Fail)
RunSpecs(t, "Controller Suite")
}
var _ = BeforeSuite(func() {
logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true)))
ctx, cancel = context.WithCancel(context.TODO())
By("bootstrapping test environment")
testEnv = &envtest.Environment{
CRDDirectoryPaths: []string{
filepath.Join("..", "..", "config", "crd", "bases"),
filepath.Join(".", "testing", "prometheus"),
filepath.Join(".", "testing", "volcano.sh"),
filepath.Join(".", "testing", "run.ai"),
filepath.Join(".", "testing", "nvidia"),
},
ErrorIfCRDPathMissing: true,
// The BinaryAssetsDirectory is only required if you want to run the tests directly
// without call the makefile target test. If not informed it will look for the
// default path defined in controller-runtime which is /usr/local/kubebuilder/.
// Note that you must have the required binaries setup under the bin directory to perform
// the tests directly. When we run make test it will be setup and used automatically.
BinaryAssetsDirectory: filepath.Join("..", "..", "bin", "k8s",
fmt.Sprintf("1.29.0-%s-%s", runtime.GOOS, runtime.GOARCH)),
}
var err error
// cfg is defined in this file globally.
cfg, err = testEnv.Start()
Expect(err).NotTo(HaveOccurred())
Expect(cfg).NotTo(BeNil())
scheme := k8sruntime.NewScheme()
//+kubebuilder:scaffold:scheme
err = v1alpha1.AddToScheme(scheme)
Expect(err).NotTo(HaveOccurred())
err = corev1.AddToScheme(scheme)
Expect(err).NotTo(HaveOccurred())
err = autoscalingv2.AddToScheme(scheme)
Expect(err).NotTo(HaveOccurred())
err = networkingv1.AddToScheme(scheme)
Expect(err).NotTo(HaveOccurred())
err = appsv1.AddToScheme(scheme)
Expect(err).NotTo(HaveOccurred())
err = monitoringv1.AddToScheme(scheme)
Expect(err).NotTo(HaveOccurred())
err = admissionregistrationv1.AddToScheme(scheme)
Expect(err).NotTo(HaveOccurred())
err = batchv1.AddToScheme(scheme)
Expect(err).NotTo(HaveOccurred())
err = rbacv1.AddToScheme(scheme)
Expect(err).NotTo(HaveOccurred())
err = apiextensionsv1.AddToScheme(scheme)
Expect(err).NotTo(HaveOccurred())
err = volcanov1beta1.AddToScheme(scheme)
Expect(err).NotTo(HaveOccurred())
err = vcbatchv1alpha1.AddToScheme(scheme)
Expect(err).NotTo(HaveOccurred())
k8sClient, err = client.New(cfg, client.Options{Scheme: scheme})
Expect(err).NotTo(HaveOccurred())
Expect(k8sClient).NotTo(BeNil())
k8sManager, err := ctrl.NewManager(cfg, ctrl.Options{
Scheme: scheme,
})
Expect(err).NotTo(HaveOccurred())
go func() {
defer GinkgoRecover()
err = k8sManager.Start(ctx)
Expect(err).NotTo(HaveOccurred())
}()
})
var _ = AfterSuite(func() {
cancel()
By("tearing down the test environment")
err := testEnv.Stop()
Expect(err).NotTo(HaveOccurred())
})
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.14.0
operator.prometheus.io/version: 0.74.0
name: prometheusrules.monitoring.coreos.com
spec:
group: monitoring.coreos.com
names:
categories:
- prometheus-operator
kind: PrometheusRule
listKind: PrometheusRuleList
plural: prometheusrules
shortNames:
- promrule
singular: prometheusrule
scope: Namespaced
versions:
- name: v1
schema:
openAPIV3Schema:
description: PrometheusRule defines recording and alerting rules for a Prometheus
instance
properties:
apiVersion:
description: |-
APIVersion defines the versioned schema of this representation of an object.
Servers should convert recognized schemas to the latest internal value, and
may reject unrecognized values.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
type: string
kind:
description: |-
Kind is a string value representing the REST resource this object represents.
Servers may infer this from the endpoint the client submits requests to.
Cannot be updated.
In CamelCase.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
type: string
metadata:
type: object
spec:
description: Specification of desired alerting rule definitions for Prometheus.
properties:
groups:
description: Content of Prometheus rule file
items:
description: RuleGroup is a list of sequentially evaluated recording
and alerting rules.
properties:
interval:
description: Interval determines how often rules in the group
are evaluated.
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
type: string
limit:
description: |-
Limit the number of alerts an alerting rule and series a recording
rule can produce.
Limit is supported starting with Prometheus >= 2.31 and Thanos Ruler >= 0.24.
type: integer
name:
description: Name of the rule group.
minLength: 1
type: string
partial_response_strategy:
description: |-
PartialResponseStrategy is only used by ThanosRuler and will
be ignored by Prometheus instances.
More info: https://github.com/thanos-io/thanos/blob/main/docs/components/rule.md#partial-response
pattern: ^(?i)(abort|warn)?$
type: string
rules:
description: List of alerting and recording rules.
items:
description: |-
Rule describes an alerting or recording rule
See Prometheus documentation: [alerting](https://www.prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) or [recording](https://www.prometheus.io/docs/prometheus/latest/configuration/recording_rules/#recording-rules) rule
properties:
alert:
description: |-
Name of the alert. Must be a valid label value.
Only one of `record` and `alert` must be set.
type: string
annotations:
additionalProperties:
type: string
description: |-
Annotations to add to each alert.
Only valid for alerting rules.
type: object
expr:
anyOf:
- type: integer
- type: string
description: PromQL expression to evaluate.
x-kubernetes-int-or-string: true
for:
description: Alerts are considered firing once they have
been returned for this long.
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
type: string
keep_firing_for:
description: KeepFiringFor defines how long an alert will
continue firing after the condition that triggered it
has cleared.
minLength: 1
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
type: string
labels:
additionalProperties:
type: string
description: Labels to add or overwrite.
type: object
record:
description: |-
Name of the time series to output to. Must be a valid metric name.
Only one of `record` and `alert` must be set.
type: string
required:
- expr
type: object
type: array
required:
- name
type: object
type: array
x-kubernetes-list-map-keys:
- name
x-kubernetes-list-type: map
type: object
required:
- spec
type: object
served: true
storage: true
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.13.0
operator.prometheus.io/version: 0.72.0
name: servicemonitors.monitoring.coreos.com
spec:
group: monitoring.coreos.com
names:
categories:
- prometheus-operator
kind: ServiceMonitor
listKind: ServiceMonitorList
plural: servicemonitors
shortNames:
- smon
singular: servicemonitor
scope: Namespaced
versions:
- name: v1
schema:
openAPIV3Schema:
description: ServiceMonitor defines monitoring for a set of services.
properties:
apiVersion:
description: 'APIVersion defines the versioned schema of this representation
of an object. Servers should convert recognized schemas to the latest
internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources'
type: string
kind:
description: 'Kind is a string value representing the REST resource this
object represents. Servers may infer this from the endpoint the client
submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds'
type: string
metadata:
type: object
spec:
description: Specification of desired Service selection for target discovery
by Prometheus.
properties:
attachMetadata:
description: "`attachMetadata` defines additional metadata which is
added to the discovered targets. \n It requires Prometheus >= v2.37.0."
properties:
node:
description: When set to true, Prometheus must have the `get`
permission on the `Nodes` objects.
type: boolean
type: object
endpoints:
description: List of endpoints part of this ServiceMonitor.
items:
description: Endpoint defines an endpoint serving Prometheus metrics
to be scraped by Prometheus.
properties:
authorization:
description: "`authorization` configures the Authorization header
credentials to use when scraping the target. \n Cannot be
set at the same time as `basicAuth`, or `oauth2`."
properties:
credentials:
description: Selects a key of a Secret in the namespace
that contains the credentials for authentication.
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?'
type: string
optional:
description: Specify whether the Secret or its key must
be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
type:
description: "Defines the authentication type. The value
is case-insensitive. \n \"Basic\" is not a supported value.
\n Default: \"Bearer\""
type: string
type: object
basicAuth:
description: "`basicAuth` configures the Basic Authentication
credentials to use when scraping the target. \n Cannot be
set at the same time as `authorization`, or `oauth2`."
properties:
password:
description: '`password` specifies a key of a Secret containing
the password for authentication.'
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?'
type: string
optional:
description: Specify whether the Secret or its key must
be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
username:
description: '`username` specifies a key of a Secret containing
the username for authentication.'
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?'
type: string
optional:
description: Specify whether the Secret or its key must
be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
type: object
bearerTokenFile:
description: "File to read bearer token for scraping the target.
\n Deprecated: use `authorization` instead."
type: string
bearerTokenSecret:
description: "`bearerTokenSecret` specifies a key of a Secret
containing the bearer token for scraping targets. The secret
needs to be in the same namespace as the ServiceMonitor object
and readable by the Prometheus Operator. \n Deprecated: use
`authorization` instead."
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?'
type: string
optional:
description: Specify whether the Secret or its key must
be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
enableHttp2:
description: '`enableHttp2` can be used to disable HTTP2 when
scraping the target.'
type: boolean
filterRunning:
description: "When true, the pods which are not running (e.g.
either in Failed or Succeeded state) are dropped during the
target discovery. \n If unset, the filtering is enabled. \n
More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase"
type: boolean
followRedirects:
description: '`followRedirects` defines whether the scrape requests
should follow HTTP 3xx redirects.'
type: boolean
honorLabels:
description: When true, `honorLabels` preserves the metric's
labels when they collide with the target's labels.
type: boolean
honorTimestamps:
description: '`honorTimestamps` controls whether Prometheus
preserves the timestamps when exposed by the target.'
type: boolean
interval:
description: "Interval at which Prometheus scrapes the metrics
from the target. \n If empty, Prometheus uses the global scrape
interval."
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
type: string
metricRelabelings:
description: '`metricRelabelings` configures the relabeling
rules to apply to the samples before ingestion.'
items:
description: "RelabelConfig allows dynamic rewriting of the
label set for targets, alerts, scraped samples and remote
write samples. \n More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config"
properties:
action:
default: replace
description: "Action to perform based on the regex matching.
\n `Uppercase` and `Lowercase` actions require Prometheus
>= v2.36.0. `DropEqual` and `KeepEqual` actions require
Prometheus >= v2.41.0. \n Default: \"Replace\""
enum:
- replace
- Replace
- keep
- Keep
- drop
- Drop
- hashmod
- HashMod
- labelmap
- LabelMap
- labeldrop
- LabelDrop
- labelkeep
- LabelKeep
- lowercase
- Lowercase
- uppercase
- Uppercase
- keepequal
- KeepEqual
- dropequal
- DropEqual
type: string
modulus:
description: "Modulus to take of the hash of the source
label values. \n Only applicable when the action is
`HashMod`."
format: int64
type: integer
regex:
description: Regular expression against which the extracted
value is matched.
type: string
replacement:
description: "Replacement value against which a Replace
action is performed if the regular expression matches.
\n Regex capture groups are available."
type: string
separator:
description: Separator is the string between concatenated
SourceLabels.
type: string
sourceLabels:
description: The source labels select values from existing
labels. Their content is concatenated using the configured
Separator and matched against the configured regular
expression.
items:
description: LabelName is a valid Prometheus label name
which may only contain ASCII letters, numbers, as
well as underscores.
pattern: ^[a-zA-Z_][a-zA-Z0-9_]*$
type: string
type: array
targetLabel:
description: "Label to which the resulting string is written
in a replacement. \n It is mandatory for `Replace`,
`HashMod`, `Lowercase`, `Uppercase`, `KeepEqual` and
`DropEqual` actions. \n Regex capture groups are available."
type: string
type: object
type: array
oauth2:
description: "`oauth2` configures the OAuth2 settings to use
when scraping the target. \n It requires Prometheus >= 2.27.0.
\n Cannot be set at the same time as `authorization`, or `basicAuth`."
properties:
clientId:
description: '`clientId` specifies a key of a Secret or
ConfigMap containing the OAuth2 client''s ID.'
properties:
configMap:
description: ConfigMap containing data to use for the
targets.
properties:
key:
description: The key to select.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind,
uid?'
type: string
optional:
description: Specify whether the ConfigMap or its
key must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
secret:
description: Secret containing data to use for the targets.
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind,
uid?'
type: string
optional:
description: Specify whether the Secret or its key
must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
type: object
clientSecret:
description: '`clientSecret` specifies a key of a Secret
containing the OAuth2 client''s secret.'
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?'
type: string
optional:
description: Specify whether the Secret or its key must
be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
endpointParams:
additionalProperties:
type: string
description: '`endpointParams` configures the HTTP parameters
to append to the token URL.'
type: object
scopes:
description: '`scopes` defines the OAuth2 scopes used for
the token request.'
items:
type: string
type: array
tokenUrl:
description: '`tokenURL` configures the URL to fetch the
token from.'
minLength: 1
type: string
required:
- clientId
- clientSecret
- tokenUrl
type: object
params:
additionalProperties:
items:
type: string
type: array
description: params define optional HTTP URL parameters.
type: object
path:
description: "HTTP path from which to scrape for metrics. \n
If empty, Prometheus uses the default value (e.g. `/metrics`)."
type: string
port:
description: "Name of the Service port which this endpoint refers
to. \n It takes precedence over `targetPort`."
type: string
proxyUrl:
description: '`proxyURL` configures the HTTP Proxy URL (e.g.
"http://proxyserver:2195") to go through when scraping the
target.'
type: string
relabelings:
description: "`relabelings` configures the relabeling rules
to apply the target's metadata labels. \n The Operator automatically
adds relabelings for a few standard Kubernetes fields. \n
The original scrape job's name is available via the `__tmp_prometheus_job_name`
label. \n More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config"
items:
description: "RelabelConfig allows dynamic rewriting of the
label set for targets, alerts, scraped samples and remote
write samples. \n More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config"
properties:
action:
default: replace
description: "Action to perform based on the regex matching.
\n `Uppercase` and `Lowercase` actions require Prometheus
>= v2.36.0. `DropEqual` and `KeepEqual` actions require
Prometheus >= v2.41.0. \n Default: \"Replace\""
enum:
- replace
- Replace
- keep
- Keep
- drop
- Drop
- hashmod
- HashMod
- labelmap
- LabelMap
- labeldrop
- LabelDrop
- labelkeep
- LabelKeep
- lowercase
- Lowercase
- uppercase
- Uppercase
- keepequal
- KeepEqual
- dropequal
- DropEqual
type: string
modulus:
description: "Modulus to take of the hash of the source
label values. \n Only applicable when the action is
`HashMod`."
format: int64
type: integer
regex:
description: Regular expression against which the extracted
value is matched.
type: string
replacement:
description: "Replacement value against which a Replace
action is performed if the regular expression matches.
\n Regex capture groups are available."
type: string
separator:
description: Separator is the string between concatenated
SourceLabels.
type: string
sourceLabels:
description: The source labels select values from existing
labels. Their content is concatenated using the configured
Separator and matched against the configured regular
expression.
items:
description: LabelName is a valid Prometheus label name
which may only contain ASCII letters, numbers, as
well as underscores.
pattern: ^[a-zA-Z_][a-zA-Z0-9_]*$
type: string
type: array
targetLabel:
description: "Label to which the resulting string is written
in a replacement. \n It is mandatory for `Replace`,
`HashMod`, `Lowercase`, `Uppercase`, `KeepEqual` and
`DropEqual` actions. \n Regex capture groups are available."
type: string
type: object
type: array
scheme:
description: "HTTP scheme to use for scraping. \n `http` and
`https` are the expected values unless you rewrite the `__scheme__`
label via relabeling. \n If empty, Prometheus uses the default
value `http`."
enum:
- http
- https
type: string
scrapeTimeout:
description: "Timeout after which Prometheus considers the scrape
to be failed. \n If empty, Prometheus uses the global scrape
timeout unless it is less than the target's scrape interval
value in which the latter is used."
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
type: string
targetPort:
anyOf:
- type: integer
- type: string
description: Name or number of the target port of the `Pod`
object behind the Service. The port must be specified with
the container's port property.
x-kubernetes-int-or-string: true
tlsConfig:
description: TLS configuration to use when scraping the target.
properties:
ca:
description: Certificate authority used when verifying server
certificates.
properties:
configMap:
description: ConfigMap containing data to use for the
targets.
properties:
key:
description: The key to select.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind,
uid?'
type: string
optional:
description: Specify whether the ConfigMap or its
key must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
secret:
description: Secret containing data to use for the targets.
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind,
uid?'
type: string
optional:
description: Specify whether the Secret or its key
must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
type: object
caFile:
description: Path to the CA cert in the Prometheus container
to use for the targets.
type: string
cert:
description: Client certificate to present when doing client-authentication.
properties:
configMap:
description: ConfigMap containing data to use for the
targets.
properties:
key:
description: The key to select.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind,
uid?'
type: string
optional:
description: Specify whether the ConfigMap or its
key must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
secret:
description: Secret containing data to use for the targets.
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind,
uid?'
type: string
optional:
description: Specify whether the Secret or its key
must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
type: object
certFile:
description: Path to the client cert file in the Prometheus
container for the targets.
type: string
insecureSkipVerify:
description: Disable target certificate validation.
type: boolean
keyFile:
description: Path to the client key file in the Prometheus
container for the targets.
type: string
keySecret:
description: Secret containing the client key file for the
targets.
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?'
type: string
optional:
description: Specify whether the Secret or its key must
be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
serverName:
description: Used to verify the hostname for the targets.
type: string
type: object
trackTimestampsStaleness:
description: "`trackTimestampsStaleness` defines whether Prometheus
tracks staleness of the metrics that have an explicit timestamp
present in scraped data. Has no effect if `honorTimestamps`
is false. \n It requires Prometheus >= v2.48.0."
type: boolean
type: object
type: array
jobLabel:
description: "`jobLabel` selects the label from the associated Kubernetes
`Service` object which will be used as the `job` label for all metrics.
\n For example if `jobLabel` is set to `foo` and the Kubernetes
`Service` object is labeled with `foo: bar`, then Prometheus adds
the `job=\"bar\"` label to all ingested metrics. \n If the value
of this field is empty or if the label doesn't exist for the given
Service, the `job` label of the metrics defaults to the name of
the associated Kubernetes `Service`."
type: string
keepDroppedTargets:
description: "Per-scrape limit on the number of targets dropped by
relabeling that will be kept in memory. 0 means no limit. \n It
requires Prometheus >= v2.47.0."
format: int64
type: integer
labelLimit:
description: "Per-scrape limit on number of labels that will be accepted
for a sample. \n It requires Prometheus >= v2.27.0."
format: int64
type: integer
labelNameLengthLimit:
description: "Per-scrape limit on length of labels name that will
be accepted for a sample. \n It requires Prometheus >= v2.27.0."
format: int64
type: integer
labelValueLengthLimit:
description: "Per-scrape limit on length of labels value that will
be accepted for a sample. \n It requires Prometheus >= v2.27.0."
format: int64
type: integer
namespaceSelector:
description: Selector to select which namespaces the Kubernetes `Endpoints`
objects are discovered from.
properties:
any:
description: Boolean describing whether all namespaces are selected
in contrast to a list restricting them.
type: boolean
matchNames:
description: List of namespace names to select from.
items:
type: string
type: array
type: object
podTargetLabels:
description: '`podTargetLabels` defines the labels which are transferred
from the associated Kubernetes `Pod` object onto the ingested metrics.'
items:
type: string
type: array
sampleLimit:
description: '`sampleLimit` defines a per-scrape limit on the number
of scraped samples that will be accepted.'
format: int64
type: integer
scrapeClass:
description: The scrape class to apply.
minLength: 1
type: string
scrapeProtocols:
description: "`scrapeProtocols` defines the protocols to negotiate
during a scrape. It tells clients the protocols supported by Prometheus
in order of preference (from most to least preferred). \n If unset,
Prometheus uses its default value. \n It requires Prometheus >=
v2.49.0."
items:
description: 'ScrapeProtocol represents a protocol used by Prometheus
for scraping metrics. Supported values are: * `OpenMetricsText0.0.1`
* `OpenMetricsText1.0.0` * `PrometheusProto` * `PrometheusText0.0.4`'
enum:
- PrometheusProto
- OpenMetricsText0.0.1
- OpenMetricsText1.0.0
- PrometheusText0.0.4
type: string
type: array
x-kubernetes-list-type: set
selector:
description: Label selector to select the Kubernetes `Endpoints` objects.
properties:
matchExpressions:
description: matchExpressions is a list of label selector requirements.
The requirements are ANDed.
items:
description: A label selector requirement is a selector that
contains values, a key, and an operator that relates the key
and values.
properties:
key:
description: key is the label key that the selector applies
to.
type: string
operator:
description: operator represents a key's relationship to
a set of values. Valid operators are In, NotIn, Exists
and DoesNotExist.
type: string
values:
description: values is an array of string values. If the
operator is In or NotIn, the values array must be non-empty.
If the operator is Exists or DoesNotExist, the values
array must be empty. This array is replaced during a strategic
merge patch.
items:
type: string
type: array
required:
- key
- operator
type: object
type: array
matchLabels:
additionalProperties:
type: string
description: matchLabels is a map of {key,value} pairs. A single
{key,value} in the matchLabels map is equivalent to an element
of matchExpressions, whose key field is "key", the operator
is "In", and the values array contains only "value". The requirements
are ANDed.
type: object
type: object
x-kubernetes-map-type: atomic
targetLabels:
description: '`targetLabels` defines the labels which are transferred
from the associated Kubernetes `Service` object onto the ingested
metrics.'
items:
type: string
type: array
targetLimit:
description: '`targetLimit` defines a limit on the number of scraped
targets that will be accepted.'
format: int64
type: integer
required:
- selector
type: object
required:
- spec
type: object
served: true
storage: true
\ No newline at end of file
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package controller_common
import (
"context"
"strings"
"k8s.io/apimachinery/pkg/api/meta"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/predicate"
)
type Config struct {
// Enable resources filtering, only the resources belonging to the given namespace will be handled.
RestrictedNamespace string
}
func EphemeralDeploymentEventFilter(config Config) predicate.Predicate {
return predicate.NewPredicateFuncs(func(o client.Object) bool {
l := log.FromContext(context.Background())
objMeta, err := meta.Accessor(o)
if err != nil {
l.Error(err, "Error extracting object metadata")
return false
}
if config.RestrictedNamespace != "" {
// in case of a restricted namespace, we only want to process the events that are in the restricted namespace
return objMeta.GetNamespace() == config.RestrictedNamespace
}
// in all other cases, discard the event if it is destined to an ephemeral deployment
if strings.Contains(objMeta.GetNamespace(), "ephemeral") {
return false
}
return true
})
}
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package controller_common
import (
"context"
"crypto/sha256"
"encoding/json"
"fmt"
"sort"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/apiutil"
)
const (
// NvidiaAnnotationHashKey indicates annotation name for last applied hash by the operator
NvidiaAnnotationHashKey = "nvidia.com/last-applied-hash"
)
func SyncResource[T client.Object](ctx context.Context, c client.Client, desired T, namespacedName types.NamespacedName, createOnly bool) (T, error) {
// Retrieve the GroupVersionKind (GVK) of the desired object
gvk, err := apiutil.GVKForObject(desired, c.Scheme())
if err != nil {
return desired, fmt.Errorf("failed to get GVK for object: %w", err)
}
// Create a new instance of the object
obj, err := c.Scheme().New(gvk)
if err != nil {
return desired, fmt.Errorf("failed to create a new object for GVK %s: %w", gvk, err)
}
// Type assertion to ensure the object implements client.Object
current, ok := obj.(T)
if !ok {
return desired, fmt.Errorf("failed to cast object to the expected type %T", desired)
}
// Retrieve the existing resource
err = c.Get(ctx, namespacedName, current)
if err != nil {
if errors.IsNotFound(err) {
// If the resource doesn't exist, create it
if err := c.Create(ctx, desired); err != nil {
return desired, fmt.Errorf("failed to create resource: %w", err)
}
return desired, nil
}
return desired, fmt.Errorf("failed to get resource: %w", err)
}
if createOnly {
return current, nil
}
// Check if the Spec has changed and update if necessary
if IsSpecChanged(current, desired) {
if err := c.Update(ctx, desired); err != nil {
return desired, fmt.Errorf("failed to update resource: %w", err)
}
}
// Return the updated object
return current, nil
}
// GetResourceHash returns a consistent hash for the given object spec
func GetResourceHash(obj client.Object) string {
// Convert obj to a map[string]interface{}
objMap, err := json.Marshal(obj)
if err != nil {
panic(err)
}
var objData map[string]interface{}
if err := json.Unmarshal(objMap, &objData); err != nil {
panic(err)
}
// Sort keys to ensure consistent serialization
sortedObjData := SortKeys(objData)
// Serialize to JSON
serialized, err := json.Marshal(sortedObjData)
if err != nil {
panic(err)
}
// Compute the hash
hasher := sha256.New()
hasher.Write(serialized)
return fmt.Sprintf("%x", hasher.Sum(nil))
}
// IsSpecChanged returns true if the spec has changed between the existing one
// and the new resource spec compared by hash.
func IsSpecChanged(current client.Object, desired client.Object) bool {
if current == nil && desired != nil {
return true
}
hashStr := GetResourceHash(desired)
foundHashAnnotation := false
currentAnnotations := current.GetAnnotations()
desiredAnnotations := desired.GetAnnotations()
if currentAnnotations == nil {
currentAnnotations = map[string]string{}
}
if desiredAnnotations == nil {
desiredAnnotations = map[string]string{}
}
for annotation, value := range currentAnnotations {
if annotation == NvidiaAnnotationHashKey {
if value != hashStr {
// Update annotation to be added to resource as per new spec and indicate spec update is required
desiredAnnotations[NvidiaAnnotationHashKey] = hashStr
desired.SetAnnotations(desiredAnnotations)
return true
}
foundHashAnnotation = true
break
}
}
if !foundHashAnnotation {
// Update annotation to be added to resource as per new spec and indicate spec update is required
desiredAnnotations[NvidiaAnnotationHashKey] = hashStr
desired.SetAnnotations(desiredAnnotations)
return true
}
return false
}
// SortKeys recursively sorts the keys of a map to ensure consistent serialization
func SortKeys(obj interface{}) interface{} {
switch obj := obj.(type) {
case map[string]interface{}:
sortedMap := make(map[string]interface{})
keys := make([]string, 0, len(obj))
for k := range obj {
keys = append(keys, k)
}
sort.Strings(keys)
for _, k := range keys {
sortedMap[k] = SortKeys(obj[k])
}
return sortedMap
case []interface{}:
// Check if the slice contains maps and sort them by the "name" field or the first available field
if len(obj) > 0 {
if _, ok := obj[0].(map[string]interface{}); ok {
sort.SliceStable(obj, func(i, j int) bool {
iMap, iOk := obj[i].(map[string]interface{})
jMap, jOk := obj[j].(map[string]interface{})
if iOk && jOk {
// Try to sort by "name" if present
iName, iNameOk := iMap["name"].(string)
jName, jNameOk := jMap["name"].(string)
if iNameOk && jNameOk {
return iName < jName
}
// If "name" is not available, sort by the first key in each map
if len(iMap) > 0 && len(jMap) > 0 {
iFirstKey := firstKey(iMap)
jFirstKey := firstKey(jMap)
return iFirstKey < jFirstKey
}
}
// If no valid comparison is possible, maintain the original order
return false
})
}
}
for i, v := range obj {
obj[i] = SortKeys(v)
}
}
return obj
}
// Helper function to get the first key of a map (alphabetically sorted)
func firstKey(m map[string]interface{}) string {
keys := make([]string, 0, len(m))
for k := range m {
keys = append(keys, k)
}
sort.Strings(keys)
return keys[0]
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment