Unverified Commit 3057af00 authored by Thomas Montfort's avatar Thomas Montfort Committed by GitHub
Browse files

feat(dep-715): refactor helm directory and remove reference to cloud (#5042)

parent b5922693
......@@ -83,14 +83,14 @@ jobs:
env:
ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
run: |
cd deploy/cloud/operator
cd deploy/operator
docker build --target linter --progress=plain --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ .
- name: Tester
shell: bash
env:
ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
run: |
cd deploy/cloud/operator
cd deploy/operator
docker build --target tester --progress=plain --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ .
- name: Set up Go
uses: actions/setup-go@44694675825211faa026b3c33043df3e48a5fa00 # v6.0.0
......@@ -102,7 +102,7 @@ jobs:
ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
run: |
sudo apt-get update && sudo apt-get install -y make
cd deploy/cloud/operator
cd deploy/operator
make check
- name: Build Container
id: build-image
......@@ -110,7 +110,7 @@ jobs:
env:
ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
run: |
cd deploy/cloud/operator
cd deploy/operator
docker buildx build --load \
--platform linux/${{ matrix.platform.arch }} \
--build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ \
......@@ -370,7 +370,6 @@ jobs:
export ISTIO_ENABLED=true
export ISTIO_GATEWAY=istio-system/ingress-alb
export VIRTUAL_SERVICE_SUPPORTS_HTTPS=true
export DYNAMO_CLOUD=https://${NAMESPACE}.${DYNAMO_INGRESS_SUFFIX}
# Install dynamo env secrets
kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN=${{ secrets.HF_TOKEN }} -n $KUBE_NS || true
......@@ -378,7 +377,7 @@ jobs:
kubectl create secret docker-registry docker-imagepullsecret --docker-server=${{ secrets.AZURE_ACR_HOSTNAME }} --docker-username=${{ secrets.AZURE_ACR_USER }} --docker-password=${{ secrets.AZURE_ACR_PASSWORD }} --namespace=${NAMESPACE}
# Install helm dependencies
helm repo add bitnami https://charts.bitnami.com/bitnami
cd deploy/cloud/helm/platform/
cd deploy/helm/charts/platform/
helm dep build .
# Install platform with namespace restriction for single profile testing
helm upgrade --install dynamo-platform . --namespace ${NAMESPACE} \
......
......@@ -746,7 +746,6 @@ jobs:
export ISTIO_ENABLED=true
export ISTIO_GATEWAY=istio-system/ingress-alb
export VIRTUAL_SERVICE_SUPPORTS_HTTPS=true
export DYNAMO_CLOUD=https://${NAMESPACE}.${DYNAMO_INGRESS_SUFFIX}
# Install dynamo env secrets
kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN=${{ secrets.HF_TOKEN }} -n $KUBE_NS || true
# Create docker pull secret for operator image
......@@ -756,7 +755,7 @@ jobs:
docker pull ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${{ env.NIGHTLY_IMAGE_PREFIX }}-operator-amd64 || echo "Operator image not found, will use SHA-based tag"
# Install helm dependencies
helm repo add bitnami https://charts.bitnami.com/bitnami
cd deploy/cloud/helm/platform/
cd deploy/helm/charts/platform/
helm dep build .
# Install platform with namespace restriction
helm upgrade --install dynamo-platform . --namespace ${NAMESPACE} \
......
......@@ -34,7 +34,7 @@ golang-base:
operator-src:
FROM +golang-base
COPY ./deploy/cloud/operator /artifacts/operator
COPY ./deploy/operator /artifacts/operator
SAVE ARTIFACT /artifacts/operator
......@@ -172,15 +172,15 @@ dynamo-base-docker:
############### ALL TARGETS ##############################
all-test:
BUILD ./deploy/cloud/operator+test
BUILD ./deploy/operator+test
all-docker:
ARG DOCKER_SERVER=my-registry
ARG IMAGE_TAG=latest
BUILD ./deploy/cloud/operator+docker --DOCKER_SERVER=$DOCKER_SERVER --IMAGE_TAG=$IMAGE_TAG
BUILD ./deploy/operator+docker --DOCKER_SERVER=$DOCKER_SERVER --IMAGE_TAG=$IMAGE_TAG
all-lint:
BUILD ./deploy/cloud/operator+lint
BUILD ./deploy/operator+lint
all:
BUILD +all-test
......
......@@ -86,5 +86,5 @@ addopts = [
"--mypy", # This flag enables mypy type checking during pytest runs
"--ignore-glob=*model.py",
"--ignore-glob=*_inc.py",
"--ignore-glob=deploy/cloud/api-store/*",
"--ignore-glob=deploy/api-store/*",
]
......@@ -14,14 +14,16 @@ Welcome to the Dynamo Deploy project! This guide will help you get started with
The deploy directory contains several key components:
```
deploy/
├── cloud/ # Cloud deployment platform
│ ├── helm/ # Cloud platform Helm charts
│ └── operator/ # Kubernetes operator (Go)
├── helm/ # Manual deployment Helm charts
├── metrics/ # Monitoring and observability
├── sdk/ # Python scripts
└── inference-gateway/ # Gateway components
├── discovery # How to use Dynamo kubernetes discovery backend
├── helm
│ └── charts
│ ├── crds # Dynamo CRD helm chart
│ ├── platform # Dynamo platform helm chart
├── inference-gateway # Dynamo intregration with inference gateway
├── observability # Observability tools for Dynamo k8s
├── operator # Source code for the Dynamo operator
├── pre-deployment # Pre-deployment scripts to check your k8s cluster meets the requirements for deploying Dynamo
└── utils # Utilities and manifests for Dynamo benchmarking and profiling workflows
```
## Development Environment
......@@ -46,13 +48,13 @@ deploy/
commit -S
```
- Every time you modify `deploy/cloud/helm/crds/templates/*.yaml`, please bump up the version of the CRD helm chart in
1. deploy/cloud/helm/platform/components/operator/Chart.yaml
2. deploy/cloud/helm/platform/Chart.yaml
- Every time you modify `deploy/helm/charts/crds/templates/*.yaml`, please bump up the version of the CRD helm chart in
1. deploy/helm/charts/platform/components/operator/Chart.yaml
2. deploy/helm/charts/platform/Chart.yaml
then
```bash
deploy/cloud/helm/platform
deploy/helm/charts/platform
helm dependency update
```
......@@ -116,7 +118,7 @@ Once you have an MR up and standard checks pass trigger the integration tests by
**Go Tests (Operator):**
```bash
cd deploy/cloud/operator
cd deploy/operator
go test ./... -v
go test -race ./...
```
......@@ -136,7 +138,7 @@ pytest tests/serve/test_dynamo_serve.py::test_serve_deployment[agg] -v
**Operator Integration Tests:**
```bash
cd deploy/cloud/operator
cd deploy/operator
make test-e2e
```
......
<!--
SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
SPDX-License-Identifier: Apache-2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
# Dynamo Kubernetes Platform Helm Charts
There are two Helm charts available for the Dynamo Kubernetes Platform:
- [platform](platform/README.md) - This chart installs the complete Dynamo Kubernetes Platform, including the Dynamo Operator, NATS, etcd, Grove, and Kai Scheduler.
- [crds](crds/README.md) - This chart installs the CRDs for the Dynamo Kubernetes Platform.
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: v1
kind: Namespace
metadata:
name: cai-hannahz
labels:
nscleanup/enabled: 'false' # this enables automated cleanup
nvcr-imagepull: enabled # adds nvcr imagepull secret
gitlab-imagepull: enabled # adds gitlab imagepull secret
istio-injection: 'false'
\ No newline at end of file
#!/usr/bin/env bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -euo pipefail
trap 'echo "Error at line $LINENO. Exiting."' ERR
read -p "Are you sure you want to delete ALL Dynamo CRDs and their instances? (y/N): " confirm
if [[ "$confirm" != "y" ]]; then
echo "Aborting."
exit 1
fi
# Step 1: Get all CRDs with the prefix
DYNAMO_CRDS="$(kubectl get crds -o name | grep 'nvidia.com' | grep 'dynamo' | cut -d'/' -f2)"
if [ -z "${DYNAMO_CRDS}" ]; then
echo "Dynamo CRDs not found"
exit 1
fi
# Step 2: Delete all custom resource instances for each CRD
for CRD in ${DYNAMO_CRDS}; do
SCOPE=$(kubectl get crd "${CRD}" -o jsonpath='{.spec.scope}')
if [ "$SCOPE" == "Namespaced" ]; then
echo "Deleting all namespaced instances of ${CRD}..."
kubectl get "${CRD}" --all-namespaces -o name | xargs -r kubectl delete --wait=false
else
echo "Skipping cluster-scoped CRD: ${CRD}"
fi
done
# Step 3: Wait for the Operator to handle finalizer removal
echo "Waiting for Dynamo Operator to handle the finalizer removal (30 seconds)..."
sleep 30
# Step 4: Verify all Custom Resources have been removed
for CRD in ${DYNAMO_CRDS}; do
# Check CRs
echo "Checking instances of ${CRD}"
kubectl get "${CRD}" --all-namespaces -o name
done
# Step 5: Delete the CRDs themselves
echo "Deleting CRDs..."
for CRD in ${DYNAMO_CRDS}; do
# Delete all CRD's
echo "Deleting CRD: ${CRD}..."
kubectl delete crd "${CRD}"
done
......@@ -15,67 +15,9 @@ See the License for the specific language governing permissions and
limitations under the License.
-->
# Manual Helm Deployment
# Dynamo Kubernetes Helm Charts
This directory contains Helm charts for manually deploying Dynamo inference graphs to Kubernetes.
This approach allows you to install Dynamo directly using a DynamoGraphDeploymentCRD values file, which is useful for quick deployments or testing specific configurations.
### Prerequisites
- Helm 3.0+
- Kubernetes 1.16+
- ETCD v3.5+ (without auth)
- NATS v2.10+ (with jetstream enabled)
- Grove v0.1.0+ (optional if deploying using Grove)
### Basic Installation
Here is how you would install a VLLM inference backend example.
```bash
helm upgrade --install dynamo-graph ./deploy/helm/chart -n dynamo-cloud -f ./examples/backends/vllm/deploy/agg.yaml
```
### Installation using Grove
Same example as above, but using Grove PodCliqueSet resources.
```bash
helm upgrade --install dynamo-graph ./deploy/helm/chart -n dynamo-cloud -f ./examples/backends/vllm/deploy/agg.yaml --set deploymentType=grove
```
### Customizable Properties
You can override the default configuration by setting the following properties:
```bash
helm upgrade --install dynamo-graph ./deploy/helm/chart -n dynamo-cloud \
-f ./examples/backends/vllm/deploy/agg.yaml \
--set "imagePullSecrets[0].name=docker-secret-1" \
--set etcdAddr="my-etcd-service:2379" \
--set natsAddr="nats://my-nats-service:4222"
```
#### Available Properties
| Property | Description | Example |
|----------|-------------|---------|
| `imagePullSecrets` | Array of image pull secrets for accessing private registries | `imagePullSecrets[0].name=docker-secret-1` |
| `etcdAddr` | Address of the etcd service | `dynamo-platform-etcd:2379` |
| `natsAddr` | Address of the NATS messaging service | `nats://dynamo-platform-nats:4222` |
| `deploymentType` | Type of deployment to use. Can be `basic` or `grove`. If not specified, `basic` is used. | `deploymentType=grove` |
## Feature Support Comparison
The following table shows which deployment features are supported by the **Helm chart installation** versus the **Operator path**:
| Feature | Helm Chart | Operator | Description |
|---------|------------|----------|-------------|
| **Singlenode** (k8sDeployments) | ✅ Supported | ✅ Supported | Single-node deployments using standard Kubernetes Deployments |
| **Singlenode** (Grove PodCliqueSet) | ✅ Supported | ✅ Supported | Single-node deployments using Grove PodCliqueSet resources |
| **Multinode** (Grove PodCliqueSet and LWS) | ❌ Not Supported | ✅ Supported | Multi-node deployments requiring Grove PodCliqueSet and LeaderWorkerSet (LWS) |
**Key Differences:**
- **Helm Chart**: Best for simple single-node deployments and quick testing. Supports both basic Kubernetes deployments and Grove PodCliqueSet resources.
- **Operator**: Required for advanced multi-node deployments. Provides full feature support including complex distributed inference configurations.
There are two Helm charts available for the Dynamo Kubernetes Platform:
- [platform](./charts/platform/README.md) - This chart installs the complete Dynamo Kubernetes Platform, including the Dynamo Operator, NATS, etcd, Grove, and Kai Scheduler.
- [crds](./charts/crds/README.md) - This chart installs the CRDs for the Dynamo.
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: v2
name: dynamo-graph
description: A Helm chart to deploy a Dynamo graph on Kubernetes
type: application
version: 0.8.0
appVersion: 0.8.0
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# if deploymentType is empty, or explicitly set to basic, use basic as default
{{- if or (not .Values.deploymentType) (eq .Values.deploymentType "basic") -}}
{{- range $serviceName, $serviceSpec := .Values.spec.services }}
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ $.Release.Name }}-{{ $serviceName | lower }}
labels:
app: {{ $.Release.Name }}-{{ $serviceName | lower }}
spec:
replicas: {{ $serviceSpec.replicas }}
selector:
matchLabels:
app: {{ $.Release.Name }}-{{ $serviceName | lower }}
template:
metadata:
labels:
app: {{ $.Release.Name }}-{{ $serviceName | lower }}
spec:
{{- if $.Values.imagePullSecrets }}
imagePullSecrets:
{{ $.Values.imagePullSecrets | toYaml | nindent 8 }}
{{- end }}
containers:
- name: {{ $.Release.Name }}-{{ $serviceName | lower }}
image: {{ $serviceSpec.extraPodSpec.mainContainer.image }}
{{- if $serviceSpec.extraPodSpec.mainContainer.workingDir }}
workingDir: {{ $serviceSpec.extraPodSpec.mainContainer.workingDir }}
{{- end }}
{{- if $serviceSpec.extraPodSpec.mainContainer.command }}
command:
{{- $serviceSpec.extraPodSpec.mainContainer.command | toYaml | nindent 8 }}
{{- else }}
{{- if $serviceSpec.componentType | eq "frontend" }}
command:
- python3
{{- else }}
command:
- /bin/sh
- -c
{{- if not $serviceSpec.extraPodSpec.mainContainer.args }}
{{- fail (printf "spec.services[%s].extraPodSpec.mainContainer.args must be set for non-frontend components" $serviceName) }}
{{- end }}
{{- end }}
{{- end }}
{{- if $serviceSpec.extraPodSpec.mainContainer.args }}
args:
{{- $serviceSpec.extraPodSpec.mainContainer.args | toYaml | nindent 8 }}
{{- else if $serviceSpec.componentType | eq "frontend" }}
args:
- -m
- dynamo.frontend
{{- end }}
{{- if $serviceSpec.resources }}
{{- $hasResources := false }}
{{- if or $serviceSpec.resources.requests $serviceSpec.resources.limits }}
{{- $hasResources = true }}
{{- end }}
{{- if $hasResources }}
resources:
{{- if $serviceSpec.resources.requests }}
{{- $requestsGpuResourceName := "nvidia.com/gpu" }}
{{- if $serviceSpec.resources.requests.gpuType }}
{{- $requestsGpuResourceName = $serviceSpec.resources.requests.gpuType }}
{{- end }}
requests:
{{- if $serviceSpec.resources.requests.cpu }}
cpu: "{{ $serviceSpec.resources.requests.cpu }}"
{{- end }}
{{- if $serviceSpec.resources.requests.memory }}
memory: "{{ $serviceSpec.resources.requests.memory }}"
{{- end }}
{{- if $serviceSpec.resources.requests.gpu }}
{{ $requestsGpuResourceName }}: "{{ $serviceSpec.resources.requests.gpu }}"
{{- end }}
{{- end }}
{{- if $serviceSpec.resources.limits }}
{{- $limitsGpuResourceName := "nvidia.com/gpu" }}
{{- if $serviceSpec.resources.limits.gpuType }}
{{- $limitsGpuResourceName = $serviceSpec.resources.limits.gpuType }}
{{- end }}
limits:
{{- if $serviceSpec.resources.limits.cpu }}
cpu: "{{ $serviceSpec.resources.limits.cpu }}"
{{- end }}
{{- if $serviceSpec.resources.limits.memory }}
memory: "{{ $serviceSpec.resources.limits.memory }}"
{{- end }}
{{- if $serviceSpec.resources.limits.gpu }}
{{ $limitsGpuResourceName }}: "{{ $serviceSpec.resources.limits.gpu }}"
{{- end }}
{{- end }}
{{- end }}
{{- end }}
{{- if $serviceSpec.envFromSecret }}
envFrom:
- secretRef:
name: {{ $serviceSpec.envFromSecret }}
{{- end }}
env:
{{- if $.Values.dynamoNamespace }}
- name: DYN_NAMESPACE
value: {{ $.Values.dynamoNamespace }}
{{- end }}
{{- if $.Values.etcdAddr }}
- name: ETCD_ENDPOINTS
value: "{{ $.Values.etcdAddr }}"
{{- end }}
{{- if $.Values.natsAddr }}
- name: NATS_SERVER
value: "{{ $.Values.natsAddr }}"
{{- end }}
{{- if $serviceSpec.componentType | eq "frontend" }}
- name: DYNAMO_PORT
value: "{{ $.Values.dynamoPort | default 8000 }}"
- name: DYN_HTTP_PORT
value: "{{ $.Values.dynamoPort | default 8000 }}"
{{- else if $serviceSpec.componentType | eq "worker" }}
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_PORT
value: "{{ $.Values.dynamoSystemPort | default 9090 }}"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
{{- end }}
{{- if $serviceSpec.componentType | eq "frontend" }}
ports:
- name: http
containerPort: {{ $.Values.dynamoPort | default 8000 }}
protocol: TCP
{{- else if $serviceSpec.componentType | eq "worker" }}
ports:
- name: system
containerPort: {{ $.Values.dynamoSystemPort | default 9090 }}
protocol: TCP
{{- end }}
{{- if and $serviceSpec.componentType (or (eq $serviceSpec.componentType "frontend") (eq $serviceSpec.componentType "worker")) }}
livenessProbe:
{{- if $serviceSpec.livenessProbe }}
{{ $serviceSpec.livenessProbe | toYaml | nindent 10 }}
{{- else }}
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 5
failureThreshold: 10
successThreshold: 1
{{- if $serviceSpec.componentType | eq "frontend" }}
httpGet:
path: /health
port: http
{{- else if $serviceSpec.componentType | eq "worker" }}
httpGet:
path: /live
port: system
{{- else }}
httpGet:
path: /healthz
port: health
scheme: HTTP
{{- end }}
{{- end }}
readinessProbe:
{{- if $serviceSpec.readinessProbe }}
{{ $serviceSpec.readinessProbe | toYaml | nindent 10 }}
{{- else }}
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 5
failureThreshold: 10
successThreshold: 1
{{- if $serviceSpec.componentType | eq "frontend" }}
exec:
command:
- /bin/sh
- -c
- curl -s http://localhost:${DYNAMO_PORT}/health | jq -e ".status == \"healthy\""
{{- else if $serviceSpec.componentType | eq "worker" }}
httpGet:
path: /health
port: system
{{- else }}
httpGet:
path: /readyz
port: health
scheme: HTTP
{{- end }}
{{- end }}
{{- end }}
{{- end }}
{{- end }}
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
{{- if eq .Values.deploymentType "grove" }}
---
apiVersion: grove.io/v1alpha1
kind: PodGangSet
metadata:
name: {{ $.Release.Name }}
labels:
app: {{ $.Release.Name }}
spec:
replicas: 1
template:
terminationDelay: 1h
cliques:
{{- range $serviceName, $serviceSpec := .Values.spec.services }}
- name: {{ $serviceName | lower }}
spec:
roleName: {{ $serviceName | lower }}
replicas: {{ $serviceSpec.replicas }}
podSpec:
{{- if $.Values.imagePullSecrets }}
imagePullSecrets:
{{ $.Values.imagePullSecrets | toYaml | nindent 12 }}
{{- end }}
containers:
- name: main
image: {{ $serviceSpec.extraPodSpec.mainContainer.image }}
{{- if $serviceSpec.resources }}
{{- $hasResources := false }}
{{- if or $serviceSpec.resources.requests $serviceSpec.resources.limits }}
{{- $hasResources = true }}
{{- end }}
{{- if $hasResources }}
resources:
{{- if $serviceSpec.resources.requests }}
{{- $requestsGpuResourceName := "nvidia.com/gpu" }}
{{- if $serviceSpec.resources.requests.gpuType }}
{{- $requestsGpuResourceName = $serviceSpec.resources.requests.gpuType }}
{{- end }}
requests:
{{- if $serviceSpec.resources.requests.cpu }}
cpu: "{{ $serviceSpec.resources.requests.cpu }}"
{{- end }}
{{- if $serviceSpec.resources.requests.memory }}
memory: "{{ $serviceSpec.resources.requests.memory }}"
{{- end }}
{{- if $serviceSpec.resources.requests.gpu }}
{{ $requestsGpuResourceName }}: "{{ $serviceSpec.resources.requests.gpu }}"
{{- end }}
{{- end }}
{{- if $serviceSpec.resources.limits }}
{{- $limitsGpuResourceName := "nvidia.com/gpu" }}
{{- if $serviceSpec.resources.limits.gpuType }}
{{- $limitsGpuResourceName = $serviceSpec.resources.limits.gpuType }}
{{- end }}
limits:
{{- if $serviceSpec.resources.limits.cpu }}
cpu: "{{ $serviceSpec.resources.limits.cpu }}"
{{- end }}
{{- if $serviceSpec.resources.limits.memory }}
memory: "{{ $serviceSpec.resources.limits.memory }}"
{{- end }}
{{- if $serviceSpec.resources.limits.gpu }}
{{ $limitsGpuResourceName }}: "{{ $serviceSpec.resources.limits.gpu }}"
{{- end }}
{{- end }}
{{- end }}
{{- end }}
workingDir: {{ $serviceSpec.extraPodSpec.mainContainer.workingDir }}
{{- if $serviceSpec.extraPodSpec.mainContainer.command }}
command:
{{- $serviceSpec.extraPodSpec.mainContainer.command | toYaml | nindent 14 }}
{{- else }}
{{- if $serviceSpec.componentType | eq "frontend" }}
command:
- python3
{{- else }}
command:
- /bin/sh
- -c
{{- if not $serviceSpec.extraPodSpec.mainContainer.args }}
{{- fail (printf "spec.services[%s].extraPodSpec.mainContainer.args must be set for non-frontend components" $serviceName) }}
{{- end }}
{{- end }}
{{- end }}
{{- if $serviceSpec.extraPodSpec.mainContainer.args }}
args:
{{- $serviceSpec.extraPodSpec.mainContainer.args | toYaml | nindent 14 }}
{{- else if $serviceSpec.componentType | eq "frontend" }}
args:
- -m
- dynamo.frontend
{{- end }}
env:
{{- if $.Values.dynamoNamespace }}
- name: DYN_NAMESPACE
value: {{ $.Values.dynamoNamespace }}
{{- end }}
{{- if $.Values.etcdAddr }}
- name: ETCD_ENDPOINTS
value: "{{ $.Values.etcdAddr }}"
{{- end }}
{{- if $.Values.natsAddr }}
- name: NATS_SERVER
value: "{{ $.Values.natsAddr }}"
{{- end }}
{{- if $serviceSpec.componentType | eq "frontend" }}
- name: DYNAMO_PORT
value: "{{ $.Values.dynamoPort | default 8000 }}"
- name: DYN_HTTP_PORT
value: "{{ $.Values.dynamoPort | default 8000 }}"
{{- else if $serviceSpec.componentType | eq "worker" }}
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_PORT
value: "{{ $.Values.dynamoSystemPort | default 9090 }}"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
{{- end }}
{{- if $serviceSpec.envFromSecret }}
envFrom:
- secretRef:
name: {{ $serviceSpec.envFromSecret }}
{{- end }}
{{- if $serviceSpec.componentType | eq "frontend" }}
ports:
- name: http
containerPort: {{ $.Values.dynamoPort | default 8000 }}
protocol: TCP
{{- else if $serviceSpec.componentType | eq "worker" }}
ports:
- name: system
containerPort: {{ $.Values.dynamoSystemPort | default 9090 }}
protocol: TCP
{{- end }}
{{- if and $serviceSpec.componentType (or (eq $serviceSpec.componentType "frontend") (eq $serviceSpec.componentType "worker")) }}
livenessProbe:
{{- if $serviceSpec.livenessProbe }}
{{ $serviceSpec.livenessProbe | toYaml | nindent 10 }}
{{- else }}
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 5
failureThreshold: 10
successThreshold: 1
{{- if $serviceSpec.componentType | eq "frontend" }}
httpGet:
path: /health
port: http
{{- else if $serviceSpec.componentType | eq "worker" }}
httpGet:
path: /live
port: system
{{- else }}
httpGet:
path: /healthz
port: health
scheme: HTTP
{{- end }}
{{- end }}
readinessProbe:
{{- if $serviceSpec.readinessProbe }}
{{ $serviceSpec.readinessProbe | toYaml | nindent 10 }}
{{- else }}
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 5
failureThreshold: 10
successThreshold: 1
{{- if $serviceSpec.componentType | eq "frontend" }}
exec:
command:
- /bin/sh
- -c
- curl -s http://localhost:${DYNAMO_PORT}/health | jq -e ".status == \"healthy\""
{{- else if $serviceSpec.componentType | eq "worker" }}
httpGet:
path: /health
port: system
{{- else }}
httpGet:
path: /readyz
port: health
scheme: HTTP
{{- end }}
{{- end }}
{{- end }}
{{- end }}
{{- end }}
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
{{- range $serviceName, $serviceSpec := .Values.spec.services }}
{{- if eq $serviceSpec.componentType "frontend" }}
---
apiVersion: v1
kind: Service
metadata:
name: {{ $.Release.Name }}-{{ $serviceName | lower }}
labels:
app: {{ $.Release.Name }}-{{ $serviceName | lower }}
spec:
selector:
app: {{ $.Release.Name }}-{{ $serviceName | lower }}
ports:
- port: {{ $serviceSpec.port | default 8000 }}
targetPort: {{ $serviceSpec.port | default 8000 }}
protocol: TCP
type: ClusterIP
{{- end }}
{{- end }}
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
imagePullSecrets:
- name: docker-imagepullsecret
natsAddr: nats://dynamo-platform-nats:4222
etcdAddr: dynamo-platform-etcd:2379
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment