Unverified Commit 3057af00 authored by Thomas Montfort's avatar Thomas Montfort Committed by GitHub
Browse files

feat(dep-715): refactor helm directory and remove reference to cloud (#5042)

parent b5922693
...@@ -83,14 +83,14 @@ jobs: ...@@ -83,14 +83,14 @@ jobs:
env: env:
ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
run: | run: |
cd deploy/cloud/operator cd deploy/operator
docker build --target linter --progress=plain --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ . docker build --target linter --progress=plain --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ .
- name: Tester - name: Tester
shell: bash shell: bash
env: env:
ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
run: | run: |
cd deploy/cloud/operator cd deploy/operator
docker build --target tester --progress=plain --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ . docker build --target tester --progress=plain --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ .
- name: Set up Go - name: Set up Go
uses: actions/setup-go@44694675825211faa026b3c33043df3e48a5fa00 # v6.0.0 uses: actions/setup-go@44694675825211faa026b3c33043df3e48a5fa00 # v6.0.0
...@@ -102,7 +102,7 @@ jobs: ...@@ -102,7 +102,7 @@ jobs:
ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
run: | run: |
sudo apt-get update && sudo apt-get install -y make sudo apt-get update && sudo apt-get install -y make
cd deploy/cloud/operator cd deploy/operator
make check make check
- name: Build Container - name: Build Container
id: build-image id: build-image
...@@ -110,7 +110,7 @@ jobs: ...@@ -110,7 +110,7 @@ jobs:
env: env:
ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
run: | run: |
cd deploy/cloud/operator cd deploy/operator
docker buildx build --load \ docker buildx build --load \
--platform linux/${{ matrix.platform.arch }} \ --platform linux/${{ matrix.platform.arch }} \
--build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ \ --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ \
...@@ -370,7 +370,6 @@ jobs: ...@@ -370,7 +370,6 @@ jobs:
export ISTIO_ENABLED=true export ISTIO_ENABLED=true
export ISTIO_GATEWAY=istio-system/ingress-alb export ISTIO_GATEWAY=istio-system/ingress-alb
export VIRTUAL_SERVICE_SUPPORTS_HTTPS=true export VIRTUAL_SERVICE_SUPPORTS_HTTPS=true
export DYNAMO_CLOUD=https://${NAMESPACE}.${DYNAMO_INGRESS_SUFFIX}
# Install dynamo env secrets # Install dynamo env secrets
kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN=${{ secrets.HF_TOKEN }} -n $KUBE_NS || true kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN=${{ secrets.HF_TOKEN }} -n $KUBE_NS || true
...@@ -378,7 +377,7 @@ jobs: ...@@ -378,7 +377,7 @@ jobs:
kubectl create secret docker-registry docker-imagepullsecret --docker-server=${{ secrets.AZURE_ACR_HOSTNAME }} --docker-username=${{ secrets.AZURE_ACR_USER }} --docker-password=${{ secrets.AZURE_ACR_PASSWORD }} --namespace=${NAMESPACE} kubectl create secret docker-registry docker-imagepullsecret --docker-server=${{ secrets.AZURE_ACR_HOSTNAME }} --docker-username=${{ secrets.AZURE_ACR_USER }} --docker-password=${{ secrets.AZURE_ACR_PASSWORD }} --namespace=${NAMESPACE}
# Install helm dependencies # Install helm dependencies
helm repo add bitnami https://charts.bitnami.com/bitnami helm repo add bitnami https://charts.bitnami.com/bitnami
cd deploy/cloud/helm/platform/ cd deploy/helm/charts/platform/
helm dep build . helm dep build .
# Install platform with namespace restriction for single profile testing # Install platform with namespace restriction for single profile testing
helm upgrade --install dynamo-platform . --namespace ${NAMESPACE} \ helm upgrade --install dynamo-platform . --namespace ${NAMESPACE} \
......
...@@ -746,7 +746,6 @@ jobs: ...@@ -746,7 +746,6 @@ jobs:
export ISTIO_ENABLED=true export ISTIO_ENABLED=true
export ISTIO_GATEWAY=istio-system/ingress-alb export ISTIO_GATEWAY=istio-system/ingress-alb
export VIRTUAL_SERVICE_SUPPORTS_HTTPS=true export VIRTUAL_SERVICE_SUPPORTS_HTTPS=true
export DYNAMO_CLOUD=https://${NAMESPACE}.${DYNAMO_INGRESS_SUFFIX}
# Install dynamo env secrets # Install dynamo env secrets
kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN=${{ secrets.HF_TOKEN }} -n $KUBE_NS || true kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN=${{ secrets.HF_TOKEN }} -n $KUBE_NS || true
# Create docker pull secret for operator image # Create docker pull secret for operator image
...@@ -756,7 +755,7 @@ jobs: ...@@ -756,7 +755,7 @@ jobs:
docker pull ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${{ env.NIGHTLY_IMAGE_PREFIX }}-operator-amd64 || echo "Operator image not found, will use SHA-based tag" docker pull ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${{ env.NIGHTLY_IMAGE_PREFIX }}-operator-amd64 || echo "Operator image not found, will use SHA-based tag"
# Install helm dependencies # Install helm dependencies
helm repo add bitnami https://charts.bitnami.com/bitnami helm repo add bitnami https://charts.bitnami.com/bitnami
cd deploy/cloud/helm/platform/ cd deploy/helm/charts/platform/
helm dep build . helm dep build .
# Install platform with namespace restriction # Install platform with namespace restriction
helm upgrade --install dynamo-platform . --namespace ${NAMESPACE} \ helm upgrade --install dynamo-platform . --namespace ${NAMESPACE} \
......
...@@ -34,7 +34,7 @@ golang-base: ...@@ -34,7 +34,7 @@ golang-base:
operator-src: operator-src:
FROM +golang-base FROM +golang-base
COPY ./deploy/cloud/operator /artifacts/operator COPY ./deploy/operator /artifacts/operator
SAVE ARTIFACT /artifacts/operator SAVE ARTIFACT /artifacts/operator
...@@ -172,15 +172,15 @@ dynamo-base-docker: ...@@ -172,15 +172,15 @@ dynamo-base-docker:
############### ALL TARGETS ############################## ############### ALL TARGETS ##############################
all-test: all-test:
BUILD ./deploy/cloud/operator+test BUILD ./deploy/operator+test
all-docker: all-docker:
ARG DOCKER_SERVER=my-registry ARG DOCKER_SERVER=my-registry
ARG IMAGE_TAG=latest ARG IMAGE_TAG=latest
BUILD ./deploy/cloud/operator+docker --DOCKER_SERVER=$DOCKER_SERVER --IMAGE_TAG=$IMAGE_TAG BUILD ./deploy/operator+docker --DOCKER_SERVER=$DOCKER_SERVER --IMAGE_TAG=$IMAGE_TAG
all-lint: all-lint:
BUILD ./deploy/cloud/operator+lint BUILD ./deploy/operator+lint
all: all:
BUILD +all-test BUILD +all-test
......
...@@ -86,5 +86,5 @@ addopts = [ ...@@ -86,5 +86,5 @@ addopts = [
"--mypy", # This flag enables mypy type checking during pytest runs "--mypy", # This flag enables mypy type checking during pytest runs
"--ignore-glob=*model.py", "--ignore-glob=*model.py",
"--ignore-glob=*_inc.py", "--ignore-glob=*_inc.py",
"--ignore-glob=deploy/cloud/api-store/*", "--ignore-glob=deploy/api-store/*",
] ]
...@@ -14,14 +14,16 @@ Welcome to the Dynamo Deploy project! This guide will help you get started with ...@@ -14,14 +14,16 @@ Welcome to the Dynamo Deploy project! This guide will help you get started with
The deploy directory contains several key components: The deploy directory contains several key components:
``` ```
deploy/ ├── discovery # How to use Dynamo kubernetes discovery backend
├── cloud/ # Cloud deployment platform ├── helm
│ ├── helm/ # Cloud platform Helm charts │ └── charts
│ └── operator/ # Kubernetes operator (Go) │ ├── crds # Dynamo CRD helm chart
├── helm/ # Manual deployment Helm charts │ ├── platform # Dynamo platform helm chart
├── metrics/ # Monitoring and observability ├── inference-gateway # Dynamo intregration with inference gateway
├── sdk/ # Python scripts ├── observability # Observability tools for Dynamo k8s
└── inference-gateway/ # Gateway components ├── operator # Source code for the Dynamo operator
├── pre-deployment # Pre-deployment scripts to check your k8s cluster meets the requirements for deploying Dynamo
└── utils # Utilities and manifests for Dynamo benchmarking and profiling workflows
``` ```
## Development Environment ## Development Environment
...@@ -46,13 +48,13 @@ deploy/ ...@@ -46,13 +48,13 @@ deploy/
commit -S commit -S
``` ```
- Every time you modify `deploy/cloud/helm/crds/templates/*.yaml`, please bump up the version of the CRD helm chart in - Every time you modify `deploy/helm/charts/crds/templates/*.yaml`, please bump up the version of the CRD helm chart in
1. deploy/cloud/helm/platform/components/operator/Chart.yaml 1. deploy/helm/charts/platform/components/operator/Chart.yaml
2. deploy/cloud/helm/platform/Chart.yaml 2. deploy/helm/charts/platform/Chart.yaml
then then
```bash ```bash
deploy/cloud/helm/platform deploy/helm/charts/platform
helm dependency update helm dependency update
``` ```
...@@ -116,7 +118,7 @@ Once you have an MR up and standard checks pass trigger the integration tests by ...@@ -116,7 +118,7 @@ Once you have an MR up and standard checks pass trigger the integration tests by
**Go Tests (Operator):** **Go Tests (Operator):**
```bash ```bash
cd deploy/cloud/operator cd deploy/operator
go test ./... -v go test ./... -v
go test -race ./... go test -race ./...
``` ```
...@@ -136,7 +138,7 @@ pytest tests/serve/test_dynamo_serve.py::test_serve_deployment[agg] -v ...@@ -136,7 +138,7 @@ pytest tests/serve/test_dynamo_serve.py::test_serve_deployment[agg] -v
**Operator Integration Tests:** **Operator Integration Tests:**
```bash ```bash
cd deploy/cloud/operator cd deploy/operator
make test-e2e make test-e2e
``` ```
......
<!--
SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
SPDX-License-Identifier: Apache-2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
# Dynamo Kubernetes Platform Helm Charts
There are two Helm charts available for the Dynamo Kubernetes Platform:
- [platform](platform/README.md) - This chart installs the complete Dynamo Kubernetes Platform, including the Dynamo Operator, NATS, etcd, Grove, and Kai Scheduler.
- [crds](crds/README.md) - This chart installs the CRDs for the Dynamo Kubernetes Platform.
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: v1
kind: Namespace
metadata:
name: cai-hannahz
labels:
nscleanup/enabled: 'false' # this enables automated cleanup
nvcr-imagepull: enabled # adds nvcr imagepull secret
gitlab-imagepull: enabled # adds gitlab imagepull secret
istio-injection: 'false'
\ No newline at end of file
#!/usr/bin/env bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -euo pipefail
trap 'echo "Error at line $LINENO. Exiting."' ERR
read -p "Are you sure you want to delete ALL Dynamo CRDs and their instances? (y/N): " confirm
if [[ "$confirm" != "y" ]]; then
echo "Aborting."
exit 1
fi
# Step 1: Get all CRDs with the prefix
DYNAMO_CRDS="$(kubectl get crds -o name | grep 'nvidia.com' | grep 'dynamo' | cut -d'/' -f2)"
if [ -z "${DYNAMO_CRDS}" ]; then
echo "Dynamo CRDs not found"
exit 1
fi
# Step 2: Delete all custom resource instances for each CRD
for CRD in ${DYNAMO_CRDS}; do
SCOPE=$(kubectl get crd "${CRD}" -o jsonpath='{.spec.scope}')
if [ "$SCOPE" == "Namespaced" ]; then
echo "Deleting all namespaced instances of ${CRD}..."
kubectl get "${CRD}" --all-namespaces -o name | xargs -r kubectl delete --wait=false
else
echo "Skipping cluster-scoped CRD: ${CRD}"
fi
done
# Step 3: Wait for the Operator to handle finalizer removal
echo "Waiting for Dynamo Operator to handle the finalizer removal (30 seconds)..."
sleep 30
# Step 4: Verify all Custom Resources have been removed
for CRD in ${DYNAMO_CRDS}; do
# Check CRs
echo "Checking instances of ${CRD}"
kubectl get "${CRD}" --all-namespaces -o name
done
# Step 5: Delete the CRDs themselves
echo "Deleting CRDs..."
for CRD in ${DYNAMO_CRDS}; do
# Delete all CRD's
echo "Deleting CRD: ${CRD}..."
kubectl delete crd "${CRD}"
done
...@@ -15,67 +15,9 @@ See the License for the specific language governing permissions and ...@@ -15,67 +15,9 @@ See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
--> -->
# Manual Helm Deployment # Dynamo Kubernetes Helm Charts
This directory contains Helm charts for manually deploying Dynamo inference graphs to Kubernetes. There are two Helm charts available for the Dynamo Kubernetes Platform:
This approach allows you to install Dynamo directly using a DynamoGraphDeploymentCRD values file, which is useful for quick deployments or testing specific configurations.
### Prerequisites
- Helm 3.0+
- Kubernetes 1.16+
- ETCD v3.5+ (without auth)
- NATS v2.10+ (with jetstream enabled)
- Grove v0.1.0+ (optional if deploying using Grove)
### Basic Installation
Here is how you would install a VLLM inference backend example.
```bash
helm upgrade --install dynamo-graph ./deploy/helm/chart -n dynamo-cloud -f ./examples/backends/vllm/deploy/agg.yaml
```
### Installation using Grove
Same example as above, but using Grove PodCliqueSet resources.
```bash
helm upgrade --install dynamo-graph ./deploy/helm/chart -n dynamo-cloud -f ./examples/backends/vllm/deploy/agg.yaml --set deploymentType=grove
```
### Customizable Properties
You can override the default configuration by setting the following properties:
```bash
helm upgrade --install dynamo-graph ./deploy/helm/chart -n dynamo-cloud \
-f ./examples/backends/vllm/deploy/agg.yaml \
--set "imagePullSecrets[0].name=docker-secret-1" \
--set etcdAddr="my-etcd-service:2379" \
--set natsAddr="nats://my-nats-service:4222"
```
#### Available Properties
| Property | Description | Example |
|----------|-------------|---------|
| `imagePullSecrets` | Array of image pull secrets for accessing private registries | `imagePullSecrets[0].name=docker-secret-1` |
| `etcdAddr` | Address of the etcd service | `dynamo-platform-etcd:2379` |
| `natsAddr` | Address of the NATS messaging service | `nats://dynamo-platform-nats:4222` |
| `deploymentType` | Type of deployment to use. Can be `basic` or `grove`. If not specified, `basic` is used. | `deploymentType=grove` |
## Feature Support Comparison
The following table shows which deployment features are supported by the **Helm chart installation** versus the **Operator path**:
| Feature | Helm Chart | Operator | Description |
|---------|------------|----------|-------------|
| **Singlenode** (k8sDeployments) | ✅ Supported | ✅ Supported | Single-node deployments using standard Kubernetes Deployments |
| **Singlenode** (Grove PodCliqueSet) | ✅ Supported | ✅ Supported | Single-node deployments using Grove PodCliqueSet resources |
| **Multinode** (Grove PodCliqueSet and LWS) | ❌ Not Supported | ✅ Supported | Multi-node deployments requiring Grove PodCliqueSet and LeaderWorkerSet (LWS) |
**Key Differences:**
- **Helm Chart**: Best for simple single-node deployments and quick testing. Supports both basic Kubernetes deployments and Grove PodCliqueSet resources.
- **Operator**: Required for advanced multi-node deployments. Provides full feature support including complex distributed inference configurations.
- [platform](./charts/platform/README.md) - This chart installs the complete Dynamo Kubernetes Platform, including the Dynamo Operator, NATS, etcd, Grove, and Kai Scheduler.
- [crds](./charts/crds/README.md) - This chart installs the CRDs for the Dynamo.
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: v2
name: dynamo-graph
description: A Helm chart to deploy a Dynamo graph on Kubernetes
type: application
version: 0.8.0
appVersion: 0.8.0
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# if deploymentType is empty, or explicitly set to basic, use basic as default
{{- if or (not .Values.deploymentType) (eq .Values.deploymentType "basic") -}}
{{- range $serviceName, $serviceSpec := .Values.spec.services }}
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ $.Release.Name }}-{{ $serviceName | lower }}
labels:
app: {{ $.Release.Name }}-{{ $serviceName | lower }}
spec:
replicas: {{ $serviceSpec.replicas }}
selector:
matchLabels:
app: {{ $.Release.Name }}-{{ $serviceName | lower }}
template:
metadata:
labels:
app: {{ $.Release.Name }}-{{ $serviceName | lower }}
spec:
{{- if $.Values.imagePullSecrets }}
imagePullSecrets:
{{ $.Values.imagePullSecrets | toYaml | nindent 8 }}
{{- end }}
containers:
- name: {{ $.Release.Name }}-{{ $serviceName | lower }}
image: {{ $serviceSpec.extraPodSpec.mainContainer.image }}
{{- if $serviceSpec.extraPodSpec.mainContainer.workingDir }}
workingDir: {{ $serviceSpec.extraPodSpec.mainContainer.workingDir }}
{{- end }}
{{- if $serviceSpec.extraPodSpec.mainContainer.command }}
command:
{{- $serviceSpec.extraPodSpec.mainContainer.command | toYaml | nindent 8 }}
{{- else }}
{{- if $serviceSpec.componentType | eq "frontend" }}
command:
- python3
{{- else }}
command:
- /bin/sh
- -c
{{- if not $serviceSpec.extraPodSpec.mainContainer.args }}
{{- fail (printf "spec.services[%s].extraPodSpec.mainContainer.args must be set for non-frontend components" $serviceName) }}
{{- end }}
{{- end }}
{{- end }}
{{- if $serviceSpec.extraPodSpec.mainContainer.args }}
args:
{{- $serviceSpec.extraPodSpec.mainContainer.args | toYaml | nindent 8 }}
{{- else if $serviceSpec.componentType | eq "frontend" }}
args:
- -m
- dynamo.frontend
{{- end }}
{{- if $serviceSpec.resources }}
{{- $hasResources := false }}
{{- if or $serviceSpec.resources.requests $serviceSpec.resources.limits }}
{{- $hasResources = true }}
{{- end }}
{{- if $hasResources }}
resources:
{{- if $serviceSpec.resources.requests }}
{{- $requestsGpuResourceName := "nvidia.com/gpu" }}
{{- if $serviceSpec.resources.requests.gpuType }}
{{- $requestsGpuResourceName = $serviceSpec.resources.requests.gpuType }}
{{- end }}
requests:
{{- if $serviceSpec.resources.requests.cpu }}
cpu: "{{ $serviceSpec.resources.requests.cpu }}"
{{- end }}
{{- if $serviceSpec.resources.requests.memory }}
memory: "{{ $serviceSpec.resources.requests.memory }}"
{{- end }}
{{- if $serviceSpec.resources.requests.gpu }}
{{ $requestsGpuResourceName }}: "{{ $serviceSpec.resources.requests.gpu }}"
{{- end }}
{{- end }}
{{- if $serviceSpec.resources.limits }}
{{- $limitsGpuResourceName := "nvidia.com/gpu" }}
{{- if $serviceSpec.resources.limits.gpuType }}
{{- $limitsGpuResourceName = $serviceSpec.resources.limits.gpuType }}
{{- end }}
limits:
{{- if $serviceSpec.resources.limits.cpu }}
cpu: "{{ $serviceSpec.resources.limits.cpu }}"
{{- end }}
{{- if $serviceSpec.resources.limits.memory }}
memory: "{{ $serviceSpec.resources.limits.memory }}"
{{- end }}
{{- if $serviceSpec.resources.limits.gpu }}
{{ $limitsGpuResourceName }}: "{{ $serviceSpec.resources.limits.gpu }}"
{{- end }}
{{- end }}
{{- end }}
{{- end }}
{{- if $serviceSpec.envFromSecret }}
envFrom:
- secretRef:
name: {{ $serviceSpec.envFromSecret }}
{{- end }}
env:
{{- if $.Values.dynamoNamespace }}
- name: DYN_NAMESPACE
value: {{ $.Values.dynamoNamespace }}
{{- end }}
{{- if $.Values.etcdAddr }}
- name: ETCD_ENDPOINTS
value: "{{ $.Values.etcdAddr }}"
{{- end }}
{{- if $.Values.natsAddr }}
- name: NATS_SERVER
value: "{{ $.Values.natsAddr }}"
{{- end }}
{{- if $serviceSpec.componentType | eq "frontend" }}
- name: DYNAMO_PORT
value: "{{ $.Values.dynamoPort | default 8000 }}"
- name: DYN_HTTP_PORT
value: "{{ $.Values.dynamoPort | default 8000 }}"
{{- else if $serviceSpec.componentType | eq "worker" }}
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_PORT
value: "{{ $.Values.dynamoSystemPort | default 9090 }}"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
{{- end }}
{{- if $serviceSpec.componentType | eq "frontend" }}
ports:
- name: http
containerPort: {{ $.Values.dynamoPort | default 8000 }}
protocol: TCP
{{- else if $serviceSpec.componentType | eq "worker" }}
ports:
- name: system
containerPort: {{ $.Values.dynamoSystemPort | default 9090 }}
protocol: TCP
{{- end }}
{{- if and $serviceSpec.componentType (or (eq $serviceSpec.componentType "frontend") (eq $serviceSpec.componentType "worker")) }}
livenessProbe:
{{- if $serviceSpec.livenessProbe }}
{{ $serviceSpec.livenessProbe | toYaml | nindent 10 }}
{{- else }}
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 5
failureThreshold: 10
successThreshold: 1
{{- if $serviceSpec.componentType | eq "frontend" }}
httpGet:
path: /health
port: http
{{- else if $serviceSpec.componentType | eq "worker" }}
httpGet:
path: /live
port: system
{{- else }}
httpGet:
path: /healthz
port: health
scheme: HTTP
{{- end }}
{{- end }}
readinessProbe:
{{- if $serviceSpec.readinessProbe }}
{{ $serviceSpec.readinessProbe | toYaml | nindent 10 }}
{{- else }}
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 5
failureThreshold: 10
successThreshold: 1
{{- if $serviceSpec.componentType | eq "frontend" }}
exec:
command:
- /bin/sh
- -c
- curl -s http://localhost:${DYNAMO_PORT}/health | jq -e ".status == \"healthy\""
{{- else if $serviceSpec.componentType | eq "worker" }}
httpGet:
path: /health
port: system
{{- else }}
httpGet:
path: /readyz
port: health
scheme: HTTP
{{- end }}
{{- end }}
{{- end }}
{{- end }}
{{- end }}
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
{{- if eq .Values.deploymentType "grove" }}
---
apiVersion: grove.io/v1alpha1
kind: PodGangSet
metadata:
name: {{ $.Release.Name }}
labels:
app: {{ $.Release.Name }}
spec:
replicas: 1
template:
terminationDelay: 1h
cliques:
{{- range $serviceName, $serviceSpec := .Values.spec.services }}
- name: {{ $serviceName | lower }}
spec:
roleName: {{ $serviceName | lower }}
replicas: {{ $serviceSpec.replicas }}
podSpec:
{{- if $.Values.imagePullSecrets }}
imagePullSecrets:
{{ $.Values.imagePullSecrets | toYaml | nindent 12 }}
{{- end }}
containers:
- name: main
image: {{ $serviceSpec.extraPodSpec.mainContainer.image }}
{{- if $serviceSpec.resources }}
{{- $hasResources := false }}
{{- if or $serviceSpec.resources.requests $serviceSpec.resources.limits }}
{{- $hasResources = true }}
{{- end }}
{{- if $hasResources }}
resources:
{{- if $serviceSpec.resources.requests }}
{{- $requestsGpuResourceName := "nvidia.com/gpu" }}
{{- if $serviceSpec.resources.requests.gpuType }}
{{- $requestsGpuResourceName = $serviceSpec.resources.requests.gpuType }}
{{- end }}
requests:
{{- if $serviceSpec.resources.requests.cpu }}
cpu: "{{ $serviceSpec.resources.requests.cpu }}"
{{- end }}
{{- if $serviceSpec.resources.requests.memory }}
memory: "{{ $serviceSpec.resources.requests.memory }}"
{{- end }}
{{- if $serviceSpec.resources.requests.gpu }}
{{ $requestsGpuResourceName }}: "{{ $serviceSpec.resources.requests.gpu }}"
{{- end }}
{{- end }}
{{- if $serviceSpec.resources.limits }}
{{- $limitsGpuResourceName := "nvidia.com/gpu" }}
{{- if $serviceSpec.resources.limits.gpuType }}
{{- $limitsGpuResourceName = $serviceSpec.resources.limits.gpuType }}
{{- end }}
limits:
{{- if $serviceSpec.resources.limits.cpu }}
cpu: "{{ $serviceSpec.resources.limits.cpu }}"
{{- end }}
{{- if $serviceSpec.resources.limits.memory }}
memory: "{{ $serviceSpec.resources.limits.memory }}"
{{- end }}
{{- if $serviceSpec.resources.limits.gpu }}
{{ $limitsGpuResourceName }}: "{{ $serviceSpec.resources.limits.gpu }}"
{{- end }}
{{- end }}
{{- end }}
{{- end }}
workingDir: {{ $serviceSpec.extraPodSpec.mainContainer.workingDir }}
{{- if $serviceSpec.extraPodSpec.mainContainer.command }}
command:
{{- $serviceSpec.extraPodSpec.mainContainer.command | toYaml | nindent 14 }}
{{- else }}
{{- if $serviceSpec.componentType | eq "frontend" }}
command:
- python3
{{- else }}
command:
- /bin/sh
- -c
{{- if not $serviceSpec.extraPodSpec.mainContainer.args }}
{{- fail (printf "spec.services[%s].extraPodSpec.mainContainer.args must be set for non-frontend components" $serviceName) }}
{{- end }}
{{- end }}
{{- end }}
{{- if $serviceSpec.extraPodSpec.mainContainer.args }}
args:
{{- $serviceSpec.extraPodSpec.mainContainer.args | toYaml | nindent 14 }}
{{- else if $serviceSpec.componentType | eq "frontend" }}
args:
- -m
- dynamo.frontend
{{- end }}
env:
{{- if $.Values.dynamoNamespace }}
- name: DYN_NAMESPACE
value: {{ $.Values.dynamoNamespace }}
{{- end }}
{{- if $.Values.etcdAddr }}
- name: ETCD_ENDPOINTS
value: "{{ $.Values.etcdAddr }}"
{{- end }}
{{- if $.Values.natsAddr }}
- name: NATS_SERVER
value: "{{ $.Values.natsAddr }}"
{{- end }}
{{- if $serviceSpec.componentType | eq "frontend" }}
- name: DYNAMO_PORT
value: "{{ $.Values.dynamoPort | default 8000 }}"
- name: DYN_HTTP_PORT
value: "{{ $.Values.dynamoPort | default 8000 }}"
{{- else if $serviceSpec.componentType | eq "worker" }}
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_PORT
value: "{{ $.Values.dynamoSystemPort | default 9090 }}"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
{{- end }}
{{- if $serviceSpec.envFromSecret }}
envFrom:
- secretRef:
name: {{ $serviceSpec.envFromSecret }}
{{- end }}
{{- if $serviceSpec.componentType | eq "frontend" }}
ports:
- name: http
containerPort: {{ $.Values.dynamoPort | default 8000 }}
protocol: TCP
{{- else if $serviceSpec.componentType | eq "worker" }}
ports:
- name: system
containerPort: {{ $.Values.dynamoSystemPort | default 9090 }}
protocol: TCP
{{- end }}
{{- if and $serviceSpec.componentType (or (eq $serviceSpec.componentType "frontend") (eq $serviceSpec.componentType "worker")) }}
livenessProbe:
{{- if $serviceSpec.livenessProbe }}
{{ $serviceSpec.livenessProbe | toYaml | nindent 10 }}
{{- else }}
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 5
failureThreshold: 10
successThreshold: 1
{{- if $serviceSpec.componentType | eq "frontend" }}
httpGet:
path: /health
port: http
{{- else if $serviceSpec.componentType | eq "worker" }}
httpGet:
path: /live
port: system
{{- else }}
httpGet:
path: /healthz
port: health
scheme: HTTP
{{- end }}
{{- end }}
readinessProbe:
{{- if $serviceSpec.readinessProbe }}
{{ $serviceSpec.readinessProbe | toYaml | nindent 10 }}
{{- else }}
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 5
failureThreshold: 10
successThreshold: 1
{{- if $serviceSpec.componentType | eq "frontend" }}
exec:
command:
- /bin/sh
- -c
- curl -s http://localhost:${DYNAMO_PORT}/health | jq -e ".status == \"healthy\""
{{- else if $serviceSpec.componentType | eq "worker" }}
httpGet:
path: /health
port: system
{{- else }}
httpGet:
path: /readyz
port: health
scheme: HTTP
{{- end }}
{{- end }}
{{- end }}
{{- end }}
{{- end }}
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
{{- range $serviceName, $serviceSpec := .Values.spec.services }}
{{- if eq $serviceSpec.componentType "frontend" }}
---
apiVersion: v1
kind: Service
metadata:
name: {{ $.Release.Name }}-{{ $serviceName | lower }}
labels:
app: {{ $.Release.Name }}-{{ $serviceName | lower }}
spec:
selector:
app: {{ $.Release.Name }}-{{ $serviceName | lower }}
ports:
- port: {{ $serviceSpec.port | default 8000 }}
targetPort: {{ $serviceSpec.port | default 8000 }}
protocol: TCP
type: ClusterIP
{{- end }}
{{- end }}
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
imagePullSecrets:
- name: docker-imagepullsecret
natsAddr: nats://dynamo-platform-nats:4222
etcdAddr: dynamo-platform-etcd:2379
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment