"lib/runtime/vscode:/vscode.git/clone" did not exist on "edda76b4ebdb855ca8a38c1955b0e52b267c9f32"
Unverified Commit 729b5fd5 authored by julienmancuso's avatar julienmancuso Committed by GitHub
Browse files

feat: automatically setup and inject prometheus configuration (#2912)


Signed-off-by: default avatarJulien Mancuso <jmancuso@nvidia.com>
parent 93208162
......@@ -53,9 +53,6 @@ spec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.4.1
workingDir: /workspace/components/planner/src/dynamo/planner
ports:
- name: metrics
containerPort: 9085
command:
- /bin/sh
- -c
......@@ -66,7 +63,6 @@ spec:
--backend=vllm
--adjustment-interval=60
--profile-results-dir=/workspace/profiling_results
--prometheus-port=9085
Prometheus: # NOTE: this is set on Prometheus to ensure a service is created for the Prometheus component. This is a workaround and should be managed differently.
dynamoNamespace: vllm-disagg-planner
componentType: frontend
......
......@@ -84,9 +84,11 @@ The Dynamo Platform Helm chart deploys the complete Dynamo Cloud infrastructure
| dynamo-operator.dynamo.istio.gateway | string | `nil` | Istio gateway name for routing |
| dynamo-operator.dynamo.ingressHostSuffix | string | `""` | Host suffix for generated ingress hostnames |
| dynamo-operator.dynamo.virtualServiceSupportsHTTPS | bool | `false` | Whether VirtualServices should support HTTPS routing |
| dynamo-operator.dynamo.metrics.prometheusEndpoint | string | `""` | Endpoint that services can use to retrieve metrics. If set, dynamo operator will automatically inject the PROMETHEUS_ENDPOINT environment variable into services it manages. Users can override the value of the PROMETHEUS_ENDPOINT environment variable by modifying the corresponding deployment's environment variables |
| grove.enabled | bool | `false` | Whether to enable Grove for multi-node inference coordination, if enabled, the Grove operator will be deployed cluster-wide |
| kai-scheduler.enabled | bool | `false` | Whether to enable Kai Scheduler for intelligent resource allocation, if enabled, the Kai Scheduler operator will be deployed cluster-wide |
| etcd.enabled | bool | `true` | Whether to enable etcd deployment, disable if you want to use an external etcd instance |
| etcd.image.repository | string | `"bitnamilegacy/etcd"` | following bitnami announcement for brownout - https://github.com/bitnami/charts/tree/main/bitnami/etcd#%EF%B8%8F-important-notice-upcoming-changes-to-the-bitnami-catalog, we need to use the legacy repository until we migrate to the new "secure" repository |
| nats.enabled | bool | `true` | Whether to enable NATS deployment, disable if you want to use an external NATS instance |
### NATS Configuration
......
......@@ -107,6 +107,9 @@ spec:
{{- if .Values.modelExpressURL }}
- --model-express-url={{ .Values.modelExpressURL }}
{{- end }}
{{- if .Values.dynamo.metrics.prometheusEndpoint }}
- --prometheus-endpoint={{ .Values.dynamo.metrics.prometheusEndpoint }}
{{- end }}
command:
- /manager
env:
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
{{- if .Capabilities.APIVersions.Has "monitoring.coreos.com/v1" }}
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: dynamo-frontend
spec:
{{- if .Values.namespaceRestriction.enabled }}
namespaceSelector:
matchNames:
- {{ .Release.Namespace }}
{{- else }}
namespaceSelector:
any: true
{{- end }}
podMetricsEndpoints:
- interval: 30s
path: /metrics
port: http
selector:
matchLabels:
nvidia.com/dynamo-component-type: frontend
nvidia.com/metrics-enabled: "true"
---
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: dynamo-worker
spec:
{{- if .Values.namespaceRestriction.enabled }}
namespaceSelector:
matchNames:
- {{ .Release.Namespace }}
{{- else }}
namespaceSelector:
any: true
{{- end }}
podMetricsEndpoints:
- interval: 30s
path: /metrics
port: system
selector:
matchLabels:
nvidia.com/dynamo-component-type: worker
nvidia.com/metrics-enabled: "true"
---
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: dynamo-planner
spec:
{{- if .Values.namespaceRestriction.enabled }}
namespaceSelector:
matchNames:
- {{ .Release.Namespace }}
{{- else }}
namespaceSelector:
any: true
{{- end }}
podMetricsEndpoints:
- interval: 30s
path: /metrics
port: metrics
selector:
matchLabels:
nvidia.com/dynamo-component-type: planner
nvidia.com/metrics-enabled: "true"
{{- end }}
\ No newline at end of file
......@@ -75,9 +75,6 @@ controllerManager:
annotations: {}
dynamo:
imageBuilder:
serviceAccount:
annotations: {}
components:
serviceAccount:
annotations: {}
......@@ -99,6 +96,9 @@ dynamo:
existingSecretName: ''
secure: true
metrics:
prometheusEndpoint: ""
#imagePullSecrets: []
kubernetesClusterDomain: cluster.local
......
......@@ -111,6 +111,11 @@ dynamo-operator:
# -- Whether VirtualServices should support HTTPS routing
virtualServiceSupportsHTTPS: false
# Metrics configuration
metrics:
# -- Endpoint that services can use to retrieve metrics. If set, dynamo operator will automatically inject the PROMETHEUS_ENDPOINT environment variable into services it manages. Users can override the value of the PROMETHEUS_ENDPOINT environment variable by modifying the corresponding deployment's environment variables
prometheusEndpoint: ""
# Grove component - distributed inference orchestration
grove:
......@@ -130,8 +135,7 @@ etcd:
enabled: true
image:
# -- following bitnami announcement for brownout - https://github.com/bitnami/charts/tree/main/bitnami/etcd#%EF%B8%8F-important-notice-upcoming-changes-to-the-bitnami-catalog
# -- we need to use the legacy repository until we migrate to the new "secure" repository
# -- following bitnami announcement for brownout - https://github.com/bitnami/charts/tree/main/bitnami/etcd#%EF%B8%8F-important-notice-upcoming-changes-to-the-bitnami-catalog, we need to use the legacy repository until we migrate to the new "secure" repository
repository: bitnamilegacy/etcd
tag: 3.5.18-debian-12-r5
......
......@@ -57,7 +57,7 @@ ensure-yq:
fi
.PHONY: manifests
manifests: controller-gen ensure-yq generate-api-docs ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects.
manifests: controller-gen ensure-yq ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects.
# Use a large maxDescLen to ensure all field comments are included as OpenAPI descriptions
$(CONTROLLER_GEN) rbac:roleName=manager-role crd:maxDescLen=100000 webhook paths="./..." output:crd:artifacts:config=config/crd/bases
echo "Removing name from mainContainer required fields"
......
......@@ -132,6 +132,7 @@ func main() {
var ingressHostSuffix string
var groveTerminationDelay time.Duration
var modelExpressURL string
var prometheusEndpoint string
flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
flag.BoolVar(&enableLeaderElection, "leader-elect", false,
......@@ -161,6 +162,8 @@ func main() {
"The termination delay for Grove PodGangSets")
flag.StringVar(&modelExpressURL, "model-express-url", "",
"URL of the Model Express server to inject into all pods")
flag.StringVar(&prometheusEndpoint, "prometheus-endpoint", "",
"URL of the Prometheus endpoint to use for metrics")
opts := zap.Options{
Development: true,
}
......@@ -196,7 +199,8 @@ func main() {
IngressControllerTLSSecret: ingressControllerTLSSecretName,
IngressHostSuffix: ingressHostSuffix,
},
ModelExpressURL: modelExpressURL,
ModelExpressURL: modelExpressURL,
PrometheusEndpoint: prometheusEndpoint,
}
mainCtx := ctrl.SetupSignalHandler()
......
......@@ -16,6 +16,9 @@ const (
DynamoServicePortName = "http"
DynamoContainerPortName = "http"
DynamoPlannerMetricsPort = 9085
DynamoMetricsPortName = "metrics"
DynamoSystemPort = 9090
DynamoSystemPortName = "system"
......
......@@ -58,6 +58,8 @@ type Config struct {
IngressConfig IngressConfig
// ModelExpressURL is the URL of the Model Express server to inject into all pods
ModelExpressURL string
// PrometheusEndpoint is the URL of the Prometheus endpoint to use for metrics
PrometheusEndpoint string
}
type IngressConfig struct {
......
......@@ -6,6 +6,8 @@
package dynamo
import (
"fmt"
commonconsts "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
corev1 "k8s.io/api/core/v1"
)
......@@ -21,6 +23,19 @@ func NewPlannerDefaults() *PlannerDefaults {
func (p *PlannerDefaults) GetBaseContainer(context ComponentContext) (corev1.Container, error) {
container := p.getCommonContainer(context)
container.Ports = []corev1.ContainerPort{
{
Protocol: corev1.ProtocolTCP,
Name: commonconsts.DynamoMetricsPortName,
ContainerPort: int32(commonconsts.DynamoPlannerMetricsPort),
},
}
container.Env = append(container.Env, []corev1.EnvVar{
{
Name: "PROMETHEUS_PORT",
Value: fmt.Sprintf("%d", commonconsts.DynamoPlannerMetricsPort),
},
}...)
return container, nil
}
......
......@@ -6,8 +6,10 @@
package dynamo
import (
"fmt"
"testing"
commonconsts "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
"github.com/google/go-cmp/cmp"
corev1 "k8s.io/api/core/v1"
)
......@@ -46,10 +48,14 @@ func TestPlannerDefaults_GetBaseContainer(t *testing.T) {
"/bin/sh",
"-c",
},
Ports: []corev1.ContainerPort{
{Name: commonconsts.DynamoMetricsPortName, ContainerPort: commonconsts.DynamoPlannerMetricsPort, Protocol: corev1.ProtocolTCP},
},
Env: []corev1.EnvVar{
{Name: "DYN_NAMESPACE", Value: "dynamo-namespace"},
{Name: "DYN_PARENT_DGD_K8S_NAME", Value: "name"},
{Name: "DYN_PARENT_DGD_K8S_NAMESPACE", Value: "namespace"},
{Name: "PROMETHEUS_PORT", Value: fmt.Sprintf("%d", commonconsts.DynamoPlannerMetricsPort)},
},
},
},
......
......@@ -660,26 +660,35 @@ func isWorkerComponent(componentType string) bool {
// addStandardEnvVars adds the standard environment variables that are common to both Grove and Controller
func addStandardEnvVars(container *corev1.Container, controllerConfig controller_common.Config) {
standardEnvVars := []corev1.EnvVar{}
if controllerConfig.NatsAddress != "" {
container.Env = append(container.Env, corev1.EnvVar{
standardEnvVars = append(standardEnvVars, corev1.EnvVar{
Name: "NATS_SERVER",
Value: controllerConfig.NatsAddress,
})
}
if controllerConfig.EtcdAddress != "" {
container.Env = append(container.Env, corev1.EnvVar{
standardEnvVars = append(standardEnvVars, corev1.EnvVar{
Name: "ETCD_ENDPOINTS",
Value: controllerConfig.EtcdAddress,
})
}
if controllerConfig.ModelExpressURL != "" {
container.Env = append(container.Env, corev1.EnvVar{
standardEnvVars = append(standardEnvVars, corev1.EnvVar{
Name: "MODEL_EXPRESS_URL",
Value: controllerConfig.ModelExpressURL,
})
}
if controllerConfig.PrometheusEndpoint != "" {
standardEnvVars = append(standardEnvVars, corev1.EnvVar{
Name: "PROMETHEUS_ENDPOINT",
Value: controllerConfig.PrometheusEndpoint,
})
}
// merge the env vars to allow users to override the standard env vars
container.Env = MergeEnvs(standardEnvVars, container.Env)
}
// GenerateBasePodSpec creates a basic PodSpec with common logic shared between controller and grove
......
......@@ -1071,6 +1071,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
Grove: controller_common.GroveConfig{
TerminationDelay: 15 * time.Minute,
},
PrometheusEndpoint: "http://localhost:9090",
},
dynamoDeployment: &v1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
......@@ -1159,7 +1160,8 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
},
"Planner": {
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
Replicas: &[]int32{2}[0],
Replicas: &[]int32{2}[0],
ComponentType: commonconsts.ComponentTypePlanner,
Resources: &common.Resources{
Requests: &common.ResourceItem{
CPU: "2",
......@@ -1348,6 +1350,10 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
Name: "MODEL_EXPRESS_URL",
Value: "model-express-url",
},
{
Name: "PROMETHEUS_ENDPOINT",
Value: "http://localhost:9090",
},
},
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
......@@ -1384,6 +1390,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
commonconsts.KubeLabelMetricsEnabled: commonconsts.KubeLabelValueTrue,
commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-planner",
commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamo-graph-deployment",
commonconsts.KubeLabelDynamoComponentType: commonconsts.ComponentTypePlanner,
},
Annotations: map[string]string{},
Spec: grovev1alpha1.PodCliqueSpec{
......@@ -1410,8 +1417,10 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
},
},
},
ServiceAccountName: commonconsts.PlannerServiceAccountName,
TerminationGracePeriodSeconds: ptr.To(int64(60)),
RestartPolicy: corev1.RestartPolicyAlways,
Containers: []corev1.Container{
{
Name: "main",
......@@ -1483,6 +1492,14 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
Name: "MODEL_EXPRESS_URL",
Value: "model-express-url",
},
{
Name: "PROMETHEUS_ENDPOINT",
Value: "http://localhost:9090",
},
{
Name: "PROMETHEUS_PORT",
Value: fmt.Sprintf("%d", commonconsts.DynamoPlannerMetricsPort),
},
},
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
......@@ -1505,6 +1522,13 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
MountPath: commonconsts.DefaultSharedMemoryMountPath,
},
},
Ports: []corev1.ContainerPort{
{
Protocol: corev1.ProtocolTCP,
Name: commonconsts.DynamoMetricsPortName,
ContainerPort: int32(commonconsts.DynamoPlannerMetricsPort),
},
},
},
},
},
......@@ -1653,7 +1677,8 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
},
"Planner": {
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
Replicas: &[]int32{2}[0],
ComponentType: commonconsts.ComponentTypePlanner,
Replicas: &[]int32{2}[0],
Resources: &common.Resources{
Requests: &common.ResourceItem{
CPU: "2",
......@@ -2146,6 +2171,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-planner",
commonconsts.KubeLabelMetricsEnabled: commonconsts.KubeLabelValueTrue,
commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamo-graph-deployment",
commonconsts.KubeLabelDynamoComponentType: commonconsts.ComponentTypePlanner,
},
Annotations: map[string]string{},
Spec: grovev1alpha1.PodCliqueSpec{
......@@ -2154,6 +2180,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
MinAvailable: ptr.To(int32(1)),
PodSpec: corev1.PodSpec{
TerminationGracePeriodSeconds: ptr.To(int64(60)),
ServiceAccountName: commonconsts.PlannerServiceAccountName,
RestartPolicy: corev1.RestartPolicyAlways,
Volumes: []corev1.Volume{
{
......@@ -2241,6 +2268,10 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
Name: "DYN_PARENT_DGD_K8S_NAMESPACE",
Value: "test-namespace",
},
{
Name: "PROMETHEUS_PORT",
Value: fmt.Sprintf("%d", commonconsts.DynamoPlannerMetricsPort),
},
},
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
......@@ -2263,6 +2294,13 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
MountPath: commonconsts.DefaultSharedMemoryMountPath,
},
},
Ports: []corev1.ContainerPort{
{
Protocol: corev1.ProtocolTCP,
Name: commonconsts.DynamoMetricsPortName,
ContainerPort: int32(commonconsts.DynamoPlannerMetricsPort),
},
},
},
},
},
......@@ -2435,7 +2473,8 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
},
"Planner": {
DynamoComponentDeploymentSharedSpec: v1alpha1.DynamoComponentDeploymentSharedSpec{
Replicas: &[]int32{2}[0],
ComponentType: commonconsts.ComponentTypePlanner,
Replicas: &[]int32{2}[0],
Resources: &common.Resources{
Requests: &common.ResourceItem{
CPU: "2",
......@@ -2916,6 +2955,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
commonconsts.KubeLabelMetricsEnabled: commonconsts.KubeLabelValueTrue,
commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-planner",
commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamo-graph-deployment",
commonconsts.KubeLabelDynamoComponentType: commonconsts.ComponentTypePlanner,
},
Annotations: map[string]string{},
Spec: grovev1alpha1.PodCliqueSpec{
......@@ -2924,6 +2964,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
MinAvailable: ptr.To(int32(1)),
PodSpec: corev1.PodSpec{
TerminationGracePeriodSeconds: ptr.To(int64(60)),
ServiceAccountName: commonconsts.PlannerServiceAccountName,
Volumes: []corev1.Volume{
{
Name: "planner-pvc",
......@@ -2982,6 +3023,13 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
},
},
},
Ports: []corev1.ContainerPort{
{
Protocol: corev1.ProtocolTCP,
Name: commonconsts.DynamoMetricsPortName,
ContainerPort: int32(commonconsts.DynamoPlannerMetricsPort),
},
},
Env: []corev1.EnvVar{
{
Name: "DYNAMO_POD_GANG_SET_REPLICAS",
......@@ -3011,6 +3059,10 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
Name: "DYN_PARENT_DGD_K8S_NAMESPACE",
Value: "test-namespace",
},
{
Name: "PROMETHEUS_PORT",
Value: fmt.Sprintf("%d", commonconsts.DynamoPlannerMetricsPort),
},
},
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
......
......@@ -63,9 +63,8 @@ helm fetch https://helm.ngc.nvidia.com/nvidia/ai-dynamo/charts/dynamo-crds-${REL
helm install dynamo-crds dynamo-crds-${RELEASE_VERSION}.tgz --namespace default
# 3. Install Platform
kubectl create namespace ${NAMESPACE}
helm fetch https://helm.ngc.nvidia.com/nvidia/ai-dynamo/charts/dynamo-platform-${RELEASE_VERSION}.tgz
helm install dynamo-platform dynamo-platform-${RELEASE_VERSION}.tgz --namespace ${NAMESPACE}
helm install dynamo-platform dynamo-platform-${RELEASE_VERSION}.tgz --namespace ${NAMESPACE} --create-namespace
```
> [!TIP]
......
......@@ -6,9 +6,6 @@ This guide provides a walkthrough for collecting and visualizing metrics from Dy
## Prerequisites
### Install Dynamo Operator
Before setting up metrics collection, you'll need to have the Dynamo operator installed in your cluster. Follow our [Installation Guide](../dynamo_deploy/installation_guide.md) for detailed instructions on deploying the Dynamo operator.
### Install kube-prometheus-stack
If you don't have an existing Prometheus setup, you'll likely want to install the kube-prometheus-stack. This is a collection of Kubernetes manifests that includes the Prometheus Operator, Prometheus, Grafana, and other monitoring components in a pre-configured setup. The stack introduces custom resources that make it easy to deploy and manage monitoring in Kubernetes:
......@@ -20,8 +17,8 @@ For a basic installation:
```bash
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm repo update
# Values allow podmnonitors to be picked up that are outside of the kube-prometheus-stack helm release
helm install prometheus prometheus-community/kube-prometheus-stack \
# Values allow PodMonitors to be picked up that are outside of the kube-prometheus-stack helm release
helm install prometheus -n monitoring --create-namespace prometheus-community/kube-prometheus-stack \
--set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false \
--set prometheus.prometheusSpec.podMonitorNamespaceSelector="{}" \
--set prometheus.prometheusSpec.probeNamespaceSelector="{}"
......@@ -30,6 +27,16 @@ helm install prometheus prometheus-community/kube-prometheus-stack \
> [!Note]
> The commands enumerated below assume you have installed the kube-prometheus-stack with the installation method listed above. Depending on your installation configuration of the monitoring stack, you may need to modify the `kubectl` commands that follow in this document accordingly (e.g modifying Namespace or Service names accordingly).
### Install Dynamo Operator
Before setting up metrics collection, you'll need to have the Dynamo operator installed in your cluster. Follow our [Installation Guide](../dynamo_deploy/installation_guide.md) for detailed instructions on deploying the Dynamo operator.
Make sure to set the `prometheusEndpoint` to the Prometheus endpoint you installed in the previous step.
```bash
helm install dynamo-platform ...
--set prometheusEndpoint=http://prometheus-kube-prometheus-prometheus.monitoring.svc.cluster.local:9090
```
### DCGM Metrics Collection (Optional)
GPU utilization metrics are collected and exported to Prometheus via dcgm-exporter. The Dynamo Grafana dashboard includes a panel for GPU utilization related to your Dynamo deployment. For that panel to be populated, you need to ensure that the dcgm-exporter is running in your cluster. To check if the dcgm-exporter is running, please run the following command:
......@@ -86,7 +93,7 @@ For more information about validating the deployment, see the [vLLM README](../.
### Create PodMonitors
The Prometheus Operator uses PodMonitor resources to automatically discover and scrape metrics from pods. To enable this discovery, the Dynamo operator automatically adds these labels to all pods:
The Prometheus Operator uses PodMonitor resources to automatically discover and scrape metrics from pods. To enable this discovery, the Dynamo operator automatically creates PodMonitor resource and adds these labels to all pods:
- `nvidia.com/metrics-enabled: "true"` - Enables metrics collection
- `nvidia.com/dynamo-component-type: "frontend|worker"` - Identifies the component type
......@@ -102,85 +109,6 @@ spec:
# …
```
Let's create two monitors - one for each component type:
First, create the frontend PodMonitor:
```yaml
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: dynamo-frontend-metrics
namespace: $NAMESPACE
spec:
selector:
matchLabels:
nvidia.com/metrics-enabled: "true"
nvidia.com/dynamo-component-type: "frontend"
podMetricsEndpoints:
- port: http
path: /metrics
interval: 2s
namespaceSelector:
matchNames:
- $NAMESPACE
```
Then, create the worker PodMonitor:
```yaml
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: dynamo-worker-metrics
namespace: $NAMESPACE
spec:
selector:
matchLabels:
nvidia.com/metrics-enabled: "true"
nvidia.com/dynamo-component-type: "worker"
podMetricsEndpoints:
- port: system
path: /metrics
interval: 2s
namespaceSelector:
matchNames:
- $NAMESPACE
```
If you are using planner, you can also create a PodMonitor for the planner:
```yaml
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: dynamo-planner-metrics
namespace: $NAMESPACE
spec:
selector:
matchLabels:
nvidia.com/metrics-enabled: "true"
nvidia.com/dynamo-component-type: "planner"
podMetricsEndpoints:
- port: metrics
path: /metrics
interval: 2s
namespaceSelector:
matchNames:
- $NAMESPACE
```
Apply the PodMonitors:
```bash
pushd deploy/metrics/k8s
# envsubst replaces ${NAMESPACE} with the actual namespace value
envsubst < frontend-podmonitor.yaml | kubectl apply -n $NAMESPACE -f -
envsubst < worker-podmonitor.yaml | kubectl apply -n $NAMESPACE -f -
envsubst < planner-podmonitor.yaml | kubectl apply -n $NAMESPACE -f -
popd
```
This will cause Prometheus to be re-configured to scrape metrics from the pods of your DynamoGraphDeployment.
### Configure Grafana Dashboard
Apply the Dynamo dashboard configuration to populate Grafana with the Dynamo dashboard:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment