Unverified Commit 5375af2c authored by mohammedabdulwahhab's avatar mohammedabdulwahhab Committed by GitHub
Browse files

docs: exporting metrics in k8s (dep-302) (#2271)

parent 77b53c41
......@@ -12,8 +12,8 @@ const (
DynamoServicePortName = "http"
DynamoContainerPortName = "http"
DynamoHealthPort = 5000
DynamoHealthPortName = "health"
DynamoSystemPort = 9090
DynamoSystemPortName = "system"
EnvDynamoServicePort = "DYNAMO_PORT"
......@@ -24,6 +24,7 @@ const (
KubeLabelDynamoComponent = "nvidia.com/dynamo-component"
KubeLabelDynamoNamespace = "nvidia.com/dynamo-namespace"
KubeLabelDynamoDeploymentTargetType = "nvidia.com/dynamo-deployment-target-type"
KubeLabelDynamoComponentType = "nvidia.com/dynamo-component-type"
KubeLabelValueFalse = "false"
KubeLabelValueTrue = "true"
......@@ -36,9 +37,14 @@ const (
ComponentTypePlanner = "planner"
ComponentTypeMain = "main"
ComponentTypeWorker = "worker"
PlannerServiceAccountName = "planner-serviceaccount"
DefaultIngressSuffix = "local"
DefaultGroveTerminationDelay = 15 * time.Minute
// Metrics related constants
KubeAnnotationEnableMetrics = "nvidia.com/enable-metrics" // User-provided annotation to control metrics
KubeLabelMetricsEnabled = "nvidia.com/metrics-enabled" // Controller-managed label for pod selection
)
......@@ -1208,6 +1208,25 @@ func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx contex
podLabels[commonconsts.KubeLabelDynamoDeploymentTargetType] = DeploymentTargetTypeDebug
}
// Convert user-provided metrics annotation into controller-managed label
// By default (no annotation), metrics are enabled
metricsAnnotationValue := ""
if opt.dynamoComponentDeployment.Spec.Annotations != nil {
metricsAnnotationValue = opt.dynamoComponentDeployment.Spec.Annotations[commonconsts.KubeAnnotationEnableMetrics]
}
switch metricsAnnotationValue {
case commonconsts.KubeLabelValueFalse:
// Explicitly disabled, don't add the label
default:
// Any other value (including empty) enables metrics
podLabels[commonconsts.KubeLabelMetricsEnabled] = commonconsts.KubeLabelValueTrue
}
// Add component type label if specified
if opt.dynamoComponentDeployment.Spec.ComponentType != "" {
podLabels[commonconsts.KubeLabelDynamoComponentType] = opt.dynamoComponentDeployment.Spec.ComponentType
}
podAnnotations := make(map[string]string)
kubeName := r.getKubeName(opt.dynamoComponentDeployment, opt.isStealingTrafficDebugModeEnabled)
......@@ -1345,48 +1364,17 @@ func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx contex
Name: commonconsts.DynamoContainerPortName,
ContainerPort: int32(containerPort), // nolint: gosec
},
{
Protocol: corev1.ProtocolTCP,
Name: commonconsts.DynamoHealthPortName,
ContainerPort: int32(commonconsts.DynamoHealthPort),
},
},
SecurityContext: mainContainerSecurityContext,
}
// Set default probes if none are provided
if livenessProbe == nil {
container.LivenessProbe = &corev1.Probe{
// TODO: Initial delay and other probe settings should be read off sdk, these are default settings that should cover vllm / hello-world
InitialDelaySeconds: 60, // 1 minute
PeriodSeconds: 60, // Check every 1 minute
TimeoutSeconds: 5, // 5 second timeout
FailureThreshold: 10, // Allow 10 failures before declaring unhealthy
SuccessThreshold: 1, // Need 1 success to be considered healthy
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Path: "/healthz",
Port: intstr.FromString(commonconsts.DynamoHealthPortName),
},
},
}
}
if readinessProbe == nil {
container.ReadinessProbe = &corev1.Probe{
// TODO: Initial delay and other probe settings should be read off sdk, these are default settings that should cover vllm / hello-world
InitialDelaySeconds: 60, // 1 minute
PeriodSeconds: 60, // Check every 1 minute
TimeoutSeconds: 5, // 5 second timeout
FailureThreshold: 10, // Allow 10 failures before declaring not ready
SuccessThreshold: 1, // Need 1 success to be considered ready
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Path: "/readyz",
Port: intstr.FromString(commonconsts.DynamoHealthPortName),
},
},
}
// Add system port for worker components
if opt.dynamoComponentDeployment.Spec.ComponentType == commonconsts.ComponentTypeWorker {
container.Ports = append(container.Ports, corev1.ContainerPort{
Protocol: corev1.ProtocolTCP,
Name: commonconsts.DynamoSystemPortName,
ContainerPort: int32(commonconsts.DynamoSystemPort),
})
}
if opt.dynamoComponentDeployment.Spec.EnvFromSecret != nil {
......
......@@ -922,9 +922,6 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
{
Protocol: corev1.ProtocolTCP, Name: commonconsts.DynamoServicePortName, ContainerPort: commonconsts.DynamoServicePort,
},
{
Protocol: corev1.ProtocolTCP, Name: commonconsts.DynamoHealthPortName, ContainerPort: commonconsts.DynamoHealthPort,
},
},
TTY: true,
Stdin: true,
......@@ -967,9 +964,11 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
Args: []string{"ray start --address=$(LWS_LEADER_ADDRESS):6379 --block"},
Env: []corev1.EnvVar{{Name: "DYNAMO_PORT", Value: fmt.Sprintf("%d", commonconsts.DynamoServicePort)}, {Name: "TEST_ENV_FROM_DYNAMO_COMPONENT_DEPLOYMENT_SPEC", Value: "test_value_from_dynamo_component_deployment_spec"}, {Name: "TEST_ENV_FROM_EXTRA_POD_SPEC", Value: "test_value_from_extra_pod_spec"}},
VolumeMounts: []corev1.VolumeMount{{Name: "shared-memory", MountPath: "/dev/shm"}},
Ports: []corev1.ContainerPort{{Protocol: corev1.ProtocolTCP, Name: commonconsts.DynamoServicePortName, ContainerPort: commonconsts.DynamoServicePort}, {
Protocol: corev1.ProtocolTCP, Name: commonconsts.DynamoHealthPortName, ContainerPort: commonconsts.DynamoHealthPort,
}},
Ports: []corev1.ContainerPort{
{
Protocol: corev1.ProtocolTCP, Name: commonconsts.DynamoServicePortName, ContainerPort: commonconsts.DynamoServicePort,
},
},
TTY: true,
Stdin: true,
Resources: corev1.ResourceRequirements{
......
......@@ -165,6 +165,17 @@ func GenerateDynamoComponentsDeployments(ctx context.Context, parentDynamoGraphD
deployment.Labels = labels
labels[commonconsts.KubeLabelDynamoComponent] = componentName
labels[commonconsts.KubeLabelDynamoNamespace] = dynamoNamespace
// Propagate metrics annotation from parent deployment if present
if parentDynamoGraphDeployment.Annotations != nil {
if val, exists := parentDynamoGraphDeployment.Annotations[commonconsts.KubeAnnotationEnableMetrics]; exists {
if deployment.Spec.Annotations == nil {
deployment.Spec.Annotations = make(map[string]string)
}
deployment.Spec.Annotations[commonconsts.KubeAnnotationEnableMetrics] = val
}
}
if component.ComponentType == commonconsts.ComponentTypePlanner {
// ensure that the extraPodSpec is not nil
if deployment.Spec.ExtraPodSpec == nil {
......@@ -337,13 +348,18 @@ func GenerateGrovePodGangSet(ctx context.Context, dynamoDeployment *v1alpha1.Dyn
Name: commonconsts.DynamoContainerPortName,
ContainerPort: int32(commonconsts.DynamoServicePort),
},
{
Protocol: corev1.ProtocolTCP,
Name: commonconsts.DynamoHealthPortName,
ContainerPort: int32(commonconsts.DynamoHealthPort),
},
},
}
// Add system port for worker components
if component.ComponentType == commonconsts.ComponentTypeWorker {
container.Ports = append(container.Ports, corev1.ContainerPort{
Protocol: corev1.ProtocolTCP,
Name: commonconsts.DynamoSystemPortName,
ContainerPort: int32(commonconsts.DynamoSystemPort),
})
}
resourcesConfig, err := controller_common.GetResourcesConfig(component.Resources)
if err != nil {
return nil, fmt.Errorf("failed to get resources config: %w", err)
......@@ -415,6 +431,32 @@ func GenerateGrovePodGangSet(ctx context.Context, dynamoDeployment *v1alpha1.Dyn
},
},
})
// Add metrics labels if not disabled
cliqueIndex := len(gangSet.Spec.Template.Cliques) - 1
labels := gangSet.Spec.Template.Cliques[cliqueIndex].Labels
// Convert user-provided metrics annotation into controller-managed label
// By default (no annotation), metrics are enabled
metricsAnnotationValue := ""
if dynamoDeployment.Annotations != nil {
metricsAnnotationValue = dynamoDeployment.Annotations[commonconsts.KubeAnnotationEnableMetrics]
}
switch metricsAnnotationValue {
case commonconsts.KubeLabelValueFalse:
// Explicitly disabled, don't add the label
default:
// Any other value (including empty) enables metrics
labels[commonconsts.KubeLabelMetricsEnabled] = commonconsts.KubeLabelValueTrue
}
// Add component type label if specified
if component.ComponentType != "" {
labels[commonconsts.KubeLabelDynamoComponentType] = component.ComponentType
}
gangSet.Spec.Template.Cliques[cliqueIndex].Labels = labels
if component.PVC != nil {
cliqueIndex := len(gangSet.Spec.Template.Cliques) - 1
gangSet.Spec.Template.Cliques[cliqueIndex].Spec.PodSpec.Volumes = append(gangSet.Spec.Template.Cliques[cliqueIndex].Spec.PodSpec.Volumes, corev1.Volume{
......
......@@ -1282,6 +1282,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
Name: "frontend",
Labels: map[string]string{
commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-frontend",
commonconsts.KubeLabelMetricsEnabled: commonconsts.KubeLabelValueTrue,
},
Spec: grovev1alpha1.PodCliqueSpec{
RoleName: "frontend",
......@@ -1365,11 +1366,6 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
Name: commonconsts.DynamoContainerPortName,
ContainerPort: int32(commonconsts.DynamoServicePort),
},
{
Protocol: corev1.ProtocolTCP,
Name: commonconsts.DynamoHealthPortName,
ContainerPort: int32(commonconsts.DynamoHealthPort),
},
},
},
},
......@@ -1380,6 +1376,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
Name: "planner",
Labels: map[string]string{
commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-planner",
commonconsts.KubeLabelMetricsEnabled: commonconsts.KubeLabelValueTrue,
},
Spec: grovev1alpha1.PodCliqueSpec{
RoleName: "planner",
......@@ -1479,11 +1476,6 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
Name: commonconsts.DynamoContainerPortName,
ContainerPort: int32(commonconsts.DynamoServicePort),
},
{
Protocol: corev1.ProtocolTCP,
Name: commonconsts.DynamoHealthPortName,
ContainerPort: int32(commonconsts.DynamoHealthPort),
},
},
},
},
......
# Dynamo Metrics Collection on Kubernetes
For detailed documentation on collecting and visualizing metrics on Kubernetes, see [docs/guides/deploy/k8s_metrics.md](../../../docs/guides/deploy/k8s_metrics.md).
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: dynamo-frontend-metrics
namespace: ${NAMESPACE}
spec:
selector:
matchLabels:
nvidia.com/metrics-enabled: "true"
nvidia.com/dynamo-component-type: "frontend"
podMetricsEndpoints:
- port: http
path: /metrics
interval: 2s
namespaceSelector:
matchNames:
- ${NAMESPACE}
This diff is collapsed.
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: dynamo-worker-metrics
namespace: ${NAMESPACE}
spec:
selector:
matchLabels:
nvidia.com/metrics-enabled: "true"
nvidia.com/dynamo-component-type: "worker"
podMetricsEndpoints:
- port: system
path: /metrics
interval: 2s
namespaceSelector:
matchNames:
- ${NAMESPACE}
# Dynamo Metrics Collection on Kubernetes
## Overview
This guide provides a walkthrough for collecting and visualizing metrics from Dynamo components using the Prometheus Operator stack. The Prometheus Operator provides a powerful and flexible way to configure monitoring for Kubernetes applications through custom resources like PodMonitors, making it easy to automatically discover and scrape metrics from Dynamo components.
## Prerequisites
### Install Dynamo Operator
Before setting up metrics collection, you'll need to have the Dynamo operator installed in your cluster. Follow our [Quickstart Guide](../dynamo_deploy/quickstart.md) for detailed instructions on deploying the Dynamo operator.
### Install Prometheus Operator
If you don't have an existing Prometheus setup, you'll need to install the Prometheus Operator. The Prometheus Operator introduces custom resources that make it easy to deploy and manage Prometheus monitoring in Kubernetes:
- `PodMonitor`: Automatically discovers and scrapes metrics from pods based on label selectors
- `ServiceMonitor`: Similar to PodMonitor but works with Services
- `PrometheusRule`: Defines alerting and recording rules
For a basic installation:
```bash
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm repo update
helm install prometheus prometheus-community/kube-prometheus-stack
```
## Deploy a DynamoGraphDeployment
Let's start by deploying a simple vLLM aggregated deployment:
```bash
export NAMESPACE=dynamo # namespace where dynamo operator is installed
pushd components/backends/vllm/deploy
kubectl apply -f agg.yaml -n $NAMESPACE
popd
```
This will create two components:
- A Frontend component exposing metrics on its HTTP port
- A Worker component exposing metrics on its system port
Both components expose a `/metrics` endpoint following the OpenMetrics format, but with different metrics appropriate to their roles. For details about:
- Deployment configuration: See the [vLLM README](../../../../components/backends/vllm/README.md)
- Available metrics: See the [metrics guide](../metrics.md)
### Validate the Deployment
Let's send some test requests to populate metrics:
```bash
curl localhost:8080/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "Qwen/Qwen3-0.6B",
"messages": [
{
"role": "user",
"content": "In the heart of Eldoria, an ancient land of boundless magic and mysterious creatures, lies the long-forgotten city of Aeloria. Once a beacon of knowledge and power, Aeloria was buried beneath the shifting sands of time, lost to the world for centuries. You are an intrepid explorer, known for your unparalleled curiosity and courage, who has stumbled upon an ancient map hinting at ests that Aeloria holds a secret so profound that it has the potential to reshape the very fabric of reality. Your journey will take you through treacherous deserts, enchanted forests, and across perilous mountain ranges. Your Task: Character Background: Develop a detailed background for your character. Describe their motivations for seeking out Aeloria, their skills and weaknesses, and any personal connections to the ancient city or its legends. Are they driven by a quest for knowledge, a search for lost familt clue is hidden."
}
],
"stream": true,
"max_tokens": 30
}'
```
For more information about validating the deployment, see the [vLLM README](../../../../components/backends/vllm/README.md).
## Set Up Metrics Collection
### Create PodMonitors
The Prometheus Operator uses PodMonitor resources to automatically discover and scrape metrics from pods. To enable this discovery, the Dynamo operator automatically adds these labels to all pods:
- `nvidia.com/metrics-enabled: "true"` - Enables metrics collection
- `nvidia.com/dynamo-component-type: "frontend|worker"` - Identifies the component type
> **Note**: You can opt-out specific deployments from metrics collection by adding this annotation to your DynamoGraphDeployment:
```yaml
apiVersion: nvidia.com/v1
kind: DynamoGraphDeployment
metadata:
name: my-deployment
annotations:
nvidia.com/enable-metrics: "false"
spec:
# …
```
Let's create two monitors - one for each component type:
First, create the frontend PodMonitor:
```yaml
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: dynamo-frontend-metrics
namespace: dynamo
spec:
selector:
matchLabels:
nvidia.com/metrics-enabled: "true"
nvidia.com/dynamo-component-type: "frontend"
podMetricsEndpoints:
- port: http
path: /metrics
interval: 2s
namespaceSelector:
matchNames:
- dynamo
```
Then, create the worker PodMonitor:
```yaml
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: dynamo-worker-metrics
namespace: dynamo
spec:
selector:
matchLabels:
nvidia.com/metrics-enabled: "true"
nvidia.com/dynamo-component-type: "worker"
podMetricsEndpoints:
- port: system
path: /metrics
interval: 2s
namespaceSelector:
matchNames:
- dynamo
```
Apply the PodMonitors:
```bash
pushd deploy/metrics/k8s
# envsubst replaces ${NAMESPACE} with the actual namespace value
envsubst < frontend-podmonitor.yaml | kubectl apply -n $NAMESPACE -f -
envsubst < worker-podmonitor.yaml | kubectl apply -n $NAMESPACE -f -
popd
```
This will cause Prometheus to be re-configured to scrape metrics from the pods of your DynamoGraphDeployment.
### Configure Grafana Dashboard
Apply the Dynamo dashboard configuration to populate Grafana with the Dynamo dashboard:
```bash
pushd deploy/metrics/k8s
kubectl apply -n monitoring -f resources/grafana-dynamo-dashboard-configmap.yaml
popd
```
The dashboard is embedded in the ConfigMap. Since it is labeled with `grafana_dashboard: "1"`, the Grafana will discover and populate it to its list of available dashboards. The dashboard includes panels for:
- Frontend request rates
- Time to first token
- Inter-token latency
- Request duration
- Input/Output sequence lengths
- GPU utilization
## Viewing the Metrics
### In Prometheus
```bash
kubectl port-forward svc/prometheus-operated 9090:9090
```
Visit http://localhost:9090 and try these example queries:
- `dynamo_frontend_requests_total`
- `dynamo_frontend_time_to_first_token_seconds_bucket`
![Prometheus UI showing Dynamo metrics](../../images/prometheus-k8s.png)
### In Grafana
```bash
kubectl port-forward svc/grafana 3000:80
```
Visit http://localhost:3000 and find the Dynamo dashboard under General.
![Grafana dashboard showing Dynamo metrics](../../images/grafana-k8s.png)
......@@ -31,9 +31,7 @@ Dynamo automatically exposes metrics with the `dynamo_` name prefixes. It also a
**Specialized Component Metrics**: Components can also expose additional metrics specific to their functionality. For example, a `preprocessor` component exposes metrics with the `dynamo_preprocessor_*` prefix. See the [Available Metrics section](../../deploy/metrics/README.md#available-metrics) for details on specialized component metrics.
## Coming Soon
**Kubernetes Integration**: Comprehensive Kubernetes deployment and monitoring information will be available soon, including Helm charts, Kubernetes-native metrics collection, and cluster-wide observability solutions.
**Kubernetes Integration**: For comprehensive Kubernetes deployment and monitoring setup, see the [Kubernetes Metrics Guide](deploy/k8s_metrics.md). This includes Prometheus Operator setup, metrics collection configuration, and visualization in Grafana.
## Metrics Hierarchy
......
File suppressed by a .gitattributes entry or the file's encoding is unsupported.
File suppressed by a .gitattributes entry or the file's encoding is unsupported.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment