Unverified Commit 53a609e5 authored by Julien Mancuso's avatar Julien Mancuso Committed by GitHub
Browse files

feat: add dynamo operator observability (#5543)


Signed-off-by: default avatarJulien Mancuso <jmancuso@nvidia.com>
parent 1032076d
......@@ -22,6 +22,8 @@ import (
"fmt"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/observability"
internalwebhook "github.com/ai-dynamo/dynamo/deploy/operator/internal/webhook"
"k8s.io/apimachinery/pkg/runtime"
"sigs.k8s.io/controller-runtime/pkg/log"
......@@ -118,13 +120,17 @@ func (h *DynamoComponentDeploymentHandler) ValidateDelete(ctx context.Context, o
}
// RegisterWithManager registers the webhook with the manager.
// The handler is automatically wrapped with LeaseAwareValidator to add namespace exclusion logic.
// The handler is automatically wrapped with LeaseAwareValidator to add namespace exclusion logic
// and ObservedValidator to add metrics collection.
func (h *DynamoComponentDeploymentHandler) RegisterWithManager(mgr manager.Manager) error {
// Wrap the handler with lease-aware logic for cluster-wide coordination
validator := internalwebhook.NewLeaseAwareValidator(h, internalwebhook.GetExcludedNamespaces())
leaseAwareValidator := internalwebhook.NewLeaseAwareValidator(h, internalwebhook.GetExcludedNamespaces())
// Wrap with metrics collection
observedValidator := observability.NewObservedValidator(leaseAwareValidator, consts.ResourceTypeDynamoComponentDeployment)
webhook := admission.
WithCustomValidator(mgr.GetScheme(), &nvidiacomv1alpha1.DynamoComponentDeployment{}, validator).
WithCustomValidator(mgr.GetScheme(), &nvidiacomv1alpha1.DynamoComponentDeployment{}, observedValidator).
WithRecoverPanic(true)
mgr.GetWebhookServer().Register(dynamoComponentDeploymentWebhookPath, webhook)
return nil
......
......@@ -22,6 +22,8 @@ import (
"fmt"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/observability"
internalwebhook "github.com/ai-dynamo/dynamo/deploy/operator/internal/webhook"
authenticationv1 "k8s.io/api/authentication/v1"
"k8s.io/apimachinery/pkg/runtime"
......@@ -134,13 +136,17 @@ func (h *DynamoGraphDeploymentHandler) ValidateDelete(ctx context.Context, obj r
}
// RegisterWithManager registers the webhook with the manager.
// The handler is automatically wrapped with LeaseAwareValidator to add namespace exclusion logic.
// The handler is automatically wrapped with LeaseAwareValidator to add namespace exclusion logic
// and ObservedValidator to add metrics collection.
func (h *DynamoGraphDeploymentHandler) RegisterWithManager(mgr manager.Manager) error {
// Wrap the handler with lease-aware logic for cluster-wide coordination
validator := internalwebhook.NewLeaseAwareValidator(h, internalwebhook.GetExcludedNamespaces())
leaseAwareValidator := internalwebhook.NewLeaseAwareValidator(h, internalwebhook.GetExcludedNamespaces())
// Wrap with metrics collection
observedValidator := observability.NewObservedValidator(leaseAwareValidator, consts.ResourceTypeDynamoGraphDeployment)
webhook := admission.
WithCustomValidator(mgr.GetScheme(), &nvidiacomv1alpha1.DynamoGraphDeployment{}, validator).
WithCustomValidator(mgr.GetScheme(), &nvidiacomv1alpha1.DynamoGraphDeployment{}, observedValidator).
WithRecoverPanic(true)
mgr.GetWebhookServer().Register(dynamoGraphDeploymentWebhookPath, webhook)
return nil
......
......@@ -22,6 +22,8 @@ import (
"fmt"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/observability"
internalwebhook "github.com/ai-dynamo/dynamo/deploy/operator/internal/webhook"
"k8s.io/apimachinery/pkg/runtime"
"sigs.k8s.io/controller-runtime/pkg/log"
......@@ -126,10 +128,13 @@ func (h *DynamoGraphDeploymentRequestHandler) ValidateDelete(ctx context.Context
// The handler is automatically wrapped with LeaseAwareValidator to add namespace exclusion logic.
func (h *DynamoGraphDeploymentRequestHandler) RegisterWithManager(mgr manager.Manager) error {
// Wrap the handler with lease-aware logic for cluster-wide coordination
validator := internalwebhook.NewLeaseAwareValidator(h, internalwebhook.GetExcludedNamespaces())
leaseAwareValidator := internalwebhook.NewLeaseAwareValidator(h, internalwebhook.GetExcludedNamespaces())
// Wrap with metrics collection
observedValidator := observability.NewObservedValidator(leaseAwareValidator, consts.ResourceTypeDynamoGraphDeploymentRequest)
webhook := admission.
WithCustomValidator(mgr.GetScheme(), &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{}, validator).
WithCustomValidator(mgr.GetScheme(), &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{}, observedValidator).
WithRecoverPanic(true)
mgr.GetWebhookServer().Register(dynamoGraphDeploymentRequestWebhookPath, webhook)
return nil
......
......@@ -22,6 +22,8 @@ import (
"fmt"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/observability"
internalwebhook "github.com/ai-dynamo/dynamo/deploy/operator/internal/webhook"
"k8s.io/apimachinery/pkg/runtime"
"sigs.k8s.io/controller-runtime/pkg/log"
......@@ -121,10 +123,13 @@ func (h *DynamoModelHandler) ValidateDelete(ctx context.Context, obj runtime.Obj
// The handler is automatically wrapped with LeaseAwareValidator to add namespace exclusion logic.
func (h *DynamoModelHandler) RegisterWithManager(mgr manager.Manager) error {
// Wrap the handler with lease-aware logic for cluster-wide coordination
validator := internalwebhook.NewLeaseAwareValidator(h, internalwebhook.GetExcludedNamespaces())
leaseAwareValidator := internalwebhook.NewLeaseAwareValidator(h, internalwebhook.GetExcludedNamespaces())
// Wrap with metrics collection
observedValidator := observability.NewObservedValidator(leaseAwareValidator, consts.ResourceTypeDynamoModel)
webhook := admission.
WithCustomValidator(mgr.GetScheme(), &nvidiacomv1alpha1.DynamoModel{}, validator).
WithCustomValidator(mgr.GetScheme(), &nvidiacomv1alpha1.DynamoModel{}, observedValidator).
WithRecoverPanic(true)
mgr.GetWebhookServer().Register(dynamoModelWebhookPath, webhook)
return nil
......
......@@ -6,3 +6,4 @@ Observability
Metrics <../kubernetes/observability/metrics>
Logging <../kubernetes/observability/logging>
Operator Metrics <../kubernetes/observability/operator-metrics>
......@@ -130,6 +130,32 @@ For complete documentation on webhooks, certificate management, and troubleshoot
**📖 [Webhooks Guide](./webhooks.md)**
## Observability
The Dynamo Operator provides comprehensive observability through Prometheus metrics and Grafana dashboards. This allows you to monitor:
- **Controller Performance**: Reconciliation loop duration, success rates, and error rates by resource type
- **Webhook Activity**: Validation performance, admission rates, and denial patterns
- **Resource Inventory**: Current count of managed resources by state and namespace
- **Operational Health**: Success rates and health indicators for controllers and webhooks
### Metrics Collection
Metrics are automatically exposed on the operator's `/metrics` endpoint (port 8443 by default) and collected by Prometheus via a ServiceMonitor. The ServiceMonitor is automatically created when you install the operator via Helm (controlled by `metricsService.enabled`, which defaults to `true`).
### Grafana Dashboard
A pre-built Grafana dashboard is available for visualizing operator metrics. The dashboard includes:
- **Reconciliation Metrics**: Rate, duration (P95), and errors by resource type
- **Webhook Metrics**: Request rate, duration (P95), and denials by resource type and operation
- **Resource Inventory**: Count of DynamoGraphDeployments by state and namespace
- **Operational Health**: Success rate gauges for controllers and webhooks
For complete setup instructions and metrics reference, see:
**📖 [Operator Metrics Guide](./observability/operator-metrics.md)**
## Installation
### Quick Install with Helm
......
......@@ -172,3 +172,9 @@ Visit http://localhost:3000 and log in with the credentials captured above.
Once logged in, find the Dynamo dashboard under General.
![Grafana dashboard showing Dynamo metrics](../../images/grafana-k8s.png)
## Operator Metrics
> **Note:** The metrics described above are for Dynamo **applications** (frontends, workers). The Dynamo **Operator** itself also exposes metrics for monitoring controller reconciliation, webhook validation, and resource inventory.
>
> See the **[Operator Metrics Guide](operator-metrics.md)** for details on operator-specific metrics and the operator dashboard.
# Dynamo Operator Metrics
## Overview
The Dynamo Operator exposes Prometheus metrics for monitoring its own health and performance. These metrics are separate from application metrics (frontend/worker) and provide visibility into:
- **Controller Reconciliation**: How efficiently controllers process DynamoGraphDeployments, DynamoComponentDeployments, and DynamoModels
- **Webhook Validation**: Performance and outcomes of admission webhook requests
- **Resource Inventory**: Current count of managed resources by state and namespace
## Prerequisites
The operator metrics feature requires the same monitoring infrastructure as application metrics. For detailed setup instructions, see the [Kubernetes Metrics Guide](./metrics.md#prerequisites).
**Quick checklist:**
- ✅ kube-prometheus-stack installed (for ServiceMonitor support)
- ✅ Prometheus and Grafana running
- ✅ Dynamo Operator installed via Helm
## Metrics Collection
### ServiceMonitor
Operator metrics are automatically collected via a ServiceMonitor, which is created by the Helm chart when `metricsService.enabled: true` (default).
**Unlike application metrics** (which use PodMonitor), the operator uses ServiceMonitor and requires no manual RBAC configuration. The operator's kube-rbac-proxy sidecar is configured with `--ignore-paths=/metrics` to allow Prometheus access.
To verify the ServiceMonitor is created:
```bash
kubectl get servicemonitor -n dynamo-system
```
### Disabling Metrics Collection
To disable operator metrics collection:
```bash
helm upgrade dynamo-platform dynamo-platform-${RELEASE_VERSION}.tgz \
--namespace dynamo-system \
--set dynamo-operator.metricsService.enabled=false
```
## Available Metrics
All metrics use the `dynamo_operator` namespace prefix.
### Reconciliation Metrics
| Metric | Type | Labels | Description |
|--------|------|--------|-------------|
| `dynamo_operator_reconcile_duration_seconds` | Histogram | `resource_type`, `namespace`, `result` | Duration of reconciliation loops |
| `dynamo_operator_reconcile_total` | Counter | `resource_type`, `namespace`, `result` | Total number of reconciliations |
| `dynamo_operator_reconcile_errors_total` | Counter | `resource_type`, `namespace`, `error_type` | Total reconciliation errors by type |
**Labels:**
- `resource_type`: `DynamoGraphDeployment`, `DynamoComponentDeployment`, `DynamoModel`, `DynamoGraphDeploymentRequest`, `DynamoGraphDeploymentScalingAdapter`
- `namespace`: Target namespace of the resource
- `result`: `success`, `error`, `requeue`
- `error_type`: `not_found`, `already_exists`, `conflict`, `validation`, `bad_request`, `unauthorized`, `forbidden`, `timeout`, `server_timeout`, `unavailable`, `rate_limited`, `internal`
### Webhook Metrics
| Metric | Type | Labels | Description |
|--------|------|--------|-------------|
| `dynamo_operator_webhook_duration_seconds` | Histogram | `resource_type`, `operation` | Duration of webhook validation requests |
| `dynamo_operator_webhook_requests_total` | Counter | `resource_type`, `operation`, `result` | Total webhook admission requests |
| `dynamo_operator_webhook_denials_total` | Counter | `resource_type`, `operation`, `reason` | Total webhook denials with reasons |
**Labels:**
- `resource_type`: Same as reconciliation metrics
- `operation`: `CREATE`, `UPDATE`, `DELETE`
- `result`: `allowed`, `denied`
- `reason`: Validation failure reason (e.g., `immutable_field_changed`, `invalid_config`)
### Resource Inventory Metrics
| Metric | Type | Labels | Description |
|--------|------|--------|-------------|
| `dynamo_operator_resources_total` | Gauge | `resource_type`, `namespace`, `status` | Current count of resources by state |
**Labels:**
- `resource_type`: `DynamoGraphDeployment`, `DynamoComponentDeployment`, `DynamoModel`, `DynamoGraphDeploymentRequest`, `DynamoGraphDeploymentScalingAdapter`
- `namespace`: Resource namespace
- `status`: Resource state derived from each CRD's status. Common values:
- `"ready"` - Resource is healthy and operational (DCD, DM, DGDSA)
- `"not_ready"` - Resource exists but is not operational (DCD, DM, DGDSA)
- `"unknown"` - State cannot be determined (default for empty status)
- DGD uses: `"pending"`, `"successful"`, `"failed"` from `.status.state`
- DGDR uses: `"Pending"`, `"Profiling"`, `"Deploying"`, `"Ready"`, `"DeploymentDeleted"`, `"Failed"` from `.status.state`
## Example Queries
### Reconciliation Performance
```promql
# P95 reconciliation duration by resource type
histogram_quantile(0.95,
sum by (resource_type, le) (
rate(dynamo_operator_reconcile_duration_seconds_bucket[5m])
)
)
# Reconciliation rate by result
sum by (resource_type, result) (
rate(dynamo_operator_reconcile_total[5m])
)
# Error rate by type
sum by (resource_type, error_type) (
rate(dynamo_operator_reconcile_errors_total[5m])
)
```
### Webhook Performance
```promql
# Webhook P95 latency
histogram_quantile(0.95,
sum by (resource_type, le) (
rate(dynamo_operator_webhook_duration_seconds_bucket[5m])
)
)
# Webhook denial rate
sum by (resource_type, operation, reason) (
rate(dynamo_operator_webhook_denials_total[5m])
)
```
### Resource Inventory
```promql
# Total resources by type and state
sum by (resource_type, status) (
dynamo_operator_resources_total
)
# DynamoGraphDeployments by state
sum by (status) (
dynamo_operator_resources_total{resource_type="DynamoGraphDeployment"}
)
# All resources by namespace and state
sum by (resource_type, namespace, status) (
dynamo_operator_resources_total
)
```
## Grafana Dashboard
A pre-built Grafana dashboard is available for visualizing operator metrics.
### Dashboard Sections
1. **Reconciliation Metrics** (3 panels)
- Reconciliation rate by resource type and result
- P95 reconciliation duration
- Reconciliation errors by type
2. **Webhook Metrics** (3 panels)
- Webhook request rate by operation
- P95 webhook duration
- Webhook denials by reason
3. **Resource Inventory** (2 panels)
- Resource inventory timeline by state and namespace (filterable by resource type)
- Current resource count by state (filterable by resource type)
4. **Operational Health** (2 panels)
- Reconciliation success rate gauges
- Webhook admission success rate gauges
### Deploying the Dashboard
```bash
kubectl apply -f deploy/observability/k8s/grafana-operator-dashboard-configmap.yaml
```
The dashboard will automatically appear in Grafana (assuming you have the Grafana dashboard sidecar configured, which is included in kube-prometheus-stack).
### Finding the Dashboard
1. Port-forward to Grafana (if needed):
```bash
kubectl port-forward svc/prometheus-grafana 3000:80 -n monitoring
```
2. Log in to Grafana at http://localhost:3000
3. Navigate to **Dashboards** → Search for **"Dynamo Operator"**
### Dashboard Filters
The dashboard includes two filter variables:
- **Namespace**: View metrics across all namespaces or filter by specific ones (multi-select)
- **Resource Type**: Filter all panels by resource type or select "All" to see aggregated metrics across all CRDs (single select)
When "All" is selected for Resource Type, all panels will show data for all five managed CRDs with resource_type labels for differentiation.
## Accessing Metrics Directly
For instructions on accessing Prometheus and Grafana, see the [Kubernetes Metrics Guide](./metrics.md#viewing-the-metrics).
Once you have access to Prometheus, you can query operator metrics directly:
```bash
# Port-forward to Prometheus
kubectl port-forward svc/prometheus-kube-prometheus-prometheus 9090:9090 -n monitoring
# Visit http://localhost:9090 and try queries like:
# - dynamo_operator_reconcile_total
# - dynamo_operator_webhook_requests_total
# - dynamo_operator_resources_total
```
## Troubleshooting
### Metrics Not Appearing in Prometheus
1. **Check ServiceMonitor exists:**
```bash
kubectl get servicemonitor -n dynamo-system | grep operator
```
2. **Check ServiceMonitor is discovered by Prometheus:**
- Go to Prometheus UI → Status → Targets
- Look for `serviceMonitor/dynamo-system/dynamo-platform-dynamo-operator-operator`
- Should show state: `UP`
3. **Check Prometheus selector configuration:**
```bash
kubectl get prometheus -o yaml | grep serviceMonitorSelector
```
Ensure `serviceMonitorSelectorNilUsesHelmValues: false` was set during kube-prometheus-stack installation.
### Dashboard Not Appearing in Grafana
1. **Check ConfigMap is created:**
```bash
kubectl get configmap -n monitoring grafana-operator-dashboard
```
2. **Check ConfigMap has the label:**
```bash
kubectl get configmap -n monitoring grafana-operator-dashboard -o jsonpath='{.metadata.labels.grafana_dashboard}'
```
Should return `"1"`
3. **Check Grafana dashboard sidecar configuration:**
```bash
kubectl get deployment -n monitoring prometheus-grafana -o yaml | grep -A 5 sidecar
```
The sidecar should be configured to watch for `grafana_dashboard: "1"` label.
4. **Restart Grafana pod** to force dashboard refresh:
```bash
kubectl rollout restart deployment/prometheus-grafana -n monitoring
```
## Related Documentation
- [Kubernetes Metrics Guide](./metrics.md) - Application metrics for frontends and workers
- [Dynamo Operator Guide](../dynamo_operator.md) - Operator architecture and deployment modes
- [Operator Webhooks](../webhooks.md) - Webhook validation details
......@@ -37,6 +37,7 @@ For detailed setup instructions and configuration, see [Prometheus + Grafana Set
| Guide | Description | Environment Variables to Control |
|-------|-------------|----------------------------------|
| [Metrics](metrics.md) | Available metrics reference | `DYN_SYSTEM_PORT`† |
| [Operator Metrics (Kubernetes)](../kubernetes/observability/operator-metrics.md) | Operator controller and webhook metrics for Kubernetes | N/A (configured via Helm) |
| [Health Checks](health-checks.md) | Component health monitoring and readiness probes | `DYN_SYSTEM_PORT`†, `DYN_SYSTEM_STARTING_HEALTH_STATUS`, `DYN_SYSTEM_HEALTH_PATH`, `DYN_SYSTEM_LIVE_PATH`, `DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS` |
| [Tracing](tracing.md) | Distributed tracing with OpenTelemetry and Tempo | `DYN_LOGGING_JSONL`†, `OTEL_EXPORT_ENABLED`†, `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT`†, `OTEL_SERVICE_NAME`† |
| [Logging](logging.md) | Structured logging configuration | `DYN_LOGGING_JSONL`†, `DYN_LOG`, `DYN_LOG_USE_LOCAL_TZ`, `DYN_LOGGING_CONFIG_PATH`, `OTEL_SERVICE_NAME`†, `OTEL_EXPORT_ENABLED`†, `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT`† |
......@@ -53,6 +54,8 @@ For detailed setup instructions and configuration, see [Prometheus + Grafana Set
For Kubernetes-specific setup and configuration, see [docs/kubernetes/observability/](../kubernetes/observability/).
**Operator Metrics**: The Dynamo Operator running in Kubernetes exposes its own set of metrics for monitoring controller reconciliation, webhook validation, and resource inventory. See the [Operator Metrics Guide](../kubernetes/observability/operator-metrics.md).
---
## Topology
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment