Unverified Commit 700d345d authored by mohammedabdulwahhab's avatar mohammedabdulwahhab Committed by GitHub
Browse files

feat: add reference setup for dynamo logging in k8s with loki (#2699)


Signed-off-by: default avatarmohammedabdulwahhab <furkhan324@berkeley.edu>
Co-authored-by: default avatarcoderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
parent 44ecfda5
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: sglang-agg
spec:
envs:
- name: DYN_LOGGING_JSONL
value: "1"
services:
Frontend:
dynamoNamespace: sglang-agg
componentType: frontend
replicas: 1
extraPodSpec:
mainContainer:
image: my-registry/sglang-runtime:my-tag
SGLangDecodeWorker:
envFromSecret: hf-token-secret
dynamoNamespace: sglang-agg
componentType: worker
replicas: 1
resources:
limits:
gpu: "1"
extraPodSpec:
mainContainer:
image: my-registry/sglang-runtime:my-tag
workingDir: /workspace/components/backends/sglang
command:
- /bin/sh
- -c
args:
- >-
python3 -m dynamo.sglang
--model-path Qwen/Qwen3-0.6B
--served-model-name Qwen/Qwen3-0.6B
--page-size 16
--tp 1
--trust-remote-code
--skip-tokenizer-init
\ No newline at end of file
# Dynamo Logging on Kubernetes
For detailed documentation on collecting and visualizing logs on Kubernetes, see [docs/guides/dynamo_deploy/logging.md](../../docs/guides/dynamo_deploy/logging.md).
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"links": [],
"panels": [
{
"datasource": {
"type": "loki",
"uid": "$datasource"
},
"fieldConfig": {
"defaults": {},
"overrides": []
},
"gridPos": {
"h": 21,
"w": 24,
"x": 0,
"y": 0
},
"id": 1,
"options": {
"dedupStrategy": "none",
"enableInfiniteScrolling": false,
"enableLogDetails": true,
"prettifyLogMessage": false,
"showCommonLabels": false,
"showLabels": false,
"showTime": false,
"sortOrder": "Descending",
"wrapLogMessage": false
},
"pluginVersion": "12.1.0",
"targets": [
{
"datasource": {
"type": "loki",
"uid": "$datasource"
},
"direction": "backward",
"editorMode": "builder",
"expr": "{namespace=~\"$namespace\", nvidia_com_dynamo_graph_deployment_name=~\"$dynamographdeployment\", nvidia_com_dynamo_component_type=~\"$component\"} |~ \"(?i)$search\" |~ \"(?i)$trace_id\"",
"queryType": "range",
"refId": "A"
}
],
"title": "DynamoGraph Logs",
"type": "logs"
}
],
"preload": false,
"schemaVersion": 41,
"tags": ["dynamograph", "logs"],
"templating": {
"list": [
{
"current": {
"selected": true,
"text": "Loki",
"value": "Loki"
},
"hide": 0,
"includeAll": false,
"label": "Datasource",
"multi": false,
"name": "datasource",
"options": [],
"query": "loki",
"queryValue": "",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"type": "datasource"
},
{
"current": {
"selected": true,
"text": ["All"],
"value": ["$__all"]
},
"datasource": {
"type": "loki",
"uid": "$datasource"
},
"definition": "label_values(namespace)",
"hide": 0,
"includeAll": true,
"label": "Namespace",
"multi": true,
"name": "namespace",
"options": [],
"query": "label_values(namespace)",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"type": "query"
},
{
"current": {
"selected": true,
"text": ["All"],
"value": ["$__all"]
},
"datasource": {
"type": "loki",
"uid": "$datasource"
},
"definition": "label_values(nvidia_com_dynamo_graph_deployment_name)",
"hide": 0,
"includeAll": true,
"label": "DynamoGraph Deployment",
"multi": true,
"name": "dynamographdeployment",
"options": [],
"query": "label_values(nvidia_com_dynamo_graph_deployment_name)",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"type": "query"
},
{
"current": {
"selected": true,
"text": ["All"],
"value": ["$__all"]
},
"datasource": {
"type": "loki",
"uid": "$datasource"
},
"definition": "label_values(nvidia_com_dynamo_component_type)",
"hide": 0,
"includeAll": true,
"label": "Component",
"multi": true,
"name": "component",
"options": [],
"query": "label_values(nvidia_com_dynamo_component_type)",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"type": "query"
},
{
"current": {
"selected": true,
"text": "",
"value": ""
},
"label": "Trace ID",
"name": "trace_id",
"options": [
{
"selected": true,
"text": "",
"value": ""
}
],
"query": "",
"type": "textbox"
},
{
"current": {
"selected": true,
"text": "",
"value": ""
},
"label": "Search",
"name": "search",
"options": [
{
"selected": true,
"text": "",
"value": ""
}
],
"query": "",
"type": "textbox"
}
]
},
"time": {
"from": "now-6h",
"to": "now"
},
"timepicker": {},
"timezone": "browser",
"title": "DynamoGraph Logs",
"description": "Dashboard for viewing DynamoGraph deployment logs across components and namespaces",
"version": 1
}
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: v1
kind: ConfigMap
metadata:
name: dynamo-logs-dashboard
labels:
grafana_dashboard: "1" # This label is important for the Grafana sidecar
data:
dynamo-logs.json: |-
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"links": [],
"panels": [
{
"datasource": {
"type": "loki",
"uid": "$datasource"
},
"fieldConfig": {
"defaults": {},
"overrides": []
},
"gridPos": {
"h": 21,
"w": 24,
"x": 0,
"y": 0
},
"id": 1,
"options": {
"dedupStrategy": "none",
"enableInfiniteScrolling": false,
"enableLogDetails": true,
"prettifyLogMessage": false,
"showCommonLabels": false,
"showLabels": false,
"showTime": false,
"sortOrder": "Descending",
"wrapLogMessage": false
},
"pluginVersion": "12.1.0",
"targets": [
{
"datasource": {
"type": "loki",
"uid": "$datasource"
},
"direction": "backward",
"editorMode": "builder",
"expr": "{namespace=~\"$namespace\", nvidia_com_dynamo_graph_deployment_name=~\"$dynamographdeployment\", nvidia_com_dynamo_component_type=~\"$component\"} |~ \"(?i)$search\" |~ \"(?i)$trace_id\"",
"queryType": "range",
"refId": "A"
}
],
"title": "DynamoGraph Logs",
"type": "logs"
}
],
"preload": false,
"schemaVersion": 41,
"tags": ["dynamograph", "logs"],
"templating": {
"list": [
{
"current": {
"selected": true,
"text": "Loki",
"value": "Loki"
},
"hide": 0,
"includeAll": false,
"label": "Datasource",
"multi": false,
"name": "datasource",
"options": [],
"query": "loki",
"queryValue": "",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"type": "datasource"
},
{
"current": {
"selected": true,
"text": ["All"],
"value": ["$__all"]
},
"datasource": {
"type": "loki",
"uid": "$datasource"
},
"definition": "label_values(namespace)",
"hide": 0,
"includeAll": true,
"label": "Namespace",
"multi": true,
"name": "namespace",
"options": [],
"query": "label_values(namespace)",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"type": "query"
},
{
"current": {
"selected": true,
"text": ["All"],
"value": ["$__all"]
},
"datasource": {
"type": "loki",
"uid": "$datasource"
},
"definition": "label_values(nvidia_com_dynamo_graph_deployment_name)",
"hide": 0,
"includeAll": true,
"label": "DynamoGraph Deployment",
"multi": true,
"name": "dynamographdeployment",
"options": [],
"query": "label_values(nvidia_com_dynamo_graph_deployment_name)",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"type": "query"
},
{
"current": {
"selected": true,
"text": ["All"],
"value": ["$__all"]
},
"datasource": {
"type": "loki",
"uid": "$datasource"
},
"definition": "label_values(nvidia_com_dynamo_component_type)",
"hide": 0,
"includeAll": true,
"label": "Component",
"multi": true,
"name": "component",
"options": [],
"query": "label_values(nvidia_com_dynamo_component_type)",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"type": "query"
},
{
"current": {
"selected": true,
"text": "",
"value": ""
},
"label": "Trace ID",
"name": "trace_id",
"options": [
{
"selected": true,
"text": "",
"value": ""
}
],
"query": "",
"type": "textbox"
},
{
"current": {
"selected": true,
"text": "",
"value": ""
},
"label": "Search",
"name": "search",
"options": [
{
"selected": true,
"text": "",
"value": ""
}
],
"query": "",
"type": "textbox"
}
]
},
"time": {
"from": "now-6h",
"to": "now"
},
"timepicker": {},
"timezone": "browser",
"title": "DynamoGraph Logs",
"description": "Dashboard for viewing DynamoGraph deployment logs across components and namespaces",
"version": 1
}
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: v1
kind: ConfigMap
metadata:
name: loki-datasource
labels:
grafana_datasource: "1" # This label is important for the Grafana sidecar
data:
loki-datasource.yaml: |-
apiVersion: 1
datasources:
- name: Loki
type: loki
access: proxy
url: http://loki-gateway.$MONITORING_NAMESPACE.svc.cluster.local
jsonData:
maxLines: 1000
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
cluster:
name: dynamo-log-collector
destinations:
- name: loki
type: loki
url: http://loki-gateway.$MONITORING_NAMESPACE.svc.cluster.local/loki/api/v1/push
nodeLogs:
enabled: false
podLogs:
enabled: true
gatherMethod: kubernetesApi
collector: alloy-logs
labels:
app_kubernetes_io_name: app.kubernetes.io/name
nvidia_com_dynamo_component_type: nvidia.com/dynamo-component-type
nvidia_com_dynamo_graph_deployment_name: nvidia.com/dynamo-graph-deployment-name
labelsToKeep:
- "app_kubernetes_io_name"
- "container"
- "instance"
- "job"
- "level"
- "namespace"
- "service_name"
- "service_namespace"
- "deployment_environment"
- "deployment_environment_name"
- "nvidia_com_dynamo_component_type"
- "nvidia_com_dynamo_graph_deployment_name"
structuredMetadata:
pod: pod # Set structured metadata "pod" from label "pod"
namespaces:
- $DYNAMO_NAMESPACE
# Collectors
alloy-singleton:
enabled: false
alloy-metrics:
enabled: false
alloy-logs:
enabled: true
alloy:
mounts:
varlog: false
dockercontainers: false
clustering:
enabled: true
alloy-profiles:
enabled: false
alloy-receiver:
enabled: false
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
loki:
auth_enabled: false
commonConfig:
replication_factor: 1
schemaConfig:
configs:
- from: 2024-04-01
store: tsdb
object_store: s3
schema: v13
index:
prefix: loki_index_
period: 24h
ingester:
chunk_encoding: snappy
tracing:
enabled: true
pattern_ingester:
enabled: true
limits_config:
allow_structured_metadata: true
volume_enabled: true
ruler:
enable_api: true
querier:
# Default is 4, if you have enough memory and CPU you can increase, reduce if OOMing
max_concurrent: 4
minio:
enabled: true
deploymentMode: SingleBinary
singleBinary:
replicas: 1
resources:
limits:
cpu: 4
memory: 4Gi
requests:
cpu: 2
memory: 2Gi
extraEnv:
# Keep a little bit lower than memory limits
- name: GOMEMLIMIT
value: 3750MiB
chunksCache:
# default is 500MB, with limited memory keep this smaller
writebackSizeLimit: 10MB
# Zero out replica counts of other deployment modes
backend:
replicas: 0
read:
replicas: 0
write:
replicas: 0
ingester:
replicas: 0
querier:
replicas: 0
queryFrontend:
replicas: 0
queryScheduler:
replicas: 0
distributor:
replicas: 0
compactor:
replicas: 0
indexGateway:
replicas: 0
bloomCompactor:
replicas: 0
bloomGateway:
replicas: 0
# Dynamo Metrics Collection on Kubernetes
For detailed documentation on collecting and visualizing metrics on Kubernetes, see [docs/guides/dynamo_deploy/k8s_metrics.md](../../../docs/guides/dynamo_deploy/k8s_metrics.md).
For detailed documentation on collecting and visualizing metrics on Kubernetes, see [docs/guides/dynamo_deploy/metrics.md](../../../docs/guides/dynamo_deploy/metrics.md).
......@@ -151,7 +151,7 @@ kubectl get pods -n ${NAMESPACE}
- [TensorRT-LLM Deployments](../../../components/backends/trtllm/deploy/README.md)
3. **Optional:**
- [Set up Prometheus & Grafana](k8s_metrics.md)
- [Set up Prometheus & Grafana](metrics.md)
- [SLA Planner Deployment Guide](sla_planner_deployment.md) (for advanced SLA-aware scheduling and autoscaling)
## Troubleshooting
......
# Log Aggregation in Dynamo on Kubernetes
This guide demonstrates how to set up logging for Dynamo in Kubernetes using Grafana Loki and Alloy. This setup provides a simple reference logging setup that can be followed in Kubernetes clusters including Minikube and MicroK8s.
> [!Note]
> This setup is intended for development and testing purposes. For production environments, please refer to the official documentation for high-availability configurations.
## Components Overview
- **[Grafana Loki](https://grafana.com/oss/loki/)**: Fast and cost-effective Kubernetes-native log aggregation system.
- **[Grafana Alloy](https://grafana.com/oss/alloy/)**: OpenTelemetry collector that replaces Promtail, gathering logs, metrics and traces from Kubernetes pods.
- **[Grafana](https://grafana.com/grafana/)**: Visualization platform for querying and exploring logs.
## Prerequisites
### 1. Dynamo Cloud Kubernetes Operator
This guide assumes you have installed Dynamo Cloud Kubernetes Operator. For more information, see [Dynamo Cloud Operator](./README.md).
### 2. Kube-prometheus
While this guide does not use Prometheus, it assumes Grafana is pre-installed with the kube-prometheus. For more information, see [kube-prometheus](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack).
### 3. Environment Variables
The following env variables are set:
- `MONITORING_NAMESPACE`: The namespace where Loki is installed
- `DYNAMO_NAMESPACE`: The namespace where Dynamo Cloud Operator is installed
```bash
export MONITORING_NAMESPACE=monitoring
export DYNAMO_NAMESPACE=dynamo-cloud
```
## Installation Steps
### 1. Install Loki
First, we'll install Loki in single binary mode, which is ideal for testing and development:
```bash
# Add the Grafana Helm repository
helm repo add grafana https://grafana.github.io/helm-charts
helm repo update
# Install Loki
helm install --values deploy/logging/values/loki-values.yaml loki grafana/loki -n $MONITORING_NAMESPACE
```
Our configuration (`loki-values.yaml`) sets up Loki in a simple configuration that is suitable for testing and development. It uses a local MinIO for storage. The installation pods can be viewed with:
```bash
kubectl get pods -n $MONITORING_NAMESPACE -l app=loki
```
### 2. Install Grafana Alloy
Next, install the Grafana Alloy collector to gather logs from your Kubernetes cluster and forward them to Loki. Here we use the Helm chart `k8s-monitoring` provided by Grafana to install the collector:
```bash
# Generate a custom values file with the namespace information
envsubst < deploy/logging/values/alloy-values.yaml > alloy-custom-values.yaml
# Install the collector
helm install --values alloy-custom-values.yaml alloy grafana/k8s-monitoring -n $MONITORING_NAMESPACE
```
The values file (`alloy-values.yaml`) includes the following configurations for the collector:
- Destination to forward logs to Loki
- Namespace to collect logs from
- Pod labels to be mapped to Loki labels
- Collection method (kubernetesApi or tailing `/var/log/containers/`)
```yaml
destinations:
- name: loki
type: loki
url: http://loki-gateway.$MONITORING_NAMESPACE.svc.cluster.local/loki/api/v1/push
podLogs:
enabled: true
gatherMethod: kubernetesApi # collect logs from the kubernetes api, rather than /var/log/containers/; friendly for testing and development
collector: alloy-logs
labels:
app_kubernetes_io_name: app.kubernetes.io/name
nvidia_com_dynamo_component_type: nvidia.com/dynamo-component-type
nvidia_com_dynamo_graph_deployment_name: nvidia.com/dynamo-graph-deployment-name
labelsToKeep:
- "app_kubernetes_io_name"
- "container"
- "instance"
- "job"
- "level"
- "namespace"
- "service_name"
- "service_namespace"
- "deployment_environment"
- "deployment_environment_name"
- "nvidia_com_dynamo_component_type" # extract this label from the dynamo graph deployment
- "nvidia_com_dynamo_graph_deployment_name" # extract this label from the dynamo graph deployment
namespaces:
- $DYNAMO_NAMESPACE
```
### 3. Configure Grafana with the Loki datasource and Dynamo Logs dashboard
We will be viewing the logs associated with our DynamoGraphDeployment in Grafana. To do this, we need to configure Grafana with the Loki datasource and Dynamo Logs dashboard.
Since we are using Grafana with the Prometheus Operator, we can simply apply the following ConfigMaps to quickly achieve this configuration.
```bash
# Configure Grafana with the Loki datasource
envsubst < deploy/logging/grafana/loki-datasource.yaml | kubectl apply -n $MONITORING_NAMESPACE -f -
# Configure Grafana with the Dynamo Logs dashboard
envsubst < deploy/logging/grafana/logging-dashboard.yaml | kubectl apply -n $MONITORING_NAMESPACE -f -
```
> [!Note]
> If using Grafana installed without the Prometheus Operator, you can manually import the Loki datasource and Dynamo Logs dashboard using the Grafana UI.
### 4. Deploy a DynamoGraphDeployment with JSONL Logging
At this point, we should have everything in place to collect and view logs in our Grafana instance. All that is left is to deploy a DynamoGraphDeployment to collect logs from.
To enable structured logs in a DynamoGraphDeployment, we need to set the `DYN_LOGGING_JSONL` environment variable to `1`. This is done for us in the `agg_logging.yaml` setup for the Sglang backend. We can now deploy the DynamoGraphDeployment with:
```bash
kubectl apply -n $DYNAMO_NAMESPACE -f components/backends/sglang/deploy/agg_logging.yaml
```
Send a few chat completions requests to generate structured logs across the frontend and worker pods across the DynamoGraphDeployment. We are now all set to view the logs in Grafana.
## Viewing Logs in Grafana
Port-forward the Grafana service to access the UI:
```bash
kubectl port-forward svc/prometheus-grafana 3000:80 -n $MONITORING_NAMESPACE
```
If everything is working, under Home > Dashboards > Dynamo Logs, you should see a dashboard that can be used to view the logs associated with our DynamoGraphDeployments
The dashboard enables filtering by DynamoGraphDeployment, namespace, and component type (e.g frontend, worker, etc).
\ No newline at end of file
......@@ -31,7 +31,7 @@ Dynamo automatically exposes metrics with the `dynamo_` name prefixes. It also a
**Specialized Component Metrics**: Components can also expose additional metrics specific to their functionality. For example, a `preprocessor` component exposes metrics with the `dynamo_preprocessor_*` prefix. See the [Available Metrics section](../../deploy/metrics/README.md#available-metrics) for details on specialized component metrics.
**Kubernetes Integration**: For comprehensive Kubernetes deployment and monitoring setup, see the [Kubernetes Metrics Guide](dynamo_deploy/k8s_metrics.md). This includes Prometheus Operator setup, metrics collection configuration, and visualization in Grafana.
**Kubernetes Integration**: For comprehensive Kubernetes deployment and monitoring setup, see the [Kubernetes Metrics Guide](dynamo_deploy/metrics.md). This includes Prometheus Operator setup, metrics collection configuration, and visualization in Grafana.
## Metrics Hierarchy
......
......@@ -52,7 +52,8 @@ Quickstart
Quickstart (K8s) <../guides/dynamo_deploy/dynamo_cloud.md>
Dynamo Operator <../guides/dynamo_deploy/dynamo_operator.md>
Metrics <../guides/dynamo_deploy/k8s_metrics.md>
Metrics <../guides/dynamo_deploy/metrics.md>
Logging <../guides/dynamo_deploy/logging.md>
Multinode <../guides/dynamo_deploy/multinode-deployment.md>
Minikube Setup <../guides/dynamo_deploy/minikube.md>
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment