Unverified Commit afccc9d4 authored by Keiven C's avatar Keiven C Committed by GitHub
Browse files

refactor: consolidate Observability files (e.g. OTEL docker-compose, md files) (#4173)


Signed-off-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
Co-authored-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
parent 3577b5c1
......@@ -101,9 +101,8 @@ To coordinate across a data center, Dynamo relies on etcd and NATS. To run Dynam
To quickly setup etcd & NATS, you can also run:
```
```bash
# At the root of the repository:
# Edit deploy/docker-compose.yml to comment out "runtime: nvidia" of the dcgm-exporter service if the nvidia container runtime isn't deployed or to be used.
docker compose -f deploy/docker-compose.yml up -d
```
......
......@@ -204,7 +204,7 @@ def setup_prometheus_registry(
SGLang uses multiprocess architecture where metrics are stored in shared memory.
MultiProcessCollector aggregates metrics from all worker processes. The Prometheus
registry collects sglang:* metrics which are exposed via the metrics server endpoint
(typically port 8081) when DYN_SYSTEM_ENABLED=true.
(set DYN_SYSTEM_PORT to a positive value to enable, e.g., DYN_SYSTEM_PORT=8081).
Args:
engine: The SGLang engine instance.
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# IMPORT NOTE: Make sure this is in sync with lib/runtime/docker-compose.yml
# Bare minimum infrastructure services for Dynamo.
# For observability (metrics, tracing, dashboards), use docker-observability.yml
networks:
server:
driver: bridge
monitoring:
driver: bridge
# Note that the images are pinned to specific versions to avoid breaking changes.
services:
nats-server:
image: nats:2.11.4
......@@ -31,7 +18,6 @@ services:
- 8222:8222 # the endpoints include /varz, /healthz, ...
networks:
- server
- monitoring
etcd-server:
image: bitnamilegacy/etcd:3.6.1
......@@ -42,108 +28,3 @@ services:
- 2380:2380
networks:
- server
- monitoring
# All the services below are part of the metrics profile and monitoring network.
# The exporter translates from /varz and other stats to Prometheus metrics
nats-prometheus-exporter:
image: natsio/prometheus-nats-exporter:0.17.3
command: ["-varz", "-connz", "-routez", "-subz", "-gatewayz", "-leafz", "-jsz=all", "http://nats-server:8222"]
ports:
- 7777:7777
networks:
- monitoring
profiles: [metrics]
depends_on:
- nats-server
# DCGM stands for Data Center GPU Manager: https://developer.nvidia.com/dcgm
# dcgm-exporter is a tool from NVIDIA that exposes DCGM metrics in Prometheus format.
dcgm-exporter:
image: nvidia/dcgm-exporter:4.2.3-4.1.3-ubi9
ports:
# Expose dcgm-exporter on port 9401 both inside and outside the container
# to avoid conflicts with other dcgm-exporter instances in distributed environments.
# To access DCGM metrics:
# Outside the container: curl http://localhost:9401/metrics (or the host IP)
# Inside the container (container-to-container): curl http://dcgm-exporter:9401/metrics
- 9401:9401
cap_add:
- SYS_ADMIN
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
environment:
# dcgm uses NVIDIA_VISIBLE_DEVICES variable but normally it is CUDA_VISIBLE_DEVICES
- NVIDIA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-all}
- DCGM_EXPORTER_LISTEN=:9401
runtime: nvidia # Specify the NVIDIA runtime
networks:
- monitoring
# To access Prometheus from another machine, you may need to disable te firewall on your host. On Ubuntu:
# sudo ufw allow 9090/tcp
prometheus:
image: prom/prometheus:v3.4.1
container_name: prometheus
volumes:
- ./metrics/prometheus.yml:/etc/prometheus/prometheus.yml
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
# These provide the web console functionality
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--web.enable-lifecycle'
restart: unless-stopped
# Example to pull from the /query endpoint:
# {__name__=~"DCGM.*", job="dcgm-exporter"}
networks:
- monitoring
ports:
- "9090:9090"
profiles: [metrics]
extra_hosts:
- "host.docker.internal:host-gateway"
depends_on:
- dcgm-exporter
- nats-prometheus-exporter
- etcd-server
# grafana connects to prometheus via the /query endpoint.
# Default credentials are dynamo/dynamo.
# To access Grafana from another machine, you may need to disable te firewall on your host. On Ubuntu:
# sudo ufw allow 3001/tcp
grafana:
image: grafana/grafana-enterprise:12.0.1
container_name: grafana
volumes:
- ./metrics/grafana_dashboards:/etc/grafana/provisioning/dashboards
- ./metrics/grafana-datasources.yml:/etc/grafana/provisioning/datasources/datasources.yml
environment:
- GF_SERVER_HTTP_PORT=3001
# do not make it admin/admin, because you will be prompted to change the password every time
- GF_SECURITY_ADMIN_USER=dynamo
- GF_SECURITY_ADMIN_PASSWORD=dynamo
- GF_USERS_ALLOW_SIGN_UP=false
- GF_INSTALL_PLUGINS=grafana-piechart-panel
# Default min interval is 5s, but can be configured lower
- GF_DASHBOARDS_MIN_REFRESH_INTERVAL=2s
# Disable password change requirement
- GF_SECURITY_DISABLE_INITIAL_ADMIN_CREATION=false
- GF_SECURITY_ADMIN_PASSWORD_POLICY=false
- GF_AUTH_DISABLE_LOGIN_FORM=false
- GF_AUTH_DISABLE_SIGNOUT_MENU=false
restart: unless-stopped
ports:
- "3001:3001"
networks:
- monitoring
profiles: [metrics]
depends_on:
- prometheus
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Observability stack for Dynamo: metrics, tracing, and visualization.
# Requires deploy/docker-compose.yml to be running for NATS and etcd connectivity.
#
# Usage:
# docker compose -f deploy/docker-observability.yml up -d
version: '3.8'
networks:
server:
external: true
name: deploy_server
volumes:
grafana-data:
tempo-data:
services:
# DCGM stands for Data Center GPU Manager: https://developer.nvidia.com/dcgm
# dcgm-exporter is a tool from NVIDIA that exposes DCGM metrics in Prometheus format.
dcgm-exporter:
image: nvidia/dcgm-exporter:4.2.3-4.1.3-ubi9
ports:
# Expose dcgm-exporter on port 9401 both inside and outside the container
# to avoid conflicts with other dcgm-exporter instances in distributed environments.
# To access DCGM metrics:
# Outside the container: curl http://localhost:9401/metrics (or the host IP)
# Inside the container (container-to-container): curl http://dcgm-exporter:9401/metrics
- 9401:9401
cap_add:
- SYS_ADMIN
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
environment:
# dcgm uses NVIDIA_VISIBLE_DEVICES variable but normally it is CUDA_VISIBLE_DEVICES
- NVIDIA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-all}
- DCGM_EXPORTER_LISTEN=:9401
runtime: nvidia # Specify the NVIDIA runtime
networks:
- server
# The exporter translates from /varz and other stats to Prometheus metrics
nats-prometheus-exporter:
image: natsio/prometheus-nats-exporter:0.17.3
command: ["-varz", "-connz", "-routez", "-subz", "-gatewayz", "-leafz", "-jsz=all", "http://nats-server:8222"]
ports:
- 7777:7777
networks:
- server
# To access Prometheus from another machine, you may need to disable te firewall on your host. On Ubuntu:
# sudo ufw allow 9090/tcp
prometheus:
image: prom/prometheus:v3.4.1
container_name: prometheus
volumes:
- ./observability/prometheus.yml:/etc/prometheus/prometheus.yml
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
# These provide the web console functionality
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--web.enable-lifecycle'
restart: unless-stopped
# Example to pull from the /query endpoint:
# {__name__=~"DCGM.*", job="dcgm-exporter"}
ports:
- "9090:9090"
networks:
- server
extra_hosts:
- "host.docker.internal:host-gateway"
depends_on:
- dcgm-exporter
- nats-prometheus-exporter
# Tempo - Distributed tracing backend
tempo:
image: grafana/tempo:2.8.2
command: [ "-config.file=/etc/tempo.yaml" ]
user: root
volumes:
- ./observability/tempo.yaml:/etc/tempo.yaml
- tempo-data:/tmp/tempo
ports:
- "3200:3200" # Tempo HTTP
- "4317:4317" # OTLP gRPC receiver (accessible from host)
- "4318:4318" # OTLP HTTP receiver (accessible from host)
networks:
- server
# Grafana - Visualization and dashboards
# Supports both Prometheus (metrics) and Tempo (tracing) datasources
# Default credentials: dynamo/dynamo
# To access Grafana from another machine, you may need to disable te firewall on your host. On Ubuntu:
# sudo ufw allow 3000/tcp
grafana:
image: grafana/grafana:12.2.0
container_name: grafana
volumes:
- grafana-data:/var/lib/grafana
- ./observability/grafana_dashboards:/etc/grafana/provisioning/dashboards
- ./observability/grafana-datasources.yml:/etc/grafana/provisioning/datasources/prometheus.yml
- ./observability/tempo-datasource.yml:/etc/grafana/provisioning/datasources/tempo.yml
environment:
- GF_SERVER_HTTP_PORT=3000
# do not make it admin/admin, because you will be prompted to change the password every time
- GF_SECURITY_ADMIN_USER=dynamo
- GF_SECURITY_ADMIN_PASSWORD=dynamo
- GF_USERS_ALLOW_SIGN_UP=false
- GF_FEATURE_TOGGLES_ENABLE=traceqlEditor
- GF_INSTALL_PLUGINS=grafana-piechart-panel
# Default min interval is 5s, but can be configured lower
- GF_DASHBOARDS_MIN_REFRESH_INTERVAL=2s
# Disable password change requirement
- GF_SECURITY_DISABLE_INITIAL_ADMIN_CREATION=false
- GF_SECURITY_ADMIN_PASSWORD_POLICY=false
- GF_AUTH_DISABLE_LOGIN_FORM=false
- GF_AUTH_DISABLE_SIGNOUT_MENU=false
restart: unless-stopped
ports:
- "3000:3000"
networks:
- server
depends_on:
- prometheus
- tempo
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: dynamo-frontend-metrics
namespace: ${NAMESPACE}
spec:
selector:
matchLabels:
nvidia.com/metrics-enabled: "true"
nvidia.com/dynamo-component-type: "frontend"
podMetricsEndpoints:
- port: http
path: /metrics
interval: 5s
relabelings:
- action: replace
sourceLabels:
- __meta_kubernetes_pod_label_nvidia_com_dynamo_namespace
targetLabel: dynamo_namespace
namespaceSelector:
matchNames:
- ${NAMESPACE}
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: dynamo-planner-metrics
namespace: $NAMESPACE
spec:
selector:
matchLabels:
nvidia.com/metrics-enabled: "true"
nvidia.com/dynamo-component-type: "planner"
podMetricsEndpoints:
- port: metrics
path: /metrics
interval: 5s
namespaceSelector:
matchNames:
- $NAMESPACE
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: dynamo-worker-metrics
namespace: ${NAMESPACE}
spec:
selector:
matchLabels:
nvidia.com/metrics-enabled: "true"
nvidia.com/dynamo-component-type: "worker"
podMetricsEndpoints:
- port: system
path: /metrics
interval: 5s
namespaceSelector:
matchNames:
- ${NAMESPACE}
# Dynamo Observability
For detailed documentation on Observability (Prometheus metrics, tracing, and logging), please refer to [docs/observability/](../../docs/observability/).
# Example Grafana Dashboards
This directory contains example Grafana dashboards for Dynamo observability. These are starter files that you can use as references for building your own custom dashboards.
- `dynamo.json` - General Dynamo dashboard showing software and hardware metrics
- `dcgm-metrics.json` - GPU metrics dashboard using DCGM exporter data
- `kvbm.json` - KV Block Manager metrics dashboard
- `temp-loki.json` - Logging dashboard for Loki integration
- `dashboard-providers.yml` - Configuration file for dashboard provisioning
For setup instructions and usage, see [Observability Documentation](../../../docs/observability/).
......@@ -15,19 +15,7 @@
}
]
},
"copyright": [
"SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.",
"SPDX-License-Identifier: Apache-2.0",
"Licensed under the Apache License, Version 2.0 (the \"License\");",
"you may not use this file except in compliance with the License.",
"You may obtain a copy of the License at",
"http://www.apache.org/licenses/LICENSE-2.0",
"Unless required by applicable law or agreed to in writing, software",
"distributed under the License is distributed on an \"AS IS\" BASIS,",
"WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.",
"See the License for the specific language governing permissions and",
"limitations under the License."
],
"_copyright": "SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
......
......@@ -1020,7 +1020,7 @@
},
"timepicker": {},
"timezone": "browser",
"title": "Dynamo Dashboard",
"title": "Dynamo Dashboard (generic)",
"uid": "97ae8df9-138a-4f7a-9b0f-635b77d818fe",
"version": 1
}
\ No newline at end of file
......@@ -1002,7 +1002,7 @@ data:
},
"timepicker": {},
"timezone": "browser",
"title": "Dynamo Dashboard",
"title": "Dynamo Dashboard (generic)",
"uid": "dynamo-dashboard",
"version": 1
}
# Dynamo Logging on Kubernetes
For detailed documentation on collecting and visualizing logs on Kubernetes, see [docs/kubernetes/observability/logging.md](../../docs/kubernetes/observability/logging.md).
For detailed documentation on collecting and visualizing logs on Kubernetes, see [docs/kubernetes/observability/logging.md](../../../../docs/kubernetes/observability/logging.md).
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"links": [],
"panels": [
{
"datasource": {
"type": "loki",
"uid": "$datasource"
},
"fieldConfig": {
"defaults": {},
"overrides": []
},
"gridPos": {
"h": 21,
"w": 24,
"x": 0,
"y": 0
},
"id": 1,
"options": {
"dedupStrategy": "none",
"enableInfiniteScrolling": false,
"enableLogDetails": true,
"prettifyLogMessage": false,
"showCommonLabels": false,
"showLabels": false,
"showTime": false,
"sortOrder": "Descending",
"wrapLogMessage": false
},
"pluginVersion": "12.1.0",
"targets": [
{
"datasource": {
"type": "loki",
"uid": "$datasource"
},
"direction": "backward",
"editorMode": "builder",
"expr": "{namespace=~\"$namespace\", nvidia_com_dynamo_graph_deployment_name=~\"$dynamographdeployment\", nvidia_com_dynamo_component_type=~\"$component\"} |= \"$search\" |= \"$trace_id\"",
"queryType": "range",
"refId": "A"
}
],
"title": "DynamoGraph Logs",
"type": "logs"
}
],
"preload": false,
"schemaVersion": 41,
"tags": ["dynamograph", "logs"],
"templating": {
"list": [
{
"current": {
"selected": true,
"text": "Loki",
"value": "Loki"
},
"hide": 0,
"includeAll": false,
"label": "Datasource",
"multi": false,
"name": "datasource",
"options": [],
"query": "loki",
"queryValue": "",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"type": "datasource"
},
{
"allValue": ".+",
"current": {
"selected": true,
"text": ["All"],
"value": ["$__all"]
},
"datasource": {
"type": "loki",
"uid": "$datasource"
},
"definition": "label_values(namespace)",
"hide": 0,
"includeAll": true,
"label": "Namespace",
"multi": true,
"name": "namespace",
"options": [],
"query": "label_values(namespace)",
"refresh": 1,
"regex": ".+",
"skipUrlSync": false,
"sort": 1,
"type": "query"
},
{
"allValue": ".+",
"current": {
"selected": true,
"text": ["All"],
"value": ["$__all"]
},
"datasource": {
"type": "loki",
"uid": "$datasource"
},
"definition": "label_values(nvidia_com_dynamo_graph_deployment_name)",
"hide": 0,
"includeAll": true,
"label": "DynamoGraph Deployment",
"multi": true,
"name": "dynamographdeployment",
"options": [],
"query": "label_values(nvidia_com_dynamo_graph_deployment_name)",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"type": "query"
},
{
"allValue": ".+",
"current": {
"selected": true,
"text": ["All"],
"value": ["$__all"]
},
"datasource": {
"type": "loki",
"uid": "$datasource"
},
"definition": "label_values(nvidia_com_dynamo_component_type)",
"hide": 0,
"includeAll": true,
"label": "Component",
"multi": true,
"name": "component",
"options": [],
"query": "label_values(nvidia_com_dynamo_component_type)",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"type": "query"
},
{
"current": {
"selected": true,
"text": "",
"value": ""
},
"label": "Trace ID",
"name": "trace_id",
"options": [
{
"selected": true,
"text": "",
"value": ""
}
],
"query": "",
"type": "textbox"
},
{
"current": {
"selected": true,
"text": "",
"value": ""
},
"label": "Search",
"name": "search",
"options": [
{
"selected": true,
"text": "",
"value": ""
}
],
"query": "",
"type": "textbox"
}
]
},
"time": {
"from": "now-6h",
"to": "now"
},
"timepicker": {},
"timezone": "browser",
"title": "DynamoGraph Logs",
"description": "Dashboard for viewing DynamoGraph deployment logs across components and namespaces",
"version": 1
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment