docker-observability.yml 5.89 KB
Newer Older
1
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# SPDX-License-Identifier: Apache-2.0

# Observability stack for Dynamo: metrics, tracing, and visualization.
# Requires deploy/docker-compose.yml to be running for NATS and etcd connectivity.
#
# Usage:
#   docker compose -f deploy/docker-observability.yml up -d

version: '3.8'

networks:
  server:
    external: true
    name: deploy_server

volumes:
  grafana-data:
  tempo-data:
20
  loki-data:
21
22
23
24

services:
  # DCGM stands for Data Center GPU Manager: https://developer.nvidia.com/dcgm
  # dcgm-exporter is a tool from NVIDIA that exposes DCGM metrics in Prometheus format.
25
26
  # Requires NVIDIA GPU and runtime. Enable with:
  #   docker compose --profile nvidia -f deploy/docker-observability.yml up -d
27
28
  dcgm-exporter:
    image: nvidia/dcgm-exporter:4.2.3-4.1.3-ubi9
29
30
    profiles:
      - nvidia
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
    ports:
      # Expose dcgm-exporter on port 9401 both inside and outside the container
      # to avoid conflicts with other dcgm-exporter instances in distributed environments.
      # To access DCGM metrics:
      # Outside the container: curl http://localhost:9401/metrics (or the host IP)
      # Inside the container (container-to-container): curl http://dcgm-exporter:9401/metrics
      - 9401:9401
    cap_add:
      - SYS_ADMIN
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    environment:
      # dcgm uses NVIDIA_VISIBLE_DEVICES variable but normally it is CUDA_VISIBLE_DEVICES
      - NVIDIA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-all}
      - DCGM_EXPORTER_LISTEN=:9401
    runtime: nvidia  # Specify the NVIDIA runtime
    networks:
      - server

  # The exporter translates from /varz and other stats to Prometheus metrics
  nats-prometheus-exporter:
    image: natsio/prometheus-nats-exporter:0.17.3
    command: ["-varz", "-connz", "-routez", "-subz", "-gatewayz", "-leafz", "-jsz=all", "http://nats-server:8222"]
    ports:
      - 7777:7777
    networks:
      - server

  # To access Prometheus from another machine, you may need to disable te firewall on your host. On Ubuntu:
  # sudo ufw allow 9090/tcp
  prometheus:
    image: prom/prometheus:v3.4.1
    container_name: prometheus
    volumes:
      - ./observability/prometheus.yml:/etc/prometheus/prometheus.yml
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      # These provide the web console functionality
      - '--web.console.libraries=/etc/prometheus/console_libraries'
      - '--web.console.templates=/etc/prometheus/consoles'
      - '--web.enable-lifecycle'
    restart: unless-stopped
    # Example to pull from the /query endpoint:
    # {__name__=~"DCGM.*", job="dcgm-exporter"}
    ports:
      - "9090:9090"
    networks:
      - server
    extra_hosts:
    - "host.docker.internal:host-gateway"
    depends_on:
      - nats-prometheus-exporter

90
91
92
93
94
95
96
97
98
99
100
101
102
  # Loki - Log aggregation backend
  loki:
    image: grafana/loki:3.5.0
    command: [ "-config.file=/etc/loki.yaml" ]
    user: root
    volumes:
      - ./observability/loki.yaml:/etc/loki.yaml
      - loki-data:/loki
    ports:
      - "3100:3100"   # Loki HTTP API (push/query)
    networks:
      - server

103
  # Tempo - Distributed tracing backend
104
  # Note: OTLP ports (4317/4318) are internal-only; the OTel Collector handles host-facing ingestion.
105
106
107
108
109
110
111
112
  tempo:
    image: grafana/tempo:2.8.2
    command: [ "-config.file=/etc/tempo.yaml" ]
    user: root
    volumes:
      - ./observability/tempo.yaml:/etc/tempo.yaml
      - tempo-data:/tmp/tempo
    ports:
113
114
115
116
117
118
119
120
121
122
123
124
125
      - "3200:3200"   # Tempo HTTP (query API)
    networks:
      - server

  # OpenTelemetry Collector - Unified ingestion point for traces and logs
  # Dynamo services send OTLP to localhost:4317 (gRPC) or localhost:4318 (HTTP).
  # The collector routes traces to Tempo and logs to Loki.
  otel-collector:
    image: otel/opentelemetry-collector:0.120.0
    command: [ "--config=/etc/otel-collector.yaml" ]
    volumes:
      - ./observability/otel-collector.yaml:/etc/otel-collector.yaml
    ports:
126
127
128
129
      - "4317:4317"   # OTLP gRPC receiver (accessible from host)
      - "4318:4318"   # OTLP HTTP receiver (accessible from host)
    networks:
      - server
130
131
132
    depends_on:
      - tempo
      - loki
133
134
135
136
137
138
139
140
141
142
143
144
145
146

  # Grafana - Visualization and dashboards
  # Supports both Prometheus (metrics) and Tempo (tracing) datasources
  # Default credentials: dynamo/dynamo
  # To access Grafana from another machine, you may need to disable te firewall on your host. On Ubuntu:
  # sudo ufw allow 3000/tcp
  grafana:
    image: grafana/grafana:12.2.0
    container_name: grafana
    volumes:
      - grafana-data:/var/lib/grafana
      - ./observability/grafana_dashboards:/etc/grafana/provisioning/dashboards
      - ./observability/grafana-datasources.yml:/etc/grafana/provisioning/datasources/prometheus.yml
      - ./observability/tempo-datasource.yml:/etc/grafana/provisioning/datasources/tempo.yml
147
      - ./observability/loki-datasource.yml:/etc/grafana/provisioning/datasources/loki.yml
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
    environment:
      - GF_SERVER_HTTP_PORT=3000
      # do not make it admin/admin, because you will be prompted to change the password every time
      - GF_SECURITY_ADMIN_USER=dynamo
      - GF_SECURITY_ADMIN_PASSWORD=dynamo
      - GF_USERS_ALLOW_SIGN_UP=false
      - GF_FEATURE_TOGGLES_ENABLE=traceqlEditor
      - GF_INSTALL_PLUGINS=grafana-piechart-panel
      # Default min interval is 5s, but can be configured lower
      - GF_DASHBOARDS_MIN_REFRESH_INTERVAL=2s
      # Disable password change requirement
      - GF_SECURITY_DISABLE_INITIAL_ADMIN_CREATION=false
      - GF_SECURITY_ADMIN_PASSWORD_POLICY=false
      - GF_AUTH_DISABLE_LOGIN_FORM=false
      - GF_AUTH_DISABLE_SIGNOUT_MENU=false
    restart: unless-stopped
    ports:
      - "3000:3000"
    networks:
      - server
    depends_on:
      - prometheus
      - tempo
171
      - loki