docker-compose.yml 4.13 KB
Newer Older
1
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
Ryan Olson's avatar
Ryan Olson committed
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

16
17
18
19
20
21
22
networks:
  server:
    driver: bridge
  monitoring:
    driver: bridge

# Note that the images are pinned to specific versions to avoid breaking changes.
Ryan Olson's avatar
Ryan Olson committed
23
24
services:
  nats-server:
25
26
    image: nats:2.11.4
    command: [ "-js", "--trace", "-m", "8222" ]
Ryan Olson's avatar
Ryan Olson committed
27
28
29
    ports:
      - 4222:4222
      - 6222:6222
30
31
32
33
      - 8222:8222  # the endpoints include /varz, /healthz, ...
    networks:
      - server
      - monitoring
Ryan Olson's avatar
Ryan Olson committed
34
35

  etcd-server:
36
    image: bitnami/etcd:3.6.1
Ryan Olson's avatar
Ryan Olson committed
37
38
39
    environment:
      - ALLOW_NONE_AUTHENTICATION=yes
    ports:
40
      - 2379:2379  # this port exposes the /metrics endpoint
Ryan Olson's avatar
Ryan Olson committed
41
      - 2380:2380
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
    networks:
      - server
      - monitoring

  # All the services below are part of the metrics profile and monitoring network.

  # The exporter translates from /varz and other stats to Prometheus metrics
  nats-prometheus-exporter:
    image: natsio/prometheus-nats-exporter:0.17.3
    command: ["-varz", "-connz", "-routez", "-subz", "-gatewayz", "-leafz", "-jsz=all", "http://nats-server:8222"]
    ports:
      - 7777:7777
    networks:
      - monitoring
    profiles: [metrics]
    depends_on:
      - nats-server

  dcgm-exporter:
    image: nvidia/dcgm-exporter:4.2.3-4.1.3-ubi9
    ports:
      - 9401:9400  # Remap from 9400 to 9401 to avoid conflict with an existing dcgm-exporter (on dlcluster)
    cap_add:
      - SYS_ADMIN
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    environment:
      # dcgm uses NVIDIA_VISIBLE_DEVICES variable but normally it is CUDA_VISIBLE_DEVICES
      - NVIDIA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-all}
    runtime: nvidia  # Specify the NVIDIA runtime
    networks:
      - monitoring
79
80

  prometheus:
81
    image: prom/prometheus:v3.4.1
82
83
    container_name: prometheus
    volumes:
84
      - ./prometheus.yml:/etc/prometheus/prometheus.yml
85
86
87
88
89
90
91
92
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      # These provide the web console functionality
      - '--web.console.libraries=/etc/prometheus/console_libraries'
      - '--web.console.templates=/etc/prometheus/consoles'
      - '--web.enable-lifecycle'
    restart: unless-stopped
93
94
95
96
97
98
    # Example to pull from the /query endpoint:
    # {__name__=~"DCGM.*", job="dcgm-exporter"}
    ports:
      - "9090:9090"
    networks:
      - monitoring
99
    profiles: [metrics]
100
101
102
103
    depends_on:
      - dcgm-exporter
      - nats-prometheus-exporter
      - etcd-server
104

105
106
  # grafana connects to prometheus via the /query endpoint.
  # Default credentials are dynamo/dynamo.
107
  grafana:
108
    image: grafana/grafana-enterprise:12.0.1
109
110
    container_name: grafana
    volumes:
111
112
113
      - ./grafana.json:/etc/grafana/provisioning/dashboards/llm-worker-dashboard.json
      - ./grafana-datasources.yml:/etc/grafana/provisioning/datasources/datasources.yml
      - ./grafana-dashboard-providers.yml:/etc/grafana/provisioning/dashboards/dashboard-providers.yml
114
    environment:
115
      # Port 3000 is already used by "dynamo serve", so use 3001
116
      - GF_SERVER_HTTP_PORT=3001
117
118
      - GF_SECURITY_ADMIN_USER=dynamo
      - GF_SECURITY_ADMIN_PASSWORD=dynamo
119
120
121
122
123
      - GF_USERS_ALLOW_SIGN_UP=false
      - GF_INSTALL_PLUGINS=grafana-piechart-panel
      # Default min interval is 5s, but can be configured lower
      - GF_DASHBOARDS_MIN_REFRESH_INTERVAL=2s
    restart: unless-stopped
124
125
126
127
    ports:
      - "3001:3001"
    networks:
      - monitoring
128
129
130
    profiles: [metrics]
    depends_on:
      - prometheus