docker-compose.yml 5.17 KB
Newer Older
1
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
Ryan Olson's avatar
Ryan Olson committed
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

16
# IMPORT NOTE: Make sure this is in sync with lib/runtime/docker-compose.yml
17
18
19
20
21
22
23
networks:
  server:
    driver: bridge
  monitoring:
    driver: bridge

# Note that the images are pinned to specific versions to avoid breaking changes.
Ryan Olson's avatar
Ryan Olson committed
24
25
services:
  nats-server:
26
27
    image: nats:2.11.4
    command: [ "-js", "--trace", "-m", "8222" ]
Ryan Olson's avatar
Ryan Olson committed
28
29
30
    ports:
      - 4222:4222
      - 6222:6222
31
32
33
34
      - 8222:8222  # the endpoints include /varz, /healthz, ...
    networks:
      - server
      - monitoring
Ryan Olson's avatar
Ryan Olson committed
35
36

  etcd-server:
37
    image: bitnamilegacy/etcd:3.6.1
Ryan Olson's avatar
Ryan Olson committed
38
39
40
    environment:
      - ALLOW_NONE_AUTHENTICATION=yes
    ports:
41
      - 2379:2379  # this port exposes the /metrics endpoint
Ryan Olson's avatar
Ryan Olson committed
42
      - 2380:2380
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
    networks:
      - server
      - monitoring

  # All the services below are part of the metrics profile and monitoring network.

  # The exporter translates from /varz and other stats to Prometheus metrics
  nats-prometheus-exporter:
    image: natsio/prometheus-nats-exporter:0.17.3
    command: ["-varz", "-connz", "-routez", "-subz", "-gatewayz", "-leafz", "-jsz=all", "http://nats-server:8222"]
    ports:
      - 7777:7777
    networks:
      - monitoring
    profiles: [metrics]
    depends_on:
      - nats-server

61
62
  # DCGM stands for Data Center GPU Manager: https://developer.nvidia.com/dcgm
  # dcgm-exporter is a tool from NVIDIA that exposes DCGM metrics in Prometheus format.
63
64
65
  dcgm-exporter:
    image: nvidia/dcgm-exporter:4.2.3-4.1.3-ubi9
    ports:
66
67
68
69
70
71
      # Expose dcgm-exporter on port 9401 both inside and outside the container
      # to avoid conflicts with other dcgm-exporter instances in distributed environments.
      # To access DCGM metrics:
      # Outside the container: curl http://localhost:9401/metrics (or the host IP)
      # Inside the container (container-to-container): curl http://dcgm-exporter:9401/metrics
      - 9401:9401
72
73
74
75
76
77
78
79
80
81
82
83
    cap_add:
      - SYS_ADMIN
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    environment:
      # dcgm uses NVIDIA_VISIBLE_DEVICES variable but normally it is CUDA_VISIBLE_DEVICES
      - NVIDIA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-all}
84
      - DCGM_EXPORTER_LISTEN=:9401
85
86
87
    runtime: nvidia  # Specify the NVIDIA runtime
    networks:
      - monitoring
88

89
90
  # To access Prometheus from another machine, you may need to disable te firewall on your host. On Ubuntu:
  # sudo ufw allow 9090/tcp
91
  prometheus:
92
    image: prom/prometheus:v3.4.1
93
94
    container_name: prometheus
    volumes:
95
      - ./metrics/prometheus.yml:/etc/prometheus/prometheus.yml
96
97
98
99
100
101
102
103
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      # These provide the web console functionality
      - '--web.console.libraries=/etc/prometheus/console_libraries'
      - '--web.console.templates=/etc/prometheus/consoles'
      - '--web.enable-lifecycle'
    restart: unless-stopped
104
105
106
107
    # Example to pull from the /query endpoint:
    # {__name__=~"DCGM.*", job="dcgm-exporter"}
    networks:
      - monitoring
108
109
    ports:
      - "9090:9090"
110
    profiles: [metrics]
111
112
    extra_hosts:
    - "host.docker.internal:host-gateway"
113
114
115
116
    depends_on:
      - dcgm-exporter
      - nats-prometheus-exporter
      - etcd-server
117

118
119
  # grafana connects to prometheus via the /query endpoint.
  # Default credentials are dynamo/dynamo.
120
121
  # To access Grafana from another machine, you may need to disable te firewall on your host. On Ubuntu:
  # sudo ufw allow 3001/tcp
122
  grafana:
123
    image: grafana/grafana-enterprise:12.0.1
124
125
    container_name: grafana
    volumes:
126
127
      - ./metrics/grafana_dashboards:/etc/grafana/provisioning/dashboards
      - ./metrics/grafana-datasources.yml:/etc/grafana/provisioning/datasources/datasources.yml
128
    environment:
129
      - GF_SERVER_HTTP_PORT=3001
130
      # do not make it admin/admin, because you will be prompted to change the password every time
131
132
      - GF_SECURITY_ADMIN_USER=dynamo
      - GF_SECURITY_ADMIN_PASSWORD=dynamo
133
134
135
136
      - GF_USERS_ALLOW_SIGN_UP=false
      - GF_INSTALL_PLUGINS=grafana-piechart-panel
      # Default min interval is 5s, but can be configured lower
      - GF_DASHBOARDS_MIN_REFRESH_INTERVAL=2s
137
138
139
140
141
      # Disable password change requirement
      - GF_SECURITY_DISABLE_INITIAL_ADMIN_CREATION=false
      - GF_SECURITY_ADMIN_PASSWORD_POLICY=false
      - GF_AUTH_DISABLE_LOGIN_FORM=false
      - GF_AUTH_DISABLE_SIGNOUT_MENU=false
142
    restart: unless-stopped
143
144
145
146
    ports:
      - "3001:3001"
    networks:
      - monitoring
147
148
149
    profiles: [metrics]
    depends_on:
      - prometheus