docker-compose.yml 5.09 KB
Newer Older
1
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
Ryan Olson's avatar
Ryan Olson committed
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

16
# IMPORT NOTE: Make sure this is in sync with lib/runtime/docker-compose.yml
17
18
19
20
21
22
23
networks:
  server:
    driver: bridge
  monitoring:
    driver: bridge

# Note that the images are pinned to specific versions to avoid breaking changes.
Ryan Olson's avatar
Ryan Olson committed
24
25
services:
  nats-server:
26
27
    image: nats:2.11.4
    command: [ "-js", "--trace", "-m", "8222" ]
Ryan Olson's avatar
Ryan Olson committed
28
29
30
    ports:
      - 4222:4222
      - 6222:6222
31
32
33
34
      - 8222:8222  # the endpoints include /varz, /healthz, ...
    networks:
      - server
      - monitoring
Ryan Olson's avatar
Ryan Olson committed
35
36

  etcd-server:
37
    image: bitnami/etcd:3.6.1
Ryan Olson's avatar
Ryan Olson committed
38
39
40
    environment:
      - ALLOW_NONE_AUTHENTICATION=yes
    ports:
41
      - 2379:2379  # this port exposes the /metrics endpoint
Ryan Olson's avatar
Ryan Olson committed
42
      - 2380:2380
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
    networks:
      - server
      - monitoring

  # All the services below are part of the metrics profile and monitoring network.

  # The exporter translates from /varz and other stats to Prometheus metrics
  nats-prometheus-exporter:
    image: natsio/prometheus-nats-exporter:0.17.3
    command: ["-varz", "-connz", "-routez", "-subz", "-gatewayz", "-leafz", "-jsz=all", "http://nats-server:8222"]
    ports:
      - 7777:7777
    networks:
      - monitoring
    profiles: [metrics]
    depends_on:
      - nats-server

61
62
  # DCGM stands for Data Center GPU Manager: https://developer.nvidia.com/dcgm
  # dcgm-exporter is a tool from NVIDIA that exposes DCGM metrics in Prometheus format.
63
64
65
  dcgm-exporter:
    image: nvidia/dcgm-exporter:4.2.3-4.1.3-ubi9
    ports:
66
67
68
69
70
      # Remap from 9400 to 9401 (public port) to avoid conflict with an existing dcgm-exporter
      # on dlcluster. To access dcgm:
      # Outside the container: curl http://localhost:9401/metrics
      # Inside the container (container-to-container): curl http://dcgm-exporter:9400/metrics
      - 9401:9400
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
    cap_add:
      - SYS_ADMIN
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    environment:
      # dcgm uses NVIDIA_VISIBLE_DEVICES variable but normally it is CUDA_VISIBLE_DEVICES
      - NVIDIA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-all}
    runtime: nvidia  # Specify the NVIDIA runtime
    networks:
      - monitoring
86

87
88
  # To access Prometheus from another machine, you may need to disable te firewall on your host. On Ubuntu:
  # sudo ufw allow 9090/tcp
89
  prometheus:
90
    image: prom/prometheus:v3.4.1
91
92
    container_name: prometheus
    volumes:
93
      - ./prometheus.yml:/etc/prometheus/prometheus.yml
94
95
96
97
98
99
100
101
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      # These provide the web console functionality
      - '--web.console.libraries=/etc/prometheus/console_libraries'
      - '--web.console.templates=/etc/prometheus/consoles'
      - '--web.enable-lifecycle'
    restart: unless-stopped
102
103
104
105
    # Example to pull from the /query endpoint:
    # {__name__=~"DCGM.*", job="dcgm-exporter"}
    networks:
      - monitoring
106
107
    ports:
      - "9090:9090"
108
    profiles: [metrics]
109
110
    extra_hosts:
    - "host.docker.internal:host-gateway"
111
112
113
114
    depends_on:
      - dcgm-exporter
      - nats-prometheus-exporter
      - etcd-server
115

116
117
  # grafana connects to prometheus via the /query endpoint.
  # Default credentials are dynamo/dynamo.
118
119
  # To access Grafana from another machine, you may need to disable te firewall on your host. On Ubuntu:
  # sudo ufw allow 3001/tcp
120
  grafana:
121
    image: grafana/grafana-enterprise:12.0.1
122
123
    container_name: grafana
    volumes:
124
      - ./grafana_dashboards:/etc/grafana/provisioning/dashboards
125
      - ./grafana-datasources.yml:/etc/grafana/provisioning/datasources/datasources.yml
126
    environment:
127
      # Port 3000 is already used by "dynamo serve", so use 3001
128
      - GF_SERVER_HTTP_PORT=3001
129
      # do not make it admin/admin, because you will be prompted to change the password every time
130
131
      - GF_SECURITY_ADMIN_USER=dynamo
      - GF_SECURITY_ADMIN_PASSWORD=dynamo
132
133
134
135
      - GF_USERS_ALLOW_SIGN_UP=false
      - GF_INSTALL_PLUGINS=grafana-piechart-panel
      # Default min interval is 5s, but can be configured lower
      - GF_DASHBOARDS_MIN_REFRESH_INTERVAL=2s
136
137
138
139
140
      # Disable password change requirement
      - GF_SECURITY_DISABLE_INITIAL_ADMIN_CREATION=false
      - GF_SECURITY_ADMIN_PASSWORD_POLICY=false
      - GF_AUTH_DISABLE_LOGIN_FORM=false
      - GF_AUTH_DISABLE_SIGNOUT_MENU=false
141
    restart: unless-stopped
142
143
144
145
    ports:
      - "3001:3001"
    networks:
      - monitoring
146
147
148
    profiles: [metrics]
    depends_on:
      - prometheus