"lib/llm/vscode:/vscode.git/clone" did not exist on "a110abfbe52facf7790ac86aebdfe8333fe56d7c"
docker-compose.yml 4.58 KB
Newer Older
1
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
Ryan Olson's avatar
Ryan Olson committed
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

16
17
18
19
20
21
22
networks:
  server:
    driver: bridge
  monitoring:
    driver: bridge

# Note that the images are pinned to specific versions to avoid breaking changes.
Ryan Olson's avatar
Ryan Olson committed
23
24
services:
  nats-server:
25
26
    image: nats:2.11.4
    command: [ "-js", "--trace", "-m", "8222" ]
Ryan Olson's avatar
Ryan Olson committed
27
28
29
    ports:
      - 4222:4222
      - 6222:6222
30
31
32
33
      - 8222:8222  # the endpoints include /varz, /healthz, ...
    networks:
      - server
      - monitoring
Ryan Olson's avatar
Ryan Olson committed
34
35

  etcd-server:
36
    image: bitnami/etcd:3.6.1
Ryan Olson's avatar
Ryan Olson committed
37
38
39
    environment:
      - ALLOW_NONE_AUTHENTICATION=yes
    ports:
40
      - 2379:2379  # this port exposes the /metrics endpoint
Ryan Olson's avatar
Ryan Olson committed
41
      - 2380:2380
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
    networks:
      - server
      - monitoring

  # All the services below are part of the metrics profile and monitoring network.

  # The exporter translates from /varz and other stats to Prometheus metrics
  nats-prometheus-exporter:
    image: natsio/prometheus-nats-exporter:0.17.3
    command: ["-varz", "-connz", "-routez", "-subz", "-gatewayz", "-leafz", "-jsz=all", "http://nats-server:8222"]
    ports:
      - 7777:7777
    networks:
      - monitoring
    profiles: [metrics]
    depends_on:
      - nats-server

60
61
  # DCGM stands for Data Center GPU Manager: https://developer.nvidia.com/dcgm
  # dcgm-exporter is a tool from NVIDIA that exposes DCGM metrics in Prometheus format.
62
63
64
  dcgm-exporter:
    image: nvidia/dcgm-exporter:4.2.3-4.1.3-ubi9
    ports:
65
66
67
68
69
      # Remap from 9400 to 9401 (public port) to avoid conflict with an existing dcgm-exporter
      # on dlcluster. To access dcgm:
      # Outside the container: curl http://localhost:9401/metrics
      # Inside the container (container-to-container): curl http://dcgm-exporter:9400/metrics
      - 9401:9400
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
    cap_add:
      - SYS_ADMIN
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    environment:
      # dcgm uses NVIDIA_VISIBLE_DEVICES variable but normally it is CUDA_VISIBLE_DEVICES
      - NVIDIA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-all}
    runtime: nvidia  # Specify the NVIDIA runtime
    networks:
      - monitoring
85
86

  prometheus:
87
    image: prom/prometheus:v3.4.1
88
89
    container_name: prometheus
    volumes:
90
      - ./prometheus.yml:/etc/prometheus/prometheus.yml
91
92
93
94
95
96
97
98
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      # These provide the web console functionality
      - '--web.console.libraries=/etc/prometheus/console_libraries'
      - '--web.console.templates=/etc/prometheus/consoles'
      - '--web.enable-lifecycle'
    restart: unless-stopped
99
100
101
102
103
104
    # Example to pull from the /query endpoint:
    # {__name__=~"DCGM.*", job="dcgm-exporter"}
    ports:
      - "9090:9090"
    networks:
      - monitoring
105
    profiles: [metrics]
106
107
108
109
    depends_on:
      - dcgm-exporter
      - nats-prometheus-exporter
      - etcd-server
110

111
112
  # grafana connects to prometheus via the /query endpoint.
  # Default credentials are dynamo/dynamo.
113
  grafana:
114
    image: grafana/grafana-enterprise:12.0.1
115
116
    container_name: grafana
    volumes:
117
      - ./grafana.json:/etc/grafana/provisioning/dashboards/llm-worker-dashboard.json
118
      - ./grafana-dcgm-dashboard.json:/etc/grafana/provisioning/dashboards/dcgm-dashboard.json
119
120
      - ./grafana-datasources.yml:/etc/grafana/provisioning/datasources/datasources.yml
      - ./grafana-dashboard-providers.yml:/etc/grafana/provisioning/dashboards/dashboard-providers.yml
121
    environment:
122
      # Port 3000 is already used by "dynamo serve", so use 3001
123
      - GF_SERVER_HTTP_PORT=3001
124
125
      - GF_SECURITY_ADMIN_USER=dynamo
      - GF_SECURITY_ADMIN_PASSWORD=dynamo
126
127
128
129
130
      - GF_USERS_ALLOW_SIGN_UP=false
      - GF_INSTALL_PLUGINS=grafana-piechart-panel
      # Default min interval is 5s, but can be configured lower
      - GF_DASHBOARDS_MIN_REFRESH_INTERVAL=2s
    restart: unless-stopped
131
132
133
134
    ports:
      - "3001:3001"
    networks:
      - monitoring
135
136
137
    profiles: [metrics]
    depends_on:
      - prometheus