# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # IMPORT NOTE: Make sure this is in sync with lib/runtime/docker-compose.yml networks: server: driver: bridge monitoring: driver: bridge # Note that the images are pinned to specific versions to avoid breaking changes. services: nats-server: image: nats:2.11.4 command: [ "-js", "--trace", "-m", "8222" ] ports: - 4222:4222 - 6222:6222 - 8222:8222 # the endpoints include /varz, /healthz, ... networks: - server - monitoring etcd-server: image: bitnamilegacy/etcd:3.6.1 environment: - ALLOW_NONE_AUTHENTICATION=yes ports: - 2379:2379 # this port exposes the /metrics endpoint - 2380:2380 networks: - server - monitoring # All the services below are part of the metrics profile and monitoring network. # The exporter translates from /varz and other stats to Prometheus metrics nats-prometheus-exporter: image: natsio/prometheus-nats-exporter:0.17.3 command: ["-varz", "-connz", "-routez", "-subz", "-gatewayz", "-leafz", "-jsz=all", "http://nats-server:8222"] ports: - 7777:7777 networks: - monitoring profiles: [metrics] depends_on: - nats-server # DCGM stands for Data Center GPU Manager: https://developer.nvidia.com/dcgm # dcgm-exporter is a tool from NVIDIA that exposes DCGM metrics in Prometheus format. dcgm-exporter: image: nvidia/dcgm-exporter:4.2.3-4.1.3-ubi9 ports: # Expose dcgm-exporter on port 9401 both inside and outside the container # to avoid conflicts with other dcgm-exporter instances in distributed environments. # To access DCGM metrics: # Outside the container: curl http://localhost:9401/metrics (or the host IP) # Inside the container (container-to-container): curl http://dcgm-exporter:9401/metrics - 9401:9401 cap_add: - SYS_ADMIN deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] environment: # dcgm uses NVIDIA_VISIBLE_DEVICES variable but normally it is CUDA_VISIBLE_DEVICES - NVIDIA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-all} - DCGM_EXPORTER_LISTEN=:9401 runtime: nvidia # Specify the NVIDIA runtime networks: - monitoring # To access Prometheus from another machine, you may need to disable te firewall on your host. On Ubuntu: # sudo ufw allow 9090/tcp prometheus: image: prom/prometheus:v3.4.1 container_name: prometheus volumes: - ./metrics/prometheus.yml:/etc/prometheus/prometheus.yml command: - '--config.file=/etc/prometheus/prometheus.yml' - '--storage.tsdb.path=/prometheus' # These provide the web console functionality - '--web.console.libraries=/etc/prometheus/console_libraries' - '--web.console.templates=/etc/prometheus/consoles' - '--web.enable-lifecycle' restart: unless-stopped # Example to pull from the /query endpoint: # {__name__=~"DCGM.*", job="dcgm-exporter"} networks: - monitoring ports: - "9090:9090" profiles: [metrics] extra_hosts: - "host.docker.internal:host-gateway" depends_on: - dcgm-exporter - nats-prometheus-exporter - etcd-server # grafana connects to prometheus via the /query endpoint. # Default credentials are dynamo/dynamo. # To access Grafana from another machine, you may need to disable te firewall on your host. On Ubuntu: # sudo ufw allow 3001/tcp grafana: image: grafana/grafana-enterprise:12.0.1 container_name: grafana volumes: - ./metrics/grafana_dashboards:/etc/grafana/provisioning/dashboards - ./metrics/grafana-datasources.yml:/etc/grafana/provisioning/datasources/datasources.yml environment: - GF_SERVER_HTTP_PORT=3001 # do not make it admin/admin, because you will be prompted to change the password every time - GF_SECURITY_ADMIN_USER=dynamo - GF_SECURITY_ADMIN_PASSWORD=dynamo - GF_USERS_ALLOW_SIGN_UP=false - GF_INSTALL_PLUGINS=grafana-piechart-panel # Default min interval is 5s, but can be configured lower - GF_DASHBOARDS_MIN_REFRESH_INTERVAL=2s # Disable password change requirement - GF_SECURITY_DISABLE_INITIAL_ADMIN_CREATION=false - GF_SECURITY_ADMIN_PASSWORD_POLICY=false - GF_AUTH_DISABLE_LOGIN_FORM=false - GF_AUTH_DISABLE_SIGNOUT_MENU=false restart: unless-stopped ports: - "3001:3001" networks: - monitoring profiles: [metrics] depends_on: - prometheus