"deploy/vscode:/vscode.git/clone" did not exist on "c92a422b6275673ceda7556c5585293050e6ef68"
Unverified Commit 54c21168 authored by Keiven C's avatar Keiven C Committed by GitHub
Browse files

fix: Prometheus to pull from dcgm-exporter:9400 instead of 9401 (#1707)


Co-authored-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
parent dfbd741d
......@@ -62,6 +62,7 @@ hf-hub = { version = "0.4.2", default-features = false, features = ["tokio", "ru
humantime = { version = "2.2.0" }
libc = { version = "0.2" }
oneshot = { version = "0.1.11", features = ["std", "async"] }
opentelemetry = { version = "0.27" }
prometheus = { version = "0.14" }
rand = { version = "0.9.0" }
serde = { version = "1", features = ["derive"] }
......
......@@ -57,10 +57,16 @@ services:
depends_on:
- nats-server
# DCGM stands for Data Center GPU Manager: https://developer.nvidia.com/dcgm
# dcgm-exporter is a tool from NVIDIA that exposes DCGM metrics in Prometheus format.
dcgm-exporter:
image: nvidia/dcgm-exporter:4.2.3-4.1.3-ubi9
ports:
- 9401:9400 # Remap from 9400 to 9401 to avoid conflict with an existing dcgm-exporter (on dlcluster)
# Remap from 9400 to 9401 (public port) to avoid conflict with an existing dcgm-exporter
# on dlcluster. To access dcgm:
# Outside the container: curl http://localhost:9401/metrics
# Inside the container (container-to-container): curl http://dcgm-exporter:9400/metrics
- 9401:9400
cap_add:
- SYS_ADMIN
deploy:
......
......@@ -31,7 +31,7 @@ scrape_configs:
- job_name: 'dcgm-exporter'
scrape_interval: 5s
static_configs:
- targets: ['dcgm-exporter:9401'] # on the "monitoring" network
- targets: ['dcgm-exporter:9400'] # on the "monitoring" network
# Uncomment to see its own Prometheus metrics
# - job_name: 'prometheus'
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment