Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
0d6cae85
Unverified
Commit
0d6cae85
authored
Jul 02, 2025
by
sanshang-nv
Committed by
GitHub
Jul 01, 2025
Browse files
feat: add grafana dcgm dashboard config file (#1701)
parent
d4676f8a
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
917 additions
and
0 deletions
+917
-0
deploy/metrics/docker-compose.yml
deploy/metrics/docker-compose.yml
+1
-0
deploy/metrics/grafana-dcgm-dashboard.json
deploy/metrics/grafana-dcgm-dashboard.json
+916
-0
No files found.
deploy/metrics/docker-compose.yml
View file @
0d6cae85
...
...
@@ -115,6 +115,7 @@ services:
container_name
:
grafana
volumes
:
-
./grafana.json:/etc/grafana/provisioning/dashboards/llm-worker-dashboard.json
-
./grafana-dcgm-dashboard.json:/etc/grafana/provisioning/dashboards/dcgm-dashboard.json
-
./grafana-datasources.yml:/etc/grafana/provisioning/datasources/datasources.yml
-
./grafana-dashboard-providers.yml:/etc/grafana/provisioning/dashboards/dashboard-providers.yml
environment
:
...
...
deploy/metrics/grafana-dcgm-dashboard.json
0 → 100644
View file @
0d6cae85
{
"annotations"
:
{
"list"
:
[
{
"builtIn"
:
1
,
"datasource"
:
{
"type"
:
"grafana"
,
"uid"
:
"-- Grafana --"
},
"enable"
:
true
,
"hide"
:
true
,
"iconColor"
:
"rgba(0, 211, 255, 1)"
,
"name"
:
"Annotations & Alerts"
,
"type"
:
"dashboard"
}
]
},
"copyright"
:
[
"SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved."
,
"SPDX-License-Identifier: Apache-2.0"
,
"Licensed under the Apache License, Version 2.0 (the
\"
License
\"
);"
,
"you may not use this file except in compliance with the License."
,
"You may obtain a copy of the License at"
,
"http://www.apache.org/licenses/LICENSE-2.0"
,
"Unless required by applicable law or agreed to in writing, software"
,
"distributed under the License is distributed on an
\"
AS IS
\"
BASIS,"
,
"WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied."
,
"See the License for the specific language governing permissions and"
,
"limitations under the License."
],
"editable"
:
true
,
"fiscalYearStartMonth"
:
0
,
"graphTooltip"
:
0
,
"id"
:
2
,
"links"
:
[],
"liveNow"
:
false
,
"panels"
:
[
{
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"prometheus"
},
"fieldConfig"
:
{
"defaults"
:
{
"color"
:
{
"mode"
:
"palette-classic"
},
"custom"
:
{
"axisCenteredZero"
:
false
,
"axisColorMode"
:
"text"
,
"axisLabel"
:
""
,
"axisPlacement"
:
"auto"
,
"barAlignment"
:
0
,
"drawStyle"
:
"line"
,
"fillOpacity"
:
20
,
"gradientMode"
:
"none"
,
"hideFrom"
:
{
"legend"
:
false
,
"tooltip"
:
false
,
"viz"
:
false
},
"lineInterpolation"
:
"smooth"
,
"lineWidth"
:
2
,
"pointSize"
:
5
,
"scaleDistribution"
:
{
"type"
:
"linear"
},
"showPoints"
:
"never"
,
"spanNulls"
:
false
,
"stacking"
:
{
"group"
:
"A"
,
"mode"
:
"none"
},
"thresholdsStyle"
:
{
"mode"
:
"off"
}
},
"mappings"
:
[],
"thresholds"
:
{
"mode"
:
"absolute"
,
"steps"
:
[
{
"color"
:
"green"
,
"value"
:
null
},
{
"color"
:
"red"
,
"value"
:
80
}
]
},
"unit"
:
"percent"
,
"min"
:
0
,
"max"
:
100
},
"overrides"
:
[]
},
"gridPos"
:
{
"h"
:
8
,
"w"
:
12
,
"x"
:
0
,
"y"
:
0
},
"id"
:
1
,
"options"
:
{
"legend"
:
{
"calcs"
:
[
"mean"
,
"max"
],
"displayMode"
:
"table"
,
"placement"
:
"right"
,
"showLegend"
:
true
},
"tooltip"
:
{
"mode"
:
"multi"
,
"sort"
:
"none"
}
},
"title"
:
"GPU Utilization"
,
"type"
:
"timeseries"
,
"targets"
:
[
{
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"prometheus"
},
"editorMode"
:
"code"
,
"expr"
:
"DCGM_FI_DEV_GPU_UTIL"
,
"legendFormat"
:
"GPU {{gpu}} ({{modelName}})"
,
"range"
:
true
,
"refId"
:
"A"
}
]
},
{
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"prometheus"
},
"fieldConfig"
:
{
"defaults"
:
{
"color"
:
{
"mode"
:
"palette-classic"
},
"custom"
:
{
"axisCenteredZero"
:
false
,
"axisColorMode"
:
"text"
,
"axisLabel"
:
""
,
"axisPlacement"
:
"auto"
,
"barAlignment"
:
0
,
"drawStyle"
:
"line"
,
"fillOpacity"
:
20
,
"gradientMode"
:
"none"
,
"hideFrom"
:
{
"legend"
:
false
,
"tooltip"
:
false
,
"viz"
:
false
},
"lineInterpolation"
:
"smooth"
,
"lineWidth"
:
2
,
"pointSize"
:
5
,
"scaleDistribution"
:
{
"type"
:
"linear"
},
"showPoints"
:
"never"
,
"spanNulls"
:
false
,
"stacking"
:
{
"group"
:
"A"
,
"mode"
:
"none"
},
"thresholdsStyle"
:
{
"mode"
:
"off"
}
},
"mappings"
:
[],
"thresholds"
:
{
"mode"
:
"absolute"
,
"steps"
:
[
{
"color"
:
"green"
,
"value"
:
null
},
{
"color"
:
"red"
,
"value"
:
80
}
]
},
"unit"
:
"bytes"
,
"min"
:
0
},
"overrides"
:
[]
},
"gridPos"
:
{
"h"
:
8
,
"w"
:
12
,
"x"
:
12
,
"y"
:
0
},
"id"
:
2
,
"options"
:
{
"legend"
:
{
"calcs"
:
[
"mean"
,
"max"
],
"displayMode"
:
"table"
,
"placement"
:
"right"
,
"showLegend"
:
true
},
"tooltip"
:
{
"mode"
:
"multi"
,
"sort"
:
"none"
}
},
"title"
:
"GPU Memory Usage"
,
"type"
:
"timeseries"
,
"targets"
:
[
{
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"prometheus"
},
"editorMode"
:
"code"
,
"expr"
:
"DCGM_FI_DEV_FB_USED * 1024 * 1024"
,
"legendFormat"
:
"GPU {{gpu}} Used"
,
"range"
:
true
,
"refId"
:
"A"
},
{
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"prometheus"
},
"editorMode"
:
"code"
,
"expr"
:
"DCGM_FI_DEV_FB_FREE * 1024 * 1024"
,
"legendFormat"
:
"GPU {{gpu}} Free"
,
"range"
:
true
,
"refId"
:
"B"
}
]
},
{
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"prometheus"
},
"fieldConfig"
:
{
"defaults"
:
{
"color"
:
{
"mode"
:
"palette-classic"
},
"custom"
:
{
"axisCenteredZero"
:
false
,
"axisColorMode"
:
"text"
,
"axisLabel"
:
""
,
"axisPlacement"
:
"auto"
,
"barAlignment"
:
0
,
"drawStyle"
:
"line"
,
"fillOpacity"
:
20
,
"gradientMode"
:
"none"
,
"hideFrom"
:
{
"legend"
:
false
,
"tooltip"
:
false
,
"viz"
:
false
},
"lineInterpolation"
:
"smooth"
,
"lineWidth"
:
2
,
"pointSize"
:
5
,
"scaleDistribution"
:
{
"type"
:
"linear"
},
"showPoints"
:
"never"
,
"spanNulls"
:
false
,
"stacking"
:
{
"group"
:
"A"
,
"mode"
:
"none"
},
"thresholdsStyle"
:
{
"mode"
:
"off"
}
},
"mappings"
:
[],
"thresholds"
:
{
"mode"
:
"absolute"
,
"steps"
:
[
{
"color"
:
"green"
,
"value"
:
null
},
{
"color"
:
"yellow"
,
"value"
:
70
},
{
"color"
:
"red"
,
"value"
:
85
}
]
},
"unit"
:
"celsius"
},
"overrides"
:
[]
},
"gridPos"
:
{
"h"
:
8
,
"w"
:
12
,
"x"
:
0
,
"y"
:
8
},
"id"
:
3
,
"options"
:
{
"legend"
:
{
"calcs"
:
[
"mean"
,
"max"
],
"displayMode"
:
"table"
,
"placement"
:
"right"
,
"showLegend"
:
true
},
"tooltip"
:
{
"mode"
:
"multi"
,
"sort"
:
"none"
}
},
"title"
:
"GPU Temperature"
,
"type"
:
"timeseries"
,
"targets"
:
[
{
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"prometheus"
},
"editorMode"
:
"code"
,
"expr"
:
"DCGM_FI_DEV_GPU_TEMP"
,
"legendFormat"
:
"GPU {{gpu}} Temp"
,
"range"
:
true
,
"refId"
:
"A"
},
{
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"prometheus"
},
"editorMode"
:
"code"
,
"expr"
:
"DCGM_FI_DEV_MEMORY_TEMP"
,
"legendFormat"
:
"GPU {{gpu}} Memory Temp"
,
"range"
:
true
,
"refId"
:
"B"
}
]
},
{
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"prometheus"
},
"fieldConfig"
:
{
"defaults"
:
{
"color"
:
{
"mode"
:
"palette-classic"
},
"custom"
:
{
"axisCenteredZero"
:
false
,
"axisColorMode"
:
"text"
,
"axisLabel"
:
""
,
"axisPlacement"
:
"auto"
,
"barAlignment"
:
0
,
"drawStyle"
:
"line"
,
"fillOpacity"
:
20
,
"gradientMode"
:
"none"
,
"hideFrom"
:
{
"legend"
:
false
,
"tooltip"
:
false
,
"viz"
:
false
},
"lineInterpolation"
:
"smooth"
,
"lineWidth"
:
2
,
"pointSize"
:
5
,
"scaleDistribution"
:
{
"type"
:
"linear"
},
"showPoints"
:
"never"
,
"spanNulls"
:
false
,
"stacking"
:
{
"group"
:
"A"
,
"mode"
:
"none"
},
"thresholdsStyle"
:
{
"mode"
:
"off"
}
},
"mappings"
:
[],
"thresholds"
:
{
"mode"
:
"absolute"
,
"steps"
:
[
{
"color"
:
"green"
,
"value"
:
null
}
]
},
"unit"
:
"watt"
},
"overrides"
:
[]
},
"gridPos"
:
{
"h"
:
8
,
"w"
:
12
,
"x"
:
12
,
"y"
:
8
},
"id"
:
4
,
"options"
:
{
"legend"
:
{
"calcs"
:
[
"mean"
,
"max"
],
"displayMode"
:
"table"
,
"placement"
:
"right"
,
"showLegend"
:
true
},
"tooltip"
:
{
"mode"
:
"multi"
,
"sort"
:
"none"
}
},
"title"
:
"GPU Power Usage"
,
"type"
:
"timeseries"
,
"targets"
:
[
{
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"prometheus"
},
"editorMode"
:
"code"
,
"expr"
:
"DCGM_FI_DEV_POWER_USAGE"
,
"legendFormat"
:
"GPU {{gpu}} Power"
,
"range"
:
true
,
"refId"
:
"A"
}
]
},
{
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"prometheus"
},
"fieldConfig"
:
{
"defaults"
:
{
"color"
:
{
"mode"
:
"palette-classic"
},
"custom"
:
{
"axisCenteredZero"
:
false
,
"axisColorMode"
:
"text"
,
"axisLabel"
:
""
,
"axisPlacement"
:
"auto"
,
"barAlignment"
:
0
,
"drawStyle"
:
"line"
,
"fillOpacity"
:
20
,
"gradientMode"
:
"none"
,
"hideFrom"
:
{
"legend"
:
false
,
"tooltip"
:
false
,
"viz"
:
false
},
"lineInterpolation"
:
"smooth"
,
"lineWidth"
:
2
,
"pointSize"
:
5
,
"scaleDistribution"
:
{
"type"
:
"linear"
},
"showPoints"
:
"never"
,
"spanNulls"
:
false
,
"stacking"
:
{
"group"
:
"A"
,
"mode"
:
"none"
},
"thresholdsStyle"
:
{
"mode"
:
"off"
}
},
"mappings"
:
[],
"thresholds"
:
{
"mode"
:
"absolute"
,
"steps"
:
[
{
"color"
:
"green"
,
"value"
:
null
}
]
},
"unit"
:
"hertz"
},
"overrides"
:
[]
},
"gridPos"
:
{
"h"
:
8
,
"w"
:
12
,
"x"
:
0
,
"y"
:
16
},
"id"
:
5
,
"options"
:
{
"legend"
:
{
"calcs"
:
[
"mean"
,
"max"
],
"displayMode"
:
"table"
,
"placement"
:
"right"
,
"showLegend"
:
true
},
"tooltip"
:
{
"mode"
:
"multi"
,
"sort"
:
"none"
}
},
"title"
:
"GPU Clock Speeds"
,
"type"
:
"timeseries"
,
"targets"
:
[
{
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"prometheus"
},
"editorMode"
:
"code"
,
"expr"
:
"DCGM_FI_DEV_SM_CLOCK * 1000000"
,
"legendFormat"
:
"GPU {{gpu}} SM Clock"
,
"range"
:
true
,
"refId"
:
"A"
},
{
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"prometheus"
},
"editorMode"
:
"code"
,
"expr"
:
"DCGM_FI_DEV_MEM_CLOCK * 1000000"
,
"legendFormat"
:
"GPU {{gpu}} Memory Clock"
,
"range"
:
true
,
"refId"
:
"B"
}
]
},
{
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"prometheus"
},
"fieldConfig"
:
{
"defaults"
:
{
"color"
:
{
"mode"
:
"palette-classic"
},
"custom"
:
{
"axisCenteredZero"
:
false
,
"axisColorMode"
:
"text"
,
"axisLabel"
:
""
,
"axisPlacement"
:
"auto"
,
"barAlignment"
:
0
,
"drawStyle"
:
"line"
,
"fillOpacity"
:
20
,
"gradientMode"
:
"none"
,
"hideFrom"
:
{
"legend"
:
false
,
"tooltip"
:
false
,
"viz"
:
false
},
"lineInterpolation"
:
"smooth"
,
"lineWidth"
:
2
,
"pointSize"
:
5
,
"scaleDistribution"
:
{
"type"
:
"linear"
},
"showPoints"
:
"never"
,
"spanNulls"
:
false
,
"stacking"
:
{
"group"
:
"A"
,
"mode"
:
"none"
},
"thresholdsStyle"
:
{
"mode"
:
"off"
}
},
"mappings"
:
[],
"thresholds"
:
{
"mode"
:
"absolute"
,
"steps"
:
[
{
"color"
:
"green"
,
"value"
:
null
}
]
},
"unit"
:
"percent"
,
"min"
:
0
,
"max"
:
100
},
"overrides"
:
[]
},
"gridPos"
:
{
"h"
:
8
,
"w"
:
12
,
"x"
:
12
,
"y"
:
16
},
"id"
:
6
,
"options"
:
{
"legend"
:
{
"calcs"
:
[
"mean"
,
"max"
],
"displayMode"
:
"table"
,
"placement"
:
"right"
,
"showLegend"
:
true
},
"tooltip"
:
{
"mode"
:
"multi"
,
"sort"
:
"none"
}
},
"title"
:
"GPU Engine Activity"
,
"type"
:
"timeseries"
,
"targets"
:
[
{
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"prometheus"
},
"editorMode"
:
"code"
,
"expr"
:
"DCGM_FI_PROF_GR_ENGINE_ACTIVE * 100"
,
"legendFormat"
:
"GPU {{gpu}} Graphics Engine"
,
"range"
:
true
,
"refId"
:
"A"
},
{
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"prometheus"
},
"editorMode"
:
"code"
,
"expr"
:
"DCGM_FI_PROF_PIPE_TENSOR_ACTIVE * 100"
,
"legendFormat"
:
"GPU {{gpu}} Tensor Core"
,
"range"
:
true
,
"refId"
:
"B"
}
]
},
{
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"prometheus"
},
"fieldConfig"
:
{
"defaults"
:
{
"color"
:
{
"mode"
:
"palette-classic"
},
"custom"
:
{
"axisCenteredZero"
:
false
,
"axisColorMode"
:
"text"
,
"axisLabel"
:
""
,
"axisPlacement"
:
"auto"
,
"barAlignment"
:
0
,
"drawStyle"
:
"line"
,
"fillOpacity"
:
20
,
"gradientMode"
:
"none"
,
"hideFrom"
:
{
"legend"
:
false
,
"tooltip"
:
false
,
"viz"
:
false
},
"lineInterpolation"
:
"smooth"
,
"lineWidth"
:
2
,
"pointSize"
:
5
,
"scaleDistribution"
:
{
"type"
:
"linear"
},
"showPoints"
:
"never"
,
"spanNulls"
:
false
,
"stacking"
:
{
"group"
:
"A"
,
"mode"
:
"none"
},
"thresholdsStyle"
:
{
"mode"
:
"off"
}
},
"mappings"
:
[],
"thresholds"
:
{
"mode"
:
"absolute"
,
"steps"
:
[
{
"color"
:
"green"
,
"value"
:
null
}
]
},
"unit"
:
"binBps"
},
"overrides"
:
[]
},
"gridPos"
:
{
"h"
:
8
,
"w"
:
12
,
"x"
:
0
,
"y"
:
24
},
"id"
:
7
,
"options"
:
{
"legend"
:
{
"calcs"
:
[
"mean"
,
"max"
],
"displayMode"
:
"table"
,
"placement"
:
"right"
,
"showLegend"
:
true
},
"tooltip"
:
{
"mode"
:
"multi"
,
"sort"
:
"none"
}
},
"title"
:
"PCIe Bandwidth"
,
"type"
:
"timeseries"
,
"targets"
:
[
{
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"prometheus"
},
"editorMode"
:
"code"
,
"expr"
:
"rate(DCGM_FI_PROF_PCIE_RX_BYTES[10s])"
,
"legendFormat"
:
"GPU {{gpu}} PCIe RX"
,
"range"
:
true
,
"refId"
:
"A"
},
{
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"prometheus"
},
"editorMode"
:
"code"
,
"expr"
:
"rate(DCGM_FI_PROF_PCIE_TX_BYTES[10s])"
,
"legendFormat"
:
"GPU {{gpu}} PCIe TX"
,
"range"
:
true
,
"refId"
:
"B"
}
]
},
{
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"prometheus"
},
"fieldConfig"
:
{
"defaults"
:
{
"color"
:
{
"mode"
:
"thresholds"
},
"mappings"
:
[],
"thresholds"
:
{
"mode"
:
"absolute"
,
"steps"
:
[
{
"color"
:
"green"
,
"value"
:
null
},
{
"color"
:
"yellow"
,
"value"
:
50
},
{
"color"
:
"red"
,
"value"
:
80
}
]
},
"unit"
:
"percent"
},
"overrides"
:
[]
},
"gridPos"
:
{
"h"
:
8
,
"w"
:
6
,
"x"
:
12
,
"y"
:
24
},
"id"
:
8
,
"options"
:
{
"orientation"
:
"auto"
,
"reduceOptions"
:
{
"calcs"
:
[
"lastNotNull"
],
"fields"
:
""
,
"values"
:
false
},
"showThresholdLabels"
:
false
,
"showThresholdMarkers"
:
true
},
"pluginVersion"
:
"10.0.0"
,
"title"
:
"Average GPU Utilization"
,
"type"
:
"gauge"
,
"targets"
:
[
{
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"prometheus"
},
"editorMode"
:
"code"
,
"expr"
:
"avg(DCGM_FI_DEV_GPU_UTIL)"
,
"legendFormat"
:
"__auto"
,
"range"
:
true
,
"refId"
:
"A"
}
]
},
{
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"prometheus"
},
"fieldConfig"
:
{
"defaults"
:
{
"color"
:
{
"mode"
:
"thresholds"
},
"mappings"
:
[],
"thresholds"
:
{
"mode"
:
"absolute"
,
"steps"
:
[
{
"color"
:
"green"
,
"value"
:
null
},
{
"color"
:
"yellow"
,
"value"
:
70
},
{
"color"
:
"red"
,
"value"
:
85
}
]
},
"unit"
:
"celsius"
},
"overrides"
:
[]
},
"gridPos"
:
{
"h"
:
8
,
"w"
:
6
,
"x"
:
18
,
"y"
:
24
},
"id"
:
9
,
"options"
:
{
"orientation"
:
"auto"
,
"reduceOptions"
:
{
"calcs"
:
[
"lastNotNull"
],
"fields"
:
""
,
"values"
:
false
},
"showThresholdLabels"
:
false
,
"showThresholdMarkers"
:
true
},
"pluginVersion"
:
"10.0.0"
,
"title"
:
"Max GPU Temperature"
,
"type"
:
"gauge"
,
"targets"
:
[
{
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"prometheus"
},
"editorMode"
:
"code"
,
"expr"
:
"max(DCGM_FI_DEV_GPU_TEMP)"
,
"legendFormat"
:
"__auto"
,
"range"
:
true
,
"refId"
:
"A"
}
]
}
],
"refresh"
:
"5s"
,
"schemaVersion"
:
36
,
"style"
:
"dark"
,
"tags"
:
[
"dcgm"
,
"gpu"
,
"nvidia"
],
"templating"
:
{
"list"
:
[]
},
"time"
:
{
"from"
:
"now-30m"
,
"to"
:
"now"
},
"timepicker"
:
{},
"timezone"
:
""
,
"title"
:
"DCGM GPU Monitoring Dashboard"
,
"uid"
:
"dcgm-dashboard"
,
"version"
:
1
,
"weekStart"
:
""
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment