Unverified Commit 14321c8f authored by Hongkuan Zhou's avatar Hongkuan Zhou Committed by GitHub
Browse files

feat: add a grafana dashboard for planner (#4815)


Signed-off-by: default avatarhongkuanz <hongkuanz@nvidia.com>
parent 065f466e
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-planner-dashboard
namespace: monitoring
labels:
grafana_dashboard: "1"
data:
planner-dashboard.json: |-
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"description": "Dynamo Planner metrics dashboard - Worker counts, observed/predicted metrics, and correction factors",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": null,
"links": [],
"panels": [
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 0
},
"id": 100,
"panels": [],
"title": "🖥️ Worker Counts & GPU Usage",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"description": "Current number of prefill workers",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#6E40AA",
"value": null
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 4,
"x": 0,
"y": 1
},
"id": 1,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "12.0.1",
"targets": [
{
"editorMode": "code",
"expr": "planner:num_p_workers{namespace=~\"$namespace\"}",
"legendFormat": "Prefill Workers",
"range": true,
"refId": "A"
}
],
"title": "Prefill Workers",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"description": "Current number of decode workers",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#1FA8C9",
"value": null
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 4,
"x": 4,
"y": 1
},
"id": 2,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "12.0.1",
"targets": [
{
"editorMode": "code",
"expr": "planner:num_d_workers{namespace=~\"$namespace\"}",
"legendFormat": "Decode Workers",
"range": true,
"refId": "A"
}
],
"title": "Decode Workers",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"description": "Cumulative GPU hours used since planner start",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"decimals": 2,
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#76B900",
"value": null
}
]
},
"unit": "h"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 4,
"x": 8,
"y": 1
},
"id": 3,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "12.0.1",
"targets": [
{
"editorMode": "code",
"expr": "planner:gpu_hours{namespace=~\"$namespace\"}",
"legendFormat": "GPU Hours",
"range": true,
"refId": "A"
}
],
"title": "Cumulative GPU Hours",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"description": "Worker count history over time",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "Workers",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "opacity",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "stepAfter",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"decimals": 0,
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
}
]
},
"unit": "none"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Prefill Workers"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#6E40AA",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Decode Workers"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#1FA8C9",
"mode": "fixed"
}
}
]
}
]
},
"gridPos": {
"h": 5,
"w": 12,
"x": 12,
"y": 1
},
"id": 4,
"options": {
"legend": {
"calcs": ["lastNotNull", "mean"],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.0.1",
"targets": [
{
"editorMode": "code",
"expr": "planner:num_p_workers{namespace=~\"$namespace\"}",
"legendFormat": "Prefill Workers",
"range": true,
"refId": "A"
},
{
"editorMode": "code",
"expr": "planner:num_d_workers{namespace=~\"$namespace\"}",
"legendFormat": "Decode Workers",
"range": true,
"refId": "B"
}
],
"title": "Worker Count History",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 6
},
"id": 101,
"panels": [],
"title": "📊 Observed Metrics",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"description": "Observed time to first token and inter-token latency",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "Latency (ms)",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "opacity",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
}
]
},
"unit": "ms"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "TTFT"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#FF6B6B",
"mode": "fixed"
}
},
{
"id": "custom.axisPlacement",
"value": "left"
},
{
"id": "custom.axisLabel",
"value": "TTFT (ms)"
}
]
},
{
"matcher": {
"id": "byName",
"options": "ITL"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#4ECDC4",
"mode": "fixed"
}
},
{
"id": "custom.axisPlacement",
"value": "right"
},
{
"id": "custom.axisLabel",
"value": "ITL (ms)"
}
]
}
]
},
"gridPos": {
"h": 7,
"w": 8,
"x": 0,
"y": 7
},
"id": 10,
"options": {
"legend": {
"calcs": ["lastNotNull", "mean", "max"],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.0.1",
"targets": [
{
"editorMode": "code",
"expr": "planner:observed_ttft{namespace=~\"$namespace\"}",
"legendFormat": "TTFT",
"range": true,
"refId": "A"
},
{
"editorMode": "code",
"expr": "planner:observed_itl{namespace=~\"$namespace\"}",
"legendFormat": "ITL",
"range": true,
"refId": "B"
}
],
"title": "Observed Latency (TTFT & ITL)",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"description": "Observed request rate and duration",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "opacity",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
}
]
}
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Request Rate"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#F9A825",
"mode": "fixed"
}
},
{
"id": "unit",
"value": "reqps"
},
{
"id": "custom.axisPlacement",
"value": "left"
}
]
},
{
"matcher": {
"id": "byName",
"options": "Request Duration"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#AB47BC",
"mode": "fixed"
}
},
{
"id": "unit",
"value": "s"
},
{
"id": "custom.axisPlacement",
"value": "right"
}
]
}
]
},
"gridPos": {
"h": 7,
"w": 8,
"x": 8,
"y": 7
},
"id": 11,
"options": {
"legend": {
"calcs": ["lastNotNull", "mean", "max"],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.0.1",
"targets": [
{
"editorMode": "code",
"expr": "planner:observed_request_rate{namespace=~\"$namespace\"}",
"legendFormat": "Request Rate",
"range": true,
"refId": "A"
},
{
"editorMode": "code",
"expr": "planner:observed_request_duration{namespace=~\"$namespace\"}",
"legendFormat": "Request Duration",
"range": true,
"refId": "B"
}
],
"title": "Observed Request Rate & Duration",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"description": "Observed input and output sequence lengths",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "Tokens",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "opacity",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
}
]
},
"unit": "none"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "ISL"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#26A69A",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "OSL"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#5C6BC0",
"mode": "fixed"
}
}
]
}
]
},
"gridPos": {
"h": 7,
"w": 8,
"x": 16,
"y": 7
},
"id": 12,
"options": {
"legend": {
"calcs": ["lastNotNull", "mean", "max"],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.0.1",
"targets": [
{
"editorMode": "code",
"expr": "planner:observed_isl{namespace=~\"$namespace\"}",
"legendFormat": "ISL",
"range": true,
"refId": "A"
},
{
"editorMode": "code",
"expr": "planner:observed_osl{namespace=~\"$namespace\"}",
"legendFormat": "OSL",
"range": true,
"refId": "B"
}
],
"title": "Observed Sequence Lengths (ISL & OSL)",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 14
},
"id": 102,
"panels": [],
"title": "🔮 Predicted Metrics",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"description": "Predicted request rate",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "Request Rate",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "opacity",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineStyle": {
"dash": [10, 10],
"fill": "dash"
},
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
}
]
},
"unit": "reqps"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Predicted Request Rate"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#FFB74D",
"mode": "fixed"
}
}
]
}
]
},
"gridPos": {
"h": 7,
"w": 8,
"x": 0,
"y": 15
},
"id": 20,
"options": {
"legend": {
"calcs": ["lastNotNull", "mean"],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.0.1",
"targets": [
{
"editorMode": "code",
"expr": "planner:predicted_request_rate{namespace=~\"$namespace\"}",
"legendFormat": "Predicted Request Rate",
"range": true,
"refId": "A"
}
],
"title": "Predicted Request Rate",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"description": "Predicted input and output sequence lengths",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "Tokens",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "opacity",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineStyle": {
"dash": [10, 10],
"fill": "dash"
},
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
}
]
},
"unit": "none"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Predicted ISL"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#80CBC4",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Predicted OSL"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#9FA8DA",
"mode": "fixed"
}
}
]
}
]
},
"gridPos": {
"h": 7,
"w": 8,
"x": 8,
"y": 15
},
"id": 22,
"options": {
"legend": {
"calcs": ["lastNotNull", "mean"],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.0.1",
"targets": [
{
"editorMode": "code",
"expr": "planner:predicted_isl{namespace=~\"$namespace\"}",
"legendFormat": "Predicted ISL",
"range": true,
"refId": "A"
},
{
"editorMode": "code",
"expr": "planner:predicted_osl{namespace=~\"$namespace\"}",
"legendFormat": "Predicted OSL",
"range": true,
"refId": "B"
}
],
"title": "Predicted Sequence Lengths (ISL & OSL)",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"description": "Predicted number of prefill and decode replicas",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "Replicas",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "opacity",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "stepAfter",
"lineStyle": {
"dash": [10, 10],
"fill": "dash"
},
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"decimals": 0,
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
}
]
},
"unit": "none"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Predicted Prefill"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#B388FF",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Predicted Decode"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#64B5F6",
"mode": "fixed"
}
}
]
}
]
},
"gridPos": {
"h": 7,
"w": 8,
"x": 16,
"y": 15
},
"id": 21,
"options": {
"legend": {
"calcs": ["lastNotNull", "mean", "max"],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.0.1",
"targets": [
{
"editorMode": "code",
"expr": "planner:predicted_num_p{namespace=~\"$namespace\"}",
"legendFormat": "Predicted Prefill",
"range": true,
"refId": "A"
},
{
"editorMode": "code",
"expr": "planner:predicted_num_d{namespace=~\"$namespace\"}",
"legendFormat": "Predicted Decode",
"range": true,
"refId": "B"
}
],
"title": "Predicted Replica Counts",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 22
},
"id": 103,
"panels": [],
"title": "⚙️ Correction Factors",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"description": "Current prefill correction factor (TTFT observed / TTFT expected)",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"decimals": 3,
"mappings": [],
"max": 2,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"value": null
},
{
"color": "#FF9830",
"value": 1.2
},
{
"color": "#F2495C",
"value": 1.5
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 4,
"x": 0,
"y": 23
},
"id": 30,
"options": {
"minVizHeight": 75,
"minVizWidth": 75,
"orientation": "auto",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true,
"sizing": "auto"
},
"pluginVersion": "12.0.1",
"targets": [
{
"editorMode": "code",
"expr": "planner:p_correction_factor{namespace=~\"$namespace\"}",
"legendFormat": "Prefill CF",
"range": true,
"refId": "A"
}
],
"title": "Prefill Correction Factor",
"type": "gauge"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"description": "Current decode correction factor (ITL observed / ITL expected)",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"decimals": 3,
"mappings": [],
"max": 2,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#73BF69",
"value": null
},
{
"color": "#FF9830",
"value": 1.2
},
{
"color": "#F2495C",
"value": 1.5
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 4,
"x": 4,
"y": 23
},
"id": 31,
"options": {
"minVizHeight": 75,
"minVizWidth": 75,
"orientation": "auto",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true,
"sizing": "auto"
},
"pluginVersion": "12.0.1",
"targets": [
{
"editorMode": "code",
"expr": "planner:d_correction_factor{namespace=~\"$namespace\"}",
"legendFormat": "Decode CF",
"range": true,
"refId": "A"
}
],
"title": "Decode Correction Factor",
"type": "gauge"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"description": "Correction factor history over time. Values close to 1.0 indicate accurate predictions.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "Factor",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "opacity",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "line+area"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "transparent",
"value": null
},
{
"color": "rgba(255, 152, 48, 0.1)",
"value": 1.2
},
{
"color": "rgba(242, 73, 92, 0.1)",
"value": 1.5
}
]
},
"unit": "none"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "Prefill CF"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#CE93D8",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "Decode CF"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "#81D4FA",
"mode": "fixed"
}
}
]
}
]
},
"gridPos": {
"h": 5,
"w": 16,
"x": 8,
"y": 23
},
"id": 32,
"options": {
"legend": {
"calcs": ["lastNotNull", "mean", "min", "max"],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "12.0.1",
"targets": [
{
"editorMode": "code",
"expr": "planner:p_correction_factor{namespace=~\"$namespace\"}",
"legendFormat": "Prefill CF",
"range": true,
"refId": "A"
},
{
"editorMode": "code",
"expr": "planner:d_correction_factor{namespace=~\"$namespace\"}",
"legendFormat": "Decode CF",
"range": true,
"refId": "B"
}
],
"title": "Correction Factor History",
"type": "timeseries"
}
],
"refresh": "",
"schemaVersion": 41,
"tags": ["dynamo", "planner"],
"templating": {
"list": [
{
"current": {
"text": "default",
"value": "default"
},
"label": "Data source",
"name": "datasource",
"options": [],
"query": "prometheus",
"refresh": 1,
"regex": "",
"type": "datasource"
},
{
"current": {
"selected": true,
"text": ["All"],
"value": ["$__all"]
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"definition": "label_values(planner:num_p_workers, namespace)",
"hide": 0,
"includeAll": true,
"label": "Namespace",
"multi": true,
"name": "namespace",
"options": [],
"query": "label_values(planner:num_p_workers, namespace)",
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"type": "query"
}
]
},
"time": {
"from": "now-30m",
"to": "now"
},
"timepicker": {
"refresh_intervals": ["5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h"]
},
"timezone": "browser",
"title": "Dynamo Planner Dashboard",
"uid": "dynamo-planner-dashboard",
"version": 1
}
......@@ -179,6 +179,25 @@ kubectl port-forward svc/trtllm-disagg-frontend 8000:8000 -n $NAMESPACE
curl http://localhost:8000/v1/models
```
### Step 5 (Optional): Access the Planner Grafana Dashboard
If you want to monitor the SLA Planner's decision-making in real-time, you can deploy the Planner Grafana dashboard.
```bash
kubectl apply -n monitoring -f deploy/observability/k8s/grafana-planner-dashboard-configmap.yaml
```
Follow the instructions in [Dynamo Metrics Collection on Kubernetes](../kubernetes/observability/metrics.md) to access the Grafana UI and select the **Dynamo Planner Dashboard**.
The dashboard displays:
- **Worker Counts & GPU Usage**: Current prefill/decode worker counts and cumulative GPU hours
- **Observed Metrics**: Real-time TTFT, ITL, request rate, and sequence lengths from Prometheus
- **Predicted Metrics**: Planner's load predictions and recommended replica counts
- **Correction Factors**: How the planner adjusts predictions based on observed vs expected performance
> [!TIP]
> Use the **Namespace** dropdown at the top of the dashboard to filter metrics for your specific deployment namespace.
## DGDR Configuration Details
### Required Fields
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment