"vllm/vscode:/vscode.git/clone" did not exist on "9d1c50a5ac8726f4af0d4a4e85ad4d26a674ad26"
Unverified Commit ca674098 authored by Ziqi Fan's avatar Ziqi Fan Committed by GitHub
Browse files

feat: add KVBM host to disk metrics | clean up dashboard (#3534)


Signed-off-by: default avatarZiqi Fan <ziqif@nvidia.com>
parent b954a249
......@@ -19,7 +19,7 @@
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": 6,
"id": 1,
"links": [],
"panels": [
{
......@@ -209,7 +209,7 @@
"x": 0,
"y": 10
},
"id": 2,
"id": 3,
"options": {
"legend": {
"calcs": [],
......@@ -228,7 +228,7 @@
{
"disableTextWrap": false,
"editorMode": "code",
"expr": "kvbm_offload_requests",
"expr": "kvbm_offload_blocks_d2h",
"fullMetaSearch": false,
"includeNullMetadata": true,
"legendFormat": "__auto",
......@@ -237,7 +237,7 @@
"useBackend": false
}
],
"title": "Offload Requests",
"title": "Offload Blocks - Device to Host",
"type": "timeseries"
},
{
......@@ -305,7 +305,7 @@
"x": 12,
"y": 10
},
"id": 3,
"id": 11,
"options": {
"legend": {
"calcs": [],
......@@ -324,7 +324,7 @@
{
"disableTextWrap": false,
"editorMode": "code",
"expr": "kvbm_offload_blocks_d2h",
"expr": "kvbm_offload_blocks_h2d",
"fullMetaSearch": false,
"includeNullMetadata": true,
"legendFormat": "__auto",
......@@ -333,7 +333,7 @@
"useBackend": false
}
],
"title": "Offload Blocks - Device to Host",
"title": "Offload Blocks - Host to Disk",
"type": "timeseries"
},
{
......@@ -342,7 +342,7 @@
"h": 1,
"w": 24,
"x": 0,
"y": 26
"y": 18
},
"id": 6,
"panels": [],
......@@ -412,103 +412,7 @@
"h": 8,
"w": 12,
"x": 0,
"y": 27
},
"id": 9,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "single",
"sort": "none"
}
},
"pluginVersion": "12.0.1",
"targets": [
{
"disableTextWrap": false,
"editorMode": "code",
"expr": "kvbm_onboard_requests",
"fullMetaSearch": false,
"includeNullMetadata": true,
"legendFormat": "__auto",
"range": true,
"refId": "A",
"useBackend": false
}
],
"title": "Onboard Requests",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 27
"y": 19
},
"id": 4,
"options": {
......@@ -603,8 +507,8 @@
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 35
"x": 12,
"y": 19
},
"id": 8,
"options": {
......@@ -639,7 +543,7 @@
}
],
"preload": false,
"refresh": "auto",
"refresh": "5s",
"schemaVersion": 41,
"tags": [],
"templating": {
......@@ -653,5 +557,5 @@
"timezone": "browser",
"title": "KVBM Dashboard",
"uid": "3f679257-70a5-402c-92b4-05382337b548",
"version": 7
"version": 14
}
\ No newline at end of file
......@@ -83,13 +83,13 @@ python3 -m dynamo.frontend --http-port 8000 &
# [DYNAMO] To serve an LLM model with dynamo
python3 -m dynamo.trtllm \
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--model-path Qwen/Qwen3-0.6B \
--served-model-name Qwen/Qwen3-0.6B \
--extra-engine-args /tmp/kvbm_llm_api_config.yaml &
# make a call to LLM
curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
"model": "Qwen/Qwen3-0.6B",
"messages": [
{
"role": "user",
......@@ -104,7 +104,7 @@ curl localhost:8000/v1/chat/completions -H "Content-Type: application/json"
Alternatively, can use "trtllm-serve" with KVBM by replacing the above two [DYNAMO] cmds with below:
```bash
trtllm-serve deepseek-ai/DeepSeek-R1-Distill-Llama-8B --host localhost --port 8000 --backend pytorch --extra_llm_api_options /tmp/kvbm_llm_api_config.yaml
trtllm-serve Qwen/Qwen3-0.6B --host localhost --port 8000 --backend pytorch --extra_llm_api_options /tmp/kvbm_llm_api_config.yaml
```
## Enable and View KVBM Metrics
......@@ -118,8 +118,8 @@ docker compose -f deploy/docker-compose.yml --profile metrics up -d
# Optionally set DYN_KVBM_METRICS_PORT to choose the /metrics port (default: 6880).
DYN_KVBM_METRICS=true \
python3 -m dynamo.trtllm \
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--model-path Qwen/Qwen3-0.6B \
--served-model-name Qwen/Qwen3-0.6B \
--extra-engine-args /tmp/kvbm_llm_api_config.yaml &
# optional if firewall blocks KVBM metrics ports to send prometheus metrics
......@@ -138,7 +138,7 @@ git clone https://github.com/LMCache/LMBenchmark.git
# we are passing model, endpoint, output file prefix and qps to the sh script.
cd LMBenchmark/synthetic-multi-round-qa
./long_input_short_output_run.sh \
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B" \
"Qwen/Qwen3-0.6B" \
"http://localhost:8000" \
"benchmark_kvbm" \
1
......@@ -160,5 +160,5 @@ kv_cache_config:
EOF
# run trtllm-serve for the baseline for comparison
trtllm-serve deepseek-ai/DeepSeek-R1-Distill-Llama-8B --host localhost --port 8000 --backend pytorch --extra_llm_api_options /tmp/llm_api_config.yaml &
trtllm-serve Qwen/Qwen3-0.6B --host localhost --port 8000 --backend pytorch --extra_llm_api_options /tmp/llm_api_config.yaml &
```
......@@ -85,7 +85,7 @@ curl localhost:8000/v1/chat/completions -H "Content-Type: application/json"
Alternatively, can use `vllm serve` directly to use KVBM for aggregated serving:
```bash
vllm serve --kv-transfer-config '{"kv_connector":"DynamoConnector","kv_role":"kv_both", "kv_connector_module_path": "dynamo.llm.vllm_integration.connector"}' deepseek-ai/DeepSeek-R1-Distill-Llama-8B
vllm serve --kv-transfer-config '{"kv_connector":"DynamoConnector","kv_role":"kv_both", "kv_connector_module_path": "dynamo.llm.vllm_integration.connector"}' Qwen/Qwen3-0.6B
```
## Enable and View KVBM Metrics
......@@ -97,9 +97,11 @@ docker compose -f deploy/docker-compose.yml --profile metrics up -d
# set env var DYN_KVBM_METRICS to true, when launch via dynamo
# Optionally set DYN_KVBM_METRICS_PORT to choose the /metrics port (default: 6880).
# NOTE: update launch/disagg_kvbm.sh or launch/disagg_kvbm_2p2d.sh as needed
DYN_KVBM_METRICS=true \
python -m dynamo.vllm \
--model deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--model Qwen/Qwen3-0.6B \
--enforce-eager \
--connector kvbm &
# optional if firewall blocks KVBM metrics ports to send prometheus metrics
......@@ -118,7 +120,7 @@ git clone https://github.com/LMCache/LMBenchmark.git
# we are passing model, endpoint, output file prefix and qps to the sh script.
cd LMBenchmark/synthetic-multi-round-qa
./long_input_short_output_run.sh \
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B" \
"Qwen/Qwen3-0.6B" \
"http://localhost:8000" \
"benchmark_kvbm" \
1
......@@ -129,4 +131,4 @@ More details about how to use LMBenchmark could be found [here](https://github.c
`NOTE`: if metrics are enabled as mentioned in the above section, you can observe KV offloading, and KV onboarding in the grafana dashboard.
To compare, you can run `vllm serve deepseek-ai/DeepSeek-R1-Distill-Llama-8B` to turn KVBM off as the baseline.
To compare, you can run `vllm serve Qwen/Qwen3-0.6B` to turn KVBM off as the baseline.
......@@ -240,6 +240,7 @@ pub struct BlockManagerBuilder {
leader: Option<distributed::KvbmLeader>,
page_size: usize,
disable_device_pool: bool,
kvbm_metrics: Option<dynamo_llm::block_manager::metrics_kvbm::KvbmMetrics>,
}
impl BlockManagerBuilder {
......@@ -266,6 +267,13 @@ impl BlockManagerBuilder {
self.disable_device_pool = yes;
self
}
pub fn kvbm_metrics(
mut self,
metrics: dynamo_llm::block_manager::metrics_kvbm::KvbmMetrics,
) -> Self {
self.kvbm_metrics = Some(metrics);
self
}
/// Async build (call from an async context).
pub async fn build(self) -> Result<BlockManager> {
......@@ -325,7 +333,11 @@ impl BlockManagerBuilder {
);
}
let config = config.build()?;
let mut config_builder = config;
if let Some(kvbm_metrics) = self.kvbm_metrics {
config_builder = config_builder.kvbm_metrics(Some(kvbm_metrics));
}
let config = config_builder.build()?;
let resources =
DistributedLeaderWorkerResources::new(Some(leader_inner), cancel_token.child_token())?;
......
......@@ -129,6 +129,7 @@ impl KvConnectorLeader {
.leader(leader_py)
.page_size(page_size)
.disable_device_pool(false)
.kvbm_metrics(kvbm_metrics_clone.clone())
.build()
.await
{
......
......@@ -145,6 +145,7 @@ impl KvConnectorLeaderRecorder {
.leader(leader_py)
.page_size(page_size)
.disable_device_pool(false)
.kvbm_metrics(kvbm_metrics_clone.clone())
.build()
.await
{
......
......@@ -1277,7 +1277,6 @@ async fn process_offload_request(
leader: &Arc<KvbmLeader>,
kvbm_metrics: KvbmMetrics,
) -> anyhow::Result<()> {
kvbm_metrics.offload_requests.inc();
kvbm_metrics
.offload_blocks_d2h
.inc_by(offload_req.block_ids.len() as u64);
......@@ -1389,7 +1388,6 @@ async fn process_onboard_request(
leader: &Arc<KvbmLeader>,
kvbm_metrics: KvbmMetrics,
) -> anyhow::Result<()> {
kvbm_metrics.onboard_requests.inc();
if onboard_req.src_blocks.storage_pool() == BlockTransferPool::Host {
kvbm_metrics
.onboard_blocks_h2d
......
......@@ -105,6 +105,7 @@ impl KvConnectorLeader {
.leader(leader_py)
.page_size(page_size)
.disable_device_pool(false)
.kvbm_metrics(kvbm_metrics_clone.clone())
.build()
.await
{
......
......@@ -93,7 +93,7 @@ impl WriteToStrategy<DeviceStorage> for PinnedStorage {
impl WriteToStrategy<DiskStorage> for DeviceStorage {
#[inline(always)]
fn write_to_strategy() -> TransferStrategy {
TransferStrategy::Nixl(NixlTransfer::Read)
TransferStrategy::Nixl(NixlTransfer::Write)
}
}
......
......@@ -199,6 +199,10 @@ pub struct KvBlockManagerConfig {
/// Channel to reset the block manager to a specific cache level
#[builder(default)]
pub block_reset_channel: Option<BlockResetChannel>,
/// Optional KVBM-level metrics for tracking offload/onboard operations
#[builder(default)]
pub kvbm_metrics: Option<crate::block_manager::metrics_kvbm::KvbmMetrics>,
}
impl KvBlockManagerConfig {
......
......@@ -4,8 +4,8 @@
use axum::Router;
use dynamo_runtime::metrics::prometheus_names::{
kvbm::{
MATCHED_TOKENS, OFFLOAD_BLOCKS_D2H, OFFLOAD_REQUESTS, ONBOARD_BLOCKS_D2D,
ONBOARD_BLOCKS_H2D, ONBOARD_REQUESTS,
MATCHED_TOKENS, OFFLOAD_BLOCKS_D2H, OFFLOAD_BLOCKS_H2D, ONBOARD_BLOCKS_D2D,
ONBOARD_BLOCKS_H2D,
},
sanitize_prometheus_name,
};
......@@ -17,14 +17,11 @@ use crate::http::service::{RouteDoc, metrics::router};
#[derive(Clone, Debug)]
pub struct KvbmMetrics {
// number of offload requests
pub offload_requests: IntCounter,
// number of blocks offloaded from device to host
pub offload_blocks_d2h: IntCounter,
// number of onboard requests
pub onboard_requests: IntCounter,
// number of blocks offloaded from host to disk
pub offload_blocks_h2d: IntCounter,
// number of blocks onboarded from host to device
pub onboard_blocks_h2d: IntCounter,
......@@ -43,9 +40,6 @@ impl KvbmMetrics {
/// Non-blocking: the HTTP server runs on a background task.
pub fn new(mr: &KvbmMetricsRegistry, create_endpoint: bool, metrics_port: u16) -> Self {
// 1) register kvbm metrics
let offload_requests = mr
.create_intcounter(OFFLOAD_REQUESTS, "The number of offload requests", &[])
.unwrap();
let offload_blocks_d2h = mr
.create_intcounter(
OFFLOAD_BLOCKS_D2H,
......@@ -53,8 +47,12 @@ impl KvbmMetrics {
&[],
)
.unwrap();
let onboard_requests = mr
.create_intcounter(ONBOARD_REQUESTS, "The number of onboard requests", &[])
let offload_blocks_h2d = mr
.create_intcounter(
OFFLOAD_BLOCKS_H2D,
"The number of offload blocks from host to disk",
&[],
)
.unwrap();
let onboard_blocks_h2d = mr
.create_intcounter(
......@@ -77,9 +75,8 @@ impl KvbmMetrics {
// early return if no endpoint is needed
if !create_endpoint {
return Self {
offload_requests,
offload_blocks_d2h,
onboard_requests,
offload_blocks_h2d,
onboard_blocks_h2d,
onboard_blocks_d2d,
matched_tokens,
......@@ -131,9 +128,8 @@ impl KvbmMetrics {
}
Self {
offload_requests,
offload_blocks_d2h,
onboard_requests,
offload_blocks_h2d,
onboard_blocks_h2d,
onboard_blocks_d2d,
matched_tokens,
......
......@@ -80,6 +80,8 @@ pub struct OffloadManagerConfig {
pub metrics: Arc<BlockManagerMetrics>,
pub cancellation_token: CancellationToken,
pub model_config: KvManagerModelConfig,
/// Optional KVBM-level metrics for tracking offload/onboard operations
pub kvbm_metrics: Option<crate::block_manager::metrics_kvbm::KvbmMetrics>,
}
/// The offload manager handles all block transfers between different cache levels.
......@@ -101,6 +103,9 @@ pub struct OffloadManager<Locality: LocalityProvider, Metadata: BlockMetadata> {
/// An incrementing counter for offloaded blocks. Within the same priority, blocks with lower tick values are processed first.
tick: Arc<AtomicU64>,
/// Optional KVBM-level metrics for tracking offload/onboard operations
kvbm_metrics: Option<crate::block_manager::metrics_kvbm::KvbmMetrics>,
}
impl<Locality: LocalityProvider + 'static, Metadata: BlockMetadata>
......@@ -129,6 +134,7 @@ impl<Locality: LocalityProvider + 'static, Metadata: BlockMetadata>
host_onboard_tx,
disk_onboard_tx,
tick: Arc::new(AtomicU64::new(0)),
kvbm_metrics: config.kvbm_metrics.clone(),
});
let cuda_ctx = Cuda::device_or_create(0)?;
......@@ -485,6 +491,11 @@ impl<Locality: LocalityProvider + 'static, Metadata: BlockMetadata>
key,
};
// Track metrics if available
if let Some(ref kvbm_metrics) = self.kvbm_metrics {
kvbm_metrics.offload_blocks_d2h.inc();
}
self.device_offload_tx.send(request).unwrap();
} else if let Some(host_block) =
any_block.downcast_ref::<ImmutableBlock<PinnedStorage, Locality, Metadata>>()
......@@ -500,6 +511,11 @@ impl<Locality: LocalityProvider + 'static, Metadata: BlockMetadata>
key,
};
// Track metrics if available
if let Some(ref kvbm_metrics) = self.kvbm_metrics {
kvbm_metrics.offload_blocks_h2d.inc();
}
self.host_offload_tx.send(request).unwrap();
}
......
......@@ -158,6 +158,7 @@ impl<R: LogicalResources, Metadata: BlockMetadata>
metrics: resources.metrics.clone(),
cancellation_token: resources.cancellation_token.clone(),
model_config,
kvbm_metrics: resources.config.kvbm_metrics.clone(),
};
let offload_manager = OffloadManager::new(
......@@ -280,6 +281,7 @@ impl<Metadata: BlockMetadata> KvBlockManagerState<locality::Local, Metadata> {
metrics: resources.metrics.clone(),
cancellation_token: resources.cancellation_token.clone(),
model_config,
kvbm_metrics: resources.config.kvbm_metrics.clone(),
};
let offload_manager = OffloadManager::new(
......
......@@ -320,14 +320,11 @@ pub mod distributed_runtime {
/// KVBM
pub mod kvbm {
/// The number of offload requests
pub const OFFLOAD_REQUESTS: &str = "offload_requests";
/// The number of offload blocks from device to host
pub const OFFLOAD_BLOCKS_D2H: &str = "offload_blocks_d2h";
/// The number of onboard requests
pub const ONBOARD_REQUESTS: &str = "onboard_requests";
/// The number of offload blocks from host to disk
pub const OFFLOAD_BLOCKS_H2D: &str = "offload_blocks_h2d";
/// The number of onboard blocks from host to device
pub const ONBOARD_BLOCKS_H2D: &str = "onboard_blocks_h2d";
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment