Unverified Commit ca674098 authored by Ziqi Fan's avatar Ziqi Fan Committed by GitHub
Browse files

feat: add KVBM host to disk metrics | clean up dashboard (#3534)


Signed-off-by: default avatarZiqi Fan <ziqif@nvidia.com>
parent b954a249
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
"editable": true, "editable": true,
"fiscalYearStartMonth": 0, "fiscalYearStartMonth": 0,
"graphTooltip": 0, "graphTooltip": 0,
"id": 6, "id": 1,
"links": [], "links": [],
"panels": [ "panels": [
{ {
...@@ -209,7 +209,7 @@ ...@@ -209,7 +209,7 @@
"x": 0, "x": 0,
"y": 10 "y": 10
}, },
"id": 2, "id": 3,
"options": { "options": {
"legend": { "legend": {
"calcs": [], "calcs": [],
...@@ -228,7 +228,7 @@ ...@@ -228,7 +228,7 @@
{ {
"disableTextWrap": false, "disableTextWrap": false,
"editorMode": "code", "editorMode": "code",
"expr": "kvbm_offload_requests", "expr": "kvbm_offload_blocks_d2h",
"fullMetaSearch": false, "fullMetaSearch": false,
"includeNullMetadata": true, "includeNullMetadata": true,
"legendFormat": "__auto", "legendFormat": "__auto",
...@@ -237,7 +237,7 @@ ...@@ -237,7 +237,7 @@
"useBackend": false "useBackend": false
} }
], ],
"title": "Offload Requests", "title": "Offload Blocks - Device to Host",
"type": "timeseries" "type": "timeseries"
}, },
{ {
...@@ -305,7 +305,7 @@ ...@@ -305,7 +305,7 @@
"x": 12, "x": 12,
"y": 10 "y": 10
}, },
"id": 3, "id": 11,
"options": { "options": {
"legend": { "legend": {
"calcs": [], "calcs": [],
...@@ -324,7 +324,7 @@ ...@@ -324,7 +324,7 @@
{ {
"disableTextWrap": false, "disableTextWrap": false,
"editorMode": "code", "editorMode": "code",
"expr": "kvbm_offload_blocks_d2h", "expr": "kvbm_offload_blocks_h2d",
"fullMetaSearch": false, "fullMetaSearch": false,
"includeNullMetadata": true, "includeNullMetadata": true,
"legendFormat": "__auto", "legendFormat": "__auto",
...@@ -333,7 +333,7 @@ ...@@ -333,7 +333,7 @@
"useBackend": false "useBackend": false
} }
], ],
"title": "Offload Blocks - Device to Host", "title": "Offload Blocks - Host to Disk",
"type": "timeseries" "type": "timeseries"
}, },
{ {
...@@ -342,7 +342,7 @@ ...@@ -342,7 +342,7 @@
"h": 1, "h": 1,
"w": 24, "w": 24,
"x": 0, "x": 0,
"y": 26 "y": 18
}, },
"id": 6, "id": 6,
"panels": [], "panels": [],
...@@ -412,103 +412,7 @@ ...@@ -412,103 +412,7 @@
"h": 8, "h": 8,
"w": 12, "w": 12,
"x": 0, "x": 0,
"y": 27 "y": 19
},
"id": 9,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "single",
"sort": "none"
}
},
"pluginVersion": "12.0.1",
"targets": [
{
"disableTextWrap": false,
"editorMode": "code",
"expr": "kvbm_onboard_requests",
"fullMetaSearch": false,
"includeNullMetadata": true,
"legendFormat": "__auto",
"range": true,
"refId": "A",
"useBackend": false
}
],
"title": "Onboard Requests",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 27
}, },
"id": 4, "id": 4,
"options": { "options": {
...@@ -603,8 +507,8 @@ ...@@ -603,8 +507,8 @@
"gridPos": { "gridPos": {
"h": 8, "h": 8,
"w": 12, "w": 12,
"x": 0, "x": 12,
"y": 35 "y": 19
}, },
"id": 8, "id": 8,
"options": { "options": {
...@@ -639,7 +543,7 @@ ...@@ -639,7 +543,7 @@
} }
], ],
"preload": false, "preload": false,
"refresh": "auto", "refresh": "5s",
"schemaVersion": 41, "schemaVersion": 41,
"tags": [], "tags": [],
"templating": { "templating": {
...@@ -653,5 +557,5 @@ ...@@ -653,5 +557,5 @@
"timezone": "browser", "timezone": "browser",
"title": "KVBM Dashboard", "title": "KVBM Dashboard",
"uid": "3f679257-70a5-402c-92b4-05382337b548", "uid": "3f679257-70a5-402c-92b4-05382337b548",
"version": 7 "version": 14
} }
\ No newline at end of file
...@@ -83,13 +83,13 @@ python3 -m dynamo.frontend --http-port 8000 & ...@@ -83,13 +83,13 @@ python3 -m dynamo.frontend --http-port 8000 &
# [DYNAMO] To serve an LLM model with dynamo # [DYNAMO] To serve an LLM model with dynamo
python3 -m dynamo.trtllm \ python3 -m dynamo.trtllm \
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \ --model-path Qwen/Qwen3-0.6B \
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \ --served-model-name Qwen/Qwen3-0.6B \
--extra-engine-args /tmp/kvbm_llm_api_config.yaml & --extra-engine-args /tmp/kvbm_llm_api_config.yaml &
# make a call to LLM # make a call to LLM
curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{ curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", "model": "Qwen/Qwen3-0.6B",
"messages": [ "messages": [
{ {
"role": "user", "role": "user",
...@@ -104,7 +104,7 @@ curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" ...@@ -104,7 +104,7 @@ curl localhost:8000/v1/chat/completions -H "Content-Type: application/json"
Alternatively, can use "trtllm-serve" with KVBM by replacing the above two [DYNAMO] cmds with below: Alternatively, can use "trtllm-serve" with KVBM by replacing the above two [DYNAMO] cmds with below:
```bash ```bash
trtllm-serve deepseek-ai/DeepSeek-R1-Distill-Llama-8B --host localhost --port 8000 --backend pytorch --extra_llm_api_options /tmp/kvbm_llm_api_config.yaml trtllm-serve Qwen/Qwen3-0.6B --host localhost --port 8000 --backend pytorch --extra_llm_api_options /tmp/kvbm_llm_api_config.yaml
``` ```
## Enable and View KVBM Metrics ## Enable and View KVBM Metrics
...@@ -118,8 +118,8 @@ docker compose -f deploy/docker-compose.yml --profile metrics up -d ...@@ -118,8 +118,8 @@ docker compose -f deploy/docker-compose.yml --profile metrics up -d
# Optionally set DYN_KVBM_METRICS_PORT to choose the /metrics port (default: 6880). # Optionally set DYN_KVBM_METRICS_PORT to choose the /metrics port (default: 6880).
DYN_KVBM_METRICS=true \ DYN_KVBM_METRICS=true \
python3 -m dynamo.trtllm \ python3 -m dynamo.trtllm \
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \ --model-path Qwen/Qwen3-0.6B \
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \ --served-model-name Qwen/Qwen3-0.6B \
--extra-engine-args /tmp/kvbm_llm_api_config.yaml & --extra-engine-args /tmp/kvbm_llm_api_config.yaml &
# optional if firewall blocks KVBM metrics ports to send prometheus metrics # optional if firewall blocks KVBM metrics ports to send prometheus metrics
...@@ -138,7 +138,7 @@ git clone https://github.com/LMCache/LMBenchmark.git ...@@ -138,7 +138,7 @@ git clone https://github.com/LMCache/LMBenchmark.git
# we are passing model, endpoint, output file prefix and qps to the sh script. # we are passing model, endpoint, output file prefix and qps to the sh script.
cd LMBenchmark/synthetic-multi-round-qa cd LMBenchmark/synthetic-multi-round-qa
./long_input_short_output_run.sh \ ./long_input_short_output_run.sh \
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B" \ "Qwen/Qwen3-0.6B" \
"http://localhost:8000" \ "http://localhost:8000" \
"benchmark_kvbm" \ "benchmark_kvbm" \
1 1
...@@ -160,5 +160,5 @@ kv_cache_config: ...@@ -160,5 +160,5 @@ kv_cache_config:
EOF EOF
# run trtllm-serve for the baseline for comparison # run trtllm-serve for the baseline for comparison
trtllm-serve deepseek-ai/DeepSeek-R1-Distill-Llama-8B --host localhost --port 8000 --backend pytorch --extra_llm_api_options /tmp/llm_api_config.yaml & trtllm-serve Qwen/Qwen3-0.6B --host localhost --port 8000 --backend pytorch --extra_llm_api_options /tmp/llm_api_config.yaml &
``` ```
...@@ -85,7 +85,7 @@ curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" ...@@ -85,7 +85,7 @@ curl localhost:8000/v1/chat/completions -H "Content-Type: application/json"
Alternatively, can use `vllm serve` directly to use KVBM for aggregated serving: Alternatively, can use `vllm serve` directly to use KVBM for aggregated serving:
```bash ```bash
vllm serve --kv-transfer-config '{"kv_connector":"DynamoConnector","kv_role":"kv_both", "kv_connector_module_path": "dynamo.llm.vllm_integration.connector"}' deepseek-ai/DeepSeek-R1-Distill-Llama-8B vllm serve --kv-transfer-config '{"kv_connector":"DynamoConnector","kv_role":"kv_both", "kv_connector_module_path": "dynamo.llm.vllm_integration.connector"}' Qwen/Qwen3-0.6B
``` ```
## Enable and View KVBM Metrics ## Enable and View KVBM Metrics
...@@ -97,9 +97,11 @@ docker compose -f deploy/docker-compose.yml --profile metrics up -d ...@@ -97,9 +97,11 @@ docker compose -f deploy/docker-compose.yml --profile metrics up -d
# set env var DYN_KVBM_METRICS to true, when launch via dynamo # set env var DYN_KVBM_METRICS to true, when launch via dynamo
# Optionally set DYN_KVBM_METRICS_PORT to choose the /metrics port (default: 6880). # Optionally set DYN_KVBM_METRICS_PORT to choose the /metrics port (default: 6880).
# NOTE: update launch/disagg_kvbm.sh or launch/disagg_kvbm_2p2d.sh as needed
DYN_KVBM_METRICS=true \ DYN_KVBM_METRICS=true \
python -m dynamo.vllm \ python -m dynamo.vllm \
--model deepseek-ai/DeepSeek-R1-Distill-Llama-8B \ --model Qwen/Qwen3-0.6B \
--enforce-eager \
--connector kvbm & --connector kvbm &
# optional if firewall blocks KVBM metrics ports to send prometheus metrics # optional if firewall blocks KVBM metrics ports to send prometheus metrics
...@@ -118,7 +120,7 @@ git clone https://github.com/LMCache/LMBenchmark.git ...@@ -118,7 +120,7 @@ git clone https://github.com/LMCache/LMBenchmark.git
# we are passing model, endpoint, output file prefix and qps to the sh script. # we are passing model, endpoint, output file prefix and qps to the sh script.
cd LMBenchmark/synthetic-multi-round-qa cd LMBenchmark/synthetic-multi-round-qa
./long_input_short_output_run.sh \ ./long_input_short_output_run.sh \
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B" \ "Qwen/Qwen3-0.6B" \
"http://localhost:8000" \ "http://localhost:8000" \
"benchmark_kvbm" \ "benchmark_kvbm" \
1 1
...@@ -129,4 +131,4 @@ More details about how to use LMBenchmark could be found [here](https://github.c ...@@ -129,4 +131,4 @@ More details about how to use LMBenchmark could be found [here](https://github.c
`NOTE`: if metrics are enabled as mentioned in the above section, you can observe KV offloading, and KV onboarding in the grafana dashboard. `NOTE`: if metrics are enabled as mentioned in the above section, you can observe KV offloading, and KV onboarding in the grafana dashboard.
To compare, you can run `vllm serve deepseek-ai/DeepSeek-R1-Distill-Llama-8B` to turn KVBM off as the baseline. To compare, you can run `vllm serve Qwen/Qwen3-0.6B` to turn KVBM off as the baseline.
...@@ -240,6 +240,7 @@ pub struct BlockManagerBuilder { ...@@ -240,6 +240,7 @@ pub struct BlockManagerBuilder {
leader: Option<distributed::KvbmLeader>, leader: Option<distributed::KvbmLeader>,
page_size: usize, page_size: usize,
disable_device_pool: bool, disable_device_pool: bool,
kvbm_metrics: Option<dynamo_llm::block_manager::metrics_kvbm::KvbmMetrics>,
} }
impl BlockManagerBuilder { impl BlockManagerBuilder {
...@@ -266,6 +267,13 @@ impl BlockManagerBuilder { ...@@ -266,6 +267,13 @@ impl BlockManagerBuilder {
self.disable_device_pool = yes; self.disable_device_pool = yes;
self self
} }
pub fn kvbm_metrics(
mut self,
metrics: dynamo_llm::block_manager::metrics_kvbm::KvbmMetrics,
) -> Self {
self.kvbm_metrics = Some(metrics);
self
}
/// Async build (call from an async context). /// Async build (call from an async context).
pub async fn build(self) -> Result<BlockManager> { pub async fn build(self) -> Result<BlockManager> {
...@@ -325,7 +333,11 @@ impl BlockManagerBuilder { ...@@ -325,7 +333,11 @@ impl BlockManagerBuilder {
); );
} }
let config = config.build()?; let mut config_builder = config;
if let Some(kvbm_metrics) = self.kvbm_metrics {
config_builder = config_builder.kvbm_metrics(Some(kvbm_metrics));
}
let config = config_builder.build()?;
let resources = let resources =
DistributedLeaderWorkerResources::new(Some(leader_inner), cancel_token.child_token())?; DistributedLeaderWorkerResources::new(Some(leader_inner), cancel_token.child_token())?;
......
...@@ -129,6 +129,7 @@ impl KvConnectorLeader { ...@@ -129,6 +129,7 @@ impl KvConnectorLeader {
.leader(leader_py) .leader(leader_py)
.page_size(page_size) .page_size(page_size)
.disable_device_pool(false) .disable_device_pool(false)
.kvbm_metrics(kvbm_metrics_clone.clone())
.build() .build()
.await .await
{ {
......
...@@ -145,6 +145,7 @@ impl KvConnectorLeaderRecorder { ...@@ -145,6 +145,7 @@ impl KvConnectorLeaderRecorder {
.leader(leader_py) .leader(leader_py)
.page_size(page_size) .page_size(page_size)
.disable_device_pool(false) .disable_device_pool(false)
.kvbm_metrics(kvbm_metrics_clone.clone())
.build() .build()
.await .await
{ {
......
...@@ -1277,7 +1277,6 @@ async fn process_offload_request( ...@@ -1277,7 +1277,6 @@ async fn process_offload_request(
leader: &Arc<KvbmLeader>, leader: &Arc<KvbmLeader>,
kvbm_metrics: KvbmMetrics, kvbm_metrics: KvbmMetrics,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
kvbm_metrics.offload_requests.inc();
kvbm_metrics kvbm_metrics
.offload_blocks_d2h .offload_blocks_d2h
.inc_by(offload_req.block_ids.len() as u64); .inc_by(offload_req.block_ids.len() as u64);
...@@ -1389,7 +1388,6 @@ async fn process_onboard_request( ...@@ -1389,7 +1388,6 @@ async fn process_onboard_request(
leader: &Arc<KvbmLeader>, leader: &Arc<KvbmLeader>,
kvbm_metrics: KvbmMetrics, kvbm_metrics: KvbmMetrics,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
kvbm_metrics.onboard_requests.inc();
if onboard_req.src_blocks.storage_pool() == BlockTransferPool::Host { if onboard_req.src_blocks.storage_pool() == BlockTransferPool::Host {
kvbm_metrics kvbm_metrics
.onboard_blocks_h2d .onboard_blocks_h2d
......
...@@ -105,6 +105,7 @@ impl KvConnectorLeader { ...@@ -105,6 +105,7 @@ impl KvConnectorLeader {
.leader(leader_py) .leader(leader_py)
.page_size(page_size) .page_size(page_size)
.disable_device_pool(false) .disable_device_pool(false)
.kvbm_metrics(kvbm_metrics_clone.clone())
.build() .build()
.await .await
{ {
......
...@@ -93,7 +93,7 @@ impl WriteToStrategy<DeviceStorage> for PinnedStorage { ...@@ -93,7 +93,7 @@ impl WriteToStrategy<DeviceStorage> for PinnedStorage {
impl WriteToStrategy<DiskStorage> for DeviceStorage { impl WriteToStrategy<DiskStorage> for DeviceStorage {
#[inline(always)] #[inline(always)]
fn write_to_strategy() -> TransferStrategy { fn write_to_strategy() -> TransferStrategy {
TransferStrategy::Nixl(NixlTransfer::Read) TransferStrategy::Nixl(NixlTransfer::Write)
} }
} }
......
...@@ -199,6 +199,10 @@ pub struct KvBlockManagerConfig { ...@@ -199,6 +199,10 @@ pub struct KvBlockManagerConfig {
/// Channel to reset the block manager to a specific cache level /// Channel to reset the block manager to a specific cache level
#[builder(default)] #[builder(default)]
pub block_reset_channel: Option<BlockResetChannel>, pub block_reset_channel: Option<BlockResetChannel>,
/// Optional KVBM-level metrics for tracking offload/onboard operations
#[builder(default)]
pub kvbm_metrics: Option<crate::block_manager::metrics_kvbm::KvbmMetrics>,
} }
impl KvBlockManagerConfig { impl KvBlockManagerConfig {
......
...@@ -4,8 +4,8 @@ ...@@ -4,8 +4,8 @@
use axum::Router; use axum::Router;
use dynamo_runtime::metrics::prometheus_names::{ use dynamo_runtime::metrics::prometheus_names::{
kvbm::{ kvbm::{
MATCHED_TOKENS, OFFLOAD_BLOCKS_D2H, OFFLOAD_REQUESTS, ONBOARD_BLOCKS_D2D, MATCHED_TOKENS, OFFLOAD_BLOCKS_D2H, OFFLOAD_BLOCKS_H2D, ONBOARD_BLOCKS_D2D,
ONBOARD_BLOCKS_H2D, ONBOARD_REQUESTS, ONBOARD_BLOCKS_H2D,
}, },
sanitize_prometheus_name, sanitize_prometheus_name,
}; };
...@@ -17,14 +17,11 @@ use crate::http::service::{RouteDoc, metrics::router}; ...@@ -17,14 +17,11 @@ use crate::http::service::{RouteDoc, metrics::router};
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
pub struct KvbmMetrics { pub struct KvbmMetrics {
// number of offload requests
pub offload_requests: IntCounter,
// number of blocks offloaded from device to host // number of blocks offloaded from device to host
pub offload_blocks_d2h: IntCounter, pub offload_blocks_d2h: IntCounter,
// number of onboard requests // number of blocks offloaded from host to disk
pub onboard_requests: IntCounter, pub offload_blocks_h2d: IntCounter,
// number of blocks onboarded from host to device // number of blocks onboarded from host to device
pub onboard_blocks_h2d: IntCounter, pub onboard_blocks_h2d: IntCounter,
...@@ -43,9 +40,6 @@ impl KvbmMetrics { ...@@ -43,9 +40,6 @@ impl KvbmMetrics {
/// Non-blocking: the HTTP server runs on a background task. /// Non-blocking: the HTTP server runs on a background task.
pub fn new(mr: &KvbmMetricsRegistry, create_endpoint: bool, metrics_port: u16) -> Self { pub fn new(mr: &KvbmMetricsRegistry, create_endpoint: bool, metrics_port: u16) -> Self {
// 1) register kvbm metrics // 1) register kvbm metrics
let offload_requests = mr
.create_intcounter(OFFLOAD_REQUESTS, "The number of offload requests", &[])
.unwrap();
let offload_blocks_d2h = mr let offload_blocks_d2h = mr
.create_intcounter( .create_intcounter(
OFFLOAD_BLOCKS_D2H, OFFLOAD_BLOCKS_D2H,
...@@ -53,8 +47,12 @@ impl KvbmMetrics { ...@@ -53,8 +47,12 @@ impl KvbmMetrics {
&[], &[],
) )
.unwrap(); .unwrap();
let onboard_requests = mr let offload_blocks_h2d = mr
.create_intcounter(ONBOARD_REQUESTS, "The number of onboard requests", &[]) .create_intcounter(
OFFLOAD_BLOCKS_H2D,
"The number of offload blocks from host to disk",
&[],
)
.unwrap(); .unwrap();
let onboard_blocks_h2d = mr let onboard_blocks_h2d = mr
.create_intcounter( .create_intcounter(
...@@ -77,9 +75,8 @@ impl KvbmMetrics { ...@@ -77,9 +75,8 @@ impl KvbmMetrics {
// early return if no endpoint is needed // early return if no endpoint is needed
if !create_endpoint { if !create_endpoint {
return Self { return Self {
offload_requests,
offload_blocks_d2h, offload_blocks_d2h,
onboard_requests, offload_blocks_h2d,
onboard_blocks_h2d, onboard_blocks_h2d,
onboard_blocks_d2d, onboard_blocks_d2d,
matched_tokens, matched_tokens,
...@@ -131,9 +128,8 @@ impl KvbmMetrics { ...@@ -131,9 +128,8 @@ impl KvbmMetrics {
} }
Self { Self {
offload_requests,
offload_blocks_d2h, offload_blocks_d2h,
onboard_requests, offload_blocks_h2d,
onboard_blocks_h2d, onboard_blocks_h2d,
onboard_blocks_d2d, onboard_blocks_d2d,
matched_tokens, matched_tokens,
......
...@@ -80,6 +80,8 @@ pub struct OffloadManagerConfig { ...@@ -80,6 +80,8 @@ pub struct OffloadManagerConfig {
pub metrics: Arc<BlockManagerMetrics>, pub metrics: Arc<BlockManagerMetrics>,
pub cancellation_token: CancellationToken, pub cancellation_token: CancellationToken,
pub model_config: KvManagerModelConfig, pub model_config: KvManagerModelConfig,
/// Optional KVBM-level metrics for tracking offload/onboard operations
pub kvbm_metrics: Option<crate::block_manager::metrics_kvbm::KvbmMetrics>,
} }
/// The offload manager handles all block transfers between different cache levels. /// The offload manager handles all block transfers between different cache levels.
...@@ -101,6 +103,9 @@ pub struct OffloadManager<Locality: LocalityProvider, Metadata: BlockMetadata> { ...@@ -101,6 +103,9 @@ pub struct OffloadManager<Locality: LocalityProvider, Metadata: BlockMetadata> {
/// An incrementing counter for offloaded blocks. Within the same priority, blocks with lower tick values are processed first. /// An incrementing counter for offloaded blocks. Within the same priority, blocks with lower tick values are processed first.
tick: Arc<AtomicU64>, tick: Arc<AtomicU64>,
/// Optional KVBM-level metrics for tracking offload/onboard operations
kvbm_metrics: Option<crate::block_manager::metrics_kvbm::KvbmMetrics>,
} }
impl<Locality: LocalityProvider + 'static, Metadata: BlockMetadata> impl<Locality: LocalityProvider + 'static, Metadata: BlockMetadata>
...@@ -129,6 +134,7 @@ impl<Locality: LocalityProvider + 'static, Metadata: BlockMetadata> ...@@ -129,6 +134,7 @@ impl<Locality: LocalityProvider + 'static, Metadata: BlockMetadata>
host_onboard_tx, host_onboard_tx,
disk_onboard_tx, disk_onboard_tx,
tick: Arc::new(AtomicU64::new(0)), tick: Arc::new(AtomicU64::new(0)),
kvbm_metrics: config.kvbm_metrics.clone(),
}); });
let cuda_ctx = Cuda::device_or_create(0)?; let cuda_ctx = Cuda::device_or_create(0)?;
...@@ -485,6 +491,11 @@ impl<Locality: LocalityProvider + 'static, Metadata: BlockMetadata> ...@@ -485,6 +491,11 @@ impl<Locality: LocalityProvider + 'static, Metadata: BlockMetadata>
key, key,
}; };
// Track metrics if available
if let Some(ref kvbm_metrics) = self.kvbm_metrics {
kvbm_metrics.offload_blocks_d2h.inc();
}
self.device_offload_tx.send(request).unwrap(); self.device_offload_tx.send(request).unwrap();
} else if let Some(host_block) = } else if let Some(host_block) =
any_block.downcast_ref::<ImmutableBlock<PinnedStorage, Locality, Metadata>>() any_block.downcast_ref::<ImmutableBlock<PinnedStorage, Locality, Metadata>>()
...@@ -500,6 +511,11 @@ impl<Locality: LocalityProvider + 'static, Metadata: BlockMetadata> ...@@ -500,6 +511,11 @@ impl<Locality: LocalityProvider + 'static, Metadata: BlockMetadata>
key, key,
}; };
// Track metrics if available
if let Some(ref kvbm_metrics) = self.kvbm_metrics {
kvbm_metrics.offload_blocks_h2d.inc();
}
self.host_offload_tx.send(request).unwrap(); self.host_offload_tx.send(request).unwrap();
} }
......
...@@ -158,6 +158,7 @@ impl<R: LogicalResources, Metadata: BlockMetadata> ...@@ -158,6 +158,7 @@ impl<R: LogicalResources, Metadata: BlockMetadata>
metrics: resources.metrics.clone(), metrics: resources.metrics.clone(),
cancellation_token: resources.cancellation_token.clone(), cancellation_token: resources.cancellation_token.clone(),
model_config, model_config,
kvbm_metrics: resources.config.kvbm_metrics.clone(),
}; };
let offload_manager = OffloadManager::new( let offload_manager = OffloadManager::new(
...@@ -280,6 +281,7 @@ impl<Metadata: BlockMetadata> KvBlockManagerState<locality::Local, Metadata> { ...@@ -280,6 +281,7 @@ impl<Metadata: BlockMetadata> KvBlockManagerState<locality::Local, Metadata> {
metrics: resources.metrics.clone(), metrics: resources.metrics.clone(),
cancellation_token: resources.cancellation_token.clone(), cancellation_token: resources.cancellation_token.clone(),
model_config, model_config,
kvbm_metrics: resources.config.kvbm_metrics.clone(),
}; };
let offload_manager = OffloadManager::new( let offload_manager = OffloadManager::new(
......
...@@ -320,14 +320,11 @@ pub mod distributed_runtime { ...@@ -320,14 +320,11 @@ pub mod distributed_runtime {
/// KVBM /// KVBM
pub mod kvbm { pub mod kvbm {
/// The number of offload requests
pub const OFFLOAD_REQUESTS: &str = "offload_requests";
/// The number of offload blocks from device to host /// The number of offload blocks from device to host
pub const OFFLOAD_BLOCKS_D2H: &str = "offload_blocks_d2h"; pub const OFFLOAD_BLOCKS_D2H: &str = "offload_blocks_d2h";
/// The number of onboard requests /// The number of offload blocks from host to disk
pub const ONBOARD_REQUESTS: &str = "onboard_requests"; pub const OFFLOAD_BLOCKS_H2D: &str = "offload_blocks_h2d";
/// The number of onboard blocks from host to device /// The number of onboard blocks from host to device
pub const ONBOARD_BLOCKS_H2D: &str = "onboard_blocks_h2d"; pub const ONBOARD_BLOCKS_H2D: &str = "onboard_blocks_h2d";
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment