Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
ca674098
Unverified
Commit
ca674098
authored
Oct 10, 2025
by
Ziqi Fan
Committed by
GitHub
Oct 10, 2025
Browse files
feat: add KVBM host to disk metrics | clean up dashboard (#3534)
Signed-off-by:
Ziqi Fan
<
ziqif@nvidia.com
>
parent
b954a249
Changes
14
Show whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
81 additions
and
147 deletions
+81
-147
deploy/metrics/grafana_dashboards/grafana-kvbm-dashboard.json
...oy/metrics/grafana_dashboards/grafana-kvbm-dashboard.json
+14
-110
docs/guides/run_kvbm_in_trtllm.md
docs/guides/run_kvbm_in_trtllm.md
+8
-8
docs/guides/run_kvbm_in_vllm.md
docs/guides/run_kvbm_in_vllm.md
+6
-4
lib/bindings/python/rust/llm/block_manager.rs
lib/bindings/python/rust/llm/block_manager.rs
+13
-1
lib/bindings/python/rust/llm/block_manager/vllm/connector/leader.rs
...gs/python/rust/llm/block_manager/vllm/connector/leader.rs
+1
-0
lib/bindings/python/rust/llm/block_manager/vllm/connector/leader/recorder.rs
.../rust/llm/block_manager/vllm/connector/leader/recorder.rs
+1
-0
lib/bindings/python/rust/llm/block_manager/vllm/connector/leader/slot.rs
...thon/rust/llm/block_manager/vllm/connector/leader/slot.rs
+0
-2
lib/bindings/python/rust/llm/block_manager/vllm/connector/trtllm_leader.rs
...on/rust/llm/block_manager/vllm/connector/trtllm_leader.rs
+1
-0
lib/llm/src/block_manager/block/transfer/strategy.rs
lib/llm/src/block_manager/block/transfer/strategy.rs
+1
-1
lib/llm/src/block_manager/config.rs
lib/llm/src/block_manager/config.rs
+4
-0
lib/llm/src/block_manager/metrics_kvbm.rs
lib/llm/src/block_manager/metrics_kvbm.rs
+12
-16
lib/llm/src/block_manager/offload.rs
lib/llm/src/block_manager/offload.rs
+16
-0
lib/llm/src/block_manager/state.rs
lib/llm/src/block_manager/state.rs
+2
-0
lib/runtime/src/metrics/prometheus_names.rs
lib/runtime/src/metrics/prometheus_names.rs
+2
-5
No files found.
deploy/metrics/grafana_dashboards/grafana-kvbm-dashboard.json
View file @
ca674098
...
...
@@ -19,7 +19,7 @@
"editable"
:
true
,
"fiscalYearStartMonth"
:
0
,
"graphTooltip"
:
0
,
"id"
:
6
,
"id"
:
1
,
"links"
:
[],
"panels"
:
[
{
...
...
@@ -209,7 +209,7 @@
"x"
:
0
,
"y"
:
10
},
"id"
:
2
,
"id"
:
3
,
"options"
:
{
"legend"
:
{
"calcs"
:
[],
...
...
@@ -228,7 +228,7 @@
{
"disableTextWrap"
:
false
,
"editorMode"
:
"code"
,
"expr"
:
"kvbm_offload_
requests
"
,
"expr"
:
"kvbm_offload_
blocks_d2h
"
,
"fullMetaSearch"
:
false
,
"includeNullMetadata"
:
true
,
"legendFormat"
:
"__auto"
,
...
...
@@ -237,7 +237,7 @@
"useBackend"
:
false
}
],
"title"
:
"Offload
Reque
st
s
"
,
"title"
:
"Offload
Blocks - Device to Ho
st"
,
"type"
:
"timeseries"
},
{
...
...
@@ -305,7 +305,7 @@
"x"
:
12
,
"y"
:
10
},
"id"
:
3
,
"id"
:
11
,
"options"
:
{
"legend"
:
{
"calcs"
:
[],
...
...
@@ -324,7 +324,7 @@
{
"disableTextWrap"
:
false
,
"editorMode"
:
"code"
,
"expr"
:
"kvbm_offload_blocks_
d2h
"
,
"expr"
:
"kvbm_offload_blocks_
h2d
"
,
"fullMetaSearch"
:
false
,
"includeNullMetadata"
:
true
,
"legendFormat"
:
"__auto"
,
...
...
@@ -333,7 +333,7 @@
"useBackend"
:
false
}
],
"title"
:
"Offload Blocks -
Device to Host
"
,
"title"
:
"Offload Blocks -
Host to Disk
"
,
"type"
:
"timeseries"
},
{
...
...
@@ -342,7 +342,7 @@
"h"
:
1
,
"w"
:
24
,
"x"
:
0
,
"y"
:
26
"y"
:
18
},
"id"
:
6
,
"panels"
:
[],
...
...
@@ -412,103 +412,7 @@
"h"
:
8
,
"w"
:
12
,
"x"
:
0
,
"y"
:
27
},
"id"
:
9
,
"options"
:
{
"legend"
:
{
"calcs"
:
[],
"displayMode"
:
"list"
,
"placement"
:
"bottom"
,
"showLegend"
:
true
},
"tooltip"
:
{
"hideZeros"
:
false
,
"mode"
:
"single"
,
"sort"
:
"none"
}
},
"pluginVersion"
:
"12.0.1"
,
"targets"
:
[
{
"disableTextWrap"
:
false
,
"editorMode"
:
"code"
,
"expr"
:
"kvbm_onboard_requests"
,
"fullMetaSearch"
:
false
,
"includeNullMetadata"
:
true
,
"legendFormat"
:
"__auto"
,
"range"
:
true
,
"refId"
:
"A"
,
"useBackend"
:
false
}
],
"title"
:
"Onboard Requests"
,
"type"
:
"timeseries"
},
{
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"P1809F7CD0C75ACF3"
},
"fieldConfig"
:
{
"defaults"
:
{
"color"
:
{
"mode"
:
"palette-classic"
},
"custom"
:
{
"axisBorderShow"
:
false
,
"axisCenteredZero"
:
false
,
"axisColorMode"
:
"text"
,
"axisLabel"
:
""
,
"axisPlacement"
:
"auto"
,
"barAlignment"
:
0
,
"barWidthFactor"
:
0.6
,
"drawStyle"
:
"line"
,
"fillOpacity"
:
0
,
"gradientMode"
:
"none"
,
"hideFrom"
:
{
"legend"
:
false
,
"tooltip"
:
false
,
"viz"
:
false
},
"insertNulls"
:
false
,
"lineInterpolation"
:
"linear"
,
"lineWidth"
:
1
,
"pointSize"
:
5
,
"scaleDistribution"
:
{
"type"
:
"linear"
},
"showPoints"
:
"auto"
,
"spanNulls"
:
false
,
"stacking"
:
{
"group"
:
"A"
,
"mode"
:
"none"
},
"thresholdsStyle"
:
{
"mode"
:
"off"
}
},
"mappings"
:
[],
"thresholds"
:
{
"mode"
:
"absolute"
,
"steps"
:
[
{
"color"
:
"green"
},
{
"color"
:
"red"
,
"value"
:
80
}
]
}
},
"overrides"
:
[]
},
"gridPos"
:
{
"h"
:
8
,
"w"
:
12
,
"x"
:
12
,
"y"
:
27
"y"
:
19
},
"id"
:
4
,
"options"
:
{
...
...
@@ -603,8 +507,8 @@
"gridPos"
:
{
"h"
:
8
,
"w"
:
12
,
"x"
:
0
,
"y"
:
35
"x"
:
12
,
"y"
:
19
},
"id"
:
8
,
"options"
:
{
...
...
@@ -639,7 +543,7 @@
}
],
"preload"
:
false
,
"refresh"
:
"
auto
"
,
"refresh"
:
"
5s
"
,
"schemaVersion"
:
41
,
"tags"
:
[],
"templating"
:
{
...
...
@@ -653,5 +557,5 @@
"timezone"
:
"browser"
,
"title"
:
"KVBM Dashboard"
,
"uid"
:
"3f679257-70a5-402c-92b4-05382337b548"
,
"version"
:
7
"version"
:
14
}
\ No newline at end of file
docs/guides/run_kvbm_in_trtllm.md
View file @
ca674098
...
...
@@ -83,13 +83,13 @@ python3 -m dynamo.frontend --http-port 8000 &
# [DYNAMO] To serve an LLM model with dynamo
python3
-m
dynamo.trtllm
\
--model-path
deepseek-ai/DeepSeek-R1-Distill-Llama-8
B
\
--served-model-name
deepseek-ai/DeepSeek-R1-Distill-Llama-8
B
\
--model-path
Qwen/Qwen3-0.6
B
\
--served-model-name
Qwen/Qwen3-0.6
B
\
--extra-engine-args
/tmp/kvbm_llm_api_config.yaml &
# make a call to LLM
curl localhost:8000/v1/chat/completions
-H
"Content-Type: application/json"
-d
'{
"model": "
deepseek-ai/DeepSeek-R1-Distill-Llama-8
B",
"model": "
Qwen/Qwen3-0.6
B",
"messages": [
{
"role": "user",
...
...
@@ -104,7 +104,7 @@ curl localhost:8000/v1/chat/completions -H "Content-Type: application/json"
Alternatively, can use "trtllm-serve" with KVBM by replacing the above two [DYNAMO] cmds with below:
```
bash
trtllm-serve
deepseek-ai/DeepSeek-R1-Distill-Llama-8
B
--host
localhost
--port
8000
--backend
pytorch
--extra_llm_api_options
/tmp/kvbm_llm_api_config.yaml
trtllm-serve
Qwen/Qwen3-0.6
B
--host
localhost
--port
8000
--backend
pytorch
--extra_llm_api_options
/tmp/kvbm_llm_api_config.yaml
```
## Enable and View KVBM Metrics
...
...
@@ -118,8 +118,8 @@ docker compose -f deploy/docker-compose.yml --profile metrics up -d
# Optionally set DYN_KVBM_METRICS_PORT to choose the /metrics port (default: 6880).
DYN_KVBM_METRICS
=
true
\
python3
-m
dynamo.trtllm
\
--model-path
deepseek-ai/DeepSeek-R1-Distill-Llama-8
B
\
--served-model-name
deepseek-ai/DeepSeek-R1-Distill-Llama-8
B
\
--model-path
Qwen/Qwen3-0.6
B
\
--served-model-name
Qwen/Qwen3-0.6
B
\
--extra-engine-args
/tmp/kvbm_llm_api_config.yaml &
# optional if firewall blocks KVBM metrics ports to send prometheus metrics
...
...
@@ -138,7 +138,7 @@ git clone https://github.com/LMCache/LMBenchmark.git
# we are passing model, endpoint, output file prefix and qps to the sh script.
cd
LMBenchmark/synthetic-multi-round-qa
./long_input_short_output_run.sh
\
"
deepseek-ai/DeepSeek-R1-Distill-Llama-8
B"
\
"
Qwen/Qwen3-0.6
B"
\
"http://localhost:8000"
\
"benchmark_kvbm"
\
1
...
...
@@ -160,5 +160,5 @@ kv_cache_config:
EOF
# run trtllm-serve for the baseline for comparison
trtllm-serve
deepseek-ai/DeepSeek-R1-Distill-Llama-8
B
--host
localhost
--port
8000
--backend
pytorch
--extra_llm_api_options
/tmp/llm_api_config.yaml &
trtllm-serve
Qwen/Qwen3-0.6
B
--host
localhost
--port
8000
--backend
pytorch
--extra_llm_api_options
/tmp/llm_api_config.yaml &
```
docs/guides/run_kvbm_in_vllm.md
View file @
ca674098
...
...
@@ -85,7 +85,7 @@ curl localhost:8000/v1/chat/completions -H "Content-Type: application/json"
Alternatively, can use
`vllm serve`
directly to use KVBM for aggregated serving:
```
bash
vllm serve
--kv-transfer-config
'{"kv_connector":"DynamoConnector","kv_role":"kv_both", "kv_connector_module_path": "dynamo.llm.vllm_integration.connector"}'
deepseek-ai/DeepSeek-R1-Distill-Llama-8
B
vllm serve
--kv-transfer-config
'{"kv_connector":"DynamoConnector","kv_role":"kv_both", "kv_connector_module_path": "dynamo.llm.vllm_integration.connector"}'
Qwen/Qwen3-0.6
B
```
## Enable and View KVBM Metrics
...
...
@@ -97,9 +97,11 @@ docker compose -f deploy/docker-compose.yml --profile metrics up -d
# set env var DYN_KVBM_METRICS to true, when launch via dynamo
# Optionally set DYN_KVBM_METRICS_PORT to choose the /metrics port (default: 6880).
# NOTE: update launch/disagg_kvbm.sh or launch/disagg_kvbm_2p2d.sh as needed
DYN_KVBM_METRICS
=
true
\
python
-m
dynamo.vllm
\
--model
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
\
--model
Qwen/Qwen3-0.6B
\
--enforce-eager
\
--connector
kvbm &
# optional if firewall blocks KVBM metrics ports to send prometheus metrics
...
...
@@ -118,7 +120,7 @@ git clone https://github.com/LMCache/LMBenchmark.git
# we are passing model, endpoint, output file prefix and qps to the sh script.
cd
LMBenchmark/synthetic-multi-round-qa
./long_input_short_output_run.sh
\
"
deepseek-ai/DeepSeek-R1-Distill-Llama-8
B"
\
"
Qwen/Qwen3-0.6
B"
\
"http://localhost:8000"
\
"benchmark_kvbm"
\
1
...
...
@@ -129,4 +131,4 @@ More details about how to use LMBenchmark could be found [here](https://github.c
`NOTE`
: if metrics are enabled as mentioned in the above section, you can observe KV offloading, and KV onboarding in the grafana dashboard.
To compare, you can run
`vllm serve
deepseek-ai/DeepSeek-R1-Distill-Llama-8
B`
to turn KVBM off as the baseline.
To compare, you can run
`vllm serve
Qwen/Qwen3-0.6
B`
to turn KVBM off as the baseline.
lib/bindings/python/rust/llm/block_manager.rs
View file @
ca674098
...
...
@@ -240,6 +240,7 @@ pub struct BlockManagerBuilder {
leader
:
Option
<
distributed
::
KvbmLeader
>
,
page_size
:
usize
,
disable_device_pool
:
bool
,
kvbm_metrics
:
Option
<
dynamo_llm
::
block_manager
::
metrics_kvbm
::
KvbmMetrics
>
,
}
impl
BlockManagerBuilder
{
...
...
@@ -266,6 +267,13 @@ impl BlockManagerBuilder {
self
.disable_device_pool
=
yes
;
self
}
pub
fn
kvbm_metrics
(
mut
self
,
metrics
:
dynamo_llm
::
block_manager
::
metrics_kvbm
::
KvbmMetrics
,
)
->
Self
{
self
.kvbm_metrics
=
Some
(
metrics
);
self
}
/// Async build (call from an async context).
pub
async
fn
build
(
self
)
->
Result
<
BlockManager
>
{
...
...
@@ -325,7 +333,11 @@ impl BlockManagerBuilder {
);
}
let
config
=
config
.build
()
?
;
let
mut
config_builder
=
config
;
if
let
Some
(
kvbm_metrics
)
=
self
.kvbm_metrics
{
config_builder
=
config_builder
.kvbm_metrics
(
Some
(
kvbm_metrics
));
}
let
config
=
config_builder
.build
()
?
;
let
resources
=
DistributedLeaderWorkerResources
::
new
(
Some
(
leader_inner
),
cancel_token
.child_token
())
?
;
...
...
lib/bindings/python/rust/llm/block_manager/vllm/connector/leader.rs
View file @
ca674098
...
...
@@ -129,6 +129,7 @@ impl KvConnectorLeader {
.leader
(
leader_py
)
.page_size
(
page_size
)
.disable_device_pool
(
false
)
.kvbm_metrics
(
kvbm_metrics_clone
.clone
())
.build
()
.await
{
...
...
lib/bindings/python/rust/llm/block_manager/vllm/connector/leader/recorder.rs
View file @
ca674098
...
...
@@ -145,6 +145,7 @@ impl KvConnectorLeaderRecorder {
.leader
(
leader_py
)
.page_size
(
page_size
)
.disable_device_pool
(
false
)
.kvbm_metrics
(
kvbm_metrics_clone
.clone
())
.build
()
.await
{
...
...
lib/bindings/python/rust/llm/block_manager/vllm/connector/leader/slot.rs
View file @
ca674098
...
...
@@ -1277,7 +1277,6 @@ async fn process_offload_request(
leader
:
&
Arc
<
KvbmLeader
>
,
kvbm_metrics
:
KvbmMetrics
,
)
->
anyhow
::
Result
<
()
>
{
kvbm_metrics
.offload_requests
.inc
();
kvbm_metrics
.offload_blocks_d2h
.inc_by
(
offload_req
.block_ids
.len
()
as
u64
);
...
...
@@ -1389,7 +1388,6 @@ async fn process_onboard_request(
leader
:
&
Arc
<
KvbmLeader
>
,
kvbm_metrics
:
KvbmMetrics
,
)
->
anyhow
::
Result
<
()
>
{
kvbm_metrics
.onboard_requests
.inc
();
if
onboard_req
.src_blocks
.storage_pool
()
==
BlockTransferPool
::
Host
{
kvbm_metrics
.onboard_blocks_h2d
...
...
lib/bindings/python/rust/llm/block_manager/vllm/connector/trtllm_leader.rs
View file @
ca674098
...
...
@@ -105,6 +105,7 @@ impl KvConnectorLeader {
.leader
(
leader_py
)
.page_size
(
page_size
)
.disable_device_pool
(
false
)
.kvbm_metrics
(
kvbm_metrics_clone
.clone
())
.build
()
.await
{
...
...
lib/llm/src/block_manager/block/transfer/strategy.rs
View file @
ca674098
...
...
@@ -93,7 +93,7 @@ impl WriteToStrategy<DeviceStorage> for PinnedStorage {
impl
WriteToStrategy
<
DiskStorage
>
for
DeviceStorage
{
#[inline(always)]
fn
write_to_strategy
()
->
TransferStrategy
{
TransferStrategy
::
Nixl
(
NixlTransfer
::
Read
)
TransferStrategy
::
Nixl
(
NixlTransfer
::
Write
)
}
}
...
...
lib/llm/src/block_manager/config.rs
View file @
ca674098
...
...
@@ -199,6 +199,10 @@ pub struct KvBlockManagerConfig {
/// Channel to reset the block manager to a specific cache level
#[builder(default)]
pub
block_reset_channel
:
Option
<
BlockResetChannel
>
,
/// Optional KVBM-level metrics for tracking offload/onboard operations
#[builder(default)]
pub
kvbm_metrics
:
Option
<
crate
::
block_manager
::
metrics_kvbm
::
KvbmMetrics
>
,
}
impl
KvBlockManagerConfig
{
...
...
lib/llm/src/block_manager/metrics_kvbm.rs
View file @
ca674098
...
...
@@ -4,8 +4,8 @@
use
axum
::
Router
;
use
dynamo_runtime
::
metrics
::
prometheus_names
::{
kvbm
::{
MATCHED_TOKENS
,
OFFLOAD_BLOCKS_D2H
,
OFFLOAD_
REQUESTS
,
ONBOARD_BLOCKS_D2D
,
ONBOARD_BLOCKS_H2D
,
ONBOARD_REQUESTS
,
MATCHED_TOKENS
,
OFFLOAD_BLOCKS_D2H
,
OFFLOAD_
BLOCKS_H2D
,
ONBOARD_BLOCKS_D2D
,
ONBOARD_BLOCKS_H2D
,
},
sanitize_prometheus_name
,
};
...
...
@@ -17,14 +17,11 @@ use crate::http::service::{RouteDoc, metrics::router};
#[derive(Clone,
Debug)]
pub
struct
KvbmMetrics
{
// number of offload requests
pub
offload_requests
:
IntCounter
,
// number of blocks offloaded from device to host
pub
offload_blocks_d2h
:
IntCounter
,
// number of
onboard requests
pub
o
nb
oa
r
d_
requests
:
IntCounter
,
// number of
blocks offloaded from host to disk
pub
o
ffl
oad_
blocks_h2d
:
IntCounter
,
// number of blocks onboarded from host to device
pub
onboard_blocks_h2d
:
IntCounter
,
...
...
@@ -43,9 +40,6 @@ impl KvbmMetrics {
/// Non-blocking: the HTTP server runs on a background task.
pub
fn
new
(
mr
:
&
KvbmMetricsRegistry
,
create_endpoint
:
bool
,
metrics_port
:
u16
)
->
Self
{
// 1) register kvbm metrics
let
offload_requests
=
mr
.create_intcounter
(
OFFLOAD_REQUESTS
,
"The number of offload requests"
,
&
[])
.unwrap
();
let
offload_blocks_d2h
=
mr
.create_intcounter
(
OFFLOAD_BLOCKS_D2H
,
...
...
@@ -53,8 +47,12 @@ impl KvbmMetrics {
&
[],
)
.unwrap
();
let
onboard_requests
=
mr
.create_intcounter
(
ONBOARD_REQUESTS
,
"The number of onboard requests"
,
&
[])
let
offload_blocks_h2d
=
mr
.create_intcounter
(
OFFLOAD_BLOCKS_H2D
,
"The number of offload blocks from host to disk"
,
&
[],
)
.unwrap
();
let
onboard_blocks_h2d
=
mr
.create_intcounter
(
...
...
@@ -77,9 +75,8 @@ impl KvbmMetrics {
// early return if no endpoint is needed
if
!
create_endpoint
{
return
Self
{
offload_requests
,
offload_blocks_d2h
,
o
nb
oa
r
d_
requests
,
o
ffl
oad_
blocks_h2d
,
onboard_blocks_h2d
,
onboard_blocks_d2d
,
matched_tokens
,
...
...
@@ -131,9 +128,8 @@ impl KvbmMetrics {
}
Self
{
offload_requests
,
offload_blocks_d2h
,
o
nb
oa
r
d_
requests
,
o
ffl
oad_
blocks_h2d
,
onboard_blocks_h2d
,
onboard_blocks_d2d
,
matched_tokens
,
...
...
lib/llm/src/block_manager/offload.rs
View file @
ca674098
...
...
@@ -80,6 +80,8 @@ pub struct OffloadManagerConfig {
pub
metrics
:
Arc
<
BlockManagerMetrics
>
,
pub
cancellation_token
:
CancellationToken
,
pub
model_config
:
KvManagerModelConfig
,
/// Optional KVBM-level metrics for tracking offload/onboard operations
pub
kvbm_metrics
:
Option
<
crate
::
block_manager
::
metrics_kvbm
::
KvbmMetrics
>
,
}
/// The offload manager handles all block transfers between different cache levels.
...
...
@@ -101,6 +103,9 @@ pub struct OffloadManager<Locality: LocalityProvider, Metadata: BlockMetadata> {
/// An incrementing counter for offloaded blocks. Within the same priority, blocks with lower tick values are processed first.
tick
:
Arc
<
AtomicU64
>
,
/// Optional KVBM-level metrics for tracking offload/onboard operations
kvbm_metrics
:
Option
<
crate
::
block_manager
::
metrics_kvbm
::
KvbmMetrics
>
,
}
impl
<
Locality
:
LocalityProvider
+
'static
,
Metadata
:
BlockMetadata
>
...
...
@@ -129,6 +134,7 @@ impl<Locality: LocalityProvider + 'static, Metadata: BlockMetadata>
host_onboard_tx
,
disk_onboard_tx
,
tick
:
Arc
::
new
(
AtomicU64
::
new
(
0
)),
kvbm_metrics
:
config
.kvbm_metrics
.clone
(),
});
let
cuda_ctx
=
Cuda
::
device_or_create
(
0
)
?
;
...
...
@@ -485,6 +491,11 @@ impl<Locality: LocalityProvider + 'static, Metadata: BlockMetadata>
key
,
};
// Track metrics if available
if
let
Some
(
ref
kvbm_metrics
)
=
self
.kvbm_metrics
{
kvbm_metrics
.offload_blocks_d2h
.inc
();
}
self
.device_offload_tx
.send
(
request
)
.unwrap
();
}
else
if
let
Some
(
host_block
)
=
any_block
.downcast_ref
::
<
ImmutableBlock
<
PinnedStorage
,
Locality
,
Metadata
>>
()
...
...
@@ -500,6 +511,11 @@ impl<Locality: LocalityProvider + 'static, Metadata: BlockMetadata>
key
,
};
// Track metrics if available
if
let
Some
(
ref
kvbm_metrics
)
=
self
.kvbm_metrics
{
kvbm_metrics
.offload_blocks_h2d
.inc
();
}
self
.host_offload_tx
.send
(
request
)
.unwrap
();
}
...
...
lib/llm/src/block_manager/state.rs
View file @
ca674098
...
...
@@ -158,6 +158,7 @@ impl<R: LogicalResources, Metadata: BlockMetadata>
metrics
:
resources
.metrics
.clone
(),
cancellation_token
:
resources
.cancellation_token
.clone
(),
model_config
,
kvbm_metrics
:
resources
.config.kvbm_metrics
.clone
(),
};
let
offload_manager
=
OffloadManager
::
new
(
...
...
@@ -280,6 +281,7 @@ impl<Metadata: BlockMetadata> KvBlockManagerState<locality::Local, Metadata> {
metrics
:
resources
.metrics
.clone
(),
cancellation_token
:
resources
.cancellation_token
.clone
(),
model_config
,
kvbm_metrics
:
resources
.config.kvbm_metrics
.clone
(),
};
let
offload_manager
=
OffloadManager
::
new
(
...
...
lib/runtime/src/metrics/prometheus_names.rs
View file @
ca674098
...
...
@@ -320,14 +320,11 @@ pub mod distributed_runtime {
/// KVBM
pub
mod
kvbm
{
/// The number of offload requests
pub
const
OFFLOAD_REQUESTS
:
&
str
=
"offload_requests"
;
/// The number of offload blocks from device to host
pub
const
OFFLOAD_BLOCKS_D2H
:
&
str
=
"offload_blocks_d2h"
;
/// The number of o
nb
oa
r
d
requests
pub
const
O
NB
OA
R
D_
REQUESTS
:
&
str
=
"o
nb
oa
r
d_
requests
"
;
/// The number of o
ffl
oad
blocks from host to disk
pub
const
O
FFL
OAD_
BLOCKS_H2D
:
&
str
=
"o
ffl
oad_
blocks_h2d
"
;
/// The number of onboard blocks from host to device
pub
const
ONBOARD_BLOCKS_H2D
:
&
str
=
"onboard_blocks_h2d"
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment