Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
4224e57d
Unverified
Commit
4224e57d
authored
Oct 10, 2025
by
Ziqi Fan
Committed by
GitHub
Oct 11, 2025
Browse files
feat: rm the old KVBM metrics | update G2 to G3 metrics collection (#3561)
Signed-off-by:
Ziqi Fan
<
ziqif@nvidia.com
>
parent
55e458d8
Changes
12
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
71 additions
and
232 deletions
+71
-232
docs/guides/run_kvbm_in_vllm.md
docs/guides/run_kvbm_in_vllm.md
+1
-1
lib/bindings/python/rust/llm/block_manager.rs
lib/bindings/python/rust/llm/block_manager.rs
+45
-21
lib/bindings/python/rust/llm/block_manager/vllm/connector/leader/slot.rs
...thon/rust/llm/block_manager/vllm/connector/leader/slot.rs
+4
-4
lib/llm/src/block_manager.rs
lib/llm/src/block_manager.rs
+0
-1
lib/llm/src/block_manager/metrics.rs
lib/llm/src/block_manager/metrics.rs
+0
-70
lib/llm/src/block_manager/offload.rs
lib/llm/src/block_manager/offload.rs
+16
-47
lib/llm/src/block_manager/offload/pending.rs
lib/llm/src/block_manager/offload/pending.rs
+2
-31
lib/llm/src/block_manager/pool.rs
lib/llm/src/block_manager/pool.rs
+0
-2
lib/llm/src/block_manager/pool/managed.rs
lib/llm/src/block_manager/pool/managed.rs
+2
-27
lib/llm/src/block_manager/pool/managed/state.rs
lib/llm/src/block_manager/pool/managed/state.rs
+0
-17
lib/llm/src/block_manager/state.rs
lib/llm/src/block_manager/state.rs
+1
-8
lib/llm/src/block_manager/state/resources.rs
lib/llm/src/block_manager/state/resources.rs
+0
-3
No files found.
docs/guides/run_kvbm_in_vllm.md
View file @
4224e57d
...
...
@@ -102,7 +102,7 @@ DYN_KVBM_METRICS=true \
python
-m
dynamo.vllm
\
--model
Qwen/Qwen3-0.6B
\
--enforce-eager
\
--connector
kvbm
&
--connector
kvbm
# optional if firewall blocks KVBM metrics ports to send prometheus metrics
sudo
ufw allow 6880/tcp
...
...
lib/bindings/python/rust/llm/block_manager.rs
View file @
4224e57d
...
...
@@ -13,6 +13,34 @@ use pyo3::PyResult;
use
std
::
time
::
Duration
;
use
tokio_util
::
sync
::
CancellationToken
;
/// Creates a disk offload filter based on environment configuration.
/// Returns `Ok(None)` if the filter is disabled via `DYN_KVBM_DISABLE_DISK_OFFLOAD_FILTER`,
/// otherwise constructs a `FrequencyFilter` with standard parameters.
fn
create_disk_offload_filter
(
cancel_token
:
&
CancellationToken
,
runtime
:
&
tokio
::
runtime
::
Handle
,
)
->
Result
<
Option
<
Arc
<
FrequencyFilter
>>>
{
// Check if disk offload filter is disabled via environment variable
let
disable_filter
=
std
::
env
::
var
(
"DYN_KVBM_DISABLE_DISK_OFFLOAD_FILTER"
)
.map
(|
v
|
v
==
"true"
||
v
==
"1"
)
.unwrap_or
(
false
);
if
disable_filter
{
return
Ok
(
None
);
}
// TODO: These values seem plausible for most use cases, but we need to figure out a better way to configure them.
let
frequency_filter
=
FrequencyFilter
::
new
(
2
,
Duration
::
from_secs
(
600
),
1_000_000
,
cancel_token
.child_token
(),
runtime
.clone
(),
)
?
;
Ok
(
Some
(
Arc
::
new
(
frequency_filter
)))
}
mod
controller
;
mod
distributed
;
...
...
@@ -104,23 +132,11 @@ impl BlockManager {
.logical
(
Some
(
BlockParallelismStrategy
::
LeaderWorkerSharded
));
if
leader
.num_disk_blocks
()
>
0
{
// Check if disk offload filter is disabled via environment variable
let
disable_filter
=
std
::
env
::
var
(
"DYN_KVBM_DISABLE_DISK_OFFLOAD_FILTER"
)
.map
(|
v
|
v
==
"true"
||
v
==
"1"
)
.unwrap_or
(
false
);
if
!
disable_filter
{
// TODO: These values seem plausible for most use cases, but we need to figure out a better way to configure them.
let
frequency_filter
=
FrequencyFilter
::
new
(
2
,
Duration
::
from_secs
(
600
),
1e6
as
usize
,
cancel_token
.child_token
(),
rt
.inner
()
.runtime
()
.primary
()
.clone
(),
)
.map_err
(
to_pyerr
)
?
;
host_layout_config
=
host_layout_config
.offload_filter
(
Some
(
Arc
::
new
(
frequency_filter
)));
if
let
Some
(
filter
)
=
create_disk_offload_filter
(
&
cancel_token
,
&
rt
.inner
()
.runtime
()
.primary
())
.map_err
(
to_pyerr
)
?
{
host_layout_config
=
host_layout_config
.offload_filter
(
Some
(
filter
));
}
}
...
...
@@ -316,12 +332,20 @@ impl BlockManagerBuilder {
}
if
leader_inner
.num_host_blocks
()
>
0
{
config
=
config
.
host_layout
(
let
mut
host_layout
_config
=
dynamo_llm
::
block_manager
::
KvManagerLayoutConfig
::
builder
()
.num_blocks
(
leader_inner
.num_host_blocks
())
.logical
(
Some
(
BlockParallelismStrategy
::
LeaderWorkerSharded
))
.build
()
?
,
);
.logical
(
Some
(
BlockParallelismStrategy
::
LeaderWorkerSharded
));
if
leader_inner
.num_disk_blocks
()
>
0
{
if
let
Some
(
filter
)
=
create_disk_offload_filter
(
&
cancel_token
,
&
drt
.inner
()
.runtime
()
.primary
())
?
{
host_layout_config
=
host_layout_config
.offload_filter
(
Some
(
filter
));
}
}
config
=
config
.host_layout
(
host_layout_config
.build
()
?
);
}
if
leader_inner
.num_disk_blocks
()
>
0
{
...
...
lib/bindings/python/rust/llm/block_manager/vllm/connector/leader/slot.rs
View file @
4224e57d
...
...
@@ -1277,10 +1277,6 @@ async fn process_offload_request(
leader
:
&
Arc
<
KvbmLeader
>
,
kvbm_metrics
:
KvbmMetrics
,
)
->
anyhow
::
Result
<
()
>
{
kvbm_metrics
.offload_blocks_d2h
.inc_by
(
offload_req
.block_ids
.len
()
as
u64
);
let
request_id
=
&
offload_req
.request_id
;
let
operation_id
=
&
offload_req
.operation_id
;
...
...
@@ -1367,6 +1363,10 @@ async fn process_offload_request(
"offload - stage 4 complete"
);
kvbm_metrics
.offload_blocks_d2h
.inc_by
(
blocks_to_register
.len
()
as
u64
);
// 5. Register the mutable blocks
let
immutable_blocks
=
block_manager
.host
()
...
...
lib/llm/src/block_manager.rs
View file @
4224e57d
...
...
@@ -15,7 +15,6 @@ pub mod connector;
pub
mod
distributed
;
pub
mod
events
;
pub
mod
layout
;
pub
mod
metrics
;
pub
mod
metrics_kvbm
;
pub
mod
offload
;
pub
mod
pool
;
...
...
lib/llm/src/block_manager/metrics.rs
deleted
100644 → 0
View file @
55e458d8
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use
anyhow
::
Result
;
use
prometheus
::{
IntCounterVec
,
IntGaugeVec
,
Opts
,
Registry
,
core
::{
AtomicI64
,
AtomicU64
,
GenericCounter
,
GenericGauge
},
register_int_counter_vec_with_registry
,
register_int_gauge_vec_with_registry
,
};
use
std
::
sync
::
Arc
;
pub
struct
BlockManagerMetrics
{
gauges
:
IntGaugeVec
,
counters
:
IntCounterVec
,
}
impl
BlockManagerMetrics
{
pub
fn
new
(
metrics_registry
:
&
Arc
<
Registry
>
)
->
Result
<
Arc
<
Self
>>
{
let
gauge_opts
=
Opts
::
new
(
"gauges"
,
"Gauges for the pools"
)
.namespace
(
"dynamo"
)
.subsystem
(
"kvbm"
);
let
counter_opts
=
Opts
::
new
(
"pools"
,
"Counters for the pools"
)
.namespace
(
"dynamo"
)
.subsystem
(
"kvbm"
);
let
gauges
=
register_int_gauge_vec_with_registry!
(
gauge_opts
,
&
[
"pool"
,
"metric_type"
],
metrics_registry
)
?
;
let
counters
=
register_int_counter_vec_with_registry!
(
counter_opts
,
&
[
"pool"
,
"metric_type"
],
metrics_registry
)
?
;
Ok
(
Arc
::
new
(
Self
{
gauges
,
counters
}))
}
pub
fn
pool
(
self
:
&
Arc
<
Self
>
,
group
:
&
str
)
->
Arc
<
PoolMetrics
>
{
PoolMetrics
::
new
(
self
,
group
)
}
}
pub
struct
PoolMetrics
{
block_manager_metrics
:
Arc
<
BlockManagerMetrics
>
,
group
:
String
,
}
impl
PoolMetrics
{
pub
fn
new
(
block_manager_metrics
:
&
Arc
<
BlockManagerMetrics
>
,
group
:
&
str
)
->
Arc
<
Self
>
{
Arc
::
new
(
Self
{
block_manager_metrics
:
block_manager_metrics
.clone
(),
group
:
group
.to_string
(),
})
}
pub
fn
gauge
(
&
self
,
metric_type
:
&
str
)
->
GenericGauge
<
AtomicI64
>
{
self
.block_manager_metrics
.gauges
.with_label_values
(
&
[
&
self
.group
,
&
metric_type
.to_string
()])
}
pub
fn
counter
(
&
self
,
metric_type
:
&
str
)
->
GenericCounter
<
AtomicU64
>
{
self
.block_manager_metrics
.counters
.with_label_values
(
&
[
&
self
.group
,
&
metric_type
.to_string
()])
}
}
lib/llm/src/block_manager/offload.rs
View file @
4224e57d
...
...
@@ -37,7 +37,6 @@ use super::block::{
locality
::
LocalityProvider
,
transfer
::{
PoolConfig
,
TransferContext
},
};
use
super
::
metrics
::{
BlockManagerMetrics
,
PoolMetrics
};
use
super
::
pool
::{
BlockPool
,
BlockPoolError
};
use
super
::
storage
::{
Cuda
,
Storage
};
use
super
::{
DeviceStorage
,
DiskStorage
,
KvManagerModelConfig
,
PinnedStorage
};
...
...
@@ -77,7 +76,6 @@ pub const MAX_TRANSFER_BATCH_SIZE: usize = 16;
pub
struct
OffloadManagerConfig
{
pub
nixl_agent
:
Arc
<
Option
<
NixlAgent
>>
,
pub
async_rt_handle
:
Handle
,
pub
metrics
:
Arc
<
BlockManagerMetrics
>
,
pub
cancellation_token
:
CancellationToken
,
pub
model_config
:
KvManagerModelConfig
,
/// Optional KVBM-level metrics for tracking offload/onboard operations
...
...
@@ -103,9 +101,6 @@ pub struct OffloadManager<Locality: LocalityProvider, Metadata: BlockMetadata> {
/// An incrementing counter for offloaded blocks. Within the same priority, blocks with lower tick values are processed first.
tick
:
Arc
<
AtomicU64
>
,
/// Optional KVBM-level metrics for tracking offload/onboard operations
kvbm_metrics
:
Option
<
crate
::
block_manager
::
metrics_kvbm
::
KvbmMetrics
>
,
}
impl
<
Locality
:
LocalityProvider
+
'static
,
Metadata
:
BlockMetadata
>
...
...
@@ -134,7 +129,6 @@ impl<Locality: LocalityProvider + 'static, Metadata: BlockMetadata>
host_onboard_tx
,
disk_onboard_tx
,
tick
:
Arc
::
new
(
AtomicU64
::
new
(
0
)),
kvbm_metrics
:
config
.kvbm_metrics
.clone
(),
});
let
cuda_ctx
=
Cuda
::
device_or_create
(
0
)
?
;
...
...
@@ -155,10 +149,6 @@ impl<Locality: LocalityProvider + 'static, Metadata: BlockMetadata>
Some
(
pool_config
),
));
let
device_metrics
=
config
.metrics
.pool
(
"device"
);
let
host_metrics
=
config
.metrics
.pool
(
"host"
);
let
disk_metrics
=
config
.metrics
.pool
(
"disk"
);
// Device -> Host offload
let
device_to_host_task
=
OffloadManager
::
offload_worker
(
this
.device
.clone
(),
...
...
@@ -170,15 +160,16 @@ impl<Locality: LocalityProvider + 'static, Metadata: BlockMetadata>
MAX_CONCURRENT_TRANSFERS
,
&
config
.async_rt_handle
,
config
.cancellation_token
.clone
(),
device_metrics
.clone
(),
"offload_bw"
.to_string
(),
)
?
,
MAX_TRANSFER_BATCH_SIZE
,
&
config
.async_rt_handle
,
config
.cancellation_token
.clone
(),
)),
filters
.device
.clone
(),
device_metrics
.clone
(),
config
.kvbm_metrics
.as_ref
()
.map
(|
m
|
m
.offload_blocks_d2h
.clone
()),
config
.cancellation_token
.clone
(),
);
CriticalTaskExecutionHandle
::
new_with_runtime
(
...
...
@@ -207,15 +198,16 @@ impl<Locality: LocalityProvider + 'static, Metadata: BlockMetadata>
MAX_CONCURRENT_TRANSFERS
,
&
config
.async_rt_handle
,
config
.cancellation_token
.clone
(),
host_metrics
.clone
(),
"offload_bw"
.to_string
(),
)
?
,
MAX_TRANSFER_BATCH_SIZE
,
&
config
.async_rt_handle
,
config
.cancellation_token
.clone
(),
)),
filters
.host
.clone
(),
host_metrics
.clone
(),
config
.kvbm_metrics
.as_ref
()
.map
(|
m
|
m
.offload_blocks_h2d
.clone
()),
config
.cancellation_token
.clone
(),
);
CriticalTaskExecutionHandle
::
new_with_runtime
(
...
...
@@ -237,14 +229,11 @@ impl<Locality: LocalityProvider + 'static, Metadata: BlockMetadata>
MAX_CONCURRENT_TRANSFERS
,
&
config
.async_rt_handle
,
config
.cancellation_token
.clone
(),
host_metrics
.clone
(),
"onboard_bw"
.to_string
(),
)
?
,
MAX_TRANSFER_BATCH_SIZE
,
&
config
.async_rt_handle
,
config
.cancellation_token
.clone
(),
)),
host_metrics
.clone
(),
config
.cancellation_token
.clone
(),
);
CriticalTaskExecutionHandle
::
new_with_runtime
(
...
...
@@ -266,14 +255,11 @@ impl<Locality: LocalityProvider + 'static, Metadata: BlockMetadata>
MAX_CONCURRENT_TRANSFERS
,
&
config
.async_rt_handle
,
config
.cancellation_token
.clone
(),
disk_metrics
.clone
(),
"onboard_bw"
.to_string
(),
)
?
,
MAX_TRANSFER_BATCH_SIZE
,
&
config
.async_rt_handle
,
config
.cancellation_token
.clone
(),
)),
disk_metrics
.clone
(),
config
.cancellation_token
.clone
(),
);
CriticalTaskExecutionHandle
::
new_with_runtime
(
...
...
@@ -293,7 +279,7 @@ impl<Locality: LocalityProvider + 'static, Metadata: BlockMetadata>
mut
offload_rx
:
mpsc
::
UnboundedReceiver
<
OffloadRequest
<
Source
,
Locality
,
Metadata
>>
,
transfer_manager
:
Arc
<
dyn
TransferManager
<
Source
,
Target
,
Locality
,
Metadata
>>
,
offload_filter
:
Option
<
Arc
<
dyn
OffloadFilter
>>
,
pool
_metric
s
:
Arc
<
PoolMetrics
>
,
offload
_metric
:
Option
<
prometheus
::
IntCounter
>
,
cancellation_token
:
CancellationToken
,
)
->
Result
<
()
>
{
if
source_pool
.is_none
()
||
target_pool
.is_none
()
{
...
...
@@ -315,7 +301,6 @@ impl<Locality: LocalityProvider + 'static, Metadata: BlockMetadata>
match
offload_rx
.try_recv
()
{
Ok
(
request
)
=>
{
queue
.insert
(
request
);
pool_metrics
.gauge
(
"offload_queue_size"
)
.inc
();
}
Err
(
TryRecvError
::
Empty
)
=>
{
break
;
...
...
@@ -326,7 +311,6 @@ impl<Locality: LocalityProvider + 'static, Metadata: BlockMetadata>
// If there is a request, process it.
if
let
Some
(
request
)
=
queue
.pop_first
()
{
pool_metrics
.gauge
(
"offload_queue_size"
)
.dec
();
// Try to upgrade the block to a strong reference.
let
block
=
match
request
.block
.upgrade
()
{
Some
(
block
)
=>
Some
(
ImmutableBlock
::
new
(
block
)),
...
...
@@ -368,11 +352,16 @@ impl<Locality: LocalityProvider + 'static, Metadata: BlockMetadata>
};
if
let
Some
(
target_block
)
=
target_block
{
pool_metrics
.counter
(
"offload_processed"
)
.inc
();
tracing
::
debug!
(
"Offloading block with sequence hash {} to target pool."
,
request
.sequence_hash
);
// Track the offload metric if available
if
let
Some
(
ref
metric
)
=
offload_metric
{
metric
.inc
();
}
transfer_manager
.enqueue_transfer
(
PendingTransfer
::
new
(
vec!
[
block
],
...
...
@@ -389,7 +378,6 @@ impl<Locality: LocalityProvider + 'static, Metadata: BlockMetadata>
_
=
cancellation_token
.cancelled
()
=>
return
Ok
(()),
Some
(
request
)
=
offload_rx
.recv
()
=>
{
queue
.insert
(
request
);
pool_metrics
.gauge
(
"offload_queue_size"
)
.inc
();
}
}
}
...
...
@@ -401,7 +389,6 @@ impl<Locality: LocalityProvider + 'static, Metadata: BlockMetadata>
target_pool
:
Option
<
Arc
<
dyn
BlockPool
<
Target
,
Locality
,
Metadata
>>>
,
mut
onboard_rx
:
mpsc
::
UnboundedReceiver
<
OnboardRequest
<
Source
,
Target
,
Locality
,
Metadata
>>
,
transfer_manager
:
Arc
<
dyn
TransferManager
<
Source
,
Target
,
Locality
,
Metadata
>>
,
pool_metrics
:
Arc
<
PoolMetrics
>
,
cancellation_token
:
CancellationToken
,
)
->
Result
<
()
>
{
if
source_pool
.is_none
()
||
target_pool
.is_none
()
{
...
...
@@ -414,10 +401,6 @@ impl<Locality: LocalityProvider + 'static, Metadata: BlockMetadata>
_
=
cancellation_token
.cancelled
()
=>
return
Ok
::
<
(),
anyhow
::
Error
>
(()),
Some
(
request
)
=
onboard_rx
.recv
()
=>
{
pool_metrics
.gauge
(
"onboard_queue_size"
)
.set
(
onboard_rx
.len
()
as
i64
);
// Try to allocate blocks on the device.
let
target_blocks
=
if
let
Some
(
targets
)
=
request
.targets
{
targets
...
...
@@ -431,10 +414,6 @@ impl<Locality: LocalityProvider + 'static, Metadata: BlockMetadata>
}
};
pool_metrics
.counter
(
"onboard_processed"
)
.inc_by
(
request
.blocks
.len
()
as
u64
);
tracing
::
debug!
(
"Onboarding {} blocks to target pool."
,
request
.blocks
.len
());
transfer_manager
...
...
@@ -491,11 +470,6 @@ impl<Locality: LocalityProvider + 'static, Metadata: BlockMetadata>
key
,
};
// Track metrics if available
if
let
Some
(
ref
kvbm_metrics
)
=
self
.kvbm_metrics
{
kvbm_metrics
.offload_blocks_d2h
.inc
();
}
self
.device_offload_tx
.send
(
request
)
.unwrap
();
}
else
if
let
Some
(
host_block
)
=
any_block
.downcast_ref
::
<
ImmutableBlock
<
PinnedStorage
,
Locality
,
Metadata
>>
()
...
...
@@ -511,11 +485,6 @@ impl<Locality: LocalityProvider + 'static, Metadata: BlockMetadata>
key
,
};
// Track metrics if available
if
let
Some
(
ref
kvbm_metrics
)
=
self
.kvbm_metrics
{
kvbm_metrics
.offload_blocks_h2d
.inc
();
}
self
.host_offload_tx
.send
(
request
)
.unwrap
();
}
...
...
@@ -833,9 +802,9 @@ mod tests {
let
config
=
OffloadManagerConfig
{
nixl_agent
:
agent_arc
,
async_rt_handle
,
metrics
:
BlockManagerMetrics
::
new
(
&
Arc
::
new
(
Registry
::
new
()))
?
,
cancellation_token
:
CancellationToken
::
new
(),
model_config
:
minimal_config
,
kvbm_metrics
:
None
,
};
let
manager
=
OffloadManager
::
new
(
...
...
lib/llm/src/block_manager/offload/pending.rs
View file @
4224e57d
...
...
@@ -30,7 +30,6 @@ use nixl_sys::NixlDescriptor;
use
std
::
marker
::
PhantomData
;
use
std
::
pin
::
Pin
;
use
std
::
sync
::
Arc
;
use
std
::
time
::{
Duration
,
Instant
};
use
tokio
::
runtime
::
Handle
;
use
tokio
::
sync
::{
mpsc
,
oneshot
};
use
tokio_util
::
sync
::
CancellationToken
;
...
...
@@ -41,7 +40,6 @@ use crate::block_manager::block::{
locality
::
LocalityProvider
,
transfer
::{
TransferContext
,
WriteTo
,
WriteToStrategy
},
};
use
crate
::
block_manager
::
metrics
::
PoolMetrics
;
use
crate
::
block_manager
::
pool
::{
BlockPool
,
BlockPoolError
};
use
crate
::
block_manager
::
storage
::{
Local
,
Storage
};
...
...
@@ -53,8 +51,6 @@ use super::BlockResult;
use
dynamo_runtime
::
utils
::
task
::
CriticalTaskExecutionHandle
;
const
BLOCKS_BW_MIN_PUBLISH_INTERVAL_MS
:
u64
=
50
;
/// Manage a set of pending transfers.
pub
struct
PendingTransfer
<
Source
:
Storage
,
...
...
@@ -164,10 +160,6 @@ struct TransferCompletionManager<
Locality
:
LocalityProvider
,
Metadata
:
BlockMetadata
,
>
{
pool_metrics
:
Arc
<
PoolMetrics
>
,
transfer_type
:
String
,
last_publish_time
:
Option
<
Instant
>
,
transfer_start
:
Instant
,
num_blocks_transferred
:
usize
,
_
phantom
:
PhantomData
<
(
Source
,
Target
,
Locality
,
Metadata
)
>
,
}
...
...
@@ -175,12 +167,8 @@ struct TransferCompletionManager<
impl
<
Source
:
Storage
,
Target
:
Storage
,
Locality
:
LocalityProvider
,
Metadata
:
BlockMetadata
>
TransferCompletionManager
<
Source
,
Target
,
Locality
,
Metadata
>
{
pub
fn
new
(
pool_metrics
:
Arc
<
PoolMetrics
>
,
transfer_type
:
String
)
->
Self
{
pub
fn
new
()
->
Self
{
Self
{
pool_metrics
,
transfer_type
,
last_publish_time
:
None
,
transfer_start
:
Instant
::
now
(),
num_blocks_transferred
:
0
,
_
phantom
:
PhantomData
,
}
...
...
@@ -192,20 +180,6 @@ impl<Source: Storage, Target: Storage, Locality: LocalityProvider, Metadata: Blo
)
->
Result
<
()
>
{
self
.num_blocks_transferred
+=
pending_transfer
.sources
.len
();
let
should_publish
=
self
.last_publish_time
.is_none_or
(|
last_publish_time
|
{
last_publish_time
.elapsed
()
>
Duration
::
from_millis
(
BLOCKS_BW_MIN_PUBLISH_INTERVAL_MS
)
});
if
should_publish
{
self
.last_publish_time
=
Some
(
Instant
::
now
());
let
duration
=
self
.transfer_start
.elapsed
();
let
blocks_per_sec
=
self
.num_blocks_transferred
as
f64
/
duration
.as_secs_f64
();
self
.pool_metrics
.gauge
(
self
.transfer_type
.as_str
())
.set
(
blocks_per_sec
as
i64
);
}
match
pending_transfer
.handle_complete
()
.await
{
Ok
(
_
)
=>
{}
Err
(
e
)
=>
{
...
...
@@ -245,13 +219,10 @@ impl<Source: Storage, Target: Storage, Locality: LocalityProvider, Metadata: Blo
max_concurrent_transfers
:
usize
,
runtime
:
&
Handle
,
cancellation_token
:
CancellationToken
,
pool_metrics
:
Arc
<
PoolMetrics
>
,
transfer_type
:
String
,
)
->
Result
<
Self
>
{
let
(
futures_tx
,
mut
futures_rx
)
=
mpsc
::
channel
(
1
);
let
mut
completion_manager
=
TransferCompletionManager
::
new
(
pool_metrics
.clone
(),
transfer_type
.clone
());
let
mut
completion_manager
=
TransferCompletionManager
::
new
();
CriticalTaskExecutionHandle
::
new_with_runtime
(
move
|
cancel_token
|
async
move
{
...
...
lib/llm/src/block_manager/pool.rs
View file @
4224e57d
...
...
@@ -15,7 +15,6 @@ use super::block::{
private
,
registry
::
BlockRegistry
,
};
use
super
::
events
::{
EventManager
,
NullEventManager
};
use
super
::
metrics
::{
BlockManagerMetrics
,
PoolMetrics
};
use
super
::
storage
::
Storage
;
use
crate
::
block_manager
::
CacheLevel
;
...
...
@@ -23,7 +22,6 @@ use crate::block_manager::block::locality::LocalityProvider;
use
crate
::
tokens
::{
SequenceHash
,
TokenBlock
};
use
async_trait
::
async_trait
;
use
prometheus
::
Registry
;
use
std
::
sync
::
atomic
::{
AtomicU64
,
Ordering
};
use
std
::{
collections
::{
BTreeSet
,
HashMap
,
VecDeque
},
...
...
lib/llm/src/block_manager/pool/managed.rs
View file @
4224e57d
...
...
@@ -72,11 +72,6 @@ pub struct ManagedBlockPoolArgs<S: Storage, L: LocalityProvider, M: BlockMetadat
#[builder(default
=
"Handle::current()"
)]
async_runtime
:
Handle
,
#[builder(
default
=
"BlockManagerMetrics::new(&Arc::new(Registry::new())).unwrap().pool(
\"
pool
\"
)"
)]
pool_metrics
:
Arc
<
PoolMetrics
>
,
#[builder(default
=
"BlockRegistrationDuplicationSetting::Disabled"
)]
default_duplication_setting
:
BlockRegistrationDuplicationSetting
,
}
...
...
@@ -90,7 +85,6 @@ impl<S: Storage, L: LocalityProvider, M: BlockMetadata> ManagedBlockPoolArgsBuil
blocks
,
global_registry
,
async_runtime
,
metrics
,
default_duplication_setting
,
)
=
args
.dissolve
();
...
...
@@ -101,7 +95,6 @@ impl<S: Storage, L: LocalityProvider, M: BlockMetadata> ManagedBlockPoolArgsBuil
blocks
,
global_registry
,
async_runtime
,
metrics
,
default_duplication_setting
,
);
...
...
@@ -183,7 +176,6 @@ impl<S: Storage, L: LocalityProvider, M: BlockMetadata> ManagedBlockPool<S, L, M
blocks
:
Vec
<
Block
<
S
,
L
,
M
>>
,
global_registry
:
GlobalRegistry
,
async_runtime
:
Handle
,
metrics
:
Arc
<
PoolMetrics
>
,
default_duplication_setting
:
BlockRegistrationDuplicationSetting
,
)
->
Self
{
let
(
pool
,
progress_engine
)
=
Self
::
with_progress_engine
(
...
...
@@ -192,7 +184,6 @@ impl<S: Storage, L: LocalityProvider, M: BlockMetadata> ManagedBlockPool<S, L, M
blocks
,
global_registry
,
async_runtime
,
metrics
,
default_duplication_setting
,
);
...
...
@@ -237,7 +228,6 @@ impl<S: Storage, L: LocalityProvider, M: BlockMetadata> ManagedBlockPool<S, L, M
blocks
:
Vec
<
Block
<
S
,
L
,
M
>>
,
global_registry
:
GlobalRegistry
,
async_runtime
:
Handle
,
metrics
:
Arc
<
PoolMetrics
>
,
default_duplication_setting
:
BlockRegistrationDuplicationSetting
,
)
->
(
Self
,
ProgressEngine
<
S
,
L
,
M
>
)
{
let
(
priority_tx
,
priority_rx
)
=
tokio
::
sync
::
mpsc
::
unbounded_channel
();
...
...
@@ -251,7 +241,6 @@ impl<S: Storage, L: LocalityProvider, M: BlockMetadata> ManagedBlockPool<S, L, M
blocks
,
global_registry
,
async_runtime
,
metrics
,
);
let
available_blocks_counter
=
progress_engine
.available_blocks_counter
.clone
();
...
...
@@ -504,7 +493,6 @@ struct ProgressEngine<S: Storage, L: LocalityProvider, M: BlockMetadata> {
cancel_token
:
CancellationToken
,
state
:
State
<
S
,
L
,
M
>
,
return_rx
:
tokio
::
sync
::
mpsc
::
UnboundedReceiver
<
Block
<
S
,
L
,
M
>>
,
metrics
:
Arc
<
PoolMetrics
>
,
available_blocks_counter
:
Arc
<
AtomicU64
>
,
total_blocks_counter
:
Arc
<
AtomicU64
>
,
}
...
...
@@ -515,7 +503,6 @@ pub struct State<S: Storage, L: LocalityProvider, M: BlockMetadata> {
registry
:
BlockRegistry
,
return_tx
:
tokio
::
sync
::
mpsc
::
UnboundedSender
<
Block
<
S
,
L
,
M
>>
,
event_manager
:
Arc
<
dyn
EventManager
>
,
metrics
:
Arc
<
PoolMetrics
>
,
}
impl
<
S
:
Storage
,
L
:
LocalityProvider
+
'static
,
M
:
BlockMetadata
>
ProgressEngine
<
S
,
L
,
M
>
{
...
...
@@ -528,16 +515,10 @@ impl<S: Storage, L: LocalityProvider + 'static, M: BlockMetadata> ProgressEngine
blocks
:
Vec
<
Block
<
S
,
L
,
M
>>
,
global_registry
:
GlobalRegistry
,
async_runtime
:
Handle
,
metrics
:
Arc
<
PoolMetrics
>
,
)
->
Self
{
let
(
return_tx
,
return_rx
)
=
tokio
::
sync
::
mpsc
::
unbounded_channel
();
let
mut
state
=
State
::
<
S
,
L
,
M
>
::
new
(
event_manager
,
return_tx
,
global_registry
,
async_runtime
,
metrics
.clone
(),
);
let
mut
state
=
State
::
<
S
,
L
,
M
>
::
new
(
event_manager
,
return_tx
,
global_registry
,
async_runtime
);
let
count
=
blocks
.len
();
...
...
@@ -553,7 +534,6 @@ impl<S: Storage, L: LocalityProvider + 'static, M: BlockMetadata> ProgressEngine
cancel_token
,
state
,
return_rx
,
metrics
,
available_blocks_counter
,
total_blocks_counter
,
}
...
...
@@ -564,17 +544,14 @@ impl<S: Storage, L: LocalityProvider + 'static, M: BlockMetadata> ProgressEngine
biased
;
Some
(
priority_req
)
=
self
.priority_rx
.recv
(),
if
!
self
.priority_rx
.is_closed
()
=>
{
self
.metrics
.gauge
(
"priority_request_queue_size"
)
.set
(
self
.priority_rx
.len
()
as
i64
);
self
.state
.handle_priority_request
(
priority_req
,
&
mut
self
.return_rx
)
.await
;
}
Some
(
req
)
=
self
.ctrl_rx
.recv
(),
if
!
self
.ctrl_rx
.is_closed
()
=>
{
self
.metrics
.gauge
(
"control_request_queue_size"
)
.set
(
self
.ctrl_rx
.len
()
as
i64
);
self
.state
.handle_control_request
(
req
);
}
Some
(
block
)
=
self
.return_rx
.recv
()
=>
{
self
.metrics
.gauge
(
"return_block_queue_size"
)
.set
(
self
.return_rx
.len
()
as
i64
);
self
.state
.handle_return_block
(
block
);
}
...
...
@@ -612,7 +589,6 @@ mod tests {
blocks
,
global_registry
,
async_runtime
,
metrics
,
default_duplication_setting
,
)
=
args
.dissolve
();
...
...
@@ -622,7 +598,6 @@ mod tests {
blocks
,
global_registry
,
async_runtime
,
metrics
,
default_duplication_setting
,
);
...
...
lib/llm/src/block_manager/pool/managed/state.rs
View file @
4224e57d
...
...
@@ -17,7 +17,6 @@ impl<S: Storage, L: LocalityProvider + 'static, M: BlockMetadata> State<S, L, M>
return_tx
:
tokio
::
sync
::
mpsc
::
UnboundedSender
<
Block
<
S
,
L
,
M
>>
,
global_registry
:
GlobalRegistry
,
async_runtime
:
Handle
,
metrics
:
Arc
<
PoolMetrics
>
,
)
->
Self
{
Self
{
active
:
ActiveBlockPool
::
new
(),
...
...
@@ -25,7 +24,6 @@ impl<S: Storage, L: LocalityProvider + 'static, M: BlockMetadata> State<S, L, M>
registry
:
BlockRegistry
::
new
(
event_manager
.clone
(),
global_registry
,
async_runtime
),
return_tx
,
event_manager
,
metrics
,
}
}
...
...
@@ -159,10 +157,6 @@ impl<S: Storage, L: LocalityProvider + 'static, M: BlockMetadata> State<S, L, M>
}
}
self
.metrics
.counter
(
"blocks_allocated"
)
.inc_by
(
count
as
u64
);
Ok
(
blocks
)
}
...
...
@@ -271,10 +265,6 @@ impl<S: Storage, L: LocalityProvider + 'static, M: BlockMetadata> State<S, L, M>
assert_eq!
(
immutable_blocks
.len
(),
expected_len
);
self
.metrics
.counter
(
"blocks_registered"
)
.inc_by
(
immutable_blocks
.len
()
as
u64
);
Ok
(
immutable_blocks
)
}
...
...
@@ -320,13 +310,6 @@ impl<S: Storage, L: LocalityProvider + 'static, M: BlockMetadata> State<S, L, M>
immutable_blocks
.push
(
immutable
);
}
self
.metrics
.counter
(
"cache_hits"
)
.inc_by
(
immutable_blocks
.len
()
as
u64
);
self
.metrics
.counter
(
"cache_misses"
)
.inc_by
(
sequence_hashes
.len
()
as
u64
-
immutable_blocks
.len
()
as
u64
);
immutable_blocks
}
...
...
lib/llm/src/block_manager/state.rs
View file @
4224e57d
...
...
@@ -16,7 +16,6 @@ use super::{
config
::
NixlOptions
,
events
::{
EventManager
,
NullEventManager
},
locality
::
LogicalResources
,
metrics
::
BlockManagerMetrics
,
offload
::{
OffloadFilters
,
OffloadManager
,
OffloadManagerConfig
,
filter
::
OffloadFilter
,
request
::
BlockResult
,
...
...
@@ -43,9 +42,6 @@ pub(crate) struct Resources {
// event manager for block manager events
pub
event_manager
:
Arc
<
dyn
EventManager
>
,
// metrics for the block manager
pub
metrics
:
Arc
<
BlockManagerMetrics
>
,
// config for the block manager
pub
config
:
KvBlockManagerConfig
,
}
...
...
@@ -155,7 +151,6 @@ impl<R: LogicalResources, Metadata: BlockMetadata>
let
offload_config
=
OffloadManagerConfig
{
nixl_agent
:
resources
.nixl_agent
.clone
(),
async_rt_handle
:
resources
.async_rt_handle
.clone
(),
metrics
:
resources
.metrics
.clone
(),
cancellation_token
:
resources
.cancellation_token
.clone
(),
model_config
,
kvbm_metrics
:
resources
.config.kvbm_metrics
.clone
(),
...
...
@@ -278,7 +273,6 @@ impl<Metadata: BlockMetadata> KvBlockManagerState<locality::Local, Metadata> {
let
offload_config
=
OffloadManagerConfig
{
nixl_agent
:
resources
.nixl_agent
.clone
(),
async_rt_handle
:
resources
.async_rt_handle
.clone
(),
metrics
:
resources
.metrics
.clone
(),
cancellation_token
:
resources
.cancellation_token
.clone
(),
model_config
,
kvbm_metrics
:
resources
.config.kvbm_metrics
.clone
(),
...
...
@@ -521,7 +515,7 @@ impl<Locality: LocalityProvider, Metadata: BlockMetadata> std::fmt::Debug
pub
(
crate
)
fn
create_block_pool
<
S
:
Storage
,
L
:
LocalityProvider
,
M
:
BlockMetadata
>
(
factory
:
impl
IntoBlocks
<
S
,
L
>
,
resources
:
&
Resources
,
pool_name
:
&
str
,
_
pool_name
:
&
str
,
)
->
Result
<
(
Arc
<
dyn
BlockPool
<
S
,
L
,
M
>>
,
Vec
<
Block
<
S
,
L
,
M
>>
,
...
...
@@ -532,7 +526,6 @@ pub(crate) fn create_block_pool<S: Storage, L: LocalityProvider, M: BlockMetadat
.global_registry
(
resources
.global_registry
.clone
())
.async_runtime
(
resources
.async_rt_handle
.clone
())
.event_manager
(
resources
.event_manager
.clone
())
.pool_metrics
(
resources
.metrics
.pool
(
pool_name
))
.build
()
?
;
let
offload_filter
=
factory
.offload_filter
();
...
...
lib/llm/src/block_manager/state/resources.rs
View file @
4224e57d
...
...
@@ -18,8 +18,6 @@ impl Resources {
let
global_registry
=
GlobalRegistry
::
default
();
let
metrics
=
BlockManagerMetrics
::
new
(
&
config
.runtime.metrics_registry
)
?
;
let
event_manager
=
config
.event_manager
.clone
()
...
...
@@ -75,7 +73,6 @@ impl Resources {
nixl_backends
,
global_registry
,
event_manager
,
metrics
,
config
,
})
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment