chore: KVBM pip wheel (#3826)

Signed-off-by: Anant Sharma <anants@nvidia.com> Co-authored-by: Anant Sharma <anants@nvidia.com>

chore: KVBM pip wheel (#3826)
Signed-off-by: Anant Sharma <anants@nvidia.com> Co-authored-by: Anant Sharma <anants@nvidia.com>
6afa679c · Richard Huo · GitHub · e5c109d8 · 6afa679c · 6afa679c
Unverified Commit 6afa679c authored Oct 31, 2025 by Richard Huo Committed by GitHub Oct 31, 2025
20 changed files
--- a/lib/kvbm/python/kvbm/__init__.py
+++ b/lib/kvbm/python/kvbm/__init__.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# flake8: noqa
+
+from kvbm._core import BlockManager as BlockManager
+from kvbm._core import KvbmLeader as KvbmLeader
+from kvbm._core import KvbmWorker as KvbmWorker
--- a/lib/kvbm/python/kvbm/_core.pyi
+++ b/lib/kvbm/python/kvbm/_core.pyi
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, List, Optional
+
+class Layer:
+    """
+    A KV cache block layer
+    """
+
+    ...
+
+    def __dlpack__(self, stream: Optional[Any] = None, max_version: Optional[Any] = None, dl_device: Optional[Any] = None, copy: Optional[bool] = None) -> Any:
+        """
+        Get a dlpack capsule of the layer
+        """
+        ...
+
+    def __dlpack_device__(self) -> Any:
+        """
+        Get the dlpack device of the layer
+        """
+        ...
+
+class Block:
+    """
+    A KV cache block
+    """
+
+    ...
+
+    def __len__(self) -> int:
+        """
+        Get the number of layers in the list
+        """
+        ...
+
+    def __getitem__(self, index: int) -> Layer:
+        """
+        Get a layer by index
+        """
+        ...
+
+    def __iter__(self) -> 'Block':
+        """
+        Get an iterator over the layers
+        """
+        ...
+
+    def __next__(self) -> Block:
+        """
+        Get the next layer in the iterator
+        """
+        ...
+
+    def to_list(self) -> List[Layer]:
+        """
+        Get a list of layers
+        """
+        ...
+
+    def __dlpack__(self, stream: Optional[Any] = None, max_version: Optional[Any] = None, dl_device: Optional[Any] = None, copy: Optional[bool] = None) -> Any:
+        """
+        Get a dlpack capsule of the block
+        Exception raised if the block is not contiguous
+        """
+        ...
+
+    def __dlpack_device__(self) -> Any:
+        """
+        Get the dlpack device of the block
+        """
+        ...
+
+class BlockList:
+    """
+    A list of KV cache blocks
+    """
+
+    ...
+
+    def __len__(self) -> int:
+        """
+        Get the number of blocks in the list
+        """
+        ...
+
+    def __getitem__(self, index: int) -> Block:
+        """
+        Get a block by index
+        """
+        ...
+
+    def __iter__(self) -> 'BlockList':
+        """
+        Get an iterator over the blocks
+        """
+        ...
+
+    def __next__(self) -> Block:
+        """
+        Get the next block in the iterator
+        """
+        ...
+
+    def to_list(self) -> List[Block]:
+        """
+        Get a list of blocks
+        """
+        ...
+
+class BlockManager:
+    """
+    A KV cache block manager
+    """
+
+    def __init__(
+        self,
+        worker_id: int,
+        num_layer: int,
+        page_size: int,
+        inner_dim: int,
+        dtype: Optional[str] = None,
+        host_num_blocks: Optional[int] = None,
+        device_num_blocks: Optional[int] = None,
+        device_id: int = 0
+    ) -> None:
+        """
+        Create a `BlockManager` object
+
+        Parameters:
+        -----------
+        worker_id: int
+            The worker ID for this block manager
+        num_layer: int
+            Number of layers in the model
+        page_size: int
+            Page size for blocks
+        inner_dim: int
+            Inner dimension size
+        dtype: Optional[str]
+            Data type (e.g., 'fp16', 'bf16', 'fp32'), defaults to 'fp16' if None
+        host_num_blocks: Optional[int]
+            Number of host blocks to allocate, None means no host blocks
+        device_num_blocks: Optional[int]
+            Number of device blocks to allocate, None means no device blocks
+        device_id: int
+            CUDA device ID, defaults to 0
+        """
+        ...
+
+    def allocate_host_blocks_blocking(self, count: int) -> BlockList:
+        """
+        Allocate a list of host blocks (blocking call)
+
+        Parameters:
+        -----------
+        count: int
+            Number of blocks to allocate
+
+        Returns:
+        --------
+        BlockList
+            List of allocated blocks
+        """
+        ...
+
+    async def allocate_host_blocks(self, count: int) -> BlockList:
+        """
+        Allocate a list of host blocks
+
+        Parameters:
+        -----------
+        count: int
+            Number of blocks to allocate
+
+        Returns:
+        --------
+        BlockList
+            List of allocated blocks
+        """
+        ...
+
+    def allocate_device_blocks_blocking(self, count: int) -> BlockList:
+        """
+        Allocate a list of device blocks (blocking call)
+
+        Parameters:
+        -----------
+        count: int
+            Number of blocks to allocate
+
+        Returns:
+        --------
+        BlockList
+            List of allocated blocks
+        """
+        ...
+
+    async def allocate_device_blocks(self, count: int) -> BlockList:
+        """
+        Allocate a list of device blocks
+
+        Parameters:
+        -----------
+        count: int
+            Number of blocks to allocate
+
+        Returns:
+        --------
+        BlockList
+            List of allocated blocks
+        """
+        ...
+
+class KvbmCacheManager:
+    """
+    A KV cache manager for VLLM
+    """
+
+    def __init__(self, block_manager: BlockManager) -> None:
+        ...
+
+
+class KvbmRequest:
+    """
+    A request for KV cache
+    """
+
+    def __init__(self, request_id: int, tokens: List[int], block_size: int) -> None:
+        ...
--- a/lib/bindings/python/src/dynamo/llm/trtllm_integration/__init__.py
+++ b/lib/bindings/python/src/dynamo/llm/trtllm_integration/__init__.py
--- a/lib/bindings/python/src/dynamo/llm/trtllm_integration/connector/__init__.py
+++ b/lib/bindings/python/src/dynamo/llm/trtllm_integration/connector/__init__.py
--- a/lib/bindings/python/src/dynamo/llm/trtllm_integration/connector/kvbm_connector_leader.py
+++ b/lib/bindings/python/src/dynamo/llm/trtllm_integration/connector/kvbm_connector_leader.py
@@ -2,8 +2,13 @@
 # SPDX-License-Identifier: Apache-2.0


-from typing import List
+from typing import List, Optional

+from kvbm import KvbmLeader
+from kvbm.trtllm_integration.rust import KvbmRequest
+from kvbm.trtllm_integration.rust import KvConnectorLeader as RustKvConnectorLeader
+from kvbm.trtllm_integration.rust import SchedulerOutput as RustSchedulerOutput
+from kvbm.utils import is_dyn_runtime_enabled
 from tensorrt_llm._torch.pyexecutor.kv_cache_connector import (
    KvCacheConnectorScheduler,
    SchedulerOutput,
@@ -11,19 +16,20 @@ from tensorrt_llm._torch.pyexecutor.kv_cache_connector import (
 from tensorrt_llm.bindings.internal.batch_manager import LlmRequest
 from tensorrt_llm.llmapi.llm_args import TorchLlmArgs

-from dynamo.llm import KvbmLeader
-from dynamo.llm.trtllm_integration.rust import KvbmRequest
-from dynamo.llm.trtllm_integration.rust import (
-    KvConnectorLeader as RustKvConnectorLeader,
-)
-from dynamo.llm.trtllm_integration.rust import SchedulerOutput as RustSchedulerOutput
-from dynamo.runtime import DistributedRuntime
+DistributedRuntime = None
+if is_dyn_runtime_enabled():
+    from dynamo.runtime import DistributedRuntime


 class DynamoKVBMConnectorLeader(KvCacheConnectorScheduler):
    def __init__(self, llm_args: TorchLlmArgs):
        super().__init__(llm_args)
-        self.drt = DistributedRuntime.detached()
+
+        drt: Optional[object] = None
+        if is_dyn_runtime_enabled():
+            drt = DistributedRuntime.detached()
+
+        self.drt = drt

        mappings = self._llm_args.parallel_config.to_mapping()


--- a/lib/bindings/python/src/dynamo/llm/trtllm_integration/connector/kvbm_connector_worker.py
+++ b/lib/bindings/python/src/dynamo/llm/trtllm_integration/connector/kvbm_connector_worker.py
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0

+from typing import Optional
+
+# Keeping this import is important because it runs the code in nixl’s __init__.py
+# to set up the Nixl plugin path.
+import nixl  # noqa: F401
 import torch
+from kvbm.trtllm_integration.rust import KvConnectorWorker as RustKvConnectorWorker
+from kvbm.utils import is_dyn_runtime_enabled
 from tensorrt_llm import logger
 from tensorrt_llm._torch.pyexecutor.kv_cache_connector import KvCacheConnectorWorker
 from tensorrt_llm.llmapi.llm_args import TorchLlmArgs

-from dynamo.llm.trtllm_integration.rust import (
-    KvConnectorWorker as RustKvConnectorWorker,
-)
-from dynamo.runtime import DistributedRuntime
+DistributedRuntime = None
+if is_dyn_runtime_enabled():
+    from dynamo.runtime import DistributedRuntime


 class DynamoKVBMConnectorWorker(KvCacheConnectorWorker):
@@ -31,7 +37,11 @@ class DynamoKVBMConnectorWorker(KvCacheConnectorWorker):
    def __init__(self, llm_args: TorchLlmArgs):
        super().__init__(llm_args)

-        self.drt = DistributedRuntime.detached()
+        drt: Optional[object] = None
+        if is_dyn_runtime_enabled():
+            drt = DistributedRuntime.detached()
+
+        self.drt = drt

        mappings = self._llm_args.parallel_config.to_mapping()
        self.rank = mappings.rank

--- a/lib/bindings/python/src/dynamo/llm/trtllm_integration/rust.py
+++ b/lib/bindings/python/src/dynamo/llm/trtllm_integration/rust.py
@@ -7,7 +7,7 @@ Loader for the Rust-based TensorRT-LLM integration objects, using objects from _

 try:
    # TODO: use TRTLLM own integration module
-    from dynamo._core import _vllm_integration
+    from kvbm._core import _vllm_integration

    # Runtime - dynamically loaded classes from Rust extension
    KvbmRequest = getattr(_vllm_integration, "KvbmRequest")

--- a/lib/kvbm/python/kvbm/utils.py
+++ b/lib/kvbm/python/kvbm/utils.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+
+import os
+
+
+def is_dyn_runtime_enabled() -> bool:
+    """
+    Return True if DYN_RUNTIME_ENABLED_KVBM is set to '1' or 'true' (case-insensitive).
+    DYN_RUNTIME_ENABLED_KVBM indicates if KVBM should use the existing DistributedRuntime
+    in the current environment.
+
+    WRN: Calling DistributedRuntime.detached() can crash the entire process if
+    dependencies are not satisfied, and it cannot be caught with try/except in Python.
+    TODO: Make DistributedRuntime.detached() raise a catchable Python exception and
+    avoid crashing the process.
+    """
+    val = os.environ.get("DYN_RUNTIME_ENABLED_KVBM", "").strip().lower()
+    return val in {"1", "true"}
--- a/lib/bindings/python/src/dynamo/llm/vllm_integration/__init__.py
+++ b/lib/bindings/python/src/dynamo/llm/vllm_integration/__init__.py
--- a/lib/bindings/python/src/dynamo/llm/vllm_integration/connector/__init__.py
+++ b/lib/bindings/python/src/dynamo/llm/vllm_integration/connector/__init__.py
--- a/lib/bindings/python/src/dynamo/llm/vllm_integration/connector/dynamo_connector.py
+++ b/lib/bindings/python/src/dynamo/llm/vllm_integration/connector/dynamo_connector.py
@@ -26,9 +26,9 @@ if TYPE_CHECKING:
    from vllm.v1.request import Request


-# from dynamo.llm.vllm_integration.kv_cache_utils import KvbmCacheBlocks
-from dynamo.llm.vllm_integration.connector_leader import KvConnectorLeader
-from dynamo.llm.vllm_integration.connector_worker import KvConnectorWorker
+# from kvbm.vllm_integration.kv_cache_utils import KvbmCacheBlocks
+from kvbm.vllm_integration.connector_leader import KvConnectorLeader
+from kvbm.vllm_integration.connector_worker import KvConnectorWorker

 EngineId = str


--- a/lib/bindings/python/src/dynamo/llm/vllm_integration/connector/pd_connector.py
+++ b/lib/bindings/python/src/dynamo/llm/vllm_integration/connector/pd_connector.py
@@ -3,6 +3,7 @@
 from dataclasses import dataclass
 from typing import TYPE_CHECKING

+from kvbm.vllm_integration.connector.dynamo_connector import DynamoConnector
 from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorRole
 from vllm.distributed.kv_transfer.kv_connector.v1.lmcache_connector import (
    LMCacheConnectorV1,
@@ -14,8 +15,6 @@ from vllm.distributed.kv_transfer.kv_connector.v1.multi_connector import (
 from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import NixlConnector
 from vllm.v1.core.sched.output import SchedulerOutput

-from dynamo.llm.vllm_integration.connector.dynamo_connector import DynamoConnector
-
 if TYPE_CHECKING:
    from vllm.config import VllmConfig
    from vllm.v1.core.kv_cache_manager import KVCacheBlocks

--- a/lib/bindings/python/src/dynamo/llm/vllm_integration/connector_leader.py
+++ b/lib/bindings/python/src/dynamo/llm/vllm_integration/connector_leader.py
@@ -20,19 +20,23 @@ if TYPE_CHECKING:
    from vllm.v1.request import Request


-# from dynamo.llm.vllm_integration.kv_cache_utils import KvbmCacheBlocks
-# from dynamo.llm.vllm_integration.rust import BlockManager, KvbmRequest
-# from dynamo.llm.vllm_integration.rust import KvConnectorLeader as RustKvConnectorLeader
-# from dynamo.llm.vllm_integration.rust import (
+# from kvbm.vllm_integration.kv_cache_utils import KvbmCacheBlocks
+# from kvbm.vllm_integration.rust import BlockManager, KvbmRequest
+# from kvbm.vllm_integration.rust import KvConnectorLeader as RustKvConnectorLeader
+# from kvbm.vllm_integration.rust import (
 #     KvConnectorMetadata as RustKvConnectorMetadata,
 # )
-# from dynamo.llm.vllm_integration.rust import SchedulerOutput as RustSchedulerOutput
+# from kvbm.vllm_integration.rust import SchedulerOutput as RustSchedulerOutput

-from dynamo.llm import KvbmLeader
-from dynamo.llm.vllm_integration.rust import KvbmRequest
-from dynamo.llm.vllm_integration.rust import KvConnectorLeader as RustKvConnectorLeader
-from dynamo.llm.vllm_integration.rust import SchedulerOutput as RustSchedulerOutput
-from dynamo.runtime import DistributedRuntime
+from kvbm import KvbmLeader
+from kvbm.utils import is_dyn_runtime_enabled
+from kvbm.vllm_integration.rust import KvbmRequest
+from kvbm.vllm_integration.rust import KvConnectorLeader as RustKvConnectorLeader
+from kvbm.vllm_integration.rust import SchedulerOutput as RustSchedulerOutput
+
+DistributedRuntime = None
+if is_dyn_runtime_enabled():
+    from dynamo.runtime import DistributedRuntime


 class DynamoConnectorMetadata(KVConnectorMetadata):
@@ -51,12 +55,14 @@ class KvConnectorLeader:
    """

    def __init__(self, vllm_config: "VllmConfig", engine_id: str, **kwargs):
-        drt = kwargs.get("drt", None)
-        if drt is None:
-            self.drt = DistributedRuntime.detached()
+        drt: Optional[object] = kwargs.get("drt")
+
+        if drt is None and is_dyn_runtime_enabled():
+            drt = DistributedRuntime.detached()
        else:
-            self.drt = drt
+            drt = None

+        self.drt = drt
        self.vllm_config = vllm_config
        world_size = vllm_config.parallel_config.world_size


--- a/lib/bindings/python/src/dynamo/llm/vllm_integration/connector_worker.py
+++ b/lib/bindings/python/src/dynamo/llm/vllm_integration/connector_worker.py
@@ -9,7 +9,11 @@ from __future__ import annotations

 from typing import TYPE_CHECKING, Optional

+# Keeping this import is important because it runs the code in nixl’s __init__.py
+# to set up the Nixl plugin path when there is no pre-defined NIXL_PLUGIN_DIR
+import nixl  # noqa: F401
 import torch
+from kvbm.utils import is_dyn_runtime_enabled
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata
 from vllm.model_executor.models.utils import extract_layer_index
@@ -21,15 +25,18 @@ if TYPE_CHECKING:
    from vllm.forward_context import ForwardContext


-# from dynamo.llm.vllm_integration.kv_cache_utils import KvbmCacheBlocks
-# from dynamo.llm.vllm_integration.rust import BlockManager
-# from dynamo.llm.vllm_integration.rust import (
+# from kvbm.vllm_integration.kv_cache_utils import KvbmCacheBlocks
+# from kvbm.vllm_integration.rust import BlockManager
+# from kvbm.vllm_integration.rust import (
 #     KvConnectorMetadata as RustKvConnectorMetadata,
 #     KvConnectorWorker as RustKvConnectorWorker,
 # )

-from dynamo.llm.vllm_integration.rust import KvConnectorWorker as RustKvConnectorWorker
-from dynamo.runtime import DistributedRuntime
+from kvbm.vllm_integration.rust import KvConnectorWorker as RustKvConnectorWorker
+
+DistributedRuntime = None
+if is_dyn_runtime_enabled():
+    from dynamo.runtime import DistributedRuntime


 class DynamoConnectorMetadata(KVConnectorMetadata):
@@ -40,11 +47,14 @@ class DynamoConnectorMetadata(KVConnectorMetadata):

 class KvConnectorWorker:
    def __init__(self, vllm_config: "VllmConfig", engine_id: str, **kwargs):
-        drt = kwargs.get("drt", None)
-        if drt is None:
-            self.drt = DistributedRuntime.detached()
+        drt: Optional[object] = kwargs.get("drt")
+
+        if drt is None and is_dyn_runtime_enabled():
+            drt = DistributedRuntime.detached()
        else:
-            self.drt = drt
+            drt = None
+
+        self.drt = drt

        self.vllm_config = vllm_config
        self._connector = RustKvConnectorWorker(self.drt, engine_id)

--- a/lib/bindings/python/src/dynamo/llm/vllm_integration/consolidator_config.py
+++ b/lib/bindings/python/src/dynamo/llm/vllm_integration/consolidator_config.py
--- a/lib/bindings/python/src/dynamo/llm/vllm_integration/kv_cache_manager.py
+++ b/lib/bindings/python/src/dynamo/llm/vllm_integration/kv_cache_manager.py
@@ -26,10 +26,10 @@ if TYPE_CHECKING:
    from vllm.v1.core.kv_cache_manager import KVCacheBlocks
    from vllm.v1.request import Request

-from dynamo.llm.vllm_integration.kv_cache_utils import KvbmCacheBlocks
-from dynamo.llm.vllm_integration.rust import BlockManager
-from dynamo.llm.vllm_integration.rust import KvbmCacheManager as RustKvbmCacheManager
-from dynamo.llm.vllm_integration.rust import KvbmRequest, SlotUpdate
+from kvbm.vllm_integration.kv_cache_utils import KvbmCacheBlocks
+from kvbm.vllm_integration.rust import BlockManager
+from kvbm.vllm_integration.rust import KvbmCacheManager as RustKvbmCacheManager
+from kvbm.vllm_integration.rust import KvbmRequest, SlotUpdate


 class KvbmCacheManager(KVConnectorBase_V1):

--- a/lib/bindings/python/src/dynamo/llm/vllm_integration/kv_cache_utils.py
+++ b/lib/bindings/python/src/dynamo/llm/vllm_integration/kv_cache_utils.py
@@ -9,11 +9,10 @@ from __future__ import annotations

 from typing import List

+from kvbm.vllm_integration.rust import BlockState, BlockStates, KvbmBlockList
 from vllm.v1.core.kv_cache_manager import KVCacheBlocks
 from vllm.v1.core.kv_cache_utils import KVCacheBlock

-from dynamo.llm.vllm_integration.rust import BlockState, BlockStates, KvbmBlockList
-
 # from vllm.logger import init_logger
 # logger = init_logger(__name__)


--- a/lib/bindings/python/src/dynamo/llm/vllm_integration/rust.py
+++ b/lib/bindings/python/src/dynamo/llm/vllm_integration/rust.py
@@ -6,7 +6,7 @@ Loader for the Rust-based vLLM integration objects.
 """

 try:
-    from dynamo._core import _vllm_integration
+    from kvbm._core import _vllm_integration

    # Runtime - dynamically loaded classes from Rust extension
    KvbmCacheManager = getattr(_vllm_integration, "KvbmCacheManager")
@@ -20,7 +20,7 @@ try:
    KvConnectorLeader = getattr(_vllm_integration, "PyKvConnectorLeader")
    SchedulerOutput = getattr(_vllm_integration, "SchedulerOutput")

-    from dynamo.llm import BlockManager
+    from kvbm import BlockManager

 except ImportError:
    print("Failed to import Dynamo KVBM. vLLM integration will not be available.")

--- a/lib/bindings/python/rust/llm/block_manager.rs
+++ b/lib/bindings/python/rust/llm/block_manager.rs
@@ -9,39 +9,11 @@ use dynamo_llm::block_manager::block::{
 use dynamo_llm::block_manager::kv_consolidator::KvEventConsolidatorConfig;
 use dynamo_llm::block_manager::offload::filter::FrequencyFilter;
 use dynamo_llm::block_manager::{BasicMetadata, BlockParallelismStrategy};
-
+use dynamo_runtime::DistributedRuntime;
 use pyo3::PyResult;
 use std::time::Duration;
 use tokio_util::sync::CancellationToken;

-/// Creates a disk offload filter based on environment configuration.
-/// Returns `Ok(None)` if the filter is disabled via `DYN_KVBM_DISABLE_DISK_OFFLOAD_FILTER`,
-/// otherwise constructs a `FrequencyFilter` with standard parameters.
-fn create_disk_offload_filter(
-    cancel_token: &CancellationToken,
-    runtime: &tokio::runtime::Handle,
-) -> Result<Option<Arc<FrequencyFilter>>> {
-    // Check if disk offload filter is disabled via environment variable
-    let disable_filter = std::env::var("DYN_KVBM_DISABLE_DISK_OFFLOAD_FILTER")
-        .map(|v| v == "true" || v == "1")
-        .unwrap_or(false);
-
-    if disable_filter {
-        return Ok(None);
-    }
-
-    // TODO: These values seem plausible for most use cases, but we need to figure out a better way to configure them.
-    let frequency_filter = FrequencyFilter::new(
-        2,
-        Duration::from_secs(600),
-        1_000_000,
-        cancel_token.child_token(),
-        runtime.clone(),
-    )?;
-
-    Ok(Some(Arc::new(frequency_filter)))
-}
-
 mod controller;
 mod distributed;

@@ -73,11 +45,39 @@ type VllmController = Arc<
    >,
 >;

+/// Creates a disk offload filter based on environment configuration.
+/// Returns `Ok(None)` if the filter is disabled via `DYN_KVBM_DISABLE_DISK_OFFLOAD_FILTER`,
+/// otherwise constructs a `FrequencyFilter` with standard parameters.
+fn create_disk_offload_filter(
+    cancel_token: &CancellationToken,
+    runtime: &tokio::runtime::Handle,
+) -> Result<Option<Arc<FrequencyFilter>>> {
+    // Check if disk offload filter is disabled via environment variable
+    let disable_filter = std::env::var("DYN_KVBM_DISABLE_DISK_OFFLOAD_FILTER")
+        .map(|v| v == "true" || v == "1")
+        .unwrap_or(false);
+
+    if disable_filter {
+        return Ok(None);
+    }
+
+    // TODO: These values seem plausible for most use cases, but we need to figure out a better way to configure them.
+    let frequency_filter = FrequencyFilter::new(
+        2,
+        Duration::from_secs(600),
+        1_000_000,
+        cancel_token.child_token(),
+        runtime.clone(),
+    )?;
+
+    Ok(Some(Arc::new(frequency_filter)))
+}
+
 #[pyclass]
 #[derive(Clone)]
 pub struct BlockManager {
    inner: VllmBlockManager,
-    drt: DistributedRuntime,
+    _drt: Option<Arc<DistributedRuntime>>,
    _controller: Option<VllmController>,
 }

@@ -126,19 +126,17 @@ impl BlockManager {

            if leader.num_host_blocks() > 0 {
                tracing::info!("Using {} host blocks", leader.num_host_blocks());
-
                let mut host_layout_config =
                    dynamo_llm::block_manager::KvManagerLayoutConfig::builder()
                        .num_blocks(leader.num_host_blocks())
                        .logical(Some(BlockParallelismStrategy::LeaderWorkerSharded));

-                if leader.num_disk_blocks() > 0 {
-                    if let Some(filter) =
-                        create_disk_offload_filter(&cancel_token, &rt.inner().runtime().primary())
+                if leader.num_disk_blocks() > 0
+                    && let Some(filter) =
+                        create_disk_offload_filter(&cancel_token, &get_current_tokio_handle())
                            .map_err(to_pyerr)?
-                    {
-                        host_layout_config = host_layout_config.offload_filter(Some(filter));
-                    }
+                {
+                    host_layout_config = host_layout_config.offload_filter(Some(filter));
                }

                config = config.host_layout(host_layout_config.build().map_err(to_pyerr)?);
@@ -181,7 +179,7 @@ impl BlockManager {
            // )
        };

-        let rt = drt.inner().runtime().primary();
+        let rt = get_current_tokio_handle();

        let config = config.build().map_err(to_pyerr)?;
        Ok(BlockManager {
@@ -197,7 +195,7 @@ impl BlockManager {
                    .await
                })
                .map_err(to_pyerr)?,
-            drt,
+            _drt: drt,
            _controller: None,
        })
    }
@@ -213,11 +211,7 @@ impl BlockManager {
        }

        let block_manager = self.inner.clone();
-        let controller = self
-            .drt
-            .inner()
-            .runtime()
-            .primary()
+        let controller = get_current_tokio_handle()
            .block_on(controller::Controller::new(
                block_manager,
                component.inner.clone(),
@@ -280,6 +274,7 @@ impl BlockManagerBuilder {
        self.disable_device_pool = yes;
        self
    }
+
    pub fn kvbm_metrics(
        mut self,
        metrics: dynamo_llm::block_manager::metrics_kvbm::KvbmMetrics,
@@ -339,12 +334,11 @@ impl BlockManagerBuilder {
                    .num_blocks(leader_inner.num_host_blocks())
                    .logical(Some(BlockParallelismStrategy::LeaderWorkerSharded));

-            if leader_inner.num_disk_blocks() > 0 {
-                if let Some(filter) =
-                    create_disk_offload_filter(&cancel_token, &drt.inner().runtime().primary())?
-                {
-                    host_layout_config = host_layout_config.offload_filter(Some(filter));
-                }
+            if leader_inner.num_disk_blocks() > 0
+                && let Some(filter) =
+                    create_disk_offload_filter(&cancel_token, &get_current_tokio_handle())?
+            {
+                host_layout_config = host_layout_config.offload_filter(Some(filter));
            }

            config = config.host_layout(host_layout_config.build()?);
@@ -382,7 +376,7 @@ impl BlockManagerBuilder {

        Ok(BlockManager {
            inner,
-            drt,
+            _drt: drt,
            _controller: None,
        })
    }

--- a/lib/bindings/python/rust/llm/block_manager/block.rs
+++ b/lib/bindings/python/rust/llm/block_manager/block.rs