chore: KVBM pip wheel (#3826)

Signed-off-by: Anant Sharma <anants@nvidia.com> Co-authored-by: Anant Sharma <anants@nvidia.com>

chore: KVBM pip wheel (#3826)
Signed-off-by: Anant Sharma <anants@nvidia.com> Co-authored-by: Anant Sharma <anants@nvidia.com>
6afa679c · Richard Huo · GitHub · e5c109d8 · 6afa679c · 6afa679c
Unverified Commit 6afa679c authored Oct 31, 2025 by Richard Huo Committed by GitHub Oct 31, 2025
18 changed files
--- a/lib/bindings/python/rust/llm/block_manager/vllm/slot_test_plan.md
+++ b/lib/bindings/python/rust/llm/block_manager/vllm/slot_test_plan.md
--- a/lib/kvbm/src/lib.rs
+++ b/lib/kvbm/src/lib.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+use pyo3::exceptions::{PyRuntimeError, PyTypeError};
+use pyo3::types::{PyCapsule, PyCapsuleMethods};
+use pyo3::{exceptions::PyException, prelude::*};
+use std::sync::OnceLock;
+use std::sync::Weak;
+use std::{fmt::Display, sync::Arc};
+use tokio::sync::Mutex;
+use tokio_util::sync::CancellationToken;
+use dynamo_runtime::{self as rs, RuntimeConfig, logging, traits::DistributedRuntimeProvider};
+use dynamo_llm::{self as llm_rs};
+mod block_manager;
+/// A Python module implemented in Rust. The name of this function must match
+/// the `lib.name` setting in the `Cargo.toml`, else Python will not be able to
+/// import the module.
+#[pymodule]
+fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> {
+    logging::init();
+    init_pyo3_tokio_rt();
+    #[cfg(feature = "block-manager")]
+    block_manager::add_to_module(m)?;
+    Ok(())
+}
+static PYO3_TOKIO_INIT: OnceLock<()> = OnceLock::new();
+static PYO3_TOKIO_RT: OnceLock<tokio::runtime::Runtime> = OnceLock::new();
+static PYO3_TOKIO_CANCEL_TOKEN: OnceLock<CancellationToken> = OnceLock::new();
+// The runtime's threads do not survive when passing DistributedRuntime across bindings,
+// so we need to reinitialize the runtime thread pool.
+// This is also required in environments without a DistributedRuntime.
+fn init_pyo3_tokio_rt() {
+    PYO3_TOKIO_INIT.get_or_init(|| {
+        let cfg =
+            RuntimeConfig::from_settings().expect("failed to build runtime config from settings");
+        let rt = tokio::runtime::Builder::new_multi_thread()
+            .worker_threads(
+                cfg.num_worker_threads
+                    .unwrap_or_else(|| std::thread::available_parallelism().unwrap().get()),
+            )
+            .max_blocking_threads(cfg.max_blocking_threads)
+            .enable_all()
+            .build()
+            .expect("failed to build fallback tokio runtime for pyo3_async_runtimes");
+        let _ = PYO3_TOKIO_RT.set(rt);
+        let rt_ref = PYO3_TOKIO_RT.get().expect("runtime missing after set");
+        // Initialize the shared cancellation token
+        let cancel_token = CancellationToken::new();
+        let _ = PYO3_TOKIO_CANCEL_TOKEN.set(cancel_token);
+        // Initialize pyo3-async runtimes with this runtime
+        let _ = pyo3_async_runtimes::tokio::init_with_runtime(rt_ref);
+    });
+}
+pub fn get_current_tokio_handle() -> tokio::runtime::Handle {
+    PYO3_TOKIO_RT
+        .get()
+        .expect("Tokio runtime not initialized!")
+        .handle()
+        .clone()
+}
+pub fn get_current_cancel_token() -> CancellationToken {
+    PYO3_TOKIO_CANCEL_TOKEN
+        .get()
+        .expect("Cancellation token not initialized!")
+        .clone()
+}
+pub fn to_pyerr<E>(err: E) -> PyErr
+where
+    E: Display,
+{
+    PyException::new_err(format!("{}", err))
+}
+#[pyclass]
+#[derive(Clone)]
+struct Component {
+    inner: rs::component::Component,
+}
+pub fn extract_distributed_runtime_from_obj(
+    py: Python<'_>,
+    drt_obj: PyObject,
+) -> PyResult<Option<Arc<rs::DistributedRuntime>>> {
+    if drt_obj.is_none(py) {
+        return Ok(None);
+    }
+    let obj = drt_obj.bind(py);
+    let cls = py.import("dynamo._core")?.getattr("DistributedRuntime")?;
+    if !obj.is_instance(&cls)? {
+        return Err(PyTypeError::new_err(
+            "expected dynamo._core.DistributedRuntime",
+        ));
+    }
+    let cap_any = obj.call_method0("to_capsule")?;
+    let cap: &Bound<'_, PyCapsule> = cap_any.downcast()?;
+    let weak: &Weak<rs::DistributedRuntime> = unsafe { cap.reference::<Weak<_>>() };
+    let strong = weak.upgrade().ok_or_else(|| {
+        PyRuntimeError::new_err("runtime is no longer alive (weak ref upgrade failed)")
+    })?;
+    Ok(Some(strong))
+}
--- a/lib/bindings/python/tests/test_kvbm_vllm_integration.py
+++ b/lib/bindings/python/tests/test_kvbm_vllm_integration.py
@@ -26,8 +26,8 @@ except ImportError:
    VLLM_NOT_AVAILABLE = True
 try:
-    from dynamo.llm import BlockManager
+    from kvbm import BlockManager
-    from dynamo.llm.vllm_integration.kv_cache_manager import KvbmCacheManager
+    from kvbm.vllm_integration.kv_cache_manager import KvbmCacheManager
    KVBM_NOT_AVAILABLE = False
 except ImportError:
@@ -819,7 +819,7 @@ def test_kvbm_wrong_blocks_provided():
 @pytest.mark.skipif(KVBM_NOT_AVAILABLE, reason="KVBM not available")
 @pytest.mark.skipif(VLLM_NOT_AVAILABLE, reason="VLLM not available")
-@patch("dynamo.llm.vllm_integration.kv_cache_manager.KvbmCacheManager")
+@patch("kvbm.vllm_integration.kv_cache_manager.KvbmCacheManager")
 def test_kvbm_new_matched_tokens_edge_case(MockCacheManager):
    PAGE_SIZE = 4
    NUM_BLOCKS = 3

--- a/lib/llm/src/block_manager/distributed.rs
+++ b/lib/llm/src/block_manager/distributed.rs
@@ -115,12 +115,6 @@ mod tests {
        }
    }
-    fn get_unique_barrier_id() -> String {
-        static COUNTER: AtomicUsize = AtomicUsize::new(0);
-        COUNTER.fetch_add(1, Ordering::Relaxed).to_string()
-    }
    async fn build_leader_and_workers(num_workers: usize) -> Result<(KvbmLeader, Vec<KvbmWorker>)> {
        let mut workers = Vec::new();

--- a/lib/llm/src/block_manager/distributed/leader.rs
+++ b/lib/llm/src/block_manager/distributed/leader.rs
@@ -69,6 +69,7 @@ impl KvbmLeaderConfig {
                "leader_pub_url and leader_ack_url must differ (same endpoint would fail to bind)."
            );
        }
        let cpu = &self.host_blocks_config;
        let disk = &self.disk_blocks_config;
        let cpu_configured = cpu.num_blocks_overriden > 0 || cpu.cache_size_in_gb > 0.0;

--- a/lib/llm/src/block_manager/distributed/worker.rs
+++ b/lib/llm/src/block_manager/distributed/worker.rs
@@ -29,7 +29,7 @@ use std::sync::atomic::{AtomicBool, Ordering};
 use tokio::runtime::Handle;
 use tokio_util::sync::CancellationToken;
-use dynamo_runtime::{DistributedRuntime, utils::task::CriticalTaskExecutionHandle};
+use dynamo_runtime::utils::task::CriticalTaskExecutionHandle;
 use tokio::sync::{Mutex, RwLock, oneshot};
 struct WorkerState {
@@ -362,7 +362,7 @@ impl Handler for BlockTransferDispatch {
 #[derive(Builder, Clone)]
 #[builder(pattern = "owned")]
 pub struct KvbmWorkerConfig {
-    drt: DistributedRuntime,
+    cancel_token: CancellationToken,
    num_device_blocks: usize,
@@ -531,7 +531,7 @@ impl KvbmWorker {
        CriticalTaskExecutionHandle,
        oneshot::Receiver<transfer::BlockTransferHandler>,
    )> {
-        let cancel_token = config.drt.primary_token().clone();
+        let cancel_token = config.cancel_token.clone();
        // establish a oneshot channel to get back the raw BlockTransferHandler
        let (handler_tx, handler_rx) = oneshot::channel();
@@ -582,7 +582,7 @@ impl KvbmWorker {
        CriticalTaskExecutionHandle,
        oneshot::Receiver<transfer::BlockTransferHandler>,
    )> {
-        let cancel_token = config.drt.primary_token().clone();
+        let cancel_token = config.cancel_token.clone();
        let scheduler_client = config.scheduler_client.clone();
        // channel to get BlockTransferHandler back to the caller
@@ -682,8 +682,7 @@ impl KvbmWorker {
        scheduler_client: Option<TransferSchedulerClient>,
        bytes_per_block: usize,
    ) -> anyhow::Result<()> {
-        let drt = config.drt.clone();
+        let worker_id = config.device_id;
-        let worker_id = drt.connection_id() as usize;
        // Readiness gating for ping
        let state = Arc::new(WorkerState::new());

--- a/lib/llm/src/block_manager/state/resources.rs
+++ b/lib/llm/src/block_manager/state/resources.rs
@@ -57,13 +57,6 @@ impl Resources {
                tracing::debug!("Creating NIXL backends");
-                if let Ok((_, ucx_params)) = agent.get_plugin_params("UCX") {
-                    let backend = agent.create_backend("UCX", &ucx_params)?;
-                    nixl_backends.insert("UCX".to_string(), Arc::new(backend));
-                } else {
-                    tracing::warn!("No UCX plugin found; will not create UCX backend");
-                }
                if config.disk_layout.is_some() {
                    if let Ok((_, gds_mt_params)) = agent.get_plugin_params("GDS_MT") {
                        let backend = agent.create_backend("GDS_MT", &gds_mt_params)?;

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -147,6 +147,7 @@ addopts = [
    "--ignore-glob=*model.py",
    "--ignore-glob=*vllm_integration*",
    "--ignore-glob=*trtllm_integration*",
+    "--ignore-glob=*kvbm/python/kvbm*",
    "--ignore-glob=*_inc.py",
    "--ignore-glob=*/llm/tensorrtllm*",
    "--ignore-glob=docs/*",

--- a/tests/kvbm/README.md
+++ b/tests/kvbm/README.md
@@ -34,10 +34,10 @@ pytest -v -m "kvbm" -s
 Run the determinism test file directly inside dynamo repo:
 ```bash
-pytest -v tests/kvbm/test_determinism_agg.py -s
+pytest -v tests/kvbm_integration/test_determinism_agg.py -s
 # disagg needs 2 GPUs to run
-pytest -v tests/kvbm/test_determinism_disagg.py -s
+pytest -v tests/kvbm_integration/test_determinism_disagg.py -s
 ```
 ## Configuration
@@ -84,7 +84,7 @@ pytest -v -m "kvbm" -s
 ## Requirements
 - `vllm` executable available in PATH inside the test environment.
- The connector module path must be valid: `dynamo.llm.vllm_integration.connector`.
+- The connector module path must be valid: `kvbm.vllm_integration.connector`.
 - NATS and etcd services (provided automatically by the `runtime_services` fixture).
 - `datasets` library for IFEval concurrent testing (included in test dependencies).
 - For containerized workflows, follow the top-level `tests/README.md` guidance to build/run the appropriate image, then execute pytest inside the container.

--- a/tests/kvbm/__init__.py
+++ b/tests/kvbm/__init__.py
--- a/tests/kvbm/common.py
+++ b/tests/kvbm/common.py
--- a/tests/kvbm/engine_config_with_cuda_graph_and_kvbm.yaml
+++ b/tests/kvbm/engine_config_with_cuda_graph_and_kvbm.yaml
@@ -9,6 +9,6 @@ kv_cache_config:
  free_gpu_memory_fraction: 0.80
  max_tokens: 8192
 kv_connector_config:
-  connector_module: dynamo.llm.trtllm_integration.connector
+  connector_module: kvbm.trtllm_integration.connector
  connector_scheduler_class: DynamoKVBMConnectorLeader
  connector_worker_class: DynamoKVBMConnectorWorker
--- a/tests/kvbm/engine_config_without_cuda_graph_and_kvbm.yaml
+++ b/tests/kvbm/engine_config_without_cuda_graph_and_kvbm.yaml
@@ -8,6 +8,6 @@ kv_cache_config:
  free_gpu_memory_fraction: 0.80
  max_tokens: 8192
 kv_connector_config:
-  connector_module: dynamo.llm.trtllm_integration.connector
+  connector_module: kvbm.trtllm_integration.connector
  connector_scheduler_class: DynamoKVBMConnectorLeader
  connector_worker_class: DynamoKVBMConnectorWorker
--- a/tests/kvbm/test_consolidator_router_e2e.py
+++ b/tests/kvbm/test_consolidator_router_e2e.py
@@ -23,7 +23,7 @@ from pathlib import Path
 import pytest
 import requests
-from tests.kvbm.common import ApiTester, check_logs_for_patterns
+from tests.kvbm_integration.common import ApiTester, check_logs_for_patterns
 from tests.utils.managed_process import ManagedProcess
 # Check if vLLM is available

--- a/tests/kvbm/test_cuda_graph.py
+++ b/tests/kvbm/test_cuda_graph.py
--- a/tests/kvbm/test_determinism_agg.py
+++ b/tests/kvbm/test_determinism_agg.py
@@ -109,7 +109,7 @@ class LLMServerManager:
            "--port",
            str(self.port),
            "--kv-transfer-config",
-            '{"kv_connector":"DynamoConnector","kv_role":"kv_both", "kv_connector_module_path": "dynamo.llm.vllm_integration.connector"}',
+            '{"kv_connector":"DynamoConnector","kv_role":"kv_both", "kv_connector_module_path": "kvbm.vllm_integration.connector"}',
            os.environ.get("KVBM_MODEL_ID", "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"),
            "--max-model-len",
            "8000",  # required to fit on L4 GPU when using 8b model
@@ -132,7 +132,7 @@ class LLMServerManager:
            "free_gpu_memory_fraction": 0.10,  # Set a small GPU fraction so that we can evict/reset the on-device kv cache faster
        }
        llm_api_config["kv_connector_config"] = {
-            "connector_module": "dynamo.llm.trtllm_integration.connector",
+            "connector_module": "kvbm.trtllm_integration.connector",
            "connector_scheduler_class": "DynamoKVBMConnectorLeader",
            "connector_worker_class": "DynamoKVBMConnectorWorker",
        }

--- a/tests/kvbm/test_determinism_disagg.py
+++ b/tests/kvbm/test_determinism_disagg.py
--- a/tests/serve/test_vllm.py
+++ b/tests/serve/test_vllm.py
@@ -100,8 +100,8 @@ vllm_configs = {
        ],
        timeout=700,
        request_payloads=[
-            chat_payload_default(),
+            chat_payload_default(expected_response=["joke"]),
-            completion_payload_default(),
+            completion_payload_default(expected_response=["joke"]),
        ],
    ),
    "multimodal_agg_llava": VLLMConfig(