Unverified Commit 6afa679c authored by Richard Huo's avatar Richard Huo Committed by GitHub
Browse files

chore: KVBM pip wheel (#3826)


Signed-off-by: default avatarAnant Sharma <anants@nvidia.com>
Co-authored-by: default avatarAnant Sharma <anants@nvidia.com>
parent e5c109d8
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use pyo3::exceptions::{PyRuntimeError, PyTypeError};
use pyo3::types::{PyCapsule, PyCapsuleMethods};
use pyo3::{exceptions::PyException, prelude::*};
use std::sync::OnceLock;
use std::sync::Weak;
use std::{fmt::Display, sync::Arc};
use tokio::sync::Mutex;
use tokio_util::sync::CancellationToken;
use dynamo_runtime::{self as rs, RuntimeConfig, logging, traits::DistributedRuntimeProvider};
use dynamo_llm::{self as llm_rs};
mod block_manager;
/// A Python module implemented in Rust. The name of this function must match
/// the `lib.name` setting in the `Cargo.toml`, else Python will not be able to
/// import the module.
#[pymodule]
fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> {
logging::init();
init_pyo3_tokio_rt();
#[cfg(feature = "block-manager")]
block_manager::add_to_module(m)?;
Ok(())
}
static PYO3_TOKIO_INIT: OnceLock<()> = OnceLock::new();
static PYO3_TOKIO_RT: OnceLock<tokio::runtime::Runtime> = OnceLock::new();
static PYO3_TOKIO_CANCEL_TOKEN: OnceLock<CancellationToken> = OnceLock::new();
// The runtime's threads do not survive when passing DistributedRuntime across bindings,
// so we need to reinitialize the runtime thread pool.
// This is also required in environments without a DistributedRuntime.
fn init_pyo3_tokio_rt() {
PYO3_TOKIO_INIT.get_or_init(|| {
let cfg =
RuntimeConfig::from_settings().expect("failed to build runtime config from settings");
let rt = tokio::runtime::Builder::new_multi_thread()
.worker_threads(
cfg.num_worker_threads
.unwrap_or_else(|| std::thread::available_parallelism().unwrap().get()),
)
.max_blocking_threads(cfg.max_blocking_threads)
.enable_all()
.build()
.expect("failed to build fallback tokio runtime for pyo3_async_runtimes");
let _ = PYO3_TOKIO_RT.set(rt);
let rt_ref = PYO3_TOKIO_RT.get().expect("runtime missing after set");
// Initialize the shared cancellation token
let cancel_token = CancellationToken::new();
let _ = PYO3_TOKIO_CANCEL_TOKEN.set(cancel_token);
// Initialize pyo3-async runtimes with this runtime
let _ = pyo3_async_runtimes::tokio::init_with_runtime(rt_ref);
});
}
pub fn get_current_tokio_handle() -> tokio::runtime::Handle {
PYO3_TOKIO_RT
.get()
.expect("Tokio runtime not initialized!")
.handle()
.clone()
}
pub fn get_current_cancel_token() -> CancellationToken {
PYO3_TOKIO_CANCEL_TOKEN
.get()
.expect("Cancellation token not initialized!")
.clone()
}
pub fn to_pyerr<E>(err: E) -> PyErr
where
E: Display,
{
PyException::new_err(format!("{}", err))
}
#[pyclass]
#[derive(Clone)]
struct Component {
inner: rs::component::Component,
}
pub fn extract_distributed_runtime_from_obj(
py: Python<'_>,
drt_obj: PyObject,
) -> PyResult<Option<Arc<rs::DistributedRuntime>>> {
if drt_obj.is_none(py) {
return Ok(None);
}
let obj = drt_obj.bind(py);
let cls = py.import("dynamo._core")?.getattr("DistributedRuntime")?;
if !obj.is_instance(&cls)? {
return Err(PyTypeError::new_err(
"expected dynamo._core.DistributedRuntime",
));
}
let cap_any = obj.call_method0("to_capsule")?;
let cap: &Bound<'_, PyCapsule> = cap_any.downcast()?;
let weak: &Weak<rs::DistributedRuntime> = unsafe { cap.reference::<Weak<_>>() };
let strong = weak.upgrade().ok_or_else(|| {
PyRuntimeError::new_err("runtime is no longer alive (weak ref upgrade failed)")
})?;
Ok(Some(strong))
}
...@@ -26,8 +26,8 @@ except ImportError: ...@@ -26,8 +26,8 @@ except ImportError:
VLLM_NOT_AVAILABLE = True VLLM_NOT_AVAILABLE = True
try: try:
from dynamo.llm import BlockManager from kvbm import BlockManager
from dynamo.llm.vllm_integration.kv_cache_manager import KvbmCacheManager from kvbm.vllm_integration.kv_cache_manager import KvbmCacheManager
KVBM_NOT_AVAILABLE = False KVBM_NOT_AVAILABLE = False
except ImportError: except ImportError:
...@@ -819,7 +819,7 @@ def test_kvbm_wrong_blocks_provided(): ...@@ -819,7 +819,7 @@ def test_kvbm_wrong_blocks_provided():
@pytest.mark.skipif(KVBM_NOT_AVAILABLE, reason="KVBM not available") @pytest.mark.skipif(KVBM_NOT_AVAILABLE, reason="KVBM not available")
@pytest.mark.skipif(VLLM_NOT_AVAILABLE, reason="VLLM not available") @pytest.mark.skipif(VLLM_NOT_AVAILABLE, reason="VLLM not available")
@patch("dynamo.llm.vllm_integration.kv_cache_manager.KvbmCacheManager") @patch("kvbm.vllm_integration.kv_cache_manager.KvbmCacheManager")
def test_kvbm_new_matched_tokens_edge_case(MockCacheManager): def test_kvbm_new_matched_tokens_edge_case(MockCacheManager):
PAGE_SIZE = 4 PAGE_SIZE = 4
NUM_BLOCKS = 3 NUM_BLOCKS = 3
......
...@@ -115,12 +115,6 @@ mod tests { ...@@ -115,12 +115,6 @@ mod tests {
} }
} }
fn get_unique_barrier_id() -> String {
static COUNTER: AtomicUsize = AtomicUsize::new(0);
COUNTER.fetch_add(1, Ordering::Relaxed).to_string()
}
async fn build_leader_and_workers(num_workers: usize) -> Result<(KvbmLeader, Vec<KvbmWorker>)> { async fn build_leader_and_workers(num_workers: usize) -> Result<(KvbmLeader, Vec<KvbmWorker>)> {
let mut workers = Vec::new(); let mut workers = Vec::new();
......
...@@ -69,6 +69,7 @@ impl KvbmLeaderConfig { ...@@ -69,6 +69,7 @@ impl KvbmLeaderConfig {
"leader_pub_url and leader_ack_url must differ (same endpoint would fail to bind)." "leader_pub_url and leader_ack_url must differ (same endpoint would fail to bind)."
); );
} }
let cpu = &self.host_blocks_config; let cpu = &self.host_blocks_config;
let disk = &self.disk_blocks_config; let disk = &self.disk_blocks_config;
let cpu_configured = cpu.num_blocks_overriden > 0 || cpu.cache_size_in_gb > 0.0; let cpu_configured = cpu.num_blocks_overriden > 0 || cpu.cache_size_in_gb > 0.0;
......
...@@ -29,7 +29,7 @@ use std::sync::atomic::{AtomicBool, Ordering}; ...@@ -29,7 +29,7 @@ use std::sync::atomic::{AtomicBool, Ordering};
use tokio::runtime::Handle; use tokio::runtime::Handle;
use tokio_util::sync::CancellationToken; use tokio_util::sync::CancellationToken;
use dynamo_runtime::{DistributedRuntime, utils::task::CriticalTaskExecutionHandle}; use dynamo_runtime::utils::task::CriticalTaskExecutionHandle;
use tokio::sync::{Mutex, RwLock, oneshot}; use tokio::sync::{Mutex, RwLock, oneshot};
struct WorkerState { struct WorkerState {
...@@ -362,7 +362,7 @@ impl Handler for BlockTransferDispatch { ...@@ -362,7 +362,7 @@ impl Handler for BlockTransferDispatch {
#[derive(Builder, Clone)] #[derive(Builder, Clone)]
#[builder(pattern = "owned")] #[builder(pattern = "owned")]
pub struct KvbmWorkerConfig { pub struct KvbmWorkerConfig {
drt: DistributedRuntime, cancel_token: CancellationToken,
num_device_blocks: usize, num_device_blocks: usize,
...@@ -531,7 +531,7 @@ impl KvbmWorker { ...@@ -531,7 +531,7 @@ impl KvbmWorker {
CriticalTaskExecutionHandle, CriticalTaskExecutionHandle,
oneshot::Receiver<transfer::BlockTransferHandler>, oneshot::Receiver<transfer::BlockTransferHandler>,
)> { )> {
let cancel_token = config.drt.primary_token().clone(); let cancel_token = config.cancel_token.clone();
// establish a oneshot channel to get back the raw BlockTransferHandler // establish a oneshot channel to get back the raw BlockTransferHandler
let (handler_tx, handler_rx) = oneshot::channel(); let (handler_tx, handler_rx) = oneshot::channel();
...@@ -582,7 +582,7 @@ impl KvbmWorker { ...@@ -582,7 +582,7 @@ impl KvbmWorker {
CriticalTaskExecutionHandle, CriticalTaskExecutionHandle,
oneshot::Receiver<transfer::BlockTransferHandler>, oneshot::Receiver<transfer::BlockTransferHandler>,
)> { )> {
let cancel_token = config.drt.primary_token().clone(); let cancel_token = config.cancel_token.clone();
let scheduler_client = config.scheduler_client.clone(); let scheduler_client = config.scheduler_client.clone();
// channel to get BlockTransferHandler back to the caller // channel to get BlockTransferHandler back to the caller
...@@ -682,8 +682,7 @@ impl KvbmWorker { ...@@ -682,8 +682,7 @@ impl KvbmWorker {
scheduler_client: Option<TransferSchedulerClient>, scheduler_client: Option<TransferSchedulerClient>,
bytes_per_block: usize, bytes_per_block: usize,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
let drt = config.drt.clone(); let worker_id = config.device_id;
let worker_id = drt.connection_id() as usize;
// Readiness gating for ping // Readiness gating for ping
let state = Arc::new(WorkerState::new()); let state = Arc::new(WorkerState::new());
......
...@@ -57,13 +57,6 @@ impl Resources { ...@@ -57,13 +57,6 @@ impl Resources {
tracing::debug!("Creating NIXL backends"); tracing::debug!("Creating NIXL backends");
if let Ok((_, ucx_params)) = agent.get_plugin_params("UCX") {
let backend = agent.create_backend("UCX", &ucx_params)?;
nixl_backends.insert("UCX".to_string(), Arc::new(backend));
} else {
tracing::warn!("No UCX plugin found; will not create UCX backend");
}
if config.disk_layout.is_some() { if config.disk_layout.is_some() {
if let Ok((_, gds_mt_params)) = agent.get_plugin_params("GDS_MT") { if let Ok((_, gds_mt_params)) = agent.get_plugin_params("GDS_MT") {
let backend = agent.create_backend("GDS_MT", &gds_mt_params)?; let backend = agent.create_backend("GDS_MT", &gds_mt_params)?;
......
...@@ -147,6 +147,7 @@ addopts = [ ...@@ -147,6 +147,7 @@ addopts = [
"--ignore-glob=*model.py", "--ignore-glob=*model.py",
"--ignore-glob=*vllm_integration*", "--ignore-glob=*vllm_integration*",
"--ignore-glob=*trtllm_integration*", "--ignore-glob=*trtllm_integration*",
"--ignore-glob=*kvbm/python/kvbm*",
"--ignore-glob=*_inc.py", "--ignore-glob=*_inc.py",
"--ignore-glob=*/llm/tensorrtllm*", "--ignore-glob=*/llm/tensorrtllm*",
"--ignore-glob=docs/*", "--ignore-glob=docs/*",
......
...@@ -34,10 +34,10 @@ pytest -v -m "kvbm" -s ...@@ -34,10 +34,10 @@ pytest -v -m "kvbm" -s
Run the determinism test file directly inside dynamo repo: Run the determinism test file directly inside dynamo repo:
```bash ```bash
pytest -v tests/kvbm/test_determinism_agg.py -s pytest -v tests/kvbm_integration/test_determinism_agg.py -s
# disagg needs 2 GPUs to run # disagg needs 2 GPUs to run
pytest -v tests/kvbm/test_determinism_disagg.py -s pytest -v tests/kvbm_integration/test_determinism_disagg.py -s
``` ```
## Configuration ## Configuration
...@@ -84,7 +84,7 @@ pytest -v -m "kvbm" -s ...@@ -84,7 +84,7 @@ pytest -v -m "kvbm" -s
## Requirements ## Requirements
- `vllm` executable available in PATH inside the test environment. - `vllm` executable available in PATH inside the test environment.
- The connector module path must be valid: `dynamo.llm.vllm_integration.connector`. - The connector module path must be valid: `kvbm.vllm_integration.connector`.
- NATS and etcd services (provided automatically by the `runtime_services` fixture). - NATS and etcd services (provided automatically by the `runtime_services` fixture).
- `datasets` library for IFEval concurrent testing (included in test dependencies). - `datasets` library for IFEval concurrent testing (included in test dependencies).
- For containerized workflows, follow the top-level `tests/README.md` guidance to build/run the appropriate image, then execute pytest inside the container. - For containerized workflows, follow the top-level `tests/README.md` guidance to build/run the appropriate image, then execute pytest inside the container.
......
...@@ -9,6 +9,6 @@ kv_cache_config: ...@@ -9,6 +9,6 @@ kv_cache_config:
free_gpu_memory_fraction: 0.80 free_gpu_memory_fraction: 0.80
max_tokens: 8192 max_tokens: 8192
kv_connector_config: kv_connector_config:
connector_module: dynamo.llm.trtllm_integration.connector connector_module: kvbm.trtllm_integration.connector
connector_scheduler_class: DynamoKVBMConnectorLeader connector_scheduler_class: DynamoKVBMConnectorLeader
connector_worker_class: DynamoKVBMConnectorWorker connector_worker_class: DynamoKVBMConnectorWorker
...@@ -8,6 +8,6 @@ kv_cache_config: ...@@ -8,6 +8,6 @@ kv_cache_config:
free_gpu_memory_fraction: 0.80 free_gpu_memory_fraction: 0.80
max_tokens: 8192 max_tokens: 8192
kv_connector_config: kv_connector_config:
connector_module: dynamo.llm.trtllm_integration.connector connector_module: kvbm.trtllm_integration.connector
connector_scheduler_class: DynamoKVBMConnectorLeader connector_scheduler_class: DynamoKVBMConnectorLeader
connector_worker_class: DynamoKVBMConnectorWorker connector_worker_class: DynamoKVBMConnectorWorker
...@@ -23,7 +23,7 @@ from pathlib import Path ...@@ -23,7 +23,7 @@ from pathlib import Path
import pytest import pytest
import requests import requests
from tests.kvbm.common import ApiTester, check_logs_for_patterns from tests.kvbm_integration.common import ApiTester, check_logs_for_patterns
from tests.utils.managed_process import ManagedProcess from tests.utils.managed_process import ManagedProcess
# Check if vLLM is available # Check if vLLM is available
......
...@@ -109,7 +109,7 @@ class LLMServerManager: ...@@ -109,7 +109,7 @@ class LLMServerManager:
"--port", "--port",
str(self.port), str(self.port),
"--kv-transfer-config", "--kv-transfer-config",
'{"kv_connector":"DynamoConnector","kv_role":"kv_both", "kv_connector_module_path": "dynamo.llm.vllm_integration.connector"}', '{"kv_connector":"DynamoConnector","kv_role":"kv_both", "kv_connector_module_path": "kvbm.vllm_integration.connector"}',
os.environ.get("KVBM_MODEL_ID", "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"), os.environ.get("KVBM_MODEL_ID", "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"),
"--max-model-len", "--max-model-len",
"8000", # required to fit on L4 GPU when using 8b model "8000", # required to fit on L4 GPU when using 8b model
...@@ -132,7 +132,7 @@ class LLMServerManager: ...@@ -132,7 +132,7 @@ class LLMServerManager:
"free_gpu_memory_fraction": 0.10, # Set a small GPU fraction so that we can evict/reset the on-device kv cache faster "free_gpu_memory_fraction": 0.10, # Set a small GPU fraction so that we can evict/reset the on-device kv cache faster
} }
llm_api_config["kv_connector_config"] = { llm_api_config["kv_connector_config"] = {
"connector_module": "dynamo.llm.trtllm_integration.connector", "connector_module": "kvbm.trtllm_integration.connector",
"connector_scheduler_class": "DynamoKVBMConnectorLeader", "connector_scheduler_class": "DynamoKVBMConnectorLeader",
"connector_worker_class": "DynamoKVBMConnectorWorker", "connector_worker_class": "DynamoKVBMConnectorWorker",
} }
......
...@@ -100,8 +100,8 @@ vllm_configs = { ...@@ -100,8 +100,8 @@ vllm_configs = {
], ],
timeout=700, timeout=700,
request_payloads=[ request_payloads=[
chat_payload_default(), chat_payload_default(expected_response=["joke"]),
completion_payload_default(), completion_payload_default(expected_response=["joke"]),
], ],
), ),
"multimodal_agg_llava": VLLMConfig( "multimodal_agg_llava": VLLMConfig(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment