Unverified Commit 5a5bc51e authored by Ryan Olson's avatar Ryan Olson Committed by GitHub
Browse files

refactor: migrate kvbm python bindings to the python directory (#4318)


Signed-off-by: default avatarRyan Olson <rolson@nvidia.com>
parent 334ce551
...@@ -39,6 +39,7 @@ jobs: ...@@ -39,6 +39,7 @@ jobs:
runs-on: runs-on:
group: Fastchecker group: Fastchecker
strategy: strategy:
# removing kvbm from here - it will fail to test with nixl dep enabled
matrix: { dir: ['.', 'lib/bindings/python', 'lib/runtime/examples', 'launch/dynamo-run'] } matrix: { dir: ['.', 'lib/bindings/python', 'lib/runtime/examples', 'launch/dynamo-run'] }
permissions: permissions:
contents: read contents: read
......
...@@ -14,7 +14,6 @@ members = [ ...@@ -14,7 +14,6 @@ members = [
"lib/bindings/c", "lib/bindings/c",
"lib/bindings/python/codegen", "lib/bindings/python/codegen",
"lib/engines/*", "lib/engines/*",
"lib/kvbm",
"lib/config", "lib/config",
] ]
# Exclude certain packages that are slow to build and we don't ship as flagship # Exclude certain packages that are slow to build and we don't ship as flagship
...@@ -107,7 +106,9 @@ tokio = { version = "1", features = ["full"] } ...@@ -107,7 +106,9 @@ tokio = { version = "1", features = ["full"] }
tokio-stream = { version = "0.1" } tokio-stream = { version = "0.1" }
tokio-util = { version = "0.7", features = ["codec", "net", "rt"] } tokio-util = { version = "0.7", features = ["codec", "net", "rt"] }
tower-http = { version = "0.6", features = ["trace"] } tower-http = { version = "0.6", features = ["trace"] }
axum = { version = "0.8", features = ["macros"] } axum = { version = "=0.8.4", features = ["macros"] }
hyper = { version = "=1.7.0" }
hyper-util = { version = "=0.1.17" }
tracing = { version = "0.1" } tracing = { version = "0.1" }
tracing-subscriber = { version = "0.3", features = [ tracing-subscriber = { version = "0.3", features = [
"env-filter", "env-filter",
...@@ -134,3 +135,4 @@ insta.opt-level = 3 ...@@ -134,3 +135,4 @@ insta.opt-level = 3
# These make the build much slower but shrink the binary, and could help performance # These make the build much slower but shrink the binary, and could help performance
codegen-units = 1 codegen-units = 1
lto = "thin" lto = "thin"
...@@ -317,7 +317,7 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ ...@@ -317,7 +317,7 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
cd /opt/dynamo/lib/bindings/python && \ cd /opt/dynamo/lib/bindings/python && \
maturin build --release --out /opt/dynamo/dist && \ maturin build --release --out /opt/dynamo/dist && \
if [ "$ENABLE_KVBM" = "true" ]; then \ if [ "$ENABLE_KVBM" = "true" ]; then \
cd /opt/dynamo/lib/kvbm && \ cd /opt/dynamo/lib/bindings/kvbm && \
maturin build --release --out /opt/dynamo/dist; \ maturin build --release --out /opt/dynamo/dist; \
fi && \ fi && \
/tmp/use-sccache.sh show-stats "Dynamo" /tmp/use-sccache.sh show-stats "Dynamo"
......
This diff is collapsed.
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
[workspace]
[package] [package]
name = "kvbm-py3" name = "kvbm-py3"
...@@ -23,8 +24,8 @@ default = ["block-manager"] ...@@ -23,8 +24,8 @@ default = ["block-manager"]
block-manager = ["dynamo-llm/block-manager", "dep:dlpark", "dep:cudarc"] block-manager = ["dynamo-llm/block-manager", "dep:dlpark", "dep:cudarc"]
[dependencies] [dependencies]
dynamo-llm = { path = "../llm" } dynamo-llm = { path = "../../llm" }
dynamo-runtime = { path = "../runtime" } dynamo-runtime = { path = "../../runtime" }
anyhow = { version = "1" } anyhow = { version = "1" }
async-stream = { version = "0.3" } async-stream = { version = "0.3" }
......
...@@ -19,7 +19,7 @@ limitations under the License. ...@@ -19,7 +19,7 @@ limitations under the License.
The Dynamo KVBM is a distributed KV-cache block management system designed for scalable LLM inference. It cleanly separates memory management from inference runtimes (vLLM, TensorRT-LLM, and SGLang), enabling GPU↔CPU↔Disk/Remote tiering, asynchronous block offload/onboard, and efficient block reuse. The Dynamo KVBM is a distributed KV-cache block management system designed for scalable LLM inference. It cleanly separates memory management from inference runtimes (vLLM, TensorRT-LLM, and SGLang), enabling GPU↔CPU↔Disk/Remote tiering, asynchronous block offload/onboard, and efficient block reuse.
![A block diagram showing a layered architecture view of Dynamo KV Block manager.](../../docs/images/kvbm-architecture.png) ![A block diagram showing a layered architecture view of Dynamo KV Block manager.](../../../docs/images/kvbm-architecture.png)
## Feature Highlights ## Feature Highlights
...@@ -84,7 +84,7 @@ DYN_KVBM_CPU_CACHE_GB=100 vllm serve \ ...@@ -84,7 +84,7 @@ DYN_KVBM_CPU_CACHE_GB=100 vllm serve \
Qwen/Qwen3-8B Qwen/Qwen3-8B
``` ```
For more detailed integration with dynamo, disaggregated serving support and benchmarking, please check [vllm-setup](../../docs/kvbm/vllm-setup.md) For more detailed integration with dynamo, disaggregated serving support and benchmarking, please check [vllm-setup](../../../docs/kvbm/vllm-setup.md)
### TensorRT-LLM ### TensorRT-LLM
...@@ -106,12 +106,12 @@ DYN_KVBM_CPU_CACHE_GB=100 trtllm-serve Qwen/Qwen3-8B \ ...@@ -106,12 +106,12 @@ DYN_KVBM_CPU_CACHE_GB=100 trtllm-serve Qwen/Qwen3-8B \
--extra_llm_api_options /tmp/kvbm_llm_api_config.yaml --extra_llm_api_options /tmp/kvbm_llm_api_config.yaml
``` ```
For more detailed integration with dynamo and benchmarking, please check [trtllm-setup](../../docs/kvbm/trtllm-setup.md) For more detailed integration with dynamo and benchmarking, please check [trtllm-setup](../../../docs/kvbm/trtllm-setup.md)
## 📚 Docs ## 📚 Docs
- [Architecture](../../docs/kvbm/kvbm_architecture.md) - [Architecture](../../../docs/kvbm/kvbm_architecture.md)
- [Motivation](../../docs/kvbm/kvbm_motivation.md) - [Motivation](../../../docs/kvbm/kvbm_motivation.md)
- [Design Deepdive](../../docs/kvbm/kvbm_design_deepdive.md) - [Design Deepdive](../../../docs/kvbm/kvbm_design_deepdive.md)
- [NIXL Overview](https://github.com/ai-dynamo/nixl/blob/main/docs/nixl.md) - [NIXL Overview](https://github.com/ai-dynamo/nixl/blob/main/docs/nixl.md)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment