Unverified Commit 6afa679c authored by Richard Huo's avatar Richard Huo Committed by GitHub
Browse files

chore: KVBM pip wheel (#3826)


Signed-off-by: default avatarAnant Sharma <anants@nvidia.com>
Co-authored-by: default avatarAnant Sharma <anants@nvidia.com>
parent e5c109d8
......@@ -226,6 +226,18 @@ version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b34d609dfbaf33d6889b2b7106d3ca345eacad44200913df5ba02bfd31d2ba9"
[[package]]
name = "async-channel"
version = "2.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "924ed96dd52d1b75e9c1a3e6275715fd320f5f9439fb5a4a11fa51f4221158d2"
dependencies = [
"concurrent-queue",
"event-listener-strategy",
"futures-core",
"pin-project-lite",
]
[[package]]
name = "async-nats"
version = "0.40.0"
......@@ -1242,6 +1254,15 @@ dependencies = [
"static_assertions",
]
[[package]]
name = "concurrent-queue"
version = "2.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ca0197aee26d1ae37445ee532fefce43251d24cc7c166799f4d46817f1d3973"
dependencies = [
"crossbeam-utils",
]
[[package]]
name = "config"
version = "0.15.17"
......@@ -2046,6 +2067,16 @@ dependencies = [
"syn 2.0.106",
]
[[package]]
name = "dlpark"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc178fc3bf4ce54c26ccffcf271ff574954ac4b940f15121be3d69f277194537"
dependencies = [
"half 2.6.0",
"pyo3",
]
[[package]]
name = "dlv-list"
version = "0.5.2"
......@@ -2581,6 +2612,27 @@ dependencies = [
"tower-service",
]
[[package]]
name = "event-listener"
version = "5.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e13b66accf52311f30a0db42147dadea9850cb48cd070028831ae5f5d4b856ab"
dependencies = [
"concurrent-queue",
"parking",
"pin-project-lite",
]
[[package]]
name = "event-listener-strategy"
version = "0.5.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8be9f3dfaaffdae2972880079a491a1a8bb7cbed0b8dd7a347f668b4150a3b93"
dependencies = [
"event-listener",
"pin-project-lite",
]
[[package]]
name = "eventsource-stream"
version = "0.2.3"
......@@ -3902,6 +3954,12 @@ dependencies = [
"web-time",
]
[[package]]
name = "indoc"
version = "2.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f4c7245a08504955605670dbf141fceab975f15ca21570696aebe9d2e71576bd"
[[package]]
name = "inlinable_string"
version = "0.1.15"
......@@ -3958,6 +4016,15 @@ dependencies = [
"windows-sys 0.52.0",
]
[[package]]
name = "inventory"
version = "0.3.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bc61209c082fbeb19919bee74b176221b27223e27b65d781eb91af24eb1fb46e"
dependencies = [
"rustversion",
]
[[package]]
name = "io-uring"
version = "0.7.10"
......@@ -4215,6 +4282,40 @@ dependencies = [
"winapi-build",
]
[[package]]
name = "kvbm-py3"
version = "0.1.0"
dependencies = [
"anyhow",
"async-stream",
"async-trait",
"cudarc 0.16.6",
"derive-getters",
"dlpark",
"dynamo-llm",
"dynamo-runtime",
"either",
"futures",
"local-ip-address",
"once_cell",
"prometheus",
"pyo3",
"pyo3-async-runtimes",
"pythonize",
"rand 0.9.2",
"rstest 0.25.0",
"serde",
"serde_json",
"socket2 0.6.0",
"thiserror 2.0.16",
"tokio",
"tokio-stream",
"tokio-util",
"tracing",
"tracing-subscriber",
"uuid 1.18.1",
]
[[package]]
name = "lalrpop-util"
version = "0.20.2"
......@@ -4583,6 +4684,15 @@ dependencies = [
"autocfg",
]
[[package]]
name = "memoffset"
version = "0.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a"
dependencies = [
"autocfg",
]
[[package]]
name = "metal"
version = "0.27.0"
......@@ -5151,7 +5261,7 @@ dependencies = [
"bitflags 1.3.2",
"cfg-if 1.0.3",
"libc",
"memoffset",
"memoffset 0.7.1",
"pin-utils",
]
......@@ -5686,6 +5796,12 @@ dependencies = [
"serde",
]
[[package]]
name = "parking"
version = "2.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f38d5652c16fde515bb1ecef450ab0f6a219d619a7274976324d5e377f7dceba"
[[package]]
name = "parking_lot"
version = "0.12.5"
......@@ -6268,6 +6384,107 @@ dependencies = [
"num-traits",
]
[[package]]
name = "pyo3"
version = "0.23.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7778bffd85cf38175ac1f545509665d0b9b92a198ca7941f131f85f7a4f9a872"
dependencies = [
"cfg-if 1.0.3",
"indoc",
"libc",
"memoffset 0.9.1",
"once_cell",
"portable-atomic",
"pyo3-build-config",
"pyo3-ffi",
"pyo3-macros",
"unindent",
]
[[package]]
name = "pyo3-async-runtimes"
version = "0.23.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "977dc837525cfd22919ba6a831413854beb7c99a256c03bf8624ad707e45810e"
dependencies = [
"async-channel",
"clap 4.5.48",
"futures",
"inventory",
"once_cell",
"pin-project-lite",
"pyo3",
"pyo3-async-runtimes-macros",
"tokio",
]
[[package]]
name = "pyo3-async-runtimes-macros"
version = "0.23.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b2df2884957d2476731f987673befac5d521dff10abb0a7cbe12015bc7702fe9"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.106",
]
[[package]]
name = "pyo3-build-config"
version = "0.23.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94f6cbe86ef3bf18998d9df6e0f3fc1050a8c5efa409bf712e661a4366e010fb"
dependencies = [
"once_cell",
"target-lexicon",
]
[[package]]
name = "pyo3-ffi"
version = "0.23.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e9f1b4c431c0bb1c8fb0a338709859eed0d030ff6daa34368d3b152a63dfdd8d"
dependencies = [
"libc",
"pyo3-build-config",
]
[[package]]
name = "pyo3-macros"
version = "0.23.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fbc2201328f63c4710f68abdf653c89d8dbc2858b88c5d88b0ff38a75288a9da"
dependencies = [
"proc-macro2",
"pyo3-macros-backend",
"quote",
"syn 2.0.106",
]
[[package]]
name = "pyo3-macros-backend"
version = "0.23.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fca6726ad0f3da9c9de093d6f116a93c1a38e417ed73bf138472cf4064f72028"
dependencies = [
"heck 0.5.0",
"proc-macro2",
"pyo3-build-config",
"quote",
"syn 2.0.106",
]
[[package]]
name = "pythonize"
version = "0.23.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "91a6ee7a084f913f98d70cdc3ebec07e852b735ae3059a1500db2661265da9ff"
dependencies = [
"pyo3",
"serde",
]
[[package]]
name = "qoi"
version = "0.4.1"
......@@ -6856,6 +7073,18 @@ dependencies = [
"rustc_version",
]
[[package]]
name = "rstest"
version = "0.25.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6fc39292f8613e913f7df8fa892b8944ceb47c247b78e1b1ae2f09e019be789d"
dependencies = [
"futures-timer",
"futures-util",
"rstest_macros 0.25.0",
"rustc_version",
]
[[package]]
name = "rstest_macros"
version = "0.18.2"
......@@ -6891,6 +7120,24 @@ dependencies = [
"unicode-ident",
]
[[package]]
name = "rstest_macros"
version = "0.25.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1f168d99749d307be9de54d23fd226628d99768225ef08f6ffb52e0182a27746"
dependencies = [
"cfg-if 1.0.3",
"glob",
"proc-macro-crate",
"proc-macro2",
"quote",
"regex",
"relative-path",
"rustc_version",
"syn 2.0.106",
"unicode-ident",
]
[[package]]
name = "rstest_reuse"
version = "0.7.0"
......@@ -9341,6 +9588,12 @@ dependencies = [
"rand 0.8.5",
]
[[package]]
name = "unindent"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3"
[[package]]
name = "unsafe-libyaml"
version = "0.2.11"
......
......@@ -12,6 +12,7 @@ members = [
"lib/bindings/c",
"lib/bindings/python/codegen",
"lib/engines/*",
"lib/kvbm",
]
# Exclude certain packages that are slow to build and we don't ship as flagship
# features from default build, but keep them in workspace for convenience.
......
......@@ -435,7 +435,7 @@ def create_kv_transfer_config(config: Config) -> Optional[KVTransferConfig]:
elif connector == "kvbm":
connector_cfg = {
"kv_connector": "DynamoConnector",
"kv_connector_module_path": "dynamo.llm.vllm_integration.connector",
"kv_connector_module_path": "kvbm.vllm_integration.connector",
"kv_role": "kv_both",
}
multi_connectors.append(connector_cfg)
......@@ -450,7 +450,7 @@ def create_kv_transfer_config(config: Config) -> Optional[KVTransferConfig]:
kv_connector="PdConnector",
kv_role="kv_both",
kv_connector_extra_config={"connectors": multi_connectors},
kv_connector_module_path="dynamo.llm.vllm_integration.connector",
kv_connector_module_path="kvbm.vllm_integration.connector",
)
......
......@@ -8,6 +8,7 @@ import signal
from typing import Optional
import uvloop
from kvbm.vllm_integration.consolidator_config import get_consolidator_endpoints
from prometheus_client import REGISTRY
from vllm.distributed.kv_events import ZmqEventPublisher
from vllm.usage.usage_lib import UsageContext
......@@ -25,7 +26,6 @@ from dynamo.llm import (
fetch_llm,
register_llm,
)
from dynamo.llm.vllm_integration.consolidator_config import get_consolidator_endpoints
from dynamo.runtime import DistributedRuntime, dynamo_worker
from dynamo.runtime.logging import configure_dynamo_logging
from dynamo.vllm.multimodal_handlers import (
......
......@@ -39,6 +39,7 @@ ARG SCCACHE_REGION=""
# NIXL configuration
ARG NIXL_UCX_REF=v1.19.0
ARG NIXL_REF=0.7.0
ARG NIXL_GDRCOPY_REF=v2.5.1
# Python configuration
ARG PYTHON_VERSION=3.12
......@@ -58,6 +59,7 @@ ARG SCCACHE_BUCKET
ARG SCCACHE_REGION
ARG NIXL_UCX_REF
ARG NIXL_REF
ARG NIXL_GDRCOPY_REF
USER root
WORKDIR /opt/dynamo
......@@ -103,46 +105,6 @@ RUN wget --tries=3 --waitretry=5 "https://static.rust-lang.org/rustup/archive/1.
rm rustup-init && \
chmod -R a+w $RUSTUP_HOME $CARGO_HOME
##################################
########## System Dependencies ###
##################################
# Install system packages
RUN apt-get update -y \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
# NIXL build dependencies
autoconf \
automake \
cmake \
git \
git-lfs \
libtool \
meson \
net-tools \
ninja-build \
pybind11-dev \
# Rust build dependencies
clang \
libclang-dev \
protobuf-compiler \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
# These headers are missing with the hpcx installer, required
# by UCX to build and use RDMA devices. Reinstall to make sure to recreate
# symlink .so to .so.1 in case some packages are already found.
RUN apt-get update -y \
&& DEBIAN_FRONTEND=noninteractive apt-get -y install --reinstall --no-install-recommends \
libibverbs-dev \
rdma-core \
ibverbs-utils \
libibumad-dev \
libnuma-dev \
librdmacm-dev \
ibverbs-providers \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
##################################
########## External Services #####
##################################
......@@ -161,110 +123,6 @@ RUN wget --tries=3 --waitretry=5 https://github.com/etcd-io/etcd/releases/downlo
rm /tmp/etcd.tar.gz
ENV PATH=/usr/local/bin/etcd/:$PATH
##################################
########## UCX Build #############
##################################
# Build and install UCX
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
--mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${ARCH}} && \
rm -rf /opt/hpcx/ucx && \
rm -rf /usr/local/ucx && \
echo "Building UCX with reference $NIXL_UCX_REF" && \
cd /usr/local/src && \
git clone https://github.com/openucx/ucx.git && \
cd ucx && git checkout $NIXL_UCX_REF && \
CC=${USE_SCCACHE:+sccache gcc} && \
CXX=${USE_SCCACHE:+sccache g++} && \
export CC=${CC} && \
export CXX=${CXX} && \
./autogen.sh && \
./configure \
--prefix=/usr/local/ucx \
--enable-shared \
--disable-static \
--disable-doxygen-doc \
--enable-optimizations \
--enable-cma \
--enable-devel-headers \
--with-cuda=/usr/local/cuda \
--with-verbs \
--with-efa \
--with-dm \
--with-gdrcopy=/usr/local \
--enable-mt && \
make -j$(nproc) && \
make -j$(nproc) install-strip && \
/tmp/use-sccache.sh show-stats "UCX" && \
echo "/usr/local/ucx/lib" > /etc/ld.so.conf.d/ucx.conf && \
echo "/usr/local/ucx/lib/ucx" >> /etc/ld.so.conf.d/ucx.conf && \
ldconfig && \
cd /usr/local/src && \
rm -rf ucx
# UCX environment variables
ENV CPATH=/usr/include \
PATH=/usr/bin:/usr/local/ucx/bin:$PATH \
PKG_CONFIG_PATH=/usr/lib/pkgconfig
##################################
########## NIXL Setup ############
##################################
# NIXL environment setup
ENV NIXL_SRC_DIR=/opt/nixl \
NIXL_PREFIX=/opt/nvidia/nvda_nixl \
NIXL_LIB_DIR=/opt/nvidia/nvda_nixl/lib/${ARCH_ALT}-linux-gnu \
NIXL_PLUGIN_DIR=/opt/nvidia/nvda_nixl/lib/${ARCH_ALT}-linux-gnu/plugins
# Build and install NIXL
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
--mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
git clone --depth 1 --branch ${NIXL_REF} "https://github.com/ai-dynamo/nixl.git" ${NIXL_SRC_DIR} && \
cd ${NIXL_SRC_DIR} && \
if [ "$ARCH" = "arm64" ]; then \
nixl_build_args="-Ddisable_gds_backend=true"; \
else \
nixl_build_args=""; \
fi && \
meson setup build/ --buildtype=release --prefix=$NIXL_PREFIX $nixl_build_args && \
ninja -C build/ -j$(nproc) && ninja -C build/ install && \
/tmp/use-sccache.sh show-stats "NIXL" && \
echo "$NIXL_LIB_DIR" > /etc/ld.so.conf.d/nixl.conf && \
echo "$NIXL_PLUGIN_DIR" >> /etc/ld.so.conf.d/nixl.conf && \
ldconfig
# Build NIXL Python module
# TODO OPS-590: Move gds_path selection based on arch into NIXL build and re-enable gds backend for arm64
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
--mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
if [ "$ARCH" = "arm64" ]; then \
cd ${NIXL_SRC_DIR} && uv build . --out-dir /opt/dynamo/wheelhouse/nixl --python $PYTHON_VERSION \
--config-settings=setup-args="-Ddisable_gds_backend=true"; \
else \
cd ${NIXL_SRC_DIR} && uv build . --out-dir /opt/dynamo/wheelhouse/nixl --python $PYTHON_VERSION; \
fi
##################################
########## Python Environment ####
##################################
# Create and activate virtual environment
ARG PYTHON_VERSION
RUN mkdir -p /opt/dynamo/venv && \
uv venv /opt/dynamo/venv --python $PYTHON_VERSION
ENV VIRTUAL_ENV=/opt/dynamo/venv \
PATH="/opt/dynamo/venv/bin:${PATH}"
# Install common and test dependencies
RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \
--mount=type=bind,source=./container/deps/requirements.test.txt,target=/tmp/requirements.test.txt \
UV_GIT_LFS=1 uv pip install \
--no-cache \
--requirement /tmp/requirements.txt \
--requirement /tmp/requirements.test.txt
##################################
##### Wheel Build Image ##########
......@@ -279,27 +137,42 @@ FROM quay.io/pypa/manylinux_2_28_${ARCH_ALT} AS wheel_builder
ARG ARCH
ARG ARCH_ALT
ARG CARGO_BUILD_JOBS
ARG PYTHON_VERSION
ARG ENABLE_KVBM
ARG USE_SCCACHE
ARG SCCACHE_BUCKET
ARG SCCACHE_REGION
ARG NIXL_UCX_REF
ARG NIXL_REF
ARG NIXL_GDRCOPY_REF
WORKDIR /opt/dynamo
# Set environment variables
ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16} \
RUSTUP_HOME=/usr/local/rustup \
CARGO_HOME=/usr/local/cargo \
CARGO_TARGET_DIR=/opt/dynamo/target \
VIRTUAL_ENV=/opt/dynamo/venv \
NIXL_PREFIX=/opt/nvidia/nvda_nixl \
PATH=/usr/local/cargo/bin:/opt/dynamo/venv/bin:$PATH
WORKDIR /workspace
# Install system dependencies
RUN dnf update -y \
&& dnf install -y llvm-toolset protobuf-compiler wget unzip \
&& dnf clean all \
&& rm -rf /var/cache/dnf
RUN yum groupinstall -y 'Development Tools' && \
dnf install -y almalinux-release-synergy && \
dnf config-manager --set-enabled powertools && \
dnf install -y \
# Build tools
cmake \
ninja-build \
clang-devel \
gcc-c++ \
flex \
wget \
# Kernel module build dependencies
dkms \
# Protobuf support
protobuf-compiler \
# RDMA/InfiniBand support (required for UCX build with --with-verbs)
libibverbs \
libibverbs-devel \
rdma-core \
rdma-core-devel \
libibumad \
libibumad-devel \
librdmacm-devel \
numactl-devel
# Ensure a modern protoc is available (required for --experimental_allow_proto3_optional)
RUN set -eux; \
......@@ -319,18 +192,16 @@ RUN set -eux; \
# Point build tools explicitly at the modern protoc
ENV PROTOC=/usr/local/bin/protoc
# Set environment variables first so they can be used in COPY commands
ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16} \
RUSTUP_HOME=/usr/local/rustup \
CARGO_HOME=/usr/local/cargo \
CARGO_TARGET_DIR=/opt/dynamo/target \
PATH=/usr/local/cargo/bin:$PATH
# Copy artifacts from base stage
COPY --from=base $RUSTUP_HOME $RUSTUP_HOME
COPY --from=base $CARGO_HOME $CARGO_HOME
COPY --from=base $NIXL_PREFIX $NIXL_PREFIX
ARG PYTHON_VERSION
RUN mkdir -p /opt/dynamo/venv && \
uv venv /opt/dynamo/venv --python $PYTHON_VERSION
ENV VIRTUAL_ENV=/opt/dynamo/venv \
PATH="/opt/dynamo/venv/bin:${PATH}"
# Install SCCACHE if requested
COPY container/use-sccache.sh /tmp/use-sccache.sh
......@@ -343,21 +214,110 @@ ENV SCCACHE_BUCKET=${USE_SCCACHE:+${SCCACHE_BUCKET}} \
SCCACHE_REGION=${USE_SCCACHE:+${SCCACHE_REGION}} \
RUSTC_WRAPPER=${USE_SCCACHE:+sccache}
# Copy CUDA from base stage
COPY --from=base /usr/local/cuda /usr/local/cuda
COPY --from=base /etc/ld.so.conf.d/hpcx.conf /etc/ld.so.conf.d/hpcx.conf
ENV CUDA_PATH=/usr/local/cuda \
PATH=/usr/local/cuda/bin:$PATH \
LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/lib:/usr/local/lib64:$LD_LIBRARY_PATH \
NVIDIA_DRIVER_CAPABILITIES=video,compute,utility
# Create virtual environment for building wheels
ENV VIRTUAL_ENV=/workspace/.venv
RUN uv venv ${VIRTUAL_ENV} --python $PYTHON_VERSION && \
uv pip install --upgrade meson pybind11 patchelf maturin[patchelf]
# Build and install gdrcopy
RUN git clone --depth 1 --branch ${NIXL_GDRCOPY_REF} https://github.com/NVIDIA/gdrcopy.git && \
cd gdrcopy/packages && \
CUDA=/usr/local/cuda ./build-rpm-packages.sh && \
rpm -Uvh gdrcopy-kmod-*.el8.noarch.rpm && \
rpm -Uvh gdrcopy-*.el8.${ARCH_ALT}.rpm && \
rpm -Uvh gdrcopy-devel-*.el8.noarch.rpm
# Build and install UCX
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
--mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${ARCH}}" && \
CC=${USE_SCCACHE:+sccache gcc} && \
CXX=${USE_SCCACHE:+sccache g++} && \
export CC=${CC} && \
export CXX=${CXX} && \
cd /usr/local/src && \
git clone https://github.com/openucx/ucx.git && \
cd ucx && \
git checkout $NIXL_UCX_REF && \
./autogen.sh && ./configure \
--prefix=/usr/local/ucx \
--enable-shared \
--disable-static \
--disable-doxygen-doc \
--enable-optimizations \
--enable-cma \
--enable-devel-headers \
--with-cuda=/usr/local/cuda \
--with-verbs \
--with-dm \
--with-gdrcopy=/usr/local \
--with-efa \
--enable-mt && \
make -j && \
make -j install-strip && \
/tmp/use-sccache.sh show-stats "UCX" && \
echo "/usr/local/ucx/lib" > /etc/ld.so.conf.d/ucx.conf && \
echo "/usr/local/ucx/lib/ucx" >> /etc/ld.so.conf.d/ucx.conf && \
ldconfig
# build and install nixl
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
--mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${ARCH}}" && \
source ${VIRTUAL_ENV}/bin/activate && \
git clone --depth 1 --branch ${NIXL_REF} "https://github.com/ai-dynamo/nixl.git" && \
cd nixl && \
mkdir build && \
meson setup build/ --prefix=/opt/nvidia/nvda_nixl --buildtype=release \
-Dcudapath_lib="/usr/local/cuda/lib64" \
-Dcudapath_inc="/usr/local/cuda/include" \
-Ducx_path="/usr/local/ucx" && \
cd build && \
ninja && \
ninja install && \
/tmp/use-sccache.sh show-stats "NIXL"
ENV NIXL_LIB_DIR=/opt/nvidia/nvda_nixl/lib64 \
NIXL_PLUGIN_DIR=/opt/nvidia/nvda_nixl/lib64/plugins \
NIXL_PREFIX=/opt/nvidia/nvda_nixl
ENV LD_LIBRARY_PATH=${NIXL_LIB_DIR}:${NIXL_PLUGIN_DIR}:/usr/local/ucx/lib:/usr/local/ucx/lib/ucx:${LD_LIBRARY_PATH}
RUN echo "$NIXL_LIB_DIR" > /etc/ld.so.conf.d/nixl.conf && \
echo "$NIXL_PLUGIN_DIR" >> /etc/ld.so.conf.d/nixl.conf && \
ldconfig
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
--mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
export SCCACHE_S3_KEY_PREFIX="${SCCACHE_S3_KEY_PREFIX:-${ARCH}}" && \
cd /workspace/nixl && \
uv build . --out-dir /opt/dynamo/dist/nixl --python $PYTHON_VERSION
# Copy source code (order matters for layer caching)
COPY pyproject.toml README.md LICENSE Cargo.toml Cargo.lock rust-toolchain.toml hatch_build.py /opt/dynamo/
COPY launch/ /opt/dynamo/launch/
COPY lib/ /opt/dynamo/lib/
COPY components/ /opt/dynamo/components/
# Build wheels
# Build dynamo wheels
RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
--mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${ARCH}} && \
source ${VIRTUAL_ENV}/bin/activate && \
cd /opt/dynamo && \
uv build --wheel --out-dir /opt/dynamo/dist && \
cd /opt/dynamo/lib/bindings/python && \
uv pip install maturin[patchelf] && \
maturin build --release --out /opt/dynamo/dist && \
if [ "$ENABLE_KVBM" = "true" ]; then \
maturin build --release --features block-manager --out /opt/dynamo/dist; \
else \
cd /opt/dynamo/lib/kvbm && \
maturin build --release --out /opt/dynamo/dist; \
fi && \
/tmp/use-sccache.sh show-stats "Dynamo"
......@@ -368,23 +328,66 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
FROM base AS dev
ARG ENABLE_KVBM
ARG ARCH_ALT
# Application environment variables
ENV DYNAMO_HOME=/opt/dynamo \
CARGO_TARGET_DIR=/opt/dynamo/target
WORKDIR /opt/dynamo
# NIXL environment variables
ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl \
NIXL_LIB_DIR=/opt/nvidia/nvda_nixl/lib/${ARCH_ALT}-linux-gnu \
NIXL_PLUGIN_DIR=/opt/nvidia/nvda_nixl/lib/${ARCH_ALT}-linux-gnu/plugins
ENV LD_LIBRARY_PATH=${NIXL_LIB_DIR}:${NIXL_PLUGIN_DIR}:/usr/local/ucx/lib:/usr/local/ucx/lib/ucx:${LD_LIBRARY_PATH}
# Copy ucx and nixl libs
COPY --from=wheel_builder /usr/local/ucx/ /usr/local/ucx/
COPY --from=wheel_builder ${NIXL_PREFIX}/ ${NIXL_PREFIX}/
COPY --from=wheel_builder /opt/nvidia/nvda_nixl/lib64/. ${NIXL_LIB_DIR}/
# Copy built artifacts
COPY --from=wheel_builder /opt/dynamo/dist/nixl/ /opt/dynamo/wheelhouse/nixl/
COPY --from=wheel_builder /opt/dynamo/dist/*.whl /opt/dynamo/wheelhouse/
COPY --from=wheel_builder $CARGO_TARGET_DIR $CARGO_TARGET_DIR
COPY --from=wheel_builder $CARGO_HOME $CARGO_HOME
# Install Python packages
RUN apt-get update -y \
&& apt-get install -y --no-install-recommends \
# required for AIC perf files
git \
git-lfs \
# rust build packages
clang \
libclang-dev \
protobuf-compiler \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
# Create and activate virtual environment
ARG PYTHON_VERSION
RUN mkdir -p /opt/dynamo/venv && \
uv venv /opt/dynamo/venv --python $PYTHON_VERSION
ENV VIRTUAL_ENV=/opt/dynamo/venv \
PATH="/opt/dynamo/venv/bin:${PATH}"
# Install common and test dependencies
RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \
--mount=type=bind,source=./container/deps/requirements.test.txt,target=/tmp/requirements.test.txt \
UV_GIT_LFS=1 uv pip install \
--no-cache \
--requirement /tmp/requirements.txt \
--requirement /tmp/requirements.test.txt
COPY benchmarks/ /opt/dynamo/benchmarks/
RUN uv pip install \
/opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
/opt/dynamo/wheelhouse/ai_dynamo*any.whl \
/opt/dynamo/wheelhouse/nixl/nixl*.whl \
/opt/dynamo/wheelhouse/nixl/nixl*.whl && \
if [ "$ENABLE_KVBM" = "true" ]; then \
uv pip install /opt/dynamo/wheelhouse/kvbm*.whl; \
fi \
&& cd /opt/dynamo/benchmarks \
&& UV_GIT_LFS=1 uv pip install --no-cache . \
&& cd - \
......@@ -393,7 +396,8 @@ RUN uv pip install \
# Setup launch banner
RUN --mount=type=bind,source=./container/launch_message.txt,target=/opt/dynamo/launch_message.txt \
sed '/^#\s/d' /opt/dynamo/launch_message.txt > ~/.launch_screen && \
echo "cat ~/.launch_screen" >> ~/.bashrc
echo "cat ~/.launch_screen" >> ~/.bashrc && \
echo "source $VIRTUAL_ENV/bin/activate" >> ~/.bashrc
ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
CMD []
......@@ -64,6 +64,7 @@ ENV VIRTUAL_ENV=/opt/dynamo/venv
ARG ARCH_ALT
ARG PYTHON_VERSION
ARG ENABLE_KVBM
ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl
ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu
ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins
......@@ -241,9 +242,12 @@ ENV LD_LIBRARY_PATH=${TENSORRT_LIB_DIR}:${LD_LIBRARY_PATH}
COPY benchmarks/ /opt/dynamo/benchmarks/
COPY --from=dynamo_base /opt/dynamo/wheelhouse/ /opt/dynamo/wheelhouse/
RUN uv pip install \
/opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
/opt/dynamo/wheelhouse/ai_dynamo*any.whl \
/opt/dynamo/wheelhouse/nixl/nixl*.whl \
/opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
/opt/dynamo/wheelhouse/ai_dynamo*any.whl \
/opt/dynamo/wheelhouse/nixl/nixl*.whl \
&& if [ "${ENABLE_KVBM}" = "true" ]; then \
uv pip install /opt/dynamo/wheelhouse/kvbm*.whl; \
fi \
&& cd /opt/dynamo/benchmarks \
&& UV_GIT_LFS=1 uv pip install --no-cache . \
&& cd - \
......
......@@ -178,6 +178,7 @@ ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
ARG ARCH_ALT
ARG PYTHON_VERSION
ARG ENABLE_KVBM
ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl
ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu
ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins
......@@ -253,9 +254,12 @@ COPY --from=framework ${VIRTUAL_ENV} ${VIRTUAL_ENV}
COPY benchmarks/ /opt/dynamo/benchmarks/
COPY --from=dynamo_base /opt/dynamo/wheelhouse/ /opt/dynamo/wheelhouse/
RUN uv pip install \
/opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
/opt/dynamo/wheelhouse/ai_dynamo*any.whl \
/opt/dynamo/wheelhouse/nixl/nixl*.whl \
/opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
/opt/dynamo/wheelhouse/ai_dynamo*any.whl \
/opt/dynamo/wheelhouse/nixl/nixl*.whl \
&& if [ "${ENABLE_KVBM}" = "true" ]; then \
uv pip install /opt/dynamo/wheelhouse/kvbm*.whl; \
fi \
&& cd /opt/dynamo/benchmarks \
&& UV_GIT_LFS=1 uv pip install --no-cache . \
&& cd - \
......
......@@ -780,7 +780,7 @@ if [[ $FRAMEWORK == "VLLM" ]] || [[ $FRAMEWORK == "TRTLLM" ]]; then
fi
if [ ! -z ${ENABLE_KVBM} ]; then
echo "Enabling the KVBM in the ai-dynamo-runtime"
echo "Enabling the KVBM in the dynamo image"
BUILD_ARGS+=" --build-arg ENABLE_KVBM=${ENABLE_KVBM} "
fi
......
......@@ -77,4 +77,4 @@ cd builddir && ninja install
cd ../..
rm -rf nixl* # Remove NIXL source tree to save space
echo "export LD_LIBRARY_PATH=/opt/nvidia/nvda_nixl/lib/${ARCH_NAME}:/opt/nvidia/nvda_nixl/lib64:\$LD_LIBRARY_PATH" >> "${ENV}"
\ No newline at end of file
echo "export LD_LIBRARY_PATH=/opt/nvidia/nvda_nixl/lib/${ARCH_NAME}:/opt/nvidia/nvda_nixl/lib64:\$LD_LIBRARY_PATH" >> "${ENV}"
......@@ -87,7 +87,7 @@ kv_cache_config:
enable_partial_reuse: false
free_gpu_memory_fraction: 0.80
kv_connector_config:
connector_module: dynamo.llm.trtllm_integration.connector
connector_module: kvbm.trtllm_integration.connector
connector_scheduler_class: DynamoKVBMConnectorLeader
connector_worker_class: DynamoKVBMConnectorWorker
EOF
......
......@@ -101,7 +101,7 @@ curl localhost:8000/v1/chat/completions -H "Content-Type: application/json"
Alternatively, can use `vllm serve` directly to use KVBM for aggregated serving:
```bash
vllm serve --kv-transfer-config '{"kv_connector":"DynamoConnector","kv_role":"kv_both", "kv_connector_module_path": "dynamo.llm.vllm_integration.connector"}' Qwen/Qwen3-0.6B
vllm serve --kv-transfer-config '{"kv_connector":"DynamoConnector","kv_role":"kv_both", "kv_connector_module_path": "kvbm.vllm_integration.connector"}' Qwen/Qwen3-0.6B
```
## Enable and View KVBM Metrics
......
......@@ -358,7 +358,7 @@ version = "0.30.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dbfd150b5dbdb988bcc8fb1fe787eb6b7ee6180ca24da683b61ea5405f3d43ff"
dependencies = [
"bindgen 0.69.5",
"bindgen",
"cc",
"cmake",
"dunce",
......@@ -558,26 +558,6 @@ dependencies = [
"which",
]
[[package]]
name = "bindgen"
version = "0.71.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5f58bf3d7db68cfbac37cfc485a8d711e87e064c3d0fe0435b92f7a407f9d6b3"
dependencies = [
"bitflags 2.9.3",
"cexpr",
"clang-sys",
"itertools 0.13.0",
"log",
"prettyplease",
"proc-macro2",
"quote",
"regex",
"rustc-hash 2.1.1",
"shlex",
"syn 2.0.106",
]
[[package]]
name = "bit-set"
version = "0.5.3"
......@@ -1463,15 +1443,6 @@ dependencies = [
"uuid",
]
[[package]]
name = "dynamo-kvbm-kernels"
version = "0.6.0"
dependencies = [
"cc",
"cudarc",
"once_cell",
]
[[package]]
name = "dynamo-llm"
version = "0.6.1"
......@@ -1479,7 +1450,6 @@ dependencies = [
"ahash",
"aho-corasick",
"akin",
"aligned-vec",
"anyhow",
"async-nats",
"async-stream",
......@@ -1495,13 +1465,11 @@ dependencies = [
"bytes",
"candle-core",
"chrono",
"cudarc",
"dashmap",
"derive-getters",
"derive_builder",
"dialoguer",
"dynamo-async-openai",
"dynamo-kvbm-kernels",
"dynamo-parsers",
"dynamo-runtime",
"either",
......@@ -1518,9 +1486,6 @@ dependencies = [
"minijinja-contrib",
"modelexpress-client",
"modelexpress-common",
"ndarray",
"nix 0.26.4",
"nixl-sys",
"offset-allocator",
"oneshot",
"parking_lot",
......@@ -1639,7 +1604,7 @@ dependencies = [
"local-ip-address",
"log",
"nid",
"nix 0.29.0",
"nix",
"nuid",
"once_cell",
"opentelemetry",
......@@ -2988,15 +2953,6 @@ dependencies = [
"either",
]
[[package]]
name = "itertools"
version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186"
dependencies = [
"either",
]
[[package]]
name = "itertools"
version = "0.14.0"
......@@ -3344,16 +3300,6 @@ version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3"
[[package]]
name = "matrixmultiply"
version = "0.3.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a06de3016e9fae57a36fd14dba131fccf49f74b40b7fbdb472f96e361ec71a08"
dependencies = [
"autocfg",
"rawpointer",
]
[[package]]
name = "maybe-rayon"
version = "0.1.1"
......@@ -3386,15 +3332,6 @@ version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38d1115007560874e373613744c6fba374c17688327a71c1476d1a5954cc857b"
[[package]]
name = "memoffset"
version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5de893c32cde5f383baa4c04c5d6dbdd735cfd4a794b0debdb2bb1b421da5ff4"
dependencies = [
"autocfg",
]
[[package]]
name = "memoffset"
version = "0.9.1"
......@@ -3572,21 +3509,6 @@ version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084"
[[package]]
name = "ndarray"
version = "0.16.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "882ed72dce9365842bf196bdeedf5055305f11fc8c03dee7bb0194a6cad34841"
dependencies = [
"matrixmultiply",
"num-complex",
"num-integer",
"num-traits",
"portable-atomic",
"portable-atomic-util",
"rawpointer",
]
[[package]]
name = "neli"
version = "0.6.5"
......@@ -3640,19 +3562,6 @@ dependencies = [
"thiserror 1.0.69",
]
[[package]]
name = "nix"
version = "0.26.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "598beaf3cc6fdd9a5dfb1630c2800c7acd31df7aaf0f565796fba2b53ca1af1b"
dependencies = [
"bitflags 1.3.2",
"cfg-if 1.0.3",
"libc",
"memoffset 0.7.1",
"pin-utils",
]
[[package]]
name = "nix"
version = "0.29.0"
......@@ -3665,22 +3574,6 @@ dependencies = [
"libc",
]
[[package]]
name = "nixl-sys"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a73b92494c94b2ff2d004cd9274d966863089e867dc9cd98bc640aefe7622036"
dependencies = [
"bindgen 0.71.1",
"cc",
"libc",
"os_info",
"pkg-config",
"serde",
"thiserror 2.0.16",
"tracing",
]
[[package]]
name = "nkeys"
version = "0.4.5"
......@@ -4057,18 +3950,6 @@ dependencies = [
"hashbrown 0.14.5",
]
[[package]]
name = "os_info"
version = "3.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d0e1ac5fde8d43c34139135df8ea9ee9465394b2d8d20f032d38998f64afffc3"
dependencies = [
"log",
"plist",
"serde",
"windows-sys 0.52.0",
]
[[package]]
name = "overload"
version = "0.1.1"
......@@ -4294,19 +4175,6 @@ version = "0.3.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
[[package]]
name = "plist"
version = "1.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3af6b589e163c5a788fab00ce0c0366f6efbb9959c2f9874b224936af7fce7e1"
dependencies = [
"base64 0.22.1",
"indexmap 2.11.0",
"quick-xml",
"serde",
"time",
]
[[package]]
name = "png"
version = "0.17.16"
......@@ -4586,7 +4454,7 @@ dependencies = [
"cfg-if 1.0.3",
"indoc",
"libc",
"memoffset 0.9.1",
"memoffset",
"once_cell",
"portable-atomic",
"pyo3-build-config",
......@@ -4693,15 +4561,6 @@ version = "2.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3"
[[package]]
name = "quick-xml"
version = "0.38.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42a232e7487fc2ef313d96dde7948e7a3c05101870d8985e4fd8d26aedd27b89"
dependencies = [
"memchr",
]
[[package]]
name = "quinn"
version = "0.11.9"
......@@ -4909,12 +4768,6 @@ dependencies = [
"bitflags 2.9.3",
]
[[package]]
name = "rawpointer"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3"
[[package]]
name = "rayon"
version = "1.11.0"
......
......@@ -23,7 +23,6 @@ crate-type = ["cdylib", "rlib"]
[features]
default = []
block-manager = ["dynamo-llm/block-manager", "dep:dlpark", "dep:cudarc"]
[dependencies]
dynamo-llm = { path = "../../llm" }
......
......@@ -6,15 +6,20 @@ use futures::StreamExt;
use once_cell::sync::OnceCell;
use pyo3::IntoPyObjectExt;
use pyo3::exceptions::PyStopAsyncIteration;
use pyo3::types::PyCapsule;
use pyo3::types::{PyDict, PyString};
use pyo3::{exceptions::PyException, prelude::*};
use rand::seq::IteratorRandom as _;
use rs::pipeline::network::Ingress;
use std::ffi::CString;
use std::fs;
use std::net::{IpAddr, Ipv4Addr, SocketAddr, SocketAddrV4};
use std::path::PathBuf;
use std::time::Duration;
use std::{fmt::Display, sync::Arc};
use std::{
fmt::Display,
sync::{Arc, Weak},
};
use tokio::sync::Mutex;
use tracing::Instrument;
......@@ -191,9 +196,6 @@ fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> {
prometheus_metrics::add_to_module(&prometheus_metrics)?;
m.add_submodule(&prometheus_metrics)?;
#[cfg(feature = "block-manager")]
llm::block_manager::add_to_module(m)?;
Ok(())
}
......@@ -629,6 +631,21 @@ impl DistributedRuntime {
let inner = self.inner.runtime().child_token();
CancellationToken { inner }
}
// This is used to pass the DistributedRuntime from the dynamo-runtime bindings
// to the KVBM bindings, since KVBM cannot directly use the struct from this cdylib.
// TODO: Create a separate crate "dynamo-python" so that all binding crates can import
// from it and share the same crate path. This will allow PyO3 to automatically
// recognize that both bindings use the same PyClass.
#[pyo3(name = "to_capsule")]
fn to_capsule<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyCapsule>> {
let arc: Arc<rs::DistributedRuntime> = Arc::new(self.inner.clone());
let weak: Weak<rs::DistributedRuntime> = Arc::downgrade(&arc);
let name = CString::new("dynamo.runtime.weak").expect("valid capsule name");
PyCapsule::new(py, weak, Some(name))
}
}
// Bind a TCP port and return a socket held until dropped.
......
......@@ -33,6 +33,3 @@ pub mod kv;
pub mod local_model;
pub mod model_card;
pub mod preprocessor;
#[cfg(feature = "block-manager")]
pub mod block_manager;
......@@ -5,13 +5,6 @@
import logging
try:
from dynamo._core import BlockManager as BlockManager
from dynamo._core import KvbmLeader as KvbmLeader
from dynamo._core import KvbmWorker as KvbmWorker
except ImportError:
pass # BlockManager is not enabled by default
from dynamo._core import ApproxKvIndexer as ApproxKvIndexer
from dynamo._core import DisaggregatedRouter as DisaggregatedRouter
from dynamo._core import EngineType
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
[package]
name = "kvbm-py3"
version = "0.1.0"
edition = "2024"
authors = ["NVIDIA"]
license = "Apache-2.0"
homepage = "https://github.com/ai-dynamo/dynamo"
repository = "https://github.com/ai-dynamo/dynamo.git"
[lib]
path = "src/lib.rs"
name = "_core"
# "cdylib" is necessary to produce a shared library for Python to import from.
# "rlib" is necessary to support doctests.
crate-type = ["cdylib", "rlib"]
[features]
default = ["block-manager"]
block-manager = ["dynamo-llm/block-manager", "dep:dlpark", "dep:cudarc"]
[dependencies]
dynamo-llm = { path = "../llm" }
dynamo-runtime = { path = "../runtime" }
anyhow = { version = "1" }
async-stream = { version = "0.3" }
async-trait = { version = "0.1" }
derive-getters = "0.5"
either = { version = "1.13", features = ["serde"] }
futures = { version = "0.3" }
local-ip-address = { version = "0.6" }
once_cell = { version = "1.20.3" }
rand = { version = "0.9" }
socket2 = { version = "0.6" }
serde = { version = "1" }
serde_json = { version = "1.0.138" }
thiserror = { version = "2.0" }
tokio = { version = "1.46.0", features = ["full"] }
tokio-stream = { version = "0" }
tokio-util = { version = "0.7", features = ["rt"] }
tracing = { version = "0" }
tracing-subscriber = { version = "0.3", features = ["fmt", "env-filter"] }
uuid = { version = "1.17", features = ["v4", "serde"] }
# "extension-module" tells pyo3 we want to build an extension module (skips linking against libpython.so)
# "abi3-py310" tells pyo3 (and maturin) to build using the stable ABI with minimum Python version 3.10
pyo3 = { version = "0.23.4", default-features = false, features = [
"macros",
"experimental-async",
"experimental-inspect",
"extension-module",
"py-clone",
"abi3-py310",
] }
pyo3-async-runtimes = { version = "0.23.0", default-features = false, features = [
"attributes",
"testing",
"tokio-runtime",
"unstable-streams",
] }
pythonize = "0.23"
dlpark = { version = "0.5", features = ["pyo3", "half"], optional = true }
cudarc = { version = "0.16.2", features = ["cuda-12020"], optional = true }
prometheus = "0.14.0"
[dev-dependencies]
rstest = "0.25"
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
<!--
SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
SPDX-License-Identifier: Apache-2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
https://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
# Dynamo KVBM
The Dynamo KVBM is a distributed KV-cache block management system designed for scalable LLM inference. It cleanly separates memory management from inference runtimes (vLLM, TensorRT-LLM, and SGLang), enabling GPU↔CPU↔Disk/Remote tiering, asynchronous block offload/onboard, and efficient block reuse.
![A block diagram showing a layered architecture view of Dynamo KV Block manager.](../../docs/images/kvbm-architecture.png)
## Feature Highlights
- **Distributed KV-Cache Management:** Unified GPU↔CPU↔Disk↔Remote tiering for scalable LLM inference.
- **Async Offload & Reuse:** Seamlessly move KV blocks between memory tiers using GDS-accelerated transfers powered by NIXL, without recomputation.
- **Runtime-Agnostic:** Works out-of-the-box with vLLM, TensorRT-LLM, and SGLang via lightweight connectors.
- **Memory-Safe & Modular:** RAII lifecycle and pluggable design for reliability, portability, and backend extensibility.
## Build and Installation
The pip wheel is built through a Docker build process:
```bash
# Build the Docker image with KVBM enabled (from the dynamo repo root)
./container/build.sh --framework none --enable-kvbm --tag local-kvbm
```
Once built, you can either:
**Option 1: Run and use the container directly**
```bash
./container/run.sh --framework none -it
```
**Option 2: Extract the wheel file to your local filesystem**
```bash
# Create a temporary container from the built image
docker create --name temp-kvbm-container local-kvbm:latest
# Copy the KVBM wheel to your current directory
docker cp temp-kvbm-container:/opt/dynamo/wheelhouse/ ./dynamo_wheelhouse
# Clean up the temporary container
docker rm temp-kvbm-container
# Install the wheel locally
pip install ./kvbm*.whl
```
Note that the default pip wheel built is not compatible with CUDA 13 at the moment.
## Integrations
### Environment Variables
| Variable | Description | Default |
|-----------|--------------|----------|
| `DYN_KVBM_CPU_CACHE_GB` | CPU pinned memory cache size (GB) | required |
| `DYN_KVBM_DISK_CACHE_GB` | SSD Disk/Storage system cache size (GB) | optional |
| `DYN_KVBM_LEADER_WORKER_INIT_TIMEOUT_SECS` | Timeout (in seconds) for the KVBM leader and worker to synchronize and allocate the required memory and storage. Increase this value if allocating large amounts of memory or storage. | 120 |
| `DYN_KVBM_METRICS` | Enable metrics endpoint | `false` |
| `DYN_KVBM_METRICS_PORT` | Metrics port | `6880` |
| `DYN_KVBM_DISABLE_DISK_OFFLOAD_FILTER` | Disable disk offload filtering to remove SSD lifespan protection | `false` |
### vLLM
```bash
DYN_KVBM_CPU_CACHE_GB=100 vllm serve \
--kv-transfer-config '{"kv_connector":"DynamoConnector","kv_role":"kv_both","kv_connector_module_path":"kvbm.vllm_integration.connector"}' \
Qwen/Qwen3-8B
```
For more detailed integration with dynamo, disaggregated serving support and benchmarking, please check [vllm-setup](../../docs/kvbm/vllm-setup.md)
### TensorRT-LLM
```bash
cat >/tmp/kvbm_llm_api_config.yaml <<EOF
cuda_graph_config: null
kv_cache_config:
enable_partial_reuse: false
free_gpu_memory_fraction: 0.80
kv_connector_config:
connector_module: kvbm.trtllm_integration.connector
connector_scheduler_class: DynamoKVBMConnectorLeader
connector_worker_class: DynamoKVBMConnectorWorker
EOF
DYN_KVBM_CPU_CACHE_GB=100 trtllm-serve Qwen/Qwen3-8B \
--host localhost --port 8000 \
--backend pytorch \
--extra_llm_api_options /tmp/kvbm_llm_api_config.yaml
```
For more detailed integration with dynamo and benchmarking, please check [trtllm-setup](../../docs/kvbm/trtllm-setup.md)
## 📚 Docs
- [Architecture](../../docs/kvbm/kvbm_architecture.md)
- [Motivation](../../docs/kvbm/kvbm_motivation.md)
- [Design Deepdive](../../docs/kvbm/kvbm_design_deepdive.md)
- [NIXL Overview](https://github.com/ai-dynamo/nixl/blob/main/docs/nixl.md)
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
[project]
name = "kvbm"
version = "0.6.1"
description = "Dynamo KVBM"
readme = "README.md"
authors = [
{ name = "NVIDIA Inc.", email = "sw-dl-dynamo@nvidia.com" },
]
license = { text = "Apache-2.0" }
license-files = ["LICENSE"]
requires-python = ">=3.10"
dependencies = [
"nixl==0.7.0"
]
classifiers = [
"Development Status :: 4 - Beta",
"Intended Audience :: Developers",
"Intended Audience :: Science/Research",
"Intended Audience :: Information Technology",
"License :: OSI Approved :: Apache Software License",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Topic :: Scientific/Engineering",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Operating System :: POSIX :: Linux",
]
keywords = ["llm", "genai", "inference", "nvidia", "kvcache", "dynamo"]
[tool.maturin]
module-name = "kvbm._core"
manifest-path = "Cargo.toml"
python-packages = ["kvbm"]
python-source = "python"
[build-system]
requires = ["maturin>=1.0,<2.0", "patchelf"]
build-backend = "maturin"
[tool.uv]
config-settings = { build-args = '--auditwheel repair --manylinux' }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment