refactor: rename vllm_nixl to vllm and make default (#100)

5bcdb734 · Neelay Shah · GitHub · a7c35dcf · 5bcdb734 · 5bcdb734
Commit 5bcdb734 authored Mar 11, 2025 by Neelay Shah Committed by GitHub Mar 11, 2025
20 changed files
--- a/.github/workflows/pre-merge-python.yml
+++ b/.github/workflows/pre-merge-python.yml
@@ -30,8 +30,7 @@ jobs:
    strategy:
      matrix:
        framework:
-          - standard
-          - vllm_nixl
+          - vllm
    name: Build and Test - ${{ matrix.framework }}
    env:
      CONTAINER_ID: test_${{ github.run_id }}_${{ github.run_attempt }}_${{ github.job }}_${{ matrix.framework }}

--- a/container/Dockerfile.vllm
+++ b/container/Dockerfile.vllm
@@ -8,6 +8,133 @@ FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS dev

 USER root

+### NIXL SETUP ###
+
+ARG MOFED_VERSION=24.10-1.1.4.0
+ARG PYTHON_VERSION=3.12
+ARG NSYS_URL=https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2025_1/
+ARG NSYS_PKG=NsightSystems-linux-cli-public-2025.1.1.131-3554042.deb
+
+RUN apt-get update -y && apt-get -y install curl \
+                                            git \
+                                            libnuma-dev \
+                                            numactl \
+                                            wget \
+                                            autotools-dev \
+                                            automake \
+                                            libtool \
+                                            libz-dev \
+                                            libiberty-dev \
+                                            flex \
+                                            build-essential \
+                                            cmake \
+                                            libibverbs-dev \
+                                            libgoogle-glog-dev \
+                                            libgtest-dev \
+                                            libjsoncpp-dev \
+                                            libpython3-dev \
+                                            libboost-all-dev \
+                                            libssl-dev \
+                                            libgrpc-dev \
+                                            libgrpc++-dev \
+                                            libprotobuf-dev \
+                                            protobuf-compiler-grpc \
+                                            pybind11-dev \
+                                            python3-full \
+                                            python3-pip \
+                                            python3-numpy \
+                                            etcd-server \
+                                            net-tools \
+                                            pciutils \
+                                            libpci-dev \
+                                            vim \
+                                            tmux \
+                                            screen \
+                                            ibverbs-utils \
+                                            libibmad-dev
+
+RUN apt-get install -y linux-tools-common linux-tools-generic ethtool iproute2
+RUN apt-get install -y dkms linux-headers-generic
+RUN apt-get install -y meson ninja-build uuid-dev gdb
+
+RUN apt-get update && apt install -y wget libglib2.0-0
+RUN wget ${NSYS_URL}${NSYS_PKG} && dpkg -i $NSYS_PKG && rm $NSYS_PKG
+
+RUN cd /usr/local/src && \
+    curl -fSsL "https://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VERSION}/MLNX_OFED_LINUX-${MOFED_VERSION}-ubuntu24.04-x86_64.tgz" -o mofed.tgz && \
+    tar -xf /usr/local/src/mofed.tgz && \
+    cd MLNX_OFED_LINUX-* && \
+    apt-get update && apt-get install -y --no-install-recommends \
+        ./DEBS/libibverbs* ./DEBS/ibverbs-providers* ./DEBS/librdmacm* ./DEBS/libibumad* && \
+    rm -rf /var/lib/apt/lists/* /usr/local/src/*
+
+ENV LIBRARY_PATH=$LIBRARY_PATH:/usr/local/cuda/lib64 \
+    LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64
+
+ENV LIBRARY_PATH=$LIBRARY_PATH:/usr/local/lib \
+    LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib
+
+WORKDIR /workspace
+RUN git clone https://github.com/NVIDIA/gdrcopy.git
+RUN PREFIX=/usr/local DESTLIB=/usr/local/lib make -C /workspace/gdrcopy lib_install
+RUN cp gdrcopy/src/libgdrapi.so.2.* /usr/lib/x86_64-linux-gnu/
+RUN ldconfig
+
+ARG UCX_VERSION=v1.18.0
+
+RUN cd /usr/local/src && \
+    curl -fSsL "https://github.com/openucx/ucx/tarball/${UCX_VERSION}" | tar xz && \
+    cd openucx-ucx* && \
+    ./autogen.sh && ./configure     \
+        --enable-shared             \
+        --disable-static            \
+        --disable-doxygen-doc       \
+        --enable-optimizations      \
+        --enable-cma                \
+        --enable-devel-headers      \
+        --with-cuda=/usr/local/cuda \
+        --with-verbs                \
+        --with-dm                   \
+        --with-gdrcopy=/usr/local   \
+        --enable-mt                 \
+        --with-mlx5-dv &&           \
+    make -j &&                      \
+    make -j install-strip &&        \
+    ldconfig
+
+ENV LD_LIBRARY_PATH=/usr/lib:$LD_LIBRARY_PATH
+ENV CPATH=/usr/include:$CPATH
+ENV PATH=/usr/bin:$PATH
+ENV PKG_CONFIG_PATH=/usr/lib/pkgconfig:$PKG_CONFIG_PATH
+SHELL ["/bin/bash", "-c"]
+
+WORKDIR /workspace
+
+ENV LD_LIBRARY_PATH=/usr/local/ompi/lib:$LD_LIBRARY_PATH
+ENV CPATH=/usr/local/ompi/include:$CPATH
+ENV PATH=/usr/local/ompi/bin:$PATH
+ENV PKG_CONFIG_PATH=/usr/local/ompi/lib/pkgconfig:$PKG_CONFIG_PATH
+
+COPY --from=nixl . /opt/nixl
+
+RUN cd /opt/nixl && \
+    mkdir build && \
+    meson setup build/ --prefix=/usr/local/nixl && \
+    cd build/ && \
+    ninja && \
+    ninja install
+
+ENV LD_LIBRARY_PATH=/usr/local/nixl/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH
+ENV PYTHONPATH=/usr/local/nixl/lib/python3/dist-packages/:/opt/nixl/test/python/:$PYTHONPATH
+ENV UCX_TLS=^cuda_ipc
+ENV NIXL_PLUGIN_DIR=/usr/local/nixl/lib/x86_64-linux-gnu/plugins
+
+RUN ls -l /usr/local/nixl/
+RUN ls -l /usr/local/nixl/include/
+RUN ls -l /usr/local/nixl/include/internal/
+
+RUN ls /opt/nixl
+
 # Install utilities
 RUN apt update -y && apt install -y git wget curl nvtop tmux vim
 # nats
@@ -31,6 +158,10 @@ RUN mkdir /opt/dynamo && \
 ENV VIRTUAL_ENV=/opt/dynamo/venv
 ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"

+# Common dependencies
+RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \
+    uv pip install --requirement /tmp/requirements.txt
+
 # Install patched vllm - keep this early in Dockerfile to avoid
 # rebuilds from unrelated source code changes
 ARG VLLM_REF="v0.7.2"
@@ -39,7 +170,6 @@ RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
    bash /tmp/deps/vllm/install.sh --patch /tmp/deps/vllm/${VLLM_PATCH} --ref ${VLLM_REF} --install-cmd "uv pip install --editable" --use-precompiled --installation-dir /opt/vllm

 # Install genai-perf for benchmarking
-# TODO: Move to tag when fix for genai-perf will be released
 ARG GENAI_PERF_TAG="25d0188713adc47868d6b3f22426375237a90529"
 RUN uv pip install "git+https://github.com/triton-inference-server/perf_analyzer.git@${GENAI_PERF_TAG}#subdirectory=genai-perf"

@@ -47,7 +177,7 @@ RUN uv pip install "git+https://github.com/triton-inference-server/perf_analyzer
 RUN --mount=type=bind,source=./container/deps/requirements.test.txt,target=/tmp/requirements.txt \
    uv pip install --requirement /tmp/requirements.txt

-### MISC UTILITY SETUP ###
+# ### MISC UTILITY SETUP ###

 # Finish pyright install
 RUN pyright --help > /dev/null 2>&1
@@ -103,11 +233,6 @@ COPY lib/bindings /workspace/lib/bindings
 RUN cd lib/bindings/c && \
    cargo build --release --locked && cargo doc --no-deps

-# Generate C bindings for kv cache routing in vLLM
-COPY lib/bindings /workspace/lib/bindings
-RUN cd lib/bindings/c && \
-cargo build --release --locked && cargo doc --no-deps
-
 COPY deploy/dynamo/sdk /workspace/deploy/dynamo/sdk
 # Build dynamo wheel
 RUN source /opt/dynamo/venv/bin/activate && \
@@ -135,50 +260,5 @@ ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]

 CMD []

-### Lean Runtime Image Stage ###
-
-# FIXME: Separate build and runtime images
-FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS runtime
-
-USER root
-
-# Install tools for interactive convenience
-RUN apt update -y && \
-    apt install -y curl tmux vim && \
-    echo "set -g mouse on" >> /root/.tmux.conf
-
-# Set environment variables
-ENV VIRTUAL_ENV=/opt/dynamo/venv
-ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
-ENV RAPIDS_LIBUCX_PREFER_SYSTEM_LIBRARY=true
-ENV VLLM_KV_CAPI_PATH="/opt/dynamo/bindings/lib/libdynamo_llm_capi.so"
-
-# Copy binaries
-COPY --from=dev /usr/local/bin/http /usr/local/bin/http
-COPY --from=dev /usr/local/bin/llmctl /usr/local/bin/llmctl
-COPY --from=dev /usr/local/bin/etcd/etcd /usr/local/bin/etcd
-COPY --from=dev /usr/bin/nats-server /usr/local/bin/nats-server
-COPY --from=dev /bin/uv /usr/local/bin/uv
-COPY --from=dev /bin/uvx /usr/local/bin/uvx
-
-# Copy venv with installed packages
-RUN uv python install 3.12
-COPY --from=dev /opt/vllm /opt/vllm
-COPY --from=dev ${VIRTUAL_ENV} ${VIRTUAL_ENV}
-
-# Copy minimal set of files for testing. May consider separate stage for testing
-# if test dependencies start to negatively impact deployment environment/size.
-COPY pyproject.toml /workspace/pyproject.toml
-COPY container/deps/vllm /workspace/container/deps/vllm
-# Add library for KV routing
-COPY --from=dev ${VLLM_KV_CAPI_PATH} ${VLLM_KV_CAPI_PATH}
-# Copy minimal set of files for deployment/examples
-# FIXME: Use a more consolidated path after directory restructure
-COPY examples/python_rs/llm/vllm /workspace/examples/python_rs/llm/vllm
-
-WORKDIR /workspace
+### TODO Lean Runtime Image Stage ###

-# FIXME: May want a modification with dynamo banner on entry
-ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
-
-CMD []
--- a/container/Dockerfile.vllm_nixl
+++ b/container/Dockerfile.vllm_nixl
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-ARG BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
-ARG BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
-
-FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS dev
-
-USER root
-
-### NIXL SETUP ###
-
-ARG MOFED_VERSION=24.10-1.1.4.0
-ARG PYTHON_VERSION=3.12
-ARG NSYS_URL=https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2025_1/
-ARG NSYS_PKG=NsightSystems-linux-cli-public-2025.1.1.131-3554042.deb
-
-RUN apt-get update -y && apt-get -y install curl \
-                                            git \
-                                            libnuma-dev \
-                                            numactl \
-                                            wget \
-                                            autotools-dev \
-                                            automake \
-                                            libtool \
-                                            libz-dev \
-                                            libiberty-dev \
-                                            flex \
-                                            build-essential \
-                                            cmake \
-                                            libibverbs-dev \
-                                            libgoogle-glog-dev \
-                                            libgtest-dev \
-                                            libjsoncpp-dev \
-                                            libpython3-dev \
-                                            libboost-all-dev \
-                                            libssl-dev \
-                                            libgrpc-dev \
-                                            libgrpc++-dev \
-                                            libprotobuf-dev \
-                                            protobuf-compiler-grpc \
-                                            pybind11-dev \
-                                            python3-full \
-                                            python3-pip \
-                                            python3-numpy \
-                                            etcd-server \
-                                            net-tools \
-                                            pciutils \
-                                            libpci-dev \
-                                            vim \
-                                            tmux \
-                                            screen \
-                                            ibverbs-utils \
-                                            libibmad-dev
-
-RUN apt-get install -y linux-tools-common linux-tools-generic ethtool iproute2
-RUN apt-get install -y dkms linux-headers-generic
-RUN apt-get install -y meson ninja-build uuid-dev gdb
-
-RUN apt-get update && apt install -y wget libglib2.0-0
-RUN wget ${NSYS_URL}${NSYS_PKG} && dpkg -i $NSYS_PKG && rm $NSYS_PKG
-
-RUN cd /usr/local/src && \
-    curl -fSsL "https://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VERSION}/MLNX_OFED_LINUX-${MOFED_VERSION}-ubuntu24.04-x86_64.tgz" -o mofed.tgz && \
-    tar -xf /usr/local/src/mofed.tgz && \
-    cd MLNX_OFED_LINUX-* && \
-    apt-get update && apt-get install -y --no-install-recommends \
-        ./DEBS/libibverbs* ./DEBS/ibverbs-providers* ./DEBS/librdmacm* ./DEBS/libibumad* && \
-    rm -rf /var/lib/apt/lists/* /usr/local/src/*
-
-ENV LIBRARY_PATH=$LIBRARY_PATH:/usr/local/cuda/lib64 \
-    LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64
-
-ENV LIBRARY_PATH=$LIBRARY_PATH:/usr/local/lib \
-    LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib
-
-WORKDIR /workspace
-RUN git clone https://github.com/NVIDIA/gdrcopy.git
-RUN PREFIX=/usr/local DESTLIB=/usr/local/lib make -C /workspace/gdrcopy lib_install
-RUN cp gdrcopy/src/libgdrapi.so.2.* /usr/lib/x86_64-linux-gnu/
-RUN ldconfig
-
-ARG UCX_VERSION=v1.18.0
-
-RUN cd /usr/local/src && \
-    curl -fSsL "https://github.com/openucx/ucx/tarball/${UCX_VERSION}" | tar xz && \
-    cd openucx-ucx* && \
-    ./autogen.sh && ./configure     \
-        --enable-shared             \
-        --disable-static            \
-        --disable-doxygen-doc       \
-        --enable-optimizations      \
-        --enable-cma                \
-        --enable-devel-headers      \
-        --with-cuda=/usr/local/cuda \
-        --with-verbs                \
-        --with-dm                   \
-        --with-gdrcopy=/usr/local   \
-        --enable-mt                 \
-        --with-mlx5-dv &&           \
-    make -j &&                      \
-    make -j install-strip &&        \
-    ldconfig
-
-ENV LD_LIBRARY_PATH=/usr/lib:$LD_LIBRARY_PATH
-ENV CPATH=/usr/include:$CPATH
-ENV PATH=/usr/bin:$PATH
-ENV PKG_CONFIG_PATH=/usr/lib/pkgconfig:$PKG_CONFIG_PATH
-SHELL ["/bin/bash", "-c"]
-
-WORKDIR /workspace
-
-ENV LD_LIBRARY_PATH=/usr/local/ompi/lib:$LD_LIBRARY_PATH
-ENV CPATH=/usr/local/ompi/include:$CPATH
-ENV PATH=/usr/local/ompi/bin:$PATH
-ENV PKG_CONFIG_PATH=/usr/local/ompi/lib/pkgconfig:$PKG_CONFIG_PATH
-
-COPY --from=nixl . /opt/nixl
-
-RUN cd /opt/nixl && \
-    mkdir build && \
-    meson setup build/ --prefix=/usr/local/nixl && \
-    cd build/ && \
-    ninja && \
-    ninja install
-
-ENV LD_LIBRARY_PATH=/usr/local/nixl/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH
-ENV PYTHONPATH=/usr/local/nixl/lib/python3/dist-packages/:/opt/nixl/test/python/:$PYTHONPATH
-ENV UCX_TLS=^cuda_ipc
-ENV NIXL_PLUGIN_DIR=/usr/local/nixl/lib/x86_64-linux-gnu/plugins
-
-RUN ls -l /usr/local/nixl/
-RUN ls -l /usr/local/nixl/include/
-RUN ls -l /usr/local/nixl/include/internal/
-
-RUN ls /opt/nixl
-
-# Install utilities
-RUN apt update -y && apt install -y git wget curl nvtop tmux vim
-# nats
-RUN wget https://github.com/nats-io/nats-server/releases/download/v2.10.24/nats-server-v2.10.24-amd64.deb && dpkg -i nats-server-v2.10.24-amd64.deb
-# etcd
-ENV ETCD_VERSION="v3.5.18"
-RUN wget https://github.com/etcd-io/etcd/releases/download/$ETCD_VERSION/etcd-$ETCD_VERSION-linux-amd64.tar.gz -O /tmp/etcd.tar.gz && \
-mkdir -p /usr/local/bin/etcd && \
-tar -xvf /tmp/etcd.tar.gz -C /usr/local/bin/etcd --strip-components=1
-ENV PATH=/usr/local/bin/etcd/:$PATH
-
-
-### VIRTUAL ENVIRONMENT SETUP ###
-
-# Install uv and create virtualenv
-COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
-RUN mkdir /opt/dynamo && \
-    uv venv /opt/dynamo/venv --python 3.12
-
-# Activate virtual environment
-ENV VIRTUAL_ENV=/opt/dynamo/venv
-ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
-
-# Common dependencies
-RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \
-    uv pip install --requirement /tmp/requirements.txt
-
-# Install patched vllm - keep this early in Dockerfile to avoid
-# rebuilds from unrelated source code changes
-ARG VLLM_REF="v0.7.2"
-ARG VLLM_PATCH="vllm_${VLLM_REF}-dynamo-kv-disagg-patch.patch"
-RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
-    bash /tmp/deps/vllm/install.sh --patch /tmp/deps/vllm/${VLLM_PATCH} --ref ${VLLM_REF} --install-cmd "uv pip install --editable" --use-precompiled --installation-dir /opt/vllm
-
-# Install genai-perf for benchmarking
-ARG GENAI_PERF_TAG="25d0188713adc47868d6b3f22426375237a90529"
-RUN uv pip install "git+https://github.com/triton-inference-server/perf_analyzer.git@${GENAI_PERF_TAG}#subdirectory=genai-perf"
-
-# Install test dependencies
-RUN --mount=type=bind,source=./container/deps/requirements.test.txt,target=/tmp/requirements.txt \
-    uv pip install --requirement /tmp/requirements.txt
-
-# ### MISC UTILITY SETUP ###
-
-# Finish pyright install
-RUN pyright --help > /dev/null 2>&1
-
-# Enable Git operations in the /workspace directory
-RUN printf "[safe]\n      directory=/workspace\n" > /root/.gitconfig
-
-RUN ln -sf /bin/bash /bin/sh
-
-### BUILDS ###
-
-# Rust build/dev dependencies
-RUN apt update -y && \
-    apt install -y \
-    build-essential \
-    protobuf-compiler \
-    cmake \
-    libssl-dev \
-    pkg-config && \
-    curl https://sh.rustup.rs -sSf | bash -s -- -y
-ENV PATH="/root/.cargo/bin:${PATH}"
-RUN rustup toolchain install 1.85.0-x86_64-unknown-linux-gnu
-
-# Working directory
-WORKDIR /workspace
-
-# Copy Python wheel configuration files
-COPY pyproject.toml /workspace/
-COPY README.md /workspace/
-COPY LICENSE /workspace/
-
-# Build Rust runtime
-COPY lib/runtime /workspace/lib/runtime
-RUN cd lib/runtime && \
-    cargo build --release --locked && cargo doc --no-deps
-
-# Build OpenAI HTTP Service binaries
-COPY lib/llm /workspace/lib/llm
-COPY components /workspace/components
-RUN cd components && \
-    cargo build --release && \
-    cp target/release/http /usr/local/bin/
-
-# Build Dynamo Run binaries
-COPY launch /workspace/launch
-RUN cd launch && \
-    cargo build --release --features mistralrs,sglang,vllm,python && \
-    cp target/release/dynamo-run /usr/local/bin/ && \
-    cp target/release/llmctl /usr/local/bin/
-
-# Generate C bindings for kv cache routing in vLLM
-COPY lib/bindings /workspace/lib/bindings
-RUN cd lib/bindings/c && \
-    cargo build --release --locked && cargo doc --no-deps
-
-COPY deploy/dynamo/sdk /workspace/deploy/dynamo/sdk
-# Build dynamo wheel
-RUN source /opt/dynamo/venv/bin/activate && \
-    uv build --wheel --out-dir /workspace/dist && \
-    uv pip install /workspace/dist/ai_dynamo*cp312*.whl && \
-    cd /workspace/deploy/dynamo/sdk && \
-    uv build --wheel --out-dir /workspace/dist && \
-    uv pip install /workspace/dist/ai_dynamo_sdk*any.whl
-
-# Package the bindings
-RUN mkdir -p /opt/dynamo/bindings/wheels && \
-    mkdir /opt/dynamo/bindings/lib && \
-    cp dist/ai_dynamo*cp312*.whl /opt/dynamo/bindings/wheels/. && \
-    cp lib/bindings/c/target/release/libdynamo_llm_capi.so /opt/dynamo/bindings/lib/. && \
-    cp -r lib/bindings/c/include /opt/dynamo/bindings/.
-
-# Tell vllm to use the Dynamo LLM C API for KV Cache Routing
-ENV VLLM_KV_CAPI_PATH="/opt/dynamo/bindings/lib/libdynamo_llm_capi.so"
-
-# FIXME: Copy more specific folders in for dev/debug after directory restructure
-COPY . /workspace
-
-# FIXME: May want a modification with dynamo banner on entry
-ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
-
-CMD []
-
-### TODO Lean Runtime Image Stage ###
-
--- a/container/build.sh
+++ b/container/build.sh
@@ -44,7 +44,7 @@ PYTHON_PACKAGE_VERSION=${current_tag:-$latest_tag.dev+$commit_id}
 # installed within framework specific sections of the Dockerfile.

 declare -A FRAMEWORKS=(["STANDARD"]=1 ["TENSORRTLLM"]=2 ["VLLM"]=3 ["VLLM_NIXL"]=4)
-DEFAULT_FRAMEWORK=STANDARD
+DEFAULT_FRAMEWORK=VLLM

 SOURCE_DIR=$(dirname "$(readlink -f "$0")")
 DOCKERFILE=${SOURCE_DIR}/Dockerfile
@@ -64,9 +64,6 @@ TENSORRTLLM_PIP_WHEEL_PATH=""
 VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
 VLLM_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"

-VLLM_NIXL_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
-VLLM_NIXL_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
-
 NIXL_COMMIT=3ce6a673b266b4f293909ceb17ca7975f1ba5cd7
 NIXL_REPO=ai-dynamo/nixl.git

@@ -197,6 +194,10 @@ get_options() {
        FRAMEWORK=$DEFAULT_FRAMEWORK
    fi

+    if [[ ${FRAMEWORK^^} == "VLLM_NIXL" ]]; then
+	FRAMEWORK="VLLM"
+    fi
+
    if [ ! -z "$FRAMEWORK" ]; then
        FRAMEWORK=${FRAMEWORK^^}

@@ -283,17 +284,14 @@ error() {

 get_options "$@"

-
 # Update DOCKERFILE if framework is VLLM
 if [[ $FRAMEWORK == "VLLM" ]]; then
    DOCKERFILE=${SOURCE_DIR}/Dockerfile.vllm
-elif [[ $FRAMEWORK == "VLLM_NIXL" ]]; then
-    DOCKERFILE=${SOURCE_DIR}/Dockerfile.vllm_nixl
 elif [[ $FRAMEWORK == "TENSORRTLLM" ]]; then
    DOCKERFILE=${SOURCE_DIR}/Dockerfile.tensorrt_llm
 fi

-if [[ $FRAMEWORK == "VLLM_NIXL" ]]; then
+if [[ $FRAMEWORK == "VLLM" ]]; then
    TEMP_DIR=$(mktemp -d)

    # Clean up temp directory on script exit

--- a/container/run.sh
+++ b/container/run.sh
@@ -23,7 +23,7 @@ RUN_PREFIX=
 # installed within framework specific sections of the Dockerfile.

 declare -A FRAMEWORKS=(["STANDARD"]=1 ["TENSORRTLLM"]=2 ["VLLM"]=3 ["VLLM_NIXL"]=4)
-DEFAULT_FRAMEWORK=STANDARD
+DEFAULT_FRAMEWORK=VLLM

 SOURCE_DIR=$(dirname "$(readlink -f "$0")")

@@ -170,6 +170,10 @@ get_options() {
 	FRAMEWORK=$DEFAULT_FRAMEWORK
    fi

+    if [[ ${FRAMEWORK^^} == "VLLM_NIXL" ]]; then
+	FRAMEWORK="VLLM"
+    fi
+
    if [ ! -z "$FRAMEWORK" ]; then
 	FRAMEWORK=${FRAMEWORK^^}
 	if [[ ! -n "${FRAMEWORKS[$FRAMEWORK]}" ]]; then

--- a/examples/python_rs/llm/vllm/README.md
+++ b/examples/python_rs/llm/vllm/README.md
@@ -15,9 +15,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->

-# vLLM Integration with Dynamo
-
-This example demonstrates how to use Dynamo to serve large language models with the vLLM engine, enabling efficient model serving with both monolithic and disaggregated deployment options.
+> **NOTE**: This example is based on an internal NVIDIA library that will soon be publicly released. The example won't work until the official release.

 ## Prerequisites

@@ -25,7 +23,7 @@ Start required services (etcd and NATS):

   Option A: Using [Docker Compose](/deploy/docker-compose.yml) (Recommended)
   ```bash
-   docker compose -f ./deploy/docker-compose.yml up -d
+   docker compose -f deploy/docker-compose.yml up -d
   ```

   Option B: Manual Setup
@@ -35,282 +33,195 @@ Start required services (etcd and NATS):
    - [etcd](https://etcd.io) server
        - follow instructions in [etcd installation](https://etcd.io/docs/v3.5/install/) to start an `etcd-server` locally

+## Build docker

-## Building the Environment
-
-The example is designed to run in a containerized environment using Dynamo, vLLM, and associated dependencies. To build the container:
-
-```bash
-# Build image
-./container/build.sh --framework VLLM
 ```
-
-## Launching the Environment
+./container/build.sh
 ```
-# Run image interactively
-./container/run.sh --framework VLLM -it
-```
-
-## Deployment

-### 1. HTTP Server
+## Run container

-Run the server logging (with debug level logging):
-```bash
-DYN_LOG=DEBUG http
 ```
-By default the server will run on port 8080.
-
-Add model to the server:
-```bash
-llmctl http add chat deepseek-ai/DeepSeek-R1-Distill-Llama-8B dynamo.vllm.chat/completions
-llmctl http add completions deepseek-ai/DeepSeek-R1-Distill-Llama-8B dynamo.vllm.completions
-```
-
-##### Example Output
-```
-+------------+------------------------------------------+-----------+-----------+----------+
-| MODEL TYPE | MODEL NAME                               | NAMESPACE | COMPONENT | ENDPOINT |
-+------------+------------------------------------------+-----------+-----------+----------+
-| chat       | deepseek-ai/DeepSeek-R1-Distill-Llama-8B | dynamo | vllm      | generate |
-+------------+------------------------------------------+-----------+-----------+----------+
+./container/run.sh -it
 ```

-### 2. Workers
+All of the commands below are run inside the same container.

-#### 2.1. Monolithic Deployment
+## Run deployment

-In a separate terminal run the vllm worker:
+This figure shows an overview of the major components to deploy:

-```bash
-# Activate virtual environment
-source /opt/dynamo/venv/bin/activate
-
-# Launch worker
-cd /workspace/examples/python_rs/llm/vllm
-python3 -m monolith.worker \
-    --model deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
-    --enforce-eager
 ```
-
-##### Example Output
+                                                 +----------------+
+                                          +------| prefill worker |-------+
+                                   notify |      |   (optional)   |       |
+                                 finished |      +----------------+       | pull
+                                          v                               v
+------+      +-----------+      +------------------+    push     +---------------+
+| HTTP |----->| processor |----->| decode/monolith  |------------>| prefill queue |
+|      |<-----|           |<-----|      worker      | (if disagg) |   (optional)  |
+------+      +-----------+      +------------------+             +---------------+
+                  |    ^                  |
+       query best |    | return           | publish kv events
+           worker |    | worker_id        v
+                  |    |         +------------------+
+                  |    +---------|     kv-router    |
+                  +------------->|    (optional)    |
+                                 +------------------+

 ```
-INFO 03-02 05:30:36 __init__.py:190] Automatically detected platform cuda.
-WARNING 03-02 05:30:36 nixl.py:43] NIXL is not available
-
-INFO 03-02 05:30:43 config.py:542] This model supports multiple tasks: {'embed', 'score', 'generate', 'classify', 'reward'}. Defaulting to 'generate'.
-INFO 03-02 05:30:43 base_engine.py:43] Initializing engine client
-INFO 03-02 05:30:43 api_server.py:206] Started engine process with PID 1151
-INFO 03-02 05:30:44 config.py:542] This model supports multiple tasks: {'embed', 'score', 'generate', 'classify', 'reward'}. Defaulting to 'generate'.
-
-<SNIP>
-
-INFO 03-02 05:32:20 llm_engine.py:476] init engine (profile, create kv cache, warmup model) took 4.22 seconds

+Add model to dynamo and start http server.
+```
+llmctl http add chat-models deepseek-ai/DeepSeek-R1-Distill-Llama-8B dynamo-init.process.chat/completions
+TRT_LOG=DEBUG http --port 8181
 ```

-#### 2.2. Disaggregated Deployment
-
-This deployment option splits the model serving across prefill and decode workers, enabling more efficient resource utilization.
+### Processor

-**Terminal 1 - Prefill Worker:**
-```bash
-# Activate virtual environment
-source /opt/dynamo/venv/bin/activate
+Processor routes the requests to the (decode) workers. Three scheduling strategies are supported: 1. random, 2. round-robin, 3. kv (see [Kv Router](#kv-router)).

-# Launch prefill worker
-cd /workspace/examples/python_rs/llm/vllm
-VLLM_WORKER_MULTIPROC_METHOD=spawn CUDA_VISIBLE_DEVICES=0 python3 -m disaggregated.prefill_worker \
+```
+# Processor must take the same args as the (decoder) worker
+# This is temporary until we communicate the ModelDeploymentCard over etcd
+RUST_LOG=info python3 processor.py \
    --model deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
-    --gpu-memory-utilization 0.8 \
-    --enforce-eager \
-    --tensor-parallel-size 1 \
-    --kv-transfer-config \
-    '{"kv_connector":"DynamoNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}'
+    --tokenizer deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
+    --block-size 64 \
+    --max-model-len 16384 \
+    --router <random/round-robin/kv>
 ```

-##### Example Output
-
-```
-INFO 03-02 05:59:44 worker.py:269] the current vLLM instance can use total_gpu_memory (47.54GiB) x gpu_memory_utilization (0.40) = 19.01GiB
-INFO 03-02 05:59:44 worker.py:269] model weights take 14.99GiB; non_torch_memory takes 0.06GiB; PyTorch activation peak memory takes 1.19GiB; the rest of the memory reserved for KV Cache is 2.78GiB.
-INFO 03-02 05:59:44 executor_base.py:110] # CUDA blocks: 1423, # CPU blocks: 2048
-INFO 03-02 05:59:44 executor_base.py:115] Maximum concurrency for 10 tokens per request: 2276.80x
-INFO 03-02 05:59:47 llm_engine.py:476] init engine (profile, create kv cache, warmup model) took 3.41 seconds
+Alternatively, the processor can be bypassed by directly hitting the worker endpoints:
 ```
+llmctl http add chat-models deepseek-ai/DeepSeek-R1-Distill-Llama-8B dynamo-init.vllm.generate

-**Terminal 2 - Decode Worker:**
-```bash
-# Activate virtual environment
-source /opt/dynamo/venv/bin/activate
+# monolithic
+CUDA_VISIBLE_DEVICES=0 python3 routerless/worker.py \
+    --model deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
+    --enforce-eager

-# Launch decode worker
-cd /workspace/examples/python_rs/llm/vllm
-VLLM_WORKER_MULTIPROC_METHOD=spawn CUDA_VISIBLE_DEVICES=1,2 python3 -m disaggregated.decode_worker \
+# disaggregated
+CUDA_VISIBLE_DEVICES=0 python routerless/prefill_worker.py \
    --model deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
-    --gpu-memory-utilization 0.8 \
    --enforce-eager \
-    --tensor-parallel-size 2 \
-    --kv-transfer-config \
-    '{"kv_connector":"DynamoNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}'
-```
-
-The disaggregated deployment utilizes separate GPUs for prefill and decode operations, allowing for optimized resource allocation and improved performance. For more details on the disaggregated deployment, please refer to the [vLLM documentation](https://docs.vllm.ai/en/latest/features/disagg_prefill.html).
-
-##### Example Output
-
-```
-INFO 03-02 05:59:44 worker.py:269] the current vLLM instance can use total_gpu_memory (47.54GiB) x gpu_memory_utilization (0.40) = 19.01GiB
-INFO 03-02 05:59:44 worker.py:269] model weights take 14.99GiB; non_torch_memory takes 0.06GiB; PyTorch activation peak memory takes 1.19GiB; the rest of the memory reserved for KV Cache is 2.78GiB.
-INFO 03-02 05:59:44 executor_base.py:110] # CUDA blocks: 1423, # CPU blocks: 2048
-INFO 03-02 05:59:44 executor_base.py:115] Maximum concurrency for 10 tokens per request: 2276.80x
-INFO 03-02 05:59:47 llm_engine.py:476] init engine (profile, create kv cache, warmup model) took 3.41 seconds
+    --kv-transfer-config '{"kv_connector":"DynamoNixlConnector"}'
+CUDA_VISIBLE_DEVICES=1 python3 routerless/worker.py \
+    --remote-prefill \
+    --model deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
+    --enforce-eager \
+    --kv-transfer-config '{"kv_connector":"DynamoNixlConnector"}'
 ```

+### Kv Router

-### 3. Client
-
-```bash
-curl localhost:8080/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-    "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
-    "messages": [
-      {"role": "user", "content": "What is the capital of France?"}
-    ]
-  }'
-```
+The KV Router is a component that aggregates KV Events from all the workers and maintains
+a prefix tree of the cached tokens. It makes decisions on which worker to route requests
+to based on the length of the prefix match and the load on the workers.
+There are three steps needed to enable the kv router:
+1. Use `--router kv` in the processor.
+2. Use `--router kv` and `--enable-prefix-caching` in all the (decode) workers.
+3. Launch the kv router in a separate terminal.
+   ```
+   RUST_LOG=info python3 kv_router.py \
+       --model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
+       --block-size 64 \
+       --min-workers 1
+   ```
+   where `--min-workers` is the number of (decode) workers.

-##### Example Output
-
-```json
-{
-    "id": "5b04e7b0-0dcd-4c45-baa0-1d03d924010c",
-    "choices": [{
-        "message": {
-            "role": "assistant",
-            "content": "The capital of France is Paris. Paris is a major city known for iconic landmarks like the Eiffel Tower and the Louvre Museum."
-        },
-        "index": 0,
-        "finish_reason": "stop"
-    }],
-    "created": 1739548787,
-    "model": "vllm",
-    "object": "chat.completion",
-    "usage": null,
-    "system_fingerprint": null
-}
-```
+You can choose only the prefix strategy for now:
+- `prefix`: Route requests to the worker that has the longest prefix match.

-### 4. Multi-Node Deployment
+### Disaggregated Router

-The vLLM workers can be deployed across multiple nodes by configuring the NATS and etcd connection endpoints through environment variables. This enables distributed inference across a cluster.
+The disaggregated router determines whether a request should be send to a
+remote prefill engine or a local prefill engine for prefilling based on the
+prefill length. If kv router is enabled, the disaggregated router will use
+the absolute prefill length (actual prefill length - prefix hit length) to make
+the decision.

-Set the following environment variables on each node before running the workers:
+When prefilling locally, the vllm scheduler will prioritize
+prefill request and pause any ongoing decode requests.

-```bash
-export NATS_SERVER="nats://<nats-server-host>:<nats-server-port>"
-export ETCD_ENDPOINTS="http://<etcd-server-host1>:<etcd-server-port>,http://<etcd-server-host2>:<etcd-server-port>",...
+To enable the disaggregated router, add the following commands in the decode workers:
 ```
-
-For disaggregated deployment, you will also need to pass the `kv_ip` and `kv_port` to the workers in the `kv_transfer_config` argument:
-
-```bash
+python worker.py \
 ...
-    --kv-transfer-config \
-    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":<rank>,"kv_parallel_size":2,"kv_ip":<master_node_ip>,"kv_port":<kv_port>}'
+--conditional-disagg \
+--max-local-prefill-length <length>
 ```

+### Worker

-### 5. KV Router Deployment
-
-The KV Router is a component that aggregates KV Events from all the workers and maintains a prefix tree of the cached tokens. It makes decisions on which worker to route requests to based on the length of the prefix match and the load on the workers.
+#### Monolithic

-You can run the router and workers in separate terminal sessions or use the `kv-router-run.sh` script to launch them all at once in their own tmux sessions.
-
-#### Deploying using tmux
-
-The helper script `kv-router-run.sh` will launch the router and workers in their own tmux sessions.
-kv-router-run.sh <number_of_workers> <routing_strategy> Optional[<model_name>]
-
-Example:
-```bash
-# Launch 8 workers with prefix routing strategy and use deepseek-ai/DeepSeek-R1-Distill-Llama-8B as the model
-bash /workspace/examples/python_rs/llm/vllm/scripts/kv-router-run.sh 8 test deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+Only kv router is supported for monolithic deployment.

-# List tmux sessions
-tmux ls
-
-# Attach to the tmux sessions
-tmux a -t v-1 # worker 1 - use cmd + b, d to detach
-tmux a -t v-router # kv router - use cmd + b, d to detach
-
-# Close the tmux sessions
-tmux ls | grep 'v-' | cut -d: -f1 | xargs -I{} tmux kill-session -t {}
 ```
-
-#### Deploying using separate terminals
-
-**Terminal 1 - Router:**
-```bash
-# Activate virtual environment
-source /opt/dynamo/venv/bin/activate
-
-# Launch prefill worker
-cd /workspace/examples/python_rs/llm/vllm
-RUST_LOG=info python3 -m kv_router.router \
-    --routing-strategy prefix \
-    --model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
-    --min-workers 1 \
-    --block-size 64
+CUDA_VISIBLE_DEVICES=0 python3 worker.py \
+    --model deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
+    --enforce-eager \
+    --block-size 64 \
+    --max-model-len 16384 \
+    <optional kv router args: --router kv --enable-prefix-caching>
 ```

-You can choose only the prefix strategy for now:
- `prefix`: Route requests to the worker that has the longest prefix match.
+#### Disaggregated

-**Terminal 2 - Processor:**
-```bash
-# Activate virtual environment
-source /opt/dynamo/venv/bin/activate
+Kv router and disaggregated router are supported and can be turned on/off individually.

-# Processor must take the same args as the worker
-# This is temporary until we communicate the ModelDeploymentCard over etcd
-cd /workspace/examples/python_rs/llm/vllm
-RUST_LOG=info python3 -m kv_router.processor \
+```
+# start prefill worker in one terminal
+# Note: prefix caching is not supported in the prefill for now
+CUDA_VISIBLE_DEVICES=0 python3 prefill_worker.py \
    --model deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
-    --tokenizer deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
-    --enable-prefix-caching \
+    --enforce-eager \
+    --kv-transfer-config '{"kv_connector":"DynamoNixlConnector"}' \
    --block-size 64 \
+    --max-num-batched-tokens 16384 \
    --max-model-len 16384
-```
-
-**Terminal 3 and 4 - Workers:**
-```bash
-# Activate virtual environment
-source /opt/dynamo/venv/bin/activate

-# Launch Worker 1 and Worker 2 with same command
-cd /workspace/examples/python_rs/llm/vllm
-RUST_LOG=info python3 -m kv_router.worker \
+# start decode worker in another terminal
+CUDA_VISIBLE_DEVICES=1 python3 worker.py \
+    --remote-prefill \
    --model deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
-    --tokenizer deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
-    --enable-prefix-caching \
+    --enforce-eager \
+    --tensor-parallel-size 1 \
+    --kv-transfer-config '{"kv_connector":"DynamoNixlConnector"}' \
    --block-size 64 \
-    --max-model-len 16384
+    --max-num-batched-tokens 16384 \
+    --max-model-len 16384 \
+    <optional kv router args: --router kv --enable-prefix-caching>
+    <optional disaggregated router args: --conditional-disagg --max-local-prefill-length <length>>
 ```

-Note: Must enable prefix caching for KV Router to work
-Note: block-size must be 64, otherwise Router won't work (accepts only 64 tokens)
+### Multi-Node Deployment
+
+For multi-node deployment, etcd, nats, processor, and kv router
+are only required on the head node. The only components that need
+to be deployed on all nodes are the workers.

-**Terminal 5 - Client:**
-Don't forget to add the model to the server:
+Set the following environment variables on each node before running the workers:
 ```bash
-llmctl http add chat-models deepseek-ai/DeepSeek-R1-Distill-Llama-8B dynamo.process.chat/completions
+export NATS_SERVER="nats://<nats-server-host>:<nats-server-port>"
+export ETCD_ENDPOINTS="http://<etcd-server-host>:<etcd-server-port>"
 ```

-```bash
-curl localhost:8080/v1/chat/completions   -H "Content-Type: application/json"   -d '{
+
+### Common Issues
+
+If torch GLOO backend is complaining about file name too long, set
+```
+export GLOO_SOCKET_IFNAME=lo
+```
+
+## Client
+
+In another terminal:
+```
+# this test request has around 200 tokens isl
+curl localhost:8181/v1/chat/completions   -H "Content-Type: application/json"   -d '{
    "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
    "messages": [
    {
@@ -322,83 +233,68 @@ curl localhost:8080/v1/chat/completions   -H "Content-Type: application/json"
    "max_tokens": 30
  }'
 ```
-##### Example Output
-
-```json
-{
-    "id": "f435d1aa-d423-40a0-a616-00bc428a3e32",
-    "choices": [
-        {
-            "message": {
-                "role": "assistant",
-                "content": "Alright, the user is playing a character in a D&D setting. They want a detailed background for their character, set in the world of Eldoria, particularly in the city of Aeloria. The user mentioned it's about an intrepid explorer"
-            },
-            "index": 0,
-            "finish_reason": "length"
-        }
-    ],
-    "created": 1740020570,
-    "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
-    "object": "chat.completion",
-    "usage": null,
-    "system_fingerprint": null
-}
-```
-### 6. Preprocessor and backend
-
-This deployment splits the pre-processing and backend for model serving.
-
-Run following commands in 4 terminals:
-
-**Terminal 1 - vLLM Worker:**
-```bash
-# Activate virtual environment
-source /opt/dynamo/venv/bin/activate
-cd /workspace/examples/python_rs/llm/vllm
-
-RUST_LOG=info python3 -m preprocessor.worker --model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
-```
-
-**Terminal 2 - preprocessor:**
-
-```bash
-# Activate virtual environment
-source /opt/dynamo/venv/bin/activate
-cd /workspace/examples/python_rs/llm/vllm
-
-RUST_LOG=info python3 -m preprocessor.processor --model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
-```
-
-**Terminal 3 - HTTP Server**
-
-Run the server logging (with debug level logging):
-```bash
-DYN_LOG=DEBUG http
-```
-By default the server will run on port 8080.
-
-Add model to the server:
-```bash
-llmctl http add chat-models deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B dynamo.preprocessor.generate
-```

-**Terminal 4 - client**
+## Run genai-perf

-```bash
+`genai-perf` is a tool for profiling and benchmarking LLM servers. It is already installed in the container. For more details, please refer to the [genai-perf README](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/perf_analyzer/genai-perf/README.html).

-curl localhost:8080/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-    "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
-    "messages": [
-      {"role": "user", "content": "What is the capital of France?"}
-    ]
-  }'
 ```
-
-### 7. Known Issues and Limitations
-
- vLLM is not working well with the `fork` method for multiprocessing and TP > 1. This is a known issue and a workaround is to use the `spawn` method instead. See [vLLM issue](https://github.com/vllm-project/vllm/issues/6152).
- `kv_rank` of `kv_producer` must be smaller than of `kv_consumer`.
- Instances with the same `kv_role` must have the same `--tensor-parallel-size`.
- Currently only `--pipeline-parallel-size 1` is supported for XpYd disaggregated deployment.
+genai-perf profile \
+  -m deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
+  --url localhost:8181 \
+  --endpoint-type chat \
+  --streaming \
+  --service-kind openai \
+  --endpoint v1/chat/completions \
+  --warmup-request-count 10 \
+  --random-seed 123 \
+  --synthetic-input-tokens-stddev 0 \
+  --output-tokens-stddev 0 \
+  --tokenizer deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
+  --synthetic-input-tokens-mean 3000 \
+  --output-tokens-mean 150 \
+  --extra-inputs min_tokens:150 \
+  --extra-inputs max_tokens:150 \
+  --profile-export-file my_profile_export.json \
+  --artifact-dir artifacts/ \
+  --concurrency 10 \
+  --request-count 40 \
+  -- -v \
+  --async
+```
+
+## Close deployment
+
+Kill all python processes and clean up metadata files:
+
+```
+pkill -9 -f python
+```
+
+## TODOs, limitations, known issues
+
+- [ ] Add etcd for discovery
+- [ ] Multi-node deployment support
+- [ ] Enable chunked prefill
+- [ ] Process many remote prefill in one iteration
+- [ ] Support recompute preemption
+- [ ] Make sure decode does not preempt blocks before xfer finishes
+- [ ] Layer wise transfer
+- [ ] Non blocking send in prefill (cache manager should check xfer status)
+- [ ] Test under load
+- [ ] Support pp > 1
+- [ ] Check why adding extra seed input is crashing vllm with remote prefill
+- [ ] Unified worker for both prefill and decode
+- [x] Support mixed tp
+- [x] Require sending two parallel requests to start decode for the first time
+- [x] Concurrency > 2 is not working
+- [x] Parse cmdline args
+- [x] Manual nixl example with tp1
+- [x] Zero copy
+- [x] Conditional remote prefill
+- [x] Manual example with tp > 1
+- [x] Run on dynamo distributed runtime
+- [x] add oai http endpoint
+- [x] Sample only on decode, do note return remote prefill response
+- [x] Check if all transfers finished before moving to decode
+- [x] Enable async output processing - could be working
--- a/examples/python_rs/llm/vllm/sdk_kv_router/__init__.py
+++ b/examples/python_rs/llm/vllm/sdk_kv_router/__init__.py
--- a/examples/python_rs/llm/vllm/common/base_engine.py
+++ b/examples/python_rs/llm/vllm/common/base_engine.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import abc
-import logging
-
-from common.chat_processor import ChatProcessor
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.entrypoints.openai.api_server import (
-    build_async_engine_client_from_engine_args,
-)
-
-logger = logging.getLogger("vllm")
-
-
-class BaseVllmEngine:
-    """
-    Request handler for the generate endpoint
-    """
-
-    def __init__(self, engine_args: AsyncEngineArgs):
-        self.engine_args = engine_args
-        self.model_config = self.engine_args.create_model_config()
-        self.engine_client = None
-        self.chat_processor: ChatProcessor | None = None
-        self._engine_context = None
-
-    async def initialize(self):
-        """Initialize the engine client and related components."""
-        logger.info("Initializing engine client")
-        self._engine_context = build_async_engine_client_from_engine_args(
-            self.engine_args
-        )
-        if self._engine_context is not None:
-            self.engine_client = await self._engine_context.__aenter__()
-            self.tokenizer = await self.engine_client.get_tokenizer()
-            self.chat_processor = ChatProcessor(self.tokenizer, self.model_config)
-        else:
-            raise RuntimeError("Failed to initialize engine client")
-
-    async def cleanup(self):
-        """Cleanup resources."""
-        print("Cleaning up engine client")
-        if self._engine_context is not None:
-            await self._engine_context.__aexit__(None, None, None)
-            self._engine_context = None
-            self.engine_client = None
-            self.chat_processor = None
-
-    async def __aenter__(self):
-        await self.initialize()
-        """Initialize with context manager syntax."""
-        return self
-
-    async def __aexit__(self, exc_type, exc_value, traceback):
-        await self.cleanup()
-
-    @abc.abstractmethod
-    async def generate(self, raw_request):
-        pass
--- a/examples/python_rs/llm/vllm/common/chat_processor.py
+++ b/examples/python_rs/llm/vllm/common/chat_processor.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import time
-from typing import AsyncIterator, List, Optional, Protocol, Union, runtime_checkable
-
-from vllm.config import ModelConfig
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.entrypoints.chat_utils import ConversationMessage
-from vllm.entrypoints.openai.protocol import (
-    ChatCompletionRequest,
-    CompletionRequest,
-    RequestResponseMetadata,
-)
-from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
-from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
-from vllm.entrypoints.openai.serving_engine import RequestPrompt
-from vllm.inputs.data import TokensPrompt
-from vllm.transformers_utils.tokenizer import AnyTokenizer
-
-
-@runtime_checkable
-class ProcessMixInRequired(Protocol):
-    engine_args: AsyncEngineArgs
-    chat_processor: "ChatProcessor | None"
-    completions_processor: "CompletionsProcessor | None"
-    model_config: ModelConfig
-
-
-class ProcessMixIn(ProcessMixInRequired):
-    """
-    Mixin for pre and post processing for vLLM
-    Requires engine_args, engine_client, processor, model_config to be initialized
-    """
-
-    engine_args: AsyncEngineArgs
-    chat_processor: "ChatProcessor | None"
-    completions_processor: "CompletionsProcessor | None"
-    model_config: ModelConfig
-
-    def __init__(self):
-        pass
-
-    def _get_processor(
-        self, raw_request: Union[CompletionRequest, ChatCompletionRequest]
-    ):
-        # Determine the processor type based on the request structure
-        return (
-            self.chat_processor
-            if isinstance(raw_request, ChatCompletionRequest)
-            else self.completions_processor
-        )
-
-    async def _parse_raw_request(
-        self, raw_request: Union[CompletionRequest, ChatCompletionRequest]
-    ):
-        processor = self._get_processor(raw_request)
-        if processor is None:
-            raise RuntimeError("Processor has not been initialized")
-        request = processor.parse_raw_request(raw_request)
-        preprocess_result = await processor.preprocess(raw_request)
-
-        default_max_tokens = self.model_config.max_model_len - len(
-            preprocess_result.engine_prompt["prompt_token_ids"]
-        )
-        default_sampling_params = self.model_config.get_diff_sampling_param()
-        sampling_params = request.to_sampling_params(
-            default_max_tokens,
-            self.model_config.logits_processor_pattern,
-            default_sampling_params,
-        )
-        return (
-            request,
-            preprocess_result.conversation,
-            preprocess_result.request_prompt,
-            preprocess_result.engine_prompt,
-            sampling_params,
-        )
-
-    async def _stream_response(self, request, generator, request_id, conversation):
-        processor = self._get_processor(request)
-        if processor is None:
-            raise RuntimeError("processor has not been initialized")
-        return processor.stream_response(
-            request,
-            generator,
-            request_id,
-            conversation,
-        )
-
-
-class PreprocessResult:
-    def __init__(
-        self,
-        conversation: Optional[ConversationMessage],
-        request_prompt: RequestPrompt,
-        engine_prompt: TokensPrompt,
-    ):
-        self.conversation = conversation
-        self.request_prompt = request_prompt
-        self.engine_prompt = engine_prompt
-
-
-class ChatProcessor:
-    def __init__(self, tokenizer: AnyTokenizer, model_config: ModelConfig):
-        self.tokenizer = tokenizer
-        self.model_config = model_config
-        self.openai_serving = OpenAIServingChat(
-            engine_client=None,
-            model_config=model_config,
-            models=None,
-            request_logger=None,
-            response_role="assistant",
-            chat_template=None,
-            chat_template_content_format="auto",
-        )
-
-    def parse_raw_request(
-        self, raw_request: ChatCompletionRequest
-    ) -> ChatCompletionRequest:
-        return ChatCompletionRequest.parse_obj(raw_request)
-
-    async def preprocess(self, raw_request: ChatCompletionRequest) -> PreprocessResult:
-        request = self.parse_raw_request(raw_request)
-
-        (
-            conversation,
-            request_prompts,
-            engine_prompts,
-        ) = await self.openai_serving._preprocess_chat(
-            request,
-            self.tokenizer,
-            request.messages,
-            chat_template=request.chat_template or self.tokenizer.chat_template,
-            chat_template_content_format=self.openai_serving.chat_template_content_format,
-            add_generation_prompt=request.add_generation_prompt,
-            continue_final_message=request.continue_final_message,
-            tool_dicts=None,
-            documents=request.documents,
-            chat_template_kwargs=request.chat_template_kwargs,
-            tool_parser=self.openai_serving.tool_parser,
-            truncate_prompt_tokens=request.truncate_prompt_tokens,
-            add_special_tokens=request.add_special_tokens,
-        )
-
-        return PreprocessResult(conversation[0], request_prompts[0], engine_prompts[0])
-
-    async def stream_response(
-        self,
-        request: ChatCompletionRequest,
-        result_generator: AsyncIterator,
-        request_id: str,
-        conversation: List,
-    ):
-        request_metadata = RequestResponseMetadata(request_id=request_id)
-        if not request.stream:
-            raise ValueError("Only streaming responses are supported")
-        async for raw_response in self.openai_serving.chat_completion_stream_generator(
-            request,
-            result_generator,
-            request_id,
-            request.model,
-            conversation,
-            self.tokenizer,
-            request_metadata,
-        ):
-            if raw_response.startswith("data: [DONE]"):
-                break
-            response = json.loads(raw_response.lstrip("data: "))
-            yield response
-
-
-class CompletionsProcessor:
-    def __init__(self, tokenizer: AnyTokenizer, model_config: ModelConfig):
-        self.tokenizer = tokenizer
-        self.model_config = model_config
-        self.openai_serving = OpenAIServingCompletion(
-            engine_client=None,
-            model_config=model_config,
-            models=None,
-            request_logger=None,
-        )
-
-    def parse_raw_request(self, raw_request: CompletionRequest) -> CompletionRequest:
-        return CompletionRequest.parse_obj(raw_request)
-
-    async def preprocess(self, raw_request: CompletionRequest) -> PreprocessResult:
-        request = self.parse_raw_request(raw_request)
-
-        (
-            request_prompts,
-            engine_prompts,
-        ) = await self.openai_serving._preprocess_completion(
-            request,
-            self.tokenizer,
-            input_or_inputs=request.prompt,
-            truncate_prompt_tokens=request.truncate_prompt_tokens,
-            add_special_tokens=request.add_special_tokens,
-        )
-
-        return PreprocessResult(None, request_prompts[0], engine_prompts[0])
-
-    async def stream_response(
-        self,
-        request: CompletionRequest,
-        result_generator: AsyncIterator,
-        request_id: str,
-        conversation: Optional[List[ConversationMessage]] = None,
-    ):
-        request_metadata = RequestResponseMetadata(request_id=request_id)
-        if not request.stream:
-            raise ValueError("Only streaming responses are supported")
-        async for raw_response in self.openai_serving.completion_stream_generator(
-            request,
-            result_generator,
-            request_id,
-            int(time.time()),  # created_time
-            request.model,
-            1,  # num_prompts
-            self.tokenizer,
-            request_metadata,
-        ):
-            if raw_response.startswith("data: [DONE]"):
-                break
-            response = json.loads(raw_response.lstrip("data: "))
-
-            yield response
--- a/examples/python_rs/llm/vllm/common/client.py
+++ b/examples/python_rs/llm/vllm/common/client.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import argparse
-import asyncio
-
-import uvloop
-
-from dynamo.runtime import DistributedRuntime, dynamo_worker
-
-from .protocol import Request
-
-
-@dynamo_worker()
-async def worker(
-    runtime: DistributedRuntime,
-    component: str,
-    prompt: str,
-    max_tokens: int,
-    temperature: float,
-):
-    """
-    Instantiate a `backend` client and call the `generate` endpoint
-    """
-    # get endpoint
-    endpoint = runtime.namespace("dynamo").component(component).endpoint("generate")
-
-    # create client
-    client = await endpoint.client()
-
-    # issue request
-    tasks = []
-    for _ in range(1):
-        tasks.append(
-            client.generate(
-                Request(
-                    prompt=prompt,
-                    sampling_params={
-                        "temperature": temperature,
-                        "max_tokens": max_tokens,
-                    },
-                ).model_dump_json()
-            )
-        )
-    streams = await asyncio.gather(*tasks)
-
-    # process response
-    for stream in streams:
-        async for resp in stream:
-            print(resp)
-
-
-if __name__ == "__main__":
-    uvloop.install()
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--prompt", type=str, default="what is the capital of france?")
-    parser.add_argument("--component", type=str, default="vllm")
-    parser.add_argument("--max-tokens", type=int, default=10)
-    parser.add_argument("--temperature", type=float, default=0.5)
-
-    args = parser.parse_args()
-
-    asyncio.run(worker(args.component, args.prompt, args.max_tokens, args.temperature))
--- a/examples/python_rs/llm/vllm/common/parser.py
+++ b/examples/python_rs/llm/vllm/common/parser.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.utils import FlexibleArgumentParser
-
-
-def parse_vllm_args() -> AsyncEngineArgs:
-    parser = FlexibleArgumentParser()
-    parser = AsyncEngineArgs.add_cli_args(parser)
-    args = parser.parse_args()
-    return AsyncEngineArgs.from_cli_args(args)
--- a/examples/python_rs/llm/vllm/common/protocol.py
+++ b/examples/python_rs/llm/vllm/common/protocol.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import json
-from typing import Any, List, Optional
-
-import msgspec
-from pydantic import BaseModel, ConfigDict, field_validator
-from pydantic_core import core_schema
-from typing_extensions import NotRequired
-from vllm.inputs.data import TokensPrompt
-from vllm.outputs import CompletionOutput
-from vllm.sampling_params import SamplingParams
-from vllm.sequence import PromptLogprobs, RequestMetrics
-
-
-class Request(BaseModel):
-    prompt: str
-    sampling_params: dict
-
-
-class Tokens(BaseModel):
-    tokens: list[int]
-
-
-class PrefillRequest(Request):
-    request_id: str
-
-
-class Response(BaseModel):
-    text: str
-
-
-class PrefillResponse(BaseModel):
-    prefilled: bool
-
-
-# Hack to override the type of multi_modal_data in TokensPrompt
-# as pydantic doesn't understand generic types
-# TokensPrompt is defined here: https://github.com/vllm-project/vllm/blob/a4c402a756fa3213caf9d2cde0e4ceb2d57727f2/vllm/inputs/data.py#L38
-# multi_modal_data is defined here: https://github.com/vllm-project/vllm/blob/main/vllm/multimodal/inputs.py#L103
-# ModalityData is defined here: https://github.com/vllm-project/vllm/blob/main/vllm/multimodal/inputs.py#L80
-class PatchedTokensPrompt(TokensPrompt):
-    multi_modal_data: NotRequired[Optional[Any]]  # type: ignore
-
-
-# Monkey-patch the SamplingParams type to add a dummy core schema so pydantic can validate it
-# Sampling params is a mspspec struct
-# SamplingParams is defined here: https://github.com/vllm-project/vllm/blob/a4c402a756fa3213caf9d2cde0e4ceb2d57727f2/vllm/sampling_params.py#L88
-
-SamplingParams.__get_pydantic_core_schema__ = classmethod(
-    lambda cls, source, handler: core_schema.any_schema()
-)
-
-
-class vLLMGenerateRequest(BaseModel):
-    """
-    Serializable class of all the fields vLLM engine requires for inference
-    """
-
-    model_config = ConfigDict(arbitrary_types_allowed=True)
-    engine_prompt: PatchedTokensPrompt
-    sampling_params: SamplingParams
-    request_id: str
-
-    @field_validator("sampling_params", mode="before")
-    @classmethod
-    def parse_sampling_params(cls, v: Any) -> SamplingParams:
-        if isinstance(v, str):
-            v = json.loads(v)
-        if isinstance(v, dict):
-            return SamplingParams(**v)
-        return v
-
-    model_config = ConfigDict(
-        json_encoders={SamplingParams: lambda v: msgspec.json.encode(v)}
-    )
-
-
-class MyRequestOutput(BaseModel):
-    """
-    RequestOutput from vLLM is not serializable by default
-    https://github.com/vllm-project/vllm/blob/a4c402a756fa3213caf9d2cde0e4ceb2d57727f2/vllm/outputs.py#L85
-
-    This class is used to serialize the RequestOutput and any recursively defined types
-    We can do this because PromptLogprobs, RequestMetrics, and CompletionOutput are all serializable dataclasses
-    """
-
-    model_config = ConfigDict(arbitrary_types_allowed=True)
-
-    request_id: str
-    prompt: Optional[str] = None
-    prompt_token_ids: Optional[List[int]] = None
-    prompt_logprobs: Optional[PromptLogprobs] = None
-    outputs: List[CompletionOutput]
-    finished: bool
-    metrics: Optional[RequestMetrics] = None
-    # lora_request: Optional[LoRARequest] = None
-    # encoder_prompt: Optional[str] = None
-    # encoder_prompt_token_ids: Optional[List[int]] = None
-    # num_cached_tokens: Optional[int] = None
-    # multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None
--- a/examples/python_rs/llm/vllm_nixl/disagg_router.py
+++ b/examples/python_rs/llm/vllm_nixl/disagg_router.py
--- a/examples/python_rs/llm/vllm/disaggregated/__init__.py
+++ b/examples/python_rs/llm/vllm/disaggregated/__init__.py
--- a/examples/python_rs/llm/vllm/disaggregated/decode_worker.py
+++ b/examples/python_rs/llm/vllm/disaggregated/decode_worker.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import asyncio
-import socket
-import uuid
-
-import msgspec
-import uvloop
-from common.base_engine import BaseVllmEngine
-from common.chat_processor import ProcessMixIn
-from common.parser import parse_vllm_args
-from common.protocol import PrefillRequest
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.entrypoints.openai.protocol import (
-    ChatCompletionRequest,
-    ChatCompletionStreamResponse,
-)
-from vllm.logger import logger as vllm_logger
-
-from dynamo.runtime import DistributedRuntime, dynamo_endpoint, dynamo_worker
-
-
-class VllmDecodeEngine(BaseVllmEngine, ProcessMixIn):
-    """
-    Request handler for the generate endpoint
-    """
-
-    def __init__(self, engine_args: AsyncEngineArgs, prefill):
-        assert (
-            engine_args.kv_transfer_config.is_kv_consumer
-        ), "Decode worker must be a KV consumer"
-        if engine_args.enable_chunked_prefill is not False:
-            vllm_logger.info(
-                "Chunked prefill is not supported in disaggregated mode, disabling it"
-            )
-            engine_args.enable_chunked_prefill = False
-        super().__init__(engine_args)
-        self.prefill = prefill
-
-        self.kv_transfer_config = engine_args.create_engine_config().kv_transfer_config
-        self.kv_rank = self.kv_transfer_config.kv_rank
-
-    @dynamo_endpoint(ChatCompletionRequest, ChatCompletionStreamResponse)
-    async def generate(self, raw_request):
-        if self.engine_client is None:
-            await self.initialize()
-
-        vllm_logger.debug(f"Got raw request: {raw_request}")
-        (
-            request,
-            conversation,
-            request_prompt,
-            engine_prompt,
-            sampling_params,
-        ) = await self._parse_raw_request(raw_request)
-
-        # TODO: pass decode info through a separate request param
-        request_id = f"{uuid.uuid4()}___decode_hostname_{socket.gethostname()}___decode_kv_rank_{self.kv_rank}"
-
-        prefill_sampling_params = {**msgspec.to_builtins(sampling_params)}
-        prefill_sampling_params["max_tokens"] = 1
-        prefill_sampling_params["min_tokens"] = 1
-        prefill_request = PrefillRequest(
-            prompt=request_prompt,  # TODO: we should use engine prompt to avoid extra tokenization
-            sampling_params=prefill_sampling_params,
-            request_id=request_id,
-        )
-        vllm_logger.debug(f"Prefill request: {prefill_request}")
-        prefill_output = self.prefill.generate(
-            prefill_request.model_dump_json(),
-        )
-
-        vllm_logger.debug(
-            f"Running generate with engine_prompt: {engine_prompt}, sampling_params: {sampling_params}, request_id: {request_id}"
-        )
-        if self.engine_client is None:
-            raise RuntimeError("Engine client not initialized")
-        else:
-            generator = self.engine_client.generate(
-                engine_prompt, sampling_params, request_id
-            )
-
-        async for response in await self._stream_response(
-            request, generator, request_id, conversation
-        ):
-            vllm_logger.debug(f"Generated response: {response}")
-            yield response
-
-        await prefill_output
-
-
-@dynamo_worker()
-async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
-    """
-    Instantiate a `backend` component and serve the `generate` endpoint
-    A `Component` can serve multiple endpoints
-    """
-    component = runtime.namespace("dynamo").component("vllm")
-    await component.create_service()
-
-    prefill = (
-        await runtime.namespace("dynamo")
-        .component("prefill")
-        .endpoint("generate")
-        .client()
-    )
-    async with VllmDecodeEngine(engine_args, prefill) as decode_engine:
-        endpoint = component.endpoint("generate")
-        await endpoint.serve_endpoint(decode_engine.generate)
-
-
-if __name__ == "__main__":
-    uvloop.install()
-    engine_args = parse_vllm_args()
-    asyncio.run(worker(engine_args))
--- a/examples/python_rs/llm/vllm/disaggregated/prefill_worker.py
+++ b/examples/python_rs/llm/vllm/disaggregated/prefill_worker.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import asyncio
-
-import uvloop
-import vllm
-from common.base_engine import BaseVllmEngine
-from common.parser import parse_vllm_args
-from common.protocol import PrefillRequest, PrefillResponse
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.logger import logger as vllm_logger
-
-from dynamo.runtime import DistributedRuntime, dynamo_endpoint, dynamo_worker
-
-
-class VllmPrefillEngine(BaseVllmEngine):
-    """
-    Request handler for the generate endpoint
-    """
-
-    def __init__(self, engine_args: AsyncEngineArgs):
-        assert (
-            engine_args.kv_transfer_config.is_kv_producer
-        ), "Prefill worker must be a KV producer"
-        if engine_args.enable_chunked_prefill is not False:
-            vllm_logger.info(
-                "Chunked prefill is not supported in disaggregated mode, disabling it"
-            )
-            engine_args.enable_chunked_prefill = False
-        super().__init__(engine_args)
-        self.kv_transfer_config = engine_args.create_engine_config().kv_transfer_config
-        self.kv_rank = self.kv_transfer_config.kv_rank
-
-    @dynamo_endpoint(PrefillRequest, PrefillResponse)
-    async def generate(self, request):
-        if self.engine_client is None:
-            await self.initialize()
-
-        vllm_logger.debug(f"Received prefill request: {request}")
-        sampling_params = vllm.sampling_params.SamplingParams(**request.sampling_params)
-        if self.engine_client is None:
-            raise RuntimeError("Engine client not initialized")
-        else:
-            async for response in self.engine_client.generate(
-                request.prompt, sampling_params, request.request_id
-            ):
-                vllm_logger.debug(f"Generated response: {response}")
-                yield True
-
-
-@dynamo_worker()
-async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
-    """
-    Instantiate a `backend` component and serve the `generate` endpoint
-    A `Component` can serve multiple endpoints
-    """
-    component = runtime.namespace("dynamo").component("prefill")
-    await component.create_service()
-
-    async with VllmPrefillEngine(engine_args) as prefill_engine:
-        endpoint = component.endpoint("generate")
-        await endpoint.serve_endpoint(prefill_engine.generate)
-
-
-if __name__ == "__main__":
-    uvloop.install()
-    engine_args = parse_vllm_args()
-    asyncio.run(worker(engine_args))
--- a/examples/python_rs/llm/vllm_nixl/kv_router.py
+++ b/examples/python_rs/llm/vllm_nixl/kv_router.py
--- a/examples/python_rs/llm/vllm/kv_router/__init__.py
+++ b/examples/python_rs/llm/vllm/kv_router/__init__.py
--- a/examples/python_rs/llm/vllm/kv_router/processor.py
+++ b/examples/python_rs/llm/vllm/kv_router/processor.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import asyncio
-import uuid
-from enum import Enum
-from typing import AsyncIterator, Tuple, Union
-
-import uvloop
-from common.chat_processor import ChatProcessor, CompletionsProcessor, ProcessMixIn
-from common.parser import parse_vllm_args
-from common.protocol import MyRequestOutput, Tokens, vLLMGenerateRequest
-from transformers import AutoTokenizer
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.entrypoints.openai.protocol import (
-    ChatCompletionRequest,
-    ChatCompletionStreamResponse,
-    CompletionRequest,
-    CompletionStreamResponse,
-)
-from vllm.logger import logger as vllm_logger
-from vllm.outputs import RequestOutput
-from vllm.transformers_utils.tokenizer import AnyTokenizer
-
-from dynamo.runtime import Client, DistributedRuntime, dynamo_endpoint, dynamo_worker
-
-
-class RequestType(Enum):
-    CHAT = "chat"
-    COMPLETION = "completion"
-
-
-class Processor(ProcessMixIn):
-    """
-    vLLM pre and post processing
-    """
-
-    def __init__(
-        self,
-        engine_args: AsyncEngineArgs,
-        router_client: Client,
-        workers_client: Client,
-    ):
-        self.engine_args = engine_args
-        self.model_config = self.engine_args.create_model_config()
-        self.tokenizer = self._create_tokenizer(engine_args)
-        self.chat_processor = ChatProcessor(self.tokenizer, self.model_config)
-        self.completions_processor = CompletionsProcessor(
-            self.tokenizer, self.model_config
-        )
-        self.router_client = router_client
-        self.workers_client = workers_client
-
-    def _create_tokenizer(self, engine_args: AsyncEngineArgs) -> AnyTokenizer:
-        """Create a TokenizerGroup using engine arguments similar to VLLM's approach"""
-        model_path = engine_args.model
-
-        # Create the base tokenizer with VLLM's typical settings
-        base_tokenizer = AutoTokenizer.from_pretrained(
-            model_path,
-            trust_remote_code=True,
-            padding_side="left",
-            truncation_side="left",
-            use_fast=True,  # VLLM might use the fast tokenizer for efficiency
-        )
-        return base_tokenizer
-
-    async def _generate(
-        self,
-        raw_request: Union[CompletionRequest, ChatCompletionRequest],
-        request_type: RequestType,
-    ):
-        request_id = str(uuid.uuid4())
-        vllm_logger.debug(f"Got raw request: {raw_request}")
-        (
-            request,
-            conversation,
-            prompt,
-            engine_prompt,
-            sampling_params,
-        ) = await self._parse_raw_request(raw_request)
-        worker_id_generator: AsyncIterator = await self.router_client.generate(
-            Tokens(tokens=engine_prompt["prompt_token_ids"]).model_dump_json()
-        )
-
-        worker_id = (
-            await worker_id_generator.__anext__()
-        )  # only one worker id is returned
-        worker_id = worker_id.data()
-        vllm_logger.info(f"Worker ID: {worker_id}")
-
-        if worker_id == "":
-            engine_generator = await self.workers_client.random(
-                vLLMGenerateRequest(
-                    engine_prompt=engine_prompt,
-                    sampling_params=sampling_params,
-                    request_id=request_id,
-                ).model_dump_json()
-            )
-        else:
-            engine_generator = await self.workers_client.direct(
-                vLLMGenerateRequest(
-                    engine_prompt=engine_prompt,
-                    sampling_params=sampling_params,
-                    request_id=request_id,
-                ).model_dump_json(),
-                int(worker_id),
-            )
-
-        output = self._generate_responses(engine_generator, request_type)
-
-        async for response in await self._stream_response(
-            request, output, request_id, conversation
-        ):
-            yield response
-
-    async def _generate_responses(
-        self, engine_generator: AsyncIterator[RequestOutput], request_type: RequestType
-    ) -> AsyncIterator[Union[RequestOutput, Tuple[int, RequestOutput]]]:
-        prompt_idx = 0
-        async for resp in engine_generator:
-            # Deserialize the response from the engine
-            # Creates correct vLLM objects for each field
-            output = MyRequestOutput.model_validate_json(resp.data())
-
-            # OpenAIServingChat.chat_completion_stream_generator() method expects a RequestOutput object
-            request_output = RequestOutput(
-                request_id=output.request_id,
-                prompt=output.prompt,
-                prompt_token_ids=output.prompt_token_ids,
-                prompt_logprobs=output.prompt_logprobs,
-                outputs=output.outputs,
-                finished=output.finished,
-                metrics=output.metrics,
-            )
-
-            if request_type == RequestType.CHAT:
-                # For chat requests, yield the request_output directly.
-                yield request_output
-            elif request_type == RequestType.COMPLETION:
-                # Completion requests can have multiple prompts and stream generator requires the prompt index
-                yield (prompt_idx, request_output)
-            else:
-                raise NotImplementedError(
-                    f"Request type {request_type} not implemented"
-                )
-
-    @dynamo_endpoint(ChatCompletionRequest, ChatCompletionStreamResponse)
-    async def generate_chat(self, raw_request):
-        async for response in self._generate(raw_request, RequestType.CHAT):
-            yield response
-
-    @dynamo_endpoint(CompletionRequest, CompletionStreamResponse)
-    async def generate_completions(self, raw_request):
-        async for response in self._generate(raw_request, RequestType.COMPLETION):
-            yield response
-
-
-@dynamo_worker()
-async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
-    """
-    Set up clients to the router and workers.
-    Serve the dynamo.process.chat/completions endpoint.
-    """
-    workers_client = (
-        await runtime.namespace("dynamo")
-        .component("vllm")
-        .endpoint("generate")
-        .client()
-    )
-
-    router_client = (
-        await runtime.namespace("dynamo")
-        .component("router")
-        .endpoint("generate")
-        .client()
-    )
-
-    preprocess_component = runtime.namespace("dynamo").component("process")
-    await preprocess_component.create_service()
-
-    chat_endpoint = preprocess_component.endpoint("chat/completions")
-    completions_endpoint = preprocess_component.endpoint("completions")
-
-    processor = Processor(engine_args, router_client, workers_client)
-
-    await asyncio.gather(
-        chat_endpoint.serve_endpoint(processor.generate_chat),
-        completions_endpoint.serve_endpoint(processor.generate_completions),
-    )
-
-
-if __name__ == "__main__":
-    uvloop.install()
-    engine_args = parse_vllm_args()
-    asyncio.run(worker(engine_args))
--- a/examples/python_rs/llm/vllm/kv_router/router.py
+++ b/examples/python_rs/llm/vllm/kv_router/router.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import asyncio
-import random
-from argparse import Namespace
-from typing import AsyncIterator
-
-import uvloop
-from common.protocol import Tokens
-from vllm.logger import logger as vllm_logger
-
-from dynamo.llm import AggregatedMetrics, KvIndexer, KvMetricsAggregator, OverlapScores
-from dynamo.runtime import DistributedRuntime, dynamo_endpoint, dynamo_worker
-
-WorkerId = str
-
-
-class CustomRouter:
-    """
-    Request handler for the generate endpoint
-    """
-
-    def __init__(
-        self,
-        workers_client,
-        indexer: KvIndexer,
-        metrics_aggregator: KvMetricsAggregator,
-    ):
-        vllm_logger.info("Initializing Custom Router")
-        self.indexer = indexer
-        self.metrics_aggregator = metrics_aggregator
-        self.workers_client = workers_client
-
-    def _cost_function(
-        self,
-        scores: OverlapScores | None,
-        metrics: AggregatedMetrics | None,
-        token_length: int,
-    ):
-        worker_scores = {}
-        if scores:
-            for worker_id, score in scores.scores.items():
-                # score is number of matching blocks we multiply by block_size to get tokens
-                # and compare to token_length. The larger the cache hit the better
-                worker_scores[worker_id] = (
-                    score * self.indexer.block_size() / token_length
-                )
-
-        worker_metrics = {}
-        # pull metrics for each worker
-        max_waiting = 0.0
-        if metrics:
-            for endpoint in metrics.endpoints:
-                worker_id = endpoint.worker_id
-            worker_metrics[worker_id] = {
-                "gpu_cache_usage_perc": endpoint.gpu_cache_usage_perc
-                if hasattr(endpoint, "gpu_cache_usage_perc")
-                else 0.0,
-                "num_requests_waiting": endpoint.num_requests_waiting
-                if hasattr(endpoint, "num_requests_waiting")
-                else 0.0,
-                "gpu_prefix_cache_hit_rate": endpoint.gpu_prefix_cache_hit_rate
-                if hasattr(endpoint, "gpu_prefix_cache_hit_rate")
-                else 0.0,
-            }
-            max_waiting = max(
-                max_waiting, worker_metrics[worker_id]["num_requests_waiting"]
-            )
-
-        # Get all worker IDs from the client. This is needed because scores / metrics may not have values for all workers
-        # and we want all workers to be considered in the logit calculation
-        worker_ids = self.workers_client.endpoint_ids()
-
-        worker_logits = {}
-        for worker_id in worker_ids:
-            # Use default values if worker not in scores or metrics
-            score = worker_scores.get(worker_id, 0.0)
-            metrics_dict = worker_metrics.get(
-                worker_id,
-                {
-                    "gpu_cache_usage_perc": 0.0,
-                    "num_requests_waiting": 0.0,
-                    "gpu_prefix_cache_hit_rate": 0.0,
-                },
-            )
-
-            normalized_waiting = (
-                metrics_dict["num_requests_waiting"] / max_waiting
-                if max_waiting > 0
-                else 0.0
-            )
-
-            # Have 1 metric that weights towards cache hit
-            # 2 metrics that penalize overloaded worker and queuing
-            worker_logits[worker_id] = (
-                2 * score - metrics_dict["gpu_cache_usage_perc"] - normalized_waiting
-            )
-            vllm_logger.info(
-                f"Formula for {worker_id}: {worker_logits[worker_id]:.3f} = 2.0 * {score:.3f} - {metrics_dict['gpu_cache_usage_perc']:.3f} - {normalized_waiting:.3f}"
-            )
-
-        if not worker_logits or all(logit == 0 for logit in worker_logits.values()):
-            return ""
-
-        # Select the worker with the highest logit
-        if worker_logits:
-            max_logit = max(worker_logits.values())
-            best_workers = [
-                wid for wid, logit in worker_logits.items() if logit == max_logit
-            ]
-            best_worker_id = random.choice(best_workers)
-        else:
-            best_worker_id = ""
-
-        # Log the metrics for the selected worker
-        if best_worker_id:
-            vllm_logger.info(
-                f"Selected worker: {best_worker_id}, logit: {worker_logits[best_worker_id]:.3f}"
-            )
-            vllm_logger.info(
-                f"Score: {scores.scores.get(best_worker_id, 0.0) if scores else 0.0:.3f}"
-            )
-
-            metrics_dict = worker_metrics.get(best_worker_id, {})
-            vllm_logger.info(
-                f"GPU Cache Hit Rate: {metrics_dict.get('gpu_prefix_cache_hit_rate', 0.0):.3f}"
-            )
-            vllm_logger.info(
-                f"GPU Cache Usage: {metrics_dict.get('gpu_cache_usage_perc', 0.0):.3f}"
-            )
-            vllm_logger.info(
-                f"Requests Waiting: {metrics_dict.get('num_requests_waiting', 0.0) / max_waiting if max_waiting > 0 else 0.0:.3f}"
-            )
-
-        return best_worker_id
-
-    @dynamo_endpoint(Tokens, WorkerId)
-    async def generate(self, request) -> AsyncIterator[WorkerId]:
-        lora_id = 0
-        worker_id = ""
-        try:
-            scores = await self.indexer.find_matches_for_request(
-                request.tokens, lora_id
-            )
-        except Exception as e:
-            scores = {}
-            vllm_logger.exception(f"Error finding matches: {e}")
-
-        token_length = len(request.tokens)
-        metrics = await self.metrics_aggregator.get_metrics()
-        worker_id = self._cost_function(scores, metrics, token_length)
-
-        vllm_logger.info(f"Scheduling to worker_id: {worker_id}")
-        vllm_logger.info("########")
-
-        yield str(worker_id)
-
-
-@dynamo_worker()
-async def worker(runtime: DistributedRuntime, args: Namespace):
-    """
-    Set up the worker clients.
-    Serve the dynamo.router.generate endpoint.
-    """
-    workers_client = (
-        await runtime.namespace("dynamo")
-        .component("vllm")
-        .endpoint("generate")
-        .client()
-    )
-
-    while len(workers_client.endpoint_ids()) < args.min_workers:
-        vllm_logger.info(
-            f"Waiting for more workers... Current: {len(workers_client.endpoint_ids())}, Required: {args.min_workers}"
-        )
-        await asyncio.sleep(5)
-
-    vllm_logger.info(
-        f"Required number of workers ({args.min_workers}) are ready:\n"
-        + "\n".join(f"id: {id}" for id in workers_client.endpoint_ids())
-    )
-
-    kv_listener = runtime.namespace("dynamo").component("vllm")
-    await kv_listener.create_service()
-
-    router_component = runtime.namespace("dynamo").component("router")
-    await router_component.create_service()
-
-    endpoint = router_component.endpoint("generate")
-
-    indexer = KvIndexer(kv_listener, args.block_size)
-    metrics_aggregator = KvMetricsAggregator(kv_listener)
-    await endpoint.serve_endpoint(
-        CustomRouter(workers_client, indexer, metrics_aggregator).generate
-    )
-
-
-if __name__ == "__main__":
-    uvloop.install()
-
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--min-workers",
-        type=int,
-        default=1,
-        help="Minimum number of workers required before proceeding",
-    )
-    # TODO: Read block size
-    parser.add_argument(
-        "--block-size",
-        type=int,
-        default=64,
-        help="Block size for the KV Indexer",
-    )
-    args = parser.parse_args()
-
-    asyncio.run(worker(args))