refactor: rename vllm_nixl to vllm and make default (#100)

5bcdb734 · Neelay Shah · GitHub · a7c35dcf · 5bcdb734 · 5bcdb734
Commit 5bcdb734 authored Mar 11, 2025 by Neelay Shah Committed by GitHub Mar 11, 2025
20 changed files
--- a/.github/workflows/pre-merge-python.yml
+++ b/.github/workflows/pre-merge-python.yml
@@ -30,8 +30,7 @@ jobs:
    strategy:
      matrix:
        framework:
-          - standard
+          - vllm
-          - vllm_nixl
    name: Build and Test - ${{ matrix.framework }}
    env:
      CONTAINER_ID: test_${{ github.run_id }}_${{ github.run_attempt }}_${{ github.job }}_${{ matrix.framework }}

--- a/container/Dockerfile.vllm
+++ b/container/Dockerfile.vllm
@@ -8,6 +8,133 @@ FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS dev
 USER root
+### NIXL SETUP ###
+ARG MOFED_VERSION=24.10-1.1.4.0
+ARG PYTHON_VERSION=3.12
+ARG NSYS_URL=https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2025_1/
+ARG NSYS_PKG=NsightSystems-linux-cli-public-2025.1.1.131-3554042.deb
+RUN apt-get update -y && apt-get -y install curl \
+                                            git \
+                                            libnuma-dev \
+                                            numactl \
+                                            wget \
+                                            autotools-dev \
+                                            automake \
+                                            libtool \
+                                            libz-dev \
+                                            libiberty-dev \
+                                            flex \
+                                            build-essential \
+                                            cmake \
+                                            libibverbs-dev \
+                                            libgoogle-glog-dev \
+                                            libgtest-dev \
+                                            libjsoncpp-dev \
+                                            libpython3-dev \
+                                            libboost-all-dev \
+                                            libssl-dev \
+                                            libgrpc-dev \
+                                            libgrpc++-dev \
+                                            libprotobuf-dev \
+                                            protobuf-compiler-grpc \
+                                            pybind11-dev \
+                                            python3-full \
+                                            python3-pip \
+                                            python3-numpy \
+                                            etcd-server \
+                                            net-tools \
+                                            pciutils \
+                                            libpci-dev \
+                                            vim \
+                                            tmux \
+                                            screen \
+                                            ibverbs-utils \
+                                            libibmad-dev
+RUN apt-get install -y linux-tools-common linux-tools-generic ethtool iproute2
+RUN apt-get install -y dkms linux-headers-generic
+RUN apt-get install -y meson ninja-build uuid-dev gdb
+RUN apt-get update && apt install -y wget libglib2.0-0
+RUN wget ${NSYS_URL}${NSYS_PKG} && dpkg -i $NSYS_PKG && rm $NSYS_PKG
+RUN cd /usr/local/src && \
+    curl -fSsL "https://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VERSION}/MLNX_OFED_LINUX-${MOFED_VERSION}-ubuntu24.04-x86_64.tgz" -o mofed.tgz && \
+    tar -xf /usr/local/src/mofed.tgz && \
+    cd MLNX_OFED_LINUX-* && \
+    apt-get update && apt-get install -y --no-install-recommends \
+        ./DEBS/libibverbs* ./DEBS/ibverbs-providers* ./DEBS/librdmacm* ./DEBS/libibumad* && \
+    rm -rf /var/lib/apt/lists/* /usr/local/src/*
+ENV LIBRARY_PATH=$LIBRARY_PATH:/usr/local/cuda/lib64 \
+    LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64
+ENV LIBRARY_PATH=$LIBRARY_PATH:/usr/local/lib \
+    LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib
+WORKDIR /workspace
+RUN git clone https://github.com/NVIDIA/gdrcopy.git
+RUN PREFIX=/usr/local DESTLIB=/usr/local/lib make -C /workspace/gdrcopy lib_install
+RUN cp gdrcopy/src/libgdrapi.so.2.* /usr/lib/x86_64-linux-gnu/
+RUN ldconfig
+ARG UCX_VERSION=v1.18.0
+RUN cd /usr/local/src && \
+    curl -fSsL "https://github.com/openucx/ucx/tarball/${UCX_VERSION}" | tar xz && \
+    cd openucx-ucx* && \
+    ./autogen.sh && ./configure     \
+        --enable-shared             \
+        --disable-static            \
+        --disable-doxygen-doc       \
+        --enable-optimizations      \
+        --enable-cma                \
+        --enable-devel-headers      \
+        --with-cuda=/usr/local/cuda \
+        --with-verbs                \
+        --with-dm                   \
+        --with-gdrcopy=/usr/local   \
+        --enable-mt                 \
+        --with-mlx5-dv &&           \
+    make -j &&                      \
+    make -j install-strip &&        \
+    ldconfig
+ENV LD_LIBRARY_PATH=/usr/lib:$LD_LIBRARY_PATH
+ENV CPATH=/usr/include:$CPATH
+ENV PATH=/usr/bin:$PATH
+ENV PKG_CONFIG_PATH=/usr/lib/pkgconfig:$PKG_CONFIG_PATH
+SHELL ["/bin/bash", "-c"]
+WORKDIR /workspace
+ENV LD_LIBRARY_PATH=/usr/local/ompi/lib:$LD_LIBRARY_PATH
+ENV CPATH=/usr/local/ompi/include:$CPATH
+ENV PATH=/usr/local/ompi/bin:$PATH
+ENV PKG_CONFIG_PATH=/usr/local/ompi/lib/pkgconfig:$PKG_CONFIG_PATH
+COPY --from=nixl . /opt/nixl
+RUN cd /opt/nixl && \
+    mkdir build && \
+    meson setup build/ --prefix=/usr/local/nixl && \
+    cd build/ && \
+    ninja && \
+    ninja install
+ENV LD_LIBRARY_PATH=/usr/local/nixl/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH
+ENV PYTHONPATH=/usr/local/nixl/lib/python3/dist-packages/:/opt/nixl/test/python/:$PYTHONPATH
+ENV UCX_TLS=^cuda_ipc
+ENV NIXL_PLUGIN_DIR=/usr/local/nixl/lib/x86_64-linux-gnu/plugins
+RUN ls -l /usr/local/nixl/
+RUN ls -l /usr/local/nixl/include/
+RUN ls -l /usr/local/nixl/include/internal/
+RUN ls /opt/nixl
 # Install utilities
 RUN apt update -y && apt install -y git wget curl nvtop tmux vim
 # nats
@@ -31,6 +158,10 @@ RUN mkdir /opt/dynamo && \
 ENV VIRTUAL_ENV=/opt/dynamo/venv
 ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
+# Common dependencies
+RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \
+    uv pip install --requirement /tmp/requirements.txt
 # Install patched vllm - keep this early in Dockerfile to avoid
 # rebuilds from unrelated source code changes
 ARG VLLM_REF="v0.7.2"
@@ -39,7 +170,6 @@ RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
    bash /tmp/deps/vllm/install.sh --patch /tmp/deps/vllm/${VLLM_PATCH} --ref ${VLLM_REF} --install-cmd "uv pip install --editable" --use-precompiled --installation-dir /opt/vllm
 # Install genai-perf for benchmarking
-# TODO: Move to tag when fix for genai-perf will be released
 ARG GENAI_PERF_TAG="25d0188713adc47868d6b3f22426375237a90529"
 RUN uv pip install "git+https://github.com/triton-inference-server/perf_analyzer.git@${GENAI_PERF_TAG}#subdirectory=genai-perf"
@@ -47,7 +177,7 @@ RUN uv pip install "git+https://github.com/triton-inference-server/perf_analyzer
 RUN --mount=type=bind,source=./container/deps/requirements.test.txt,target=/tmp/requirements.txt \
    uv pip install --requirement /tmp/requirements.txt
-### MISC UTILITY SETUP ###
+# ### MISC UTILITY SETUP ###
 # Finish pyright install
 RUN pyright --help > /dev/null 2>&1
@@ -103,11 +233,6 @@ COPY lib/bindings /workspace/lib/bindings
 RUN cd lib/bindings/c && \
    cargo build --release --locked && cargo doc --no-deps
-# Generate C bindings for kv cache routing in vLLM
-COPY lib/bindings /workspace/lib/bindings
-RUN cd lib/bindings/c && \
-cargo build --release --locked && cargo doc --no-deps
 COPY deploy/dynamo/sdk /workspace/deploy/dynamo/sdk
 # Build dynamo wheel
 RUN source /opt/dynamo/venv/bin/activate && \
@@ -135,50 +260,5 @@ ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
 CMD []
-### Lean Runtime Image Stage ###
+### TODO Lean Runtime Image Stage ###
-# FIXME: Separate build and runtime images
-FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS runtime
-USER root
-# Install tools for interactive convenience
-RUN apt update -y && \
-    apt install -y curl tmux vim && \
-    echo "set -g mouse on" >> /root/.tmux.conf
-# Set environment variables
-ENV VIRTUAL_ENV=/opt/dynamo/venv
-ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
-ENV RAPIDS_LIBUCX_PREFER_SYSTEM_LIBRARY=true
-ENV VLLM_KV_CAPI_PATH="/opt/dynamo/bindings/lib/libdynamo_llm_capi.so"
-# Copy binaries
-COPY --from=dev /usr/local/bin/http /usr/local/bin/http
-COPY --from=dev /usr/local/bin/llmctl /usr/local/bin/llmctl
-COPY --from=dev /usr/local/bin/etcd/etcd /usr/local/bin/etcd
-COPY --from=dev /usr/bin/nats-server /usr/local/bin/nats-server
-COPY --from=dev /bin/uv /usr/local/bin/uv
-COPY --from=dev /bin/uvx /usr/local/bin/uvx
-# Copy venv with installed packages
-RUN uv python install 3.12
-COPY --from=dev /opt/vllm /opt/vllm
-COPY --from=dev ${VIRTUAL_ENV} ${VIRTUAL_ENV}
-# Copy minimal set of files for testing. May consider separate stage for testing
-# if test dependencies start to negatively impact deployment environment/size.
-COPY pyproject.toml /workspace/pyproject.toml
-COPY container/deps/vllm /workspace/container/deps/vllm
-# Add library for KV routing
-COPY --from=dev ${VLLM_KV_CAPI_PATH} ${VLLM_KV_CAPI_PATH}
-# Copy minimal set of files for deployment/examples
-# FIXME: Use a more consolidated path after directory restructure
-COPY examples/python_rs/llm/vllm /workspace/examples/python_rs/llm/vllm
-WORKDIR /workspace
-# FIXME: May want a modification with dynamo banner on entry
-ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
-CMD []
--- a/container/Dockerfile.vllm_nixl
+++ b/container/Dockerfile.vllm_nixl
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-ARG BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
-ARG BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
-FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS dev
-USER root
-### NIXL SETUP ###
-ARG MOFED_VERSION=24.10-1.1.4.0
-ARG PYTHON_VERSION=3.12
-ARG NSYS_URL=https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2025_1/
-ARG NSYS_PKG=NsightSystems-linux-cli-public-2025.1.1.131-3554042.deb
-RUN apt-get update -y && apt-get -y install curl \
-                                            git \
-                                            libnuma-dev \
-                                            numactl \
-                                            wget \
-                                            autotools-dev \
-                                            automake \
-                                            libtool \
-                                            libz-dev \
-                                            libiberty-dev \
-                                            flex \
-                                            build-essential \
-                                            cmake \
-                                            libibverbs-dev \
-                                            libgoogle-glog-dev \
-                                            libgtest-dev \
-                                            libjsoncpp-dev \
-                                            libpython3-dev \
-                                            libboost-all-dev \
-                                            libssl-dev \
-                                            libgrpc-dev \
-                                            libgrpc++-dev \
-                                            libprotobuf-dev \
-                                            protobuf-compiler-grpc \
-                                            pybind11-dev \
-                                            python3-full \
-                                            python3-pip \
-                                            python3-numpy \
-                                            etcd-server \
-                                            net-tools \
-                                            pciutils \
-                                            libpci-dev \
-                                            vim \
-                                            tmux \
-                                            screen \
-                                            ibverbs-utils \
-                                            libibmad-dev
-RUN apt-get install -y linux-tools-common linux-tools-generic ethtool iproute2
-RUN apt-get install -y dkms linux-headers-generic
-RUN apt-get install -y meson ninja-build uuid-dev gdb
-RUN apt-get update && apt install -y wget libglib2.0-0
-RUN wget ${NSYS_URL}${NSYS_PKG} && dpkg -i $NSYS_PKG && rm $NSYS_PKG
-RUN cd /usr/local/src && \
-    curl -fSsL "https://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VERSION}/MLNX_OFED_LINUX-${MOFED_VERSION}-ubuntu24.04-x86_64.tgz" -o mofed.tgz && \
-    tar -xf /usr/local/src/mofed.tgz && \
-    cd MLNX_OFED_LINUX-* && \
-    apt-get update && apt-get install -y --no-install-recommends \
-        ./DEBS/libibverbs* ./DEBS/ibverbs-providers* ./DEBS/librdmacm* ./DEBS/libibumad* && \
-    rm -rf /var/lib/apt/lists/* /usr/local/src/*
-ENV LIBRARY_PATH=$LIBRARY_PATH:/usr/local/cuda/lib64 \
-    LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64
-ENV LIBRARY_PATH=$LIBRARY_PATH:/usr/local/lib \
-    LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib
-WORKDIR /workspace
-RUN git clone https://github.com/NVIDIA/gdrcopy.git
-RUN PREFIX=/usr/local DESTLIB=/usr/local/lib make -C /workspace/gdrcopy lib_install
-RUN cp gdrcopy/src/libgdrapi.so.2.* /usr/lib/x86_64-linux-gnu/
-RUN ldconfig
-ARG UCX_VERSION=v1.18.0
-RUN cd /usr/local/src && \
-    curl -fSsL "https://github.com/openucx/ucx/tarball/${UCX_VERSION}" | tar xz && \
-    cd openucx-ucx* && \
-    ./autogen.sh && ./configure     \
-        --enable-shared             \
-        --disable-static            \
-        --disable-doxygen-doc       \
-        --enable-optimizations      \
-        --enable-cma                \
-        --enable-devel-headers      \
-        --with-cuda=/usr/local/cuda \
-        --with-verbs                \
-        --with-dm                   \
-        --with-gdrcopy=/usr/local   \
-        --enable-mt                 \
-        --with-mlx5-dv &&           \
-    make -j &&                      \
-    make -j install-strip &&        \
-    ldconfig
-ENV LD_LIBRARY_PATH=/usr/lib:$LD_LIBRARY_PATH
-ENV CPATH=/usr/include:$CPATH
-ENV PATH=/usr/bin:$PATH
-ENV PKG_CONFIG_PATH=/usr/lib/pkgconfig:$PKG_CONFIG_PATH
-SHELL ["/bin/bash", "-c"]
-WORKDIR /workspace
-ENV LD_LIBRARY_PATH=/usr/local/ompi/lib:$LD_LIBRARY_PATH
-ENV CPATH=/usr/local/ompi/include:$CPATH
-ENV PATH=/usr/local/ompi/bin:$PATH
-ENV PKG_CONFIG_PATH=/usr/local/ompi/lib/pkgconfig:$PKG_CONFIG_PATH
-COPY --from=nixl . /opt/nixl
-RUN cd /opt/nixl && \
-    mkdir build && \
-    meson setup build/ --prefix=/usr/local/nixl && \
-    cd build/ && \
-    ninja && \
-    ninja install
-ENV LD_LIBRARY_PATH=/usr/local/nixl/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH
-ENV PYTHONPATH=/usr/local/nixl/lib/python3/dist-packages/:/opt/nixl/test/python/:$PYTHONPATH
-ENV UCX_TLS=^cuda_ipc
-ENV NIXL_PLUGIN_DIR=/usr/local/nixl/lib/x86_64-linux-gnu/plugins
-RUN ls -l /usr/local/nixl/
-RUN ls -l /usr/local/nixl/include/
-RUN ls -l /usr/local/nixl/include/internal/
-RUN ls /opt/nixl
-# Install utilities
-RUN apt update -y && apt install -y git wget curl nvtop tmux vim
-# nats
-RUN wget https://github.com/nats-io/nats-server/releases/download/v2.10.24/nats-server-v2.10.24-amd64.deb && dpkg -i nats-server-v2.10.24-amd64.deb
-# etcd
-ENV ETCD_VERSION="v3.5.18"
-RUN wget https://github.com/etcd-io/etcd/releases/download/$ETCD_VERSION/etcd-$ETCD_VERSION-linux-amd64.tar.gz -O /tmp/etcd.tar.gz && \
-mkdir -p /usr/local/bin/etcd && \
-tar -xvf /tmp/etcd.tar.gz -C /usr/local/bin/etcd --strip-components=1
-ENV PATH=/usr/local/bin/etcd/:$PATH
-### VIRTUAL ENVIRONMENT SETUP ###
-# Install uv and create virtualenv
-COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
-RUN mkdir /opt/dynamo && \
-    uv venv /opt/dynamo/venv --python 3.12
-# Activate virtual environment
-ENV VIRTUAL_ENV=/opt/dynamo/venv
-ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
-# Common dependencies
-RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \
-    uv pip install --requirement /tmp/requirements.txt
-# Install patched vllm - keep this early in Dockerfile to avoid
-# rebuilds from unrelated source code changes
-ARG VLLM_REF="v0.7.2"
-ARG VLLM_PATCH="vllm_${VLLM_REF}-dynamo-kv-disagg-patch.patch"
-RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
-    bash /tmp/deps/vllm/install.sh --patch /tmp/deps/vllm/${VLLM_PATCH} --ref ${VLLM_REF} --install-cmd "uv pip install --editable" --use-precompiled --installation-dir /opt/vllm
-# Install genai-perf for benchmarking
-ARG GENAI_PERF_TAG="25d0188713adc47868d6b3f22426375237a90529"
-RUN uv pip install "git+https://github.com/triton-inference-server/perf_analyzer.git@${GENAI_PERF_TAG}#subdirectory=genai-perf"
-# Install test dependencies
-RUN --mount=type=bind,source=./container/deps/requirements.test.txt,target=/tmp/requirements.txt \
-    uv pip install --requirement /tmp/requirements.txt
-# ### MISC UTILITY SETUP ###
-# Finish pyright install
-RUN pyright --help > /dev/null 2>&1
-# Enable Git operations in the /workspace directory
-RUN printf "[safe]\n      directory=/workspace\n" > /root/.gitconfig
-RUN ln -sf /bin/bash /bin/sh
-### BUILDS ###
-# Rust build/dev dependencies
-RUN apt update -y && \
-    apt install -y \
-    build-essential \
-    protobuf-compiler \
-    cmake \
-    libssl-dev \
-    pkg-config && \
-    curl https://sh.rustup.rs -sSf | bash -s -- -y
-ENV PATH="/root/.cargo/bin:${PATH}"
-RUN rustup toolchain install 1.85.0-x86_64-unknown-linux-gnu
-# Working directory
-WORKDIR /workspace
-# Copy Python wheel configuration files
-COPY pyproject.toml /workspace/
-COPY README.md /workspace/
-COPY LICENSE /workspace/
-# Build Rust runtime
-COPY lib/runtime /workspace/lib/runtime
-RUN cd lib/runtime && \
-    cargo build --release --locked && cargo doc --no-deps
-# Build OpenAI HTTP Service binaries
-COPY lib/llm /workspace/lib/llm
-COPY components /workspace/components
-RUN cd components && \
-    cargo build --release && \
-    cp target/release/http /usr/local/bin/
-# Build Dynamo Run binaries
-COPY launch /workspace/launch
-RUN cd launch && \
-    cargo build --release --features mistralrs,sglang,vllm,python && \
-    cp target/release/dynamo-run /usr/local/bin/ && \
-    cp target/release/llmctl /usr/local/bin/
-# Generate C bindings for kv cache routing in vLLM
-COPY lib/bindings /workspace/lib/bindings
-RUN cd lib/bindings/c && \
-    cargo build --release --locked && cargo doc --no-deps
-COPY deploy/dynamo/sdk /workspace/deploy/dynamo/sdk
-# Build dynamo wheel
-RUN source /opt/dynamo/venv/bin/activate && \
-    uv build --wheel --out-dir /workspace/dist && \
-    uv pip install /workspace/dist/ai_dynamo*cp312*.whl && \
-    cd /workspace/deploy/dynamo/sdk && \
-    uv build --wheel --out-dir /workspace/dist && \
-    uv pip install /workspace/dist/ai_dynamo_sdk*any.whl
-# Package the bindings
-RUN mkdir -p /opt/dynamo/bindings/wheels && \
-    mkdir /opt/dynamo/bindings/lib && \
-    cp dist/ai_dynamo*cp312*.whl /opt/dynamo/bindings/wheels/. && \
-    cp lib/bindings/c/target/release/libdynamo_llm_capi.so /opt/dynamo/bindings/lib/. && \
-    cp -r lib/bindings/c/include /opt/dynamo/bindings/.
-# Tell vllm to use the Dynamo LLM C API for KV Cache Routing
-ENV VLLM_KV_CAPI_PATH="/opt/dynamo/bindings/lib/libdynamo_llm_capi.so"
-# FIXME: Copy more specific folders in for dev/debug after directory restructure
-COPY . /workspace
-# FIXME: May want a modification with dynamo banner on entry
-ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
-CMD []
-### TODO Lean Runtime Image Stage ###
--- a/container/build.sh
+++ b/container/build.sh
@@ -44,7 +44,7 @@ PYTHON_PACKAGE_VERSION=${current_tag:-$latest_tag.dev+$commit_id}
 # installed within framework specific sections of the Dockerfile.
 declare -A FRAMEWORKS=(["STANDARD"]=1 ["TENSORRTLLM"]=2 ["VLLM"]=3 ["VLLM_NIXL"]=4)
-DEFAULT_FRAMEWORK=STANDARD
+DEFAULT_FRAMEWORK=VLLM
 SOURCE_DIR=$(dirname "$(readlink -f "$0")")
 DOCKERFILE=${SOURCE_DIR}/Dockerfile
@@ -64,9 +64,6 @@ TENSORRTLLM_PIP_WHEEL_PATH=""
 VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
 VLLM_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
-VLLM_NIXL_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
-VLLM_NIXL_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
 NIXL_COMMIT=3ce6a673b266b4f293909ceb17ca7975f1ba5cd7
 NIXL_REPO=ai-dynamo/nixl.git
@@ -197,6 +194,10 @@ get_options() {
        FRAMEWORK=$DEFAULT_FRAMEWORK
    fi
+    if [[ ${FRAMEWORK^^} == "VLLM_NIXL" ]]; then
+	FRAMEWORK="VLLM"
+    fi
    if [ ! -z "$FRAMEWORK" ]; then
        FRAMEWORK=${FRAMEWORK^^}
@@ -283,17 +284,14 @@ error() {
 get_options "$@"
 # Update DOCKERFILE if framework is VLLM
 if [[ $FRAMEWORK == "VLLM" ]]; then
    DOCKERFILE=${SOURCE_DIR}/Dockerfile.vllm
-elif [[ $FRAMEWORK == "VLLM_NIXL" ]]; then
-    DOCKERFILE=${SOURCE_DIR}/Dockerfile.vllm_nixl
 elif [[ $FRAMEWORK == "TENSORRTLLM" ]]; then
    DOCKERFILE=${SOURCE_DIR}/Dockerfile.tensorrt_llm
 fi
-if [[ $FRAMEWORK == "VLLM_NIXL" ]]; then
+if [[ $FRAMEWORK == "VLLM" ]]; then
    TEMP_DIR=$(mktemp -d)
    # Clean up temp directory on script exit

--- a/container/run.sh
+++ b/container/run.sh
@@ -23,7 +23,7 @@ RUN_PREFIX=
 # installed within framework specific sections of the Dockerfile.
 declare -A FRAMEWORKS=(["STANDARD"]=1 ["TENSORRTLLM"]=2 ["VLLM"]=3 ["VLLM_NIXL"]=4)
-DEFAULT_FRAMEWORK=STANDARD
+DEFAULT_FRAMEWORK=VLLM
 SOURCE_DIR=$(dirname "$(readlink -f "$0")")
@@ -170,6 +170,10 @@ get_options() {
 	FRAMEWORK=$DEFAULT_FRAMEWORK
    fi
+    if [[ ${FRAMEWORK^^} == "VLLM_NIXL" ]]; then
+	FRAMEWORK="VLLM"
+    fi
    if [ ! -z "$FRAMEWORK" ]; then
 	FRAMEWORK=${FRAMEWORK^^}
 	if [[ ! -n "${FRAMEWORKS[$FRAMEWORK]}" ]]; then

--- a/examples/python_rs/llm/vllm/README.md
+++ b/examples/python_rs/llm/vllm/README.md
--- a/examples/python_rs/llm/vllm/sdk_kv_router/__init__.py
+++ b/examples/python_rs/llm/vllm/sdk_kv_router/__init__.py
--- a/examples/python_rs/llm/vllm/common/base_engine.py
+++ b/examples/python_rs/llm/vllm/common/base_engine.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import abc
-import logging
-from common.chat_processor import ChatProcessor
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.entrypoints.openai.api_server import (
-    build_async_engine_client_from_engine_args,
-)
-logger = logging.getLogger("vllm")
-class BaseVllmEngine:
-    """
-    Request handler for the generate endpoint
-    """
-    def __init__(self, engine_args: AsyncEngineArgs):
-        self.engine_args = engine_args
-        self.model_config = self.engine_args.create_model_config()
-        self.engine_client = None
-        self.chat_processor: ChatProcessor | None = None
-        self._engine_context = None
-    async def initialize(self):
-        """Initialize the engine client and related components."""
-        logger.info("Initializing engine client")
-        self._engine_context = build_async_engine_client_from_engine_args(
-            self.engine_args
-        )
-        if self._engine_context is not None:
-            self.engine_client = await self._engine_context.__aenter__()
-            self.tokenizer = await self.engine_client.get_tokenizer()
-            self.chat_processor = ChatProcessor(self.tokenizer, self.model_config)
-        else:
-            raise RuntimeError("Failed to initialize engine client")
-    async def cleanup(self):
-        """Cleanup resources."""
-        print("Cleaning up engine client")
-        if self._engine_context is not None:
-            await self._engine_context.__aexit__(None, None, None)
-            self._engine_context = None
-            self.engine_client = None
-            self.chat_processor = None
-    async def __aenter__(self):
-        await self.initialize()
-        """Initialize with context manager syntax."""
-        return self
-    async def __aexit__(self, exc_type, exc_value, traceback):
-        await self.cleanup()
-    @abc.abstractmethod
-    async def generate(self, raw_request):
-        pass
--- a/examples/python_rs/llm/vllm/common/chat_processor.py
+++ b/examples/python_rs/llm/vllm/common/chat_processor.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import json
-import time
-from typing import AsyncIterator, List, Optional, Protocol, Union, runtime_checkable
-from vllm.config import ModelConfig
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.entrypoints.chat_utils import ConversationMessage
-from vllm.entrypoints.openai.protocol import (
-    ChatCompletionRequest,
-    CompletionRequest,
-    RequestResponseMetadata,
-)
-from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
-from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
-from vllm.entrypoints.openai.serving_engine import RequestPrompt
-from vllm.inputs.data import TokensPrompt
-from vllm.transformers_utils.tokenizer import AnyTokenizer
-@runtime_checkable
-class ProcessMixInRequired(Protocol):
-    engine_args: AsyncEngineArgs
-    chat_processor: "ChatProcessor | None"
-    completions_processor: "CompletionsProcessor | None"
-    model_config: ModelConfig
-class ProcessMixIn(ProcessMixInRequired):
-    """
-    Mixin for pre and post processing for vLLM
-    Requires engine_args, engine_client, processor, model_config to be initialized
-    """
-    engine_args: AsyncEngineArgs
-    chat_processor: "ChatProcessor | None"
-    completions_processor: "CompletionsProcessor | None"
-    model_config: ModelConfig
-    def __init__(self):
-        pass
-    def _get_processor(
-        self, raw_request: Union[CompletionRequest, ChatCompletionRequest]
-    ):
-        # Determine the processor type based on the request structure
-        return (
-            self.chat_processor
-            if isinstance(raw_request, ChatCompletionRequest)
-            else self.completions_processor
-        )
-    async def _parse_raw_request(
-        self, raw_request: Union[CompletionRequest, ChatCompletionRequest]
-    ):
-        processor = self._get_processor(raw_request)
-        if processor is None:
-            raise RuntimeError("Processor has not been initialized")
-        request = processor.parse_raw_request(raw_request)
-        preprocess_result = await processor.preprocess(raw_request)
-        default_max_tokens = self.model_config.max_model_len - len(
-            preprocess_result.engine_prompt["prompt_token_ids"]
-        )
-        default_sampling_params = self.model_config.get_diff_sampling_param()
-        sampling_params = request.to_sampling_params(
-            default_max_tokens,
-            self.model_config.logits_processor_pattern,
-            default_sampling_params,
-        )
-        return (
-            request,
-            preprocess_result.conversation,
-            preprocess_result.request_prompt,
-            preprocess_result.engine_prompt,
-            sampling_params,
-        )
-    async def _stream_response(self, request, generator, request_id, conversation):
-        processor = self._get_processor(request)
-        if processor is None:
-            raise RuntimeError("processor has not been initialized")
-        return processor.stream_response(
-            request,
-            generator,
-            request_id,
-            conversation,
-        )
-class PreprocessResult:
-    def __init__(
-        self,
-        conversation: Optional[ConversationMessage],
-        request_prompt: RequestPrompt,
-        engine_prompt: TokensPrompt,
-    ):
-        self.conversation = conversation
-        self.request_prompt = request_prompt
-        self.engine_prompt = engine_prompt
-class ChatProcessor:
-    def __init__(self, tokenizer: AnyTokenizer, model_config: ModelConfig):
-        self.tokenizer = tokenizer
-        self.model_config = model_config
-        self.openai_serving = OpenAIServingChat(
-            engine_client=None,
-            model_config=model_config,
-            models=None,
-            request_logger=None,
-            response_role="assistant",
-            chat_template=None,
-            chat_template_content_format="auto",
-        )
-    def parse_raw_request(
-        self, raw_request: ChatCompletionRequest
-    ) -> ChatCompletionRequest:
-        return ChatCompletionRequest.parse_obj(raw_request)
-    async def preprocess(self, raw_request: ChatCompletionRequest) -> PreprocessResult:
-        request = self.parse_raw_request(raw_request)
-        (
-            conversation,
-            request_prompts,
-            engine_prompts,
-        ) = await self.openai_serving._preprocess_chat(
-            request,
-            self.tokenizer,
-            request.messages,
-            chat_template=request.chat_template or self.tokenizer.chat_template,
-            chat_template_content_format=self.openai_serving.chat_template_content_format,
-            add_generation_prompt=request.add_generation_prompt,
-            continue_final_message=request.continue_final_message,
-            tool_dicts=None,
-            documents=request.documents,
-            chat_template_kwargs=request.chat_template_kwargs,
-            tool_parser=self.openai_serving.tool_parser,
-            truncate_prompt_tokens=request.truncate_prompt_tokens,
-            add_special_tokens=request.add_special_tokens,
-        )
-        return PreprocessResult(conversation[0], request_prompts[0], engine_prompts[0])
-    async def stream_response(
-        self,
-        request: ChatCompletionRequest,
-        result_generator: AsyncIterator,
-        request_id: str,
-        conversation: List,
-    ):
-        request_metadata = RequestResponseMetadata(request_id=request_id)
-        if not request.stream:
-            raise ValueError("Only streaming responses are supported")
-        async for raw_response in self.openai_serving.chat_completion_stream_generator(
-            request,
-            result_generator,
-            request_id,
-            request.model,
-            conversation,
-            self.tokenizer,
-            request_metadata,
-        ):
-            if raw_response.startswith("data: [DONE]"):
-                break
-            response = json.loads(raw_response.lstrip("data: "))
-            yield response
-class CompletionsProcessor:
-    def __init__(self, tokenizer: AnyTokenizer, model_config: ModelConfig):
-        self.tokenizer = tokenizer
-        self.model_config = model_config
-        self.openai_serving = OpenAIServingCompletion(
-            engine_client=None,
-            model_config=model_config,
-            models=None,
-            request_logger=None,
-        )
-    def parse_raw_request(self, raw_request: CompletionRequest) -> CompletionRequest:
-        return CompletionRequest.parse_obj(raw_request)
-    async def preprocess(self, raw_request: CompletionRequest) -> PreprocessResult:
-        request = self.parse_raw_request(raw_request)
-        (
-            request_prompts,
-            engine_prompts,
-        ) = await self.openai_serving._preprocess_completion(
-            request,
-            self.tokenizer,
-            input_or_inputs=request.prompt,
-            truncate_prompt_tokens=request.truncate_prompt_tokens,
-            add_special_tokens=request.add_special_tokens,
-        )
-        return PreprocessResult(None, request_prompts[0], engine_prompts[0])
-    async def stream_response(
-        self,
-        request: CompletionRequest,
-        result_generator: AsyncIterator,
-        request_id: str,
-        conversation: Optional[List[ConversationMessage]] = None,
-    ):
-        request_metadata = RequestResponseMetadata(request_id=request_id)
-        if not request.stream:
-            raise ValueError("Only streaming responses are supported")
-        async for raw_response in self.openai_serving.completion_stream_generator(
-            request,
-            result_generator,
-            request_id,
-            int(time.time()),  # created_time
-            request.model,
-            1,  # num_prompts
-            self.tokenizer,
-            request_metadata,
-        ):
-            if raw_response.startswith("data: [DONE]"):
-                break
-            response = json.loads(raw_response.lstrip("data: "))
-            yield response
--- a/examples/python_rs/llm/vllm/common/client.py
+++ b/examples/python_rs/llm/vllm/common/client.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import asyncio
-import uvloop
-from dynamo.runtime import DistributedRuntime, dynamo_worker
-from .protocol import Request
-@dynamo_worker()
-async def worker(
-    runtime: DistributedRuntime,
-    component: str,
-    prompt: str,
-    max_tokens: int,
-    temperature: float,
-):
-    """
-    Instantiate a `backend` client and call the `generate` endpoint
-    """
-    # get endpoint
-    endpoint = runtime.namespace("dynamo").component(component).endpoint("generate")
-    # create client
-    client = await endpoint.client()
-    # issue request
-    tasks = []
-    for _ in range(1):
-        tasks.append(
-            client.generate(
-                Request(
-                    prompt=prompt,
-                    sampling_params={
-                        "temperature": temperature,
-                        "max_tokens": max_tokens,
-                    },
-                ).model_dump_json()
-            )
-        )
-    streams = await asyncio.gather(*tasks)
-    # process response
-    for stream in streams:
-        async for resp in stream:
-            print(resp)
-if __name__ == "__main__":
-    uvloop.install()
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--prompt", type=str, default="what is the capital of france?")
-    parser.add_argument("--component", type=str, default="vllm")
-    parser.add_argument("--max-tokens", type=int, default=10)
-    parser.add_argument("--temperature", type=float, default=0.5)
-    args = parser.parse_args()
-    asyncio.run(worker(args.component, args.prompt, args.max_tokens, args.temperature))
--- a/examples/python_rs/llm/vllm/common/parser.py
+++ b/examples/python_rs/llm/vllm/common/parser.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.utils import FlexibleArgumentParser
-def parse_vllm_args() -> AsyncEngineArgs:
-    parser = FlexibleArgumentParser()
-    parser = AsyncEngineArgs.add_cli_args(parser)
-    args = parser.parse_args()
-    return AsyncEngineArgs.from_cli_args(args)
--- a/examples/python_rs/llm/vllm/common/protocol.py
+++ b/examples/python_rs/llm/vllm/common/protocol.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import json
-from typing import Any, List, Optional
-import msgspec
-from pydantic import BaseModel, ConfigDict, field_validator
-from pydantic_core import core_schema
-from typing_extensions import NotRequired
-from vllm.inputs.data import TokensPrompt
-from vllm.outputs import CompletionOutput
-from vllm.sampling_params import SamplingParams
-from vllm.sequence import PromptLogprobs, RequestMetrics
-class Request(BaseModel):
-    prompt: str
-    sampling_params: dict
-class Tokens(BaseModel):
-    tokens: list[int]
-class PrefillRequest(Request):
-    request_id: str
-class Response(BaseModel):
-    text: str
-class PrefillResponse(BaseModel):
-    prefilled: bool
-# Hack to override the type of multi_modal_data in TokensPrompt
-# as pydantic doesn't understand generic types
-# TokensPrompt is defined here: https://github.com/vllm-project/vllm/blob/a4c402a756fa3213caf9d2cde0e4ceb2d57727f2/vllm/inputs/data.py#L38
-# multi_modal_data is defined here: https://github.com/vllm-project/vllm/blob/main/vllm/multimodal/inputs.py#L103
-# ModalityData is defined here: https://github.com/vllm-project/vllm/blob/main/vllm/multimodal/inputs.py#L80
-class PatchedTokensPrompt(TokensPrompt):
-    multi_modal_data: NotRequired[Optional[Any]]  # type: ignore
-# Monkey-patch the SamplingParams type to add a dummy core schema so pydantic can validate it
-# Sampling params is a mspspec struct
-# SamplingParams is defined here: https://github.com/vllm-project/vllm/blob/a4c402a756fa3213caf9d2cde0e4ceb2d57727f2/vllm/sampling_params.py#L88
-SamplingParams.__get_pydantic_core_schema__ = classmethod(
-    lambda cls, source, handler: core_schema.any_schema()
-)
-class vLLMGenerateRequest(BaseModel):
-    """
-    Serializable class of all the fields vLLM engine requires for inference
-    """
-    model_config = ConfigDict(arbitrary_types_allowed=True)
-    engine_prompt: PatchedTokensPrompt
-    sampling_params: SamplingParams
-    request_id: str
-    @field_validator("sampling_params", mode="before")
-    @classmethod
-    def parse_sampling_params(cls, v: Any) -> SamplingParams:
-        if isinstance(v, str):
-            v = json.loads(v)
-        if isinstance(v, dict):
-            return SamplingParams(**v)
-        return v
-    model_config = ConfigDict(
-        json_encoders={SamplingParams: lambda v: msgspec.json.encode(v)}
-    )
-class MyRequestOutput(BaseModel):
-    """
-    RequestOutput from vLLM is not serializable by default
-    https://github.com/vllm-project/vllm/blob/a4c402a756fa3213caf9d2cde0e4ceb2d57727f2/vllm/outputs.py#L85
-    This class is used to serialize the RequestOutput and any recursively defined types
-    We can do this because PromptLogprobs, RequestMetrics, and CompletionOutput are all serializable dataclasses
-    """
-    model_config = ConfigDict(arbitrary_types_allowed=True)
-    request_id: str
-    prompt: Optional[str] = None
-    prompt_token_ids: Optional[List[int]] = None
-    prompt_logprobs: Optional[PromptLogprobs] = None
-    outputs: List[CompletionOutput]
-    finished: bool
-    metrics: Optional[RequestMetrics] = None
-    # lora_request: Optional[LoRARequest] = None
-    # encoder_prompt: Optional[str] = None
-    # encoder_prompt_token_ids: Optional[List[int]] = None
-    # num_cached_tokens: Optional[int] = None
-    # multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None
--- a/examples/python_rs/llm/vllm_nixl/disagg_router.py
+++ b/examples/python_rs/llm/vllm_nixl/disagg_router.py
--- a/examples/python_rs/llm/vllm/disaggregated/__init__.py
+++ b/examples/python_rs/llm/vllm/disaggregated/__init__.py
--- a/examples/python_rs/llm/vllm/disaggregated/decode_worker.py
+++ b/examples/python_rs/llm/vllm/disaggregated/decode_worker.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import asyncio
-import socket
-import uuid
-import msgspec
-import uvloop
-from common.base_engine import BaseVllmEngine
-from common.chat_processor import ProcessMixIn
-from common.parser import parse_vllm_args
-from common.protocol import PrefillRequest
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.entrypoints.openai.protocol import (
-    ChatCompletionRequest,
-    ChatCompletionStreamResponse,
-)
-from vllm.logger import logger as vllm_logger
-from dynamo.runtime import DistributedRuntime, dynamo_endpoint, dynamo_worker
-class VllmDecodeEngine(BaseVllmEngine, ProcessMixIn):
-    """
-    Request handler for the generate endpoint
-    """
-    def __init__(self, engine_args: AsyncEngineArgs, prefill):
-        assert (
-            engine_args.kv_transfer_config.is_kv_consumer
-        ), "Decode worker must be a KV consumer"
-        if engine_args.enable_chunked_prefill is not False:
-            vllm_logger.info(
-                "Chunked prefill is not supported in disaggregated mode, disabling it"
-            )
-            engine_args.enable_chunked_prefill = False
-        super().__init__(engine_args)
-        self.prefill = prefill
-        self.kv_transfer_config = engine_args.create_engine_config().kv_transfer_config
-        self.kv_rank = self.kv_transfer_config.kv_rank
-    @dynamo_endpoint(ChatCompletionRequest, ChatCompletionStreamResponse)
-    async def generate(self, raw_request):
-        if self.engine_client is None:
-            await self.initialize()
-        vllm_logger.debug(f"Got raw request: {raw_request}")
-        (
-            request,
-            conversation,
-            request_prompt,
-            engine_prompt,
-            sampling_params,
-        ) = await self._parse_raw_request(raw_request)
-        # TODO: pass decode info through a separate request param
-        request_id = f"{uuid.uuid4()}___decode_hostname_{socket.gethostname()}___decode_kv_rank_{self.kv_rank}"
-        prefill_sampling_params = {**msgspec.to_builtins(sampling_params)}
-        prefill_sampling_params["max_tokens"] = 1
-        prefill_sampling_params["min_tokens"] = 1
-        prefill_request = PrefillRequest(
-            prompt=request_prompt,  # TODO: we should use engine prompt to avoid extra tokenization
-            sampling_params=prefill_sampling_params,
-            request_id=request_id,
-        )
-        vllm_logger.debug(f"Prefill request: {prefill_request}")
-        prefill_output = self.prefill.generate(
-            prefill_request.model_dump_json(),
-        )
-        vllm_logger.debug(
-            f"Running generate with engine_prompt: {engine_prompt}, sampling_params: {sampling_params}, request_id: {request_id}"
-        )
-        if self.engine_client is None:
-            raise RuntimeError("Engine client not initialized")
-        else:
-            generator = self.engine_client.generate(
-                engine_prompt, sampling_params, request_id
-            )
-        async for response in await self._stream_response(
-            request, generator, request_id, conversation
-        ):
-            vllm_logger.debug(f"Generated response: {response}")
-            yield response
-        await prefill_output
-@dynamo_worker()
-async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
-    """
-    Instantiate a `backend` component and serve the `generate` endpoint
-    A `Component` can serve multiple endpoints
-    """
-    component = runtime.namespace("dynamo").component("vllm")
-    await component.create_service()
-    prefill = (
-        await runtime.namespace("dynamo")
-        .component("prefill")
-        .endpoint("generate")
-        .client()
-    )
-    async with VllmDecodeEngine(engine_args, prefill) as decode_engine:
-        endpoint = component.endpoint("generate")
-        await endpoint.serve_endpoint(decode_engine.generate)
-if __name__ == "__main__":
-    uvloop.install()
-    engine_args = parse_vllm_args()
-    asyncio.run(worker(engine_args))
--- a/examples/python_rs/llm/vllm/disaggregated/prefill_worker.py
+++ b/examples/python_rs/llm/vllm/disaggregated/prefill_worker.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import asyncio
-import uvloop
-import vllm
-from common.base_engine import BaseVllmEngine
-from common.parser import parse_vllm_args
-from common.protocol import PrefillRequest, PrefillResponse
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.logger import logger as vllm_logger
-from dynamo.runtime import DistributedRuntime, dynamo_endpoint, dynamo_worker
-class VllmPrefillEngine(BaseVllmEngine):
-    """
-    Request handler for the generate endpoint
-    """
-    def __init__(self, engine_args: AsyncEngineArgs):
-        assert (
-            engine_args.kv_transfer_config.is_kv_producer
-        ), "Prefill worker must be a KV producer"
-        if engine_args.enable_chunked_prefill is not False:
-            vllm_logger.info(
-                "Chunked prefill is not supported in disaggregated mode, disabling it"
-            )
-            engine_args.enable_chunked_prefill = False
-        super().__init__(engine_args)
-        self.kv_transfer_config = engine_args.create_engine_config().kv_transfer_config
-        self.kv_rank = self.kv_transfer_config.kv_rank
-    @dynamo_endpoint(PrefillRequest, PrefillResponse)
-    async def generate(self, request):
-        if self.engine_client is None:
-            await self.initialize()
-        vllm_logger.debug(f"Received prefill request: {request}")
-        sampling_params = vllm.sampling_params.SamplingParams(**request.sampling_params)
-        if self.engine_client is None:
-            raise RuntimeError("Engine client not initialized")
-        else:
-            async for response in self.engine_client.generate(
-                request.prompt, sampling_params, request.request_id
-            ):
-                vllm_logger.debug(f"Generated response: {response}")
-                yield True
-@dynamo_worker()
-async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
-    """
-    Instantiate a `backend` component and serve the `generate` endpoint
-    A `Component` can serve multiple endpoints
-    """
-    component = runtime.namespace("dynamo").component("prefill")
-    await component.create_service()
-    async with VllmPrefillEngine(engine_args) as prefill_engine:
-        endpoint = component.endpoint("generate")
-        await endpoint.serve_endpoint(prefill_engine.generate)
-if __name__ == "__main__":
-    uvloop.install()
-    engine_args = parse_vllm_args()
-    asyncio.run(worker(engine_args))
--- a/examples/python_rs/llm/vllm_nixl/kv_router.py
+++ b/examples/python_rs/llm/vllm_nixl/kv_router.py
--- a/examples/python_rs/llm/vllm/kv_router/__init__.py
+++ b/examples/python_rs/llm/vllm/kv_router/__init__.py
--- a/examples/python_rs/llm/vllm/kv_router/processor.py
+++ b/examples/python_rs/llm/vllm/kv_router/processor.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import asyncio
-import uuid
-from enum import Enum
-from typing import AsyncIterator, Tuple, Union
-import uvloop
-from common.chat_processor import ChatProcessor, CompletionsProcessor, ProcessMixIn
-from common.parser import parse_vllm_args
-from common.protocol import MyRequestOutput, Tokens, vLLMGenerateRequest
-from transformers import AutoTokenizer
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.entrypoints.openai.protocol import (
-    ChatCompletionRequest,
-    ChatCompletionStreamResponse,
-    CompletionRequest,
-    CompletionStreamResponse,
-)
-from vllm.logger import logger as vllm_logger
-from vllm.outputs import RequestOutput
-from vllm.transformers_utils.tokenizer import AnyTokenizer
-from dynamo.runtime import Client, DistributedRuntime, dynamo_endpoint, dynamo_worker
-class RequestType(Enum):
-    CHAT = "chat"
-    COMPLETION = "completion"
-class Processor(ProcessMixIn):
-    """
-    vLLM pre and post processing
-    """
-    def __init__(
-        self,
-        engine_args: AsyncEngineArgs,
-        router_client: Client,
-        workers_client: Client,
-    ):
-        self.engine_args = engine_args
-        self.model_config = self.engine_args.create_model_config()
-        self.tokenizer = self._create_tokenizer(engine_args)
-        self.chat_processor = ChatProcessor(self.tokenizer, self.model_config)
-        self.completions_processor = CompletionsProcessor(
-            self.tokenizer, self.model_config
-        )
-        self.router_client = router_client
-        self.workers_client = workers_client
-    def _create_tokenizer(self, engine_args: AsyncEngineArgs) -> AnyTokenizer:
-        """Create a TokenizerGroup using engine arguments similar to VLLM's approach"""
-        model_path = engine_args.model
-        # Create the base tokenizer with VLLM's typical settings
-        base_tokenizer = AutoTokenizer.from_pretrained(
-            model_path,
-            trust_remote_code=True,
-            padding_side="left",
-            truncation_side="left",
-            use_fast=True,  # VLLM might use the fast tokenizer for efficiency
-        )
-        return base_tokenizer
-    async def _generate(
-        self,
-        raw_request: Union[CompletionRequest, ChatCompletionRequest],
-        request_type: RequestType,
-    ):
-        request_id = str(uuid.uuid4())
-        vllm_logger.debug(f"Got raw request: {raw_request}")
-        (
-            request,
-            conversation,
-            prompt,
-            engine_prompt,
-            sampling_params,
-        ) = await self._parse_raw_request(raw_request)
-        worker_id_generator: AsyncIterator = await self.router_client.generate(
-            Tokens(tokens=engine_prompt["prompt_token_ids"]).model_dump_json()
-        )
-        worker_id = (
-            await worker_id_generator.__anext__()
-        )  # only one worker id is returned
-        worker_id = worker_id.data()
-        vllm_logger.info(f"Worker ID: {worker_id}")
-        if worker_id == "":
-            engine_generator = await self.workers_client.random(
-                vLLMGenerateRequest(
-                    engine_prompt=engine_prompt,
-                    sampling_params=sampling_params,
-                    request_id=request_id,
-                ).model_dump_json()
-            )
-        else:
-            engine_generator = await self.workers_client.direct(
-                vLLMGenerateRequest(
-                    engine_prompt=engine_prompt,
-                    sampling_params=sampling_params,
-                    request_id=request_id,
-                ).model_dump_json(),
-                int(worker_id),
-            )
-        output = self._generate_responses(engine_generator, request_type)
-        async for response in await self._stream_response(
-            request, output, request_id, conversation
-        ):
-            yield response
-    async def _generate_responses(
-        self, engine_generator: AsyncIterator[RequestOutput], request_type: RequestType
-    ) -> AsyncIterator[Union[RequestOutput, Tuple[int, RequestOutput]]]:
-        prompt_idx = 0
-        async for resp in engine_generator:
-            # Deserialize the response from the engine
-            # Creates correct vLLM objects for each field
-            output = MyRequestOutput.model_validate_json(resp.data())
-            # OpenAIServingChat.chat_completion_stream_generator() method expects a RequestOutput object
-            request_output = RequestOutput(
-                request_id=output.request_id,
-                prompt=output.prompt,
-                prompt_token_ids=output.prompt_token_ids,
-                prompt_logprobs=output.prompt_logprobs,
-                outputs=output.outputs,
-                finished=output.finished,
-                metrics=output.metrics,
-            )
-            if request_type == RequestType.CHAT:
-                # For chat requests, yield the request_output directly.
-                yield request_output
-            elif request_type == RequestType.COMPLETION:
-                # Completion requests can have multiple prompts and stream generator requires the prompt index
-                yield (prompt_idx, request_output)
-            else:
-                raise NotImplementedError(
-                    f"Request type {request_type} not implemented"
-                )
-    @dynamo_endpoint(ChatCompletionRequest, ChatCompletionStreamResponse)
-    async def generate_chat(self, raw_request):
-        async for response in self._generate(raw_request, RequestType.CHAT):
-            yield response
-    @dynamo_endpoint(CompletionRequest, CompletionStreamResponse)
-    async def generate_completions(self, raw_request):
-        async for response in self._generate(raw_request, RequestType.COMPLETION):
-            yield response
-@dynamo_worker()
-async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
-    """
-    Set up clients to the router and workers.
-    Serve the dynamo.process.chat/completions endpoint.
-    """
-    workers_client = (
-        await runtime.namespace("dynamo")
-        .component("vllm")
-        .endpoint("generate")
-        .client()
-    )
-    router_client = (
-        await runtime.namespace("dynamo")
-        .component("router")
-        .endpoint("generate")
-        .client()
-    )
-    preprocess_component = runtime.namespace("dynamo").component("process")
-    await preprocess_component.create_service()
-    chat_endpoint = preprocess_component.endpoint("chat/completions")
-    completions_endpoint = preprocess_component.endpoint("completions")
-    processor = Processor(engine_args, router_client, workers_client)
-    await asyncio.gather(
-        chat_endpoint.serve_endpoint(processor.generate_chat),
-        completions_endpoint.serve_endpoint(processor.generate_completions),
-    )
-if __name__ == "__main__":
-    uvloop.install()
-    engine_args = parse_vllm_args()
-    asyncio.run(worker(engine_args))
--- a/examples/python_rs/llm/vllm/kv_router/router.py
+++ b/examples/python_rs/llm/vllm/kv_router/router.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import asyncio
-import random
-from argparse import Namespace
-from typing import AsyncIterator
-import uvloop
-from common.protocol import Tokens
-from vllm.logger import logger as vllm_logger
-from dynamo.llm import AggregatedMetrics, KvIndexer, KvMetricsAggregator, OverlapScores
-from dynamo.runtime import DistributedRuntime, dynamo_endpoint, dynamo_worker
-WorkerId = str
-class CustomRouter:
-    """
-    Request handler for the generate endpoint
-    """
-    def __init__(
-        self,
-        workers_client,
-        indexer: KvIndexer,
-        metrics_aggregator: KvMetricsAggregator,
-    ):
-        vllm_logger.info("Initializing Custom Router")
-        self.indexer = indexer
-        self.metrics_aggregator = metrics_aggregator
-        self.workers_client = workers_client
-    def _cost_function(
-        self,
-        scores: OverlapScores | None,
-        metrics: AggregatedMetrics | None,
-        token_length: int,
-    ):
-        worker_scores = {}
-        if scores:
-            for worker_id, score in scores.scores.items():
-                # score is number of matching blocks we multiply by block_size to get tokens
-                # and compare to token_length. The larger the cache hit the better
-                worker_scores[worker_id] = (
-                    score * self.indexer.block_size() / token_length
-                )
-        worker_metrics = {}
-        # pull metrics for each worker
-        max_waiting = 0.0
-        if metrics:
-            for endpoint in metrics.endpoints:
-                worker_id = endpoint.worker_id
-            worker_metrics[worker_id] = {
-                "gpu_cache_usage_perc": endpoint.gpu_cache_usage_perc
-                if hasattr(endpoint, "gpu_cache_usage_perc")
-                else 0.0,
-                "num_requests_waiting": endpoint.num_requests_waiting
-                if hasattr(endpoint, "num_requests_waiting")
-                else 0.0,
-                "gpu_prefix_cache_hit_rate": endpoint.gpu_prefix_cache_hit_rate
-                if hasattr(endpoint, "gpu_prefix_cache_hit_rate")
-                else 0.0,
-            }
-            max_waiting = max(
-                max_waiting, worker_metrics[worker_id]["num_requests_waiting"]
-            )
-        # Get all worker IDs from the client. This is needed because scores / metrics may not have values for all workers
-        # and we want all workers to be considered in the logit calculation
-        worker_ids = self.workers_client.endpoint_ids()
-        worker_logits = {}
-        for worker_id in worker_ids:
-            # Use default values if worker not in scores or metrics
-            score = worker_scores.get(worker_id, 0.0)
-            metrics_dict = worker_metrics.get(
-                worker_id,
-                {
-                    "gpu_cache_usage_perc": 0.0,
-                    "num_requests_waiting": 0.0,
-                    "gpu_prefix_cache_hit_rate": 0.0,
-                },
-            )
-            normalized_waiting = (
-                metrics_dict["num_requests_waiting"] / max_waiting
-                if max_waiting > 0
-                else 0.0
-            )
-            # Have 1 metric that weights towards cache hit
-            # 2 metrics that penalize overloaded worker and queuing
-            worker_logits[worker_id] = (
-                2 * score - metrics_dict["gpu_cache_usage_perc"] - normalized_waiting
-            )
-            vllm_logger.info(
-                f"Formula for {worker_id}: {worker_logits[worker_id]:.3f} = 2.0 * {score:.3f} - {metrics_dict['gpu_cache_usage_perc']:.3f} - {normalized_waiting:.3f}"
-            )
-        if not worker_logits or all(logit == 0 for logit in worker_logits.values()):
-            return ""
-        # Select the worker with the highest logit
-        if worker_logits:
-            max_logit = max(worker_logits.values())
-            best_workers = [
-                wid for wid, logit in worker_logits.items() if logit == max_logit
-            ]
-            best_worker_id = random.choice(best_workers)
-        else:
-            best_worker_id = ""
-        # Log the metrics for the selected worker
-        if best_worker_id:
-            vllm_logger.info(
-                f"Selected worker: {best_worker_id}, logit: {worker_logits[best_worker_id]:.3f}"
-            )
-            vllm_logger.info(
-                f"Score: {scores.scores.get(best_worker_id, 0.0) if scores else 0.0:.3f}"
-            )
-            metrics_dict = worker_metrics.get(best_worker_id, {})
-            vllm_logger.info(
-                f"GPU Cache Hit Rate: {metrics_dict.get('gpu_prefix_cache_hit_rate', 0.0):.3f}"
-            )
-            vllm_logger.info(
-                f"GPU Cache Usage: {metrics_dict.get('gpu_cache_usage_perc', 0.0):.3f}"
-            )
-            vllm_logger.info(
-                f"Requests Waiting: {metrics_dict.get('num_requests_waiting', 0.0) / max_waiting if max_waiting > 0 else 0.0:.3f}"
-            )
-        return best_worker_id
-    @dynamo_endpoint(Tokens, WorkerId)
-    async def generate(self, request) -> AsyncIterator[WorkerId]:
-        lora_id = 0
-        worker_id = ""
-        try:
-            scores = await self.indexer.find_matches_for_request(
-                request.tokens, lora_id
-            )
-        except Exception as e:
-            scores = {}
-            vllm_logger.exception(f"Error finding matches: {e}")
-        token_length = len(request.tokens)
-        metrics = await self.metrics_aggregator.get_metrics()
-        worker_id = self._cost_function(scores, metrics, token_length)
-        vllm_logger.info(f"Scheduling to worker_id: {worker_id}")
-        vllm_logger.info("########")
-        yield str(worker_id)
-@dynamo_worker()
-async def worker(runtime: DistributedRuntime, args: Namespace):
-    """
-    Set up the worker clients.
-    Serve the dynamo.router.generate endpoint.
-    """
-    workers_client = (
-        await runtime.namespace("dynamo")
-        .component("vllm")
-        .endpoint("generate")
-        .client()
-    )
-    while len(workers_client.endpoint_ids()) < args.min_workers:
-        vllm_logger.info(
-            f"Waiting for more workers... Current: {len(workers_client.endpoint_ids())}, Required: {args.min_workers}"
-        )
-        await asyncio.sleep(5)
-    vllm_logger.info(
-        f"Required number of workers ({args.min_workers}) are ready:\n"
-        + "\n".join(f"id: {id}" for id in workers_client.endpoint_ids())
-    )
-    kv_listener = runtime.namespace("dynamo").component("vllm")
-    await kv_listener.create_service()
-    router_component = runtime.namespace("dynamo").component("router")
-    await router_component.create_service()
-    endpoint = router_component.endpoint("generate")
-    indexer = KvIndexer(kv_listener, args.block_size)
-    metrics_aggregator = KvMetricsAggregator(kv_listener)
-    await endpoint.serve_endpoint(
-        CustomRouter(workers_client, indexer, metrics_aggregator).generate
-    )
-if __name__ == "__main__":
-    uvloop.install()
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--min-workers",
-        type=int,
-        default=1,
-        help="Minimum number of workers required before proceeding",
-    )
-    # TODO: Read block size
-    parser.add_argument(
-        "--block-size",
-        type=int,
-        default=64,
-        help="Block size for the KV Indexer",
-    )
-    args = parser.parse_args()
-    asyncio.run(worker(args))