refactor: replace vllm with vllm_v1 container (#1953)

Co-authored-by: alec-flowers <aflowers@nvidia.com>

refactor: replace vllm with vllm_v1 container (#1953)
Co-authored-by: alec-flowers <aflowers@nvidia.com>
6d2be143 · Biswa Panda · GitHub · 4d2a31ab · 6d2be143 · 4d2a31ab
Unverified Commit 6d2be143 authored Jul 17, 2025 by Biswa Panda Committed by GitHub Jul 17, 2025
12 changed files
--- a/container/Dockerfile.vllm
+++ b/container/Dockerfile.vllm
@@ -69,7 +69,8 @@ RUN apt-get update -y && \
    tmux \
    vim \
    autoconf \
-    libtool
+    libtool \
+    net-tools

 # These headers are missing with the hpcx installer, required
 # by UCX to find RDMA devices
@@ -120,12 +121,21 @@ WORKDIR /workspace
 # Copy nixl source, and use commit hash as cache hint
 COPY --from=nixl_base /opt/nixl /opt/nixl
 COPY --from=nixl_base /opt/nixl/commit.txt /opt/nixl/commit.txt
-RUN cd /opt/nixl && \
-    mkdir build && \
-    meson setup build/ --buildtype=release --prefix=/usr/local/nixl && \
-    cd build/ && \
-    ninja && \
-    ninja install
+RUN if [ "$ARCH" = "arm64" ]; then \
+        cd /opt/nixl && \
+        mkdir build && \
+        meson setup build/ --buildtype=release --prefix=/usr/local/nixl -Dgds_path=/usr/local/cuda/targets/sbsa-linux && \
+        cd build/ && \
+        ninja && \
+        ninja install; \
+    else \
+        cd /opt/nixl && \
+        mkdir build && \
+        meson setup build/ --buildtype=release --prefix=/usr/local/nixl && \
+        cd build/ && \
+        ninja && \
+        ninja install; \
+    fi

 ### NATS & ETCD SETUP ###
 # nats
@@ -152,65 +162,37 @@ ENV VIRTUAL_ENV=/opt/dynamo/venv
 ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"

 # Install NIXL Python module
-RUN cd /opt/nixl && uv build . --out-dir /workspace/wheels/nixl
+# TODO: Move gds_path selection based on arch into NIXL build
+RUN if [ "$ARCH" = "arm64" ]; then \
+        cd /opt/nixl && uv build . --out-dir /workspace/wheels/nixl \
+        --config-settings=setup-args="-Dgds_path=/usr/local/cuda/targets/sbsa-linux"; \
+    else \
+        cd /opt/nixl && uv build . --out-dir /workspace/wheels/nixl; \
+    fi

 # Install the wheel
 # TODO: Move NIXL wheel install to the wheel_builder stage
 RUN uv pip install /workspace/wheels/nixl/*.whl

-# Install patched vllm - keep this early in Dockerfile to avoid
+# Install vllm - keep this early in Dockerfile to avoid
 # rebuilds from unrelated source code changes
-ARG VLLM_REF="0.8.4"
-ARG VLLM_PATCH="vllm_v${VLLM_REF}-dynamo-kv-disagg-patch.patch"
-ARG VLLM_PATCHED_PACKAGE_NAME="ai_dynamo_vllm"
-ARG VLLM_PATCHED_PACKAGE_VERSION="0.8.4.post4"
-ARG VLLM_MAX_JOBS=4
+ARG VLLM_REF="059d4cd"
+ENV CUDA_HOME=/usr/local/cuda
 RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
    --mount=type=cache,target=/root/.cache/uv \
-    mkdir /tmp/vllm && \
-    uv pip install pip wheel && \
-    # NOTE: vLLM build from source on ARM can take several hours, see VLLM_MAX_JOBS details.
-    if [ "$ARCH" = "arm64" ]; then \
-        # PyTorch 2.7 supports CUDA 12.8 and aarch64 installs
-        # NIXL has a torch dependency, so need to force-reinstall to install the correct version
-        uv pip install torch==2.7.0 torchvision torchaudio --force-reinstall --index-url https://download.pytorch.org/whl/cu128 && \
-        # Download vLLM source with version matching patch
-        git clone --branch v${VLLM_REF} --depth 1 https://github.com/vllm-project/vllm.git /tmp/vllm/vllm-${VLLM_REF} && \
-        cd /tmp/vllm/vllm-${VLLM_REF}/ && \
-        # Patch vLLM source with dynamo additions
-        patch -p1 < /tmp/deps/vllm/${VLLM_PATCH} && \
-        # WAR: Set package version check to 'vllm' instead of 'ai_dynamo_vllm' to avoid
-        # platform detection issues on ARM install.
-        # TODO: Rename package from vllm to ai_dynamo_vllm like x86 path below to remove this WAR.
-        sed -i 's/version("ai_dynamo_vllm")/version("vllm")/g' vllm/platforms/__init__.py && \
-        # Remove pytorch from vllm install dependencies
-        python use_existing_torch.py && \
-        # Build/install vllm from source
-        uv pip install -r requirements/build.txt && \
-        # MAX_JOBS set to avoid running OOM on vllm-flash-attn build, this can
-        # significantly impact the overall build time. Each job can take up
-        # to -16GB RAM each, so tune according to available system memory.
-        MAX_JOBS=${VLLM_MAX_JOBS} uv pip install -vv . --no-build-isolation ; \
-    # Handle x86_64: Download wheel, unpack, setup for later steps
-    else \
-        python -m pip download --only-binary=:all: --no-deps --dest /tmp/vllm vllm==v${VLLM_REF} && \
-        # Patch vLLM pre-built download with dynamo additions
-        cd /tmp/vllm && \
-        wheel unpack *.whl && \
-        cd vllm-${VLLM_REF}/ && \
-        patch -p1 < /tmp/deps/vllm/${VLLM_PATCH} && \
-        # Rename the package from vllm to ai_dynamo_vllm
-        mv vllm-${VLLM_REF}.dist-info ${VLLM_PATCHED_PACKAGE_NAME}-${VLLM_PATCHED_PACKAGE_VERSION}.dist-info && \
-        sed -i "s/^Name: vllm/Name: ${VLLM_PATCHED_PACKAGE_NAME}/g" ${VLLM_PATCHED_PACKAGE_NAME}-${VLLM_PATCHED_PACKAGE_VERSION}.dist-info/METADATA && \
-        sed -i "s/^Version: ${VLLM_REF}/Version: ${VLLM_PATCHED_PACKAGE_VERSION}/g" ${VLLM_PATCHED_PACKAGE_NAME}-${VLLM_PATCHED_PACKAGE_VERSION}.dist-info/METADATA && \
-        # Update wheel tag from linux_${ARCH_ALT} to manylinux1_${ARCH_ALT} in WHEEL file
-        sed -i "s/Tag: cp38-abi3-linux_${ARCH_ALT}/Tag: cp38-abi3-manylinux1_${ARCH_ALT}/g" ${VLLM_PATCHED_PACKAGE_NAME}-${VLLM_PATCHED_PACKAGE_VERSION}.dist-info/WHEEL && \
-        # Also update the tag in RECORD file to match
-        sed -i "s/-cp38-abi3-linux_${ARCH_ALT}.whl/-cp38-abi3-manylinux1_${ARCH_ALT}.whl/g" ${VLLM_PATCHED_PACKAGE_NAME}-${VLLM_PATCHED_PACKAGE_VERSION}.dist-info/RECORD && \
-        mkdir -p /workspace/dist && \
-        wheel pack . --dest-dir /workspace/dist && \
-        uv pip install /workspace/dist/${VLLM_PATCHED_PACKAGE_NAME}-*.whl ; \
-    fi
+    uv pip install pip cuda-python && \
+    mkdir /opt/vllm && \
+    cd /opt/vllm && \
+    git clone https://github.com/vllm-project/vllm.git && \
+    cd vllm && \
+    git checkout $VLLM_REF && \
+    VLLM_USE_PRECOMPILED=1 uv pip install -e . && \
+    cd tools/ep_kernels && \
+    bash install_python_libraries.sh && \
+    cd ep_kernels_workspace && \
+    git clone --recursive https://github.com/deepseek-ai/DeepGEMM.git && \
+    cd DeepGEMM && \
+    python setup.py install

 # Common dependencies
 RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \
@@ -324,8 +306,6 @@ RUN SNIPPET="export PROMPT_COMMAND='history -a' && export HISTFILE=$HOME/.comman

 RUN mkdir -p /home/$USERNAME/.cache/

-ENV VLLM_KV_CAPI_PATH=$HOME/dynamo/.build/target/debug/libdynamo_llm_capi.so
-
 ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]

 ##################################
@@ -443,12 +423,7 @@ RUN --mount=type=bind,source=./container/launch_message.txt,target=/workspace/la
    sed '/^#\s/d' /workspace/launch_message.txt > ~/.launch_screen && \
    echo "cat ~/.launch_screen" >> ~/.bashrc

-# Tell vllm to use the Dynamo LLM C API for KV Cache Routing
-ENV VLLM_KV_CAPI_PATH=/opt/dynamo/bindings/lib/libdynamo_llm_capi.so
-
-ARG ARCH_ALT
-ENV NIXL_PLUGIN_DIR=/usr/local/nixl/lib/${ARCH_ALT}-linux-gnu/plugins
-ENV LD_LIBRARY_PATH=/usr/local/nixl/lib/${ARCH_ALT}-linux-gnu:/usr/local/nixl/lib/${ARCH_ALT}-linux-gnu/plugins:/usr/local/ucx/lib:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/nvidia/nvda_nixl/lib/x86_64-linux-gnu/

 ########################################
 ########## Development Image ###########
@@ -519,16 +494,13 @@ COPY --from=base /workspace/wheels/nixl/*.whl wheelhouse/
 COPY --from=wheel_builder /workspace/dist/*.whl wheelhouse/
 RUN uv pip install ai-dynamo[vllm] --find-links wheelhouse && \
    uv pip install nixl --find-links wheelhouse && \
-    ln -sf $VIRTUAL_ENV/bin/* /usr/local/bin/
-
-# Tell vllm to use the Dynamo LLM C API for KV Cache Routing
-ENV VLLM_KV_CAPI_PATH="/opt/dynamo/bindings/lib/libdynamo_llm_capi.so"
+    ln -sf $VIRTUAL_ENV/bin/* /usr/local/bin/ && \
+    rm -r wheelhouse

 # Copy launch banner
 RUN --mount=type=bind,source=./container/launch_message.txt,target=/workspace/launch_message.txt \
    sed '/^#\s/d' /workspace/launch_message.txt > ~/.launch_screen && \
    echo "cat ~/.launch_screen" >> ~/.bashrc

-
 ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
 CMD []
--- a/container/Dockerfile.vllm_v1
+++ b/container/Dockerfile.vllm_v1
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-ARG BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
-# FIXME: NCCL will hang with 25.03, so use 25.01 for now
-# Please check https://github.com/ai-dynamo/dynamo/pull/1065
-# for details and reproducer to manually test if the image
-# can be updated to later versions.
-ARG BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
-ARG RELEASE_BUILD
-ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda"
-ARG RUNTIME_IMAGE_TAG="12.8.1-runtime-ubuntu24.04"
-
-# Define general architecture ARGs for supporting both x86 and aarch64 builds.
-#   ARCH: Used for package suffixes (e.g., amd64, arm64)
-#   ARCH_ALT: Used for Rust targets, manylinux suffix (e.g., x86_64, aarch64)
-#
-# Default values are for x86/amd64:
-#   --build-arg ARCH=amd64 --build-arg ARCH_ALT=x86_64
-#
-# For arm64/aarch64, build with:
-#   --build-arg ARCH=arm64 --build-arg ARCH_ALT=aarch64
-#
-# NOTE: There isn't an easy way to define one of these values based on the other value
-# without adding if statements everywhere, so just define both as ARGs for now.
-ARG ARCH=amd64
-ARG ARCH_ALT=x86_64
-
-FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS nixl_base
-
-# Redeclare ARCH and ARCH_ALT so they're available in this stage
-ARG ARCH
-ARG ARCH_ALT
-
-WORKDIR /opt/nixl
-# Add a cache hint that only changes when the nixl commit changes
-ARG NIXL_COMMIT
-# This line acts as a cache key - it only changes when NIXL_COMMIT changes
-RUN echo "NIXL commit: ${NIXL_COMMIT}" > /opt/nixl/commit.txt
-# Copy the nixl source
-COPY --from=nixl . .
-
-##################################
-########## Base Image ############
-##################################
-
-FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS base
-
-# Redeclare ARCH and ARCH_ALT so they're available in this stage
-ARG ARCH
-ARG ARCH_ALT
-
-USER root
-ARG PYTHON_VERSION=3.12
-
-RUN apt-get update -y && \
-    apt-get install -y \
-    # NIXL build dependencies
-    cmake \
-    meson \
-    ninja-build \
-    pybind11-dev \
-    # Rust build dependencies
-	clang \
-    libclang-dev \
-	git \
-    # Install utilities
-    nvtop \
-    tmux \
-    vim \
-    autoconf \
-    libtool \
-    net-tools
-
-# These headers are missing with the hpcx installer, required
-# by UCX to find RDMA devices
-RUN apt-get update -y && \
-    apt-get install -y --no-install-recommends \
-    --reinstall libibverbs-dev rdma-core ibverbs-utils libibumad-dev \
-    libnuma-dev librdmacm-dev ibverbs-providers
-
-ARG NIXL_UCX_REF=v1.19.x
-
-WORKDIR /workspace
-
-### UCX EFA Setup ###
-RUN rm -rf /opt/hpcx/ucx
-RUN rm -rf /usr/local/ucx
-RUN echo "Building UCX with reference $NIXL_UCX_REF"
-RUN cd /usr/local/src &&                            \
-    git clone https://github.com/openucx/ucx.git && \
-    cd ucx &&                                       \
-    git checkout $NIXL_UCX_REF &&                   \
-    ./autogen.sh && ./configure                     \
-    --prefix=/usr/local/ucx                         \
-    --enable-shared                                 \
-    --disable-static                                \
-    --disable-doxygen-doc                           \
-    --enable-optimizations                          \
-    --enable-cma                                    \
-    --enable-devel-headers                          \
-    --with-cuda=/usr/local/cuda                     \
-    --with-verbs                                    \
-    --with-efa                                      \
-    --with-dm                                       \
-    --with-gdrcopy=/usr/local                       \
-    --enable-mt &&                                  \
-    make -j &&                                      \
-    make -j install-strip &&                        \
-    ldconfig
-
-ENV LD_LIBRARY_PATH=/usr/lib:/usr/local/ucx/lib:$LD_LIBRARY_PATH
-ENV CPATH=/usr/include
-ENV PATH=/usr/bin:$PATH
-ENV PKG_CONFIG_PATH=/usr/lib/pkgconfig
-SHELL ["/bin/bash", "-c"]
-
-WORKDIR /workspace
-
-### NIXL SETUP ###
-# Copy nixl source, and use commit hash as cache hint
-COPY --from=nixl_base /opt/nixl /opt/nixl
-COPY --from=nixl_base /opt/nixl/commit.txt /opt/nixl/commit.txt
-RUN if [ "$ARCH" = "arm64" ]; then \
-        cd /opt/nixl && \
-        mkdir build && \
-        meson setup build/ --buildtype=release --prefix=/usr/local/nixl -Dgds_path=/usr/local/cuda/targets/sbsa-linux && \
-        cd build/ && \
-        ninja && \
-        ninja install; \
-    else \
-        cd /opt/nixl && \
-        mkdir build && \
-        meson setup build/ --buildtype=release --prefix=/usr/local/nixl && \
-        cd build/ && \
-        ninja && \
-        ninja install; \
-    fi
-
-### NATS & ETCD SETUP ###
-# nats
-RUN wget --tries=3 --waitretry=5 https://github.com/nats-io/nats-server/releases/download/v2.10.28/nats-server-v2.10.28-${ARCH}.deb && \
-    dpkg -i nats-server-v2.10.28-${ARCH}.deb && rm nats-server-v2.10.28-${ARCH}.deb
-# etcd
-ENV ETCD_VERSION="v3.5.21"
-RUN wget --tries=3 --waitretry=5 https://github.com/etcd-io/etcd/releases/download/$ETCD_VERSION/etcd-$ETCD_VERSION-linux-${ARCH}.tar.gz -O /tmp/etcd.tar.gz && \
-    mkdir -p /usr/local/bin/etcd && \
-    tar -xvf /tmp/etcd.tar.gz -C /usr/local/bin/etcd --strip-components=1 && \
-    rm /tmp/etcd.tar.gz
-ENV PATH=/usr/local/bin/etcd/:$PATH
-
-
-### VIRTUAL ENVIRONMENT SETUP ###
-
-# Install uv and create virtualenv
-COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
-RUN mkdir /opt/dynamo && \
-    uv venv /opt/dynamo/venv --python 3.12
-
-# Activate virtual environment
-ENV VIRTUAL_ENV=/opt/dynamo/venv
-ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
-
-# Install NIXL Python module
-# TODO: Move gds_path selection based on arch into NIXL build
-RUN if [ "$ARCH" = "arm64" ]; then \
-        cd /opt/nixl && uv build . --out-dir /workspace/wheels/nixl \
-        --config-settings=setup-args="-Dgds_path=/usr/local/cuda/targets/sbsa-linux"; \
-    else \
-        cd /opt/nixl && uv build . --out-dir /workspace/wheels/nixl; \
-    fi
-
-# Install the wheel
-# TODO: Move NIXL wheel install to the wheel_builder stage
-RUN uv pip install /workspace/wheels/nixl/*.whl
-
-# Install vllm - keep this early in Dockerfile to avoid
-# rebuilds from unrelated source code changes
-ARG VLLM_REF="059d4cd"
-ENV CUDA_HOME=/usr/local/cuda
-RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
-    --mount=type=cache,target=/root/.cache/uv \
-    uv pip install pip cuda-python && \
-    mkdir /opt/vllm && \
-    cd /opt/vllm && \
-    git clone https://github.com/vllm-project/vllm.git && \
-    cd vllm && \
-    git checkout $VLLM_REF && \
-    VLLM_USE_PRECOMPILED=1 uv pip install -e . && \
-    cd tools/ep_kernels && \
-    bash install_python_libraries.sh && \
-    cd ep_kernels_workspace && \
-    git clone --recursive https://github.com/deepseek-ai/DeepGEMM.git && \
-    cd DeepGEMM && \
-    python setup.py install
-
-# Common dependencies
-RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \
-    uv pip install --requirement /tmp/requirements.txt
-
-# Install test dependencies
-RUN --mount=type=bind,source=./container/deps/requirements.test.txt,target=/tmp/requirements.txt \
-    uv pip install --requirement /tmp/requirements.txt
-
-# ### MISC UTILITY SETUP ###
-
-# Finish pyright install
-RUN pyright --help > /dev/null 2>&1
-
-# Enable Git operations in the /workspace directory
-RUN printf "[safe]\n      directory=/workspace\n" > /root/.gitconfig
-
-# Install prometheus
-ARG PROM_VERSION=3.4.1
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    curl tar ca-certificates && \
-    rm -rf /var/lib/apt/lists/*
-RUN ARCH=$(dpkg --print-architecture) && \
-    case "$ARCH" in \
-        amd64) PLATFORM=linux-amd64 ;; \
-        arm64) PLATFORM=linux-arm64 ;; \
-        *) echo "Unsupported architecture: $ARCH" && exit 1 ;; \
-    esac && \
-    curl -fsSL https://github.com/prometheus/prometheus/releases/download/v${PROM_VERSION}/prometheus-${PROM_VERSION}.${PLATFORM}.tar.gz \
-    | tar -xz -C /tmp && \
-    mv /tmp/prometheus-${PROM_VERSION}.${PLATFORM}/prometheus /usr/local/bin/ && \
-    chmod +x /usr/local/bin/prometheus && \
-    rm -rf /tmp/prometheus-${PROM_VERSION}.${PLATFORM}
-
-### BUILDS ###
-
-# Rust build/dev dependencies
-RUN apt update -y && \
-    apt install --no-install-recommends -y \
-    build-essential \
-    protobuf-compiler \
-    cmake \
-    libssl-dev \
-    pkg-config
-
-ENV RUSTUP_HOME=/usr/local/rustup \
-    CARGO_HOME=/usr/local/cargo \
-    PATH=/usr/local/cargo/bin:$PATH \
-    RUST_VERSION=1.87.0
-
-# Define Rust target based on ARCH_ALT ARG
-ARG RUSTARCH=${ARCH_ALT}-unknown-linux-gnu
-
-# Install Rust using RUSTARCH derived from ARCH_ALT
-RUN wget --tries=3 --waitretry=5 "https://static.rust-lang.org/rustup/archive/1.28.1/${RUSTARCH}/rustup-init" && \
-    # TODO: Add SHA check back based on RUSTARCH
-    chmod +x rustup-init && \
-    ./rustup-init -y --no-modify-path --profile default --default-toolchain $RUST_VERSION --default-host ${RUSTARCH} && \
-    rm rustup-init && \
-    chmod -R a+w $RUSTUP_HOME $CARGO_HOME
-
-ARG CARGO_BUILD_JOBS
-# Set CARGO_BUILD_JOBS to 16 if not provided
-# This is to prevent cargo from building $(nproc) jobs in parallel,
-# which might exceed the number of opened files limit.
-ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16}
-
-#######################################
-########## Local Development ##########
-#######################################
-
-FROM base AS local-dev
-
-# https://code.visualstudio.com/remote/advancedcontainers/add-nonroot-user
-# Will use the default ubuntu user, but give sudo access
-# Needed so files permissions aren't set to root ownership when writing from inside container
-
-# Don't want ubuntu to be editable, just change uid and gid. User ubuntu is hardcoded in .devcontainer
-ENV USERNAME=ubuntu
-ARG USER_UID=1000
-ARG USER_GID=1000
-
-RUN apt-get update && apt-get install -y sudo gnupg2 gnupg1 \
-    && echo "$USERNAME ALL=(root) NOPASSWD:ALL" > /etc/sudoers.d/$USERNAME \
-    && chmod 0440 /etc/sudoers.d/$USERNAME \
-    && mkdir -p /home/$USERNAME \
-    && chown -R $USERNAME:$USERNAME /home/$USERNAME \
-    && rm -rf /var/lib/apt/lists/* \
-    && chsh -s /bin/bash $USERNAME
-
-# This is a slow operation (~40s on my cpu)
-# Much better than chown -R $USERNAME:$USERNAME /opt/dynamo/venv (~10min on my cpu)
-COPY --from=base --chown=$USER_UID:$USER_GID /opt/dynamo/venv/ /opt/dynamo/venv/
-RUN chown $USERNAME:$USERNAME /opt/dynamo/venv
-COPY --from=base --chown=$USERNAME:$USERNAME /usr/local/bin /usr/local/bin
-
-# so we can use maturin develop
-RUN uv pip install maturin[patchelf]
-
-USER $USERNAME
-ENV HOME=/home/$USERNAME
-ENV PYTHONPATH=$HOME/dynamo/deploy/sdk/src:$PYTHONPATH:$HOME/dynamo/components/planner/src:$PYTHONPATH
-ENV CARGO_TARGET_DIR=$HOME/dynamo/.build/target
-WORKDIR $HOME
-
-# https://code.visualstudio.com/remote/advancedcontainers/persist-bash-history
-RUN SNIPPET="export PROMPT_COMMAND='history -a' && export HISTFILE=$HOME/.commandhistory/.bash_history" \
-    && mkdir -p $HOME/.commandhistory \
-    && touch $HOME/.commandhistory/.bash_history \
-    && echo "$SNIPPET" >> "$HOME/.bashrc"
-
-RUN mkdir -p /home/$USERNAME/.cache/
-
-ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
-
-##################################
-##### Wheel Build Image ##########
-##################################
-
-# Redeclare ARCH_ALT ARG so it's available for interpolation in the FROM instruction
-ARG ARCH_ALT
-
-FROM quay.io/pypa/manylinux_2_28_${ARCH_ALT} AS wheel_builder
-
-ARG CARGO_BUILD_JOBS
-# Set CARGO_BUILD_JOBS to 16 if not provided
-# This is to prevent cargo from building $(nproc) jobs in parallel,
-# which might exceed the number of opened files limit.
-ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16}
-# Use build arg RELEASE_BUILD = true to generate wheels for Python 3.10, 3.11 and 3.12.
-ARG RELEASE_BUILD
-
-WORKDIR /workspace
-
-RUN yum update -y \
-    && yum install -y llvm-toolset \
-    && yum install -y python3.12-devel \
-    && yum install -y protobuf-compiler \
-    && yum clean all \
-    && rm -rf /var/cache/yum
-
-ENV RUSTUP_HOME=/usr/local/rustup \
-    CARGO_HOME=/usr/local/cargo \
-    CARGO_TARGET_DIR=/workspace/target \
-    VIRTUAL_ENV=/opt/dynamo/venv
-
-COPY --from=base $RUSTUP_HOME $RUSTUP_HOME
-COPY --from=base $CARGO_HOME $CARGO_HOME
-COPY --from=base /usr/local/nixl /opt/nvidia/nvda_nixl
-COPY --from=base /workspace /workspace
-COPY --from=base $VIRTUAL_ENV $VIRTUAL_ENV
-ENV PATH=$CARGO_HOME/bin:$VIRTUAL_ENV/bin:$PATH
-
-# Copy configuration files
-COPY pyproject.toml /workspace/
-COPY README.md /workspace/
-COPY LICENSE /workspace/
-COPY Cargo.toml /workspace/
-COPY Cargo.lock /workspace/
-COPY rust-toolchain.toml /workspace/
-COPY hatch_build.py /workspace/
-
-# Copy source code
-COPY lib/ /workspace/lib/
-COPY components /workspace/components
-COPY launch /workspace/launch
-COPY deploy/sdk /workspace/deploy/sdk
-
-RUN cargo build \
-	--release \
-	--locked \
-	--features dynamo-llm/block-manager \
-	--workspace
-
-# Build dynamo wheel
-RUN uv build --wheel --out-dir /workspace/dist && \
-    cd /workspace/lib/bindings/python && \
-    uv pip install maturin[patchelf] && \
-    maturin build --release --features block-manager --out /workspace/dist && \
-    if [ "$RELEASE_BUILD" = "true" ]; then \
-        # do not enable KVBM feature, ensure compatibility with lower glibc
-        uv run --python 3.11 maturin build --release --out /workspace/dist && \
-        uv run --python 3.10 maturin build --release --out /workspace/dist; \
-    fi
-
-#######################################
-########## CI Minimum Image ###########
-#######################################
-FROM base AS ci_minimum
-
-ENV DYNAMO_HOME=/workspace
-ENV CARGO_TARGET_DIR=/workspace/target
-
-WORKDIR /workspace
-
-COPY --from=wheel_builder /workspace /workspace
-COPY --from=wheel_builder /opt/nvidia/nvda_nixl /opt/nvidia/nvda_nixl
-# Copy Cargo cache to avoid re-downloading dependencies
-COPY --from=wheel_builder $CARGO_HOME $CARGO_HOME
-
-# Copy rest of the code
-COPY . /workspace
-
-# Build C bindings, creates lib/bindings/c/include
-#
-# TODO: In theory the 'cargo build' in earlier stage covers this, we "just" need to copy the
-# `lib/bindings/c/include` folder that build.rs generated across.
-# I couldn't get that to work, hence TODO.
-RUN cd /workspace/lib/bindings/c && cargo build --release --locked
-
-# Package the bindings
-RUN mkdir -p /opt/dynamo/bindings/wheels && \
-    mkdir /opt/dynamo/bindings/lib && \
-    cp dist/ai_dynamo*cp312*.whl /opt/dynamo/bindings/wheels/. && \
-    cp target/release/libdynamo_llm_capi.so /opt/dynamo/bindings/lib/. && \
-    cp -r lib/bindings/c/include /opt/dynamo/bindings/.  && \
-    cp target/release/dynamo-run /usr/local/bin && \
-    cp target/release/metrics /usr/local/bin && \
-    cp target/release/mock_worker /usr/local/bin
-
-RUN uv pip install /workspace/dist/ai_dynamo_runtime*cp312*.whl && \
-    uv pip install /workspace/dist/ai_dynamo*any.whl
-
-RUN uv pip install /workspace/benchmarks
-
-# Copy launch banner
-RUN --mount=type=bind,source=./container/launch_message.txt,target=/workspace/launch_message.txt \
-    sed '/^#\s/d' /workspace/launch_message.txt > ~/.launch_screen && \
-    echo "cat ~/.launch_screen" >> ~/.bashrc
-
-ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/nvidia/nvda_nixl/lib/x86_64-linux-gnu/
-
-########################################
-########## Development Image ###########
-########################################
-FROM ci_minimum AS dev
-
-ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
-
-CMD []
-
-####################################
-########## Runtime Image ###########
-####################################
-
-FROM ${RUNTIME_IMAGE}:${RUNTIME_IMAGE_TAG} AS runtime
-
-WORKDIR /workspace
-ENV DYNAMO_HOME=/workspace
-ENV VIRTUAL_ENV=/opt/dynamo/venv
-ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
-
-# Install build-essential and python3-dev as apt dependencies
-RUN apt-get update && \
-    DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
-        build-essential \
-        python3-dev && \
-    rm -rf /var/lib/apt/lists/*
-
-### COPY BINDINGS ###
-# Copy all bindings (wheels, lib, include) from ci_minimum
-COPY --from=ci_minimum /opt/dynamo/bindings /opt/dynamo/bindings
-### COPY NATS & ETCD ###
-# Copy nats and etcd from base image
-COPY --from=base /usr/bin/nats-server /usr/bin/nats-server
-COPY --from=base /usr/local/bin/etcd/ /usr/local/bin/etcd/
-
-# Copy UCX from base image as plugin for NIXL
-# Copy NIXL source from base image (required for NIXL plugins)
-COPY --from=base /usr/local/ucx /usr/local/ucx
-COPY --from=base /usr/local/nixl /usr/local/nixl
-ARG ARCH_ALT
-ENV NIXL_PLUGIN_DIR=/usr/local/nixl/lib/${ARCH_ALT}-linux-gnu/plugins
-ENV LD_LIBRARY_PATH=/usr/local/nixl/lib/${ARCH_ALT}-linux-gnu:/usr/local/nixl/lib/${ARCH_ALT}-linux-gnu/plugins:/usr/local/ucx/lib:$LD_LIBRARY_PATH
-
-# Setup the python environment
-COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
-RUN uv venv $VIRTUAL_ENV --python 3.12 && \
-    echo "source $VIRTUAL_ENV/bin/activate" >> ~/.bashrc
-
-# Common dependencies
-RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \
-    uv pip install --requirement /tmp/requirements.txt
-
-# Install the wheels and symlink executables to /usr/local/bin so dynamo components can use them
-# Dynamo components currently do not have the VIRTUAL_ENV in their PATH, so we need to symlink the executables
-#Copy NIXL and Dynamo wheels into wheelhouse
-COPY --from=base /workspace/wheels/nixl/*.whl wheelhouse/
-COPY --from=wheel_builder /workspace/dist/*.whl wheelhouse/
-RUN uv pip install ai-dynamo --find-links wheelhouse && \
-    uv pip install nixl --find-links wheelhouse && \
-    ln -sf $VIRTUAL_ENV/bin/* /usr/local/bin/ && \
-    rm -r wheelhouse
-
-# Copy launch banner
-RUN --mount=type=bind,source=./container/launch_message.txt,target=/workspace/launch_message.txt \
-    sed '/^#\s/d' /workspace/launch_message.txt > ~/.launch_screen && \
-    echo "cat ~/.launch_screen" >> ~/.bashrc
-
-# Copy examples
-COPY ./examples examples/
-
-ENTRYPOINT [ "/usr/bin/bash" ]
-CMD []
--- a/container/build.sh
+++ b/container/build.sh
@@ -49,7 +49,7 @@ PYTHON_PACKAGE_VERSION=${current_tag:-$latest_tag.dev+$commit_id}
 # dependencies are specified in the /container/deps folder and
 # installed within framework specific sections of the Dockerfile.

-declare -A FRAMEWORKS=(["VLLM"]=1 ["TENSORRTLLM"]=2 ["NONE"]=3 ["SGLANG"]=4 ["VLLM_V1"]=5)
+declare -A FRAMEWORKS=(["VLLM"]=1 ["TENSORRTLLM"]=2 ["NONE"]=3 ["SGLANG"]=4)
 DEFAULT_FRAMEWORK=VLLM

 SOURCE_DIR=$(dirname "$(readlink -f "$0")")
@@ -111,9 +111,6 @@ NONE_BASE_IMAGE_TAG="24.04"
 SGLANG_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
 SGLANG_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"

-VLLM_V1_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
-VLLM_V1_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
-
 NIXL_COMMIT=3c47a48955e6f96bd5d4fb43a9d80bb64722f8e4
 NIXL_REPO=ai-dynamo/nixl.git

@@ -403,8 +400,6 @@ elif [[ $FRAMEWORK == "NONE" ]]; then
    DOCKERFILE=${SOURCE_DIR}/Dockerfile.none
 elif [[ $FRAMEWORK == "SGLANG" ]]; then
    DOCKERFILE=${SOURCE_DIR}/Dockerfile.sglang
-elif [[ $FRAMEWORK == "VLLM_V1" ]]; then
-    DOCKERFILE=${SOURCE_DIR}/Dockerfile.vllm_v1
 fi

 NIXL_DIR="/tmp/nixl/nixl_src"

--- a/container/deps/vllm/README.md
+++ b/container/deps/vllm/README.md
-<!--
-SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-SPDX-License-Identifier: Apache-2.0
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-->
-
-Apply this patch to Python source code from vLLM release [v0.7.2](https://github.com/vllm-project/vllm/releases/tag/v0.7.2).
\ No newline at end of file
--- a/container/deps/vllm/prepare_patch.sh
+++ b/container/deps/vllm/prepare_patch.sh
-#!/bin/bash
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set -e
-
-# Function to print usage
-print_usage() {
-    echo "Usage: $0 --original-ref <original_tag_or_branch> --fork-repo <fork_repo_url> --fork-ref <fork_tag_or_branch> --output <patch_output_path>"
-    echo
-    echo "Arguments:"
-    echo "  --original-ref    The tag or branch name from the original vllm-project/vllm repo"
-    echo "  --fork-repo   The URL of the forked repository"
-    echo "  --fork-ref    The tag or branch name from the forked repository"
-    echo "  --output      Path where the generated patch file should be saved"
-    echo
-    echo "Example:"
-    echo "  $0 --original-ref v0.2.0 --fork-repo https://github.com/user/vllm.git --fork-ref feature-branch --output ./my-patch.diff"
-    exit 1
-}
-
-# Parse named arguments
-while [[ $# -gt 0 ]]; do
-    case $1 in
-        --original-ref)
-            ORIGINAL_REF="$2"
-            shift 2
-            ;;
-        --fork-repo)
-            FORK_REPO="$2"
-            shift 2
-            ;;
-        --fork-ref)
-            FORK_REF="$2"
-            shift 2
-            ;;
-        --output)
-            PATCH_OUTPUT="$2"
-            shift 2
-            ;;
-        *)
-            print_usage
-            ;;
-    esac
-done
-
-# Check if all required arguments are provided
-if [ -z "$ORIGINAL_REF" ] || [ -z "$FORK_REPO" ] || [ -z "$FORK_REF" ] || [ -z "$PATCH_OUTPUT" ]; then
-    print_usage
-fi
-
-# Convert patch output path to absolute path if it's relative
-if [[ ! "$PATCH_OUTPUT" = /* ]]; then
-    PATCH_OUTPUT="$(pwd)/${PATCH_OUTPUT}"
-fi
-
-TEMP_DIR=$(mktemp -d)
-
-# Clean up temp directory on script exit
-trap 'rm -rf "$TEMP_DIR"' EXIT
-
-# Clone original vLLM to a temp directory
-git clone https://github.com/vllm-project/vllm.git "$TEMP_DIR/original_vllm"
-
-cd "$TEMP_DIR/original_vllm"
-
-git remote add fork "$FORK_REPO"
-git fetch fork "$FORK_REF"
-git diff "$ORIGINAL_REF" fork/"$FORK_REF" > "$PATCH_OUTPUT"
-
-echo "Patch created successfully: $PATCH_OUTPUT"
\ No newline at end of file
--- a/container/deps/vllm/tests/test_patch_install.py
+++ b/container/deps/vllm/tests/test_patch_install.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-try:
-    import vllm
-except ImportError:
-    vllm = None  # type: ignore
-
-pytestmark = pytest.mark.pre_merge
-
-
-# TODO: Consider `pytest.mark.vllm` and running tests based on environment
-@pytest.mark.skipif(vllm is None, reason="Skipping vllm tests, vllm not installed")
-def test_version():
-    # Verify that the image has the patched version of vllm
-    assert vllm.__version__.endswith("0.8.4")  # type: ignore
--- a/container/deps/vllm/vllm_v0.7.2-dynamo-kv-disagg-patch.patch
+++ b/container/deps/vllm/vllm_v0.7.2-dynamo-kv-disagg-patch.patch
--- a/container/deps/vllm/vllm_v0.8.4-dynamo-kv-disagg-patch.patch
+++ b/container/deps/vllm/vllm_v0.8.4-dynamo-kv-disagg-patch.patch
--- a/container/run.sh
+++ b/container/run.sh
@@ -24,7 +24,7 @@ RUN_PREFIX=
 # dependencies are specified in the /container/deps folder and
 # installed within framework specific sections of the Dockerfile.

-declare -A FRAMEWORKS=(["VLLM"]=1 ["TENSORRTLLM"]=2 ["SGLANG"]=3 ["VLLM_V1"]=4)
+declare -A FRAMEWORKS=(["VLLM"]=1 ["TENSORRTLLM"]=2 ["SGLANG"]=3)
 DEFAULT_FRAMEWORK=VLLM

 SOURCE_DIR=$(dirname "$(readlink -f "$0")")

--- a/deploy/metrics/README.md
+++ b/deploy/metrics/README.md
@@ -25,7 +25,7 @@ graph TD

 The dcgm-exporter service in the Docker Compose network is configured to use port 9401 instead of the default port 9400. This adjustment is made to avoid port conflicts with other dcgm-exporter instances that may be running simultaneously. Such a configuration is typical in distributed systems like SLURM.

-As of Q2 2025, Dynamo HTTP Frontend metrics are exposed when you build containers with `--framework VLLM_V1` or `--framework TENSORRTLLM`.
+As of Q2 2025, Dynamo HTTP Frontend metrics are exposed when you build containers with `--framework VLLM` or `--framework TENSORRTLLM`.

 ## Getting Started


--- a/examples/vllm/README.md
+++ b/examples/vllm/README.md
@@ -36,11 +36,11 @@ docker compose -f deploy/metrics/docker-compose.yml up -d
 ### Build and Run docker

 ```bash
-./container/build.sh --framework VLLM_V1
+./container/build.sh
 ```

 ```bash
-./container/run.sh -it --framework VLLM_V1 [--mount-workspace]
+./container/run.sh -it [--mount-workspace]
 ```

 This includes the specific commit [vllm-project/vllm#19790](https://github.com/vllm-project/vllm/pull/19790) which enables support for external control of the DP ranks.
@@ -129,9 +129,9 @@ For Kubernetes deployment, YAML manifests are provided in the `deploy/` director

 - **Dynamo Cloud**: Follow the [Quickstart Guide](../../docs/guides/dynamo_deploy/quickstart.md) to deploy Dynamo Cloud first.

- **Container Images**: The deployment files currently require access to `nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime`. If you don't have access, build and push your own image:
+- **Container Images**: The deployment files currently require access to `nvcr.io/nvidian/nim-llm-dev/vllm-runtime`. If you don't have access, build and push your own image:
  ```bash
-  ./container/build.sh --framework VLLM_V1
+  ./container/build.sh --framework VLLM
  # Tag and push to your container registry
  # Update the image references in the YAML files
  ```

--- a/tests/serve/test_vllm.py
+++ b/tests/serve/test_vllm.py
@@ -186,7 +186,7 @@ class VLLMProcess(ManagedProcess):
 vllm_configs = {
    "aggregated": VLLMConfig(
        name="aggregated",
-        directory="/workspace/examples/llm",
+        directory="/workspace/examples/vllm",
        script_name="agg.sh",
        marks=[pytest.mark.gpu_1, pytest.mark.vllm],
        endpoints=["v1/chat/completions", "v1/completions"],
@@ -199,7 +199,7 @@ vllm_configs = {
    ),
    "disaggregated": VLLMConfig(
        name="disaggregated",
-        directory="/workspace/examples/llm",
+        directory="/workspace/examples/vllm",
        script_name="disagg.sh",
        marks=[pytest.mark.gpu_2, pytest.mark.vllm],
        endpoints=["v1/chat/completions", "v1/completions"],