Merge tag 'v0.14.0' into v0.14.0-dev

7e63ef82 · zhuwenwen · 8cbcac5d · b17039bc · 7e63ef82 · 7e63ef82
Commit 7e63ef82 authored Jan 21, 2026 by zhuwenwen
20 changed files
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -32,7 +32,7 @@ ARG DEADSNAKES_GPGKEY_URL

 # The PyPA get-pip.py script is a self contained script+zip file, that provides
 # both the installer script and the pip base85-encoded zip archive. This allows
-# bootstrapping pip in environment where a dsitribution package does not exist.
+# bootstrapping pip in environment where a distribution package does not exist.
 #
 # By parameterizing the URL for get-pip.py installation script, we allow
 # third-party to use their own copy of the script stored in a private mirror.
@@ -73,15 +73,13 @@ ARG INSTALL_KV_CONNECTORS=false
 #################### BASE BUILD IMAGE ####################
 # prepare basic build environment
 FROM ${BUILD_BASE_IMAGE} AS base
+
 ARG CUDA_VERSION
 ARG PYTHON_VERSION
-ARG TARGETPLATFORM
-ARG INSTALL_KV_CONNECTORS=false
-ENV DEBIAN_FRONTEND=noninteractive

-ARG GET_PIP_URL
+ENV DEBIAN_FRONTEND=noninteractive

-# Install system dependencies and uv, then create Python virtual environment
+# Install system dependencies including build tools
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
    && apt-get update -y \
@@ -107,32 +105,30 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
    && ln -s /opt/venv/bin/pip /usr/bin/pip \
    && python3 --version && python3 -m pip --version

-ARG PIP_INDEX_URL UV_INDEX_URL
-ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
-ARG PYTORCH_CUDA_INDEX_BASE_URL
-ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
-
 # Activate virtual environment and add uv to PATH
 ENV PATH="/opt/venv/bin:/root/.local/bin:$PATH"
 ENV VIRTUAL_ENV="/opt/venv"

-# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
-# Reference: https://github.com/astral-sh/uv/pull/1694
+# Environment for uv
 ENV UV_HTTP_TIMEOUT=500
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
-# Use copy mode to avoid hardlink failures with Docker cache mounts
 ENV UV_LINK_MODE=copy

-RUN <<EOF
-gcc --version
-EOF
+# Verify GCC version
+RUN gcc --version

-# Workaround for https://github.com/openai/triton/issues/2507 and
-# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
-# this won't be needed for future versions of this docker image
-# or future versions of triton.
+# Workaround for triton/pytorch issues
 RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/

+# ============================================================
+# SLOW-CHANGING DEPENDENCIES BELOW
+# These are the expensive layers that we want to cache
+# ============================================================
+
+# Install PyTorch and core CUDA dependencies
+# This is ~2GB and rarely changes
+ARG PYTORCH_CUDA_INDEX_BASE_URL
+
 WORKDIR /workspace

 # install build and runtime dependencies
@@ -142,13 +138,12 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
    --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')

-# cuda arch list used by torch
-# can be useful for both `dev` and `test`
-# explicitly set the list to avoid issues with torch 2.2
-# see https://github.com/pytorch/pytorch/pull/123243
+# CUDA arch list used by torch
+# Explicitly set the list to avoid issues with torch 2.2
+# See https://github.com/pytorch/pytorch/pull/123243
 ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
-#################### BASE BUILD IMAGE ####################
+#################### BUILD BASE IMAGE ####################

 #################### CSRC BUILD IMAGE ####################
 FROM base AS csrc-build
@@ -188,7 +183,7 @@ ARG nvcc_threads=8
 ENV NVCC_THREADS=$nvcc_threads

 ARG USE_SCCACHE
-ARG SCCACHE_DOWNLOAD_URL=https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz
+ARG SCCACHE_DOWNLOAD_URL
 ARG SCCACHE_ENDPOINT
 ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
 ARG SCCACHE_REGION_NAME=us-west-2
@@ -206,10 +201,16 @@ ENV SETUPTOOLS_SCM_PRETEND_VERSION="0.0.0+csrc.build"
 RUN --mount=type=cache,target=/root/.cache/uv \
    if [ "$USE_SCCACHE" = "1" ]; then \
        echo "Installing sccache..." \
+        && case "${TARGETPLATFORM}" in \
+          linux/arm64) SCCACHE_ARCH="aarch64" ;; \
+          linux/amd64) SCCACHE_ARCH="x86_64" ;; \
+          *) echo "Unsupported TARGETPLATFORM for sccache: ${TARGETPLATFORM}" >&2; exit 1 ;; \
+        esac \
+        && export SCCACHE_DOWNLOAD_URL="${SCCACHE_DOWNLOAD_URL:-https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-${SCCACHE_ARCH}-unknown-linux-musl.tar.gz}" \
        && curl -L -o sccache.tar.gz ${SCCACHE_DOWNLOAD_URL} \
        && tar -xzf sccache.tar.gz \
-        && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
-        && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
+        && sudo mv sccache-v0.8.1-${SCCACHE_ARCH}-unknown-linux-musl/sccache /usr/bin/sccache \
+        && rm -rf sccache.tar.gz sccache-v0.8.1-${SCCACHE_ARCH}-unknown-linux-musl \
        && if [ ! -z ${SCCACHE_ENDPOINT} ] ; then export SCCACHE_ENDPOINT=${SCCACHE_ENDPOINT} ; fi \
        && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
        && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
@@ -241,6 +242,50 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
    fi
 #################### CSRC BUILD IMAGE ####################

+#################### EXTENSIONS BUILD IMAGE ####################
+# Build DeepGEMM, pplx-kernels, DeepEP - runs in PARALLEL with csrc-build
+# This stage is independent and doesn't affect csrc cache
+FROM base AS extensions-build
+ARG CUDA_VERSION
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+ENV UV_LINK_MODE=copy
+
+WORKDIR /workspace
+
+# Build DeepGEMM wheel
+ARG DEEPGEMM_GIT_REF
+COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
+RUN --mount=type=cache,target=/root/.cache/uv \
+    mkdir -p /tmp/deepgemm/dist && \
+    VLLM_DOCKER_BUILD_CONTEXT=1 TORCH_CUDA_ARCH_LIST="9.0a 10.0a" /tmp/install_deepgemm.sh \
+        --cuda-version "${CUDA_VERSION}" \
+        ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"} \
+        --wheel-dir /tmp/deepgemm/dist || \
+    echo "DeepGEMM build skipped (CUDA version requirement not met)"
+
+# Ensure the wheel dir exists so COPY won't fail when DeepGEMM is skipped
+RUN mkdir -p /tmp/deepgemm/dist && touch /tmp/deepgemm/dist/.deepgemm_skipped
+
+# Build pplx-kernels and DeepEP wheels
+COPY tools/ep_kernels/install_python_libraries.sh /tmp/install_python_libraries.sh
+ARG PPLX_COMMIT_HASH
+ARG DEEPEP_COMMIT_HASH
+ARG NVSHMEM_VER
+RUN --mount=type=cache,target=/root/.cache/uv \
+    mkdir -p /tmp/ep_kernels_workspace/dist && \
+    export TORCH_CUDA_ARCH_LIST='9.0a 10.0a' && \
+    /tmp/install_python_libraries.sh \
+        --workspace /tmp/ep_kernels_workspace \
+        --mode wheel \
+        ${PPLX_COMMIT_HASH:+--pplx-ref "$PPLX_COMMIT_HASH"} \
+        ${DEEPEP_COMMIT_HASH:+--deepep-ref "$DEEPEP_COMMIT_HASH"} \
+        ${NVSHMEM_VER:+--nvshmem-ver "$NVSHMEM_VER"} && \
+    find /tmp/ep_kernels_workspace/nvshmem -name '*.a' -delete
+#################### EXTENSIONS BUILD IMAGE ####################
+
 #################### WHEEL BUILD IMAGE ####################
 FROM base AS build
 ARG TARGETPLATFORM
@@ -265,6 +310,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \

 WORKDIR /workspace

+# Copy pre-built csrc wheel directly
 COPY --from=csrc-build /workspace/dist /precompiled-wheels

 COPY . .
@@ -286,27 +332,9 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    fi && \
    python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38

-# Install DeepGEMM from source
-ARG DEEPGEMM_GIT_REF
-COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
-RUN --mount=type=cache,target=/root/.cache/uv \
-    VLLM_DOCKER_BUILD_CONTEXT=1 TORCH_CUDA_ARCH_LIST="9.0a 10.0a" /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"} --wheel-dir /tmp/deepgemm/dist
-
-# Ensure the wheel dir exists so later-stage COPY won't fail when DeepGEMM is skipped
-RUN mkdir -p /tmp/deepgemm/dist && touch /tmp/deepgemm/dist/.deepgemm_skipped
-
-COPY tools/ep_kernels/install_python_libraries.sh /tmp/install_python_libraries.sh
-# Install EP kernels(pplx-kernels and DeepEP)
-ARG PPLX_COMMIT_HASH
-ARG DEEPEP_COMMIT_HASH
-RUN --mount=type=cache,target=/root/.cache/uv \
-    export TORCH_CUDA_ARCH_LIST='9.0a 10.0a' && \
-    /tmp/install_python_libraries.sh \
-        --workspace /tmp/ep_kernels_workspace \
-        --mode wheel \
-        ${PPLX_COMMIT_HASH:+--pplx-ref "$PPLX_COMMIT_HASH"} \
-        ${DEEPEP_COMMIT_HASH:+--deepep-ref "$DEEPEP_COMMIT_HASH"} && \
-    find /tmp/ep_kernels_workspace/nvshmem -name '*.a' -delete
+# Copy extension wheels from extensions-build stage for later use
+COPY --from=extensions-build /tmp/deepgemm/dist /tmp/deepgemm/dist
+COPY --from=extensions-build /tmp/ep_kernels_workspace/dist /tmp/ep_kernels_workspace/dist

 # Check the size of the wheel if RUN_WHEEL_CHECK is true
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
@@ -344,32 +372,25 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --python /opt/venv/bin/python3 -r requirements/dev.txt \
    --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 #################### DEV IMAGE ####################
-
 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
 FROM ${FINAL_BASE_IMAGE} AS vllm-base
+
 ARG CUDA_VERSION
 ARG PYTHON_VERSION
-ARG INSTALL_KV_CONNECTORS=false
-WORKDIR /vllm-workspace
-ENV DEBIAN_FRONTEND=noninteractive
-ARG TARGETPLATFORM
-
-# TODO (huydhn): There is no prebuilt gdrcopy package on 12.9 at the moment
-ARG GDRCOPY_CUDA_VERSION=12.8
-# Keep in line with FINAL_BASE_IMAGE
-ARG GDRCOPY_OS_VERSION=Ubuntu22_04
-
-SHELL ["/bin/bash", "-c"]
-
 ARG DEADSNAKES_MIRROR_URL
 ARG DEADSNAKES_GPGKEY_URL
 ARG GET_PIP_URL

+ENV DEBIAN_FRONTEND=noninteractive
+WORKDIR /vllm-workspace
+
+
+# Python version string for paths (e.g., "312" for 3.12)
 RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
    echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment

-# Install Python and other dependencies
+# Install Python and system dependencies
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
    && apt-get update -y \
@@ -408,62 +429,103 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
    && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
    && python3 --version && python3 -m pip --version

-# Install CUDA development tools and build essentials for runtime JIT compilation
+# Install CUDA development tools for runtime JIT compilation
 # (FlashInfer, DeepGEMM, EP kernels all require compilation at runtime)
 RUN CUDA_VERSION_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') && \
    apt-get update -y && \
    apt-get install -y --no-install-recommends \
-    cuda-nvcc-${CUDA_VERSION_DASH} \
-    cuda-cudart-${CUDA_VERSION_DASH} \
-    cuda-nvrtc-${CUDA_VERSION_DASH} \
-    cuda-cuobjdump-${CUDA_VERSION_DASH} \
-    # https://github.com/vllm-project/vllm/issues/29590
-    libcurand-dev-${CUDA_VERSION_DASH} \
-    libcublas-${CUDA_VERSION_DASH} \
-    # Fixes nccl_allocator requiring nccl.h at runtime
-    # https://github.com/vllm-project/vllm/blob/1336a1ea244fa8bfd7e72751cabbdb5b68a0c11a/vllm/distributed/device_communicators/pynccl_allocator.py#L22
-    libnccl-dev && \
+        cuda-nvcc-${CUDA_VERSION_DASH} \
+        cuda-cudart-${CUDA_VERSION_DASH} \
+        cuda-nvrtc-${CUDA_VERSION_DASH} \
+        cuda-cuobjdump-${CUDA_VERSION_DASH} \
+        libcurand-dev-${CUDA_VERSION_DASH} \
+        libcublas-${CUDA_VERSION_DASH} \
+        # Fixes nccl_allocator requiring nccl.h at runtime
+        # https://github.com/vllm-project/vllm/blob/1336a1ea244fa8bfd7e72751cabbdb5b68a0c11a/vllm/distributed/device_communicators/pynccl_allocator.py#L22
+        libnccl-dev && \
    rm -rf /var/lib/apt/lists/*

-ARG PIP_INDEX_URL UV_INDEX_URL
-ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
-ARG PYTORCH_CUDA_INDEX_BASE_URL
-ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
-
 # Install uv for faster pip installs
-RUN --mount=type=cache,target=/root/.cache/uv \
-    python3 -m pip install uv
+RUN python3 -m pip install uv

-# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
-# Reference: https://github.com/astral-sh/uv/pull/1694
+# Environment for uv
 ENV UV_HTTP_TIMEOUT=500
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
-# Use copy mode to avoid hardlink failures with Docker cache mounts
 ENV UV_LINK_MODE=copy

-# Workaround for https://github.com/openai/triton/issues/2507 and
-# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
-# this won't be needed for future versions of this docker image
-# or future versions of triton.
+# Workaround for triton/pytorch issues
 RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/

-# Install vllm wheel first, so that torch etc will be installed.
-RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
-    --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system dist/*.whl --verbose \
-        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+# ============================================================
+# SLOW-CHANGING DEPENDENCIES BELOW
+# These are the expensive layers that we want to cache
+# ============================================================
+
+# Install PyTorch and core CUDA dependencies
+# This is ~2GB and rarely changes
+ARG PYTORCH_CUDA_INDEX_BASE_URL
+COPY requirements/common.txt /tmp/common.txt
+COPY requirements/cuda.txt /tmp/requirements-cuda.txt
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -r /tmp/requirements-cuda.txt \
+        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') && \
+    rm /tmp/requirements-cuda.txt /tmp/common.txt

 # Install FlashInfer pre-compiled kernel cache and binaries
+# This is ~1.1GB and only changes when FlashInfer version bumps
 # https://docs.flashinfer.ai/installation.html
+ARG FLASHINFER_VERSION=0.5.3
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system flashinfer-cubin==0.5.3 \
-    && uv pip install --system flashinfer-jit-cache==0.5.3 \
+    uv pip install --system flashinfer-cubin==${FLASHINFER_VERSION} \
+    && uv pip install --system flashinfer-jit-cache==${FLASHINFER_VERSION} \
        --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
    && flashinfer show-config

-COPY examples examples
-COPY benchmarks benchmarks
-COPY ./vllm/collect_env.py .
+# ============================================================
+# OPENAI API SERVER DEPENDENCIES
+# Pre-install these to avoid reinstalling on every vLLM wheel rebuild
+# ============================================================
+
+# Install gdrcopy (saves ~6s per build)
+# TODO (huydhn): There is no prebuilt gdrcopy package on 12.9 at the moment
+ARG GDRCOPY_CUDA_VERSION=12.8
+ARG GDRCOPY_OS_VERSION=Ubuntu22_04
+ARG TARGETPLATFORM
+COPY tools/install_gdrcopy.sh /tmp/install_gdrcopy.sh
+RUN set -eux; \
+    case "${TARGETPLATFORM}" in \
+      linux/arm64) UUARCH="aarch64" ;; \
+      linux/amd64) UUARCH="x64" ;; \
+      *) echo "Unsupported TARGETPLATFORM: ${TARGETPLATFORM}" >&2; exit 1 ;; \
+    esac; \
+    /tmp/install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "${GDRCOPY_CUDA_VERSION}" "${UUARCH}" && \
+    rm /tmp/install_gdrcopy.sh
+
+# Install vllm-openai dependencies (saves ~2.6s per build)
+# These are stable packages that don't depend on vLLM itself
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        BITSANDBYTES_VERSION="0.42.0"; \
+    else \
+        BITSANDBYTES_VERSION="0.46.1"; \
+    fi; \
+    uv pip install --system accelerate hf_transfer modelscope \
+        "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3,gcs]>=0.15.3'
+
+# ============================================================
+# VLLM INSTALLATION (depends on build stage)
+# ============================================================
+
+ARG PIP_INDEX_URL UV_INDEX_URL
+ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
+ARG PYTORCH_CUDA_INDEX_BASE_URL
+ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
+
+# Install vllm wheel first, so that torch etc will be installed.
+RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
+    --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system dist/*.whl --verbose \
+        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')

 RUN --mount=type=cache,target=/root/.cache/uv \
 . /etc/environment && \
@@ -478,7 +540,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
              echo "No DeepGEMM wheels to install; skipping."; \
           fi'

-# Pytorch now installs NVSHMEM, setting LD_LIBRARY_PATH (https://github.com/pytorch/pytorch/blob/d38164a545b4a4e4e0cf73ce67173f70574890b6/.ci/manywheel/build_cuda.sh#L141C14-L141C36)
+# Pytorch now installs NVSHMEM, setting LD_LIBRARY_PATH
 ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH

 # Install EP kernels wheels (pplx-kernels and DeepEP) that have been built in the `build` stage
@@ -487,23 +549,17 @@ RUN --mount=type=bind,from=build,src=/tmp/ep_kernels_workspace/dist,target=/vllm
    uv pip install --system ep_kernels/dist/*.whl --verbose \
        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')

-RUN --mount=type=bind,source=tools/install_gdrcopy.sh,target=/tmp/install_gdrcopy.sh,ro \
-    set -eux; \
-    case "${TARGETPLATFORM}" in \
-      linux/arm64) UUARCH="aarch64" ;; \
-      linux/amd64) UUARCH="x64" ;; \
-      *) echo "Unsupported TARGETPLATFORM: ${TARGETPLATFORM}" >&2; exit 1 ;; \
-    esac; \
-    /tmp/install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "${GDRCOPY_CUDA_VERSION}" "${UUARCH}"
-
 # CUDA image changed from /usr/local/nvidia to /usr/local/cuda in 12.8 but will
 # return to /usr/local/nvidia in 13.0 to allow container providers to mount drivers
 # consistently from the host (see https://github.com/vllm-project/vllm/issues/18859).
 # Until then, add /usr/local/nvidia/lib64 before the image cuda path to allow override.
 ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib64:${LD_LIBRARY_PATH}

+# Copy examples and benchmarks at the end to minimize cache invalidation
+COPY examples examples
+COPY benchmarks benchmarks
+COPY ./vllm/collect_env.py .
 #################### vLLM installation IMAGE ####################
-
 #################### TEST IMAGE ####################
 # image to run unit testing suite
 # note that this uses vllm installed by `pip`
@@ -561,6 +617,7 @@ RUN mv vllm src/vllm
 FROM vllm-base AS vllm-openai-base
 ARG TARGETPLATFORM
 ARG INSTALL_KV_CONNECTORS=false
+ARG CUDA_VERSION

 ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
@@ -569,18 +626,32 @@ ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500

-# install additional dependencies for openai api server
+# install kv_connectors if requested
+ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
+ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,source=requirements/kv_connectors.txt,target=/tmp/kv_connectors.txt,ro \
+    CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
+    CUDA_VERSION_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-'); \
+    CUDA_HOME=/usr/local/cuda; \
+    # lmcache requires explicit specifying CUDA_HOME
+    BUILD_PKGS="libcusparse-dev-${CUDA_VERSION_DASH} \
+                libcublas-dev-${CUDA_VERSION_DASH} \
+                libcusolver-dev-${CUDA_VERSION_DASH}"; \
    if [ "$INSTALL_KV_CONNECTORS" = "true" ]; then \
-        uv pip install --system -r /tmp/kv_connectors.txt; \
-    fi; \
-    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        BITSANDBYTES_VERSION="0.42.0"; \
-    else \
-        BITSANDBYTES_VERSION="0.46.1"; \
-    fi; \
-    uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3,gcs]>=0.15.3'
+        if [ "$CUDA_MAJOR" -ge 13 ]; then \
+            uv pip install --system nixl-cu13; \
+        fi; \
+        uv pip install --system -r /tmp/kv_connectors.txt --no-build || ( \
+            # if the above fails, install from source
+            apt-get update -y && \
+            apt-get install -y --no-install-recommends ${BUILD_PKGS} && \
+            uv pip install --system -r /tmp/kv_connectors.txt --no-build-isolation && \
+            apt-get purge -y ${BUILD_PKGS} && \
+            # clean up -dev packages, keep runtime libraries
+            rm -rf /var/lib/apt/lists/* \
+        ); \
+    fi

 ENV VLLM_USAGE_SOURCE production-docker-image


--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -17,7 +17,7 @@
 #   VLLM_CPU_DISABLE_AVX512=false (default)|true
 #   VLLM_CPU_AVX512BF16=false (default)|true
 #   VLLM_CPU_AVX512VNNI=false (default)|true
-#   VLLM_CPU_AMXBF16=false (default)|true
+#   VLLM_CPU_AMXBF16=false |true (default)
 #

 ######################### COMMON BASE IMAGE #########################
@@ -95,7 +95,7 @@ ENV VLLM_CPU_AVX512BF16=${VLLM_CPU_AVX512BF16}
 ARG VLLM_CPU_AVX512VNNI=0
 ENV VLLM_CPU_AVX512VNNI=${VLLM_CPU_AVX512VNNI}
 # Support for building with AMXBF16 ISA: docker build --build-arg VLLM_CPU_AMXBF16="true" ...
-ARG VLLM_CPU_AMXBF16=0
+ARG VLLM_CPU_AMXBF16=1
 ENV VLLM_CPU_AMXBF16=${VLLM_CPU_AMXBF16}

 WORKDIR /workspace/vllm
@@ -147,7 +147,9 @@ WORKDIR /workspace/vllm

 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
    --mount=type=cache,target=/var/lib/apt,sharing=locked \
-    apt-get install -y --no-install-recommends vim numactl xz-utils
+    apt-get install -y --no-install-recommends vim numactl xz-utils make clangd-14
+
+RUN ln -s /usr/bin/clangd-14 /usr/bin/clangd

 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \

--- a/docker/Dockerfile.ppc64le
+++ b/docker/Dockerfile.ppc64le
@@ -22,13 +22,13 @@ RUN microdnf install -y dnf && dnf install -y gcc-toolset-14 make wget unzip \
 ###############################################################
 FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS centos-deps-builder
 RUN  microdnf install -y dnf && \ 
-     dnf install -y https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-gpg-keys-9.0-24.el9.noarch.rpm \
-        https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-stream-repos-9.0-24.el9.noarch.rpm \
+     dnf install -y https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-gpg-keys-9.0-26.el9.noarch.rpm \
+        https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-stream-repos-9.0-26.el9.noarch.rpm \
        https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
        dnf config-manager --set-enabled crb

-RUN dnf install -y openjpeg2-devel lcms2-devel tcl-devel tk-devel fribidi-devel && \
-    dnf remove -y centos-gpg-keys-9.0-24.el9.noarch centos-stream-repos-9.0-24.el9.noarch 
+RUN dnf install -y openjpeg2-devel lcms2-devel tcl-devel tk-devel fribidi-devel yajl-devel && \
+    dnf remove -y centos-gpg-keys-9.0-24.el9.noarch centos-stream-repos-9.0-26.el9.noarch 


 ###############################################################
@@ -346,4 +346,4 @@ WORKDIR /workspace/

 RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks

-ENTRYPOINT ["vllm", "serve"]
\ No newline at end of file
+ENTRYPOINT ["vllm", "serve"]
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -3,6 +3,14 @@ ARG REMOTE_VLLM="0"
 ARG COMMON_WORKDIR=/app
 ARG BASE_IMAGE=rocm/vllm-dev:base

+# Sccache configuration (only used in release pipeline)
+ARG USE_SCCACHE
+ARG SCCACHE_DOWNLOAD_URL
+ARG SCCACHE_ENDPOINT
+ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
+ARG SCCACHE_REGION_NAME=us-west-2
+ARG SCCACHE_S3_NO_CREDENTIALS=0
+
 FROM ${BASE_IMAGE} AS base

 ARG ARG_PYTORCH_ROCM_ARCH
@@ -14,9 +22,14 @@ ENV RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1
 RUN apt-get update -q -y && apt-get install -q -y \
    sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
    apt-transport-https ca-certificates wget curl
-# Remove sccache
 RUN python3 -m pip install --upgrade pip
-RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
+# Remove sccache only if not using sccache (it exists in base image from Dockerfile.rocm_base)
+ARG USE_SCCACHE
+RUN if [ "$USE_SCCACHE" != "1" ]; then \
+        apt-get purge -y sccache || true; \
+        python3 -m pip uninstall -y sccache || true; \
+        rm -f "$(which sccache)" || true; \
+    fi

 # Install UV
 RUN curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR="/usr/local/bin" sh
@@ -28,6 +41,39 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
 # Use copy mode to avoid hardlink failures with Docker cache mounts
 ENV UV_LINK_MODE=copy

+# Install sccache if USE_SCCACHE is enabled (for release builds)
+ARG USE_SCCACHE
+ARG SCCACHE_DOWNLOAD_URL
+ARG SCCACHE_ENDPOINT
+ARG SCCACHE_BUCKET_NAME
+ARG SCCACHE_REGION_NAME
+ARG SCCACHE_S3_NO_CREDENTIALS
+RUN if [ "$USE_SCCACHE" = "1" ]; then \
+        if command -v sccache >/dev/null 2>&1; then \
+            echo "sccache already installed, skipping installation"; \
+            sccache --version; \
+        else \
+            echo "Installing sccache..." \
+            && SCCACHE_ARCH="x86_64" \
+            && SCCACHE_VERSION="v0.8.1" \
+            && SCCACHE_DL_URL="${SCCACHE_DOWNLOAD_URL:-https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl.tar.gz}" \
+            && curl -L -o /tmp/sccache.tar.gz ${SCCACHE_DL_URL} \
+            && tar -xzf /tmp/sccache.tar.gz -C /tmp \
+            && mv /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl/sccache /usr/bin/sccache \
+            && chmod +x /usr/bin/sccache \
+            && rm -rf /tmp/sccache.tar.gz /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl \
+            && sccache --version; \
+        fi; \
+    fi
+
+# Set sccache environment variables only when USE_SCCACHE=1
+# This prevents S3 config from leaking into images when sccache is not used
+ARG USE_SCCACHE
+ENV SCCACHE_BUCKET=${USE_SCCACHE:+${SCCACHE_BUCKET_NAME}}
+ENV SCCACHE_REGION=${USE_SCCACHE:+${SCCACHE_REGION_NAME}}
+ENV SCCACHE_S3_NO_CREDENTIALS=${USE_SCCACHE:+${SCCACHE_S3_NO_CREDENTIALS}}
+ENV SCCACHE_IDLE_TIMEOUT=${USE_SCCACHE:+0}
+
 ARG COMMON_WORKDIR
 WORKDIR ${COMMON_WORKDIR}

@@ -39,6 +85,8 @@ ONBUILD COPY ./ vllm/
 FROM base AS fetch_vllm_1
 ARG VLLM_REPO="https://github.com/vllm-project/vllm.git"
 ARG VLLM_BRANCH="main"
+ENV VLLM_REPO=${VLLM_REPO}
+ENV VLLM_BRANCH=${VLLM_BRANCH}
 ONBUILD RUN git clone ${VLLM_REPO} \
 	    && cd vllm \
 	    && git fetch -v --prune -- origin ${VLLM_BRANCH} \
@@ -51,7 +99,7 @@ FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm
 # -----------------------
 # vLLM build stages
 FROM fetch_vllm AS build_vllm
-# Build vLLM
+# Build vLLM (setup.py auto-detects sccache in PATH)
 RUN cd vllm \
    && python3 -m pip install -r requirements/rocm.txt \
    && python3 setup.py clean --all  \
@@ -67,6 +115,178 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/docker/Dockerfile.rocm /docker/
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/vllm/v1 /vllm_v1

+# RIXL/UCX build stages
+FROM base AS build_rixl
+ARG RIXL_BRANCH="f33a5599"
+ARG RIXL_REPO="https://github.com/ROCm/RIXL.git"
+ARG UCX_BRANCH="da3fac2a"
+ARG UCX_REPO="https://github.com/ROCm/ucx.git"
+ENV ROCM_PATH=/opt/rocm
+ENV UCX_HOME=/usr/local/ucx
+ENV RIXL_HOME=/usr/local/rixl
+ENV RIXL_BENCH_HOME=/usr/local/rixl_bench
+
+# RIXL build system dependences and RDMA support
+RUN apt-get -y update && apt-get -y install autoconf libtool pkg-config \
+    libgrpc-dev \
+    libgrpc++-dev \
+    libprotobuf-dev \
+    protobuf-compiler-grpc \
+    libcpprest-dev \
+    libaio-dev \
+    librdmacm1 \
+    librdmacm-dev \
+    libibverbs1 \
+    libibverbs-dev \
+    ibverbs-utils \
+    rdmacm-utils \
+    ibverbs-providers \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN uv pip install --system meson auditwheel patchelf tomlkit
+
+RUN cd /usr/local/src && \
+    git clone ${UCX_REPO} &&  \
+    cd ucx  && \
+    git checkout ${UCX_BRANCH} && \
+    ./autogen.sh && \
+    mkdir build && cd build && \
+    ../configure \
+        --prefix=/usr/local/ucx \
+        --enable-shared \
+        --disable-static \
+        --disable-doxygen-doc \
+        --enable-optimizations \
+        --enable-devel-headers \
+        --with-rocm=/opt/rocm \
+        --with-verbs \
+        --with-dm \
+        --enable-mt && \
+    make -j && \
+    make install
+
+ENV PATH=/usr/local/ucx/bin:$PATH
+ENV LD_LIBRARY_PATH=${UCX_HOME}/lib:${LD_LIBRARY_PATH}
+
+RUN git clone ${RIXL_REPO} /opt/rixl && \
+    cd /opt/rixl && \
+    git checkout ${RIXL_BRANCH} && \
+    meson setup build --prefix=${RIXL_HOME} \
+                     -Ducx_path=${UCX_HOME} \
+                     -Drocm_path=${ROCM_PATH} && \
+    cd build && \
+    ninja && \
+    ninja install
+
+# Generate RIXL wheel
+RUN cd /opt/rixl && mkdir -p /app/install && \
+    ./contrib/build-wheel.sh \
+        --output-dir /app/install \
+        --rocm-dir ${ROCM_PATH} \
+        --ucx-plugins-dir ${UCX_HOME}/lib/ucx \
+        --nixl-plugins-dir ${RIXL_HOME}/lib/x86_64-linux-gnu/plugins
+
+
+# -----------------------
+# vLLM wheel release build stage (for building distributable wheels)
+# This stage pins dependencies to custom ROCm wheel versions and handles version detection
+FROM fetch_vllm AS build_vllm_wheel_release
+
+ARG COMMON_WORKDIR
+
+# Create /install directory for custom wheels
+RUN mkdir -p /install
+
+# Copy custom ROCm wheels from docker/context if they exist
+# COPY ensures Docker cache is invalidated when wheels change
+# .keep file ensures directory always exists for COPY to work
+COPY docker/context/base-wheels/ /tmp/base-wheels/
+# This is how we know if we are building for a wheel release or not.
+# If there are not wheels found there, we are not building for a wheel release. 
+# So we exit with an error. To skip this stage.
+RUN if [ -n "$(ls /tmp/base-wheels/*.whl 2>/dev/null)" ]; then \
+        echo "Found custom wheels - copying to /install"; \
+        cp /tmp/base-wheels/*.whl /install/ && \
+        echo "Copied custom wheels:"; \
+        ls -lh /install/; \
+    else \
+        echo "ERROR: No custom wheels found in docker/context/base-wheels/"; \
+        echo "Wheel releases require pre-built ROCm wheels."; \
+        exit 1; \
+    fi
+
+# GIT_REPO_CHECK: Verify repo is clean and tags are available (for release builds)
+# This matches CUDA's Dockerfile behavior for proper version detection via setuptools_scm
+ARG GIT_REPO_CHECK=0
+RUN if [ "$GIT_REPO_CHECK" != "0" ]; then \
+        echo "Running repository checks..."; \
+        cd vllm && bash tools/check_repo.sh; \
+    fi
+
+# Extract version from git BEFORE any modifications (pin_rocm_dependencies.py modifies requirements/rocm.txt)
+# This ensures setuptools_scm sees clean repo state for version detection
+RUN --mount=type=bind,source=.git,target=vllm/.git \
+    cd vllm \
+    && pip install setuptools_scm \
+    && VLLM_VERSION=$(python3 -c "import setuptools_scm; print(setuptools_scm.get_version())") \
+    && echo "Detected vLLM version: ${VLLM_VERSION}" \
+    && echo "${VLLM_VERSION}" > /tmp/vllm_version.txt
+
+# Fail if git-based package dependencies are found in requirements files
+# (uv doesn't handle git+ URLs well, and packages should be distributed on PyPI)
+# Extra notes: pip install is able to handle git+ URLs, but uv doesn't.
+RUN echo "Checking for git-based packages in requirements files..." \
+    && echo "Checking common.txt for git-based packages:" \
+    && if grep -q 'git+' ${COMMON_WORKDIR}/vllm/requirements/common.txt; then \
+         echo "ERROR: Git-based packages found in common.txt:"; \
+         grep 'git+' ${COMMON_WORKDIR}/vllm/requirements/common.txt; \
+         echo "Please publish these packages to PyPI instead of using git dependencies."; \
+         exit 1; \
+       else \
+         echo "  ✓ No git-based packages found in common.txt"; \
+       fi \
+    && echo "Checking rocm.txt for git-based packages:" \
+    && if grep -q 'git+' ${COMMON_WORKDIR}/vllm/requirements/rocm.txt; then \
+         echo "ERROR: Git-based packages found in rocm.txt:"; \
+         grep 'git+' ${COMMON_WORKDIR}/vllm/requirements/rocm.txt; \
+         echo "Please publish these packages to PyPI instead of using git dependencies."; \
+         exit 1; \
+       else \
+         echo "  ✓ No git-based packages found in rocm.txt"; \
+       fi \
+    && echo "All requirements files are clean - no git-based packages found"
+
+# Pin vLLM dependencies to exact versions of custom ROCm wheels
+# This ensures 'pip install vllm' automatically installs correct torch/triton/torchvision/amdsmi
+COPY tools/vllm-rocm/pin_rocm_dependencies.py /tmp/pin_rocm_dependencies.py
+RUN echo "Pinning vLLM dependencies to custom wheel versions..." \
+    && python3 /tmp/pin_rocm_dependencies.py /install ${COMMON_WORKDIR}/vllm/requirements/rocm.txt
+
+# Install dependencies using custom wheels from /install
+RUN cd vllm \
+    && echo "Building vLLM with custom wheels from /install" \
+    && python3 -m pip install --find-links /install -r requirements/rocm.txt \
+    && python3 setup.py clean --all
+
+# Build wheel using pre-extracted version to avoid dirty state from modified requirements/rocm.txt
+# (setup.py auto-detects sccache in PATH)
+RUN --mount=type=bind,source=.git,target=vllm/.git \
+    cd vllm \
+    && export SETUPTOOLS_SCM_PRETEND_VERSION=$(cat /tmp/vllm_version.txt) \
+    && echo "Building wheel with version: ${SETUPTOOLS_SCM_PRETEND_VERSION}" \
+    && python3 setup.py bdist_wheel --dist-dir=dist
+
+FROM scratch AS export_vllm_wheel_release
+ARG COMMON_WORKDIR
+COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/dist/*.whl /
+COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/requirements /requirements
+COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/benchmarks /benchmarks
+COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/tests /tests
+COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/examples /examples
+COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/docker/Dockerfile.rocm /docker/
+COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite
+COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/vllm/v1 /vllm_v1
+
 # -----------------------
 # Test vLLM image
 FROM base AS test
@@ -83,6 +303,10 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
    && pip uninstall -y vllm \
    && uv pip install --system *.whl

+# Install RIXL wheel
+RUN --mount=type=bind,from=build_rixl,src=/app/install,target=/rixl_install \
+    uv pip install --system /rixl_install/*.whl
+
 WORKDIR /vllm-workspace
 ARG COMMON_WORKDIR
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
@@ -97,6 +321,14 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system hf_transfer
 ENV HF_HUB_ENABLE_HF_TRANSFER=1

+# install audio decode package `torchcodec` from source (required due to 
+# ROCm and torch version mismatch) for tests with datasets package
+COPY tools/install_torchcodec_rocm.sh /tmp/install_torchcodec.sh
+RUN bash /tmp/install_torchcodec.sh \
+    && rm /tmp/install_torchcodec.sh \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
 # Copy in the v1 package (for python-only install test group)
 COPY --from=export_vllm /vllm_v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1

@@ -130,6 +362,7 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
    && uv pip install --system *.whl

 ARG COMMON_WORKDIR
+ARG BASE_IMAGE

 # Copy over the benchmark scripts as well
 COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks
@@ -144,4 +377,13 @@ ENV SAFETENSORS_FAST_GPU=1
 # Performance environment variable.
 ENV HIP_FORCE_DEV_KERNARG=1

+# Workaround for ROCm profiler limits
+RUN echo "ROCTRACER_MAX_EVENTS=10000000" > ${COMMON_WORKDIR}/libkineto.conf
+ENV KINETO_CONFIG="${COMMON_WORKDIR}/libkineto.conf"
+RUN echo "VLLM_BASE_IMAGE=${BASE_IMAGE}" >> ${COMMON_WORKDIR}/versions.txt
+
 CMD ["/bin/bash"]
+
+#Set entrypoint for vllm-openai official images
+FROM final As vllm-openai
+ENTRYPOINT ["vllm", "serve"]
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
-ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.1-complete
+ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete
 ARG TRITON_BRANCH="57c693b6"
 ARG TRITON_REPO="https://github.com/ROCm/triton.git"
-ARG PYTORCH_BRANCH="1c57644d"
-ARG PYTORCH_VISION_BRANCH="v0.23.0"
+ARG PYTORCH_BRANCH="89075173"
 ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
+ARG PYTORCH_VISION_BRANCH="v0.24.1"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
 ARG PYTORCH_AUDIO_BRANCH="v2.9.0"
 ARG PYTORCH_AUDIO_REPO="https://github.com/pytorch/audio.git"
 ARG FA_BRANCH="0e60e394"
 ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
-ARG AITER_BRANCH="59bd8ff2"
+ARG AITER_BRANCH="6af8b687"
 ARG AITER_REPO="https://github.com/ROCm/aiter.git"
+ARG MORI_BRANCH="2d02c6a9"
+ARG MORI_REPO="https://github.com/ROCm/mori.git"
+
+# Sccache configuration (only used in release pipeline)
+ARG USE_SCCACHE
+ARG SCCACHE_DOWNLOAD_URL
+ARG SCCACHE_ENDPOINT
+ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
+ARG SCCACHE_REGION_NAME=us-west-2
+ARG SCCACHE_S3_NO_CREDENTIALS=0

 FROM ${BASE_IMAGE} AS base

@@ -20,6 +30,7 @@ ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
 ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151
 ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
 ENV AITER_ROCM_ARCH=gfx942;gfx950
+ENV MORI_GPU_ARCHS=gfx942;gfx950

 # Required for RCCL in ROCm7.1
 ENV HSA_NO_SCRATCH_RECLAIM=1
@@ -33,7 +44,7 @@ ENV DEBIAN_FRONTEND=noninteractive

 # Install Python and other dependencies
 RUN apt-get update -y \
-    && apt-get install -y software-properties-common git curl sudo vim less libgfortran5 \
+    && apt-get install -y software-properties-common git curl sudo vim less libgfortran5 libopenmpi-dev libpci-dev \
    && for i in 1 2 3; do \
        add-apt-repository -y ppa:deadsnakes/ppa && break || \
        { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
@@ -50,6 +61,53 @@ RUN apt-get update -y \
 RUN pip install -U packaging 'cmake<4' ninja wheel 'setuptools<80' pybind11 Cython
 RUN apt-get update && apt-get install -y libjpeg-dev libsox-dev libsox-fmt-all sox && rm -rf /var/lib/apt/lists/*

+# Install sccache if USE_SCCACHE is enabled (for release builds)
+ARG USE_SCCACHE
+ARG SCCACHE_DOWNLOAD_URL
+ARG SCCACHE_ENDPOINT
+ARG SCCACHE_BUCKET_NAME
+ARG SCCACHE_REGION_NAME
+ARG SCCACHE_S3_NO_CREDENTIALS
+RUN if [ "$USE_SCCACHE" = "1" ]; then \
+        echo "Installing sccache..." \
+        && SCCACHE_ARCH="x86_64" \
+        && SCCACHE_VERSION="v0.8.1" \
+        && SCCACHE_DL_URL="${SCCACHE_DOWNLOAD_URL:-https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl.tar.gz}" \
+        && curl -L -o /tmp/sccache.tar.gz ${SCCACHE_DL_URL} \
+        && tar -xzf /tmp/sccache.tar.gz -C /tmp \
+        && mv /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl/sccache /usr/bin/sccache \
+        && chmod +x /usr/bin/sccache \
+        && rm -rf /tmp/sccache.tar.gz /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl \
+        && sccache --version; \
+    fi
+
+# Setup sccache for HIP compilation via HIP_CLANG_PATH
+# This creates wrapper scripts in a separate directory and points HIP to use them
+# This avoids modifying the original ROCm binaries which can break detection
+# NOTE: HIP_CLANG_PATH is NOT set as ENV to avoid affecting downstream images (Dockerfile.rocm)
+# Instead, each build stage should export HIP_CLANG_PATH=/opt/sccache-wrappers if USE_SCCACHE=1
+RUN if [ "$USE_SCCACHE" = "1" ]; then \
+        echo "Setting up sccache wrappers for HIP compilation..." \
+        && mkdir -p /opt/sccache-wrappers \
+        && printf '#!/bin/bash\nexec sccache /opt/rocm/lib/llvm/bin/clang++ "$@"\n' > /opt/sccache-wrappers/clang++ \
+        && chmod +x /opt/sccache-wrappers/clang++ \
+        && printf '#!/bin/bash\nexec sccache /opt/rocm/lib/llvm/bin/clang "$@"\n' > /opt/sccache-wrappers/clang \
+        && chmod +x /opt/sccache-wrappers/clang \
+        && echo "sccache wrappers created in /opt/sccache-wrappers"; \
+    fi
+
+# Set sccache environment variables only when USE_SCCACHE=1
+# This prevents S3 config from leaking into images when sccache is not used
+ARG USE_SCCACHE
+ENV SCCACHE_BUCKET=${USE_SCCACHE:+${SCCACHE_BUCKET_NAME}}
+ENV SCCACHE_REGION=${USE_SCCACHE:+${SCCACHE_REGION_NAME}}
+ENV SCCACHE_S3_NO_CREDENTIALS=${USE_SCCACHE:+${SCCACHE_S3_NO_CREDENTIALS}}
+ENV SCCACHE_IDLE_TIMEOUT=${USE_SCCACHE:+0}
+
+
+###
+### Triton Build
+###
 FROM base AS build_triton
 ARG TRITON_BRANCH
 ARG TRITON_REPO
@@ -62,11 +120,19 @@ RUN cd triton \
 RUN if [ -d triton/python/triton_kernels ]; then pip install build && cd triton/python/triton_kernels \
    && python3 -m build --wheel && cp dist/*.whl /app/install; fi

+
+###
+### AMD SMI Build
+###
 FROM base AS build_amdsmi
 RUN cd /opt/rocm/share/amd_smi \
    && pip wheel . --wheel-dir=dist
 RUN mkdir -p /app/install && cp /opt/rocm/share/amd_smi/dist/*.whl /app/install

+
+###
+### Pytorch build
+###
 FROM base AS build_pytorch
 ARG PYTORCH_BRANCH
 ARG PYTORCH_VISION_BRANCH
@@ -74,42 +140,93 @@ ARG PYTORCH_AUDIO_BRANCH
 ARG PYTORCH_REPO
 ARG PYTORCH_VISION_REPO
 ARG PYTORCH_AUDIO_REPO
+ARG USE_SCCACHE

 RUN git clone ${PYTORCH_REPO} pytorch
 RUN cd pytorch && git checkout ${PYTORCH_BRANCH} \
    && pip install -r requirements.txt && git submodule update --init --recursive \
    && python3 tools/amd_build/build_amd.py \
+    && if [ "$USE_SCCACHE" = "1" ]; then \
+           export HIP_CLANG_PATH=/opt/sccache-wrappers \
+           && export CMAKE_C_COMPILER_LAUNCHER=sccache \
+           && export CMAKE_CXX_COMPILER_LAUNCHER=sccache \
+           && sccache --show-stats; \
+       fi \
    && CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \
+    && if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
    && pip install dist/*.whl
 RUN git clone ${PYTORCH_VISION_REPO} vision
 RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \
+    && if [ "$USE_SCCACHE" = "1" ]; then \
+           export HIP_CLANG_PATH=/opt/sccache-wrappers \
+           && export CMAKE_C_COMPILER_LAUNCHER=sccache \
+           && export CMAKE_CXX_COMPILER_LAUNCHER=sccache; \
+       fi \
    && python3 setup.py bdist_wheel --dist-dir=dist \
+    && if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
    && pip install dist/*.whl
 RUN git clone ${PYTORCH_AUDIO_REPO} audio
 RUN cd audio && git checkout ${PYTORCH_AUDIO_BRANCH} \
    && git submodule update --init --recursive \
    && pip install -r requirements.txt \
+    && if [ "$USE_SCCACHE" = "1" ]; then \
+           export HIP_CLANG_PATH=/opt/sccache-wrappers \
+           && export CMAKE_C_COMPILER_LAUNCHER=sccache \
+           && export CMAKE_CXX_COMPILER_LAUNCHER=sccache; \
+       fi \
    && python3 setup.py bdist_wheel --dist-dir=dist \
+    && if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
    && pip install dist/*.whl
 RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
    && cp /app/vision/dist/*.whl /app/install \
    && cp /app/audio/dist/*.whl /app/install

+
+###
+### MORI Build
+###
+FROM base AS build_mori
+ARG MORI_BRANCH
+ARG MORI_REPO
+RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
+    pip install /install/*.whl
+RUN git clone ${MORI_REPO}
+RUN cd mori \
+    && git checkout ${MORI_BRANCH} \
+    && git submodule update --init --recursive \
+    && python3 setup.py bdist_wheel --dist-dir=dist && ls /app/mori/dist/*.whl
+RUN mkdir -p /app/install && cp /app/mori/dist/*.whl /app/install
+
+
+###
+### FlashAttention Build
+###
 FROM base AS build_fa
 ARG FA_BRANCH
 ARG FA_REPO
+ARG USE_SCCACHE
 RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
    pip install /install/*.whl
 RUN git clone ${FA_REPO}
 RUN cd flash-attention \
    && git checkout ${FA_BRANCH} \
    && git submodule update --init \
-    && GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist
+    && if [ "$USE_SCCACHE" = "1" ]; then \
+           export HIP_CLANG_PATH=/opt/sccache-wrappers \
+           && sccache --show-stats; \
+       fi \
+    && GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist \
+    && if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi
 RUN mkdir -p /app/install && cp /app/flash-attention/dist/*.whl /app/install

+
+###
+### AITER Build
+###
 FROM base AS build_aiter
 ARG AITER_BRANCH
 ARG AITER_REPO
+ARG USE_SCCACHE
 RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
    pip install /install/*.whl
 RUN git clone --recursive ${AITER_REPO}
@@ -117,9 +234,37 @@ RUN cd aiter \
    && git checkout ${AITER_BRANCH} \
    && git submodule update --init --recursive \
    && pip install -r requirements.txt
-RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=${AITER_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
+RUN pip install pyyaml && cd aiter \
+    && if [ "$USE_SCCACHE" = "1" ]; then \
+           export HIP_CLANG_PATH=/opt/sccache-wrappers \
+           && sccache --show-stats; \
+       fi \
+    && PREBUILD_KERNELS=1 GPU_ARCHS=${AITER_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist \
+    && if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
+    && ls /app/aiter/dist/*.whl
 RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install

+
+###
+### Final Build
+###
+
+# Wheel release stage - 
+# only includes dependencies used by wheel release pipeline
+FROM base AS debs_wheel_release
+RUN mkdir /app/debs
+RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
+    cp /install/*.whl /app/debs
+RUN --mount=type=bind,from=build_fa,src=/app/install/,target=/install \
+    cp /install/*.whl /app/debs
+RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
+    cp /install/*.whl /app/debs
+RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
+    cp /install/*.whl /app/debs
+RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
+    cp /install/*.whl /app/debs
+
+# Full debs stage - includes Mori (used by Docker releases)
 FROM base AS debs
 RUN mkdir /app/debs
 RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
@@ -132,6 +277,8 @@ RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
    cp /install/*.whl /app/debs
 RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
    cp /install/*.whl /app/debs
+RUN --mount=type=bind,from=build_mori,src=/app/install/,target=/install \
+    cp /install/*.whl /app/debs

 FROM base AS final
 RUN --mount=type=bind,from=debs,src=/app/debs,target=/install \
@@ -150,6 +297,8 @@ ARG FA_BRANCH
 ARG FA_REPO
 ARG AITER_BRANCH
 ARG AITER_REPO
+ARG MORI_BRANCH
+ARG MORI_REPO
 RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
    && echo "TRITON_BRANCH: ${TRITON_BRANCH}" >> /app/versions.txt \
    && echo "TRITON_REPO: ${TRITON_REPO}" >> /app/versions.txt \
@@ -162,4 +311,6 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
    && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
    && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
    && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
-    && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
\ No newline at end of file
+    && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt \
+    && echo "MORI_BRANCH: ${MORI_BRANCH}" >> /app/versions.txt \
+    && echo "MORI_REPO: ${MORI_REPO}" >> /app/versions.txt
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@@ -2,7 +2,7 @@ FROM intel/deep-learning-essentials:2025.2.2-0-devel-ubuntu24.04 AS vllm-base

 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
    echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
-    add-apt-repository -y ppa:kobuk-team/intel-graphics
+    add-apt-repository -y ppa:kobuk-team/intel-graphics-staging

 RUN apt clean && apt-get update -y && \
    apt-get install -y --no-install-recommends --fix-missing \
@@ -28,10 +28,14 @@ RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.12 1
 RUN apt install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing intel-ocloc

 # This oneccl contains the BMG support which is not the case for default version of oneapi 2025.2.
-RUN wget https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.6/intel-oneccl-2021.15.6.9_offline.sh
-RUN bash intel-oneccl-2021.15.6.9_offline.sh -a --silent --eula accept && \
+ARG ONECCL_INSTALLER="intel-oneccl-2021.15.7.6_offline.sh"
+RUN wget "https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.7/${ONECCL_INSTALLER}" && \
+    bash "${ONECCL_INSTALLER}" -a --silent --eula accept && \
+    rm "${ONECCL_INSTALLER}" && \
    echo "source /opt/intel/oneapi/setvars.sh --force" >> /root/.bashrc && \
    echo "source /opt/intel/oneapi/ccl/2021.15/env/vars.sh --force" >> /root/.bashrc
+RUN rm -f /opt/intel/oneapi/ccl/latest && \
+    ln -s /opt/intel/oneapi/ccl/2021.15 /opt/intel/oneapi/ccl/latest

 SHELL ["bash", "-c"]
 CMD ["bash", "-c", "source /root/.bashrc && exec bash"]
@@ -47,6 +51,11 @@ RUN --mount=type=cache,target=/root/.cache/pip \
    pip install --no-cache-dir \
    -r requirements/xpu.txt

+# arctic-inference is built from source which needs torch-xpu properly installed
+# used for suffix method speculative decoding
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install --no-cache-dir arctic-inference==0.1.1
+
 ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"

 COPY . .

--- a/docker/docker-bake.hcl
+++ b/docker/docker-bake.hcl
+# docker-bake.hcl - vLLM Docker build configuration
+#
+# This file lives in vLLM repo at docker/docker-bake.hcl
+#
+# Usage:
+#   cd docker && docker buildx bake        # Build default target (openai)
+#   cd docker && docker buildx bake test   # Build test target
+#   docker buildx bake --print             # Show resolved config
+#
+# Reference: https://docs.docker.com/build/bake/reference/
+
+# Build configuration
+
+variable "MAX_JOBS" {
+  default = 16
+}
+
+variable "NVCC_THREADS" {
+  default = 8
+}
+
+variable "TORCH_CUDA_ARCH_LIST" {
+  default = "8.0 8.9 9.0 10.0"
+}
+
+variable "COMMIT" {
+  default = ""
+}
+
+# Groups
+
+group "default" {
+  targets = ["openai"]
+}
+
+# Base targets
+
+target "_common" {
+  dockerfile = "docker/Dockerfile"
+  context    = "."
+  args = {
+    max_jobs             = MAX_JOBS
+    nvcc_threads         = NVCC_THREADS
+    torch_cuda_arch_list = TORCH_CUDA_ARCH_LIST
+  }
+}
+
+target "_labels" {
+  labels = {
+    "org.opencontainers.image.source"      = "https://github.com/vllm-project/vllm"
+    "org.opencontainers.image.vendor"      = "vLLM"
+    "org.opencontainers.image.title"       = "vLLM"
+    "org.opencontainers.image.description" = "vLLM: A high-throughput and memory-efficient inference and serving engine for LLMs"
+    "org.opencontainers.image.licenses"    = "Apache-2.0"
+    "org.opencontainers.image.revision"    = COMMIT
+  }
+  annotations = [
+      "index,manifest:org.opencontainers.image.revision=${COMMIT}",
+  ]
+}
+
+# Build targets
+
+target "test" {
+  inherits = ["_common", "_labels"]
+  target   = "test"
+  tags     = ["vllm:test"]
+  output   = ["type=docker"]
+}
+
+target "openai" {
+  inherits = ["_common", "_labels"]
+  target   = "vllm-openai"
+  tags     = ["vllm:openai"]
+  output   = ["type=docker"]
+}
--- a/docs/README.md
+++ b/docs/README.md
@@ -62,7 +62,7 @@ vLLM is flexible and easy to use with:

 For more information, check out the following:

- [vLLM announcing blog post](https://vllm.ai) (intro to PagedAttention)
+- [vLLM announcing blog post](https://blog.vllm.ai/2023/06/20/vllm.html) (intro to PagedAttention)
 - [vLLM paper](https://arxiv.org/abs/2309.06180) (SOSP 2023)
 - [How continuous batching enables 23x throughput in LLM inference while reducing p50 latency](https://www.anyscale.com/blog/continuous-batching-llm-inference) by Cade Daniel et al.
 - [vLLM Meetups](community/meetups.md)
--- a/docs/api/README.md
+++ b/docs/api/README.md
@@ -72,7 +72,6 @@ Internal data structures.
 - [vllm.multimodal.inputs.MultiModalFieldConfig][]
 - [vllm.multimodal.inputs.MultiModalKwargsItem][]
 - [vllm.multimodal.inputs.MultiModalKwargsItems][]
- [vllm.multimodal.inputs.MultiModalKwargs][]
 - [vllm.multimodal.inputs.MultiModalInputs][]

 ### Data Parsing

--- a/docs/assets/contributing/dockerfile-stages-dependency.png
+++ b/docs/assets/contributing/dockerfile-stages-dependency.png
--- a/docs/assets/deployment/claude-code-example.png
+++ b/docs/assets/deployment/claude-code-example.png
--- a/docs/benchmarking/dashboard.md
+++ b/docs/benchmarking/dashboard.md
@@ -8,12 +8,19 @@ The results are automatically published to the public [vLLM Performance Dashboar
 ## Manually Trigger the benchmark

 Use [vllm-ci-test-repo images](https://gallery.ecr.aws/q9t5s3a7/vllm-ci-test-repo) with vLLM benchmark suite.
-For CPU environment, please use the image with "-cpu" postfix.
+For x86 CPU environment, please use the image with "-cpu" postfix. For AArch64 CPU environment, please use the image with "-arm64-cpu" postfix.

-Here is an example for docker run command for CPU.
+Here is an example for docker run command for CPU. For GPUs skip setting the `ON_CPU` env var.

 ```bash
-docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingface  -e HF_TOKEN=''  --shm-size=16g --name vllm-cpu-ci  public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:1da94e673c257373280026f75ceb4effac80e892-cpu
+export VLLM_COMMIT=1da94e673c257373280026f75ceb4effac80e892 # use full commit hash from the main branch
+export HF_TOKEN=<valid Hugging Face token>
+if [[ "$(uname -m)" == aarch64 || "$(uname -m)" == arm64 ]]; then
+  IMG_SUFFIX="arm64-cpu"
+else
+  IMG_SUFFIX="cpu"
+fi
+docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingface -e HF_TOKEN=$HF_TOKEN -e ON_ARM64_CPU=1 --shm-size=16g --name vllm-cpu-ci public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:${VLLM_COMMIT}-${IMG_SUFFIX}
 ```

 Then, run below command inside the docker instance.
@@ -26,14 +33,65 @@ When run, benchmark script generates results under **benchmark/results** folder,

 ### Runtime environment variables

- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
+- `ON_CPU`: set the value to '1' on Intel® Xeon® and Arm® Neoverse™ Processors. Default value is 0.
 - `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
 - `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
 - `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
 - `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
 - `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.

-For more results visualization, check the [visualizing the results](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md#visualizing-the-results).
+### Visualization
+
+The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table with real benchmarking results.
+You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
+If you do not see the table, please wait till the benchmark finish running.
+The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
+The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
+
+#### Performance Results Comparison
+
+The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`.
+When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`.
+`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.  
+If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead.
+
+Here is an example using the script to compare result_a and result_b with max concurrency and qps for same Model, Dataset name, input/output length.
+`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
+
+***Output Tput (tok/s) — Model : [ meta-llama/Llama-3.1-8B-Instruct ] , Dataset Name : [ random ] , Input Len : [ 2048.0 ] , Output Len : [ 2048.0 ]***
+
+|    | # of max concurrency | qps  | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio        |
+|----|------|-----|-----------|----------|----------|
+| 0  | 12 | inf | 24.98   | 186.03 |  7.45 |
+| 1  | 16 | inf|  25.49  | 246.92 | 9.69 |
+| 2  | 24 | inf| 27.74  | 293.34 |  10.57 |
+| 3  | 32 | inf| 28.61  |306.69 | 10.72 |
+
+***compare-json-results.py – Command-Line Parameters***  
+
+compare-json-results.py provides configurable parameters to compare one or more benchmark_results.json files and generate summary tables and plots.  
+In most cases, users only need to specify --file to parse the desired benchmark results.
+
+| Parameter              | Type               | Default Value           | Description                                                                                           |
+| ---------------------- | ------------------ | ----------------------- | ----------------------------------------------------------------------------------------------------- |
+| `--file`               | `str` (appendable) | *None*                  | Input JSON result file(s). Can be specified multiple times to compare multiple benchmark outputs.     |
+| `--debug`              | `bool`             | `False`                 | Enables debug mode. When set, prints all available information to aid troubleshooting and validation. |
+| `--plot` / `--no-plot` | `bool`             | `True`                  | Controls whether performance plots are generated. Use `--no-plot` to disable graph generation.        |
+| `--xaxis`              | `str`              | `# of max concurrency.` | Column name used as the X-axis in comparison plots (for example, concurrency or batch size).          |
+| `--latency`            | `str`              | `p99`                   | Latency aggregation method used for TTFT/TPOT. Supported values: `median` or `p99`.                   |
+| `--ttft-max-ms`        | `float`            | `3000.0`                | Reference upper bound (milliseconds) for TTFT plots, typically used to visualize SLA thresholds.      |
+| `--tpot-max-ms`        | `float`            | `100.0`                 | Reference upper bound (milliseconds) for TPOT plots, typically used to visualize SLA thresholds.      |
+
+***Valid Max Concurrency Summary***  
+
+Based on the configured TTFT and TPOT SLA thresholds, compare-json-results.py computes the maximum valid concurrency for each benchmark result.  
+The “Max # of max concurrency. (Both)” column represents the highest concurrency level that satisfies both TTFT and TPOT constraints simultaneously.  
+This value is typically used in capacity planning and sizing guides.  
+
+| # | Configuration  | Max # of max concurrency. (TTFT ≤ 10000 ms) | Max # of max concurrency. (TPOT ≤ 100 ms) | Max # of max concurrency. (Both) | Output Tput @ Both (tok/s) | TTFT @ Both (ms) | TPOT @ Both (ms) |
+| - | -------------- | ------------------------------------------- | ----------------------------------------- | -------------------------------- | -------------------------- | ---------------- | ---------------- |
+| 0 | results-a      | 128.00                                      | 12.00                                     | 12.00                            | 127.76                     | 3000.82          | 93.24            |
+| 1 | results-b      | 128.00                                      | 32.00                                     | 32.00                            | 371.42                     | 2261.53          | 81.74            |

 More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](../../.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md).


--- a/docs/benchmarking/sweeps.md
+++ b/docs/benchmarking/sweeps.md
@@ -129,10 +129,10 @@ vllm bench sweep serve_sla \

 The algorithm for adjusting the SLA variable is as follows:

-1. Run the benchmark with infinite QPS, and use the corresponding metrics to determine the initial value of the variable.
-    - For example, the initial request rate is set to the concurrency under infinite QPS.
-2. If the SLA is still satisfied, keep doubling the value until the SLA is no longer satisfied. This gives a relatively narrow window that contains the point where the SLA is barely satisfied.
-3. Apply binary search over the window to find the maximum value that still satisfies the SLA.
+1. Run the benchmark once with maximum possible QPS, and once with minimum possible QPS. For each run, calculate the distance of the SLA metrics from their targets, resulting in data points of QPS vs SLA distance.
+2. Perform spline interpolation between the data points to estimate the QPS that results in zero SLA distance.
+3. Run the benchmark with the estimated QPS and add the resulting data point to the history.
+4. Repeat Steps 2 and 3 until the maximum QPS that passes SLA and the minimum QPS that fails SLA in the history are close enough to each other.

 !!! important
    SLA tuning is applied over each combination of `--serve-params`, `--bench-params`, and `--sla-params`.

--- a/docs/cli/bench/latency.md
+++ b/docs/cli/bench/latency.md
@@ -6,4 +6,4 @@

 ## Arguments

--8<-- "docs/argparse/bench_latency.inc.md"
+--8<-- "docs/generated/argparse/bench_latency.inc.md"
--- a/docs/cli/bench/mm_processor.md
+++ b/docs/cli/bench/mm_processor.md
+# vllm bench mm-processor
+
+## JSON CLI Arguments
+
+--8<-- "docs/cli/json_tip.inc.md"
+
+## Arguments
+
+--8<-- "docs/generated/argparse/bench_mm_processor.inc.md"
--- a/docs/cli/bench/serve.md
+++ b/docs/cli/bench/serve.md
@@ -6,4 +6,4 @@

 ## Arguments

--8<-- "docs/argparse/bench_serve.inc.md"
+--8<-- "docs/generated/argparse/bench_serve.inc.md"
--- a/docs/cli/bench/sweep/plot.md
+++ b/docs/cli/bench/sweep/plot.md
@@ -6,4 +6,4 @@

 ## Arguments

--8<-- "docs/argparse/bench_sweep_plot.inc.md"
+--8<-- "docs/generated/argparse/bench_sweep_plot.inc.md"
--- a/docs/cli/bench/sweep/plot_pareto.md
+++ b/docs/cli/bench/sweep/plot_pareto.md
@@ -6,4 +6,4 @@

 ## Arguments

--8<-- "docs/argparse/bench_sweep_plot_pareto.inc.md"
+--8<-- "docs/generated/argparse/bench_sweep_plot_pareto.inc.md"
--- a/docs/cli/bench/sweep/serve.md
+++ b/docs/cli/bench/sweep/serve.md
@@ -6,4 +6,4 @@

 ## Arguments

--8<-- "docs/argparse/bench_sweep_serve.inc.md"
+--8<-- "docs/generated/argparse/bench_sweep_serve.inc.md"
--- a/docs/cli/bench/sweep/serve_sla.md
+++ b/docs/cli/bench/sweep/serve_sla.md
@@ -6,4 +6,4 @@

 ## Arguments

--8<-- "docs/argparse/bench_sweep_serve_sla.inc.md"
+--8<-- "docs/generated/argparse/bench_sweep_serve_sla.inc.md"