"vllm/vscode:/vscode.git/clone" did not exist on "067c34a1559400e956311f067ddd185f54207a2b"
Commit 7e63ef82 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.14.0' into v0.14.0-dev

parents 8cbcac5d b17039bc
...@@ -32,7 +32,7 @@ ARG DEADSNAKES_GPGKEY_URL ...@@ -32,7 +32,7 @@ ARG DEADSNAKES_GPGKEY_URL
# The PyPA get-pip.py script is a self contained script+zip file, that provides # The PyPA get-pip.py script is a self contained script+zip file, that provides
# both the installer script and the pip base85-encoded zip archive. This allows # both the installer script and the pip base85-encoded zip archive. This allows
# bootstrapping pip in environment where a dsitribution package does not exist. # bootstrapping pip in environment where a distribution package does not exist.
# #
# By parameterizing the URL for get-pip.py installation script, we allow # By parameterizing the URL for get-pip.py installation script, we allow
# third-party to use their own copy of the script stored in a private mirror. # third-party to use their own copy of the script stored in a private mirror.
...@@ -73,15 +73,13 @@ ARG INSTALL_KV_CONNECTORS=false ...@@ -73,15 +73,13 @@ ARG INSTALL_KV_CONNECTORS=false
#################### BASE BUILD IMAGE #################### #################### BASE BUILD IMAGE ####################
# prepare basic build environment # prepare basic build environment
FROM ${BUILD_BASE_IMAGE} AS base FROM ${BUILD_BASE_IMAGE} AS base
ARG CUDA_VERSION ARG CUDA_VERSION
ARG PYTHON_VERSION ARG PYTHON_VERSION
ARG TARGETPLATFORM
ARG INSTALL_KV_CONNECTORS=false
ENV DEBIAN_FRONTEND=noninteractive
ARG GET_PIP_URL ENV DEBIAN_FRONTEND=noninteractive
# Install system dependencies and uv, then create Python virtual environment # Install system dependencies including build tools
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \ && apt-get update -y \
...@@ -107,32 +105,30 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ ...@@ -107,32 +105,30 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& ln -s /opt/venv/bin/pip /usr/bin/pip \ && ln -s /opt/venv/bin/pip /usr/bin/pip \
&& python3 --version && python3 -m pip --version && python3 --version && python3 -m pip --version
ARG PIP_INDEX_URL UV_INDEX_URL
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
ARG PYTORCH_CUDA_INDEX_BASE_URL
ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
# Activate virtual environment and add uv to PATH # Activate virtual environment and add uv to PATH
ENV PATH="/opt/venv/bin:/root/.local/bin:$PATH" ENV PATH="/opt/venv/bin:/root/.local/bin:$PATH"
ENV VIRTUAL_ENV="/opt/venv" ENV VIRTUAL_ENV="/opt/venv"
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out # Environment for uv
# Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT=500 ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match" ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Use copy mode to avoid hardlink failures with Docker cache mounts
ENV UV_LINK_MODE=copy ENV UV_LINK_MODE=copy
RUN <<EOF # Verify GCC version
gcc --version RUN gcc --version
EOF
# Workaround for https://github.com/openai/triton/issues/2507 and # Workaround for triton/pytorch issues
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# or future versions of triton.
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
# ============================================================
# SLOW-CHANGING DEPENDENCIES BELOW
# These are the expensive layers that we want to cache
# ============================================================
# Install PyTorch and core CUDA dependencies
# This is ~2GB and rarely changes
ARG PYTORCH_CUDA_INDEX_BASE_URL
WORKDIR /workspace WORKDIR /workspace
# install build and runtime dependencies # install build and runtime dependencies
...@@ -142,13 +138,12 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -142,13 +138,12 @@ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \ uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
# cuda arch list used by torch # CUDA arch list used by torch
# can be useful for both `dev` and `test` # Explicitly set the list to avoid issues with torch 2.2
# explicitly set the list to avoid issues with torch 2.2 # See https://github.com/pytorch/pytorch/pull/123243
# see https://github.com/pytorch/pytorch/pull/123243
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0' ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
#################### BASE BUILD IMAGE #################### #################### BUILD BASE IMAGE ####################
#################### CSRC BUILD IMAGE #################### #################### CSRC BUILD IMAGE ####################
FROM base AS csrc-build FROM base AS csrc-build
...@@ -188,7 +183,7 @@ ARG nvcc_threads=8 ...@@ -188,7 +183,7 @@ ARG nvcc_threads=8
ENV NVCC_THREADS=$nvcc_threads ENV NVCC_THREADS=$nvcc_threads
ARG USE_SCCACHE ARG USE_SCCACHE
ARG SCCACHE_DOWNLOAD_URL=https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz ARG SCCACHE_DOWNLOAD_URL
ARG SCCACHE_ENDPOINT ARG SCCACHE_ENDPOINT
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
ARG SCCACHE_REGION_NAME=us-west-2 ARG SCCACHE_REGION_NAME=us-west-2
...@@ -206,10 +201,16 @@ ENV SETUPTOOLS_SCM_PRETEND_VERSION="0.0.0+csrc.build" ...@@ -206,10 +201,16 @@ ENV SETUPTOOLS_SCM_PRETEND_VERSION="0.0.0+csrc.build"
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
if [ "$USE_SCCACHE" = "1" ]; then \ if [ "$USE_SCCACHE" = "1" ]; then \
echo "Installing sccache..." \ echo "Installing sccache..." \
&& case "${TARGETPLATFORM}" in \
linux/arm64) SCCACHE_ARCH="aarch64" ;; \
linux/amd64) SCCACHE_ARCH="x86_64" ;; \
*) echo "Unsupported TARGETPLATFORM for sccache: ${TARGETPLATFORM}" >&2; exit 1 ;; \
esac \
&& export SCCACHE_DOWNLOAD_URL="${SCCACHE_DOWNLOAD_URL:-https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-${SCCACHE_ARCH}-unknown-linux-musl.tar.gz}" \
&& curl -L -o sccache.tar.gz ${SCCACHE_DOWNLOAD_URL} \ && curl -L -o sccache.tar.gz ${SCCACHE_DOWNLOAD_URL} \
&& tar -xzf sccache.tar.gz \ && tar -xzf sccache.tar.gz \
&& sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \ && sudo mv sccache-v0.8.1-${SCCACHE_ARCH}-unknown-linux-musl/sccache /usr/bin/sccache \
&& rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \ && rm -rf sccache.tar.gz sccache-v0.8.1-${SCCACHE_ARCH}-unknown-linux-musl \
&& if [ ! -z ${SCCACHE_ENDPOINT} ] ; then export SCCACHE_ENDPOINT=${SCCACHE_ENDPOINT} ; fi \ && if [ ! -z ${SCCACHE_ENDPOINT} ] ; then export SCCACHE_ENDPOINT=${SCCACHE_ENDPOINT} ; fi \
&& export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \ && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
&& export SCCACHE_REGION=${SCCACHE_REGION_NAME} \ && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
...@@ -241,6 +242,50 @@ RUN --mount=type=cache,target=/root/.cache/ccache \ ...@@ -241,6 +242,50 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
fi fi
#################### CSRC BUILD IMAGE #################### #################### CSRC BUILD IMAGE ####################
#################### EXTENSIONS BUILD IMAGE ####################
# Build DeepGEMM, pplx-kernels, DeepEP - runs in PARALLEL with csrc-build
# This stage is independent and doesn't affect csrc cache
FROM base AS extensions-build
ARG CUDA_VERSION
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
ENV UV_LINK_MODE=copy
WORKDIR /workspace
# Build DeepGEMM wheel
ARG DEEPGEMM_GIT_REF
COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
RUN --mount=type=cache,target=/root/.cache/uv \
mkdir -p /tmp/deepgemm/dist && \
VLLM_DOCKER_BUILD_CONTEXT=1 TORCH_CUDA_ARCH_LIST="9.0a 10.0a" /tmp/install_deepgemm.sh \
--cuda-version "${CUDA_VERSION}" \
${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"} \
--wheel-dir /tmp/deepgemm/dist || \
echo "DeepGEMM build skipped (CUDA version requirement not met)"
# Ensure the wheel dir exists so COPY won't fail when DeepGEMM is skipped
RUN mkdir -p /tmp/deepgemm/dist && touch /tmp/deepgemm/dist/.deepgemm_skipped
# Build pplx-kernels and DeepEP wheels
COPY tools/ep_kernels/install_python_libraries.sh /tmp/install_python_libraries.sh
ARG PPLX_COMMIT_HASH
ARG DEEPEP_COMMIT_HASH
ARG NVSHMEM_VER
RUN --mount=type=cache,target=/root/.cache/uv \
mkdir -p /tmp/ep_kernels_workspace/dist && \
export TORCH_CUDA_ARCH_LIST='9.0a 10.0a' && \
/tmp/install_python_libraries.sh \
--workspace /tmp/ep_kernels_workspace \
--mode wheel \
${PPLX_COMMIT_HASH:+--pplx-ref "$PPLX_COMMIT_HASH"} \
${DEEPEP_COMMIT_HASH:+--deepep-ref "$DEEPEP_COMMIT_HASH"} \
${NVSHMEM_VER:+--nvshmem-ver "$NVSHMEM_VER"} && \
find /tmp/ep_kernels_workspace/nvshmem -name '*.a' -delete
#################### EXTENSIONS BUILD IMAGE ####################
#################### WHEEL BUILD IMAGE #################### #################### WHEEL BUILD IMAGE ####################
FROM base AS build FROM base AS build
ARG TARGETPLATFORM ARG TARGETPLATFORM
...@@ -265,6 +310,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -265,6 +310,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
WORKDIR /workspace WORKDIR /workspace
# Copy pre-built csrc wheel directly
COPY --from=csrc-build /workspace/dist /precompiled-wheels COPY --from=csrc-build /workspace/dist /precompiled-wheels
COPY . . COPY . .
...@@ -286,27 +332,9 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -286,27 +332,9 @@ RUN --mount=type=cache,target=/root/.cache/uv \
fi && \ fi && \
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38
# Install DeepGEMM from source # Copy extension wheels from extensions-build stage for later use
ARG DEEPGEMM_GIT_REF COPY --from=extensions-build /tmp/deepgemm/dist /tmp/deepgemm/dist
COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh COPY --from=extensions-build /tmp/ep_kernels_workspace/dist /tmp/ep_kernels_workspace/dist
RUN --mount=type=cache,target=/root/.cache/uv \
VLLM_DOCKER_BUILD_CONTEXT=1 TORCH_CUDA_ARCH_LIST="9.0a 10.0a" /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"} --wheel-dir /tmp/deepgemm/dist
# Ensure the wheel dir exists so later-stage COPY won't fail when DeepGEMM is skipped
RUN mkdir -p /tmp/deepgemm/dist && touch /tmp/deepgemm/dist/.deepgemm_skipped
COPY tools/ep_kernels/install_python_libraries.sh /tmp/install_python_libraries.sh
# Install EP kernels(pplx-kernels and DeepEP)
ARG PPLX_COMMIT_HASH
ARG DEEPEP_COMMIT_HASH
RUN --mount=type=cache,target=/root/.cache/uv \
export TORCH_CUDA_ARCH_LIST='9.0a 10.0a' && \
/tmp/install_python_libraries.sh \
--workspace /tmp/ep_kernels_workspace \
--mode wheel \
${PPLX_COMMIT_HASH:+--pplx-ref "$PPLX_COMMIT_HASH"} \
${DEEPEP_COMMIT_HASH:+--deepep-ref "$DEEPEP_COMMIT_HASH"} && \
find /tmp/ep_kernels_workspace/nvshmem -name '*.a' -delete
# Check the size of the wheel if RUN_WHEEL_CHECK is true # Check the size of the wheel if RUN_WHEEL_CHECK is true
COPY .buildkite/check-wheel-size.py check-wheel-size.py COPY .buildkite/check-wheel-size.py check-wheel-size.py
...@@ -344,32 +372,25 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -344,32 +372,25 @@ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --python /opt/venv/bin/python3 -r requirements/dev.txt \ uv pip install --python /opt/venv/bin/python3 -r requirements/dev.txt \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
#################### DEV IMAGE #################### #################### DEV IMAGE ####################
#################### vLLM installation IMAGE #################### #################### vLLM installation IMAGE ####################
# image with vLLM installed # image with vLLM installed
FROM ${FINAL_BASE_IMAGE} AS vllm-base FROM ${FINAL_BASE_IMAGE} AS vllm-base
ARG CUDA_VERSION ARG CUDA_VERSION
ARG PYTHON_VERSION ARG PYTHON_VERSION
ARG INSTALL_KV_CONNECTORS=false
WORKDIR /vllm-workspace
ENV DEBIAN_FRONTEND=noninteractive
ARG TARGETPLATFORM
# TODO (huydhn): There is no prebuilt gdrcopy package on 12.9 at the moment
ARG GDRCOPY_CUDA_VERSION=12.8
# Keep in line with FINAL_BASE_IMAGE
ARG GDRCOPY_OS_VERSION=Ubuntu22_04
SHELL ["/bin/bash", "-c"]
ARG DEADSNAKES_MIRROR_URL ARG DEADSNAKES_MIRROR_URL
ARG DEADSNAKES_GPGKEY_URL ARG DEADSNAKES_GPGKEY_URL
ARG GET_PIP_URL ARG GET_PIP_URL
ENV DEBIAN_FRONTEND=noninteractive
WORKDIR /vllm-workspace
# Python version string for paths (e.g., "312" for 3.12)
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \ RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
# Install Python and other dependencies # Install Python and system dependencies
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \ && apt-get update -y \
...@@ -408,62 +429,103 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ ...@@ -408,62 +429,103 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \ && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
&& python3 --version && python3 -m pip --version && python3 --version && python3 -m pip --version
# Install CUDA development tools and build essentials for runtime JIT compilation # Install CUDA development tools for runtime JIT compilation
# (FlashInfer, DeepGEMM, EP kernels all require compilation at runtime) # (FlashInfer, DeepGEMM, EP kernels all require compilation at runtime)
RUN CUDA_VERSION_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') && \ RUN CUDA_VERSION_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') && \
apt-get update -y && \ apt-get update -y && \
apt-get install -y --no-install-recommends \ apt-get install -y --no-install-recommends \
cuda-nvcc-${CUDA_VERSION_DASH} \ cuda-nvcc-${CUDA_VERSION_DASH} \
cuda-cudart-${CUDA_VERSION_DASH} \ cuda-cudart-${CUDA_VERSION_DASH} \
cuda-nvrtc-${CUDA_VERSION_DASH} \ cuda-nvrtc-${CUDA_VERSION_DASH} \
cuda-cuobjdump-${CUDA_VERSION_DASH} \ cuda-cuobjdump-${CUDA_VERSION_DASH} \
# https://github.com/vllm-project/vllm/issues/29590 libcurand-dev-${CUDA_VERSION_DASH} \
libcurand-dev-${CUDA_VERSION_DASH} \ libcublas-${CUDA_VERSION_DASH} \
libcublas-${CUDA_VERSION_DASH} \ # Fixes nccl_allocator requiring nccl.h at runtime
# Fixes nccl_allocator requiring nccl.h at runtime # https://github.com/vllm-project/vllm/blob/1336a1ea244fa8bfd7e72751cabbdb5b68a0c11a/vllm/distributed/device_communicators/pynccl_allocator.py#L22
# https://github.com/vllm-project/vllm/blob/1336a1ea244fa8bfd7e72751cabbdb5b68a0c11a/vllm/distributed/device_communicators/pynccl_allocator.py#L22 libnccl-dev && \
libnccl-dev && \
rm -rf /var/lib/apt/lists/* rm -rf /var/lib/apt/lists/*
ARG PIP_INDEX_URL UV_INDEX_URL
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
ARG PYTORCH_CUDA_INDEX_BASE_URL
ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
# Install uv for faster pip installs # Install uv for faster pip installs
RUN --mount=type=cache,target=/root/.cache/uv \ RUN python3 -m pip install uv
python3 -m pip install uv
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out # Environment for uv
# Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT=500 ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match" ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Use copy mode to avoid hardlink failures with Docker cache mounts
ENV UV_LINK_MODE=copy ENV UV_LINK_MODE=copy
# Workaround for https://github.com/openai/triton/issues/2507 and # Workaround for triton/pytorch issues
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# or future versions of triton.
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
# Install vllm wheel first, so that torch etc will be installed. # ============================================================
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \ # SLOW-CHANGING DEPENDENCIES BELOW
--mount=type=cache,target=/root/.cache/uv \ # These are the expensive layers that we want to cache
uv pip install --system dist/*.whl --verbose \ # ============================================================
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
# Install PyTorch and core CUDA dependencies
# This is ~2GB and rarely changes
ARG PYTORCH_CUDA_INDEX_BASE_URL
COPY requirements/common.txt /tmp/common.txt
COPY requirements/cuda.txt /tmp/requirements-cuda.txt
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r /tmp/requirements-cuda.txt \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') && \
rm /tmp/requirements-cuda.txt /tmp/common.txt
# Install FlashInfer pre-compiled kernel cache and binaries # Install FlashInfer pre-compiled kernel cache and binaries
# This is ~1.1GB and only changes when FlashInfer version bumps
# https://docs.flashinfer.ai/installation.html # https://docs.flashinfer.ai/installation.html
ARG FLASHINFER_VERSION=0.5.3
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system flashinfer-cubin==0.5.3 \ uv pip install --system flashinfer-cubin==${FLASHINFER_VERSION} \
&& uv pip install --system flashinfer-jit-cache==0.5.3 \ && uv pip install --system flashinfer-jit-cache==${FLASHINFER_VERSION} \
--extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \ --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
&& flashinfer show-config && flashinfer show-config
COPY examples examples # ============================================================
COPY benchmarks benchmarks # OPENAI API SERVER DEPENDENCIES
COPY ./vllm/collect_env.py . # Pre-install these to avoid reinstalling on every vLLM wheel rebuild
# ============================================================
# Install gdrcopy (saves ~6s per build)
# TODO (huydhn): There is no prebuilt gdrcopy package on 12.9 at the moment
ARG GDRCOPY_CUDA_VERSION=12.8
ARG GDRCOPY_OS_VERSION=Ubuntu22_04
ARG TARGETPLATFORM
COPY tools/install_gdrcopy.sh /tmp/install_gdrcopy.sh
RUN set -eux; \
case "${TARGETPLATFORM}" in \
linux/arm64) UUARCH="aarch64" ;; \
linux/amd64) UUARCH="x64" ;; \
*) echo "Unsupported TARGETPLATFORM: ${TARGETPLATFORM}" >&2; exit 1 ;; \
esac; \
/tmp/install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "${GDRCOPY_CUDA_VERSION}" "${UUARCH}" && \
rm /tmp/install_gdrcopy.sh
# Install vllm-openai dependencies (saves ~2.6s per build)
# These are stable packages that don't depend on vLLM itself
RUN --mount=type=cache,target=/root/.cache/uv \
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
BITSANDBYTES_VERSION="0.42.0"; \
else \
BITSANDBYTES_VERSION="0.46.1"; \
fi; \
uv pip install --system accelerate hf_transfer modelscope \
"bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3,gcs]>=0.15.3'
# ============================================================
# VLLM INSTALLATION (depends on build stage)
# ============================================================
ARG PIP_INDEX_URL UV_INDEX_URL
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
ARG PYTORCH_CUDA_INDEX_BASE_URL
ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
# Install vllm wheel first, so that torch etc will be installed.
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
--mount=type=cache,target=/root/.cache/uv \
uv pip install --system dist/*.whl --verbose \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
. /etc/environment && \ . /etc/environment && \
...@@ -478,7 +540,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -478,7 +540,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
echo "No DeepGEMM wheels to install; skipping."; \ echo "No DeepGEMM wheels to install; skipping."; \
fi' fi'
# Pytorch now installs NVSHMEM, setting LD_LIBRARY_PATH (https://github.com/pytorch/pytorch/blob/d38164a545b4a4e4e0cf73ce67173f70574890b6/.ci/manywheel/build_cuda.sh#L141C14-L141C36) # Pytorch now installs NVSHMEM, setting LD_LIBRARY_PATH
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
# Install EP kernels wheels (pplx-kernels and DeepEP) that have been built in the `build` stage # Install EP kernels wheels (pplx-kernels and DeepEP) that have been built in the `build` stage
...@@ -487,23 +549,17 @@ RUN --mount=type=bind,from=build,src=/tmp/ep_kernels_workspace/dist,target=/vllm ...@@ -487,23 +549,17 @@ RUN --mount=type=bind,from=build,src=/tmp/ep_kernels_workspace/dist,target=/vllm
uv pip install --system ep_kernels/dist/*.whl --verbose \ uv pip install --system ep_kernels/dist/*.whl --verbose \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
RUN --mount=type=bind,source=tools/install_gdrcopy.sh,target=/tmp/install_gdrcopy.sh,ro \
set -eux; \
case "${TARGETPLATFORM}" in \
linux/arm64) UUARCH="aarch64" ;; \
linux/amd64) UUARCH="x64" ;; \
*) echo "Unsupported TARGETPLATFORM: ${TARGETPLATFORM}" >&2; exit 1 ;; \
esac; \
/tmp/install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "${GDRCOPY_CUDA_VERSION}" "${UUARCH}"
# CUDA image changed from /usr/local/nvidia to /usr/local/cuda in 12.8 but will # CUDA image changed from /usr/local/nvidia to /usr/local/cuda in 12.8 but will
# return to /usr/local/nvidia in 13.0 to allow container providers to mount drivers # return to /usr/local/nvidia in 13.0 to allow container providers to mount drivers
# consistently from the host (see https://github.com/vllm-project/vllm/issues/18859). # consistently from the host (see https://github.com/vllm-project/vllm/issues/18859).
# Until then, add /usr/local/nvidia/lib64 before the image cuda path to allow override. # Until then, add /usr/local/nvidia/lib64 before the image cuda path to allow override.
ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib64:${LD_LIBRARY_PATH} ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib64:${LD_LIBRARY_PATH}
# Copy examples and benchmarks at the end to minimize cache invalidation
COPY examples examples
COPY benchmarks benchmarks
COPY ./vllm/collect_env.py .
#################### vLLM installation IMAGE #################### #################### vLLM installation IMAGE ####################
#################### TEST IMAGE #################### #################### TEST IMAGE ####################
# image to run unit testing suite # image to run unit testing suite
# note that this uses vllm installed by `pip` # note that this uses vllm installed by `pip`
...@@ -561,6 +617,7 @@ RUN mv vllm src/vllm ...@@ -561,6 +617,7 @@ RUN mv vllm src/vllm
FROM vllm-base AS vllm-openai-base FROM vllm-base AS vllm-openai-base
ARG TARGETPLATFORM ARG TARGETPLATFORM
ARG INSTALL_KV_CONNECTORS=false ARG INSTALL_KV_CONNECTORS=false
ARG CUDA_VERSION
ARG PIP_INDEX_URL UV_INDEX_URL ARG PIP_INDEX_URL UV_INDEX_URL
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
...@@ -569,18 +626,32 @@ ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL ...@@ -569,18 +626,32 @@ ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
# Reference: https://github.com/astral-sh/uv/pull/1694 # Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT=500 ENV UV_HTTP_TIMEOUT=500
# install additional dependencies for openai api server # install kv_connectors if requested
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=requirements/kv_connectors.txt,target=/tmp/kv_connectors.txt,ro \ --mount=type=bind,source=requirements/kv_connectors.txt,target=/tmp/kv_connectors.txt,ro \
CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
CUDA_VERSION_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-'); \
CUDA_HOME=/usr/local/cuda; \
# lmcache requires explicit specifying CUDA_HOME
BUILD_PKGS="libcusparse-dev-${CUDA_VERSION_DASH} \
libcublas-dev-${CUDA_VERSION_DASH} \
libcusolver-dev-${CUDA_VERSION_DASH}"; \
if [ "$INSTALL_KV_CONNECTORS" = "true" ]; then \ if [ "$INSTALL_KV_CONNECTORS" = "true" ]; then \
uv pip install --system -r /tmp/kv_connectors.txt; \ if [ "$CUDA_MAJOR" -ge 13 ]; then \
fi; \ uv pip install --system nixl-cu13; \
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ fi; \
BITSANDBYTES_VERSION="0.42.0"; \ uv pip install --system -r /tmp/kv_connectors.txt --no-build || ( \
else \ # if the above fails, install from source
BITSANDBYTES_VERSION="0.46.1"; \ apt-get update -y && \
fi; \ apt-get install -y --no-install-recommends ${BUILD_PKGS} && \
uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3,gcs]>=0.15.3' uv pip install --system -r /tmp/kv_connectors.txt --no-build-isolation && \
apt-get purge -y ${BUILD_PKGS} && \
# clean up -dev packages, keep runtime libraries
rm -rf /var/lib/apt/lists/* \
); \
fi
ENV VLLM_USAGE_SOURCE production-docker-image ENV VLLM_USAGE_SOURCE production-docker-image
......
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
# VLLM_CPU_DISABLE_AVX512=false (default)|true # VLLM_CPU_DISABLE_AVX512=false (default)|true
# VLLM_CPU_AVX512BF16=false (default)|true # VLLM_CPU_AVX512BF16=false (default)|true
# VLLM_CPU_AVX512VNNI=false (default)|true # VLLM_CPU_AVX512VNNI=false (default)|true
# VLLM_CPU_AMXBF16=false (default)|true # VLLM_CPU_AMXBF16=false |true (default)
# #
######################### COMMON BASE IMAGE ######################### ######################### COMMON BASE IMAGE #########################
...@@ -95,7 +95,7 @@ ENV VLLM_CPU_AVX512BF16=${VLLM_CPU_AVX512BF16} ...@@ -95,7 +95,7 @@ ENV VLLM_CPU_AVX512BF16=${VLLM_CPU_AVX512BF16}
ARG VLLM_CPU_AVX512VNNI=0 ARG VLLM_CPU_AVX512VNNI=0
ENV VLLM_CPU_AVX512VNNI=${VLLM_CPU_AVX512VNNI} ENV VLLM_CPU_AVX512VNNI=${VLLM_CPU_AVX512VNNI}
# Support for building with AMXBF16 ISA: docker build --build-arg VLLM_CPU_AMXBF16="true" ... # Support for building with AMXBF16 ISA: docker build --build-arg VLLM_CPU_AMXBF16="true" ...
ARG VLLM_CPU_AMXBF16=0 ARG VLLM_CPU_AMXBF16=1
ENV VLLM_CPU_AMXBF16=${VLLM_CPU_AMXBF16} ENV VLLM_CPU_AMXBF16=${VLLM_CPU_AMXBF16}
WORKDIR /workspace/vllm WORKDIR /workspace/vllm
...@@ -147,7 +147,9 @@ WORKDIR /workspace/vllm ...@@ -147,7 +147,9 @@ WORKDIR /workspace/vllm
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt,sharing=locked \ --mount=type=cache,target=/var/lib/apt,sharing=locked \
apt-get install -y --no-install-recommends vim numactl xz-utils apt-get install -y --no-install-recommends vim numactl xz-utils make clangd-14
RUN ln -s /usr/bin/clangd-14 /usr/bin/clangd
# install development dependencies (for testing) # install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
......
...@@ -22,13 +22,13 @@ RUN microdnf install -y dnf && dnf install -y gcc-toolset-14 make wget unzip \ ...@@ -22,13 +22,13 @@ RUN microdnf install -y dnf && dnf install -y gcc-toolset-14 make wget unzip \
############################################################### ###############################################################
FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS centos-deps-builder FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS centos-deps-builder
RUN microdnf install -y dnf && \ RUN microdnf install -y dnf && \
dnf install -y https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-gpg-keys-9.0-24.el9.noarch.rpm \ dnf install -y https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-gpg-keys-9.0-26.el9.noarch.rpm \
https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-stream-repos-9.0-24.el9.noarch.rpm \ https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-stream-repos-9.0-26.el9.noarch.rpm \
https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \ https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
dnf config-manager --set-enabled crb dnf config-manager --set-enabled crb
RUN dnf install -y openjpeg2-devel lcms2-devel tcl-devel tk-devel fribidi-devel && \ RUN dnf install -y openjpeg2-devel lcms2-devel tcl-devel tk-devel fribidi-devel yajl-devel && \
dnf remove -y centos-gpg-keys-9.0-24.el9.noarch centos-stream-repos-9.0-24.el9.noarch dnf remove -y centos-gpg-keys-9.0-24.el9.noarch centos-stream-repos-9.0-26.el9.noarch
############################################################### ###############################################################
...@@ -346,4 +346,4 @@ WORKDIR /workspace/ ...@@ -346,4 +346,4 @@ WORKDIR /workspace/
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
ENTRYPOINT ["vllm", "serve"] ENTRYPOINT ["vllm", "serve"]
\ No newline at end of file
...@@ -3,6 +3,14 @@ ARG REMOTE_VLLM="0" ...@@ -3,6 +3,14 @@ ARG REMOTE_VLLM="0"
ARG COMMON_WORKDIR=/app ARG COMMON_WORKDIR=/app
ARG BASE_IMAGE=rocm/vllm-dev:base ARG BASE_IMAGE=rocm/vllm-dev:base
# Sccache configuration (only used in release pipeline)
ARG USE_SCCACHE
ARG SCCACHE_DOWNLOAD_URL
ARG SCCACHE_ENDPOINT
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
ARG SCCACHE_REGION_NAME=us-west-2
ARG SCCACHE_S3_NO_CREDENTIALS=0
FROM ${BASE_IMAGE} AS base FROM ${BASE_IMAGE} AS base
ARG ARG_PYTORCH_ROCM_ARCH ARG ARG_PYTORCH_ROCM_ARCH
...@@ -14,9 +22,14 @@ ENV RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1 ...@@ -14,9 +22,14 @@ ENV RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1
RUN apt-get update -q -y && apt-get install -q -y \ RUN apt-get update -q -y && apt-get install -q -y \
sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \ sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
apt-transport-https ca-certificates wget curl apt-transport-https ca-certificates wget curl
# Remove sccache
RUN python3 -m pip install --upgrade pip RUN python3 -m pip install --upgrade pip
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)" # Remove sccache only if not using sccache (it exists in base image from Dockerfile.rocm_base)
ARG USE_SCCACHE
RUN if [ "$USE_SCCACHE" != "1" ]; then \
apt-get purge -y sccache || true; \
python3 -m pip uninstall -y sccache || true; \
rm -f "$(which sccache)" || true; \
fi
# Install UV # Install UV
RUN curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR="/usr/local/bin" sh RUN curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR="/usr/local/bin" sh
...@@ -28,6 +41,39 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match" ...@@ -28,6 +41,39 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Use copy mode to avoid hardlink failures with Docker cache mounts # Use copy mode to avoid hardlink failures with Docker cache mounts
ENV UV_LINK_MODE=copy ENV UV_LINK_MODE=copy
# Install sccache if USE_SCCACHE is enabled (for release builds)
ARG USE_SCCACHE
ARG SCCACHE_DOWNLOAD_URL
ARG SCCACHE_ENDPOINT
ARG SCCACHE_BUCKET_NAME
ARG SCCACHE_REGION_NAME
ARG SCCACHE_S3_NO_CREDENTIALS
RUN if [ "$USE_SCCACHE" = "1" ]; then \
if command -v sccache >/dev/null 2>&1; then \
echo "sccache already installed, skipping installation"; \
sccache --version; \
else \
echo "Installing sccache..." \
&& SCCACHE_ARCH="x86_64" \
&& SCCACHE_VERSION="v0.8.1" \
&& SCCACHE_DL_URL="${SCCACHE_DOWNLOAD_URL:-https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl.tar.gz}" \
&& curl -L -o /tmp/sccache.tar.gz ${SCCACHE_DL_URL} \
&& tar -xzf /tmp/sccache.tar.gz -C /tmp \
&& mv /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl/sccache /usr/bin/sccache \
&& chmod +x /usr/bin/sccache \
&& rm -rf /tmp/sccache.tar.gz /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl \
&& sccache --version; \
fi; \
fi
# Set sccache environment variables only when USE_SCCACHE=1
# This prevents S3 config from leaking into images when sccache is not used
ARG USE_SCCACHE
ENV SCCACHE_BUCKET=${USE_SCCACHE:+${SCCACHE_BUCKET_NAME}}
ENV SCCACHE_REGION=${USE_SCCACHE:+${SCCACHE_REGION_NAME}}
ENV SCCACHE_S3_NO_CREDENTIALS=${USE_SCCACHE:+${SCCACHE_S3_NO_CREDENTIALS}}
ENV SCCACHE_IDLE_TIMEOUT=${USE_SCCACHE:+0}
ARG COMMON_WORKDIR ARG COMMON_WORKDIR
WORKDIR ${COMMON_WORKDIR} WORKDIR ${COMMON_WORKDIR}
...@@ -39,6 +85,8 @@ ONBUILD COPY ./ vllm/ ...@@ -39,6 +85,8 @@ ONBUILD COPY ./ vllm/
FROM base AS fetch_vllm_1 FROM base AS fetch_vllm_1
ARG VLLM_REPO="https://github.com/vllm-project/vllm.git" ARG VLLM_REPO="https://github.com/vllm-project/vllm.git"
ARG VLLM_BRANCH="main" ARG VLLM_BRANCH="main"
ENV VLLM_REPO=${VLLM_REPO}
ENV VLLM_BRANCH=${VLLM_BRANCH}
ONBUILD RUN git clone ${VLLM_REPO} \ ONBUILD RUN git clone ${VLLM_REPO} \
&& cd vllm \ && cd vllm \
&& git fetch -v --prune -- origin ${VLLM_BRANCH} \ && git fetch -v --prune -- origin ${VLLM_BRANCH} \
...@@ -51,7 +99,7 @@ FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm ...@@ -51,7 +99,7 @@ FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm
# ----------------------- # -----------------------
# vLLM build stages # vLLM build stages
FROM fetch_vllm AS build_vllm FROM fetch_vllm AS build_vllm
# Build vLLM # Build vLLM (setup.py auto-detects sccache in PATH)
RUN cd vllm \ RUN cd vllm \
&& python3 -m pip install -r requirements/rocm.txt \ && python3 -m pip install -r requirements/rocm.txt \
&& python3 setup.py clean --all \ && python3 setup.py clean --all \
...@@ -67,6 +115,178 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/docker/Dockerfile.rocm /docker/ ...@@ -67,6 +115,178 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/docker/Dockerfile.rocm /docker/
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/vllm/v1 /vllm_v1 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/vllm/v1 /vllm_v1
# RIXL/UCX build stages
FROM base AS build_rixl
ARG RIXL_BRANCH="f33a5599"
ARG RIXL_REPO="https://github.com/ROCm/RIXL.git"
ARG UCX_BRANCH="da3fac2a"
ARG UCX_REPO="https://github.com/ROCm/ucx.git"
ENV ROCM_PATH=/opt/rocm
ENV UCX_HOME=/usr/local/ucx
ENV RIXL_HOME=/usr/local/rixl
ENV RIXL_BENCH_HOME=/usr/local/rixl_bench
# RIXL build system dependences and RDMA support
RUN apt-get -y update && apt-get -y install autoconf libtool pkg-config \
libgrpc-dev \
libgrpc++-dev \
libprotobuf-dev \
protobuf-compiler-grpc \
libcpprest-dev \
libaio-dev \
librdmacm1 \
librdmacm-dev \
libibverbs1 \
libibverbs-dev \
ibverbs-utils \
rdmacm-utils \
ibverbs-providers \
&& rm -rf /var/lib/apt/lists/*
RUN uv pip install --system meson auditwheel patchelf tomlkit
RUN cd /usr/local/src && \
git clone ${UCX_REPO} && \
cd ucx && \
git checkout ${UCX_BRANCH} && \
./autogen.sh && \
mkdir build && cd build && \
../configure \
--prefix=/usr/local/ucx \
--enable-shared \
--disable-static \
--disable-doxygen-doc \
--enable-optimizations \
--enable-devel-headers \
--with-rocm=/opt/rocm \
--with-verbs \
--with-dm \
--enable-mt && \
make -j && \
make install
ENV PATH=/usr/local/ucx/bin:$PATH
ENV LD_LIBRARY_PATH=${UCX_HOME}/lib:${LD_LIBRARY_PATH}
RUN git clone ${RIXL_REPO} /opt/rixl && \
cd /opt/rixl && \
git checkout ${RIXL_BRANCH} && \
meson setup build --prefix=${RIXL_HOME} \
-Ducx_path=${UCX_HOME} \
-Drocm_path=${ROCM_PATH} && \
cd build && \
ninja && \
ninja install
# Generate RIXL wheel
RUN cd /opt/rixl && mkdir -p /app/install && \
./contrib/build-wheel.sh \
--output-dir /app/install \
--rocm-dir ${ROCM_PATH} \
--ucx-plugins-dir ${UCX_HOME}/lib/ucx \
--nixl-plugins-dir ${RIXL_HOME}/lib/x86_64-linux-gnu/plugins
# -----------------------
# vLLM wheel release build stage (for building distributable wheels)
# This stage pins dependencies to custom ROCm wheel versions and handles version detection
FROM fetch_vllm AS build_vllm_wheel_release
ARG COMMON_WORKDIR
# Create /install directory for custom wheels
RUN mkdir -p /install
# Copy custom ROCm wheels from docker/context if they exist
# COPY ensures Docker cache is invalidated when wheels change
# .keep file ensures directory always exists for COPY to work
COPY docker/context/base-wheels/ /tmp/base-wheels/
# This is how we know if we are building for a wheel release or not.
# If there are not wheels found there, we are not building for a wheel release.
# So we exit with an error. To skip this stage.
RUN if [ -n "$(ls /tmp/base-wheels/*.whl 2>/dev/null)" ]; then \
echo "Found custom wheels - copying to /install"; \
cp /tmp/base-wheels/*.whl /install/ && \
echo "Copied custom wheels:"; \
ls -lh /install/; \
else \
echo "ERROR: No custom wheels found in docker/context/base-wheels/"; \
echo "Wheel releases require pre-built ROCm wheels."; \
exit 1; \
fi
# GIT_REPO_CHECK: Verify repo is clean and tags are available (for release builds)
# This matches CUDA's Dockerfile behavior for proper version detection via setuptools_scm
ARG GIT_REPO_CHECK=0
RUN if [ "$GIT_REPO_CHECK" != "0" ]; then \
echo "Running repository checks..."; \
cd vllm && bash tools/check_repo.sh; \
fi
# Extract version from git BEFORE any modifications (pin_rocm_dependencies.py modifies requirements/rocm.txt)
# This ensures setuptools_scm sees clean repo state for version detection
RUN --mount=type=bind,source=.git,target=vllm/.git \
cd vllm \
&& pip install setuptools_scm \
&& VLLM_VERSION=$(python3 -c "import setuptools_scm; print(setuptools_scm.get_version())") \
&& echo "Detected vLLM version: ${VLLM_VERSION}" \
&& echo "${VLLM_VERSION}" > /tmp/vllm_version.txt
# Fail if git-based package dependencies are found in requirements files
# (uv doesn't handle git+ URLs well, and packages should be distributed on PyPI)
# Extra notes: pip install is able to handle git+ URLs, but uv doesn't.
RUN echo "Checking for git-based packages in requirements files..." \
&& echo "Checking common.txt for git-based packages:" \
&& if grep -q 'git+' ${COMMON_WORKDIR}/vllm/requirements/common.txt; then \
echo "ERROR: Git-based packages found in common.txt:"; \
grep 'git+' ${COMMON_WORKDIR}/vllm/requirements/common.txt; \
echo "Please publish these packages to PyPI instead of using git dependencies."; \
exit 1; \
else \
echo " ✓ No git-based packages found in common.txt"; \
fi \
&& echo "Checking rocm.txt for git-based packages:" \
&& if grep -q 'git+' ${COMMON_WORKDIR}/vllm/requirements/rocm.txt; then \
echo "ERROR: Git-based packages found in rocm.txt:"; \
grep 'git+' ${COMMON_WORKDIR}/vllm/requirements/rocm.txt; \
echo "Please publish these packages to PyPI instead of using git dependencies."; \
exit 1; \
else \
echo " ✓ No git-based packages found in rocm.txt"; \
fi \
&& echo "All requirements files are clean - no git-based packages found"
# Pin vLLM dependencies to exact versions of custom ROCm wheels
# This ensures 'pip install vllm' automatically installs correct torch/triton/torchvision/amdsmi
COPY tools/vllm-rocm/pin_rocm_dependencies.py /tmp/pin_rocm_dependencies.py
RUN echo "Pinning vLLM dependencies to custom wheel versions..." \
&& python3 /tmp/pin_rocm_dependencies.py /install ${COMMON_WORKDIR}/vllm/requirements/rocm.txt
# Install dependencies using custom wheels from /install
RUN cd vllm \
&& echo "Building vLLM with custom wheels from /install" \
&& python3 -m pip install --find-links /install -r requirements/rocm.txt \
&& python3 setup.py clean --all
# Build wheel using pre-extracted version to avoid dirty state from modified requirements/rocm.txt
# (setup.py auto-detects sccache in PATH)
RUN --mount=type=bind,source=.git,target=vllm/.git \
cd vllm \
&& export SETUPTOOLS_SCM_PRETEND_VERSION=$(cat /tmp/vllm_version.txt) \
&& echo "Building wheel with version: ${SETUPTOOLS_SCM_PRETEND_VERSION}" \
&& python3 setup.py bdist_wheel --dist-dir=dist
FROM scratch AS export_vllm_wheel_release
ARG COMMON_WORKDIR
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/dist/*.whl /
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/requirements /requirements
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/benchmarks /benchmarks
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/tests /tests
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/examples /examples
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/docker/Dockerfile.rocm /docker/
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/vllm/v1 /vllm_v1
# ----------------------- # -----------------------
# Test vLLM image # Test vLLM image
FROM base AS test FROM base AS test
...@@ -83,6 +303,10 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \ ...@@ -83,6 +303,10 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
&& pip uninstall -y vllm \ && pip uninstall -y vllm \
&& uv pip install --system *.whl && uv pip install --system *.whl
# Install RIXL wheel
RUN --mount=type=bind,from=build_rixl,src=/app/install,target=/rixl_install \
uv pip install --system /rixl_install/*.whl
WORKDIR /vllm-workspace WORKDIR /vllm-workspace
ARG COMMON_WORKDIR ARG COMMON_WORKDIR
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
...@@ -97,6 +321,14 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -97,6 +321,14 @@ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system hf_transfer uv pip install --system hf_transfer
ENV HF_HUB_ENABLE_HF_TRANSFER=1 ENV HF_HUB_ENABLE_HF_TRANSFER=1
# install audio decode package `torchcodec` from source (required due to
# ROCm and torch version mismatch) for tests with datasets package
COPY tools/install_torchcodec_rocm.sh /tmp/install_torchcodec.sh
RUN bash /tmp/install_torchcodec.sh \
&& rm /tmp/install_torchcodec.sh \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
# Copy in the v1 package (for python-only install test group) # Copy in the v1 package (for python-only install test group)
COPY --from=export_vllm /vllm_v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1 COPY --from=export_vllm /vllm_v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1
...@@ -130,6 +362,7 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \ ...@@ -130,6 +362,7 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
&& uv pip install --system *.whl && uv pip install --system *.whl
ARG COMMON_WORKDIR ARG COMMON_WORKDIR
ARG BASE_IMAGE
# Copy over the benchmark scripts as well # Copy over the benchmark scripts as well
COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks
...@@ -144,4 +377,13 @@ ENV SAFETENSORS_FAST_GPU=1 ...@@ -144,4 +377,13 @@ ENV SAFETENSORS_FAST_GPU=1
# Performance environment variable. # Performance environment variable.
ENV HIP_FORCE_DEV_KERNARG=1 ENV HIP_FORCE_DEV_KERNARG=1
# Workaround for ROCm profiler limits
RUN echo "ROCTRACER_MAX_EVENTS=10000000" > ${COMMON_WORKDIR}/libkineto.conf
ENV KINETO_CONFIG="${COMMON_WORKDIR}/libkineto.conf"
RUN echo "VLLM_BASE_IMAGE=${BASE_IMAGE}" >> ${COMMON_WORKDIR}/versions.txt
CMD ["/bin/bash"] CMD ["/bin/bash"]
#Set entrypoint for vllm-openai official images
FROM final As vllm-openai
ENTRYPOINT ["vllm", "serve"]
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.1-complete ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete
ARG TRITON_BRANCH="57c693b6" ARG TRITON_BRANCH="57c693b6"
ARG TRITON_REPO="https://github.com/ROCm/triton.git" ARG TRITON_REPO="https://github.com/ROCm/triton.git"
ARG PYTORCH_BRANCH="1c57644d" ARG PYTORCH_BRANCH="89075173"
ARG PYTORCH_VISION_BRANCH="v0.23.0"
ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git" ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
ARG PYTORCH_VISION_BRANCH="v0.24.1"
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git" ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
ARG PYTORCH_AUDIO_BRANCH="v2.9.0" ARG PYTORCH_AUDIO_BRANCH="v2.9.0"
ARG PYTORCH_AUDIO_REPO="https://github.com/pytorch/audio.git" ARG PYTORCH_AUDIO_REPO="https://github.com/pytorch/audio.git"
ARG FA_BRANCH="0e60e394" ARG FA_BRANCH="0e60e394"
ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git" ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
ARG AITER_BRANCH="59bd8ff2" ARG AITER_BRANCH="6af8b687"
ARG AITER_REPO="https://github.com/ROCm/aiter.git" ARG AITER_REPO="https://github.com/ROCm/aiter.git"
ARG MORI_BRANCH="2d02c6a9"
ARG MORI_REPO="https://github.com/ROCm/mori.git"
# Sccache configuration (only used in release pipeline)
ARG USE_SCCACHE
ARG SCCACHE_DOWNLOAD_URL
ARG SCCACHE_ENDPOINT
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
ARG SCCACHE_REGION_NAME=us-west-2
ARG SCCACHE_S3_NO_CREDENTIALS=0
FROM ${BASE_IMAGE} AS base FROM ${BASE_IMAGE} AS base
...@@ -20,6 +30,7 @@ ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib: ...@@ -20,6 +30,7 @@ ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151 ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
ENV AITER_ROCM_ARCH=gfx942;gfx950 ENV AITER_ROCM_ARCH=gfx942;gfx950
ENV MORI_GPU_ARCHS=gfx942;gfx950
# Required for RCCL in ROCm7.1 # Required for RCCL in ROCm7.1
ENV HSA_NO_SCRATCH_RECLAIM=1 ENV HSA_NO_SCRATCH_RECLAIM=1
...@@ -33,7 +44,7 @@ ENV DEBIAN_FRONTEND=noninteractive ...@@ -33,7 +44,7 @@ ENV DEBIAN_FRONTEND=noninteractive
# Install Python and other dependencies # Install Python and other dependencies
RUN apt-get update -y \ RUN apt-get update -y \
&& apt-get install -y software-properties-common git curl sudo vim less libgfortran5 \ && apt-get install -y software-properties-common git curl sudo vim less libgfortran5 libopenmpi-dev libpci-dev \
&& for i in 1 2 3; do \ && for i in 1 2 3; do \
add-apt-repository -y ppa:deadsnakes/ppa && break || \ add-apt-repository -y ppa:deadsnakes/ppa && break || \
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \ { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
...@@ -50,6 +61,53 @@ RUN apt-get update -y \ ...@@ -50,6 +61,53 @@ RUN apt-get update -y \
RUN pip install -U packaging 'cmake<4' ninja wheel 'setuptools<80' pybind11 Cython RUN pip install -U packaging 'cmake<4' ninja wheel 'setuptools<80' pybind11 Cython
RUN apt-get update && apt-get install -y libjpeg-dev libsox-dev libsox-fmt-all sox && rm -rf /var/lib/apt/lists/* RUN apt-get update && apt-get install -y libjpeg-dev libsox-dev libsox-fmt-all sox && rm -rf /var/lib/apt/lists/*
# Install sccache if USE_SCCACHE is enabled (for release builds)
ARG USE_SCCACHE
ARG SCCACHE_DOWNLOAD_URL
ARG SCCACHE_ENDPOINT
ARG SCCACHE_BUCKET_NAME
ARG SCCACHE_REGION_NAME
ARG SCCACHE_S3_NO_CREDENTIALS
RUN if [ "$USE_SCCACHE" = "1" ]; then \
echo "Installing sccache..." \
&& SCCACHE_ARCH="x86_64" \
&& SCCACHE_VERSION="v0.8.1" \
&& SCCACHE_DL_URL="${SCCACHE_DOWNLOAD_URL:-https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl.tar.gz}" \
&& curl -L -o /tmp/sccache.tar.gz ${SCCACHE_DL_URL} \
&& tar -xzf /tmp/sccache.tar.gz -C /tmp \
&& mv /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl/sccache /usr/bin/sccache \
&& chmod +x /usr/bin/sccache \
&& rm -rf /tmp/sccache.tar.gz /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl \
&& sccache --version; \
fi
# Setup sccache for HIP compilation via HIP_CLANG_PATH
# This creates wrapper scripts in a separate directory and points HIP to use them
# This avoids modifying the original ROCm binaries which can break detection
# NOTE: HIP_CLANG_PATH is NOT set as ENV to avoid affecting downstream images (Dockerfile.rocm)
# Instead, each build stage should export HIP_CLANG_PATH=/opt/sccache-wrappers if USE_SCCACHE=1
RUN if [ "$USE_SCCACHE" = "1" ]; then \
echo "Setting up sccache wrappers for HIP compilation..." \
&& mkdir -p /opt/sccache-wrappers \
&& printf '#!/bin/bash\nexec sccache /opt/rocm/lib/llvm/bin/clang++ "$@"\n' > /opt/sccache-wrappers/clang++ \
&& chmod +x /opt/sccache-wrappers/clang++ \
&& printf '#!/bin/bash\nexec sccache /opt/rocm/lib/llvm/bin/clang "$@"\n' > /opt/sccache-wrappers/clang \
&& chmod +x /opt/sccache-wrappers/clang \
&& echo "sccache wrappers created in /opt/sccache-wrappers"; \
fi
# Set sccache environment variables only when USE_SCCACHE=1
# This prevents S3 config from leaking into images when sccache is not used
ARG USE_SCCACHE
ENV SCCACHE_BUCKET=${USE_SCCACHE:+${SCCACHE_BUCKET_NAME}}
ENV SCCACHE_REGION=${USE_SCCACHE:+${SCCACHE_REGION_NAME}}
ENV SCCACHE_S3_NO_CREDENTIALS=${USE_SCCACHE:+${SCCACHE_S3_NO_CREDENTIALS}}
ENV SCCACHE_IDLE_TIMEOUT=${USE_SCCACHE:+0}
###
### Triton Build
###
FROM base AS build_triton FROM base AS build_triton
ARG TRITON_BRANCH ARG TRITON_BRANCH
ARG TRITON_REPO ARG TRITON_REPO
...@@ -62,11 +120,19 @@ RUN cd triton \ ...@@ -62,11 +120,19 @@ RUN cd triton \
RUN if [ -d triton/python/triton_kernels ]; then pip install build && cd triton/python/triton_kernels \ RUN if [ -d triton/python/triton_kernels ]; then pip install build && cd triton/python/triton_kernels \
&& python3 -m build --wheel && cp dist/*.whl /app/install; fi && python3 -m build --wheel && cp dist/*.whl /app/install; fi
###
### AMD SMI Build
###
FROM base AS build_amdsmi FROM base AS build_amdsmi
RUN cd /opt/rocm/share/amd_smi \ RUN cd /opt/rocm/share/amd_smi \
&& pip wheel . --wheel-dir=dist && pip wheel . --wheel-dir=dist
RUN mkdir -p /app/install && cp /opt/rocm/share/amd_smi/dist/*.whl /app/install RUN mkdir -p /app/install && cp /opt/rocm/share/amd_smi/dist/*.whl /app/install
###
### Pytorch build
###
FROM base AS build_pytorch FROM base AS build_pytorch
ARG PYTORCH_BRANCH ARG PYTORCH_BRANCH
ARG PYTORCH_VISION_BRANCH ARG PYTORCH_VISION_BRANCH
...@@ -74,42 +140,93 @@ ARG PYTORCH_AUDIO_BRANCH ...@@ -74,42 +140,93 @@ ARG PYTORCH_AUDIO_BRANCH
ARG PYTORCH_REPO ARG PYTORCH_REPO
ARG PYTORCH_VISION_REPO ARG PYTORCH_VISION_REPO
ARG PYTORCH_AUDIO_REPO ARG PYTORCH_AUDIO_REPO
ARG USE_SCCACHE
RUN git clone ${PYTORCH_REPO} pytorch RUN git clone ${PYTORCH_REPO} pytorch
RUN cd pytorch && git checkout ${PYTORCH_BRANCH} \ RUN cd pytorch && git checkout ${PYTORCH_BRANCH} \
&& pip install -r requirements.txt && git submodule update --init --recursive \ && pip install -r requirements.txt && git submodule update --init --recursive \
&& python3 tools/amd_build/build_amd.py \ && python3 tools/amd_build/build_amd.py \
&& if [ "$USE_SCCACHE" = "1" ]; then \
export HIP_CLANG_PATH=/opt/sccache-wrappers \
&& export CMAKE_C_COMPILER_LAUNCHER=sccache \
&& export CMAKE_CXX_COMPILER_LAUNCHER=sccache \
&& sccache --show-stats; \
fi \
&& CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \ && CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \
&& if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
&& pip install dist/*.whl && pip install dist/*.whl
RUN git clone ${PYTORCH_VISION_REPO} vision RUN git clone ${PYTORCH_VISION_REPO} vision
RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \ RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \
&& if [ "$USE_SCCACHE" = "1" ]; then \
export HIP_CLANG_PATH=/opt/sccache-wrappers \
&& export CMAKE_C_COMPILER_LAUNCHER=sccache \
&& export CMAKE_CXX_COMPILER_LAUNCHER=sccache; \
fi \
&& python3 setup.py bdist_wheel --dist-dir=dist \ && python3 setup.py bdist_wheel --dist-dir=dist \
&& if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
&& pip install dist/*.whl && pip install dist/*.whl
RUN git clone ${PYTORCH_AUDIO_REPO} audio RUN git clone ${PYTORCH_AUDIO_REPO} audio
RUN cd audio && git checkout ${PYTORCH_AUDIO_BRANCH} \ RUN cd audio && git checkout ${PYTORCH_AUDIO_BRANCH} \
&& git submodule update --init --recursive \ && git submodule update --init --recursive \
&& pip install -r requirements.txt \ && pip install -r requirements.txt \
&& if [ "$USE_SCCACHE" = "1" ]; then \
export HIP_CLANG_PATH=/opt/sccache-wrappers \
&& export CMAKE_C_COMPILER_LAUNCHER=sccache \
&& export CMAKE_CXX_COMPILER_LAUNCHER=sccache; \
fi \
&& python3 setup.py bdist_wheel --dist-dir=dist \ && python3 setup.py bdist_wheel --dist-dir=dist \
&& if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
&& pip install dist/*.whl && pip install dist/*.whl
RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \ RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
&& cp /app/vision/dist/*.whl /app/install \ && cp /app/vision/dist/*.whl /app/install \
&& cp /app/audio/dist/*.whl /app/install && cp /app/audio/dist/*.whl /app/install
###
### MORI Build
###
FROM base AS build_mori
ARG MORI_BRANCH
ARG MORI_REPO
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
pip install /install/*.whl
RUN git clone ${MORI_REPO}
RUN cd mori \
&& git checkout ${MORI_BRANCH} \
&& git submodule update --init --recursive \
&& python3 setup.py bdist_wheel --dist-dir=dist && ls /app/mori/dist/*.whl
RUN mkdir -p /app/install && cp /app/mori/dist/*.whl /app/install
###
### FlashAttention Build
###
FROM base AS build_fa FROM base AS build_fa
ARG FA_BRANCH ARG FA_BRANCH
ARG FA_REPO ARG FA_REPO
ARG USE_SCCACHE
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \ RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
pip install /install/*.whl pip install /install/*.whl
RUN git clone ${FA_REPO} RUN git clone ${FA_REPO}
RUN cd flash-attention \ RUN cd flash-attention \
&& git checkout ${FA_BRANCH} \ && git checkout ${FA_BRANCH} \
&& git submodule update --init \ && git submodule update --init \
&& GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist && if [ "$USE_SCCACHE" = "1" ]; then \
export HIP_CLANG_PATH=/opt/sccache-wrappers \
&& sccache --show-stats; \
fi \
&& GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist \
&& if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi
RUN mkdir -p /app/install && cp /app/flash-attention/dist/*.whl /app/install RUN mkdir -p /app/install && cp /app/flash-attention/dist/*.whl /app/install
###
### AITER Build
###
FROM base AS build_aiter FROM base AS build_aiter
ARG AITER_BRANCH ARG AITER_BRANCH
ARG AITER_REPO ARG AITER_REPO
ARG USE_SCCACHE
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \ RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
pip install /install/*.whl pip install /install/*.whl
RUN git clone --recursive ${AITER_REPO} RUN git clone --recursive ${AITER_REPO}
...@@ -117,9 +234,37 @@ RUN cd aiter \ ...@@ -117,9 +234,37 @@ RUN cd aiter \
&& git checkout ${AITER_BRANCH} \ && git checkout ${AITER_BRANCH} \
&& git submodule update --init --recursive \ && git submodule update --init --recursive \
&& pip install -r requirements.txt && pip install -r requirements.txt
RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=${AITER_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl RUN pip install pyyaml && cd aiter \
&& if [ "$USE_SCCACHE" = "1" ]; then \
export HIP_CLANG_PATH=/opt/sccache-wrappers \
&& sccache --show-stats; \
fi \
&& PREBUILD_KERNELS=1 GPU_ARCHS=${AITER_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist \
&& if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
&& ls /app/aiter/dist/*.whl
RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install
###
### Final Build
###
# Wheel release stage -
# only includes dependencies used by wheel release pipeline
FROM base AS debs_wheel_release
RUN mkdir /app/debs
RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
cp /install/*.whl /app/debs
RUN --mount=type=bind,from=build_fa,src=/app/install/,target=/install \
cp /install/*.whl /app/debs
RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
cp /install/*.whl /app/debs
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
cp /install/*.whl /app/debs
RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
cp /install/*.whl /app/debs
# Full debs stage - includes Mori (used by Docker releases)
FROM base AS debs FROM base AS debs
RUN mkdir /app/debs RUN mkdir /app/debs
RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \ RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
...@@ -132,6 +277,8 @@ RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \ ...@@ -132,6 +277,8 @@ RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
cp /install/*.whl /app/debs cp /install/*.whl /app/debs
RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \ RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
cp /install/*.whl /app/debs cp /install/*.whl /app/debs
RUN --mount=type=bind,from=build_mori,src=/app/install/,target=/install \
cp /install/*.whl /app/debs
FROM base AS final FROM base AS final
RUN --mount=type=bind,from=debs,src=/app/debs,target=/install \ RUN --mount=type=bind,from=debs,src=/app/debs,target=/install \
...@@ -150,6 +297,8 @@ ARG FA_BRANCH ...@@ -150,6 +297,8 @@ ARG FA_BRANCH
ARG FA_REPO ARG FA_REPO
ARG AITER_BRANCH ARG AITER_BRANCH
ARG AITER_REPO ARG AITER_REPO
ARG MORI_BRANCH
ARG MORI_REPO
RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
&& echo "TRITON_BRANCH: ${TRITON_BRANCH}" >> /app/versions.txt \ && echo "TRITON_BRANCH: ${TRITON_BRANCH}" >> /app/versions.txt \
&& echo "TRITON_REPO: ${TRITON_REPO}" >> /app/versions.txt \ && echo "TRITON_REPO: ${TRITON_REPO}" >> /app/versions.txt \
...@@ -162,4 +311,6 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \ ...@@ -162,4 +311,6 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
&& echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \ && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
&& echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \ && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
&& echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \ && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
&& echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt \
\ No newline at end of file && echo "MORI_BRANCH: ${MORI_BRANCH}" >> /app/versions.txt \
&& echo "MORI_REPO: ${MORI_REPO}" >> /app/versions.txt
...@@ -2,7 +2,7 @@ FROM intel/deep-learning-essentials:2025.2.2-0-devel-ubuntu24.04 AS vllm-base ...@@ -2,7 +2,7 @@ FROM intel/deep-learning-essentials:2025.2.2-0-devel-ubuntu24.04 AS vllm-base
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \ echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
add-apt-repository -y ppa:kobuk-team/intel-graphics add-apt-repository -y ppa:kobuk-team/intel-graphics-staging
RUN apt clean && apt-get update -y && \ RUN apt clean && apt-get update -y && \
apt-get install -y --no-install-recommends --fix-missing \ apt-get install -y --no-install-recommends --fix-missing \
...@@ -28,10 +28,14 @@ RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.12 1 ...@@ -28,10 +28,14 @@ RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.12 1
RUN apt install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing intel-ocloc RUN apt install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing intel-ocloc
# This oneccl contains the BMG support which is not the case for default version of oneapi 2025.2. # This oneccl contains the BMG support which is not the case for default version of oneapi 2025.2.
RUN wget https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.6/intel-oneccl-2021.15.6.9_offline.sh ARG ONECCL_INSTALLER="intel-oneccl-2021.15.7.6_offline.sh"
RUN bash intel-oneccl-2021.15.6.9_offline.sh -a --silent --eula accept && \ RUN wget "https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.7/${ONECCL_INSTALLER}" && \
bash "${ONECCL_INSTALLER}" -a --silent --eula accept && \
rm "${ONECCL_INSTALLER}" && \
echo "source /opt/intel/oneapi/setvars.sh --force" >> /root/.bashrc && \ echo "source /opt/intel/oneapi/setvars.sh --force" >> /root/.bashrc && \
echo "source /opt/intel/oneapi/ccl/2021.15/env/vars.sh --force" >> /root/.bashrc echo "source /opt/intel/oneapi/ccl/2021.15/env/vars.sh --force" >> /root/.bashrc
RUN rm -f /opt/intel/oneapi/ccl/latest && \
ln -s /opt/intel/oneapi/ccl/2021.15 /opt/intel/oneapi/ccl/latest
SHELL ["bash", "-c"] SHELL ["bash", "-c"]
CMD ["bash", "-c", "source /root/.bashrc && exec bash"] CMD ["bash", "-c", "source /root/.bashrc && exec bash"]
...@@ -47,6 +51,11 @@ RUN --mount=type=cache,target=/root/.cache/pip \ ...@@ -47,6 +51,11 @@ RUN --mount=type=cache,target=/root/.cache/pip \
pip install --no-cache-dir \ pip install --no-cache-dir \
-r requirements/xpu.txt -r requirements/xpu.txt
# arctic-inference is built from source which needs torch-xpu properly installed
# used for suffix method speculative decoding
RUN --mount=type=cache,target=/root/.cache/pip \
pip install --no-cache-dir arctic-inference==0.1.1
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/" ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"
COPY . . COPY . .
......
# docker-bake.hcl - vLLM Docker build configuration
#
# This file lives in vLLM repo at docker/docker-bake.hcl
#
# Usage:
# cd docker && docker buildx bake # Build default target (openai)
# cd docker && docker buildx bake test # Build test target
# docker buildx bake --print # Show resolved config
#
# Reference: https://docs.docker.com/build/bake/reference/
# Build configuration
variable "MAX_JOBS" {
default = 16
}
variable "NVCC_THREADS" {
default = 8
}
variable "TORCH_CUDA_ARCH_LIST" {
default = "8.0 8.9 9.0 10.0"
}
variable "COMMIT" {
default = ""
}
# Groups
group "default" {
targets = ["openai"]
}
# Base targets
target "_common" {
dockerfile = "docker/Dockerfile"
context = "."
args = {
max_jobs = MAX_JOBS
nvcc_threads = NVCC_THREADS
torch_cuda_arch_list = TORCH_CUDA_ARCH_LIST
}
}
target "_labels" {
labels = {
"org.opencontainers.image.source" = "https://github.com/vllm-project/vllm"
"org.opencontainers.image.vendor" = "vLLM"
"org.opencontainers.image.title" = "vLLM"
"org.opencontainers.image.description" = "vLLM: A high-throughput and memory-efficient inference and serving engine for LLMs"
"org.opencontainers.image.licenses" = "Apache-2.0"
"org.opencontainers.image.revision" = COMMIT
}
annotations = [
"index,manifest:org.opencontainers.image.revision=${COMMIT}",
]
}
# Build targets
target "test" {
inherits = ["_common", "_labels"]
target = "test"
tags = ["vllm:test"]
output = ["type=docker"]
}
target "openai" {
inherits = ["_common", "_labels"]
target = "vllm-openai"
tags = ["vllm:openai"]
output = ["type=docker"]
}
...@@ -62,7 +62,7 @@ vLLM is flexible and easy to use with: ...@@ -62,7 +62,7 @@ vLLM is flexible and easy to use with:
For more information, check out the following: For more information, check out the following:
- [vLLM announcing blog post](https://vllm.ai) (intro to PagedAttention) - [vLLM announcing blog post](https://blog.vllm.ai/2023/06/20/vllm.html) (intro to PagedAttention)
- [vLLM paper](https://arxiv.org/abs/2309.06180) (SOSP 2023) - [vLLM paper](https://arxiv.org/abs/2309.06180) (SOSP 2023)
- [How continuous batching enables 23x throughput in LLM inference while reducing p50 latency](https://www.anyscale.com/blog/continuous-batching-llm-inference) by Cade Daniel et al. - [How continuous batching enables 23x throughput in LLM inference while reducing p50 latency](https://www.anyscale.com/blog/continuous-batching-llm-inference) by Cade Daniel et al.
- [vLLM Meetups](community/meetups.md) - [vLLM Meetups](community/meetups.md)
...@@ -72,7 +72,6 @@ Internal data structures. ...@@ -72,7 +72,6 @@ Internal data structures.
- [vllm.multimodal.inputs.MultiModalFieldConfig][] - [vllm.multimodal.inputs.MultiModalFieldConfig][]
- [vllm.multimodal.inputs.MultiModalKwargsItem][] - [vllm.multimodal.inputs.MultiModalKwargsItem][]
- [vllm.multimodal.inputs.MultiModalKwargsItems][] - [vllm.multimodal.inputs.MultiModalKwargsItems][]
- [vllm.multimodal.inputs.MultiModalKwargs][]
- [vllm.multimodal.inputs.MultiModalInputs][] - [vllm.multimodal.inputs.MultiModalInputs][]
### Data Parsing ### Data Parsing
......
...@@ -8,12 +8,19 @@ The results are automatically published to the public [vLLM Performance Dashboar ...@@ -8,12 +8,19 @@ The results are automatically published to the public [vLLM Performance Dashboar
## Manually Trigger the benchmark ## Manually Trigger the benchmark
Use [vllm-ci-test-repo images](https://gallery.ecr.aws/q9t5s3a7/vllm-ci-test-repo) with vLLM benchmark suite. Use [vllm-ci-test-repo images](https://gallery.ecr.aws/q9t5s3a7/vllm-ci-test-repo) with vLLM benchmark suite.
For CPU environment, please use the image with "-cpu" postfix. For x86 CPU environment, please use the image with "-cpu" postfix. For AArch64 CPU environment, please use the image with "-arm64-cpu" postfix.
Here is an example for docker run command for CPU. Here is an example for docker run command for CPU. For GPUs skip setting the `ON_CPU` env var.
```bash ```bash
docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingface -e HF_TOKEN='' --shm-size=16g --name vllm-cpu-ci public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:1da94e673c257373280026f75ceb4effac80e892-cpu export VLLM_COMMIT=1da94e673c257373280026f75ceb4effac80e892 # use full commit hash from the main branch
export HF_TOKEN=<valid Hugging Face token>
if [[ "$(uname -m)" == aarch64 || "$(uname -m)" == arm64 ]]; then
IMG_SUFFIX="arm64-cpu"
else
IMG_SUFFIX="cpu"
fi
docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingface -e HF_TOKEN=$HF_TOKEN -e ON_ARM64_CPU=1 --shm-size=16g --name vllm-cpu-ci public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:${VLLM_COMMIT}-${IMG_SUFFIX}
``` ```
Then, run below command inside the docker instance. Then, run below command inside the docker instance.
...@@ -26,14 +33,65 @@ When run, benchmark script generates results under **benchmark/results** folder, ...@@ -26,14 +33,65 @@ When run, benchmark script generates results under **benchmark/results** folder,
### Runtime environment variables ### Runtime environment variables
- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0. - `ON_CPU`: set the value to '1' on Intel® Xeon® and Arm® Neoverse™ Processors. Default value is 0.
- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file). - `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file). - `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
- `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file). - `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
- `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string. - `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string. - `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
For more results visualization, check the [visualizing the results](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md#visualizing-the-results). ### Visualization
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table with real benchmarking results.
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
If you do not see the table, please wait till the benchmark finish running.
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
#### Performance Results Comparison
The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`.
When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`.
`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.
If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead.
Here is an example using the script to compare result_a and result_b with max concurrency and qps for same Model, Dataset name, input/output length.
`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
***Output Tput (tok/s) — Model : [ meta-llama/Llama-3.1-8B-Instruct ] , Dataset Name : [ random ] , Input Len : [ 2048.0 ] , Output Len : [ 2048.0 ]***
| | # of max concurrency | qps | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio |
|----|------|-----|-----------|----------|----------|
| 0 | 12 | inf | 24.98 | 186.03 | 7.45 |
| 1 | 16 | inf| 25.49 | 246.92 | 9.69 |
| 2 | 24 | inf| 27.74 | 293.34 | 10.57 |
| 3 | 32 | inf| 28.61 |306.69 | 10.72 |
***compare-json-results.py – Command-Line Parameters***
compare-json-results.py provides configurable parameters to compare one or more benchmark_results.json files and generate summary tables and plots.
In most cases, users only need to specify --file to parse the desired benchmark results.
| Parameter | Type | Default Value | Description |
| ---------------------- | ------------------ | ----------------------- | ----------------------------------------------------------------------------------------------------- |
| `--file` | `str` (appendable) | *None* | Input JSON result file(s). Can be specified multiple times to compare multiple benchmark outputs. |
| `--debug` | `bool` | `False` | Enables debug mode. When set, prints all available information to aid troubleshooting and validation. |
| `--plot` / `--no-plot` | `bool` | `True` | Controls whether performance plots are generated. Use `--no-plot` to disable graph generation. |
| `--xaxis` | `str` | `# of max concurrency.` | Column name used as the X-axis in comparison plots (for example, concurrency or batch size). |
| `--latency` | `str` | `p99` | Latency aggregation method used for TTFT/TPOT. Supported values: `median` or `p99`. |
| `--ttft-max-ms` | `float` | `3000.0` | Reference upper bound (milliseconds) for TTFT plots, typically used to visualize SLA thresholds. |
| `--tpot-max-ms` | `float` | `100.0` | Reference upper bound (milliseconds) for TPOT plots, typically used to visualize SLA thresholds. |
***Valid Max Concurrency Summary***
Based on the configured TTFT and TPOT SLA thresholds, compare-json-results.py computes the maximum valid concurrency for each benchmark result.
The “Max # of max concurrency. (Both)” column represents the highest concurrency level that satisfies both TTFT and TPOT constraints simultaneously.
This value is typically used in capacity planning and sizing guides.
| # | Configuration | Max # of max concurrency. (TTFT ≤ 10000 ms) | Max # of max concurrency. (TPOT ≤ 100 ms) | Max # of max concurrency. (Both) | Output Tput @ Both (tok/s) | TTFT @ Both (ms) | TPOT @ Both (ms) |
| - | -------------- | ------------------------------------------- | ----------------------------------------- | -------------------------------- | -------------------------- | ---------------- | ---------------- |
| 0 | results-a | 128.00 | 12.00 | 12.00 | 127.76 | 3000.82 | 93.24 |
| 1 | results-b | 128.00 | 32.00 | 32.00 | 371.42 | 2261.53 | 81.74 |
More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](../../.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md). More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](../../.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md).
......
...@@ -129,10 +129,10 @@ vllm bench sweep serve_sla \ ...@@ -129,10 +129,10 @@ vllm bench sweep serve_sla \
The algorithm for adjusting the SLA variable is as follows: The algorithm for adjusting the SLA variable is as follows:
1. Run the benchmark with infinite QPS, and use the corresponding metrics to determine the initial value of the variable. 1. Run the benchmark once with maximum possible QPS, and once with minimum possible QPS. For each run, calculate the distance of the SLA metrics from their targets, resulting in data points of QPS vs SLA distance.
- For example, the initial request rate is set to the concurrency under infinite QPS. 2. Perform spline interpolation between the data points to estimate the QPS that results in zero SLA distance.
2. If the SLA is still satisfied, keep doubling the value until the SLA is no longer satisfied. This gives a relatively narrow window that contains the point where the SLA is barely satisfied. 3. Run the benchmark with the estimated QPS and add the resulting data point to the history.
3. Apply binary search over the window to find the maximum value that still satisfies the SLA. 4. Repeat Steps 2 and 3 until the maximum QPS that passes SLA and the minimum QPS that fails SLA in the history are close enough to each other.
!!! important !!! important
SLA tuning is applied over each combination of `--serve-params`, `--bench-params`, and `--sla-params`. SLA tuning is applied over each combination of `--serve-params`, `--bench-params`, and `--sla-params`.
......
...@@ -6,4 +6,4 @@ ...@@ -6,4 +6,4 @@
## Arguments ## Arguments
--8<-- "docs/argparse/bench_latency.inc.md" --8<-- "docs/generated/argparse/bench_latency.inc.md"
# vllm bench mm-processor
## JSON CLI Arguments
--8<-- "docs/cli/json_tip.inc.md"
## Arguments
--8<-- "docs/generated/argparse/bench_mm_processor.inc.md"
...@@ -6,4 +6,4 @@ ...@@ -6,4 +6,4 @@
## Arguments ## Arguments
--8<-- "docs/argparse/bench_serve.inc.md" --8<-- "docs/generated/argparse/bench_serve.inc.md"
...@@ -6,4 +6,4 @@ ...@@ -6,4 +6,4 @@
## Arguments ## Arguments
--8<-- "docs/argparse/bench_sweep_plot.inc.md" --8<-- "docs/generated/argparse/bench_sweep_plot.inc.md"
...@@ -6,4 +6,4 @@ ...@@ -6,4 +6,4 @@
## Arguments ## Arguments
--8<-- "docs/argparse/bench_sweep_plot_pareto.inc.md" --8<-- "docs/generated/argparse/bench_sweep_plot_pareto.inc.md"
...@@ -6,4 +6,4 @@ ...@@ -6,4 +6,4 @@
## Arguments ## Arguments
--8<-- "docs/argparse/bench_sweep_serve.inc.md" --8<-- "docs/generated/argparse/bench_sweep_serve.inc.md"
...@@ -6,4 +6,4 @@ ...@@ -6,4 +6,4 @@
## Arguments ## Arguments
--8<-- "docs/argparse/bench_sweep_serve_sla.inc.md" --8<-- "docs/generated/argparse/bench_sweep_serve_sla.inc.md"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment