Unverified Commit 7a3b15e6 authored by Dmitry Tokarev's avatar Dmitry Tokarev Committed by GitHub
Browse files

feat: VLLM + CUDA 13 (#4997)


Signed-off-by: default avatarDmitry Tokarev <dtokarev@nvidia.com>
Co-authored-by: default avatarcoderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
parent aaed4f3b
...@@ -510,7 +510,15 @@ RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \ ...@@ -510,7 +510,15 @@ RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
--mount=type=cache,target=/root/.cache/uv \ --mount=type=cache,target=/root/.cache/uv \
cp /tmp/deps/vllm/install_vllm.sh /tmp/install_vllm.sh && \ cp /tmp/deps/vllm/install_vllm.sh /tmp/install_vllm.sh && \
chmod +x /tmp/install_vllm.sh && \ chmod +x /tmp/install_vllm.sh && \
/tmp/install_vllm.sh --vllm-ref $VLLM_REF --max-jobs $MAX_JOBS --arch $ARCH --installation-dir /opt ${DEEPGEMM_REF:+--deepgemm-ref "$DEEPGEMM_REF"} ${FLASHINF_REF:+--flashinf-ref "$FLASHINF_REF"} ${LMCACHE_REF:+--lmcache-ref "$LMCACHE_REF"} --cuda-version $CUDA_VERSION /tmp/install_vllm.sh \
--vllm-ref $VLLM_REF \
--max-jobs $MAX_JOBS \
--arch $ARCH \
--installation-dir /opt \
${DEEPGEMM_REF:+--deepgemm-ref "$DEEPGEMM_REF"} \
${FLASHINF_REF:+--flashinf-ref "$FLASHINF_REF"} \
${LMCACHE_REF:+--lmcache-ref "$LMCACHE_REF"} \
--cuda-version $CUDA_VERSION
ENV LD_LIBRARY_PATH=\ ENV LD_LIBRARY_PATH=\
/opt/vllm/tools/ep_kernels/ep_kernels_workspace/nvshmem_install/lib:\ /opt/vllm/tools/ep_kernels/ep_kernels_workspace/nvshmem_install/lib:\
...@@ -553,8 +561,9 @@ COPY --from=dynamo_base /usr/local/cuda/bin/fatbinary /usr/local/cuda/bin/fatbin ...@@ -553,8 +561,9 @@ COPY --from=dynamo_base /usr/local/cuda/bin/fatbinary /usr/local/cuda/bin/fatbin
COPY --from=dynamo_base /usr/local/cuda/include/ /usr/local/cuda/include/ COPY --from=dynamo_base /usr/local/cuda/include/ /usr/local/cuda/include/
COPY --from=dynamo_base /usr/local/cuda/nvvm /usr/local/cuda/nvvm COPY --from=dynamo_base /usr/local/cuda/nvvm /usr/local/cuda/nvvm
COPY --from=dynamo_base /usr/local/cuda/lib64/libcudart.so* /usr/local/cuda/lib64/ COPY --from=dynamo_base /usr/local/cuda/lib64/libcudart.so* /usr/local/cuda/lib64/
RUN ln -s /usr/local/cuda/lib64/libcublas.so.12 /usr/local/cuda/lib64/libcublas.so RUN CUDA_VERSION_MAJOR="${CUDA_VERSION%%.*}" &&\
RUN ln -s /usr/local/cuda/lib64/libcublasLt.so.12 /usr/local/cuda/lib64/libcublasLt.so ln -s /usr/local/cuda/lib64/libcublas.so.${CUDA_VERSION_MAJOR} /usr/local/cuda/lib64/libcublas.so &&\
ln -s /usr/local/cuda/lib64/libcublasLt.so.${CUDA_VERSION_MAJOR} /usr/local/cuda/lib64/libcublasLt.so
# DeepGemm runs nvcc for JIT kernel compilation, however the CUDA include path # DeepGemm runs nvcc for JIT kernel compilation, however the CUDA include path
# is not properly set for complilation. Set CPATH to help nvcc find the headers. # is not properly set for complilation. Set CPATH to help nvcc find the headers.
...@@ -587,6 +596,8 @@ ARG PYTHON_VERSION ...@@ -587,6 +596,8 @@ ARG PYTHON_VERSION
# Install Python, build-essential and python3-dev as apt dependencies # Install Python, build-essential and python3-dev as apt dependencies
RUN apt-get update && \ RUN apt-get update && \
CUDA_VERSION_MAJOR=${CUDA_VERSION%%.*} &&\
CUDA_VERSION_MINOR=$(echo "${CUDA_VERSION#*.}" | cut -d. -f1) && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
# Python runtime - CRITICAL for virtual environment to work # Python runtime - CRITICAL for virtual environment to work
python${PYTHON_VERSION}-dev \ python${PYTHON_VERSION}-dev \
...@@ -605,7 +616,7 @@ RUN apt-get update && \ ...@@ -605,7 +616,7 @@ RUN apt-get update && \
# prometheus dependencies # prometheus dependencies
ca-certificates \ ca-certificates \
# DeepGemm uses 'cuobjdump' which does not come with CUDA image # DeepGemm uses 'cuobjdump' which does not come with CUDA image
cuda-command-line-tools-12-9 && \ cuda-command-line-tools-${CUDA_VERSION_MAJOR}-${CUDA_VERSION_MINOR} && \
rm -rf /var/lib/apt/lists/* rm -rf /var/lib/apt/lists/*
USER dynamo USER dynamo
......
...@@ -107,6 +107,10 @@ VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base" ...@@ -107,6 +107,10 @@ VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
# for details and reproducer to manually test if the image # for details and reproducer to manually test if the image
# can be updated to later versions. # can be updated to later versions.
VLLM_BASE_IMAGE_TAG="25.04-cuda12.9-devel-ubuntu24.04" VLLM_BASE_IMAGE_TAG="25.04-cuda12.9-devel-ubuntu24.04"
VLLM_BASE_IMAGE_TAG_CU13="25.11-cuda13.0-devel-ubuntu24.04"
VLLM_RUNTIME_IMAGE="nvcr.io/nvidia/cuda"
VLLM_RUNTIME_IMAGE_TAG="12.9.0-runtime-ubuntu24.04"
VLLM_RUNTIME_IMAGE_TAG_CU13="13.0.2-runtime-ubuntu24.04"
NONE_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base" NONE_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
NONE_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04" NONE_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
...@@ -161,6 +165,16 @@ get_options() { ...@@ -161,6 +165,16 @@ get_options() {
missing_requirement "$1" missing_requirement "$1"
fi fi
;; ;;
--cuda-version)
if [ "$2" ]; then
echo "INFO: Setting CUDA_VERSION to $2"
CUDA_VERSION=$2
BUILD_ARGS+=" --build-arg CUDA_VERSION=$2 "
shift
else
missing_requirement "$1"
fi
;;
--tensorrtllm-pip-wheel-dir) --tensorrtllm-pip-wheel-dir)
if [ "$2" ]; then if [ "$2" ]; then
TENSORRTLLM_PIP_WHEEL_DIR=$2 TENSORRTLLM_PIP_WHEEL_DIR=$2
...@@ -316,7 +330,6 @@ get_options() { ...@@ -316,7 +330,6 @@ get_options() {
missing_requirement "$1" missing_requirement "$1"
fi fi
;; ;;
--sccache-region) --sccache-region)
if [ "$2" ]; then if [ "$2" ]; then
SCCACHE_REGION=$2 SCCACHE_REGION=$2
...@@ -379,6 +392,7 @@ get_options() { ...@@ -379,6 +392,7 @@ get_options() {
if [ -z "$BASE_IMAGE_TAG" ]; then if [ -z "$BASE_IMAGE_TAG" ]; then
BASE_IMAGE_TAG=${FRAMEWORK}_BASE_IMAGE_TAG BASE_IMAGE_TAG=${FRAMEWORK}_BASE_IMAGE_TAG
BASE_IMAGE_TAG=${!BASE_IMAGE_TAG} BASE_IMAGE_TAG=${!BASE_IMAGE_TAG}
echo "INFO: Using default base image tag for $FRAMEWORK: $BASE_IMAGE_TAG"
fi fi
if [ -z "$BASE_IMAGE" ]; then if [ -z "$BASE_IMAGE" ]; then
...@@ -386,6 +400,14 @@ get_options() { ...@@ -386,6 +400,14 @@ get_options() {
BASE_IMAGE=${!BASE_IMAGE} BASE_IMAGE=${!BASE_IMAGE}
fi fi
if [[ $FRAMEWORK == "VLLM" ]] && [[ $CUDA_VERSION == "13."* ]]; then
BASE_IMAGE_TAG=$VLLM_BASE_IMAGE_TAG_CU13
BUILD_ARGS+=" --build-arg BASE_IMAGE_TAG=${VLLM_BASE_IMAGE_TAG_CU13} "
RUNTIME_IMAGE_TAG=$VLLM_RUNTIME_IMAGE_TAG_CU13
BUILD_ARGS+=" --build-arg RUNTIME_IMAGE_TAG=${VLLM_RUNTIME_IMAGE_TAG_CU13} "
echo "INFO: Overriding base image tag for vLLM with CUDA 13: $BASE_IMAGE_TAG AND RUNTIME_IMAGE_TAG: $RUNTIME_IMAGE_TAG"
fi
if [ -z "$BASE_IMAGE" ]; then if [ -z "$BASE_IMAGE" ]; then
error "ERROR: Framework $FRAMEWORK without BASE_IMAGE" error "ERROR: Framework $FRAMEWORK without BASE_IMAGE"
fi fi
...@@ -521,17 +543,6 @@ if [[ $FRAMEWORK == "VLLM" ]] && [[ "$PLATFORM" == *"linux/arm64"* ]]; then ...@@ -521,17 +543,6 @@ if [[ $FRAMEWORK == "VLLM" ]] && [[ "$PLATFORM" == *"linux/arm64"* ]]; then
BUILD_ARGS+=" --build-arg RUNTIME_IMAGE_TAG=12.9.0-runtime-ubuntu24.04 " BUILD_ARGS+=" --build-arg RUNTIME_IMAGE_TAG=12.9.0-runtime-ubuntu24.04 "
echo "INFO: Automatically setting RUNTIME_IMAGE_TAG=12.9.0-runtime-ubuntu24.04 for vLLM ARM64" echo "INFO: Automatically setting RUNTIME_IMAGE_TAG=12.9.0-runtime-ubuntu24.04 for vLLM ARM64"
fi fi
if [[ "$BUILD_ARGS" != *"CUDA_VERSION"* ]]; then
BUILD_ARGS+=" --build-arg CUDA_VERSION=129 "
echo "INFO: Automatically setting CUDA_VERSION=129 for vLLM ARM64"
fi
if [[ "$BUILD_ARGS" != *"TORCH_BACKEND"* ]]; then
BUILD_ARGS+=" --build-arg TORCH_BACKEND=cu129 "
echo "INFO: Automatically setting TORCH_BACKEND=cu129 for vLLM ARM64"
fi
fi fi
# Update DOCKERFILE if framework is VLLM # Update DOCKERFILE if framework is VLLM
......
...@@ -11,7 +11,8 @@ ...@@ -11,7 +11,8 @@
set -euo pipefail set -euo pipefail
VLLM_REF="v0.12.0" VLLM_VER="0.12.0"
VLLM_REF="v${VLLM_VER}"
# Basic Configurations # Basic Configurations
ARCH=$(uname -m) ARCH=$(uname -m)
...@@ -19,7 +20,7 @@ MAX_JOBS=16 ...@@ -19,7 +20,7 @@ MAX_JOBS=16
INSTALLATION_DIR=/tmp INSTALLATION_DIR=/tmp
# VLLM and Dependency Configurations # VLLM and Dependency Configurations
TORCH_CUDA_ARCH_LIST="9.0;10.0" # For EP Kernels TORCH_CUDA_ARCH_LIST="9.0;10.0" # For EP Kernels -- TODO: check if we need to add 12.0+PTX
DEEPGEMM_REF="" DEEPGEMM_REF=""
CUDA_VERSION="12.9" CUDA_VERSION="12.9"
FLASHINF_REF="v0.5.3" FLASHINF_REF="v0.5.3"
...@@ -97,44 +98,76 @@ export CUDA_HOME=/usr/local/cuda ...@@ -97,44 +98,76 @@ export CUDA_HOME=/usr/local/cuda
# Derive torch backend from CUDA version (e.g., "12.9" -> "cu129") # Derive torch backend from CUDA version (e.g., "12.9" -> "cu129")
TORCH_BACKEND="cu$(echo $CUDA_VERSION | tr -d '.')" TORCH_BACKEND="cu$(echo $CUDA_VERSION | tr -d '.')"
CUDA_VERSION_MAJOR=${CUDA_VERSION%%.*}
echo "=== Installing prerequisites ===" echo "=== Installing prerequisites ==="
uv pip install pip cuda-python uv pip install pip cuda-python
echo "\n=== Configuration Summary ===" echo "\n=== Configuration Summary ==="
echo " VLLM_REF=$VLLM_REF | ARCH=$ARCH | CUDA_VERSION=$CUDA_VERSION | TORCH_BACKEND=$TORCH_BACKEND" echo " VLLM_REF=$VLLM_REF | ARCH=$ARCH | CUDA_VERSION=$CUDA_VERSION | TORCH_BACKEND=$TORCH_BACKEND"
echo " FLASHINF_REF=$FLASHINF_REF | LMCACHE_REF=$LMCACHE_REF | DEEPGEMM_REF=$DEEPGEMM_REF"
echo " TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST | INSTALLATION_DIR=$INSTALLATION_DIR" echo " TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST | INSTALLATION_DIR=$INSTALLATION_DIR"
echo "\n=== Installing LMCache ===" if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then
if [ "$ARCH" = "amd64" ]; then echo " FLASHINF_REF=$FLASHINF_REF | LMCACHE_REF=$LMCACHE_REF | DEEPGEMM_REF=$DEEPGEMM_REF"
# LMCache installation currently fails on arm64 due to CUDA dependency issues echo "\n=== Installing LMCache ==="
# Install LMCache BEFORE vLLM so vLLM's dependencies take precedence if [ "$ARCH" = "amd64" ]; then
uv pip install lmcache==${LMCACHE_REF} --torch-backend=${TORCH_BACKEND} # LMCache installation currently fails on arm64 due to CUDA dependency issues
echo "✓ LMCache ${LMCACHE_REF} installed" # Install LMCache BEFORE vLLM so vLLM's dependencies take precedence
uv pip install lmcache==${LMCACHE_REF} --torch-backend=${TORCH_BACKEND}
echo "✓ LMCache ${LMCACHE_REF} installed"
else
echo "⚠ Skipping LMCache on ARM64 (compatibility issues)"
fi
else else
echo "⚠ Skipping LMCache on ARM64 (compatibility issues)" echo " FLASHINF_REF=$FLASHINF_REF | LMCache will not be installed as it doesn't support CUDA 13 yet | DEEPGEMM_REF=$DEEPGEMM_REF"
fi fi
echo "\n=== Cloning vLLM repository ===" echo "\n=== Cloning vLLM repository ==="
# Clone needed for DeepGEMM and EP kernels install scripts # Clone needed for DeepGEMM and EP kernels install scripts
cd $INSTALLATION_DIR cd $INSTALLATION_DIR
git clone https://github.com/vllm-project/vllm.git vllm git clone https://github.com/vllm-project/vllm.git vllm
cd vllm cd vllm
git checkout $VLLM_REF git checkout $VLLM_REF
# TODO: remove this cherry-pick when vllm is upgraded to > 0.12.0 (when the fix is shipped)
git cherry-pick --no-commit 799804d140fc99ce3964648ba91aaa810cf28fef # nvshmem fix for CUDA 13.0
echo "✓ vLLM repository cloned"
echo "\n=== Installing vLLM & FlashInfer ==="
echo "Installing vLLM $VLLM_REF from PyPI..."
uv pip install vllm[flashinfer,runai]==$VLLM_REF --torch-backend=${TORCH_BACKEND}
uv pip install flashinfer-cubin==$FLASHINF_REF
uv pip install flashinfer-jit-cache==$FLASHINF_REF --extra-index-url https://flashinfer.ai/whl/${TORCH_BACKEND}
echo "\n=== Installing vLLM & FlashInfer ==="
if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then
echo "Installing vLLM $VLLM_REF from PyPI..."
uv pip install vllm[flashinfer,runai]==$VLLM_REF --torch-backend=${TORCH_BACKEND}
uv pip install flashinfer-cubin==$FLASHINF_REF
uv pip install flashinfer-jit-cache==$FLASHINF_REF --extra-index-url https://flashinfer.ai/whl/${TORCH_BACKEND}
elif [[ "$CUDA_VERSION_MAJOR" == "13" ]]; then
if [ "$ARCH" = "amd64" ]; then
echo "Installing vLLM $VLLM_REF from GitHub since CUDA 13 x86_64 wheel is only present on GitHub..."
uv pip install \
--index-strategy=unsafe-best-match \
--extra-index-url https://download.pytorch.org/whl/${TORCH_BACKEND} \
nixl[cu13]==0.7.1 \
https://github.com/vllm-project/vllm/releases/download/v${VLLM_VER}/vllm-${VLLM_VER}+${TORCH_BACKEND}-cp38-abi3-manylinux_2_31_x86_64.whl[flashinfer,runai] \
--torch-backend=${TORCH_BACKEND}
uv pip install flashinfer-cubin==$FLASHINF_REF
uv pip install flashinfer-jit-cache==$FLASHINF_REF --extra-index-url https://flashinfer.ai/whl/${TORCH_BACKEND}
echo "✓ vLLM installation completed"
else
echo "⚠ Skipping LMCache on ARM64 (compatibility issues, missing aarch64 wheels)"
echo "Building vLLM from source for ${ARCH} architecture..."
echo "Try to install specific PyTorch and other dependencies first"
uv pip install --index-strategy=unsafe-best-match --index https://download.pytorch.org/whl/ -r requirements/cuda.txt
uv pip install setuptools_scm # required to build vLLM from source
MAX_JOBS=${MAX_JOBS} uv pip install -v --no-build-isolation .
fi
else
echo "❌ Unsupported CUDA version for vLLM installation: ${CUDA_VERSION}"
exit 1
fi
echo "✓ vLLM installation completed" echo "✓ vLLM installation completed"
echo "\n=== Installing DeepGEMM ===" echo "\n=== Installing DeepGEMM ==="
cd $INSTALLATION_DIR/vllm/tools cd $INSTALLATION_DIR/vllm/tools
if [ -n "$DEEPGEMM_REF" ]; then if [ -n "$DEEPGEMM_REF" ]; then
bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}" --ref "$DEEPGEMM_REF" bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}" --ref "$DEEPGEMM_REF"
else else
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment