Unverified Commit e360316a authored by Matthew Bonanni's avatar Matthew Bonanni Committed by GitHub
Browse files

Add DeepGEMM to Dockerfile in vllm-base image (#21533)


Signed-off-by: default avatarMatthew Bonanni <mbonanni001@gmail.com>
Signed-off-by: default avatarmgoin <mgoin64@gmail.com>
Co-authored-by: default avatarmgoin <mgoin64@gmail.com>
parent c3e0e933
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
# to run the OpenAI compatible server.
......@@ -16,6 +15,7 @@ ARG PYTHON_VERSION=3.12
# Example:
# docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
# TODO: Restore to base image after FlashInfer AOT wheel fixed
ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
# By parameterizing the Deadsnakes repository URL, we allow third-party to use
......@@ -289,7 +289,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
#################### vLLM installation IMAGE ####################
# image with vLLM installed
# TODO: Restore to base image after FlashInfer AOT wheel fixed
FROM ${FINAL_BASE_IMAGE} AS vllm-base
ARG CUDA_VERSION
ARG PYTHON_VERSION
......@@ -435,6 +434,33 @@ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/build.txt \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
# Install DeepGEMM from source
ARG DEEPGEMM_GIT_REPO="https://github.com/deepseek-ai/DeepGEMM.git"
ARG DEEPGEMM_GIT_REF="187656694f7f69e3e7975617a68bc3387680a7e1"
RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
. /etc/environment
CUDA_MAJOR="${CUDA_VERSION%%.*}"
CUDA_MINOR="${CUDA_VERSION#${CUDA_MAJOR}.}"
CUDA_MINOR="${CUDA_MINOR%%.*}"
if [ "$CUDA_MAJOR" -ge 12 ] && [ "$CUDA_MINOR" -ge 8 ]; then
git clone --recursive --shallow-submodules \
${DEEPGEMM_GIT_REPO} deepgemm
echo "🏗️ Building DeepGEMM"
pushd deepgemm
git checkout ${DEEPGEMM_GIT_REF}
# Build DeepGEMM
# (Based on https://github.com/deepseek-ai/DeepGEMM/blob/main/install.sh)
rm -rf build dist
rm -rf *.egg-info
python3 setup.py bdist_wheel
uv pip install --system dist/*.whl
popd
rm -rf deepgemm
else
echo "Skipping DeepGEMM installation (requires CUDA 12.8+ but got ${CUDA_VERSION})"
fi
BASH
#################### vLLM installation IMAGE ####################
#################### TEST IMAGE ####################
......
......@@ -20,7 +20,8 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
FusedMoEModularKernel)
from vllm.platforms import current_platform
from vllm.utils import has_deep_ep, has_deep_gemm
from vllm.utils.deep_gemm import is_blackwell_deep_gemm_used
from vllm.utils.deep_gemm import (is_blackwell_deep_gemm_used,
is_deep_gemm_supported)
from .parallel_utils import ProcessGroupInfo, parallel_launch
from .utils import make_test_weights
......@@ -46,7 +47,7 @@ requires_deep_ep = pytest.mark.skipif(
)
requires_deep_gemm = pytest.mark.skipif(
not has_deep_gemm(),
not is_deep_gemm_supported(),
reason="Requires deep_gemm kernels",
)
......
......@@ -15,13 +15,13 @@ import torch
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
per_token_group_quant_fp8)
from vllm.utils import has_deep_gemm
from vllm.utils.deep_gemm import calc_diff, per_block_cast_to_fp8
from vllm.utils.deep_gemm import (calc_diff, is_deep_gemm_supported,
per_block_cast_to_fp8)
BLOCK_SIZE = [128, 128]
requires_deep_gemm = pytest.mark.skipif(
not has_deep_gemm(),
not is_deep_gemm_supported(),
reason="Requires deep_gemm kernels",
)
......
......@@ -17,6 +17,17 @@ from vllm.platforms import current_platform
from vllm.utils import has_deep_gemm
@functools.cache
def is_deep_gemm_supported() -> bool:
"""Return ``True`` if DeepGEMM is supported on the current platform.
Currently, only Hopper and Blackwell GPUs are supported.
"""
supported_arch = current_platform.is_cuda() and (
current_platform.is_device_capability(90)
or current_platform.is_device_capability(100))
return has_deep_gemm() and supported_arch
@functools.cache
def is_blackwell_deep_gemm_used() -> bool:
"""Return ``True`` if vLLM is configured to use DeepGEMM on a
......@@ -142,4 +153,5 @@ __all__ = [
"fp8_m_grouped_gemm_nt_masked",
"per_block_cast_to_fp8",
"is_blackwell_deep_gemm_used",
"is_deep_gemm_supported",
]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment