"vllm/vscode:/vscode.git/clone" did not exist on "93b9d9f499982b723c975ba7066af533afd04f08"
Commit 4d3a2c28 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.6.5' into v0.6.5-dev

parents 92ec5d8e 2d1b9baa
#!/bin/bash #!/bin/bash
set -eux
python_executable=python$1 python_executable=python$1
cuda_home=/usr/local/cuda-$2 cuda_home=/usr/local/cuda-$2
...@@ -8,13 +9,15 @@ PATH=${cuda_home}/bin:$PATH ...@@ -8,13 +9,15 @@ PATH=${cuda_home}/bin:$PATH
LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
# Install requirements # Install requirements
$python_executable -m pip install wheel packaging $python_executable -m pip install -r requirements-build.txt -r requirements-cuda.txt
$python_executable -m pip install -r requirements-cuda.txt
# Limit the number of parallel jobs to avoid OOM # Limit the number of parallel jobs to avoid OOM
export MAX_JOBS=1 export MAX_JOBS=1
# Make sure release wheels are built for the following architectures # Make sure release wheels are built for the following architectures
export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
export VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real" export VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real"
bash tools/check_repo.sh
# Build # Build
$python_executable setup.py bdist_wheel --dist-dir=dist $python_executable setup.py bdist_wheel --dist-dir=dist
#!/bin/bash #!/bin/bash
# Replace '.' with '-' ex: 11.8 -> 11-8 # Replace '.' with '-' ex: 11.8 -> 11-8
cuda_version=$(echo $1 | tr "." "-") cuda_version=$(echo "$1" | tr "." "-")
# Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004 # Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004
OS=$(echo $2 | tr -d ".\-") OS=$(echo "$2" | tr -d ".\-")
# Installs CUDA # Installs CUDA
wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb wget -nv "https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb"
sudo dpkg -i cuda-keyring_1.1-1_all.deb sudo dpkg -i cuda-keyring_1.1-1_all.deb
rm cuda-keyring_1.1-1_all.deb rm cuda-keyring_1.1-1_all.deb
sudo apt -qq update sudo apt -qq update
sudo apt -y install cuda-${cuda_version} cuda-nvcc-${cuda_version} cuda-libraries-dev-${cuda_version} sudo apt -y install "cuda-${cuda_version}" "cuda-nvcc-${cuda_version}" "cuda-libraries-dev-${cuda_version}"
sudo apt clean sudo apt clean
# Test nvcc # Test nvcc
......
...@@ -6,7 +6,7 @@ cuda_version=$3 ...@@ -6,7 +6,7 @@ cuda_version=$3
# Install torch # Install torch
$python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya $python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya
$python_executable -m pip install torch==${pytorch_version}+cu${cuda_version//./} --extra-index-url https://download.pytorch.org/whl/cu${cuda_version//./} $python_executable -m pip install torch=="${pytorch_version}+cu${cuda_version//./}" --extra-index-url "https://download.pytorch.org/whl/cu${cuda_version//./}"
# Print version information # Print version information
$python_executable --version $python_executable --version
......
name: Lint shell scripts
on:
push:
branches:
- "main"
paths:
- '**/*.sh'
- '.github/workflows/shellcheck.yml'
pull_request:
branches:
- "main"
paths:
- '**/*.sh'
- '.github/workflows/shellcheck.yml'
env:
LC_ALL: en_US.UTF-8
defaults:
run:
shell: bash
permissions:
contents: read
jobs:
shellcheck:
runs-on: ubuntu-latest
steps:
- name: "Checkout"
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: "Check shell scripts"
run: |
tools/shellcheck.sh
name: Lint documentation
on:
push:
branches:
- main
paths:
- "docs/**"
pull_request:
branches:
- main
paths:
- "docs/**"
jobs:
sphinx-lint:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.12"]
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements-lint.txt
- name: Linting docs
run: tools/sphinx-lint.sh
name: 'Close inactive issues and PRs'
on:
schedule:
# Daily at 1:30 AM UTC
- cron: '30 1 * * *'
jobs:
close-issues-and-pull-requests:
permissions:
issues: write
pull-requests: write
actions: write
runs-on: ubuntu-latest
steps:
- uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0
with:
# Increasing this value ensures that changes to this workflow
# propagate to all issues and PRs in days rather than months
operations-per-run: 1000
exempt-draft-pr: true
exempt-issue-labels: 'keep-open'
exempt-pr-labels: 'keep-open'
labels-to-add-when-unstale: 'unstale'
labels-to-remove-when-stale: 'unstale'
days-before-issue-stale: 90
days-before-issue-close: 30
stale-issue-label: 'stale'
stale-issue-message: >
This issue has been automatically marked as stale because it has not
had any activity within 90 days. It will be automatically closed if no
further activity occurs within 30 days. Leave a comment if
you feel this issue should remain open. Thank you!
close-issue-message: >
This issue has been automatically closed due to inactivity. Please
feel free to reopen if you feel it is still relevant. Thank you!
days-before-pr-stale: 90
days-before-pr-close: 30
stale-pr-label: 'stale'
stale-pr-message: >
This pull request has been automatically marked as stale because it
has not had any activity within 90 days. It will be automatically
closed if no further activity occurs within 30 days. Leave a comment
if you feel this pull request should remain open. Thank you!
close-pr-message: >
This pull request has been automatically closed due to inactivity.
Please feel free to reopen if you intend to continue working on it.
Thank you!
...@@ -6,26 +6,33 @@ on: ...@@ -6,26 +6,33 @@ on:
push: push:
branches: branches:
- main - main
paths:
- "**/*.py"
- .github/workflows/yapf.yml
pull_request: pull_request:
branches: branches:
- main - main
paths:
- "**/*.py"
- .github/workflows/yapf.yml
jobs: jobs:
yapf: yapf:
runs-on: ubuntu-latest runs-on: ubuntu-latest
strategy: strategy:
matrix: matrix:
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] python-version: ["3.12"]
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python ${{ matrix.python-version }} - name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2 uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
- name: Install dependencies - name: Install dependencies
run: | run: |
python -m pip install --upgrade pip python -m pip install --upgrade pip
pip install yapf==0.32.0 pip install yapf==0.32.0
pip install toml==0.10.2 pip install toml==0.10.2
- name: Running yapf - name: Running yapf
run: | run: |
yapf --diff --recursive . yapf --diff --recursive .
...@@ -33,6 +33,7 @@ share/python-wheels/ ...@@ -33,6 +33,7 @@ share/python-wheels/
.installed.cfg .installed.cfg
*.egg *.egg
MANIFEST MANIFEST
/.deps/
# PyInstaller # PyInstaller
# Usually these files are written by a python script from a template # Usually these files are written by a python script from a template
...@@ -198,3 +199,7 @@ hip_compat.h ...@@ -198,3 +199,7 @@ hip_compat.h
# Benchmark dataset # Benchmark dataset
benchmarks/*.json benchmarks/*.json
# Linting
actionlint
shellcheck*/
...@@ -6,17 +6,16 @@ version: 2 ...@@ -6,17 +6,16 @@ version: 2
build: build:
os: ubuntu-22.04 os: ubuntu-22.04
tools: tools:
python: "3.8" python: "3.12"
sphinx: sphinx:
configuration: docs/source/conf.py configuration: docs/source/conf.py
fail_on_warning: true fail_on_warning: true
# If using Sphinx, optionally build your docs in additional formats such as PDF # If using Sphinx, optionally build your docs in additional formats such as PDF
formats: formats: []
- pdf
# Optionally declare the Python requirements required to build your docs # Optionally declare the Python requirements required to build your docs
python: python:
install: install:
- requirements: docs/requirements-docs.txt - requirements: docs/requirements-docs.txt
# rules currently disabled:
#
# SC1091 (info): Not following: <sourced file> was not specified as input (see shellcheck -x)
# SC2004 (style): $/${} is unnecessary on arithmetic variables.
# SC2129 (style): Consider using { cmd1; cmd2; } >> file instead of individual redirects.
# SC2155 (warning): Declare and assign separately to avoid masking return values.
# SC2164 (warning): Use 'cd ... || exit' or 'cd ... || return' in case cd fails.
#
disable=SC1091,SC2004,SC2129,SC2155,SC2164
...@@ -35,13 +35,13 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS) ...@@ -35,13 +35,13 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
# Supported python versions. These versions will be searched in order, the # Supported python versions. These versions will be searched in order, the
# first match will be selected. These should be kept in sync with setup.py. # first match will be selected. These should be kept in sync with setup.py.
# #
set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11" "3.12") set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
# Supported NVIDIA architectures. # Supported NVIDIA architectures.
set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0") set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
# Supported hcu architectures. # Supported hcu architectures.
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx926;gfx928;gfx936") set(HIP_SUPPORTED_ARCHS "gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;;gfx1101;gfx906;gfx926;gfx928;gfx936")
# #
# Supported/expected torch versions for CUDA/ROCm. # Supported/expected torch versions for CUDA/ROCm.
...@@ -53,8 +53,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11 ...@@ -53,8 +53,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
# requirements.txt files and should be kept consistent. The ROCm torch # requirements.txt files and should be kept consistent. The ROCm torch
# versions are derived from Dockerfile.rocm # versions are derived from Dockerfile.rocm
# #
set(TORCH_SUPPORTED_VERSION_CUDA "2.4.0") set(TORCH_SUPPORTED_VERSION_CUDA "2.5.1")
set(TORCH_SUPPORTED_VERSION_ROCM "2.5.0") set(TORCH_SUPPORTED_VERSION_ROCM "2.5.1")
# #
# Try to find python package with an executable that exactly matches # Try to find python package with an executable that exactly matches
...@@ -87,24 +87,6 @@ endif() ...@@ -87,24 +87,6 @@ endif()
# #
find_package(Torch REQUIRED) find_package(Torch REQUIRED)
#
message(STATUS "Enabling core extension.")
# Define _core_C extension
# built for (almost) every target platform, (excludes TPU and Neuron)
set(VLLM_EXT_SRC
"csrc/core/torch_bindings.cpp")
define_gpu_extension_target(
_core_C
DESTINATION vllm
LANGUAGE CXX
SOURCES ${VLLM_EXT_SRC}
COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
USE_SABI 3
WITH_SOABI)
# #
# Forward the non-CUDA device extensions to external CMake scripts. # Forward the non-CUDA device extensions to external CMake scripts.
# #
...@@ -147,15 +129,33 @@ else() ...@@ -147,15 +129,33 @@ else()
message(FATAL_ERROR "Can't find CUDA or HIP installation.") message(FATAL_ERROR "Can't find CUDA or HIP installation.")
endif() endif()
#
# Override the GPU architectures detected by cmake/torch and filter them by if(VLLM_GPU_LANG STREQUAL "CUDA")
# the supported versions for the current language. #
# The final set of arches is stored in `VLLM_GPU_ARCHES`. # For cuda we want to be able to control which architectures we compile for on
# # a per-file basis in order to cut down on compile time. So here we extract
override_gpu_arches(VLLM_GPU_ARCHES # the set of architectures we want to compile for and remove the from the
${VLLM_GPU_LANG} # CMAKE_CUDA_FLAGS so that they are not applied globally.
"${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}") #
clear_cuda_arches(CUDA_ARCH_FLAGS)
extract_unique_cuda_archs_ascending(CUDA_ARCHS "${CUDA_ARCH_FLAGS}")
message(STATUS "CUDA target architectures: ${CUDA_ARCHS}")
# Filter the target architectures by the supported supported archs
# since for some files we will build for all CUDA_ARCHS.
cuda_archs_loose_intersection(CUDA_ARCHS
"${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}")
message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}")
else()
#
# For other GPU targets override the GPU architectures detected by cmake/torch
# and filter them by the supported versions for the current language.
# The final set of arches is stored in `VLLM_GPU_ARCHES`.
#
override_gpu_arches(VLLM_GPU_ARCHES
${VLLM_GPU_LANG}
"${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}")
endif()
# #
# Query torch for additional GPU compilation flags for the given # Query torch for additional GPU compilation flags for the given
# `VLLM_GPU_LANG`. # `VLLM_GPU_LANG`.
...@@ -170,7 +170,16 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -170,7 +170,16 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}") list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
endif() endif()
#
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
# setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache.
# Each dependency that produces build artifacts should override its BINARY_DIR to avoid
# conflicts between build types. It should instead be set to ${CMAKE_BINARY_DIR}/<dependency>.
#
include(FetchContent) include(FetchContent)
file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
# #
# Define other extension targets # Define other extension targets
...@@ -182,7 +191,8 @@ include(FetchContent) ...@@ -182,7 +191,8 @@ include(FetchContent)
set(VLLM_EXT_SRC set(VLLM_EXT_SRC
"csrc/cache_kernels.cu" "csrc/cache_kernels.cu"
"csrc/attention/attention_kernels.cu" "csrc/attention/paged_attention_v1.cu"
"csrc/attention/paged_attention_v2.cu"
"csrc/pos_encoding_kernels.cu" "csrc/pos_encoding_kernels.cu"
"csrc/pos_encoding_tgi_kernels.cu" "csrc/pos_encoding_tgi_kernels.cu"
"csrc/activation_kernels.cu" "csrc/activation_kernels.cu"
...@@ -192,11 +202,13 @@ set(VLLM_EXT_SRC ...@@ -192,11 +202,13 @@ set(VLLM_EXT_SRC
"csrc/attention/attention_kernels_opt.cu" "csrc/attention/attention_kernels_opt.cu"
"csrc/attention/attention_kernels_opt_tc.cu" "csrc/attention/attention_kernels_opt_tc.cu"
"csrc/opt/layernorm_kernels_opt.cu" "csrc/opt/layernorm_kernels_opt.cu"
"csrc/layernorm_quant_kernels.cu"
# "csrc/quantization/gptq/q_gemm.cu" # "csrc/quantization/gptq/q_gemm.cu"
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu" "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
# "csrc/quantization/fp8/common.cu" # "csrc/quantization/fp8/common.cu"
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
"csrc/quantization/gguf/gguf_kernel.cu"
"csrc/cuda_utils_kernels.cu" "csrc/cuda_utils_kernels.cu"
"csrc/moe_align_block_size_kernels.cu"
"csrc/prepare_inputs/advance_step.cu" "csrc/prepare_inputs/advance_step.cu"
"csrc/torch_bindings.cpp" "csrc/torch_bindings.cpp"
"csrc/attention/attention_with_mask_kernels.cu" "csrc/attention/attention_with_mask_kernels.cu"
...@@ -209,7 +221,19 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -209,7 +221,19 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case. # Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
set(CUTLASS_REVISION "v3.5.1" CACHE STRING "CUTLASS revision to use") set(CUTLASS_REVISION "v3.5.1" CACHE STRING "CUTLASS revision to use")
FetchContent_Declare( # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
set(VLLM_CUTLASS_SRC_DIR $ENV{VLLM_CUTLASS_SRC_DIR})
endif()
if(VLLM_CUTLASS_SRC_DIR)
if(NOT IS_ABSOLUTE VLLM_CUTLASS_SRC_DIR)
get_filename_component(VLLM_CUTLASS_SRC_DIR "${VLLM_CUTLASS_SRC_DIR}" ABSOLUTE)
endif()
message(STATUS "The VLLM_CUTLASS_SRC_DIR is set, using ${VLLM_CUTLASS_SRC_DIR} for compilation")
FetchContent_Declare(cutlass SOURCE_DIR ${VLLM_CUTLASS_SRC_DIR})
else()
FetchContent_Declare(
cutlass cutlass
GIT_REPOSITORY https://github.com/nvidia/cutlass.git GIT_REPOSITORY https://github.com/nvidia/cutlass.git
GIT_TAG v3.5.1 GIT_TAG v3.5.1
...@@ -219,7 +243,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -219,7 +243,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags. # Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
# So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE # So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
GIT_SHALLOW TRUE GIT_SHALLOW TRUE
) )
endif()
FetchContent_MakeAvailable(cutlass) FetchContent_MakeAvailable(cutlass)
list(APPEND VLLM_EXT_SRC list(APPEND VLLM_EXT_SRC
...@@ -227,30 +252,88 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -227,30 +252,88 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"csrc/mamba/causal_conv1d/causal_conv1d.cu" "csrc/mamba/causal_conv1d/causal_conv1d.cu"
"csrc/quantization/aqlm/gemm_kernels.cu" "csrc/quantization/aqlm/gemm_kernels.cu"
"csrc/quantization/awq/gemm_kernels.cu" "csrc/quantization/awq/gemm_kernels.cu"
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
"csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu"
"csrc/quantization/gguf/gguf_kernel.cu"
"csrc/quantization/fp8/fp8_marlin.cu"
"csrc/custom_all_reduce.cu" "csrc/custom_all_reduce.cu"
"csrc/permute_cols.cu" "csrc/permute_cols.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu" "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu")
"csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu") set_gencode_flags_for_srcs(
SRCS "${VLLM_EXT_SRC}"
CUDA_ARCHS "${CUDA_ARCHS}")
# Only build Marlin kernels if we are building for at least some compatible archs.
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
# are not supported by Machete yet.
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" ${CUDA_ARCHS})
if (MARLIN_ARCHS)
set(MARLIN_SRCS
"csrc/quantization/fp8/fp8_marlin.cu"
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
"csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
set_gencode_flags_for_srcs(
SRCS "${MARLIN_SRCS}"
CUDA_ARCHS "${MARLIN_ARCHS}")
list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}")
message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
else()
message(STATUS "Not building Marlin kernels as no compatible archs found"
" in CUDA target architectures")
endif()
#
# The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
# CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1")
message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
else()
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
"later if you intend on running FP8 quantized models on "
"Hopper.")
else()
message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
"in CUDA target architectures")
endif()
# clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
# build any 3x kernels
set(SCALED_MM_3X_ARCHS)
endif()
# #
# The CUTLASS kernels for Hopper require sm90a to be enabled. # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
# This is done via the below gencode option, BUT that creates kernels for both sm90 and sm90a. # kernels for the remaining archs that are not already built for 3x.
# That adds an extra 17MB to compiled binary, so instead we selectively enable it. cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0) "7.5;8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
set_source_files_properties( # subtract out the archs that are already built for 3x
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu" list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
PROPERTIES if (SCALED_MM_2X_ARCHS)
COMPILE_FLAGS set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu")
"-gencode arch=compute_90a,code=sm_90a") set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C2X=1")
message(STATUS "Building scaled_mm_c2x for archs: ${SCALED_MM_2X_ARCHS}")
else()
if (SCALED_MM_3X_ARCHS)
message(STATUS "Not building scaled_mm_c2x as all archs are already built"
" for and covered by scaled_mm_c3x")
else()
message(STATUS "Not building scaled_mm_c2x as no compatible archs found "
"in CUDA target architectures")
endif()
endif() endif()
...@@ -258,47 +341,72 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -258,47 +341,72 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# Machete kernels # Machete kernels
# The machete kernels only work on hopper and require CUDA 12.0 or later. # The machete kernels only work on hopper and require CUDA 12.0 or later.
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0) # Only build Machete kernels if we are building for something compatible with sm90a
cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS)
# #
# For the Machete kernels we automatically generate sources for various # For the Machete kernels we automatically generate sources for various
# preselected input type pairs and schedules. # preselected input type pairs and schedules.
# Generate sources: # Generate sources:
execute_process( set(MACHETE_GEN_SCRIPT
COMMAND ${CMAKE_COMMAND} -E env ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py)
PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH file(MD5 ${MACHETE_GEN_SCRIPT} MACHETE_GEN_SCRIPT_HASH)
${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py
RESULT_VARIABLE machete_generation_result message(STATUS "Machete generation script hash: ${MACHETE_GEN_SCRIPT_HASH}")
OUTPUT_VARIABLE machete_generation_output message(STATUS "Last run machete generate script hash: $CACHE{MACHETE_GEN_SCRIPT_HASH}")
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log if (NOT DEFINED CACHE{MACHETE_GEN_SCRIPT_HASH}
) OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH})
execute_process(
if (NOT machete_generation_result EQUAL 0) COMMAND ${CMAKE_COMMAND} -E env
message(FATAL_ERROR "Machete generation failed." PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
" Result: \"${machete_generation_result}\"" ${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
"\nCheck the log for details: " RESULT_VARIABLE machete_generation_result
"${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log") OUTPUT_VARIABLE machete_generation_output
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
)
if (NOT machete_generation_result EQUAL 0)
message(FATAL_ERROR "Machete generation failed."
" Result: \"${machete_generation_result}\""
"\nCheck the log for details: "
"${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log")
else()
set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH}
CACHE STRING "Last run machete generate script hash" FORCE)
message(STATUS "Machete generation completed successfully.")
endif()
else() else()
message(STATUS "Machete generation completed successfully.") message(STATUS "Machete generation script has not changed, skipping generation.")
endif() endif()
# Add machete generated sources # Add machete generated sources
file(GLOB MACHETE_GEN_SOURCES "csrc/quantization/machete/generated/*.cu") file(GLOB MACHETE_GEN_SOURCES "csrc/quantization/machete/generated/*.cu")
list(APPEND VLLM_EXT_SRC ${MACHETE_GEN_SOURCES}) list(APPEND VLLM_EXT_SRC ${MACHETE_GEN_SOURCES})
message(STATUS "Machete generated sources: ${MACHETE_GEN_SOURCES}")
set_source_files_properties( # forward compatible
${MACHETE_GEN_SOURCES} set_gencode_flags_for_srcs(
PROPERTIES SRCS "${MACHETE_GEN_SOURCES}"
COMPILE_FLAGS CUDA_ARCHS "${MACHETE_ARCHS}")
"-gencode arch=compute_90a,code=sm_90a")
list(APPEND VLLM_EXT_SRC
csrc/quantization/machete/machete_pytorch.cu)
message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}")
else()
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0
AND MACHETE_ARCHS)
message(STATUS "Not building Machete kernels as CUDA Compiler version is "
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
"later if you intend on running w4a16 quantized models on "
"Hopper.")
else()
message(STATUS "Not building Machete kernels as no compatible archs "
"found in CUDA target architectures")
endif()
endif() endif()
# if CUDA endif
# Add pytorch binding for machete (add on even CUDA < 12.0 so that we can
# raise an error if the user that this was built with an incompatible
# CUDA version)
list(APPEND VLLM_EXT_SRC
csrc/quantization/machete/machete_pytorch.cu)
endif() endif()
message(STATUS "Enabling C extension.") message(STATUS "Enabling C extension.")
...@@ -313,8 +421,8 @@ define_gpu_extension_target( ...@@ -313,8 +421,8 @@ define_gpu_extension_target(
USE_SABI 3 USE_SABI 3
WITH_SOABI) WITH_SOABI)
# If CUTLASS is compiled on NVCC >= 12.5, it by default uses # If CUTLASS is compiled on NVCC >= 12.5, it by default uses
# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the # cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the
# driver API. This causes problems when linking with earlier versions of CUDA. # driver API. This causes problems when linking with earlier versions of CUDA.
# Setting this variable sidesteps the issue by calling the driver directly. # Setting this variable sidesteps the issue by calling the driver directly.
target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1) target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
...@@ -325,16 +433,36 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1) ...@@ -325,16 +433,36 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
set(VLLM_MOE_EXT_SRC set(VLLM_MOE_EXT_SRC
"csrc/moe/torch_bindings.cpp" "csrc/moe/torch_bindings.cpp"
"csrc/moe/moe_align_sum_kernels.cu"
"csrc/moe/topk_softmax_kernels.cu") "csrc/moe/topk_softmax_kernels.cu")
set_gencode_flags_for_srcs(
SRCS "${VLLM_MOE_EXT_SRC}"
CUDA_ARCHS "${CUDA_ARCHS}")
if(VLLM_GPU_LANG STREQUAL "CUDA") if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_MOE_EXT_SRC cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
"csrc/moe/marlin_kernels/marlin_moe_kernel.h" if (MARLIN_MOE_ARCHS)
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h" set(MARLIN_MOE_SRC
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu" "csrc/moe/marlin_kernels/marlin_moe_kernel.h"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h" "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu" "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
"csrc/moe/marlin_moe_ops.cu") "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu"
"csrc/moe/marlin_moe_ops.cu")
set_gencode_flags_for_srcs(
SRCS "${MARLIN_MOE_SRC}"
CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_SRC}")
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
else()
message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
" in CUDA target architectures")
endif()
endif() endif()
message(STATUS "Enabling moe extension.") message(STATUS "Enabling moe extension.")
...@@ -374,6 +502,17 @@ if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda") ...@@ -374,6 +502,17 @@ if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda")
return() return()
endif () endif ()
# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
# arches in the CUDA case (and instead set the gencodes on a per file basis)
# we need to manually set VLLM_GPU_ARCHES here.
if(VLLM_GPU_LANG STREQUAL "CUDA")
foreach(_ARCH ${CUDA_ARCHS})
string(REPLACE "." "" _ARCH "${_ARCH}")
list(APPEND VLLM_GPU_ARCHES "${_ARCH}-real")
endforeach()
endif()
# #
# Build vLLM flash attention from source # Build vLLM flash attention from source
# #
...@@ -400,8 +539,10 @@ else() ...@@ -400,8 +539,10 @@ else()
FetchContent_Declare( FetchContent_Declare(
vllm-flash-attn vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
GIT_TAG 013f0c4fc47e6574060879d9734c1df8c5c273bd GIT_TAG 04325b6798bcc326c86fb35af62d05a9c8c8eceb
GIT_PROGRESS TRUE GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
) )
]] ]]
endif() endif()
......
# Contributing to vLLM # Contributing to vLLM
Thank you for your interest in contributing to vLLM! You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html).
Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large.
There are several ways you can contribute to the project:
- Identify and report any issues or bugs.
- Request or add a new model.
- Suggest or implement new features.
However, remember that contributions aren't just about code.
We believe in the power of community support; thus, answering queries, assisting others, and enhancing the documentation are highly regarded and beneficial contributions.
Finally, one of the most impactful ways to support us is by raising awareness about vLLM.
Talk about it in your blog posts, highlighting how it's driving your incredible projects.
Express your support on Twitter if vLLM aids you, or simply offer your appreciation by starring our repository.
## Setup for development
### Build from source
```bash
pip install -e . # This may take several minutes.
```
### Testing
```bash
pip install -r requirements-dev.txt
# linting and formatting
bash format.sh
# Static type checking
mypy
# Unit tests
pytest tests/
```
**Note:** Currently, the repository does not pass the mypy tests.
## Contributing Guidelines
### Issue Reporting
If you encounter a bug or have a feature request, please check our issues page first to see if someone else has already reported it.
If not, please file a new issue, providing as much relevant information as possible.
### Pull Requests & Code Reviews
Please check the PR checklist in the [PR template](.github/PULL_REQUEST_TEMPLATE.md) for detailed guide for contribution.
### Thank You
Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM.
Your contributions make vLLM a great tool for everyone!
Developer Certificate of Origin
Version 1.1
Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
Everyone is permitted to copy and distribute verbatim copies of this
license document, but changing it is not allowed.
Developer's Certificate of Origin 1.1
By making a contribution to this project, I certify that:
(a) The contribution was created in whole or in part by me and I
have the right to submit it under the open source license
indicated in the file; or
(b) The contribution is based upon previous work that, to the best
of my knowledge, is covered under an appropriate open source
license and I have the right under that license to submit that
work with modifications, whether created in whole or in part
by me, under the same open source license (unless I am
permitted to submit under a different license), as indicated
in the file; or
(c) The contribution was provided directly to me by some other
person who certified (a), (b) or (c) and I have not modified
it.
(d) I understand and agree that this project and the contribution
are public and that a record of the contribution (including all
personal information I submit with it, including my sign-off) is
maintained indefinitely and may be redistributed consistent with
this project or the open source license(s) involved.
...@@ -11,6 +11,7 @@ ARG CUDA_VERSION=12.4.1 ...@@ -11,6 +11,7 @@ ARG CUDA_VERSION=12.4.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
ARG CUDA_VERSION=12.4.1 ARG CUDA_VERSION=12.4.1
ARG PYTHON_VERSION=3.12 ARG PYTHON_VERSION=3.12
ARG TARGETPLATFORM
ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_FRONTEND=noninteractive
# Install Python and other dependencies # Install Python and other dependencies
...@@ -27,6 +28,14 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ ...@@ -27,6 +28,14 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \ && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
&& python3 --version && python3 -m pip --version && python3 --version && python3 -m pip --version
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
# as it was causing spam when compiling the CUTLASS kernels
RUN apt-get install -y gcc-10 g++-10
RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10
RUN <<EOF
gcc --version
EOF
# Workaround for https://github.com/openai/triton/issues/2507 and # Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image # this won't be needed for future versions of this docker image
...@@ -38,9 +47,14 @@ WORKDIR /workspace ...@@ -38,9 +47,14 @@ WORKDIR /workspace
# install build and runtime dependencies # install build and runtime dependencies
COPY requirements-common.txt requirements-common.txt COPY requirements-common.txt requirements-common.txt
COPY requirements-cuda.txt requirements-cuda.txt COPY requirements-cuda.txt requirements-cuda.txt
COPY requirements-cuda-arm64.txt requirements-cuda-arm64.txt
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-cuda.txt python3 -m pip install -r requirements-cuda.txt
RUN --mount=type=cache,target=/root/.cache/pip \
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
python3 -m pip install -r requirements-cuda-arm64.txt; \
fi
# cuda arch list used by torch # cuda arch list used by torch
# can be useful for both `dev` and `test` # can be useful for both `dev` and `test`
...@@ -55,6 +69,7 @@ ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches} ...@@ -55,6 +69,7 @@ ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
#################### WHEEL BUILD IMAGE #################### #################### WHEEL BUILD IMAGE ####################
FROM base AS build FROM base AS build
ARG TARGETPLATFORM
# install build dependencies # install build dependencies
COPY requirements-build.txt requirements-build.txt COPY requirements-build.txt requirements-build.txt
...@@ -62,15 +77,15 @@ COPY requirements-build.txt requirements-build.txt ...@@ -62,15 +77,15 @@ COPY requirements-build.txt requirements-build.txt
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-build.txt python3 -m pip install -r requirements-build.txt
# files and directories related to build wheels RUN --mount=type=cache,target=/root/.cache/pip \
COPY csrc csrc if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
COPY setup.py setup.py python3 -m pip install -r requirements-cuda-arm64.txt; \
COPY cmake cmake fi
COPY CMakeLists.txt CMakeLists.txt
COPY requirements-common.txt requirements-common.txt COPY . .
COPY requirements-cuda.txt requirements-cuda.txt ARG GIT_REPO_CHECK=0
COPY pyproject.toml pyproject.toml RUN --mount=type=bind,source=.git,target=.git \
COPY vllm vllm if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
# max jobs used by Ninja to build extensions # max jobs used by Ninja to build extensions
ARG max_jobs=2 ARG max_jobs=2
...@@ -131,15 +146,18 @@ COPY requirements-test.txt requirements-test.txt ...@@ -131,15 +146,18 @@ COPY requirements-test.txt requirements-test.txt
COPY requirements-dev.txt requirements-dev.txt COPY requirements-dev.txt requirements-dev.txt
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-dev.txt python3 -m pip install -r requirements-dev.txt
#################### DEV IMAGE #################### #################### DEV IMAGE ####################
#################### vLLM installation IMAGE #################### #################### vLLM installation IMAGE ####################
# image with vLLM installed # image with vLLM installed
FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04 AS vllm-base FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base
ARG CUDA_VERSION=12.4.1 ARG CUDA_VERSION=12.4.1
ARG PYTHON_VERSION=3.12 ARG PYTHON_VERSION=3.12
WORKDIR /vllm-workspace WORKDIR /vllm-workspace
ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_FRONTEND=noninteractive
ARG TARGETPLATFORM
COPY requirements-cuda-arm64.txt requirements-cuda-arm64.txt
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \ RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
...@@ -165,16 +183,24 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ ...@@ -165,16 +183,24 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
# or future versions of triton. # or future versions of triton.
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
# install vllm wheel first, so that torch etc will be installed # Install vllm wheel first, so that torch etc will be installed.
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
--mount=type=cache,target=/root/.cache/pip \ --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install dist/*.whl --verbose python3 -m pip install dist/*.whl --verbose
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
. /etc/environment && \ if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl pip uninstall -y torch && \
#################### vLLM installation IMAGE #################### python3 -m pip install -r requirements-cuda-arm64.txt; \
fi
RUN --mount=type=cache,target=/root/.cache/pip \
. /etc/environment && \
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \
fi
COPY examples examples
#################### vLLM installation IMAGE ####################
#################### TEST IMAGE #################### #################### TEST IMAGE ####################
# image to run unit testing suite # image to run unit testing suite
...@@ -187,13 +213,24 @@ ADD . /vllm-workspace/ ...@@ -187,13 +213,24 @@ ADD . /vllm-workspace/
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-dev.txt python3 -m pip install -r requirements-dev.txt
# install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -e tests/vllm_test_utils
# enable fast downloads from hf (for testing)
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install hf_transfer
ENV HF_HUB_ENABLE_HF_TRANSFER 1
# Copy in the v1 package for testing (it isn't distributed yet)
COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1
# doc requires source code # doc requires source code
# we hide them inside `test_docs/` , so that this source code # we hide them inside `test_docs/` , so that this source code
# will not be imported by other tests # will not be imported by other tests
RUN mkdir test_docs RUN mkdir test_docs
RUN mv docs test_docs/ RUN mv docs test_docs/
RUN mv vllm test_docs/ RUN mv vllm test_docs/
#################### TEST IMAGE #################### #################### TEST IMAGE ####################
#################### OPENAI API SERVER #################### #################### OPENAI API SERVER ####################
...@@ -202,8 +239,11 @@ FROM vllm-base AS vllm-openai ...@@ -202,8 +239,11 @@ FROM vllm-base AS vllm-openai
# install additional dependencies for openai api server # install additional dependencies for openai api server
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
pip install accelerate hf_transfer 'modelscope!=1.15.0' bitsandbytes>=0.44.0 timm==0.9.10 if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10'; \
else \
pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10'; \
fi
ENV VLLM_USAGE_SOURCE production-docker-image ENV VLLM_USAGE_SOURCE production-docker-image
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
......
# This vLLM Dockerfile is used to construct an image that can build and run vLLM on ARM CPU platform.
FROM ubuntu:22.04 AS cpu-test-arm
ENV CCACHE_DIR=/root/.cache/ccache
ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
RUN --mount=type=cache,target=/var/cache/apt \
apt-get update -y \
&& apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
# tcmalloc provides better memory allocation efficiency, e.g., holding memory in caches to speed up access of commonly-used objects.
RUN --mount=type=cache,target=/root/.cache/pip \
pip install py-cpuinfo # Use this to gather CPU info and optimize based on ARM Neoverse cores
# Set LD_PRELOAD for tcmalloc on ARM
ENV LD_PRELOAD="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4"
RUN echo 'ulimit -c 0' >> ~/.bashrc
WORKDIR /workspace
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
pip install --upgrade pip && \
pip install -r requirements-build.txt
FROM cpu-test-arm AS build
WORKDIR /workspace/vllm
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
--mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
pip install -v -r requirements-cpu.txt
COPY . .
ARG GIT_REPO_CHECK=0
RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
# Disabling AVX512 specific optimizations for ARM
ARG VLLM_CPU_DISABLE_AVX512="true"
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/ccache \
--mount=type=bind,source=.git,target=.git \
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
pip install dist/*.whl && \
rm -rf dist
WORKDIR /workspace/
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
\ No newline at end of file
...@@ -16,35 +16,23 @@ RUN --mount=type=cache,target=/var/cache/apt \ ...@@ -16,35 +16,23 @@ RUN --mount=type=cache,target=/var/cache/apt \
# intel-openmp provides additional performance improvement vs. openmp # intel-openmp provides additional performance improvement vs. openmp
# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects. # tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
pip install intel-openmp pip install intel-openmp==2025.0.1
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so" ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
RUN echo 'ulimit -c 0' >> ~/.bashrc RUN echo 'ulimit -c 0' >> ~/.bashrc
RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.4.0%2Bgitfbaa4bc-cp310-cp310-linux_x86_64.whl RUN pip install intel_extension_for_pytorch==2.5.0
WORKDIR /workspace WORKDIR /workspace
ENV PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cpu ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \ --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
pip install --upgrade pip && \ pip install --upgrade pip && \
pip install -r requirements-build.txt pip install -r requirements-build.txt
# install oneDNN
RUN git clone -b rls-v3.5 https://github.com/oneapi-src/oneDNN.git
RUN --mount=type=cache,target=/root/.cache/ccache \
cmake -B ./oneDNN/build -S ./oneDNN -G Ninja -DONEDNN_LIBRARY_TYPE=STATIC \
-DONEDNN_BUILD_DOC=OFF \
-DONEDNN_BUILD_EXAMPLES=OFF \
-DONEDNN_BUILD_TESTS=OFF \
-DONEDNN_BUILD_GRAPH=OFF \
-DONEDNN_ENABLE_WORKLOAD=INFERENCE \
-DONEDNN_ENABLE_PRIMITIVE=MATMUL && \
cmake --build ./oneDNN/build --target install --config Release
FROM cpu-test-1 AS build FROM cpu-test-1 AS build
WORKDIR /workspace/vllm WORKDIR /workspace/vllm
...@@ -54,7 +42,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \ ...@@ -54,7 +42,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \ --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
pip install -v -r requirements-cpu.txt pip install -v -r requirements-cpu.txt
COPY ./ ./ COPY . .
ARG GIT_REPO_CHECK=0
RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ... # Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
ARG VLLM_CPU_DISABLE_AVX512 ARG VLLM_CPU_DISABLE_AVX512
...@@ -71,4 +62,8 @@ WORKDIR /workspace/ ...@@ -71,4 +62,8 @@ WORKDIR /workspace/
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
# install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -e tests/vllm_test_utils
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
COPY ./ /workspace/vllm
WORKDIR /workspace/vllm
RUN pip install -v -r requirements-hpu.txt
ENV no_proxy=localhost,127.0.0.1
ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
# install development dependencies (for testing)
RUN python3 -m pip install -e tests/vllm_test_utils
WORKDIR /workspace/
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
# default base image # default base image
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.0-ubuntu20.04" # https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.2-ubuntu20.04"
FROM $BASE_IMAGE FROM $BASE_IMAGE
...@@ -17,7 +18,7 @@ RUN apt-get update && \ ...@@ -17,7 +18,7 @@ RUN apt-get update && \
# When launching the container, mount the code directory to /app # When launching the container, mount the code directory to /app
ARG APP_MOUNT=/app ARG APP_MOUNT=/app
VOLUME [ ${APP_MOUNT} ] VOLUME [ ${APP_MOUNT} ]
WORKDIR ${APP_MOUNT} WORKDIR ${APP_MOUNT}/vllm
RUN python3 -m pip install --upgrade pip RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
...@@ -25,17 +26,20 @@ RUN python3 -m pip install sentencepiece transformers==4.36.2 -U ...@@ -25,17 +26,20 @@ RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
RUN python3 -m pip install --pre neuronx-cc==2.15.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U RUN python3 -m pip install --pre neuronx-cc==2.15.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
COPY . /app/vllm COPY . .
ARG GIT_REPO_CHECK=0
RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
RUN cd /app/vllm \ RUN python3 -m pip install -U \
&& python3 -m pip install -U \ 'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
-r requirements-neuron.txt -r requirements-neuron.txt
ENV VLLM_TARGET_DEVICE neuron ENV VLLM_TARGET_DEVICE neuron
RUN --mount=type=bind,source=.git,target=.git \ RUN --mount=type=bind,source=.git,target=.git \
cd /app/vllm \ pip install --no-build-isolation -v -e .
&& pip install --no-build-isolation -v -e . \
&& cd .. # install development dependencies (for testing)
RUN python3 -m pip install -e tests/vllm_test_utils
CMD ["/bin/bash"] CMD ["/bin/bash"]
...@@ -9,23 +9,20 @@ RUN apt-get update -y && \ ...@@ -9,23 +9,20 @@ RUN apt-get update -y && \
ffmpeg libsm6 libxext6 libgl1 ffmpeg libsm6 libxext6 libgl1
WORKDIR /workspace WORKDIR /workspace
# copy requirements COPY . .
COPY requirements-build.txt /workspace/vllm/ ARG GIT_REPO_CHECK=0
COPY requirements-common.txt /workspace/vllm/ RUN --mount=type=bind,source=.git,target=.git \
COPY requirements-openvino.txt /workspace/vllm/ if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
COPY vllm/ /workspace/vllm/vllm
COPY csrc/core /workspace/vllm/csrc/core
COPY cmake/utils.cmake /workspace/vllm/cmake/
COPY CMakeLists.txt /workspace/vllm/
COPY setup.py /workspace/vllm/
# install build requirements # install build requirements
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements-build.txt
# build vLLM with OpenVINO backend # build vLLM with OpenVINO backend
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/ RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace
COPY examples/ /workspace/examples
COPY benchmarks/ /workspace/benchmarks
COPY examples/ /workspace/vllm/examples # install development dependencies (for testing)
COPY benchmarks/ /workspace/vllm/benchmarks RUN python3 -m pip install -e tests/vllm_test_utils
CMD ["/bin/bash"] CMD ["/bin/bash"]
...@@ -14,11 +14,14 @@ RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p ...@@ -14,11 +14,14 @@ RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p
COPY ./ /workspace/vllm COPY ./ /workspace/vllm
WORKDIR /workspace/vllm WORKDIR /workspace/vllm
ARG GIT_REPO_CHECK=0
RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
# These packages will be in rocketce eventually # These packages will be in rocketce eventually
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \ pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \ 'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
torch==2.3.1 \ torch==2.3.1 \
-r requirements-cpu.txt \ -r requirements-cpu.txt \
xformers uvloop==0.20.0 xformers uvloop==0.20.0
...@@ -26,8 +29,11 @@ RUN --mount=type=cache,target=/root/.cache/pip \ ...@@ -26,8 +29,11 @@ RUN --mount=type=cache,target=/root/.cache/pip \
RUN --mount=type=bind,source=.git,target=.git \ RUN --mount=type=bind,source=.git,target=.git \
VLLM_TARGET_DEVICE=cpu python3 setup.py install VLLM_TARGET_DEVICE=cpu python3 setup.py install
# install development dependencies (for testing)
RUN python3 -m pip install -e tests/vllm_test_utils
WORKDIR /workspace/ WORKDIR /workspace/
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment