Commit ad385667 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'v0.6.3.post1-dev'

parents be0967c1 903593d3
name: Remove ready Label on notready Comment
on:
issue_comment:
types: [created]
jobs:
add-ready-label:
runs-on: ubuntu-latest
if: github.event.issue.pull_request && contains(github.event.comment.body, '/notready')
steps:
- name: Remove ready label
uses: actions/github-script@v5
with:
script: |
github.rest.issues.removeLabel({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
name: 'ready'
})
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
...@@ -17,18 +17,18 @@ jobs: ...@@ -17,18 +17,18 @@ jobs:
matrix: matrix:
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }} - name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2 uses: actions/setup-python@v5
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
- name: Install dependencies - name: Install dependencies
run: | run: |
python -m pip install --upgrade pip python -m pip install --upgrade pip
pip install ruff==0.1.5 codespell==2.3.0 tomli==2.0.1 isort==5.13.2 pip install -r requirements-lint.txt
- name: Analysing the code with ruff - name: Analysing the code with ruff
run: | run: |
ruff . ruff check .
- name: Spelling check with codespell - name: Spelling check with codespell
run: | run: |
codespell --toml pyproject.toml codespell --toml pyproject.toml
......
#!/bin/bash #!/bin/bash
set -eux
python_executable=python$1 python_executable=python$1
cuda_home=/usr/local/cuda-$2 cuda_home=/usr/local/cuda-$2
...@@ -8,12 +9,15 @@ PATH=${cuda_home}/bin:$PATH ...@@ -8,12 +9,15 @@ PATH=${cuda_home}/bin:$PATH
LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
# Install requirements # Install requirements
$python_executable -m pip install wheel packaging $python_executable -m pip install -r requirements-build.txt -r requirements-cuda.txt
$python_executable -m pip install -r requirements-cuda.txt
# Limit the number of parallel jobs to avoid OOM # Limit the number of parallel jobs to avoid OOM
export MAX_JOBS=1 export MAX_JOBS=1
# Make sure release wheels are built for the following architectures # Make sure release wheels are built for the following architectures
export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
export VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real"
bash tools/check_repo.sh
# Build # Build
$python_executable setup.py bdist_wheel --dist-dir=dist $python_executable setup.py bdist_wheel --dist-dir=dist
...@@ -16,9 +16,9 @@ jobs: ...@@ -16,9 +16,9 @@ jobs:
matrix: matrix:
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }} - name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2 uses: actions/setup-python@v5
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
- name: Install dependencies - name: Install dependencies
......
# vllm commit id, generated by setup.py # version file generated by setuptools-scm
vllm/commit_id.py /vllm/_version.py
# vllm-flash-attn built from source
vllm/vllm_flash_attn/
# Byte-compiled / optimized / DLL files # Byte-compiled / optimized / DLL files
__pycache__/ __pycache__/
...@@ -12,6 +15,8 @@ __pycache__/ ...@@ -12,6 +15,8 @@ __pycache__/
# Distribution / packaging # Distribution / packaging
.Python .Python
build/ build/
cmake-build-*/
CMakeUserPresets.json
develop-eggs/ develop-eggs/
dist/ dist/
downloads/ downloads/
...@@ -28,6 +33,7 @@ share/python-wheels/ ...@@ -28,6 +33,7 @@ share/python-wheels/
.installed.cfg .installed.cfg
*.egg *.egg
MANIFEST MANIFEST
/.deps/
# PyInstaller # PyInstaller
# Usually these files are written by a python script from a template # Usually these files are written by a python script from a template
...@@ -87,6 +93,9 @@ target/ ...@@ -87,6 +93,9 @@ target/
profile_default/ profile_default/
ipython_config.py ipython_config.py
# generated files
**/generated/**
# pyenv # pyenv
# For a library or package, you might want to ignore these files since the code is # For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in: # intended to run in multiple environments; otherwise, check them in:
...@@ -189,4 +198,7 @@ _build/ ...@@ -189,4 +198,7 @@ _build/
hip_compat.h hip_compat.h
# Benchmark dataset # Benchmark dataset
*.json benchmarks/*.json
# Linting
actionlint
...@@ -13,10 +13,10 @@ sphinx: ...@@ -13,10 +13,10 @@ sphinx:
fail_on_warning: true fail_on_warning: true
# If using Sphinx, optionally build your docs in additional formats such as PDF # If using Sphinx, optionally build your docs in additional formats such as PDF
formats: formats: []
- pdf
# Optionally declare the Python requirements required to build your docs # Optionally declare the Python requirements required to build your docs
python: python:
install: install:
- requirements: docs/requirements-docs.txt - requirements: docs/requirements-docs.txt
cmake_minimum_required(VERSION 3.21) cmake_minimum_required(VERSION 3.26)
# When building directly using CMake, make sure you run the install step
# (it places the .so files in the correct location).
#
# Example:
# mkdir build && cd build
# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_INSTALL_PREFIX=.. ..
# cmake --build . --target install
#
# If you want to only build one target, make sure to install it manually:
# cmake --build . --target _C
# cmake --install . --component _C
project(vllm_extensions LANGUAGES CXX) project(vllm_extensions LANGUAGES CXX)
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py) # CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
...@@ -12,6 +23,14 @@ message(STATUS "Target device: ${VLLM_TARGET_DEVICE}") ...@@ -12,6 +23,14 @@ message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake) include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
add_compile_options(-w) add_compile_options(-w)
# Suppress potential warnings about unused manually-specified variables
set(ignoreMe "${VLLM_PYTHON_PATH}")
# Prevent installation of dependencies (cutlass) by default.
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
# #
# Supported python versions. These versions will be searched in order, the # Supported python versions. These versions will be searched in order, the
# first match will be selected. These should be kept in sync with setup.py. # first match will be selected. These should be kept in sync with setup.py.
...@@ -69,19 +88,6 @@ endif() ...@@ -69,19 +88,6 @@ endif()
find_package(Torch REQUIRED) find_package(Torch REQUIRED)
# #
# Add the `default` target which detects which extensions should be
# built based on platform/architecture. This is the same logic that
# setup.py uses to select which extensions should be built and should
# be kept in sync.
#
# The `default` target makes direct use of cmake easier since knowledge
# of which extensions are supported has been factored in, e.g.
#
# mkdir build && cd build
# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=../vllm ..
# cmake --build . --target default
#
add_custom_target(default)
message(STATUS "Enabling core extension.") message(STATUS "Enabling core extension.")
# Define _core_C extension # Define _core_C extension
...@@ -99,8 +105,6 @@ define_gpu_extension_target( ...@@ -99,8 +105,6 @@ define_gpu_extension_target(
USE_SABI 3 USE_SABI 3
WITH_SOABI) WITH_SOABI)
add_dependencies(default _core_C)
# #
# Forward the non-CUDA device extensions to external CMake scripts. # Forward the non-CUDA device extensions to external CMake scripts.
# #
...@@ -143,14 +147,32 @@ else() ...@@ -143,14 +147,32 @@ else()
message(FATAL_ERROR "Can't find CUDA or HIP installation.") message(FATAL_ERROR "Can't find CUDA or HIP installation.")
endif() endif()
#
# Override the GPU architectures detected by cmake/torch and filter them by if(VLLM_GPU_LANG STREQUAL "CUDA")
# the supported versions for the current language. #
# The final set of arches is stored in `VLLM_GPU_ARCHES`. # For cuda we want to be able to control which architectures we compile for on
# # a per-file basis in order to cut down on compile time. So here we extract
override_gpu_arches(VLLM_GPU_ARCHES # the set of architectures we want to compile for and remove the from the
# CMAKE_CUDA_FLAGS so that they are not applied globally.
#
clear_cuda_arches(CUDA_ARCH_FLAGS)
extract_unique_cuda_archs_ascending(CUDA_ARCHS "${CUDA_ARCH_FLAGS}")
message(STATUS "CUDA target architectures: ${CUDA_ARCHS}")
# Filter the target architectures by the supported supported archs
# since for some files we will build for all CUDA_ARCHS.
cuda_archs_loose_intersection(CUDA_ARCHS
"${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}")
message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}")
else()
#
# For other GPU targets override the GPU architectures detected by cmake/torch
# and filter them by the supported versions for the current language.
# The final set of arches is stored in `VLLM_GPU_ARCHES`.
#
override_gpu_arches(VLLM_GPU_ARCHES
${VLLM_GPU_LANG} ${VLLM_GPU_LANG}
"${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}") "${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}")
endif()
# #
# Query torch for additional GPU compilation flags for the given # Query torch for additional GPU compilation flags for the given
...@@ -166,6 +188,17 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -166,6 +188,17 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}") list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
endif() endif()
#
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
# Configure it to place files in vllm/.deps, in order to play nicely with sccache.
#
include(FetchContent)
get_filename_component(PROJECT_ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
file(MAKE_DIRECTORY "${FETCHCONTENT_BASE_DIR}")
set(FETCHCONTENT_BASE_DIR "${PROJECT_ROOT_DIR}/.deps")
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
# #
# Define other extension targets # Define other extension targets
# #
...@@ -184,9 +217,9 @@ set(VLLM_EXT_SRC ...@@ -184,9 +217,9 @@ set(VLLM_EXT_SRC
"csrc/opt/transpose_kernels.cu" "csrc/opt/transpose_kernels.cu"
"csrc/opt/activation_kernels_opt.cu" "csrc/opt/activation_kernels_opt.cu"
"csrc/attention/attention_kernels_opt.cu" "csrc/attention/attention_kernels_opt.cu"
"csrc/attention/attention_kernels_opt_tc.cu"
"csrc/opt/layernorm_kernels_opt.cu" "csrc/opt/layernorm_kernels_opt.cu"
"csrc/quantization/squeezellm/quant_cuda_kernel.cu" # "csrc/quantization/gptq/q_gemm.cu"
"csrc/quantization/gptq/q_gemm.cu"
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu" "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
# "csrc/quantization/fp8/common.cu" # "csrc/quantization/fp8/common.cu"
"csrc/cuda_utils_kernels.cu" "csrc/cuda_utils_kernels.cu"
...@@ -195,46 +228,188 @@ set(VLLM_EXT_SRC ...@@ -195,46 +228,188 @@ set(VLLM_EXT_SRC
"csrc/torch_bindings.cpp") "csrc/torch_bindings.cpp")
if(VLLM_GPU_LANG STREQUAL "CUDA") if(VLLM_GPU_LANG STREQUAL "CUDA")
include(FetchContent)
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library") SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
# Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
set(CUTLASS_REVISION "v3.5.1" CACHE STRING "CUTLASS revision to use")
FetchContent_Declare( FetchContent_Declare(
cutlass cutlass
GIT_REPOSITORY https://github.com/nvidia/cutlass.git GIT_REPOSITORY https://github.com/nvidia/cutlass.git
# CUTLASS 3.5.1 GIT_TAG v3.5.1
GIT_TAG 06b21349bcf6ddf6a1686a47a137ad1446579db9
GIT_PROGRESS TRUE GIT_PROGRESS TRUE
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
# Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
# So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
GIT_SHALLOW TRUE
) )
FetchContent_MakeAvailable(cutlass) FetchContent_MakeAvailable(cutlass)
list(APPEND VLLM_EXT_SRC list(APPEND VLLM_EXT_SRC
"csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
"csrc/quantization/aqlm/gemm_kernels.cu" "csrc/quantization/aqlm/gemm_kernels.cu"
"csrc/quantization/awq/gemm_kernels.cu" "csrc/quantization/awq/gemm_kernels.cu"
"csrc/quantization/gguf/gguf_kernel.cu"
"csrc/custom_all_reduce.cu"
"csrc/permute_cols.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu")
set_gencode_flags_for_srcs(
SRCS "${VLLM_EXT_SRC}"
CUDA_ARCHS "${CUDA_ARCHS}")
# Only build Marlin kernels if we are building for at least some compatible archs.
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
# are not supported by Machete yet.
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.9;9.0" ${CUDA_ARCHS})
if (MARLIN_ARCHS)
set(MARLIN_SRCS
"csrc/quantization/fp8/fp8_marlin.cu"
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu" "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu" "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
"csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu" "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
"csrc/quantization/gptq_marlin/gptq_marlin.cu" "csrc/quantization/gptq_marlin/gptq_marlin.cu"
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu" "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu" "csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
"csrc/quantization/fp8/fp8_marlin.cu" set_gencode_flags_for_srcs(
"csrc/custom_all_reduce.cu" SRCS "${MARLIN_SRCS}"
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu" CUDA_ARCHS "${MARLIN_ARCHS}")
"csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu" list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}")
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu") message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
else()
message(STATUS "Not building Marlin kernels as no compatible archs found"
"in CUDA target architectures")
endif()
#
# The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
# CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1")
message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
else()
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
"later if you intend on running FP8 quantized models on "
"Hopper.")
else()
message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
"in CUDA target architectures")
endif()
# clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
# build any 3x kernels
set(SCALED_MM_3X_ARCHS)
endif()
#
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
# kernels for the remaining archs that are not already built for 3x.
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
"7.5;8.0;8.6;8.9;9.0" "${CUDA_ARCHS}")
# subtract out the archs that are already built for 3x
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
if (SCALED_MM_2X_ARCHS)
set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C2X=1")
message(STATUS "Building scaled_mm_c2x for archs: ${SCALED_MM_2X_ARCHS}")
else()
if (SCALED_MM_3X_ARCHS)
message(STATUS "Not building scaled_mm_c2x as all archs are already built"
" for and covered by scaled_mm_c3x")
else()
message(STATUS "Not building scaled_mm_c2x as no compatible archs found "
"in CUDA target architectures")
endif()
endif()
#
# Machete kernels
# The machete kernels only work on hopper and require CUDA 12.0 or later.
# Only build Machete kernels if we are building for something compatible with sm90a
cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS)
# #
# The CUTLASS kernels for Hopper require sm90a to be enabled. # For the Machete kernels we automatically generate sources for various
# This is done via the below gencode option, BUT that creates kernels for both sm90 and sm90a. # preselected input type pairs and schedules.
# That adds an extra 17MB to compiled binary, so instead we selectively enable it. # Generate sources:
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0) set(MACHETE_GEN_SCRIPT
set_source_files_properties( ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py)
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu" file(MD5 ${MACHETE_GEN_SCRIPT} MACHETE_GEN_SCRIPT_HASH)
PROPERTIES
COMPILE_FLAGS message(STATUS "Machete generation script hash: ${MACHETE_GEN_SCRIPT_HASH}")
"-gencode arch=compute_90a,code=sm_90a") message(STATUS "Last run machete generate script hash: $CACHE{MACHETE_GEN_SCRIPT_HASH}")
if (NOT DEFINED CACHE{MACHETE_GEN_SCRIPT_HASH}
OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH})
execute_process(
COMMAND ${CMAKE_COMMAND} -E env
PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
RESULT_VARIABLE machete_generation_result
OUTPUT_VARIABLE machete_generation_output
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log
)
if (NOT machete_generation_result EQUAL 0)
message(FATAL_ERROR "Machete generation failed."
" Result: \"${machete_generation_result}\""
"\nCheck the log for details: "
"${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log")
else()
set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH}
CACHE STRING "Last run machete generate script hash" FORCE)
message(STATUS "Machete generation completed successfully.")
endif() endif()
else()
message(STATUS "Machete generation script has not changed, skipping generation.")
endif()
# Add machete generated sources
file(GLOB MACHETE_GEN_SOURCES "csrc/quantization/machete/generated/*.cu")
list(APPEND VLLM_EXT_SRC ${MACHETE_GEN_SOURCES})
# forward compatible
set_gencode_flags_for_srcs(
SRCS "${MACHETE_GEN_SOURCES}"
CUDA_ARCHS "${MACHETE_ARCHS}")
list(APPEND VLLM_EXT_SRC
csrc/quantization/machete/machete_pytorch.cu)
message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}")
else()
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0
AND MACHETE_ARCHS)
message(STATUS "Not building Machete kernels as CUDA Compiler version is "
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
"later if you intend on running w4a16 quantized models on "
"Hopper.")
else()
message(STATUS "Not building Machete kernels as no compatible archs "
"found in CUDA target architectures")
endif()
endif()
# if CUDA endif
endif() endif()
message(STATUS "Enabling C extension.")
define_gpu_extension_target( define_gpu_extension_target(
_C _C
DESTINATION vllm DESTINATION vllm
...@@ -246,6 +421,12 @@ define_gpu_extension_target( ...@@ -246,6 +421,12 @@ define_gpu_extension_target(
USE_SABI 3 USE_SABI 3
WITH_SOABI) WITH_SOABI)
# If CUTLASS is compiled on NVCC >= 12.5, it by default uses
# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the
# driver API. This causes problems when linking with earlier versions of CUDA.
# Setting this variable sidesteps the issue by calling the driver directly.
target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
# #
# _moe_C extension # _moe_C extension
# #
...@@ -254,6 +435,36 @@ set(VLLM_MOE_EXT_SRC ...@@ -254,6 +435,36 @@ set(VLLM_MOE_EXT_SRC
"csrc/moe/torch_bindings.cpp" "csrc/moe/torch_bindings.cpp"
"csrc/moe/topk_softmax_kernels.cu") "csrc/moe/topk_softmax_kernels.cu")
set_gencode_flags_for_srcs(
SRCS "${VLLM_MOE_EXT_SRC}"
CUDA_ARCHS "${CUDA_ARCHS}")
if(VLLM_GPU_LANG STREQUAL "CUDA")
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.9;9.0" "${CUDA_ARCHS}")
if (MARLIN_MOE_ARCHS)
set(MARLIN_MOE_SRC
"csrc/moe/marlin_kernels/marlin_moe_kernel.h"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu"
"csrc/moe/marlin_moe_ops.cu")
set_gencode_flags_for_srcs(
SRCS "${MARLIN_MOE_SRC}"
CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_SRC}")
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
else()
message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
"in CUDA target architectures")
endif()
endif()
message(STATUS "Enabling moe extension.")
define_gpu_extension_target( define_gpu_extension_target(
_moe_C _moe_C
DESTINATION vllm DESTINATION vllm
...@@ -264,13 +475,102 @@ define_gpu_extension_target( ...@@ -264,13 +475,102 @@ define_gpu_extension_target(
USE_SABI 3 USE_SABI 3
WITH_SOABI) WITH_SOABI)
#[[
if(VLLM_GPU_LANG STREQUAL "HIP")
#
# _rocm_C extension
#
set(VLLM_ROCM_EXT_SRC
"csrc/rocm/torch_bindings.cpp"
"csrc/rocm/attention.cu")
define_gpu_extension_target(
_rocm_C
DESTINATION vllm
LANGUAGE ${VLLM_GPU_LANG}
SOURCES ${VLLM_ROCM_EXT_SRC}
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
ARCHITECTURES ${VLLM_GPU_ARCHES}
USE_SABI 3
WITH_SOABI)
endif()
]]
# vllm-flash-attn currently only supported on CUDA
if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda")
return()
endif ()
# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
# arches in the CUDA case (and instead set the gencodes on a per file basis)
# we need to manually set VLLM_GPU_ARCHES here.
if(VLLM_GPU_LANG STREQUAL "CUDA")
foreach(_ARCH ${CUDA_ARCHS})
string(REPLACE "." "" _ARCH "${_ARCH}")
list(APPEND VLLM_GPU_ARCHES "${_ARCH}-real")
endforeach()
endif()
if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP") #
message(STATUS "Enabling C extension.") # Build vLLM flash attention from source
add_dependencies(default _C) #
# IMPORTANT: This has to be the last thing we do, because vllm-flash-attn uses the same macros/functions as vLLM.
# Because functions all belong to the global scope, vllm-flash-attn's functions overwrite vLLMs.
# They should be identical but if they aren't, this is a massive footgun.
#
# The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.
# To only install vllm-flash-attn, use --component vllm_flash_attn_c.
# If no component is specified, vllm-flash-attn is still installed.
message(STATUS "Enabling moe extension.") # If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.
add_dependencies(default _moe_C) # This is to enable local development of vllm-flash-attn within vLLM.
# It can be set as an environment variable or passed as a cmake argument.
# The environment variable takes precedence.
if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR})
set(VLLM_FLASH_ATTN_SRC_DIR $ENV{VLLM_FLASH_ATTN_SRC_DIR})
endif()
if(VLLM_FLASH_ATTN_SRC_DIR)
FetchContent_Declare(vllm-flash-attn SOURCE_DIR ${VLLM_FLASH_ATTN_SRC_DIR})
#[[
else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
GIT_TAG 013f0c4fc47e6574060879d9734c1df8c5c273bd
GIT_PROGRESS TRUE
)
]]
endif() endif()
# Set the parent build flag so that the vllm-flash-attn library does not redo compile flag and arch initialization.
set(VLLM_PARENT_BUILD ON)
#[[
# Ensure the vllm/vllm_flash_attn directory exists before installation
install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")" COMPONENT vllm_flash_attn_c)
# Make sure vllm-flash-attn install rules are nested under vllm/
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" COMPONENT vllm_flash_attn_c)
install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c)
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" COMPONENT vllm_flash_attn_c)
# Fetch the vllm-flash-attn library
FetchContent_MakeAvailable(vllm-flash-attn)
message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
# Restore the install prefix
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c)
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" COMPONENT vllm_flash_attn_c)
# Copy over the vllm-flash-attn python files
install(
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
DESTINATION vllm/vllm_flash_attn
COMPONENT vllm_flash_attn_c
FILES_MATCHING PATTERN "*.py"
)
# Nothing after vllm-flash-attn, see comment about macros above
]]
\ No newline at end of file
# vLLM Code of Conduct
## Our Pledge
We as members, contributors, and leaders pledge to make participation in our
community a harassment-free experience for everyone, regardless of age, body
size, visible or invisible disability, ethnicity, sex characteristics, gender
identity and expression, level of experience, education, socioeconomic status,
nationality, personal appearance, race, caste, color, religion, or sexual
identity and orientation.
We pledge to act and interact in ways that contribute to an open, welcoming,
diverse, inclusive, and healthy community.
## Our Standards
Examples of behavior that contributes to a positive environment for our
community include:
* Demonstrating empathy and kindness toward other people
* Being respectful of differing opinions, viewpoints, and experiences
* Giving and gracefully accepting constructive feedback
* Accepting responsibility and apologizing to those affected by our mistakes,
and learning from the experience
* Focusing on what is best not just for us as individuals, but for the overall
community
Examples of unacceptable behavior include:
* The use of sexualized language or imagery, and sexual attention or advances of
any kind
* Trolling, insulting or derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or email address,
without their explicit permission
* Other conduct which could reasonably be considered inappropriate in a
professional setting
## Enforcement Responsibilities
Community leaders are responsible for clarifying and enforcing our standards of
acceptable behavior and will take appropriate and fair corrective action in
response to any behavior that they deem inappropriate, threatening, offensive,
or harmful.
Community leaders have the right and responsibility to remove, edit, or reject
comments, commits, code, wiki edits, issues, and other contributions that are
not aligned to this Code of Conduct, and will communicate reasons for moderation
decisions when appropriate.
## Scope
This Code of Conduct applies within all community spaces, and also applies when
an individual is officially representing the community in public spaces.
Examples of representing our community include using an official email address,
posting via an official social media account, or acting as an appointed
representative at an online or offline/IRL event.
## Enforcement
Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported to the community leaders responsible for enforcement in the #code-of-conduct
channel in the [vLLM Discord](https://discord.com/invite/jz7wjKhh6g).
All complaints will be reviewed and investigated promptly and fairly.
All community leaders are obligated to respect the privacy and security of the
reporter of any incident.
## Enforcement Guidelines
Community leaders will follow these Community Impact Guidelines in determining
the consequences for any action they deem in violation of this Code of Conduct:
### 1. Correction
**Community Impact**: Use of inappropriate language or other behavior deemed
unprofessional or unwelcome in the community.
**Consequence**: A private, written warning from community leaders, providing
clarity around the nature of the violation and an explanation of why the
behavior was inappropriate. A public apology may be requested.
### 2. Warning
**Community Impact**: A violation through a single incident or series of
actions.
**Consequence**: A warning with consequences for continued behavior. No
interaction with the people involved, including unsolicited interaction with
those enforcing the Code of Conduct, for a specified period of time. This
includes avoiding interactions in community spaces as well as external channels
like social media. Violating these terms may lead to a temporary or permanent
ban.
### 3. Temporary Ban
**Community Impact**: A serious violation of community standards, including
sustained inappropriate behavior.
**Consequence**: A temporary ban from any sort of interaction or public
communication with the community for a specified period of time. No public or
private interaction with the people involved, including unsolicited interaction
with those enforcing the Code of Conduct, is allowed during this period.
Violating these terms may lead to a permanent ban.
### 4. Permanent Ban
**Community Impact**: Demonstrating a pattern of violation of community
standards, including sustained inappropriate behavior, harassment of an
individual, or aggression toward or disparagement of classes of individuals.
**Consequence**: A permanent ban from any sort of public interaction within the
community.
## Attribution
This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/),
version 2.1, available at
[v2.1](https://www.contributor-covenant.org/version/2/1/code_of_conduct.html).
Community Impact Guidelines were inspired by
[Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/inclusion).
For answers to common questions about this code of conduct, see the
[Contributor Covenant FAQ](https://www.contributor-covenant.org/faq). Translations are available at
[Contributor Covenant translations](https://www.contributor-covenant.org/translations).
# Contributing to vLLM # Contributing to vLLM
Thank you for your interest in contributing to vLLM! Thank you for your interest in contributing to vLLM! Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large. There are several ways you can contribute to the project:
Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large.
There are several ways you can contribute to the project:
- Identify and report any issues or bugs. - Identify and report any issues or bugs.
- Request or add a new model. - Request or add support for a new model.
- Suggest or implement new features. - Suggest or implement new features.
- Improve documentation or contribute a how-to guide.
However, remember that contributions aren't just about code. We also believe in the power of community support; thus, answering queries, offering PR reviews, and assisting others are also highly regarded and beneficial contributions.
We believe in the power of community support; thus, answering queries, assisting others, and enhancing the documentation are highly regarded and beneficial contributions.
Finally, one of the most impactful ways to support us is by raising awareness about vLLM. Finally, one of the most impactful ways to support us is by raising awareness about vLLM. Talk about it in your blog posts and highlight how it's driving your incredible projects. Express your support on social media if you're using vLLM, or simply offer your appreciation by starring our repository!
Talk about it in your blog posts, highlighting how it's driving your incredible projects.
Express your support on Twitter if vLLM aids you, or simply offer your appreciation by starring our repository.
## Setup for development ## Developing
### Build from source Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. Check out the [building from source](https://docs.vllm.ai/en/latest/getting_started/installation.html#build-from-source) documentation for details.
```bash
pip install -e . # This may take several minutes.
```
### Testing ## Testing
```bash ```bash
pip install -r requirements-dev.txt pip install -r requirements-dev.txt
...@@ -36,15 +29,16 @@ mypy ...@@ -36,15 +29,16 @@ mypy
# Unit tests # Unit tests
pytest tests/ pytest tests/
``` ```
**Note:** Currently, the repository does not pass the mypy tests. **Note:** Currently, the repository does not pass the ``mypy`` tests.
## Contribution Guidelines
## Contributing Guidelines ### Issues
### Issue Reporting If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
If you encounter a bug or have a feature request, please check our issues page first to see if someone else has already reported it. > [!IMPORTANT]
If not, please file a new issue, providing as much relevant information as possible. > If you discover a security vulnerability, please follow the instructions [here](/SECURITY.md#reporting-a-vulnerability).
### Pull Requests & Code Reviews ### Pull Requests & Code Reviews
...@@ -53,4 +47,4 @@ Please check the PR checklist in the [PR template](.github/PULL_REQUEST_TEMPLATE ...@@ -53,4 +47,4 @@ Please check the PR checklist in the [PR template](.github/PULL_REQUEST_TEMPLATE
### Thank You ### Thank You
Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM.
Your contributions make vLLM a great tool for everyone! All of your contributions help make vLLM a great tool and community for everyone!
This diff is collapsed.
...@@ -2,37 +2,61 @@ ...@@ -2,37 +2,61 @@
FROM ubuntu:22.04 AS cpu-test-1 FROM ubuntu:22.04 AS cpu-test-1
RUN apt-get update -y \ ENV CCACHE_DIR=/root/.cache/ccache
&& apt-get install -y curl git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
RUN --mount=type=cache,target=/var/cache/apt \
apt-get update -y \
&& apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html # https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
# intel-openmp provides additional performance improvement vs. openmp # intel-openmp provides additional performance improvement vs. openmp
# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects. # tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
RUN pip install intel-openmp RUN --mount=type=cache,target=/root/.cache/pip \
pip install intel-openmp
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so:$LD_PRELOAD" ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
RUN echo 'ulimit -c 0' >> ~/.bashrc RUN echo 'ulimit -c 0' >> ~/.bashrc
RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.4.0%2Bgitfbaa4bc-cp310-cp310-linux_x86_64.whl RUN pip install intel_extension_for_pytorch==2.4.0
RUN pip install --upgrade pip \ WORKDIR /workspace
&& pip install wheel packaging ninja "setuptools>=49.4.0" numpy
FROM cpu-test-1 AS build ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
pip install --upgrade pip && \
pip install -r requirements-build.txt
COPY ./ /workspace/vllm FROM cpu-test-1 AS build
WORKDIR /workspace/vllm WORKDIR /workspace/vllm
RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
--mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
pip install -v -r requirements-cpu.txt
COPY . .
ARG GIT_REPO_CHECK=0
RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ... # Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
ARG VLLM_CPU_DISABLE_AVX512 ARG VLLM_CPU_DISABLE_AVX512
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512} ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/ccache \
--mount=type=bind,source=.git,target=.git \
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
pip install dist/*.whl && \
rm -rf dist
WORKDIR /workspace/ WORKDIR /workspace/
......
# default base image # default base image
ARG BASE_IMAGE="763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference-neuronx:2.1.1-neuronx-py310-sdk2.17.0-ubuntu20.04" ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.0-ubuntu20.04"
FROM $BASE_IMAGE FROM $BASE_IMAGE
RUN echo "Base image is $BASE_IMAGE" RUN echo "Base image is $BASE_IMAGE"
# Install some basic utilities # Install some basic utilities
RUN apt-get update && apt-get install python3 python3-pip -y RUN apt-get update && \
apt-get install -y \
git \
python3 \
python3-pip \
ffmpeg libsm6 libxext6 libgl1
### Mount Point ### ### Mount Point ###
# When launching the container, mount the code directory to /app # When launching the container, mount the code directory to /app
ARG APP_MOUNT=/app ARG APP_MOUNT=/app
VOLUME [ ${APP_MOUNT} ] VOLUME [ ${APP_MOUNT} ]
WORKDIR ${APP_MOUNT} WORKDIR ${APP_MOUNT}/vllm
RUN python3 -m pip install --upgrade pip RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
RUN python3 -m pip install sentencepiece transformers==4.36.2 -U RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
RUN python3 -m pip install --pre neuronx-cc==2.12.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U RUN python3 -m pip install --pre neuronx-cc==2.15.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
COPY ./vllm /app/vllm/vllm COPY . .
COPY ./setup.py /app/vllm/setup.py ARG GIT_REPO_CHECK=0
COPY ./requirements-common.txt /app/vllm/requirements-common.txt RUN --mount=type=bind,source=.git,target=.git \
COPY ./requirements-neuron.txt /app/vllm/requirements-neuron.txt if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
RUN cd /app/vllm \ RUN python3 -m pip install -U \
&& python3 -m pip install -U -r requirements-neuron.txt cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
-r requirements-neuron.txt
ENV VLLM_TARGET_DEVICE neuron ENV VLLM_TARGET_DEVICE neuron
RUN cd /app/vllm \ RUN --mount=type=bind,source=.git,target=.git \
&& pip install -e . \ pip install --no-build-isolation -v -e . \
&& cd ..
CMD ["/bin/bash"] CMD ["/bin/bash"]
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
include LICENSE include LICENSE
include requirements-adag.txt
include requirements-common.txt include requirements-common.txt
include requirements-cuda.txt include requirements-cuda.txt
include requirements-rocm.txt include requirements-rocm.txt
......
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment