Commit 539aa992 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.6.2' into v0.6.2-dev

parents 93872128 7193774b
...@@ -8,8 +8,7 @@ steps: ...@@ -8,8 +8,7 @@ steps:
containers: containers:
- image: badouralix/curl-jq - image: badouralix/curl-jq
command: command:
- sh - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
- .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
- wait - wait
- label: "A100" - label: "A100"
agents: agents:
......
...@@ -2,9 +2,11 @@ ...@@ -2,9 +2,11 @@
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token) TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token)
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT" URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
TIMEOUT_SECONDS=10
retries=0 retries=0
while [ $retries -lt 1000 ]; do while [ $retries -lt 1000 ]; do
if [ $(curl -s -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then if [ $(curl -s --max-time $TIMEOUT_SECONDS -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then
exit 0 exit 0
fi fi
......
...@@ -83,6 +83,7 @@ if [[ $commands == *" kernels "* ]]; then ...@@ -83,6 +83,7 @@ if [[ $commands == *" kernels "* ]]; then
--ignore=kernels/test_encoder_decoder_attn.py \ --ignore=kernels/test_encoder_decoder_attn.py \
--ignore=kernels/test_flash_attn.py \ --ignore=kernels/test_flash_attn.py \
--ignore=kernels/test_flashinfer.py \ --ignore=kernels/test_flashinfer.py \
--ignore=kernels/test_gguf.py \
--ignore=kernels/test_int8_quant.py \ --ignore=kernels/test_int8_quant.py \
--ignore=kernels/test_machete_gemm.py \ --ignore=kernels/test_machete_gemm.py \
--ignore=kernels/test_mamba_ssm.py \ --ignore=kernels/test_mamba_ssm.py \
...@@ -93,6 +94,16 @@ if [[ $commands == *" kernels "* ]]; then ...@@ -93,6 +94,16 @@ if [[ $commands == *" kernels "* ]]; then
--ignore=kernels/test_sampler.py" --ignore=kernels/test_sampler.py"
fi fi
#ignore certain Entrypoints tests
if [[ $commands == *" entrypoints/openai "* ]]; then
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
--ignore=entrypoints/openai/test_accuracy.py \
--ignore=entrypoints/openai/test_audio.py \
--ignore=entrypoints/openai/test_encoder_decoder.py \
--ignore=entrypoints/openai/test_embedding.py \
--ignore=entrypoints/openai/test_oot_registration.py "}
fi
PARALLEL_JOB_COUNT=8 PARALLEL_JOB_COUNT=8
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
if [[ $commands == *"--shard-id="* ]]; then if [[ $commands == *"--shard-id="* ]]; then
......
...@@ -22,7 +22,7 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py" ...@@ -22,7 +22,7 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
# Run basic model test # Run basic model test
docker exec cpu-test bash -c " docker exec cpu-test bash -c "
pip install pytest matplotlib einops transformers_stream_generator pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator
pytest -v -s tests/models/decoder_only/language \ pytest -v -s tests/models/decoder_only/language \
--ignore=tests/models/test_fp8.py \ --ignore=tests/models/test_fp8.py \
--ignore=tests/models/decoder_only/language/test_jamba.py \ --ignore=tests/models/decoder_only/language/test_jamba.py \
......
...@@ -43,13 +43,15 @@ steps: ...@@ -43,13 +43,15 @@ steps:
fast_check: true fast_check: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/mq_llm_engine
- tests/async_engine - tests/async_engine
- tests/test_inputs - tests/test_inputs
- tests/multimodal - tests/multimodal
- tests/test_utils - tests/test_utils
- tests/worker - tests/worker
commands: commands:
- pytest -v -s async_engine # Async Engine - pytest -v -s mq_llm_engine # MQLLMEngine
- pytest -v -s async_engine # AsyncLLMEngine
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
- pytest -v -s test_inputs.py - pytest -v -s test_inputs.py
- pytest -v -s multimodal - pytest -v -s multimodal
...@@ -68,7 +70,7 @@ steps: ...@@ -68,7 +70,7 @@ steps:
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
- label: Core Test # 10min - label: Core Test # 10min
mirror_hardwares: [amd] mirror_hardwares: [amd]
fast_check: true fast_check: true
...@@ -82,14 +84,17 @@ steps: ...@@ -82,14 +84,17 @@ steps:
- label: Entrypoints Test # 20min - label: Entrypoints Test # 20min
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
fast_check: true fast_check: true
#mirror_hardwares: [amd] mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
commands: commands:
- pip install -e ./plugins/vllm_add_dummy_model - pip install -e ./plugins/vllm_add_dummy_model
- pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@a4987bba6e9e9b3f22bd3a6c1ecf0abd04fd5622#egg=lm_eval[api] - pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@a4987bba6e9e9b3f22bd3a6c1ecf0abd04fd5622#egg=lm_eval[api]
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
- pytest -v -s entrypoints/openai - pytest -v -s entrypoints/openai
- pytest -v -s entrypoints/test_chat_utils.py - pytest -v -s entrypoints/test_chat_utils.py
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
...@@ -163,13 +168,6 @@ steps: ...@@ -163,13 +168,6 @@ steps:
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference_encoder_decoder.py - python3 offline_inference_encoder_decoder.py
- label: torch compile integration test
source_file_dependencies:
- vllm/
commands:
- pytest -v -s ./compile/test_full_graph.py
- pytest -v -s ./compile/test_wrapper.py
- label: Prefix Caching Test # 7min - label: Prefix Caching Test # 7min
#mirror_hardwares: [amd] #mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
...@@ -212,6 +210,21 @@ steps: ...@@ -212,6 +210,21 @@ steps:
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
parallelism: 4 parallelism: 4
- label: "PyTorch Fullgraph Smoke Test"
fast_check: true
source_file_dependencies:
- vllm/
- tests/compile
commands:
- pytest -v -s compile/test_full_graph_smoke.py
- label: "PyTorch Fullgraph Test"
source_file_dependencies:
- vllm/
- tests/compile
commands:
- pytest -v -s compile/test_full_graph.py
- label: Kernels Test %N # 30min each - label: Kernels Test %N # 30min each
mirror_hardwares: [amd] mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
...@@ -259,6 +272,13 @@ steps: ...@@ -259,6 +272,13 @@ steps:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- bash ./run-tests.sh -c configs/models-small.txt -t 1 - bash ./run-tests.sh -c configs/models-small.txt -t 1
- label: Encoder Decoder tests # 5min
source_file_dependencies:
- vllm/
- tests/encoder_decoder
commands:
- pytest -v -s encoder_decoder
- label: OpenAI-Compatible Tool Use # 20 min - label: OpenAI-Compatible Tool Use # 20 min
fast_check: false fast_check: false
mirror_hardwares: [ amd ] mirror_hardwares: [ amd ]
...@@ -348,7 +368,10 @@ steps: ...@@ -348,7 +368,10 @@ steps:
- vllm/executor/ - vllm/executor/
- vllm/model_executor/models/ - vllm/model_executor/models/
- tests/distributed/ - tests/distributed/
- vllm/compilation
commands: commands:
- pytest -v -s ./compile/test_full_graph_multi_gpu.py
- pytest -v -s ./compile/test_wrapper.py
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed' - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
# Avoid importing model tests that cause CUDA reinitialization error # Avoid importing model tests that cause CUDA reinitialization error
......
...@@ -25,10 +25,10 @@ jobs: ...@@ -25,10 +25,10 @@ jobs:
- name: Install dependencies - name: Install dependencies
run: | run: |
python -m pip install --upgrade pip python -m pip install --upgrade pip
pip install ruff==0.1.5 codespell==2.3.0 tomli==2.0.1 isort==5.13.2 pip install -r requirements-lint.txt
- name: Analysing the code with ruff - name: Analysing the code with ruff
run: | run: |
ruff . ruff check .
- name: Spelling check with codespell - name: Spelling check with codespell
run: | run: |
codespell --toml pyproject.toml codespell --toml pyproject.toml
......
...@@ -15,5 +15,6 @@ $python_executable -m pip install -r requirements-cuda.txt ...@@ -15,5 +15,6 @@ $python_executable -m pip install -r requirements-cuda.txt
export MAX_JOBS=1 export MAX_JOBS=1
# Make sure release wheels are built for the following architectures # Make sure release wheels are built for the following architectures
export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
export VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real"
# Build # Build
$python_executable setup.py bdist_wheel --dist-dir=dist $python_executable setup.py bdist_wheel --dist-dir=dist
# vllm commit id, generated by setup.py # version file generated by setuptools-scm
vllm/commit_id.py /vllm/_version.py
# vllm-flash-attn built from source
vllm/vllm_flash_attn/
# Byte-compiled / optimized / DLL files # Byte-compiled / optimized / DLL files
__pycache__/ __pycache__/
...@@ -12,6 +15,8 @@ __pycache__/ ...@@ -12,6 +15,8 @@ __pycache__/
# Distribution / packaging # Distribution / packaging
.Python .Python
build/ build/
cmake-build-*/
CMakeUserPresets.json
develop-eggs/ develop-eggs/
dist/ dist/
downloads/ downloads/
......
cmake_minimum_required(VERSION 3.26) cmake_minimum_required(VERSION 3.26)
# When building directly using CMake, make sure you run the install step
# (it places the .so files in the correct location).
#
# Example:
# mkdir build && cd build
# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_INSTALL_PREFIX=.. ..
# cmake --build . --target install
#
# If you want to only build one target, make sure to install it manually:
# cmake --build . --target _C
# cmake --install . --component _C
project(vllm_extensions LANGUAGES CXX) project(vllm_extensions LANGUAGES CXX)
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py) # CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
...@@ -17,6 +28,9 @@ add_compile_options(-w) ...@@ -17,6 +28,9 @@ add_compile_options(-w)
# Suppress potential warnings about unused manually-specified variables # Suppress potential warnings about unused manually-specified variables
set(ignoreMe "${VLLM_PYTHON_PATH}") set(ignoreMe "${VLLM_PYTHON_PATH}")
# Prevent installation of dependencies (cutlass) by default.
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
# #
# Supported python versions. These versions will be searched in order, the # Supported python versions. These versions will be searched in order, the
# first match will be selected. These should be kept in sync with setup.py. # first match will be selected. These should be kept in sync with setup.py.
...@@ -74,19 +88,6 @@ endif() ...@@ -74,19 +88,6 @@ endif()
find_package(Torch REQUIRED) find_package(Torch REQUIRED)
# #
# Add the `default` target which detects which extensions should be
# built based on platform/architecture. This is the same logic that
# setup.py uses to select which extensions should be built and should
# be kept in sync.
#
# The `default` target makes direct use of cmake easier since knowledge
# of which extensions are supported has been factored in, e.g.
#
# mkdir build && cd build
# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=../vllm ..
# cmake --build . --target default
#
add_custom_target(default)
message(STATUS "Enabling core extension.") message(STATUS "Enabling core extension.")
# Define _core_C extension # Define _core_C extension
...@@ -104,8 +105,6 @@ define_gpu_extension_target( ...@@ -104,8 +105,6 @@ define_gpu_extension_target(
USE_SABI 3 USE_SABI 3
WITH_SOABI) WITH_SOABI)
add_dependencies(default _core_C)
# #
# Forward the non-CUDA device extensions to external CMake scripts. # Forward the non-CUDA device extensions to external CMake scripts.
# #
...@@ -171,6 +170,8 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -171,6 +170,8 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}") list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
endif() endif()
include(FetchContent)
# #
# Define other extension targets # Define other extension targets
# #
...@@ -199,8 +200,11 @@ set(VLLM_EXT_SRC ...@@ -199,8 +200,11 @@ set(VLLM_EXT_SRC
"csrc/torch_bindings.cpp") "csrc/torch_bindings.cpp")
if(VLLM_GPU_LANG STREQUAL "CUDA") if(VLLM_GPU_LANG STREQUAL "CUDA")
include(FetchContent)
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library") SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
# Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
set(CUTLASS_REVISION "v3.5.1" CACHE STRING "CUTLASS revision to use")
FetchContent_Declare( FetchContent_Declare(
cutlass cutlass
GIT_REPOSITORY https://github.com/nvidia/cutlass.git GIT_REPOSITORY https://github.com/nvidia/cutlass.git
...@@ -228,6 +232,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -228,6 +232,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"csrc/quantization/gguf/gguf_kernel.cu" "csrc/quantization/gguf/gguf_kernel.cu"
"csrc/quantization/fp8/fp8_marlin.cu" "csrc/quantization/fp8/fp8_marlin.cu"
"csrc/custom_all_reduce.cu" "csrc/custom_all_reduce.cu"
"csrc/permute_cols.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu" "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu" "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu") "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
...@@ -292,6 +297,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -292,6 +297,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
csrc/quantization/machete/machete_pytorch.cu) csrc/quantization/machete/machete_pytorch.cu)
endif() endif()
message(STATUS "Enabling C extension.")
define_gpu_extension_target( define_gpu_extension_target(
_C _C
DESTINATION vllm DESTINATION vllm
...@@ -319,9 +325,15 @@ set(VLLM_MOE_EXT_SRC ...@@ -319,9 +325,15 @@ set(VLLM_MOE_EXT_SRC
if(VLLM_GPU_LANG STREQUAL "CUDA") if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_MOE_EXT_SRC list(APPEND VLLM_MOE_EXT_SRC
"csrc/moe/marlin_kernels/marlin_moe_kernel.h"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
"csrc/moe/marlin_moe_ops.cu") "csrc/moe/marlin_moe_ops.cu")
endif() endif()
message(STATUS "Enabling moe extension.")
define_gpu_extension_target( define_gpu_extension_target(
_moe_C _moe_C
DESTINATION vllm DESTINATION vllm
...@@ -332,13 +344,85 @@ define_gpu_extension_target( ...@@ -332,13 +344,85 @@ define_gpu_extension_target(
USE_SABI 3 USE_SABI 3
WITH_SOABI) WITH_SOABI)
if(VLLM_GPU_LANG STREQUAL "HIP")
#
# _rocm_C extension
#
set(VLLM_ROCM_EXT_SRC
"csrc/rocm/torch_bindings.cpp"
"csrc/rocm/attention.cu")
define_gpu_extension_target(
_rocm_C
DESTINATION vllm
LANGUAGE ${VLLM_GPU_LANG}
SOURCES ${VLLM_ROCM_EXT_SRC}
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
ARCHITECTURES ${VLLM_GPU_ARCHES}
USE_SABI 3
WITH_SOABI)
endif()
# vllm-flash-attn currently only supported on CUDA
if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda")
return()
endif ()
if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP") #
message(STATUS "Enabling C extension.") # Build vLLM flash attention from source
add_dependencies(default _C) #
# IMPORTANT: This has to be the last thing we do, because vllm-flash-attn uses the same macros/functions as vLLM.
# Because functions all belong to the global scope, vllm-flash-attn's functions overwrite vLLMs.
# They should be identical but if they aren't, this is a massive footgun.
#
# The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.
# To only install vllm-flash-attn, use --component vllm_flash_attn_c.
# If no component is specified, vllm-flash-attn is still installed.
message(STATUS "Enabling moe extension.") # If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.
add_dependencies(default _moe_C) # This is to enable local development of vllm-flash-attn within vLLM.
# It can be set as an environment variable or passed as a cmake argument.
# The environment variable takes precedence.
if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR})
set(VLLM_FLASH_ATTN_SRC_DIR $ENV{VLLM_FLASH_ATTN_SRC_DIR})
endif()
if(VLLM_FLASH_ATTN_SRC_DIR)
FetchContent_Declare(vllm-flash-attn SOURCE_DIR ${VLLM_FLASH_ATTN_SRC_DIR})
else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
GIT_TAG 013f0c4fc47e6574060879d9734c1df8c5c273bd
GIT_PROGRESS TRUE
)
endif() endif()
# Set the parent build flag so that the vllm-flash-attn library does not redo compile flag and arch initialization.
set(VLLM_PARENT_BUILD ON)
# Ensure the vllm/vllm_flash_attn directory exists before installation
install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")" COMPONENT vllm_flash_attn_c)
# Make sure vllm-flash-attn install rules are nested under vllm/
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" COMPONENT vllm_flash_attn_c)
install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c)
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" COMPONENT vllm_flash_attn_c)
# Fetch the vllm-flash-attn library
FetchContent_MakeAvailable(vllm-flash-attn)
message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
# Restore the install prefix
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c)
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" COMPONENT vllm_flash_attn_c)
# Copy over the vllm-flash-attn python files
install(
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
DESTINATION vllm/vllm_flash_attn
COMPONENT vllm_flash_attn_c
FILES_MATCHING PATTERN "*.py"
)
# Nothing after vllm-flash-attn, see comment about macros above
...@@ -48,6 +48,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \ ...@@ -48,6 +48,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \
# see https://github.com/pytorch/pytorch/pull/123243 # see https://github.com/pytorch/pytorch/pull/123243
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX' ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
# Override the arch list for flash-attn to reduce the binary size
ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
#################### BASE BUILD IMAGE #################### #################### BASE BUILD IMAGE ####################
#################### WHEEL BUILD IMAGE #################### #################### WHEEL BUILD IMAGE ####################
...@@ -76,14 +79,13 @@ ENV MAX_JOBS=${max_jobs} ...@@ -76,14 +79,13 @@ ENV MAX_JOBS=${max_jobs}
ARG nvcc_threads=8 ARG nvcc_threads=8
ENV NVCC_THREADS=$nvcc_threads ENV NVCC_THREADS=$nvcc_threads
ARG buildkite_commit
ENV BUILDKITE_COMMIT=${buildkite_commit}
ARG USE_SCCACHE ARG USE_SCCACHE
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
ARG SCCACHE_REGION_NAME=us-west-2 ARG SCCACHE_REGION_NAME=us-west-2
ARG SCCACHE_S3_NO_CREDENTIALS=0
# if USE_SCCACHE is set, use sccache to speed up compilation # if USE_SCCACHE is set, use sccache to speed up compilation
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,source=.git,target=.git \
if [ "$USE_SCCACHE" = "1" ]; then \ if [ "$USE_SCCACHE" = "1" ]; then \
echo "Installing sccache..." \ echo "Installing sccache..." \
&& curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \ && curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
...@@ -92,6 +94,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \ ...@@ -92,6 +94,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
&& rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \ && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
&& export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \ && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
&& export SCCACHE_REGION=${SCCACHE_REGION_NAME} \ && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
&& export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
&& export SCCACHE_IDLE_TIMEOUT=0 \ && export SCCACHE_IDLE_TIMEOUT=0 \
&& export CMAKE_BUILD_TYPE=Release \ && export CMAKE_BUILD_TYPE=Release \
&& sccache --show-stats \ && sccache --show-stats \
...@@ -102,6 +105,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \ ...@@ -102,6 +105,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
ENV CCACHE_DIR=/root/.cache/ccache ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \ RUN --mount=type=cache,target=/root/.cache/ccache \
--mount=type=cache,target=/root/.cache/pip \ --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,source=.git,target=.git \
if [ "$USE_SCCACHE" != "1" ]; then \ if [ "$USE_SCCACHE" != "1" ]; then \
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \ python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
fi fi
...@@ -180,10 +184,6 @@ FROM vllm-base AS test ...@@ -180,10 +184,6 @@ FROM vllm-base AS test
ADD . /vllm-workspace/ ADD . /vllm-workspace/
# install development dependencies (for testing) # install development dependencies (for testing)
# A newer setuptools is required for installing some test dependencies from source that do not publish python 3.12 wheels
# This installation must complete before the test dependencies are collected and installed.
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install "setuptools>=74.1.1"
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-dev.txt python3 -m pip install -r requirements-dev.txt
...@@ -202,7 +202,7 @@ FROM vllm-base AS vllm-openai ...@@ -202,7 +202,7 @@ FROM vllm-base AS vllm-openai
# install additional dependencies for openai api server # install additional dependencies for openai api server
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
pip install accelerate hf_transfer 'modelscope!=1.15.0' pip install accelerate hf_transfer 'modelscope!=1.15.0' bitsandbytes>=0.44.0 timm==0.9.10
ENV VLLM_USAGE_SOURCE production-docker-image ENV VLLM_USAGE_SOURCE production-docker-image
......
...@@ -24,6 +24,8 @@ RUN echo 'ulimit -c 0' >> ~/.bashrc ...@@ -24,6 +24,8 @@ RUN echo 'ulimit -c 0' >> ~/.bashrc
RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.4.0%2Bgitfbaa4bc-cp310-cp310-linux_x86_64.whl RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.4.0%2Bgitfbaa4bc-cp310-cp310-linux_x86_64.whl
WORKDIR /workspace
ENV PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cpu ENV PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cpu
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \ --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
...@@ -60,8 +62,10 @@ ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512} ...@@ -60,8 +62,10 @@ ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/ccache \ --mount=type=cache,target=/root/.cache/ccache \
--mount=type=bind,source=.git,target=.git \
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \ VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
pip install dist/*.whl pip install dist/*.whl && \
rm -rf dist
WORKDIR /workspace/ WORKDIR /workspace/
......
# default base image # default base image
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.19.1-ubuntu20.04" ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.0-ubuntu20.04"
FROM $BASE_IMAGE FROM $BASE_IMAGE
RUN echo "Base image is $BASE_IMAGE" RUN echo "Base image is $BASE_IMAGE"
# Install some basic utilities # Install some basic utilities
RUN apt-get update \ RUN apt-get update && \
&& apt-get install python3 python3-pip -y \ apt-get install -y \
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 git \
python3 \
python3-pip \
ffmpeg libsm6 libxext6 libgl1
### Mount Point ### ### Mount Point ###
# When launching the container, mount the code directory to /app # When launching the container, mount the code directory to /app
...@@ -20,19 +23,19 @@ RUN python3 -m pip install --upgrade pip ...@@ -20,19 +23,19 @@ RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
RUN python3 -m pip install sentencepiece transformers==4.36.2 -U RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
RUN python3 -m pip install --pre neuronx-cc==2.12.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U RUN python3 -m pip install --pre neuronx-cc==2.15.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
COPY ./vllm /app/vllm/vllm COPY . /app/vllm
COPY ./setup.py /app/vllm/setup.py
COPY ./requirements-common.txt /app/vllm/requirements-common.txt
COPY ./requirements-neuron.txt /app/vllm/requirements-neuron.txt
RUN cd /app/vllm \ RUN cd /app/vllm \
&& python3 -m pip install -U -r requirements-neuron.txt && python3 -m pip install -U \
cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
-r requirements-neuron.txt
ENV VLLM_TARGET_DEVICE neuron ENV VLLM_TARGET_DEVICE neuron
RUN cd /app/vllm \ RUN --mount=type=bind,source=.git,target=.git \
&& pip install -e . \ cd /app/vllm \
&& pip install --no-build-isolation -v -e . \
&& cd .. && cd ..
CMD ["/bin/bash"] CMD ["/bin/bash"]
...@@ -4,8 +4,9 @@ ...@@ -4,8 +4,9 @@
FROM ubuntu:22.04 AS dev FROM ubuntu:22.04 AS dev
RUN apt-get update -y && \ RUN apt-get update -y && \
apt-get install -y python3-pip git && \ apt-get install -y \
apt-get install -y ffmpeg libsm6 libxext6 libgl1 git python3-pip \
ffmpeg libsm6 libxext6 libgl1
WORKDIR /workspace WORKDIR /workspace
# copy requirements # copy requirements
......
...@@ -16,9 +16,15 @@ COPY ./ /workspace/vllm ...@@ -16,9 +16,15 @@ COPY ./ /workspace/vllm
WORKDIR /workspace/vllm WORKDIR /workspace/vllm
# These packages will be in rocketce eventually # These packages will be in rocketce eventually
RUN pip install -v cmake xformers torch==2.3.1 uvloop==0.20.0 -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing RUN --mount=type=cache,target=/root/.cache/pip \
pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
torch==2.3.1 \
-r requirements-cpu.txt \
xformers uvloop==0.20.0
RUN --mount=type=bind,source=.git,target=.git \
VLLM_TARGET_DEVICE=cpu python3 setup.py install
WORKDIR /workspace/ WORKDIR /workspace/
......
# Default ROCm 6.1 base image # Default ROCm 6.2 base image
ARG BASE_IMAGE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging" ARG BASE_IMAGE="rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0"
# Default ROCm ARCHes to build vLLM for. # Default ROCm ARCHes to build vLLM for.
ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100" ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
...@@ -7,18 +7,12 @@ ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100" ...@@ -7,18 +7,12 @@ ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
# Whether to install CK-based flash-attention # Whether to install CK-based flash-attention
# If 0, will not install flash-attention # If 0, will not install flash-attention
ARG BUILD_FA="1" ARG BUILD_FA="1"
# If `TRY_FA_WHEEL=1`, we will try installing flash-attention from `FA_WHEEL_URL`
# If this succeeds, we use the downloaded wheel and skip building flash-attention.
# Otherwise, ROCm flash-attention from `FA_BRANCH` will be built for the
# architectures specified in `FA_GFX_ARCHS`
ARG TRY_FA_WHEEL="1"
ARG FA_WHEEL_URL="https://github.com/ROCm/flash-attention/releases/download/v2.5.9post1-cktile-vllm/flash_attn-2.5.9.post1-cp39-cp39-linux_x86_64.whl"
ARG FA_GFX_ARCHS="gfx90a;gfx942" ARG FA_GFX_ARCHS="gfx90a;gfx942"
ARG FA_BRANCH="23a2b1c2" ARG FA_BRANCH="3cea2fb"
# Whether to build triton on rocm # Whether to build triton on rocm
ARG BUILD_TRITON="1" ARG BUILD_TRITON="1"
ARG TRITON_BRANCH="e0fc12c" ARG TRITON_BRANCH="e192dba"
### Base image build stage ### Base image build stage
FROM $BASE_IMAGE AS base FROM $BASE_IMAGE AS base
...@@ -50,14 +44,17 @@ RUN python3 -m pip install --upgrade pip ...@@ -50,14 +44,17 @@ RUN python3 -m pip install --upgrade pip
# Remove sccache so it doesn't interfere with ccache # Remove sccache so it doesn't interfere with ccache
# TODO: implement sccache support across components # TODO: implement sccache support across components
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)" RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
# Install torch == 2.5.0 on ROCm
RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \ # Install torch == 2.6.0 on ROCm
*"rocm-6.1"*) \ RUN --mount=type=cache,target=/root/.cache/pip \
case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
*"rocm-6.2"*) \
python3 -m pip uninstall -y torch torchvision \ python3 -m pip uninstall -y torch torchvision \
&& python3 -m pip install --no-cache-dir --pre \ && python3 -m pip install --pre \
torch==2.5.0.dev20240726 \ torch==2.6.0.dev20240918 \
torchvision==0.20.0.dev20240726 \ setuptools-scm>=8 \
--index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \ torchvision==0.20.0.dev20240918 \
--extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2;; \
*) ;; esac *) ;; esac
ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
...@@ -79,25 +76,18 @@ RUN cd /opt/rocm/share/amd_smi \ ...@@ -79,25 +76,18 @@ RUN cd /opt/rocm/share/amd_smi \
### Flash-Attention wheel build stage ### Flash-Attention wheel build stage
FROM base AS build_fa FROM base AS build_fa
ARG BUILD_FA ARG BUILD_FA
ARG TRY_FA_WHEEL
ARG FA_WHEEL_URL
ARG FA_GFX_ARCHS ARG FA_GFX_ARCHS
ARG FA_BRANCH ARG FA_BRANCH
# Build ROCm flash-attention wheel if `BUILD_FA = 1` # Build ROCm flash-attention wheel if `BUILD_FA = 1`
RUN --mount=type=cache,target=${CCACHE_DIR} \ RUN --mount=type=cache,target=${CCACHE_DIR} \
if [ "$BUILD_FA" = "1" ]; then \ if [ "$BUILD_FA" = "1" ]; then \
if [ "${TRY_FA_WHEEL}" = "1" ] && python3 -m pip install "${FA_WHEEL_URL}"; then \ mkdir -p libs \
# If a suitable wheel exists, we download it instead of building FA && cd libs \
mkdir -p /install && wget -N "${FA_WHEEL_URL}" -P /install; \ && git clone https://github.com/ROCm/flash-attention.git \
else \ && cd flash-attention \
mkdir -p libs \ && git checkout "${FA_BRANCH}" \
&& cd libs \ && git submodule update --init \
&& git clone https://github.com/ROCm/flash-attention.git \ && GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
&& cd flash-attention \
&& git checkout "${FA_BRANCH}" \
&& git submodule update --init \
&& GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
fi; \
# Create an empty directory otherwise as later build stages expect one # Create an empty directory otherwise as later build stages expect one
else mkdir -p /install; \ else mkdir -p /install; \
fi fi
...@@ -112,6 +102,7 @@ RUN --mount=type=cache,target=${CCACHE_DIR} \ ...@@ -112,6 +102,7 @@ RUN --mount=type=cache,target=${CCACHE_DIR} \
if [ "$BUILD_TRITON" = "1" ]; then \ if [ "$BUILD_TRITON" = "1" ]; then \
mkdir -p libs \ mkdir -p libs \
&& cd libs \ && cd libs \
&& python3 -m pip install ninja cmake wheel pybind11 \
&& git clone https://github.com/OpenAI/triton.git \ && git clone https://github.com/OpenAI/triton.git \
&& cd triton \ && cd triton \
&& git checkout "${TRITON_BRANCH}" \ && git checkout "${TRITON_BRANCH}" \
...@@ -129,7 +120,7 @@ COPY . . ...@@ -129,7 +120,7 @@ COPY . .
# Package upgrades for useful functionality or to avoid dependency issues # Package upgrades for useful functionality or to avoid dependency issues
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install --upgrade numba scipy huggingface-hub[cli] python3 -m pip install --upgrade numba scipy huggingface-hub[cli] pytest-shard
# Workaround for ray >= 2.10.0 # Workaround for ray >= 2.10.0
...@@ -138,15 +129,9 @@ ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 ...@@ -138,15 +129,9 @@ ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
ENV TOKENIZERS_PARALLELISM=false ENV TOKENIZERS_PARALLELISM=false
RUN --mount=type=cache,target=${CCACHE_DIR} \ RUN --mount=type=cache,target=${CCACHE_DIR} \
--mount=type=bind,source=.git,target=.git \
--mount=type=cache,target=/root/.cache/pip \ --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -Ur requirements-rocm.txt \ python3 -m pip install -Ur requirements-rocm.txt \
&& case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
*"rocm-6.1"*) \
# Bring in upgrades to HIP graph earlier than ROCm 6.2 for vLLM
wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P /opt/rocm/lib \
# Prevent interference if torch bundles its own HIP runtime
&& rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so* || true;; \
*) ;; esac \
&& python3 setup.py clean --all \ && python3 setup.py clean --all \
&& python3 setup.py develop && python3 setup.py develop
......
...@@ -5,16 +5,25 @@ FROM $BASE_IMAGE ...@@ -5,16 +5,25 @@ FROM $BASE_IMAGE
WORKDIR /workspace WORKDIR /workspace
# Install some basic utilities # Install some basic utilities
RUN apt-get update && apt-get install -y ffmpeg libsm6 libxext6 libgl1 RUN apt-get update && apt-get install -y \
git \
ffmpeg libsm6 libxext6 libgl1
# Install the TPU and Pallas dependencies. # Install the TPU and Pallas dependencies.
RUN python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html RUN --mount=type=cache,target=/root/.cache/pip \
RUN python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
# Build vLLM. # Build vLLM.
COPY . /workspace/vllm COPY . /workspace/vllm
ENV VLLM_TARGET_DEVICE="tpu" ENV VLLM_TARGET_DEVICE="tpu"
RUN cd /workspace/vllm && python3 -m pip install -r requirements-tpu.txt RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,source=.git,target=.git \
cd /workspace/vllm && \
python3 -m pip install \
cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
-r requirements-tpu.txt
RUN cd /workspace/vllm && python3 setup.py develop RUN cd /workspace/vllm && python3 setup.py develop
CMD ["/bin/bash"] CMD ["/bin/bash"]
FROM intel/oneapi-basekit:2024.1.0-devel-ubuntu20.04 FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \ echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \ chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
rm /etc/apt/sources.list.d/intel-graphics.list && \
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \ wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \ echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
chmod 644 /usr/share/keyrings/intel-graphics.gpg chmod 644 /usr/share/keyrings/intel-graphics.gpg
RUN apt-get update -y \ RUN apt-get update -y && \
&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip ffmpeg libsm6 libxext6 libgl1 apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip ffmpeg libsm6 libxext6 libgl1
COPY ./ /workspace/vllm COPY ./ /workspace/vllm
WORKDIR /workspace/vllm WORKDIR /workspace/vllm
RUN pip install -v -r requirements-xpu.txt RUN --mount=type=cache,target=/root/.cache/pip \
pip install -v --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ \
cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
-r requirements-xpu.txt
RUN VLLM_TARGET_DEVICE=xpu python3 setup.py install RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,source=.git,target=.git \
VLLM_TARGET_DEVICE=xpu python3 setup.py install
CMD ["/bin/bash"] CMD ["/bin/bash"]
# Security Policy
## Reporting a Vulnerability
If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away.
We will investigate all legitimate reports and do our best to quickly fix the problem.
Please report security issues using https://github.com/vllm-project/vllm/security/advisories/new
---
Please see PyTorch Security for more information how to securely interact with models: https://github.com/pytorch/pytorch/blob/main/SECURITY.md
This document mostly references the recommendation from PyTorch, thank you!
...@@ -25,6 +25,7 @@ class RequestFuncInput: ...@@ -25,6 +25,7 @@ class RequestFuncInput:
best_of: int = 1 best_of: int = 1
use_beam_search: bool = False use_beam_search: bool = False
logprobs: Optional[int] = None logprobs: Optional[int] = None
multi_modal_content: Optional[dict] = None
@dataclass @dataclass
...@@ -312,12 +313,15 @@ async def async_request_openai_chat_completions( ...@@ -312,12 +313,15 @@ async def async_request_openai_chat_completions(
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
assert not request_func_input.use_beam_search assert not request_func_input.use_beam_search
content = [{"type": "text", "text": request_func_input.prompt}]
if request_func_input.multi_modal_content:
content.append(request_func_input.multi_modal_content)
payload = { payload = {
"model": request_func_input.model, "model": request_func_input.model,
"messages": [ "messages": [
{ {
"role": "user", "role": "user",
"content": request_func_input.prompt, "content": content
}, },
], ],
"temperature": 0.0, "temperature": 0.0,
......
"""Benchmark offline prioritization."""
import argparse
import json
import random
import time
from typing import List, Optional, Tuple
from transformers import AutoTokenizer, PreTrainedTokenizerBase
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
def sample_requests(
dataset_path: str,
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
fixed_output_len: Optional[int],
) -> List[Tuple[str, int, int]]:
if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small")
# Load the dataset.
with open(dataset_path) as f:
dataset = json.load(f)
# Filter out the conversations with less than 2 turns.
dataset = [data for data in dataset if len(data["conversations"]) >= 2]
# Only keep the first two turns of each conversation.
dataset = [(data["conversations"][0]["value"],
data["conversations"][1]["value"]) for data in dataset]
# Shuffle the dataset.
random.shuffle(dataset)
# Filter out sequences that are too long or too short
filtered_dataset: List[Tuple[str, int, int]] = []
for i in range(len(dataset)):
if len(filtered_dataset) == num_requests:
break
# Tokenize the prompts and completions.
prompt = dataset[i][0]
prompt_token_ids = tokenizer(prompt).input_ids
completion = dataset[i][1]
completion_token_ids = tokenizer(completion).input_ids
prompt_len = len(prompt_token_ids)
output_len = len(completion_token_ids
) if fixed_output_len is None else fixed_output_len
if prompt_len < 4 or output_len < 4:
# Prune too short sequences.
continue
if prompt_len > 1024 or prompt_len + output_len > 2048:
# Prune too long sequences.
continue
#Select a equi-probable random priority
priority = 0 if random.random() < 0.5 else 1
filtered_dataset.append((prompt, prompt_len, output_len, priority))
return filtered_dataset
def run_vllm(
requests: List[Tuple[str, int, int]],
model: str,
tokenizer: str,
quantization: Optional[str],
tensor_parallel_size: int,
seed: int,
n: int,
use_beam_search: bool,
trust_remote_code: bool,
dtype: str,
max_model_len: Optional[int],
enforce_eager: bool,
kv_cache_dtype: str,
quantization_param_path: Optional[str],
device: str,
enable_prefix_caching: bool,
enable_chunked_prefill: bool,
max_num_batched_tokens: int,
gpu_memory_utilization: float = 0.9,
download_dir: Optional[str] = None,
) -> float:
from vllm import LLM, SamplingParams
llm = LLM(
model=model,
tokenizer=tokenizer,
quantization=quantization,
tensor_parallel_size=tensor_parallel_size,
seed=seed,
trust_remote_code=trust_remote_code,
dtype=dtype,
max_model_len=max_model_len,
gpu_memory_utilization=gpu_memory_utilization,
enforce_eager=enforce_eager,
kv_cache_dtype=kv_cache_dtype,
quantization_param_path=quantization_param_path,
device=device,
enable_prefix_caching=enable_prefix_caching,
download_dir=download_dir,
enable_chunked_prefill=enable_chunked_prefill,
max_num_batched_tokens=max_num_batched_tokens,
disable_log_stats=False,
)
# Add the requests to the engine.
prompts = []
sampling_params = []
priority = []
for prompt, _, output_len, _priority in requests:
prompts.append(prompt)
priority.append(_priority)
sampling_params.append(
SamplingParams(
n=n,
temperature=0.0 if use_beam_search else 1.0,
top_p=1.0,
use_beam_search=use_beam_search,
ignore_eos=True,
max_tokens=output_len,
))
start = time.perf_counter()
llm.generate(prompts, sampling_params, priority=priority, use_tqdm=True)
end = time.perf_counter()
return end - start
def main(args: argparse.Namespace):
print(args)
random.seed(args.seed)
# Sample the requests.
tokenizer = AutoTokenizer.from_pretrained(
args.tokenizer, trust_remote_code=args.trust_remote_code)
if args.dataset is None:
# Synthesize a prompt with the given input length.
prompt = "hi" * (args.input_len - 1)
requests = [(prompt, args.input_len, args.output_len)
for _ in range(args.num_prompts)]
else:
requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
args.output_len)
if args.backend == "vllm":
elapsed_time = run_vllm(
requests, args.model, args.tokenizer, args.quantization,
args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
args.trust_remote_code, args.dtype, args.max_model_len,
args.enforce_eager, args.kv_cache_dtype,
args.quantization_param_path, args.device,
args.enable_prefix_caching, args.enable_chunked_prefill,
args.max_num_batched_tokens, args.gpu_memory_utilization,
args.download_dir)
else:
raise ValueError(f"Unknown backend: {args.backend}")
total_num_tokens = sum(prompt_len + output_len
for _, prompt_len, output_len, priority in requests)
print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
f"{total_num_tokens / elapsed_time:.2f} tokens/s")
# Output JSON results if specified
if args.output_json:
results = {
"elapsed_time": elapsed_time,
"num_requests": len(requests),
"total_num_tokens": total_num_tokens,
"requests_per_second": len(requests) / elapsed_time,
"tokens_per_second": total_num_tokens / elapsed_time,
}
with open(args.output_json, "w") as f:
json.dump(results, f, indent=4)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Benchmark the throughput.")
parser.add_argument("--backend",
type=str,
choices=["vllm", "hf", "mii"],
default="vllm")
parser.add_argument("--dataset",
type=str,
default=None,
help="Path to the dataset.")
parser.add_argument("--input-len",
type=int,
default=None,
help="Input prompt length for each request")
parser.add_argument("--output-len",
type=int,
default=None,
help="Output length for each request. Overrides the "
"output length from the dataset.")
parser.add_argument("--model", type=str, default="facebook/opt-125m")
parser.add_argument("--tokenizer", type=str, default=None)
parser.add_argument('--quantization',
'-q',
choices=[*QUANTIZATION_METHODS, None],
default=None)
parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
parser.add_argument("--n",
type=int,
default=1,
help="Number of generated sequences per prompt.")
parser.add_argument("--use-beam-search", action="store_true")
parser.add_argument("--num-prompts",
type=int,
default=200,
help="Number of prompts to process.")
parser.add_argument("--seed", type=int, default=0)
parser.add_argument('--trust-remote-code',
action='store_true',
help='trust remote code from huggingface')
parser.add_argument(
'--max-model-len',
type=int,
default=None,
help='Maximum length of a sequence (including prompt and output). '
'If None, will be derived from the model.')
parser.add_argument(
'--dtype',
type=str,
default='auto',
choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
help='data type for model weights and activations. '
'The "auto" option will use FP16 precision '
'for FP32 and FP16 models, and BF16 precision '
'for BF16 models.')
parser.add_argument('--gpu-memory-utilization',
type=float,
default=0.9,
help='the fraction of GPU memory to be used for '
'the model executor, which can range from 0 to 1.'
'If unspecified, will use the default value of 0.9.')
parser.add_argument("--enforce-eager",
action="store_true",
help="enforce eager execution")
parser.add_argument(
'--kv-cache-dtype',
type=str,
choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
default="auto",
help='Data type for kv cache storage. If "auto", will use model '
'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
parser.add_argument(
'--quantization-param-path',
type=str,
default=None,
help='Path to the JSON file containing the KV cache scaling factors. '
'This should generally be supplied, when KV cache dtype is FP8. '
'Otherwise, KV cache scaling factors default to 1.0, which may cause '
'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
'instead supported for common inference criteria.')
parser.add_argument(
"--device",
type=str,
default="cuda",
choices=["cuda", "cpu"],
help='device type for vLLM execution, supporting CUDA and CPU.')
parser.add_argument(
"--enable-prefix-caching",
action='store_true',
help="enable automatic prefix caching for vLLM backend.")
parser.add_argument("--enable-chunked-prefill",
action='store_true',
help="enable chunked prefill for vLLM backend.")
parser.add_argument('--max-num-batched-tokens',
type=int,
default=None,
help='maximum number of batched tokens per '
'iteration')
parser.add_argument('--download-dir',
type=str,
default=None,
help='directory to download and load the weights, '
'default to the default cache dir of huggingface')
parser.add_argument(
'--output-json',
type=str,
default=None,
help='Path to save the throughput results in JSON format.')
args = parser.parse_args()
if args.tokenizer is None:
args.tokenizer = args.model
if args.dataset is None:
assert args.input_len is not None
assert args.output_len is not None
else:
assert args.input_len is None
main(args)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment