diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py
index 90a5e54736cf3343da62fb33e6ca81c322337d93..75ad094fa138220c1b054a1200690f388c011b44 100644
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@@ -1,7 +1,7 @@
import os
import zipfile
-MAX_SIZE_MB = 100
+MAX_SIZE_MB = 200
def print_top_10_largest_files(zip_file):
diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
index c04e05a994894d769df8f5ccf95d834b3aed3791..bde8ab6184d3c645903796872a506d75bb671fdd 100644
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -1,10 +1,38 @@
-# This script build the ROCm docker image and runs test inside it.
+# This script runs test inside the corresponding ROCm docker container.
set -ex
# Print ROCm version
echo "--- ROCm info"
rocminfo
+# cleanup older docker images
+cleanup_docker() {
+ # Get Docker's root directory
+ docker_root=$(docker info -f '{{.DockerRootDir}}')
+ if [ -z "$docker_root" ]; then
+ echo "Failed to determine Docker root directory."
+ exit 1
+ fi
+ echo "Docker root directory: $docker_root"
+ # Check disk usage of the filesystem where Docker's root directory is located
+ disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
+ # Define the threshold
+ threshold=70
+ if [ "$disk_usage" -gt "$threshold" ]; then
+ echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
+ # Remove dangling images (those that are not tagged and not used by any container)
+ docker image prune -f
+ # Remove unused volumes
+ docker volume prune -f
+ echo "Docker images and volumes cleanup completed."
+ else
+ echo "Disk usage is below $threshold%. No cleanup needed."
+ fi
+}
+
+# Call the cleanup docker function
+cleanup_docker
+
echo "--- Resetting GPUs"
echo "reset" > /opt/amdgpu/etc/gpu_state
@@ -19,15 +47,16 @@ done
echo "--- Building container"
sha=$(git rev-parse --short HEAD)
-container_name=rocm_${sha}
+image_name=rocm_${sha}
+container_name=rocm_${sha}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)
docker build \
- -t ${container_name} \
+ -t ${image_name} \
-f Dockerfile.rocm \
--progress plain \
.
remove_docker_container() {
- docker rm -f ${container_name} || docker image rm -f ${container_name} || true
+ docker rm -f ${container_name} || docker image rm -f ${image_name} || true
}
trap remove_docker_container EXIT
@@ -39,6 +68,6 @@ docker run \
--rm \
-e HF_TOKEN \
--name ${container_name} \
- ${container_name} \
- /bin/bash -c $(echo $1 | sed "s/^'//" | sed "s/'$//")
+ ${image_name} \
+ /bin/bash -c "${@}"
diff --git a/.buildkite/run-benchmarks.sh b/.buildkite/run-benchmarks.sh
index 7fbad1c4bd9503a9295070688efa244a683c2eb9..1efc96395933fd9d13d3004b60ea3021012651b4 100644
--- a/.buildkite/run-benchmarks.sh
+++ b/.buildkite/run-benchmarks.sh
@@ -9,10 +9,10 @@ cd "$(dirname "${BASH_SOURCE[0]}")/.."
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
# run python-based benchmarks and upload the result to buildkite
-python3 benchmarks/benchmark_latency.py 2>&1 | tee benchmark_latency.txt
+python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
bench_latency_exit_code=$?
-python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 2>&1 | tee benchmark_throughput.txt
+python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
bench_throughput_exit_code=$?
# run server-based benchmarks and upload the result to buildkite
@@ -74,4 +74,5 @@ if [ $bench_serving_exit_code -ne 0 ]; then
exit $bench_serving_exit_code
fi
-/workspace/buildkite-agent artifact upload openai-*.json
+rm ShareGPT_V3_unfiltered_cleaned_split.json
+/workspace/buildkite-agent artifact upload "*.json"
diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index f187d1f1817242b7208b6e3f6a9deab1b48f5551..414045fe163e572b999afb99be38c3a0e601da48 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -11,4 +11,4 @@ trap remove_docker_container EXIT
remove_docker_container
# Run the image and launch offline inference
-docker run --network host --env VLLM_CPU_KVCACHE_SPACE=1 --name cpu-test cpu-test python3 examples/offline_inference.py
+docker run --network host --env VLLM_CPU_KVCACHE_SPACE=1 --name cpu-test cpu-test python3 vllm/examples/offline_inference.py
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index e49a5650c44ea0817a00c2971656b0b732a5b18b..21cbd9ba1378007823c6533857d3f779e5b448e0 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -5,13 +5,16 @@
steps:
- label: Regression Test
+ mirror_hardwares: [amd]
command: pytest -v -s test_regression.py
working_dir: "/vllm-workspace/tests" # optional
- label: AsyncEngine Test
+ #mirror_hardwares: [amd]
command: pytest -v -s async_engine
- label: Basic Correctness Test
+ mirror_hardwares: [amd]
commands:
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
@@ -24,59 +27,68 @@ steps:
command: pytest -v -s core
- label: Distributed Comm Ops Test
- command: pytest -v -s test_comm_ops.py
- working_dir: "/vllm-workspace/tests/distributed"
+ #mirror_hardwares: [amd]
+ command: pytest -v -s distributed/test_comm_ops.py
+ working_dir: "/vllm-workspace/tests"
num_gpus: 2
- label: Distributed Tests
- working_dir: "/vllm-workspace/tests/distributed"
-
- num_gpus: 2 # only support 1 or 2 for now.
mirror_hardwares: [amd]
-
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
commands:
- - pytest -v -s test_pynccl_library.py
- - TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_basic_distributed_correctness.py
- - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_basic_distributed_correctness.py
- - TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_chunked_prefill_distributed.py
- - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_chunked_prefill_distributed.py
+ - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+ - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+ - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
+ - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
+ - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
+ - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
+ - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
+ - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
+ - pytest -v -s spec_decode/e2e/test_integration_dist.py
- label: Distributed Tests (Multiple Groups)
- working_dir: "/vllm-workspace/tests/distributed"
+ #mirror_hardwares: [amd]
+ working_dir: "/vllm-workspace/tests"
num_gpus: 4
commands:
- - pytest -v -s test_pynccl.py
+ - pytest -v -s distributed/test_pynccl.py
- label: Engine Test
mirror_hardwares: [amd]
command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
- label: Entrypoints Test
+ mirror_hardwares: [amd]
+
commands:
- # these tests have to be separated, because each one will allocate all posible GPU memory
- - pytest -v -s entrypoints --ignore=entrypoints/test_server_oot_registration.py
- - pytest -v -s entrypoints/test_server_oot_registration.py
+ - pytest -v -s test_inputs.py
+ - pytest -v -s entrypoints -m llm
+ - pytest -v -s entrypoints -m openai
- label: Examples Test
working_dir: "/vllm-workspace/examples"
mirror_hardwares: [amd]
commands:
# install aws cli for llava_example.py
- - pip install awscli
+ # install tensorizer for tensorize_vllm_model.py
+ - pip install awscli tensorizer
- python3 offline_inference.py
- python3 offline_inference_with_prefix.py
- python3 llm_engine_example.py
- python3 llava_example.py
+ - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- label: Kernels Test %N
+ #mirror_hardwares: [amd]
command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 4
- label: Models Test
- mirror_hardwares: [amd]
+ #mirror_hardwares: [amd]
commands:
- bash ../.buildkite/download-images.sh
- - pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py
+ - pytest -v -s models --ignore=models/test_llava.py
- label: Llava Test
mirror_hardwares: [amd]
@@ -90,31 +102,53 @@ steps:
- pytest -v -s prefix_caching
- label: Samplers Test
+ #mirror_hardwares: [amd]
command: pytest -v -s samplers
- label: LogitsProcessor Test
mirror_hardwares: [amd]
command: pytest -v -s test_logits_processor.py
+- label: Utils Test
+ command: pytest -v -s test_utils.py
+
- label: Worker Test
mirror_hardwares: [amd]
command: pytest -v -s worker
- label: Speculative decoding tests
- mirror_hardwares: [amd]
+ #mirror_hardwares: [amd]
command: pytest -v -s spec_decode
- label: LoRA Test %N
- command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+ #mirror_hardwares: [amd]
+ command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
parallelism: 4
+- label: LoRA Long Context (Distributed)
+ #mirror_hardwares: [amd]
+ num_gpus: 4
+ # This test runs llama 13B, so it is required to run on 4 GPUs.
+ commands:
+ # Temporarily run this way because we cannot clean up GPU mem usage
+ # for multi GPU tests.
+ # TODO(sang): Fix it.
+ - pytest -v -s lora/test_long_context.py::test_rotary_emb_replaced
+ - pytest -v -s lora/test_long_context.py::test_batched_rope_kernel
+ - pytest -v -s lora/test_long_context.py::test_self_consistency
+ - pytest -v -s lora/test_long_context.py::test_quality
+ - pytest -v -s lora/test_long_context.py::test_max_len
+
- label: Tensorizer Test
+ #mirror_hardwares: [amd]
command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader
- label: Metrics Test
+ mirror_hardwares: [amd]
command: pytest -v -s metrics
- label: Quantization Test
+ #mirror_hardwares: [amd]
command: pytest -v -s quantization
- label: Benchmarks
diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2
index ea02b6b1e9c9ee190869eaaaae888fbda0bd8617..265833e2ccf6e55d272c211384529a4ebe7d4ffe 100644
--- a/.buildkite/test-template.j2
+++ b/.buildkite/test-template.j2
@@ -3,9 +3,8 @@
{% set default_working_dir = "/vllm-workspace/tests" %}
steps:
-
- label: ":docker: build image"
- commands:
+ commands:
- "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
- "docker push {{ docker_image }}"
env:
@@ -14,6 +13,8 @@ steps:
automatic:
- exit_status: -1 # Agent was lost
limit: 5
+ - exit_status: -10 # Agent was lost
+ limit: 5
- wait
- group: "AMD Tests"
@@ -24,7 +25,7 @@ steps:
- label: "AMD: {{ step.label }}"
agents:
queue: amd
- command: bash .buildkite/run-amd-test.sh "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'"
+ command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" ; ")) | safe }}"
env:
DOCKER_BUILDKIT: "1"
{% endif %}
@@ -53,6 +54,8 @@ steps:
automatic:
- exit_status: -1 # Agent was lost
limit: 5
+ - exit_status: -10 # Agent was lost
+ limit: 5
plugins:
- kubernetes:
podSpec:
diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000000000000000000000000000000000000..7f9e6d720fae5e3881c922172fca8fdb82d39890
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,26 @@
+BasedOnStyle: Google
+UseTab: Never
+IndentWidth: 2
+ColumnLimit: 80
+
+# Force pointers to the type for C++.
+DerivePointerAlignment: false
+PointerAlignment: Left
+
+# Reordering #include statements can (and currently will) introduce errors
+SortIncludes: false
+
+# Style choices
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+IndentPPDirectives: BeforeHash
+
+IncludeCategories:
+ - Regex: '^<'
+ Priority: 4
+ - Regex: '^"(llvm|llvm-c|clang|clang-c|mlir|mlir-c)/'
+ Priority: 3
+ - Regex: '^"(qoda|\.\.)/'
+ Priority: 2
+ - Regex: '.*'
+ Priority: 1
diff --git a/.github/ISSUE_TEMPLATE/400-bug report.yml b/.github/ISSUE_TEMPLATE/400-bug report.yml
index 08120ad8e5a600b19a09d4244b4db982435c20ed..ce980c3f4a01d4df29be6feafd07922d164242f4 100644
--- a/.github/ISSUE_TEMPLATE/400-bug report.yml
+++ b/.github/ISSUE_TEMPLATE/400-bug report.yml
@@ -59,6 +59,8 @@ body:
Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````.
+ Please set the environment variable `export VLLM_LOGGING_LEVEL=DEBUG` to turn on more logging to help debugging potential issues.
+
If you experienced crashes or hangs, it would be helpful to run vllm with `export VLLM_TRACE_FUNCTION=1` . All the function calls in vllm will be recorded. Inspect these log files, and tell which function crashes or hangs.
placeholder: |
A clear and concise description of what the bug is.
diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml
new file mode 100644
index 0000000000000000000000000000000000000000..e9b6e28fa6bcbe39a06591319628dddb643c3112
--- /dev/null
+++ b/.github/workflows/clang-format.yml
@@ -0,0 +1,42 @@
+name: clang-format
+
+on:
+ # Trigger the workflow on push or pull request,
+ # but only for the main branch
+ push:
+ branches:
+ - main
+ pull_request:
+ branches:
+ - main
+
+jobs:
+ clang-format:
+ runs-on: ubuntu-latest
+ strategy:
+ matrix:
+ python-version: ["3.11"]
+ steps:
+ - uses: actions/checkout@v2
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install clang-format==18.1.5
+ - name: Running clang-format
+ run: |
+ EXCLUDES=(
+ 'csrc/moe/topk_softmax_kernels.cu'
+ 'csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu'
+ 'csrc/punica/bgmv/bgmv_config.h'
+ 'csrc/punica/bgmv/bgmv_impl.cuh'
+ 'csrc/punica/bgmv/vec_dtypes.cuh'
+ 'csrc/punica/punica_ops.cu'
+ 'csrc/punica/type_convert.h'
+ )
+ find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
+ | grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \
+ | xargs clang-format --dry-run --Werror
\ No newline at end of file
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index ac60ce0fed14aab96373d8c1388d005b17e6c526..9c35ede5f6781634c403df3c0abb10b4dd415294 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -58,6 +58,9 @@ jobs:
- name: Setup ccache
uses: hendrikmuhs/ccache-action@v1.2
+ with:
+ create-symlink: true
+ key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
- name: Set up Linux Env
if: ${{ runner.os == 'Linux' }}
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 393571d6321b867021aa2ec002d3c5f10122adad..e0c70a0d5a69598264ca9f962ce7ed2d2a225590 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -168,19 +168,47 @@ set(VLLM_EXT_SRC
"csrc/layernorm_kernels.cu"
"csrc/quantization/squeezellm/quant_cuda_kernel.cu"
"csrc/quantization/gptq/q_gemm.cu"
- # "csrc/quantization/fp8/fp8_cuda_kernels.cu"
+ # "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
+ # "csrc/quantization/fp8/common.cu"
"csrc/cuda_utils_kernels.cu"
"csrc/moe_align_block_size_kernels.cu"
"csrc/pybind.cpp")
if(VLLM_GPU_LANG STREQUAL "CUDA")
+ include(FetchContent)
+ SET(CUTLASS_ENABLE_HEADERS_ONLY=ON)
+ FetchContent_Declare(
+ cutlass
+ GIT_REPOSITORY https://github.com/nvidia/cutlass.git
+ # CUTLASS 3.5.0
+ GIT_TAG 7d49e6c7e2f8896c47f586706e67e1fb215529dc
+ )
+ FetchContent_MakeAvailable(cutlass)
+
list(APPEND VLLM_EXT_SRC
"csrc/quantization/aqlm/gemm_kernels.cu"
"csrc/quantization/awq/gemm_kernels.cu"
- "csrc/quantization/marlin/marlin_cuda_kernel.cu"
+ "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
+ "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
- "csrc/custom_all_reduce.cu")
+ "csrc/custom_all_reduce.cu"
+ "csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu"
+ "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu"
+ "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu")
+
+ #
+ # The CUTLASS kernels for Hopper require sm90a to be enabled.
+ # This is done via the below gencode option, BUT that creates kernels for both sm90 and sm90a.
+ # That adds an extra 17MB to compiled binary, so instead we selectively enable it.
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
+ set_source_files_properties(
+ "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu"
+ PROPERTIES
+ COMPILE_FLAGS
+ "-gencode arch=compute_90a,code=sm_90a")
+ endif()
+
endif()
define_gpu_extension_target(
@@ -190,6 +218,7 @@ define_gpu_extension_target(
SOURCES ${VLLM_EXT_SRC}
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
ARCHITECTURES ${VLLM_GPU_ARCHES}
+ INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
WITH_SOABI)
#
@@ -220,7 +249,8 @@ set(VLLM_PUNICA_EXT_SRC
"csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu"
"csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu"
"csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu"
- "csrc/punica/punica_ops.cc")
+ "csrc/punica/punica_ops.cu"
+ "csrc/punica/punica_pybind.cpp")
#
# Copy GPU compilation flags+update for punica
@@ -244,6 +274,9 @@ if (${VLLM_GPU_LANG} STREQUAL "CUDA")
endif()
endforeach()
message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
+elseif(${VLLM_GPU_LANG} STREQUAL "HIP")
+ set(VLLM_PUNICA_GPU_ARCHES ${VLLM_GPU_ARCHES})
+ message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
endif()
if (VLLM_PUNICA_GPU_ARCHES)
@@ -278,11 +311,6 @@ add_custom_target(default)
if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
message(STATUS "Enabling C extension.")
add_dependencies(default _C)
-endif()
-
-if(VLLM_GPU_LANG STREQUAL "CUDA")
- message(STATUS "Enabling moe extension.")
- add_dependencies(default _moe_C)
# Enable punica if -DVLLM_INSTALL_PUNICA_KERNELS=ON or
# VLLM_INSTALL_PUNICA_KERNELS is set in the environment and
@@ -293,3 +321,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
add_dependencies(default _punica_C)
endif()
endif()
+
+if(VLLM_GPU_LANG STREQUAL "CUDA")
+ message(STATUS "Enabling moe extension.")
+ add_dependencies(default _moe_C)
+endif()
diff --git a/Dockerfile b/Dockerfile
index 90be3a30f89b1a023600f60395acbf1360ddc51b..eb96bf3c1db2b8609f8d22328958ba8d831d08d2 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -79,31 +79,8 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
COPY .buildkite/check-wheel-size.py check-wheel-size.py
RUN python3 check-wheel-size.py dist
-# the `vllm_nccl` package must be installed from source distribution
-# pip is too smart to store a wheel in the cache, and other CI jobs
-# will directly use the wheel from the cache, which is not what we want.
-# we need to remove it manually
-RUN --mount=type=cache,target=/root/.cache/pip \
- pip cache remove vllm_nccl*
#################### EXTENSION Build IMAGE ####################
-#################### FLASH_ATTENTION Build IMAGE ####################
-FROM dev as flash-attn-builder
-# max jobs used for build
-ARG max_jobs=2
-ENV MAX_JOBS=${max_jobs}
-# flash attention version
-ARG flash_attn_version=v2.5.8
-ENV FLASH_ATTN_VERSION=${flash_attn_version}
-
-WORKDIR /usr/src/flash-attention-v2
-
-# Download the wheel or build it if a pre-compiled release doesn't exist
-RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
- --no-build-isolation --no-deps --no-cache-dir
-
-#################### FLASH_ATTENTION Build IMAGE ####################
-
#################### vLLM installation IMAGE ####################
# image with vLLM installed
FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS vllm-base
@@ -122,10 +99,6 @@ RUN ldconfig /usr/local/cuda-12.4/compat/
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
--mount=type=cache,target=/root/.cache/pip \
pip install dist/*.whl --verbose
-
-RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
- --mount=type=cache,target=/root/.cache/pip \
- pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
#################### vLLM installation IMAGE ####################
diff --git a/Dockerfile.cpu b/Dockerfile.cpu
index 4251fddd6cc3b170514a683216353ff2cc6aeee4..aec79824213f3a474a8be7d3203f30a5bbdff361 100644
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -17,4 +17,6 @@ RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.py
RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
+WORKDIR /workspace/
+
CMD ["/bin/bash"]
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index d04bb9915e2ab14e46200aca97bc0b4bf8652a54..9bfe8446a519d211e6e9aecc8885cb4e4c5712ba 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -92,16 +92,23 @@ RUN if [ "$BUILD_TRITON" = "1" ]; then \
WORKDIR /vllm-workspace
COPY . .
+#RUN python3 -m pip install pynvml # to be removed eventually
RUN python3 -m pip install --upgrade pip numba
+# make sure punica kernels are built (for LoRA)
+ENV VLLM_INSTALL_PUNICA_KERNELS=1
+# Workaround for ray >= 2.10.0
+ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
+
+ENV VLLM_NCCL_SO_PATH=/opt/rocm/lib/librccl.so
+
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -U -r requirements-rocm.txt \
&& patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch \
&& python3 setup.py install \
&& cp build/lib.linux-x86_64-cpython-39/vllm/_C.cpython-39-x86_64-linux-gnu.so vllm/ \
+ && cp build/lib.linux-x86_64-cpython-39/vllm/_punica_C.cpython-39-x86_64-linux-gnu.so vllm/ \
&& cd ..
-RUN python3 -m pip install --upgrade pip
-RUN python3 -m pip install --no-cache-dir ray[all]==2.9.3
CMD ["/bin/bash"]
diff --git a/README.md b/README.md
index 874792a12e8531a29e612d66e4f2c7c2720bcb4b..36a6aa4ae5ef7ef644f36a2585b488f436a8a4e1 100644
--- a/README.md
+++ b/README.md
@@ -39,7 +39,7 @@ vLLM支持
1. 基于光源pytorch2.1.0基础镜像环境:镜像下载地址:[https://sourcefind.cn/#/image/dcu/pytorch](https://sourcefind.cn/#/image/dcu/pytorch),根据pytorch2.1.0、python、dtk及系统下载对应的镜像版本。
-2. 基于现有python环境:安装pytorch2.1.0,pytorch whl包下载目录:[https://cancon.hpccube.com:65024/4/main/pytorch/DAS1.0](https://cancon.hpccube.com:65024/4/main/pytorch/DAS1.0),根据python、dtk版本,下载对应pytorch2.1.0的whl包。安装命令如下:
+2. 基于现有python环境:安装pytorch2.1.0,pytorch whl包下载目录:[https://cancon.hpccube.com:65024/4/main/pytorch](https://cancon.hpccube.com:65024/4/main/pytorch),根据python、dtk版本,下载对应pytorch2.1.0的whl包。安装命令如下:
```shell
pip install torch* (下载的torch的whl包)
pip install setuptools wheel
@@ -47,7 +47,7 @@ pip install setuptools wheel
#### 源码编译安装
```shell
-git clone https://developer.hpccube.com/codes/aicomponent/vllm # 根据需要的分支进行切换
+git clone http://developer.hpccube.com/codes/OpenDAS/vllm.git # 根据需要的分支进行切换
```
- 提供2种源码编译方式(进入vllm目录):
@@ -62,19 +62,19 @@ python3 setup.py install
```
#### 运行基础环境准备
-1、使用基于光源pytorch2.1.0基础镜像环境:docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-centos7.6-dtk24.04-py310
+1、使用上面基于光源pytorch2.1.0基础镜像环境
-2、安装对应依赖的包:
-- triton:[https://cancon.hpccube.com:65024/4/main/triton/DAS1.0](https://cancon.hpccube.com:65024/4/main/triton/DAS1.0)
-- xformers:[https://cancon.hpccube.com:65024/4/main/xformers/DAS1.0](https://cancon.hpccube.com:65024/4/main/xformers/DAS1.0)
-- flash_attn: [https://cancon.hpccube.com:65024/4/main/flash_attn/DAS1.0](https://cancon.hpccube.com:65024/4/main/flash_attn/DAS1.0)
+2、根据pytorch2.1.0、python、dtk及系统下载对应的依赖包:
+- triton:[https://cancon.hpccube.com:65024/4/main/triton](https://cancon.hpccube.com:65024/4/main/triton/)
+- xformers:[https://cancon.hpccube.com:65024/4/main/xformers](https://cancon.hpccube.com:65024/4/main/xformers)
+- flash_attn: [https://cancon.hpccube.com:65024/4/main/flash_attn](https://cancon.hpccube.com:65024/4/main/flash_attn)
#### 注意事项
+ 若使用 pip install 下载安装过慢,可添加源:-i https://pypi.tuna.tsinghua.edu.cn/simple/
## 验证
-- python -c "import vllm; print(vllm.\_\_version__)",版本号与官方版本同步,查询该软件的版本号,例如0.4.2;
+- python -c "import vllm; print(vllm.\_\_version__)",版本号与官方版本同步,查询该软件的版本号,例如0.4.3;
## Known Issue
- 无
diff --git a/README_ORIGIN.md b/README_ORIGIN.md
index ce60144c85adec20a9ffd0d51369964441dc8912..971f951e24fbbd68f3e285d761f2dcb86faec4dc 100644
--- a/README_ORIGIN.md
+++ b/README_ORIGIN.md
@@ -14,6 +14,17 @@ Easy, fast, and cheap LLM serving for everyone
+---
+
+**The Fourth vLLM Bay Area Meetup (June 11th 5:30pm-8pm PT)**
+
+We are thrilled to announce our fourth vLLM Meetup!
+The vLLM team will share recent updates and roadmap.
+We will also have vLLM collaborators from BentoML and Cloudflare coming up to the stage to discuss their experience in deploying LLMs with vLLM.
+Please register [here](https://lu.ma/agivllm) and join us!
+
+---
+
*Latest News* 🔥
- [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
@@ -51,41 +62,14 @@ vLLM is flexible and easy to use with:
- (Experimental) Prefix caching support
- (Experimental) Multi-lora support
-vLLM seamlessly supports many Hugging Face models, including the following architectures:
-
-- Aquila & Aquila2 (`BAAI/AquilaChat2-7B`, `BAAI/AquilaChat2-34B`, `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.)
-- Baichuan & Baichuan2 (`baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.)
-- BLOOM (`bigscience/bloom`, `bigscience/bloomz`, etc.)
-- ChatGLM (`THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.)
-- Command-R (`CohereForAI/c4ai-command-r-v01`, etc.)
-- DBRX (`databricks/dbrx-base`, `databricks/dbrx-instruct` etc.)
-- DeciLM (`Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc.)
-- Falcon (`tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.)
-- Gemma (`google/gemma-2b`, `google/gemma-7b`, etc.)
-- GPT-2 (`gpt2`, `gpt2-xl`, etc.)
-- GPT BigCode (`bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, etc.)
-- GPT-J (`EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.)
-- GPT-NeoX (`EleutherAI/gpt-neox-20b`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.)
-- InternLM (`internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.)
-- InternLM2 (`internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc.)
-- Jais (`core42/jais-13b`, `core42/jais-13b-chat`, `core42/jais-30b-v3`, `core42/jais-30b-chat-v3`, etc.)
-- LLaMA, Llama 2, and Meta Llama 3 (`meta-llama/Meta-Llama-3-8B-Instruct`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.)
-- MiniCPM (`openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, etc.)
-- Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.)
-- Mixtral (`mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc.)
-- MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.)
-- OLMo (`allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc.)
-- OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.)
-- Orion (`OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc.)
-- Phi (`microsoft/phi-1_5`, `microsoft/phi-2`, etc.)
-- Phi-3 (`microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, etc.)
-- Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.)
-- Qwen2 (`Qwen/Qwen1.5-7B`, `Qwen/Qwen1.5-7B-Chat`, etc.)
-- Qwen2MoE (`Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.)
-- StableLM(`stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.)
-- Starcoder2(`bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc.)
-- Xverse (`xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.)
-- Yi (`01-ai/Yi-6B`, `01-ai/Yi-34B`, etc.)
+vLLM seamlessly supports most popular open-source models on HuggingFace, including:
+- Transformer-like LLMs (e.g., Llama)
+- Mixture-of-Expert LLMs (e.g., Mixtral)
+- Multi-modal LLMs (e.g., LLaVA)
+
+Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html).
+
+## Getting Started
Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
@@ -93,9 +77,7 @@ Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/get
pip install vllm
```
-## Getting Started
-
-Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to get started.
+Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to learn more.
- [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html)
- [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html)
- [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
@@ -105,6 +87,32 @@ Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to get started
We welcome and value any contributions and collaborations.
Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.
+## Sponsors
+
+vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support!
+
+
+
+
+- a16z
+- AMD
+- Anyscale
+- AWS
+- Crusoe Cloud
+- Databricks
+- DeepInfra
+- Dropbox
+- Lambda Lab
+- NVIDIA
+- Replicate
+- Roblox
+- RunPod
+- Trainy
+- UC Berkeley
+- UC San Diego
+
+We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
+
## Citation
If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180):
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index f9d167590fe4736d01c605e20c019bbefb547fe5..58dcc6167efa6a4a014229046b0e96849902b59e 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -89,6 +89,9 @@ async def async_request_tgi(
output.latency = most_recent_timestamp - st
output.success = True
output.generated_text = data["generated_text"]
+ else:
+ output.error = response.reason or ""
+ output.success = False
except Exception:
output.success = False
exc_info = sys.exc_info()
@@ -276,6 +279,9 @@ async def async_request_openai_completions(
output.generated_text = generated_text
output.success = True
output.latency = latency
+ else:
+ output.error = response.reason or ""
+ output.success = False
except Exception:
output.success = False
exc_info = sys.exc_info()
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 44da3bad8d840699034da49ce4f9e9ef900cd748..f69d91a086a9f9cafae9f6ef2449c25818e4a88f 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -1,14 +1,16 @@
"""Benchmark the latency of processing a single batch of requests."""
import argparse
+import json
import time
from pathlib import Path
-from typing import Optional
+from typing import List, Optional
import numpy as np
import torch
from tqdm import tqdm
from vllm import LLM, SamplingParams
+from vllm.inputs import PromptStrictInputs
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
@@ -18,6 +20,8 @@ def main(args: argparse.Namespace):
# NOTE(woosuk): If the request cannot be processed in a single batch,
# the engine will automatically process the request in multiple batches.
llm = LLM(model=args.model,
+ speculative_model=args.speculative_model,
+ num_speculative_tokens=args.num_speculative_tokens,
tokenizer=args.tokenizer,
quantization=args.quantization,
tensor_parallel_size=args.tensor_parallel_size,
@@ -28,9 +32,11 @@ def main(args: argparse.Namespace):
quantization_param_path=args.quantization_param_path,
device=args.device,
ray_workers_use_nsight=args.ray_workers_use_nsight,
+ use_v2_block_manager=args.use_v2_block_manager,
enable_chunked_prefill=args.enable_chunked_prefill,
download_dir=args.download_dir,
- block_size=args.block_size)
+ block_size=args.block_size,
+ gpu_memory_utilization=args.gpu_memory_utilization)
sampling_params = SamplingParams(
n=args.n,
@@ -44,7 +50,9 @@ def main(args: argparse.Namespace):
dummy_prompt_token_ids = np.random.randint(10000,
size=(args.batch_size,
args.input_len))
- dummy_prompt_token_ids = dummy_prompt_token_ids.tolist()
+ dummy_inputs: List[PromptStrictInputs] = [{
+ "prompt_token_ids": batch
+ } for batch in dummy_prompt_token_ids.tolist()]
def run_to_completion(profile_dir: Optional[str] = None):
if profile_dir:
@@ -55,13 +63,13 @@ def main(args: argparse.Namespace):
],
on_trace_ready=torch.profiler.tensorboard_trace_handler(
str(profile_dir))) as p:
- llm.generate(prompt_token_ids=dummy_prompt_token_ids,
+ llm.generate(dummy_inputs,
sampling_params=sampling_params,
use_tqdm=False)
print(p.key_averages())
else:
start_time = time.perf_counter()
- llm.generate(prompt_token_ids=dummy_prompt_token_ids,
+ llm.generate(dummy_inputs,
sampling_params=sampling_params,
use_tqdm=False)
end_time = time.perf_counter()
@@ -93,12 +101,24 @@ def main(args: argparse.Namespace):
for percentage, percentile in zip(percentages, percentiles):
print(f'{percentage}% percentile latency: {percentile} seconds')
+ # Output JSON results if specified
+ if args.output_json:
+ results = {
+ "avg_latency": np.mean(latencies),
+ "latencies": latencies.tolist(),
+ "percentiles": dict(zip(percentages, percentiles.tolist())),
+ }
+ with open(args.output_json, "w") as f:
+ json.dump(results, f, indent=4)
+
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Benchmark the latency of processing a single batch of '
'requests till completion.')
parser.add_argument('--model', type=str, default='facebook/opt-125m')
+ parser.add_argument('--speculative-model', type=str, default=None)
+ parser.add_argument('--num-speculative-tokens', type=int, default=None)
parser.add_argument('--tokenizer', type=str, default=None)
parser.add_argument('--quantization',
'-q',
@@ -137,15 +157,13 @@ if __name__ == '__main__':
action='store_true',
help='enforce eager mode and disable CUDA graph')
parser.add_argument(
- "--kv-cache-dtype",
+ '--kv-cache-dtype',
type=str,
- choices=['auto', 'fp8'],
- default='auto',
- help=
- 'Data type for kv cache storage. If "auto", will use model data type. '
- 'FP8_E5M2 (without scaling) is only supported on cuda version greater '
- 'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for '
- 'common inference criteria.')
+ choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
+ default="auto",
+ help='Data type for kv cache storage. If "auto", will use model '
+ 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
+ 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
parser.add_argument(
'--quantization-param-path',
type=str,
@@ -181,6 +199,7 @@ if __name__ == '__main__':
action='store_true',
help='If True, the prefill requests can be chunked based on the '
'max_num_batched_tokens')
+ parser.add_argument('--use-v2-block-manager', action='store_true')
parser.add_argument(
"--ray-workers-use-nsight",
action='store_true',
@@ -191,5 +210,16 @@ if __name__ == '__main__':
default=None,
help='directory to download and load the weights, '
'default to the default cache dir of huggingface')
+ parser.add_argument(
+ '--output-json',
+ type=str,
+ default=None,
+ help='Path to save the latency results in JSON format.')
+ parser.add_argument('--gpu-memory-utilization',
+ type=float,
+ default=0.9,
+ help='the fraction of GPU memory to be used for '
+ 'the model executor, which can range from 0 to 1.'
+ 'If unspecified, will use the default value of 0.9.')
args = parser.parse_args()
main(args)
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 2c2d69da4a7d1ceb6c832f453930c2b728c22e2d..f3d71de775f82d03cba8365e97817c58cd457f22 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -17,6 +17,10 @@ On the client side, run:
--dataset-path \
--request-rate \ # By default is inf
--num-prompts # By default is 1000
+
+ when using tgi backend, add
+ --endpoint /generate_stream
+ to the end of the command above.
"""
import argparse
import asyncio
@@ -211,6 +215,11 @@ def calculate_metrics(
else:
actual_output_lens.append(0)
+ if completed == 0:
+ warnings.warn(
+ "All requests failed. This is likely due to a misconfiguration "
+ "on the benchmark arguments.",
+ stacklevel=2)
metrics = BenchmarkMetrics(
completed=completed,
total_input=total_input,
@@ -222,9 +231,9 @@ def calculate_metrics(
1000, # ttfts is empty if streaming is not supported by backend
median_ttft_ms=np.median(ttfts or 0) * 1000,
p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
- mean_tpot_ms=np.mean(tpots) * 1000,
- median_tpot_ms=np.median(tpots) * 1000,
- p99_tpot_ms=np.percentile(tpots, 99) * 1000,
+ mean_tpot_ms=np.mean(tpots or 0) * 1000,
+ median_tpot_ms=np.median(tpots or 0) * 1000,
+ p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
)
return metrics, actual_output_lens
@@ -246,6 +255,24 @@ async def benchmark(
else:
raise ValueError(f"Unknown backend: {backend}")
+ print("Starting initial single prompt test run...")
+ test_prompt, test_prompt_len, test_output_len = input_requests[0]
+ test_input = RequestFuncInput(
+ model=model_id,
+ prompt=test_prompt,
+ api_url=api_url,
+ prompt_len=test_prompt_len,
+ output_len=test_output_len,
+ best_of=best_of,
+ use_beam_search=use_beam_search,
+ )
+ test_output = await request_func(request_func_input=test_input)
+ if not test_output.success:
+ raise ValueError(
+ "Initial test run failed - Please make sure benchmark arguments "
+ f"are correctly specified. Error: {test_output.error}")
+ else:
+ print("Initial test run completed. Starting main benchmark run...")
print(f"Traffic request rate: {request_rate}")
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index c6d7d1f80bafbd985bd8185fd575d5ee20722b09..3b82713e1084ecc63dc5d8b684653b7b7e8c569f 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -249,6 +249,18 @@ def main(args: argparse.Namespace):
print(f"Generate Throughput: {total_out_tokens / elapsed_time:.2f} tokens/s")
+ # Output JSON results if specified
+ if args.output_json:
+ results = {
+ "elapsed_time": elapsed_time,
+ "num_requests": len(requests),
+ "total_num_tokens": total_num_tokens,
+ "requests_per_second": len(requests) / elapsed_time,
+ "tokens_per_second": total_num_tokens / elapsed_time,
+ }
+ with open(args.output_json, "w") as f:
+ json.dump(results, f, indent=4)
+
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Benchmark the throughput.")
@@ -318,15 +330,13 @@ if __name__ == "__main__":
action="store_true",
help="enforce eager execution")
parser.add_argument(
- "--kv-cache-dtype",
+ '--kv-cache-dtype',
type=str,
- choices=["auto", "fp8"],
+ choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
default="auto",
- help=
- 'Data type for kv cache storage. If "auto", will use model data type. '
- 'FP8_E5M2 (without scaling) is only supported on cuda version greater '
- 'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for '
- 'common inference criteria.')
+ help='Data type for kv cache storage. If "auto", will use model '
+ 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
+ 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
parser.add_argument(
'--quantization-param-path',
type=str,
@@ -360,6 +370,11 @@ if __name__ == "__main__":
default=None,
help='directory to download and load the weights, '
'default to the default cache dir of huggingface')
+ parser.add_argument(
+ '--output-json',
+ type=str,
+ default=None,
+ help='Path to save the throughput results in JSON format.')
args = parser.parse_args()
if args.tokenizer is None:
args.tokenizer = args.model
diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py
new file mode 100644
index 0000000000000000000000000000000000000000..b771911781574594087f931696d549a3d52d6182
--- /dev/null
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -0,0 +1,233 @@
+import argparse
+
+import torch
+import torch.utils.benchmark as benchmark
+from benchmark_shapes import WEIGHT_SHAPES
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.gptq_marlin import (
+ GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
+ GPTQ_MARLIN_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_SUPPORTED_NUM_BITS)
+from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
+ GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
+ GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_NUM_BITS)
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+ MarlinWorkspace, marlin_24_quantize, marlin_quantize)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+ gptq_pack, quantize_weights, sort_weights)
+
+DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
+DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
+
+ACT_ORDER_OPTS = [False, True]
+K_FULL_OPTS = [False, True]
+
+
+def bench_run(results, model, act_order, is_k_full, num_bits, group_size,
+ size_m, size_k, size_n):
+ label = "Quant Matmul"
+
+ sub_label = ("{}, act={} k_full={}, b={}, g={}, "
+ "MKN=({}x{}x{})".format(model, act_order, is_k_full, num_bits,
+ group_size, size_m, size_k, size_n))
+
+ print(f"Testing: {sub_label}")
+
+ a = torch.randn(size_m, size_k).to(torch.half).cuda()
+ b = torch.rand(size_k, size_n).to(torch.half).cuda()
+
+ a_tmp = (torch.zeros(size_m, size_k).to(torch.half).cuda())
+
+ # Marlin quant
+ (
+ marlin_w_ref,
+ marlin_q_w,
+ marlin_s,
+ marlin_g_idx,
+ marlin_sort_indices,
+ marlin_rand_perm,
+ ) = marlin_quantize(b, num_bits, group_size, act_order)
+
+ # Marlin_24 quant
+ (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta,
+ marlin_24_s) = marlin_24_quantize(b, num_bits, group_size)
+
+ # GPTQ quant
+ (w_ref, q_w, s, g_idx,
+ rand_perm) = quantize_weights(b, num_bits, group_size, act_order)
+ q_w_gptq = gptq_pack(q_w, num_bits, size_k, size_n)
+
+ # For act_order, sort the "weights" and "g_idx"
+ # so that group ids are increasing
+ repack_sort_indices = torch.empty(0, dtype=torch.int, device=b.device)
+ if act_order:
+ (q_w, g_idx, repack_sort_indices) = sort_weights(q_w, g_idx)
+
+ # Prepare
+ marlin_workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
+ GPTQ_MARLIN_MAX_PARALLEL)
+
+ marlin_24_workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_24_MIN_THREAD_N,
+ GPTQ_MARLIN_24_MAX_PARALLEL)
+
+ globals = {
+ # Gen params
+ "num_bits": num_bits,
+ "group_size": group_size,
+ "size_m": size_m,
+ "size_n": size_n,
+ "size_k": size_k,
+ "a": a,
+ "a_tmp": a_tmp,
+ # Marlin params
+ "marlin_w_ref": marlin_w_ref,
+ "marlin_q_w": marlin_q_w,
+ "marlin_s": marlin_s,
+ "marlin_g_idx": marlin_g_idx,
+ "marlin_sort_indices": marlin_sort_indices,
+ "marlin_rand_perm": marlin_rand_perm,
+ "marlin_workspace": marlin_workspace,
+ "is_k_full": is_k_full,
+ # Marlin_24 params
+ "marlin_24_w_ref": marlin_24_w_ref,
+ "marlin_24_q_w_comp": marlin_24_q_w_comp,
+ "marlin_24_meta": marlin_24_meta,
+ "marlin_24_s": marlin_24_s,
+ "marlin_24_workspace": marlin_24_workspace,
+ # GPTQ params
+ "q_w_gptq": q_w_gptq,
+ "repack_sort_indices": repack_sort_indices,
+ # Kernels
+ "gptq_marlin_gemm": ops.gptq_marlin_gemm,
+ "gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm,
+ "gptq_marlin_repack": ops.gptq_marlin_repack,
+ }
+
+ min_run_time = 1
+
+ # Warmup pytorch
+ for i in range(5):
+ torch.matmul(a, marlin_w_ref)
+
+ results.append(
+ benchmark.Timer(
+ stmt="torch.matmul(a, marlin_w_ref)",
+ globals=globals,
+ label=label,
+ sub_label=sub_label,
+ description="pytorch_gemm",
+ ).blocked_autorange(min_run_time=min_run_time))
+
+ results.append(
+ benchmark.Timer(
+ stmt=
+ "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, num_bits, size_m, size_n, size_k, is_k_full)", # noqa: E501
+ globals=globals,
+ label=label,
+ sub_label=sub_label,
+ description="gptq_marlin_gemm",
+ ).blocked_autorange(min_run_time=min_run_time))
+
+ if (num_bits in GPTQ_MARLIN_24_SUPPORTED_NUM_BITS
+ and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES):
+ results.append(
+ benchmark.Timer(
+ stmt=
+ "output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, num_bits, size_m, size_n, size_k)", # noqa: E501
+ globals=globals,
+ label=label,
+ sub_label=sub_label,
+ description="gptq_marlin_24_gemm",
+ ).blocked_autorange(min_run_time=min_run_time))
+
+ results.append(
+ benchmark.Timer(
+ stmt=
+ "q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, num_bits)", # noqa: E501
+ globals=globals,
+ label=label,
+ sub_label=sub_label,
+ description="gptq_marlin_repack",
+ ).blocked_autorange(min_run_time=min_run_time))
+
+
+def main(args):
+ print("Benchmarking models:")
+ for i, model in enumerate(args.models):
+ print(f"[{i}] {model}")
+
+ results = []
+
+ for model in args.models:
+ for layer in WEIGHT_SHAPES[model]:
+ size_k = layer[0]
+ size_n = layer[1]
+
+ if len(args.limit_k) > 0 and size_k not in args.limit_k:
+ continue
+
+ if len(args.limit_n) > 0 and size_n not in args.limit_n:
+ continue
+
+ for act_order in ACT_ORDER_OPTS:
+ if len(args.limit_act_order
+ ) > 0 and act_order not in args.limit_act_order:
+ continue
+
+ for is_k_full in K_FULL_OPTS:
+ if len(args.limit_k_full
+ ) > 0 and is_k_full not in args.limit_k_full:
+ continue
+
+ for num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS:
+ if len(args.limit_num_bits
+ ) > 0 and num_bits not in args.limit_num_bits:
+ continue
+
+ for group_size in GPTQ_MARLIN_SUPPORTED_GROUP_SIZES:
+ if len(
+ args.limit_group_size
+ ) > 0 and group_size not in args.limit_group_size:
+ continue
+
+ # For act_order, the group_size must be less than
+ # size_k
+ if act_order and (group_size == size_k
+ or group_size == -1):
+ continue
+
+ for size_m in args.batch_sizes:
+ bench_run(results, model, act_order, is_k_full,
+ num_bits, group_size, size_m, size_k,
+ size_n)
+
+ compare = benchmark.Compare(results)
+ compare.print()
+
+
+# For quick benchmarking use:
+# python benchmark_marlin.py --batch-sizes 1 16 32 --limit-k 4096 --limit-n 4096 --limit-group-size 128 --limit-num-bits 4 --limit-act-order 0 --limit-k-full 1 # noqa E501
+#
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(
+ description="Benchmark Marlin across specified models/shapes/batches")
+ parser.add_argument(
+ "--models",
+ nargs="+",
+ type=str,
+ default=DEFAULT_MODELS,
+ choices=WEIGHT_SHAPES.keys(),
+ )
+ parser.add_argument("--batch-sizes",
+ nargs="+",
+ type=int,
+ default=DEFAULT_BATCH_SIZES)
+ parser.add_argument("--limit-k", nargs="+", type=int, default=[])
+ parser.add_argument("--limit-n", nargs="+", type=int, default=[])
+ parser.add_argument("--limit-group-size", nargs="+", type=int, default=[])
+ parser.add_argument("--limit-num-bits", nargs="+", type=int, default=[])
+ parser.add_argument("--limit-act-order", nargs="+", type=int, default=[])
+ parser.add_argument("--limit-k-full", nargs="+", type=int, default=[])
+
+ args = parser.parse_args()
+ main(args)
diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
index ca7967c1ab0d253b2ba824f07f00a768263402ba..e6f4e9e6b971634119c5777e15a4f7c6890384fd 100644
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -170,7 +170,7 @@ if __name__ == '__main__':
parser.add_argument("--num-kv-heads", type=int, default=8)
parser.add_argument("--head-size",
type=int,
- choices=[64, 80, 96, 112, 128, 256],
+ choices=[64, 80, 96, 112, 128, 192, 256],
default=128)
parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
parser.add_argument("--use-alibi", action="store_true")
@@ -183,13 +183,11 @@ if __name__ == '__main__':
parser.add_argument(
"--kv-cache-dtype",
type=str,
- choices=["auto", "fp8"],
+ choices=["auto", "fp8", "fp8_e5m2", "fp8_e4m3"],
default="auto",
- help=
- 'Data type for kv cache storage. If "auto", will use model data type. '
- 'FP8_E5M2 (without scaling) is only supported on cuda version greater '
- 'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for '
- 'common inference criteria.')
+ help="Data type for kv cache storage. If 'auto', will use model "
+ "data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. "
+ "ROCm (AMD GPU) supports fp8 (=fp8_e4m3)")
args = parser.parse_args()
print(args)
diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py
index 9188e811e298216ed8a02159e10370f84e014468..00e55f6060b52f26ec37a6f2838d074d4dc99198 100644
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -93,7 +93,7 @@ if __name__ == '__main__':
parser.add_argument("--num-heads", type=int, default=8)
parser.add_argument("--head-size",
type=int,
- choices=[64, 80, 96, 112, 128, 256],
+ choices=[64, 80, 96, 112, 128, 192, 256],
default=128)
parser.add_argument("--rotary-dim", type=int, choices=[16, 32], default=32)
parser.add_argument("--dtype",
diff --git a/benchmarks/kernels/benchmark_shapes.py b/benchmarks/kernels/benchmark_shapes.py
new file mode 100644
index 0000000000000000000000000000000000000000..4eeeca35a37cc031f2a810946733d488357fd26d
--- /dev/null
+++ b/benchmarks/kernels/benchmark_shapes.py
@@ -0,0 +1,75 @@
+WEIGHT_SHAPES = {
+ "ideal": [[4 * 256 * 32, 256 * 32]],
+ "mistralai/Mistral-7B-v0.1/TP1": [
+ [4096, 6144],
+ [4096, 4096],
+ [4096, 28672],
+ [14336, 4096],
+ ],
+ "mistralai/Mistral-7B-v0.1/TP2": [
+ [4096, 3072],
+ [2048, 4096],
+ [4096, 14336],
+ [7168, 4096],
+ ],
+ "mistralai/Mistral-7B-v0.1/TP4": [
+ [4096, 1536],
+ [1024, 4096],
+ [4096, 7168],
+ [3584, 4096],
+ ],
+ "meta-llama/Llama-2-7b-hf/TP1": [
+ [4096, 12288],
+ [4096, 4096],
+ [4096, 22016],
+ [11008, 4096],
+ ],
+ "meta-llama/Llama-2-7b-hf/TP2": [
+ [4096, 6144],
+ [2048, 4096],
+ [4096, 11008],
+ [5504, 4096],
+ ],
+ "meta-llama/Llama-2-7b-hf/TP4": [
+ [4096, 3072],
+ [1024, 4096],
+ [4096, 5504],
+ [2752, 4096],
+ ],
+ "meta-llama/Llama-2-13b-hf/TP1": [
+ [5120, 15360],
+ [5120, 5120],
+ [5120, 27648],
+ [13824, 5120],
+ ],
+ "meta-llama/Llama-2-13b-hf/TP2": [
+ [5120, 7680],
+ [2560, 5120],
+ [5120, 13824],
+ [6912, 5120],
+ ],
+ "meta-llama/Llama-2-13b-hf/TP4": [
+ [5120, 3840],
+ [1280, 5120],
+ [5120, 6912],
+ [3456, 5120],
+ ],
+ "meta-llama/Llama-2-70b-hf/TP1": [
+ [8192, 10240],
+ [8192, 8192],
+ [8192, 57344],
+ [28672, 8192],
+ ],
+ "meta-llama/Llama-2-70b-hf/TP2": [
+ [8192, 5120],
+ [4096, 8192],
+ [8192, 28672],
+ [14336, 8192],
+ ],
+ "meta-llama/Llama-2-70b-hf/TP4": [
+ [8192, 2560],
+ [2048, 8192],
+ [8192, 14336],
+ [7168, 8192],
+ ],
+}
diff --git a/benchmarks/launch_tgi_server.sh b/benchmarks/launch_tgi_server.sh
index 64d3c4f4b3889690d6b244062e9483b8a8b47ee2..f491c90d0683e6497a4b63cc62b678c1ce2f84a0 100755
--- a/benchmarks/launch_tgi_server.sh
+++ b/benchmarks/launch_tgi_server.sh
@@ -4,7 +4,7 @@ PORT=8000
MODEL=$1
TOKENS=$2
-docker run --gpus all --shm-size 1g -p $PORT:80 \
+docker run -e HF_TOKEN=$HF_TOKEN --gpus all --shm-size 1g -p $PORT:80 \
-v $PWD/data:/data \
ghcr.io/huggingface/text-generation-inference:1.4.0 \
--model-id $MODEL \
diff --git a/benchmarks/overheads/benchmark_hashing.py b/benchmarks/overheads/benchmark_hashing.py
new file mode 100644
index 0000000000000000000000000000000000000000..c846e47de1fcf7b1a3d9725ec555e972c19d46f0
--- /dev/null
+++ b/benchmarks/overheads/benchmark_hashing.py
@@ -0,0 +1,63 @@
+import argparse
+import cProfile
+import pstats
+
+from vllm import LLM, SamplingParams
+
+# A very long prompt, total number of tokens is about 15k.
+LONG_PROMPT = ["You are an expert in large language models, aren't you?"
+ ] * 1000
+LONG_PROMPT = ' '.join(LONG_PROMPT)
+
+
+def main(args):
+ llm = LLM(
+ model=args.model,
+ enforce_eager=True,
+ enable_prefix_caching=True,
+ tensor_parallel_size=args.tensor_parallel_size,
+ use_v2_block_manager=args.use_v2_block_manager,
+ )
+
+ sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
+ profiler = cProfile.Profile()
+
+ print("------warm up------")
+ for i in range(3):
+ output = llm.generate(LONG_PROMPT, sampling_params)
+ print(output[0].outputs[0].text)
+
+ print("------start generating------")
+ for i in range(3):
+ profiler.runctx('llm.generate(LONG_PROMPT, sampling_params)',
+ globals(), locals())
+
+ # analyze the runtime of hashing function
+ stats = pstats.Stats(profiler)
+ stats.sort_stats('cumulative')
+ total_time = 0
+ total_calls = 0
+ for func in stats.stats:
+ if 'hash_of_block' in func[2]:
+ total_time = stats.stats[func][3]
+ total_calls = stats.stats[func][0]
+ percentage = (total_time / stats.total_tt) * 100
+ print(f"Hashing took {total_time:.2f} seconds,"
+ f"{percentage:.2f}% of the total runtime.")
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(
+ description='Benchmark the performance of hashing function in'
+ 'automatic prefix caching.')
+ parser.add_argument('--model', type=str, default='lmsys/longchat-7b-16k')
+ parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
+ parser.add_argument('--output-len', type=int, default=10)
+ parser.add_argument('--enable-prefix-caching',
+ action='store_true',
+ help='enable prefix caching')
+ parser.add_argument('--use-v2-block-manager',
+ action='store_true',
+ help='Use BlockSpaceMangerV2')
+ args = parser.parse_args()
+ main(args)
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index bf8e9cf4f1ae73fe51d781d09947729fcdc8275c..2e171c7906d4d1a60a3dcd88013c0118546c416c 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -99,7 +99,7 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
"Failed to determine torch nvcc compiler flags")
if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8)
- list(APPEND GPU_FLAGS "-DENABLE_FP8_E5M2")
+ list(APPEND GPU_FLAGS "-DENABLE_FP8")
endif()
if (CUDA_VERSION VERSION_GREATER_EQUAL 12.0)
list(REMOVE_ITEM GPU_FLAGS
@@ -119,7 +119,7 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
list(APPEND GPU_FLAGS
"-DUSE_ROCM"
- # "-DENABLE_FP8_E4M3"
+ # "-DENABLE_FP8"
"-U__HIP_NO_HALF_CONVERSIONS__"
"-U__HIP_NO_HALF_OPERATORS__"
"-fno-gpu-rdc"
diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu
index 24d972702c858b30c7993c12ec15a3aa10f95129..867f63f12de4bcf3b3d8d050800474e1eade979e 100644
--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@@ -10,11 +10,11 @@
namespace vllm {
// Activation and gating kernel template.
-template
+template
__global__ void act_and_mul_kernel(
- scalar_t* __restrict__ out, // [..., d]
- const scalar_t* __restrict__ input, // [..., 2, d]
- const int d) {
+ scalar_t* __restrict__ out, // [..., d]
+ const scalar_t* __restrict__ input, // [..., 2, d]
+ const int d) {
const int64_t token_idx = blockIdx.x;
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
@@ -23,72 +23,66 @@ __global__ void act_and_mul_kernel(
}
}
-template
+template
__device__ __forceinline__ T silu_kernel(const T& x) {
// x * sigmoid(x)
- return (T) (((float) x) / (1.0f + expf((float) -x)));
+ return (T)(((float)x) / (1.0f + expf((float)-x)));
}
-template
+template
__device__ __forceinline__ T gelu_kernel(const T& x) {
// Equivalent to PyTorch GELU with 'none' approximation.
// Refer to:
// https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L36-L38
- const float f = (float) x;
+ const float f = (float)x;
constexpr float ALPHA = M_SQRT1_2;
- return (T) (f * 0.5f * (1.0f + ::erf(f * ALPHA)));
+ return (T)(f * 0.5f * (1.0f + ::erf(f * ALPHA)));
}
-template
+template
__device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
// Equivalent to PyTorch GELU with 'tanh' approximation.
// Refer to:
// https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L25-L30
- const float f = (float) x;
+ const float f = (float)x;
constexpr float BETA = M_SQRT2 * M_2_SQRTPI * 0.5f;
constexpr float KAPPA = 0.044715;
float x_cube = f * f * f;
float inner = BETA * (f + KAPPA * x_cube);
- return (T) (0.5f * f * (1.0f + ::tanhf(inner)));
+ return (T)(0.5f * f * (1.0f + ::tanhf(inner)));
}
-} // namespace vllm
+} // namespace vllm
// Launch activation and gating kernel.
-#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL) \
- int d = input.size(-1) / 2; \
- int64_t num_tokens = input.numel() / input.size(-1); \
- dim3 grid(num_tokens); \
- dim3 block(std::min(d, 1024)); \
- const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
- const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
- VLLM_DISPATCH_FLOATING_TYPES( \
- input.scalar_type(), \
- "act_and_mul_kernel", \
- [&] { \
- vllm::act_and_mul_kernel><<>>( \
- out.data_ptr(), \
- input.data_ptr(), \
- d); \
- });
-
-void silu_and_mul(
- torch::Tensor& out, // [..., d]
- torch::Tensor& input) // [..., 2 * d]
+#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL) \
+ int d = input.size(-1) / 2; \
+ int64_t num_tokens = input.numel() / input.size(-1); \
+ dim3 grid(num_tokens); \
+ dim3 block(std::min(d, 1024)); \
+ const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
+ const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
+ VLLM_DISPATCH_FLOATING_TYPES( \
+ input.scalar_type(), "act_and_mul_kernel", [&] { \
+ vllm::act_and_mul_kernel> \
+ <<>>(out.data_ptr(), \
+ input.data_ptr(), d); \
+ });
+
+void silu_and_mul(torch::Tensor& out, // [..., d]
+ torch::Tensor& input) // [..., 2 * d]
{
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel);
}
-void gelu_and_mul(
- torch::Tensor& out, // [..., d]
- torch::Tensor& input) // [..., 2 * d]
+void gelu_and_mul(torch::Tensor& out, // [..., d]
+ torch::Tensor& input) // [..., 2 * d]
{
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel);
}
-void gelu_tanh_and_mul(
- torch::Tensor& out, // [..., d]
- torch::Tensor& input) // [..., 2 * d]
+void gelu_tanh_and_mul(torch::Tensor& out, // [..., d]
+ torch::Tensor& input) // [..., 2 * d]
{
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel);
}
@@ -96,11 +90,11 @@ void gelu_tanh_and_mul(
namespace vllm {
// Element-wise activation kernel template.
-template
+template
__global__ void activation_kernel(
- scalar_t* __restrict__ out, // [..., d]
- const scalar_t* __restrict__ input, // [..., d]
- const int d) {
+ scalar_t* __restrict__ out, // [..., d]
+ const scalar_t* __restrict__ input, // [..., d]
+ const int d) {
const int64_t token_idx = blockIdx.x;
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
const scalar_t x = VLLM_LDG(&input[token_idx * d + idx]);
@@ -108,54 +102,49 @@ __global__ void activation_kernel(
}
}
-} // namespace vllm
+} // namespace vllm
// Launch element-wise activation kernel.
-#define LAUNCH_ACTIVATION_KERNEL(KERNEL) \
- int d = input.size(-1); \
- int64_t num_tokens = input.numel() / d; \
- dim3 grid(num_tokens); \
- dim3 block(std::min(d, 1024)); \
- const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
- const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
- VLLM_DISPATCH_FLOATING_TYPES( \
- input.scalar_type(), \
- "activation_kernel", \
- [&] { \
- vllm::activation_kernel><<>>( \
- out.data_ptr(), \
- input.data_ptr(), \
- d); \
- });
+#define LAUNCH_ACTIVATION_KERNEL(KERNEL) \
+ int d = input.size(-1); \
+ int64_t num_tokens = input.numel() / d; \
+ dim3 grid(num_tokens); \
+ dim3 block(std::min(d, 1024)); \
+ const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
+ const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
+ VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "activation_kernel", [&] { \
+ vllm::activation_kernel> \
+ <<>>(out.data_ptr(), \
+ input.data_ptr(), d); \
+ });
namespace vllm {
-template
+template
__device__ __forceinline__ T gelu_new_kernel(const T& x) {
- const float x3 = (float) (x * x * x);
- const T t = (T) tanhf((T) (0.79788456f * (float) (x + (T) (0.044715f * x3))));
- return ((T) 0.5) * x * (((T) 1.0) + t);
+ const float x3 = (float)(x * x * x);
+ const T t = (T)tanhf((T)(0.79788456f * (float)(x + (T)(0.044715f * x3))));
+ return ((T)0.5) * x * (((T)1.0) + t);
}
-template
+template
__device__ __forceinline__ T gelu_fast_kernel(const T& x) {
- const float f = (float) x;
- const T t = (T) tanhf(((T) (f * 0.79788456f)) * (((T) 1.0) + (T) (0.044715f * f) * x));
- return ((T) 0.5) * x * (((T) 1.0) + t);
+ const float f = (float)x;
+ const T t =
+ (T)tanhf(((T)(f * 0.79788456f)) * (((T)1.0) + (T)(0.044715f * f) * x));
+ return ((T)0.5) * x * (((T)1.0) + t);
}
-} // namespace vllm
+} // namespace vllm
-void gelu_new(
- torch::Tensor& out, // [..., d]
- torch::Tensor& input) // [..., d]
+void gelu_new(torch::Tensor& out, // [..., d]
+ torch::Tensor& input) // [..., d]
{
LAUNCH_ACTIVATION_KERNEL(vllm::gelu_new_kernel);
}
-void gelu_fast(
- torch::Tensor& out, // [..., d]
- torch::Tensor& input) // [..., d]
+void gelu_fast(torch::Tensor& out, // [..., d]
+ torch::Tensor& input) // [..., d]
{
LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel);
}
diff --git a/csrc/attention/attention_generic.cuh b/csrc/attention/attention_generic.cuh
index 31fb401cbe2c158bb7e3dfad266e794e5da58abc..62409c0cce93e696cebcb69cb7b34526d6b26a47 100644
--- a/csrc/attention/attention_generic.cuh
+++ b/csrc/attention/attention_generic.cuh
@@ -1,5 +1,6 @@
/*
- * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
+ * Adapted from
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
* Copyright (c) 2023, The vLLM team.
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
@@ -22,31 +23,31 @@
namespace vllm {
// A vector type to store Q, K, V elements.
-template
+template
struct Vec {};
// A vector type to store FP32 accumulators.
-template
+template
struct FloatVec {};
// Template vector operations.
-template
+template
inline __device__ Acc mul(A a, B b);
-template
+template
inline __device__ float sum(T v);
-template
+template
inline __device__ float dot(T a, T b) {
return sum(mul(a, b));
}
-template
+template
inline __device__ float dot(T a, T b) {
return sum(mul(a, b));
}
-template
+template
inline __device__ void zero(T& dst) {
constexpr int WORDS = sizeof(T) / 4;
union {
@@ -61,4 +62,4 @@ inline __device__ void zero(T& dst) {
dst = tmp.raw;
}
-} // namespace vllm
+} // namespace vllm
diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu
index 8b1b5e098015ffd8666e1d86f185d9e589ecfdfe..8f89f89786c3bdd68dc5868f524aeec7ec91e323 100644
--- a/csrc/attention/attention_kernels.cu
+++ b/csrc/attention/attention_kernels.cu
@@ -1,5 +1,6 @@
/*
- * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+ * Adapted from
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
* Copyright (c) 2023, The vLLM team.
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
@@ -19,27 +20,23 @@
#include
#include
#include
+#include
#include "attention_dtypes.h"
#include "attention_utils.cuh"
-#if defined(ENABLE_FP8_E5M2)
-#include "../quantization/fp8_e5m2_kvcache/quant_utils.cuh"
-#elif defined(ENABLE_FP8_E4M3)
-#include "../quantization/fp8/amd_detail/quant_utils.cuh"
-#endif
-
-#include
-
#ifdef USE_ROCM
#include
- typedef __hip_bfloat16 __nv_bfloat16;
+ #include "../quantization/fp8/amd/quant_utils.cuh"
+typedef __hip_bfloat16 __nv_bfloat16;
+#else
+ #include "../quantization/fp8/nvidia/quant_utils.cuh"
#endif
#ifndef USE_ROCM
-#define WARP_SIZE 32
+ #define WARP_SIZE 32
#else
-#define WARP_SIZE warpSize
+ #define WARP_SIZE warpSize
#endif
#define MAX(a, b) ((a) > (b) ? (a) : (b))
@@ -49,7 +46,7 @@
namespace vllm {
// Utility function for attention softmax.
-template
+template
inline __device__ float block_sum(float* red_smem, float sum) {
// Decompose the thread index into warp / lane.
int warp = threadIdx.x / WARP_SIZE;
@@ -86,31 +83,31 @@ inline __device__ float block_sum(float* red_smem, float sum) {
// TODO(woosuk): Merge the last two dimensions of the grid.
// Grid: (num_heads, num_seqs, max_num_partitions).
-template<
- typename scalar_t,
- typename cache_t,
- int HEAD_SIZE,
- int BLOCK_SIZE,
- int NUM_THREADS,
- bool IS_FP8_KV_CACHE,
- int PARTITION_SIZE = 0> // Zero means no partitioning.
+template // Zero means no partitioning.
__device__ void paged_attention_kernel(
- float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions]
- float* __restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions]
- scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions, head_size]
- const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
- const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, head_size/x, block_size, x]
- const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, head_size, block_size]
- const int num_kv_heads, // [num_heads]
- const float scale,
- const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
- const int* __restrict__ seq_lens, // [num_seqs]
- const int max_num_blocks_per_seq,
- const float* __restrict__ alibi_slopes, // [num_heads]
- const int q_stride,
- const int kv_block_stride,
- const int kv_head_stride,
- const float kv_scale) {
+ float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions]
+ float* __restrict__ max_logits, // [num_seqs, num_heads,
+ // max_num_partitions]
+ scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions,
+ // head_size]
+ const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
+ const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads,
+ // head_size/x, block_size, x]
+ const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads,
+ // head_size, block_size]
+ const int num_kv_heads, // [num_heads]
+ const float scale,
+ const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
+ const int* __restrict__ seq_lens, // [num_seqs]
+ const int max_num_blocks_per_seq,
+ const float* __restrict__ alibi_slopes, // [num_heads]
+ const int q_stride, const int kv_block_stride, const int kv_head_stride,
+ const float kv_scale, const int tp_rank, const int blocksparse_local_blocks,
+ const int blocksparse_vert_stride, const int blocksparse_block_size,
+ const int blocksparse_head_sliding_step) {
const int seq_idx = blockIdx.y;
const int partition_idx = blockIdx.z;
const int max_num_partitions = gridDim.z;
@@ -122,22 +119,29 @@ __device__ void paged_attention_kernel(
}
const int num_seq_blocks = DIVIDE_ROUND_UP(seq_len, BLOCK_SIZE);
- const int num_blocks_per_partition = USE_PARTITIONING ? PARTITION_SIZE / BLOCK_SIZE : num_seq_blocks;
+ const int num_blocks_per_partition =
+ USE_PARTITIONING ? PARTITION_SIZE / BLOCK_SIZE : num_seq_blocks;
// [start_block_idx, end_block_idx) is the range of blocks to process.
- const int start_block_idx = USE_PARTITIONING ? partition_idx * num_blocks_per_partition : 0;
- const int end_block_idx = MIN(start_block_idx + num_blocks_per_partition, num_seq_blocks);
+ const int start_block_idx =
+ USE_PARTITIONING ? partition_idx * num_blocks_per_partition : 0;
+ const int end_block_idx =
+ MIN(start_block_idx + num_blocks_per_partition, num_seq_blocks);
const int num_blocks = end_block_idx - start_block_idx;
// [start_token_idx, end_token_idx) is the range of tokens to process.
const int start_token_idx = start_block_idx * BLOCK_SIZE;
- const int end_token_idx = MIN(start_token_idx + num_blocks * BLOCK_SIZE, seq_len);
+ const int end_token_idx =
+ MIN(start_token_idx + num_blocks * BLOCK_SIZE, seq_len);
const int num_tokens = end_token_idx - start_token_idx;
constexpr int THREAD_GROUP_SIZE = MAX(WARP_SIZE / BLOCK_SIZE, 1);
- constexpr int NUM_THREAD_GROUPS = NUM_THREADS / THREAD_GROUP_SIZE; // Note: This assumes THREAD_GROUP_SIZE divides NUM_THREADS
+ constexpr int NUM_THREAD_GROUPS =
+ NUM_THREADS / THREAD_GROUP_SIZE; // Note: This assumes THREAD_GROUP_SIZE
+ // divides NUM_THREADS
assert(NUM_THREADS % THREAD_GROUP_SIZE == 0);
- constexpr int NUM_TOKENS_PER_THREAD_GROUP = DIVIDE_ROUND_UP(BLOCK_SIZE, WARP_SIZE);
+ constexpr int NUM_TOKENS_PER_THREAD_GROUP =
+ DIVIDE_ROUND_UP(BLOCK_SIZE, WARP_SIZE);
constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
const int thread_idx = threadIdx.x;
const int warp_idx = thread_idx / WARP_SIZE;
@@ -147,19 +151,18 @@ __device__ void paged_attention_kernel(
const int num_heads = gridDim.x;
const int num_queries_per_kv = num_heads / num_kv_heads;
const int kv_head_idx = head_idx / num_queries_per_kv;
- const float alibi_slope = alibi_slopes == nullptr ? 0.f : alibi_slopes[head_idx];
+ const float alibi_slope =
+ alibi_slopes == nullptr ? 0.f : alibi_slopes[head_idx];
// A vector type to store a part of a key or a query.
- // The vector size is configured in such a way that the threads in a thread group
- // fetch or compute 16 bytes at a time.
- // For example, if the size of a thread group is 4 and the data type is half,
- // then the vector size is 16 / (4 * sizeof(half)) == 2.
+ // The vector size is configured in such a way that the threads in a thread
+ // group fetch or compute 16 bytes at a time. For example, if the size of a
+ // thread group is 4 and the data type is half, then the vector size is 16 /
+ // (4 * sizeof(half)) == 2.
constexpr int VEC_SIZE = MAX(16 / (THREAD_GROUP_SIZE * sizeof(scalar_t)), 1);
using K_vec = typename Vec::Type;
using Q_vec = typename Vec::Type;
-#if defined(ENABLE_FP8_E5M2) || defined(ENABLE_FP8_E4M3)
using Quant_vec = typename Vec::Type;
-#endif
constexpr int NUM_ELEMS_PER_THREAD = HEAD_SIZE / THREAD_GROUP_SIZE;
constexpr int NUM_VECS_PER_THREAD = NUM_ELEMS_PER_THREAD / VEC_SIZE;
@@ -169,18 +172,21 @@ __device__ void paged_attention_kernel(
// Load the query to registers.
// Each thread in a thread group has a different part of the query.
- // For example, if the the thread group size is 4, then the first thread in the group
- // has 0, 4, 8, ... th vectors of the query, and the second thread has 1, 5, 9, ...
- // th vectors of the query, and so on.
- // NOTE(woosuk): Because q is split from a qkv tensor, it may not be contiguous.
+ // For example, if the the thread group size is 4, then the first thread in
+ // the group has 0, 4, 8, ... th vectors of the query, and the second thread
+ // has 1, 5, 9, ... th vectors of the query, and so on. NOTE(woosuk): Because
+ // q is split from a qkv tensor, it may not be contiguous.
const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
__shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD];
#pragma unroll
- for (int i = thread_group_idx; i < NUM_VECS_PER_THREAD; i += NUM_THREAD_GROUPS) {
+ for (int i = thread_group_idx; i < NUM_VECS_PER_THREAD;
+ i += NUM_THREAD_GROUPS) {
const int vec_idx = thread_group_offset + i * THREAD_GROUP_SIZE;
- q_vecs[thread_group_offset][i] = *reinterpret_cast(q_ptr + vec_idx * VEC_SIZE);
+ q_vecs[thread_group_offset][i] =
+ *reinterpret_cast(q_ptr + vec_idx * VEC_SIZE);
}
- __syncthreads(); // TODO(naed90): possible speedup if this is replaced with a memory wall right before we use q_vecs
+ __syncthreads(); // TODO(naed90): possible speedup if this is replaced with a
+ // memory wall right before we use q_vecs
// Memory planning.
extern __shared__ char shared_mem[];
@@ -199,51 +205,94 @@ __device__ void paged_attention_kernel(
// Each thread group in a warp fetches a key from the block, and computes
// dot product with the query.
const int* block_table = block_tables + seq_idx * max_num_blocks_per_seq;
- for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx; block_idx += NUM_WARPS) {
- // NOTE(woosuk): The block number is stored in int32. However, we cast it to int64
- // because int32 can lead to overflow when this variable is multiplied by large numbers
- // (e.g., kv_block_stride).
- const int64_t physical_block_number = static_cast(block_table[block_idx]);
+
+ // blocksparse specific vars
+ int bs_block_offset;
+ int q_bs_block_id;
+ if constexpr (IS_BLOCK_SPARSE) {
+ // const int num_blocksparse_blocks = DIVIDE_ROUND_UP(seq_len,
+ // blocksparse_block_size);
+ q_bs_block_id = (seq_len - 1) / blocksparse_block_size;
+ if (blocksparse_head_sliding_step >= 0)
+ // sliding on q heads
+ bs_block_offset =
+ (tp_rank * num_heads + head_idx) * blocksparse_head_sliding_step + 1;
+ else
+ // sliding on kv heads
+ bs_block_offset = (tp_rank * num_kv_heads + kv_head_idx) *
+ (-blocksparse_head_sliding_step) +
+ 1;
+ }
+
+ for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx;
+ block_idx += NUM_WARPS) {
+ // NOTE(woosuk): The block number is stored in int32. However, we cast it to
+ // int64 because int32 can lead to overflow when this variable is multiplied
+ // by large numbers (e.g., kv_block_stride).
+ // For blocksparse attention: skip computation on blocks that are not
+ // attended
+ if constexpr (IS_BLOCK_SPARSE) {
+ const int k_bs_block_id = block_idx * BLOCK_SIZE / blocksparse_block_size;
+ const bool is_remote =
+ ((k_bs_block_id + bs_block_offset) % blocksparse_vert_stride == 0);
+ const bool is_local =
+ (k_bs_block_id > q_bs_block_id - blocksparse_local_blocks);
+ if (!is_remote && !is_local) {
+ for (int i = 0; i < NUM_TOKENS_PER_THREAD_GROUP; i++) {
+ const int physical_block_offset =
+ (thread_group_idx + i * WARP_SIZE) % BLOCK_SIZE;
+ const int token_idx = block_idx * BLOCK_SIZE + physical_block_offset;
+
+ if (thread_group_offset == 0) {
+ // NOTE(linxihui): assign very large number to skipped tokens to
+ // avoid contribution to the sumexp softmax normalizer. This will
+ // not be used at computing sum(softmax*v) as the blocks will be
+ // skipped.
+ logits[token_idx - start_token_idx] = -FLT_MAX;
+ }
+ }
+ continue;
+ }
+ }
+ const int64_t physical_block_number =
+ static_cast(block_table[block_idx]);
// Load a key to registers.
// Each thread in a thread group has a different part of the key.
- // For example, if the the thread group size is 4, then the first thread in the group
- // has 0, 4, 8, ... th vectors of the key, and the second thread has 1, 5, 9, ... th
- // vectors of the key, and so on.
+ // For example, if the the thread group size is 4, then the first thread in
+ // the group has 0, 4, 8, ... th vectors of the key, and the second thread
+ // has 1, 5, 9, ... th vectors of the key, and so on.
for (int i = 0; i < NUM_TOKENS_PER_THREAD_GROUP; i++) {
- const int physical_block_offset = (thread_group_idx + i * WARP_SIZE) % BLOCK_SIZE;
+ const int physical_block_offset =
+ (thread_group_idx + i * WARP_SIZE) % BLOCK_SIZE;
const int token_idx = block_idx * BLOCK_SIZE + physical_block_offset;
K_vec k_vecs[NUM_VECS_PER_THREAD];
#pragma unroll
for (int j = 0; j < NUM_VECS_PER_THREAD; j++) {
- const cache_t* k_ptr = k_cache + physical_block_number * kv_block_stride
- + kv_head_idx * kv_head_stride
- + physical_block_offset * x;
+ const cache_t* k_ptr =
+ k_cache + physical_block_number * kv_block_stride +
+ kv_head_idx * kv_head_stride + physical_block_offset * x;
const int vec_idx = thread_group_offset + j * THREAD_GROUP_SIZE;
const int offset1 = (vec_idx * VEC_SIZE) / x;
const int offset2 = (vec_idx * VEC_SIZE) % x;
- if constexpr (IS_FP8_KV_CACHE) {
-#if defined(ENABLE_FP8_E5M2)
- Quant_vec k_vec_quant = *reinterpret_cast(k_ptr + offset1 * BLOCK_SIZE * x + offset2);
- // Vector conversion from Quant_vec to K_vec.
- k_vecs[j] = fp8_e5m2_unscaled::vec_conversion(k_vec_quant);
-#elif defined(ENABLE_FP8_E4M3)
- Quant_vec k_vec_quant = *reinterpret_cast(k_ptr + offset1 * BLOCK_SIZE * x + offset2);
- // Vector conversion from Quant_vec to K_vec. Use scaled_vec_conversion to convert FP8_E4M3 quantized k
- // cache vec to k vec in higher precision (FP16, BFloat16, etc.)
- k_vecs[j] = fp8_e4m3::scaled_vec_conversion(k_vec_quant, kv_scale);
-#else
- assert(false);
-#endif
+
+ if constexpr (KV_DTYPE == Fp8KVCacheDataType::kAuto) {
+ k_vecs[j] = *reinterpret_cast(
+ k_ptr + offset1 * BLOCK_SIZE * x + offset2);
} else {
- k_vecs[j] = *reinterpret_cast(k_ptr + offset1 * BLOCK_SIZE * x + offset2);
+ // Vector conversion from Quant_vec to K_vec.
+ Quant_vec k_vec_quant = *reinterpret_cast(
+ k_ptr + offset1 * BLOCK_SIZE * x + offset2);
+ k_vecs[j] = fp8::scaled_convert(
+ k_vec_quant, kv_scale);
}
}
// Compute dot product.
// This includes a reduction across the threads in the same thread group.
- float qk = scale * Qk_dot::dot(q_vecs[thread_group_offset], k_vecs);
+ float qk = scale * Qk_dot::dot(
+ q_vecs[thread_group_offset], k_vecs);
// Add the ALiBi bias if slopes are given.
qk += (alibi_slope != 0) ? alibi_slope * (token_idx - seq_len + 1) : 0;
@@ -298,13 +347,12 @@ __device__ void paged_attention_kernel(
// If partitioning is enabled, store the max logit and exp_sum.
if (USE_PARTITIONING && thread_idx == 0) {
- float* max_logits_ptr = max_logits + seq_idx * num_heads * max_num_partitions
- + head_idx * max_num_partitions
- + partition_idx;
+ float* max_logits_ptr = max_logits +
+ seq_idx * num_heads * max_num_partitions +
+ head_idx * max_num_partitions + partition_idx;
*max_logits_ptr = qk_max;
- float* exp_sums_ptr = exp_sums + seq_idx * num_heads * max_num_partitions
- + head_idx * max_num_partitions
- + partition_idx;
+ float* exp_sums_ptr = exp_sums + seq_idx * num_heads * max_num_partitions +
+ head_idx * max_num_partitions + partition_idx;
*exp_sums_ptr = exp_sum;
}
@@ -312,14 +360,13 @@ __device__ void paged_attention_kernel(
constexpr int V_VEC_SIZE = MIN(16 / sizeof(scalar_t), BLOCK_SIZE);
using V_vec = typename Vec::Type;
using L_vec = typename Vec::Type;
-#if defined(ENABLE_FP8_E5M2) || defined(ENABLE_FP8_E4M3)
using V_quant_vec = typename Vec::Type;
-#endif
using Float_L_vec = typename FloatVec::Type;
constexpr int NUM_V_VECS_PER_ROW = BLOCK_SIZE / V_VEC_SIZE;
constexpr int NUM_ROWS_PER_ITER = WARP_SIZE / NUM_V_VECS_PER_ROW;
- constexpr int NUM_ROWS_PER_THREAD = DIVIDE_ROUND_UP(HEAD_SIZE, NUM_ROWS_PER_ITER);
+ constexpr int NUM_ROWS_PER_THREAD =
+ DIVIDE_ROUND_UP(HEAD_SIZE, NUM_ROWS_PER_ITER);
// NOTE(woosuk): We use FP32 for the accumulator for better accuracy.
float accs[NUM_ROWS_PER_THREAD];
@@ -330,44 +377,51 @@ __device__ void paged_attention_kernel(
scalar_t zero_value;
zero(zero_value);
- for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx; block_idx += NUM_WARPS) {
- // NOTE(woosuk): The block number is stored in int32. However, we cast it to int64
- // because int32 can lead to overflow when this variable is multiplied by large numbers
- // (e.g., kv_block_stride).
- const int64_t physical_block_number = static_cast(block_table[block_idx]);
+ for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx;
+ block_idx += NUM_WARPS) {
+ // NOTE(woosuk): The block number is stored in int32. However, we cast it to
+ // int64 because int32 can lead to overflow when this variable is multiplied
+ // by large numbers (e.g., kv_block_stride).
+ // For blocksparse attention: skip computation on blocks that are not
+ // attended
+ if constexpr (IS_BLOCK_SPARSE) {
+ int v_bs_block_id = block_idx * BLOCK_SIZE / blocksparse_block_size;
+ if (!((v_bs_block_id + bs_block_offset) % blocksparse_vert_stride == 0) &&
+ !((v_bs_block_id > q_bs_block_id - blocksparse_local_blocks))) {
+ continue;
+ }
+ }
+ const int64_t physical_block_number =
+ static_cast(block_table[block_idx]);
const int physical_block_offset = (lane % NUM_V_VECS_PER_ROW) * V_VEC_SIZE;
const int token_idx = block_idx * BLOCK_SIZE + physical_block_offset;
L_vec logits_vec;
- from_float(logits_vec, *reinterpret_cast(logits + token_idx - start_token_idx));
+ from_float(logits_vec, *reinterpret_cast(logits + token_idx -
+ start_token_idx));
- const cache_t* v_ptr = v_cache + physical_block_number * kv_block_stride
- + kv_head_idx * kv_head_stride;
+ const cache_t* v_ptr = v_cache + physical_block_number * kv_block_stride +
+ kv_head_idx * kv_head_stride;
#pragma unroll
for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
if (row_idx < HEAD_SIZE) {
const int offset = row_idx * BLOCK_SIZE + physical_block_offset;
V_vec v_vec;
- if constexpr (IS_FP8_KV_CACHE) {
-#if defined(ENABLE_FP8_E5M2)
- V_quant_vec v_quant_vec = *reinterpret_cast(v_ptr + offset);
- // Vector conversion from V_quant_vec to V_vec.
- v_vec = fp8_e5m2_unscaled::vec_conversion(v_quant_vec);
-#elif defined(ENABLE_FP8_E4M3)
- V_quant_vec v_quant_vec = *reinterpret_cast(v_ptr + offset);
- // Vector conversion from V_quant_vec to V_vec. Use scaled_vec_conversion to convert
- // FP8_E4M3 quantized v cache vec to v vec in higher precision (FP16, BFloat16, etc.)
- v_vec = fp8_e4m3::scaled_vec_conversion(v_quant_vec, kv_scale);
-#else
- assert(false);
-#endif
- } else {
+
+ if constexpr (KV_DTYPE == Fp8KVCacheDataType::kAuto) {
v_vec = *reinterpret_cast(v_ptr + offset);
+ } else {
+ V_quant_vec v_quant_vec =
+ *reinterpret_cast(v_ptr + offset);
+ // Vector conversion from V_quant_vec to V_vec.
+ v_vec = fp8::scaled_convert(v_quant_vec,
+ kv_scale);
}
if (block_idx == num_seq_blocks - 1) {
- // NOTE(woosuk): When v_vec contains the tokens that are out of the context,
- // we should explicitly zero out the values since they may contain NaNs.
- // See https://github.com/vllm-project/vllm/issues/641#issuecomment-1682544472
+ // NOTE(woosuk): When v_vec contains the tokens that are out of the
+ // context, we should explicitly zero out the values since they may
+ // contain NaNs. See
+ // https://github.com/vllm-project/vllm/issues/641#issuecomment-1682544472
scalar_t* v_vec_ptr = reinterpret_cast(&v_vec);
#pragma unroll
for (int j = 0; j < V_VEC_SIZE; j++) {
@@ -390,8 +444,8 @@ __device__ void paged_attention_kernel(
accs[i] = acc;
}
- // NOTE(woosuk): A barrier is required because the shared memory space for logits
- // is reused for the output.
+ // NOTE(woosuk): A barrier is required because the shared memory space for
+ // logits is reused for the output.
__syncthreads();
// Perform reduction across warps.
@@ -428,9 +482,9 @@ __device__ void paged_attention_kernel(
// Write the final output.
if (warp_idx == 0) {
- scalar_t* out_ptr = out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE
- + head_idx * max_num_partitions * HEAD_SIZE
- + partition_idx * HEAD_SIZE;
+ scalar_t* out_ptr =
+ out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
+ head_idx * max_num_partitions * HEAD_SIZE + partition_idx * HEAD_SIZE;
#pragma unroll
for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
@@ -442,79 +496,84 @@ __device__ void paged_attention_kernel(
}
// Grid: (num_heads, num_seqs, 1).
-template<
- typename scalar_t,
- typename cache_t,
- int HEAD_SIZE,
- int BLOCK_SIZE,
- int NUM_THREADS,
- bool IS_FP8_KV_CACHE>
+template
__global__ void paged_attention_v1_kernel(
- scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size]
- const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
- const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, head_size/x, block_size, x]
- const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, head_size, block_size]
- const int num_kv_heads, // [num_heads]
- const float scale,
- const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
- const int* __restrict__ seq_lens, // [num_seqs]
- const int max_num_blocks_per_seq,
- const float* __restrict__ alibi_slopes, // [num_heads]
- const int q_stride,
- const int kv_block_stride,
- const int kv_head_stride,
- const float kv_scale) {
- paged_attention_kernel(
- /* exp_sums */ nullptr, /* max_logits */ nullptr,
- out, q, k_cache, v_cache, num_kv_heads, scale, block_tables, seq_lens,
- max_num_blocks_per_seq, alibi_slopes, q_stride, kv_block_stride, kv_head_stride, kv_scale);
+ scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size]
+ const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
+ const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads,
+ // head_size/x, block_size, x]
+ const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads,
+ // head_size, block_size]
+ const int num_kv_heads, // [num_heads]
+ const float scale,
+ const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
+ const int* __restrict__ seq_lens, // [num_seqs]
+ const int max_num_blocks_per_seq,
+ const float* __restrict__ alibi_slopes, // [num_heads]
+ const int q_stride, const int kv_block_stride, const int kv_head_stride,
+ const float kv_scale, const int tp_rank, const int blocksparse_local_blocks,
+ const int blocksparse_vert_stride, const int blocksparse_block_size,
+ const int blocksparse_head_sliding_step) {
+ paged_attention_kernel(
+ /* exp_sums */ nullptr, /* max_logits */ nullptr, out, q, k_cache,
+ v_cache, num_kv_heads, scale, block_tables, seq_lens,
+ max_num_blocks_per_seq, alibi_slopes, q_stride, kv_block_stride,
+ kv_head_stride, kv_scale, tp_rank, blocksparse_local_blocks,
+ blocksparse_vert_stride, blocksparse_block_size,
+ blocksparse_head_sliding_step);
}
// Grid: (num_heads, num_seqs, max_num_partitions).
-template<
- typename scalar_t,
- typename cache_t,
- int HEAD_SIZE,
- int BLOCK_SIZE,
- int NUM_THREADS,
- bool IS_FP8_KV_CACHE,
- int PARTITION_SIZE>
+template
__global__ void paged_attention_v2_kernel(
- float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions]
- float* __restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions]
- scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size]
- const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
- const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, head_size/x, block_size, x]
- const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, head_size, block_size]
- const int num_kv_heads, // [num_heads]
- const float scale,
- const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
- const int* __restrict__ seq_lens, // [num_seqs]
- const int max_num_blocks_per_seq,
- const float* __restrict__ alibi_slopes, // [num_heads]
- const int q_stride,
- const int kv_block_stride,
- const int kv_head_stride,
- const float kv_scale) {
- paged_attention_kernel(
- exp_sums, max_logits, tmp_out, q, k_cache, v_cache, num_kv_heads, scale,
- block_tables, seq_lens, max_num_blocks_per_seq, alibi_slopes,
- q_stride, kv_block_stride, kv_head_stride, kv_scale);
+ float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions]
+ float* __restrict__ max_logits, // [num_seqs, num_heads,
+ // max_num_partitions]
+ scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads,
+ // max_num_partitions, head_size]
+ const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
+ const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads,
+ // head_size/x, block_size, x]
+ const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads,
+ // head_size, block_size]
+ const int num_kv_heads, // [num_heads]
+ const float scale,
+ const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
+ const int* __restrict__ seq_lens, // [num_seqs]
+ const int max_num_blocks_per_seq,
+ const float* __restrict__ alibi_slopes, // [num_heads]
+ const int q_stride, const int kv_block_stride, const int kv_head_stride,
+ const float kv_scale, const int tp_rank, const int blocksparse_local_blocks,
+ const int blocksparse_vert_stride, const int blocksparse_block_size,
+ const int blocksparse_head_sliding_step) {
+ paged_attention_kernel(
+ exp_sums, max_logits, tmp_out, q, k_cache, v_cache, num_kv_heads, scale,
+ block_tables, seq_lens, max_num_blocks_per_seq, alibi_slopes, q_stride,
+ kv_block_stride, kv_head_stride, kv_scale, tp_rank,
+ blocksparse_local_blocks, blocksparse_vert_stride, blocksparse_block_size,
+ blocksparse_head_sliding_step);
}
// Grid: (num_heads, num_seqs).
-template<
- typename scalar_t,
- int HEAD_SIZE,
- int NUM_THREADS,
- int PARTITION_SIZE>
+template
__global__ void paged_attention_v2_reduce_kernel(
- scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size]
- const float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions]
- const float* __restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions]
- const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size]
- const int* __restrict__ seq_lens, // [num_seqs]
- const int max_num_partitions) {
+ scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size]
+ const float* __restrict__ exp_sums, // [num_seqs, num_heads,
+ // max_num_partitions]
+ const float* __restrict__ max_logits, // [num_seqs, num_heads,
+ // max_num_partitions]
+ const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads,
+ // max_num_partitions, head_size]
+ const int* __restrict__ seq_lens, // [num_seqs]
+ const int max_num_partitions) {
const int num_heads = gridDim.x;
const int head_idx = blockIdx.x;
const int seq_idx = blockIdx.y;
@@ -522,9 +581,11 @@ __global__ void paged_attention_v2_reduce_kernel(
const int num_partitions = DIVIDE_ROUND_UP(seq_len, PARTITION_SIZE);
if (num_partitions == 1) {
// No need to reduce. Only copy tmp_out to out.
- scalar_t* out_ptr = out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
- const scalar_t* tmp_out_ptr = tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE
- + head_idx * max_num_partitions * HEAD_SIZE;
+ scalar_t* out_ptr =
+ out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
+ const scalar_t* tmp_out_ptr =
+ tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
+ head_idx * max_num_partitions * HEAD_SIZE;
for (int i = threadIdx.x; i < HEAD_SIZE; i += blockDim.x) {
out_ptr[i] = tmp_out_ptr[i];
}
@@ -543,8 +604,9 @@ __global__ void paged_attention_v2_reduce_kernel(
// Load max logits to shared memory.
float* shared_max_logits = reinterpret_cast(shared_mem);
- const float* max_logits_ptr = max_logits + seq_idx * num_heads * max_num_partitions
- + head_idx * max_num_partitions;
+ const float* max_logits_ptr = max_logits +
+ seq_idx * num_heads * max_num_partitions +
+ head_idx * max_num_partitions;
float max_logit = -FLT_MAX;
for (int i = threadIdx.x; i < num_partitions; i += blockDim.x) {
const float l = max_logits_ptr[i];
@@ -573,9 +635,11 @@ __global__ void paged_attention_v2_reduce_kernel(
max_logit = VLLM_SHFL_SYNC(max_logit, 0);
// Load rescaled exp sums to shared memory.
- float* shared_exp_sums = reinterpret_cast(shared_mem + sizeof(float) * num_partitions);
- const float* exp_sums_ptr = exp_sums + seq_idx * num_heads * max_num_partitions
- + head_idx * max_num_partitions;
+ float* shared_exp_sums =
+ reinterpret_cast(shared_mem + sizeof(float) * num_partitions);
+ const float* exp_sums_ptr = exp_sums +
+ seq_idx * num_heads * max_num_partitions +
+ head_idx * max_num_partitions;
float global_exp_sum = 0.0f;
for (int i = threadIdx.x; i < num_partitions; i += blockDim.x) {
float l = shared_max_logits[i];
@@ -588,61 +652,52 @@ __global__ void paged_attention_v2_reduce_kernel(
const float inv_global_exp_sum = __fdividef(1.0f, global_exp_sum + 1e-6f);
// Aggregate tmp_out to out.
- const scalar_t* tmp_out_ptr = tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE
- + head_idx * max_num_partitions * HEAD_SIZE;
- scalar_t* out_ptr = out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
+ const scalar_t* tmp_out_ptr =
+ tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
+ head_idx * max_num_partitions * HEAD_SIZE;
+ scalar_t* out_ptr =
+ out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
#pragma unroll
for (int i = threadIdx.x; i < HEAD_SIZE; i += NUM_THREADS) {
float acc = 0.0f;
for (int j = 0; j < num_partitions; ++j) {
- acc += to_float(tmp_out_ptr[j * HEAD_SIZE + i]) * shared_exp_sums[j] * inv_global_exp_sum;
+ acc += to_float(tmp_out_ptr[j * HEAD_SIZE + i]) * shared_exp_sums[j] *
+ inv_global_exp_sum;
}
from_float(out_ptr[i], acc);
}
}
-} // namespace vllm
-
-#define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE) \
- VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( \
- ((void*)vllm::paged_attention_v1_kernel), shared_mem_size); \
- vllm::paged_attention_v1_kernel<<>>( \
- out_ptr, \
- query_ptr, \
- key_cache_ptr, \
- value_cache_ptr, \
- num_kv_heads, \
- scale, \
- block_tables_ptr, \
- seq_lens_ptr, \
- max_num_blocks_per_seq, \
- alibi_slopes_ptr, \
- q_stride, \
- kv_block_stride, \
- kv_head_stride, \
- kv_scale);
+} // namespace vllm
+
+#define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE) \
+ VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( \
+ ((void*)vllm::paged_attention_v1_kernel), \
+ shared_mem_size); \
+ vllm::paged_attention_v1_kernel \
+ <<>>( \
+ out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, \
+ scale, block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq, \
+ alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, \
+ kv_scale, tp_rank, blocksparse_local_blocks, \
+ blocksparse_vert_stride, blocksparse_block_size, \
+ blocksparse_head_sliding_step);
// TODO(woosuk): Tune NUM_THREADS.
-template<
- typename T,
- typename CACHE_T,
- int BLOCK_SIZE,
- bool IS_FP8_KV_CACHE,
- int NUM_THREADS = 128>
+template
void paged_attention_v1_launcher(
- torch::Tensor& out,
- torch::Tensor& query,
- torch::Tensor& key_cache,
- torch::Tensor& value_cache,
- int num_kv_heads,
- float scale,
- torch::Tensor& block_tables,
- torch::Tensor& seq_lens,
- int max_seq_len,
- const c10::optional& alibi_slopes,
- float kv_scale) {
+ torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
+ torch::Tensor& value_cache, int num_kv_heads, float scale,
+ torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
+ const c10::optional& alibi_slopes, float kv_scale,
+ const int tp_rank, const int blocksparse_local_blocks,
+ const int blocksparse_vert_stride, const int blocksparse_block_size,
+ const int blocksparse_head_sliding_step) {
int num_seqs = query.size(0);
int num_heads = query.size(1);
int head_size = query.size(2);
@@ -655,9 +710,10 @@ void paged_attention_v1_launcher(
assert(head_size % thread_group_size == 0);
// NOTE: alibi_slopes is optional.
- const float* alibi_slopes_ptr = alibi_slopes ?
- reinterpret_cast(alibi_slopes.value().data_ptr())
- : nullptr;
+ const float* alibi_slopes_ptr =
+ alibi_slopes
+ ? reinterpret_cast(alibi_slopes.value().data_ptr())
+ : nullptr;
T* out_ptr = reinterpret_cast(out.data_ptr());
T* query_ptr = reinterpret_cast(query.data_ptr());
@@ -667,7 +723,8 @@ void paged_attention_v1_launcher(
int* seq_lens_ptr = seq_lens.data_ptr();
constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
- int padded_max_seq_len = DIVIDE_ROUND_UP(max_seq_len, BLOCK_SIZE) * BLOCK_SIZE;
+ int padded_max_seq_len =
+ DIVIDE_ROUND_UP(max_seq_len, BLOCK_SIZE) * BLOCK_SIZE;
int logits_size = padded_max_seq_len * sizeof(float);
int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
// Python-side check in vllm.worker.worker._check_if_can_support_max_seq_len
@@ -697,6 +754,9 @@ void paged_attention_v1_launcher(
case 128:
LAUNCH_PAGED_ATTENTION_V1(128);
break;
+ case 192:
+ LAUNCH_PAGED_ATTENTION_V1(192);
+ break;
case 256:
LAUNCH_PAGED_ATTENTION_V1(256);
break;
@@ -706,128 +766,93 @@ void paged_attention_v1_launcher(
}
}
-#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
- paged_attention_v1_launcher( \
- out, \
- query, \
- key_cache, \
- value_cache, \
- num_kv_heads, \
- scale, \
- block_tables, \
- seq_lens, \
- max_seq_len, \
- alibi_slopes, \
- kv_scale);
+#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE) \
+ paged_attention_v1_launcher( \
+ out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \
+ seq_lens, max_seq_len, alibi_slopes, kv_scale, tp_rank, \
+ blocksparse_local_blocks, blocksparse_vert_stride, \
+ blocksparse_block_size, blocksparse_head_sliding_step);
+
+#define CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
+ switch (is_block_sparse) { \
+ case true: \
+ CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \
+ break; \
+ case false: \
+ CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \
+ break; \
+ }
// NOTE(woosuk): To reduce the compilation time, we omitted block sizes
// 1, 2, 4, 64, 128, 256.
-#define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, IS_FP8_KV_CACHE) \
- switch (block_size) { \
- case 8: \
- CALL_V1_LAUNCHER(T, CACHE_T, 8, IS_FP8_KV_CACHE); \
- break; \
- case 16: \
- CALL_V1_LAUNCHER(T, CACHE_T, 16, IS_FP8_KV_CACHE); \
- break; \
- case 32: \
- CALL_V1_LAUNCHER(T, CACHE_T, 32, IS_FP8_KV_CACHE); \
- break; \
- default: \
- TORCH_CHECK(false, "Unsupported block size: ", block_size); \
- break; \
+#define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE) \
+ switch (block_size) { \
+ case 8: \
+ CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE); \
+ break; \
+ case 16: \
+ CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE); \
+ break; \
+ case 32: \
+ CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE); \
+ break; \
+ default: \
+ TORCH_CHECK(false, "Unsupported block size: ", block_size); \
+ break; \
}
void paged_attention_v1(
- torch::Tensor& out, // [num_seqs, num_heads, head_size]
- torch::Tensor& query, // [num_seqs, num_heads, head_size]
- torch::Tensor& key_cache, // [num_blocks, num_heads, head_size/x, block_size, x]
- torch::Tensor& value_cache, // [num_blocks, num_heads, head_size, block_size]
- int num_kv_heads, // [num_heads]
- float scale,
- torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
- torch::Tensor& seq_lens, // [num_seqs]
- int block_size,
- int max_seq_len,
- const c10::optional& alibi_slopes,
- const std::string& kv_cache_dtype,
- float kv_scale) {
- if (kv_cache_dtype == "auto") {
- if (query.dtype() == at::ScalarType::Float) {
- CALL_V1_LAUNCHER_BLOCK_SIZE(float, float, false);
- } else if (query.dtype() == at::ScalarType::Half) {
- CALL_V1_LAUNCHER_BLOCK_SIZE(uint16_t, uint16_t, false);
- } else if (query.dtype() == at::ScalarType::BFloat16) {
- CALL_V1_LAUNCHER_BLOCK_SIZE(__nv_bfloat16, __nv_bfloat16, false);
- } else {
- TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
- }
- } else if (kv_cache_dtype == "fp8") {
- if (query.dtype() == at::ScalarType::Float) {
- CALL_V1_LAUNCHER_BLOCK_SIZE(float, uint8_t, true);
- } else if (query.dtype() == at::ScalarType::Half) {
- CALL_V1_LAUNCHER_BLOCK_SIZE(uint16_t, uint8_t, true);
- } else if (query.dtype() == at::ScalarType::BFloat16) {
- CALL_V1_LAUNCHER_BLOCK_SIZE(__nv_bfloat16, uint8_t, true);
- } else {
- TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
- }
- } else {
- TORCH_CHECK(false, "Unsupported data type of kv cache: ", kv_cache_dtype);
- }
+ torch::Tensor& out, // [num_seqs, num_heads, head_size]
+ torch::Tensor& query, // [num_seqs, num_heads, head_size]
+ torch::Tensor&
+ key_cache, // [num_blocks, num_heads, head_size/x, block_size, x]
+ torch::Tensor&
+ value_cache, // [num_blocks, num_heads, head_size, block_size]
+ int num_kv_heads, // [num_heads]
+ float scale,
+ torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
+ torch::Tensor& seq_lens, // [num_seqs]
+ int block_size, int max_seq_len,
+ const c10::optional& alibi_slopes,
+ const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
+ const int blocksparse_local_blocks, const int blocksparse_vert_stride,
+ const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
+ const bool is_block_sparse = (blocksparse_vert_stride > 1);
+
+ DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
+ CALL_V1_LAUNCHER_BLOCK_SIZE)
}
-#define LAUNCH_PAGED_ATTENTION_V2(HEAD_SIZE) \
- vllm::paged_attention_v2_kernel \
- <<>>( \
- exp_sums_ptr, \
- max_logits_ptr, \
- tmp_out_ptr, \
- query_ptr, \
- key_cache_ptr, \
- value_cache_ptr, \
- num_kv_heads, \
- scale, \
- block_tables_ptr, \
- seq_lens_ptr, \
- max_num_blocks_per_seq, \
- alibi_slopes_ptr, \
- q_stride, \
- kv_block_stride, \
- kv_head_stride, \
- kv_scale); \
- vllm::paged_attention_v2_reduce_kernel \
- <<>>( \
- out_ptr, \
- exp_sums_ptr, \
- max_logits_ptr, \
- tmp_out_ptr, \
- seq_lens_ptr, \
- max_num_partitions);
-
-template<
- typename T,
- typename CACHE_T,
- int BLOCK_SIZE,
- bool IS_FP8_KV_CACHE,
- int NUM_THREADS = 128,
- int PARTITION_SIZE = 512>
+#define LAUNCH_PAGED_ATTENTION_V2(HEAD_SIZE) \
+ vllm::paged_attention_v2_kernel \
+ <<>>( \
+ exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr, key_cache_ptr, \
+ value_cache_ptr, num_kv_heads, scale, block_tables_ptr, \
+ seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, \
+ kv_block_stride, kv_head_stride, kv_scale, tp_rank, \
+ blocksparse_local_blocks, blocksparse_vert_stride, \
+ blocksparse_block_size, blocksparse_head_sliding_step); \
+ vllm::paged_attention_v2_reduce_kernel \
+ <<>>( \
+ out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, seq_lens_ptr, \
+ max_num_partitions);
+
+template
void paged_attention_v2_launcher(
- torch::Tensor& out,
- torch::Tensor& exp_sums,
- torch::Tensor& max_logits,
- torch::Tensor& tmp_out,
- torch::Tensor& query,
- torch::Tensor& key_cache,
- torch::Tensor& value_cache,
- int num_kv_heads,
- float scale,
- torch::Tensor& block_tables,
- torch::Tensor& seq_lens,
- int max_seq_len,
- const c10::optional& alibi_slopes,
- float kv_scale) {
+ torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
+ torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
+ torch::Tensor& value_cache, int num_kv_heads, float scale,
+ torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
+ const c10::optional& alibi_slopes, float kv_scale,
+ const int tp_rank, const int blocksparse_local_blocks,
+ const int blocksparse_vert_stride, const int blocksparse_block_size,
+ const int blocksparse_head_sliding_step) {
int num_seqs = query.size(0);
int num_heads = query.size(1);
int head_size = query.size(2);
@@ -840,9 +865,10 @@ void paged_attention_v2_launcher(
assert(head_size % thread_group_size == 0);
// NOTE: alibi_slopes is optional.
- const float* alibi_slopes_ptr = alibi_slopes ?
- reinterpret_cast(alibi_slopes.value().data_ptr())
- : nullptr;
+ const float* alibi_slopes_ptr =
+ alibi_slopes
+ ? reinterpret_cast(alibi_slopes.value().data_ptr())
+ : nullptr;
T* out_ptr = reinterpret_cast(out.data_ptr());
float* exp_sums_ptr = reinterpret_cast(exp_sums.data_ptr());
@@ -888,6 +914,9 @@ void paged_attention_v2_launcher(
case 128:
LAUNCH_PAGED_ATTENTION_V2(128);
break;
+ case 192:
+ LAUNCH_PAGED_ATTENTION_V2(192);
+ break;
case 256:
LAUNCH_PAGED_ATTENTION_V2(256);
break;
@@ -897,84 +926,68 @@ void paged_attention_v2_launcher(
}
}
-#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
- paged_attention_v2_launcher( \
- out, \
- exp_sums, \
- max_logits, \
- tmp_out, \
- query, \
- key_cache, \
- value_cache, \
- num_kv_heads, \
- scale, \
- block_tables, \
- seq_lens, \
- max_seq_len, \
- alibi_slopes, \
- kv_scale);
+#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE) \
+ paged_attention_v2_launcher( \
+ out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \
+ num_kv_heads, scale, block_tables, seq_lens, max_seq_len, alibi_slopes, \
+ kv_scale, tp_rank, blocksparse_local_blocks, blocksparse_vert_stride, \
+ blocksparse_block_size, blocksparse_head_sliding_step);
+
+#define CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
+ switch (is_block_sparse) { \
+ case true: \
+ CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \
+ break; \
+ case false: \
+ CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \
+ break; \
+ }
// NOTE(woosuk): To reduce the compilation time, we omitted block sizes
// 1, 2, 4, 64, 128, 256.
-#define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, IS_FP8_KV_CACHE) \
- switch (block_size) { \
- case 8: \
- CALL_V2_LAUNCHER(T, CACHE_T, 8, IS_FP8_KV_CACHE); \
- break; \
- case 16: \
- CALL_V2_LAUNCHER(T, CACHE_T, 16, IS_FP8_KV_CACHE); \
- break; \
- case 32: \
- CALL_V2_LAUNCHER(T, CACHE_T, 32, IS_FP8_KV_CACHE); \
- break; \
- default: \
- TORCH_CHECK(false, "Unsupported block size: ", block_size); \
- break; \
+#define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE) \
+ switch (block_size) { \
+ case 8: \
+ CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE); \
+ break; \
+ case 16: \
+ CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE); \
+ break; \
+ case 32: \
+ CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE); \
+ break; \
+ default: \
+ TORCH_CHECK(false, "Unsupported block size: ", block_size); \
+ break; \
}
void paged_attention_v2(
- torch::Tensor& out, // [num_seqs, num_heads, head_size]
- torch::Tensor& exp_sums, // [num_seqs, num_heads, max_num_partitions]
- torch::Tensor& max_logits, // [num_seqs, num_heads, max_num_partitions]
- torch::Tensor& tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size]
- torch::Tensor& query, // [num_seqs, num_heads, head_size]
- torch::Tensor& key_cache, // [num_blocks, num_heads, head_size/x, block_size, x]
- torch::Tensor& value_cache, // [num_blocks, num_heads, head_size, block_size]
- int num_kv_heads, // [num_heads]
- float scale,
- torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
- torch::Tensor& seq_lens, // [num_seqs]
- int block_size,
- int max_seq_len,
- const c10::optional& alibi_slopes,
- const std::string& kv_cache_dtype,
- float kv_scale) {
- if (kv_cache_dtype == "auto") {
- if (query.dtype() == at::ScalarType::Float) {
- CALL_V2_LAUNCHER_BLOCK_SIZE(float, float, false);
- } else if (query.dtype() == at::ScalarType::Half) {
- CALL_V2_LAUNCHER_BLOCK_SIZE(uint16_t, uint16_t, false);
- } else if (query.dtype() == at::ScalarType::BFloat16) {
- CALL_V2_LAUNCHER_BLOCK_SIZE(__nv_bfloat16, __nv_bfloat16, false);
- } else {
- TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
- }
- } else if (kv_cache_dtype == "fp8") {
- if (query.dtype() == at::ScalarType::Float) {
- CALL_V2_LAUNCHER_BLOCK_SIZE(float, uint8_t, true);
- } else if (query.dtype() == at::ScalarType::Half) {
- CALL_V2_LAUNCHER_BLOCK_SIZE(uint16_t, uint8_t, true);
- } else if (query.dtype() == at::ScalarType::BFloat16) {
- CALL_V2_LAUNCHER_BLOCK_SIZE(__nv_bfloat16, uint8_t, true);
- } else {
- TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
- }
- } else {
- TORCH_CHECK(false, "Unsupported data type of kv cache: ", kv_cache_dtype);
- }
+ torch::Tensor& out, // [num_seqs, num_heads, head_size]
+ torch::Tensor& exp_sums, // [num_seqs, num_heads, max_num_partitions]
+ torch::Tensor& max_logits, // [num_seqs, num_heads, max_num_partitions]
+ torch::Tensor&
+ tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size]
+ torch::Tensor& query, // [num_seqs, num_heads, head_size]
+ torch::Tensor&
+ key_cache, // [num_blocks, num_heads, head_size/x, block_size, x]
+ torch::Tensor&
+ value_cache, // [num_blocks, num_heads, head_size, block_size]
+ int num_kv_heads, // [num_heads]
+ float scale,
+ torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
+ torch::Tensor& seq_lens, // [num_seqs]
+ int block_size, int max_seq_len,
+ const c10::optional& alibi_slopes,
+ const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
+ const int blocksparse_local_blocks, const int blocksparse_vert_stride,
+ const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
+ const bool is_block_sparse = (blocksparse_vert_stride > 1);
+ DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
+ CALL_V2_LAUNCHER_BLOCK_SIZE)
}
#undef WARP_SIZE
#undef MAX
#undef MIN
-#undef DIVIDE_ROUND_UP
+#undef DIVIDE_ROUND_UP
\ No newline at end of file
diff --git a/csrc/attention/attention_utils.cuh b/csrc/attention/attention_utils.cuh
index ff64c4bd8f80c200647e688db1a74c711a9f709d..cdcee42748998486210507fe84b17b9f023430b4 100644
--- a/csrc/attention/attention_utils.cuh
+++ b/csrc/attention/attention_utils.cuh
@@ -1,5 +1,6 @@
/*
- * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+ * Adapted from
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
* Copyright (c) 2023, The vLLM team.
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
@@ -26,7 +27,7 @@
namespace vllm {
// Q*K^T operation.
-template
+template
inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) {
using A_vec = typename FloatVec::Type;
// Compute the parallel products for Q*K^T (treat vector lanes separately).
@@ -45,12 +46,12 @@ inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) {
return qk;
}
-template
+template
struct Qk_dot {
- template
+ template
static inline __device__ float dot(const Vec (&q)[N], const Vec (&k)[N]) {
return qk_dot_(q, k);
}
};
-} // namespace vllm
+} // namespace vllm
diff --git a/csrc/attention/dtype_bfloat16.cuh b/csrc/attention/dtype_bfloat16.cuh
index 75eae931f797c381a1066ee9230e36dc06eb108b..970dcef62179947641f07ee4730fafd38ca7199e 100644
--- a/csrc/attention/dtype_bfloat16.cuh
+++ b/csrc/attention/dtype_bfloat16.cuh
@@ -1,6 +1,8 @@
/*
- * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
- * and https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
+ * Adapted from
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+ * and
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
* Copyright (c) 2023, The vLLM team.
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
@@ -28,8 +30,8 @@
#include
#include
- typedef __hip_bfloat162 __nv_bfloat162;
- typedef __hip_bfloat16 __nv_bfloat16;
+typedef __hip_bfloat162 __nv_bfloat162;
+typedef __hip_bfloat16 __nv_bfloat16;
#endif
#include
@@ -50,37 +52,37 @@ struct bf16_8_t {
};
// BF16 vector types for Q, K, V.
-template<>
+template <>
struct Vec<__nv_bfloat16, 1> {
using Type = __nv_bfloat16;
};
-template<>
+template <>
struct Vec<__nv_bfloat16, 2> {
using Type = __nv_bfloat162;
};
-template<>
+template <>
struct Vec<__nv_bfloat16, 4> {
using Type = bf16_4_t;
};
-template<>
+template <>
struct Vec<__nv_bfloat16, 8> {
using Type = bf16_8_t;
};
// FP32 accumulator vector types corresponding to Vec.
-template<>
+template <>
struct FloatVec<__nv_bfloat16> {
using Type = float;
};
-template<>
+template <>
struct FloatVec<__nv_bfloat162> {
using Type = float2;
};
-template<>
+template <>
struct FloatVec {
using Type = Float4_;
};
-template<>
+template <>
struct FloatVec {
using Type = Float8_;
};
@@ -108,9 +110,9 @@ inline __device__ __nv_bfloat16 add(__nv_bfloat16 a, __nv_bfloat16 b) {
// assert(false);
// #else
#ifndef USE_ROCM
- return a + b;
+ return a + b;
#else
- return __hadd(a, b);
+ return __hadd(a, b);
#endif
// #endif
}
@@ -161,7 +163,7 @@ inline __device__ Float8_ add(bf16_8_t a, Float8_ fb) {
}
// Vector multiplication.
-template<>
+template <>
inline __device__ __nv_bfloat16 mul(__nv_bfloat16 a, __nv_bfloat16 b) {
// #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
// assert(false);
@@ -170,7 +172,7 @@ inline __device__ __nv_bfloat16 mul(__nv_bfloat16 a, __nv_bfloat16 b) {
// #endif
}
-template<>
+template <>
inline __device__ __nv_bfloat162 mul(__nv_bfloat162 a, __nv_bfloat162 b) {
// #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
// assert(false);
@@ -179,12 +181,12 @@ inline __device__ __nv_bfloat162 mul(__nv_bfloat162 a, __nv_bfloat162 b) {
// #endif
}
-template<>
+template <>
inline __device__ __nv_bfloat162 mul(__nv_bfloat16 a, __nv_bfloat162 b) {
return mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(bf162bf162(a), b);
}
-template<>
+template <>
inline __device__ bf16_4_t mul(bf16_4_t a, bf16_4_t b) {
bf16_4_t c;
c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.x, b.x);
@@ -192,7 +194,7 @@ inline __device__ bf16_4_t mul(bf16_4_t a, bf16_4_t b) {
return c;
}
-template<>
+template <>
inline __device__ bf16_4_t mul(__nv_bfloat16 a, bf16_4_t b) {
__nv_bfloat162 s = bf162bf162(a);
bf16_4_t c;
@@ -201,7 +203,7 @@ inline __device__ bf16_4_t mul(__nv_bfloat16 a, bf16_4_t b) {
return c;
}
-template<>
+template <>
inline __device__ bf16_8_t mul(bf16_8_t a, bf16_8_t b) {
bf16_8_t c;
c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.x, b.x);
@@ -211,7 +213,7 @@ inline __device__ bf16_8_t mul(bf16_8_t a, bf16_8_t b) {
return c;
}
-template<>
+template <>
inline __device__ bf16_8_t mul(__nv_bfloat16 a, bf16_8_t b) {
__nv_bfloat162 s = bf162bf162(a);
bf16_8_t c;
@@ -222,26 +224,26 @@ inline __device__ bf16_8_t mul(__nv_bfloat16 a, bf16_8_t b) {
return c;
}
-template<>
+template <>
inline __device__ float mul(__nv_bfloat16 a, __nv_bfloat16 b) {
float fa = __bfloat162float(a);
float fb = __bfloat162float(b);
return fa * fb;
}
-template<>
+template <>
inline __device__ float2 mul(__nv_bfloat162 a, __nv_bfloat162 b) {
float2 fa = bf1622float2(a);
float2 fb = bf1622float2(b);
return mul(fa, fb);
}
-template<>
+template <>
inline __device__ float2 mul(__nv_bfloat16 a, __nv_bfloat162 b) {
return mul(bf162bf162(a), b);
}
-template<>
+template <>
inline __device__ Float4_ mul(bf16_4_t a, bf16_4_t b) {
Float4_ fc;
fc.x = mul(a.x, b.x);
@@ -249,7 +251,7 @@ inline __device__ Float4_ mul(bf16_4_t a, bf16_4_t b) {
return fc;
}
-template<>
+template <>
inline __device__ Float4_ mul(__nv_bfloat16 a, bf16_4_t b) {
__nv_bfloat162 s = bf162bf162(a);
Float4_ fc;
@@ -258,7 +260,7 @@ inline __device__ Float4_ mul(__nv_bfloat16 a, bf16_4_t b) {
return fc;
}
-template<>
+template <>
inline __device__ Float8_ mul(bf16_8_t a, bf16_8_t b) {
Float8_ fc;
fc.x = mul(a.x, b.x);
@@ -268,7 +270,7 @@ inline __device__ Float8_ mul(bf16_8_t a, bf16_8_t b) {
return fc;
}
-template<>
+template <>
inline __device__ Float8_ mul(__nv_bfloat16 a, bf16_8_t b) {
__nv_bfloat162 s = bf162bf162(a);
Float8_ fc;
@@ -280,7 +282,8 @@ inline __device__ Float8_ mul(__nv_bfloat16 a, bf16_8_t b) {
}
// Vector fused multiply-add.
-inline __device__ __nv_bfloat162 fma(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) {
+inline __device__ __nv_bfloat162 fma(__nv_bfloat162 a, __nv_bfloat162 b,
+ __nv_bfloat162 c) {
// #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
// assert(false);
// #else
@@ -288,7 +291,8 @@ inline __device__ __nv_bfloat162 fma(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bf
// #endif
}
-inline __device__ __nv_bfloat162 fma(__nv_bfloat16 a, __nv_bfloat162 b, __nv_bfloat162 c) {
+inline __device__ __nv_bfloat162 fma(__nv_bfloat16 a, __nv_bfloat162 b,
+ __nv_bfloat162 c) {
// #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
// assert(false);
// #else
@@ -379,23 +383,23 @@ inline __device__ Float8_ fma(__nv_bfloat16 a, bf16_8_t b, Float8_ fc) {
}
// Vector sum.
-template<>
+template <>
inline __device__ float sum(__nv_bfloat16 v) {
return __bfloat162float(v);
}
-template<>
+template <>
inline __device__ float sum(__nv_bfloat162 v) {
float2 vf = bf1622float2(v);
return vf.x + vf.y;
}
-template<>
+template <>
inline __device__ float sum(bf16_4_t v) {
return sum(v.x) + sum(v.y);
}
-template<>
+template <>
inline __device__ float sum(bf16_8_t v) {
return sum(v.x) + sum(v.y) + sum(v.z) + sum(v.w);
}
@@ -448,4 +452,4 @@ inline __device__ void zero(__nv_bfloat16& dst) {
// #endif
}
-} // namespace vllm
+} // namespace vllm
\ No newline at end of file
diff --git a/csrc/attention/dtype_float16.cuh b/csrc/attention/dtype_float16.cuh
index d3271e69cd69d93abe03539604b17a380eb094b8..3a1815f0ed4fc4706840d0136abfe7f96b6fd48a 100644
--- a/csrc/attention/dtype_float16.cuh
+++ b/csrc/attention/dtype_float16.cuh
@@ -1,6 +1,8 @@
/*
- * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
- * and https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
+ * Adapted from
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+ * and
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
* Copyright (c) 2023, The vLLM team.
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
@@ -30,37 +32,37 @@
namespace vllm {
// FP16 vector types for Q, K, V.
-template<>
+template <>
struct Vec {
using Type = uint16_t;
};
-template<>
+template <>
struct Vec {
using Type = uint32_t;
};
-template<>
+template <>
struct Vec {
using Type = uint2;
};
-template<>
+template <>
struct Vec {
using Type = uint4;
};
// FP32 accumulator vector types corresponding to Vec.
-template<>
+template <>
struct FloatVec {
using Type = float;
};
-template<>
+template <>
struct FloatVec {
using Type = float2;
};
-template<>
+template <>
struct FloatVec {
using Type = Float4_;
};
-template<>
+template <>
struct FloatVec {
using Type = Float8_;
};
@@ -73,8 +75,8 @@ inline __device__ uint32_t h0_h0(uint16_t a) {
return b;
#else
union {
- uint32_t u32;
- uint16_t u16[2];
+ uint32_t u32;
+ uint16_t u16[2];
} tmp;
tmp.u16[0] = a;
tmp.u16[1] = a;
@@ -130,10 +132,12 @@ inline __device__ uint32_t float2_to_half2(float2 f) {
} tmp;
#ifndef USE_ROCM
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
- asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n" : "=r"(tmp.u32) : "f"(f.y), "f"(f.x));
+ asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n"
+ : "=r"(tmp.u32)
+ : "f"(f.y), "f"(f.x));
#else
- asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f.x));
- asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[1]) : "f"(f.y));
+ asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f.x));
+ asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[1]) : "f"(f.y));
#endif
#else
tmp.u16[0] = float_to_half(f.x);
@@ -201,7 +205,7 @@ inline __device__ Float8_ add(uint4 a, Float8_ fb) {
}
// Vector multiplication.
-template<>
+template <>
inline __device__ uint16_t mul(uint16_t a, uint16_t b) {
uint16_t c;
#ifndef USE_ROCM
@@ -212,7 +216,7 @@ inline __device__ uint16_t mul(uint16_t a, uint16_t b) {
return c;
}
-template<>
+template <>
inline __device__ uint32_t mul(uint32_t a, uint32_t b) {
uint32_t c;
#ifndef USE_ROCM
@@ -223,12 +227,12 @@ inline __device__ uint32_t mul(uint32_t a, uint32_t b) {
return c;
}
-template<>
+template <>
inline __device__ uint32_t mul(uint16_t a, uint32_t b) {
return mul(h0_h0(a), b);
}
-template<>
+template <>
inline __device__ uint2 mul(uint2 a, uint2 b) {
uint2 c;
c.x = mul(a.x, b.x);
@@ -236,7 +240,7 @@ inline __device__ uint2 mul(uint2 a, uint2 b) {
return c;
}
-template<>
+template <>
inline __device__ uint2 mul(uint16_t a, uint2 b) {
uint32_t s = h0_h0(a);
uint2 c;
@@ -245,7 +249,7 @@ inline __device__ uint2 mul(uint16_t a, uint2 b) {
return c;
}
-template<>
+template <>
inline __device__ uint4 mul(uint4 a, uint4 b) {
uint4 c;
c.x = mul(a.x, b.x);
@@ -255,7 +259,7 @@ inline __device__ uint4 mul(uint4 a, uint4 b) {
return c;
}
-template<>
+template <>
inline __device__ uint4 mul(uint16_t a, uint4 b) {
uint32_t s = h0_h0(a);
uint4 c;
@@ -266,26 +270,26 @@ inline __device__ uint4 mul(uint16_t a, uint4 b) {
return c;
}
-template<>
+template <>
inline __device__ float mul(uint16_t a, uint16_t b) {
float fa = half_to_float(a);
float fb = half_to_float(b);
return fa * fb;
}
-template<>
+template <>
inline __device__ float2 mul(uint32_t a, uint32_t b) {
float2 fa = half2_to_float2(a);
float2 fb = half2_to_float2(b);
return mul(fa, fb);
}
-template<>
+template <>
inline __device__ float2 mul(uint16_t a, uint32_t b) {
return mul(h0_h0(a), b);
}
-template<>
+template <>
inline __device__ Float4_ mul(uint2 a, uint2 b) {
Float4_ fc;
fc.x = mul(a.x, b.x);
@@ -293,7 +297,7 @@ inline __device__ Float4_ mul(uint2 a, uint2 b) {
return fc;
}
-template<>
+template <>
inline __device__ Float4_ mul(uint16_t a, uint2 b) {
uint32_t s = h0_h0(a);
Float4_ fc;
@@ -302,7 +306,7 @@ inline __device__ Float4_ mul(uint16_t a, uint2 b) {
return fc;
}
-template<>
+template <>
inline __device__ Float8_ mul(uint4 a, uint4 b) {
Float8_ fc;
fc.x = mul(a.x, b.x);
@@ -312,7 +316,7 @@ inline __device__ Float8_ mul(uint4 a, uint4 b) {
return fc;
}
-template<>
+template <>
inline __device__ Float8_ mul(uint16_t a, uint4 b) {
uint32_t s = h0_h0(a);
Float8_ fc;
@@ -327,9 +331,13 @@ inline __device__ Float8_ mul(uint16_t a, uint4 b) {
inline __device__ uint32_t fma(uint32_t a, uint32_t b, uint32_t c) {
uint32_t d;
#ifndef USE_ROCM
- asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(d) : "r"(a), "r"(b), "r"(c));
+ asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
+ : "=r"(d)
+ : "r"(a), "r"(b), "r"(c));
#else
- asm volatile("v_pk_fma_f16 %0, %1, %2, %3;\n" : "=v"(d) : "v"(a), "v"(b), "v"(c));
+ asm volatile("v_pk_fma_f16 %0, %1, %2, %3;\n"
+ : "=v"(d)
+ : "v"(a), "v"(b), "v"(c));
#endif
return d;
}
@@ -423,24 +431,24 @@ inline __device__ Float8_ fma(uint16_t a, uint4 b, Float8_ fc) {
}
// Vector sum.
-template<>
+template <>
inline __device__ float sum(uint16_t v) {
return half_to_float(v);
}
-template<>
+template <>
inline __device__ float sum(uint32_t v) {
float2 tmp = half2_to_float2(v);
return tmp.x + tmp.y;
}
-template<>
+template <>
inline __device__ float sum(uint2 v) {
uint32_t c = add(v.x, v.y);
return sum(c);
}
-template<>
+template <>
inline __device__ float sum(uint4 v) {
uint32_t c = add(v.x, v.y);
c = add(c, v.z);
@@ -470,13 +478,9 @@ inline __device__ void from_float(uint4& dst, Float8_ src) {
}
// From float16 to float32.
-inline __device__ float to_float(uint16_t u) {
- return half_to_float(u);
-}
+inline __device__ float to_float(uint16_t u) { return half_to_float(u); }
-inline __device__ float2 to_float(uint32_t u) {
- return half2_to_float2(u);
-}
+inline __device__ float2 to_float(uint32_t u) { return half2_to_float2(u); }
inline __device__ Float4_ to_float(uint2 u) {
Float4_ tmp;
@@ -495,8 +499,6 @@ inline __device__ Float8_ to_float(uint4 u) {
}
// Zero-out a variable.
-inline __device__ void zero(uint16_t& dst) {
- dst = uint16_t(0);
-}
+inline __device__ void zero(uint16_t& dst) { dst = uint16_t(0); }
-} // namespace vllm
+} // namespace vllm
diff --git a/csrc/attention/dtype_float32.cuh b/csrc/attention/dtype_float32.cuh
index b200d2d226eb04792ec3d18a48a5210c40a2d92b..7c6a686db3ba94f114bb965b6a7c94c6a71ecdb7 100644
--- a/csrc/attention/dtype_float32.cuh
+++ b/csrc/attention/dtype_float32.cuh
@@ -1,6 +1,8 @@
/*
- * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
- * and https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
+ * Adapted from
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+ * and
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
* Copyright (c) 2023, The vLLM team.
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
@@ -38,37 +40,35 @@ struct Float8_ {
};
// FP32 vector types for Q, K, V.
-template<>
+template <>
struct Vec {
using Type = float;
};
-template<>
+template <>
struct Vec {
using Type = float2;
};
-template<>
+template <>
struct Vec {
using Type = float4;
};
// FP32 accumulator vector types corresponding to Vec.
-template<>
+template <>
struct FloatVec {
using Type = float;
};
-template<>
+template <>
struct FloatVec {
using Type = float2;
};
-template<>
+template <>
struct FloatVec {
using Type = float4;
};
// Vector addition.
-inline __device__ float add(float a, float b) {
- return a + b;
-}
+inline __device__ float add(float a, float b) { return a + b; }
inline __device__ float2 add(float2 a, float2 b) {
float2 c;
@@ -87,12 +87,12 @@ inline __device__ float4 add(float4 a, float4 b) {
}
// Vector multiplication.
-template<>
+template <>
inline __device__ float mul(float a, float b) {
return a * b;
}
-template<>
+template <>
inline __device__ float2 mul(float2 a, float2 b) {
float2 c;
c.x = a.x * b.x;
@@ -100,7 +100,7 @@ inline __device__ float2 mul(float2 a, float2 b) {
return c;
}
-template<>
+template <>
inline __device__ float2 mul(float a, float2 b) {
float2 c;
c.x = a * b.x;
@@ -108,7 +108,7 @@ inline __device__ float2 mul(float a, float2 b) {
return c;
}
-template<>
+template <>
inline __device__ float4 mul(float4 a, float4 b) {
float4 c;
c.x = a.x * b.x;
@@ -118,7 +118,7 @@ inline __device__ float4 mul(float4 a, float4 b) {
return c;
}
-template<>
+template <>
inline __device__ float4 mul(float a, float4 b) {
float4 c;
c.x = a * b.x;
@@ -129,9 +129,7 @@ inline __device__ float4 mul(float a, float4 b) {
}
// Vector fused multiply-add.
-inline __device__ float fma(float a, float b, float c) {
- return a * b + c;
-}
+inline __device__ float fma(float a, float b, float c) { return a * b + c; }
inline __device__ float2 fma(float2 a, float2 b, float2 c) {
float2 d;
@@ -182,35 +180,33 @@ inline __device__ Float8_ fma(float a, Float8_ b, Float8_ c) {
}
// Vector sum.
-template<>
+template <>
inline __device__ float sum(float v) {
return v;
}
-template<>
+template <>
inline __device__ float sum(float2 v) {
return v.x + v.y;
}
-template<>
+template <>
inline __device__ float sum(float4 v) {
return v.x + v.y + v.z + v.w;
}
-template<>
+template <>
inline __device__ float sum(Float4_ v) {
return v.x.x + v.x.y + v.y.x + v.y.y;
}
-template<>
+template <>
inline __device__ float sum(Float8_ v) {
return v.x.x + v.x.y + v.y.x + v.y.y + v.z.x + v.z.y + v.w.x + v.w.y;
}
// Vector dot product.
-inline __device__ float dot(float a, float b) {
- return a * b;
-}
+inline __device__ float dot(float a, float b) { return a * b; }
inline __device__ float dot(float2 a, float2 b) {
float2 c = mul(a, b);
@@ -232,42 +228,24 @@ inline __device__ float dot(Float8_ a, Float8_ b) {
}
// From float to float.
-inline __device__ void from_float(float& dst, float src) {
- dst = src;
-}
+inline __device__ void from_float(float& dst, float src) { dst = src; }
-inline __device__ void from_float(float2& dst, float2 src) {
- dst = src;
-}
+inline __device__ void from_float(float2& dst, float2 src) { dst = src; }
-inline __device__ void from_float(float4& dst, float4 src) {
- dst = src;
-}
+inline __device__ void from_float(float4& dst, float4 src) { dst = src; }
// From float to float.
-inline __device__ float to_float(float u) {
- return u;
-}
+inline __device__ float to_float(float u) { return u; }
-inline __device__ float2 to_float(float2 u) {
- return u;
-}
+inline __device__ float2 to_float(float2 u) { return u; }
-inline __device__ float4 to_float(float4 u) {
- return u;
-}
+inline __device__ float4 to_float(float4 u) { return u; }
-inline __device__ Float4_ to_float(Float4_ u) {
- return u;
-}
+inline __device__ Float4_ to_float(Float4_ u) { return u; }
-inline __device__ Float8_ to_float(Float8_ u) {
- return u;
-}
+inline __device__ Float8_ to_float(Float8_ u) { return u; }
// Zero-out a variable.
-inline __device__ void zero(float& dst) {
- dst = 0.f;
-}
+inline __device__ void zero(float& dst) { dst = 0.f; }
-} // namespace vllm
+} // namespace vllm
diff --git a/csrc/attention/dtype_fp8.cuh b/csrc/attention/dtype_fp8.cuh
index d11dee91ebe87d724e9ae16744168f29ebd37607..e714e321b0beb2bd4b03bdabbdcd118502ccea46 100644
--- a/csrc/attention/dtype_fp8.cuh
+++ b/csrc/attention/dtype_fp8.cuh
@@ -3,33 +3,39 @@
#include "attention_generic.cuh"
#include
-#ifdef ENABLE_FP8_E5M2
-#include
-#endif
+#ifdef ENABLE_FP8
+ #ifndef USE_ROCM
+ #include
+ #endif // USE_ROCM
+#endif // ENABLE_FP8
namespace vllm {
-#if defined(ENABLE_FP8_E5M2) || defined(ENABLE_FP8_E4M3)
-// fp8 vector types for quantization of kv cache
-template<>
+enum class Fp8KVCacheDataType {
+ kAuto = 0,
+ kFp8E4M3 = 1,
+ kFp8E5M2 = 2,
+};
+
+// fp8 vector types for quantization of kv cache
+template <>
struct Vec {
- using Type = uint8_t;
+ using Type = uint8_t;
};
-template<>
+template <>
struct Vec {
- using Type = uint16_t;
+ using Type = uint16_t;
};
-template<>
+template <>
struct Vec {
- using Type = uint32_t;
+ using Type = uint32_t;
};
-template<>
+template <>
struct Vec {
- using Type = uint2;
+ using Type = uint2;
};
-#endif // ENABLE_FP8_E5M2
-} // namespace vllm
+} // namespace vllm
diff --git a/csrc/cache.h b/csrc/cache.h
index 4c142ce17f1b95739d1524443a63b51f0ebb045b..435ae3e57f555ed9e333dde6dce4c9ae02abc5ec 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -5,34 +5,24 @@
#include