Commit ec5e299c authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.7.3' into v0.7.3-dev

parents 47bd229c ed6e9075
...@@ -10,10 +10,11 @@ jobs: ...@@ -10,10 +10,11 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
with: with:
python-version: "3.12" python-version: "3.12"
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json" - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
- run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
- uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1 - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
with: with:
extra_args: --all-files --hook-stage manual extra_args: --all-files --hook-stage manual
...@@ -13,7 +13,7 @@ jobs: ...@@ -13,7 +13,7 @@ jobs:
actions: write actions: write
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0 - uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
with: with:
# Increasing this value ensures that changes to this workflow # Increasing this value ensures that changes to this workflow
# propagate to all issues and PRs in days rather than months # propagate to all issues and PRs in days rather than months
......
...@@ -8,36 +8,42 @@ repos: ...@@ -8,36 +8,42 @@ repos:
- id: yapf - id: yapf
args: [--in-place, --verbose] args: [--in-place, --verbose]
additional_dependencies: [toml] # TODO: Remove when yapf is upgraded additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
exclude: 'vllm/third_party/.*'
- repo: https://github.com/astral-sh/ruff-pre-commit - repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.9.3 rev: v0.9.3
hooks: hooks:
- id: ruff - id: ruff
args: [--output-format, github] args: [--output-format, github, --fix]
exclude: 'vllm/third_party/.*'
- repo: https://github.com/codespell-project/codespell - repo: https://github.com/codespell-project/codespell
rev: v2.4.0 rev: v2.4.0
hooks: hooks:
- id: codespell - id: codespell
exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*' additional_dependencies: ['tomli']
args: ['--toml', 'pyproject.toml']
- repo: https://github.com/PyCQA/isort - repo: https://github.com/PyCQA/isort
rev: 5.13.2 rev: 5.13.2
hooks: hooks:
- id: isort - id: isort
exclude: 'vllm/third_party/.*'
- repo: https://github.com/pre-commit/mirrors-clang-format - repo: https://github.com/pre-commit/mirrors-clang-format
rev: v19.1.7 rev: v19.1.7
hooks: hooks:
- id: clang-format - id: clang-format
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))' exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
types_or: [c++, cuda] types_or: [c++, cuda]
args: [--style=file, --verbose] args: [--style=file, --verbose]
- repo: https://github.com/jackdewinter/pymarkdown - repo: https://github.com/jackdewinter/pymarkdown
rev: v0.9.27 rev: v0.9.27
hooks: hooks:
- id: pymarkdown - id: pymarkdown
files: docs/.* args: [fix]
exclude: 'vllm/third_party/.*'
- repo: https://github.com/rhysd/actionlint - repo: https://github.com/rhysd/actionlint
rev: v1.7.7 rev: v1.7.7
hooks: hooks:
- id: actionlint - id: actionlint
exclude: 'vllm/third_party/.*'
- repo: local - repo: local
hooks: hooks:
- id: mypy-local - id: mypy-local
...@@ -47,6 +53,7 @@ repos: ...@@ -47,6 +53,7 @@ repos:
types: [python] types: [python]
additional_dependencies: &mypy_deps [mypy==1.11.1, types-setuptools, types-PyYAML, types-requests] additional_dependencies: &mypy_deps [mypy==1.11.1, types-setuptools, types-PyYAML, types-requests]
stages: [pre-commit] # Don't run in CI stages: [pre-commit] # Don't run in CI
exclude: 'vllm/third_party/.*'
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.9 name: Run mypy for Python 3.9
entry: tools/mypy.sh 1 "3.9" entry: tools/mypy.sh 1 "3.9"
...@@ -54,6 +61,7 @@ repos: ...@@ -54,6 +61,7 @@ repos:
types: [python] types: [python]
additional_dependencies: *mypy_deps additional_dependencies: *mypy_deps
stages: [manual] # Only run in CI stages: [manual] # Only run in CI
exclude: 'vllm/third_party/.*'
- id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.10 name: Run mypy for Python 3.10
entry: tools/mypy.sh 1 "3.10" entry: tools/mypy.sh 1 "3.10"
...@@ -61,6 +69,7 @@ repos: ...@@ -61,6 +69,7 @@ repos:
types: [python] types: [python]
additional_dependencies: *mypy_deps additional_dependencies: *mypy_deps
stages: [manual] # Only run in CI stages: [manual] # Only run in CI
exclude: 'vllm/third_party/.*'
- id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward - id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.11 name: Run mypy for Python 3.11
entry: tools/mypy.sh 1 "3.11" entry: tools/mypy.sh 1 "3.11"
...@@ -68,6 +77,7 @@ repos: ...@@ -68,6 +77,7 @@ repos:
types: [python] types: [python]
additional_dependencies: *mypy_deps additional_dependencies: *mypy_deps
stages: [manual] # Only run in CI stages: [manual] # Only run in CI
exclude: 'vllm/third_party/.*'
- id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward - id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.12 name: Run mypy for Python 3.12
entry: tools/mypy.sh 1 "3.12" entry: tools/mypy.sh 1 "3.12"
...@@ -75,16 +85,19 @@ repos: ...@@ -75,16 +85,19 @@ repos:
types: [python] types: [python]
additional_dependencies: *mypy_deps additional_dependencies: *mypy_deps
stages: [manual] # Only run in CI stages: [manual] # Only run in CI
exclude: 'vllm/third_party/.*'
- id: shellcheck - id: shellcheck
name: Lint shell scripts name: Lint shell scripts
entry: tools/shellcheck.sh entry: tools/shellcheck.sh
language: script language: script
types: [shell] types: [shell]
exclude: 'vllm/third_party/.*'
- id: png-lint - id: png-lint
name: Lint PNG exports from excalidraw name: Lint PNG exports from excalidraw
entry: tools/png-lint.sh entry: tools/png-lint.sh
language: script language: script
types: [png] types: [png]
exclude: 'vllm/third_party/.*'
- id: signoff-commit - id: signoff-commit
name: Sign-off Commit name: Sign-off Commit
entry: bash entry: bash
...@@ -97,14 +110,29 @@ repos: ...@@ -97,14 +110,29 @@ repos:
language: system language: system
verbose: true verbose: true
stages: [commit-msg] stages: [commit-msg]
exclude: 'vllm/third_party/.*'
- id: check-spdx-header - id: check-spdx-header
name: Check SPDX headers name: Check SPDX headers
entry: python tools/check_spdx_header.py entry: python tools/check_spdx_header.py
language: python language: python
types: [python] types: [python]
exclude: 'vllm/third_party/.*'
- id: check-filenames
name: Check for spaces in all filenames
entry: bash
args:
- -c
- 'git ls-files | grep " " && echo "Filenames should not contain spaces!" && exit 1 || exit 0'
language: system
always_run: true
pass_filenames: false
exclude: 'vllm/third_party/.*'
# Keep `suggestion` last
- id: suggestion - id: suggestion
name: Suggestion name: Suggestion
entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."' entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
language: system language: system
verbose: true verbose: true
pass_filenames: false pass_filenames: false
exclude: 'vllm/third_party/.*'
# Insert new entries above the `suggestion` entry
...@@ -38,7 +38,7 @@ set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12") ...@@ -38,7 +38,7 @@ set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0") set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
# Supported hcu architectures. # Supported hcu architectures.
set(HIP_SUPPORTED_ARCHS "gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;;gfx1101;gfx906;gfx926;gfx928;gfx936") set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx906;gfx926;gfx928;gfx936")
# #
# Supported/expected torch versions for CUDA/ROCm. # Supported/expected torch versions for CUDA/ROCm.
...@@ -196,7 +196,7 @@ set_gencode_flags_for_srcs( ...@@ -196,7 +196,7 @@ set_gencode_flags_for_srcs(
if(VLLM_GPU_LANG STREQUAL "CUDA") if(VLLM_GPU_LANG STREQUAL "CUDA")
message(STATUS "Enabling cumem allocator extension.") message(STATUS "Enabling cumem allocator extension.")
# link against cuda driver library # link against cuda driver library
list(APPEND CUMEM_LIBS cuda) list(APPEND CUMEM_LIBS CUDA::cuda_driver)
define_gpu_extension_target( define_gpu_extension_target(
cumem_allocator cumem_allocator
DESTINATION vllm DESTINATION vllm
...@@ -241,7 +241,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -241,7 +241,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library") SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
# Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case. # Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
set(CUTLASS_REVISION "v3.6.0" CACHE STRING "CUTLASS revision to use") # Please keep this in sync with FetchContent_Declare line below.
set(CUTLASS_REVISION "v3.7.0" CACHE STRING "CUTLASS revision to use")
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR}) if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
...@@ -258,6 +259,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -258,6 +259,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
FetchContent_Declare( FetchContent_Declare(
cutlass cutlass
GIT_REPOSITORY https://github.com/nvidia/cutlass.git GIT_REPOSITORY https://github.com/nvidia/cutlass.git
# Please keep this in sync with CUTLASS_REVISION line above.
GIT_TAG v3.7.0 GIT_TAG v3.7.0
GIT_PROGRESS TRUE GIT_PROGRESS TRUE
...@@ -277,8 +279,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -277,8 +279,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"csrc/custom_all_reduce.cu" "csrc/custom_all_reduce.cu"
"csrc/permute_cols.cu" "csrc/permute_cols.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu" "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu" "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
"csrc/sparse/cutlass/sparse_compressor_entry.cu"
"csrc/cutlass_extensions/common.cpp") "csrc/cutlass_extensions/common.cpp")
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
...@@ -371,8 +373,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -371,8 +373,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor # The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
# require CUDA 12.2 or later (and only work on Hopper, 9.0a for now). # require CUDA 12.2 or later (and only work on Hopper, 9.0a for now).
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
set(SRCS "csrc/sparse/cutlass/sparse_compressor_c3x.cu" set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
"csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${SRCS}" SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_3X_ARCHS}") CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
...@@ -390,6 +391,23 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -390,6 +391,23 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif() endif()
endif() endif()
# FP4 Archs and flags
cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
set(SRCS
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
)
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${FP4_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4=1")
message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
else()
message(STATUS "Not building NVFP4 as no compatible archs were found.")
# clear FP4_ARCHS
set(FP4_ARCHS)
endif()
# #
# Machete kernels # Machete kernels
...@@ -471,7 +489,7 @@ define_gpu_extension_target( ...@@ -471,7 +489,7 @@ define_gpu_extension_target(
SOURCES ${VLLM_EXT_SRC} SOURCES ${VLLM_EXT_SRC}
COMPILE_FLAGS ${VLLM_GPU_FLAGS} COMPILE_FLAGS ${VLLM_GPU_FLAGS}
ARCHITECTURES ${VLLM_GPU_ARCHES} ARCHITECTURES ${VLLM_GPU_ARCHES}
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR} INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
USE_SABI 3 USE_SABI 3
WITH_SOABI) WITH_SOABI)
...@@ -597,7 +615,7 @@ else() ...@@ -597,7 +615,7 @@ else()
FetchContent_Declare( FetchContent_Declare(
vllm-flash-attn vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
GIT_TAG d4e09037abf588af1ec47d0e966b237ee376876c GIT_TAG 720c94869cf2e0ff5a706e9c7f1dce0939686ade
GIT_PROGRESS TRUE GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types # Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
......
...@@ -125,4 +125,3 @@ Community Impact Guidelines were inspired by ...@@ -125,4 +125,3 @@ Community Impact Guidelines were inspired by
For answers to common questions about this code of conduct, see the For answers to common questions about this code of conduct, see the
[Contributor Covenant FAQ](https://www.contributor-covenant.org/faq). Translations are available at [Contributor Covenant FAQ](https://www.contributor-covenant.org/faq). Translations are available at
[Contributor Covenant translations](https://www.contributor-covenant.org/translations). [Contributor Covenant translations](https://www.contributor-covenant.org/translations).
...@@ -27,6 +27,9 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ ...@@ -27,6 +27,9 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \ && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
&& python3 --version && python3 -m pip --version && python3 --version && python3 -m pip --version
# Install uv for faster pip installs
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install uv
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519 # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
# as it was causing spam when compiling the CUTLASS kernels # as it was causing spam when compiling the CUTLASS kernels
...@@ -52,13 +55,13 @@ WORKDIR /workspace ...@@ -52,13 +55,13 @@ WORKDIR /workspace
# after this step # after this step
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu126 "torch==2.7.0.dev20250121+cu126" "torchvision==0.22.0.dev20250121"; \ uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu126 "torch==2.7.0.dev20250121+cu126" "torchvision==0.22.0.dev20250121"; \
fi fi
COPY requirements-common.txt requirements-common.txt COPY requirements-common.txt requirements-common.txt
COPY requirements-cuda.txt requirements-cuda.txt COPY requirements-cuda.txt requirements-cuda.txt
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-cuda.txt uv pip install --system -r requirements-cuda.txt
# cuda arch list used by torch # cuda arch list used by torch
# can be useful for both `dev` and `test` # can be useful for both `dev` and `test`
...@@ -79,7 +82,7 @@ ARG TARGETPLATFORM ...@@ -79,7 +82,7 @@ ARG TARGETPLATFORM
COPY requirements-build.txt requirements-build.txt COPY requirements-build.txt requirements-build.txt
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-build.txt uv pip install --system -r requirements-build.txt
COPY . . COPY . .
ARG GIT_REPO_CHECK=0 ARG GIT_REPO_CHECK=0
...@@ -144,7 +147,7 @@ COPY requirements-lint.txt requirements-lint.txt ...@@ -144,7 +147,7 @@ COPY requirements-lint.txt requirements-lint.txt
COPY requirements-test.txt requirements-test.txt COPY requirements-test.txt requirements-test.txt
COPY requirements-dev.txt requirements-dev.txt COPY requirements-dev.txt requirements-dev.txt
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-dev.txt uv pip install --system -r requirements-dev.txt
#################### DEV IMAGE #################### #################### DEV IMAGE ####################
#################### vLLM installation IMAGE #################### #################### vLLM installation IMAGE ####################
...@@ -174,6 +177,9 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ ...@@ -174,6 +177,9 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \ && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
&& python3 --version && python3 -m pip --version && python3 --version && python3 -m pip --version
# Install uv for faster pip installs
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install uv
# Workaround for https://github.com/openai/triton/issues/2507 and # Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
...@@ -187,27 +193,30 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ ...@@ -187,27 +193,30 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
# after this step # after this step
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215"; \ uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215"; \
fi fi
# Install vllm wheel first, so that torch etc will be installed. # Install vllm wheel first, so that torch etc will be installed.
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
--mount=type=cache,target=/root/.cache/pip \ --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install dist/*.whl --verbose uv pip install --system dist/*.whl --verbose
# How to build this FlashInfer wheel: # If we need to build FlashInfer wheel before its release:
# $ export FLASHINFER_ENABLE_AOT=1 # $ export FLASHINFER_ENABLE_AOT=1
# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+ # $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+
# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX' # $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX'
# $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive # $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
# $ cd flashinfer # $ cd flashinfer
# $ git checkout 524304395bd1d8cd7d07db083859523fcaa246a4 # $ git checkout 524304395bd1d8cd7d07db083859523fcaa246a4
# $ rm -rf build
# $ python3 setup.py bdist_wheel --dist-dir=dist --verbose # $ python3 setup.py bdist_wheel --dist-dir=dist --verbose
# $ ls dist
# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
. /etc/environment && \ . /etc/environment && \
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \ if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
python3 -m pip install https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.0.post1-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \ uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post1/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl ; \
fi fi
COPY examples examples COPY examples examples
...@@ -217,7 +226,7 @@ COPY examples examples ...@@ -217,7 +226,7 @@ COPY examples examples
# TODO: Remove this once FlashInfer AOT wheel is fixed # TODO: Remove this once FlashInfer AOT wheel is fixed
COPY requirements-build.txt requirements-build.txt COPY requirements-build.txt requirements-build.txt
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-build.txt uv pip install --system -r requirements-build.txt
#################### vLLM installation IMAGE #################### #################### vLLM installation IMAGE ####################
...@@ -230,15 +239,15 @@ ADD . /vllm-workspace/ ...@@ -230,15 +239,15 @@ ADD . /vllm-workspace/
# install development dependencies (for testing) # install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-dev.txt uv pip install --system -r requirements-dev.txt
# install development dependencies (for testing) # install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -e tests/vllm_test_utils uv pip install --system -e tests/vllm_test_utils
# enable fast downloads from hf (for testing) # enable fast downloads from hf (for testing)
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install hf_transfer uv pip install --system hf_transfer
ENV HF_HUB_ENABLE_HF_TRANSFER 1 ENV HF_HUB_ENABLE_HF_TRANSFER 1
# Copy in the v1 package for testing (it isn't distributed yet) # Copy in the v1 package for testing (it isn't distributed yet)
...@@ -259,9 +268,9 @@ FROM vllm-base AS vllm-openai-base ...@@ -259,9 +268,9 @@ FROM vllm-base AS vllm-openai-base
# install additional dependencies for openai api server # install additional dependencies for openai api server
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \ uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
else \ else \
pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \ uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
fi fi
ENV VLLM_USAGE_SOURCE production-docker-image ENV VLLM_USAGE_SOURCE production-docker-image
......
...@@ -23,10 +23,12 @@ WORKDIR ${APP_MOUNT}/vllm ...@@ -23,10 +23,12 @@ WORKDIR ${APP_MOUNT}/vllm
RUN python3 -m pip install --upgrade pip RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
RUN python3 -m pip install sentencepiece transformers==4.45.2 -U RUN python3 -m pip install sentencepiece transformers==4.45.2 -U
RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
RUN python3 -m pip install pytest RUN python3 -m pip install pytest
# uninstall transformers-neuronx package explicitly to avoid version conflict
RUN python3 -m pip uninstall -y transformers-neuronx
COPY . . COPY . .
ARG GIT_REPO_CHECK=0 ARG GIT_REPO_CHECK=0
RUN --mount=type=bind,source=.git,target=.git \ RUN --mount=type=bind,source=.git,target=.git \
...@@ -43,6 +45,10 @@ RUN --mount=type=bind,source=.git,target=.git \ ...@@ -43,6 +45,10 @@ RUN --mount=type=bind,source=.git,target=.git \
# install development dependencies (for testing) # install development dependencies (for testing)
RUN python3 -m pip install -e tests/vllm_test_utils RUN python3 -m pip install -e tests/vllm_test_utils
# install transformers-neuronx package as an optional dependencies (for V0)
# FIXME: `--no-deps` argument is temporarily added to resolve transformers package version conflict
RUN python3 -m pip install transformers-neuronx==0.13.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U --no-deps
# overwrite entrypoint to run bash script # overwrite entrypoint to run bash script
RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py
......
...@@ -6,7 +6,7 @@ ARG RCCL_BRANCH="648a58d" ...@@ -6,7 +6,7 @@ ARG RCCL_BRANCH="648a58d"
ARG RCCL_REPO="https://github.com/ROCm/rccl" ARG RCCL_REPO="https://github.com/ROCm/rccl"
ARG TRITON_BRANCH="e5be006" ARG TRITON_BRANCH="e5be006"
ARG TRITON_REPO="https://github.com/triton-lang/triton.git" ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
ARG PYTORCH_BRANCH="8d4926e" ARG PYTORCH_BRANCH="3a585126"
ARG PYTORCH_VISION_BRANCH="v0.19.1" ARG PYTORCH_VISION_BRANCH="v0.19.1"
ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git" ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git" ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
......
...@@ -85,7 +85,7 @@ VLLM_INSTALL_PUNICA_KERNELS=1 python3 setup.py install (若调试,可使用V ...@@ -85,7 +85,7 @@ VLLM_INSTALL_PUNICA_KERNELS=1 python3 setup.py install (若调试,可使用V
+ 若使用 pip install 下载安装过慢,可添加源:-i https://pypi.tuna.tsinghua.edu.cn/simple/ + 若使用 pip install 下载安装过慢,可添加源:-i https://pypi.tuna.tsinghua.edu.cn/simple/
## 验证 ## 验证
- python -c "import vllm; print(vllm.\_\_version__)",版本号与官方版本同步,查询该软件的版本号,例如0.7.2; - python -c "import vllm; print(vllm.\_\_version__)",版本号与官方版本同步,查询该软件的版本号,例如0.7.3;
## Known Issue ## Known Issue
- -
......
...@@ -15,9 +15,14 @@ Easy, fast, and cheap LLM serving for everyone ...@@ -15,9 +15,14 @@ Easy, fast, and cheap LLM serving for everyone
--- ---
We are excited to invite you to our Menlo Park meetup with Meta, evening of Thursday, February 27! Meta engineers will discuss the improvements on top of vLLM, and vLLM contributors will share updates from the v0.7.x series of releases. [Register Now](https://lu.ma/h7g3kuj9)
---
*Latest News* 🔥 *Latest News* 🔥
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html). - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing). - [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone! - [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing). - [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there! - [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
...@@ -33,7 +38,9 @@ Easy, fast, and cheap LLM serving for everyone ...@@ -33,7 +38,9 @@ Easy, fast, and cheap LLM serving for everyone
- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai). - [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
--- ---
## About ## About
vLLM is a fast and easy-to-use library for LLM inference and serving. vLLM is a fast and easy-to-use library for LLM inference and serving.
Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evolved into a community-driven project with contributions from both academia and industry. Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evolved into a community-driven project with contributions from both academia and industry.
...@@ -127,6 +134,7 @@ We also have an official fundraising venue through [OpenCollective](https://open ...@@ -127,6 +134,7 @@ We also have an official fundraising venue through [OpenCollective](https://open
## Citation ## Citation
If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180): If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180):
```bibtex ```bibtex
@inproceedings{kwon2023efficient, @inproceedings{kwon2023efficient,
title={Efficient Memory Management for Large Language Model Serving with PagedAttention}, title={Efficient Memory Management for Large Language Model Serving with PagedAttention},
...@@ -138,11 +146,11 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs ...@@ -138,11 +146,11 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
## Contact Us ## Contact Us
* For technical questions and feature requests, please use Github issues or discussions. - For technical questions and feature requests, please use Github issues or discussions.
* For discussing with fellow users and coordinating contributions and development, please use Slack. - For discussing with fellow users and coordinating contributions and development, please use Slack.
* For security disclosures, please use Github's security advisory feature. - For security disclosures, please use Github's security advisory feature.
* For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu. - For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
## Media Kit ## Media Kit
* If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit). - If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit).
\ No newline at end of file \ No newline at end of file
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
## Downloading the ShareGPT dataset ## Downloading the ShareGPT dataset
You can download the dataset by running: You can download the dataset by running:
```bash ```bash
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
``` ```
...@@ -11,9 +12,18 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r ...@@ -11,9 +12,18 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
The json file refers to several image datasets (coco, llava, etc.). The benchmark scripts The json file refers to several image datasets (coco, llava, etc.). The benchmark scripts
will ignore a datapoint if the referred image is missing. will ignore a datapoint if the referred image is missing.
```bash ```bash
wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json
mkdir coco -p mkdir coco -p
wget http://images.cocodataset.org/zips/train2017.zip -O coco/train2017.zip wget http://images.cocodataset.org/zips/train2017.zip -O coco/train2017.zip
unzip coco/train2017.zip -d coco/ unzip coco/train2017.zip -d coco/
``` ```
# Downloading the BurstGPT dataset
You can download the BurstGPT v1.1 dataset by running:
```bash
wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv
```
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
"""Benchmark the latency of processing a single batch of requests.""" """Benchmark the latency of processing a single batch of requests."""
import argparse import argparse
import dataclasses import dataclasses
import json import json
import os
import time import time
from pathlib import Path from pathlib import Path
from typing import List, Optional from typing import Any, Dict, List, Optional
import numpy as np import numpy as np
import torch import torch
from benchmark_utils import convert_to_pytorch_benchmark_format
from tqdm import tqdm from tqdm import tqdm
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
...@@ -18,6 +21,19 @@ from vllm.sampling_params import BeamSearchParams ...@@ -18,6 +21,19 @@ from vllm.sampling_params import BeamSearchParams
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
def save_to_pytorch_benchmark_format(args: argparse.Namespace,
results: Dict[str, Any]) -> None:
pt_records = convert_to_pytorch_benchmark_format(
args=args,
metrics={"latency": results["latencies"]},
extra_info={k: results[k]
for k in ["avg_latency", "percentiles"]})
if pt_records:
pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
with open(pt_file, "w") as f:
json.dump(pt_records, f)
def main(args: argparse.Namespace): def main(args: argparse.Namespace):
print(args) print(args)
...@@ -54,7 +70,8 @@ def main(args: argparse.Namespace): ...@@ -54,7 +70,8 @@ def main(args: argparse.Namespace):
beam_width=args.n, beam_width=args.n,
max_tokens=args.output_len, max_tokens=args.output_len,
ignore_eos=True, ignore_eos=True,
)) ),
)
def run_to_completion(profile_dir: Optional[str] = None): def run_to_completion(profile_dir: Optional[str] = None):
if profile_dir: if profile_dir:
...@@ -64,7 +81,8 @@ def main(args: argparse.Namespace): ...@@ -64,7 +81,8 @@ def main(args: argparse.Namespace):
torch.profiler.ProfilerActivity.CUDA, torch.profiler.ProfilerActivity.CUDA,
], ],
on_trace_ready=torch.profiler.tensorboard_trace_handler( on_trace_ready=torch.profiler.tensorboard_trace_handler(
str(profile_dir))) as p: str(profile_dir)),
) as p:
llm_generate() llm_generate()
print(p.key_averages().table(sort_by="self_cuda_time_total")) print(p.key_averages().table(sort_by="self_cuda_time_total"))
else: else:
...@@ -81,9 +99,8 @@ def main(args: argparse.Namespace): ...@@ -81,9 +99,8 @@ def main(args: argparse.Namespace):
if args.profile: if args.profile:
profile_dir = args.profile_result_dir profile_dir = args.profile_result_dir
if not profile_dir: if not profile_dir:
profile_dir = Path( profile_dir = (Path(".") / "vllm_benchmark_result" /
"." f"latency_result_{time.time()}")
) / "vllm_benchmark_result" / f"latency_result_{time.time()}"
print(f"Profiling (results will be saved to '{profile_dir}')...") print(f"Profiling (results will be saved to '{profile_dir}')...")
run_to_completion(profile_dir=profile_dir) run_to_completion(profile_dir=profile_dir)
return return
...@@ -95,9 +112,9 @@ def main(args: argparse.Namespace): ...@@ -95,9 +112,9 @@ def main(args: argparse.Namespace):
latencies = np.array(latencies) latencies = np.array(latencies)
percentages = [10, 25, 50, 75, 90, 99] percentages = [10, 25, 50, 75, 90, 99]
percentiles = np.percentile(latencies, percentages) percentiles = np.percentile(latencies, percentages)
print(f'Avg latency: {np.mean(latencies)} seconds') print(f"Avg latency: {np.mean(latencies)} seconds")
for percentage, percentile in zip(percentages, percentiles): for percentage, percentile in zip(percentages, percentiles):
print(f'{percentage}% percentile latency: {percentile} seconds') print(f"{percentage}% percentile latency: {percentile} seconds")
# Output JSON results if specified # Output JSON results if specified
if args.output_json: if args.output_json:
...@@ -108,43 +125,51 @@ def main(args: argparse.Namespace): ...@@ -108,43 +125,51 @@ def main(args: argparse.Namespace):
} }
with open(args.output_json, "w") as f: with open(args.output_json, "w") as f:
json.dump(results, f, indent=4) json.dump(results, f, indent=4)
save_to_pytorch_benchmark_format(args, results)
if __name__ == '__main__': if __name__ == "__main__":
parser = FlexibleArgumentParser( parser = FlexibleArgumentParser(
description='Benchmark the latency of processing a single batch of ' description="Benchmark the latency of processing a single batch of "
'requests till completion.') "requests till completion.")
parser.add_argument('--input-len', type=int, default=32) parser.add_argument("--input-len", type=int, default=32)
parser.add_argument('--output-len', type=int, default=128) parser.add_argument("--output-len", type=int, default=128)
parser.add_argument('--batch-size', type=int, default=8) parser.add_argument("--batch-size", type=int, default=8)
parser.add_argument('--n', parser.add_argument(
type=int, "--n",
default=1, type=int,
help='Number of generated sequences per prompt.') default=1,
parser.add_argument('--use-beam-search', action='store_true') help="Number of generated sequences per prompt.",
parser.add_argument('--num-iters-warmup', )
type=int, parser.add_argument("--use-beam-search", action="store_true")
default=10, parser.add_argument(
help='Number of iterations to run for warmup.') "--num-iters-warmup",
parser.add_argument('--num-iters', type=int,
default=10,
help="Number of iterations to run for warmup.",
)
parser.add_argument("--num-iters",
type=int, type=int,
default=30, default=30,
help='Number of iterations to run.') help="Number of iterations to run.")
parser.add_argument( parser.add_argument(
'--profile', "--profile",
action='store_true', action="store_true",
help='profile the generation process of a single batch') help="profile the generation process of a single batch",
)
parser.add_argument( parser.add_argument(
'--profile-result-dir', "--profile-result-dir",
type=str, type=str,
default=None, default=None,
help=('path to save the pytorch profiler output. Can be visualized ' help=("path to save the pytorch profiler output. Can be visualized "
'with ui.perfetto.dev or Tensorboard.')) "with ui.perfetto.dev or Tensorboard."),
)
parser.add_argument( parser.add_argument(
'--output-json', "--output-json",
type=str, type=str,
default=None, default=None,
help='Path to save the latency results in JSON format.') help="Path to save the latency results in JSON format.",
)
parser = EngineArgs.add_cli_args(parser) parser = EngineArgs.add_cli_args(parser)
args = parser.parse_args() args = parser.parse_args()
......
...@@ -38,6 +38,7 @@ from datetime import datetime ...@@ -38,6 +38,7 @@ from datetime import datetime
from typing import Any, AsyncGenerator, Collection, Dict, List, Optional, Tuple from typing import Any, AsyncGenerator, Collection, Dict, List, Optional, Tuple
import numpy as np import numpy as np
import pandas as pd
from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput, from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
RequestFuncOutput) RequestFuncOutput)
from datasets import load_dataset from datasets import load_dataset
...@@ -55,6 +56,8 @@ try: ...@@ -55,6 +56,8 @@ try:
except ImportError: except ImportError:
from argparse import ArgumentParser as FlexibleArgumentParser from argparse import ArgumentParser as FlexibleArgumentParser
from benchmark_utils import convert_to_pytorch_benchmark_format
MILLISECONDS_TO_SECONDS_CONVERSION = 1000 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
...@@ -131,6 +134,35 @@ def sample_sharegpt_requests( ...@@ -131,6 +134,35 @@ def sample_sharegpt_requests(
return filtered_dataset return filtered_dataset
def sample_burstgpt_requests(
dataset_path: str,
num_requests: int,
random_seed: int,
tokenizer: PreTrainedTokenizerBase,
) -> List[Tuple[str, int, int, None]]:
df = pd.read_csv(dataset_path)
gpt4_df = df[df["Model"] == "GPT-4"]
# Remove the failed requests (i.e., response length is 0)
gpt4_df = gpt4_df[gpt4_df["Response tokens"] > 0]
# Randomly sample num_requests from the dataset
if num_requests <= len(gpt4_df):
gpt4_df = gpt4_df.sample(n=num_requests, random_state=random_seed)
else:
gpt4_df = gpt4_df.sample(n=num_requests,
random_state=random_seed,
replace=True)
# Convert the dataframe to a list of tuples
dataset = gpt4_df.values.tolist()
input_requests = []
for i in range(num_requests):
input_len = int(dataset[i][2])
output_len = int(dataset[i][3])
prompt = tokenizer.decode([(i + j) % tokenizer.vocab_size
for j in range(input_len)])
input_requests.append((prompt, input_len, output_len, None))
return input_requests
def sample_sonnet_requests( def sample_sonnet_requests(
dataset_path: str, dataset_path: str,
num_requests: int, num_requests: int,
...@@ -372,21 +404,21 @@ async def get_request( ...@@ -372,21 +404,21 @@ async def get_request(
burstiness: float = 1.0, burstiness: float = 1.0,
) -> AsyncGenerator[Tuple[str, int, int], None]: ) -> AsyncGenerator[Tuple[str, int, int], None]:
""" """
Asynchronously generates requests at a specified rate Asynchronously generates requests at a specified rate
with OPTIONAL burstiness. with OPTIONAL burstiness.
Args: Args:
input_requests: input_requests:
A list of input requests, each represented as a tuple. A list of input requests, each represented as a tuple.
request_rate: request_rate:
The rate at which requests are generated (requests/s). The rate at which requests are generated (requests/s).
burstiness (optional): burstiness (optional):
The burstiness factor of the request generation. The burstiness factor of the request generation.
Only takes effect when request_rate is not inf. Only takes effect when request_rate is not inf.
Default value is 1, which follows a Poisson process. Default value is 1, which follows a Poisson process.
Otherwise, the request intervals follow a gamma distribution. Otherwise, the request intervals follow a gamma distribution.
A lower burstiness value (0 < burstiness < 1) results A lower burstiness value (0 < burstiness < 1) results
in more bursty requests, while a higher burstiness value in more bursty requests, while a higher burstiness value
(burstiness > 1) results in a more uniform arrival of requests. (burstiness > 1) results in a more uniform arrival of requests.
""" """
input_requests = iter(input_requests) input_requests = iter(input_requests)
...@@ -537,6 +569,7 @@ async def benchmark( ...@@ -537,6 +569,7 @@ async def benchmark(
ignore_eos: bool, ignore_eos: bool,
goodput_config_dict: Dict[str, float], goodput_config_dict: Dict[str, float],
max_concurrency: Optional[int], max_concurrency: Optional[int],
lora_modules: Optional[List[str]],
): ):
if backend in ASYNC_REQUEST_FUNCS: if backend in ASYNC_REQUEST_FUNCS:
request_func = ASYNC_REQUEST_FUNCS[backend] request_func = ASYNC_REQUEST_FUNCS[backend]
...@@ -562,6 +595,7 @@ async def benchmark( ...@@ -562,6 +595,7 @@ async def benchmark(
multi_modal_content=test_mm_content, multi_modal_content=test_mm_content,
ignore_eos=ignore_eos, ignore_eos=ignore_eos,
) )
test_output = await request_func(request_func_input=test_input) test_output = await request_func(request_func_input=test_input)
if not test_output.success: if not test_output.success:
raise ValueError( raise ValueError(
...@@ -570,6 +604,11 @@ async def benchmark( ...@@ -570,6 +604,11 @@ async def benchmark(
else: else:
print("Initial test run completed. Starting main benchmark run...") print("Initial test run completed. Starting main benchmark run...")
if lora_modules:
# For each input request, choose a LoRA module at random.
lora_modules = iter(
[random.choice(lora_modules) for _ in range(len(input_requests))])
if profile: if profile:
print("Starting profiler...") print("Starting profiler...")
profile_input = RequestFuncInput(model=model_id, profile_input = RequestFuncInput(model=model_id,
...@@ -616,8 +655,13 @@ async def benchmark( ...@@ -616,8 +655,13 @@ async def benchmark(
tasks: List[asyncio.Task] = [] tasks: List[asyncio.Task] = []
async for request in get_request(input_requests, request_rate, burstiness): async for request in get_request(input_requests, request_rate, burstiness):
prompt, prompt_len, output_len, mm_content = request prompt, prompt_len, output_len, mm_content = request
request_func_input = RequestFuncInput(model=model_id, req_model_id, req_model_name = model_id, model_name
model_name=model_name, if lora_modules:
req_lora_module = next(lora_modules)
req_model_id, req_model_name = req_lora_module, req_lora_module
request_func_input = RequestFuncInput(model=req_model_id,
model_name=req_model_name,
prompt=prompt, prompt=prompt,
api_url=api_url, api_url=api_url,
prompt_len=prompt_len, prompt_len=prompt_len,
...@@ -775,6 +819,32 @@ def parse_goodput(slo_pairs): ...@@ -775,6 +819,32 @@ def parse_goodput(slo_pairs):
return goodput_config_dict return goodput_config_dict
def save_to_pytorch_benchmark_format(args: argparse.Namespace,
results: Dict[str, Any],
file_name: str) -> None:
metrics = [
"median_ttft_ms", "mean_ttft_ms", "std_ttft_ms", "p99_ttft_ms",
"mean_tpot_ms", "median_tpot_ms", "std_tpot_ms", "p99_tpot_ms",
"median_itl_ms", "mean_itl_ms", "std_itl_ms", "p99_itl_ms"
]
# These raw data might be useful, but they are rather big. They can be added
# later if needed
ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"]
pt_records = convert_to_pytorch_benchmark_format(
args=args,
metrics={k: [results[k]]
for k in metrics},
extra_info={
k: results[k]
for k in results if k not in metrics and k not in ignored_metrics
})
if pt_records:
# Don't use json suffix here as we don't want CI to pick it up
pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json"
with open(pt_file, "w") as f:
json.dump(pt_records, f)
def main(args: argparse.Namespace): def main(args: argparse.Namespace):
print(args) print(args)
random.seed(args.seed) random.seed(args.seed)
...@@ -818,6 +888,14 @@ def main(args: argparse.Namespace): ...@@ -818,6 +888,14 @@ def main(args: argparse.Namespace):
fixed_output_len=args.sharegpt_output_len, fixed_output_len=args.sharegpt_output_len,
) )
elif args.dataset_name == "burstgpt":
input_requests = sample_burstgpt_requests(
dataset_path=args.dataset_path,
num_requests=args.num_prompts,
random_seed=args.seed,
tokenizer=tokenizer,
)
elif args.dataset_name == "sonnet": elif args.dataset_name == "sonnet":
# Do not format the prompt, pass to message directly # Do not format the prompt, pass to message directly
if args.backend == "openai-chat": if args.backend == "openai-chat":
...@@ -900,6 +978,7 @@ def main(args: argparse.Namespace): ...@@ -900,6 +978,7 @@ def main(args: argparse.Namespace):
ignore_eos=args.ignore_eos, ignore_eos=args.ignore_eos,
goodput_config_dict=goodput_config_dict, goodput_config_dict=goodput_config_dict,
max_concurrency=args.max_concurrency, max_concurrency=args.max_concurrency,
lora_modules=args.lora_modules,
)) ))
# Save config and results to json # Save config and results to json
...@@ -946,6 +1025,7 @@ def main(args: argparse.Namespace): ...@@ -946,6 +1025,7 @@ def main(args: argparse.Namespace):
file_name = os.path.join(args.result_dir, file_name) file_name = os.path.join(args.result_dir, file_name)
with open(file_name, "w", encoding='utf-8') as outfile: with open(file_name, "w", encoding='utf-8') as outfile:
json.dump(result_json, outfile) json.dump(result_json, outfile)
save_to_pytorch_benchmark_format(args, result_json, file_name)
if __name__ == "__main__": if __name__ == "__main__":
...@@ -963,7 +1043,8 @@ if __name__ == "__main__": ...@@ -963,7 +1043,8 @@ if __name__ == "__main__":
default=None, default=None,
help="Server or API base url if not using http host and port.", help="Server or API base url if not using http host and port.",
) )
parser.add_argument("--host", type=str, default="localhost") # Use 127.0.0.1 here instead of localhost to force the use of ipv4
parser.add_argument("--host", type=str, default="127.0.0.1")
parser.add_argument("--port", type=int, default=8000) parser.add_argument("--port", type=int, default=8000)
parser.add_argument( parser.add_argument(
"--endpoint", "--endpoint",
...@@ -982,7 +1063,7 @@ if __name__ == "__main__": ...@@ -982,7 +1063,7 @@ if __name__ == "__main__":
"--dataset-name", "--dataset-name",
type=str, type=str,
default="sharegpt", default="sharegpt",
choices=["sharegpt", "sonnet", "random", "hf"], choices=["sharegpt", "burstgpt", "sonnet", "random", "hf"],
help="Name of the dataset to benchmark on.", help="Name of the dataset to benchmark on.",
) )
parser.add_argument("--dataset-path", parser.add_argument("--dataset-path",
...@@ -1224,11 +1305,12 @@ if __name__ == "__main__": ...@@ -1224,11 +1305,12 @@ if __name__ == "__main__":
'--tokenizer-mode', '--tokenizer-mode',
type=str, type=str,
default="auto", default="auto",
choices=['auto', 'slow', 'mistral'], choices=['auto', 'slow', 'mistral', 'custom'],
help='The tokenizer mode.\n\n* "auto" will use the ' help='The tokenizer mode.\n\n* "auto" will use the '
'fast tokenizer if available.\n* "slow" will ' 'fast tokenizer if available.\n* "slow" will '
'always use the slow tokenizer. \n* ' 'always use the slow tokenizer. \n* '
'"mistral" will always use the `mistral_common` tokenizer.') '"mistral" will always use the `mistral_common` tokenizer. \n*'
'"custom" will use --tokenizer to select the preregistered tokenizer.')
parser.add_argument("--served-model-name", parser.add_argument("--served-model-name",
type=str, type=str,
...@@ -1237,5 +1319,12 @@ if __name__ == "__main__": ...@@ -1237,5 +1319,12 @@ if __name__ == "__main__":
"If not specified, the model name will be the " "If not specified, the model name will be the "
"same as the ``--model`` argument. ") "same as the ``--model`` argument. ")
parser.add_argument("--lora-modules",
nargs='+',
default=None,
help="A subset of LoRA module names passed in when "
"launching the server. For each request, the "
"script chooses a LoRA module at random.")
args = parser.parse_args() args = parser.parse_args()
main(args) main(args)
...@@ -731,7 +731,8 @@ if __name__ == "__main__": ...@@ -731,7 +731,8 @@ if __name__ == "__main__":
default=None, default=None,
help="Server or API base url if not using http host and port.", help="Server or API base url if not using http host and port.",
) )
parser.add_argument("--host", type=str, default="localhost") # Use 127.0.0.1 here instead of localhost to force the use of ipv4
parser.add_argument("--host", type=str, default="127.0.0.1")
parser.add_argument("--port", type=int, default=8000) parser.add_argument("--port", type=int, default=8000)
parser.add_argument( parser.add_argument(
"--endpoint", "--endpoint",
......
...@@ -3,14 +3,16 @@ ...@@ -3,14 +3,16 @@
import argparse import argparse
import dataclasses import dataclasses
import json import json
import os
import random import random
import time import time
from functools import cache from functools import cache
from typing import Dict, List, Optional, Tuple from typing import Any, Dict, List, Optional, Tuple
import numpy as np import numpy as np
import torch import torch
import uvloop import uvloop
from benchmark_utils import convert_to_pytorch_benchmark_format
from PIL import Image from PIL import Image
from tqdm import tqdm from tqdm import tqdm
from transformers import (AutoModelForCausalLM, AutoTokenizer, from transformers import (AutoModelForCausalLM, AutoTokenizer,
...@@ -361,6 +363,25 @@ def run_mii( ...@@ -361,6 +363,25 @@ def run_mii(
return end - start return end - start
def save_to_pytorch_benchmark_format(args: argparse.Namespace,
results: Dict[str, Any]) -> None:
pt_records = convert_to_pytorch_benchmark_format(
args=args,
metrics={
"requests_per_second": [results["requests_per_second"]],
"tokens_per_second": [results["tokens_per_second"]],
},
extra_info={
k: results[k]
for k in ["elapsed_time", "num_requests", "total_num_tokens"]
})
if pt_records:
# Don't use json suffix here as we don't want CI to pick it up
pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
with open(pt_file, "w") as f:
json.dump(pt_records, f)
def main(args: argparse.Namespace): def main(args: argparse.Namespace):
print(args) print(args)
random.seed(args.seed) random.seed(args.seed)
...@@ -459,6 +480,7 @@ def main(args: argparse.Namespace): ...@@ -459,6 +480,7 @@ def main(args: argparse.Namespace):
} }
with open(args.output_json, "w") as f: with open(args.output_json, "w") as f:
json.dump(results, f, indent=4) json.dump(results, f, indent=4)
save_to_pytorch_benchmark_format(args, results)
if __name__ == "__main__": if __name__ == "__main__":
......
# SPDX-License-Identifier: Apache-2.0
import argparse
import os
from typing import Any, Dict, List
def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
metrics: Dict[str, List],
extra_info: Dict[str, Any]) -> List:
"""
Save the benchmark results in the format used by PyTorch OSS benchmark with
on metric per record
https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
"""
records = []
if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False):
return records
for name, benchmark_values in metrics.items():
record = {
"benchmark": {
"name": "vLLM benchmark",
"extra_info": {
"args": vars(args),
},
},
"model": {
"name": args.model,
},
"metric": {
"name": name,
"benchmark_values": benchmark_values,
"extra_info": extra_info,
},
}
records.append(record)
return records
...@@ -262,9 +262,9 @@ endmacro() ...@@ -262,9 +262,9 @@ endmacro()
# where `<=` is the version comparison operator. # where `<=` is the version comparison operator.
# In other words, for each version in `TGT_CUDA_ARCHS` find the highest version # In other words, for each version in `TGT_CUDA_ARCHS` find the highest version
# in `SRC_CUDA_ARCHS` that is less or equal to the version in `TGT_CUDA_ARCHS`. # in `SRC_CUDA_ARCHS` that is less or equal to the version in `TGT_CUDA_ARCHS`.
# We have special handling for 9.0a, if 9.0a is in `SRC_CUDA_ARCHS` and 9.0 is # We have special handling for x.0a, if x.0a is in `SRC_CUDA_ARCHS` and x.0 is
# in `TGT_CUDA_ARCHS` then we should remove 9.0a from `SRC_CUDA_ARCHS` and add # in `TGT_CUDA_ARCHS` then we should remove x.0a from `SRC_CUDA_ARCHS` and add
# 9.0a to the result (and remove 9.0 from TGT_CUDA_ARCHS). # x.0a to the result (and remove x.0 from TGT_CUDA_ARCHS).
# The result is stored in `OUT_CUDA_ARCHS`. # The result is stored in `OUT_CUDA_ARCHS`.
# #
# Example: # Example:
...@@ -277,8 +277,8 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR ...@@ -277,8 +277,8 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR
list(REMOVE_DUPLICATES SRC_CUDA_ARCHS) list(REMOVE_DUPLICATES SRC_CUDA_ARCHS)
set(TGT_CUDA_ARCHS_ ${TGT_CUDA_ARCHS}) set(TGT_CUDA_ARCHS_ ${TGT_CUDA_ARCHS})
# if 9.0a is in SRC_CUDA_ARCHS and 9.0 is in CUDA_ARCHS then we should # if x.0a is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should
# remove 9.0a from SRC_CUDA_ARCHS and add 9.0a to _CUDA_ARCHS # remove x.0a from SRC_CUDA_ARCHS and add x.0a to _CUDA_ARCHS
set(_CUDA_ARCHS) set(_CUDA_ARCHS)
if ("9.0a" IN_LIST SRC_CUDA_ARCHS) if ("9.0a" IN_LIST SRC_CUDA_ARCHS)
list(REMOVE_ITEM SRC_CUDA_ARCHS "9.0a") list(REMOVE_ITEM SRC_CUDA_ARCHS "9.0a")
...@@ -288,6 +288,14 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR ...@@ -288,6 +288,14 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR
endif() endif()
endif() endif()
if ("10.0a" IN_LIST SRC_CUDA_ARCHS)
list(REMOVE_ITEM SRC_CUDA_ARCHS "10.0a")
if ("10.0" IN_LIST TGT_CUDA_ARCHS)
list(REMOVE_ITEM TGT_CUDA_ARCHS_ "10.0")
set(_CUDA_ARCHS "10.0a")
endif()
endif()
list(SORT SRC_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING) list(SORT SRC_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
# for each ARCH in TGT_CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that # for each ARCH in TGT_CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that
......
#pragma once #pragma once
#include <stdio.h>
#if defined(__CUDACC__) || defined(_NVHPC_CUDA) #if defined(__CUDACC__) || defined(_NVHPC_CUDA)
#define HOST_DEVICE_INLINE __forceinline__ __host__ __device__ #define HOST_DEVICE_INLINE __forceinline__ __host__ __device__
#define DEVICE_INLINE __forceinline__ __device__ #define DEVICE_INLINE __forceinline__ __device__
...@@ -10,6 +12,16 @@ ...@@ -10,6 +12,16 @@
#define HOST_INLINE inline #define HOST_INLINE inline
#endif #endif
#define CUDA_CHECK(cmd) \
do { \
cudaError_t e = cmd; \
if (e != cudaSuccess) { \
printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, \
cudaGetErrorString(e)); \
exit(EXIT_FAILURE); \
} \
} while (0)
int64_t get_device_attribute(int64_t attribute, int64_t device_id); int64_t get_device_attribute(int64_t attribute, int64_t device_id);
int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id); int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id);
#include "cuda_utils.h"
#ifdef USE_ROCM #ifdef USE_ROCM
#include <hip/hip_runtime.h> #include <hip/hip_runtime.h>
#include <hip/hip_runtime_api.h> #include <hip/hip_runtime_api.h>
#endif #endif
int64_t get_device_attribute(int64_t attribute, int64_t device_id) { int64_t get_device_attribute(int64_t attribute, int64_t device_id) {
int device, value; // Return the cached value on subsequent calls
if (device_id < 0) { static int value = [=]() {
cudaGetDevice(&device); int device = static_cast<int>(device_id);
} else { if (device < 0) {
device = device_id; CUDA_CHECK(cudaGetDevice(&device));
} }
cudaDeviceGetAttribute(&value, static_cast<cudaDeviceAttr>(attribute), int value;
device); CUDA_CHECK(cudaDeviceGetAttribute(
&value, static_cast<cudaDeviceAttr>(attribute), device));
return static_cast<int>(value);
}();
return value; return value;
} }
......
...@@ -12,15 +12,21 @@ extern "C" { ...@@ -12,15 +12,21 @@ extern "C" {
#include <cuda_runtime_api.h> #include <cuda_runtime_api.h>
#include <cuda.h> #include <cuda.h>
#define CUDA_CHECK(condition) \ char error_msg[10240]; // 10KB buffer to store error messages
do { \ CUresult no_error = CUresult(0);
CUresult error = condition; \ CUresult error_code = no_error; // store error code
if (error != 0) { \
char* error_string; \ #define CUDA_CHECK(condition) \
cuGetErrorString(error, (const char**)&error_string); \ do { \
std::cerr << "CUDA Error: " << error_string << " at " << __FILE__ << ":" \ CUresult error = condition; \
<< __LINE__ << std::endl; \ if (error != 0) { \
} \ error_code = error; \
char* error_string; \
cuGetErrorString(error, (const char**)&error_string); \
snprintf(error_msg, sizeof(error_msg), "CUDA Error: %s at %s:%d", \
error_string, __FILE__, __LINE__); \
std::cerr << error_msg << std::endl; \
} \
} while (0) } while (0)
// Global references to Python callables // Global references to Python callables
...@@ -54,14 +60,22 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem, ...@@ -54,14 +60,22 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,
// Allocate memory using cuMemCreate // Allocate memory using cuMemCreate
CUDA_CHECK(cuMemCreate(p_memHandle, size, &prop, 0)); CUDA_CHECK(cuMemCreate(p_memHandle, size, &prop, 0));
if (error_code != 0) {
return;
}
CUDA_CHECK(cuMemMap(d_mem, size, 0, *p_memHandle, 0)); CUDA_CHECK(cuMemMap(d_mem, size, 0, *p_memHandle, 0));
if (error_code != 0) {
return;
}
CUmemAccessDesc accessDesc = {}; CUmemAccessDesc accessDesc = {};
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
accessDesc.location.id = device; accessDesc.location.id = device;
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
CUDA_CHECK(cuMemSetAccess(d_mem, size, &accessDesc, 1)); CUDA_CHECK(cuMemSetAccess(d_mem, size, &accessDesc, 1));
if (error_code != 0) {
return;
}
// std::cout << "create_and_map: device=" << device << ", size=" << size << ", // std::cout << "create_and_map: device=" << device << ", size=" << size << ",
// d_mem=" << d_mem << ", p_memHandle=" << p_memHandle << std::endl; // d_mem=" << d_mem << ", p_memHandle=" << p_memHandle << std::endl;
} }
...@@ -73,7 +87,13 @@ void unmap_and_release(unsigned long long device, ssize_t size, ...@@ -73,7 +87,13 @@ void unmap_and_release(unsigned long long device, ssize_t size,
// ", d_mem=" << d_mem << ", p_memHandle=" << p_memHandle << std::endl; // ", d_mem=" << d_mem << ", p_memHandle=" << p_memHandle << std::endl;
ensure_context(device); ensure_context(device);
CUDA_CHECK(cuMemUnmap(d_mem, size)); CUDA_CHECK(cuMemUnmap(d_mem, size));
if (error_code != 0) {
return;
}
CUDA_CHECK(cuMemRelease(*p_memHandle)); CUDA_CHECK(cuMemRelease(*p_memHandle));
if (error_code != 0) {
return;
}
} }
PyObject* create_tuple_from_c_integers(unsigned long long a, PyObject* create_tuple_from_c_integers(unsigned long long a,
...@@ -121,12 +141,16 @@ void* my_malloc(ssize_t size, int device, CUstream stream) { ...@@ -121,12 +141,16 @@ void* my_malloc(ssize_t size, int device, CUstream stream) {
size_t granularity; size_t granularity;
CUDA_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CUDA_CHECK(cuMemGetAllocationGranularity(&granularity, &prop,
CU_MEM_ALLOC_GRANULARITY_MINIMUM)); CU_MEM_ALLOC_GRANULARITY_MINIMUM));
if (error_code != 0) {
return nullptr;
}
size_t alignedSize = ((size + granularity - 1) / granularity) * granularity; size_t alignedSize = ((size + granularity - 1) / granularity) * granularity;
CUdeviceptr d_mem; CUdeviceptr d_mem;
CUDA_CHECK(cuMemAddressReserve(&d_mem, alignedSize, 0, 0, 0)); CUDA_CHECK(cuMemAddressReserve(&d_mem, alignedSize, 0, 0, 0));
if (error_code != 0) {
return nullptr;
}
// allocate the CUmemGenericAllocationHandle // allocate the CUmemGenericAllocationHandle
CUmemGenericAllocationHandle* p_memHandle = CUmemGenericAllocationHandle* p_memHandle =
(CUmemGenericAllocationHandle*)malloc( (CUmemGenericAllocationHandle*)malloc(
...@@ -208,6 +232,9 @@ void my_free(void* ptr, ssize_t size, int device, CUstream stream) { ...@@ -208,6 +232,9 @@ void my_free(void* ptr, ssize_t size, int device, CUstream stream) {
// free address and the handle // free address and the handle
CUDA_CHECK(cuMemAddressFree(d_mem, size)); CUDA_CHECK(cuMemAddressFree(d_mem, size));
if (error_code != 0) {
return;
}
free(p_memHandle); free(p_memHandle);
} }
...@@ -258,6 +285,12 @@ static PyObject* python_unmap_and_release(PyObject* self, PyObject* args) { ...@@ -258,6 +285,12 @@ static PyObject* python_unmap_and_release(PyObject* self, PyObject* args) {
unmap_and_release(recv_device, recv_size, d_mem_ptr, p_memHandle); unmap_and_release(recv_device, recv_size, d_mem_ptr, p_memHandle);
if (error_code != 0) {
error_code = no_error;
PyErr_SetString(PyExc_RuntimeError, error_msg);
return nullptr;
}
Py_RETURN_NONE; Py_RETURN_NONE;
} }
...@@ -282,6 +315,12 @@ static PyObject* python_create_and_map(PyObject* self, PyObject* args) { ...@@ -282,6 +315,12 @@ static PyObject* python_create_and_map(PyObject* self, PyObject* args) {
create_and_map(recv_device, recv_size, d_mem_ptr, p_memHandle); create_and_map(recv_device, recv_size, d_mem_ptr, p_memHandle);
if (error_code != 0) {
error_code = no_error;
PyErr_SetString(PyExc_RuntimeError, error_msg);
return nullptr;
}
Py_RETURN_NONE; Py_RETURN_NONE;
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment