Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
96ae75ad
Commit
96ae75ad
authored
Jan 04, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.6.6.post1' into v0.6.6.post1-dev
parents
f9f4a735
2339d59f
Changes
374
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
793 additions
and
183 deletions
+793
-183
.buildkite/generate_index.py
.buildkite/generate_index.py
+24
-0
.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+3
-3
.buildkite/release-pipeline.yaml
.buildkite/release-pipeline.yaml
+15
-0
.buildkite/run-gh200-test.sh
.buildkite/run-gh200-test.sh
+3
-0
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+5
-1
.buildkite/upload-wheels.sh
.buildkite/upload-wheels.sh
+29
-1
.github/workflows/publish.yml
.github/workflows/publish.yml
+62
-61
.gitignore
.gitignore
+2
-0
CMakeLists.txt
CMakeLists.txt
+33
-6
Dockerfile
Dockerfile
+24
-23
Dockerfile.cpu
Dockerfile.cpu
+3
-3
README.md
README.md
+1
-1
README_ORIGIN.md
README_ORIGIN.md
+1
-1
benchmarks/benchmark_throughput.py
benchmarks/benchmark_throughput.py
+92
-43
benchmarks/cutlass_benchmarks/sparse_benchmarks.py
benchmarks/cutlass_benchmarks/sparse_benchmarks.py
+384
-0
benchmarks/cutlass_benchmarks/utils.py
benchmarks/cutlass_benchmarks/utils.py
+96
-0
benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+2
-26
benchmarks/cutlass_benchmarks/weight_shapes.py
benchmarks/cutlass_benchmarks/weight_shapes.py
+1
-1
benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
+7
-6
benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
+6
-7
No files found.
.buildkite/generate_index.py
0 → 100644
View file @
96ae75ad
import
argparse
import
os
template
=
"""<!DOCTYPE html>
<html>
<body>
<h1>Links for vLLM</h1/>
<a href="../{wheel_html_escaped}">{wheel}</a><br/>
</body>
</html>
"""
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--wheel"
,
help
=
"The wheel path."
,
required
=
True
)
args
=
parser
.
parse_args
()
filename
=
os
.
path
.
basename
(
args
.
wheel
)
with
open
(
"index.html"
,
"w"
)
as
f
:
print
(
f
"Generated index.html for
{
args
.
wheel
}
"
)
# cloudfront requires escaping the '+' character
f
.
write
(
template
.
format
(
wheel
=
filename
,
wheel_html_escaped
=
filename
.
replace
(
"+"
,
"%2B"
)))
.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
View file @
96ae75ad
...
@@ -65,9 +65,9 @@ steps:
...
@@ -65,9 +65,9 @@ steps:
-
VLLM_USAGE_SOURCE
-
VLLM_USAGE_SOURCE
-
HF_TOKEN
-
HF_TOKEN
-
block
:
"
Run
H100
Benchmark"
#
- block: "Run H100 Benchmark"
key
:
block-h100
#
key: block-h100
depends_on
:
~
#
depends_on: ~
-
label
:
"
H100"
-
label
:
"
H100"
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
...
...
.buildkite/release-pipeline.yaml
View file @
96ae75ad
...
@@ -55,3 +55,18 @@ steps:
...
@@ -55,3 +55,18 @@ steps:
password-env
:
DOCKERHUB_TOKEN
password-env
:
DOCKERHUB_TOKEN
env
:
env
:
DOCKER_BUILDKIT
:
"
1"
DOCKER_BUILDKIT
:
"
1"
-
block
:
"
Build
CPU
release
image"
key
:
block-cpu-release-image-build
depends_on
:
~
-
label
:
"
Build
and
publish
CPU
release
image"
depends_on
:
block-cpu-release-image-build
agents
:
queue
:
cpu_queue_postmerge
commands
:
-
"
aws
ecr-public
get-login-password
--region
us-east-1
|
docker
login
--username
AWS
--password-stdin
public.ecr.aws/q9t5s3a7"
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
GIT_REPO_CHECK=1
--tag
public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION
--progress
plain
-f
Dockerfile.cpu
."
-
"
docker
push
public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION"
env
:
DOCKER_BUILDKIT
:
"
1"
.buildkite/run-gh200-test.sh
View file @
96ae75ad
...
@@ -4,6 +4,9 @@
...
@@ -4,6 +4,9 @@
# It serves a sanity check for compilation and basic model usage.
# It serves a sanity check for compilation and basic model usage.
set
-ex
set
-ex
# Skip the new torch installation during build since we are using the specified version for arm64 in the Dockerfile
python3 use_existing_torch.py
# Try building the docker image
# Try building the docker image
DOCKER_BUILDKIT
=
1 docker build
.
\
DOCKER_BUILDKIT
=
1 docker build
.
\
--target
vllm-openai
\
--target
vllm-openai
\
...
...
.buildkite/test-pipeline.yaml
View file @
96ae75ad
...
@@ -224,8 +224,12 @@ steps:
...
@@ -224,8 +224,12 @@ steps:
mirror_hardwares
:
[
amd
]
mirror_hardwares
:
[
amd
]
source_file_dependencies
:
source_file_dependencies
:
-
vllm/model_executor/layers
-
vllm/model_executor/layers
-
vllm/model_executor/guided_decoding
-
tests/test_logits_processor
-
tests/test_logits_processor
command
:
pytest -v -s test_logits_processor.py
-
tests/model_executor/test_guided_processors
commands
:
-
pytest -v -s test_logits_processor.py
-
pytest -v -s model_executor/test_guided_processors.py
-
label
:
Speculative decoding tests
# 30min
-
label
:
Speculative decoding tests
# 30min
source_file_dependencies
:
source_file_dependencies
:
...
...
.buildkite/upload-wheels.sh
View file @
96ae75ad
...
@@ -23,6 +23,8 @@ wheel="$new_wheel"
...
@@ -23,6 +23,8 @@ wheel="$new_wheel"
version
=
$(
unzip
-p
"
$wheel
"
'**/METADATA'
|
grep
'^Version: '
|
cut
-d
' '
-f2
)
version
=
$(
unzip
-p
"
$wheel
"
'**/METADATA'
|
grep
'^Version: '
|
cut
-d
' '
-f2
)
echo
"Version:
$version
"
echo
"Version:
$version
"
normal_wheel
=
"
$wheel
"
# Save the original wheel filename
# If the version contains "dev", rename it to v1.0.0.dev for consistency
# If the version contains "dev", rename it to v1.0.0.dev for consistency
if
[[
$version
==
*
dev
*
]]
;
then
if
[[
$version
==
*
dev
*
]]
;
then
suffix
=
"
${
version
##*.
}
"
suffix
=
"
${
version
##*.
}
"
...
@@ -32,12 +34,38 @@ if [[ $version == *dev* ]]; then
...
@@ -32,12 +34,38 @@ if [[ $version == *dev* ]]; then
new_version
=
"1.0.0.dev"
new_version
=
"1.0.0.dev"
fi
fi
new_wheel
=
"
${
wheel
/
$version
/
$new_version
}
"
new_wheel
=
"
${
wheel
/
$version
/
$new_version
}
"
mv
--
"
$wheel
"
"
$new_wheel
"
# use cp to keep both files in the artifacts directory
cp
--
"
$wheel
"
"
$new_wheel
"
wheel
=
"
$new_wheel
"
wheel
=
"
$new_wheel
"
version
=
"
$new_version
"
version
=
"
$new_version
"
fi
fi
# Upload the wheel to S3
# Upload the wheel to S3
python3 .buildkite/generate_index.py
--wheel
"
$normal_wheel
"
# generate index for this commit
aws s3
cp
"
$wheel
"
"s3://vllm-wheels/
$BUILDKITE_COMMIT
/"
aws s3
cp
"
$wheel
"
"s3://vllm-wheels/
$BUILDKITE_COMMIT
/"
aws s3
cp
"
$normal_wheel
"
"s3://vllm-wheels/
$BUILDKITE_COMMIT
/"
if
[[
$normal_wheel
==
*
"cu118"
*
]]
;
then
# if $normal_wheel matches cu118, do not upload the index.html
echo
"Skipping index files for cu118 wheels"
else
# only upload index.html for cu12 wheels (default wheels)
aws s3
cp
index.html
"s3://vllm-wheels/
$BUILDKITE_COMMIT
/vllm/index.html"
aws s3
cp
"s3://vllm-wheels/nightly/index.html"
"s3://vllm-wheels/
$BUILDKITE_COMMIT
/index.html"
fi
# generate index for nightly
aws s3
cp
"
$wheel
"
"s3://vllm-wheels/nightly/"
aws s3
cp
"
$wheel
"
"s3://vllm-wheels/nightly/"
aws s3
cp
"
$normal_wheel
"
"s3://vllm-wheels/nightly/"
if
[[
$normal_wheel
==
*
"cu118"
*
]]
;
then
# if $normal_wheel matches cu118, do not upload the index.html
echo
"Skipping index files for cu118 wheels"
else
# only upload index.html for cu12 wheels (default wheels)
aws s3
cp
index.html
"s3://vllm-wheels/nightly/vllm/index.html"
fi
aws s3
cp
"
$wheel
"
"s3://vllm-wheels/
$version
/"
aws s3
cp
"
$wheel
"
"s3://vllm-wheels/
$version
/"
\ No newline at end of file
.github/workflows/publish.yml
View file @
96ae75ad
...
@@ -39,67 +39,68 @@ jobs:
...
@@ -39,67 +39,68 @@ jobs:
const script = require('.github/workflows/scripts/create_release.js')
const script = require('.github/workflows/scripts/create_release.js')
await script(github, context, core)
await script(github, context, core)
wheel
:
# NOTE(simon): No longer build wheel using Github Actions. See buildkite's release workflow.
name
:
Build Wheel
# wheel:
runs-on
:
${{ matrix.os }}
# name: Build Wheel
needs
:
release
# runs-on: ${{ matrix.os }}
# needs: release
strategy
:
fail-fast
:
false
# strategy:
matrix
:
# fail-fast: false
os
:
[
'
ubuntu-20.04'
]
# matrix:
python-version
:
[
'
3.9'
,
'
3.10'
,
'
3.11'
,
'
3.12'
]
# os: ['ubuntu-20.04']
pytorch-version
:
[
'
2.4.0'
]
# Must be the most recent version that meets requirements-cuda.txt.
# python-version: ['3.9', '3.10', '3.11', '3.12']
cuda-version
:
[
'
11.8'
,
'
12.1'
]
# pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements-cuda.txt.
# cuda-version: ['11.8', '12.1']
steps
:
-
name
:
Checkout
# steps:
uses
:
actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
# v4.2.2
# - name: Checkout
# uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
name
:
Setup ccache
uses
:
hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9
# v1.2.14
# - name: Setup ccache
with
:
# uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14
create-symlink
:
true
# with:
key
:
${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
# create-symlink: true
# key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
-
name
:
Set up Linux Env
if
:
${{ runner.os == 'Linux' }}
# - name: Set up Linux Env
run
:
|
# if: ${{ runner.os == 'Linux' }}
bash -x .github/workflows/scripts/env.sh
# run: |
# bash -x .github/workflows/scripts/env.sh
-
name
:
Set up Python
uses
:
actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b
# v5.3.0
# - name: Set up Python
with
:
# uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
python-version
:
${{ matrix.python-version }}
# with:
# python-version: ${{ matrix.python-version }}
-
name
:
Install CUDA ${{ matrix.cuda-version }}
run
:
|
# - name: Install CUDA ${{ matrix.cuda-version }}
bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
# run: |
# bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
-
name
:
Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
run
:
|
# - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
# run: |
# bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
-
name
:
Build wheel
shell
:
bash
# - name: Build wheel
env
:
# shell: bash
CMAKE_BUILD_TYPE
:
Release
# do not compile with debug symbol to reduce wheel size
# env:
run
:
|
# CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
# run: |
wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
# bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
asset_name=${wheel_name//"linux"/"manylinux1"}
# wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
# asset_name=${wheel_name//"linux"/"manylinux1"}
echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
# echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
# echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
-
name
:
Upload Release Asset
uses
:
actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5
# v1.0.2
# - name: Upload Release Asset
env
:
# uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
GITHUB_TOKEN
:
${{ secrets.GITHUB_TOKEN }}
# env:
with
:
# GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
upload_url
:
${{ needs.release.outputs.upload_url }}
# with:
asset_path
:
./dist/${{ env.wheel_name }}
# upload_url: ${{ needs.release.outputs.upload_url }}
asset_name
:
${{ env.asset_name }}
# asset_path: ./dist/${{ env.wheel_name }}
asset_content_type
:
application/*
# asset_name: ${{ env.asset_name }}
# asset_content_type: application/*
# (Danielkinz): This last step will publish the .whl to pypi. Warning: untested
# (Danielkinz): This last step will publish the .whl to pypi. Warning: untested
# - name: Publish package
# - name: Publish package
...
...
.gitignore
View file @
96ae75ad
...
@@ -81,6 +81,8 @@ instance/
...
@@ -81,6 +81,8 @@ instance/
docs/_build/
docs/_build/
docs/source/getting_started/examples/*.rst
docs/source/getting_started/examples/*.rst
!**/*.template.rst
!**/*.template.rst
docs/source/getting_started/examples/*.md
!**/*.template.md
# PyBuilder
# PyBuilder
.pybuilder/
.pybuilder/
...
...
CMakeLists.txt
View file @
96ae75ad
...
@@ -219,7 +219,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
...
@@ -219,7 +219,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
SET
(
CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL
"Enable only the header library"
)
SET
(
CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL
"Enable only the header library"
)
# Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
# Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
set
(
CUTLASS_REVISION
"v3.
5.1
"
CACHE STRING
"CUTLASS revision to use"
)
set
(
CUTLASS_REVISION
"v3.
6.0
"
CACHE STRING
"CUTLASS revision to use"
)
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
if
(
DEFINED ENV{VLLM_CUTLASS_SRC_DIR}
)
if
(
DEFINED ENV{VLLM_CUTLASS_SRC_DIR}
)
...
@@ -236,13 +236,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
...
@@ -236,13 +236,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
FetchContent_Declare
(
FetchContent_Declare
(
cutlass
cutlass
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
GIT_TAG
v3.5.1
GIT_TAG
8aa95dbb888be6d81c6fbf7169718c5244b53227
GIT_PROGRESS TRUE
GIT_PROGRESS TRUE
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
# Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
# Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
# So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
# So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
GIT_SHALLOW
TRU
E
GIT_SHALLOW
FALS
E
)
)
endif
()
endif
()
FetchContent_MakeAvailable
(
cutlass
)
FetchContent_MakeAvailable
(
cutlass
)
...
@@ -254,7 +254,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
...
@@ -254,7 +254,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"csrc/quantization/awq/gemm_kernels.cu"
"csrc/quantization/awq/gemm_kernels.cu"
"csrc/custom_all_reduce.cu"
"csrc/custom_all_reduce.cu"
"csrc/permute_cols.cu"
"csrc/permute_cols.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
)
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
"csrc/sparse/cutlass/sparse_compressor_entry.cu"
"csrc/cutlass_extensions/common.cpp"
)
set_gencode_flags_for_srcs
(
set_gencode_flags_for_srcs
(
SRCS
"
${
VLLM_EXT_SRC
}
"
SRCS
"
${
VLLM_EXT_SRC
}
"
...
@@ -283,7 +286,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
...
@@ -283,7 +286,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
" in CUDA target architectures"
)
" in CUDA target architectures"
)
endif
()
endif
()
#
# The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
# The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
# CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
# CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
cuda_archs_loose_intersection
(
SCALED_MM_3X_ARCHS
"9.0;9.0a"
"
${
CUDA_ARCHS
}
"
)
cuda_archs_loose_intersection
(
SCALED_MM_3X_ARCHS
"9.0;9.0a"
"
${
CUDA_ARCHS
}
"
)
...
@@ -336,6 +338,31 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
...
@@ -336,6 +338,31 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif
()
endif
()
endif
()
endif
()
#
# 2:4 Sparse Kernels
# The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
# require CUDA 12.2 or later (and only work on Hopper, 9.0/9.0a for now).
if
(
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS
)
set
(
SRCS
"csrc/sparse/cutlass/sparse_compressor_c3x.cu"
"csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu"
)
set_gencode_flags_for_srcs
(
SRCS
"
${
SRCS
}
"
CUDA_ARCHS
"
${
SCALED_MM_3X_ARCHS
}
"
)
list
(
APPEND VLLM_EXT_SRC
"
${
SRCS
}
"
)
list
(
APPEND VLLM_GPU_FLAGS
"-DENABLE_SPARSE_SCALED_MM_C3X=1"
)
message
(
STATUS
"Building sparse_scaled_mm_c3x for archs:
${
SCALED_MM_3X_ARCHS
}
"
)
else
()
if
(
NOT
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS
)
message
(
STATUS
"Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
"not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
"if you intend on running FP8 sparse quantized models on Hopper."
)
else
()
message
(
STATUS
"Not building sparse_scaled_mm_c3x as no compatible archs found "
"in CUDA target architectures"
)
endif
()
endif
()
#
#
# Machete kernels
# Machete kernels
...
@@ -417,7 +444,7 @@ define_gpu_extension_target(
...
@@ -417,7 +444,7 @@ define_gpu_extension_target(
SOURCES
${
VLLM_EXT_SRC
}
SOURCES
${
VLLM_EXT_SRC
}
COMPILE_FLAGS
${
VLLM_GPU_FLAGS
}
COMPILE_FLAGS
${
VLLM_GPU_FLAGS
}
ARCHITECTURES
${
VLLM_GPU_ARCHES
}
ARCHITECTURES
${
VLLM_GPU_ARCHES
}
INCLUDE_DIRECTORIES
${
CUTLASS_INCLUDE_DIR
}
INCLUDE_DIRECTORIES
${
CUTLASS_INCLUDE_DIR
}
;
${
CUTLASS_TOOLS_UTIL_INCLUDE_DIR
}
USE_SABI 3
USE_SABI 3
WITH_SOABI
)
WITH_SOABI
)
...
...
Dockerfile
View file @
96ae75ad
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
# to run the OpenAI compatible server.
# to run the OpenAI compatible server.
# Please update any changes made here to
# Please update any changes made here to
# docs/source/dev/dockerfile/dockerfile.
rst
and
# docs/source/dev/dockerfile/dockerfile.
md
and
# docs/source/assets/dev/dockerfile-stages-dependency.png
# docs/source/assets/dev/dockerfile-stages-dependency.png
ARG
CUDA_VERSION=12.4.1
ARG
CUDA_VERSION=12.4.1
...
@@ -45,17 +45,21 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
...
@@ -45,17 +45,21 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
WORKDIR
/workspace
WORKDIR
/workspace
# install build and runtime dependencies
# install build and runtime dependencies
COPY
requirements-common.txt requirements-common.txt
COPY
requirements-cuda.txt requirements-cuda.txt
COPY
requirements-cuda-arm64.txt requirements-cuda-arm64.txt
RUN
--mount
=
type
=
cache,target
=
/root/.cache/pip
\
python3
-m
pip
install
-r
requirements-cuda.txt
# arm64 (GH200) build follows the practice of "use existing pytorch" build,
# we need to install torch and torchvision from the nightly builds first,
# pytorch will not appear as a vLLM dependency in all of the following steps
# after this step
RUN
--mount
=
type
=
cache,target
=
/root/.cache/pip
\
RUN
--mount
=
type
=
cache,target
=
/root/.cache/pip
\
if
[
"
$TARGETPLATFORM
"
=
"linux/arm64"
]
;
then
\
if
[
"
$TARGETPLATFORM
"
=
"linux/arm64"
]
;
then
\
python3
-m
pip
install
-
r
requirements-cuda-arm64.txt
;
\
python3
-m
pip
install
-
-index-url
https://download.pytorch.org/whl/nightly/cu124
"torch==2.6.0.dev20241210+cu124"
"torchvision==0.22.0.dev20241215"
;
\
fi
fi
COPY
requirements-common.txt requirements-common.txt
COPY
requirements-cuda.txt requirements-cuda.txt
RUN
--mount
=
type
=
cache,target
=
/root/.cache/pip
\
python3
-m
pip
install
-r
requirements-cuda.txt
# cuda arch list used by torch
# cuda arch list used by torch
# can be useful for both `dev` and `test`
# can be useful for both `dev` and `test`
# explicitly set the list to avoid issues with torch 2.2
# explicitly set the list to avoid issues with torch 2.2
...
@@ -77,11 +81,6 @@ COPY requirements-build.txt requirements-build.txt
...
@@ -77,11 +81,6 @@ COPY requirements-build.txt requirements-build.txt
RUN
--mount
=
type
=
cache,target
=
/root/.cache/pip
\
RUN
--mount
=
type
=
cache,target
=
/root/.cache/pip
\
python3
-m
pip
install
-r
requirements-build.txt
python3
-m
pip
install
-r
requirements-build.txt
RUN
--mount
=
type
=
cache,target
=
/root/.cache/pip
\
if
[
"
$TARGETPLATFORM
"
=
"linux/arm64"
]
;
then
\
python3
-m
pip
install
-r
requirements-cuda-arm64.txt
;
\
fi
COPY
. .
COPY
. .
ARG
GIT_REPO_CHECK=0
ARG
GIT_REPO_CHECK=0
RUN
--mount
=
type
=
bind
,source
=
.git,target
=
.git
\
RUN
--mount
=
type
=
bind
,source
=
.git,target
=
.git
\
...
@@ -157,8 +156,6 @@ WORKDIR /vllm-workspace
...
@@ -157,8 +156,6 @@ WORKDIR /vllm-workspace
ENV
DEBIAN_FRONTEND=noninteractive
ENV
DEBIAN_FRONTEND=noninteractive
ARG
TARGETPLATFORM
ARG
TARGETPLATFORM
COPY
requirements-cuda-arm64.txt requirements-cuda-arm64.txt
RUN
PYTHON_VERSION_STR
=
$(
echo
${
PYTHON_VERSION
}
|
sed
's/\.//g'
)
&&
\
RUN
PYTHON_VERSION_STR
=
$(
echo
${
PYTHON_VERSION
}
|
sed
's/\.//g'
)
&&
\
echo
"export PYTHON_VERSION_STR=
${
PYTHON_VERSION_STR
}
"
>>
/etc/environment
echo
"export PYTHON_VERSION_STR=
${
PYTHON_VERSION_STR
}
"
>>
/etc/environment
...
@@ -166,7 +163,7 @@ RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
...
@@ -166,7 +163,7 @@ RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
RUN
echo
'tzdata tzdata/Areas select America'
| debconf-set-selections
\
RUN
echo
'tzdata tzdata/Areas select America'
| debconf-set-selections
\
&&
echo
'tzdata tzdata/Zones/America select Los_Angeles'
| debconf-set-selections
\
&&
echo
'tzdata tzdata/Zones/America select Los_Angeles'
| debconf-set-selections
\
&&
apt-get update
-y
\
&&
apt-get update
-y
\
&&
apt-get
install
-y
ccache software-properties-common git curl
sudo
vim python3-pip
\
&&
apt-get
install
-y
ccache software-properties-common git curl
wget
sudo
vim python3-pip
\
&&
apt-get
install
-y
ffmpeg libsm6 libxext6 libgl1
\
&&
apt-get
install
-y
ffmpeg libsm6 libxext6 libgl1
\
&&
add-apt-repository ppa:deadsnakes/ppa
\
&&
add-apt-repository ppa:deadsnakes/ppa
\
&&
apt-get update
-y
\
&&
apt-get update
-y
\
...
@@ -183,17 +180,20 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
...
@@ -183,17 +180,20 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
# or future versions of triton.
# or future versions of triton.
RUN
ldconfig /usr/local/cuda-
$(
echo
$CUDA_VERSION
|
cut
-d
.
-f1
,2
)
/compat/
RUN
ldconfig /usr/local/cuda-
$(
echo
$CUDA_VERSION
|
cut
-d
.
-f1
,2
)
/compat/
# arm64 (GH200) build follows the practice of "use existing pytorch" build,
# we need to install torch and torchvision from the nightly builds first,
# pytorch will not appear as a vLLM dependency in all of the following steps
# after this step
RUN
--mount
=
type
=
cache,target
=
/root/.cache/pip
\
if
[
"
$TARGETPLATFORM
"
=
"linux/arm64"
]
;
then
\
python3
-m
pip
install
--index-url
https://download.pytorch.org/whl/nightly/cu124
"torch==2.6.0.dev20241210+cu124"
"torchvision==0.22.0.dev20241215"
;
\
fi
# Install vllm wheel first, so that torch etc will be installed.
# Install vllm wheel first, so that torch etc will be installed.
RUN
--mount
=
type
=
bind
,from
=
build,src
=
/workspace/dist,target
=
/vllm-workspace/dist
\
RUN
--mount
=
type
=
bind
,from
=
build,src
=
/workspace/dist,target
=
/vllm-workspace/dist
\
--mount
=
type
=
cache,target
=
/root/.cache/pip
\
--mount
=
type
=
cache,target
=
/root/.cache/pip
\
python3
-m
pip
install
dist/
*
.whl
--verbose
python3
-m
pip
install
dist/
*
.whl
--verbose
RUN
--mount
=
type
=
cache,target
=
/root/.cache/pip
\
if
[
"
$TARGETPLATFORM
"
=
"linux/arm64"
]
;
then
\
pip uninstall
-y
torch
&&
\
python3
-m
pip
install
-r
requirements-cuda-arm64.txt
;
\
fi
RUN
--mount
=
type
=
cache,target
=
/root/.cache/pip
\
RUN
--mount
=
type
=
cache,target
=
/root/.cache/pip
\
.
/etc/environment
&&
\
.
/etc/environment
&&
\
if
[
"
$TARGETPLATFORM
"
!=
"linux/arm64"
]
;
then
\
if
[
"
$TARGETPLATFORM
"
!=
"linux/arm64"
]
;
then
\
...
@@ -240,10 +240,11 @@ FROM vllm-base AS vllm-openai
...
@@ -240,10 +240,11 @@ FROM vllm-base AS vllm-openai
# install additional dependencies for openai api server
# install additional dependencies for openai api server
RUN
--mount
=
type
=
cache,target
=
/root/.cache/pip
\
RUN
--mount
=
type
=
cache,target
=
/root/.cache/pip
\
if
[
"
$TARGETPLATFORM
"
=
"linux/arm64"
]
;
then
\
if
[
"
$TARGETPLATFORM
"
=
"linux/arm64"
]
;
then
\
pip
install
accelerate hf_transfer
'modelscope!=1.15.0'
'bitsandbytes>=0.42.0'
'timm==0.9.10'
;
\
pip
install
accelerate hf_transfer
'modelscope!=1.15.0'
'bitsandbytes>=0.42.0'
'timm==0.9.10'
boto3 runai-model-streamer runai-model-streamer[s3]
;
\
else
\
else
\
pip
install
accelerate hf_transfer
'modelscope!=1.15.0'
'bitsandbytes>=0.45.0'
'timm==0.9.10'
;
\
pip
install
accelerate hf_transfer
'modelscope!=1.15.0'
'bitsandbytes>=0.45.0'
'timm==0.9.10'
boto3 runai-model-streamer runai-model-streamer[s3]
;
\
fi
fi
ENV
VLLM_USAGE_SOURCE production-docker-image
ENV
VLLM_USAGE_SOURCE production-docker-image
ENTRYPOINT
["python3", "-m", "vllm.entrypoints.openai.api_server"]
ENTRYPOINT
["python3", "-m", "vllm.entrypoints.openai.api_server"]
...
...
Dockerfile.cpu
View file @
96ae75ad
...
@@ -26,10 +26,10 @@ RUN pip install intel_extension_for_pytorch==2.5.0
...
@@ -26,10 +26,10 @@ RUN pip install intel_extension_for_pytorch==2.5.0
WORKDIR /workspace
WORKDIR /workspace
COPY requirements-build.txt requirements-build.txt
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
RUN --mount=type=cache,target=/root/.cache/pip \
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
pip install --upgrade pip && \
pip install --upgrade pip && \
pip install -r requirements-build.txt
pip install -r requirements-build.txt
...
@@ -37,9 +37,9 @@ FROM cpu-test-1 AS build
...
@@ -37,9 +37,9 @@ FROM cpu-test-1 AS build
WORKDIR /workspace/vllm
WORKDIR /workspace/vllm
COPY requirements-common.txt requirements-common.txt
COPY requirements-cpu.txt requirements-cpu.txt
RUN --mount=type=cache,target=/root/.cache/pip \
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
--mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
pip install -v -r requirements-cpu.txt
pip install -v -r requirements-cpu.txt
COPY . .
COPY . .
...
...
README.md
View file @
96ae75ad
...
@@ -84,7 +84,7 @@ VLLM_INSTALL_PUNICA_KERNELS=1 python3 setup.py install (若调试,可使用V
...
@@ -84,7 +84,7 @@ VLLM_INSTALL_PUNICA_KERNELS=1 python3 setup.py install (若调试,可使用V
+
若使用 pip install 下载安装过慢,可添加源:-i https://pypi.tuna.tsinghua.edu.cn/simple/
+
若使用 pip install 下载安装过慢,可添加源:-i https://pypi.tuna.tsinghua.edu.cn/simple/
## 验证
## 验证
-
python -c "import vllm; print(vllm.
\_\_
version__)",版本号与官方版本同步,查询该软件的版本号,例如0.6.
5
;
-
python -c "import vllm; print(vllm.
\_\_
version__)",版本号与官方版本同步,查询该软件的版本号,例如0.6.
6.post1
;
## Known Issue
## Known Issue
-
无
-
无
...
...
README_ORIGIN.md
View file @
96ae75ad
...
@@ -60,7 +60,7 @@ vLLM is flexible and easy to use with:
...
@@ -60,7 +60,7 @@ vLLM is flexible and easy to use with:
vLLM seamlessly supports most popular open-source models on HuggingFace, including:
vLLM seamlessly supports most popular open-source models on HuggingFace, including:
-
Transformer-like LLMs (e.g., Llama)
-
Transformer-like LLMs (e.g., Llama)
-
Mixture-of-Expert LLMs (e.g., Mixtral)
-
Mixture-of-Expert LLMs (e.g., Mixtral
, Deepseek-V2 and V3
)
-
Embedding Models (e.g. E5-Mistral)
-
Embedding Models (e.g. E5-Mistral)
-
Multi-modal LLMs (e.g., LLaVA)
-
Multi-modal LLMs (e.g., LLaVA)
...
...
benchmarks/benchmark_throughput.py
View file @
96ae75ad
...
@@ -4,7 +4,8 @@ import dataclasses
...
@@ -4,7 +4,8 @@ import dataclasses
import
json
import
json
import
random
import
random
import
time
import
time
from
typing
import
List
,
Optional
from
functools
import
cache
from
typing
import
Dict
,
List
,
Optional
,
Tuple
import
numpy
as
np
import
numpy
as
np
import
torch
import
torch
...
@@ -20,8 +21,11 @@ from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
...
@@ -20,8 +21,11 @@ from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
from
vllm.entrypoints.openai.api_server
import
(
from
vllm.entrypoints.openai.api_server
import
(
build_async_engine_client_from_engine_args
)
build_async_engine_client_from_engine_args
)
from
vllm.inputs
import
TextPrompt
from
vllm.inputs
import
TextPrompt
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.utils
import
get_adapter_absolute_path
from
vllm.multimodal
import
MultiModalDataDict
from
vllm.multimodal
import
MultiModalDataDict
from
vllm.sampling_params
import
BeamSearchParams
from
vllm.sampling_params
import
BeamSearchParams
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
,
get_lora_tokenizer
from
vllm.utils
import
FlexibleArgumentParser
,
merge_async_iterators
from
vllm.utils
import
FlexibleArgumentParser
,
merge_async_iterators
...
@@ -31,15 +35,17 @@ class SampleRequest:
...
@@ -31,15 +35,17 @@ class SampleRequest:
Attributes:
Attributes:
prompt: The input text prompt for the model.
prompt: The input text prompt for the model.
multi_modal_data: Optional dictionary containing multi-modal data (e.g.
images).
prompt_len: The length of the prompt in tokens.
prompt_len: The length of the prompt in tokens.
expected_output_len: The expected length of the output in tokens.
expected_output_len: The expected length of the output in tokens.
multi_modal_data: Optional dictionary containing multi-modal data (e.g.
images).
lora_request: Optional LoRARequest specifying the LoRA to use.
"""
"""
prompt
:
str
prompt
:
str
prompt_len
:
int
prompt_len
:
int
expected_output_len
:
int
expected_output_len
:
int
multi_modal_data
:
Optional
[
MultiModalDataDict
]
=
None
multi_modal_data
:
Optional
[
MultiModalDataDict
]
=
None
lora_request
:
Optional
[
LoRARequest
]
=
None
def
_get_prompt_for_image_model
(
question
:
str
,
*
,
model
:
str
)
->
str
:
def
_get_prompt_for_image_model
(
question
:
str
,
*
,
model
:
str
)
->
str
:
...
@@ -63,8 +69,30 @@ def _get_prompt_for_image_model(question: str, *, model: str) -> str:
...
@@ -63,8 +69,30 @@ def _get_prompt_for_image_model(question: str, *, model: str) -> str:
raise
ValueError
(
f
"Unsupported model
{
model
}
"
)
raise
ValueError
(
f
"Unsupported model
{
model
}
"
)
@
cache
def
lora_path_on_disk
(
lora_path
:
str
)
->
str
:
return
get_adapter_absolute_path
(
lora_path
)
lora_tokenizer_cache
:
Dict
[
int
,
AnyTokenizer
]
=
{}
def
get_random_lora_request
(
args
:
argparse
.
Namespace
)
->
Tuple
[
LoRARequest
,
Optional
[
AnyTokenizer
]]:
global
lora_tokenizer_cache
lora_id
=
random
.
randint
(
1
,
args
.
max_loras
)
lora_request
=
LoRARequest
(
lora_name
=
str
(
lora_id
),
lora_int_id
=
lora_id
,
lora_path
=
lora_path_on_disk
(
args
.
lora_path
))
if
lora_id
not
in
lora_tokenizer_cache
:
lora_tokenizer_cache
[
lora_id
]
=
get_lora_tokenizer
(
lora_request
)
return
lora_request
,
lora_tokenizer_cache
[
lora_id
]
def
sample_requests
(
tokenizer
:
PreTrainedTokenizerBase
,
def
sample_requests
(
tokenizer
:
PreTrainedTokenizerBase
,
args
:
argparse
.
Namespace
)
->
List
[
SampleRequest
]:
args
:
argparse
.
Namespace
)
->
List
[
SampleRequest
]:
dataset_path
:
str
=
args
.
dataset
dataset_path
:
str
=
args
.
dataset
num_requests
:
int
=
args
.
num_prompts
num_requests
:
int
=
args
.
num_prompts
fixed_output_len
:
Optional
[
int
]
=
args
.
output_len
fixed_output_len
:
Optional
[
int
]
=
args
.
output_len
...
@@ -82,7 +110,9 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
...
@@ -82,7 +110,9 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
# Filter out sequences that are too long or too short
# Filter out sequences that are too long or too short
filtered_dataset
:
List
[
SampleRequest
]
=
[]
filtered_dataset
:
List
[
SampleRequest
]
=
[]
for
data
in
dataset
:
for
data
in
tqdm
(
dataset
,
total
=
len
(
filtered_dataset
),
desc
=
"sampling requests"
):
if
len
(
filtered_dataset
)
==
num_requests
:
if
len
(
filtered_dataset
)
==
num_requests
:
break
break
...
@@ -105,9 +135,16 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
...
@@ -105,9 +135,16 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
continue
continue
prompt
=
_get_prompt_for_image_model
(
question
=
prompt
,
model
=
model
)
prompt
=
_get_prompt_for_image_model
(
question
=
prompt
,
model
=
model
)
request_tokenizer
=
tokenizer
lora_request
:
Optional
[
LoRARequest
]
=
None
if
args
.
enable_lora
:
lora_request
,
lora_tokenizer
=
get_random_lora_request
(
args
)
if
lora_tokenizer
:
request_tokenizer
=
lora_tokenizer
# Tokenize the prompts and completions.
# Tokenize the prompts and completions.
prompt_token_ids
=
tokenizer
(
prompt
).
input_ids
prompt_token_ids
=
request_
tokenizer
(
prompt
).
input_ids
completion_token_ids
=
tokenizer
(
completion
).
input_ids
completion_token_ids
=
request_
tokenizer
(
completion
).
input_ids
prompt_len
=
len
(
prompt_token_ids
)
prompt_len
=
len
(
prompt_token_ids
)
output_len
=
len
(
completion_token_ids
output_len
=
len
(
completion_token_ids
)
if
fixed_output_len
is
None
else
fixed_output_len
)
if
fixed_output_len
is
None
else
fixed_output_len
...
@@ -121,7 +158,8 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
...
@@ -121,7 +158,8 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
SampleRequest
(
prompt
=
prompt
,
SampleRequest
(
prompt
=
prompt
,
prompt_len
=
prompt_len
,
prompt_len
=
prompt_len
,
expected_output_len
=
output_len
,
expected_output_len
=
output_len
,
multi_modal_data
=
multi_modal_data
))
multi_modal_data
=
multi_modal_data
,
lora_request
=
lora_request
))
return
filtered_dataset
return
filtered_dataset
...
@@ -150,11 +188,14 @@ def run_vllm(
...
@@ -150,11 +188,14 @@ def run_vllm(
ignore_eos
=
True
,
ignore_eos
=
True
,
max_tokens
=
request
.
expected_output_len
,
max_tokens
=
request
.
expected_output_len
,
))
))
lora_requests
:
Optional
[
List
[
LoRARequest
]]
=
None
if
engine_args
.
enable_lora
:
lora_requests
=
[
request
.
lora_request
for
request
in
requests
]
# warmup
# warmup
warmup_prompts
:
List
[
TextPrompt
]
=
[]
warmup_prompts
:
List
[
TextPrompt
]
=
[]
warmup_sampling_params
:
List
[
SamplingParams
]
=
[]
warmup_sampling_params
:
List
[
SamplingParams
]
=
[]
for
request
in
warmup_
promp
ts
:
for
request
in
warmup_
reques
ts
:
warmup_prompts
.
append
(
warmup_prompts
.
append
(
TextPrompt
(
prompt
=
request
.
prompt
,
TextPrompt
(
prompt
=
request
.
prompt
,
multi_modal_data
=
request
.
multi_modal_data
))
multi_modal_data
=
request
.
multi_modal_data
))
...
@@ -191,9 +232,13 @@ def run_vllm(
...
@@ -191,9 +232,13 @@ def run_vllm(
if
not
use_beam_search
:
if
not
use_beam_search
:
start
=
time
.
perf_counter
()
start
=
time
.
perf_counter
()
llm
.
generate
(
prompts
,
sampling_params
,
use_tqdm
=
True
)
llm
.
generate
(
prompts
,
sampling_params
,
lora_request
=
lora_requests
,
use_tqdm
=
True
)
end
=
time
.
perf_counter
()
end
=
time
.
perf_counter
()
else
:
else
:
assert
lora_requests
is
None
,
"BeamSearch API does not support LoRA"
prompts
=
[
request
.
prompt
for
request
in
requests
]
prompts
=
[
request
.
prompt
for
request
in
requests
]
# output_len should be the same for all requests.
# output_len should be the same for all requests.
output_len
=
requests
[
0
][
2
]
output_len
=
requests
[
0
][
2
]
...
@@ -225,6 +270,7 @@ async def run_vllm_async(
...
@@ -225,6 +270,7 @@ async def run_vllm_async(
# Add the requests to the engine.
# Add the requests to the engine.
prompts
:
List
[
TextPrompt
]
=
[]
prompts
:
List
[
TextPrompt
]
=
[]
sampling_params
:
List
[
SamplingParams
]
=
[]
sampling_params
:
List
[
SamplingParams
]
=
[]
lora_requests
:
List
[
Optional
[
LoRARequest
]]
=
[]
for
request
in
requests
:
for
request
in
requests
:
prompts
.
append
(
prompts
.
append
(
TextPrompt
(
prompt
=
request
.
prompt
,
TextPrompt
(
prompt
=
request
.
prompt
,
...
@@ -237,11 +283,16 @@ async def run_vllm_async(
...
@@ -237,11 +283,16 @@ async def run_vllm_async(
ignore_eos
=
True
,
ignore_eos
=
True
,
max_tokens
=
request
.
expected_output_len
,
max_tokens
=
request
.
expected_output_len
,
))
))
lora_requests
.
append
(
request
.
lora_request
)
generators
=
[]
generators
=
[]
start
=
time
.
perf_counter
()
start
=
time
.
perf_counter
()
for
i
,
(
prompt
,
sp
)
in
enumerate
(
zip
(
prompts
,
sampling_params
)):
for
i
,
(
prompt
,
sp
,
generator
=
llm
.
generate
(
prompt
,
sp
,
request_id
=
f
"test
{
i
}
"
)
lr
)
in
enumerate
(
zip
(
prompts
,
sampling_params
,
lora_requests
)):
generator
=
llm
.
generate
(
prompt
,
sp
,
lora_request
=
lr
,
request_id
=
f
"test
{
i
}
"
)
generators
.
append
(
generator
)
generators
.
append
(
generator
)
all_gens
=
merge_async_iterators
(
*
generators
)
all_gens
=
merge_async_iterators
(
*
generators
)
async
for
i
,
res
in
all_gens
:
async
for
i
,
res
in
all_gens
:
...
@@ -340,6 +391,14 @@ def main(args: argparse.Namespace):
...
@@ -340,6 +391,14 @@ def main(args: argparse.Namespace):
vocab_size
=
tokenizer
.
vocab_size
vocab_size
=
tokenizer
.
vocab_size
requests
=
[]
requests
=
[]
for
_
in
range
(
args
.
num_prompts
):
for
_
in
range
(
args
.
num_prompts
):
request_tokenizer
=
tokenizer
lora_request
:
Optional
[
LoRARequest
]
=
None
if
args
.
enable_lora
:
lora_request
,
lora_tokenizer
=
get_random_lora_request
(
args
)
if
lora_tokenizer
:
request_tokenizer
=
lora_tokenizer
# Synthesize a prompt with the given input length.
# Synthesize a prompt with the given input length.
candidate_ids
=
[
candidate_ids
=
[
random
.
randint
(
0
,
vocab_size
-
1
)
random
.
randint
(
0
,
vocab_size
-
1
)
...
@@ -348,8 +407,8 @@ def main(args: argparse.Namespace):
...
@@ -348,8 +407,8 @@ def main(args: argparse.Namespace):
# As tokenizer may add additional tokens like BOS, we need to try
# As tokenizer may add additional tokens like BOS, we need to try
# different lengths to get the desired input length.
# different lengths to get the desired input length.
for
_
in
range
(
5
):
# Max attempts to correct
for
_
in
range
(
5
):
# Max attempts to correct
candidate_prompt
=
tokenizer
.
decode
(
candidate_ids
)
candidate_prompt
=
request_
tokenizer
.
decode
(
candidate_ids
)
tokenized_len
=
len
(
tokenizer
.
encode
(
candidate_prompt
))
tokenized_len
=
len
(
request_
tokenizer
.
encode
(
candidate_prompt
))
if
tokenized_len
==
args
.
input_len
:
if
tokenized_len
==
args
.
input_len
:
break
break
...
@@ -366,40 +425,14 @@ def main(args: argparse.Namespace):
...
@@ -366,40 +425,14 @@ def main(args: argparse.Namespace):
requests
.
append
(
requests
.
append
(
SampleRequest
(
prompt
=
candidate_prompt
,
SampleRequest
(
prompt
=
candidate_prompt
,
prompt_len
=
args
.
input_len
,
prompt_len
=
args
.
input_len
,
expected_output_len
=
args
.
output_len
))
expected_output_len
=
args
.
output_len
,
lora_request
=
lora_request
))
else
:
else
:
requests
=
sample_requests
(
tokenizer
,
args
)
requests
=
sample_requests
(
tokenizer
,
args
)
is_multi_modal
=
any
(
request
.
multi_modal_data
is
not
None
is_multi_modal
=
any
(
request
.
multi_modal_data
is
not
None
for
request
in
requests
)
for
request
in
requests
)
if
args
.
backend
==
"vllm"
:
if
args
.
backend
==
"vllm"
:
# if args.async_engine:
# run_args = [
# requests, args.model, args.tokenizer, args.quantization,
# args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
# args.trust_remote_code, args.dtype, args.max_model_len,
# args.enforce_eager, args.kv_cache_dtype,
# args.quantization_param_path, args.device,
# args.enable_prefix_caching, args.enable_chunked_prefill,
# args.max_num_batched_tokens, args.distributed_executor_backend,
# args.gpu_memory_utilization, args.num_scheduler_steps,
# args.use_v2_block_manager, args.download_dir, args.load_format,
# args.disable_async_output_proc
# ]
# else:
# run_args = [
# warmup_requests, requests, args.model, args.tokenizer, args.quantization,
# args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
# args.trust_remote_code, args.dtype, args.max_model_len,
# args.enforce_eager, args.kv_cache_dtype,
# args.quantization_param_path, args.device,
# args.enable_prefix_caching, args.enable_chunked_prefill,
# args.max_num_batched_tokens, args.distributed_executor_backend,
# args.gpu_memory_utilization, args.num_scheduler_steps,
# args.use_v2_block_manager, args.download_dir, args.load_format,
# args.disable_async_output_proc
# ]
if
args
.
async_engine
:
if
args
.
async_engine
:
elapsed_time
=
uvloop
.
run
(
elapsed_time
=
uvloop
.
run
(
run_vllm_async
(
run_vllm_async
(
...
@@ -409,7 +442,7 @@ def main(args: argparse.Namespace):
...
@@ -409,7 +442,7 @@ def main(args: argparse.Namespace):
args
.
disable_frontend_multiprocessing
,
args
.
disable_frontend_multiprocessing
,
))
))
else
:
else
:
elapsed_time
=
run_vllm
(
requests
,
args
.
n
,
elapsed_time
=
run_vllm
(
warmup_requests
,
requests
,
args
.
n
,
EngineArgs
.
from_cli_args
(
args
))
EngineArgs
.
from_cli_args
(
args
))
elif
args
.
backend
==
"hf"
:
elif
args
.
backend
==
"hf"
:
assert
args
.
tensor_parallel_size
==
1
assert
args
.
tensor_parallel_size
==
1
...
@@ -496,6 +529,14 @@ if __name__ == "__main__":
...
@@ -496,6 +529,14 @@ if __name__ == "__main__":
action
=
'store_true'
,
action
=
'store_true'
,
default
=
False
,
default
=
False
,
help
=
"Disable decoupled async engine frontend."
)
help
=
"Disable decoupled async engine frontend."
)
# LoRA
parser
.
add_argument
(
"--lora-path"
,
type
=
str
,
default
=
None
,
help
=
"Path to the lora adapters to use. This can be an absolute path, "
"a relative path, or a Hugging Face model identifier."
)
parser
=
AsyncEngineArgs
.
add_cli_args
(
parser
)
parser
=
AsyncEngineArgs
.
add_cli_args
(
parser
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
if
args
.
tokenizer
is
None
:
if
args
.
tokenizer
is
None
:
...
@@ -505,6 +546,8 @@ if __name__ == "__main__":
...
@@ -505,6 +546,8 @@ if __name__ == "__main__":
assert
args
.
output_len
is
not
None
assert
args
.
output_len
is
not
None
else
:
else
:
assert
args
.
input_len
is
None
assert
args
.
input_len
is
None
if
args
.
enable_lora
:
assert
args
.
lora_path
is
not
None
if
args
.
backend
==
"vllm"
:
if
args
.
backend
==
"vllm"
:
if
args
.
hf_max_batch_size
is
not
None
:
if
args
.
hf_max_batch_size
is
not
None
:
...
@@ -514,6 +557,9 @@ if __name__ == "__main__":
...
@@ -514,6 +557,9 @@ if __name__ == "__main__":
raise
ValueError
(
"HF max batch size is required for HF backend."
)
raise
ValueError
(
"HF max batch size is required for HF backend."
)
if
args
.
quantization
is
not
None
:
if
args
.
quantization
is
not
None
:
raise
ValueError
(
"Quantization is only for vLLM backend."
)
raise
ValueError
(
"Quantization is only for vLLM backend."
)
if
args
.
enable_lora
is
not
None
:
raise
ValueError
(
"LoRA benchmarking is only supported for vLLM"
" backend"
)
elif
args
.
backend
==
"mii"
:
elif
args
.
backend
==
"mii"
:
if
args
.
dtype
!=
"auto"
:
if
args
.
dtype
!=
"auto"
:
raise
ValueError
(
"dtype must be auto for MII backend."
)
raise
ValueError
(
"dtype must be auto for MII backend."
)
...
@@ -526,4 +572,7 @@ if __name__ == "__main__":
...
@@ -526,4 +572,7 @@ if __name__ == "__main__":
if
args
.
tokenizer
!=
args
.
model
:
if
args
.
tokenizer
!=
args
.
model
:
raise
ValueError
(
"Tokenizer must be the same as the model for MII "
raise
ValueError
(
"Tokenizer must be the same as the model for MII "
"backend."
)
"backend."
)
main
(
args
)
if
args
.
enable_lora
is
not
None
:
\ No newline at end of file
raise
ValueError
(
"LoRA benchmarking is only supported for vLLM"
" backend"
)
main
(
args
)
benchmarks/cutlass_benchmarks/sparse_benchmarks.py
0 → 100644
View file @
96ae75ad
import
argparse
import
copy
import
itertools
import
pickle
as
pkl
import
time
from
typing
import
Callable
,
Iterable
,
List
,
Tuple
import
torch
import
torch.utils.benchmark
as
TBenchmark
from
torch.utils.benchmark
import
Measurement
as
TMeasurement
from
utils
import
make_rand_sparse_tensors
from
weight_shapes
import
WEIGHT_SHAPES
from
vllm
import
_custom_ops
as
ops
from
vllm.utils
import
FlexibleArgumentParser
DEFAULT_MODELS
=
list
(
WEIGHT_SHAPES
.
keys
())
DEFAULT_BATCH_SIZES
=
[
1
,
16
,
32
,
64
,
128
,
256
,
512
]
DEFAULT_TP_SIZES
=
[
1
]
# bench
def
bench_fn
(
label
:
str
,
sub_label
:
str
,
description
:
str
,
fn
:
Callable
,
*
args
,
**
kwargs
)
->
TMeasurement
:
min_run_time
=
1
globals
=
{
"args"
:
args
,
"kwargs"
:
kwargs
,
"fn"
:
fn
,
}
return
TBenchmark
.
Timer
(
stmt
=
"fn(*args, **kwargs)"
,
globals
=
globals
,
label
=
label
,
sub_label
=
sub_label
,
description
=
description
,
).
blocked_autorange
(
min_run_time
=
min_run_time
)
def
bench_int8
(
dtype
:
torch
.
dtype
,
m
:
int
,
k
:
int
,
n
:
int
,
label
:
str
,
sub_label
:
str
)
->
Iterable
[
TMeasurement
]:
assert
dtype
==
torch
.
int8
b_compressed
,
e
,
a
,
b
=
make_rand_sparse_tensors
(
torch
.
int8
,
m
,
n
,
k
)
scale_a
=
torch
.
tensor
(
1.0
,
device
=
"cuda"
,
dtype
=
torch
.
float32
)
scale_b
=
torch
.
tensor
(
1.0
,
device
=
"cuda"
,
dtype
=
torch
.
float32
)
bias
=
torch
.
zeros
((
n
,
),
device
=
"cuda"
,
dtype
=
torch
.
bfloat16
)
out
=
ops
.
cutlass_scaled_sparse_mm
(
a
,
b_compressed
,
e
,
scale_a
,
scale_b
,
torch
.
bfloat16
)
out_ref
=
ops
.
cutlass_scaled_mm
(
a
,
b
,
scale_a
,
scale_b
,
torch
.
bfloat16
)
if
not
torch
.
allclose
(
out
,
out_ref
):
print
(
"Incorrect results"
)
print
(
out
)
print
(
out_ref
)
else
:
print
(
"Correct results"
)
timers
=
[]
# pytorch impl - bfloat16
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"pytorch_bf16_bf16_bf16_matmul-no-scales"
,
torch
.
mm
,
a
.
to
(
dtype
=
torch
.
bfloat16
),
b
.
to
(
dtype
=
torch
.
bfloat16
)))
# pytorch impl - float16
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"pytorch_fp16_fp16_fp16_matmul-no-scales"
,
torch
.
mm
,
a
.
to
(
dtype
=
torch
.
float16
),
b
.
to
(
dtype
=
torch
.
float16
)))
# cutlass impl
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"cutlass_i8_i8_bf16_scaled_mm"
,
ops
.
cutlass_scaled_mm
,
a
,
b
,
scale_a
,
scale_b
,
torch
.
bfloat16
))
# cutlass with bias
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"cutlass_i8_i8_bf16_scaled_mm_bias"
,
ops
.
cutlass_scaled_mm
,
a
,
b
,
scale_a
,
scale_b
,
torch
.
bfloat16
,
bias
))
# cutlass sparse impl
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"cutlass_i8_i8_bf16_scaled_sparse_mm"
,
ops
.
cutlass_scaled_sparse_mm
,
a
,
b_compressed
,
e
,
scale_a
,
scale_b
,
torch
.
bfloat16
))
# cutlass sparse with bias
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"cutlass_i8_i8_bf16_scaled_sparse_mm_bias"
,
ops
.
cutlass_scaled_sparse_mm
,
a
,
b_compressed
,
e
,
scale_a
,
scale_b
,
torch
.
bfloat16
,
bias
))
return
timers
def
bench_fp8
(
dtype
:
torch
.
dtype
,
m
:
int
,
k
:
int
,
n
:
int
,
label
:
str
,
sub_label
:
str
)
->
Iterable
[
TMeasurement
]:
assert
dtype
==
torch
.
float8_e4m3fn
b_compressed
,
e
,
a
,
b
=
make_rand_sparse_tensors
(
torch
.
float8_e4m3fn
,
m
,
n
,
k
)
scale_a
=
torch
.
tensor
(
1.0
,
device
=
"cuda"
,
dtype
=
torch
.
float32
)
scale_b
=
torch
.
tensor
(
1.0
,
device
=
"cuda"
,
dtype
=
torch
.
float32
)
bias
=
torch
.
zeros
((
n
,
),
device
=
"cuda"
,
dtype
=
torch
.
bfloat16
)
out
=
ops
.
cutlass_scaled_sparse_mm
(
a
,
b_compressed
,
e
,
scale_a
,
scale_b
,
torch
.
bfloat16
)
out_ref
=
ops
.
cutlass_scaled_mm
(
a
,
b
,
scale_a
,
scale_b
,
torch
.
bfloat16
)
if
not
torch
.
allclose
(
out
,
out_ref
):
print
(
"Incorrect results"
)
print
(
out
)
print
(
out_ref
)
else
:
print
(
"Correct results"
)
timers
=
[]
# pytorch impl w. bf16
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"pytorch_bf16_bf16_bf16_matmul-no-scales"
,
torch
.
mm
,
a
.
to
(
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
),
b
.
to
(
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
)))
# pytorch impl: bf16 output, without fp8 fast accum
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"pytorch_fp8_fp8_bf16_scaled_mm"
,
torch
.
_scaled_mm
,
a
,
b
,
scale_a
=
scale_a
,
scale_b
=
scale_b
,
out_dtype
=
torch
.
bfloat16
))
# pytorch impl: bf16 output, with fp8 fast accum
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"pytorch_fp8_fp8_bf16_scaled_mm_fast_accum"
,
torch
.
_scaled_mm
,
a
,
b
,
scale_a
=
scale_a
,
scale_b
=
scale_b
,
out_dtype
=
torch
.
bfloat16
,
use_fast_accum
=
True
))
# pytorch impl: fp16 output, without fp8 fast accum
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"pytorch_fp8_fp8_fp16_scaled_mm"
,
torch
.
_scaled_mm
,
a
,
b
,
scale_a
=
scale_a
,
scale_b
=
scale_b
,
out_dtype
=
torch
.
float16
))
# pytorch impl: fp16 output, with fp8 fast accum
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"pytorch_fp8_fp8_fp16_scaled_mm_fast_accum"
,
torch
.
_scaled_mm
,
a
,
b
,
scale_a
=
scale_a
,
scale_b
=
scale_b
,
out_dtype
=
torch
.
float16
,
use_fast_accum
=
True
))
# cutlass impl: bf16 output
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"cutlass_fp8_fp8_bf16_scaled_mm"
,
ops
.
cutlass_scaled_mm
,
a
,
b
,
scale_a
,
scale_b
,
torch
.
bfloat16
))
# cutlass impl: bf16 output
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"cutlass_fp8_fp8_bf16_scaled_sparse_mm"
,
ops
.
cutlass_scaled_sparse_mm
,
a
,
b_compressed
,
e
,
scale_a
,
scale_b
,
torch
.
bfloat16
))
# cutlass impl: fp16 output
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"cutlass_fp8_fp8_fp16_scaled_sparse_mm"
,
ops
.
cutlass_scaled_sparse_mm
,
a
,
b_compressed
,
e
,
scale_a
,
scale_b
,
torch
.
float16
))
# cutlass impl: bf16 output, with bias
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"cutlass_fp8_fp8_bf16_scaled_sparse_mm_bias"
,
ops
.
cutlass_scaled_sparse_mm
,
a
,
b_compressed
,
e
,
scale_a
,
scale_b
,
torch
.
bfloat16
,
bias
))
# cutlass impl: fp16 output, with bias
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"cutlass_fp8_fp8_fp16_scaled_sparse_mm_bias"
,
ops
.
cutlass_scaled_sparse_mm
,
a
,
b_compressed
,
e
,
scale_a
,
scale_b
,
torch
.
float16
,
bias
.
to
(
dtype
=
torch
.
float16
)))
return
timers
def
bench
(
dtype
:
torch
.
dtype
,
m
:
int
,
k
:
int
,
n
:
int
,
label
:
str
,
sub_label
:
str
)
->
Iterable
[
TMeasurement
]:
if
dtype
==
torch
.
int8
:
return
bench_int8
(
dtype
,
m
,
k
,
n
,
label
,
sub_label
)
if
dtype
==
torch
.
float8_e4m3fn
:
return
bench_fp8
(
dtype
,
m
,
k
,
n
,
label
,
sub_label
)
raise
ValueError
(
"unsupported type"
)
# runner
def
print_timers
(
timers
:
Iterable
[
TMeasurement
]):
compare
=
TBenchmark
.
Compare
(
timers
)
compare
.
print
()
def
run
(
dtype
:
torch
.
dtype
,
MKNs
:
Iterable
[
Tuple
[
int
,
int
,
int
]])
->
Iterable
[
TMeasurement
]:
results
=
[]
for
m
,
k
,
n
in
MKNs
:
timers
=
bench
(
dtype
,
m
,
k
,
n
,
f
"scaled-
{
dtype
}
-gemm"
,
f
"MKN=(
{
m
}
x
{
k
}
x
{
n
}
)"
)
print_timers
(
timers
)
results
.
extend
(
timers
)
return
results
# output makers
def
make_output
(
data
:
Iterable
[
TMeasurement
],
MKNs
:
Iterable
[
Tuple
[
int
,
int
,
int
]],
base_description
:
str
,
timestamp
=
None
):
print
(
f
"== All Results
{
base_description
}
===="
)
print_timers
(
data
)
# pickle all the results
timestamp
=
int
(
time
.
time
())
if
timestamp
is
None
else
timestamp
with
open
(
f
"
{
base_description
}
-
{
timestamp
}
.pkl"
,
"wb"
)
as
f
:
pkl
.
dump
(
data
,
f
)
# argparse runners
def
run_square_bench
(
args
):
dim_sizes
=
list
(
range
(
args
.
dim_start
,
args
.
dim_end
+
1
,
args
.
dim_increment
))
MKNs
=
list
(
zip
(
dim_sizes
,
dim_sizes
,
dim_sizes
))
data
=
run
(
args
.
dtype
,
MKNs
)
make_output
(
data
,
MKNs
,
f
"square_bench-
{
args
.
dtype
}
"
)
def
run_range_bench
(
args
):
dim_sizes
=
list
(
range
(
args
.
dim_start
,
args
.
dim_end
,
args
.
dim_increment
))
n
=
len
(
dim_sizes
)
Ms
=
[
args
.
m_constant
]
*
n
if
args
.
m_constant
is
not
None
else
dim_sizes
Ks
=
[
args
.
k_constant
]
*
n
if
args
.
k_constant
is
not
None
else
dim_sizes
Ns
=
[
args
.
n_constant
]
*
n
if
args
.
n_constant
is
not
None
else
dim_sizes
MKNs
=
list
(
zip
(
Ms
,
Ks
,
Ns
))
data
=
run
(
args
.
dtype
,
MKNs
)
make_output
(
data
,
MKNs
,
f
"range_bench-
{
args
.
dtype
}
"
)
def
run_model_bench
(
args
):
print
(
"Benchmarking models:"
)
for
i
,
model
in
enumerate
(
args
.
models
):
print
(
f
"[
{
i
}
]
{
model
}
"
)
def
model_shapes
(
model_name
:
str
,
tp_size
:
int
)
->
List
[
Tuple
[
int
,
int
]]:
KNs
=
[]
for
KN
,
tp_split_dim
in
copy
.
deepcopy
(
WEIGHT_SHAPES
[
model_name
]):
KN
[
tp_split_dim
]
=
KN
[
tp_split_dim
]
//
tp_size
KNs
.
append
(
KN
)
return
KNs
model_bench_data
=
[]
models_tps
=
list
(
itertools
.
product
(
args
.
models
,
args
.
tp_sizes
))
for
model
,
tp_size
in
models_tps
:
Ms
=
args
.
batch_sizes
KNs
=
model_shapes
(
model
,
tp_size
)
MKNs
=
[]
for
m
in
Ms
:
for
k
,
n
in
KNs
:
MKNs
.
append
((
m
,
k
,
n
))
data
=
run
(
args
.
dtype
,
MKNs
)
model_bench_data
.
append
(
data
)
# Print all results
for
data
,
model_tp
in
zip
(
model_bench_data
,
models_tps
):
model
,
tp_size
=
model_tp
print
(
f
"== Results
{
args
.
dtype
}
{
model
}
-TP
{
tp_size
}
===="
)
print_timers
(
data
)
timestamp
=
int
(
time
.
time
())
all_data
=
[]
for
d
in
model_bench_data
:
all_data
.
extend
(
d
)
# pickle all data
with
open
(
f
"model_bench-
{
args
.
dtype
}
-
{
timestamp
}
.pkl"
,
"wb"
)
as
f
:
pkl
.
dump
(
all_data
,
f
)
if
__name__
==
'__main__'
:
def
to_torch_dtype
(
dt
):
if
dt
==
"int8"
:
return
torch
.
int8
if
dt
==
"fp8"
:
return
torch
.
float8_e4m3fn
raise
ValueError
(
"unsupported dtype"
)
parser
=
FlexibleArgumentParser
(
description
=
"""
Benchmark Cutlass GEMM.
To run square GEMMs:
python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
To run constant N and K and sweep M:
python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
To run dimensions from a model:
python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
Output:
- a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
"""
,
# noqa: E501
formatter_class
=
argparse
.
RawTextHelpFormatter
)
parser
.
add_argument
(
"--dtype"
,
type
=
to_torch_dtype
,
required
=
True
,
help
=
"Available options are ['int8', 'fp8']"
)
subparsers
=
parser
.
add_subparsers
(
dest
=
"cmd"
)
square_parser
=
subparsers
.
add_parser
(
"square_bench"
)
square_parser
.
add_argument
(
"--dim-start"
,
type
=
int
,
required
=
True
)
square_parser
.
add_argument
(
"--dim-end"
,
type
=
int
,
required
=
True
)
square_parser
.
add_argument
(
"--dim-increment"
,
type
=
int
,
required
=
True
)
square_parser
.
set_defaults
(
func
=
run_square_bench
)
range_parser
=
subparsers
.
add_parser
(
"range_bench"
)
range_parser
.
add_argument
(
"--dim-start"
,
type
=
int
,
required
=
True
)
range_parser
.
add_argument
(
"--dim-end"
,
type
=
int
,
required
=
True
)
range_parser
.
add_argument
(
"--dim-increment"
,
type
=
int
,
required
=
True
)
range_parser
.
add_argument
(
"--m-constant"
,
type
=
int
,
default
=
None
)
range_parser
.
add_argument
(
"--n-constant"
,
type
=
int
,
default
=
None
)
range_parser
.
add_argument
(
"--k-constant"
,
type
=
int
,
default
=
None
)
range_parser
.
set_defaults
(
func
=
run_range_bench
)
model_parser
=
subparsers
.
add_parser
(
"model_bench"
)
model_parser
.
add_argument
(
"--models"
,
nargs
=
"+"
,
type
=
str
,
default
=
DEFAULT_MODELS
,
choices
=
WEIGHT_SHAPES
.
keys
())
model_parser
.
add_argument
(
"--tp-sizes"
,
nargs
=
"+"
,
type
=
int
,
default
=
DEFAULT_TP_SIZES
)
model_parser
.
add_argument
(
"--batch-sizes"
,
nargs
=
"+"
,
type
=
int
,
default
=
DEFAULT_BATCH_SIZES
)
model_parser
.
set_defaults
(
func
=
run_model_bench
)
args
=
parser
.
parse_args
()
args
.
func
(
args
)
benchmarks/cutlass_benchmarks/utils.py
0 → 100644
View file @
96ae75ad
# Cutlass bench utils
from
typing
import
Iterable
,
Tuple
import
torch
import
vllm._custom_ops
as
ops
def
to_fp8
(
tensor
:
torch
.
Tensor
)
->
torch
.
Tensor
:
finfo
=
torch
.
finfo
(
torch
.
float8_e4m3fn
)
return
torch
.
round
(
tensor
.
clamp
(
min
=
finfo
.
min
,
max
=
finfo
.
max
)).
to
(
dtype
=
torch
.
float8_e4m3fn
)
def
to_int8
(
tensor
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
torch
.
round
(
tensor
.
clamp
(
min
=-
128
,
max
=
127
)).
to
(
dtype
=
torch
.
int8
)
def
to_bf16
(
tensor
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
tensor
.
to
(
dtype
=
torch
.
bfloat16
)
def
to_fp16
(
tensor
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
tensor
.
to
(
dtype
=
torch
.
float16
)
def
make_rand_tensors
(
dtype
:
torch
.
dtype
,
m
:
int
,
n
:
int
,
k
:
int
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
a
=
torch
.
randn
((
m
,
k
),
device
=
'cuda'
)
*
5
b
=
torch
.
randn
((
n
,
k
),
device
=
'cuda'
).
t
()
*
5
if
dtype
==
torch
.
int8
:
return
to_int8
(
a
),
to_int8
(
b
)
if
dtype
==
torch
.
float8_e4m3fn
:
return
to_fp8
(
a
),
to_fp8
(
b
)
raise
ValueError
(
"unsupported dtype"
)
def
prune_to_2_4
(
tensor
):
# Reshape tensor to [N, 4] where N is number of groups of 4
original_shape
=
tensor
.
shape
reshaped
=
tensor
.
reshape
(
-
1
,
4
)
# Get indices of top 2 absolute values in each group of 4
_
,
indices
=
torch
.
topk
(
torch
.
abs
(
reshaped
),
k
=
2
,
dim
=
1
)
# Create binary mask
mask
=
torch
.
zeros_like
(
reshaped
)
mask
.
scatter_
(
dim
=
1
,
index
=
indices
,
src
=
torch
.
ones_like
(
indices
,
dtype
=
mask
.
dtype
))
# Apply mask and reshape back
pruned
=
reshaped
*
mask
# Turn all -0.0 to 0.0
pruned
[
pruned
==
-
0.0
]
=
0.0
return
pruned
.
reshape
(
original_shape
)
def
make_rand_sparse_tensors
(
dtype
:
torch
.
dtype
,
m
:
int
,
n
:
int
,
k
:
int
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
a
=
torch
.
randn
((
m
,
k
),
device
=
'cuda'
)
*
5
b
=
torch
.
randn
((
n
,
k
),
device
=
'cuda'
).
t
()
*
5
b
=
prune_to_2_4
(
b
.
t
()).
t
()
if
dtype
==
torch
.
int8
:
a
,
b
=
to_int8
(
a
),
to_int8
(
b
)
elif
dtype
==
torch
.
float8_e4m3fn
:
a
,
b
=
to_fp8
(
a
),
to_fp8
(
b
)
elif
dtype
==
torch
.
float16
:
a
,
b
=
to_fp16
(
a
),
to_fp16
(
b
)
elif
dtype
==
torch
.
bfloat16
:
a
,
b
=
to_bf16
(
a
),
to_bf16
(
b
)
else
:
raise
ValueError
(
"unsupported dtype"
)
b_compressed
,
e
=
ops
.
cutlass_sparse_compress
(
b
.
t
())
# Compressed B, Metadata, Original A, B
return
b_compressed
,
e
,
a
,
b
def
make_n_rand_sparse_tensors
(
num_tensors
:
int
,
dtype
:
torch
.
dtype
,
m
:
int
,
n
:
int
,
k
:
int
)
->
\
Tuple
[
Iterable
[
torch
.
Tensor
],
Iterable
[
torch
.
Tensor
]]:
ABs
=
[]
for
_
in
range
(
num_tensors
):
b_comp
,
e
,
a
,
b
=
make_rand_sparse_tensors
(
dtype
,
m
,
n
,
k
)
if
b_comp
is
not
None
:
ABs
.
append
(
make_rand_sparse_tensors
(
dtype
,
m
,
n
,
k
))
BComps
,
Es
,
As
,
Bs
=
zip
(
*
ABs
)
return
list
(
BComps
),
list
(
Es
),
list
(
As
),
list
(
Bs
)
benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
View file @
96ae75ad
...
@@ -8,6 +8,7 @@ from typing import Callable, Iterable, List, Tuple
...
@@ -8,6 +8,7 @@ from typing import Callable, Iterable, List, Tuple
import
torch
import
torch
import
torch.utils.benchmark
as
TBenchmark
import
torch.utils.benchmark
as
TBenchmark
from
torch.utils.benchmark
import
Measurement
as
TMeasurement
from
torch.utils.benchmark
import
Measurement
as
TMeasurement
from
utils
import
make_rand_tensors
from
weight_shapes
import
WEIGHT_SHAPES
from
weight_shapes
import
WEIGHT_SHAPES
from
vllm
import
_custom_ops
as
ops
from
vllm
import
_custom_ops
as
ops
...
@@ -17,31 +18,6 @@ DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
...
@@ -17,31 +18,6 @@ DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
DEFAULT_BATCH_SIZES
=
[
1
,
16
,
32
,
64
,
128
,
256
,
512
]
DEFAULT_BATCH_SIZES
=
[
1
,
16
,
32
,
64
,
128
,
256
,
512
]
DEFAULT_TP_SIZES
=
[
1
]
DEFAULT_TP_SIZES
=
[
1
]
# helpers
def
to_fp8
(
tensor
:
torch
.
Tensor
)
->
torch
.
Tensor
:
finfo
=
torch
.
finfo
(
torch
.
float8_e4m3fn
)
return
torch
.
round
(
tensor
.
clamp
(
min
=
finfo
.
min
,
max
=
finfo
.
max
)).
to
(
dtype
=
torch
.
float8_e4m3fn
)
def
to_int8
(
tensor
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
torch
.
round
(
tensor
.
clamp
(
min
=-
128
,
max
=
127
)).
to
(
dtype
=
torch
.
int8
)
def
make_rand_tensors
(
dtype
:
torch
.
dtype
,
m
:
int
,
n
:
int
,
k
:
int
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
a
=
torch
.
randn
((
m
,
k
),
device
=
'cuda'
)
*
5
b
=
torch
.
randn
((
n
,
k
),
device
=
'cuda'
).
t
()
*
5
if
dtype
==
torch
.
int8
:
return
to_int8
(
a
),
to_int8
(
b
)
if
dtype
==
torch
.
float8_e4m3fn
:
return
to_fp8
(
a
),
to_fp8
(
b
)
raise
ValueError
(
"unsupported dtype"
)
# bench
# bench
def
bench_fn
(
label
:
str
,
sub_label
:
str
,
description
:
str
,
fn
:
Callable
,
*
args
,
def
bench_fn
(
label
:
str
,
sub_label
:
str
,
description
:
str
,
fn
:
Callable
,
*
args
,
...
@@ -386,4 +362,4 @@ Benchmark Cutlass GEMM.
...
@@ -386,4 +362,4 @@ Benchmark Cutlass GEMM.
model_parser
.
set_defaults
(
func
=
run_model_bench
)
model_parser
.
set_defaults
(
func
=
run_model_bench
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
args
.
func
(
args
)
args
.
func
(
args
)
\ No newline at end of file
benchmarks/cutlass_benchmarks/weight_shapes.py
View file @
96ae75ad
...
@@ -40,4 +40,4 @@ WEIGHT_SHAPES = {
...
@@ -40,4 +40,4 @@ WEIGHT_SHAPES = {
([
8192
,
57344
],
1
),
([
8192
,
57344
],
1
),
([
28672
,
8192
],
0
),
([
28672
,
8192
],
0
),
],
],
}
}
\ No newline at end of file
benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
View file @
96ae75ad
...
@@ -10,7 +10,8 @@ set -ex
...
@@ -10,7 +10,8 @@ set -ex
kill_gpu_processes
()
{
kill_gpu_processes
()
{
# kill all processes on GPU.
# kill all processes on GPU.
pkill
-f
pt_main_thread
pgrep pt_main_thread | xargs
-r
kill
-9
pgrep python3 | xargs
-r
kill
-9
sleep
10
sleep
10
# remove vllm config file
# remove vllm config file
...
@@ -54,7 +55,7 @@ benchmark() {
...
@@ -54,7 +55,7 @@ benchmark() {
CUDA_VISIBLE_DEVICES
=
0 python3
\
CUDA_VISIBLE_DEVICES
=
0 python3
\
-m
vllm.entrypoints.openai.api_server
\
-m
vllm.entrypoints.openai.api_server
\
--model
meta-llama/Meta-Llama-3.1-8B-Instruct
\
--model
$model
\
--port
8100
\
--port
8100
\
--max-model-len
10000
\
--max-model-len
10000
\
--gpu-memory-utilization
0.6
\
--gpu-memory-utilization
0.6
\
...
@@ -64,7 +65,7 @@ benchmark() {
...
@@ -64,7 +65,7 @@ benchmark() {
CUDA_VISIBLE_DEVICES
=
1 python3
\
CUDA_VISIBLE_DEVICES
=
1 python3
\
-m
vllm.entrypoints.openai.api_server
\
-m
vllm.entrypoints.openai.api_server
\
--model
meta-llama/Meta-Llama-3.1-8B-Instruct
\
--model
$model
\
--port
8200
\
--port
8200
\
--max-model-len
10000
\
--max-model-len
10000
\
--gpu-memory-utilization
0.6
\
--gpu-memory-utilization
0.6
\
...
@@ -87,7 +88,7 @@ benchmark() {
...
@@ -87,7 +88,7 @@ benchmark() {
--port
8100
\
--port
8100
\
--save-result
\
--save-result
\
--result-dir
$results_folder
\
--result-dir
$results_folder
\
--result-filename
disagg_prefill_
2x
tp
4
.json
\
--result-filename
disagg_prefill_tp
1
.json
\
--request-rate
"inf"
--request-rate
"inf"
...
@@ -105,7 +106,7 @@ benchmark() {
...
@@ -105,7 +106,7 @@ benchmark() {
--port
8200
\
--port
8200
\
--save-result
\
--save-result
\
--result-dir
$results_folder
\
--result-dir
$results_folder
\
--result-filename
disagg_prefill_
2xtp4
.json
\
--result-filename
disagg_prefill_
tp1_overhead
.json
\
--request-rate
"
$qps
"
--request-rate
"
$qps
"
kill_gpu_processes
kill_gpu_processes
...
@@ -118,7 +119,7 @@ main() {
...
@@ -118,7 +119,7 @@ main() {
(
which jq
)
||
(
apt-get
-y
install
jq
)
(
which jq
)
||
(
apt-get
-y
install
jq
)
(
which socat
)
||
(
apt-get
-y
install
socat
)
(
which socat
)
||
(
apt-get
-y
install
socat
)
pip
install
quart httpx
pip
install
quart httpx
datasets
cd
"
$(
dirname
"
$0
"
)
"
cd
"
$(
dirname
"
$0
"
)
"
...
...
benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
View file @
96ae75ad
#!/bin/bash
#!/bin/bash
# Requirement:
8x H100
GPUs.
# Requirement:
2x
GPUs.
# Model:
neuralmagic
/Meta-Llama-3
-70
B-Instruct
-FP8-KV
# Model:
meta-llama
/Meta-Llama-3
.1-8
B-Instruct
# Query:
2048
input tokens,
11
output tokens, QPS
4
,
5
00 requests
# Query:
1024
input tokens,
6
output tokens, QPS
2/4/6/8
,
1
00 requests
# Resource:
8
x
H100
# Resource:
2
x
GPU
# Approaches:
# Approaches:
# 1. Chunked prefill: 1 vllm instance with tp=8
# 2. Chunked prefill: 2 vllm instance with tp=4, equivalent to 1 tp=4 instance with QPS 4
# 2. Chunked prefill: 2 vllm instance with tp=4, equivalent to 1 tp=4 instance with QPS 4
# 3. Disaggregated prefill: 1 prefilling instance and 1 decoding instance
# 3. Disaggregated prefill: 1 prefilling instance and 1 decoding instance
# Prefilling instance: max_output_token=1
# Prefilling instance: max_output_token=1
...
@@ -114,7 +113,6 @@ benchmark() {
...
@@ -114,7 +113,6 @@ benchmark() {
--request-rate
"
$qps
"
--request-rate
"
$qps
"
sleep
2
sleep
2
}
}
...
@@ -123,8 +121,9 @@ main() {
...
@@ -123,8 +121,9 @@ main() {
(
which wget
&&
which curl
)
||
(
apt-get update
&&
apt-get
install
-y
wget curl
)
(
which wget
&&
which curl
)
||
(
apt-get update
&&
apt-get
install
-y
wget curl
)
(
which jq
)
||
(
apt-get
-y
install
jq
)
(
which jq
)
||
(
apt-get
-y
install
jq
)
(
which socat
)
||
(
apt-get
-y
install
socat
)
(
which socat
)
||
(
apt-get
-y
install
socat
)
(
which lsof
)
||
(
apt-get
-y
install
lsof
)
pip
install
quart httpx matplotlib aiohttp
pip
install
quart httpx matplotlib aiohttp
datasets
cd
"
$(
dirname
"
$0
"
)
"
cd
"
$(
dirname
"
$0
"
)
"
...
...
Prev
1
2
3
4
5
…
19
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment