Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
96ae75ad
Commit
96ae75ad
authored
Jan 04, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.6.6.post1' into v0.6.6.post1-dev
parents
f9f4a735
2339d59f
Changes
374
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
793 additions
and
183 deletions
+793
-183
.buildkite/generate_index.py
.buildkite/generate_index.py
+24
-0
.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+3
-3
.buildkite/release-pipeline.yaml
.buildkite/release-pipeline.yaml
+15
-0
.buildkite/run-gh200-test.sh
.buildkite/run-gh200-test.sh
+3
-0
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+5
-1
.buildkite/upload-wheels.sh
.buildkite/upload-wheels.sh
+29
-1
.github/workflows/publish.yml
.github/workflows/publish.yml
+62
-61
.gitignore
.gitignore
+2
-0
CMakeLists.txt
CMakeLists.txt
+33
-6
Dockerfile
Dockerfile
+24
-23
Dockerfile.cpu
Dockerfile.cpu
+3
-3
README.md
README.md
+1
-1
README_ORIGIN.md
README_ORIGIN.md
+1
-1
benchmarks/benchmark_throughput.py
benchmarks/benchmark_throughput.py
+92
-43
benchmarks/cutlass_benchmarks/sparse_benchmarks.py
benchmarks/cutlass_benchmarks/sparse_benchmarks.py
+384
-0
benchmarks/cutlass_benchmarks/utils.py
benchmarks/cutlass_benchmarks/utils.py
+96
-0
benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+2
-26
benchmarks/cutlass_benchmarks/weight_shapes.py
benchmarks/cutlass_benchmarks/weight_shapes.py
+1
-1
benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
+7
-6
benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
+6
-7
No files found.
.buildkite/generate_index.py
0 → 100644
View file @
96ae75ad
import
argparse
import
os
template
=
"""<!DOCTYPE html>
<html>
<body>
<h1>Links for vLLM</h1/>
<a href="../{wheel_html_escaped}">{wheel}</a><br/>
</body>
</html>
"""
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--wheel"
,
help
=
"The wheel path."
,
required
=
True
)
args
=
parser
.
parse_args
()
filename
=
os
.
path
.
basename
(
args
.
wheel
)
with
open
(
"index.html"
,
"w"
)
as
f
:
print
(
f
"Generated index.html for
{
args
.
wheel
}
"
)
# cloudfront requires escaping the '+' character
f
.
write
(
template
.
format
(
wheel
=
filename
,
wheel_html_escaped
=
filename
.
replace
(
"+"
,
"%2B"
)))
.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
View file @
96ae75ad
...
...
@@ -65,9 +65,9 @@ steps:
-
VLLM_USAGE_SOURCE
-
HF_TOKEN
-
block
:
"
Run
H100
Benchmark"
key
:
block-h100
depends_on
:
~
#
- block: "Run H100 Benchmark"
#
key: block-h100
#
depends_on: ~
-
label
:
"
H100"
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
...
...
.buildkite/release-pipeline.yaml
View file @
96ae75ad
...
...
@@ -55,3 +55,18 @@ steps:
password-env
:
DOCKERHUB_TOKEN
env
:
DOCKER_BUILDKIT
:
"
1"
-
block
:
"
Build
CPU
release
image"
key
:
block-cpu-release-image-build
depends_on
:
~
-
label
:
"
Build
and
publish
CPU
release
image"
depends_on
:
block-cpu-release-image-build
agents
:
queue
:
cpu_queue_postmerge
commands
:
-
"
aws
ecr-public
get-login-password
--region
us-east-1
|
docker
login
--username
AWS
--password-stdin
public.ecr.aws/q9t5s3a7"
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
GIT_REPO_CHECK=1
--tag
public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION
--progress
plain
-f
Dockerfile.cpu
."
-
"
docker
push
public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION"
env
:
DOCKER_BUILDKIT
:
"
1"
.buildkite/run-gh200-test.sh
View file @
96ae75ad
...
...
@@ -4,6 +4,9 @@
# It serves a sanity check for compilation and basic model usage.
set
-ex
# Skip the new torch installation during build since we are using the specified version for arm64 in the Dockerfile
python3 use_existing_torch.py
# Try building the docker image
DOCKER_BUILDKIT
=
1 docker build
.
\
--target
vllm-openai
\
...
...
.buildkite/test-pipeline.yaml
View file @
96ae75ad
...
...
@@ -224,8 +224,12 @@ steps:
mirror_hardwares
:
[
amd
]
source_file_dependencies
:
-
vllm/model_executor/layers
-
vllm/model_executor/guided_decoding
-
tests/test_logits_processor
command
:
pytest -v -s test_logits_processor.py
-
tests/model_executor/test_guided_processors
commands
:
-
pytest -v -s test_logits_processor.py
-
pytest -v -s model_executor/test_guided_processors.py
-
label
:
Speculative decoding tests
# 30min
source_file_dependencies
:
...
...
.buildkite/upload-wheels.sh
View file @
96ae75ad
...
...
@@ -23,6 +23,8 @@ wheel="$new_wheel"
version
=
$(
unzip
-p
"
$wheel
"
'**/METADATA'
|
grep
'^Version: '
|
cut
-d
' '
-f2
)
echo
"Version:
$version
"
normal_wheel
=
"
$wheel
"
# Save the original wheel filename
# If the version contains "dev", rename it to v1.0.0.dev for consistency
if
[[
$version
==
*
dev
*
]]
;
then
suffix
=
"
${
version
##*.
}
"
...
...
@@ -32,12 +34,38 @@ if [[ $version == *dev* ]]; then
new_version
=
"1.0.0.dev"
fi
new_wheel
=
"
${
wheel
/
$version
/
$new_version
}
"
mv
--
"
$wheel
"
"
$new_wheel
"
# use cp to keep both files in the artifacts directory
cp
--
"
$wheel
"
"
$new_wheel
"
wheel
=
"
$new_wheel
"
version
=
"
$new_version
"
fi
# Upload the wheel to S3
python3 .buildkite/generate_index.py
--wheel
"
$normal_wheel
"
# generate index for this commit
aws s3
cp
"
$wheel
"
"s3://vllm-wheels/
$BUILDKITE_COMMIT
/"
aws s3
cp
"
$normal_wheel
"
"s3://vllm-wheels/
$BUILDKITE_COMMIT
/"
if
[[
$normal_wheel
==
*
"cu118"
*
]]
;
then
# if $normal_wheel matches cu118, do not upload the index.html
echo
"Skipping index files for cu118 wheels"
else
# only upload index.html for cu12 wheels (default wheels)
aws s3
cp
index.html
"s3://vllm-wheels/
$BUILDKITE_COMMIT
/vllm/index.html"
aws s3
cp
"s3://vllm-wheels/nightly/index.html"
"s3://vllm-wheels/
$BUILDKITE_COMMIT
/index.html"
fi
# generate index for nightly
aws s3
cp
"
$wheel
"
"s3://vllm-wheels/nightly/"
aws s3
cp
"
$normal_wheel
"
"s3://vllm-wheels/nightly/"
if
[[
$normal_wheel
==
*
"cu118"
*
]]
;
then
# if $normal_wheel matches cu118, do not upload the index.html
echo
"Skipping index files for cu118 wheels"
else
# only upload index.html for cu12 wheels (default wheels)
aws s3
cp
index.html
"s3://vllm-wheels/nightly/vllm/index.html"
fi
aws s3
cp
"
$wheel
"
"s3://vllm-wheels/
$version
/"
\ No newline at end of file
.github/workflows/publish.yml
View file @
96ae75ad
...
...
@@ -39,67 +39,68 @@ jobs:
const script = require('.github/workflows/scripts/create_release.js')
await script(github, context, core)
wheel
:
name
:
Build Wheel
runs-on
:
${{ matrix.os }}
needs
:
release
strategy
:
fail-fast
:
false
matrix
:
os
:
[
'
ubuntu-20.04'
]
python-version
:
[
'
3.9'
,
'
3.10'
,
'
3.11'
,
'
3.12'
]
pytorch-version
:
[
'
2.4.0'
]
# Must be the most recent version that meets requirements-cuda.txt.
cuda-version
:
[
'
11.8'
,
'
12.1'
]
steps
:
-
name
:
Checkout
uses
:
actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
# v4.2.2
-
name
:
Setup ccache
uses
:
hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9
# v1.2.14
with
:
create-symlink
:
true
key
:
${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
-
name
:
Set up Linux Env
if
:
${{ runner.os == 'Linux' }}
run
:
|
bash -x .github/workflows/scripts/env.sh
-
name
:
Set up Python
uses
:
actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b
# v5.3.0
with
:
python-version
:
${{ matrix.python-version }}
-
name
:
Install CUDA ${{ matrix.cuda-version }}
run
:
|
bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
# NOTE(simon): No longer build wheel using Github Actions. See buildkite's release workflow.
# wheel:
# name: Build Wheel
# runs-on: ${{ matrix.os }}
# needs: release
# strategy:
# fail-fast: false
# matrix:
# os: ['ubuntu-20.04']
# python-version: ['3.9', '3.10', '3.11', '3.12']
# pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements-cuda.txt.
# cuda-version: ['11.8', '12.1']
# steps:
# - name: Checkout
# uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
# - name: Setup ccache
# uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14
# with:
# create-symlink: true
# key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
-
name
:
Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
run
:
|
bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
# - name: Set up Linux Env
# if: ${{ runner.os == 'Linux' }}
# run: |
# bash -x .github/workflows/scripts/env.sh
-
name
:
Build wheel
shell
:
bash
env
:
CMAKE_BUILD_TYPE
:
Release
# do not compile with debug symbol to reduce wheel size
run
:
|
bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
asset_name=${wheel_name//"linux"/"manylinux1"}
echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
-
name
:
Upload Release Asset
uses
:
actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5
# v1.0.2
env
:
GITHUB_TOKEN
:
${{ secrets.GITHUB_TOKEN }}
with
:
upload_url
:
${{ needs.release.outputs.upload_url }}
asset_path
:
./dist/${{ env.wheel_name }}
asset_name
:
${{ env.asset_name }}
asset_content_type
:
application/*
# - name: Set up Python
# uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
# with:
# python-version: ${{ matrix.python-version }}
# - name: Install CUDA ${{ matrix.cuda-version }}
# run: |
# bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
# - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
# run: |
# bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
# - name: Build wheel
# shell: bash
# env:
# CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
# run: |
# bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
# wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
# asset_name=${wheel_name//"linux"/"manylinux1"}
# echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
# echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
# - name: Upload Release Asset
# uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
# env:
# GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
# with:
# upload_url: ${{ needs.release.outputs.upload_url }}
# asset_path: ./dist/${{ env.wheel_name }}
# asset_name: ${{ env.asset_name }}
# asset_content_type: application/*
# (Danielkinz): This last step will publish the .whl to pypi. Warning: untested
# - name: Publish package
...
...
.gitignore
View file @
96ae75ad
...
...
@@ -81,6 +81,8 @@ instance/
docs/_build/
docs/source/getting_started/examples/*.rst
!**/*.template.rst
docs/source/getting_started/examples/*.md
!**/*.template.md
# PyBuilder
.pybuilder/
...
...
CMakeLists.txt
View file @
96ae75ad
...
...
@@ -219,7 +219,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
SET
(
CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL
"Enable only the header library"
)
# Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
set
(
CUTLASS_REVISION
"v3.
5.1
"
CACHE STRING
"CUTLASS revision to use"
)
set
(
CUTLASS_REVISION
"v3.
6.0
"
CACHE STRING
"CUTLASS revision to use"
)
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
if
(
DEFINED ENV{VLLM_CUTLASS_SRC_DIR}
)
...
...
@@ -236,13 +236,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
FetchContent_Declare
(
cutlass
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
GIT_TAG
v3.5.1
GIT_TAG
8aa95dbb888be6d81c6fbf7169718c5244b53227
GIT_PROGRESS TRUE
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
# Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
# So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
GIT_SHALLOW
TRU
E
GIT_SHALLOW
FALS
E
)
endif
()
FetchContent_MakeAvailable
(
cutlass
)
...
...
@@ -254,7 +254,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"csrc/quantization/awq/gemm_kernels.cu"
"csrc/custom_all_reduce.cu"
"csrc/permute_cols.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
)
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
"csrc/sparse/cutlass/sparse_compressor_entry.cu"
"csrc/cutlass_extensions/common.cpp"
)
set_gencode_flags_for_srcs
(
SRCS
"
${
VLLM_EXT_SRC
}
"
...
...
@@ -283,7 +286,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
" in CUDA target architectures"
)
endif
()
#
# The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
# CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
cuda_archs_loose_intersection
(
SCALED_MM_3X_ARCHS
"9.0;9.0a"
"
${
CUDA_ARCHS
}
"
)
...
...
@@ -336,6 +338,31 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif
()
endif
()
#
# 2:4 Sparse Kernels
# The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
# require CUDA 12.2 or later (and only work on Hopper, 9.0/9.0a for now).
if
(
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS
)
set
(
SRCS
"csrc/sparse/cutlass/sparse_compressor_c3x.cu"
"csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu"
)
set_gencode_flags_for_srcs
(
SRCS
"
${
SRCS
}
"
CUDA_ARCHS
"
${
SCALED_MM_3X_ARCHS
}
"
)
list
(
APPEND VLLM_EXT_SRC
"
${
SRCS
}
"
)
list
(
APPEND VLLM_GPU_FLAGS
"-DENABLE_SPARSE_SCALED_MM_C3X=1"
)
message
(
STATUS
"Building sparse_scaled_mm_c3x for archs:
${
SCALED_MM_3X_ARCHS
}
"
)
else
()
if
(
NOT
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS
)
message
(
STATUS
"Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
"not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
"if you intend on running FP8 sparse quantized models on Hopper."
)
else
()
message
(
STATUS
"Not building sparse_scaled_mm_c3x as no compatible archs found "
"in CUDA target architectures"
)
endif
()
endif
()
#
# Machete kernels
...
...
@@ -417,7 +444,7 @@ define_gpu_extension_target(
SOURCES
${
VLLM_EXT_SRC
}
COMPILE_FLAGS
${
VLLM_GPU_FLAGS
}
ARCHITECTURES
${
VLLM_GPU_ARCHES
}
INCLUDE_DIRECTORIES
${
CUTLASS_INCLUDE_DIR
}
INCLUDE_DIRECTORIES
${
CUTLASS_INCLUDE_DIR
}
;
${
CUTLASS_TOOLS_UTIL_INCLUDE_DIR
}
USE_SABI 3
WITH_SOABI
)
...
...
Dockerfile
View file @
96ae75ad
...
...
@@ -2,7 +2,7 @@
# to run the OpenAI compatible server.
# Please update any changes made here to
# docs/source/dev/dockerfile/dockerfile.
rst
and
# docs/source/dev/dockerfile/dockerfile.
md
and
# docs/source/assets/dev/dockerfile-stages-dependency.png
ARG
CUDA_VERSION=12.4.1
...
...
@@ -45,17 +45,21 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
WORKDIR
/workspace
# install build and runtime dependencies
COPY
requirements-common.txt requirements-common.txt
COPY
requirements-cuda.txt requirements-cuda.txt
COPY
requirements-cuda-arm64.txt requirements-cuda-arm64.txt
RUN
--mount
=
type
=
cache,target
=
/root/.cache/pip
\
python3
-m
pip
install
-r
requirements-cuda.txt
# arm64 (GH200) build follows the practice of "use existing pytorch" build,
# we need to install torch and torchvision from the nightly builds first,
# pytorch will not appear as a vLLM dependency in all of the following steps
# after this step
RUN
--mount
=
type
=
cache,target
=
/root/.cache/pip
\
if
[
"
$TARGETPLATFORM
"
=
"linux/arm64"
]
;
then
\
python3
-m
pip
install
-
r
requirements-cuda-arm64.txt
;
\
python3
-m
pip
install
-
-index-url
https://download.pytorch.org/whl/nightly/cu124
"torch==2.6.0.dev20241210+cu124"
"torchvision==0.22.0.dev20241215"
;
\
fi
COPY
requirements-common.txt requirements-common.txt
COPY
requirements-cuda.txt requirements-cuda.txt
RUN
--mount
=
type
=
cache,target
=
/root/.cache/pip
\
python3
-m
pip
install
-r
requirements-cuda.txt
# cuda arch list used by torch
# can be useful for both `dev` and `test`
# explicitly set the list to avoid issues with torch 2.2
...
...
@@ -77,11 +81,6 @@ COPY requirements-build.txt requirements-build.txt
RUN
--mount
=
type
=
cache,target
=
/root/.cache/pip
\
python3
-m
pip
install
-r
requirements-build.txt
RUN
--mount
=
type
=
cache,target
=
/root/.cache/pip
\
if
[
"
$TARGETPLATFORM
"
=
"linux/arm64"
]
;
then
\
python3
-m
pip
install
-r
requirements-cuda-arm64.txt
;
\
fi
COPY
. .
ARG
GIT_REPO_CHECK=0
RUN
--mount
=
type
=
bind
,source
=
.git,target
=
.git
\
...
...
@@ -157,8 +156,6 @@ WORKDIR /vllm-workspace
ENV
DEBIAN_FRONTEND=noninteractive
ARG
TARGETPLATFORM
COPY
requirements-cuda-arm64.txt requirements-cuda-arm64.txt
RUN
PYTHON_VERSION_STR
=
$(
echo
${
PYTHON_VERSION
}
|
sed
's/\.//g'
)
&&
\
echo
"export PYTHON_VERSION_STR=
${
PYTHON_VERSION_STR
}
"
>>
/etc/environment
...
...
@@ -166,7 +163,7 @@ RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
RUN
echo
'tzdata tzdata/Areas select America'
| debconf-set-selections
\
&&
echo
'tzdata tzdata/Zones/America select Los_Angeles'
| debconf-set-selections
\
&&
apt-get update
-y
\
&&
apt-get
install
-y
ccache software-properties-common git curl
sudo
vim python3-pip
\
&&
apt-get
install
-y
ccache software-properties-common git curl
wget
sudo
vim python3-pip
\
&&
apt-get
install
-y
ffmpeg libsm6 libxext6 libgl1
\
&&
add-apt-repository ppa:deadsnakes/ppa
\
&&
apt-get update
-y
\
...
...
@@ -183,17 +180,20 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
# or future versions of triton.
RUN
ldconfig /usr/local/cuda-
$(
echo
$CUDA_VERSION
|
cut
-d
.
-f1
,2
)
/compat/
# arm64 (GH200) build follows the practice of "use existing pytorch" build,
# we need to install torch and torchvision from the nightly builds first,
# pytorch will not appear as a vLLM dependency in all of the following steps
# after this step
RUN
--mount
=
type
=
cache,target
=
/root/.cache/pip
\
if
[
"
$TARGETPLATFORM
"
=
"linux/arm64"
]
;
then
\
python3
-m
pip
install
--index-url
https://download.pytorch.org/whl/nightly/cu124
"torch==2.6.0.dev20241210+cu124"
"torchvision==0.22.0.dev20241215"
;
\
fi
# Install vllm wheel first, so that torch etc will be installed.
RUN
--mount
=
type
=
bind
,from
=
build,src
=
/workspace/dist,target
=
/vllm-workspace/dist
\
--mount
=
type
=
cache,target
=
/root/.cache/pip
\
python3
-m
pip
install
dist/
*
.whl
--verbose
RUN
--mount
=
type
=
cache,target
=
/root/.cache/pip
\
if
[
"
$TARGETPLATFORM
"
=
"linux/arm64"
]
;
then
\
pip uninstall
-y
torch
&&
\
python3
-m
pip
install
-r
requirements-cuda-arm64.txt
;
\
fi
RUN
--mount
=
type
=
cache,target
=
/root/.cache/pip
\
.
/etc/environment
&&
\
if
[
"
$TARGETPLATFORM
"
!=
"linux/arm64"
]
;
then
\
...
...
@@ -240,10 +240,11 @@ FROM vllm-base AS vllm-openai
# install additional dependencies for openai api server
RUN
--mount
=
type
=
cache,target
=
/root/.cache/pip
\
if
[
"
$TARGETPLATFORM
"
=
"linux/arm64"
]
;
then
\
pip
install
accelerate hf_transfer
'modelscope!=1.15.0'
'bitsandbytes>=0.42.0'
'timm==0.9.10'
;
\
pip
install
accelerate hf_transfer
'modelscope!=1.15.0'
'bitsandbytes>=0.42.0'
'timm==0.9.10'
boto3 runai-model-streamer runai-model-streamer[s3]
;
\
else
\
pip
install
accelerate hf_transfer
'modelscope!=1.15.0'
'bitsandbytes>=0.45.0'
'timm==0.9.10'
;
\
pip
install
accelerate hf_transfer
'modelscope!=1.15.0'
'bitsandbytes>=0.45.0'
'timm==0.9.10'
boto3 runai-model-streamer runai-model-streamer[s3]
;
\
fi
ENV
VLLM_USAGE_SOURCE production-docker-image
ENTRYPOINT
["python3", "-m", "vllm.entrypoints.openai.api_server"]
...
...
Dockerfile.cpu
View file @
96ae75ad
...
...
@@ -26,10 +26,10 @@ RUN pip install intel_extension_for_pytorch==2.5.0
WORKDIR /workspace
COPY requirements-build.txt requirements-build.txt
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
pip install --upgrade pip && \
pip install -r requirements-build.txt
...
...
@@ -37,9 +37,9 @@ FROM cpu-test-1 AS build
WORKDIR /workspace/vllm
COPY requirements-common.txt requirements-common.txt
COPY requirements-cpu.txt requirements-cpu.txt
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
--mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
pip install -v -r requirements-cpu.txt
COPY . .
...
...
README.md
View file @
96ae75ad
...
...
@@ -84,7 +84,7 @@ VLLM_INSTALL_PUNICA_KERNELS=1 python3 setup.py install (若调试,可使用V
+
若使用 pip install 下载安装过慢,可添加源:-i https://pypi.tuna.tsinghua.edu.cn/simple/
## 验证
-
python -c "import vllm; print(vllm.
\_\_
version__)",版本号与官方版本同步,查询该软件的版本号,例如0.6.
5
;
-
python -c "import vllm; print(vllm.
\_\_
version__)",版本号与官方版本同步,查询该软件的版本号,例如0.6.
6.post1
;
## Known Issue
-
无
...
...
README_ORIGIN.md
View file @
96ae75ad
...
...
@@ -60,7 +60,7 @@ vLLM is flexible and easy to use with:
vLLM seamlessly supports most popular open-source models on HuggingFace, including:
-
Transformer-like LLMs (e.g., Llama)
-
Mixture-of-Expert LLMs (e.g., Mixtral)
-
Mixture-of-Expert LLMs (e.g., Mixtral
, Deepseek-V2 and V3
)
-
Embedding Models (e.g. E5-Mistral)
-
Multi-modal LLMs (e.g., LLaVA)
...
...
benchmarks/benchmark_throughput.py
View file @
96ae75ad
...
...
@@ -4,7 +4,8 @@ import dataclasses
import
json
import
random
import
time
from
typing
import
List
,
Optional
from
functools
import
cache
from
typing
import
Dict
,
List
,
Optional
,
Tuple
import
numpy
as
np
import
torch
...
...
@@ -20,8 +21,11 @@ from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
from
vllm.entrypoints.openai.api_server
import
(
build_async_engine_client_from_engine_args
)
from
vllm.inputs
import
TextPrompt
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.utils
import
get_adapter_absolute_path
from
vllm.multimodal
import
MultiModalDataDict
from
vllm.sampling_params
import
BeamSearchParams
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
,
get_lora_tokenizer
from
vllm.utils
import
FlexibleArgumentParser
,
merge_async_iterators
...
...
@@ -31,15 +35,17 @@ class SampleRequest:
Attributes:
prompt: The input text prompt for the model.
multi_modal_data: Optional dictionary containing multi-modal data (e.g.
images).
prompt_len: The length of the prompt in tokens.
expected_output_len: The expected length of the output in tokens.
multi_modal_data: Optional dictionary containing multi-modal data (e.g.
images).
lora_request: Optional LoRARequest specifying the LoRA to use.
"""
prompt
:
str
prompt_len
:
int
expected_output_len
:
int
multi_modal_data
:
Optional
[
MultiModalDataDict
]
=
None
lora_request
:
Optional
[
LoRARequest
]
=
None
def
_get_prompt_for_image_model
(
question
:
str
,
*
,
model
:
str
)
->
str
:
...
...
@@ -63,8 +69,30 @@ def _get_prompt_for_image_model(question: str, *, model: str) -> str:
raise
ValueError
(
f
"Unsupported model
{
model
}
"
)
@
cache
def
lora_path_on_disk
(
lora_path
:
str
)
->
str
:
return
get_adapter_absolute_path
(
lora_path
)
lora_tokenizer_cache
:
Dict
[
int
,
AnyTokenizer
]
=
{}
def
get_random_lora_request
(
args
:
argparse
.
Namespace
)
->
Tuple
[
LoRARequest
,
Optional
[
AnyTokenizer
]]:
global
lora_tokenizer_cache
lora_id
=
random
.
randint
(
1
,
args
.
max_loras
)
lora_request
=
LoRARequest
(
lora_name
=
str
(
lora_id
),
lora_int_id
=
lora_id
,
lora_path
=
lora_path_on_disk
(
args
.
lora_path
))
if
lora_id
not
in
lora_tokenizer_cache
:
lora_tokenizer_cache
[
lora_id
]
=
get_lora_tokenizer
(
lora_request
)
return
lora_request
,
lora_tokenizer_cache
[
lora_id
]
def
sample_requests
(
tokenizer
:
PreTrainedTokenizerBase
,
args
:
argparse
.
Namespace
)
->
List
[
SampleRequest
]:
dataset_path
:
str
=
args
.
dataset
num_requests
:
int
=
args
.
num_prompts
fixed_output_len
:
Optional
[
int
]
=
args
.
output_len
...
...
@@ -82,7 +110,9 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
# Filter out sequences that are too long or too short
filtered_dataset
:
List
[
SampleRequest
]
=
[]
for
data
in
dataset
:
for
data
in
tqdm
(
dataset
,
total
=
len
(
filtered_dataset
),
desc
=
"sampling requests"
):
if
len
(
filtered_dataset
)
==
num_requests
:
break
...
...
@@ -105,9 +135,16 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
continue
prompt
=
_get_prompt_for_image_model
(
question
=
prompt
,
model
=
model
)
request_tokenizer
=
tokenizer
lora_request
:
Optional
[
LoRARequest
]
=
None
if
args
.
enable_lora
:
lora_request
,
lora_tokenizer
=
get_random_lora_request
(
args
)
if
lora_tokenizer
:
request_tokenizer
=
lora_tokenizer
# Tokenize the prompts and completions.
prompt_token_ids
=
tokenizer
(
prompt
).
input_ids
completion_token_ids
=
tokenizer
(
completion
).
input_ids
prompt_token_ids
=
request_
tokenizer
(
prompt
).
input_ids
completion_token_ids
=
request_
tokenizer
(
completion
).
input_ids
prompt_len
=
len
(
prompt_token_ids
)
output_len
=
len
(
completion_token_ids
)
if
fixed_output_len
is
None
else
fixed_output_len
...
...
@@ -121,7 +158,8 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
SampleRequest
(
prompt
=
prompt
,
prompt_len
=
prompt_len
,
expected_output_len
=
output_len
,
multi_modal_data
=
multi_modal_data
))
multi_modal_data
=
multi_modal_data
,
lora_request
=
lora_request
))
return
filtered_dataset
...
...
@@ -150,11 +188,14 @@ def run_vllm(
ignore_eos
=
True
,
max_tokens
=
request
.
expected_output_len
,
))
lora_requests
:
Optional
[
List
[
LoRARequest
]]
=
None
if
engine_args
.
enable_lora
:
lora_requests
=
[
request
.
lora_request
for
request
in
requests
]
# warmup
warmup_prompts
:
List
[
TextPrompt
]
=
[]
warmup_sampling_params
:
List
[
SamplingParams
]
=
[]
for
request
in
warmup_
promp
ts
:
for
request
in
warmup_
reques
ts
:
warmup_prompts
.
append
(
TextPrompt
(
prompt
=
request
.
prompt
,
multi_modal_data
=
request
.
multi_modal_data
))
...
...
@@ -191,9 +232,13 @@ def run_vllm(
if
not
use_beam_search
:
start
=
time
.
perf_counter
()
llm
.
generate
(
prompts
,
sampling_params
,
use_tqdm
=
True
)
llm
.
generate
(
prompts
,
sampling_params
,
lora_request
=
lora_requests
,
use_tqdm
=
True
)
end
=
time
.
perf_counter
()
else
:
assert
lora_requests
is
None
,
"BeamSearch API does not support LoRA"
prompts
=
[
request
.
prompt
for
request
in
requests
]
# output_len should be the same for all requests.
output_len
=
requests
[
0
][
2
]
...
...
@@ -225,6 +270,7 @@ async def run_vllm_async(
# Add the requests to the engine.
prompts
:
List
[
TextPrompt
]
=
[]
sampling_params
:
List
[
SamplingParams
]
=
[]
lora_requests
:
List
[
Optional
[
LoRARequest
]]
=
[]
for
request
in
requests
:
prompts
.
append
(
TextPrompt
(
prompt
=
request
.
prompt
,
...
...
@@ -237,11 +283,16 @@ async def run_vllm_async(
ignore_eos
=
True
,
max_tokens
=
request
.
expected_output_len
,
))
lora_requests
.
append
(
request
.
lora_request
)
generators
=
[]
start
=
time
.
perf_counter
()
for
i
,
(
prompt
,
sp
)
in
enumerate
(
zip
(
prompts
,
sampling_params
)):
generator
=
llm
.
generate
(
prompt
,
sp
,
request_id
=
f
"test
{
i
}
"
)
for
i
,
(
prompt
,
sp
,
lr
)
in
enumerate
(
zip
(
prompts
,
sampling_params
,
lora_requests
)):
generator
=
llm
.
generate
(
prompt
,
sp
,
lora_request
=
lr
,
request_id
=
f
"test
{
i
}
"
)
generators
.
append
(
generator
)
all_gens
=
merge_async_iterators
(
*
generators
)
async
for
i
,
res
in
all_gens
:
...
...
@@ -340,6 +391,14 @@ def main(args: argparse.Namespace):
vocab_size
=
tokenizer
.
vocab_size
requests
=
[]
for
_
in
range
(
args
.
num_prompts
):
request_tokenizer
=
tokenizer
lora_request
:
Optional
[
LoRARequest
]
=
None
if
args
.
enable_lora
:
lora_request
,
lora_tokenizer
=
get_random_lora_request
(
args
)
if
lora_tokenizer
:
request_tokenizer
=
lora_tokenizer
# Synthesize a prompt with the given input length.
candidate_ids
=
[
random
.
randint
(
0
,
vocab_size
-
1
)
...
...
@@ -348,8 +407,8 @@ def main(args: argparse.Namespace):
# As tokenizer may add additional tokens like BOS, we need to try
# different lengths to get the desired input length.
for
_
in
range
(
5
):
# Max attempts to correct
candidate_prompt
=
tokenizer
.
decode
(
candidate_ids
)
tokenized_len
=
len
(
tokenizer
.
encode
(
candidate_prompt
))
candidate_prompt
=
request_
tokenizer
.
decode
(
candidate_ids
)
tokenized_len
=
len
(
request_
tokenizer
.
encode
(
candidate_prompt
))
if
tokenized_len
==
args
.
input_len
:
break
...
...
@@ -366,40 +425,14 @@ def main(args: argparse.Namespace):
requests
.
append
(
SampleRequest
(
prompt
=
candidate_prompt
,
prompt_len
=
args
.
input_len
,
expected_output_len
=
args
.
output_len
))
expected_output_len
=
args
.
output_len
,
lora_request
=
lora_request
))
else
:
requests
=
sample_requests
(
tokenizer
,
args
)
is_multi_modal
=
any
(
request
.
multi_modal_data
is
not
None
for
request
in
requests
)
if
args
.
backend
==
"vllm"
:
# if args.async_engine:
# run_args = [
# requests, args.model, args.tokenizer, args.quantization,
# args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
# args.trust_remote_code, args.dtype, args.max_model_len,
# args.enforce_eager, args.kv_cache_dtype,
# args.quantization_param_path, args.device,
# args.enable_prefix_caching, args.enable_chunked_prefill,
# args.max_num_batched_tokens, args.distributed_executor_backend,
# args.gpu_memory_utilization, args.num_scheduler_steps,
# args.use_v2_block_manager, args.download_dir, args.load_format,
# args.disable_async_output_proc
# ]
# else:
# run_args = [
# warmup_requests, requests, args.model, args.tokenizer, args.quantization,
# args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
# args.trust_remote_code, args.dtype, args.max_model_len,
# args.enforce_eager, args.kv_cache_dtype,
# args.quantization_param_path, args.device,
# args.enable_prefix_caching, args.enable_chunked_prefill,
# args.max_num_batched_tokens, args.distributed_executor_backend,
# args.gpu_memory_utilization, args.num_scheduler_steps,
# args.use_v2_block_manager, args.download_dir, args.load_format,
# args.disable_async_output_proc
# ]
if
args
.
async_engine
:
elapsed_time
=
uvloop
.
run
(
run_vllm_async
(
...
...
@@ -409,7 +442,7 @@ def main(args: argparse.Namespace):
args
.
disable_frontend_multiprocessing
,
))
else
:
elapsed_time
=
run_vllm
(
requests
,
args
.
n
,
elapsed_time
=
run_vllm
(
warmup_requests
,
requests
,
args
.
n
,
EngineArgs
.
from_cli_args
(
args
))
elif
args
.
backend
==
"hf"
:
assert
args
.
tensor_parallel_size
==
1
...
...
@@ -496,6 +529,14 @@ if __name__ == "__main__":
action
=
'store_true'
,
default
=
False
,
help
=
"Disable decoupled async engine frontend."
)
# LoRA
parser
.
add_argument
(
"--lora-path"
,
type
=
str
,
default
=
None
,
help
=
"Path to the lora adapters to use. This can be an absolute path, "
"a relative path, or a Hugging Face model identifier."
)
parser
=
AsyncEngineArgs
.
add_cli_args
(
parser
)
args
=
parser
.
parse_args
()
if
args
.
tokenizer
is
None
:
...
...
@@ -505,6 +546,8 @@ if __name__ == "__main__":
assert
args
.
output_len
is
not
None
else
:
assert
args
.
input_len
is
None
if
args
.
enable_lora
:
assert
args
.
lora_path
is
not
None
if
args
.
backend
==
"vllm"
:
if
args
.
hf_max_batch_size
is
not
None
:
...
...
@@ -514,6 +557,9 @@ if __name__ == "__main__":
raise
ValueError
(
"HF max batch size is required for HF backend."
)
if
args
.
quantization
is
not
None
:
raise
ValueError
(
"Quantization is only for vLLM backend."
)
if
args
.
enable_lora
is
not
None
:
raise
ValueError
(
"LoRA benchmarking is only supported for vLLM"
" backend"
)
elif
args
.
backend
==
"mii"
:
if
args
.
dtype
!=
"auto"
:
raise
ValueError
(
"dtype must be auto for MII backend."
)
...
...
@@ -526,4 +572,7 @@ if __name__ == "__main__":
if
args
.
tokenizer
!=
args
.
model
:
raise
ValueError
(
"Tokenizer must be the same as the model for MII "
"backend."
)
if
args
.
enable_lora
is
not
None
:
raise
ValueError
(
"LoRA benchmarking is only supported for vLLM"
" backend"
)
main
(
args
)
benchmarks/cutlass_benchmarks/sparse_benchmarks.py
0 → 100644
View file @
96ae75ad
import
argparse
import
copy
import
itertools
import
pickle
as
pkl
import
time
from
typing
import
Callable
,
Iterable
,
List
,
Tuple
import
torch
import
torch.utils.benchmark
as
TBenchmark
from
torch.utils.benchmark
import
Measurement
as
TMeasurement
from
utils
import
make_rand_sparse_tensors
from
weight_shapes
import
WEIGHT_SHAPES
from
vllm
import
_custom_ops
as
ops
from
vllm.utils
import
FlexibleArgumentParser
DEFAULT_MODELS
=
list
(
WEIGHT_SHAPES
.
keys
())
DEFAULT_BATCH_SIZES
=
[
1
,
16
,
32
,
64
,
128
,
256
,
512
]
DEFAULT_TP_SIZES
=
[
1
]
# bench
def
bench_fn
(
label
:
str
,
sub_label
:
str
,
description
:
str
,
fn
:
Callable
,
*
args
,
**
kwargs
)
->
TMeasurement
:
min_run_time
=
1
globals
=
{
"args"
:
args
,
"kwargs"
:
kwargs
,
"fn"
:
fn
,
}
return
TBenchmark
.
Timer
(
stmt
=
"fn(*args, **kwargs)"
,
globals
=
globals
,
label
=
label
,
sub_label
=
sub_label
,
description
=
description
,
).
blocked_autorange
(
min_run_time
=
min_run_time
)
def
bench_int8
(
dtype
:
torch
.
dtype
,
m
:
int
,
k
:
int
,
n
:
int
,
label
:
str
,
sub_label
:
str
)
->
Iterable
[
TMeasurement
]:
assert
dtype
==
torch
.
int8
b_compressed
,
e
,
a
,
b
=
make_rand_sparse_tensors
(
torch
.
int8
,
m
,
n
,
k
)
scale_a
=
torch
.
tensor
(
1.0
,
device
=
"cuda"
,
dtype
=
torch
.
float32
)
scale_b
=
torch
.
tensor
(
1.0
,
device
=
"cuda"
,
dtype
=
torch
.
float32
)
bias
=
torch
.
zeros
((
n
,
),
device
=
"cuda"
,
dtype
=
torch
.
bfloat16
)
out
=
ops
.
cutlass_scaled_sparse_mm
(
a
,
b_compressed
,
e
,
scale_a
,
scale_b
,
torch
.
bfloat16
)
out_ref
=
ops
.
cutlass_scaled_mm
(
a
,
b
,
scale_a
,
scale_b
,
torch
.
bfloat16
)
if
not
torch
.
allclose
(
out
,
out_ref
):
print
(
"Incorrect results"
)
print
(
out
)
print
(
out_ref
)
else
:
print
(
"Correct results"
)
timers
=
[]
# pytorch impl - bfloat16
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"pytorch_bf16_bf16_bf16_matmul-no-scales"
,
torch
.
mm
,
a
.
to
(
dtype
=
torch
.
bfloat16
),
b
.
to
(
dtype
=
torch
.
bfloat16
)))
# pytorch impl - float16
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"pytorch_fp16_fp16_fp16_matmul-no-scales"
,
torch
.
mm
,
a
.
to
(
dtype
=
torch
.
float16
),
b
.
to
(
dtype
=
torch
.
float16
)))
# cutlass impl
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"cutlass_i8_i8_bf16_scaled_mm"
,
ops
.
cutlass_scaled_mm
,
a
,
b
,
scale_a
,
scale_b
,
torch
.
bfloat16
))
# cutlass with bias
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"cutlass_i8_i8_bf16_scaled_mm_bias"
,
ops
.
cutlass_scaled_mm
,
a
,
b
,
scale_a
,
scale_b
,
torch
.
bfloat16
,
bias
))
# cutlass sparse impl
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"cutlass_i8_i8_bf16_scaled_sparse_mm"
,
ops
.
cutlass_scaled_sparse_mm
,
a
,
b_compressed
,
e
,
scale_a
,
scale_b
,
torch
.
bfloat16
))
# cutlass sparse with bias
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"cutlass_i8_i8_bf16_scaled_sparse_mm_bias"
,
ops
.
cutlass_scaled_sparse_mm
,
a
,
b_compressed
,
e
,
scale_a
,
scale_b
,
torch
.
bfloat16
,
bias
))
return
timers
def
bench_fp8
(
dtype
:
torch
.
dtype
,
m
:
int
,
k
:
int
,
n
:
int
,
label
:
str
,
sub_label
:
str
)
->
Iterable
[
TMeasurement
]:
assert
dtype
==
torch
.
float8_e4m3fn
b_compressed
,
e
,
a
,
b
=
make_rand_sparse_tensors
(
torch
.
float8_e4m3fn
,
m
,
n
,
k
)
scale_a
=
torch
.
tensor
(
1.0
,
device
=
"cuda"
,
dtype
=
torch
.
float32
)
scale_b
=
torch
.
tensor
(
1.0
,
device
=
"cuda"
,
dtype
=
torch
.
float32
)
bias
=
torch
.
zeros
((
n
,
),
device
=
"cuda"
,
dtype
=
torch
.
bfloat16
)
out
=
ops
.
cutlass_scaled_sparse_mm
(
a
,
b_compressed
,
e
,
scale_a
,
scale_b
,
torch
.
bfloat16
)
out_ref
=
ops
.
cutlass_scaled_mm
(
a
,
b
,
scale_a
,
scale_b
,
torch
.
bfloat16
)
if
not
torch
.
allclose
(
out
,
out_ref
):
print
(
"Incorrect results"
)
print
(
out
)
print
(
out_ref
)
else
:
print
(
"Correct results"
)
timers
=
[]
# pytorch impl w. bf16
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"pytorch_bf16_bf16_bf16_matmul-no-scales"
,
torch
.
mm
,
a
.
to
(
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
),
b
.
to
(
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
)))
# pytorch impl: bf16 output, without fp8 fast accum
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"pytorch_fp8_fp8_bf16_scaled_mm"
,
torch
.
_scaled_mm
,
a
,
b
,
scale_a
=
scale_a
,
scale_b
=
scale_b
,
out_dtype
=
torch
.
bfloat16
))
# pytorch impl: bf16 output, with fp8 fast accum
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"pytorch_fp8_fp8_bf16_scaled_mm_fast_accum"
,
torch
.
_scaled_mm
,
a
,
b
,
scale_a
=
scale_a
,
scale_b
=
scale_b
,
out_dtype
=
torch
.
bfloat16
,
use_fast_accum
=
True
))
# pytorch impl: fp16 output, without fp8 fast accum
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"pytorch_fp8_fp8_fp16_scaled_mm"
,
torch
.
_scaled_mm
,
a
,
b
,
scale_a
=
scale_a
,
scale_b
=
scale_b
,
out_dtype
=
torch
.
float16
))
# pytorch impl: fp16 output, with fp8 fast accum
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"pytorch_fp8_fp8_fp16_scaled_mm_fast_accum"
,
torch
.
_scaled_mm
,
a
,
b
,
scale_a
=
scale_a
,
scale_b
=
scale_b
,
out_dtype
=
torch
.
float16
,
use_fast_accum
=
True
))
# cutlass impl: bf16 output
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"cutlass_fp8_fp8_bf16_scaled_mm"
,
ops
.
cutlass_scaled_mm
,
a
,
b
,
scale_a
,
scale_b
,
torch
.
bfloat16
))
# cutlass impl: bf16 output
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"cutlass_fp8_fp8_bf16_scaled_sparse_mm"
,
ops
.
cutlass_scaled_sparse_mm
,
a
,
b_compressed
,
e
,
scale_a
,
scale_b
,
torch
.
bfloat16
))
# cutlass impl: fp16 output
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"cutlass_fp8_fp8_fp16_scaled_sparse_mm"
,
ops
.
cutlass_scaled_sparse_mm
,
a
,
b_compressed
,
e
,
scale_a
,
scale_b
,
torch
.
float16
))
# cutlass impl: bf16 output, with bias
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"cutlass_fp8_fp8_bf16_scaled_sparse_mm_bias"
,
ops
.
cutlass_scaled_sparse_mm
,
a
,
b_compressed
,
e
,
scale_a
,
scale_b
,
torch
.
bfloat16
,
bias
))
# cutlass impl: fp16 output, with bias
timers
.
append
(
bench_fn
(
label
,
sub_label
,
"cutlass_fp8_fp8_fp16_scaled_sparse_mm_bias"
,
ops
.
cutlass_scaled_sparse_mm
,
a
,
b_compressed
,
e
,
scale_a
,
scale_b
,
torch
.
float16
,
bias
.
to
(
dtype
=
torch
.
float16
)))
return
timers
def
bench
(
dtype
:
torch
.
dtype
,
m
:
int
,
k
:
int
,
n
:
int
,
label
:
str
,
sub_label
:
str
)
->
Iterable
[
TMeasurement
]:
if
dtype
==
torch
.
int8
:
return
bench_int8
(
dtype
,
m
,
k
,
n
,
label
,
sub_label
)
if
dtype
==
torch
.
float8_e4m3fn
:
return
bench_fp8
(
dtype
,
m
,
k
,
n
,
label
,
sub_label
)
raise
ValueError
(
"unsupported type"
)
# runner
def
print_timers
(
timers
:
Iterable
[
TMeasurement
]):
compare
=
TBenchmark
.
Compare
(
timers
)
compare
.
print
()
def
run
(
dtype
:
torch
.
dtype
,
MKNs
:
Iterable
[
Tuple
[
int
,
int
,
int
]])
->
Iterable
[
TMeasurement
]:
results
=
[]
for
m
,
k
,
n
in
MKNs
:
timers
=
bench
(
dtype
,
m
,
k
,
n
,
f
"scaled-
{
dtype
}
-gemm"
,
f
"MKN=(
{
m
}
x
{
k
}
x
{
n
}
)"
)
print_timers
(
timers
)
results
.
extend
(
timers
)
return
results
# output makers
def
make_output
(
data
:
Iterable
[
TMeasurement
],
MKNs
:
Iterable
[
Tuple
[
int
,
int
,
int
]],
base_description
:
str
,
timestamp
=
None
):
print
(
f
"== All Results
{
base_description
}
===="
)
print_timers
(
data
)
# pickle all the results
timestamp
=
int
(
time
.
time
())
if
timestamp
is
None
else
timestamp
with
open
(
f
"
{
base_description
}
-
{
timestamp
}
.pkl"
,
"wb"
)
as
f
:
pkl
.
dump
(
data
,
f
)
# argparse runners
def
run_square_bench
(
args
):
dim_sizes
=
list
(
range
(
args
.
dim_start
,
args
.
dim_end
+
1
,
args
.
dim_increment
))
MKNs
=
list
(
zip
(
dim_sizes
,
dim_sizes
,
dim_sizes
))
data
=
run
(
args
.
dtype
,
MKNs
)
make_output
(
data
,
MKNs
,
f
"square_bench-
{
args
.
dtype
}
"
)
def
run_range_bench
(
args
):
dim_sizes
=
list
(
range
(
args
.
dim_start
,
args
.
dim_end
,
args
.
dim_increment
))
n
=
len
(
dim_sizes
)
Ms
=
[
args
.
m_constant
]
*
n
if
args
.
m_constant
is
not
None
else
dim_sizes
Ks
=
[
args
.
k_constant
]
*
n
if
args
.
k_constant
is
not
None
else
dim_sizes
Ns
=
[
args
.
n_constant
]
*
n
if
args
.
n_constant
is
not
None
else
dim_sizes
MKNs
=
list
(
zip
(
Ms
,
Ks
,
Ns
))
data
=
run
(
args
.
dtype
,
MKNs
)
make_output
(
data
,
MKNs
,
f
"range_bench-
{
args
.
dtype
}
"
)
def
run_model_bench
(
args
):
print
(
"Benchmarking models:"
)
for
i
,
model
in
enumerate
(
args
.
models
):
print
(
f
"[
{
i
}
]
{
model
}
"
)
def
model_shapes
(
model_name
:
str
,
tp_size
:
int
)
->
List
[
Tuple
[
int
,
int
]]:
KNs
=
[]
for
KN
,
tp_split_dim
in
copy
.
deepcopy
(
WEIGHT_SHAPES
[
model_name
]):
KN
[
tp_split_dim
]
=
KN
[
tp_split_dim
]
//
tp_size
KNs
.
append
(
KN
)
return
KNs
model_bench_data
=
[]
models_tps
=
list
(
itertools
.
product
(
args
.
models
,
args
.
tp_sizes
))
for
model
,
tp_size
in
models_tps
:
Ms
=
args
.
batch_sizes
KNs
=
model_shapes
(
model
,
tp_size
)
MKNs
=
[]
for
m
in
Ms
:
for
k
,
n
in
KNs
:
MKNs
.
append
((
m
,
k
,
n
))
data
=
run
(
args
.
dtype
,
MKNs
)
model_bench_data
.
append
(
data
)
# Print all results
for
data
,
model_tp
in
zip
(
model_bench_data
,
models_tps
):
model
,
tp_size
=
model_tp
print
(
f
"== Results
{
args
.
dtype
}
{
model
}
-TP
{
tp_size
}
===="
)
print_timers
(
data
)
timestamp
=
int
(
time
.
time
())
all_data
=
[]
for
d
in
model_bench_data
:
all_data
.
extend
(
d
)
# pickle all data
with
open
(
f
"model_bench-
{
args
.
dtype
}
-
{
timestamp
}
.pkl"
,
"wb"
)
as
f
:
pkl
.
dump
(
all_data
,
f
)
if
__name__
==
'__main__'
:
def
to_torch_dtype
(
dt
):
if
dt
==
"int8"
:
return
torch
.
int8
if
dt
==
"fp8"
:
return
torch
.
float8_e4m3fn
raise
ValueError
(
"unsupported dtype"
)
parser
=
FlexibleArgumentParser
(
description
=
"""
Benchmark Cutlass GEMM.
To run square GEMMs:
python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
To run constant N and K and sweep M:
python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
To run dimensions from a model:
python3 ./benchmarks/cutlass_benchmarks/sparse_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
Output:
- a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
"""
,
# noqa: E501
formatter_class
=
argparse
.
RawTextHelpFormatter
)
parser
.
add_argument
(
"--dtype"
,
type
=
to_torch_dtype
,
required
=
True
,
help
=
"Available options are ['int8', 'fp8']"
)
subparsers
=
parser
.
add_subparsers
(
dest
=
"cmd"
)
square_parser
=
subparsers
.
add_parser
(
"square_bench"
)
square_parser
.
add_argument
(
"--dim-start"
,
type
=
int
,
required
=
True
)
square_parser
.
add_argument
(
"--dim-end"
,
type
=
int
,
required
=
True
)
square_parser
.
add_argument
(
"--dim-increment"
,
type
=
int
,
required
=
True
)
square_parser
.
set_defaults
(
func
=
run_square_bench
)
range_parser
=
subparsers
.
add_parser
(
"range_bench"
)
range_parser
.
add_argument
(
"--dim-start"
,
type
=
int
,
required
=
True
)
range_parser
.
add_argument
(
"--dim-end"
,
type
=
int
,
required
=
True
)
range_parser
.
add_argument
(
"--dim-increment"
,
type
=
int
,
required
=
True
)
range_parser
.
add_argument
(
"--m-constant"
,
type
=
int
,
default
=
None
)
range_parser
.
add_argument
(
"--n-constant"
,
type
=
int
,
default
=
None
)
range_parser
.
add_argument
(
"--k-constant"
,
type
=
int
,
default
=
None
)
range_parser
.
set_defaults
(
func
=
run_range_bench
)
model_parser
=
subparsers
.
add_parser
(
"model_bench"
)
model_parser
.
add_argument
(
"--models"
,
nargs
=
"+"
,
type
=
str
,
default
=
DEFAULT_MODELS
,
choices
=
WEIGHT_SHAPES
.
keys
())
model_parser
.
add_argument
(
"--tp-sizes"
,
nargs
=
"+"
,
type
=
int
,
default
=
DEFAULT_TP_SIZES
)
model_parser
.
add_argument
(
"--batch-sizes"
,
nargs
=
"+"
,
type
=
int
,
default
=
DEFAULT_BATCH_SIZES
)
model_parser
.
set_defaults
(
func
=
run_model_bench
)
args
=
parser
.
parse_args
()
args
.
func
(
args
)
benchmarks/cutlass_benchmarks/utils.py
0 → 100644
View file @
96ae75ad
# Cutlass bench utils
from
typing
import
Iterable
,
Tuple
import
torch
import
vllm._custom_ops
as
ops
def
to_fp8
(
tensor
:
torch
.
Tensor
)
->
torch
.
Tensor
:
finfo
=
torch
.
finfo
(
torch
.
float8_e4m3fn
)
return
torch
.
round
(
tensor
.
clamp
(
min
=
finfo
.
min
,
max
=
finfo
.
max
)).
to
(
dtype
=
torch
.
float8_e4m3fn
)
def
to_int8
(
tensor
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
torch
.
round
(
tensor
.
clamp
(
min
=-
128
,
max
=
127
)).
to
(
dtype
=
torch
.
int8
)
def
to_bf16
(
tensor
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
tensor
.
to
(
dtype
=
torch
.
bfloat16
)
def
to_fp16
(
tensor
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
tensor
.
to
(
dtype
=
torch
.
float16
)
def
make_rand_tensors
(
dtype
:
torch
.
dtype
,
m
:
int
,
n
:
int
,
k
:
int
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
a
=
torch
.
randn
((
m
,
k
),
device
=
'cuda'
)
*
5
b
=
torch
.
randn
((
n
,
k
),
device
=
'cuda'
).
t
()
*
5
if
dtype
==
torch
.
int8
:
return
to_int8
(
a
),
to_int8
(
b
)
if
dtype
==
torch
.
float8_e4m3fn
:
return
to_fp8
(
a
),
to_fp8
(
b
)
raise
ValueError
(
"unsupported dtype"
)
def
prune_to_2_4
(
tensor
):
# Reshape tensor to [N, 4] where N is number of groups of 4
original_shape
=
tensor
.
shape
reshaped
=
tensor
.
reshape
(
-
1
,
4
)
# Get indices of top 2 absolute values in each group of 4
_
,
indices
=
torch
.
topk
(
torch
.
abs
(
reshaped
),
k
=
2
,
dim
=
1
)
# Create binary mask
mask
=
torch
.
zeros_like
(
reshaped
)
mask
.
scatter_
(
dim
=
1
,
index
=
indices
,
src
=
torch
.
ones_like
(
indices
,
dtype
=
mask
.
dtype
))
# Apply mask and reshape back
pruned
=
reshaped
*
mask
# Turn all -0.0 to 0.0
pruned
[
pruned
==
-
0.0
]
=
0.0
return
pruned
.
reshape
(
original_shape
)
def
make_rand_sparse_tensors
(
dtype
:
torch
.
dtype
,
m
:
int
,
n
:
int
,
k
:
int
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
a
=
torch
.
randn
((
m
,
k
),
device
=
'cuda'
)
*
5
b
=
torch
.
randn
((
n
,
k
),
device
=
'cuda'
).
t
()
*
5
b
=
prune_to_2_4
(
b
.
t
()).
t
()
if
dtype
==
torch
.
int8
:
a
,
b
=
to_int8
(
a
),
to_int8
(
b
)
elif
dtype
==
torch
.
float8_e4m3fn
:
a
,
b
=
to_fp8
(
a
),
to_fp8
(
b
)
elif
dtype
==
torch
.
float16
:
a
,
b
=
to_fp16
(
a
),
to_fp16
(
b
)
elif
dtype
==
torch
.
bfloat16
:
a
,
b
=
to_bf16
(
a
),
to_bf16
(
b
)
else
:
raise
ValueError
(
"unsupported dtype"
)
b_compressed
,
e
=
ops
.
cutlass_sparse_compress
(
b
.
t
())
# Compressed B, Metadata, Original A, B
return
b_compressed
,
e
,
a
,
b
def
make_n_rand_sparse_tensors
(
num_tensors
:
int
,
dtype
:
torch
.
dtype
,
m
:
int
,
n
:
int
,
k
:
int
)
->
\
Tuple
[
Iterable
[
torch
.
Tensor
],
Iterable
[
torch
.
Tensor
]]:
ABs
=
[]
for
_
in
range
(
num_tensors
):
b_comp
,
e
,
a
,
b
=
make_rand_sparse_tensors
(
dtype
,
m
,
n
,
k
)
if
b_comp
is
not
None
:
ABs
.
append
(
make_rand_sparse_tensors
(
dtype
,
m
,
n
,
k
))
BComps
,
Es
,
As
,
Bs
=
zip
(
*
ABs
)
return
list
(
BComps
),
list
(
Es
),
list
(
As
),
list
(
Bs
)
benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
View file @
96ae75ad
...
...
@@ -8,6 +8,7 @@ from typing import Callable, Iterable, List, Tuple
import
torch
import
torch.utils.benchmark
as
TBenchmark
from
torch.utils.benchmark
import
Measurement
as
TMeasurement
from
utils
import
make_rand_tensors
from
weight_shapes
import
WEIGHT_SHAPES
from
vllm
import
_custom_ops
as
ops
...
...
@@ -17,31 +18,6 @@ DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
DEFAULT_BATCH_SIZES
=
[
1
,
16
,
32
,
64
,
128
,
256
,
512
]
DEFAULT_TP_SIZES
=
[
1
]
# helpers
def
to_fp8
(
tensor
:
torch
.
Tensor
)
->
torch
.
Tensor
:
finfo
=
torch
.
finfo
(
torch
.
float8_e4m3fn
)
return
torch
.
round
(
tensor
.
clamp
(
min
=
finfo
.
min
,
max
=
finfo
.
max
)).
to
(
dtype
=
torch
.
float8_e4m3fn
)
def
to_int8
(
tensor
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
torch
.
round
(
tensor
.
clamp
(
min
=-
128
,
max
=
127
)).
to
(
dtype
=
torch
.
int8
)
def
make_rand_tensors
(
dtype
:
torch
.
dtype
,
m
:
int
,
n
:
int
,
k
:
int
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
a
=
torch
.
randn
((
m
,
k
),
device
=
'cuda'
)
*
5
b
=
torch
.
randn
((
n
,
k
),
device
=
'cuda'
).
t
()
*
5
if
dtype
==
torch
.
int8
:
return
to_int8
(
a
),
to_int8
(
b
)
if
dtype
==
torch
.
float8_e4m3fn
:
return
to_fp8
(
a
),
to_fp8
(
b
)
raise
ValueError
(
"unsupported dtype"
)
# bench
def
bench_fn
(
label
:
str
,
sub_label
:
str
,
description
:
str
,
fn
:
Callable
,
*
args
,
...
...
benchmarks/cutlass_benchmarks/weight_shapes.py
View file @
96ae75ad
benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
View file @
96ae75ad
...
...
@@ -10,7 +10,8 @@ set -ex
kill_gpu_processes
()
{
# kill all processes on GPU.
pkill
-f
pt_main_thread
pgrep pt_main_thread | xargs
-r
kill
-9
pgrep python3 | xargs
-r
kill
-9
sleep
10
# remove vllm config file
...
...
@@ -54,7 +55,7 @@ benchmark() {
CUDA_VISIBLE_DEVICES
=
0 python3
\
-m
vllm.entrypoints.openai.api_server
\
--model
meta-llama/Meta-Llama-3.1-8B-Instruct
\
--model
$model
\
--port
8100
\
--max-model-len
10000
\
--gpu-memory-utilization
0.6
\
...
...
@@ -64,7 +65,7 @@ benchmark() {
CUDA_VISIBLE_DEVICES
=
1 python3
\
-m
vllm.entrypoints.openai.api_server
\
--model
meta-llama/Meta-Llama-3.1-8B-Instruct
\
--model
$model
\
--port
8200
\
--max-model-len
10000
\
--gpu-memory-utilization
0.6
\
...
...
@@ -87,7 +88,7 @@ benchmark() {
--port
8100
\
--save-result
\
--result-dir
$results_folder
\
--result-filename
disagg_prefill_
2x
tp
4
.json
\
--result-filename
disagg_prefill_tp
1
.json
\
--request-rate
"inf"
...
...
@@ -105,7 +106,7 @@ benchmark() {
--port
8200
\
--save-result
\
--result-dir
$results_folder
\
--result-filename
disagg_prefill_
2xtp4
.json
\
--result-filename
disagg_prefill_
tp1_overhead
.json
\
--request-rate
"
$qps
"
kill_gpu_processes
...
...
@@ -118,7 +119,7 @@ main() {
(
which jq
)
||
(
apt-get
-y
install
jq
)
(
which socat
)
||
(
apt-get
-y
install
socat
)
pip
install
quart httpx
pip
install
quart httpx
datasets
cd
"
$(
dirname
"
$0
"
)
"
...
...
benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
View file @
96ae75ad
#!/bin/bash
# Requirement:
8x H100
GPUs.
# Requirement:
2x
GPUs.
# Model:
neuralmagic
/Meta-Llama-3
-70
B-Instruct
-FP8-KV
# Query:
2048
input tokens,
11
output tokens, QPS
4
,
5
00 requests
# Resource:
8
x
H100
# Model:
meta-llama
/Meta-Llama-3
.1-8
B-Instruct
# Query:
1024
input tokens,
6
output tokens, QPS
2/4/6/8
,
1
00 requests
# Resource:
2
x
GPU
# Approaches:
# 1. Chunked prefill: 1 vllm instance with tp=8
# 2. Chunked prefill: 2 vllm instance with tp=4, equivalent to 1 tp=4 instance with QPS 4
# 3. Disaggregated prefill: 1 prefilling instance and 1 decoding instance
# Prefilling instance: max_output_token=1
...
...
@@ -114,7 +113,6 @@ benchmark() {
--request-rate
"
$qps
"
sleep
2
}
...
...
@@ -123,8 +121,9 @@ main() {
(
which wget
&&
which curl
)
||
(
apt-get update
&&
apt-get
install
-y
wget curl
)
(
which jq
)
||
(
apt-get
-y
install
jq
)
(
which socat
)
||
(
apt-get
-y
install
socat
)
(
which lsof
)
||
(
apt-get
-y
install
lsof
)
pip
install
quart httpx matplotlib aiohttp
pip
install
quart httpx matplotlib aiohttp
datasets
cd
"
$(
dirname
"
$0
"
)
"
...
...
Prev
1
2
3
4
5
…
19
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment