Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
2216a4e5
"tests/vscode:/vscode.git/clone" did not exist on "8a297115e2367d463b781adb86b55ac740594cf6"
Commit
2216a4e5
authored
Oct 23, 2024
by
zhuwenwen
Browse files
Merge remote-tracking branch 'mirror/main'
parents
ad385667
51c24c97
Changes
239
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
252 additions
and
356 deletions
+252
-356
.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
...s/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
+11
-0
.buildkite/lm-eval-harness/configs/models-small.txt
.buildkite/lm-eval-harness/configs/models-small.txt
+1
-1
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+17
-10
.github/workflows/actionlint.yml
.github/workflows/actionlint.yml
+1
-0
.github/workflows/add_label_automerge.yml
.github/workflows/add_label_automerge.yml
+1
-1
.github/workflows/clang-format.yml
.github/workflows/clang-format.yml
+3
-3
.github/workflows/matchers/mypy.json
.github/workflows/matchers/mypy.json
+16
-0
.github/workflows/matchers/ruff.json
.github/workflows/matchers/ruff.json
+17
-0
.github/workflows/mypy.yaml
.github/workflows/mypy.yaml
+4
-3
.github/workflows/publish.yml
.github/workflows/publish.yml
+6
-6
.github/workflows/reminder_comment.yml
.github/workflows/reminder_comment.yml
+1
-1
.github/workflows/ruff.yml
.github/workflows/ruff.yml
+4
-3
.github/workflows/yapf.yml
.github/workflows/yapf.yml
+2
-2
CMakeLists.txt
CMakeLists.txt
+8
-24
Dockerfile.openvino
Dockerfile.openvino
+4
-4
Dockerfile.ppc64le
Dockerfile.ppc64le
+1
-1
benchmarks/benchmark_latency.py
benchmarks/benchmark_latency.py
+7
-148
benchmarks/benchmark_prefix_caching.py
benchmarks/benchmark_prefix_caching.py
+7
-17
benchmarks/benchmark_prioritization.py
benchmarks/benchmark_prioritization.py
+9
-125
benchmarks/benchmark_serving.py
benchmarks/benchmark_serving.py
+132
-7
No files found.
.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
0 → 100644
View file @
2216a4e5
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
model_name
:
"
neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
tasks
:
-
name
:
"
gsm8k"
metrics
:
-
name
:
"
exact_match,strict-match"
value
:
0.356
-
name
:
"
exact_match,flexible-extract"
value
:
0.358
limit
:
1000
num_fewshot
:
5
.buildkite/lm-eval-harness/configs/models-small.txt
View file @
2216a4e5
Meta-Llama-3-8B-Instruct.yaml
Meta-Llama-3-8B-Instruct.yaml
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
Meta-Llama-3
-8
B-Instruct-INT8-compressed-tensors.yaml
Meta-Llama-3
.2-1
B-Instruct-INT8-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
...
...
.buildkite/test-pipeline.yaml
View file @
2216a4e5
...
@@ -230,14 +230,12 @@ steps:
...
@@ -230,14 +230,12 @@ steps:
commands
:
commands
:
-
pytest -v -s compile/test_basic_correctness.py
-
pytest -v -s compile/test_basic_correctness.py
# TODO: re-write in comparison tests, and fix symbolic shape
-
label
:
"
PyTorch
Fullgraph
Test"
# 18min
# for quantization ops.
source_file_dependencies
:
# - label: "PyTorch Fullgraph Test" # 18min
-
vllm/
# source_file_dependencies:
-
tests/compile
# - vllm/
commands
:
# - tests/compile
-
pytest -v -s compile/test_full_graph.py
# commands:
# - pytest -v -s compile/test_full_graph.py
-
label
:
Kernels Test %N
# 1h each
-
label
:
Kernels Test %N
# 1h each
mirror_hardwares
:
[
amd
]
mirror_hardwares
:
[
amd
]
...
@@ -312,13 +310,22 @@ steps:
...
@@ -312,13 +310,22 @@ steps:
-
pytest -v -s models/test_oot_registration.py
# it needs a clean process
-
pytest -v -s models/test_oot_registration.py
# it needs a clean process
-
pytest -v -s models/*.py --ignore=models/test_oot_registration.py
-
pytest -v -s models/*.py --ignore=models/test_oot_registration.py
-
label
:
Decoder-only Language Models Test
# 1h36
min
-
label
:
Decoder-only Language Models Test
(Standard)
# 35
min
#mirror_hardwares: [amd]
#mirror_hardwares: [amd]
source_file_dependencies
:
source_file_dependencies
:
-
vllm/
-
vllm/
-
tests/models/decoder_only/language
-
tests/models/decoder_only/language
commands
:
commands
:
-
pytest -v -s models/decoder_only/language
-
pytest -v -s models/decoder_only/language/test_models.py
-
pytest -v -s models/decoder_only/language/test_big_models.py
-
label
:
Decoder-only Language Models Test (Extended)
# 1h20min
nightly
:
true
source_file_dependencies
:
-
vllm/
-
tests/models/decoder_only/language
commands
:
-
pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py --ignore=models/decoder_only/language/test_big_models.py
-
label
:
Decoder-only Multi-Modal Models Test
# 1h31min
-
label
:
Decoder-only Multi-Modal Models Test
# 1h31min
#mirror_hardwares: [amd]
#mirror_hardwares: [amd]
...
...
.github/workflows/actionlint.yml
View file @
2216a4e5
...
@@ -34,4 +34,5 @@ jobs:
...
@@ -34,4 +34,5 @@ jobs:
-
name
:
"
Run
actionlint"
-
name
:
"
Run
actionlint"
run
:
|
run
:
|
echo "::add-matcher::.github/workflows/matchers/actionlint.json"
tools/actionlint.sh -color
tools/actionlint.sh -color
.github/workflows/add_label_automerge.yml
View file @
2216a4e5
...
@@ -8,7 +8,7 @@ jobs:
...
@@ -8,7 +8,7 @@ jobs:
runs-on
:
ubuntu-latest
runs-on
:
ubuntu-latest
steps
:
steps
:
-
name
:
Add label
-
name
:
Add label
uses
:
actions/github-script@
v7
uses
:
actions/github-script@
60a0d83039c74a4aee543508d2ffcb1c3799cdea
# v7.0.1
with
:
with
:
script
:
|
script
:
|
github.rest.issues.addLabels({
github.rest.issues.addLabels({
...
...
.github/workflows/clang-format.yml
View file @
2216a4e5
...
@@ -17,9 +17,9 @@ jobs:
...
@@ -17,9 +17,9 @@ jobs:
matrix
:
matrix
:
python-version
:
[
"
3.11"
]
python-version
:
[
"
3.11"
]
steps
:
steps
:
-
uses
:
actions/checkout@
v4
-
uses
:
actions/checkout@
eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
# v4.2.1
-
name
:
Set up Python ${{ matrix.python-version }}
-
name
:
Set up Python ${{ matrix.python-version }}
uses
:
actions/setup-python@
v5
uses
:
actions/setup-python@
f677139bbe7f9c59b41e40162b753c062f5d49a3
# v5.2.0
with
:
with
:
python-version
:
${{ matrix.python-version }}
python-version
:
${{ matrix.python-version }}
-
name
:
Install dependencies
-
name
:
Install dependencies
...
@@ -38,4 +38,4 @@ jobs:
...
@@ -38,4 +38,4 @@ jobs:
)
)
find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
| grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \
| grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \
| xargs clang-format --dry-run --Werror
| xargs clang-format --dry-run --Werror
\ No newline at end of file
.github/workflows/matchers/mypy.json
0 → 100644
View file @
2216a4e5
{
"problemMatcher"
:
[
{
"owner"
:
"mypy"
,
"pattern"
:
[
{
"regexp"
:
"^(.+):(
\\
d+):
\\
s(error|warning):
\\
s(.+)$"
,
"file"
:
1
,
"line"
:
2
,
"severity"
:
3
,
"message"
:
4
}
]
}
]
}
.github/workflows/matchers/ruff.json
0 → 100644
View file @
2216a4e5
{
"problemMatcher"
:
[
{
"owner"
:
"ruff"
,
"pattern"
:
[
{
"regexp"
:
"^(.+?):(
\\
d+):(
\\
d+): (
\\
w+): (.+)$"
,
"file"
:
1
,
"line"
:
2
,
"column"
:
3
,
"code"
:
4
,
"message"
:
5
}
]
}
]
}
.github/workflows/mypy.yaml
View file @
2216a4e5
...
@@ -17,9 +17,9 @@ jobs:
...
@@ -17,9 +17,9 @@ jobs:
matrix
:
matrix
:
python-version
:
[
"
3.8"
,
"
3.9"
,
"
3.10"
,
"
3.11"
,
"
3.12"
]
python-version
:
[
"
3.8"
,
"
3.9"
,
"
3.10"
,
"
3.11"
,
"
3.12"
]
steps
:
steps
:
-
uses
:
actions/checkout@
v4
-
uses
:
actions/checkout@
eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
# v4.2.1
-
name
:
Set up Python ${{ matrix.python-version }}
-
name
:
Set up Python ${{ matrix.python-version }}
uses
:
actions/setup-python@
v5
uses
:
actions/setup-python@
f677139bbe7f9c59b41e40162b753c062f5d49a3
# v5.2.0
with
:
with
:
python-version
:
${{ matrix.python-version }}
python-version
:
${{ matrix.python-version }}
-
name
:
Install dependencies
-
name
:
Install dependencies
...
@@ -32,4 +32,5 @@ jobs:
...
@@ -32,4 +32,5 @@ jobs:
pip install types-setuptools
pip install types-setuptools
-
name
:
Mypy
-
name
:
Mypy
run
:
|
run
:
|
tools/mypy.sh
echo "::add-matcher::.github/workflows/matchers/mypy.json"
tools/mypy.sh 1
.github/workflows/publish.yml
View file @
2216a4e5
...
@@ -21,7 +21,7 @@ jobs:
...
@@ -21,7 +21,7 @@ jobs:
upload_url
:
${{ steps.create_release.outputs.upload_url }}
upload_url
:
${{ steps.create_release.outputs.upload_url }}
steps
:
steps
:
-
name
:
Checkout
-
name
:
Checkout
uses
:
actions/checkout@
v4
uses
:
actions/checkout@
eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
# v4.2.1
-
name
:
Extract branch info
-
name
:
Extract branch info
shell
:
bash
shell
:
bash
...
@@ -30,7 +30,7 @@ jobs:
...
@@ -30,7 +30,7 @@ jobs:
-
name
:
Create Release
-
name
:
Create Release
id
:
create_release
id
:
create_release
uses
:
"
actions/github-script@
v7"
uses
:
actions/github-script@
60a0d83039c74a4aee543508d2ffcb1c3799cdea
# v7.0.1
env
:
env
:
RELEASE_TAG
:
${{ env.release_tag }}
RELEASE_TAG
:
${{ env.release_tag }}
with
:
with
:
...
@@ -54,10 +54,10 @@ jobs:
...
@@ -54,10 +54,10 @@ jobs:
steps
:
steps
:
-
name
:
Checkout
-
name
:
Checkout
uses
:
actions/checkout@
v4
uses
:
actions/checkout@
eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
# v4.2.1
-
name
:
Setup ccache
-
name
:
Setup ccache
uses
:
hendrikmuhs/ccache-action@v1.2
uses
:
hendrikmuhs/ccache-action@
ed74d11c0b343532753ecead8a951bb09bb34bc9
#
v1.2
.14
with
:
with
:
create-symlink
:
true
create-symlink
:
true
key
:
${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
key
:
${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
...
@@ -68,7 +68,7 @@ jobs:
...
@@ -68,7 +68,7 @@ jobs:
bash -x .github/workflows/scripts/env.sh
bash -x .github/workflows/scripts/env.sh
-
name
:
Set up Python
-
name
:
Set up Python
uses
:
actions/setup-python@
v5
uses
:
actions/setup-python@
f677139bbe7f9c59b41e40162b753c062f5d49a3
# v5.2.0
with
:
with
:
python-version
:
${{ matrix.python-version }}
python-version
:
${{ matrix.python-version }}
...
@@ -92,7 +92,7 @@ jobs:
...
@@ -92,7 +92,7 @@ jobs:
echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
-
name
:
Upload Release Asset
-
name
:
Upload Release Asset
uses
:
actions/upload-release-asset@
v1
uses
:
actions/upload-release-asset@
e8f9f06c4b078e705bd2ea027f0926603fc9b4d5
# v1.0.2
env
:
env
:
GITHUB_TOKEN
:
${{ secrets.GITHUB_TOKEN }}
GITHUB_TOKEN
:
${{ secrets.GITHUB_TOKEN }}
with
:
with
:
...
...
.github/workflows/reminder_comment.yml
View file @
2216a4e5
...
@@ -8,7 +8,7 @@ jobs:
...
@@ -8,7 +8,7 @@ jobs:
runs-on
:
ubuntu-latest
runs-on
:
ubuntu-latest
steps
:
steps
:
-
name
:
Remind to run full CI on PR
-
name
:
Remind to run full CI on PR
uses
:
actions/github-script@
v7
uses
:
actions/github-script@
60a0d83039c74a4aee543508d2ffcb1c3799cdea
# v7.0.1
with
:
with
:
script
:
|
script
:
|
github.rest.issues.createComment({
github.rest.issues.createComment({
...
...
.github/workflows/ruff.yml
View file @
2216a4e5
...
@@ -17,9 +17,9 @@ jobs:
...
@@ -17,9 +17,9 @@ jobs:
matrix
:
matrix
:
python-version
:
[
"
3.8"
,
"
3.9"
,
"
3.10"
,
"
3.11"
,
"
3.12"
]
python-version
:
[
"
3.8"
,
"
3.9"
,
"
3.10"
,
"
3.11"
,
"
3.12"
]
steps
:
steps
:
-
uses
:
actions/checkout@
v4
-
uses
:
actions/checkout@
eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
# v4.2.1
-
name
:
Set up Python ${{ matrix.python-version }}
-
name
:
Set up Python ${{ matrix.python-version }}
uses
:
actions/setup-python@
v5
uses
:
actions/setup-python@
f677139bbe7f9c59b41e40162b753c062f5d49a3
# v5.2.0
with
:
with
:
python-version
:
${{ matrix.python-version }}
python-version
:
${{ matrix.python-version }}
-
name
:
Install dependencies
-
name
:
Install dependencies
...
@@ -28,7 +28,8 @@ jobs:
...
@@ -28,7 +28,8 @@ jobs:
pip install -r requirements-lint.txt
pip install -r requirements-lint.txt
-
name
:
Analysing the code with ruff
-
name
:
Analysing the code with ruff
run
:
|
run
:
|
ruff check .
echo "::add-matcher::.github/workflows/matchers/ruff.json"
ruff check --output-format github .
-
name
:
Spelling check with codespell
-
name
:
Spelling check with codespell
run
:
|
run
:
|
codespell --toml pyproject.toml
codespell --toml pyproject.toml
...
...
.github/workflows/yapf.yml
View file @
2216a4e5
...
@@ -16,9 +16,9 @@ jobs:
...
@@ -16,9 +16,9 @@ jobs:
matrix
:
matrix
:
python-version
:
[
"
3.8"
,
"
3.9"
,
"
3.10"
,
"
3.11"
,
"
3.12"
]
python-version
:
[
"
3.8"
,
"
3.9"
,
"
3.10"
,
"
3.11"
,
"
3.12"
]
steps
:
steps
:
-
uses
:
actions/checkout@
v4
-
uses
:
actions/checkout@
eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
# v4.2.1
-
name
:
Set up Python ${{ matrix.python-version }}
-
name
:
Set up Python ${{ matrix.python-version }}
uses
:
actions/setup-python@
v5
uses
:
actions/setup-python@
f677139bbe7f9c59b41e40162b753c062f5d49a3
# v5.2.0
with
:
with
:
python-version
:
${{ matrix.python-version }}
python-version
:
${{ matrix.python-version }}
-
name
:
Install dependencies
-
name
:
Install dependencies
...
...
CMakeLists.txt
View file @
2216a4e5
...
@@ -87,24 +87,6 @@ endif()
...
@@ -87,24 +87,6 @@ endif()
#
#
find_package
(
Torch REQUIRED
)
find_package
(
Torch REQUIRED
)
#
message
(
STATUS
"Enabling core extension."
)
# Define _core_C extension
# built for (almost) every target platform, (excludes TPU and Neuron)
set
(
VLLM_EXT_SRC
"csrc/core/torch_bindings.cpp"
)
define_gpu_extension_target
(
_core_C
DESTINATION vllm
LANGUAGE CXX
SOURCES
${
VLLM_EXT_SRC
}
COMPILE_FLAGS
${
CXX_COMPILE_FLAGS
}
USE_SABI 3
WITH_SOABI
)
#
#
# Forward the non-CUDA device extensions to external CMake scripts.
# Forward the non-CUDA device extensions to external CMake scripts.
#
#
...
@@ -191,12 +173,12 @@ endif()
...
@@ -191,12 +173,12 @@ endif()
#
#
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
# Configure it to place files in vllm/.deps, in order to play nicely with sccache.
# setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache.
# Each dependency that produces build artifacts should override its BINARY_DIR to avoid
# conflicts between build types. It should instead be set to ${CMAKE_BINARY_DIR}/<dependency>.
#
#
include
(
FetchContent
)
include
(
FetchContent
)
get_filename_component
(
PROJECT_ROOT_DIR
"
${
CMAKE_CURRENT_SOURCE_DIR
}
"
ABSOLUTE
)
file
(
MAKE_DIRECTORY
${
FETCHCONTENT_BASE_DIR
}
)
# Ensure the directory exists
file
(
MAKE_DIRECTORY
"
${
FETCHCONTENT_BASE_DIR
}
"
)
set
(
FETCHCONTENT_BASE_DIR
"
${
PROJECT_ROOT_DIR
}
/.deps"
)
message
(
STATUS
"FetchContent base directory:
${
FETCHCONTENT_BASE_DIR
}
"
)
message
(
STATUS
"FetchContent base directory:
${
FETCHCONTENT_BASE_DIR
}
"
)
#
#
...
@@ -280,7 +262,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
...
@@ -280,7 +262,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
message
(
STATUS
"Building Marlin kernels for archs:
${
MARLIN_ARCHS
}
"
)
message
(
STATUS
"Building Marlin kernels for archs:
${
MARLIN_ARCHS
}
"
)
else
()
else
()
message
(
STATUS
"Not building Marlin kernels as no compatible archs found"
message
(
STATUS
"Not building Marlin kernels as no compatible archs found"
"in CUDA target architectures"
)
"
in CUDA target architectures"
)
endif
()
endif
()
#
#
...
@@ -460,7 +442,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
...
@@ -460,7 +442,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
message
(
STATUS
"Building Marlin MOE kernels for archs:
${
MARLIN_MOE_ARCHS
}
"
)
message
(
STATUS
"Building Marlin MOE kernels for archs:
${
MARLIN_MOE_ARCHS
}
"
)
else
()
else
()
message
(
STATUS
"Not building Marlin MOE kernels as no compatible archs found"
message
(
STATUS
"Not building Marlin MOE kernels as no compatible archs found"
"in CUDA target architectures"
)
"
in CUDA target architectures"
)
endif
()
endif
()
endif
()
endif
()
...
@@ -540,6 +522,8 @@ else()
...
@@ -540,6 +522,8 @@ else()
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
GIT_TAG 013f0c4fc47e6574060879d9734c1df8c5c273bd
GIT_TAG 013f0c4fc47e6574060879d9734c1df8c5c273bd
GIT_PROGRESS TRUE
GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
)
)
]]
]]
endif
()
endif
()
...
...
Dockerfile.openvino
View file @
2216a4e5
...
@@ -15,11 +15,11 @@ RUN --mount=type=bind,source=.git,target=.git \
...
@@ -15,11 +15,11 @@ RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
# install build requirements
# install build requirements
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/
vllm/
requirements-build.txt
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements-build.txt
# build vLLM with OpenVINO backend
# build vLLM with OpenVINO backend
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace
/vllm/
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace
COPY examples/ /workspace/
vllm/
examples
COPY examples/ /workspace/examples
COPY benchmarks/ /workspace/
vllm/
benchmarks
COPY benchmarks/ /workspace/benchmarks
CMD ["/bin/bash"]
CMD ["/bin/bash"]
Dockerfile.ppc64le
View file @
2216a4e5
...
@@ -33,4 +33,4 @@ WORKDIR /workspace/
...
@@ -33,4 +33,4 @@ WORKDIR /workspace/
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
ENTRYPOINT ["
/opt/conda/bin/
python3", "-m", "vllm.entrypoints.openai.api_server"]
benchmarks/benchmark_latency.py
View file @
2216a4e5
"""Benchmark the latency of processing a single batch of requests."""
"""Benchmark the latency of processing a single batch of requests."""
import
argparse
import
argparse
import
dataclasses
import
json
import
json
import
time
import
time
from
pathlib
import
Path
from
pathlib
import
Path
...
@@ -10,43 +11,19 @@ import torch
...
@@ -10,43 +11,19 @@ import torch
from
tqdm
import
tqdm
from
tqdm
import
tqdm
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
from
vllm.engine.arg_utils
import
DEVICE_OPTIONS
,
EngineArgs
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.inputs
import
PromptType
from
vllm.inputs
import
PromptType
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
from
vllm.utils
import
FlexibleArgumentParser
from
vllm.utils
import
FlexibleArgumentParser
def
main
(
args
:
argparse
.
Namespace
):
def
main
(
args
:
argparse
.
Namespace
):
print
(
args
)
print
(
args
)
engine_args
=
EngineArgs
.
from_cli_args
(
args
)
# NOTE(woosuk): If the request cannot be processed in a single batch,
# NOTE(woosuk): If the request cannot be processed in a single batch,
# the engine will automatically process the request in multiple batches.
# the engine will automatically process the request in multiple batches.
llm
=
LLM
(
llm
=
LLM
(
**
dataclasses
.
asdict
(
engine_args
))
model
=
args
.
model
,
speculative_model
=
args
.
speculative_model
,
num_speculative_tokens
=
args
.
num_speculative_tokens
,
speculative_draft_tensor_parallel_size
=
\
args
.
speculative_draft_tensor_parallel_size
,
tokenizer
=
args
.
tokenizer
,
quantization
=
args
.
quantization
,
tensor_parallel_size
=
args
.
tensor_parallel_size
,
trust_remote_code
=
args
.
trust_remote_code
,
dtype
=
args
.
dtype
,
max_model_len
=
args
.
max_model_len
,
enforce_eager
=
args
.
enforce_eager
,
kv_cache_dtype
=
args
.
kv_cache_dtype
,
quantization_param_path
=
args
.
quantization_param_path
,
device
=
args
.
device
,
ray_workers_use_nsight
=
args
.
ray_workers_use_nsight
,
enable_chunked_prefill
=
args
.
enable_chunked_prefill
,
download_dir
=
args
.
download_dir
,
block_size
=
args
.
block_size
,
gpu_memory_utilization
=
args
.
gpu_memory_utilization
,
load_format
=
args
.
load_format
,
distributed_executor_backend
=
args
.
distributed_executor_backend
,
otlp_traces_endpoint
=
args
.
otlp_traces_endpoint
,
enable_prefix_caching
=
args
.
enable_prefix_caching
,
)
sampling_params
=
SamplingParams
(
sampling_params
=
SamplingParams
(
n
=
args
.
n
,
n
=
args
.
n
,
...
@@ -125,19 +102,6 @@ if __name__ == '__main__':
...
@@ -125,19 +102,6 @@ if __name__ == '__main__':
parser
=
FlexibleArgumentParser
(
parser
=
FlexibleArgumentParser
(
description
=
'Benchmark the latency of processing a single batch of '
description
=
'Benchmark the latency of processing a single batch of '
'requests till completion.'
)
'requests till completion.'
)
parser
.
add_argument
(
'--model'
,
type
=
str
,
default
=
'facebook/opt-125m'
)
parser
.
add_argument
(
'--speculative-model'
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
'--num-speculative-tokens'
,
type
=
int
,
default
=
None
)
parser
.
add_argument
(
'--speculative-draft-tensor-parallel-size'
,
'-spec-draft-tp'
,
type
=
int
,
default
=
None
)
parser
.
add_argument
(
'--tokenizer'
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
'--quantization'
,
'-q'
,
choices
=
[
*
QUANTIZATION_METHODS
,
None
],
default
=
None
)
parser
.
add_argument
(
'--tensor-parallel-size'
,
'-tp'
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
'--input-len'
,
type
=
int
,
default
=
32
)
parser
.
add_argument
(
'--input-len'
,
type
=
int
,
default
=
32
)
parser
.
add_argument
(
'--output-len'
,
type
=
int
,
default
=
128
)
parser
.
add_argument
(
'--output-len'
,
type
=
int
,
default
=
128
)
parser
.
add_argument
(
'--batch-size'
,
type
=
int
,
default
=
8
)
parser
.
add_argument
(
'--batch-size'
,
type
=
int
,
default
=
8
)
...
@@ -154,45 +118,6 @@ if __name__ == '__main__':
...
@@ -154,45 +118,6 @@ if __name__ == '__main__':
type
=
int
,
type
=
int
,
default
=
30
,
default
=
30
,
help
=
'Number of iterations to run.'
)
help
=
'Number of iterations to run.'
)
parser
.
add_argument
(
'--trust-remote-code'
,
action
=
'store_true'
,
help
=
'trust remote code from huggingface'
)
parser
.
add_argument
(
'--max-model-len'
,
type
=
int
,
default
=
None
,
help
=
'Maximum length of a sequence (including prompt and output). '
'If None, will be derived from the model.'
)
parser
.
add_argument
(
'--dtype'
,
type
=
str
,
default
=
'auto'
,
choices
=
[
'auto'
,
'half'
,
'float16'
,
'bfloat16'
,
'float'
,
'float32'
],
help
=
'data type for model weights and activations. '
'The "auto" option will use FP16 precision '
'for FP32 and FP16 models, and BF16 precision '
'for BF16 models.'
)
parser
.
add_argument
(
'--enforce-eager'
,
action
=
'store_true'
,
help
=
'enforce eager mode and disable CUDA graph'
)
parser
.
add_argument
(
'--kv-cache-dtype'
,
type
=
str
,
choices
=
[
'auto'
,
'fp8'
,
'fp8_e5m2'
,
'fp8_e4m3'
],
default
=
"auto"
,
help
=
'Data type for kv cache storage. If "auto", will use model '
'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)'
)
parser
.
add_argument
(
'--quantization-param-path'
,
type
=
str
,
default
=
None
,
help
=
'Path to the JSON file containing the KV cache scaling factors. '
'This should generally be supplied, when KV cache dtype is FP8. '
'Otherwise, KV cache scaling factors default to 1.0, which may cause '
'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
'instead supported for common inference criteria.'
)
parser
.
add_argument
(
parser
.
add_argument
(
'--profile'
,
'--profile'
,
action
=
'store_true'
,
action
=
'store_true'
,
...
@@ -203,78 +128,12 @@ if __name__ == '__main__':
...
@@ -203,78 +128,12 @@ if __name__ == '__main__':
default
=
None
,
default
=
None
,
help
=
(
'path to save the pytorch profiler output. Can be visualized '
help
=
(
'path to save the pytorch profiler output. Can be visualized '
'with ui.perfetto.dev or Tensorboard.'
))
'with ui.perfetto.dev or Tensorboard.'
))
parser
.
add_argument
(
"--device"
,
type
=
str
,
default
=
"auto"
,
choices
=
DEVICE_OPTIONS
,
help
=
'device type for vLLM execution'
)
parser
.
add_argument
(
'--block-size'
,
type
=
int
,
default
=
16
,
help
=
'block size of key/value cache'
)
parser
.
add_argument
(
'--enable-chunked-prefill'
,
action
=
'store_true'
,
help
=
'If True, the prefill requests can be chunked based on the '
'max_num_batched_tokens'
)
parser
.
add_argument
(
"--enable-prefix-caching"
,
action
=
'store_true'
,
help
=
"Enable automatic prefix caching"
)
parser
.
add_argument
(
"--ray-workers-use-nsight"
,
action
=
'store_true'
,
help
=
"If specified, use nsight to profile ray workers"
,
)
parser
.
add_argument
(
'--download-dir'
,
type
=
str
,
default
=
None
,
help
=
'directory to download and load the weights, '
'default to the default cache dir of huggingface'
)
parser
.
add_argument
(
parser
.
add_argument
(
'--output-json'
,
'--output-json'
,
type
=
str
,
type
=
str
,
default
=
None
,
default
=
None
,
help
=
'Path to save the latency results in JSON format.'
)
help
=
'Path to save the latency results in JSON format.'
)
parser
.
add_argument
(
'--gpu-memory-utilization'
,
type
=
float
,
parser
=
EngineArgs
.
add_cli_args
(
parser
)
default
=
0.9
,
help
=
'the fraction of GPU memory to be used for '
'the model executor, which can range from 0 to 1.'
'If unspecified, will use the default value of 0.9.'
)
parser
.
add_argument
(
'--load-format'
,
type
=
str
,
default
=
EngineArgs
.
load_format
,
choices
=
[
'auto'
,
'pt'
,
'safetensors'
,
'npcache'
,
'dummy'
,
'tensorizer'
,
'bitsandbytes'
],
help
=
'The format of the model weights to load.
\n\n
'
'* "auto" will try to load the weights in the safetensors format '
'and fall back to the pytorch bin format if safetensors format '
'is not available.
\n
'
'* "pt" will load the weights in the pytorch bin format.
\n
'
'* "safetensors" will load the weights in the safetensors format.
\n
'
'* "npcache" will load the weights in pytorch format and store '
'a numpy cache to speed up the loading.
\n
'
'* "dummy" will initialize the weights with random values, '
'which is mainly for profiling.
\n
'
'* "tensorizer" will load the weights using tensorizer from '
'CoreWeave. See the Tensorize vLLM Model script in the Examples'
'section for more information.
\n
'
'* "bitsandbytes" will load the weights using bitsandbytes '
'quantization.
\n
'
)
parser
.
add_argument
(
'--distributed-executor-backend'
,
choices
=
[
'ray'
,
'mp'
],
default
=
None
,
help
=
'Backend to use for distributed serving. When more than 1 GPU '
'is used, will be automatically set to "ray" if installed '
'or "mp" (multiprocessing) otherwise.'
)
parser
.
add_argument
(
'--otlp-traces-endpoint'
,
type
=
str
,
default
=
None
,
help
=
'Target URL to which OpenTelemetry traces will be sent.'
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
main
(
args
)
main
(
args
)
benchmarks/benchmark_prefix_caching.py
View file @
2216a4e5
...
@@ -25,6 +25,7 @@ ShareGPT example usage:
...
@@ -25,6 +25,7 @@ ShareGPT example usage:
--input-length-range 128:256
--input-length-range 128:256
"""
"""
import
dataclasses
import
json
import
json
import
random
import
random
import
time
import
time
...
@@ -33,6 +34,7 @@ from typing import List, Optional, Tuple
...
@@ -33,6 +34,7 @@ from typing import List, Optional, Tuple
from
transformers
import
PreTrainedTokenizerBase
from
transformers
import
PreTrainedTokenizerBase
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.utils
import
FlexibleArgumentParser
from
vllm.utils
import
FlexibleArgumentParser
try
:
try
:
...
@@ -129,12 +131,9 @@ def main(args):
...
@@ -129,12 +131,9 @@ def main(args):
filtered_datasets
=
[(
PROMPT
,
prompt_len
,
args
.
output_len
)
filtered_datasets
=
[(
PROMPT
,
prompt_len
,
args
.
output_len
)
]
*
args
.
num_prompts
]
*
args
.
num_prompts
llm
=
LLM
(
model
=
args
.
model
,
engine_args
=
EngineArgs
.
from_cli_args
(
args
)
tokenizer_mode
=
'auto'
,
trust_remote_code
=
True
,
llm
=
LLM
(
**
dataclasses
.
asdict
(
engine_args
))
enforce_eager
=
True
,
tensor_parallel_size
=
args
.
tensor_parallel_size
,
enable_prefix_caching
=
args
.
enable_prefix_caching
)
sampling_params
=
SamplingParams
(
temperature
=
0
,
max_tokens
=
args
.
output_len
)
sampling_params
=
SamplingParams
(
temperature
=
0
,
max_tokens
=
args
.
output_len
)
...
@@ -162,18 +161,11 @@ if __name__ == "__main__":
...
@@ -162,18 +161,11 @@ if __name__ == "__main__":
parser
=
FlexibleArgumentParser
(
parser
=
FlexibleArgumentParser
(
description
=
description
=
'Benchmark the performance with or without automatic prefix caching.'
)
'Benchmark the performance with or without automatic prefix caching.'
)
parser
.
add_argument
(
'--model'
,
type
=
str
,
default
=
'baichuan-inc/Baichuan2-13B-Chat'
)
parser
.
add_argument
(
"--dataset-path"
,
parser
.
add_argument
(
"--dataset-path"
,
type
=
str
,
type
=
str
,
default
=
None
,
default
=
None
,
help
=
"Path to the dataset."
)
help
=
"Path to the dataset."
)
parser
.
add_argument
(
'--tensor-parallel-size'
,
'-tp'
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
'--output-len'
,
type
=
int
,
default
=
10
)
parser
.
add_argument
(
'--output-len'
,
type
=
int
,
default
=
10
)
parser
.
add_argument
(
'--enable-prefix-caching'
,
action
=
'store_true'
,
help
=
'enable prefix caching'
)
parser
.
add_argument
(
'--num-prompts'
,
parser
.
add_argument
(
'--num-prompts'
,
type
=
int
,
type
=
int
,
default
=
1
,
default
=
1
,
...
@@ -190,9 +182,7 @@ if __name__ == "__main__":
...
@@ -190,9 +182,7 @@ if __name__ == "__main__":
default
=
'128:256'
,
default
=
'128:256'
,
help
=
'Range of input lengths for sampling prompts,'
help
=
'Range of input lengths for sampling prompts,'
'specified as "min:max" (e.g., "128:256").'
)
'specified as "min:max" (e.g., "128:256").'
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
parser
=
EngineArgs
.
add_cli_args
(
parser
)
default
=
0
,
help
=
'Random seed for reproducibility'
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
main
(
args
)
main
(
args
)
benchmarks/benchmark_prioritization.py
View file @
2216a4e5
"""Benchmark offline prioritization."""
"""Benchmark offline prioritization."""
import
argparse
import
argparse
import
dataclasses
import
json
import
json
import
random
import
random
import
time
import
time
...
@@ -7,7 +8,8 @@ from typing import List, Optional, Tuple
...
@@ -7,7 +8,8 @@ from typing import List, Optional, Tuple
from
transformers
import
AutoTokenizer
,
PreTrainedTokenizerBase
from
transformers
import
AutoTokenizer
,
PreTrainedTokenizerBase
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.utils
import
FlexibleArgumentParser
def
sample_requests
(
def
sample_requests
(
...
@@ -62,46 +64,11 @@ def sample_requests(
...
@@ -62,46 +64,11 @@ def sample_requests(
def
run_vllm
(
def
run_vllm
(
requests
:
List
[
Tuple
[
str
,
int
,
int
]],
requests
:
List
[
Tuple
[
str
,
int
,
int
]],
model
:
str
,
tokenizer
:
str
,
quantization
:
Optional
[
str
],
tensor_parallel_size
:
int
,
seed
:
int
,
n
:
int
,
n
:
int
,
trust_remote_code
:
bool
,
engine_args
:
EngineArgs
,
dtype
:
str
,
max_model_len
:
Optional
[
int
],
enforce_eager
:
bool
,
kv_cache_dtype
:
str
,
quantization_param_path
:
Optional
[
str
],
device
:
str
,
enable_prefix_caching
:
bool
,
enable_chunked_prefill
:
bool
,
max_num_batched_tokens
:
int
,
gpu_memory_utilization
:
float
=
0.9
,
download_dir
:
Optional
[
str
]
=
None
,
)
->
float
:
)
->
float
:
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
llm
=
LLM
(
llm
=
LLM
(
**
dataclasses
.
asdict
(
engine_args
))
model
=
model
,
tokenizer
=
tokenizer
,
quantization
=
quantization
,
tensor_parallel_size
=
tensor_parallel_size
,
seed
=
seed
,
trust_remote_code
=
trust_remote_code
,
dtype
=
dtype
,
max_model_len
=
max_model_len
,
gpu_memory_utilization
=
gpu_memory_utilization
,
enforce_eager
=
enforce_eager
,
kv_cache_dtype
=
kv_cache_dtype
,
quantization_param_path
=
quantization_param_path
,
device
=
device
,
enable_prefix_caching
=
enable_prefix_caching
,
download_dir
=
download_dir
,
enable_chunked_prefill
=
enable_chunked_prefill
,
max_num_batched_tokens
=
max_num_batched_tokens
,
disable_log_stats
=
False
,
)
# Add the requests to the engine.
# Add the requests to the engine.
prompts
=
[]
prompts
=
[]
...
@@ -142,16 +109,8 @@ def main(args: argparse.Namespace):
...
@@ -142,16 +109,8 @@ def main(args: argparse.Namespace):
args
.
output_len
)
args
.
output_len
)
if
args
.
backend
==
"vllm"
:
if
args
.
backend
==
"vllm"
:
elapsed_time
=
run_vllm
(
requests
,
args
.
model
,
args
.
tokenizer
,
elapsed_time
=
run_vllm
(
requests
,
args
.
n
,
args
.
quantization
,
args
.
tensor_parallel_size
,
EngineArgs
.
from_cli_args
(
args
))
args
.
seed
,
args
.
n
,
args
.
trust_remote_code
,
args
.
dtype
,
args
.
max_model_len
,
args
.
enforce_eager
,
args
.
kv_cache_dtype
,
args
.
quantization_param_path
,
args
.
device
,
args
.
enable_prefix_caching
,
args
.
enable_chunked_prefill
,
args
.
max_num_batched_tokens
,
args
.
gpu_memory_utilization
,
args
.
download_dir
)
else
:
else
:
raise
ValueError
(
f
"Unknown backend:
{
args
.
backend
}
"
)
raise
ValueError
(
f
"Unknown backend:
{
args
.
backend
}
"
)
total_num_tokens
=
sum
(
prompt_len
+
output_len
total_num_tokens
=
sum
(
prompt_len
+
output_len
...
@@ -173,7 +132,7 @@ def main(args: argparse.Namespace):
...
@@ -173,7 +132,7 @@ def main(args: argparse.Namespace):
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
(
description
=
"Benchmark the throughput."
)
parser
=
Flexible
ArgumentParser
(
description
=
"Benchmark the throughput."
)
parser
.
add_argument
(
"--backend"
,
parser
.
add_argument
(
"--backend"
,
type
=
str
,
type
=
str
,
choices
=
[
"vllm"
,
"hf"
,
"mii"
],
choices
=
[
"vllm"
,
"hf"
,
"mii"
],
...
@@ -191,13 +150,6 @@ if __name__ == "__main__":
...
@@ -191,13 +150,6 @@ if __name__ == "__main__":
default
=
None
,
default
=
None
,
help
=
"Output length for each request. Overrides the "
help
=
"Output length for each request. Overrides the "
"output length from the dataset."
)
"output length from the dataset."
)
parser
.
add_argument
(
"--model"
,
type
=
str
,
default
=
"facebook/opt-125m"
)
parser
.
add_argument
(
"--tokenizer"
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
'--quantization'
,
'-q'
,
choices
=
[
*
QUANTIZATION_METHODS
,
None
],
default
=
None
)
parser
.
add_argument
(
"--tensor-parallel-size"
,
"-tp"
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
"--n"
,
parser
.
add_argument
(
"--n"
,
type
=
int
,
type
=
int
,
default
=
1
,
default
=
1
,
...
@@ -206,81 +158,13 @@ if __name__ == "__main__":
...
@@ -206,81 +158,13 @@ if __name__ == "__main__":
type
=
int
,
type
=
int
,
default
=
200
,
default
=
200
,
help
=
"Number of prompts to process."
)
help
=
"Number of prompts to process."
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
0
)
parser
.
add_argument
(
'--trust-remote-code'
,
action
=
'store_true'
,
help
=
'trust remote code from huggingface'
)
parser
.
add_argument
(
'--max-model-len'
,
type
=
int
,
default
=
None
,
help
=
'Maximum length of a sequence (including prompt and output). '
'If None, will be derived from the model.'
)
parser
.
add_argument
(
'--dtype'
,
type
=
str
,
default
=
'auto'
,
choices
=
[
'auto'
,
'half'
,
'float16'
,
'bfloat16'
,
'float'
,
'float32'
],
help
=
'data type for model weights and activations. '
'The "auto" option will use FP16 precision '
'for FP32 and FP16 models, and BF16 precision '
'for BF16 models.'
)
parser
.
add_argument
(
'--gpu-memory-utilization'
,
type
=
float
,
default
=
0.9
,
help
=
'the fraction of GPU memory to be used for '
'the model executor, which can range from 0 to 1.'
'If unspecified, will use the default value of 0.9.'
)
parser
.
add_argument
(
"--enforce-eager"
,
action
=
"store_true"
,
help
=
"enforce eager execution"
)
parser
.
add_argument
(
'--kv-cache-dtype'
,
type
=
str
,
choices
=
[
'auto'
,
'fp8'
,
'fp8_e5m2'
,
'fp8_e4m3'
],
default
=
"auto"
,
help
=
'Data type for kv cache storage. If "auto", will use model '
'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)'
)
parser
.
add_argument
(
'--quantization-param-path'
,
type
=
str
,
default
=
None
,
help
=
'Path to the JSON file containing the KV cache scaling factors. '
'This should generally be supplied, when KV cache dtype is FP8. '
'Otherwise, KV cache scaling factors default to 1.0, which may cause '
'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
'instead supported for common inference criteria.'
)
parser
.
add_argument
(
"--device"
,
type
=
str
,
default
=
"cuda"
,
choices
=
[
"cuda"
,
"cpu"
],
help
=
'device type for vLLM execution, supporting CUDA and CPU.'
)
parser
.
add_argument
(
"--enable-prefix-caching"
,
action
=
'store_true'
,
help
=
"enable automatic prefix caching for vLLM backend."
)
parser
.
add_argument
(
"--enable-chunked-prefill"
,
action
=
'store_true'
,
help
=
"enable chunked prefill for vLLM backend."
)
parser
.
add_argument
(
'--max-num-batched-tokens'
,
type
=
int
,
default
=
None
,
help
=
'maximum number of batched tokens per '
'iteration'
)
parser
.
add_argument
(
'--download-dir'
,
type
=
str
,
default
=
None
,
help
=
'directory to download and load the weights, '
'default to the default cache dir of huggingface'
)
parser
.
add_argument
(
parser
.
add_argument
(
'--output-json'
,
'--output-json'
,
type
=
str
,
type
=
str
,
default
=
None
,
default
=
None
,
help
=
'Path to save the throughput results in JSON format.'
)
help
=
'Path to save the throughput results in JSON format.'
)
parser
=
EngineArgs
.
add_cli_args
(
parser
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
if
args
.
tokenizer
is
None
:
if
args
.
tokenizer
is
None
:
args
.
tokenizer
=
args
.
model
args
.
tokenizer
=
args
.
model
...
...
benchmarks/benchmark_serving.py
View file @
2216a4e5
...
@@ -53,6 +53,8 @@ try:
...
@@ -53,6 +53,8 @@ try:
except
ImportError
:
except
ImportError
:
from
argparse
import
ArgumentParser
as
FlexibleArgumentParser
from
argparse
import
ArgumentParser
as
FlexibleArgumentParser
MILLISECONDS_TO_SECONDS_CONVERSION
=
1000
@
dataclass
@
dataclass
class
BenchmarkMetrics
:
class
BenchmarkMetrics
:
...
@@ -60,6 +62,7 @@ class BenchmarkMetrics:
...
@@ -60,6 +62,7 @@ class BenchmarkMetrics:
total_input
:
int
total_input
:
int
total_output
:
int
total_output
:
int
request_throughput
:
float
request_throughput
:
float
request_goodput
:
float
output_throughput
:
float
output_throughput
:
float
total_token_throughput
:
float
total_token_throughput
:
float
mean_ttft_ms
:
float
mean_ttft_ms
:
float
...
@@ -202,6 +205,7 @@ def sample_hf_requests(
...
@@ -202,6 +205,7 @@ def sample_hf_requests(
dataset_split
:
str
,
dataset_split
:
str
,
num_requests
:
int
,
num_requests
:
int
,
tokenizer
:
PreTrainedTokenizerBase
,
tokenizer
:
PreTrainedTokenizerBase
,
random_seed
:
int
,
fixed_output_len
:
Optional
[
int
]
=
None
,
fixed_output_len
:
Optional
[
int
]
=
None
,
)
->
List
[
Tuple
[
str
,
str
,
int
,
Optional
[
Dict
[
str
,
Collection
[
str
]]]]]:
)
->
List
[
Tuple
[
str
,
str
,
int
,
Optional
[
Dict
[
str
,
Collection
[
str
]]]]]:
dataset
=
load_dataset
(
dataset_path
,
dataset
=
load_dataset
(
dataset_path
,
...
@@ -210,8 +214,8 @@ def sample_hf_requests(
...
@@ -210,8 +214,8 @@ def sample_hf_requests(
streaming
=
True
)
streaming
=
True
)
assert
"conversations"
in
dataset
.
features
,
(
assert
"conversations"
in
dataset
.
features
,
(
"HF Dataset must have 'conversations' column."
)
"HF Dataset must have 'conversations' column."
)
filter
ed_dataset
=
dataset
.
shuffle
().
filter
(
filter
_func
=
lambda
x
:
len
(
x
[
"conversations"
])
>=
2
lambda
x
:
len
(
x
[
"conversations"
])
>=
2
)
filtered_dataset
=
dataset
.
shuffle
(
seed
=
random_seed
).
filter
(
filter_func
)
sampled_requests
:
List
[
Tuple
[
str
,
int
,
int
,
Dict
[
str
,
sampled_requests
:
List
[
Tuple
[
str
,
int
,
int
,
Dict
[
str
,
Collection
[
str
]]]]
=
[]
Collection
[
str
]]]]
=
[]
for
data
in
filtered_dataset
:
for
data
in
filtered_dataset
:
...
@@ -315,12 +319,15 @@ def calculate_metrics(
...
@@ -315,12 +319,15 @@ def calculate_metrics(
tokenizer
:
PreTrainedTokenizerBase
,
tokenizer
:
PreTrainedTokenizerBase
,
selected_percentile_metrics
:
List
[
str
],
selected_percentile_metrics
:
List
[
str
],
selected_percentiles
:
List
[
float
],
selected_percentiles
:
List
[
float
],
gootput_config_dict
:
Dict
[
str
,
float
],
)
->
Tuple
[
BenchmarkMetrics
,
List
[
int
]]:
)
->
Tuple
[
BenchmarkMetrics
,
List
[
int
]]:
actual_output_lens
:
List
[
int
]
=
[]
actual_output_lens
:
List
[
int
]
=
[]
total_input
=
0
total_input
=
0
completed
=
0
completed
=
0
good_completed
=
0
itls
:
List
[
float
]
=
[]
itls
:
List
[
float
]
=
[]
tpots
:
List
[
float
]
=
[]
tpots
:
List
[
float
]
=
[]
all_tpots
:
List
[
float
]
=
[]
ttfts
:
List
[
float
]
=
[]
ttfts
:
List
[
float
]
=
[]
e2els
:
List
[
float
]
=
[]
e2els
:
List
[
float
]
=
[]
for
i
in
range
(
len
(
outputs
)):
for
i
in
range
(
len
(
outputs
)):
...
@@ -334,9 +341,13 @@ def calculate_metrics(
...
@@ -334,9 +341,13 @@ def calculate_metrics(
add_special_tokens
=
False
).
input_ids
)
add_special_tokens
=
False
).
input_ids
)
actual_output_lens
.
append
(
output_len
)
actual_output_lens
.
append
(
output_len
)
total_input
+=
input_requests
[
i
][
1
]
total_input
+=
input_requests
[
i
][
1
]
tpot
=
0
if
output_len
>
1
:
if
output_len
>
1
:
tpots
.
append
(
tpot
=
(
outputs
[
i
].
latency
-
outputs
[
i
].
ttft
)
/
(
output_len
-
(
outputs
[
i
].
latency
-
outputs
[
i
].
ttft
)
/
(
output_len
-
1
))
1
)
tpots
.
append
(
tpot
)
# Note: if output_len <= 1, we regard tpot as 0 for goodput
all_tpots
.
append
(
tpot
)
itls
+=
outputs
[
i
].
itl
itls
+=
outputs
[
i
].
itl
ttfts
.
append
(
outputs
[
i
].
ttft
)
ttfts
.
append
(
outputs
[
i
].
ttft
)
e2els
.
append
(
outputs
[
i
].
latency
)
e2els
.
append
(
outputs
[
i
].
latency
)
...
@@ -344,6 +355,28 @@ def calculate_metrics(
...
@@ -344,6 +355,28 @@ def calculate_metrics(
else
:
else
:
actual_output_lens
.
append
(
0
)
actual_output_lens
.
append
(
0
)
if
gootput_config_dict
:
valid_metrics
=
[]
slo_values
=
[]
if
"ttft"
in
gootput_config_dict
:
valid_metrics
.
append
(
ttfts
)
slo_values
.
append
(
gootput_config_dict
[
"ttft"
]
/
MILLISECONDS_TO_SECONDS_CONVERSION
)
if
"tpot"
in
gootput_config_dict
:
valid_metrics
.
append
(
all_tpots
)
slo_values
.
append
(
gootput_config_dict
[
"tpot"
]
/
MILLISECONDS_TO_SECONDS_CONVERSION
)
if
"e2el"
in
gootput_config_dict
:
valid_metrics
.
append
(
e2els
)
slo_values
.
append
(
gootput_config_dict
[
"e2el"
]
/
MILLISECONDS_TO_SECONDS_CONVERSION
)
for
req_metric
in
zip
(
*
valid_metrics
):
is_good_req
=
all
([
s
>=
r
for
s
,
r
in
zip
(
slo_values
,
req_metric
)])
if
is_good_req
:
good_completed
+=
1
if
completed
==
0
:
if
completed
==
0
:
warnings
.
warn
(
warnings
.
warn
(
"All requests failed. This is likely due to a misconfiguration "
"All requests failed. This is likely due to a misconfiguration "
...
@@ -354,6 +387,7 @@ def calculate_metrics(
...
@@ -354,6 +387,7 @@ def calculate_metrics(
total_input
=
total_input
,
total_input
=
total_input
,
total_output
=
sum
(
actual_output_lens
),
total_output
=
sum
(
actual_output_lens
),
request_throughput
=
completed
/
dur_s
,
request_throughput
=
completed
/
dur_s
,
request_goodput
=
good_completed
/
dur_s
,
output_throughput
=
sum
(
actual_output_lens
)
/
dur_s
,
output_throughput
=
sum
(
actual_output_lens
)
/
dur_s
,
total_token_throughput
=
(
total_input
+
sum
(
actual_output_lens
))
/
dur_s
,
total_token_throughput
=
(
total_input
+
sum
(
actual_output_lens
))
/
dur_s
,
mean_ttft_ms
=
np
.
mean
(
ttfts
or
0
)
*
mean_ttft_ms
=
np
.
mean
(
ttfts
or
0
)
*
...
@@ -397,6 +431,8 @@ async def benchmark(
...
@@ -397,6 +431,8 @@ async def benchmark(
selected_percentile_metrics
:
List
[
str
],
selected_percentile_metrics
:
List
[
str
],
selected_percentiles
:
List
[
str
],
selected_percentiles
:
List
[
str
],
ignore_eos
:
bool
,
ignore_eos
:
bool
,
gootput_config_dict
:
Dict
[
str
,
float
],
max_concurrency
:
Optional
[
int
],
):
):
if
backend
in
ASYNC_REQUEST_FUNCS
:
if
backend
in
ASYNC_REQUEST_FUNCS
:
request_func
=
ASYNC_REQUEST_FUNCS
[
backend
]
request_func
=
ASYNC_REQUEST_FUNCS
[
backend
]
...
@@ -445,9 +481,25 @@ async def benchmark(
...
@@ -445,9 +481,25 @@ async def benchmark(
print
(
"Profiler started"
)
print
(
"Profiler started"
)
print
(
f
"Traffic request rate:
{
request_rate
}
"
)
print
(
f
"Traffic request rate:
{
request_rate
}
"
)
print
(
f
"Maximum request concurrency:
{
max_concurrency
}
"
)
pbar
=
None
if
disable_tqdm
else
tqdm
(
total
=
len
(
input_requests
))
pbar
=
None
if
disable_tqdm
else
tqdm
(
total
=
len
(
input_requests
))
# This can be used once the minimum Python version is 3.10 or higher,
# and it will simplify the code in limited_request_func.
# semaphore = (asyncio.Semaphore(max_concurrency)
# if max_concurrency else contextlib.nullcontext())
semaphore
=
(
asyncio
.
Semaphore
(
max_concurrency
)
if
max_concurrency
else
None
)
async
def
limited_request_func
(
request_func_input
,
pbar
):
if
semaphore
is
None
:
return
await
request_func
(
request_func_input
=
request_func_input
,
pbar
=
pbar
)
async
with
semaphore
:
return
await
request_func
(
request_func_input
=
request_func_input
,
pbar
=
pbar
)
benchmark_start_time
=
time
.
perf_counter
()
benchmark_start_time
=
time
.
perf_counter
()
tasks
:
List
[
asyncio
.
Task
]
=
[]
tasks
:
List
[
asyncio
.
Task
]
=
[]
async
for
request
in
get_request
(
input_requests
,
request_rate
):
async
for
request
in
get_request
(
input_requests
,
request_rate
):
...
@@ -463,8 +515,8 @@ async def benchmark(
...
@@ -463,8 +515,8 @@ async def benchmark(
ignore_eos
=
ignore_eos
)
ignore_eos
=
ignore_eos
)
tasks
.
append
(
tasks
.
append
(
asyncio
.
create_task
(
asyncio
.
create_task
(
request_func
(
request_func_input
=
request_func_input
,
limited_
request_func
(
request_func_input
=
request_func_input
,
pbar
=
pbar
)))
pbar
=
pbar
)))
outputs
:
List
[
RequestFuncOutput
]
=
await
asyncio
.
gather
(
*
tasks
)
outputs
:
List
[
RequestFuncOutput
]
=
await
asyncio
.
gather
(
*
tasks
)
if
profile
:
if
profile
:
...
@@ -494,6 +546,7 @@ async def benchmark(
...
@@ -494,6 +546,7 @@ async def benchmark(
tokenizer
=
tokenizer
,
tokenizer
=
tokenizer
,
selected_percentile_metrics
=
selected_percentile_metrics
,
selected_percentile_metrics
=
selected_percentile_metrics
,
selected_percentiles
=
selected_percentiles
,
selected_percentiles
=
selected_percentiles
,
gootput_config_dict
=
gootput_config_dict
,
)
)
print
(
"{s:{c}^{n}}"
.
format
(
s
=
' Serving Benchmark Result '
,
n
=
50
,
c
=
'='
))
print
(
"{s:{c}^{n}}"
.
format
(
s
=
' Serving Benchmark Result '
,
n
=
50
,
c
=
'='
))
...
@@ -505,6 +558,9 @@ async def benchmark(
...
@@ -505,6 +558,9 @@ async def benchmark(
metrics
.
total_output
))
metrics
.
total_output
))
print
(
"{:<40} {:<10.2f}"
.
format
(
"Request throughput (req/s):"
,
print
(
"{:<40} {:<10.2f}"
.
format
(
"Request throughput (req/s):"
,
metrics
.
request_throughput
))
metrics
.
request_throughput
))
if
gootput_config_dict
:
print
(
"{:<40} {:<10.2f}"
.
format
(
"Request goodput (req/s):"
,
metrics
.
request_goodput
))
print
(
"{:<40} {:<10.2f}"
.
format
(
"Output token throughput (tok/s):"
,
print
(
"{:<40} {:<10.2f}"
.
format
(
"Output token throughput (tok/s):"
,
metrics
.
output_throughput
))
metrics
.
output_throughput
))
print
(
"{:<40} {:<10.2f}"
.
format
(
"Total Token throughput (tok/s):"
,
print
(
"{:<40} {:<10.2f}"
.
format
(
"Total Token throughput (tok/s):"
,
...
@@ -516,6 +572,8 @@ async def benchmark(
...
@@ -516,6 +572,8 @@ async def benchmark(
"total_input_tokens"
:
metrics
.
total_input
,
"total_input_tokens"
:
metrics
.
total_input
,
"total_output_tokens"
:
metrics
.
total_output
,
"total_output_tokens"
:
metrics
.
total_output
,
"request_throughput"
:
metrics
.
request_throughput
,
"request_throughput"
:
metrics
.
request_throughput
,
"request_goodput:"
:
metrics
.
request_goodput
if
gootput_config_dict
else
None
,
"output_throughput"
:
metrics
.
output_throughput
,
"output_throughput"
:
metrics
.
output_throughput
,
"total_token_throughput"
:
metrics
.
total_token_throughput
,
"total_token_throughput"
:
metrics
.
total_token_throughput
,
"input_lens"
:
[
output
.
prompt_len
for
output
in
outputs
],
"input_lens"
:
[
output
.
prompt_len
for
output
in
outputs
],
...
@@ -569,6 +627,41 @@ async def benchmark(
...
@@ -569,6 +627,41 @@ async def benchmark(
return
result
return
result
def
check_goodput_args
(
args
):
# Check and parse goodput arguments
gootput_config_dict
=
{}
VALID_NAMES
=
[
"ttft"
,
"tpot"
,
"e2el"
]
if
args
.
goodput
:
gootput_config_dict
=
parse_goodput
(
args
.
goodput
)
for
slo_name
,
slo_val
in
gootput_config_dict
.
items
():
if
slo_name
not
in
VALID_NAMES
:
raise
ValueError
(
f
"Invalid metric name found,
{
slo_name
}
:
{
slo_val
}
. "
"The service level objective name should be one of "
f
"
{
str
(
VALID_NAMES
)
}
. "
)
if
slo_val
<
0
:
raise
ValueError
(
f
"Invalid value found,
{
slo_name
}
:
{
slo_val
}
. "
"The service level objective value should be "
"non-negative."
)
return
gootput_config_dict
def
parse_goodput
(
slo_pairs
):
gootput_config_dict
=
{}
try
:
for
slo_pair
in
slo_pairs
:
slo_name
,
slo_val
=
slo_pair
.
split
(
":"
)
gootput_config_dict
[
slo_name
]
=
float
(
slo_val
)
except
ValueError
as
err
:
raise
argparse
.
ArgumentTypeError
(
"Invalid format found for service level objectives. "
"Specify service level objectives for goodput as
\"
KEY:VALUE
\"
"
"pairs, where the key is a metric name, and the value is a "
"number in milliseconds."
)
from
err
return
gootput_config_dict
def
main
(
args
:
argparse
.
Namespace
):
def
main
(
args
:
argparse
.
Namespace
):
print
(
args
)
print
(
args
)
random
.
seed
(
args
.
seed
)
random
.
seed
(
args
.
seed
)
...
@@ -646,6 +739,7 @@ def main(args: argparse.Namespace):
...
@@ -646,6 +739,7 @@ def main(args: argparse.Namespace):
dataset_split
=
args
.
hf_split
,
dataset_split
=
args
.
hf_split
,
num_requests
=
args
.
num_prompts
,
num_requests
=
args
.
num_prompts
,
tokenizer
=
tokenizer
,
tokenizer
=
tokenizer
,
random_seed
=
args
.
seed
,
fixed_output_len
=
args
.
hf_output_len
,
fixed_output_len
=
args
.
hf_output_len
,
)
)
...
@@ -662,6 +756,8 @@ def main(args: argparse.Namespace):
...
@@ -662,6 +756,8 @@ def main(args: argparse.Namespace):
else
:
else
:
raise
ValueError
(
f
"Unknown dataset:
{
args
.
dataset_name
}
"
)
raise
ValueError
(
f
"Unknown dataset:
{
args
.
dataset_name
}
"
)
gootput_config_dict
=
check_goodput_args
(
args
)
benchmark_result
=
asyncio
.
run
(
benchmark_result
=
asyncio
.
run
(
benchmark
(
benchmark
(
backend
=
backend
,
backend
=
backend
,
...
@@ -680,6 +776,8 @@ def main(args: argparse.Namespace):
...
@@ -680,6 +776,8 @@ def main(args: argparse.Namespace):
float
(
p
)
for
p
in
args
.
metric_percentiles
.
split
(
","
)
float
(
p
)
for
p
in
args
.
metric_percentiles
.
split
(
","
)
],
],
ignore_eos
=
args
.
ignore_eos
,
ignore_eos
=
args
.
ignore_eos
,
gootput_config_dict
=
gootput_config_dict
,
max_concurrency
=
args
.
max_concurrency
,
))
))
# Save config and results to json
# Save config and results to json
...
@@ -709,13 +807,16 @@ def main(args: argparse.Namespace):
...
@@ -709,13 +807,16 @@ def main(args: argparse.Namespace):
# Traffic
# Traffic
result_json
[
"request_rate"
]
=
(
result_json
[
"request_rate"
]
=
(
args
.
request_rate
if
args
.
request_rate
<
float
(
"inf"
)
else
"inf"
)
args
.
request_rate
if
args
.
request_rate
<
float
(
"inf"
)
else
"inf"
)
result_json
[
"max_concurrency"
]
=
args
.
max_concurrency
# Merge with benchmark result
# Merge with benchmark result
result_json
=
{
**
result_json
,
**
benchmark_result
}
result_json
=
{
**
result_json
,
**
benchmark_result
}
# Save to file
# Save to file
base_model_id
=
model_id
.
split
(
"/"
)[
-
1
]
base_model_id
=
model_id
.
split
(
"/"
)[
-
1
]
file_name
=
f
"
{
backend
}
-
{
args
.
request_rate
}
qps-
{
base_model_id
}
-
{
current_dt
}
.json"
#noqa
max_concurrency_str
=
(
f
"-concurrency
{
args
.
max_concurrency
}
"
if
args
.
max_concurrency
is
not
None
else
""
)
file_name
=
f
"
{
backend
}
-
{
args
.
request_rate
}
qps
{
max_concurrency_str
}
-
{
base_model_id
}
-
{
current_dt
}
.json"
#noqa
if
args
.
result_filename
:
if
args
.
result_filename
:
file_name
=
args
.
result_filename
file_name
=
args
.
result_filename
if
args
.
result_dir
:
if
args
.
result_dir
:
...
@@ -766,6 +867,19 @@ if __name__ == "__main__":
...
@@ -766,6 +867,19 @@ if __name__ == "__main__":
default
=
None
,
default
=
None
,
help
=
"Path to the sharegpt/sonnet dataset. "
help
=
"Path to the sharegpt/sonnet dataset. "
"Or the huggingface dataset ID if using HF dataset."
)
"Or the huggingface dataset ID if using HF dataset."
)
parser
.
add_argument
(
"--max-concurrency"
,
type
=
int
,
default
=
None
,
help
=
"Maximum number of concurrent requests. This can be used "
"to help simulate an environment where a higher level component "
"is enforcing a maximum number of concurrent requests. While the "
"--request-rate argument controls the rate at which requests are "
"initiated, this argument will control how many are actually allowed "
"to execute at a time. This means that when used in combination, the "
"actual request rate may be lower than specified with --request-rate, "
"if the server is not processing requests fast enough to keep up."
)
parser
.
add_argument
(
parser
.
add_argument
(
"--model"
,
"--model"
,
type
=
str
,
type
=
str
,
...
@@ -879,6 +993,17 @@ if __name__ == "__main__":
...
@@ -879,6 +993,17 @@ if __name__ == "__main__":
"Default value is
\"
99
\"
. "
"Default value is
\"
99
\"
. "
"Use
\"
--percentile-metrics
\"
to select metrics."
,
"Use
\"
--percentile-metrics
\"
to select metrics."
,
)
)
parser
.
add_argument
(
"--goodput"
,
nargs
=
"+"
,
required
=
False
,
help
=
"Specify service level objectives for goodput as
\"
KEY:VALUE
\"
"
"pairs, where the key is a metric name, and the value is in "
"milliseconds. Multiple
\"
KEY:VALUE
\"
pairs can be provided, "
"separated by spaces. Allowed request level metric names are "
"
\"
ttft
\"
,
\"
tpot
\"
,
\"
e2el
\"
. For more context on the definition of "
"goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
"and the blog: https://hao-ai-lab.github.io/blogs/distserve"
)
# group for dataset specific arguments
# group for dataset specific arguments
sonnet_group
=
parser
.
add_argument_group
(
"sonnet dataset options"
)
sonnet_group
=
parser
.
add_argument_group
(
"sonnet dataset options"
)
...
...
Prev
1
2
3
4
5
…
12
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment