Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
1591c68f
"ssh:/git@developer.sourcefind.cn:2222/OpenDAS/vllm_cscc.git" did not exist on "0f8a98612d2879807039a5c679b7da42500df8d6"
Commit
1591c68f
authored
May 25, 2024
by
zhuwenwen
Browse files
merge v0.4.2
parents
09bcf00b
c7f2cf2b
Changes
265
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
273 additions
and
100 deletions
+273
-100
.buildkite/check-wheel-size.py
.buildkite/check-wheel-size.py
+36
-0
.buildkite/run-amd-test.sh
.buildkite/run-amd-test.sh
+35
-29
.buildkite/run-benchmarks.sh
.buildkite/run-benchmarks.sh
+5
-0
.buildkite/run-neuron-test.sh
.buildkite/run-neuron-test.sh
+14
-0
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+21
-2
.buildkite/test-template.j2
.buildkite/test-template.j2
+20
-5
.github/ISSUE_TEMPLATE/750-RFC.yml
.github/ISSUE_TEMPLATE/750-RFC.yml
+49
-0
.github/workflows/mypy.yaml
.github/workflows/mypy.yaml
+5
-6
.github/workflows/publish.yml
.github/workflows/publish.yml
+3
-1
.github/workflows/scripts/create_release.js
.github/workflows/scripts/create_release.js
+1
-1
CMakeLists.txt
CMakeLists.txt
+3
-1
Dockerfile
Dockerfile
+13
-5
Dockerfile.rocm
Dockerfile.rocm
+7
-6
MANIFEST.in
MANIFEST.in
+3
-0
README.md
README.md
+1
-1
README_ORIGIN.md
README_ORIGIN.md
+2
-1
benchmarks/benchmark_prefix_caching.py
benchmarks/benchmark_prefix_caching.py
+13
-3
benchmarks/benchmark_serving.py
benchmarks/benchmark_serving.py
+28
-22
benchmarks/benchmark_throughput.py
benchmarks/benchmark_throughput.py
+13
-16
benchmarks/kernels/benchmark_aqlm.py
benchmarks/kernels/benchmark_aqlm.py
+1
-1
No files found.
.buildkite/check-wheel-size.py
0 → 100644
View file @
1591c68f
import
os
import
zipfile
MAX_SIZE_MB
=
100
def
print_top_10_largest_files
(
zip_file
):
with
zipfile
.
ZipFile
(
zip_file
,
'r'
)
as
z
:
file_sizes
=
[(
f
,
z
.
getinfo
(
f
).
file_size
)
for
f
in
z
.
namelist
()]
file_sizes
.
sort
(
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)
for
f
,
size
in
file_sizes
[:
10
]:
print
(
f
"
{
f
}
:
{
size
/
(
1024
*
1024
)
}
MBs uncompressed."
)
def
check_wheel_size
(
directory
):
for
root
,
_
,
files
in
os
.
walk
(
directory
):
for
f
in
files
:
if
f
.
endswith
(
".whl"
):
wheel_path
=
os
.
path
.
join
(
root
,
f
)
wheel_size
=
os
.
path
.
getsize
(
wheel_path
)
wheel_size_mb
=
wheel_size
/
(
1024
*
1024
)
if
wheel_size_mb
>
MAX_SIZE_MB
:
print
(
f
"Wheel
{
wheel_path
}
is too large (
{
wheel_size_mb
}
MB) "
f
"compare to the allowed size (
{
MAX_SIZE_MB
}
MB)."
)
print_top_10_largest_files
(
wheel_path
)
return
1
else
:
print
(
f
"Wheel
{
wheel_path
}
is within the allowed size "
f
"(
{
wheel_size_mb
}
MB)."
)
return
0
if
__name__
==
"__main__"
:
import
sys
sys
.
exit
(
check_wheel_size
(
sys
.
argv
[
1
]))
.buildkite/run-amd-test.sh
View file @
1591c68f
# This script build the ROCm docker image and run the API server inside the container.
# This script build the ROCm docker image and runs test inside it.
# It serves a sanity check for compilation and basic model usage.
set
-ex
set
-ex
# Print ROCm version
# Print ROCm version
echo
"--- ROCm info"
rocminfo
rocminfo
# Try building the docker image
echo
"--- Resetting GPUs"
docker build
-t
rocm
-f
Dockerfile.rocm
.
# Setup cleanup
echo
"reset"
>
/opt/amdgpu/etc/gpu_state
remove_docker_container
()
{
docker
rm
-f
rocm
||
true
;
}
trap
remove_docker_container EXIT
while
true
;
do
remove_docker_container
sleep
3
if
grep
-q
clean /opt/amdgpu/etc/gpu_state
;
then
# Run the image
echo
"GPUs state is
\"
clean
\"
"
docker run
--device
/dev/kfd
--device
/dev/dri
--network
host
--name
rocm rocm python3
-m
vllm.entrypoints.api_server &
break
# Wait for the server to start
wait_for_server_to_start
()
{
timeout
=
300
counter
=
0
while
[
"
$(
curl
-s
-o
/dev/null
-w
''
%
{
http_code
}
''
localhost:8000/health
)
"
!=
"200"
]
;
do
sleep
1
counter
=
$((
counter
+
1
))
if
[
$counter
-ge
$timeout
]
;
then
echo
"Timeout after
$timeout
seconds"
break
fi
fi
done
done
echo
"--- Building container"
sha
=
$(
git rev-parse
--short
HEAD
)
container_name
=
rocm_
${
sha
}
docker build
\
-t
${
container_name
}
\
-f
Dockerfile.rocm
\
--progress
plain
\
.
remove_docker_container
()
{
docker
rm
-f
${
container_name
}
||
docker image
rm
-f
${
container_name
}
||
true
}
}
wait_for_server_to_start
trap
remove_docker_container EXIT
echo
"--- Running container"
docker run
\
--device
/dev/kfd
--device
/dev/dri
\
--network
host
\
--rm
\
-e
HF_TOKEN
\
--name
${
container_name
}
\
${
container_name
}
\
/bin/bash
-c
$(
echo
$1
|
sed
"s/^'//"
|
sed
"s/'
$/
/"
)
# Test a simple prompt
curl
-X
POST
-H
"Content-Type: application/json"
\
localhost:8000/generate
\
-d
'{"prompt": "San Francisco is a"}'
.buildkite/run-benchmarks.sh
View file @
1591c68f
...
@@ -53,6 +53,11 @@ echo '```' >> benchmark_results.md
...
@@ -53,6 +53,11 @@ echo '```' >> benchmark_results.md
tail
-n
20 benchmark_serving.txt
>>
benchmark_results.md
# last 20 lines
tail
-n
20 benchmark_serving.txt
>>
benchmark_results.md
# last 20 lines
echo
'```'
>>
benchmark_results.md
echo
'```'
>>
benchmark_results.md
# if the agent binary is not found, skip uploading the results, exit 0
if
[
!
-f
/workspace/buildkite-agent
]
;
then
exit
0
fi
# upload the results to buildkite
# upload the results to buildkite
/workspace/buildkite-agent annotate
--style
"info"
--context
"benchmark-results"
< benchmark_results.md
/workspace/buildkite-agent annotate
--style
"info"
--context
"benchmark-results"
< benchmark_results.md
...
...
.buildkite/run-neuron-test.sh
View file @
1591c68f
...
@@ -4,6 +4,20 @@ set -e
...
@@ -4,6 +4,20 @@ set -e
# Try building the docker image
# Try building the docker image
aws ecr get-login-password
--region
us-west-2 | docker login
--username
AWS
--password-stdin
763104351884.dkr.ecr.us-west-2.amazonaws.com
aws ecr get-login-password
--region
us-west-2 | docker login
--username
AWS
--password-stdin
763104351884.dkr.ecr.us-west-2.amazonaws.com
# prune old image and containers to save disk space, and only once a day
# by using a timestamp file in tmp.
if
[
-f
/tmp/neuron-docker-build-timestamp
]
;
then
last_build
=
$(
cat
/tmp/neuron-docker-build-timestamp
)
current_time
=
$(
date
+%s
)
if
[
$((
current_time
-
last_build
))
-gt
86400
]
;
then
docker system prune
-f
echo
$current_time
>
/tmp/neuron-docker-build-timestamp
fi
else
echo
$(
date
+%s
)
>
/tmp/neuron-docker-build-timestamp
fi
docker build
-t
neuron
-f
Dockerfile.neuron
.
docker build
-t
neuron
-f
Dockerfile.neuron
.
# Setup cleanup
# Setup cleanup
...
...
.buildkite/test-pipeline.yaml
View file @
1591c68f
...
@@ -17,27 +17,38 @@ steps:
...
@@ -17,27 +17,38 @@ steps:
-
VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
-
VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
-
VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
-
VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
-
VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
-
VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
-
VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
-
label
:
Core Test
-
label
:
Core Test
mirror_hardwares
:
[
amd
]
command
:
pytest -v -s core
command
:
pytest -v -s core
-
label
:
Distributed Comm Ops Test
-
label
:
Distributed Comm Ops Test
command
:
pytest -v -s test_comm_ops.py
command
:
pytest -v -s test_comm_ops.py
working_dir
:
"
/vllm-workspace/tests/distributed"
working_dir
:
"
/vllm-workspace/tests/distributed"
num_gpus
:
2
# only support 1 or 2 for now.
num_gpus
:
2
-
label
:
Distributed Tests
-
label
:
Distributed Tests
working_dir
:
"
/vllm-workspace/tests/distributed"
working_dir
:
"
/vllm-workspace/tests/distributed"
num_gpus
:
2
# only support 1 or 2 for now.
num_gpus
:
2
# only support 1 or 2 for now.
mirror_hardwares
:
[
amd
]
commands
:
commands
:
-
pytest -v -s test_pynccl.py
-
pytest -v -s test_pynccl_library.py
-
pytest -v -s test_pynccl_library.py
-
TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_basic_distributed_correctness.py
-
TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_basic_distributed_correctness.py
-
TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_basic_distributed_correctness.py
-
TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_basic_distributed_correctness.py
-
TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_chunked_prefill_distributed.py
-
TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_chunked_prefill_distributed.py
-
TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_chunked_prefill_distributed.py
-
TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_chunked_prefill_distributed.py
-
label
:
Distributed Tests (Multiple Groups)
working_dir
:
"
/vllm-workspace/tests/distributed"
num_gpus
:
4
commands
:
-
pytest -v -s test_pynccl.py
-
label
:
Engine Test
-
label
:
Engine Test
mirror_hardwares
:
[
amd
]
command
:
pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
command
:
pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
-
label
:
Entrypoints Test
-
label
:
Entrypoints Test
...
@@ -48,6 +59,7 @@ steps:
...
@@ -48,6 +59,7 @@ steps:
-
label
:
Examples Test
-
label
:
Examples Test
working_dir
:
"
/vllm-workspace/examples"
working_dir
:
"
/vllm-workspace/examples"
mirror_hardwares
:
[
amd
]
commands
:
commands
:
# install aws cli for llava_example.py
# install aws cli for llava_example.py
-
pip install awscli
-
pip install awscli
...
@@ -61,16 +73,19 @@ steps:
...
@@ -61,16 +73,19 @@ steps:
parallelism
:
4
parallelism
:
4
-
label
:
Models Test
-
label
:
Models Test
mirror_hardwares
:
[
amd
]
commands
:
commands
:
-
bash ../.buildkite/download-images.sh
-
bash ../.buildkite/download-images.sh
-
pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py
-
pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py
-
label
:
Llava Test
-
label
:
Llava Test
mirror_hardwares
:
[
amd
]
commands
:
commands
:
-
bash ../.buildkite/download-images.sh
-
bash ../.buildkite/download-images.sh
-
pytest -v -s models/test_llava.py
-
pytest -v -s models/test_llava.py
-
label
:
Prefix Caching Test
-
label
:
Prefix Caching Test
mirror_hardwares
:
[
amd
]
commands
:
commands
:
-
pytest -v -s prefix_caching
-
pytest -v -s prefix_caching
...
@@ -78,12 +93,15 @@ steps:
...
@@ -78,12 +93,15 @@ steps:
command
:
pytest -v -s samplers
command
:
pytest -v -s samplers
-
label
:
LogitsProcessor Test
-
label
:
LogitsProcessor Test
mirror_hardwares
:
[
amd
]
command
:
pytest -v -s test_logits_processor.py
command
:
pytest -v -s test_logits_processor.py
-
label
:
Worker Test
-
label
:
Worker Test
mirror_hardwares
:
[
amd
]
command
:
pytest -v -s worker
command
:
pytest -v -s worker
-
label
:
Speculative decoding tests
-
label
:
Speculative decoding tests
mirror_hardwares
:
[
amd
]
command
:
pytest -v -s spec_decode
command
:
pytest -v -s spec_decode
-
label
:
LoRA Test %N
-
label
:
LoRA Test %N
...
@@ -101,6 +119,7 @@ steps:
...
@@ -101,6 +119,7 @@ steps:
-
label
:
Benchmarks
-
label
:
Benchmarks
working_dir
:
"
/vllm-workspace/.buildkite"
working_dir
:
"
/vllm-workspace/.buildkite"
mirror_hardwares
:
[
amd
]
commands
:
commands
:
-
pip install aiohttp
-
pip install aiohttp
-
bash run-benchmarks.sh
-
bash run-benchmarks.sh
...
...
.buildkite/test-template.j2
View file @
1591c68f
...
@@ -16,17 +16,29 @@ steps:
...
@@ -16,17 +16,29 @@ steps:
limit: 5
limit: 5
- wait
- wait
- label: "AMD Test"
- group: "AMD Tests"
agents:
depends_on: ~
queue: amd
steps:
command: bash .buildkite/run-amd-test.sh
{% for step in steps %}
{% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
- label: "AMD: {{ step.label }}"
agents:
queue: amd
command: bash .buildkite/run-amd-test.sh "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'"
env:
DOCKER_BUILDKIT: "1"
{% endif %}
{% endfor %}
- label: "Neuron Test"
- label: "Neuron Test"
depends_on: ~
agents:
agents:
queue: neuron
queue: neuron
command: bash .buildkite/run-neuron-test.sh
command: bash .buildkite/run-neuron-test.sh
soft_fail: true
- label: "CPU Test"
- label: "Intel Test"
depends_on: ~
command: bash .buildkite/run-cpu-test.sh
command: bash .buildkite/run-cpu-test.sh
{% for step in steps %}
{% for step in steps %}
...
@@ -44,6 +56,9 @@ steps:
...
@@ -44,6 +56,9 @@ steps:
plugins:
plugins:
- kubernetes:
- kubernetes:
podSpec:
podSpec:
{% if step.num_gpus %}
priorityClassName: gpu-priority-cls-{{ step.num_gpus }}
{% endif %}
volumes:
volumes:
- name: dshm
- name: dshm
emptyDir:
emptyDir:
...
...
.github/ISSUE_TEMPLATE/750-RFC.yml
0 → 100644
View file @
1591c68f
name
:
💬 Request for comments (RFC).
description
:
Ask for feedback on major architectural changes or design choices.
title
:
"
[RFC]:
"
labels
:
[
"
RFC"
]
body
:
-
type
:
markdown
attributes
:
value
:
>
#### Please take a look at previous [RFCs](https://github.com/vllm-project/vllm/issues?q=label%3ARFC+sort%3Aupdated-desc) for reference.
-
type
:
textarea
attributes
:
label
:
Motivation.
description
:
>
The motivation of the RFC.
validations
:
required
:
true
-
type
:
textarea
attributes
:
label
:
Proposed Change.
description
:
>
The proposed change of the RFC.
validations
:
required
:
true
-
type
:
textarea
attributes
:
label
:
Feedback Period.
description
:
>
The feedback period of the RFC. Usually at least one week.
validations
:
required
:
false
-
type
:
textarea
attributes
:
label
:
CC List.
description
:
>
The list of people you want to CC.
validations
:
required
:
false
-
type
:
textarea
attributes
:
label
:
Any Other Things.
description
:
>
Any other things you would like to mention.
validations
:
required
:
false
-
type
:
markdown
attributes
:
value
:
>
Thanks for contributing 🎉!
.github/workflows/mypy.yaml
View file @
1591c68f
...
@@ -33,8 +33,7 @@ jobs:
...
@@ -33,8 +33,7 @@ jobs:
-
name
:
Mypy
-
name
:
Mypy
run
:
|
run
:
|
mypy vllm/attention --config-file pyproject.toml
mypy vllm/attention --config-file pyproject.toml
# TODO(sang): Fix nested dir
mypy vllm/core --config-file pyproject.toml
mypy vllm/core/*.py --follow-imports=skip --config-file pyproject.toml
mypy vllm/distributed --config-file pyproject.toml
mypy vllm/distributed --config-file pyproject.toml
mypy vllm/entrypoints --config-file pyproject.toml
mypy vllm/entrypoints --config-file pyproject.toml
mypy vllm/executor --config-file pyproject.toml
mypy vllm/executor --config-file pyproject.toml
...
@@ -44,8 +43,8 @@ jobs:
...
@@ -44,8 +43,8 @@ jobs:
mypy vllm/engine --config-file pyproject.toml
mypy vllm/engine --config-file pyproject.toml
mypy vllm/worker --config-file pyproject.toml
mypy vllm/worker --config-file pyproject.toml
mypy vllm/spec_decode --config-file pyproject.toml
mypy vllm/spec_decode --config-file pyproject.toml
# TODO(sang): Fix nested dir
mypy vllm/model_executor --config-file pyproject.toml
mypy vllm/
model_executor/*.py
--config-file pyproject.toml
mypy vllm/
lora
--config-file pyproject.toml
# TODO(sang): Fix nested dir
mypy vllm/logging --config-file pyproject.toml
#
mypy vllm/
lora/*.py
--config-file pyproject.toml
mypy vllm/
model_executor
--config-file pyproject.toml
.github/workflows/publish.yml
View file @
1591c68f
...
@@ -49,7 +49,7 @@ jobs:
...
@@ -49,7 +49,7 @@ jobs:
matrix
:
matrix
:
os
:
[
'
ubuntu-20.04'
]
os
:
[
'
ubuntu-20.04'
]
python-version
:
[
'
3.8'
,
'
3.9'
,
'
3.10'
,
'
3.11'
]
python-version
:
[
'
3.8'
,
'
3.9'
,
'
3.10'
,
'
3.11'
]
pytorch-version
:
[
'
2.
2.1
'
]
# Must be the most recent version that meets requirements-cuda.txt.
pytorch-version
:
[
'
2.
3.0
'
]
# Must be the most recent version that meets requirements-cuda.txt.
cuda-version
:
[
'
11.8'
,
'
12.1'
]
cuda-version
:
[
'
11.8'
,
'
12.1'
]
steps
:
steps
:
...
@@ -79,6 +79,8 @@ jobs:
...
@@ -79,6 +79,8 @@ jobs:
-
name
:
Build wheel
-
name
:
Build wheel
shell
:
bash
shell
:
bash
env
:
CMAKE_BUILD_TYPE
:
Release
# do not compile with debug symbol to reduce wheel size
run
:
|
run
:
|
bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
wheel_name=$(ls dist/*whl | xargs -n 1 basename)
wheel_name=$(ls dist/*whl | xargs -n 1 basename)
...
...
.github/workflows/scripts/create_release.js
View file @
1591c68f
...
@@ -8,7 +8,7 @@ module.exports = async (github, context, core) => {
...
@@ -8,7 +8,7 @@ module.exports = async (github, context, core) => {
generate_release_notes
:
true
,
generate_release_notes
:
true
,
name
:
process
.
env
.
RELEASE_TAG
,
name
:
process
.
env
.
RELEASE_TAG
,
owner
:
context
.
repo
.
owner
,
owner
:
context
.
repo
.
owner
,
prerelease
:
fals
e
,
prerelease
:
tru
e
,
repo
:
context
.
repo
.
repo
,
repo
:
context
.
repo
.
repo
,
tag_name
:
process
.
env
.
RELEASE_TAG
,
tag_name
:
process
.
env
.
RELEASE_TAG
,
});
});
...
...
CMakeLists.txt
View file @
1591c68f
...
@@ -32,7 +32,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
...
@@ -32,7 +32,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
# requirements.txt files and should be kept consistent. The ROCm torch
# requirements.txt files and should be kept consistent. The ROCm torch
# versions are derived from Dockerfile.rocm
# versions are derived from Dockerfile.rocm
#
#
set
(
TORCH_SUPPORTED_VERSION_CUDA
"2.
2.1
"
)
set
(
TORCH_SUPPORTED_VERSION_CUDA
"2.
3.0
"
)
set
(
TORCH_SUPPORTED_VERSION_ROCM_5X
"2.0.1"
)
set
(
TORCH_SUPPORTED_VERSION_ROCM_5X
"2.0.1"
)
set
(
TORCH_SUPPORTED_VERSION_ROCM_6X
"2.1.1"
)
set
(
TORCH_SUPPORTED_VERSION_ROCM_6X
"2.1.1"
)
...
@@ -178,6 +178,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
...
@@ -178,6 +178,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"csrc/quantization/aqlm/gemm_kernels.cu"
"csrc/quantization/aqlm/gemm_kernels.cu"
"csrc/quantization/awq/gemm_kernels.cu"
"csrc/quantization/awq/gemm_kernels.cu"
"csrc/quantization/marlin/marlin_cuda_kernel.cu"
"csrc/quantization/marlin/marlin_cuda_kernel.cu"
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
"csrc/custom_all_reduce.cu"
)
"csrc/custom_all_reduce.cu"
)
endif
()
endif
()
...
...
Dockerfile
View file @
1591c68f
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
# to run the OpenAI compatible server.
# to run the OpenAI compatible server.
# Please update any changes made here to
# docs/source/dev/dockerfile/dockerfile.rst and
# docs/source/assets/dev/dockerfile-stages-dependency.png
#################### BASE BUILD IMAGE ####################
#################### BASE BUILD IMAGE ####################
# prepare basic build environment
# prepare basic build environment
FROM
nvidia/cuda:12.
1.0
-devel-ubuntu22.04 AS dev
FROM
nvidia/cuda:12.
4.1
-devel-ubuntu22.04 AS dev
RUN
apt-get update
-y
\
RUN
apt-get update
-y
\
&&
apt-get
install
-y
python3-pip git
&&
apt-get
install
-y
python3-pip git
...
@@ -12,7 +16,7 @@ RUN apt-get update -y \
...
@@ -12,7 +16,7 @@ RUN apt-get update -y \
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# this won't be needed for future versions of this docker image
# or future versions of triton.
# or future versions of triton.
RUN
ldconfig /usr/local/cuda-12.
1
/compat/
RUN
ldconfig /usr/local/cuda-12.
4
/compat/
WORKDIR
/workspace
WORKDIR
/workspace
...
@@ -71,6 +75,10 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
...
@@ -71,6 +75,10 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
--mount
=
type
=
cache,target
=
/root/.cache/pip
\
--mount
=
type
=
cache,target
=
/root/.cache/pip
\
python3 setup.py bdist_wheel
--dist-dir
=
dist
python3 setup.py bdist_wheel
--dist-dir
=
dist
# check the size of the wheel, we cannot upload wheels larger than 100MB
COPY
.buildkite/check-wheel-size.py check-wheel-size.py
RUN
python3 check-wheel-size.py dist
# the `vllm_nccl` package must be installed from source distribution
# the `vllm_nccl` package must be installed from source distribution
# pip is too smart to store a wheel in the cache, and other CI jobs
# pip is too smart to store a wheel in the cache, and other CI jobs
# will directly use the wheel from the cache, which is not what we want.
# will directly use the wheel from the cache, which is not what we want.
...
@@ -85,7 +93,7 @@ FROM dev as flash-attn-builder
...
@@ -85,7 +93,7 @@ FROM dev as flash-attn-builder
ARG
max_jobs=2
ARG
max_jobs=2
ENV
MAX_JOBS=${max_jobs}
ENV
MAX_JOBS=${max_jobs}
# flash attention version
# flash attention version
ARG
flash_attn_version=v2.5.
6
ARG
flash_attn_version=v2.5.
8
ENV
FLASH_ATTN_VERSION=${flash_attn_version}
ENV
FLASH_ATTN_VERSION=${flash_attn_version}
WORKDIR
/usr/src/flash-attention-v2
WORKDIR
/usr/src/flash-attention-v2
...
@@ -98,7 +106,7 @@ RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
...
@@ -98,7 +106,7 @@ RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
#################### vLLM installation IMAGE ####################
#################### vLLM installation IMAGE ####################
# image with vLLM installed
# image with vLLM installed
FROM
nvidia/cuda:12.
1.0
-base-ubuntu22.04 AS vllm-base
FROM
nvidia/cuda:12.
4.1
-base-ubuntu22.04 AS vllm-base
WORKDIR
/vllm-workspace
WORKDIR
/vllm-workspace
RUN
apt-get update
-y
\
RUN
apt-get update
-y
\
...
@@ -108,7 +116,7 @@ RUN apt-get update -y \
...
@@ -108,7 +116,7 @@ RUN apt-get update -y \
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# this won't be needed for future versions of this docker image
# or future versions of triton.
# or future versions of triton.
RUN
ldconfig /usr/local/cuda-12.
1
/compat/
RUN
ldconfig /usr/local/cuda-12.
4
/compat/
# install vllm wheel first, so that torch etc will be installed
# install vllm wheel first, so that torch etc will be installed
RUN
--mount
=
type
=
bind
,from
=
build,src
=
/workspace/dist,target
=
/vllm-workspace/dist
\
RUN
--mount
=
type
=
bind
,from
=
build,src
=
/workspace/dist,target
=
/vllm-workspace/dist
\
...
...
Dockerfile.rocm
View file @
1591c68f
...
@@ -46,7 +46,7 @@ RUN apt-get update && apt-get install -y \
...
@@ -46,7 +46,7 @@ RUN apt-get update && apt-get install -y \
### Mount Point ###
### Mount Point ###
# When launching the container, mount the code directory to /app
# When launching the container, mount the code directory to /app
ARG APP_MOUNT=/
app
ARG APP_MOUNT=/
vllm-workspace
VOLUME [ ${APP_MOUNT} ]
VOLUME [ ${APP_MOUNT} ]
WORKDIR ${APP_MOUNT}
WORKDIR ${APP_MOUNT}
...
@@ -89,15 +89,16 @@ RUN if [ "$BUILD_TRITON" = "1" ]; then \
...
@@ -89,15 +89,16 @@ RUN if [ "$BUILD_TRITON" = "1" ]; then \
&& cd ../..; \
&& cd ../..; \
fi
fi
COPY ./ /app/vllm
WORKDIR /vllm-workspace
COPY . .
RUN python3 -m pip install --upgrade pip numba
RUN python3 -m pip install --upgrade pip numba
RUN cd /app \
RUN --mount=type=cache,target=/root/.cache/pip \
&& cd vllm \
pip install -U -r requirements-rocm.txt \
&& pip install -U -r requirements-rocm.txt \
&& patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch \
&& patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h /app/vllm/rocm_patch/rocm_bf16.patch \
&& python3 setup.py install \
&& python3 setup.py install \
&& cp build/lib.linux-x86_64-cpython-39/vllm/_C.cpython-39-x86_64-linux-gnu.so vllm/ \
&& cd ..
&& cd ..
RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install --upgrade pip
...
...
MANIFEST.in
View file @
1591c68f
include LICENSE
include LICENSE
include requirements-common.txt
include requirements-common.txt
include requirements-cuda.txt
include requirements-cuda.txt
include requirements-rocm.txt
include requirements-neuron.txt
include requirements-cpu.txt
include CMakeLists.txt
include CMakeLists.txt
recursive-include cmake *
recursive-include cmake *
...
...
README.md
View file @
1591c68f
...
@@ -74,7 +74,7 @@ python3 setup.py install
...
@@ -74,7 +74,7 @@ python3 setup.py install
+
若使用 pip install 下载安装过慢,可添加源:-i https://pypi.tuna.tsinghua.edu.cn/simple/
+
若使用 pip install 下载安装过慢,可添加源:-i https://pypi.tuna.tsinghua.edu.cn/simple/
## 验证
## 验证
-
python -c "import vllm; print(vllm.
\_\_
version__)",版本号与官方版本同步,查询该软件的版本号,例如0.4.
1
;
-
python -c "import vllm; print(vllm.
\_\_
version__)",版本号与官方版本同步,查询该软件的版本号,例如0.4.
2
;
## Known Issue
## Known Issue
-
无
-
无
...
...
README_ORIGIN.md
View file @
1591c68f
...
@@ -74,10 +74,11 @@ vLLM seamlessly supports many Hugging Face models, including the following archi
...
@@ -74,10 +74,11 @@ vLLM seamlessly supports many Hugging Face models, including the following archi
-
Mistral (
`mistralai/Mistral-7B-v0.1`
,
`mistralai/Mistral-7B-Instruct-v0.1`
, etc.)
-
Mistral (
`mistralai/Mistral-7B-v0.1`
,
`mistralai/Mistral-7B-Instruct-v0.1`
, etc.)
-
Mixtral (
`mistralai/Mixtral-8x7B-v0.1`
,
`mistralai/Mixtral-8x7B-Instruct-v0.1`
,
`mistral-community/Mixtral-8x22B-v0.1`
, etc.)
-
Mixtral (
`mistralai/Mixtral-8x7B-v0.1`
,
`mistralai/Mixtral-8x7B-Instruct-v0.1`
,
`mistral-community/Mixtral-8x22B-v0.1`
, etc.)
-
MPT (
`mosaicml/mpt-7b`
,
`mosaicml/mpt-30b`
, etc.)
-
MPT (
`mosaicml/mpt-7b`
,
`mosaicml/mpt-30b`
, etc.)
-
OLMo (
`allenai/OLMo-1B`
,
`allenai/OLMo-7B`
, etc.)
-
OLMo (
`allenai/OLMo-1B
-hf
`
,
`allenai/OLMo-7B
-hf
`
, etc.)
-
OPT (
`facebook/opt-66b`
,
`facebook/opt-iml-max-30b`
, etc.)
-
OPT (
`facebook/opt-66b`
,
`facebook/opt-iml-max-30b`
, etc.)
-
Orion (
`OrionStarAI/Orion-14B-Base`
,
`OrionStarAI/Orion-14B-Chat`
, etc.)
-
Orion (
`OrionStarAI/Orion-14B-Base`
,
`OrionStarAI/Orion-14B-Chat`
, etc.)
-
Phi (
`microsoft/phi-1_5`
,
`microsoft/phi-2`
, etc.)
-
Phi (
`microsoft/phi-1_5`
,
`microsoft/phi-2`
, etc.)
-
Phi-3 (
`microsoft/Phi-3-mini-4k-instruct`
,
`microsoft/Phi-3-mini-128k-instruct`
, etc.)
-
Qwen (
`Qwen/Qwen-7B`
,
`Qwen/Qwen-7B-Chat`
, etc.)
-
Qwen (
`Qwen/Qwen-7B`
,
`Qwen/Qwen-7B-Chat`
, etc.)
-
Qwen2 (
`Qwen/Qwen1.5-7B`
,
`Qwen/Qwen1.5-7B-Chat`
, etc.)
-
Qwen2 (
`Qwen/Qwen1.5-7B`
,
`Qwen/Qwen1.5-7B-Chat`
, etc.)
-
Qwen2MoE (
`Qwen/Qwen1.5-MoE-A2.7B`
,
`Qwen/Qwen1.5-MoE-A2.7B-Chat`
, etc.)
-
Qwen2MoE (
`Qwen/Qwen1.5-MoE-A2.7B`
,
`Qwen/Qwen1.5-MoE-A2.7B-Chat`
, etc.)
...
...
benchmarks/benchmark_prefix_caching.py
View file @
1591c68f
...
@@ -16,20 +16,22 @@ def test_prefix(llm=None, sampling_params=None, prompts=None):
...
@@ -16,20 +16,22 @@ def test_prefix(llm=None, sampling_params=None, prompts=None):
def
main
(
args
):
def
main
(
args
):
llm
=
LLM
(
model
=
"baichuan-inc/Baichuan2-13B-Chat"
,
llm
=
LLM
(
model
=
args
.
model
,
tokenizer_mode
=
'auto'
,
tokenizer_mode
=
'auto'
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
enforce_eager
=
True
,
enforce_eager
=
True
,
use_v2_block_manager
=
args
.
use_v2_block_manager
,
tensor_parallel_size
=
args
.
tensor_parallel_size
,
enable_prefix_caching
=
args
.
enable_prefix_caching
)
enable_prefix_caching
=
args
.
enable_prefix_caching
)
num_prompts
=
100
num_prompts
=
100
prompts
=
[
PROMPT
]
*
num_prompts
prompts
=
[
PROMPT
]
*
num_prompts
sampling_params
=
SamplingParams
(
temperature
=
0
,
max_tokens
=
100
)
sampling_params
=
SamplingParams
(
temperature
=
0
,
max_tokens
=
args
.
output_len
)
print
(
"------warm up------"
)
print
(
"------warm up------"
)
test_prefix
(
test_prefix
(
llm
=
llm
,
llm
=
llm
,
prompts
=
prompts
[:
1
]
,
prompts
=
prompts
,
sampling_params
=
sampling_params
,
sampling_params
=
sampling_params
,
)
)
...
@@ -45,8 +47,16 @@ if __name__ == "__main__":
...
@@ -45,8 +47,16 @@ if __name__ == "__main__":
parser
=
argparse
.
ArgumentParser
(
parser
=
argparse
.
ArgumentParser
(
description
=
'Benchmark the performance with or without automatic '
description
=
'Benchmark the performance with or without automatic '
'prefix caching.'
)
'prefix caching.'
)
parser
.
add_argument
(
'--model'
,
type
=
str
,
default
=
'baichuan-inc/Baichuan2-13B-Chat'
)
parser
.
add_argument
(
'--tensor-parallel-size'
,
'-tp'
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
'--output-len'
,
type
=
int
,
default
=
10
)
parser
.
add_argument
(
'--enable-prefix-caching'
,
parser
.
add_argument
(
'--enable-prefix-caching'
,
action
=
'store_true'
,
action
=
'store_true'
,
help
=
'enable prefix caching'
)
help
=
'enable prefix caching'
)
parser
.
add_argument
(
'--use-v2-block-manager'
,
action
=
'store_true'
,
help
=
'Use BlockSpaceMangerV2'
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
main
(
args
)
main
(
args
)
benchmarks/benchmark_serving.py
View file @
1591c68f
...
@@ -27,7 +27,7 @@ import time
...
@@ -27,7 +27,7 @@ import time
import
warnings
import
warnings
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
from
datetime
import
datetime
from
datetime
import
datetime
from
typing
import
AsyncGenerator
,
List
,
Tuple
from
typing
import
AsyncGenerator
,
List
,
Optional
,
Tuple
import
numpy
as
np
import
numpy
as
np
from
backend_request_func
import
(
ASYNC_REQUEST_FUNCS
,
RequestFuncInput
,
from
backend_request_func
import
(
ASYNC_REQUEST_FUNCS
,
RequestFuncInput
,
...
@@ -58,7 +58,11 @@ def sample_sharegpt_requests(
...
@@ -58,7 +58,11 @@ def sample_sharegpt_requests(
dataset_path
:
str
,
dataset_path
:
str
,
num_requests
:
int
,
num_requests
:
int
,
tokenizer
:
PreTrainedTokenizerBase
,
tokenizer
:
PreTrainedTokenizerBase
,
fixed_output_len
:
Optional
[
int
]
=
None
,
)
->
List
[
Tuple
[
str
,
int
,
int
]]:
)
->
List
[
Tuple
[
str
,
int
,
int
]]:
if
fixed_output_len
is
not
None
and
fixed_output_len
<
4
:
raise
ValueError
(
"output_len too small"
)
# Load the dataset.
# Load the dataset.
with
open
(
dataset_path
)
as
f
:
with
open
(
dataset_path
)
as
f
:
dataset
=
json
.
load
(
f
)
dataset
=
json
.
load
(
f
)
...
@@ -68,38 +72,32 @@ def sample_sharegpt_requests(
...
@@ -68,38 +72,32 @@ def sample_sharegpt_requests(
dataset
=
[(
data
[
"conversations"
][
0
][
"value"
],
dataset
=
[(
data
[
"conversations"
][
0
][
"value"
],
data
[
"conversations"
][
1
][
"value"
])
for
data
in
dataset
]
data
[
"conversations"
][
1
][
"value"
])
for
data
in
dataset
]
# some of these will be filtered out, so sample more than we need
# Shuffle the dataset.
sampled_indices
=
random
.
sample
(
range
(
len
(
dataset
)),
random
.
shuffle
(
dataset
)
int
(
num_requests
*
1.2
))
dataset
=
[
dataset
[
i
]
for
i
in
sampled_indices
]
# Tokenize the prompts and completions.
prompts
=
[
prompt
for
prompt
,
_
in
dataset
]
prompt_token_ids
=
tokenizer
(
prompts
).
input_ids
completions
=
[
completion
for
_
,
completion
in
dataset
]
completion_token_ids
=
tokenizer
(
completions
).
input_ids
tokenized_dataset
=
[]
for
i
in
range
(
len
(
dataset
)):
output_len
=
len
(
completion_token_ids
[
i
])
tokenized_dataset
.
append
((
prompts
[
i
],
prompt_token_ids
[
i
],
output_len
))
# Filter out
too long sequences.
# Filter out
sequences that are too long or too short
filtered_dataset
:
List
[
Tuple
[
str
,
int
,
int
]]
=
[]
filtered_dataset
:
List
[
Tuple
[
str
,
int
,
int
]]
=
[]
for
prompt
,
prompt_token_ids
,
output_len
in
tokenized_dataset
:
for
i
in
range
(
len
(
dataset
)):
if
len
(
filtered_dataset
)
==
num_requests
:
break
# Tokenize the prompts and completions.
prompt
=
dataset
[
i
][
0
]
prompt_token_ids
=
tokenizer
(
prompt
).
input_ids
completion
=
dataset
[
i
][
1
]
completion_token_ids
=
tokenizer
(
completion
).
input_ids
prompt_len
=
len
(
prompt_token_ids
)
prompt_len
=
len
(
prompt_token_ids
)
output_len
=
len
(
completion_token_ids
)
if
fixed_output_len
is
None
else
fixed_output_len
if
prompt_len
<
4
or
output_len
<
4
:
if
prompt_len
<
4
or
output_len
<
4
:
# Prune too short sequences.
# Prune too short sequences.
# This is because TGI causes errors when the input or output length
# is too short.
continue
continue
if
prompt_len
>
1024
or
prompt_len
+
output_len
>
2048
:
if
prompt_len
>
1024
or
prompt_len
+
output_len
>
2048
:
# Prune too long sequences.
# Prune too long sequences.
continue
continue
filtered_dataset
.
append
((
prompt
,
prompt_len
,
output_len
))
filtered_dataset
.
append
((
prompt
,
prompt_len
,
output_len
))
# Sample the requests.
return
filtered_dataset
sampled_requests
=
random
.
sample
(
filtered_dataset
,
num_requests
)
return
sampled_requests
def
sample_sonnet_requests
(
def
sample_sonnet_requests
(
...
@@ -361,6 +359,7 @@ def main(args: argparse.Namespace):
...
@@ -361,6 +359,7 @@ def main(args: argparse.Namespace):
dataset_path
=
args
.
dataset
,
dataset_path
=
args
.
dataset
,
num_requests
=
args
.
num_prompts
,
num_requests
=
args
.
num_prompts
,
tokenizer
=
tokenizer
,
tokenizer
=
tokenizer
,
fixed_output_len
=
args
.
sharegpt_output_len
,
)
)
elif
args
.
dataset_name
==
"sharegpt"
:
elif
args
.
dataset_name
==
"sharegpt"
:
...
@@ -368,6 +367,7 @@ def main(args: argparse.Namespace):
...
@@ -368,6 +367,7 @@ def main(args: argparse.Namespace):
dataset_path
=
args
.
dataset_path
,
dataset_path
=
args
.
dataset_path
,
num_requests
=
args
.
num_prompts
,
num_requests
=
args
.
num_prompts
,
tokenizer
=
tokenizer
,
tokenizer
=
tokenizer
,
fixed_output_len
=
args
.
sharegpt_output_len
,
)
)
elif
args
.
dataset_name
==
"sonnet"
:
elif
args
.
dataset_name
==
"sonnet"
:
...
@@ -524,6 +524,12 @@ if __name__ == "__main__":
...
@@ -524,6 +524,12 @@ if __name__ == "__main__":
default
=
1000
,
default
=
1000
,
help
=
"Number of prompts to process."
,
help
=
"Number of prompts to process."
,
)
)
parser
.
add_argument
(
"--sharegpt-output-len"
,
type
=
int
,
default
=
None
,
help
=
"Output length for each request. Overrides the output length "
"from the ShareGPT dataset."
)
parser
.
add_argument
(
parser
.
add_argument
(
"--sonnet-input-len"
,
"--sonnet-input-len"
,
type
=
int
,
type
=
int
,
...
...
benchmarks/benchmark_throughput.py
View file @
1591c68f
...
@@ -103,25 +103,22 @@ def run_vllm(
...
@@ -103,25 +103,22 @@ def run_vllm(
)
)
# Add the requests to the engine.
# Add the requests to the engine.
prompts
=
[]
sampling_params
=
[]
for
prompt
,
_
,
output_len
in
requests
:
for
prompt
,
_
,
output_len
in
requests
:
sampling_params
=
SamplingParams
(
prompts
.
append
(
prompt
)
n
=
n
,
sampling_params
.
append
(
temperature
=
0.0
if
use_beam_search
else
1.0
,
SamplingParams
(
top_p
=
1.0
,
n
=
n
,
use_beam_search
=
use_beam_search
,
temperature
=
0.0
if
use_beam_search
else
1.0
,
ignore_eos
=
True
,
top_p
=
1.0
,
max_tokens
=
output_len
,
use_beam_search
=
use_beam_search
,
)
ignore_eos
=
True
,
# FIXME(woosuk): Do not use internal method.
max_tokens
=
output_len
,
llm
.
_add_request
(
))
prompt
=
prompt
,
prompt_token_ids
=
None
,
sampling_params
=
sampling_params
,
)
start
=
time
.
perf_counter
()
start
=
time
.
perf_counter
()
# FIXME(woosuk): Do not use internal method.
llm
.
generate
(
prompts
,
sampling_params
,
use_tqdm
=
True
)
llm
.
_run_engine
(
use_tqdm
=
True
)
end
=
time
.
perf_counter
()
end
=
time
.
perf_counter
()
return
end
-
start
return
end
-
start
...
...
benchmarks/kernels/benchmark_aqlm.py
View file @
1591c68f
...
@@ -6,7 +6,7 @@ from typing import Optional
...
@@ -6,7 +6,7 @@ from typing import Optional
import
torch
import
torch
import
torch.nn.functional
as
F
import
torch.nn.functional
as
F
from
vllm
._C
import
ops
from
vllm
import
_custom_ops
as
ops
from
vllm.model_executor.layers.quantization.aqlm
import
(
from
vllm.model_executor.layers.quantization.aqlm
import
(
dequantize_weight
,
generic_dequantize_gemm
,
get_int_dtype
,
dequantize_weight
,
generic_dequantize_gemm
,
get_int_dtype
,
optimized_dequantize_gemm
)
optimized_dequantize_gemm
)
...
...
Prev
1
2
3
4
5
…
14
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment