Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
norm
vllm
Commits
6e01e8c1
Unverified
Commit
6e01e8c1
authored
Jan 14, 2024
by
Simon Mo
Committed by
GitHub
Jan 14, 2024
Browse files
[CI] Add Buildkite (#2355)
parent
9f659bf0
Changes
13
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
192 additions
and
37 deletions
+192
-37
.buildkite/run-benchmarks.sh
.buildkite/run-benchmarks.sh
+24
-0
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+41
-0
.buildkite/test-template.j2
.buildkite/test-template.j2
+46
-0
Dockerfile
Dockerfile
+24
-12
requirements-dev.txt
requirements-dev.txt
+3
-1
setup.py
setup.py
+6
-1
tests/async_engine/test_api_server.py
tests/async_engine/test_api_server.py
+10
-2
tests/async_engine/test_openai_server.py
tests/async_engine/test_openai_server.py
+9
-6
tests/distributed/test_comm_ops.py
tests/distributed/test_comm_ops.py
+14
-12
tests/kernels/test_attention.py
tests/kernels/test_attention.py
+1
-1
tests/kernels/test_cache.py
tests/kernels/test_cache.py
+2
-2
tests/samplers/test_logprobs.py
tests/samplers/test_logprobs.py
+1
-0
tests/samplers/test_sampler.py
tests/samplers/test_sampler.py
+11
-0
No files found.
.buildkite/run-benchmarks.sh
0 → 100644
View file @
6e01e8c1
# This script is run by buildkite to run the benchmarks and upload the results to buildkite
set
-ex
# cd into parent directory of this file
cd
"
$(
dirname
"
${
BASH_SOURCE
[0]
}
"
)
/.."
# run benchmarks and upload the result to buildkite
python3 benchmarks/benchmark_latency.py 2>&1 |
tee
benchmark_latency.txt
python3 benchmarks/benchmark_throughput.py
--input-len
256
--output-len
256 2>&1 |
tee
benchmark_throughput.txt
# write the results into a markdown file
echo
"### Latency Benchmarks"
>>
benchmark_results.md
sed
-n
'1p'
benchmark_latency.txt
>>
benchmark_results.md
echo
""
>>
benchmark_results.md
sed
-n
'$p'
benchmark_latency.txt
>>
benchmark_results.md
echo
"### Throughput Benchmarks"
>>
benchmark_results.md
sed
-n
'1p'
benchmark_throughput.txt
>>
benchmark_results.md
echo
""
>>
benchmark_results.md
sed
-n
'$p'
benchmark_throughput.txt
>>
benchmark_results.md
# upload the results to buildkite
/workspace/buildkite-agent annotate
--style
"info"
--context
"benchmark-results"
< benchmark_results.md
.buildkite/test-pipeline.yaml
0 → 100644
View file @
6e01e8c1
# In this file, you can add more tests to run either by adding a new step or
# adding a new command to an existing step. See different options here for examples.
# This script will be feed into Jinja template in `test-template.j2` to generate
# the final pipeline yaml file.
steps
:
-
label
:
Regression Test
command
:
pytest -v -s test_regression.py
working_dir
:
"
/vllm-workspace/tests"
# optional
-
label
:
AsyncEngine Test
command
:
pytest -v -s async_engine
-
label
:
Distributed Test
command
:
pytest -v -s test_comm_ops.py
working_dir
:
"
/vllm-workspace/tests/distributed"
num_gpus
:
2
# only support 1 or 2 for now.
-
label
:
Engine Test
command
:
pytest -v -s engine
-
label
:
Kernels Test
command
:
pytest -v -s kernels
soft_fail
:
true
-
label
:
Models Test
commands
:
-
pytest -v -s models --forked
soft_fail
:
true
-
label
:
Samplers Test
command
:
pytest -v -s samplers --forked
-
label
:
Worker Test
command
:
pytest -v -s worker
-
label
:
Benchmarks
working_dir
:
"
/vllm-workspace/.buildkite"
commands
:
-
pip install aiohttp
-
bash run-benchmarks.sh
.buildkite/test-template.j2
0 → 100644
View file @
6e01e8c1
{% set docker_image = "us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT" %}
{% set default_num_gpu = 1 %}
{% set default_working_dir = "/vllm-workspace/tests" %}
steps:
- label: ":docker: build image"
commands:
- "docker build --tag {{ docker_image }} --target test --progress plain ."
- "docker push {{ docker_image }}"
env:
DOCKER_BUILDKIT: "1"
- wait
{% for step in steps %}
- label: "{{ step.label }}"
agents:
queue: kubernetes
soft_fail: {{ step.soft_fail or false }}
plugins:
- kubernetes:
podSpec:
volumes:
- name: dshm
emptyDir:
medium: Memory
containers:
- image: "{{ docker_image }}"
command: ["bash"]
args:
- "-c"
- "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'"
resources:
requests:
nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
limits:
nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
env:
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
volumeMounts:
- mountPath: /dev/shm
name: dshm
{% endfor %}
Dockerfile
View file @
6e01e8c1
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
# to run the OpenAI compatible server.
#################### BASE BUILD IMAGE ####################
FROM
nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
RUN
apt-get update
-y
\
&&
apt-get
install
-y
python3-pip
&&
apt-get
install
-y
python3-pip
git
WORKDIR
/workspace
...
...
@@ -14,8 +18,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \
COPY
requirements-dev.txt requirements-dev.txt
RUN
--mount
=
type
=
cache,target
=
/root/.cache/pip
\
pip
install
-r
requirements-dev.txt
#################### BASE BUILD IMAGE ####################
#
image to build pytorch extensions
#
################### EXTENSION BUILD IMAGE ####################
FROM
dev AS build
# install build dependencies
...
...
@@ -30,6 +36,7 @@ COPY requirements.txt requirements.txt
COPY
pyproject.toml pyproject.toml
COPY
vllm/__init__.py vllm/__init__.py
# cuda arch list used by torch
ARG
torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
ENV
TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
# max jobs used by Ninja to build extensions
...
...
@@ -40,18 +47,26 @@ ARG nvcc_threads=8
ENV
NVCC_THREADS=$nvcc_threads
RUN
python3 setup.py build_ext
--inplace
#################### EXTENSION Build IMAGE ####################
#################### TEST IMAGE ####################
# image to run unit testing suite
FROM
dev AS test
# copy pytorch extensions separately to avoid having to rebuild
# when python code changes
COPY
--from=build /workspace/vllm/*.so /workspace/vllm/
COPY
tests tests
COPY
vllm vllm
WORKDIR
/vllm-workspace
# ADD is used to preserve directory structure
ADD
. /vllm-workspace/
COPY
--from=build /workspace/vllm/*.so /vllm-workspace/vllm/
# ignore build dependencies installation because we are using pre-complied extensions
RUN
rm
pyproject.toml
RUN
--mount
=
type
=
cache,target
=
/root/.cache/pip
VLLM_USE_PRECOMPILED
=
1 pip
install
.
--verbose
#################### TEST IMAGE ####################
ENTRYPOINT
["python3", "-m", "pytest", "tests"]
#################### RUNTIME BASE IMAGE ####################
# use CUDA base as CUDA runtime dependencies are already installed via pip
FROM
nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base
...
...
@@ -63,14 +78,10 @@ WORKDIR /workspace
COPY
requirements.txt requirements.txt
RUN
--mount
=
type
=
cache,target
=
/root/.cache/pip
\
pip
install
-r
requirements.txt
#################### RUNTIME BASE IMAGE ####################
FROM
vllm-base AS vllm
COPY
--from=build /workspace/vllm/*.so /workspace/vllm/
COPY
vllm vllm
EXPOSE
8000
ENTRYPOINT
["python3", "-m", "vllm.entrypoints.api_server"]
#################### OPENAI API SERVER ####################
# openai api server alternative
FROM
vllm-base AS vllm-openai
# install additional dependencies for openai api server
...
...
@@ -81,3 +92,4 @@ COPY --from=build /workspace/vllm/*.so /workspace/vllm/
COPY
vllm vllm
ENTRYPOINT
["python3", "-m", "vllm.entrypoints.openai.api_server"]
#################### OPENAI API SERVER ####################
requirements-dev.txt
View file @
6e01e8c1
...
...
@@ -13,4 +13,6 @@ types-setuptools
pytest
pytest-forked
pytest-asyncio
httpx
einops # required for MPT
flash_attn # required for HuggingFace's llama implementation
setup.py
View file @
6e01e8c1
...
...
@@ -293,6 +293,11 @@ def get_requirements() -> List[str]:
return
requirements
package_data
=
{
"vllm"
:
[
"py.typed"
]}
if
os
.
environ
.
get
(
"VLLM_USE_PRECOMPILED"
):
ext_modules
=
[]
package_data
[
"vllm"
].
append
(
"*.so"
)
setuptools
.
setup
(
name
=
"vllm"
,
version
=
get_vllm_version
(),
...
...
@@ -321,5 +326,5 @@ setuptools.setup(
install_requires
=
get_requirements
(),
ext_modules
=
ext_modules
,
cmdclass
=
{
"build_ext"
:
BuildExtension
},
package_data
=
{
"vllm"
:
[
"py.typed"
]}
,
package_data
=
package_data
,
)
tests/async_engine/test_api_server.py
View file @
6e01e8c1
...
...
@@ -29,8 +29,13 @@ def api_server():
script_path
=
Path
(
__file__
).
parent
.
joinpath
(
"api_server_async_engine.py"
).
absolute
()
uvicorn_process
=
subprocess
.
Popen
([
sys
.
executable
,
"-u"
,
str
(
script_path
),
"--model"
,
"facebook/opt-125m"
sys
.
executable
,
"-u"
,
str
(
script_path
),
"--model"
,
"facebook/opt-125m"
,
"--host"
,
"127.0.0.1"
,
])
yield
uvicorn_process
.
terminate
()
...
...
@@ -81,6 +86,9 @@ def test_api_server(api_server):
pool
.
join
()
# check cancellation stats
# give it some times to update the stats
time
.
sleep
(
1
)
num_aborted_requests
=
requests
.
get
(
"http://localhost:8000/stats"
).
json
()[
"num_aborted_requests"
]
assert
num_aborted_requests
>
0
...
...
tests/async_engine/test_openai_server.py
View file @
6e01e8c1
from
argparse
import
Namespace
from
dataclasses
import
dataclass
import
os
import
pathlib
import
pytest
from
fastapi.testclient
import
TestClient
from
vllm.entrypoints.openai.api_server
import
*
chatml_jinja_path
=
pathlib
.
Path
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))).
parent
.
parent
/
"examples/template_chatml.jinja"
assert
chatml_jinja_path
.
exists
()
# Define models, templates, and their corresponding expected outputs
MODEL_TEMPLATE_GENERATON_OUTPUT
=
[
(
"facebook/opt-125m"
,
None
,
True
,
"Hello</s>Hi there!</s>What is the capital of</s>"
),
(
"facebook/opt-125m"
,
None
,
False
,
"Hello</s>Hi there!</s>What is the capital of</s>"
),
(
"facebook/opt-125m"
,
"../../examples/template_chatml.jinja"
,
True
,
"""<|im_start|>user
(
"facebook/opt-125m"
,
chatml_jinja_path
,
True
,
"""<|im_start|>user
Hello<|im_end|>
<|im_start|>assistant
Hi there!<|im_end|>
...
...
@@ -21,8 +26,7 @@ Hi there!<|im_end|>
What is the capital of<|im_end|>
<|im_start|>assistant
"""
),
(
"facebook/opt-125m"
,
"../../examples/template_chatml.jinja"
,
False
,
"""<|im_start|>user
(
"facebook/opt-125m"
,
chatml_jinja_path
,
False
,
"""<|im_start|>user
Hello<|im_end|>
<|im_start|>assistant
Hi there!<|im_end|>
...
...
@@ -54,8 +58,7 @@ class MockTokenizer:
def
test_load_chat_template
():
# Testing chatml template
template
=
"../../examples/template_chatml.jinja"
mock_args
=
Namespace
(
chat_template
=
template
)
mock_args
=
Namespace
(
chat_template
=
chatml_jinja_path
)
tokenizer
=
MockTokenizer
()
# Call the function with the mocked args
...
...
tests/distributed/test_comm_ops.py
View file @
6e01e8c1
...
...
@@ -2,10 +2,9 @@
Run `pytest tests/distributed/test_comm_ops.py --forked`.
"""
from
multiprocessing
import
Process
,
set_start_method
import
pytest
import
torch
import
ray
from
vllm.config
import
ParallelConfig
from
vllm.utils
import
get_open_port
...
...
@@ -23,11 +22,11 @@ def init_test_distributed_environment(pipeline_parallel_size: int,
tensor_parallel_size
,
worker_use_ray
=
True
)
distributed_init_method
=
f
"tcp://localhost:
{
distributed_init_port
}
"
torch
.
cuda
.
set_device
(
rank
)
_init_distributed_environment
(
parallel_config
,
rank
,
distributed_init_method
)
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
def
all_reduce_test_worker
(
tensor_parallel_size
:
int
,
rank
:
int
,
distributed_init_port
:
str
):
init_test_distributed_environment
(
1
,
tensor_parallel_size
,
rank
,
...
...
@@ -43,6 +42,7 @@ def all_reduce_test_worker(tensor_parallel_size: int, rank: int,
assert
torch
.
allclose
(
t
,
expected
)
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
def
all_gather_test_worker
(
tensor_parallel_size
:
int
,
rank
:
int
,
distributed_init_port
:
str
):
init_test_distributed_environment
(
1
,
tensor_parallel_size
,
rank
,
...
...
@@ -70,14 +70,16 @@ def all_gather_test_worker(tensor_parallel_size: int, rank: int,
@
pytest
.
mark
.
parametrize
(
"test_target"
,
[
all_reduce_test_worker
,
all_gather_test_worker
])
def
test_multi_process_tensor_parallel
(
tensor_parallel_size
,
test_target
):
set_start_method
(
"spawn"
,
force
=
True
)
# Using ray helps debugging the error when it failed
# as compared to multiprocessing.
ray
.
init
()
distributed_init_port
=
get_open_port
()
processe
s
=
[]
ref
s
=
[]
for
rank
in
range
(
tensor_parallel_size
):
p
=
Process
(
target
=
test_target
,
args
=
(
tensor_parallel_size
,
rank
,
distributed_init_port
))
p
.
start
()
processes
.
append
(
p
)
for
p
in
processes
:
p
.
join
()
assert
all
(
p
.
exitcode
==
0
for
p
in
processes
)
refs
.
append
(
test_target
.
remote
(
tensor_parallel_size
,
rank
,
distributed_init_port
))
ray
.
get
(
refs
)
ray
.
shutdown
()
tests/kernels/test_attention.py
View file @
6e01e8c1
...
...
@@ -13,7 +13,7 @@ FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
# This will change depending on the compute capability.
# - 512 as a buffer
MAX_SEQ_LEN
=
get_max_shared_memory_bytes
()
//
FLOAT32_BYTES
-
512
NUM_BLOCKS
=
40
000
# Arbitrary values for testing
NUM_BLOCKS
=
12
000
# Arbitrary values for testing
PARTITION_SIZE
=
512
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]
...
...
tests/kernels/test_cache.py
View file @
6e01e8c1
...
...
@@ -6,12 +6,12 @@ import torch
from
vllm._C
import
cache_ops
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]
NUM_TOKENS
=
[
83
]
# Arbitrary values for testing
NUM_TOKENS
=
[
42
]
# Arbitrary values for testing
NUM_LAYERS
=
[
1
]
# Arbitrary values for testing
NUM_HEADS
=
[
8
]
# Arbitrary values for testing
HEAD_SIZES
=
[
64
,
80
,
96
,
112
,
128
,
256
]
BLOCK_SIZES
=
[
8
,
16
,
32
]
NUM_BLOCKS
=
[
1024
,
3600
0
]
# Arbitrary values for testing
NUM_BLOCKS
=
[
1024
,
3600
]
# Arbitrary values for testing
NUM_MAPPINGS
=
[
256
]
# Arbitrary values for testing
SEEDS
=
[
0
]
DEVICES
=
[
i
for
i
in
range
(
1
if
torch
.
cuda
.
device_count
()
==
1
else
2
)]
...
...
tests/samplers/test_logprobs.py
View file @
6e01e8c1
...
...
@@ -30,6 +30,7 @@ def test_get_prompt_logprobs(
temperature
=
0.0
)
vllm_results
=
vllm_model
.
model
.
generate
(
example_prompts
,
sampling_params
=
vllm_sampling_params
)
del
vllm_model
# Test whether logprobs are included in the results.
for
result
in
vllm_results
:
...
...
tests/samplers/test_sampler.py
View file @
6e01e8c1
...
...
@@ -75,6 +75,8 @@ def test_sampler_all_greedy(seed: int):
for
nth_output
in
sequence_output
.
samples
:
assert
nth_output
.
output_token
==
expected
[
i
].
item
()
del
model_runner
@
pytest
.
mark
.
parametrize
(
"seed"
,
RANDOM_SEEDS
)
def
test_sampler_all_random
(
seed
:
int
):
...
...
@@ -111,6 +113,8 @@ def test_sampler_all_random(seed: int):
for
nth_output
in
sequence_output
.
samples
:
assert
nth_output
.
output_token
==
i
del
model_runner
@
pytest
.
mark
.
parametrize
(
"seed"
,
RANDOM_SEEDS
)
def
test_sampler_all_beam
(
seed
:
int
):
...
...
@@ -144,6 +148,7 @@ def test_sampler_all_beam(seed: int):
# the outputs are expected - in other words, this just tests
# whether there are no exceptions in the sampler
# when handling an all-beam search case.
del
model_runner
@
pytest
.
mark
.
parametrize
(
"seed"
,
RANDOM_SEEDS
)
...
...
@@ -198,6 +203,8 @@ def test_sampler_mixed(seed: int):
for
nth_output
in
sequence_output
.
samples
:
assert
nth_output
.
output_token
in
expected_tokens
del
model_runner
@
pytest
.
mark
.
parametrize
(
"seed"
,
RANDOM_SEEDS
)
def
test_sampler_logits_processors
(
seed
:
int
):
...
...
@@ -235,6 +242,8 @@ def test_sampler_logits_processors(seed: int):
for
idx
,
nth_output
in
enumerate
(
sequence_output
.
samples
):
assert
nth_output
.
output_token
==
idx
del
model_runner
@
pytest
.
mark
.
parametrize
(
"seed"
,
RANDOM_SEEDS
)
def
test_sampler_top_k_top_p
(
seed
:
int
):
...
...
@@ -296,3 +305,5 @@ def test_sampler_top_k_top_p(seed: int):
hf_probs
=
torch
.
softmax
(
hf_probs
,
dim
=-
1
,
dtype
=
torch
.
float
)
assert
torch
.
allclose
(
hf_probs
,
sample_probs
,
atol
=
1e-5
)
assert
torch
.
equal
(
hf_probs
.
eq
(
0
),
sample_probs
.
eq
(
0
))
del
model_runner
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment