Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
norm
vllm
Commits
6e01e8c1
Unverified
Commit
6e01e8c1
authored
Jan 14, 2024
by
Simon Mo
Committed by
GitHub
Jan 14, 2024
Browse files
[CI] Add Buildkite (#2355)
parent
9f659bf0
Changes
13
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
192 additions
and
37 deletions
+192
-37
.buildkite/run-benchmarks.sh
.buildkite/run-benchmarks.sh
+24
-0
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+41
-0
.buildkite/test-template.j2
.buildkite/test-template.j2
+46
-0
Dockerfile
Dockerfile
+24
-12
requirements-dev.txt
requirements-dev.txt
+3
-1
setup.py
setup.py
+6
-1
tests/async_engine/test_api_server.py
tests/async_engine/test_api_server.py
+10
-2
tests/async_engine/test_openai_server.py
tests/async_engine/test_openai_server.py
+9
-6
tests/distributed/test_comm_ops.py
tests/distributed/test_comm_ops.py
+14
-12
tests/kernels/test_attention.py
tests/kernels/test_attention.py
+1
-1
tests/kernels/test_cache.py
tests/kernels/test_cache.py
+2
-2
tests/samplers/test_logprobs.py
tests/samplers/test_logprobs.py
+1
-0
tests/samplers/test_sampler.py
tests/samplers/test_sampler.py
+11
-0
No files found.
.buildkite/run-benchmarks.sh
0 → 100644
View file @
6e01e8c1
# This script is run by buildkite to run the benchmarks and upload the results to buildkite
set
-ex
# cd into parent directory of this file
cd
"
$(
dirname
"
${
BASH_SOURCE
[0]
}
"
)
/.."
# run benchmarks and upload the result to buildkite
python3 benchmarks/benchmark_latency.py 2>&1 |
tee
benchmark_latency.txt
python3 benchmarks/benchmark_throughput.py
--input-len
256
--output-len
256 2>&1 |
tee
benchmark_throughput.txt
# write the results into a markdown file
echo
"### Latency Benchmarks"
>>
benchmark_results.md
sed
-n
'1p'
benchmark_latency.txt
>>
benchmark_results.md
echo
""
>>
benchmark_results.md
sed
-n
'$p'
benchmark_latency.txt
>>
benchmark_results.md
echo
"### Throughput Benchmarks"
>>
benchmark_results.md
sed
-n
'1p'
benchmark_throughput.txt
>>
benchmark_results.md
echo
""
>>
benchmark_results.md
sed
-n
'$p'
benchmark_throughput.txt
>>
benchmark_results.md
# upload the results to buildkite
/workspace/buildkite-agent annotate
--style
"info"
--context
"benchmark-results"
< benchmark_results.md
.buildkite/test-pipeline.yaml
0 → 100644
View file @
6e01e8c1
# In this file, you can add more tests to run either by adding a new step or
# adding a new command to an existing step. See different options here for examples.
# This script will be feed into Jinja template in `test-template.j2` to generate
# the final pipeline yaml file.
steps
:
-
label
:
Regression Test
command
:
pytest -v -s test_regression.py
working_dir
:
"
/vllm-workspace/tests"
# optional
-
label
:
AsyncEngine Test
command
:
pytest -v -s async_engine
-
label
:
Distributed Test
command
:
pytest -v -s test_comm_ops.py
working_dir
:
"
/vllm-workspace/tests/distributed"
num_gpus
:
2
# only support 1 or 2 for now.
-
label
:
Engine Test
command
:
pytest -v -s engine
-
label
:
Kernels Test
command
:
pytest -v -s kernels
soft_fail
:
true
-
label
:
Models Test
commands
:
-
pytest -v -s models --forked
soft_fail
:
true
-
label
:
Samplers Test
command
:
pytest -v -s samplers --forked
-
label
:
Worker Test
command
:
pytest -v -s worker
-
label
:
Benchmarks
working_dir
:
"
/vllm-workspace/.buildkite"
commands
:
-
pip install aiohttp
-
bash run-benchmarks.sh
.buildkite/test-template.j2
0 → 100644
View file @
6e01e8c1
{% set docker_image = "us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT" %}
{% set default_num_gpu = 1 %}
{% set default_working_dir = "/vllm-workspace/tests" %}
steps:
- label: ":docker: build image"
commands:
- "docker build --tag {{ docker_image }} --target test --progress plain ."
- "docker push {{ docker_image }}"
env:
DOCKER_BUILDKIT: "1"
- wait
{% for step in steps %}
- label: "{{ step.label }}"
agents:
queue: kubernetes
soft_fail: {{ step.soft_fail or false }}
plugins:
- kubernetes:
podSpec:
volumes:
- name: dshm
emptyDir:
medium: Memory
containers:
- image: "{{ docker_image }}"
command: ["bash"]
args:
- "-c"
- "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'"
resources:
requests:
nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
limits:
nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
env:
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
volumeMounts:
- mountPath: /dev/shm
name: dshm
{% endfor %}
Dockerfile
View file @
6e01e8c1
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
# to run the OpenAI compatible server.
#################### BASE BUILD IMAGE ####################
FROM
nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
FROM
nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
RUN
apt-get update
-y
\
RUN
apt-get update
-y
\
&&
apt-get
install
-y
python3-pip
&&
apt-get
install
-y
python3-pip
git
WORKDIR
/workspace
WORKDIR
/workspace
...
@@ -14,8 +18,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \
...
@@ -14,8 +18,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \
COPY
requirements-dev.txt requirements-dev.txt
COPY
requirements-dev.txt requirements-dev.txt
RUN
--mount
=
type
=
cache,target
=
/root/.cache/pip
\
RUN
--mount
=
type
=
cache,target
=
/root/.cache/pip
\
pip
install
-r
requirements-dev.txt
pip
install
-r
requirements-dev.txt
#################### BASE BUILD IMAGE ####################
#
image to build pytorch extensions
#
################### EXTENSION BUILD IMAGE ####################
FROM
dev AS build
FROM
dev AS build
# install build dependencies
# install build dependencies
...
@@ -30,6 +36,7 @@ COPY requirements.txt requirements.txt
...
@@ -30,6 +36,7 @@ COPY requirements.txt requirements.txt
COPY
pyproject.toml pyproject.toml
COPY
pyproject.toml pyproject.toml
COPY
vllm/__init__.py vllm/__init__.py
COPY
vllm/__init__.py vllm/__init__.py
# cuda arch list used by torch
ARG
torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
ARG
torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
ENV
TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
ENV
TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
# max jobs used by Ninja to build extensions
# max jobs used by Ninja to build extensions
...
@@ -40,18 +47,26 @@ ARG nvcc_threads=8
...
@@ -40,18 +47,26 @@ ARG nvcc_threads=8
ENV
NVCC_THREADS=$nvcc_threads
ENV
NVCC_THREADS=$nvcc_threads
RUN
python3 setup.py build_ext
--inplace
RUN
python3 setup.py build_ext
--inplace
#################### EXTENSION Build IMAGE ####################
#################### TEST IMAGE ####################
# image to run unit testing suite
# image to run unit testing suite
FROM
dev AS test
FROM
dev AS test
# copy pytorch extensions separately to avoid having to rebuild
# copy pytorch extensions separately to avoid having to rebuild
# when python code changes
# when python code changes
COPY
--from=build /workspace/vllm/*.so /workspace/vllm/
WORKDIR
/vllm-workspace
COPY
tests tests
# ADD is used to preserve directory structure
COPY
vllm vllm
ADD
. /vllm-workspace/
COPY
--from=build /workspace/vllm/*.so /vllm-workspace/vllm/
# ignore build dependencies installation because we are using pre-complied extensions
RUN
rm
pyproject.toml
RUN
--mount
=
type
=
cache,target
=
/root/.cache/pip
VLLM_USE_PRECOMPILED
=
1 pip
install
.
--verbose
#################### TEST IMAGE ####################
ENTRYPOINT
["python3", "-m", "pytest", "tests"]
#################### RUNTIME BASE IMAGE ####################
# use CUDA base as CUDA runtime dependencies are already installed via pip
# use CUDA base as CUDA runtime dependencies are already installed via pip
FROM
nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base
FROM
nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base
...
@@ -63,14 +78,10 @@ WORKDIR /workspace
...
@@ -63,14 +78,10 @@ WORKDIR /workspace
COPY
requirements.txt requirements.txt
COPY
requirements.txt requirements.txt
RUN
--mount
=
type
=
cache,target
=
/root/.cache/pip
\
RUN
--mount
=
type
=
cache,target
=
/root/.cache/pip
\
pip
install
-r
requirements.txt
pip
install
-r
requirements.txt
#################### RUNTIME BASE IMAGE ####################
FROM
vllm-base AS vllm
COPY
--from=build /workspace/vllm/*.so /workspace/vllm/
COPY
vllm vllm
EXPOSE
8000
ENTRYPOINT
["python3", "-m", "vllm.entrypoints.api_server"]
#################### OPENAI API SERVER ####################
# openai api server alternative
# openai api server alternative
FROM
vllm-base AS vllm-openai
FROM
vllm-base AS vllm-openai
# install additional dependencies for openai api server
# install additional dependencies for openai api server
...
@@ -81,3 +92,4 @@ COPY --from=build /workspace/vllm/*.so /workspace/vllm/
...
@@ -81,3 +92,4 @@ COPY --from=build /workspace/vllm/*.so /workspace/vllm/
COPY
vllm vllm
COPY
vllm vllm
ENTRYPOINT
["python3", "-m", "vllm.entrypoints.openai.api_server"]
ENTRYPOINT
["python3", "-m", "vllm.entrypoints.openai.api_server"]
#################### OPENAI API SERVER ####################
requirements-dev.txt
View file @
6e01e8c1
...
@@ -13,4 +13,6 @@ types-setuptools
...
@@ -13,4 +13,6 @@ types-setuptools
pytest
pytest
pytest-forked
pytest-forked
pytest-asyncio
pytest-asyncio
httpx
einops # required for MPT
flash_attn # required for HuggingFace's llama implementation
setup.py
View file @
6e01e8c1
...
@@ -293,6 +293,11 @@ def get_requirements() -> List[str]:
...
@@ -293,6 +293,11 @@ def get_requirements() -> List[str]:
return
requirements
return
requirements
package_data
=
{
"vllm"
:
[
"py.typed"
]}
if
os
.
environ
.
get
(
"VLLM_USE_PRECOMPILED"
):
ext_modules
=
[]
package_data
[
"vllm"
].
append
(
"*.so"
)
setuptools
.
setup
(
setuptools
.
setup
(
name
=
"vllm"
,
name
=
"vllm"
,
version
=
get_vllm_version
(),
version
=
get_vllm_version
(),
...
@@ -321,5 +326,5 @@ setuptools.setup(
...
@@ -321,5 +326,5 @@ setuptools.setup(
install_requires
=
get_requirements
(),
install_requires
=
get_requirements
(),
ext_modules
=
ext_modules
,
ext_modules
=
ext_modules
,
cmdclass
=
{
"build_ext"
:
BuildExtension
},
cmdclass
=
{
"build_ext"
:
BuildExtension
},
package_data
=
{
"vllm"
:
[
"py.typed"
]}
,
package_data
=
package_data
,
)
)
tests/async_engine/test_api_server.py
View file @
6e01e8c1
...
@@ -29,8 +29,13 @@ def api_server():
...
@@ -29,8 +29,13 @@ def api_server():
script_path
=
Path
(
__file__
).
parent
.
joinpath
(
script_path
=
Path
(
__file__
).
parent
.
joinpath
(
"api_server_async_engine.py"
).
absolute
()
"api_server_async_engine.py"
).
absolute
()
uvicorn_process
=
subprocess
.
Popen
([
uvicorn_process
=
subprocess
.
Popen
([
sys
.
executable
,
"-u"
,
sys
.
executable
,
str
(
script_path
),
"--model"
,
"facebook/opt-125m"
"-u"
,
str
(
script_path
),
"--model"
,
"facebook/opt-125m"
,
"--host"
,
"127.0.0.1"
,
])
])
yield
yield
uvicorn_process
.
terminate
()
uvicorn_process
.
terminate
()
...
@@ -81,6 +86,9 @@ def test_api_server(api_server):
...
@@ -81,6 +86,9 @@ def test_api_server(api_server):
pool
.
join
()
pool
.
join
()
# check cancellation stats
# check cancellation stats
# give it some times to update the stats
time
.
sleep
(
1
)
num_aborted_requests
=
requests
.
get
(
num_aborted_requests
=
requests
.
get
(
"http://localhost:8000/stats"
).
json
()[
"num_aborted_requests"
]
"http://localhost:8000/stats"
).
json
()[
"num_aborted_requests"
]
assert
num_aborted_requests
>
0
assert
num_aborted_requests
>
0
...
...
tests/async_engine/test_openai_server.py
View file @
6e01e8c1
from
argparse
import
Namespace
from
argparse
import
Namespace
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
import
os
import
pathlib
import
pytest
import
pytest
from
fastapi.testclient
import
TestClient
from
fastapi.testclient
import
TestClient
from
vllm.entrypoints.openai.api_server
import
*
from
vllm.entrypoints.openai.api_server
import
*
chatml_jinja_path
=
pathlib
.
Path
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))).
parent
.
parent
/
"examples/template_chatml.jinja"
assert
chatml_jinja_path
.
exists
()
# Define models, templates, and their corresponding expected outputs
# Define models, templates, and their corresponding expected outputs
MODEL_TEMPLATE_GENERATON_OUTPUT
=
[
MODEL_TEMPLATE_GENERATON_OUTPUT
=
[
(
"facebook/opt-125m"
,
None
,
True
,
(
"facebook/opt-125m"
,
None
,
True
,
"Hello</s>Hi there!</s>What is the capital of</s>"
),
"Hello</s>Hi there!</s>What is the capital of</s>"
),
(
"facebook/opt-125m"
,
None
,
False
,
(
"facebook/opt-125m"
,
None
,
False
,
"Hello</s>Hi there!</s>What is the capital of</s>"
),
"Hello</s>Hi there!</s>What is the capital of</s>"
),
(
"facebook/opt-125m"
,
"../../examples/template_chatml.jinja"
,
True
,
(
"facebook/opt-125m"
,
chatml_jinja_path
,
True
,
"""<|im_start|>user
"""<|im_start|>user
Hello<|im_end|>
Hello<|im_end|>
<|im_start|>assistant
<|im_start|>assistant
Hi there!<|im_end|>
Hi there!<|im_end|>
...
@@ -21,8 +26,7 @@ Hi there!<|im_end|>
...
@@ -21,8 +26,7 @@ Hi there!<|im_end|>
What is the capital of<|im_end|>
What is the capital of<|im_end|>
<|im_start|>assistant
<|im_start|>assistant
"""
),
"""
),
(
"facebook/opt-125m"
,
"../../examples/template_chatml.jinja"
,
False
,
(
"facebook/opt-125m"
,
chatml_jinja_path
,
False
,
"""<|im_start|>user
"""<|im_start|>user
Hello<|im_end|>
Hello<|im_end|>
<|im_start|>assistant
<|im_start|>assistant
Hi there!<|im_end|>
Hi there!<|im_end|>
...
@@ -54,8 +58,7 @@ class MockTokenizer:
...
@@ -54,8 +58,7 @@ class MockTokenizer:
def
test_load_chat_template
():
def
test_load_chat_template
():
# Testing chatml template
# Testing chatml template
template
=
"../../examples/template_chatml.jinja"
mock_args
=
Namespace
(
chat_template
=
chatml_jinja_path
)
mock_args
=
Namespace
(
chat_template
=
template
)
tokenizer
=
MockTokenizer
()
tokenizer
=
MockTokenizer
()
# Call the function with the mocked args
# Call the function with the mocked args
...
...
tests/distributed/test_comm_ops.py
View file @
6e01e8c1
...
@@ -2,10 +2,9 @@
...
@@ -2,10 +2,9 @@
Run `pytest tests/distributed/test_comm_ops.py --forked`.
Run `pytest tests/distributed/test_comm_ops.py --forked`.
"""
"""
from
multiprocessing
import
Process
,
set_start_method
import
pytest
import
pytest
import
torch
import
torch
import
ray
from
vllm.config
import
ParallelConfig
from
vllm.config
import
ParallelConfig
from
vllm.utils
import
get_open_port
from
vllm.utils
import
get_open_port
...
@@ -23,11 +22,11 @@ def init_test_distributed_environment(pipeline_parallel_size: int,
...
@@ -23,11 +22,11 @@ def init_test_distributed_environment(pipeline_parallel_size: int,
tensor_parallel_size
,
tensor_parallel_size
,
worker_use_ray
=
True
)
worker_use_ray
=
True
)
distributed_init_method
=
f
"tcp://localhost:
{
distributed_init_port
}
"
distributed_init_method
=
f
"tcp://localhost:
{
distributed_init_port
}
"
torch
.
cuda
.
set_device
(
rank
)
_init_distributed_environment
(
parallel_config
,
rank
,
_init_distributed_environment
(
parallel_config
,
rank
,
distributed_init_method
)
distributed_init_method
)
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
def
all_reduce_test_worker
(
tensor_parallel_size
:
int
,
rank
:
int
,
def
all_reduce_test_worker
(
tensor_parallel_size
:
int
,
rank
:
int
,
distributed_init_port
:
str
):
distributed_init_port
:
str
):
init_test_distributed_environment
(
1
,
tensor_parallel_size
,
rank
,
init_test_distributed_environment
(
1
,
tensor_parallel_size
,
rank
,
...
@@ -43,6 +42,7 @@ def all_reduce_test_worker(tensor_parallel_size: int, rank: int,
...
@@ -43,6 +42,7 @@ def all_reduce_test_worker(tensor_parallel_size: int, rank: int,
assert
torch
.
allclose
(
t
,
expected
)
assert
torch
.
allclose
(
t
,
expected
)
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
def
all_gather_test_worker
(
tensor_parallel_size
:
int
,
rank
:
int
,
def
all_gather_test_worker
(
tensor_parallel_size
:
int
,
rank
:
int
,
distributed_init_port
:
str
):
distributed_init_port
:
str
):
init_test_distributed_environment
(
1
,
tensor_parallel_size
,
rank
,
init_test_distributed_environment
(
1
,
tensor_parallel_size
,
rank
,
...
@@ -70,14 +70,16 @@ def all_gather_test_worker(tensor_parallel_size: int, rank: int,
...
@@ -70,14 +70,16 @@ def all_gather_test_worker(tensor_parallel_size: int, rank: int,
@
pytest
.
mark
.
parametrize
(
"test_target"
,
@
pytest
.
mark
.
parametrize
(
"test_target"
,
[
all_reduce_test_worker
,
all_gather_test_worker
])
[
all_reduce_test_worker
,
all_gather_test_worker
])
def
test_multi_process_tensor_parallel
(
tensor_parallel_size
,
test_target
):
def
test_multi_process_tensor_parallel
(
tensor_parallel_size
,
test_target
):
set_start_method
(
"spawn"
,
force
=
True
)
# Using ray helps debugging the error when it failed
# as compared to multiprocessing.
ray
.
init
()
distributed_init_port
=
get_open_port
()
distributed_init_port
=
get_open_port
()
processe
s
=
[]
ref
s
=
[]
for
rank
in
range
(
tensor_parallel_size
):
for
rank
in
range
(
tensor_parallel_size
):
p
=
Process
(
target
=
test_target
,
refs
.
append
(
args
=
(
tensor_parallel_size
,
rank
,
distributed_init_port
))
test_target
.
remote
(
tensor_parallel_size
,
rank
,
p
.
start
()
distributed_init_port
))
processes
.
append
(
p
)
ray
.
get
(
refs
)
for
p
in
processes
:
p
.
join
()
ray
.
shutdown
()
assert
all
(
p
.
exitcode
==
0
for
p
in
processes
)
tests/kernels/test_attention.py
View file @
6e01e8c1
...
@@ -13,7 +13,7 @@ FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
...
@@ -13,7 +13,7 @@ FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
# This will change depending on the compute capability.
# This will change depending on the compute capability.
# - 512 as a buffer
# - 512 as a buffer
MAX_SEQ_LEN
=
get_max_shared_memory_bytes
()
//
FLOAT32_BYTES
-
512
MAX_SEQ_LEN
=
get_max_shared_memory_bytes
()
//
FLOAT32_BYTES
-
512
NUM_BLOCKS
=
40
000
# Arbitrary values for testing
NUM_BLOCKS
=
12
000
# Arbitrary values for testing
PARTITION_SIZE
=
512
PARTITION_SIZE
=
512
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]
...
...
tests/kernels/test_cache.py
View file @
6e01e8c1
...
@@ -6,12 +6,12 @@ import torch
...
@@ -6,12 +6,12 @@ import torch
from
vllm._C
import
cache_ops
from
vllm._C
import
cache_ops
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]
NUM_TOKENS
=
[
83
]
# Arbitrary values for testing
NUM_TOKENS
=
[
42
]
# Arbitrary values for testing
NUM_LAYERS
=
[
1
]
# Arbitrary values for testing
NUM_LAYERS
=
[
1
]
# Arbitrary values for testing
NUM_HEADS
=
[
8
]
# Arbitrary values for testing
NUM_HEADS
=
[
8
]
# Arbitrary values for testing
HEAD_SIZES
=
[
64
,
80
,
96
,
112
,
128
,
256
]
HEAD_SIZES
=
[
64
,
80
,
96
,
112
,
128
,
256
]
BLOCK_SIZES
=
[
8
,
16
,
32
]
BLOCK_SIZES
=
[
8
,
16
,
32
]
NUM_BLOCKS
=
[
1024
,
3600
0
]
# Arbitrary values for testing
NUM_BLOCKS
=
[
1024
,
3600
]
# Arbitrary values for testing
NUM_MAPPINGS
=
[
256
]
# Arbitrary values for testing
NUM_MAPPINGS
=
[
256
]
# Arbitrary values for testing
SEEDS
=
[
0
]
SEEDS
=
[
0
]
DEVICES
=
[
i
for
i
in
range
(
1
if
torch
.
cuda
.
device_count
()
==
1
else
2
)]
DEVICES
=
[
i
for
i
in
range
(
1
if
torch
.
cuda
.
device_count
()
==
1
else
2
)]
...
...
tests/samplers/test_logprobs.py
View file @
6e01e8c1
...
@@ -30,6 +30,7 @@ def test_get_prompt_logprobs(
...
@@ -30,6 +30,7 @@ def test_get_prompt_logprobs(
temperature
=
0.0
)
temperature
=
0.0
)
vllm_results
=
vllm_model
.
model
.
generate
(
vllm_results
=
vllm_model
.
model
.
generate
(
example_prompts
,
sampling_params
=
vllm_sampling_params
)
example_prompts
,
sampling_params
=
vllm_sampling_params
)
del
vllm_model
# Test whether logprobs are included in the results.
# Test whether logprobs are included in the results.
for
result
in
vllm_results
:
for
result
in
vllm_results
:
...
...
tests/samplers/test_sampler.py
View file @
6e01e8c1
...
@@ -75,6 +75,8 @@ def test_sampler_all_greedy(seed: int):
...
@@ -75,6 +75,8 @@ def test_sampler_all_greedy(seed: int):
for
nth_output
in
sequence_output
.
samples
:
for
nth_output
in
sequence_output
.
samples
:
assert
nth_output
.
output_token
==
expected
[
i
].
item
()
assert
nth_output
.
output_token
==
expected
[
i
].
item
()
del
model_runner
@
pytest
.
mark
.
parametrize
(
"seed"
,
RANDOM_SEEDS
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
RANDOM_SEEDS
)
def
test_sampler_all_random
(
seed
:
int
):
def
test_sampler_all_random
(
seed
:
int
):
...
@@ -111,6 +113,8 @@ def test_sampler_all_random(seed: int):
...
@@ -111,6 +113,8 @@ def test_sampler_all_random(seed: int):
for
nth_output
in
sequence_output
.
samples
:
for
nth_output
in
sequence_output
.
samples
:
assert
nth_output
.
output_token
==
i
assert
nth_output
.
output_token
==
i
del
model_runner
@
pytest
.
mark
.
parametrize
(
"seed"
,
RANDOM_SEEDS
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
RANDOM_SEEDS
)
def
test_sampler_all_beam
(
seed
:
int
):
def
test_sampler_all_beam
(
seed
:
int
):
...
@@ -144,6 +148,7 @@ def test_sampler_all_beam(seed: int):
...
@@ -144,6 +148,7 @@ def test_sampler_all_beam(seed: int):
# the outputs are expected - in other words, this just tests
# the outputs are expected - in other words, this just tests
# whether there are no exceptions in the sampler
# whether there are no exceptions in the sampler
# when handling an all-beam search case.
# when handling an all-beam search case.
del
model_runner
@
pytest
.
mark
.
parametrize
(
"seed"
,
RANDOM_SEEDS
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
RANDOM_SEEDS
)
...
@@ -198,6 +203,8 @@ def test_sampler_mixed(seed: int):
...
@@ -198,6 +203,8 @@ def test_sampler_mixed(seed: int):
for
nth_output
in
sequence_output
.
samples
:
for
nth_output
in
sequence_output
.
samples
:
assert
nth_output
.
output_token
in
expected_tokens
assert
nth_output
.
output_token
in
expected_tokens
del
model_runner
@
pytest
.
mark
.
parametrize
(
"seed"
,
RANDOM_SEEDS
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
RANDOM_SEEDS
)
def
test_sampler_logits_processors
(
seed
:
int
):
def
test_sampler_logits_processors
(
seed
:
int
):
...
@@ -235,6 +242,8 @@ def test_sampler_logits_processors(seed: int):
...
@@ -235,6 +242,8 @@ def test_sampler_logits_processors(seed: int):
for
idx
,
nth_output
in
enumerate
(
sequence_output
.
samples
):
for
idx
,
nth_output
in
enumerate
(
sequence_output
.
samples
):
assert
nth_output
.
output_token
==
idx
assert
nth_output
.
output_token
==
idx
del
model_runner
@
pytest
.
mark
.
parametrize
(
"seed"
,
RANDOM_SEEDS
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
RANDOM_SEEDS
)
def
test_sampler_top_k_top_p
(
seed
:
int
):
def
test_sampler_top_k_top_p
(
seed
:
int
):
...
@@ -296,3 +305,5 @@ def test_sampler_top_k_top_p(seed: int):
...
@@ -296,3 +305,5 @@ def test_sampler_top_k_top_p(seed: int):
hf_probs
=
torch
.
softmax
(
hf_probs
,
dim
=-
1
,
dtype
=
torch
.
float
)
hf_probs
=
torch
.
softmax
(
hf_probs
,
dim
=-
1
,
dtype
=
torch
.
float
)
assert
torch
.
allclose
(
hf_probs
,
sample_probs
,
atol
=
1e-5
)
assert
torch
.
allclose
(
hf_probs
,
sample_probs
,
atol
=
1e-5
)
assert
torch
.
equal
(
hf_probs
.
eq
(
0
),
sample_probs
.
eq
(
0
))
assert
torch
.
equal
(
hf_probs
.
eq
(
0
),
sample_probs
.
eq
(
0
))
del
model_runner
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment