Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
26422e47
Unverified
Commit
26422e47
authored
Mar 29, 2024
by
SangBin Cho
Committed by
GitHub
Mar 28, 2024
Browse files
[Test] Make model tests run again and remove --forked from pytest (#3631)
Co-authored-by:
Simon Mo
<
simon.mo@hey.com
>
parent
f342153b
Changes
12
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
101 additions
and
29 deletions
+101
-29
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+6
-7
requirements-dev.txt
requirements-dev.txt
+1
-0
tests/basic_correctness/test_basic_correctness.py
tests/basic_correctness/test_basic_correctness.py
+1
-1
tests/conftest.py
tests/conftest.py
+30
-0
tests/distributed/test_comm_ops.py
tests/distributed/test_comm_ops.py
+1
-1
tests/models/test_big_models.py
tests/models/test_big_models.py
+45
-0
tests/models/test_llava.py
tests/models/test_llava.py
+0
-3
tests/models/test_marlin.py
tests/models/test_marlin.py
+1
-3
tests/models/test_mistral.py
tests/models/test_mistral.py
+4
-1
tests/models/test_models.py
tests/models/test_models.py
+10
-11
tests/samplers/test_beam_search.py
tests/samplers/test_beam_search.py
+1
-1
tests/samplers/test_seeded_generate.py
tests/samplers/test_seeded_generate.py
+1
-1
No files found.
.buildkite/test-pipeline.yaml
View file @
26422e47
...
...
@@ -12,13 +12,13 @@ steps:
command
:
pytest -v -s async_engine
-
label
:
Basic Correctness Test
command
:
pytest -v -s
--forked
basic_correctness
command
:
pytest -v -s basic_correctness
-
label
:
Core Test
command
:
pytest -v -s core
-
label
:
Distributed Comm Ops Test
command
:
pytest -v -s
--forked
test_comm_ops.py
command
:
pytest -v -s test_comm_ops.py
working_dir
:
"
/vllm-workspace/tests/distributed"
num_gpus
:
2
# only support 1 or 2 for now.
...
...
@@ -26,9 +26,9 @@ steps:
working_dir
:
"
/vllm-workspace/tests/distributed"
num_gpus
:
2
# only support 1 or 2 for now.
commands
:
-
pytest -v -s
--forked
test_pynccl.py
-
TEST_DIST_MODEL=facebook/opt-125m pytest -v -s
--forked
test_basic_distributed_correctness.py
-
TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s
--forked
test_basic_distributed_correctness.py
-
pytest -v -s test_pynccl.py
-
TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_basic_distributed_correctness.py
-
TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_basic_distributed_correctness.py
-
label
:
Engine Test
command
:
pytest -v -s engine tokenization test_sequence.py test_config.py
...
...
@@ -53,8 +53,7 @@ steps:
-
label
:
Models Test
commands
:
-
bash ../.buildkite/download-images.sh
-
pytest -v -s models --ignore=models/test_llava.py --forked
soft_fail
:
true
-
pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py
-
label
:
Llava Test
commands
:
...
...
requirements-dev.txt
View file @
26422e47
...
...
@@ -25,6 +25,7 @@ requests
ray
peft
awscli
ai2-olmo # required for OLMo
# Benchmarking
aiohttp
...
...
tests/basic_correctness/test_basic_correctness.py
View file @
26422e47
"""Compare the short outputs of HF and vLLM when using greedy sampling.
Run `pytest tests/basic_correctness/test_basic_correctness.py
--forked
`.
Run `pytest tests/basic_correctness/test_basic_correctness.py`.
"""
import
pytest
...
...
tests/conftest.py
View file @
26422e47
import
contextlib
import
gc
import
os
from
typing
import
List
,
Optional
,
Tuple
...
...
@@ -9,6 +11,8 @@ from transformers import (AutoModelForCausalLM, AutoProcessor,
from
vllm
import
LLM
,
SamplingParams
from
vllm.config
import
TokenizerPoolConfig
,
VisionLanguageConfig
from
vllm.model_executor.parallel_utils.parallel_state
import
(
destroy_model_parallel
)
from
vllm.sequence
import
MultiModalData
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
...
...
@@ -43,6 +47,20 @@ def _read_prompts(filename: str) -> List[str]:
return
prompts
def
cleanup
():
destroy_model_parallel
()
with
contextlib
.
suppress
(
AssertionError
):
torch
.
distributed
.
destroy_process_group
()
gc
.
collect
()
torch
.
cuda
.
empty_cache
()
@
pytest
.
fixture
(
autouse
=
True
)
def
cleanup_fixture
():
yield
cleanup
()
@
pytest
.
fixture
(
scope
=
"session"
)
def
hf_image_prompts
()
->
List
[
str
]:
return
_IMAGE_PROMPTS
...
...
@@ -241,6 +259,10 @@ class HfRunner:
all_logprobs
.
append
(
seq_logprobs
)
return
all_logprobs
def
__del__
(
self
):
del
self
.
model
cleanup
()
@
pytest
.
fixture
def
hf_runner
():
...
...
@@ -253,6 +275,9 @@ class VllmRunner:
self
,
model_name
:
str
,
tokenizer_name
:
Optional
[
str
]
=
None
,
# Use smaller max model length, otherwise bigger model cannot run due
# to kv cache size limit.
max_model_len
=
1024
,
dtype
:
str
=
"half"
,
disable_log_stats
:
bool
=
True
,
tensor_parallel_size
:
int
=
1
,
...
...
@@ -268,6 +293,7 @@ class VllmRunner:
swap_space
=
0
,
disable_log_stats
=
disable_log_stats
,
tensor_parallel_size
=
tensor_parallel_size
,
max_model_len
=
max_model_len
,
block_size
=
block_size
,
enable_chunked_prefill
=
enable_chunked_prefill
,
**
kwargs
,
...
...
@@ -357,6 +383,10 @@ class VllmRunner:
outputs
=
self
.
generate
(
prompts
,
beam_search_params
)
return
outputs
def
__del__
(
self
):
del
self
.
model
cleanup
()
@
pytest
.
fixture
def
vllm_runner
():
...
...
tests/distributed/test_comm_ops.py
View file @
26422e47
"""Test the communication operators.
Run `pytest tests/distributed/test_comm_ops.py
--forked
`.
Run `pytest tests/distributed/test_comm_ops.py`.
"""
import
os
...
...
tests/models/test_big_models.py
0 → 100644
View file @
26422e47
"""Compare the outputs of HF and vLLM when using greedy sampling.
This tests bigger models and use half precision.
Run `pytest tests/models/test_big_models.py`.
"""
import
pytest
MODELS
=
[
"meta-llama/Llama-2-7b-hf"
,
# "mistralai/Mistral-7B-v0.1", # Broken
# "Deci/DeciLM-7b", # Broken
# "tiiuae/falcon-7b", # Broken
"EleutherAI/gpt-j-6b"
,
"mosaicml/mpt-7b"
,
# "Qwen/Qwen1.5-0.5B" # Broken,
]
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
32
])
def
test_models
(
hf_runner
,
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
)
->
None
:
hf_model
=
hf_runner
(
model
,
dtype
=
dtype
)
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
del
hf_model
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
)
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
del
vllm_model
for
i
in
range
(
len
(
example_prompts
)):
hf_output_ids
,
hf_output_str
=
hf_outputs
[
i
]
vllm_output_ids
,
vllm_output_str
=
vllm_outputs
[
i
]
assert
hf_output_str
==
vllm_output_str
,
(
f
"Test
{
i
}
:
\n
HF:
{
hf_output_str
!
r
}
\n
vLLM:
{
vllm_output_str
!
r
}
"
)
assert
hf_output_ids
==
vllm_output_ids
,
(
f
"Test
{
i
}
:
\n
HF:
{
hf_output_ids
}
\n
vLLM:
{
vllm_output_ids
}
"
)
tests/models/test_llava.py
View file @
26422e47
...
...
@@ -85,9 +85,6 @@ def test_models(hf_runner, vllm_runner, hf_image_prompts, hf_images,
images
=
hf_images
)
del
hf_model
gc
.
collect
()
torch
.
cuda
.
empty_cache
()
vllm_model
=
vllm_runner
(
model_id
,
dtype
=
dtype
,
worker_use_ray
=
worker_use_ray
,
...
...
tests/models/test_marlin.py
View file @
26422e47
...
...
@@ -8,7 +8,7 @@ Note: Marlin internally uses locks to synchronize the threads. This can
result in very slight nondeterminism for Marlin. As a result, we re-run the test
up to 3 times to see if we pass.
Run `pytest tests/models/test_marlin.py
--forked
`.
Run `pytest tests/models/test_marlin.py`.
"""
from
dataclasses
import
dataclass
...
...
@@ -63,7 +63,6 @@ def test_models(
# Note: not sure why, but deleting just the model on Ada Lovelace
# does not free the GPU memory. On Ampere, deleting the just model
# frees the memory.
del
marlin_model
.
model
.
llm_engine
.
driver_worker
del
marlin_model
gptq_model
=
vllm_runner
(
model_pair
.
model_gptq
,
dtype
=
dtype
)
...
...
@@ -74,7 +73,6 @@ def test_models(
# Note: not sure why, but deleting just the model on Ada Lovelace
# does not free the GPU memory. On Ampere, deleting the just model
# frees the memory.
del
gptq_model
.
model
.
llm_engine
.
driver_worker
del
gptq_model
# loop through the prompts
...
...
tests/models/test_mistral.py
View file @
26422e47
"""Compare the outputs of HF and vLLM for Mistral models using greedy sampling.
Run `pytest tests/models/test_mistral.py
--forked
`.
Run `pytest tests/models/test_mistral.py`.
"""
import
pytest
...
...
@@ -12,6 +12,9 @@ MODELS = [
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
skip
(
"Two problems: 1. Failing correctness tests. 2. RuntimeError: expected "
"scalar type BFloat16 but found Half (only in CI)."
)
def
test_models
(
hf_runner
,
vllm_runner
,
...
...
tests/models/test_models.py
View file @
26422e47
"""Compare the outputs of HF and vLLM when using greedy sampling.
Run `pytest tests/models/test_models.py --forked`.
This test only tests small models. Big models such as 7B should be tested from
test_big_models.py because it could use a larger instance to run tests.
Run `pytest tests/models/test_models.py`.
"""
import
pytest
MODELS
=
[
"facebook/opt-125m"
,
"meta-llama/Llama-2-7b-hf"
,
"mistralai/Mistral-7B-v0.1"
,
"Deci/DeciLM-7b"
,
"tiiuae/falcon-7b"
,
"gpt2"
,
"bigcode/tiny_starcoder_py"
,
"EleutherAI/gpt-j-6b"
,
"EleutherAI/pythia-70m"
,
"bigscience/bloom-560m"
,
"mosaicml/mpt-7b"
,
"microsoft/phi-2"
,
"stabilityai/stablelm-3b-4e1t"
,
"allenai/OLMo-1B"
,
#
"allenai/OLMo-1B",
# Broken
"bigcode/starcoder2-3b"
,
"Qwen/Qwen1.5-0.5B"
,
]
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"
half
"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"
float
"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
96
])
def
test_models
(
hf_runner
,
vllm_runner
,
...
...
@@ -35,6 +31,9 @@ def test_models(
dtype
:
str
,
max_tokens
:
int
,
)
->
None
:
# To pass the small model tests, we need full precision.
assert
dtype
==
"float"
hf_model
=
hf_runner
(
model
,
dtype
=
dtype
)
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
del
hf_model
...
...
tests/samplers/test_beam_search.py
View file @
26422e47
"""Compare the outputs of HF and vLLM when using beam search.
Run `pytest tests/samplers/test_beam_search.py
--forked
`.
Run `pytest tests/samplers/test_beam_search.py`.
"""
import
gc
...
...
tests/samplers/test_seeded_generate.py
View file @
26422e47
"""Verify that seeded random sampling is deterministic.
Run `pytest tests/samplers/test_seeded_generate.py
--forked
`.
Run `pytest tests/samplers/test_seeded_generate.py`.
"""
import
copy
import
random
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment