Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a84e598e
Unverified
Commit
a84e598e
authored
Sep 14, 2024
by
Cyrus Leung
Committed by
GitHub
Sep 13, 2024
Browse files
[CI/Build] Reorganize models tests (#7820)
parent
0a4806f0
Changes
54
Hide whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
137 additions
and
69 deletions
+137
-69
tests/models/decoder_only/vision_language/test_llava_next.py
tests/models/decoder_only/vision_language/test_llava_next.py
+4
-6
tests/models/decoder_only/vision_language/test_llava_next_video.py
...els/decoder_only/vision_language/test_llava_next_video.py
+2
-4
tests/models/decoder_only/vision_language/test_minicpmv.py
tests/models/decoder_only/vision_language/test_minicpmv.py
+3
-5
tests/models/decoder_only/vision_language/test_paligemma.py
tests/models/decoder_only/vision_language/test_paligemma.py
+3
-5
tests/models/decoder_only/vision_language/test_phi3v.py
tests/models/decoder_only/vision_language/test_phi3v.py
+3
-5
tests/models/decoder_only/vision_language/test_pixtral.py
tests/models/decoder_only/vision_language/test_pixtral.py
+16
-7
tests/models/decoder_only/vision_language/test_qwen.py
tests/models/decoder_only/vision_language/test_qwen.py
+3
-5
tests/models/embedding/__init__.py
tests/models/embedding/__init__.py
+0
-0
tests/models/embedding/language/__init__.py
tests/models/embedding/language/__init__.py
+0
-0
tests/models/embedding/language/test_embedding.py
tests/models/embedding/language/test_embedding.py
+0
-0
tests/models/encoder_decoder/__init__.py
tests/models/encoder_decoder/__init__.py
+0
-0
tests/models/encoder_decoder/language/__init__.py
tests/models/encoder_decoder/language/__init__.py
+0
-0
tests/models/encoder_decoder/language/test_bart.py
tests/models/encoder_decoder/language/test_bart.py
+84
-31
tests/utils.py
tests/utils.py
+19
-1
No files found.
tests/models/test_llava_next.py
→
tests/models/
decoder_only/vision_language/
test_llava_next.py
View file @
a84e598e
...
...
@@ -6,11 +6,9 @@ from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
from
vllm.multimodal.utils
import
rescale_image_size
from
vllm.sequence
import
SampleLogprobs
from
..conftest
import
(
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
,
_ImageAssets
)
from
.utils
import
check_logprobs_close
pytestmark
=
pytest
.
mark
.
vlm
from
....conftest
import
(
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
,
_ImageAssets
)
from
...utils
import
check_logprobs_close
_LIMIT_IMAGE_PER_PROMPT
=
4
...
...
@@ -197,7 +195,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
dtype
,
max_tokens
,
num_logprobs
)
->
None
:
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test
is under tests/images
.
All the image fixtures for the test
are from IMAGE_ASSETS
.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input.
...
...
tests/models/test_llava_next_video.py
→
tests/models/
decoder_only/vision_language/
test_llava_next_video.py
View file @
a84e598e
...
...
@@ -8,10 +8,8 @@ from vllm.multimodal.utils import (rescale_video_size, resize_video,
sample_frames_from_video
)
from
vllm.sequence
import
SampleLogprobs
from
..conftest
import
VIDEO_ASSETS
,
HfRunner
,
VllmRunner
,
_VideoAssets
from
.utils
import
check_logprobs_close
pytestmark
=
pytest
.
mark
.
vlm
from
....conftest
import
VIDEO_ASSETS
,
HfRunner
,
VllmRunner
,
_VideoAssets
from
...utils
import
check_logprobs_close
_PREFACE
=
(
"A chat between a curious human and an artificial intelligence assistant. "
...
...
tests/models/test_minicpmv.py
→
tests/models/
decoder_only/vision_language/
test_minicpmv.py
View file @
a84e598e
...
...
@@ -9,10 +9,8 @@ from transformers import BatchEncoding
from
vllm.multimodal.utils
import
rescale_image_size
from
vllm.sequence
import
SampleLogprobs
from
..conftest
import
IMAGE_ASSETS
,
HfRunner
,
VllmRunner
from
.utils
import
check_logprobs_close
pytestmark
=
pytest
.
mark
.
vlm
from
....conftest
import
IMAGE_ASSETS
,
HfRunner
,
VllmRunner
from
...utils
import
check_logprobs_close
# The image token is placed before "user" on purpose so that the test can pass
HF_IMAGE_PROMPTS
=
IMAGE_ASSETS
.
prompts
({
...
...
@@ -65,7 +63,7 @@ def run_test(
):
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test
is under tests/images
.
All the image fixtures for the test
are from IMAGE_ASSETS
.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input.
...
...
tests/models/test_paligemma.py
→
tests/models/
decoder_only/vision_language/
test_paligemma.py
View file @
a84e598e
...
...
@@ -8,10 +8,8 @@ from vllm.multimodal.utils import rescale_image_size
from
vllm.sequence
import
SampleLogprobs
from
vllm.utils
import
is_hip
from
..conftest
import
IMAGE_ASSETS
,
HfRunner
,
VllmRunner
,
_ImageAssets
from
.utils
import
check_logprobs_close
pytestmark
=
pytest
.
mark
.
vlm
from
....conftest
import
IMAGE_ASSETS
,
HfRunner
,
VllmRunner
,
_ImageAssets
from
...utils
import
check_logprobs_close
HF_IMAGE_PROMPTS
=
IMAGE_ASSETS
.
prompts
({
"stop_sign"
:
...
...
@@ -69,7 +67,7 @@ def run_test(
):
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test
is under tests/images
.
All the image fixtures for the test
are from IMAGE_ASSETS
.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input.
...
...
tests/models/test_phi3v.py
→
tests/models/
decoder_only/vision_language/
test_phi3v.py
View file @
a84e598e
...
...
@@ -9,10 +9,8 @@ from vllm.multimodal.utils import rescale_image_size
from
vllm.sequence
import
SampleLogprobs
from
vllm.utils
import
is_cpu
,
is_hip
from
..conftest
import
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
from
.utils
import
check_logprobs_close
pytestmark
=
pytest
.
mark
.
vlm
from
....conftest
import
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
from
...utils
import
check_logprobs_close
HF_IMAGE_PROMPTS
=
IMAGE_ASSETS
.
prompts
({
"stop_sign"
:
...
...
@@ -71,7 +69,7 @@ def run_test(
):
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test
is under tests/images
.
All the image fixtures for the test
are from IMAGE_ASSETS
.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input.
...
...
tests/models/test_pixtral.py
→
tests/models/
decoder_only/vision_language/
test_pixtral.py
View file @
a84e598e
...
...
@@ -5,7 +5,7 @@ Run `pytest tests/models/test_mistral.py`.
import
json
import
uuid
from
dataclasses
import
asdict
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Tuple
from
typing
import
TYPE_CHECKING
,
Any
,
Dict
,
List
,
Optional
,
Tuple
import
pytest
from
mistral_common.protocol.instruct.messages
import
ImageURLChunk
...
...
@@ -17,9 +17,11 @@ from vllm import EngineArgs, LLMEngine, SamplingParams, TokensPrompt
from
vllm.multimodal
import
MultiModalDataBuiltins
from
vllm.sequence
import
Logprob
,
SampleLogprobs
from
.utils
import
check_logprobs_close
from
....utils
import
VLLM_PATH
from
...utils
import
check_logprobs_close
pytestmark
=
pytest
.
mark
.
vlm
if
TYPE_CHECKING
:
from
_typeshed
import
StrPath
MODELS
=
[
"mistralai/Pixtral-12B-2409"
]
IMG_URLS
=
[
...
...
@@ -83,14 +85,21 @@ SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
LIMIT_MM_PER_PROMPT
=
dict
(
image
=
4
)
MAX_MODEL_LEN
=
[
8192
,
65536
]
FIXTURE_LOGPROBS_CHAT
=
"tests/models/fixtures/pixtral_chat.json"
FIXTURE_LOGPROBS_ENGINE
=
"tests/models/fixtures/pixtral_chat_engine.json"
FIXTURES_PATH
=
VLLM_PATH
/
"tests/models/fixtures"
assert
FIXTURES_PATH
.
exists
()
FIXTURE_LOGPROBS_CHAT
=
FIXTURES_PATH
/
"pixtral_chat.json"
FIXTURE_LOGPROBS_ENGINE
=
FIXTURES_PATH
/
"pixtral_chat_engine.json"
OutputsLogprobs
=
List
[
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]]]
# For the test author to store golden output in JSON
def
_dump_outputs_w_logprobs
(
outputs
:
OutputsLogprobs
,
filename
:
str
)
->
None
:
def
_dump_outputs_w_logprobs
(
outputs
:
OutputsLogprobs
,
filename
:
"StrPath"
,
)
->
None
:
json_data
=
[(
tokens
,
text
,
[{
k
:
asdict
(
v
)
for
k
,
v
in
token_logprobs
.
items
()}
...
...
@@ -101,7 +110,7 @@ def _dump_outputs_w_logprobs(outputs: OutputsLogprobs, filename: str) -> None:
json
.
dump
(
json_data
,
f
)
def
load_outputs_w_logprobs
(
filename
:
str
)
->
OutputsLogprobs
:
def
load_outputs_w_logprobs
(
filename
:
"StrPath"
)
->
OutputsLogprobs
:
with
open
(
filename
,
"rb"
)
as
f
:
json_data
=
json
.
load
(
f
)
...
...
tests/models/test_qwen.py
→
tests/models/
decoder_only/vision_language/
test_qwen.py
View file @
a84e598e
...
...
@@ -10,11 +10,9 @@ from vllm.inputs import InputContext, LLMInputs
from
vllm.multimodal.base
import
MultiModalInputs
from
vllm.multimodal.utils
import
cached_get_tokenizer
,
rescale_image_size
from
..conftest
import
(
IMAGE_ASSETS
,
HfRunner
,
ImageAsset
,
PromptImageInput
,
VllmRunner
,
_ImageAssets
)
from
.utils
import
check_logprobs_close
pytestmark
=
pytest
.
mark
.
vlm
from
....conftest
import
(
IMAGE_ASSETS
,
HfRunner
,
ImageAsset
,
PromptImageInput
,
VllmRunner
,
_ImageAssets
)
from
...utils
import
check_logprobs_close
text_only_models
=
[
"Qwen/Qwen-7B-Chat"
# Has no visual component
...
...
tests/models/embedding/__init__.py
0 → 100644
View file @
a84e598e
tests/models/embedding/language/__init__.py
0 → 100644
View file @
a84e598e
tests/models/test_embedding.py
→
tests/models/
embedding/language/
test_embedding.py
View file @
a84e598e
File moved
tests/models/encoder_decoder/__init__.py
0 → 100644
View file @
a84e598e
tests/models/encoder_decoder/language/__init__.py
0 → 100644
View file @
a84e598e
tests/models/test_bart.py
→
tests/models/
encoder_decoder/language/
test_bart.py
View file @
a84e598e
"""Compare the outputs of HF and vLLM for BART models using greedy sampling.
Run `pytest tests/models/test_bart.py`.
Run `pytest tests/models/
encoder_decoder/language/
test_bart.py`.
"""
from
typing
import
List
,
Optional
,
Tuple
from
typing
import
List
,
Optional
,
Tuple
,
Type
from
vllm.utils
import
is_cpu
...
...
@@ -16,8 +16,10 @@ if not is_cpu():
from
vllm.sequence
import
SampleLogprobs
from
..conftest
import
DecoderPromptType
from
.utils
import
check_logprobs_close
from
....conftest
import
(
DecoderPromptType
,
ExplicitEncoderDecoderPrompt
,
HfRunner
,
VllmRunner
)
from
....utils
import
multi_gpu_test
from
...utils
import
check_logprobs_close
MODELS
=
[
"facebook/bart-base"
,
"facebook/bart-large-cnn"
]
...
...
@@ -34,20 +36,18 @@ if not is_cpu():
return
output_ids
,
hf_output_str
,
out_logprobs
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
,
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
64
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"decoder_prompt_type"
,
list
(
DecoderPromptType
))
def
test_models
(
hf_runner
,
vllm_runner
,
example_encoder_decoder_prompts
,
def
run_test
(
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
prompts
:
List
[
ExplicitEncoderDecoderPrompt
[
str
,
str
]],
decoder_prompt_type
:
DecoderPromptType
,
model
:
str
,
*
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
decoder_prompt_type
:
DecoderPromptType
,
tensor_parallel_size
:
int
,
distributed_executor_backend
:
Optional
[
str
]
=
None
,
)
->
None
:
'''
Test the vLLM BART model for a variety of encoder/decoder input prompts,
...
...
@@ -116,8 +116,29 @@ if not is_cpu():
token during the process of validating the vLLM decoded output.
'''
test_case_prompts
=
example_encoder_decoder_prompts
[
decoder_prompt_type
]
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default).
# Note: currently encoder/decoder models are only compatible with
# enforce_eager=True. Normally this is not a problem because
# for encoder/decoder models vLLM will
# default to enforce_eager=True if enforce_eager
# is left unspecified. However, the
# VllmRunner test fixture (which wraps around the LLM class) defaults to
# enforce_eager=False (a behavior which a number of already-exisitng
# decoder-only unit tests expect), so when testing an encoder/decoder
# model we must explicitly specify enforce_eager=True in the VllmRunner
# constructor.
with
vllm_runner
(
model
,
dtype
=
dtype
,
tensor_parallel_size
=
tensor_parallel_size
,
distributed_executor_backend
=
distributed_executor_backend
,
enforce_eager
=
True
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_encoder_decoder_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
)
# Configuration settings for HF baseline
hf_kwargs
=
{
...
...
@@ -135,26 +156,12 @@ if not is_cpu():
auto_cls
=
AutoModelForSeq2SeqLM
)
as
hf_model
:
hf_outputs
=
(
hf_model
.
generate_encoder_decoder_greedy_logprobs_limit
(
test_case_
prompts
,
prompts
,
max_tokens
,
num_logprobs
,
**
hf_kwargs
,
))
# Note: currently encoder/decoder models are only compatible with
# enforce_eager=True. Normally this is not a problem because
# for encoder/decoder models vLLM will
# default to enforce_eager=True if enforce_eager
# is left unspecified. However, the
# VllmRunner test fixture (which wraps around the LLM class) defaults to
# enforce_eager=False (a behavior which a number of already-exisitng
# decoder-only unit tests expect), so when testing an encoder/decoder
# model we must explicitly specify enforce_eager=True in the VllmRunner
# constructor.
with
vllm_runner
(
model
,
dtype
=
dtype
,
enforce_eager
=
True
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_encoder_decoder_greedy_logprobs
(
test_case_prompts
,
max_tokens
,
num_logprobs
)
hf_skip_tokens
=
(
1
if
decoder_prompt_type
==
DecoderPromptType
.
NONE
else
0
)
...
...
@@ -168,3 +175,49 @@ if not is_cpu():
name_1
=
"vllm"
,
num_outputs_0_skip_tokens
=
hf_skip_tokens
,
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
,
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
64
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"decoder_prompt_type"
,
list
(
DecoderPromptType
))
def
test_models
(
hf_runner
,
vllm_runner
,
example_encoder_decoder_prompts
,
model
,
dtype
,
max_tokens
,
num_logprobs
,
decoder_prompt_type
)
->
None
:
run_test
(
hf_runner
,
vllm_runner
,
example_encoder_decoder_prompts
[
decoder_prompt_type
],
decoder_prompt_type
,
model
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
tensor_parallel_size
=
1
,
)
@
multi_gpu_test
(
num_gpus
=
2
)
@
pytest
.
mark
.
parametrize
(
"distributed_executor_backend"
,
[
"ray"
,
"mp"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"facebook/bart-large-cnn"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
64
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"decoder_prompt_type"
,
[
DecoderPromptType
.
CUSTOM
])
def
test_models_distributed
(
hf_runner
,
vllm_runner
,
example_encoder_decoder_prompts
,
distributed_executor_backend
,
model
,
dtype
,
max_tokens
,
num_logprobs
,
decoder_prompt_type
)
->
None
:
run_test
(
hf_runner
,
vllm_runner
,
example_encoder_decoder_prompts
[
decoder_prompt_type
],
decoder_prompt_type
,
model
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
tensor_parallel_size
=
2
,
distributed_executor_backend
=
distributed_executor_backend
,
)
tests/utils.py
View file @
a84e598e
...
...
@@ -10,6 +10,7 @@ from pathlib import Path
from
typing
import
Any
,
Callable
,
Dict
,
List
,
Optional
import
openai
import
pytest
import
requests
from
openai.types.completion
import
Completion
from
transformers
import
AutoTokenizer
...
...
@@ -22,7 +23,8 @@ from vllm.engine.arg_utils import AsyncEngineArgs
from
vllm.entrypoints.openai.cli_args
import
make_arg_parser
from
vllm.model_executor.model_loader.loader
import
get_model_loader
from
vllm.platforms
import
current_platform
from
vllm.utils
import
FlexibleArgumentParser
,
get_open_port
,
is_hip
from
vllm.utils
import
(
FlexibleArgumentParser
,
cuda_device_count_stateless
,
get_open_port
,
is_hip
)
if
current_platform
.
is_rocm
():
from
amdsmi
import
(
amdsmi_get_gpu_vram_usage
,
...
...
@@ -452,6 +454,22 @@ def fork_new_process_for_each_test(
return
wrapper
def
multi_gpu_test
(
*
,
num_gpus
:
int
):
"""
Decorate a test to be run only when multiple GPUs are available.
"""
test_selector
=
getattr
(
pytest
.
mark
,
f
"distributed_
{
num_gpus
}
_gpus"
)
test_skipif
=
pytest
.
mark
.
skipif
(
cuda_device_count_stateless
()
<
num_gpus
,
reason
=
f
"Need at least
{
num_gpus
}
GPUs to run the test."
,
)
def
wrapper
(
f
:
Callable
[
_P
,
None
])
->
Callable
[
_P
,
None
]:
return
test_selector
(
test_skipif
(
fork_new_process_for_each_test
(
f
)))
return
wrapper
async
def
completions_with_server_args
(
prompts
:
List
[
str
],
model_name
:
str
,
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment