Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
6984c02a
Unverified
Commit
6984c02a
authored
Jun 26, 2024
by
Cyrus Leung
Committed by
GitHub
Jun 26, 2024
Browse files
[CI/Build] Refactor image test assets (#5821)
parent
3439c5a8
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
127 additions
and
92 deletions
+127
-92
tests/conftest.py
tests/conftest.py
+70
-41
tests/models/test_llava.py
tests/models/test_llava.py
+14
-12
tests/models/test_llava_next.py
tests/models/test_llava_next.py
+16
-14
tests/models/test_phi3v.py
tests/models/test_phi3v.py
+15
-13
tests/multimodal/test_processor.py
tests/multimodal/test_processor.py
+12
-12
No files found.
tests/conftest.py
View file @
6984c02a
import
contextlib
import
gc
import
os
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Tuple
,
TypeVar
from
collections
import
UserList
from
dataclasses
import
dataclass
from
functools
import
cached_property
from
pathlib
import
Path
from
typing
import
(
Any
,
Dict
,
List
,
Literal
,
Optional
,
Tuple
,
TypedDict
,
TypeVar
)
import
pytest
import
torch
...
...
@@ -28,21 +33,8 @@ _TEST_DIR = os.path.dirname(__file__)
_TEST_PROMPTS
=
[
os
.
path
.
join
(
_TEST_DIR
,
"prompts"
,
"example.txt"
)]
_LONG_PROMPTS
=
[
os
.
path
.
join
(
_TEST_DIR
,
"prompts"
,
"summary.txt"
)]
# Multi modal related
# You can use `.buildkite/download-images.sh` to download the assets
PIXEL_VALUES_FILES
=
[
os
.
path
.
join
(
_TEST_DIR
,
"images"
,
filename
)
for
filename
in
[
"stop_sign_pixel_values.pt"
,
"cherry_blossom_pixel_values.pt"
]
]
IMAGE_FEATURES_FILES
=
[
os
.
path
.
join
(
_TEST_DIR
,
"images"
,
filename
)
for
filename
in
[
"stop_sign_image_features.pt"
,
"cherry_blossom_image_features.pt"
]
]
IMAGE_FILES
=
[
os
.
path
.
join
(
_TEST_DIR
,
"images"
,
filename
)
for
filename
in
[
"stop_sign.jpg"
,
"cherry_blossom.jpg"
]
]
assert
len
(
PIXEL_VALUES_FILES
)
==
len
(
IMAGE_FEATURES_FILES
)
==
len
(
IMAGE_FILES
)
_IMAGE_DIR
=
Path
(
_TEST_DIR
)
/
"images"
"""You can use `.buildkite/download-images.sh` to download the assets."""
def
_read_prompts
(
filename
:
str
)
->
List
[
str
]:
...
...
@@ -51,6 +43,63 @@ def _read_prompts(filename: str) -> List[str]:
return
prompts
@
dataclass
(
frozen
=
True
)
class
ImageAsset
:
name
:
Literal
[
"stop_sign"
,
"cherry_blossom"
]
@
cached_property
def
pixel_values
(
self
)
->
torch
.
Tensor
:
return
torch
.
load
(
_IMAGE_DIR
/
f
"
{
self
.
name
}
_pixel_values.pt"
)
@
cached_property
def
image_features
(
self
)
->
torch
.
Tensor
:
return
torch
.
load
(
_IMAGE_DIR
/
f
"
{
self
.
name
}
_image_features.pt"
)
@
cached_property
def
pil_image
(
self
)
->
Image
.
Image
:
return
Image
.
open
(
_IMAGE_DIR
/
f
"
{
self
.
name
}
.jpg"
)
def
for_hf
(
self
)
->
Image
.
Image
:
return
self
.
pil_image
def
for_vllm
(
self
,
vision_config
:
VisionLanguageConfig
)
->
MultiModalData
:
image_input_type
=
vision_config
.
image_input_type
ImageInputType
=
VisionLanguageConfig
.
ImageInputType
if
image_input_type
==
ImageInputType
.
IMAGE_FEATURES
:
return
ImageFeatureData
(
self
.
image_features
)
if
image_input_type
==
ImageInputType
.
PIXEL_VALUES
:
return
ImagePixelData
(
self
.
pil_image
)
raise
NotImplementedError
class
_ImageAssetPrompts
(
TypedDict
):
stop_sign
:
str
cherry_blossom
:
str
class
_ImageAssets
(
UserList
[
ImageAsset
]):
def
__init__
(
self
)
->
None
:
super
().
__init__
(
[
ImageAsset
(
"stop_sign"
),
ImageAsset
(
"cherry_blossom"
)])
def
prompts
(
self
,
prompts
:
_ImageAssetPrompts
)
->
List
[
str
]:
"""
Convenience method to define the prompt for each test image.
The order of the returned prompts matches the order of the
assets when iterating through this object.
"""
return
[
prompts
[
"stop_sign"
],
prompts
[
"cherry_blossom"
]]
IMAGE_ASSETS
=
_ImageAssets
()
"""Singleton instance of :class:`_ImageAssets`."""
def
cleanup
():
destroy_model_parallel
()
destroy_distributed_environment
()
...
...
@@ -81,31 +130,6 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
cleanup
()
@
pytest
.
fixture
(
scope
=
"session"
)
def
hf_images
()
->
List
[
Image
.
Image
]:
return
[
Image
.
open
(
filename
)
for
filename
in
IMAGE_FILES
]
@
pytest
.
fixture
()
def
vllm_images
(
request
)
->
List
[
MultiModalData
]:
vision_language_config
=
request
.
getfixturevalue
(
"model_and_config"
)[
1
]
if
vision_language_config
.
image_input_type
==
(
VisionLanguageConfig
.
ImageInputType
.
IMAGE_FEATURES
):
return
[
ImageFeatureData
(
torch
.
load
(
filename
))
for
filename
in
IMAGE_FEATURES_FILES
]
else
:
return
[
ImagePixelData
(
Image
.
open
(
filename
))
for
filename
in
IMAGE_FILES
]
@
pytest
.
fixture
()
def
vllm_image_tensors
(
request
)
->
List
[
torch
.
Tensor
]:
return
[
torch
.
load
(
filename
)
for
filename
in
PIXEL_VALUES_FILES
]
@
pytest
.
fixture
def
example_prompts
()
->
List
[
str
]:
prompts
=
[]
...
...
@@ -122,6 +146,11 @@ def example_long_prompts() -> List[str]:
return
prompts
@
pytest
.
fixture
(
scope
=
"session"
)
def
image_assets
()
->
_ImageAssets
:
return
IMAGE_ASSETS
_STR_DTYPE_TO_TORCH_DTYPE
=
{
"half"
:
torch
.
half
,
"bfloat16"
:
torch
.
bfloat16
,
...
...
tests/models/test_llava.py
View file @
6984c02a
...
...
@@ -5,17 +5,17 @@ from transformers import AutoTokenizer
from
vllm.config
import
VisionLanguageConfig
from
..conftest
import
IMAGE_
FILE
S
from
..conftest
import
IMAGE_
ASSET
S
pytestmark
=
pytest
.
mark
.
vlm
# The image token is placed before "user" on purpose so that the test can pass
HF_IMAGE_PROMPTS
=
[
HF_IMAGE_PROMPTS
=
IMAGE_ASSETS
.
prompts
({
"stop_sign"
:
"<image>
\n
USER: What's the content of the image?
\n
ASSISTANT:"
,
"cherry_blossom"
:
"<image>
\n
USER: What is the season?
\n
ASSISTANT:"
,
]
assert
len
(
HF_IMAGE_PROMPTS
)
==
len
(
IMAGE_FILES
)
})
def
iter_llava_configs
(
model_name
:
str
):
...
...
@@ -49,28 +49,28 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
It also reduces `output_str` from "<image><image>bla" to "bla".
"""
in
put_ids
,
output_str
=
vllm_output
out
put_ids
,
output_str
=
vllm_output
image_token_id
=
vlm_config
.
image_token_id
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_id
)
image_token_str
=
tokenizer
.
decode
(
image_token_id
)
hf_
in
put_ids
=
[
input
_id
for
idx
,
input
_id
in
enumerate
(
in
put_ids
)
if
input
_id
!=
image_token_id
or
in
put_ids
[
idx
-
1
]
!=
image_token_id
hf_
out
put_ids
=
[
token
_id
for
idx
,
token
_id
in
enumerate
(
out
put_ids
)
if
token
_id
!=
image_token_id
or
out
put_ids
[
idx
-
1
]
!=
image_token_id
]
hf_output_str
=
output_str
\
.
replace
(
image_token_str
*
vlm_config
.
image_feature_size
,
""
)
return
hf_
in
put_ids
,
hf_output_str
return
hf_
out
put_ids
,
hf_output_str
# TODO: Add test for `tensor_parallel_size` [ref: PR #3883]
@
pytest
.
mark
.
parametrize
(
"model_and_config"
,
model_and_vl_config
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
def
test_models
(
hf_runner
,
vllm_runner
,
hf_
image
s
,
vllm_images
,
model_and_config
,
dtype
:
str
,
max_tokens
:
int
)
->
None
:
def
test_models
(
hf_runner
,
vllm_runner
,
image
_assets
,
model_and_config
,
dtype
:
str
,
max_tokens
:
int
)
->
None
:
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test is under tests/images.
...
...
@@ -81,6 +81,8 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
The text output is sanitized to be able to compare with hf.
"""
model_id
,
vlm_config
=
model_and_config
hf_images
=
[
asset
.
for_hf
()
for
asset
in
image_assets
]
vllm_images
=
[
asset
.
for_vllm
(
vlm_config
)
for
asset
in
image_assets
]
with
hf_runner
(
model_id
,
dtype
=
dtype
,
is_vision_model
=
True
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
HF_IMAGE_PROMPTS
,
...
...
tests/models/test_llava_next.py
View file @
6984c02a
...
...
@@ -5,7 +5,7 @@ from transformers import AutoTokenizer
from
vllm.config
import
VisionLanguageConfig
from
..conftest
import
IMAGE_
FILE
S
from
..conftest
import
IMAGE_
ASSET
S
pytestmark
=
pytest
.
mark
.
vlm
...
...
@@ -15,12 +15,12 @@ _PREFACE = (
"questions."
)
# The image token is placed before "user" on purpose so that the test can pass
HF_IMAGE_PROMPTS
=
[
f
"
{
_PREFACE
}
<image>
\n
USER: What's the content of the image? ASSISTANT:"
,
f
"
{
_PREFACE
}
<image>
\n
USER: What
i
s the
season?
ASSISTANT:"
,
]
assert
len
(
HF_IMAGE_PROMPTS
)
==
len
(
IMAGE_FILES
)
HF_IMAGE_PROMPTS
=
IMAGE_ASSETS
.
prompts
({
"stop_sign"
:
f
"
{
_PREFACE
}
<image>
\n
USER: What
'
s the
content of the image?
\n
ASSISTANT:"
,
"cherry_blossom"
:
f
"
{
_PREFACE
}
<image>
\n
USER: What is the season?
\n
ASSISTANT:"
,
}
)
def
iter_llava_next_configs
(
model_name
:
str
):
...
...
@@ -56,20 +56,20 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
It also reduces `output_str` from "<image><image>bla" to "bla".
"""
in
put_ids
,
output_str
=
vllm_output
out
put_ids
,
output_str
=
vllm_output
image_token_id
=
vlm_config
.
image_token_id
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_id
)
image_token_str
=
tokenizer
.
decode
(
image_token_id
)
hf_
in
put_ids
=
[
input
_id
for
idx
,
input
_id
in
enumerate
(
in
put_ids
)
if
input
_id
!=
image_token_id
or
in
put_ids
[
idx
-
1
]
!=
image_token_id
hf_
out
put_ids
=
[
token
_id
for
idx
,
token
_id
in
enumerate
(
out
put_ids
)
if
token
_id
!=
image_token_id
or
out
put_ids
[
idx
-
1
]
!=
image_token_id
]
hf_output_str
=
output_str
\
.
replace
(
image_token_str
*
vlm_config
.
image_feature_size
,
" "
)
return
hf_
in
put_ids
,
hf_output_str
return
hf_
out
put_ids
,
hf_output_str
@
pytest
.
mark
.
xfail
(
...
...
@@ -78,8 +78,8 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
@
pytest
.
mark
.
parametrize
(
"model_and_config"
,
model_and_vl_config
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
def
test_models
(
hf_runner
,
vllm_runner
,
hf_
image
s
,
vllm_images
,
model_and_config
,
dtype
:
str
,
max_tokens
:
int
)
->
None
:
def
test_models
(
hf_runner
,
vllm_runner
,
image
_assets
,
model_and_config
,
dtype
:
str
,
max_tokens
:
int
)
->
None
:
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test is under tests/images.
...
...
@@ -90,6 +90,8 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
The text output is sanitized to be able to compare with hf.
"""
model_id
,
vlm_config
=
model_and_config
hf_images
=
[
asset
.
for_hf
()
for
asset
in
image_assets
]
vllm_images
=
[
asset
.
for_vllm
(
vlm_config
)
for
asset
in
image_assets
]
with
hf_runner
(
model_id
,
dtype
=
dtype
,
is_vision_model
=
True
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
HF_IMAGE_PROMPTS
,
...
...
tests/models/test_phi3v.py
View file @
6984c02a
...
...
@@ -6,17 +6,17 @@ from transformers import AutoTokenizer
from
vllm.config
import
VisionLanguageConfig
from
vllm.utils
import
is_cpu
from
..conftest
import
IMAGE_
FILE
S
from
..conftest
import
IMAGE_
ASSET
S
pytestmark
=
pytest
.
mark
.
vlm
# The image token is placed before "user" on purpose so that the test can pass
HF_IMAGE_PROMPTS
=
[
HF_IMAGE_PROMPTS
=
IMAGE_ASSETS
.
prompts
({
"stop_sign"
:
"<|user|>
\n
<|image_1|>
\n
What's the content of the image?<|end|>
\n
<|assistant|>
\n
"
,
# noqa: E501
"<|user|>
\n
<|image_1|>
\n
What is the season?<|end|>
\n
<|assistant|>
\n
"
,
]
assert
len
(
HF_IMAGE_PROMPTS
)
==
len
(
IMAGE_FILES
)
"cherry_blossom"
:
"<|user|>
\n
<|image_1|>
\n
What is the season?<|end|>
\n
<|assistant|>
\n
"
,
# noqa: E501
})
def
iter_phi3v_configs
(
model_name
:
str
):
...
...
@@ -50,22 +50,22 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
It also reduces `output_str` from "<image><image>bla" to "bla".
"""
in
put_ids
,
output_str
=
vllm_output
out
put_ids
,
output_str
=
vllm_output
image_token_id
=
vlm_config
.
image_token_id
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_id
)
image_token_str
=
tokenizer
.
decode
(
image_token_id
)
hf_
in
put_ids
=
[
input
_id
if
input
_id
!=
image_token_id
else
0
for
idx
,
input
_id
in
enumerate
(
in
put_ids
)
hf_
out
put_ids
=
[
token
_id
if
token
_id
!=
image_token_id
else
0
for
idx
,
token
_id
in
enumerate
(
out
put_ids
)
]
hf_output_str
=
output_str
\
.
replace
(
image_token_str
*
vlm_config
.
image_feature_size
,
""
)
\
.
replace
(
"<s>"
,
" "
).
replace
(
"<|user|>"
,
""
)
\
.
replace
(
"<|end|>
\n
<|assistant|>"
,
" "
)
return
hf_
in
put_ids
,
hf_output_str
return
hf_
out
put_ids
,
hf_output_str
target_dtype
=
"half"
...
...
@@ -82,8 +82,8 @@ if is_cpu():
@
pytest
.
mark
.
parametrize
(
"model_and_config"
,
model_and_vl_config
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
target_dtype
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
def
test_models
(
hf_runner
,
vllm_runner
,
hf_
image
s
,
vllm_images
,
model_and_config
,
dtype
:
str
,
max_tokens
:
int
)
->
None
:
def
test_models
(
hf_runner
,
vllm_runner
,
image
_assets
,
model_and_config
,
dtype
:
str
,
max_tokens
:
int
)
->
None
:
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test is under tests/images.
...
...
@@ -94,6 +94,8 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
The text output is sanitized to be able to compare with hf.
"""
model_id
,
vlm_config
=
model_and_config
hf_images
=
[
asset
.
for_hf
()
for
asset
in
image_assets
]
vllm_images
=
[
asset
.
for_vllm
(
vlm_config
)
for
asset
in
image_assets
]
# use eager mode for hf runner, since phi3_v didn't work with flash_attn
hf_model_kwargs
=
{
"_attn_implementation"
:
"eager"
}
...
...
tests/multimodal/test_processor.py
View file @
6984c02a
...
...
@@ -10,7 +10,7 @@ from ..conftest import _STR_DTYPE_TO_TORCH_DTYPE
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
,
"float"
])
def
test_clip_image_processor
(
hf_
images
,
dtype
):
def
test_clip_image_processor
(
image
_asset
s
,
dtype
):
MODEL_NAME
=
"llava-hf/llava-1.5-7b-hf"
IMAGE_HEIGHT
=
IMAGE_WIDTH
=
560
...
...
@@ -35,13 +35,13 @@ def test_clip_image_processor(hf_images, dtype):
image_processor_revision
=
None
,
)
for
image
in
hf_
images
:
for
asset
in
image
_asset
s
:
hf_result
=
hf_processor
.
preprocess
(
image
,
asset
.
pil_
image
,
return_tensors
=
"pt"
,
).
to
(
dtype
=
_STR_DTYPE_TO_TORCH_DTYPE
[
dtype
])
vllm_result
=
MULTIMODAL_REGISTRY
.
process_input
(
ImagePixelData
(
image
),
ImagePixelData
(
asset
.
pil_
image
),
model_config
=
model_config
,
vlm_config
=
vlm_config
,
)
...
...
@@ -59,7 +59,7 @@ def test_clip_image_processor(hf_images, dtype):
reason
=
"Inconsistent image processor being used due to lack "
"of support for dynamic image token replacement"
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
,
"float"
])
def
test_llava_next_image_processor
(
hf_
images
,
dtype
):
def
test_llava_next_image_processor
(
image
_asset
s
,
dtype
):
MODEL_NAME
=
"llava-hf/llava-v1.6-34b-hf"
IMAGE_HEIGHT
=
IMAGE_WIDTH
=
560
...
...
@@ -84,13 +84,13 @@ def test_llava_next_image_processor(hf_images, dtype):
image_processor_revision
=
None
,
)
for
image
in
hf_
images
:
for
asset
in
image
_asset
s
:
hf_result
=
hf_processor
.
preprocess
(
image
,
asset
.
pil_
image
,
return_tensors
=
"pt"
,
).
to
(
dtype
=
_STR_DTYPE_TO_TORCH_DTYPE
[
dtype
])
vllm_result
=
MULTIMODAL_REGISTRY
.
process_input
(
ImagePixelData
(
image
),
ImagePixelData
(
asset
.
pil_
image
),
model_config
=
model_config
,
vlm_config
=
vlm_config
,
)
...
...
@@ -107,7 +107,7 @@ def test_llava_next_image_processor(hf_images, dtype):
@
pytest
.
mark
.
xfail
(
reason
=
"Example image pixels were not processed using HuggingFace"
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
def
test_image_pixel_types
(
hf_
image
s
,
vllm_image_tensor
s
,
dtype
):
def
test_image_pixel_types
(
image
_asset
s
,
dtype
):
MODEL_NAME
=
"llava-hf/llava-1.5-7b-hf"
IMAGE_HEIGHT
=
IMAGE_WIDTH
=
560
...
...
@@ -129,14 +129,14 @@ def test_image_pixel_types(hf_images, vllm_image_tensors, dtype):
image_processor_revision
=
None
,
)
for
image
,
tensor
in
zip
(
hf_images
,
vllm_image_tensors
)
:
for
asset
in
image_assets
:
image_result
=
MULTIMODAL_REGISTRY
.
process_input
(
ImagePixelData
(
image
),
ImagePixelData
(
asset
.
pil_
image
),
model_config
=
model_config
,
vlm_config
=
vlm_config
,
)
tensor_result
=
MULTIMODAL_REGISTRY
.
process_input
(
ImagePixelData
(
tensor
),
ImagePixelData
(
asset
.
pixel_values
),
model_config
=
model_config
,
vlm_config
=
vlm_config
,
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment