Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
874f7c29
Unverified
Commit
874f7c29
authored
Jan 16, 2025
by
Roger Wang
Committed by
GitHub
Jan 16, 2025
Browse files
[Bugfix] Fix max image feature size for Llava-one-vision (#12104)
Signed-off-by:
Roger Wang
<
ywang@roblox.com
>
parent
92e793d9
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
129 additions
and
2 deletions
+129
-2
tests/models/multimodal/processing/test_llava_next.py
tests/models/multimodal/processing/test_llava_next.py
+61
-0
tests/models/multimodal/processing/test_llava_onevision.py
tests/models/multimodal/processing/test_llava_onevision.py
+62
-0
vllm/model_executor/models/llava_onevision.py
vllm/model_executor/models/llava_onevision.py
+6
-2
No files found.
tests/models/multimodal/processing/test_llava_next.py
View file @
874f7c29
...
...
@@ -13,6 +13,67 @@ from vllm.multimodal.utils import cached_get_tokenizer
from
...utils
import
build_model_context
def
_validate_image_max_tokens_one
(
processor
:
BaseMultiModalProcessor
,
max_tokens
:
int
,
failed_size_excs
:
list
[
tuple
[
ImageSize
,
Exception
]],
image_size
:
ImageSize
,
)
->
None
:
info
=
processor
.
info
feature_size
=
info
.
get_num_image_tokens
(
image_width
=
image_size
.
width
,
image_height
=
image_size
.
height
)
try
:
assert
feature_size
<=
max_tokens
,
f
"
{
feature_size
}
<=
{
max_tokens
}
"
except
Exception
as
exc
:
failed_size_excs
.
append
((
image_size
,
exc
))
@
pytest
.
mark
.
skip
(
"This test takes around 5 minutes to run. "
"Comment this out to run it manually."
)
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"llava-hf/llava-v1.6-mistral-7b-hf"
])
def
test_processor_max_tokens
(
model_id
):
ctx
=
build_model_context
(
model_name
=
model_id
,
tokenizer_name
=
model_id
,
mm_processor_kwargs
=
None
,
limit_mm_per_prompt
=
{
"image"
:
1
},
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
),
)
info
=
processor
.
info
seen_aspect_ratios
=
set
[
float
]()
image_sizes
=
list
[
ImageSize
]()
# The aspect ratio of the grid layout is between 1 and 2
# NOTE: Assumes that feature size calculation is the same if we
# swap the width and height of the image
for
w
,
h
in
itertools
.
product
(
range
(
32
,
4096
),
repeat
=
2
):
aspect_ratio
=
w
/
h
if
1
<=
aspect_ratio
<=
2
and
aspect_ratio
not
in
seen_aspect_ratios
:
image_sizes
.
append
(
ImageSize
(
w
,
h
))
seen_aspect_ratios
.
add
(
aspect_ratio
)
failed_size_excs
=
list
[
tuple
[
ImageSize
,
Exception
]]()
validate_one
=
partial
(
_validate_image_max_tokens_one
,
processor
,
info
.
get_max_image_tokens
(),
# type: ignore
failed_size_excs
,
)
pqdm
(
image_sizes
,
validate_one
,
n_jobs
=
8
,
desc
=
"Validating image sizes"
)
if
failed_size_excs
:
msg
=
"Found failing image sizes:"
\
+
"
\n
========
\n
"
.
join
(
f
"[
{
size
}
]
\n
{
exc
}
"
for
size
,
exc
in
failed_size_excs
)
raise
AssertionError
(
msg
)
def
_validate_image_prompt_replacements_one
(
processor
:
BaseMultiModalProcessor
,
num_imgs
:
int
,
...
...
tests/models/multimodal/processing/test_llava_onevision.py
View file @
874f7c29
...
...
@@ -13,6 +13,68 @@ from vllm.multimodal.utils import cached_get_tokenizer
from
...utils
import
build_model_context
def
_validate_image_max_tokens_one
(
processor
:
BaseMultiModalProcessor
,
max_tokens
:
int
,
failed_size_excs
:
list
[
tuple
[
ImageSize
,
Exception
]],
image_size
:
ImageSize
,
)
->
None
:
info
=
processor
.
info
feature_size
=
info
.
get_num_image_tokens
(
image_width
=
image_size
.
width
,
image_height
=
image_size
.
height
)
try
:
assert
feature_size
<=
max_tokens
,
f
"
{
feature_size
}
<=
{
max_tokens
}
"
except
Exception
as
exc
:
failed_size_excs
.
append
((
image_size
,
exc
))
@
pytest
.
mark
.
skip
(
"This test takes around 5 minutes to run. "
"Comment this out to run it manually."
)
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
])
def
test_processor_max_tokens
(
model_id
):
ctx
=
build_model_context
(
model_name
=
model_id
,
tokenizer_name
=
model_id
,
mm_processor_kwargs
=
None
,
limit_mm_per_prompt
=
{
"image"
:
1
},
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
),
)
info
=
processor
.
info
seen_aspect_ratios
=
set
[
float
]()
image_sizes
=
list
[
ImageSize
]()
# The aspect ratio of the grid layout is between 1 and 6
# NOTE: Assumes that feature size calculation is the same if we
# swap the width and height of the image
for
w
,
h
in
itertools
.
product
(
range
(
32
,
4096
),
repeat
=
2
):
aspect_ratio
=
w
/
h
if
1
<=
aspect_ratio
<=
6
and
aspect_ratio
not
in
seen_aspect_ratios
:
image_sizes
.
append
(
ImageSize
(
w
,
h
))
seen_aspect_ratios
.
add
(
aspect_ratio
)
failed_size_excs
=
list
[
tuple
[
ImageSize
,
Exception
]]()
validate_one
=
partial
(
_validate_image_max_tokens_one
,
processor
,
info
.
get_max_image_tokens
(),
# type: ignore
failed_size_excs
,
)
pqdm
(
image_sizes
,
validate_one
,
n_jobs
=
8
,
desc
=
"Validating image sizes"
)
if
failed_size_excs
:
msg
=
"Found failing image sizes:"
\
+
"
\n
========
\n
"
.
join
(
f
"[
{
size
}
]
\n
{
exc
}
"
for
size
,
exc
in
failed_size_excs
)
raise
AssertionError
(
msg
)
def
_validate_image_prompt_replacements_one
(
processor
:
BaseMultiModalProcessor
,
num_imgs
:
int
,
...
...
vllm/model_executor/models/llava_onevision.py
View file @
874f7c29
...
...
@@ -19,8 +19,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
(
MultiModalFieldConfig
,
MultiModalKwargs
,
NestedTensors
)
from
vllm.multimodal.parse
import
(
MultiModalDataItems
,
VideoEmbeddingItems
,
VideoProcessorItems
)
from
vllm.multimodal.parse
import
(
ImageSize
,
MultiModalDataItems
,
VideoEmbeddingItems
,
VideoProcessorItems
)
from
vllm.multimodal.processing
import
PromptReplacement
from
vllm.multimodal.profiling
import
ProcessorInputs
from
vllm.sequence
import
IntermediateTensors
...
...
@@ -145,6 +145,10 @@ class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo):
return
(
unpadded_features
,
newline_features
)
def
get_image_size_with_most_features
(
self
)
->
ImageSize
:
# NOTE: This hardcoded value is found via processor tests
return
ImageSize
(
width
=
1153
,
height
=
944
)
def
_get_num_frame_tokens
(
self
,
*
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment