Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
377d10bd
Unverified
Commit
377d10bd
authored
Feb 19, 2025
by
Cyrus Leung
Committed by
GitHub
Feb 19, 2025
Browse files
[VLM][Bugfix] Pass processor kwargs properly on init (#13516)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
52ce14d3
Changes
44
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
383 additions
and
288 deletions
+383
-288
examples/offline_inference/vision_language_multi_image.py
examples/offline_inference/vision_language_multi_image.py
+1
-0
tests/models/multimodal/processing/test_common.py
tests/models/multimodal/processing/test_common.py
+2
-5
tests/models/multimodal/processing/test_h2ovl.py
tests/models/multimodal/processing/test_h2ovl.py
+131
-94
tests/models/multimodal/processing/test_idefics3.py
tests/models/multimodal/processing/test_idefics3.py
+16
-8
tests/models/multimodal/processing/test_internvl.py
tests/models/multimodal/processing/test_internvl.py
+107
-35
tests/models/multimodal/processing/test_llava_next.py
tests/models/multimodal/processing/test_llava_next.py
+4
-13
tests/models/multimodal/processing/test_llava_onevision.py
tests/models/multimodal/processing/test_llava_onevision.py
+4
-13
tests/models/multimodal/processing/test_phi3v.py
tests/models/multimodal/processing/test_phi3v.py
+7
-6
tests/models/multimodal/processing/test_qwen2_vl.py
tests/models/multimodal/processing/test_qwen2_vl.py
+8
-8
tests/models/utils.py
tests/models/utils.py
+11
-7
tests/multimodal/test_processing.py
tests/multimodal/test_processing.py
+5
-5
vllm/inputs/registry.py
vllm/inputs/registry.py
+30
-47
vllm/model_executor/models/aria.py
vllm/model_executor/models/aria.py
+2
-2
vllm/model_executor/models/chameleon.py
vllm/model_executor/models/chameleon.py
+2
-2
vllm/model_executor/models/deepseek_vl2.py
vllm/model_executor/models/deepseek_vl2.py
+5
-10
vllm/model_executor/models/fuyu.py
vllm/model_executor/models/fuyu.py
+2
-2
vllm/model_executor/models/glm4v.py
vllm/model_executor/models/glm4v.py
+6
-9
vllm/model_executor/models/gritlm.py
vllm/model_executor/models/gritlm.py
+2
-7
vllm/model_executor/models/h2ovl.py
vllm/model_executor/models/h2ovl.py
+31
-10
vllm/model_executor/models/idefics3.py
vllm/model_executor/models/idefics3.py
+7
-5
No files found.
examples/offline_inference/vision_language_multi_image.py
View file @
377d10bd
...
...
@@ -85,6 +85,7 @@ def load_h2ovl(question: str, image_urls: List[str]) -> ModelRequestData:
trust_remote_code
=
True
,
max_model_len
=
8192
,
limit_mm_per_prompt
=
{
"image"
:
len
(
image_urls
)},
mm_processor_kwargs
=
{
"max_dynamic_patch"
:
4
},
)
placeholders
=
"
\n
"
.
join
(
f
"Image-
{
i
}
: <image>
\n
"
...
...
tests/models/multimodal/processing/test_common.py
View file @
377d10bd
...
...
@@ -10,7 +10,7 @@ from vllm.config import ModelConfig
from
vllm.inputs
import
InputProcessingContext
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.processing
import
ProcessingCache
from
vllm.
multimodal.utils
import
cached_
get_
tokenizer
from
vllm.
transformers_utils.tokenizer
import
cached_tokenizer
_from_config
from
....multimodal.utils
import
random_audio
,
random_image
,
random_video
from
...registry
import
HF_EXAMPLE_MODELS
...
...
@@ -42,10 +42,7 @@ def _test_processing_correctness(
factories
=
MULTIMODAL_REGISTRY
.
_processor_factories
[
model_cls
]
ctx
=
InputProcessingContext
(
model_config
,
tokenizer
=
cached_get_tokenizer
(
model_config
.
tokenizer
,
trust_remote_code
=
model_info
.
trust_remote_code
,
),
tokenizer
=
cached_tokenizer_from_config
(
model_config
),
)
# Ensure that it can fit all of the data
cache
=
ProcessingCache
(
capacity
=
1
<<
30
)
...
...
tests/models/multimodal/processing/test_h2ovl.py
View file @
377d10bd
# SPDX-License-Identifier: Apache-2.0
"""Tests for H2OVL's multimodal preprocessing kwargs."""
from
typing
import
Optional
from
typing
import
Mapping
,
Optional
import
pytest
from
PIL
import
Image
from
transformers
import
PretrainedConfig
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.image
import
rescale_image_size
from
vllm.multimodal.utils
import
cached_get_tokenizer
from
vllm.multimodal.processing
import
BaseMultiModalProcessor
from
vllm.transformers_utils.tokenizer
import
cached_tokenizer_from_config
from
....conftest
import
_ImageAssets
from
...utils
import
build_model_context
def
_get_expected_num_patches
(
config
:
PretrainedConfig
,
image
:
Image
.
Image
,
num_imgs
:
int
,
min_num
:
int
,
max_num
:
int
,
):
from
vllm.model_executor.models.h2ovl
import
(
calculate_h2ovl_targets
,
get_h2ovl_target_ratios
)
width
,
height
=
image
.
size
# Calculate the expected number of blocks
if
num_imgs
==
1
and
config
.
use_msac
:
# First pass
blocks1
,
_
,
_
,
aspect_ratio
=
calculate_h2ovl_targets
(
orig_width
=
width
,
orig_height
=
height
,
target_ratios
=
get_h2ovl_target_ratios
(
min_num
=
1
,
max_num
=
max_num
,
prior_aspect_ratio
=
None
,
),
image_size
=
config
.
vision_config
.
image_size
,
use_thumbnail
=
False
,
# Thumbnail is handled separately
)
# Second pass
blocks2
,
_
,
_
,
_
=
calculate_h2ovl_targets
(
orig_width
=
width
,
orig_height
=
height
,
target_ratios
=
get_h2ovl_target_ratios
(
min_num
=
3
,
max_num
=
max_num
,
prior_aspect_ratio
=
aspect_ratio
,
),
image_size
=
config
.
vision_config
.
image_size
,
use_thumbnail
=
False
,
)
# Add thumbnail if use_thumbnail is True and total_blocks > 1
if
config
.
use_thumbnail
:
blocks1
+=
1
if
blocks1
>
1
else
0
blocks2
+=
1
if
blocks2
>
1
else
0
# Total blocks is the sum of blocks from both passes minus
# overlapping
total_blocks
=
blocks1
+
blocks2
-
1
return
total_blocks
blocks
,
_
,
_
,
_
=
calculate_h2ovl_targets
(
orig_width
=
width
,
orig_height
=
height
,
target_ratios
=
get_h2ovl_target_ratios
(
min_num
,
max_num
,
prior_aspect_ratio
=
None
,
),
image_size
=
config
.
vision_config
.
image_size
,
use_thumbnail
=
False
,
)
expected_num_patches
=
blocks
if
config
.
use_thumbnail
and
expected_num_patches
>
1
:
expected_num_patches
+=
1
return
expected_num_patches
def
_run_check
(
processor
:
BaseMultiModalProcessor
,
images
:
list
[
Image
.
Image
],
min_num
:
int
,
max_num
:
int
,
mm_processor_kwargs
:
Mapping
[
str
,
object
],
):
tokenizer
=
processor
.
info
.
get_tokenizer
()
config
=
processor
.
info
.
get_hf_config
()
mm_data
=
{
"image"
:
images
}
total_expected_num_patches
=
sum
(
_get_expected_num_patches
(
config
,
image
,
len
(
images
),
min_num
,
max_num
)
for
image
in
images
)
processed_inputs
=
processor
.
apply
(
"<image>"
*
len
(
images
),
mm_data
,
mm_processor_kwargs
)
# Ensure we have the right number of placeholders per num_crops size
image_token_id
=
tokenizer
.
convert_tokens_to_ids
(
"<IMG_CONTEXT>"
)
img_tok_count
=
processed_inputs
[
"prompt_token_ids"
].
count
(
image_token_id
)
pixel_shape
=
processed_inputs
[
"mm_kwargs"
][
"pixel_values_flat"
].
shape
assert
img_tok_count
==
256
*
total_expected_num_patches
assert
pixel_shape
[
0
]
==
total_expected_num_patches
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"h2oai/h2ovl-mississippi-800m"
,
"h2oai/h2ovl-mississippi-2b"
,
...
...
@@ -25,118 +126,54 @@ from ...utils import build_model_context
[
1.0
,
1.0
,
1.0
],
# Multi-scale
[
0.25
,
0.5
,
1.0
],
[
4.0
,
2.0
,
1.0
],
],
)
@
pytest
.
mark
.
parametrize
(
"max_dynamic_patch"
,
[
1
,
2
,
4
,
8
])
@
pytest
.
mark
.
parametrize
(
(
"min_dynamic_patch"
,
"max_dynamic_patch"
),
[(
1
,
1
),
(
1
,
2
),
(
1
,
4
),
(
1
,
8
),
(
2
,
4
),
(
4
,
8
)],
)
@
pytest
.
mark
.
parametrize
(
"dynamic_image_size"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"
num_imgs"
,
[
1
,
2
])
@
pytest
.
mark
.
parametrize
(
"
kwargs_on_init"
,
[
True
,
False
])
def
test_processor_override
(
model_id
:
str
,
image_assets
:
_ImageAssets
,
size_factors
:
list
[
int
],
min_dynamic_patch
:
int
,
max_dynamic_patch
:
int
,
dynamic_image_size
:
Optional
[
bool
],
num_imgs
:
int
,
kwargs_on_init
:
bool
,
):
from
vllm.model_executor.models.h2ovl
import
(
calculate_h2ovl_targets
,
get_h2ovl_target_ratios
)
mm_processor_kwargs
=
{
"min_dynamic_patch"
:
min_dynamic_patch
,
"max_dynamic_patch"
:
max_dynamic_patch
,
"dynamic_image_size"
:
dynamic_image_size
,
}
ctx
=
build_model_context
(
model_name
=
model_id
,
tokenizer_name
=
model_id
,
trust_remote_code
=
True
,
mm_processor_kwargs
=
None
,
limit_mm_per_prompt
=
{
"image"
:
num_imgs
},
)
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
,
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
mm_processor_kwargs
=
mm_processor_kwargs
if
kwargs_on_init
else
None
,
limit_mm_per_prompt
=
{
"image"
:
len
(
size_factors
)},
)
tokenizer
=
cached_tokenizer_from_config
(
ctx
.
model_config
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
tokenizer
,
)
hf_processor_mm_kwargs
=
{}
if
kwargs_on_init
else
mm_processor_kwargs
config
=
processor
.
info
.
get_hf_config
()
use_msac
=
config
.
use_msac
mm_processor_kwargs
=
{
"max_dynamic_patch"
:
max_dynamic_patch
,
}
if
dynamic_image_size
is
not
None
:
mm_processor_kwargs
[
"dynamic_image_size"
]
=
dynamic_image_size
min_num
=
config
.
min_dynamic_patch
min_num
=
min_dynamic_patch
if
dynamic_image_size
else
1
max_num
=
max_dynamic_patch
if
dynamic_image_size
else
1
# Build the image str / prompt based on the number of images we pass
prompt
=
"<image>"
*
num_imgs
for
asset
in
image_assets
:
for
factor
in
size_factors
:
image
=
rescale_image_size
(
asset
.
pil_image
,
factor
)
mm_data
=
{
"image"
:
[
image
]
*
num_imgs
}
width
,
height
=
image
.
size
# Calculate the expected number of blocks
if
num_imgs
==
1
and
use_msac
:
# First pass
blocks1
,
_
,
_
,
aspect_ratio
=
calculate_h2ovl_targets
(
orig_width
=
width
,
orig_height
=
height
,
target_ratios
=
get_h2ovl_target_ratios
(
min_num
,
max_num
,
prior_aspect_ratio
=
None
,
),
image_size
=
config
.
vision_config
.
image_size
,
use_thumbnail
=
False
,
# Thumbnail is handled separately
)
# Second pass
blocks2
,
_
,
_
,
_
=
calculate_h2ovl_targets
(
orig_width
=
width
,
orig_height
=
height
,
target_ratios
=
get_h2ovl_target_ratios
(
min_num
,
max_num
,
prior_aspect_ratio
=
aspect_ratio
,
),
image_size
=
config
.
vision_config
.
image_size
,
use_thumbnail
=
False
,
)
# Add thumbnail if use_thumbnail is True and total_blocks > 1
if
config
.
use_thumbnail
:
blocks1
+=
1
if
blocks1
>
1
else
0
blocks2
+=
1
if
blocks2
>
1
else
0
# Total blocks is the sum of blocks from both passes minus
# overlapping
total_blocks
=
blocks1
+
blocks2
-
1
expected_num_patches
=
total_blocks
else
:
blocks
,
_
,
_
,
_
=
calculate_h2ovl_targets
(
orig_width
=
width
,
orig_height
=
height
,
target_ratios
=
get_h2ovl_target_ratios
(
min_num
,
max_num
,
prior_aspect_ratio
=
None
,
),
image_size
=
config
.
vision_config
.
image_size
,
use_thumbnail
=
False
,
)
expected_num_patches
=
blocks
if
config
.
use_thumbnail
and
expected_num_patches
!=
1
:
expected_num_patches
+=
1
processed_inputs
=
processor
.
apply
(
prompt
,
mm_data
,
mm_processor_kwargs
)
pixel_shape
=
(
processed_inputs
[
"mm_kwargs"
][
"pixel_values_flat"
].
shape
)
assert
pixel_shape
[
0
]
==
expected_num_patches
*
num_imgs
_run_check
(
processor
,
[
rescale_image_size
(
image_assets
[
0
].
pil_image
,
f
)
for
f
in
size_factors
],
min_num
,
max_num
,
hf_processor_mm_kwargs
,
)
tests/models/multimodal/processing/test_idefics3.py
View file @
377d10bd
...
...
@@ -4,7 +4,7 @@ import pytest
from
transformers
import
Idefics3Config
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.
multimodal.utils
import
cached_
get_
tokenizer
from
vllm.
transformers_utils.tokenizer
import
cached_tokenizer
_from_config
from
....conftest
import
_ImageAssets
from
...utils
import
build_model_context
...
...
@@ -22,9 +22,15 @@ models = ["HuggingFaceM4/Idefics3-8B-Llama3"]
])
# yapf: enable
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
2
])
def
test_processor_override
(
image_assets
:
_ImageAssets
,
model
:
str
,
mm_processor_kwargs
:
dict
[
str
,
object
],
expected_toks_per_img
:
int
,
num_imgs
:
int
):
@
pytest
.
mark
.
parametrize
(
"kwargs_on_init"
,
[
True
,
False
])
def
test_processor_override
(
image_assets
:
_ImageAssets
,
model
:
str
,
mm_processor_kwargs
:
dict
[
str
,
object
],
expected_toks_per_img
:
int
,
num_imgs
:
int
,
kwargs_on_init
:
bool
,
):
"""Ensure input_processor_for_idefics3 handles num_crops properly."""
# Same as the previous test - don't initialize mm_processor_kwargs
# in this test and assume that the kwargs will be correctly expanded by
...
...
@@ -33,15 +39,15 @@ def test_processor_override(image_assets: _ImageAssets, model: str,
model_name
=
model
,
tokenizer_name
=
model
,
trust_remote_code
=
True
,
mm_processor_kwargs
=
None
,
mm_processor_kwargs
=
mm_processor_kwargs
if
kwargs_on_init
else
None
,
limit_mm_per_prompt
=
{
"image"
:
num_imgs
},
)
tokenizer
=
cached_
get_
tokenizer
(
ctx
.
model_config
.
tokenizer
)
tokenizer
=
cached_tokenizer
_from_config
(
ctx
.
model_config
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
tokenizer
,
)
hf_processor
=
processor
.
info
.
get_hf_processor
(
**
mm_processor_kwargs
)
hf_processor
_mm_kwargs
=
{}
if
kwargs_on_init
else
mm_processor_kwargs
# Build the image str / prompt based on the number of images we pass
placeholders
=
"<image>"
if
num_imgs
==
1
else
"
\n
"
.
join
(
...
...
@@ -54,8 +60,10 @@ def test_processor_override(image_assets: _ImageAssets, model: str,
dummy_image
=
image_assets
[
0
].
pil_image
.
resize
(
dummy_image_size
)
mm_data
=
{
"image"
:
[
dummy_image
]
*
num_imgs
}
processed_inputs
=
processor
.
apply
(
prompt
,
mm_data
,
mm_processor_kwargs
)
processed_inputs
=
processor
.
apply
(
prompt
,
mm_data
,
hf_processor_mm_kwargs
)
# Ensure the placeholders format are correct
hf_processor
=
processor
.
info
.
get_hf_processor
(
**
hf_processor_mm_kwargs
)
hf_processed_inputs
=
hf_processor
(
text
=
prompt
,
images
=
mm_data
[
"image"
])
assert
processed_inputs
[
"prompt_token_ids"
]
==
hf_processed_inputs
[
"input_ids"
][
0
]
...
...
tests/models/multimodal/processing/test_internvl.py
View file @
377d10bd
# SPDX-License-Identifier: Apache-2.0
"""Tests for InternVL's multimodal preprocessing kwargs."""
from
typing
import
Optional
from
typing
import
Mapping
,
Optional
import
pytest
from
PIL
import
Image
from
transformers
import
PretrainedConfig
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.utils
import
cached_get_tokenizer
from
vllm.multimodal.image
import
rescale_image_size
from
vllm.multimodal.processing
import
BaseMultiModalProcessor
from
vllm.transformers_utils.tokenizer
import
cached_tokenizer_from_config
from
....conftest
import
_ImageAssets
from
...utils
import
build_model_context
def
_get_expected_num_patches
(
config
:
PretrainedConfig
,
image
:
Image
.
Image
,
num_imgs
:
int
,
min_num
:
int
,
max_num
:
int
,
):
from
vllm.model_executor.models.internvl
import
(
calculate_internvl_targets
,
get_internvl_target_ratios
)
width
,
height
=
image
.
size
blocks
,
_
,
_
=
calculate_internvl_targets
(
orig_width
=
width
,
orig_height
=
height
,
target_ratios
=
get_internvl_target_ratios
(
min_num
,
max_num
,
),
image_size
=
config
.
vision_config
.
image_size
,
use_thumbnail
=
False
,
)
expected_num_patches
=
blocks
if
config
.
use_thumbnail
and
expected_num_patches
>
1
:
expected_num_patches
+=
1
return
expected_num_patches
def
_run_check
(
processor
:
BaseMultiModalProcessor
,
images
:
list
[
Image
.
Image
],
min_num
:
int
,
max_num
:
int
,
mm_processor_kwargs
:
Mapping
[
str
,
object
],
):
tokenizer
=
processor
.
info
.
get_tokenizer
()
config
=
processor
.
info
.
get_hf_config
()
mm_data
=
{
"image"
:
images
}
total_expected_num_patches
=
sum
(
_get_expected_num_patches
(
config
,
image
,
len
(
images
),
min_num
,
max_num
)
for
image
in
images
)
processed_inputs
=
processor
.
apply
(
"<image>"
*
len
(
images
),
mm_data
,
mm_processor_kwargs
)
# Ensure we have the right number of placeholders per num_crops size
image_token_id
=
tokenizer
.
convert_tokens_to_ids
(
"<IMG_CONTEXT>"
)
img_tok_count
=
processed_inputs
[
"prompt_token_ids"
].
count
(
image_token_id
)
pixel_shape
=
processed_inputs
[
"mm_kwargs"
][
"pixel_values_flat"
].
shape
assert
img_tok_count
==
256
*
total_expected_num_patches
assert
pixel_shape
[
0
]
==
total_expected_num_patches
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"OpenGVLab/InternVL2-2B"
])
@
pytest
.
mark
.
parametrize
(
"max_dynamic_patch"
,
[
1
,
4
])
@
pytest
.
mark
.
parametrize
(
"dynamic_image_size"
,
[
True
,
False
,
None
])
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
2
])
@
pytest
.
mark
.
parametrize
(
"size_factors"
,
[
# Single-scale
[
1.0
],
# Single-scale, batched
[
1.0
,
1.0
,
1.0
],
# Multi-scale
[
0.25
,
0.5
,
1.0
],
[
4.0
,
2.0
,
1.0
],
],
)
@
pytest
.
mark
.
parametrize
(
(
"min_dynamic_patch"
,
"max_dynamic_patch"
),
[(
1
,
1
),
(
1
,
2
),
(
1
,
4
),
(
1
,
8
),
(
2
,
4
),
(
4
,
8
)],
)
@
pytest
.
mark
.
parametrize
(
"dynamic_image_size"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"kwargs_on_init"
,
[
True
,
False
])
def
test_processor_override
(
model_id
:
str
,
image_assets
:
_ImageAssets
,
size_factors
:
list
[
int
],
min_dynamic_patch
:
int
,
max_dynamic_patch
:
int
,
dynamic_image_size
:
Optional
[
bool
],
num_imgs
:
int
,
kwargs_on_init
:
bool
,
):
mm_processor_kwargs
=
{
"min_dynamic_patch"
:
min_dynamic_patch
,
"max_dynamic_patch"
:
max_dynamic_patch
,
"dynamic_image_size"
:
dynamic_image_size
,
}
ctx
=
build_model_context
(
model_name
=
model_id
,
tokenizer_name
=
model_id
,
trust_remote_code
=
True
,
mm_processor_kwargs
=
None
,
limit_mm_per_prompt
=
{
"image"
:
num_imgs
},
)
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
,
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
mm_processor_kwargs
=
mm_processor_kwargs
if
kwargs_on_init
else
None
,
limit_mm_per_prompt
=
{
"image"
:
len
(
size_factors
)},
)
tokenizer
=
cached_tokenizer_from_config
(
ctx
.
model_config
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
tokenizer
,
)
hf_processor_mm_kwargs
=
{}
if
kwargs_on_init
else
mm_processor_kwargs
mm_processor_kwargs
=
{
"max_dynamic_patch"
:
max_dynamic_patch
,
}
if
dynamic_image_size
is
not
None
:
mm_processor_kwargs
[
"dynamic_image_size"
]
=
dynamic_image_size
min_num
=
min_dynamic_patch
if
dynamic_image_size
else
1
max_num
=
max_dynamic_patch
if
dynamic_image_size
else
1
# Build the image str / prompt based on the number of images we pass
prompt
=
"<image>"
*
num_imgs
image
=
image_assets
[
0
].
pil_image
.
resize
((
448
*
2
,
448
*
2
))
mm_data
=
{
"image"
:
[
image
]
*
num_imgs
}
expected_num_patches
=
max_dynamic_patch
+
1
if
max_dynamic_patch
>
1
else
1
if
dynamic_image_size
is
False
:
expected_num_patches
=
1
processed_inputs
=
processor
.
apply
(
prompt
,
mm_data
,
mm_processor_kwargs
)
# Ensure we have the right number of placeholders per num_crops size
image_token_id
=
tokenizer
.
convert_tokens_to_ids
(
"<IMG_CONTEXT>"
)
img_tok_count
=
processed_inputs
[
"prompt_token_ids"
].
count
(
image_token_id
)
pixel_shape
=
processed_inputs
[
"mm_kwargs"
][
"pixel_values_flat"
].
shape
assert
img_tok_count
==
256
*
expected_num_patches
*
num_imgs
assert
pixel_shape
[
0
]
==
expected_num_patches
*
num_imgs
_run_check
(
processor
,
[
rescale_image_size
(
image_assets
[
0
].
pil_image
,
f
)
for
f
in
size_factors
],
min_num
,
max_num
,
hf_processor_mm_kwargs
,
)
tests/models/multimodal/processing/test_llava_next.py
View file @
377d10bd
...
...
@@ -10,7 +10,7 @@ from pqdm.threads import pqdm
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.parse
import
ImageSize
from
vllm.multimodal.processing
import
BaseMultiModalProcessor
from
vllm.
multimodal.utils
import
cached_
get_
tokenizer
from
vllm.
transformers_utils.tokenizer
import
cached_tokenizer
_from_config
from
...utils
import
build_model_context
...
...
@@ -43,10 +43,7 @@ def test_processor_max_tokens(model_id):
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
,
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
),
tokenizer
=
cached_tokenizer_from_config
(
ctx
.
model_config
),
)
info
=
processor
.
info
...
...
@@ -146,10 +143,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
,
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
),
tokenizer
=
cached_tokenizer_from_config
(
ctx
.
model_config
),
)
image_ratios
=
[(
171
,
152
),
(
184
,
161
),
(
198
,
176
),
(
333
,
296
),
(
369
,
328
),
...
...
@@ -179,10 +173,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
,
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
),
tokenizer
=
cached_tokenizer_from_config
(
ctx
.
model_config
),
)
seen_aspect_ratios
=
set
[
float
]()
...
...
tests/models/multimodal/processing/test_llava_onevision.py
View file @
377d10bd
...
...
@@ -10,7 +10,7 @@ from pqdm.threads import pqdm
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.parse
import
ImageSize
from
vllm.multimodal.processing
import
BaseMultiModalProcessor
from
vllm.
multimodal.utils
import
cached_
get_
tokenizer
from
vllm.
transformers_utils.tokenizer
import
cached_tokenizer
_from_config
from
...utils
import
build_model_context
...
...
@@ -44,10 +44,7 @@ def test_processor_max_tokens(model_id):
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
,
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
),
tokenizer
=
cached_tokenizer_from_config
(
ctx
.
model_config
),
)
info
=
processor
.
info
...
...
@@ -146,10 +143,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
,
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
),
tokenizer
=
cached_tokenizer_from_config
(
ctx
.
model_config
),
)
image_ratios
=
[(
171
,
152
),
(
184
,
161
),
(
198
,
176
),
(
333
,
296
),
(
369
,
328
),
...
...
@@ -180,10 +174,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
,
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
),
tokenizer
=
cached_tokenizer_from_config
(
ctx
.
model_config
),
)
seen_aspect_ratios
=
set
[
float
]()
...
...
tests/models/multimodal/processing/test_phi3v.py
View file @
377d10bd
...
...
@@ -3,7 +3,7 @@
import
pytest
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.
multimodal.utils
import
cached_
get_
tokenizer
from
vllm.
transformers_utils.tokenizer
import
cached_tokenizer
_from_config
from
....conftest
import
_ImageAssets
from
...utils
import
build_model_context
...
...
@@ -21,12 +21,14 @@ from ...utils import build_model_context
])
# yapf: enable
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
2
])
@
pytest
.
mark
.
parametrize
(
"kwargs_on_init"
,
[
True
,
False
])
def
test_processor_override
(
image_assets
:
_ImageAssets
,
model_id
:
str
,
mm_processor_kwargs
:
dict
[
str
,
int
],
expected_toks_per_img
:
int
,
num_imgs
:
int
,
kwargs_on_init
:
bool
,
):
"""Ensure input_processor_for_phi3v handles num_crops properly."""
# Avoid initializing CUDA early
...
...
@@ -36,23 +38,22 @@ def test_processor_override(
model_name
=
model_id
,
tokenizer_name
=
model_id
,
trust_remote_code
=
True
,
mm_processor_kwargs
=
mm_processor_kwargs
if
kwargs_on_init
else
None
,
limit_mm_per_prompt
=
{
"image"
:
num_imgs
},
)
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
,
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
)
tokenizer
=
cached_tokenizer_from_config
(
ctx
.
model_config
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
tokenizer
,
)
hf_processor_mm_kwargs
=
{}
if
kwargs_on_init
else
mm_processor_kwargs
# Build the image str / prompt based on the number of images we pass
img_str
=
""
.
join
([
f
"<|image_
{
idx
}
|>
\n
"
for
idx
in
range
(
1
,
num_imgs
+
1
)])
prompt
=
f
"<|user|>
\n
{
img_str
}
<|end|>
\n
<|assistant|>
\n
"
mm_data
=
{
"image"
:
[
image_assets
[
0
].
pil_image
]
*
num_imgs
}
processed_inputs
=
processor
.
apply
(
prompt
,
mm_data
,
mm
_processor_kwargs
)
processed_inputs
=
processor
.
apply
(
prompt
,
mm_data
,
hf
_processor_
mm_
kwargs
)
# Ensure we have the right number of placeholders per num_crops size
img_tok_count
=
processed_inputs
[
"prompt_token_ids"
].
count
(
_IMAGE_TOKEN_ID
)
...
...
tests/models/multimodal/processing/test_qwen2_vl.py
View file @
377d10bd
...
...
@@ -3,7 +3,7 @@
import
pytest
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.
multimodal.utils
import
cached_
get_
tokenizer
from
vllm.
transformers_utils.tokenizer
import
cached_tokenizer
_from_config
from
....conftest
import
_ImageAssets
from
...utils
import
build_model_context
...
...
@@ -18,6 +18,7 @@ from ...utils import build_model_context
])
# yapf: enable
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
2
])
@
pytest
.
mark
.
parametrize
(
"kwargs_on_init"
,
[
True
,
False
])
def
test_processor_override
(
image_assets
:
_ImageAssets
,
model_id
:
str
,
...
...
@@ -25,31 +26,30 @@ def test_processor_override(
expected_toks_per_img
:
int
,
expected_pixels_shape
:
tuple
[
int
,
int
],
num_imgs
:
int
,
kwargs_on_init
:
bool
,
):
"""Ensure Qwen2VLMultiModalProcessor handles min/max pixels properly."""
ctx
=
build_model_context
(
model_name
=
model_id
,
tokenizer_name
=
model_id
,
mm_processor_kwargs
=
None
,
mm_processor_kwargs
=
mm_processor_kwargs
if
kwargs_on_init
else
None
,
limit_mm_per_prompt
=
{
"image"
:
num_imgs
},
)
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
,
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
)
tokenizer
=
cached_tokenizer_from_config
(
ctx
.
model_config
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
tokenizer
,
)
hf_processor_mm_kwargs
=
{}
if
kwargs_on_init
else
mm_processor_kwargs
# Build the image str / prompt based on the number of images we pass
prompt
=
"<|vision_start|><|image_pad|><|vision_end|>"
*
num_imgs
mm_data
=
{
"image"
:
[
image_assets
[
0
].
pil_image
]
*
num_imgs
}
processed_inputs
=
processor
.
apply
(
prompt
,
mm_data
,
mm
_processor_kwargs
)
processed_inputs
=
processor
.
apply
(
prompt
,
mm_data
,
hf
_processor_
mm_
kwargs
)
# Ensure we have the right number of placeholders per num_crops size
hf_processor
=
processor
.
info
.
get_hf_processor
(
**
mm
_processor_kwargs
)
hf_processor
=
processor
.
info
.
get_hf_processor
(
**
hf
_processor_
mm_
kwargs
)
image_token_id
=
tokenizer
.
convert_tokens_to_ids
(
hf_processor
.
image_token
)
img_tok_count
=
processed_inputs
[
"prompt_token_ids"
].
count
(
image_token_id
)
pixel_shape
=
processed_inputs
[
"mm_kwargs"
][
"pixel_values"
].
shape
...
...
tests/models/utils.py
View file @
377d10bd
...
...
@@ -248,13 +248,16 @@ def check_logprobs_close(
warnings
.
warn
(
fail_msg
,
stacklevel
=
2
)
def
build_model_context
(
model_name
:
str
,
task
:
TaskOption
=
"auto"
,
tokenizer_name
:
Optional
[
str
]
=
None
,
trust_remote_code
:
bool
=
False
,
dtype
:
Optional
[
Union
[
str
,
torch
.
dtype
]]
=
None
,
mm_processor_kwargs
:
Optional
[
Dict
]
=
None
,
limit_mm_per_prompt
:
Optional
[
Dict
]
=
None
):
def
build_model_context
(
model_name
:
str
,
task
:
TaskOption
=
"auto"
,
tokenizer_name
:
Optional
[
str
]
=
None
,
trust_remote_code
:
bool
=
False
,
dtype
:
Optional
[
Union
[
str
,
torch
.
dtype
]]
=
None
,
mm_processor_kwargs
:
Optional
[
Dict
]
=
None
,
limit_mm_per_prompt
:
Optional
[
Dict
]
=
None
,
disable_mm_preprocessor_cache
:
bool
=
True
,
):
"""Creates an InputContext for a given model.
Args:
...
...
@@ -283,5 +286,6 @@ def build_model_context(model_name: str,
seed
=
0
,
mm_processor_kwargs
=
mm_processor_kwargs
,
limit_mm_per_prompt
=
limit_mm_per_prompt
,
disable_mm_preprocessor_cache
=
disable_mm_preprocessor_cache
,
)
return
InputContext
(
model_config
)
tests/multimodal/test_processing.py
View file @
377d10bd
...
...
@@ -22,8 +22,8 @@ from vllm.multimodal.processing import (PlaceholderFeaturesInfo,
replace_token_matches
)
# yapf: enable
from
vllm.multimodal.profiling
import
MultiModalProfiler
from
vllm.
multimodal.utils
import
cached_get_t
okenizer
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
from
vllm.
transformers_utils.tokenizer
import
(
AnyT
okenizer
,
cached_tokenizer_from_config
)
from
vllm.utils
import
full_groupby
from
.utils
import
random_image
...
...
@@ -577,7 +577,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
model_config
,
tokenizer
=
cached_
get_
tokenizer
(
model_config
.
tokenizer
),
tokenizer
=
cached_tokenizer
_from_config
(
model_config
),
)
profiler
=
MultiModalProfiler
(
processor
)
...
...
@@ -617,7 +617,7 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
model_config
,
tokenizer
=
cached_
get_
tokenizer
(
model_config
.
tokenizer
),
tokenizer
=
cached_tokenizer
_from_config
(
model_config
),
)
rng
=
np
.
random
.
RandomState
(
0
)
...
...
@@ -689,7 +689,7 @@ def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
model_config
,
tokenizer
=
cached_
get_
tokenizer
(
model_config
.
tokenizer
),
tokenizer
=
cached_tokenizer
_from_config
(
model_config
),
)
orig_get_hf_processor
=
processor
.
info
.
get_hf_processor
...
...
vllm/inputs/registry.py
View file @
377d10bd
...
...
@@ -11,8 +11,9 @@ from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
from
typing_extensions
import
TypeVar
,
assert_never
from
vllm.logger
import
init_logger
from
vllm.transformers_utils.processor
import
cached_get_processor
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
from
vllm.transformers_utils.processor
import
cached_processor_from_config
from
vllm.transformers_utils.tokenizer
import
(
AnyTokenizer
,
cached_tokenizer_from_config
)
from
vllm.utils
import
(
ClassRegistry
,
get_allowed_kwarg_only_overrides
,
resolve_mm_processor_kwargs
)
...
...
@@ -27,19 +28,9 @@ if TYPE_CHECKING:
logger
=
init_logger
(
__name__
)
C
=
TypeVar
(
"C"
,
bound
=
PretrainedConfig
,
default
=
PretrainedConfig
)
P
=
TypeVar
(
"P"
,
bound
=
ProcessorMixin
,
default
=
ProcessorMixin
)
class
HashableDict
(
dict
):
"""
A dictionary that can be hashed by lru_cache.
"""
# NOTE: pythonic dict is not hashable,
# we override on it directly for simplicity
def
__hash__
(
self
)
->
int
:
# type: ignore[override]
return
hash
(
frozenset
(
self
.
items
()))
_T
=
TypeVar
(
"_T"
)
_C
=
TypeVar
(
"_C"
,
bound
=
PretrainedConfig
,
default
=
PretrainedConfig
)
_P
=
TypeVar
(
"_P"
,
bound
=
ProcessorMixin
,
default
=
ProcessorMixin
)
@
dataclass
(
frozen
=
True
)
...
...
@@ -54,9 +45,9 @@ class InputContext:
def
get_hf_config
(
self
,
typ
:
Union
[
type
[
C
],
tuple
[
type
[
C
],
...]]
=
PretrainedConfig
,
typ
:
Union
[
type
[
_
C
],
tuple
[
type
[
_
C
],
...]]
=
PretrainedConfig
,
/
,
)
->
C
:
)
->
_
C
:
"""
Get the HuggingFace configuration
(:class:`transformers.PretrainedConfig`) of the model,
...
...
@@ -94,10 +85,10 @@ class InputContext:
def
get_hf_processor
(
self
,
typ
:
Union
[
type
[
P
],
tuple
[
type
[
P
],
...]]
=
ProcessorMixin
,
typ
:
Union
[
type
[
_
P
],
tuple
[
type
[
_
P
],
...]]
=
ProcessorMixin
,
/
,
**
kwargs
:
object
,
)
->
P
:
)
->
_
P
:
"""
Get the HuggingFace processor
(:class:`transformers.ProcessorMixin`) of the model,
...
...
@@ -106,33 +97,29 @@ class InputContext:
Raises:
TypeError: If the processor is not of the specified type.
"""
return
cached_processor_from_config
(
self
.
model_config
,
processor_cls
=
typ
,
**
kwargs
,
)
def
init_processor
(
self
,
typ
:
type
[
_T
],
/
,
**
kwargs
:
object
,
)
->
_T
:
"""
Initialize a HuggingFace-like processor class, merging the
keyword arguments with those in the model's configuration.
"""
base_kwargs
=
self
.
model_config
.
mm_processor_kwargs
if
base_kwargs
is
None
:
base_kwargs
=
{}
merged_kwargs
=
{
**
base_kwargs
,
**
kwargs
}
if
isinstance
(
typ
,
type
):
merged_kwargs
[
"processor_cls"
]
=
typ
# NOTE: Pythonic dict is not hashable and will raise unhashable type
# error when calling `cached_get_processor`, therefore we need to
# wrap it to a hashable dict.
for
key
,
value
in
merged_kwargs
.
items
():
if
isinstance
(
value
,
dict
):
merged_kwargs
[
key
]
=
HashableDict
(
value
)
hf_processor
=
cached_get_processor
(
self
.
model_config
.
model
,
trust_remote_code
=
self
.
model_config
.
trust_remote_code
,
**
merged_kwargs
,
)
if
not
isinstance
(
hf_processor
,
typ
):
raise
TypeError
(
"Invalid type of HuggingFace processor. "
f
"Expected type:
{
typ
}
, but "
f
"found type:
{
type
(
hf_processor
)
}
"
)
return
hf_processor
return
typ
(
**
merged_kwargs
)
@
dataclass
(
frozen
=
True
)
...
...
@@ -142,10 +129,10 @@ class InputProcessingContext(InputContext):
def
get_hf_processor
(
self
,
typ
:
Union
[
type
[
P
],
tuple
[
type
[
P
],
...]]
=
ProcessorMixin
,
typ
:
Union
[
type
[
_
P
],
tuple
[
type
[
_
P
],
...]]
=
ProcessorMixin
,
/
,
**
kwargs
:
object
,
)
->
P
:
)
->
_
P
:
return
super
().
get_hf_processor
(
typ
,
tokenizer
=
self
.
tokenizer
,
...
...
@@ -341,13 +328,9 @@ class InputRegistry:
from
vllm.model_executor.model_loader
import
get_model_architecture
from
vllm.multimodal
import
MultiModalKwargs
from
vllm.multimodal.profiling
import
MultiModalProfiler
from
vllm.multimodal.utils
import
cached_get_tokenizer
if
mm_registry
.
has_processor
(
model_config
):
tokenizer
=
cached_get_tokenizer
(
model_config
.
tokenizer
,
trust_remote_code
=
model_config
.
trust_remote_code
,
)
tokenizer
=
cached_tokenizer_from_config
(
model_config
)
processor
=
mm_registry
.
create_processor
(
model_config
,
tokenizer
)
profiler
=
MultiModalProfiler
(
processor
)
dummy_data
=
profiler
.
get_dummy_data
(
...
...
vllm/model_executor/models/aria.py
View file @
377d10bd
...
...
@@ -400,8 +400,8 @@ class AriaProcessingInfo(BaseProcessingInfo):
def
get_vision_config
(
self
):
return
self
.
get_hf_config
().
vision_config
def
get_hf_processor
(
self
):
return
self
.
ctx
.
get_hf_processor
(
AriaProcessor
)
def
get_hf_processor
(
self
,
**
kwargs
:
object
):
return
self
.
ctx
.
get_hf_processor
(
AriaProcessor
,
**
kwargs
)
def
get_supported_mm_limits
(
self
)
->
Mapping
[
str
,
Optional
[
int
]]:
return
{
"image"
:
None
}
...
...
vllm/model_executor/models/chameleon.py
View file @
377d10bd
...
...
@@ -58,8 +58,8 @@ class ChameleonProcessingInfo(BaseProcessingInfo):
def
get_hf_config
(
self
):
return
self
.
ctx
.
get_hf_config
(
ChameleonConfig
)
def
get_hf_processor
(
self
):
return
self
.
ctx
.
get_hf_processor
(
ChameleonProcessor
)
def
get_hf_processor
(
self
,
**
kwargs
:
object
):
return
self
.
ctx
.
get_hf_processor
(
ChameleonProcessor
,
**
kwargs
)
def
get_supported_mm_limits
(
self
)
->
Mapping
[
str
,
Optional
[
int
]]:
return
{
"image"
:
1
}
...
...
vllm/model_executor/models/deepseek_vl2.py
View file @
377d10bd
...
...
@@ -28,13 +28,13 @@ from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
from
vllm.multimodal.processing
import
(
BaseMultiModalProcessor
,
BaseProcessingInfo
,
PromptReplacement
)
from
vllm.multimodal.profiling
import
BaseDummyInputsBuilder
,
ProcessorInputs
from
vllm.multimodal.utils
import
cached_get_tokenizer
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.configs.deepseek_vl2
import
(
DeepseekVLV2Config
,
MlpProjectorConfig
,
VisionEncoderConfig
)
from
vllm.transformers_utils.processors.deepseek_vl2
import
(
DeepseekVLV2Processor
)
from
vllm.transformers_utils.tokenizer
import
cached_tokenizer_from_config
from
vllm.utils
import
is_list_of
from
.interfaces
import
SupportsMultiModal
,
SupportsPP
...
...
@@ -133,8 +133,8 @@ class DeepseekVL2ProcessingInfo(BaseProcessingInfo):
def
get_hf_config
(
self
):
return
self
.
ctx
.
get_hf_config
(
DeepseekVLV2Config
)
def
get_hf_processor
(
self
)
->
DeepseekVLV2Processor
:
return
self
.
ctx
.
get_hf_processor
(
DeepseekVLV2Processor
)
def
get_hf_processor
(
self
,
**
kwargs
:
object
)
:
return
self
.
ctx
.
get_hf_processor
(
DeepseekVLV2Processor
,
**
kwargs
)
def
get_supported_mm_limits
(
self
)
->
Mapping
[
str
,
Optional
[
int
]]:
return
{
"image"
:
None
}
...
...
@@ -308,13 +308,8 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
self
.
text_config
=
config
.
text_config
model_config
=
vllm_config
.
model_config
tokenizer
=
cached_get_tokenizer
(
model_config
.
tokenizer
,
tokenizer_mode
=
model_config
.
tokenizer_mode
,
tokenizer_revision
=
model_config
.
tokenizer_revision
,
trust_remote_code
=
model_config
.
trust_remote_code
,
)
self
.
image_token_id
=
tokenizer
.
vocab
.
get
(
_IMAGE_TOKEN
)
tokenizer
=
cached_tokenizer_from_config
(
model_config
)
self
.
image_token_id
=
tokenizer
.
vocab
[
_IMAGE_TOKEN
]
self
.
vision
=
self
.
_init_vision_module
(
self
.
vision_config
,
quant_config
,
...
...
vllm/model_executor/models/fuyu.py
View file @
377d10bd
...
...
@@ -71,8 +71,8 @@ class FuyuProcessingInfo(BaseProcessingInfo):
def
get_hf_config
(
self
):
return
self
.
ctx
.
get_hf_config
(
FuyuConfig
)
def
get_hf_processor
(
self
):
return
self
.
ctx
.
get_hf_processor
(
FuyuProcessor
)
def
get_hf_processor
(
self
,
**
kwargs
:
object
):
return
self
.
ctx
.
get_hf_processor
(
FuyuProcessor
,
**
kwargs
)
def
get_image_processor
(
self
)
->
FuyuImageProcessor
:
return
self
.
get_hf_processor
().
image_processor
...
...
vllm/model_executor/models/glm4v.py
View file @
377d10bd
...
...
@@ -416,18 +416,15 @@ class GLM4VProcessor:
class
GLM4VProcessingInfo
(
BaseProcessingInfo
):
def
get_tokenizer
(
self
):
tokenizer
=
self
.
ctx
.
tokenizer
assert
isinstance
(
tokenizer
,
PreTrainedTokenizer
)
return
tokenizer
def
get_hf_config
(
self
):
return
self
.
ctx
.
get_hf_config
(
ChatGLMConfig
)
def
get_hf_processor
(
self
)
->
GLM4VProcessor
:
return
GLM4VProcessor
(
self
.
get_hf_config
(),
self
.
get_tokenizer
(),
def
get_hf_processor
(
self
,
**
kwargs
:
object
)
->
GLM4VProcessor
:
return
self
.
ctx
.
init_processor
(
GLM4VProcessor
,
config
=
self
.
get_hf_config
(),
tokenizer
=
self
.
get_tokenizer
(),
**
kwargs
,
)
def
get_supported_mm_limits
(
self
)
->
Mapping
[
str
,
Optional
[
int
]]:
...
...
vllm/model_executor/models/gritlm.py
View file @
377d10bd
...
...
@@ -15,9 +15,9 @@ from vllm.model_executor.layers.pooler import PoolerHead
from
vllm.model_executor.models.llama
import
LlamaForCausalLM
from
vllm.model_executor.pooling_metadata
import
(
PoolingMetadata
,
PoolingTensors
)
from
vllm.multimodal.utils
import
cached_get_tokenizer
from
vllm.sequence
import
(
IntermediateTensors
,
PoolerOutput
,
PoolingSequenceGroupOutput
)
from
vllm.transformers_utils.tokenizer
import
cached_tokenizer_from_config
logger
=
init_logger
(
__name__
)
...
...
@@ -29,12 +29,7 @@ class GritLMPooler(nn.Module):
self
.
model_config
=
model_config
tokenizer
=
cached_get_tokenizer
(
self
.
model_config
.
tokenizer
,
tokenizer_mode
=
self
.
model_config
.
tokenizer_mode
,
tokenizer_revision
=
self
.
model_config
.
tokenizer_revision
,
trust_remote_code
=
self
.
model_config
.
trust_remote_code
,
)
tokenizer
=
cached_tokenizer_from_config
(
self
.
model_config
)
# Collect the tokens needed for pattern matching.
# "▁<" is different from "_<". The former uses "▁" to indicate that
...
...
vllm/model_executor/models/h2ovl.py
View file @
377d10bd
...
...
@@ -41,6 +41,7 @@ def resolve_h2ovl_min_max_num(
dynamic_image_size
:
bool
,
use_thumbnail
:
bool
,
)
->
tuple
[
int
,
int
]:
min_dynamic_patch
=
min_dynamic_patch
if
dynamic_image_size
else
1
max_dynamic_patch
=
max_dynamic_patch
if
dynamic_image_size
else
1
if
use_thumbnail
and
max_dynamic_patch
!=
1
:
...
...
@@ -190,7 +191,7 @@ def image_to_pixel_values_h2ovl(
pixel_values1
,
aspect_ratio1
=
_preprocess_image
(
image
,
input_size
=
input_size
,
min_num
=
min_num
,
min_num
=
1
,
max_num
=
max_num
,
use_thumbnail
=
True
,
prior_aspect_ratio
=
None
,
...
...
@@ -199,7 +200,7 @@ def image_to_pixel_values_h2ovl(
pixel_values2
,
_
=
_preprocess_image
(
image
,
input_size
=
input_size
,
min_num
=
3
,
# Hardcoded value
min_num
=
3
,
max_num
=
max_num
,
use_thumbnail
=
True
,
prior_aspect_ratio
=
aspect_ratio1
,
...
...
@@ -228,6 +229,7 @@ class H2OVLProcessor(BaseInternVLProcessor):
config
:
PretrainedConfig
,
tokenizer
:
AnyTokenizer
,
*
,
min_dynamic_patch
:
Optional
[
int
]
=
None
,
max_dynamic_patch
:
Optional
[
int
]
=
None
,
dynamic_image_size
:
Optional
[
bool
]
=
None
,
use_msac
:
Optional
[
bool
]
=
None
,
...
...
@@ -235,6 +237,7 @@ class H2OVLProcessor(BaseInternVLProcessor):
super
().
__init__
(
config
,
tokenizer
,
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
...
...
@@ -267,11 +270,13 @@ class H2OVLProcessor(BaseInternVLProcessor):
def
resolve_min_max_num
(
self
,
*
,
min_dynamic_patch
:
Optional
[
int
]
=
None
,
max_dynamic_patch
:
Optional
[
int
]
=
None
,
dynamic_image_size
:
Optional
[
bool
]
=
None
,
use_thumbnail
:
Optional
[
bool
]
=
None
,
)
->
tuple
[
int
,
int
]:
min_dynamic_patch
=
self
.
min_dynamic_patch
min_dynamic_patch
=
(
self
.
min_dynamic_patch
if
min_dynamic_patch
is
None
else
min_dynamic_patch
)
max_dynamic_patch
=
(
self
.
max_dynamic_patch
if
max_dynamic_patch
is
None
else
max_dynamic_patch
)
dynamic_image_size
=
(
self
.
dynamic_image_size
if
dynamic_image_size
...
...
@@ -289,18 +294,21 @@ class H2OVLProcessor(BaseInternVLProcessor):
def
resolve_target_ratios
(
self
,
*
,
min_dynamic_patch
:
Optional
[
int
]
=
None
,
max_dynamic_patch
:
Optional
[
int
]
=
None
,
dynamic_image_size
:
Optional
[
bool
]
=
None
,
use_thumbnail
:
Optional
[
bool
]
=
None
,
prior_aspect_ratio
:
Optional
[
tuple
[
int
,
int
]]
=
None
,
override_min_num
:
Optional
[
int
]
=
None
,
)
->
list
[
tuple
[
int
,
int
]]:
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
use_thumbnail
,
)
if
prior_aspect_ratio
:
# hardcoded value for second pass of use_msac
min_num
=
3
if
override_min_num
is
not
None
:
min_num
=
override_min_num
return
get_h2ovl_target_ratios
(
min_num
,
...
...
@@ -322,6 +330,7 @@ class H2OVLProcessor(BaseInternVLProcessor):
if
use_msac
:
target_ratios_1
=
self
.
resolve_target_ratios
(
use_thumbnail
=
False
,
# Applied in calculate_targets
override_min_num
=
1
,
)
num_patches_1
,
_
,
_
,
aspect_ratio_1
=
calculate_h2ovl_targets
(
orig_width
=
image_width
,
...
...
@@ -334,6 +343,7 @@ class H2OVLProcessor(BaseInternVLProcessor):
target_ratios_2
=
self
.
resolve_target_ratios
(
use_thumbnail
=
False
,
# Applied in calculate_targets
prior_aspect_ratio
=
aspect_ratio_1
,
override_min_num
=
3
,
)
num_patches_2
,
_
,
_
,
_
=
calculate_h2ovl_targets
(
orig_width
=
image_width
,
...
...
@@ -361,12 +371,14 @@ class H2OVLProcessor(BaseInternVLProcessor):
def
_images_to_pixel_values_lst
(
self
,
images
:
list
[
Image
.
Image
],
min_dynamic_patch
:
Optional
[
int
]
=
None
,
max_dynamic_patch
:
Optional
[
int
]
=
None
,
dynamic_image_size
:
Optional
[
bool
]
=
None
,
)
->
list
[
torch
.
Tensor
]:
use_msac
=
self
.
use_msac
if
len
(
images
)
==
1
else
False
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
False
,
# Applied in image_to_pixel_values
...
...
@@ -389,14 +401,23 @@ class H2OVLProcessingInfo(BaseInternVLProcessingInfo):
def
get_hf_processor
(
self
,
*
,
min_dynamic_patch
:
Optional
[
int
]
=
None
,
max_dynamic_patch
:
Optional
[
int
]
=
None
,
dynamic_image_size
:
Optional
[
bool
]
=
None
,
**
kwargs
:
object
,
)
->
H2OVLProcessor
:
return
H2OVLProcessor
(
self
.
get_hf_config
(),
self
.
get_tokenizer
(),
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
if
min_dynamic_patch
is
not
None
:
kwargs
[
"min_dynamic_patch"
]
=
min_dynamic_patch
if
max_dynamic_patch
is
not
None
:
kwargs
[
"max_dynamic_patch"
]
=
max_dynamic_patch
if
dynamic_image_size
is
not
None
:
kwargs
[
"dynamic_image_size"
]
=
dynamic_image_size
return
self
.
ctx
.
init_processor
(
H2OVLProcessor
,
config
=
self
.
get_hf_config
(),
tokenizer
=
self
.
get_tokenizer
(),
**
kwargs
,
)
def
get_mm_max_tokens_per_item
(
...
...
vllm/model_executor/models/idefics3.py
View file @
377d10bd
...
...
@@ -83,13 +83,15 @@ ImageInputs = Union[Idefics3ImagePixelInputs, Idefics3ImageEmbeddingInputs]
class
Idefics3ProcessingInfo
(
BaseProcessingInfo
):
def
get_hf_processor
(
self
,
*
,
size
:
Optional
[
Dict
[
str
,
int
]]
=
None
)
->
Idefics3Processor
:
self
,
*
,
size
:
Optional
[
Dict
[
str
,
int
]]
=
None
,
**
kwargs
:
object
,
)
->
Idefics3Processor
:
if
size
is
not
None
:
return
self
.
ctx
.
get_hf_processor
(
Idefics3Processor
,
size
=
size
)
kwargs
[
"size"
]
=
size
return
self
.
ctx
.
get_hf_processor
(
Idefics3Processor
)
return
self
.
ctx
.
get_hf_processor
(
Idefics3Processor
,
**
kwargs
)
def
get_supported_mm_limits
(
self
)
->
Mapping
[
str
,
Optional
[
int
]]:
return
{
"image"
:
None
}
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment