Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
66b809cc
Commit
66b809cc
authored
Feb 08, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.7.2' into v0.7.2-dev
parents
37b63c24
0408efc6
Changes
1000
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
270 additions
and
331 deletions
+270
-331
tests/models/decoder_only/vision_language/vlm_utils/types.py
tests/models/decoder_only/vision_language/vlm_utils/types.py
+1
-0
tests/models/embedding/language/test_cls_models.py
tests/models/embedding/language/test_cls_models.py
+1
-0
tests/models/embedding/language/test_embedding.py
tests/models/embedding/language/test_embedding.py
+1
-0
tests/models/embedding/language/test_gritlm.py
tests/models/embedding/language/test_gritlm.py
+2
-0
tests/models/embedding/language/test_scoring.py
tests/models/embedding/language/test_scoring.py
+1
-0
tests/models/embedding/utils.py
tests/models/embedding/utils.py
+2
-0
tests/models/embedding/vision_language/test_dse_qwen2_vl.py
tests/models/embedding/vision_language/test_dse_qwen2_vl.py
+2
-0
tests/models/embedding/vision_language/test_llava_next.py
tests/models/embedding/vision_language/test_llava_next.py
+6
-3
tests/models/embedding/vision_language/test_phi3v.py
tests/models/embedding/vision_language/test_phi3v.py
+2
-0
tests/models/encoder_decoder/audio_language/test_whisper.py
tests/models/encoder_decoder/audio_language/test_whisper.py
+1
-0
tests/models/encoder_decoder/language/test_bart.py
tests/models/encoder_decoder/language/test_bart.py
+1
-0
tests/models/encoder_decoder/vision_language/test_broadcast.py
.../models/encoder_decoder/vision_language/test_broadcast.py
+2
-0
tests/models/encoder_decoder/vision_language/test_florence2.py
.../models/encoder_decoder/vision_language/test_florence2.py
+2
-0
tests/models/encoder_decoder/vision_language/test_mllama.py
tests/models/encoder_decoder/vision_language/test_mllama.py
+2
-0
tests/models/multimodal/processing/test_common.py
tests/models/multimodal/processing/test_common.py
+7
-1
tests/models/multimodal/processing/test_h2ovl.py
tests/models/multimodal/processing/test_h2ovl.py
+142
-0
tests/models/multimodal/processing/test_idefics3.py
tests/models/multimodal/processing/test_idefics3.py
+34
-146
tests/models/multimodal/processing/test_internvl.py
tests/models/multimodal/processing/test_internvl.py
+33
-175
tests/models/multimodal/processing/test_llava_next.py
tests/models/multimodal/processing/test_llava_next.py
+14
-3
tests/models/multimodal/processing/test_llava_onevision.py
tests/models/multimodal/processing/test_llava_onevision.py
+14
-3
No files found.
Too many changes to show.
To preserve performance only
1000 of 1000+
files are displayed.
Plain diff
Email patch
tests/models/decoder_only/vision_language/vlm_utils/types.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
"""Types for writing multimodal model tests."""
from
enum
import
Enum
from
pathlib
import
PosixPath
...
...
tests/models/embedding/language/test_cls_models.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
"""Compare the classification outputs of HF and vLLM models.
Run `pytest tests/models/test_cls_models.py`.
...
...
tests/models/embedding/language/test_embedding.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
"""Compare the embedding outputs of HF and vLLM models.
Run `pytest tests/models/embedding/language/test_embedding.py`.
...
...
tests/models/embedding/language/test_gritlm.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
importlib.util
import
math
from
array
import
array
...
...
tests/models/embedding/language/test_scoring.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
"""Compare the scoring outputs of HF and vLLM models.
Run `pytest tests/models/embedding/language/test_scoring.py`.
...
...
tests/models/embedding/utils.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
typing
import
List
,
Sequence
import
torch
...
...
tests/models/embedding/vision_language/test_dse_qwen2_vl.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
functools
import
partial
from
typing
import
Callable
,
Dict
,
List
,
Type
...
...
tests/models/embedding/vision_language/test_llava_next.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
typing
import
List
,
Type
import
os
import
pytest
import
torch.nn.functional
as
F
import
transformers
from
transformers
import
AutoModelForVision2Seq
from
....conftest
import
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
...
...
@@ -56,6 +57,10 @@ def _run_test(
with
hf_runner
(
model
,
dtype
=
dtype
,
auto_cls
=
AutoModelForVision2Seq
)
as
hf_model
:
# Patch the issue where generation_config.json is missing
hf_model
.
processor
.
patch_size
=
\
hf_model
.
model
.
config
.
vision_config
.
patch_size
# Patch the issue where image_token_id
# exceeds the maximum allowed vocab size
hf_model
.
model
.
resize_token_embeddings
(
...
...
@@ -87,8 +92,6 @@ def _run_test(
)
@
pytest
.
mark
.
skipif
(
transformers
.
__version__
>=
"4.46"
,
reason
=
"Model broken with changes in transformers 4.46"
)
@
pytest
.
mark
.
core_model
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
...
...
tests/models/embedding/vision_language/test_phi3v.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
typing
import
List
,
Type
import
os
...
...
tests/models/encoder_decoder/audio_language/test_whisper.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
"""Compare the outputs of HF and vLLM for Whisper models using greedy sampling.
Run `pytest tests/models/encoder_decoder/audio/test_whisper.py`.
...
...
tests/models/encoder_decoder/language/test_bart.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
"""Compare the outputs of HF and vLLM for BART models using greedy sampling.
Run `pytest tests/models/encoder_decoder/language/test_bart.py`.
...
...
tests/models/encoder_decoder/vision_language/test_broadcast.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
pytest
import
os
...
...
tests/models/encoder_decoder/vision_language/test_florence2.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
functools
import
partial
from
typing
import
List
,
Optional
,
Tuple
,
Type
...
...
tests/models/encoder_decoder/vision_language/test_mllama.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
typing
import
List
,
Optional
,
Tuple
,
Type
,
overload
import
os
...
...
tests/models/multimodal/processing/test_common.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
functools
import
partial
import
numpy
as
np
...
...
@@ -139,13 +141,15 @@ def _test_processing_correctness(
# yapf: disable
# True if the model supports multiple data items of the modality per request
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"rhymes-ai/Aria"
,
"Salesforce/blip2-opt-2.7b"
,
"facebook/chameleon-7b"
,
"deepseek-ai/deepseek-vl2-tiny"
,
"adept/fuyu-8b"
,
"h2oai/h2ovl-mississippi-800m"
,
"OpenGVLab/InternVL2-1B"
,
"HuggingFaceM4/Idefics3-8B-Llama3"
,
"llava-hf/llava-1.5-7b-hf"
,
"llava-hf/llava-v1.6-mistral-7b-hf"
,
"llava-hf/LLaVA-NeXT-Video-7B-hf"
,
...
...
@@ -154,8 +158,10 @@ def _test_processing_correctness(
"mistral-community/pixtral-12b"
,
"openbmb/MiniCPM-o-2_6"
,
"openbmb/MiniCPM-V-2_6"
,
"nvidia/NVLM-D-72B"
,
"Qwen/Qwen-VL-Chat"
,
"Qwen/Qwen2-VL-2B-Instruct"
,
"Qwen/Qwen2.5-VL-3B-Instruct"
,
"Qwen/Qwen2-Audio-7B-Instruct"
,
"fixie-ai/ultravox-v0_3"
,
])
...
...
tests/models/multimodal/processing/test_h2ovl.py
0 → 100644
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
"""Tests for H2OVL's multimodal preprocessing kwargs."""
from
typing
import
Optional
import
pytest
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.image
import
rescale_image_size
from
vllm.multimodal.utils
import
cached_get_tokenizer
from
....conftest
import
_ImageAssets
from
...utils
import
build_model_context
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"h2oai/h2ovl-mississippi-800m"
,
"h2oai/h2ovl-mississippi-2b"
,
])
@
pytest
.
mark
.
parametrize
(
"size_factors"
,
[
# Single-scale
[
1.0
],
# Single-scale, batched
[
1.0
,
1.0
,
1.0
],
# Multi-scale
[
0.25
,
0.5
,
1.0
],
],
)
@
pytest
.
mark
.
parametrize
(
"max_dynamic_patch"
,
[
1
,
2
,
4
,
8
])
@
pytest
.
mark
.
parametrize
(
"dynamic_image_size"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
2
])
def
test_processor_override
(
model_id
:
str
,
image_assets
:
_ImageAssets
,
size_factors
:
list
[
int
],
max_dynamic_patch
:
int
,
dynamic_image_size
:
Optional
[
bool
],
num_imgs
:
int
,
):
from
vllm.model_executor.models.h2ovl
import
(
calculate_h2ovl_targets
,
get_h2ovl_target_ratios
)
ctx
=
build_model_context
(
model_name
=
model_id
,
tokenizer_name
=
model_id
,
trust_remote_code
=
True
,
mm_processor_kwargs
=
None
,
limit_mm_per_prompt
=
{
"image"
:
num_imgs
},
)
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
,
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
tokenizer
,
)
config
=
processor
.
info
.
get_hf_config
()
use_msac
=
config
.
use_msac
mm_processor_kwargs
=
{
"max_dynamic_patch"
:
max_dynamic_patch
,
}
if
dynamic_image_size
is
not
None
:
mm_processor_kwargs
[
"dynamic_image_size"
]
=
dynamic_image_size
min_num
=
config
.
min_dynamic_patch
max_num
=
max_dynamic_patch
if
dynamic_image_size
else
1
# Build the image str / prompt based on the number of images we pass
prompt
=
"<image>"
*
num_imgs
for
asset
in
image_assets
:
for
factor
in
size_factors
:
image
=
rescale_image_size
(
asset
.
pil_image
,
factor
)
mm_data
=
{
"image"
:
[
image
]
*
num_imgs
}
width
,
height
=
image
.
size
# Calculate the expected number of blocks
if
num_imgs
==
1
and
use_msac
:
# First pass
blocks1
,
_
,
_
,
aspect_ratio
=
calculate_h2ovl_targets
(
orig_width
=
width
,
orig_height
=
height
,
target_ratios
=
get_h2ovl_target_ratios
(
min_num
,
max_num
,
prior_aspect_ratio
=
None
,
),
image_size
=
config
.
vision_config
.
image_size
,
use_thumbnail
=
False
,
# Thumbnail is handled separately
)
# Second pass
blocks2
,
_
,
_
,
_
=
calculate_h2ovl_targets
(
orig_width
=
width
,
orig_height
=
height
,
target_ratios
=
get_h2ovl_target_ratios
(
min_num
,
max_num
,
prior_aspect_ratio
=
aspect_ratio
,
),
image_size
=
config
.
vision_config
.
image_size
,
use_thumbnail
=
False
,
)
# Add thumbnail if use_thumbnail is True and total_blocks > 1
if
config
.
use_thumbnail
:
blocks1
+=
1
if
blocks1
>
1
else
0
blocks2
+=
1
if
blocks2
>
1
else
0
# Total blocks is the sum of blocks from both passes minus
# overlapping
total_blocks
=
blocks1
+
blocks2
-
1
expected_num_patches
=
total_blocks
else
:
blocks
,
_
,
_
,
_
=
calculate_h2ovl_targets
(
orig_width
=
width
,
orig_height
=
height
,
target_ratios
=
get_h2ovl_target_ratios
(
min_num
,
max_num
,
prior_aspect_ratio
=
None
,
),
image_size
=
config
.
vision_config
.
image_size
,
use_thumbnail
=
False
,
)
expected_num_patches
=
blocks
if
config
.
use_thumbnail
and
expected_num_patches
!=
1
:
expected_num_patches
+=
1
processed_inputs
=
processor
.
apply
(
prompt
,
mm_data
,
mm_processor_kwargs
)
pixel_shape
=
(
processed_inputs
[
"mm_kwargs"
][
"pixel_values_flat"
].
shape
)
assert
pixel_shape
[
0
]
==
expected_num_patches
*
num_imgs
tests/models/multimodal/processing/test_idefics3.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
"""Tests for Idefics3's multimodal preprocessing kwargs."""
from
typing
import
Optional
import
os
import
pytest
import
torch
from
transformers
import
AutoImageProcessor
,
AutoTokenizer
from
transformers
import
Idefics3Config
from
vllm.
inputs
import
InputContext
,
token_inputs
from
vllm.multimodal
import
MultiModalRegistry
from
vllm.
multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
.utils
import
cached_get_tokenizer
from
....conftest
import
_ImageAssets
from
...utils
import
build_model_context
...
...
@@ -16,163 +14,53 @@ from ....utils import models_path_prefix
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"HuggingFaceM4/Idefics3-8B-Llama3"
)]
# Wrap lazy imports to avoid initializing CUDA during test collection
@
pytest
.
fixture
()
def
input_processor_for_idefics3
():
from
vllm.model_executor.models.idefics3
import
(
input_processor_for_idefics3
)
return
input_processor_for_idefics3
@
pytest
.
fixture
()
def
dummy_data_for_idefics3
():
from
vllm.model_executor.models.idefics3
import
dummy_data_for_idefics3
return
dummy_data_for_idefics3
@
pytest
.
fixture
()
def
get_max_idefics3_image_tokens
():
from
vllm.model_executor.models.idefics3
import
(
get_max_idefics3_image_tokens
)
return
get_max_idefics3_image_tokens
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"longest_edge"
,
[
None
,
168
,
336
,
400
,
2
*
336
])
def
test_input_mapper_override
(
model
:
str
,
image_assets
:
_ImageAssets
,
longest_edge
:
Optional
[
int
]):
"""Ensure that the [default] input mapper handles size properly."""
mm_processor_kwargs
=
{
"size"
:
{
"longest_edge"
:
longest_edge
}
}
if
longest_edge
is
not
None
else
{}
ctx
=
build_model_context
(
model_name
=
model
,
tokenizer_name
=
model
,
trust_remote_code
=
True
,
mm_processor_kwargs
=
mm_processor_kwargs
,
)
hf_processor
=
AutoImageProcessor
.
from_pretrained
(
model
,
trust_remote_code
=
True
,
**
mm_processor_kwargs
)
mm_registry
=
MultiModalRegistry
()
mm_registry
.
init_mm_limits_per_prompt
(
ctx
.
model_config
)
image
=
image_assets
[
0
].
pil_image
hf_result
=
hf_processor
.
preprocess
(
image
,
return_tensors
=
"pt"
,
)
vllm_result
=
mm_registry
.
map_input
(
ctx
.
model_config
,
{
"image"
:
image
},
)
assert
torch
.
all
(
hf_result
[
"pixel_values"
]
==
vllm_result
[
"pixel_values"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"longest_edge, expected_max_tokens"
,
[
(
None
,
2873
),
(
168
,
169
),
(
336
,
169
),
(
400
,
338
),
(
672
,
338
),
])
def
test_max_tokens_override
(
get_max_idefics3_image_tokens
,
model
:
str
,
longest_edge
:
Optional
[
int
],
expected_max_tokens
:
int
):
"""Ensure get_max_idefics3_image_tokens handles mm_processor_kwargs."""
size
=
{
"longest_edge"
:
longest_edge
}
if
longest_edge
is
not
None
else
None
ctx
=
build_model_context
(
model_name
=
model
,
tokenizer_name
=
model
,
trust_remote_code
=
True
,
mm_processor_kwargs
=
None
,
)
actual_max_tokens
=
get_max_idefics3_image_tokens
(
ctx
=
InputContext
(
ctx
.
model_config
),
size
=
size
,
)
assert
expected_max_tokens
==
actual_max_tokens
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"longest_edge, toks_per_img, num_imgs"
,
[
(
168
,
169
,
1
),
(
168
,
169
,
2
),
(
400
,
338
,
1
),
(
400
,
338
,
2
),
])
def
test_dummy_data_override
(
dummy_data_for_idefics3
,
model
:
str
,
longest_edge
:
int
,
toks_per_img
:
int
,
num_imgs
:
int
):
"""Ensure dummy_data_for_idefics3 handles num_crops properly."""
# Same as the previous test - don't initialize mm_processor_kwargs
# in this test and assume that the kwargs will be correctly expanded by
# the partial when calling the dummy data func.
size
=
{
"longest_edge"
:
longest_edge
}
if
longest_edge
is
not
None
else
None
ctx
=
build_model_context
(
model_name
=
model
,
tokenizer_name
=
model
,
trust_remote_code
=
True
,
mm_processor_kwargs
=
None
,
)
dummy_data
=
dummy_data_for_idefics3
(
ctx
=
ctx
,
seq_len
=
8192
,
# Should be bigger than num_imgs * toks_per_img
mm_counts
=
{
"image"
:
num_imgs
},
size
=
size
)
sequence_data
=
dummy_data
.
seq_data
# Ensure we have the right number of placeholders per size
image_token_id
=
ctx
.
get_hf_config
().
image_token_id
img_tok_count
=
sequence_data
.
get_token_ids
().
count
(
image_token_id
)
assert
img_tok_count
==
toks_per_img
*
num_imgs
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"longest_edge,expected_toks_per_img,num_imgs"
,
[
(
336
,
169
*
(
1
**
2
+
1
),
1
),
(
336
,
169
*
(
1
**
2
+
1
),
2
),
(
400
,
169
*
(
2
**
2
+
1
),
1
),
(
400
,
169
*
(
2
**
2
+
1
),
2
),
])
def
test_input_processor_override
(
input_processor_for_idefics3
,
image_assets
:
_ImageAssets
,
model
:
str
,
longest_edge
:
int
,
expected_toks_per_img
:
int
,
num_imgs
:
int
):
# yapf: disable
@
pytest
.
mark
.
parametrize
(
(
"mm_processor_kwargs"
,
"expected_toks_per_img"
),
[
({
"size"
:
{
"longest_edge"
:
364
}},
169
),
({
"size"
:
{
"longest_edge"
:
728
}},
169
*
(
2
**
2
+
1
)),
])
# yapf: enable
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
2
])
def
test_processor_override
(
image_assets
:
_ImageAssets
,
model
:
str
,
mm_processor_kwargs
:
dict
[
str
,
object
],
expected_toks_per_img
:
int
,
num_imgs
:
int
):
"""Ensure input_processor_for_idefics3 handles num_crops properly."""
# Same as the previous test - don't initialize mm_processor_kwargs
# in this test and assume that the kwargs will be correctly expanded by
# the partial when calling the custom input processor.
size
=
{
"longest_edge"
:
longest_edge
}
if
longest_edge
is
not
None
else
None
ctx
=
build_model_context
(
model_name
=
model
,
tokenizer_name
=
model
,
trust_remote_code
=
True
,
mm_processor_kwargs
=
None
,
limit_mm_per_prompt
=
{
"image"
:
num_imgs
},
)
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
tokenizer
,
)
hf_processor
=
processor
.
info
.
get_hf_processor
(
**
mm_processor_kwargs
)
# Build the image str / prompt based on the number of images we pass
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model
)
placeholders
=
"<image>"
if
num_imgs
==
1
else
"
\n
"
.
join
(
f
"Image-
{
i
}
: <image>
\n
"
for
i
in
range
(
1
,
num_imgs
+
1
))
prompt
=
f
"<|begin_of_text|>User:
{
placeholders
}
\n
<end_of_utterance>
\n
Assistant:"
# noqa: E501
images
=
[
image_assets
[
0
].
pil_image
.
resize
((
336
*
4
,
336
*
4
))]
*
num_imgs
inputs
=
token_inputs
(
prompt_token_ids
=
tokenizer
.
encode
(
prompt
),
prompt
=
prompt
,
multi_modal_data
=
{
"image"
:
images
})
processed_inputs
=
input_processor_for_idefics3
(
ctx
,
inputs
,
size
=
size
)
# Build mm_data
image_size
=
ctx
.
get_hf_config
(
Idefics3Config
).
vision_config
.
image_size
dummy_image_size
=
(
image_size
*
4
,
image_size
*
4
)
dummy_image
=
image_assets
[
0
].
pil_image
.
resize
(
dummy_image_size
)
mm_data
=
{
"image"
:
[
dummy_image
]
*
num_imgs
}
processed_inputs
=
processor
.
apply
(
prompt
,
mm_data
,
mm_processor_kwargs
)
# Ensure the placeholders format are correct
hf_processed_inputs
=
hf_processor
(
text
=
prompt
,
images
=
mm_data
[
"image"
])
assert
processed_inputs
[
"prompt_token_ids"
]
==
hf_processed_inputs
[
"input_ids"
][
0
]
# Ensure we have the right number of placeholders per num_crops size
image_token_id
=
ctx
.
get_hf_config
().
image_token_id
...
...
tests/models/multimodal/processing/test_internvl.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
"""Tests for InternVL's multimodal preprocessing kwargs."""
from
typing
import
Callable
,
Optional
from
typing
import
Optional
import
os
import
pytest
from
transformers
import
AutoTokenizer
from
vllm.
inputs
import
InputContext
,
token_inputs
from
vllm.multimodal
import
MultiModalRegistry
from
vllm.
multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
.utils
import
cached_get_tokenizer
from
....conftest
import
_ImageAssets
from
...utils
import
build_model_context
from
....utils
import
models_path_prefix
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"OpenGVLab/InternVL2-2B"
)]
# Wrap lazy imports to avoid initializing CUDA during test collection
@
pytest
.
fixture
()
def
input_processor_for_internvl
():
from
vllm.model_executor.models.internvl
import
InternVLInputPipeline
pipeline
=
InternVLInputPipeline
(
'<img>'
,
'</img>'
,
'<IMG_CONTEXT>'
)
return
pipeline
.
input_processor
@
pytest
.
fixture
()
def
dummy_data_for_internvl
():
from
vllm.model_executor.models.internvl
import
InternVLInputPipeline
pipeline
=
InternVLInputPipeline
(
'<img>'
,
'</img>'
,
'<IMG_CONTEXT>'
)
return
pipeline
.
dummy_data
@
pytest
.
fixture
()
def
get_max_internvl_image_tokens
():
from
vllm.model_executor.models.internvl
import
(
get_max_internvl_image_tokens
)
return
get_max_internvl_image_tokens
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
os
.
path
.
join
(
models_path_prefix
,
"OpenGVLab/InternVL2-2B"
)])
@
pytest
.
mark
.
parametrize
(
"max_dynamic_patch"
,
[
1
,
4
])
@
pytest
.
mark
.
parametrize
(
"dynamic_image_size"
,
[
True
,
False
,
None
])
def
test_input_mapper_override
(
model
:
str
,
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
2
])
def
test_processor_override
(
model_id
:
str
,
image_assets
:
_ImageAssets
,
max_dynamic_patch
:
int
,
dynamic_image_size
:
Optional
[
bool
],
):
mm_processor_kwargs
=
{
"max_dynamic_patch"
:
max_dynamic_patch
,
}
if
dynamic_image_size
is
not
None
:
mm_processor_kwargs
[
"dynamic_image_size"
]
=
dynamic_image_size
expected_num_patches
=
max_dynamic_patch
+
1
if
max_dynamic_patch
>
1
else
1
if
dynamic_image_size
is
False
:
expected_num_patches
=
1
ctx
=
build_model_context
(
model_name
=
model
,
tokenizer_name
=
model
,
trust_remote_code
=
True
,
mm_processor_kwargs
=
mm_processor_kwargs
,
)
mm_registry
=
MultiModalRegistry
()
mm_registry
.
init_mm_limits_per_prompt
(
ctx
.
model_config
)
image
=
image_assets
[
0
].
pil_image
.
resize
((
448
*
2
,
448
*
2
))
vllm_result
=
mm_registry
.
map_input
(
ctx
.
model_config
,
{
"image"
:
image
},
)
assert
vllm_result
[
"pixel_values"
].
size
(
1
)
==
expected_num_patches
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"max_dynamic_patch"
,
[
1
,
4
,
None
])
@
pytest
.
mark
.
parametrize
(
"dynamic_image_size"
,
[
True
,
False
,
None
])
def
test_max_tokens_override
(
get_max_internvl_image_tokens
:
Callable
,
model
:
str
,
max_dynamic_patch
:
Optional
[
int
],
dynamic_image_size
:
Optional
[
bool
],
):
"""Ensure get_max_internvl_image_tokens handles mm_processor_kwargs."""
ctx
=
build_model_context
(
model_name
=
model
,
tokenizer_name
=
model
,
trust_remote_code
=
True
,
mm_processor_kwargs
=
None
,
)
if
max_dynamic_patch
is
None
:
max_dynamic_patch
=
ctx
.
get_hf_config
().
max_dynamic_patch
expected_num_patches
=
max_dynamic_patch
+
1
if
max_dynamic_patch
>
1
else
1
if
dynamic_image_size
is
False
:
expected_num_patches
=
1
expected_max_tokens
=
256
*
expected_num_patches
actual_max_tokens
=
get_max_internvl_image_tokens
(
ctx
=
InputContext
(
ctx
.
model_config
),
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
assert
expected_max_tokens
==
actual_max_tokens
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
2
])
@
pytest
.
mark
.
parametrize
(
"max_dynamic_patch"
,
[
1
,
4
,
None
])
@
pytest
.
mark
.
parametrize
(
"dynamic_image_size"
,
[
True
,
False
,
None
])
def
test_dummy_data_override
(
dummy_data_for_internvl
:
Callable
,
model
:
str
,
num_imgs
:
int
,
max_dynamic_patch
:
Optional
[
int
],
dynamic_image_size
:
Optional
[
bool
],
):
"""Ensure dummy_data_for_internvl handles kwargs properly."""
# Same as the previous test - don't initialize mm_processor_kwargs
# in this test and assume that the kwargs will be correctly expanded by
# the partial when calling the dummy data func.
ctx
=
build_model_context
(
model_name
=
model
,
tokenizer_name
=
model
,
model_name
=
model
_id
,
tokenizer_name
=
model
_id
,
trust_remote_code
=
True
,
mm_processor_kwargs
=
None
,
limit_mm_per_prompt
=
{
"image"
:
num_imgs
},
)
if
max_dynamic_patch
is
None
:
max_dynamic_patch
=
ctx
.
get_hf_config
().
max_dynamic_patch
expected_num_patches
=
max_dynamic_patch
+
1
if
max_dynamic_patch
>
1
else
1
if
dynamic_image_size
is
False
:
expected_num_patches
=
1
expected_max_tokens
=
256
*
expected_num_patches
dummy_data
=
dummy_data_for_internvl
(
ctx
=
ctx
,
seq_len
=
8192
,
# Should be bigger than num_imgs * toks_per_img
mm_counts
=
{
"image"
:
num_imgs
},
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
,
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
tokenizer
,
)
sequence_data
=
dummy_data
.
seq_data
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model
,
trust_remote_code
=
True
)
image_token_id
=
tokenizer
.
encode
(
'<IMG_CONTEXT>'
,
add_special_tokens
=
False
)[
0
]
# Ensure we have the right number of placeholders per size
img_tok_count
=
sequence_data
.
get_token_ids
().
count
(
image_token_id
)
assert
img_tok_count
==
expected_max_tokens
*
num_imgs
mm_processor_kwargs
=
{
"max_dynamic_patch"
:
max_dynamic_patch
,
}
if
dynamic_image_size
is
not
None
:
mm_processor_kwargs
[
"dynamic_image_size"
]
=
dynamic_image_size
# Build the image str / prompt based on the number of images we pass
prompt
=
"<image>"
*
num_imgs
image
=
image_assets
[
0
].
pil_image
.
resize
((
448
*
2
,
448
*
2
))
mm_data
=
{
"image"
:
[
image
]
*
num_imgs
}
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"max_dynamic_patch"
,
[
1
,
4
])
@
pytest
.
mark
.
parametrize
(
"dynamic_image_size"
,
[
True
,
False
,
None
])
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
2
])
def
test_input_processor_override
(
input_processor_for_internvl
:
Callable
,
image_assets
:
_ImageAssets
,
model
:
str
,
num_imgs
:
int
,
max_dynamic_patch
:
int
,
dynamic_image_size
:
Optional
[
bool
],
):
"""Ensure input_processor_for_internvl handles kwargs properly."""
# Same as the previous test - don't initialize mm_processor_kwargs
# in this test and assume that the kwargs will be correctly expanded by
# the partial when calling the custom input processor.
expected_num_patches
=
max_dynamic_patch
+
1
if
max_dynamic_patch
>
1
else
1
if
dynamic_image_size
is
False
:
expected_num_patches
=
1
ctx
=
build_model_context
(
model_name
=
model
,
tokenizer_name
=
model
,
trust_remote_code
=
True
,
mm_processor_kwargs
=
None
,
)
expected_toks_per_img
=
256
*
expected_num_patches
# Build the image str / prompt based on the number of images we pass
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model
,
trust_remote_code
=
True
)
placeholders
=
"<image>"
if
num_imgs
==
1
else
"
\n
"
.
join
(
f
"Image-
{
i
}
: <image>
\n
"
for
i
in
range
(
1
,
num_imgs
+
1
))
prompt
=
placeholders
images
=
[
image_assets
[
0
].
pil_image
.
resize
((
448
*
2
,
448
*
2
))]
*
num_imgs
inputs
=
token_inputs
(
prompt_token_ids
=
tokenizer
.
encode
(
prompt
),
prompt
=
prompt
,
multi_modal_data
=
{
"image"
:
images
})
processed_inputs
=
input_processor_for_internvl
(
ctx
,
inputs
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
processed_inputs
=
processor
.
apply
(
prompt
,
mm_data
,
mm_processor_kwargs
)
# Ensure we have the right number of placeholders per num_crops size
image_token_id
=
tokenizer
.
encode
(
'<IMG_CONTEXT>'
,
add_special_tokens
=
False
)[
0
]
image_token_id
=
tokenizer
.
convert_tokens_to_ids
(
"<IMG_CONTEXT>"
)
img_tok_count
=
processed_inputs
[
"prompt_token_ids"
].
count
(
image_token_id
)
assert
img_tok_count
==
expected_toks_per_img
*
num_imgs
pixel_shape
=
processed_inputs
[
"mm_kwargs"
][
"pixel_values_flat"
].
shape
assert
img_tok_count
==
256
*
expected_num_patches
*
num_imgs
assert
pixel_shape
[
0
]
==
expected_num_patches
*
num_imgs
tests/models/multimodal/processing/test_llava_next.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
itertools
from
functools
import
partial
...
...
@@ -41,7 +43,10 @@ def test_processor_max_tokens(model_id):
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
),
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
,
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
),
)
info
=
processor
.
info
...
...
@@ -141,7 +146,10 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
),
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
,
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
),
)
image_ratios
=
[(
171
,
152
),
(
184
,
161
),
(
198
,
176
),
(
333
,
296
),
(
369
,
328
),
...
...
@@ -171,7 +179,10 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
),
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
,
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
),
)
seen_aspect_ratios
=
set
[
float
]()
...
...
tests/models/multimodal/processing/test_llava_onevision.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
itertools
from
functools
import
partial
...
...
@@ -42,7 +44,10 @@ def test_processor_max_tokens(model_id):
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
),
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
,
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
),
)
info
=
processor
.
info
...
...
@@ -141,7 +146,10 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
),
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
,
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
),
)
image_ratios
=
[(
171
,
152
),
(
184
,
161
),
(
198
,
176
),
(
333
,
296
),
(
369
,
328
),
...
...
@@ -172,7 +180,10 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
),
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
,
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
),
)
seen_aspect_ratios
=
set
[
float
]()
...
...
Prev
1
…
14
15
16
17
18
19
20
21
22
…
50
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment