Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
66b809cc
Commit
66b809cc
authored
Feb 08, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.7.2' into v0.7.2-dev
parents
37b63c24
0408efc6
Changes
1000
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
270 additions
and
331 deletions
+270
-331
tests/models/decoder_only/vision_language/vlm_utils/types.py
tests/models/decoder_only/vision_language/vlm_utils/types.py
+1
-0
tests/models/embedding/language/test_cls_models.py
tests/models/embedding/language/test_cls_models.py
+1
-0
tests/models/embedding/language/test_embedding.py
tests/models/embedding/language/test_embedding.py
+1
-0
tests/models/embedding/language/test_gritlm.py
tests/models/embedding/language/test_gritlm.py
+2
-0
tests/models/embedding/language/test_scoring.py
tests/models/embedding/language/test_scoring.py
+1
-0
tests/models/embedding/utils.py
tests/models/embedding/utils.py
+2
-0
tests/models/embedding/vision_language/test_dse_qwen2_vl.py
tests/models/embedding/vision_language/test_dse_qwen2_vl.py
+2
-0
tests/models/embedding/vision_language/test_llava_next.py
tests/models/embedding/vision_language/test_llava_next.py
+6
-3
tests/models/embedding/vision_language/test_phi3v.py
tests/models/embedding/vision_language/test_phi3v.py
+2
-0
tests/models/encoder_decoder/audio_language/test_whisper.py
tests/models/encoder_decoder/audio_language/test_whisper.py
+1
-0
tests/models/encoder_decoder/language/test_bart.py
tests/models/encoder_decoder/language/test_bart.py
+1
-0
tests/models/encoder_decoder/vision_language/test_broadcast.py
.../models/encoder_decoder/vision_language/test_broadcast.py
+2
-0
tests/models/encoder_decoder/vision_language/test_florence2.py
.../models/encoder_decoder/vision_language/test_florence2.py
+2
-0
tests/models/encoder_decoder/vision_language/test_mllama.py
tests/models/encoder_decoder/vision_language/test_mllama.py
+2
-0
tests/models/multimodal/processing/test_common.py
tests/models/multimodal/processing/test_common.py
+7
-1
tests/models/multimodal/processing/test_h2ovl.py
tests/models/multimodal/processing/test_h2ovl.py
+142
-0
tests/models/multimodal/processing/test_idefics3.py
tests/models/multimodal/processing/test_idefics3.py
+34
-146
tests/models/multimodal/processing/test_internvl.py
tests/models/multimodal/processing/test_internvl.py
+33
-175
tests/models/multimodal/processing/test_llava_next.py
tests/models/multimodal/processing/test_llava_next.py
+14
-3
tests/models/multimodal/processing/test_llava_onevision.py
tests/models/multimodal/processing/test_llava_onevision.py
+14
-3
No files found.
Too many changes to show.
To preserve performance only
1000 of 1000+
files are displayed.
Plain diff
Email patch
tests/models/decoder_only/vision_language/vlm_utils/types.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
"""Types for writing multimodal model tests."""
"""Types for writing multimodal model tests."""
from
enum
import
Enum
from
enum
import
Enum
from
pathlib
import
PosixPath
from
pathlib
import
PosixPath
...
...
tests/models/embedding/language/test_cls_models.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
"""Compare the classification outputs of HF and vLLM models.
"""Compare the classification outputs of HF and vLLM models.
Run `pytest tests/models/test_cls_models.py`.
Run `pytest tests/models/test_cls_models.py`.
...
...
tests/models/embedding/language/test_embedding.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
"""Compare the embedding outputs of HF and vLLM models.
"""Compare the embedding outputs of HF and vLLM models.
Run `pytest tests/models/embedding/language/test_embedding.py`.
Run `pytest tests/models/embedding/language/test_embedding.py`.
...
...
tests/models/embedding/language/test_gritlm.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
importlib.util
import
importlib.util
import
math
import
math
from
array
import
array
from
array
import
array
...
...
tests/models/embedding/language/test_scoring.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
"""Compare the scoring outputs of HF and vLLM models.
"""Compare the scoring outputs of HF and vLLM models.
Run `pytest tests/models/embedding/language/test_scoring.py`.
Run `pytest tests/models/embedding/language/test_scoring.py`.
...
...
tests/models/embedding/utils.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
typing
import
List
,
Sequence
from
typing
import
List
,
Sequence
import
torch
import
torch
...
...
tests/models/embedding/vision_language/test_dse_qwen2_vl.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
functools
import
partial
from
functools
import
partial
from
typing
import
Callable
,
Dict
,
List
,
Type
from
typing
import
Callable
,
Dict
,
List
,
Type
...
...
tests/models/embedding/vision_language/test_llava_next.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
typing
import
List
,
Type
from
typing
import
List
,
Type
import
os
import
os
import
pytest
import
pytest
import
torch.nn.functional
as
F
import
torch.nn.functional
as
F
import
transformers
from
transformers
import
AutoModelForVision2Seq
from
transformers
import
AutoModelForVision2Seq
from
....conftest
import
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
from
....conftest
import
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
...
@@ -56,6 +57,10 @@ def _run_test(
...
@@ -56,6 +57,10 @@ def _run_test(
with
hf_runner
(
model
,
dtype
=
dtype
,
with
hf_runner
(
model
,
dtype
=
dtype
,
auto_cls
=
AutoModelForVision2Seq
)
as
hf_model
:
auto_cls
=
AutoModelForVision2Seq
)
as
hf_model
:
# Patch the issue where generation_config.json is missing
hf_model
.
processor
.
patch_size
=
\
hf_model
.
model
.
config
.
vision_config
.
patch_size
# Patch the issue where image_token_id
# Patch the issue where image_token_id
# exceeds the maximum allowed vocab size
# exceeds the maximum allowed vocab size
hf_model
.
model
.
resize_token_embeddings
(
hf_model
.
model
.
resize_token_embeddings
(
...
@@ -87,8 +92,6 @@ def _run_test(
...
@@ -87,8 +92,6 @@ def _run_test(
)
)
@
pytest
.
mark
.
skipif
(
transformers
.
__version__
>=
"4.46"
,
reason
=
"Model broken with changes in transformers 4.46"
)
@
pytest
.
mark
.
core_model
@
pytest
.
mark
.
core_model
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
...
...
tests/models/embedding/vision_language/test_phi3v.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
typing
import
List
,
Type
from
typing
import
List
,
Type
import
os
import
os
...
...
tests/models/encoder_decoder/audio_language/test_whisper.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
"""Compare the outputs of HF and vLLM for Whisper models using greedy sampling.
"""Compare the outputs of HF and vLLM for Whisper models using greedy sampling.
Run `pytest tests/models/encoder_decoder/audio/test_whisper.py`.
Run `pytest tests/models/encoder_decoder/audio/test_whisper.py`.
...
...
tests/models/encoder_decoder/language/test_bart.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
"""Compare the outputs of HF and vLLM for BART models using greedy sampling.
"""Compare the outputs of HF and vLLM for BART models using greedy sampling.
Run `pytest tests/models/encoder_decoder/language/test_bart.py`.
Run `pytest tests/models/encoder_decoder/language/test_bart.py`.
...
...
tests/models/encoder_decoder/vision_language/test_broadcast.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
pytest
import
pytest
import
os
import
os
...
...
tests/models/encoder_decoder/vision_language/test_florence2.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
functools
import
partial
from
functools
import
partial
from
typing
import
List
,
Optional
,
Tuple
,
Type
from
typing
import
List
,
Optional
,
Tuple
,
Type
...
...
tests/models/encoder_decoder/vision_language/test_mllama.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
typing
import
List
,
Optional
,
Tuple
,
Type
,
overload
from
typing
import
List
,
Optional
,
Tuple
,
Type
,
overload
import
os
import
os
...
...
tests/models/multimodal/processing/test_common.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
functools
import
partial
from
functools
import
partial
import
numpy
as
np
import
numpy
as
np
...
@@ -139,13 +141,15 @@ def _test_processing_correctness(
...
@@ -139,13 +141,15 @@ def _test_processing_correctness(
# yapf: disable
# yapf: disable
# True if the model supports multiple data items of the modality per request
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"rhymes-ai/Aria"
,
"rhymes-ai/Aria"
,
"Salesforce/blip2-opt-2.7b"
,
"Salesforce/blip2-opt-2.7b"
,
"facebook/chameleon-7b"
,
"facebook/chameleon-7b"
,
"deepseek-ai/deepseek-vl2-tiny"
,
"deepseek-ai/deepseek-vl2-tiny"
,
"adept/fuyu-8b"
,
"adept/fuyu-8b"
,
"h2oai/h2ovl-mississippi-800m"
,
"OpenGVLab/InternVL2-1B"
,
"HuggingFaceM4/Idefics3-8B-Llama3"
,
"llava-hf/llava-1.5-7b-hf"
,
"llava-hf/llava-1.5-7b-hf"
,
"llava-hf/llava-v1.6-mistral-7b-hf"
,
"llava-hf/llava-v1.6-mistral-7b-hf"
,
"llava-hf/LLaVA-NeXT-Video-7B-hf"
,
"llava-hf/LLaVA-NeXT-Video-7B-hf"
,
...
@@ -154,8 +158,10 @@ def _test_processing_correctness(
...
@@ -154,8 +158,10 @@ def _test_processing_correctness(
"mistral-community/pixtral-12b"
,
"mistral-community/pixtral-12b"
,
"openbmb/MiniCPM-o-2_6"
,
"openbmb/MiniCPM-o-2_6"
,
"openbmb/MiniCPM-V-2_6"
,
"openbmb/MiniCPM-V-2_6"
,
"nvidia/NVLM-D-72B"
,
"Qwen/Qwen-VL-Chat"
,
"Qwen/Qwen-VL-Chat"
,
"Qwen/Qwen2-VL-2B-Instruct"
,
"Qwen/Qwen2-VL-2B-Instruct"
,
"Qwen/Qwen2.5-VL-3B-Instruct"
,
"Qwen/Qwen2-Audio-7B-Instruct"
,
"Qwen/Qwen2-Audio-7B-Instruct"
,
"fixie-ai/ultravox-v0_3"
,
"fixie-ai/ultravox-v0_3"
,
])
])
...
...
tests/models/multimodal/processing/test_h2ovl.py
0 → 100644
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
"""Tests for H2OVL's multimodal preprocessing kwargs."""
from
typing
import
Optional
import
pytest
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.image
import
rescale_image_size
from
vllm.multimodal.utils
import
cached_get_tokenizer
from
....conftest
import
_ImageAssets
from
...utils
import
build_model_context
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"h2oai/h2ovl-mississippi-800m"
,
"h2oai/h2ovl-mississippi-2b"
,
])
@
pytest
.
mark
.
parametrize
(
"size_factors"
,
[
# Single-scale
[
1.0
],
# Single-scale, batched
[
1.0
,
1.0
,
1.0
],
# Multi-scale
[
0.25
,
0.5
,
1.0
],
],
)
@
pytest
.
mark
.
parametrize
(
"max_dynamic_patch"
,
[
1
,
2
,
4
,
8
])
@
pytest
.
mark
.
parametrize
(
"dynamic_image_size"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
2
])
def
test_processor_override
(
model_id
:
str
,
image_assets
:
_ImageAssets
,
size_factors
:
list
[
int
],
max_dynamic_patch
:
int
,
dynamic_image_size
:
Optional
[
bool
],
num_imgs
:
int
,
):
from
vllm.model_executor.models.h2ovl
import
(
calculate_h2ovl_targets
,
get_h2ovl_target_ratios
)
ctx
=
build_model_context
(
model_name
=
model_id
,
tokenizer_name
=
model_id
,
trust_remote_code
=
True
,
mm_processor_kwargs
=
None
,
limit_mm_per_prompt
=
{
"image"
:
num_imgs
},
)
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
,
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
tokenizer
,
)
config
=
processor
.
info
.
get_hf_config
()
use_msac
=
config
.
use_msac
mm_processor_kwargs
=
{
"max_dynamic_patch"
:
max_dynamic_patch
,
}
if
dynamic_image_size
is
not
None
:
mm_processor_kwargs
[
"dynamic_image_size"
]
=
dynamic_image_size
min_num
=
config
.
min_dynamic_patch
max_num
=
max_dynamic_patch
if
dynamic_image_size
else
1
# Build the image str / prompt based on the number of images we pass
prompt
=
"<image>"
*
num_imgs
for
asset
in
image_assets
:
for
factor
in
size_factors
:
image
=
rescale_image_size
(
asset
.
pil_image
,
factor
)
mm_data
=
{
"image"
:
[
image
]
*
num_imgs
}
width
,
height
=
image
.
size
# Calculate the expected number of blocks
if
num_imgs
==
1
and
use_msac
:
# First pass
blocks1
,
_
,
_
,
aspect_ratio
=
calculate_h2ovl_targets
(
orig_width
=
width
,
orig_height
=
height
,
target_ratios
=
get_h2ovl_target_ratios
(
min_num
,
max_num
,
prior_aspect_ratio
=
None
,
),
image_size
=
config
.
vision_config
.
image_size
,
use_thumbnail
=
False
,
# Thumbnail is handled separately
)
# Second pass
blocks2
,
_
,
_
,
_
=
calculate_h2ovl_targets
(
orig_width
=
width
,
orig_height
=
height
,
target_ratios
=
get_h2ovl_target_ratios
(
min_num
,
max_num
,
prior_aspect_ratio
=
aspect_ratio
,
),
image_size
=
config
.
vision_config
.
image_size
,
use_thumbnail
=
False
,
)
# Add thumbnail if use_thumbnail is True and total_blocks > 1
if
config
.
use_thumbnail
:
blocks1
+=
1
if
blocks1
>
1
else
0
blocks2
+=
1
if
blocks2
>
1
else
0
# Total blocks is the sum of blocks from both passes minus
# overlapping
total_blocks
=
blocks1
+
blocks2
-
1
expected_num_patches
=
total_blocks
else
:
blocks
,
_
,
_
,
_
=
calculate_h2ovl_targets
(
orig_width
=
width
,
orig_height
=
height
,
target_ratios
=
get_h2ovl_target_ratios
(
min_num
,
max_num
,
prior_aspect_ratio
=
None
,
),
image_size
=
config
.
vision_config
.
image_size
,
use_thumbnail
=
False
,
)
expected_num_patches
=
blocks
if
config
.
use_thumbnail
and
expected_num_patches
!=
1
:
expected_num_patches
+=
1
processed_inputs
=
processor
.
apply
(
prompt
,
mm_data
,
mm_processor_kwargs
)
pixel_shape
=
(
processed_inputs
[
"mm_kwargs"
][
"pixel_values_flat"
].
shape
)
assert
pixel_shape
[
0
]
==
expected_num_patches
*
num_imgs
tests/models/multimodal/processing/test_idefics3.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
"""Tests for Idefics3's multimodal preprocessing kwargs."""
"""Tests for Idefics3's multimodal preprocessing kwargs."""
from
typing
import
Optional
import
os
import
os
import
pytest
import
pytest
import
torch
from
transformers
import
Idefics3Config
from
transformers
import
AutoImageProcessor
,
AutoTokenizer
from
vllm.
inputs
import
InputContext
,
token_inputs
from
vllm.
multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MultiModalRegistry
from
vllm.multimodal
.utils
import
cached_get_tokenizer
from
....conftest
import
_ImageAssets
from
....conftest
import
_ImageAssets
from
...utils
import
build_model_context
from
...utils
import
build_model_context
...
@@ -16,163 +14,53 @@ from ....utils import models_path_prefix
...
@@ -16,163 +14,53 @@ from ....utils import models_path_prefix
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"HuggingFaceM4/Idefics3-8B-Llama3"
)]
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"HuggingFaceM4/Idefics3-8B-Llama3"
)]
# Wrap lazy imports to avoid initializing CUDA during test collection
@
pytest
.
fixture
()
def
input_processor_for_idefics3
():
from
vllm.model_executor.models.idefics3
import
(
input_processor_for_idefics3
)
return
input_processor_for_idefics3
@
pytest
.
fixture
()
def
dummy_data_for_idefics3
():
from
vllm.model_executor.models.idefics3
import
dummy_data_for_idefics3
return
dummy_data_for_idefics3
@
pytest
.
fixture
()
def
get_max_idefics3_image_tokens
():
from
vllm.model_executor.models.idefics3
import
(
get_max_idefics3_image_tokens
)
return
get_max_idefics3_image_tokens
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"longest_edge"
,
[
None
,
168
,
336
,
400
,
2
*
336
])
def
test_input_mapper_override
(
model
:
str
,
image_assets
:
_ImageAssets
,
longest_edge
:
Optional
[
int
]):
"""Ensure that the [default] input mapper handles size properly."""
mm_processor_kwargs
=
{
"size"
:
{
"longest_edge"
:
longest_edge
}
}
if
longest_edge
is
not
None
else
{}
ctx
=
build_model_context
(
model_name
=
model
,
tokenizer_name
=
model
,
trust_remote_code
=
True
,
mm_processor_kwargs
=
mm_processor_kwargs
,
)
hf_processor
=
AutoImageProcessor
.
from_pretrained
(
model
,
trust_remote_code
=
True
,
**
mm_processor_kwargs
)
mm_registry
=
MultiModalRegistry
()
mm_registry
.
init_mm_limits_per_prompt
(
ctx
.
model_config
)
image
=
image_assets
[
0
].
pil_image
hf_result
=
hf_processor
.
preprocess
(
image
,
return_tensors
=
"pt"
,
)
vllm_result
=
mm_registry
.
map_input
(
ctx
.
model_config
,
{
"image"
:
image
},
)
assert
torch
.
all
(
hf_result
[
"pixel_values"
]
==
vllm_result
[
"pixel_values"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"longest_edge, expected_max_tokens"
,
[
(
None
,
2873
),
(
168
,
169
),
(
336
,
169
),
(
400
,
338
),
(
672
,
338
),
])
def
test_max_tokens_override
(
get_max_idefics3_image_tokens
,
model
:
str
,
longest_edge
:
Optional
[
int
],
expected_max_tokens
:
int
):
"""Ensure get_max_idefics3_image_tokens handles mm_processor_kwargs."""
size
=
{
"longest_edge"
:
longest_edge
}
if
longest_edge
is
not
None
else
None
ctx
=
build_model_context
(
model_name
=
model
,
tokenizer_name
=
model
,
trust_remote_code
=
True
,
mm_processor_kwargs
=
None
,
)
actual_max_tokens
=
get_max_idefics3_image_tokens
(
ctx
=
InputContext
(
ctx
.
model_config
),
size
=
size
,
)
assert
expected_max_tokens
==
actual_max_tokens
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"longest_edge, toks_per_img, num_imgs"
,
[
(
168
,
169
,
1
),
(
168
,
169
,
2
),
(
400
,
338
,
1
),
(
400
,
338
,
2
),
])
def
test_dummy_data_override
(
dummy_data_for_idefics3
,
model
:
str
,
longest_edge
:
int
,
toks_per_img
:
int
,
num_imgs
:
int
):
"""Ensure dummy_data_for_idefics3 handles num_crops properly."""
# Same as the previous test - don't initialize mm_processor_kwargs
# in this test and assume that the kwargs will be correctly expanded by
# the partial when calling the dummy data func.
size
=
{
"longest_edge"
:
longest_edge
}
if
longest_edge
is
not
None
else
None
ctx
=
build_model_context
(
model_name
=
model
,
tokenizer_name
=
model
,
trust_remote_code
=
True
,
mm_processor_kwargs
=
None
,
)
dummy_data
=
dummy_data_for_idefics3
(
ctx
=
ctx
,
seq_len
=
8192
,
# Should be bigger than num_imgs * toks_per_img
mm_counts
=
{
"image"
:
num_imgs
},
size
=
size
)
sequence_data
=
dummy_data
.
seq_data
# Ensure we have the right number of placeholders per size
image_token_id
=
ctx
.
get_hf_config
().
image_token_id
img_tok_count
=
sequence_data
.
get_token_ids
().
count
(
image_token_id
)
assert
img_tok_count
==
toks_per_img
*
num_imgs
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"longest_edge,expected_toks_per_img,num_imgs"
,
[
# yapf: disable
(
336
,
169
*
(
1
**
2
+
1
),
1
),
@
pytest
.
mark
.
parametrize
(
(
336
,
169
*
(
1
**
2
+
1
),
2
),
(
"mm_processor_kwargs"
,
"expected_toks_per_img"
),
(
400
,
169
*
(
2
**
2
+
1
),
1
),
[
(
400
,
169
*
(
2
**
2
+
1
),
2
),
({
"size"
:
{
"longest_edge"
:
364
}},
169
),
])
({
"size"
:
{
"longest_edge"
:
728
}},
169
*
(
2
**
2
+
1
)),
def
test_input_processor_override
(
input_processor_for_idefics3
,
])
image_assets
:
_ImageAssets
,
model
:
str
,
# yapf: enable
longest_edge
:
int
,
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
2
])
expected_toks_per_img
:
int
,
num_imgs
:
int
):
def
test_processor_override
(
image_assets
:
_ImageAssets
,
model
:
str
,
mm_processor_kwargs
:
dict
[
str
,
object
],
expected_toks_per_img
:
int
,
num_imgs
:
int
):
"""Ensure input_processor_for_idefics3 handles num_crops properly."""
"""Ensure input_processor_for_idefics3 handles num_crops properly."""
# Same as the previous test - don't initialize mm_processor_kwargs
# Same as the previous test - don't initialize mm_processor_kwargs
# in this test and assume that the kwargs will be correctly expanded by
# in this test and assume that the kwargs will be correctly expanded by
# the partial when calling the custom input processor.
# the partial when calling the custom input processor.
size
=
{
"longest_edge"
:
longest_edge
}
if
longest_edge
is
not
None
else
None
ctx
=
build_model_context
(
ctx
=
build_model_context
(
model_name
=
model
,
model_name
=
model
,
tokenizer_name
=
model
,
tokenizer_name
=
model
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
mm_processor_kwargs
=
None
,
mm_processor_kwargs
=
None
,
limit_mm_per_prompt
=
{
"image"
:
num_imgs
},
)
)
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
tokenizer
,
)
hf_processor
=
processor
.
info
.
get_hf_processor
(
**
mm_processor_kwargs
)
# Build the image str / prompt based on the number of images we pass
# Build the image str / prompt based on the number of images we pass
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model
)
placeholders
=
"<image>"
if
num_imgs
==
1
else
"
\n
"
.
join
(
placeholders
=
"<image>"
if
num_imgs
==
1
else
"
\n
"
.
join
(
f
"Image-
{
i
}
: <image>
\n
"
for
i
in
range
(
1
,
num_imgs
+
1
))
f
"Image-
{
i
}
: <image>
\n
"
for
i
in
range
(
1
,
num_imgs
+
1
))
prompt
=
f
"<|begin_of_text|>User:
{
placeholders
}
\n
<end_of_utterance>
\n
Assistant:"
# noqa: E501
prompt
=
f
"<|begin_of_text|>User:
{
placeholders
}
\n
<end_of_utterance>
\n
Assistant:"
# noqa: E501
images
=
[
image_assets
[
0
].
pil_image
.
resize
((
336
*
4
,
336
*
4
))]
*
num_imgs
inputs
=
token_inputs
(
prompt_token_ids
=
tokenizer
.
encode
(
prompt
),
prompt
=
prompt
,
multi_modal_data
=
{
"image"
:
images
})
processed_inputs
=
input_processor_for_idefics3
(
ctx
,
inputs
,
size
=
size
)
# Build mm_data
image_size
=
ctx
.
get_hf_config
(
Idefics3Config
).
vision_config
.
image_size
dummy_image_size
=
(
image_size
*
4
,
image_size
*
4
)
dummy_image
=
image_assets
[
0
].
pil_image
.
resize
(
dummy_image_size
)
mm_data
=
{
"image"
:
[
dummy_image
]
*
num_imgs
}
processed_inputs
=
processor
.
apply
(
prompt
,
mm_data
,
mm_processor_kwargs
)
# Ensure the placeholders format are correct
hf_processed_inputs
=
hf_processor
(
text
=
prompt
,
images
=
mm_data
[
"image"
])
assert
processed_inputs
[
"prompt_token_ids"
]
==
hf_processed_inputs
[
"input_ids"
][
0
]
# Ensure we have the right number of placeholders per num_crops size
# Ensure we have the right number of placeholders per num_crops size
image_token_id
=
ctx
.
get_hf_config
().
image_token_id
image_token_id
=
ctx
.
get_hf_config
().
image_token_id
...
...
tests/models/multimodal/processing/test_internvl.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
"""Tests for InternVL's multimodal preprocessing kwargs."""
"""Tests for InternVL's multimodal preprocessing kwargs."""
from
typing
import
Callable
,
Optional
from
typing
import
Optional
import
os
import
os
import
pytest
import
pytest
from
transformers
import
AutoTokenizer
from
vllm.
inputs
import
InputContext
,
token_inputs
from
vllm.
multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MultiModalRegistry
from
vllm.multimodal
.utils
import
cached_get_tokenizer
from
....conftest
import
_ImageAssets
from
....conftest
import
_ImageAssets
from
...utils
import
build_model_context
from
...utils
import
build_model_context
from
....utils
import
models_path_prefix
from
....utils
import
models_path_prefix
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"OpenGVLab/InternVL2-2B"
)]
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
os
.
path
.
join
(
models_path_prefix
,
"OpenGVLab/InternVL2-2B"
)])
# Wrap lazy imports to avoid initializing CUDA during test collection
@
pytest
.
fixture
()
def
input_processor_for_internvl
():
from
vllm.model_executor.models.internvl
import
InternVLInputPipeline
pipeline
=
InternVLInputPipeline
(
'<img>'
,
'</img>'
,
'<IMG_CONTEXT>'
)
return
pipeline
.
input_processor
@
pytest
.
fixture
()
def
dummy_data_for_internvl
():
from
vllm.model_executor.models.internvl
import
InternVLInputPipeline
pipeline
=
InternVLInputPipeline
(
'<img>'
,
'</img>'
,
'<IMG_CONTEXT>'
)
return
pipeline
.
dummy_data
@
pytest
.
fixture
()
def
get_max_internvl_image_tokens
():
from
vllm.model_executor.models.internvl
import
(
get_max_internvl_image_tokens
)
return
get_max_internvl_image_tokens
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"max_dynamic_patch"
,
[
1
,
4
])
@
pytest
.
mark
.
parametrize
(
"max_dynamic_patch"
,
[
1
,
4
])
@
pytest
.
mark
.
parametrize
(
"dynamic_image_size"
,
[
True
,
False
,
None
])
@
pytest
.
mark
.
parametrize
(
"dynamic_image_size"
,
[
True
,
False
,
None
])
def
test_input_mapper_override
(
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
2
])
model
:
str
,
def
test_processor_override
(
model_id
:
str
,
image_assets
:
_ImageAssets
,
image_assets
:
_ImageAssets
,
max_dynamic_patch
:
int
,
max_dynamic_patch
:
int
,
dynamic_image_size
:
Optional
[
bool
],
dynamic_image_size
:
Optional
[
bool
],
):
mm_processor_kwargs
=
{
"max_dynamic_patch"
:
max_dynamic_patch
,
}
if
dynamic_image_size
is
not
None
:
mm_processor_kwargs
[
"dynamic_image_size"
]
=
dynamic_image_size
expected_num_patches
=
max_dynamic_patch
+
1
if
max_dynamic_patch
>
1
else
1
if
dynamic_image_size
is
False
:
expected_num_patches
=
1
ctx
=
build_model_context
(
model_name
=
model
,
tokenizer_name
=
model
,
trust_remote_code
=
True
,
mm_processor_kwargs
=
mm_processor_kwargs
,
)
mm_registry
=
MultiModalRegistry
()
mm_registry
.
init_mm_limits_per_prompt
(
ctx
.
model_config
)
image
=
image_assets
[
0
].
pil_image
.
resize
((
448
*
2
,
448
*
2
))
vllm_result
=
mm_registry
.
map_input
(
ctx
.
model_config
,
{
"image"
:
image
},
)
assert
vllm_result
[
"pixel_values"
].
size
(
1
)
==
expected_num_patches
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"max_dynamic_patch"
,
[
1
,
4
,
None
])
@
pytest
.
mark
.
parametrize
(
"dynamic_image_size"
,
[
True
,
False
,
None
])
def
test_max_tokens_override
(
get_max_internvl_image_tokens
:
Callable
,
model
:
str
,
max_dynamic_patch
:
Optional
[
int
],
dynamic_image_size
:
Optional
[
bool
],
):
"""Ensure get_max_internvl_image_tokens handles mm_processor_kwargs."""
ctx
=
build_model_context
(
model_name
=
model
,
tokenizer_name
=
model
,
trust_remote_code
=
True
,
mm_processor_kwargs
=
None
,
)
if
max_dynamic_patch
is
None
:
max_dynamic_patch
=
ctx
.
get_hf_config
().
max_dynamic_patch
expected_num_patches
=
max_dynamic_patch
+
1
if
max_dynamic_patch
>
1
else
1
if
dynamic_image_size
is
False
:
expected_num_patches
=
1
expected_max_tokens
=
256
*
expected_num_patches
actual_max_tokens
=
get_max_internvl_image_tokens
(
ctx
=
InputContext
(
ctx
.
model_config
),
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
assert
expected_max_tokens
==
actual_max_tokens
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
2
])
@
pytest
.
mark
.
parametrize
(
"max_dynamic_patch"
,
[
1
,
4
,
None
])
@
pytest
.
mark
.
parametrize
(
"dynamic_image_size"
,
[
True
,
False
,
None
])
def
test_dummy_data_override
(
dummy_data_for_internvl
:
Callable
,
model
:
str
,
num_imgs
:
int
,
num_imgs
:
int
,
max_dynamic_patch
:
Optional
[
int
],
dynamic_image_size
:
Optional
[
bool
],
):
):
"""Ensure dummy_data_for_internvl handles kwargs properly."""
# Same as the previous test - don't initialize mm_processor_kwargs
# in this test and assume that the kwargs will be correctly expanded by
# the partial when calling the dummy data func.
ctx
=
build_model_context
(
ctx
=
build_model_context
(
model_name
=
model
,
model_name
=
model
_id
,
tokenizer_name
=
model
,
tokenizer_name
=
model
_id
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
mm_processor_kwargs
=
None
,
mm_processor_kwargs
=
None
,
limit_mm_per_prompt
=
{
"image"
:
num_imgs
},
)
)
tokenizer
=
cached_get_tokenizer
(
if
max_dynamic_patch
is
None
:
ctx
.
model_config
.
tokenizer
,
max_dynamic_patch
=
ctx
.
get_hf_config
().
max_dynamic_patch
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
expected_num_patches
=
max_dynamic_patch
+
1
if
max_dynamic_patch
>
1
else
1
)
if
dynamic_image_size
is
False
:
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
expected_num_patches
=
1
ctx
.
model_config
,
expected_max_tokens
=
256
*
expected_num_patches
tokenizer
=
tokenizer
,
dummy_data
=
dummy_data_for_internvl
(
ctx
=
ctx
,
seq_len
=
8192
,
# Should be bigger than num_imgs * toks_per_img
mm_counts
=
{
"image"
:
num_imgs
},
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
)
sequence_data
=
dummy_data
.
seq_data
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model
,
trust_remote_code
=
True
)
image_token_id
=
tokenizer
.
encode
(
'<IMG_CONTEXT>'
,
add_special_tokens
=
False
)[
0
]
# Ensure we have the right number of placeholders per size
mm_processor_kwargs
=
{
img_tok_count
=
sequence_data
.
get_token_ids
().
count
(
image_token_id
)
"max_dynamic_patch"
:
max_dynamic_patch
,
assert
img_tok_count
==
expected_max_tokens
*
num_imgs
}
if
dynamic_image_size
is
not
None
:
mm_processor_kwargs
[
"dynamic_image_size"
]
=
dynamic_image_size
# Build the image str / prompt based on the number of images we pass
prompt
=
"<image>"
*
num_imgs
image
=
image_assets
[
0
].
pil_image
.
resize
((
448
*
2
,
448
*
2
))
mm_data
=
{
"image"
:
[
image
]
*
num_imgs
}
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"max_dynamic_patch"
,
[
1
,
4
])
@
pytest
.
mark
.
parametrize
(
"dynamic_image_size"
,
[
True
,
False
,
None
])
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
2
])
def
test_input_processor_override
(
input_processor_for_internvl
:
Callable
,
image_assets
:
_ImageAssets
,
model
:
str
,
num_imgs
:
int
,
max_dynamic_patch
:
int
,
dynamic_image_size
:
Optional
[
bool
],
):
"""Ensure input_processor_for_internvl handles kwargs properly."""
# Same as the previous test - don't initialize mm_processor_kwargs
# in this test and assume that the kwargs will be correctly expanded by
# the partial when calling the custom input processor.
expected_num_patches
=
max_dynamic_patch
+
1
if
max_dynamic_patch
>
1
else
1
expected_num_patches
=
max_dynamic_patch
+
1
if
max_dynamic_patch
>
1
else
1
if
dynamic_image_size
is
False
:
if
dynamic_image_size
is
False
:
expected_num_patches
=
1
expected_num_patches
=
1
ctx
=
build_model_context
(
processed_inputs
=
processor
.
apply
(
prompt
,
mm_data
,
mm_processor_kwargs
)
model_name
=
model
,
tokenizer_name
=
model
,
trust_remote_code
=
True
,
mm_processor_kwargs
=
None
,
)
expected_toks_per_img
=
256
*
expected_num_patches
# Build the image str / prompt based on the number of images we pass
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model
,
trust_remote_code
=
True
)
placeholders
=
"<image>"
if
num_imgs
==
1
else
"
\n
"
.
join
(
f
"Image-
{
i
}
: <image>
\n
"
for
i
in
range
(
1
,
num_imgs
+
1
))
prompt
=
placeholders
images
=
[
image_assets
[
0
].
pil_image
.
resize
((
448
*
2
,
448
*
2
))]
*
num_imgs
inputs
=
token_inputs
(
prompt_token_ids
=
tokenizer
.
encode
(
prompt
),
prompt
=
prompt
,
multi_modal_data
=
{
"image"
:
images
})
processed_inputs
=
input_processor_for_internvl
(
ctx
,
inputs
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
# Ensure we have the right number of placeholders per num_crops size
# Ensure we have the right number of placeholders per num_crops size
image_token_id
=
tokenizer
.
encode
(
'<IMG_CONTEXT>'
,
image_token_id
=
tokenizer
.
convert_tokens_to_ids
(
"<IMG_CONTEXT>"
)
add_special_tokens
=
False
)[
0
]
img_tok_count
=
processed_inputs
[
"prompt_token_ids"
].
count
(
image_token_id
)
img_tok_count
=
processed_inputs
[
"prompt_token_ids"
].
count
(
image_token_id
)
assert
img_tok_count
==
expected_toks_per_img
*
num_imgs
pixel_shape
=
processed_inputs
[
"mm_kwargs"
][
"pixel_values_flat"
].
shape
assert
img_tok_count
==
256
*
expected_num_patches
*
num_imgs
assert
pixel_shape
[
0
]
==
expected_num_patches
*
num_imgs
tests/models/multimodal/processing/test_llava_next.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
itertools
import
itertools
from
functools
import
partial
from
functools
import
partial
...
@@ -41,7 +43,10 @@ def test_processor_max_tokens(model_id):
...
@@ -41,7 +43,10 @@ def test_processor_max_tokens(model_id):
)
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
ctx
.
model_config
,
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
),
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
,
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
),
)
)
info
=
processor
.
info
info
=
processor
.
info
...
@@ -141,7 +146,10 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
...
@@ -141,7 +146,10 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
)
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
ctx
.
model_config
,
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
),
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
,
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
),
)
)
image_ratios
=
[(
171
,
152
),
(
184
,
161
),
(
198
,
176
),
(
333
,
296
),
(
369
,
328
),
image_ratios
=
[(
171
,
152
),
(
184
,
161
),
(
198
,
176
),
(
333
,
296
),
(
369
,
328
),
...
@@ -171,7 +179,10 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
...
@@ -171,7 +179,10 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
)
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
ctx
.
model_config
,
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
),
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
,
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
),
)
)
seen_aspect_ratios
=
set
[
float
]()
seen_aspect_ratios
=
set
[
float
]()
...
...
tests/models/multimodal/processing/test_llava_onevision.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
itertools
import
itertools
from
functools
import
partial
from
functools
import
partial
...
@@ -42,7 +44,10 @@ def test_processor_max_tokens(model_id):
...
@@ -42,7 +44,10 @@ def test_processor_max_tokens(model_id):
)
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
ctx
.
model_config
,
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
),
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
,
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
),
)
)
info
=
processor
.
info
info
=
processor
.
info
...
@@ -141,7 +146,10 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
...
@@ -141,7 +146,10 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
)
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
ctx
.
model_config
,
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
),
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
,
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
),
)
)
image_ratios
=
[(
171
,
152
),
(
184
,
161
),
(
198
,
176
),
(
333
,
296
),
(
369
,
328
),
image_ratios
=
[(
171
,
152
),
(
184
,
161
),
(
198
,
176
),
(
333
,
296
),
(
369
,
328
),
...
@@ -172,7 +180,10 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
...
@@ -172,7 +180,10 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
)
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
ctx
.
model_config
,
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
),
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
,
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
),
)
)
seen_aspect_ratios
=
set
[
float
]()
seen_aspect_ratios
=
set
[
float
]()
...
...
Prev
1
…
14
15
16
17
18
19
20
21
22
…
50
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment