Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ec5e299c
Commit
ec5e299c
authored
Feb 21, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.7.3' into v0.7.3-dev
parents
47bd229c
ed6e9075
Changes
521
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
590 additions
and
328 deletions
+590
-328
tests/mistral_tool_use/conftest.py
tests/mistral_tool_use/conftest.py
+40
-0
tests/mistral_tool_use/test_mistral_tool_calls.py
tests/mistral_tool_use/test_mistral_tool_calls.py
+29
-0
tests/mistral_tool_use/utils.py
tests/mistral_tool_use/utils.py
+33
-0
tests/models/decoder_only/audio_language/test_ultravox.py
tests/models/decoder_only/audio_language/test_ultravox.py
+1
-1
tests/models/decoder_only/language/test_fp8.py
tests/models/decoder_only/language/test_fp8.py
+3
-3
tests/models/decoder_only/language/test_hybrid.py
tests/models/decoder_only/language/test_hybrid.py
+23
-12
tests/models/decoder_only/language/test_mamba.py
tests/models/decoder_only/language/test_mamba.py
+30
-13
tests/models/decoder_only/language/test_models.py
tests/models/decoder_only/language/test_models.py
+10
-0
tests/models/decoder_only/vision_language/test_models.py
tests/models/decoder_only/vision_language/test_models.py
+4
-10
tests/models/decoder_only/vision_language/vlm_utils/core.py
tests/models/decoder_only/vision_language/vlm_utils/core.py
+39
-23
tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
...els/decoder_only/vision_language/vlm_utils/model_utils.py
+21
-77
tests/models/decoder_only/vision_language/vlm_utils/types.py
tests/models/decoder_only/vision_language/vlm_utils/types.py
+3
-7
tests/models/encoder_decoder/vision_language/test_mllama.py
tests/models/encoder_decoder/vision_language/test_mllama.py
+67
-4
tests/models/multimodal/processing/test_common.py
tests/models/multimodal/processing/test_common.py
+18
-9
tests/models/multimodal/processing/test_h2ovl.py
tests/models/multimodal/processing/test_h2ovl.py
+131
-94
tests/models/multimodal/processing/test_idefics3.py
tests/models/multimodal/processing/test_idefics3.py
+16
-8
tests/models/multimodal/processing/test_internvl.py
tests/models/multimodal/processing/test_internvl.py
+107
-35
tests/models/multimodal/processing/test_llava_next.py
tests/models/multimodal/processing/test_llava_next.py
+4
-13
tests/models/multimodal/processing/test_llava_onevision.py
tests/models/multimodal/processing/test_llava_onevision.py
+4
-13
tests/models/multimodal/processing/test_phi3v.py
tests/models/multimodal/processing/test_phi3v.py
+7
-6
No files found.
Too many changes to show.
To preserve performance only
521 of 521+
files are displayed.
Plain diff
Email patch
tests/mistral_tool_use/conftest.py
0 → 100644
View file @
ec5e299c
# SPDX-License-Identifier: Apache-2.0
import
pytest
import
pytest_asyncio
from
huggingface_hub
import
snapshot_download
from
tests.utils
import
RemoteOpenAIServer
from
vllm.platforms
import
current_platform
from
.utils
import
ARGS
,
CONFIGS
,
ServerConfig
# for each server config, download the model and return the config
@
pytest
.
fixture
(
scope
=
"session"
,
params
=
CONFIGS
.
keys
())
def
server_config
(
request
):
config
=
CONFIGS
[
request
.
param
]
if
current_platform
.
is_rocm
()
and
not
config
.
get
(
"supports_rocm"
,
True
):
pytest
.
skip
(
"The {} model can't be tested on the ROCm platform"
.
format
(
config
[
"model"
]))
# download model and tokenizer using transformers
snapshot_download
(
config
[
"model"
])
yield
CONFIGS
[
request
.
param
]
# run this for each server config
@
pytest
.
fixture
(
scope
=
"session"
)
def
server
(
request
,
server_config
:
ServerConfig
):
model
=
server_config
[
"model"
]
args_for_model
=
server_config
[
"arguments"
]
with
RemoteOpenAIServer
(
model
,
ARGS
+
args_for_model
,
max_wait_seconds
=
480
)
as
server
:
yield
server
@
pytest_asyncio
.
fixture
async
def
client
(
server
:
RemoteOpenAIServer
):
async
with
server
.
get_async_client
()
as
async_client
:
yield
async_client
tests/mistral_tool_use/test_mistral_tool_calls.py
0 → 100644
View file @
ec5e299c
# SPDX-License-Identifier: Apache-2.0
import
openai
import
pytest
from
tests.tool_use.utils
import
MESSAGES_ASKING_FOR_TOOLS
,
WEATHER_TOOL
# test: a tool_choice with mistral-tokenizer results in an ID of length 9
@
pytest
.
mark
.
asyncio
async
def
test_tool_call_with_tool_choice
(
client
:
openai
.
AsyncOpenAI
):
models
=
await
client
.
models
.
list
()
model_name
:
str
=
models
.
data
[
0
].
id
chat_completion
=
await
client
.
chat
.
completions
.
create
(
messages
=
MESSAGES_ASKING_FOR_TOOLS
,
temperature
=
0
,
max_completion_tokens
=
100
,
model
=
model_name
,
tools
=
[
WEATHER_TOOL
],
tool_choice
=
WEATHER_TOOL
,
logprobs
=
False
)
choice
=
chat_completion
.
choices
[
0
]
assert
choice
.
finish_reason
!=
"tool_calls"
# "stop" or "length"
assert
choice
.
message
.
role
==
"assistant"
assert
choice
.
message
.
tool_calls
is
None
\
or
len
(
choice
.
message
.
tool_calls
)
==
1
assert
len
(
choice
.
message
.
tool_calls
[
0
].
id
)
==
9
# length of 9 for mistral
tests/mistral_tool_use/utils.py
0 → 100644
View file @
ec5e299c
# SPDX-License-Identifier: Apache-2.0
from
typing
import
Dict
,
List
,
Optional
from
typing_extensions
import
TypedDict
class
ServerConfig
(
TypedDict
,
total
=
False
):
model
:
str
arguments
:
List
[
str
]
system_prompt
:
Optional
[
str
]
supports_parallel
:
Optional
[
bool
]
supports_rocm
:
Optional
[
bool
]
ARGS
:
List
[
str
]
=
[
"--max-model-len"
,
"1024"
]
CONFIGS
:
Dict
[
str
,
ServerConfig
]
=
{
"mistral"
:
{
"model"
:
"mistralai/Mistral-7B-Instruct-v0.3"
,
"arguments"
:
[
"--tokenizer-mode"
,
"mistral"
,
"--ignore-patterns=
\"
consolidated.safetensors
\"
"
],
"system_prompt"
:
"You are a helpful assistant with access to tools. If a tool"
" that you have would be helpful to answer a user query, "
"call the tool. Otherwise, answer the user's query directly "
"without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
"to the user's question - just respond to it normally."
},
}
tests/models/decoder_only/audio_language/test_ultravox.py
View file @
ec5e299c
...
...
@@ -18,7 +18,7 @@ from ....conftest import HfRunner, VllmRunner
from
....utils
import
RemoteOpenAIServer
,
models_path_prefix
from
...utils
import
check_logprobs_close
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"fixie-ai/ultravox-v0_
3
"
)
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"fixie-ai/ultravox-v0_
5-llama-3_2-1b
"
)
AudioTuple
=
Tuple
[
np
.
ndarray
,
int
]
...
...
tests/models/decoder_only/language/test_fp8.py
View file @
ec5e299c
...
...
@@ -30,9 +30,9 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
# Test FP16 checkpoint w. fp8_e5m2 kv-cache.
(
"fp8_e5m2"
,
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
),
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
)),
# Test F
P
16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
(
"fp8_e4m3"
,
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-
2-7b-chat-hf
"
),
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-
2-7b-chat-hf
"
))
# Test
B
F16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
(
"fp8_e4m3"
,
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-
3.2-1B-Instruct
"
),
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-
3.2-1B-Instruct
"
))
])
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
4
])
...
...
tests/models/decoder_only/language/test_
jamba
.py
→
tests/models/decoder_only/language/test_
hybrid
.py
View file @
ec5e299c
...
...
@@ -10,7 +10,8 @@ from vllm.sampling_params import SamplingParams
from
...utils
import
check_outputs_equal
from
....utils
import
models_path_prefix
MODELS
=
[
os
.
path
.
join
(
models_path_prefix
,
"ai21labs/Jamba-tiny-random"
)]
# This test is for the hybrid models
MODELS
=
[
os
.
path
.
join
(
models_path_prefix
,
"ai21labs/Jamba-tiny-dev"
),
os
.
path
.
join
(
models_path_prefix
,
"ibm-ai-platform/Bamba-9B"
)]
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
...
...
@@ -25,6 +26,10 @@ def test_models(
max_tokens
:
int
,
)
->
None
:
# numeric error produces different generation
if
'Bamba'
in
model
:
example_prompts
.
pop
(
3
)
with
hf_runner
(
model
,
dtype
=
dtype
,
...
...
@@ -110,15 +115,21 @@ def test_mamba_prefill_chunking_with_parallel_sampling(
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
10
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
7
])
def
test_mamba_prefill_chunking
(
hf_runner
,
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
)
->
None
:
# numeric error during prefill chucking produces different generation
# compared to w/o prefill chunking for those examples, removed them for now
example_prompts
.
pop
(
7
)
example_prompts
.
pop
(
2
)
example_prompts
.
pop
(
1
)
if
'Jamba'
in
model
:
example_prompts
.
pop
(
7
)
example_prompts
.
pop
(
2
)
example_prompts
.
pop
(
1
)
elif
'Bamba'
in
model
:
example_prompts
.
pop
(
6
)
example_prompts
.
pop
(
3
)
example_prompts
.
pop
(
2
)
dtype
=
"half"
# use a different dtype for Bamba
with
hf_runner
(
model
,
...
...
@@ -147,7 +158,7 @@ def test_mamba_prefill_chunking(hf_runner, vllm_runner, example_prompts,
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"
b
float
16
"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
15
])
def
test_parallel_sampling
(
vllm_runner
,
...
...
@@ -251,17 +262,17 @@ def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
dtype
:
str
,
example_prompts
,
)
->
None
:
# This test is for verifying that the
Jamba
inner state management doesn't
# This test is for verifying that the
hybrid
inner state management doesn't
# collapse in case where the number of incoming requests and
# finished_requests_ids is larger than the maximum mamba block capacity.
# This could generally happen due to the fact that
Jamba
does support
# This could generally happen due to the fact that
hybrid
does support
# statelessness mechanism where it can cleanup new incoming requests in
# a single step.
try
:
with
vllm_runner
(
model
,
dtype
=
dtype
,
max_num_seqs
=
10
)
as
vllm_model
:
vllm_model
.
generate_greedy
([
example_prompts
[
0
]]
*
100
,
10
)
except
ValueError
:
pytest
.
fail
(
"
Jamba
inner state wasn't cleaned up properly between"
pytest
.
fail
(
"
Hybrid
inner state wasn't cleaned up properly between"
"steps finished requests registered unnecessarily "
)
...
...
@@ -273,14 +284,14 @@ def test_state_cleanup(
dtype
:
str
,
example_prompts
,
)
->
None
:
# This test is for verifying that the
Jamba
state is cleaned up between
# This test is for verifying that the
Hybrid
state is cleaned up between
# steps, If its not cleaned, an error would be expected.
try
:
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
for
_
in
range
(
10
):
vllm_model
.
generate_greedy
([
example_prompts
[
0
]]
*
100
,
1
)
except
ValueError
:
pytest
.
fail
(
"
Jamba
inner state wasn't cleaned up between states, "
pytest
.
fail
(
"
Hybrid
inner state wasn't cleaned up between states, "
"could be related to finished_requests_ids"
)
...
...
@@ -326,7 +337,7 @@ def test_multistep_correctness(vllm_runner, model: str, dtype: str,
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
64
])
def
test_
jamba
_distributed_produces_identical_generation
(
def
test_
hybrid
_distributed_produces_identical_generation
(
vllm_runner
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
example_prompts
)
->
None
:
...
...
tests/models/decoder_only/language/test_mamba.py
View file @
ec5e299c
...
...
@@ -5,6 +5,7 @@ Run `pytest tests/models/test_mamba.py`.
"""
import
os
import
pytest
import
torch
from
transformers
import
AutoModelForCausalLM
,
AutoTokenizer
from
vllm.engine.arg_utils
import
EngineArgs
...
...
@@ -13,7 +14,14 @@ from vllm.sampling_params import SamplingParams
from
...utils
import
check_outputs_equal
from
....utils
import
models_path_prefix
MODELS
=
[
os
.
path
.
join
(
models_path_prefix
,
"state-spaces/mamba-130m-hf"
),
os
.
path
.
join
(
models_path_prefix
,
"tiiuae/falcon-mamba-tiny-dev"
)]
MODELS
=
[
os
.
path
.
join
(
models_path_prefix
,
"state-spaces/mamba-130m-hf"
),
os
.
path
.
join
(
models_path_prefix
,
"tiiuae/falcon-mamba-tiny-dev"
),
# TODO: Compare to a Mamba2 model. The HF transformers implementation of
# Mamba2 is buggy for Codestral as it doesn't handle n_groups.
# See https://github.com/huggingface/transformers/pull/35943
# "mistralai/Mamba-Codestral-7B-v0.1",
]
# Use lower-level interfaces to create this greedy generator, as mamba will
...
...
@@ -23,6 +31,10 @@ def generate_greedy(model_name, example_prompts, max_tokens):
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
)
model
=
AutoModelForCausalLM
.
from_pretrained
(
model_name
)
# Set the device (GPU if available, else CPU)
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
model
.
to
(
device
)
# Generate texts from the prompts
outputs
=
[]
for
prompt
in
example_prompts
:
...
...
@@ -31,7 +43,9 @@ def generate_greedy(model_name, example_prompts, max_tokens):
input_ids
=
inputs
[
"input_ids"
].
to
(
model
.
device
)
# Generate text using the model's generate method directly
generated_ids
=
model
.
generate
(
input_ids
,
max_new_tokens
=
max_tokens
)
generated_ids
=
model
.
generate
(
input_ids
,
max_new_tokens
=
max_tokens
,
do_sample
=
False
)
generated_text
=
tokenizer
.
decode
(
generated_ids
[
0
],
skip_special_tokens
=
True
)
...
...
@@ -52,7 +66,8 @@ def test_models(
)
->
None
:
hf_outputs
=
generate_greedy
(
model
,
example_prompts
,
max_tokens
)
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
# Set max_num_seqs to keep Codestral from going OOM at fp32
with
vllm_runner
(
model
,
dtype
=
dtype
,
max_num_seqs
=
16
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
# This test is for verifying whether the model's extra_repr
...
...
@@ -83,7 +98,7 @@ def test_batching(
)
->
None
:
# To pass the small model tests, we need full precision.
for_loop_outputs
=
[]
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
with
vllm_runner
(
model
,
dtype
=
dtype
,
max_num_seqs
=
16
)
as
vllm_model
:
for
prompt
in
example_prompts
:
for_loop_outputs
.
append
(
vllm_model
.
generate_greedy
([
prompt
],
max_tokens
)[
0
])
...
...
@@ -167,20 +182,22 @@ def test_parallel_sampling(
max_tokens
:
int
,
)
->
None
:
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
# Numerical differences produce slightly different output for these
if
'state-spaces'
in
model
:
example_prompts
.
pop
(
0
)
example_prompts
.
pop
(
0
)
example_prompts
.
pop
(
0
)
with
vllm_runner
(
model
,
dtype
=
dtype
,
max_num_seqs
=
16
)
as
vllm_model
:
for_loop_outputs
=
[]
for
_
in
range
(
10
):
for_loop_outputs
.
append
(
# using example_prompts index 1 instead of 0 since with 0 the
# logprobs get really close and the test doesn't pass
vllm_model
.
generate_greedy
([
example_prompts
[
1
]],
max_tokens
)
[
0
])
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)[
0
])
sampling_params
=
SamplingParams
(
n
=
10
,
temperature
=
0.001
,
seed
=
0
,
max_tokens
=
max_tokens
)
n_lt_1_outputs
=
vllm_model
.
generate
([
example_prompts
[
1
]],
sampling_params
)
n_lt_1_outputs
=
vllm_model
.
generate
(
example_prompts
,
sampling_params
)
token_ids
,
texts
=
n_lt_1_outputs
[
0
]
n_lt_1_outputs
=
[(
token_id
,
text
)
for
token_id
,
text
in
zip
(
token_ids
,
texts
)]
...
...
@@ -234,7 +251,7 @@ def test_models_preemption_recompute(
# Tests that outputs are identical with and w/o preemtions (recompute)
assert
dtype
==
"float"
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
with
vllm_runner
(
model
,
dtype
=
dtype
,
max_num_seqs
=
16
)
as
vllm_model
:
vllm_model
.
model
.
llm_engine
.
scheduler
[
0
].
ENABLE_ARTIFICIAL_PREEMPT
=
True
preempt_vllm_outputs
=
vllm_model
.
generate_greedy
(
...
...
@@ -285,7 +302,7 @@ def test_state_cleanup(
# This test is for verifying that the Mamba state is cleaned up between
# steps, If its not cleaned, an error would be expected.
try
:
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
with
vllm_runner
(
model
,
dtype
=
dtype
,
max_num_seqs
=
16
)
as
vllm_model
:
for
_
in
range
(
10
):
vllm_model
.
generate_greedy
([
example_prompts
[
0
]]
*
100
,
1
)
except
ValueError
:
...
...
tests/models/decoder_only/language/test_models.py
View file @
ec5e299c
...
...
@@ -28,6 +28,9 @@ from ....utils import models_path_prefix
os
.
path
.
join
(
models_path_prefix
,
"google/gemma-1.1-2b-it"
),
# gemma
marks
=
[
pytest
.
mark
.
core_model
,
pytest
.
mark
.
cpu_model
],
),
pytest
.
param
(
os
.
path
.
join
(
models_path_prefix
,
"THUDM/chatglm3-6b"
),
# chatglm (text-only)
),
pytest
.
param
(
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
),
# llama
marks
=
[
pytest
.
mark
.
core_model
,
pytest
.
mark
.
cpu_model
],
...
...
@@ -45,6 +48,9 @@ from ....utils import models_path_prefix
os
.
path
.
join
(
models_path_prefix
,
"microsoft/phi-2"
),
# phi
marks
=
[
pytest
.
mark
.
core_model
],
),
pytest
.
param
(
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen-7B"
),
# qwen (text-only)
),
pytest
.
param
(
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2.5-0.5B-Instruct"
),
# qwen2
marks
=
[
pytest
.
mark
.
core_model
],
...
...
@@ -70,6 +76,10 @@ def test_models(
)
->
None
:
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
if
model
.
startswith
(
"THUDM/chatglm3"
):
hf_model
.
model
.
get_output_embeddings
=
lambda
:
\
hf_model
.
model
.
transformer
.
output_layer
hf_outputs
=
hf_model
.
generate_greedy_logprobs_limit
(
example_prompts
,
max_tokens
,
num_logprobs
)
...
...
tests/models/decoder_only/vision_language/test_models.py
View file @
ec5e299c
...
...
@@ -157,10 +157,7 @@ VLM_TEST_SETTINGS = {
auto_cls
=
AutoModelForVision2Seq
,
vllm_output_post_proc
=
model_utils
.
qwen2_vllm_to_hf_output
,
image_size_factors
=
[(),
(
0.25
,),
(
0.25
,
0.25
,
0.25
),
(
0.25
,
0.2
,
0.15
)],
marks
=
[
pytest
.
mark
.
skipif
(
TRANSFORMERS_VERSION
<
"4.49.0"
,
reason
=
"HF model requires transformers>=4.49.0"
,
),
pytest
.
mark
.
core_model
,
pytest
.
mark
.
cpu_model
],
marks
=
[
pytest
.
mark
.
core_model
,
pytest
.
mark
.
cpu_model
],
),
#### Extended model tests
"aria"
:
VLMTestInfo
(
...
...
@@ -217,7 +214,6 @@ VLM_TEST_SETTINGS = {
"cherry_blossom"
:
"<image>
\n
Please infer the season with reason in details."
,
# noqa: E501
}),
multi_image_prompt
=
"image_1:<image>
\n
image_2:<image>
\n
Which image can we see the car and the tower?"
,
# noqa: E501
vllm_runner_kwargs
=
{
"hf_overrides"
:
{
"architectures"
:
[
"DeepseekVLV2ForCausalLM"
]}},
# noqa: E501
patch_hf_runner
=
model_utils
.
deepseekvl2_patch_hf_runner
,
postprocess_inputs
=
model_utils
.
cast_dtype_post_processor
(
"images"
),
hf_output_post_proc
=
model_utils
.
deepseekvl2_trunc_hf_output
,
...
...
@@ -353,7 +349,6 @@ VLM_TEST_SETTINGS = {
postprocess_inputs
=
model_utils
.
cast_dtype_post_processor
(
"pixel_values"
),
vllm_runner_kwargs
=
{
"hf_overrides"
:
{
"architectures"
:
[
"MantisForConditionalGeneration"
]}},
# noqa: E501
get_stop_token_ids
=
lambda
tok
:
[
128009
],
auto_cls
=
AutoModelForVision2Seq
,
vllm_output_post_proc
=
model_utils
.
mantis_vllm_to_hf_output
,
...
...
@@ -406,11 +401,10 @@ VLM_TEST_SETTINGS = {
"molmo"
:
VLMTestInfo
(
models
=
[
"allenai/Molmo-7B-D-0924"
],
test_type
=
(
VLMTestType
.
IMAGE
),
prompt_formatter
=
lambda
img_prompt
:
"User: "
+
img_prompt
+
" Assistant:"
,
# noqa: E501
prompt_formatter
=
identity
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
image_size_factors
=
[(),(
1.0
,
1.0
,
1.0
)],
patch_hf_runner
=
model_utils
.
mlomo_patch_hf_runner
,
patch_hf_runner
=
model_utils
.
molmo_patch_hf_runner
,
postprocess_inputs
=
model_utils
.
molmo_post_processor
,
),
# Tests for phi3v currently live in another file because of a bug in
...
...
@@ -440,7 +434,7 @@ VLM_TEST_SETTINGS = {
auto_cls
=
AutoModelForVision2Seq
,
marks
=
[
large_gpu_mark
(
min_gb
=
48
)],
),
"qwen"
:
VLMTestInfo
(
"qwen
_vl
"
:
VLMTestInfo
(
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen-VL"
)],
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
),
prompt_formatter
=
identity
,
...
...
tests/models/decoder_only/vision_language/vlm_utils/core.py
View file @
ec5e299c
...
...
@@ -4,12 +4,14 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
import
torch
from
PIL.Image
import
Image
from
transformers
import
AutoTokenizer
,
BatchEncoding
,
PreTrainedTokenizerBase
from
transformers
import
BatchEncoding
from
transformers.models.auto.auto_factory
import
_BaseAutoModelClass
from
vllm.config
import
TaskOption
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
from
.....conftest
import
HfRunner
,
VllmRunner
from
....registry
import
HF_EXAMPLE_MODELS
from
.types
import
RunnerOutput
...
...
@@ -31,10 +33,8 @@ def run_test(
use_tokenizer_eos
:
bool
,
postprocess_inputs
:
Callable
[[
BatchEncoding
],
BatchEncoding
],
comparator
:
Callable
[...,
None
],
get_stop_token_ids
:
Optional
[
Callable
[[
PreTrainedTokenizerBase
],
List
[
int
]]],
get_stop_token_ids
:
Optional
[
Callable
[[
AnyTokenizer
],
list
[
int
]]],
stop_str
:
Optional
[
List
[
str
]],
tokenizer_mode
:
str
,
limit_mm_per_prompt
:
Dict
[
str
,
int
],
vllm_runner_kwargs
:
Optional
[
Dict
[
str
,
Any
]],
hf_model_kwargs
:
Optional
[
Dict
[
str
,
Any
]],
...
...
@@ -48,7 +48,10 @@ def run_test(
"""Modality agnostic test test executor for comparing HF/vLLM outputs."""
# In the case of embeddings, vLLM takes separate input tensors
vllm_inputs
=
vllm_embeddings
if
vllm_embeddings
is
not
None
else
inputs
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model
,
trust_remote_code
=
True
)
model_info
=
HF_EXAMPLE_MODELS
.
find_hf_info
(
model
)
model_info
.
check_available_online
(
on_fail
=
"skip"
)
model_info
.
check_transformers_version
(
on_fail
=
"skip"
)
vllm_outputs_per_mm
=
[]
hf_outputs_per_mm
=
[]
...
...
@@ -57,17 +60,19 @@ def run_test(
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
vllm_kwargs
:
Dict
[
str
,
Any
]
=
{}
if
get_stop_token_ids
is
not
None
:
vllm_kwargs
[
"stop_token_ids"
]
=
get_stop_token_ids
(
tokenizer
)
if
stop_str
:
vllm_kwargs
[
"stop"
]
=
stop_str
if
vllm_runner_kwargs
is
None
:
vllm_runner_kwargs
=
{}
vllm_runner_kwargs_
:
Dict
[
str
,
Any
]
=
{}
if
model_info
.
tokenizer
:
vllm_runner_kwargs_
[
"tokenizer"
]
=
model_info
.
tokenizer
if
model_info
.
tokenizer_mode
:
vllm_runner_kwargs_
[
"tokenizer_mode"
]
=
model_info
.
tokenizer_mode
if
model_info
.
hf_overrides
:
vllm_runner_kwargs_
[
"hf_overrides"
]
=
model_info
.
hf_overrides
if
vllm_runner_kwargs
:
vllm_runner_kwargs_
.
update
(
vllm_runner_kwargs
)
with
vllm_runner
(
model
,
tokenizer_mode
=
tokenizer_mode
,
max_model_len
=
max_model_len
,
max_num_seqs
=
max_num_seqs
,
dtype
=
dtype
,
...
...
@@ -76,7 +81,15 @@ def run_test(
distributed_executor_backend
=
distributed_executor_backend
,
enforce_eager
=
enforce_eager
,
task
=
task
,
**
vllm_runner_kwargs
)
as
vllm_model
:
**
vllm_runner_kwargs_
)
as
vllm_model
:
tokenizer
=
vllm_model
.
model
.
get_tokenizer
()
vllm_kwargs
:
Dict
[
str
,
Any
]
=
{}
if
get_stop_token_ids
is
not
None
:
vllm_kwargs
[
"stop_token_ids"
]
=
get_stop_token_ids
(
tokenizer
)
if
stop_str
:
vllm_kwargs
[
"stop"
]
=
stop_str
for
prompts
,
media
in
vllm_inputs
:
vllm_kwargs
[
runner_mm_key
]
=
media
vllm_output
=
vllm_model
.
generate_greedy_logprobs
(
...
...
@@ -93,16 +106,19 @@ def run_test(
if
patch_hf_runner
is
not
None
:
hf_model
=
patch_hf_runner
(
hf_model
)
# Some models need to explicitly pass the eos_token_id off the tokenizer or
# processor for a good comparison; currently assume processor/tokenizer
# agree on the EOS, and pull it off the tokenizer if requested.
hf_kwargs
=
{}
if
use_tokenizer_eos
:
hf_kwargs
[
"eos_token_id"
]
=
tokenizer
.
eos_token_id
if
stop_str
:
hf_kwargs
[
"stop_strings"
]
=
stop_str
with
hf_model
,
torch
.
no_grad
():
tokenizer
=
hf_model
.
tokenizer
# Some models need to explicitly pass the eos_token_id off the tokenizer
# or processor for a good comparison;
# currently assume processor/tokenizer agree on the EOS, and pull it off
# the tokenizer if requested.
hf_kwargs
=
{}
if
use_tokenizer_eos
:
hf_kwargs
[
"eos_token_id"
]
=
tokenizer
.
eos_token_id
if
stop_str
:
hf_kwargs
[
"stop_strings"
]
=
stop_str
for
prompts
,
media
in
inputs
:
hf_kwargs
[
runner_mm_key
]
=
media
hf_output
=
hf_model
.
generate_greedy_logprobs_limit
(
...
...
tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
View file @
ec5e299c
...
...
@@ -6,7 +6,7 @@ typically specific to a small subset of models.
import
re
import
types
from
pathlib
import
PosixPath
from
typing
import
Any
,
Callable
,
Dict
,
List
,
Optional
,
Tuple
,
Union
from
typing
import
Callable
,
List
,
Optional
,
Tuple
,
Union
import
torch
from
PIL.Image
import
Image
...
...
@@ -17,9 +17,7 @@ from vllm.sequence import SampleLogprobs
from
vllm.transformers_utils.tokenizer
import
patch_padding_side
from
vllm.utils
import
STR_DTYPE_TO_TORCH_DTYPE
from
.....conftest
import
(
HfRunner
,
ImageAsset
,
PromptAudioInput
,
PromptImageInput
,
PromptVideoInput
,
_ImageAssets
)
from
....utils
import
TokensTextLogprobs
from
.....conftest
import
HfRunner
,
ImageAsset
,
_ImageAssets
from
.types
import
RunnerOutput
...
...
@@ -522,74 +520,7 @@ def minicpmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
return
hf_model
def
_generate_greedy_logprobs_limit
(
self
,
prompts
:
List
[
str
],
max_tokens
:
int
,
num_logprobs
:
int
,
images
:
Optional
[
PromptImageInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
**
kwargs
:
Any
,
)
->
List
[
TokensTextLogprobs
]:
all_inputs
=
self
.
get_inputs
(
prompts
,
images
=
images
,
videos
=
videos
,
audios
=
audios
)
# Process in batches for inference.
if
len
(
all_inputs
):
input_ids_lst
=
[]
images_lst
=
[]
images_input_idx_lst
=
[]
imges_masks_lst
=
[]
for
inputs
in
all_inputs
:
input_ids_lst
.
append
(
inputs
[
"input_ids"
])
images_lst
.
append
(
inputs
[
"images"
])
images_input_idx_lst
.
append
(
inputs
[
"image_input_idx"
])
imges_masks_lst
.
append
(
inputs
[
"image_masks"
])
batch_inputs
=
{}
batch_inputs
[
'input_ids'
]
=
torch
.
cat
(
input_ids_lst
,
dim
=
0
)
batch_inputs
[
'images'
]
=
torch
.
cat
(
images_lst
,
dim
=
0
)
batch_inputs
[
'image_input_idx'
]
=
torch
.
cat
(
images_input_idx_lst
,
dim
=
0
)
batch_inputs
[
'image_masks'
]
=
torch
.
cat
(
imges_masks_lst
,
dim
=
0
)
outputs
=
self
.
model
.
generate_from_batch
(
batch
=
self
.
wrap_device
(
batch_inputs
,
device
=
self
.
model
.
device
.
type
),
generation_config
=
GenerationConfig
(
max_new_tokens
=
max_tokens
,
stop_strings
=
"<|endoftext|>"
,
do_sample
=
False
,
),
tokenizer
=
self
.
tokenizer
,
output_hidden_states
=
True
,
return_dict_in_generate
=
True
,
)
all_logprobs
:
List
[
List
[
Dict
[
int
,
float
]]]
=
[]
all_output_ids
:
List
[
List
[
int
]]
=
[]
all_output_strs
:
List
[
str
]
=
[]
for
index
in
range
(
len
(
all_inputs
)):
(
seq_logprobs_lst
,
output_len
,
)
=
self
.
_hidden_states_to_logprobs
(
outputs
.
hidden_states
,
num_logprobs
)
all_logprobs
.
append
(
seq_logprobs_lst
)
seq_ids
=
outputs
.
sequences
[
index
]
output_ids
=
seq_ids
[
-
output_len
:]
all_output_ids
.
append
(
output_ids
.
tolist
())
all_output_strs
.
append
(
self
.
tokenizer
.
decode
(
output_ids
))
outputs
=
zip
(
all_output_ids
,
all_output_strs
,
all_logprobs
)
return
[(
output_ids
,
output_str
,
output_logprobs
)
for
output_ids
,
output_str
,
output_logprobs
in
outputs
]
####### Molmo-specific HuggingFace runner patchers
def
mlomo_patch_hf_runner
(
hf_model
:
HfRunner
)
->
HfRunner
:
def
molmo_patch_hf_runner
(
hf_model
:
HfRunner
)
->
HfRunner
:
"""Patches and returns an instance of the HfRunner to use for Molmo."""
hf_processor
=
hf_model
.
processor
...
...
@@ -598,10 +529,23 @@ def mlomo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
hf_model
.
processor
=
_processor
setattr
(
# noqa: B010
hf_model
,
"generate_greedy_logprobs_limit"
,
types
.
MethodType
(
_generate_greedy_logprobs_limit
,
hf_model
),
)
def
_generate
(
self
,
max_new_tokens
=
None
,
do_sample
=
None
,
**
kwargs
):
batch
=
{
k
:
kwargs
.
pop
(
k
)
for
k
in
(
"input_ids"
,
"images"
,
"image_input_idx"
,
"image_masks"
)
if
k
in
kwargs
}
return
self
.
generate_from_batch
(
batch
,
generation_config
=
GenerationConfig
(
max_new_tokens
=
max_new_tokens
,
stop_strings
=
"<|endoftext|>"
,
do_sample
=
do_sample
,
),
**
kwargs
,
)
hf_model
.
model
.
generate
=
types
.
MethodType
(
_generate
,
hf_model
.
model
)
return
hf_model
tests/models/decoder_only/vision_language/vlm_utils/types.py
View file @
ec5e299c
...
...
@@ -8,12 +8,12 @@ from typing import (Any, Callable, Dict, Iterable, List, NamedTuple, Optional,
import
torch
from
PIL.Image
import
Image
from
pytest
import
MarkDecorator
from
transformers
import
(
AutoModelForCausalLM
,
BatchEncoding
,
PreTrainedTokenizerBase
)
from
transformers
import
AutoModelForCausalLM
,
BatchEncoding
from
transformers.models.auto.auto_factory
import
_BaseAutoModelClass
from
vllm.config
import
TaskOption
from
vllm.sequence
import
SampleLogprobs
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
from
vllm.utils
import
identity
from
.....conftest
import
IMAGE_ASSETS
,
HfRunner
,
ImageAsset
,
_ImageAssets
...
...
@@ -100,8 +100,7 @@ class VLMTestInfo(NamedTuple):
vllm_runner_kwargs
:
Optional
[
Dict
[
str
,
Any
]]
=
None
# Optional callable which gets a list of token IDs from the model tokenizer
get_stop_token_ids
:
Optional
[
Callable
[[
PreTrainedTokenizerBase
],
List
[
int
]]]
=
None
get_stop_token_ids
:
Optional
[
Callable
[[
AnyTokenizer
],
list
[
int
]]]
=
None
# Optional list of strings to stop generation, useful when stop tokens are
# not special tokens in the tokenizer
stop_str
:
Optional
[
List
[
str
]]
=
None
...
...
@@ -156,8 +155,6 @@ class VLMTestInfo(NamedTuple):
marks
:
Optional
[
List
[
MarkDecorator
]]
=
None
tokenizer_mode
:
str
=
"auto"
def
get_non_parametrized_runner_kwargs
(
self
):
"""Returns a dictionary of expandable kwargs for items that are used
in all test types, which are NOT used when creating the parametrized
...
...
@@ -180,7 +177,6 @@ class VLMTestInfo(NamedTuple):
"hf_model_kwargs"
:
self
.
hf_model_kwargs
,
"stop_str"
:
self
.
stop_str
,
"patch_hf_runner"
:
self
.
patch_hf_runner
,
"tokenizer_mode"
:
self
.
tokenizer_mode
}
...
...
tests/models/encoder_decoder/vision_language/test_mllama.py
View file @
ec5e299c
...
...
@@ -8,11 +8,11 @@ import torch
from
transformers
import
(
AutoConfig
,
AutoModelForVision2Seq
,
AutoTokenizer
,
BatchEncoding
)
from
vllm
import
LLM
,
SamplingParams
from
vllm.attention.backends.flash_attn
import
FlashAttentionMetadata
from
vllm.attention.selector
import
(
_Backend
,
_cached_get_attn_backend
,
global_force_attn_backend_context_manager
)
from
vllm.model_executor.models.mllama
import
(
MLLAMA_IMAGE_TOKEN_ID
,
MllamaForConditionalGeneration
)
from
vllm.model_executor.models.mllama
import
MllamaForConditionalGeneration
from
vllm.multimodal.image
import
rescale_image_size
from
vllm.sequence
import
SampleLogprobs
...
...
@@ -23,6 +23,7 @@ from ...utils import check_logprobs_close
from
....utils
import
models_path_prefix
_LIMIT_IMAGE_PER_PROMPT
=
3
MLLAMA_IMAGE_TOKEN_ID
=
128256
LIST_ENC_DEC_SUPPORTED_BACKENDS
=
[
_Backend
.
XFORMERS
,
_Backend
.
FLASH_ATTN
]
...
...
@@ -398,6 +399,64 @@ def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
)
@
large_gpu_test
(
min_gb
=
48
)
@
pytest
.
mark
.
core_model
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
32
])
def
test_explicit_implicit_prompt
(
image_assets
:
_ImageAssets
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
):
stop_sign
=
image_assets
[
0
].
pil_image
# yapf: disable
prompts
=
[
# explicit prompt
{
"encoder_prompt"
:
{
"prompt"
:
"<|image|>"
,
"multi_modal_data"
:
{
"image"
:
stop_sign
},
},
"decoder_prompt"
:
{
"prompt_token_ids"
:
[
128000
,
791
,
2262
,
315
,
279
,
2217
,
220
,
128256
,
374
],
# noqa: E501
}
},
{
"encoder_prompt"
:
"Not <|image|>"
,
"decoder_prompt"
:
"The color of the sky is blue but sometimes it can also be"
,
# noqa: E501
},
# implicit prompt
{
"prompt"
:
"<|begin_of_text|>The content of the image <|image|> is"
,
# noqa: E501
"multi_modal_data"
:
{
"image"
:
stop_sign
},
},
{
"prompt"
:
"The color of the sky is blue but sometimes it can also be"
,
# noqa: E501
},
]
# yapf: enable
llm
=
LLM
(
model
=
model
,
dtype
=
dtype
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
tensor_parallel_size
=
1
,
enforce_eager
=
True
,
)
sampling_params
=
SamplingParams
(
temperature
=
0
,
max_tokens
=
max_tokens
,
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
n_prompts
=
len
(
prompts
)
explicit_outputs
=
outputs
[:
n_prompts
//
2
]
implicit_outputs
=
outputs
[
n_prompts
//
2
:]
for
exp_output
,
imp_output
in
zip
(
explicit_outputs
,
implicit_outputs
):
assert
exp_output
.
outputs
[
0
].
text
==
imp_output
.
outputs
[
0
].
text
@
large_gpu_test
(
min_gb
=
48
)
@
pytest
.
mark
.
core_model
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
...
...
@@ -460,6 +519,10 @@ def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
images
=
images
)
class
DummyModel
:
image_token_id
=
MLLAMA_IMAGE_TOKEN_ID
@
pytest
.
mark
.
core_model
@
pytest
.
mark
.
parametrize
(
"input_indices_and_output"
,
...
...
@@ -501,7 +564,7 @@ def test_get_cross_attention_mask(input_indices_and_output) -> None:
use_cuda_graph
=
False
,
)
dummy
:
dict
[
str
,
str
]
=
{}
dummy
=
DummyModel
()
cross_attention_mask
,
kv_range_for_decode
=
MllamaForConditionalGeneration
\
.
get_cross_attention_mask
(
dummy
,
...
...
@@ -558,7 +621,7 @@ def test_get_full_text_row_masked_out_mask(input_indices) -> None:
use_cuda_graph
=
False
,
)
dummy
:
dict
[
str
,
str
]
=
{}
dummy
=
DummyModel
()
full_text_row_masked_out_mask
=
MllamaForConditionalGeneration
\
.
get_full_text_row_masked_out_mask
(
dummy
,
...
...
tests/models/multimodal/processing/test_common.py
View file @
ec5e299c
...
...
@@ -10,7 +10,7 @@ from vllm.config import ModelConfig
from
vllm.inputs
import
InputProcessingContext
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.processing
import
ProcessingCache
from
vllm.
multimodal.utils
import
cached_
get_
tokenizer
from
vllm.
transformers_utils.tokenizer
import
cached_tokenizer
_from_config
from
....multimodal.utils
import
random_audio
,
random_image
,
random_video
from
...registry
import
HF_EXAMPLE_MODELS
...
...
@@ -42,10 +42,7 @@ def _test_processing_correctness(
factories
=
MULTIMODAL_REGISTRY
.
_processor_factories
[
model_cls
]
ctx
=
InputProcessingContext
(
model_config
,
tokenizer
=
cached_get_tokenizer
(
model_config
.
tokenizer
,
trust_remote_code
=
model_info
.
trust_remote_code
,
),
tokenizer
=
cached_tokenizer_from_config
(
model_config
),
)
# Ensure that it can fit all of the data
cache
=
ProcessingCache
(
capacity
=
1
<<
30
)
...
...
@@ -85,11 +82,19 @@ def _test_processing_correctness(
partial
(
random_audio
,
rng
,
min_len
=
512
,
max_len
=
1024
,
sr
=
16000
),
}
tokenizer_encode_kwargs
=
{}
if
model_config
.
hf_config
.
model_type
==
"mllama"
:
# For Mllama, tokenizer will always add bos_token at the beginning of
# prompt by default, causing hf_processor outputs incorrect token ids.
# So we need use `add_special_tokens=False` here to leave bos_token
# to be added by the processor.
tokenizer_encode_kwargs
=
{
"add_special_tokens"
:
False
}
for
batch_idx
in
range
(
num_batches
):
mm_data
=
{
k
:
[(
input_to_hit
[
k
]
if
rng
.
rand
()
<
hit_rate
else
input_factory
[
k
]())
for
_
in
range
(
rng
.
randint
(
limit
))]
for
_
in
range
(
rng
.
randint
(
limit
+
1
))]
for
k
,
limit
in
limit_mm_per_prompt
.
items
()
}
...
...
@@ -122,7 +127,7 @@ def _test_processing_correctness(
f
"Failed (
{
batch_idx
=
}
,
{
prompt
=
}
,
{
mm_data
=
}
)"
)
baseline_tokenized_result
=
baseline_processor
.
apply
(
tokenizer
.
encode
(
prompt
),
tokenizer
.
encode
(
prompt
,
**
tokenizer_encode_kwargs
),
mm_data
=
mm_data
,
hf_processor_mm_kwargs
=
{},
)
...
...
@@ -131,7 +136,7 @@ def _test_processing_correctness(
f
"Failed (
{
batch_idx
=
}
,
{
prompt
=
}
,
{
mm_data
=
}
)"
)
cached_tokenized_result
=
cached_processor
.
apply
(
tokenizer
.
encode
(
prompt
),
tokenizer
.
encode
(
prompt
,
**
tokenizer_encode_kwargs
),
mm_data
=
mm_data
,
hf_processor_mm_kwargs
=
{},
)
...
...
@@ -147,6 +152,7 @@ def _test_processing_correctness(
"facebook/chameleon-7b"
,
"deepseek-ai/deepseek-vl2-tiny"
,
"adept/fuyu-8b"
,
"THUDM/glm-4v-9b"
,
"h2oai/h2ovl-mississippi-800m"
,
"OpenGVLab/InternVL2-1B"
,
"HuggingFaceM4/Idefics3-8B-Llama3"
,
...
...
@@ -154,16 +160,19 @@ def _test_processing_correctness(
"llava-hf/llava-v1.6-mistral-7b-hf"
,
"llava-hf/LLaVA-NeXT-Video-7B-hf"
,
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
,
"meta-llama/Llama-3.2-11B-Vision-Instruct"
,
"TIGER-Lab/Mantis-8B-siglip-llama3"
,
"mistral-community/pixtral-12b"
,
"openbmb/MiniCPM-o-2_6"
,
"openbmb/MiniCPM-V-2_6"
,
"allenai/Molmo-7B-D-0924"
,
"allenai/Molmo-7B-O-0924"
,
"nvidia/NVLM-D-72B"
,
"Qwen/Qwen-VL-Chat"
,
"Qwen/Qwen2-VL-2B-Instruct"
,
"Qwen/Qwen2.5-VL-3B-Instruct"
,
"Qwen/Qwen2-Audio-7B-Instruct"
,
"fixie-ai/ultravox-v0_
3
"
,
"fixie-ai/ultravox-v0_
5-llama-3_2-1b
"
,
])
@
pytest
.
mark
.
parametrize
(
"hit_rate"
,
[
0.3
,
0.5
,
1.0
])
@
pytest
.
mark
.
parametrize
(
"num_batches"
,
[
32
])
...
...
tests/models/multimodal/processing/test_h2ovl.py
View file @
ec5e299c
# SPDX-License-Identifier: Apache-2.0
"""Tests for H2OVL's multimodal preprocessing kwargs."""
from
typing
import
Optional
from
typing
import
Mapping
,
Optional
import
pytest
from
PIL
import
Image
from
transformers
import
PretrainedConfig
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.image
import
rescale_image_size
from
vllm.multimodal.utils
import
cached_get_tokenizer
from
vllm.multimodal.processing
import
BaseMultiModalProcessor
from
vllm.transformers_utils.tokenizer
import
cached_tokenizer_from_config
from
....conftest
import
_ImageAssets
from
...utils
import
build_model_context
def
_get_expected_num_patches
(
config
:
PretrainedConfig
,
image
:
Image
.
Image
,
num_imgs
:
int
,
min_num
:
int
,
max_num
:
int
,
):
from
vllm.model_executor.models.h2ovl
import
(
calculate_h2ovl_targets
,
get_h2ovl_target_ratios
)
width
,
height
=
image
.
size
# Calculate the expected number of blocks
if
num_imgs
==
1
and
config
.
use_msac
:
# First pass
blocks1
,
_
,
_
,
aspect_ratio
=
calculate_h2ovl_targets
(
orig_width
=
width
,
orig_height
=
height
,
target_ratios
=
get_h2ovl_target_ratios
(
min_num
=
1
,
max_num
=
max_num
,
prior_aspect_ratio
=
None
,
),
image_size
=
config
.
vision_config
.
image_size
,
use_thumbnail
=
False
,
# Thumbnail is handled separately
)
# Second pass
blocks2
,
_
,
_
,
_
=
calculate_h2ovl_targets
(
orig_width
=
width
,
orig_height
=
height
,
target_ratios
=
get_h2ovl_target_ratios
(
min_num
=
3
,
max_num
=
max_num
,
prior_aspect_ratio
=
aspect_ratio
,
),
image_size
=
config
.
vision_config
.
image_size
,
use_thumbnail
=
False
,
)
# Add thumbnail if use_thumbnail is True and total_blocks > 1
if
config
.
use_thumbnail
:
blocks1
+=
1
if
blocks1
>
1
else
0
blocks2
+=
1
if
blocks2
>
1
else
0
# Total blocks is the sum of blocks from both passes minus
# overlapping
total_blocks
=
blocks1
+
blocks2
-
1
return
total_blocks
blocks
,
_
,
_
,
_
=
calculate_h2ovl_targets
(
orig_width
=
width
,
orig_height
=
height
,
target_ratios
=
get_h2ovl_target_ratios
(
min_num
,
max_num
,
prior_aspect_ratio
=
None
,
),
image_size
=
config
.
vision_config
.
image_size
,
use_thumbnail
=
False
,
)
expected_num_patches
=
blocks
if
config
.
use_thumbnail
and
expected_num_patches
>
1
:
expected_num_patches
+=
1
return
expected_num_patches
def
_run_check
(
processor
:
BaseMultiModalProcessor
,
images
:
list
[
Image
.
Image
],
min_num
:
int
,
max_num
:
int
,
mm_processor_kwargs
:
Mapping
[
str
,
object
],
):
tokenizer
=
processor
.
info
.
get_tokenizer
()
config
=
processor
.
info
.
get_hf_config
()
mm_data
=
{
"image"
:
images
}
total_expected_num_patches
=
sum
(
_get_expected_num_patches
(
config
,
image
,
len
(
images
),
min_num
,
max_num
)
for
image
in
images
)
processed_inputs
=
processor
.
apply
(
"<image>"
*
len
(
images
),
mm_data
,
mm_processor_kwargs
)
# Ensure we have the right number of placeholders per num_crops size
image_token_id
=
tokenizer
.
convert_tokens_to_ids
(
"<IMG_CONTEXT>"
)
img_tok_count
=
processed_inputs
[
"prompt_token_ids"
].
count
(
image_token_id
)
pixel_shape
=
processed_inputs
[
"mm_kwargs"
][
"pixel_values_flat"
].
shape
assert
img_tok_count
==
256
*
total_expected_num_patches
assert
pixel_shape
[
0
]
==
total_expected_num_patches
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"h2oai/h2ovl-mississippi-800m"
,
"h2oai/h2ovl-mississippi-2b"
,
...
...
@@ -25,118 +126,54 @@ from ...utils import build_model_context
[
1.0
,
1.0
,
1.0
],
# Multi-scale
[
0.25
,
0.5
,
1.0
],
[
4.0
,
2.0
,
1.0
],
],
)
@
pytest
.
mark
.
parametrize
(
"max_dynamic_patch"
,
[
1
,
2
,
4
,
8
])
@
pytest
.
mark
.
parametrize
(
(
"min_dynamic_patch"
,
"max_dynamic_patch"
),
[(
1
,
1
),
(
1
,
2
),
(
1
,
4
),
(
1
,
8
),
(
2
,
4
),
(
4
,
8
)],
)
@
pytest
.
mark
.
parametrize
(
"dynamic_image_size"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"
num_imgs"
,
[
1
,
2
])
@
pytest
.
mark
.
parametrize
(
"
kwargs_on_init"
,
[
True
,
False
])
def
test_processor_override
(
model_id
:
str
,
image_assets
:
_ImageAssets
,
size_factors
:
list
[
int
],
min_dynamic_patch
:
int
,
max_dynamic_patch
:
int
,
dynamic_image_size
:
Optional
[
bool
],
num_imgs
:
int
,
kwargs_on_init
:
bool
,
):
from
vllm.model_executor.models.h2ovl
import
(
calculate_h2ovl_targets
,
get_h2ovl_target_ratios
)
mm_processor_kwargs
=
{
"min_dynamic_patch"
:
min_dynamic_patch
,
"max_dynamic_patch"
:
max_dynamic_patch
,
"dynamic_image_size"
:
dynamic_image_size
,
}
ctx
=
build_model_context
(
model_name
=
model_id
,
tokenizer_name
=
model_id
,
trust_remote_code
=
True
,
mm_processor_kwargs
=
None
,
limit_mm_per_prompt
=
{
"image"
:
num_imgs
},
)
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
,
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
mm_processor_kwargs
=
mm_processor_kwargs
if
kwargs_on_init
else
None
,
limit_mm_per_prompt
=
{
"image"
:
len
(
size_factors
)},
)
tokenizer
=
cached_tokenizer_from_config
(
ctx
.
model_config
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
tokenizer
,
)
hf_processor_mm_kwargs
=
{}
if
kwargs_on_init
else
mm_processor_kwargs
config
=
processor
.
info
.
get_hf_config
()
use_msac
=
config
.
use_msac
mm_processor_kwargs
=
{
"max_dynamic_patch"
:
max_dynamic_patch
,
}
if
dynamic_image_size
is
not
None
:
mm_processor_kwargs
[
"dynamic_image_size"
]
=
dynamic_image_size
min_num
=
config
.
min_dynamic_patch
min_num
=
min_dynamic_patch
if
dynamic_image_size
else
1
max_num
=
max_dynamic_patch
if
dynamic_image_size
else
1
# Build the image str / prompt based on the number of images we pass
prompt
=
"<image>"
*
num_imgs
for
asset
in
image_assets
:
for
factor
in
size_factors
:
image
=
rescale_image_size
(
asset
.
pil_image
,
factor
)
mm_data
=
{
"image"
:
[
image
]
*
num_imgs
}
width
,
height
=
image
.
size
# Calculate the expected number of blocks
if
num_imgs
==
1
and
use_msac
:
# First pass
blocks1
,
_
,
_
,
aspect_ratio
=
calculate_h2ovl_targets
(
orig_width
=
width
,
orig_height
=
height
,
target_ratios
=
get_h2ovl_target_ratios
(
min_num
,
max_num
,
prior_aspect_ratio
=
None
,
),
image_size
=
config
.
vision_config
.
image_size
,
use_thumbnail
=
False
,
# Thumbnail is handled separately
)
# Second pass
blocks2
,
_
,
_
,
_
=
calculate_h2ovl_targets
(
orig_width
=
width
,
orig_height
=
height
,
target_ratios
=
get_h2ovl_target_ratios
(
min_num
,
max_num
,
prior_aspect_ratio
=
aspect_ratio
,
),
image_size
=
config
.
vision_config
.
image_size
,
use_thumbnail
=
False
,
)
# Add thumbnail if use_thumbnail is True and total_blocks > 1
if
config
.
use_thumbnail
:
blocks1
+=
1
if
blocks1
>
1
else
0
blocks2
+=
1
if
blocks2
>
1
else
0
# Total blocks is the sum of blocks from both passes minus
# overlapping
total_blocks
=
blocks1
+
blocks2
-
1
expected_num_patches
=
total_blocks
else
:
blocks
,
_
,
_
,
_
=
calculate_h2ovl_targets
(
orig_width
=
width
,
orig_height
=
height
,
target_ratios
=
get_h2ovl_target_ratios
(
min_num
,
max_num
,
prior_aspect_ratio
=
None
,
),
image_size
=
config
.
vision_config
.
image_size
,
use_thumbnail
=
False
,
)
expected_num_patches
=
blocks
if
config
.
use_thumbnail
and
expected_num_patches
!=
1
:
expected_num_patches
+=
1
processed_inputs
=
processor
.
apply
(
prompt
,
mm_data
,
mm_processor_kwargs
)
pixel_shape
=
(
processed_inputs
[
"mm_kwargs"
][
"pixel_values_flat"
].
shape
)
assert
pixel_shape
[
0
]
==
expected_num_patches
*
num_imgs
_run_check
(
processor
,
[
rescale_image_size
(
image_assets
[
0
].
pil_image
,
f
)
for
f
in
size_factors
],
min_num
,
max_num
,
hf_processor_mm_kwargs
,
)
tests/models/multimodal/processing/test_idefics3.py
View file @
ec5e299c
...
...
@@ -5,7 +5,7 @@ import pytest
from
transformers
import
Idefics3Config
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.
multimodal.utils
import
cached_
get_
tokenizer
from
vllm.
transformers_utils.tokenizer
import
cached_tokenizer
_from_config
from
....conftest
import
_ImageAssets
from
...utils
import
build_model_context
...
...
@@ -24,9 +24,15 @@ models = [os.path.join(models_path_prefix, "HuggingFaceM4/Idefics3-8B-Llama3")]
])
# yapf: enable
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
2
])
def
test_processor_override
(
image_assets
:
_ImageAssets
,
model
:
str
,
mm_processor_kwargs
:
dict
[
str
,
object
],
expected_toks_per_img
:
int
,
num_imgs
:
int
):
@
pytest
.
mark
.
parametrize
(
"kwargs_on_init"
,
[
True
,
False
])
def
test_processor_override
(
image_assets
:
_ImageAssets
,
model
:
str
,
mm_processor_kwargs
:
dict
[
str
,
object
],
expected_toks_per_img
:
int
,
num_imgs
:
int
,
kwargs_on_init
:
bool
,
):
"""Ensure input_processor_for_idefics3 handles num_crops properly."""
# Same as the previous test - don't initialize mm_processor_kwargs
# in this test and assume that the kwargs will be correctly expanded by
...
...
@@ -35,15 +41,15 @@ def test_processor_override(image_assets: _ImageAssets, model: str,
model_name
=
model
,
tokenizer_name
=
model
,
trust_remote_code
=
True
,
mm_processor_kwargs
=
None
,
mm_processor_kwargs
=
mm_processor_kwargs
if
kwargs_on_init
else
None
,
limit_mm_per_prompt
=
{
"image"
:
num_imgs
},
)
tokenizer
=
cached_
get_
tokenizer
(
ctx
.
model_config
.
tokenizer
)
tokenizer
=
cached_tokenizer
_from_config
(
ctx
.
model_config
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
tokenizer
,
)
hf_processor
=
processor
.
info
.
get_hf_processor
(
**
mm_processor_kwargs
)
hf_processor
_mm_kwargs
=
{}
if
kwargs_on_init
else
mm_processor_kwargs
# Build the image str / prompt based on the number of images we pass
placeholders
=
"<image>"
if
num_imgs
==
1
else
"
\n
"
.
join
(
...
...
@@ -56,8 +62,10 @@ def test_processor_override(image_assets: _ImageAssets, model: str,
dummy_image
=
image_assets
[
0
].
pil_image
.
resize
(
dummy_image_size
)
mm_data
=
{
"image"
:
[
dummy_image
]
*
num_imgs
}
processed_inputs
=
processor
.
apply
(
prompt
,
mm_data
,
mm_processor_kwargs
)
processed_inputs
=
processor
.
apply
(
prompt
,
mm_data
,
hf_processor_mm_kwargs
)
# Ensure the placeholders format are correct
hf_processor
=
processor
.
info
.
get_hf_processor
(
**
hf_processor_mm_kwargs
)
hf_processed_inputs
=
hf_processor
(
text
=
prompt
,
images
=
mm_data
[
"image"
])
assert
processed_inputs
[
"prompt_token_ids"
]
==
hf_processed_inputs
[
"input_ids"
][
0
]
...
...
tests/models/multimodal/processing/test_internvl.py
View file @
ec5e299c
# SPDX-License-Identifier: Apache-2.0
"""Tests for InternVL's multimodal preprocessing kwargs."""
from
typing
import
Optional
from
typing
import
Mapping
,
Optional
import
os
import
pytest
from
PIL
import
Image
from
transformers
import
PretrainedConfig
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.utils
import
cached_get_tokenizer
from
vllm.multimodal.image
import
rescale_image_size
from
vllm.multimodal.processing
import
BaseMultiModalProcessor
from
vllm.transformers_utils.tokenizer
import
cached_tokenizer_from_config
from
....conftest
import
_ImageAssets
from
...utils
import
build_model_context
from
....utils
import
models_path_prefix
def
_get_expected_num_patches
(
config
:
PretrainedConfig
,
image
:
Image
.
Image
,
num_imgs
:
int
,
min_num
:
int
,
max_num
:
int
,
):
from
vllm.model_executor.models.internvl
import
(
calculate_internvl_targets
,
get_internvl_target_ratios
)
width
,
height
=
image
.
size
blocks
,
_
,
_
=
calculate_internvl_targets
(
orig_width
=
width
,
orig_height
=
height
,
target_ratios
=
get_internvl_target_ratios
(
min_num
,
max_num
,
),
image_size
=
config
.
vision_config
.
image_size
,
use_thumbnail
=
False
,
)
expected_num_patches
=
blocks
if
config
.
use_thumbnail
and
expected_num_patches
>
1
:
expected_num_patches
+=
1
return
expected_num_patches
def
_run_check
(
processor
:
BaseMultiModalProcessor
,
images
:
list
[
Image
.
Image
],
min_num
:
int
,
max_num
:
int
,
mm_processor_kwargs
:
Mapping
[
str
,
object
],
):
tokenizer
=
processor
.
info
.
get_tokenizer
()
config
=
processor
.
info
.
get_hf_config
()
mm_data
=
{
"image"
:
images
}
total_expected_num_patches
=
sum
(
_get_expected_num_patches
(
config
,
image
,
len
(
images
),
min_num
,
max_num
)
for
image
in
images
)
processed_inputs
=
processor
.
apply
(
"<image>"
*
len
(
images
),
mm_data
,
mm_processor_kwargs
)
# Ensure we have the right number of placeholders per num_crops size
image_token_id
=
tokenizer
.
convert_tokens_to_ids
(
"<IMG_CONTEXT>"
)
img_tok_count
=
processed_inputs
[
"prompt_token_ids"
].
count
(
image_token_id
)
pixel_shape
=
processed_inputs
[
"mm_kwargs"
][
"pixel_values_flat"
].
shape
assert
img_tok_count
==
256
*
total_expected_num_patches
assert
pixel_shape
[
0
]
==
total_expected_num_patches
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
os
.
path
.
join
(
models_path_prefix
,
"OpenGVLab/InternVL2-2B"
)])
@
pytest
.
mark
.
parametrize
(
"max_dynamic_patch"
,
[
1
,
4
])
@
pytest
.
mark
.
parametrize
(
"dynamic_image_size"
,
[
True
,
False
,
None
])
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
2
])
@
pytest
.
mark
.
parametrize
(
"size_factors"
,
[
# Single-scale
[
1.0
],
# Single-scale, batched
[
1.0
,
1.0
,
1.0
],
# Multi-scale
[
0.25
,
0.5
,
1.0
],
[
4.0
,
2.0
,
1.0
],
],
)
@
pytest
.
mark
.
parametrize
(
(
"min_dynamic_patch"
,
"max_dynamic_patch"
),
[(
1
,
1
),
(
1
,
2
),
(
1
,
4
),
(
1
,
8
),
(
2
,
4
),
(
4
,
8
)],
)
@
pytest
.
mark
.
parametrize
(
"dynamic_image_size"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"kwargs_on_init"
,
[
True
,
False
])
def
test_processor_override
(
model_id
:
str
,
image_assets
:
_ImageAssets
,
size_factors
:
list
[
int
],
min_dynamic_patch
:
int
,
max_dynamic_patch
:
int
,
dynamic_image_size
:
Optional
[
bool
],
num_imgs
:
int
,
kwargs_on_init
:
bool
,
):
mm_processor_kwargs
=
{
"min_dynamic_patch"
:
min_dynamic_patch
,
"max_dynamic_patch"
:
max_dynamic_patch
,
"dynamic_image_size"
:
dynamic_image_size
,
}
ctx
=
build_model_context
(
model_name
=
model_id
,
tokenizer_name
=
model_id
,
trust_remote_code
=
True
,
mm_processor_kwargs
=
None
,
limit_mm_per_prompt
=
{
"image"
:
num_imgs
},
)
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
,
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
mm_processor_kwargs
=
mm_processor_kwargs
if
kwargs_on_init
else
None
,
limit_mm_per_prompt
=
{
"image"
:
len
(
size_factors
)},
)
tokenizer
=
cached_tokenizer_from_config
(
ctx
.
model_config
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
tokenizer
,
)
hf_processor_mm_kwargs
=
{}
if
kwargs_on_init
else
mm_processor_kwargs
mm_processor_kwargs
=
{
"max_dynamic_patch"
:
max_dynamic_patch
,
}
if
dynamic_image_size
is
not
None
:
mm_processor_kwargs
[
"dynamic_image_size"
]
=
dynamic_image_size
min_num
=
min_dynamic_patch
if
dynamic_image_size
else
1
max_num
=
max_dynamic_patch
if
dynamic_image_size
else
1
# Build the image str / prompt based on the number of images we pass
prompt
=
"<image>"
*
num_imgs
image
=
image_assets
[
0
].
pil_image
.
resize
((
448
*
2
,
448
*
2
))
mm_data
=
{
"image"
:
[
image
]
*
num_imgs
}
expected_num_patches
=
max_dynamic_patch
+
1
if
max_dynamic_patch
>
1
else
1
if
dynamic_image_size
is
False
:
expected_num_patches
=
1
processed_inputs
=
processor
.
apply
(
prompt
,
mm_data
,
mm_processor_kwargs
)
# Ensure we have the right number of placeholders per num_crops size
image_token_id
=
tokenizer
.
convert_tokens_to_ids
(
"<IMG_CONTEXT>"
)
img_tok_count
=
processed_inputs
[
"prompt_token_ids"
].
count
(
image_token_id
)
pixel_shape
=
processed_inputs
[
"mm_kwargs"
][
"pixel_values_flat"
].
shape
assert
img_tok_count
==
256
*
expected_num_patches
*
num_imgs
assert
pixel_shape
[
0
]
==
expected_num_patches
*
num_imgs
_run_check
(
processor
,
[
rescale_image_size
(
image_assets
[
0
].
pil_image
,
f
)
for
f
in
size_factors
],
min_num
,
max_num
,
hf_processor_mm_kwargs
,
)
tests/models/multimodal/processing/test_llava_next.py
View file @
ec5e299c
...
...
@@ -10,7 +10,7 @@ from pqdm.threads import pqdm
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.parse
import
ImageSize
from
vllm.multimodal.processing
import
BaseMultiModalProcessor
from
vllm.
multimodal.utils
import
cached_
get_
tokenizer
from
vllm.
transformers_utils.tokenizer
import
cached_tokenizer
_from_config
from
...utils
import
build_model_context
...
...
@@ -43,10 +43,7 @@ def test_processor_max_tokens(model_id):
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
,
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
),
tokenizer
=
cached_tokenizer_from_config
(
ctx
.
model_config
),
)
info
=
processor
.
info
...
...
@@ -146,10 +143,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
,
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
),
tokenizer
=
cached_tokenizer_from_config
(
ctx
.
model_config
),
)
image_ratios
=
[(
171
,
152
),
(
184
,
161
),
(
198
,
176
),
(
333
,
296
),
(
369
,
328
),
...
...
@@ -179,10 +173,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
,
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
),
tokenizer
=
cached_tokenizer_from_config
(
ctx
.
model_config
),
)
seen_aspect_ratios
=
set
[
float
]()
...
...
tests/models/multimodal/processing/test_llava_onevision.py
View file @
ec5e299c
...
...
@@ -10,7 +10,7 @@ from pqdm.threads import pqdm
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.parse
import
ImageSize
from
vllm.multimodal.processing
import
BaseMultiModalProcessor
from
vllm.
multimodal.utils
import
cached_
get_
tokenizer
from
vllm.
transformers_utils.tokenizer
import
cached_tokenizer
_from_config
from
...utils
import
build_model_context
...
...
@@ -44,10 +44,7 @@ def test_processor_max_tokens(model_id):
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
,
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
),
tokenizer
=
cached_tokenizer_from_config
(
ctx
.
model_config
),
)
info
=
processor
.
info
...
...
@@ -146,10 +143,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
,
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
),
tokenizer
=
cached_tokenizer_from_config
(
ctx
.
model_config
),
)
image_ratios
=
[(
171
,
152
),
(
184
,
161
),
(
198
,
176
),
(
333
,
296
),
(
369
,
328
),
...
...
@@ -180,10 +174,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
,
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
),
tokenizer
=
cached_tokenizer_from_config
(
ctx
.
model_config
),
)
seen_aspect_ratios
=
set
[
float
]()
...
...
tests/models/multimodal/processing/test_phi3v.py
View file @
ec5e299c
...
...
@@ -3,7 +3,7 @@
import
pytest
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.
multimodal.utils
import
cached_
get_
tokenizer
from
vllm.
transformers_utils.tokenizer
import
cached_tokenizer
_from_config
from
....conftest
import
_ImageAssets
from
...utils
import
build_model_context
...
...
@@ -21,12 +21,14 @@ from ...utils import build_model_context
])
# yapf: enable
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
2
])
@
pytest
.
mark
.
parametrize
(
"kwargs_on_init"
,
[
True
,
False
])
def
test_processor_override
(
image_assets
:
_ImageAssets
,
model_id
:
str
,
mm_processor_kwargs
:
dict
[
str
,
int
],
expected_toks_per_img
:
int
,
num_imgs
:
int
,
kwargs_on_init
:
bool
,
):
"""Ensure input_processor_for_phi3v handles num_crops properly."""
# Avoid initializing CUDA early
...
...
@@ -36,23 +38,22 @@ def test_processor_override(
model_name
=
model_id
,
tokenizer_name
=
model_id
,
trust_remote_code
=
True
,
mm_processor_kwargs
=
mm_processor_kwargs
if
kwargs_on_init
else
None
,
limit_mm_per_prompt
=
{
"image"
:
num_imgs
},
)
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
,
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
)
tokenizer
=
cached_tokenizer_from_config
(
ctx
.
model_config
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
tokenizer
=
tokenizer
,
)
hf_processor_mm_kwargs
=
{}
if
kwargs_on_init
else
mm_processor_kwargs
# Build the image str / prompt based on the number of images we pass
img_str
=
""
.
join
([
f
"<|image_
{
idx
}
|>
\n
"
for
idx
in
range
(
1
,
num_imgs
+
1
)])
prompt
=
f
"<|user|>
\n
{
img_str
}
<|end|>
\n
<|assistant|>
\n
"
mm_data
=
{
"image"
:
[
image_assets
[
0
].
pil_image
]
*
num_imgs
}
processed_inputs
=
processor
.
apply
(
prompt
,
mm_data
,
mm
_processor_kwargs
)
processed_inputs
=
processor
.
apply
(
prompt
,
mm_data
,
hf
_processor_
mm_
kwargs
)
# Ensure we have the right number of placeholders per num_crops size
img_tok_count
=
processed_inputs
[
"prompt_token_ids"
].
count
(
_IMAGE_TOKEN_ID
)
...
...
Prev
1
…
7
8
9
10
11
12
13
14
15
…
27
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment