Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ec5e299c
Commit
ec5e299c
authored
Feb 21, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.7.3' into v0.7.3-dev
parents
47bd229c
ed6e9075
Changes
521
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
590 additions
and
328 deletions
+590
-328
tests/mistral_tool_use/conftest.py
tests/mistral_tool_use/conftest.py
+40
-0
tests/mistral_tool_use/test_mistral_tool_calls.py
tests/mistral_tool_use/test_mistral_tool_calls.py
+29
-0
tests/mistral_tool_use/utils.py
tests/mistral_tool_use/utils.py
+33
-0
tests/models/decoder_only/audio_language/test_ultravox.py
tests/models/decoder_only/audio_language/test_ultravox.py
+1
-1
tests/models/decoder_only/language/test_fp8.py
tests/models/decoder_only/language/test_fp8.py
+3
-3
tests/models/decoder_only/language/test_hybrid.py
tests/models/decoder_only/language/test_hybrid.py
+23
-12
tests/models/decoder_only/language/test_mamba.py
tests/models/decoder_only/language/test_mamba.py
+30
-13
tests/models/decoder_only/language/test_models.py
tests/models/decoder_only/language/test_models.py
+10
-0
tests/models/decoder_only/vision_language/test_models.py
tests/models/decoder_only/vision_language/test_models.py
+4
-10
tests/models/decoder_only/vision_language/vlm_utils/core.py
tests/models/decoder_only/vision_language/vlm_utils/core.py
+39
-23
tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
...els/decoder_only/vision_language/vlm_utils/model_utils.py
+21
-77
tests/models/decoder_only/vision_language/vlm_utils/types.py
tests/models/decoder_only/vision_language/vlm_utils/types.py
+3
-7
tests/models/encoder_decoder/vision_language/test_mllama.py
tests/models/encoder_decoder/vision_language/test_mllama.py
+67
-4
tests/models/multimodal/processing/test_common.py
tests/models/multimodal/processing/test_common.py
+18
-9
tests/models/multimodal/processing/test_h2ovl.py
tests/models/multimodal/processing/test_h2ovl.py
+131
-94
tests/models/multimodal/processing/test_idefics3.py
tests/models/multimodal/processing/test_idefics3.py
+16
-8
tests/models/multimodal/processing/test_internvl.py
tests/models/multimodal/processing/test_internvl.py
+107
-35
tests/models/multimodal/processing/test_llava_next.py
tests/models/multimodal/processing/test_llava_next.py
+4
-13
tests/models/multimodal/processing/test_llava_onevision.py
tests/models/multimodal/processing/test_llava_onevision.py
+4
-13
tests/models/multimodal/processing/test_phi3v.py
tests/models/multimodal/processing/test_phi3v.py
+7
-6
No files found.
Too many changes to show.
To preserve performance only
521 of 521+
files are displayed.
Plain diff
Email patch
tests/mistral_tool_use/conftest.py
0 → 100644
View file @
ec5e299c
# SPDX-License-Identifier: Apache-2.0
import
pytest
import
pytest_asyncio
from
huggingface_hub
import
snapshot_download
from
tests.utils
import
RemoteOpenAIServer
from
vllm.platforms
import
current_platform
from
.utils
import
ARGS
,
CONFIGS
,
ServerConfig
# for each server config, download the model and return the config
@
pytest
.
fixture
(
scope
=
"session"
,
params
=
CONFIGS
.
keys
())
def
server_config
(
request
):
config
=
CONFIGS
[
request
.
param
]
if
current_platform
.
is_rocm
()
and
not
config
.
get
(
"supports_rocm"
,
True
):
pytest
.
skip
(
"The {} model can't be tested on the ROCm platform"
.
format
(
config
[
"model"
]))
# download model and tokenizer using transformers
snapshot_download
(
config
[
"model"
])
yield
CONFIGS
[
request
.
param
]
# run this for each server config
@
pytest
.
fixture
(
scope
=
"session"
)
def
server
(
request
,
server_config
:
ServerConfig
):
model
=
server_config
[
"model"
]
args_for_model
=
server_config
[
"arguments"
]
with
RemoteOpenAIServer
(
model
,
ARGS
+
args_for_model
,
max_wait_seconds
=
480
)
as
server
:
yield
server
@
pytest_asyncio
.
fixture
async
def
client
(
server
:
RemoteOpenAIServer
):
async
with
server
.
get_async_client
()
as
async_client
:
yield
async_client
tests/mistral_tool_use/test_mistral_tool_calls.py
0 → 100644
View file @
ec5e299c
# SPDX-License-Identifier: Apache-2.0
import
openai
import
pytest
from
tests.tool_use.utils
import
MESSAGES_ASKING_FOR_TOOLS
,
WEATHER_TOOL
# test: a tool_choice with mistral-tokenizer results in an ID of length 9
@
pytest
.
mark
.
asyncio
async
def
test_tool_call_with_tool_choice
(
client
:
openai
.
AsyncOpenAI
):
models
=
await
client
.
models
.
list
()
model_name
:
str
=
models
.
data
[
0
].
id
chat_completion
=
await
client
.
chat
.
completions
.
create
(
messages
=
MESSAGES_ASKING_FOR_TOOLS
,
temperature
=
0
,
max_completion_tokens
=
100
,
model
=
model_name
,
tools
=
[
WEATHER_TOOL
],
tool_choice
=
WEATHER_TOOL
,
logprobs
=
False
)
choice
=
chat_completion
.
choices
[
0
]
assert
choice
.
finish_reason
!=
"tool_calls"
# "stop" or "length"
assert
choice
.
message
.
role
==
"assistant"
assert
choice
.
message
.
tool_calls
is
None
\
or
len
(
choice
.
message
.
tool_calls
)
==
1
assert
len
(
choice
.
message
.
tool_calls
[
0
].
id
)
==
9
# length of 9 for mistral
tests/mistral_tool_use/utils.py
0 → 100644
View file @
ec5e299c
# SPDX-License-Identifier: Apache-2.0
from
typing
import
Dict
,
List
,
Optional
from
typing_extensions
import
TypedDict
class
ServerConfig
(
TypedDict
,
total
=
False
):
model
:
str
arguments
:
List
[
str
]
system_prompt
:
Optional
[
str
]
supports_parallel
:
Optional
[
bool
]
supports_rocm
:
Optional
[
bool
]
ARGS
:
List
[
str
]
=
[
"--max-model-len"
,
"1024"
]
CONFIGS
:
Dict
[
str
,
ServerConfig
]
=
{
"mistral"
:
{
"model"
:
"mistralai/Mistral-7B-Instruct-v0.3"
,
"arguments"
:
[
"--tokenizer-mode"
,
"mistral"
,
"--ignore-patterns=
\"
consolidated.safetensors
\"
"
],
"system_prompt"
:
"You are a helpful assistant with access to tools. If a tool"
" that you have would be helpful to answer a user query, "
"call the tool. Otherwise, answer the user's query directly "
"without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
"to the user's question - just respond to it normally."
},
}
tests/models/decoder_only/audio_language/test_ultravox.py
View file @
ec5e299c
...
@@ -18,7 +18,7 @@ from ....conftest import HfRunner, VllmRunner
...
@@ -18,7 +18,7 @@ from ....conftest import HfRunner, VllmRunner
from
....utils
import
RemoteOpenAIServer
,
models_path_prefix
from
....utils
import
RemoteOpenAIServer
,
models_path_prefix
from
...utils
import
check_logprobs_close
from
...utils
import
check_logprobs_close
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"fixie-ai/ultravox-v0_
3
"
)
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"fixie-ai/ultravox-v0_
5-llama-3_2-1b
"
)
AudioTuple
=
Tuple
[
np
.
ndarray
,
int
]
AudioTuple
=
Tuple
[
np
.
ndarray
,
int
]
...
...
tests/models/decoder_only/language/test_fp8.py
View file @
ec5e299c
...
@@ -30,9 +30,9 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
...
@@ -30,9 +30,9 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
# Test FP16 checkpoint w. fp8_e5m2 kv-cache.
# Test FP16 checkpoint w. fp8_e5m2 kv-cache.
(
"fp8_e5m2"
,
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
),
(
"fp8_e5m2"
,
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
),
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
)),
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
)),
# Test F
P
16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
# Test
B
F16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
(
"fp8_e4m3"
,
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-
2-7b-chat-hf
"
),
(
"fp8_e4m3"
,
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-
3.2-1B-Instruct
"
),
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-
2-7b-chat-hf
"
))
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-
3.2-1B-Instruct
"
))
])
])
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
4
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
4
])
...
...
tests/models/decoder_only/language/test_
jamba
.py
→
tests/models/decoder_only/language/test_
hybrid
.py
View file @
ec5e299c
...
@@ -10,7 +10,8 @@ from vllm.sampling_params import SamplingParams
...
@@ -10,7 +10,8 @@ from vllm.sampling_params import SamplingParams
from
...utils
import
check_outputs_equal
from
...utils
import
check_outputs_equal
from
....utils
import
models_path_prefix
from
....utils
import
models_path_prefix
MODELS
=
[
os
.
path
.
join
(
models_path_prefix
,
"ai21labs/Jamba-tiny-random"
)]
# This test is for the hybrid models
MODELS
=
[
os
.
path
.
join
(
models_path_prefix
,
"ai21labs/Jamba-tiny-dev"
),
os
.
path
.
join
(
models_path_prefix
,
"ibm-ai-platform/Bamba-9B"
)]
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
...
@@ -25,6 +26,10 @@ def test_models(
...
@@ -25,6 +26,10 @@ def test_models(
max_tokens
:
int
,
max_tokens
:
int
,
)
->
None
:
)
->
None
:
# numeric error produces different generation
if
'Bamba'
in
model
:
example_prompts
.
pop
(
3
)
with
hf_runner
(
with
hf_runner
(
model
,
model
,
dtype
=
dtype
,
dtype
=
dtype
,
...
@@ -110,15 +115,21 @@ def test_mamba_prefill_chunking_with_parallel_sampling(
...
@@ -110,15 +115,21 @@ def test_mamba_prefill_chunking_with_parallel_sampling(
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
10
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
7
])
def
test_mamba_prefill_chunking
(
hf_runner
,
vllm_runner
,
example_prompts
,
def
test_mamba_prefill_chunking
(
hf_runner
,
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
)
->
None
:
max_tokens
:
int
)
->
None
:
# numeric error during prefill chucking produces different generation
# numeric error during prefill chucking produces different generation
# compared to w/o prefill chunking for those examples, removed them for now
# compared to w/o prefill chunking for those examples, removed them for now
example_prompts
.
pop
(
7
)
if
'Jamba'
in
model
:
example_prompts
.
pop
(
2
)
example_prompts
.
pop
(
7
)
example_prompts
.
pop
(
1
)
example_prompts
.
pop
(
2
)
example_prompts
.
pop
(
1
)
elif
'Bamba'
in
model
:
example_prompts
.
pop
(
6
)
example_prompts
.
pop
(
3
)
example_prompts
.
pop
(
2
)
dtype
=
"half"
# use a different dtype for Bamba
with
hf_runner
(
with
hf_runner
(
model
,
model
,
...
@@ -147,7 +158,7 @@ def test_mamba_prefill_chunking(hf_runner, vllm_runner, example_prompts,
...
@@ -147,7 +158,7 @@ def test_mamba_prefill_chunking(hf_runner, vllm_runner, example_prompts,
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"
b
float
16
"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
15
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
15
])
def
test_parallel_sampling
(
def
test_parallel_sampling
(
vllm_runner
,
vllm_runner
,
...
@@ -251,17 +262,17 @@ def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
...
@@ -251,17 +262,17 @@ def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
dtype
:
str
,
dtype
:
str
,
example_prompts
,
example_prompts
,
)
->
None
:
)
->
None
:
# This test is for verifying that the
Jamba
inner state management doesn't
# This test is for verifying that the
hybrid
inner state management doesn't
# collapse in case where the number of incoming requests and
# collapse in case where the number of incoming requests and
# finished_requests_ids is larger than the maximum mamba block capacity.
# finished_requests_ids is larger than the maximum mamba block capacity.
# This could generally happen due to the fact that
Jamba
does support
# This could generally happen due to the fact that
hybrid
does support
# statelessness mechanism where it can cleanup new incoming requests in
# statelessness mechanism where it can cleanup new incoming requests in
# a single step.
# a single step.
try
:
try
:
with
vllm_runner
(
model
,
dtype
=
dtype
,
max_num_seqs
=
10
)
as
vllm_model
:
with
vllm_runner
(
model
,
dtype
=
dtype
,
max_num_seqs
=
10
)
as
vllm_model
:
vllm_model
.
generate_greedy
([
example_prompts
[
0
]]
*
100
,
10
)
vllm_model
.
generate_greedy
([
example_prompts
[
0
]]
*
100
,
10
)
except
ValueError
:
except
ValueError
:
pytest
.
fail
(
"
Jamba
inner state wasn't cleaned up properly between"
pytest
.
fail
(
"
Hybrid
inner state wasn't cleaned up properly between"
"steps finished requests registered unnecessarily "
)
"steps finished requests registered unnecessarily "
)
...
@@ -273,14 +284,14 @@ def test_state_cleanup(
...
@@ -273,14 +284,14 @@ def test_state_cleanup(
dtype
:
str
,
dtype
:
str
,
example_prompts
,
example_prompts
,
)
->
None
:
)
->
None
:
# This test is for verifying that the
Jamba
state is cleaned up between
# This test is for verifying that the
Hybrid
state is cleaned up between
# steps, If its not cleaned, an error would be expected.
# steps, If its not cleaned, an error would be expected.
try
:
try
:
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
for
_
in
range
(
10
):
for
_
in
range
(
10
):
vllm_model
.
generate_greedy
([
example_prompts
[
0
]]
*
100
,
1
)
vllm_model
.
generate_greedy
([
example_prompts
[
0
]]
*
100
,
1
)
except
ValueError
:
except
ValueError
:
pytest
.
fail
(
"
Jamba
inner state wasn't cleaned up between states, "
pytest
.
fail
(
"
Hybrid
inner state wasn't cleaned up between states, "
"could be related to finished_requests_ids"
)
"could be related to finished_requests_ids"
)
...
@@ -326,7 +337,7 @@ def test_multistep_correctness(vllm_runner, model: str, dtype: str,
...
@@ -326,7 +337,7 @@ def test_multistep_correctness(vllm_runner, model: str, dtype: str,
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
64
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
64
])
def
test_
jamba
_distributed_produces_identical_generation
(
def
test_
hybrid
_distributed_produces_identical_generation
(
vllm_runner
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
vllm_runner
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
example_prompts
)
->
None
:
example_prompts
)
->
None
:
...
...
tests/models/decoder_only/language/test_mamba.py
View file @
ec5e299c
...
@@ -5,6 +5,7 @@ Run `pytest tests/models/test_mamba.py`.
...
@@ -5,6 +5,7 @@ Run `pytest tests/models/test_mamba.py`.
"""
"""
import
os
import
os
import
pytest
import
pytest
import
torch
from
transformers
import
AutoModelForCausalLM
,
AutoTokenizer
from
transformers
import
AutoModelForCausalLM
,
AutoTokenizer
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.engine.arg_utils
import
EngineArgs
...
@@ -13,7 +14,14 @@ from vllm.sampling_params import SamplingParams
...
@@ -13,7 +14,14 @@ from vllm.sampling_params import SamplingParams
from
...utils
import
check_outputs_equal
from
...utils
import
check_outputs_equal
from
....utils
import
models_path_prefix
from
....utils
import
models_path_prefix
MODELS
=
[
os
.
path
.
join
(
models_path_prefix
,
"state-spaces/mamba-130m-hf"
),
os
.
path
.
join
(
models_path_prefix
,
"tiiuae/falcon-mamba-tiny-dev"
)]
MODELS
=
[
os
.
path
.
join
(
models_path_prefix
,
"state-spaces/mamba-130m-hf"
),
os
.
path
.
join
(
models_path_prefix
,
"tiiuae/falcon-mamba-tiny-dev"
),
# TODO: Compare to a Mamba2 model. The HF transformers implementation of
# Mamba2 is buggy for Codestral as it doesn't handle n_groups.
# See https://github.com/huggingface/transformers/pull/35943
# "mistralai/Mamba-Codestral-7B-v0.1",
]
# Use lower-level interfaces to create this greedy generator, as mamba will
# Use lower-level interfaces to create this greedy generator, as mamba will
...
@@ -23,6 +31,10 @@ def generate_greedy(model_name, example_prompts, max_tokens):
...
@@ -23,6 +31,10 @@ def generate_greedy(model_name, example_prompts, max_tokens):
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
)
model
=
AutoModelForCausalLM
.
from_pretrained
(
model_name
)
model
=
AutoModelForCausalLM
.
from_pretrained
(
model_name
)
# Set the device (GPU if available, else CPU)
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
model
.
to
(
device
)
# Generate texts from the prompts
# Generate texts from the prompts
outputs
=
[]
outputs
=
[]
for
prompt
in
example_prompts
:
for
prompt
in
example_prompts
:
...
@@ -31,7 +43,9 @@ def generate_greedy(model_name, example_prompts, max_tokens):
...
@@ -31,7 +43,9 @@ def generate_greedy(model_name, example_prompts, max_tokens):
input_ids
=
inputs
[
"input_ids"
].
to
(
model
.
device
)
input_ids
=
inputs
[
"input_ids"
].
to
(
model
.
device
)
# Generate text using the model's generate method directly
# Generate text using the model's generate method directly
generated_ids
=
model
.
generate
(
input_ids
,
max_new_tokens
=
max_tokens
)
generated_ids
=
model
.
generate
(
input_ids
,
max_new_tokens
=
max_tokens
,
do_sample
=
False
)
generated_text
=
tokenizer
.
decode
(
generated_ids
[
0
],
generated_text
=
tokenizer
.
decode
(
generated_ids
[
0
],
skip_special_tokens
=
True
)
skip_special_tokens
=
True
)
...
@@ -52,7 +66,8 @@ def test_models(
...
@@ -52,7 +66,8 @@ def test_models(
)
->
None
:
)
->
None
:
hf_outputs
=
generate_greedy
(
model
,
example_prompts
,
max_tokens
)
hf_outputs
=
generate_greedy
(
model
,
example_prompts
,
max_tokens
)
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
# Set max_num_seqs to keep Codestral from going OOM at fp32
with
vllm_runner
(
model
,
dtype
=
dtype
,
max_num_seqs
=
16
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
# This test is for verifying whether the model's extra_repr
# This test is for verifying whether the model's extra_repr
...
@@ -83,7 +98,7 @@ def test_batching(
...
@@ -83,7 +98,7 @@ def test_batching(
)
->
None
:
)
->
None
:
# To pass the small model tests, we need full precision.
# To pass the small model tests, we need full precision.
for_loop_outputs
=
[]
for_loop_outputs
=
[]
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
with
vllm_runner
(
model
,
dtype
=
dtype
,
max_num_seqs
=
16
)
as
vllm_model
:
for
prompt
in
example_prompts
:
for
prompt
in
example_prompts
:
for_loop_outputs
.
append
(
for_loop_outputs
.
append
(
vllm_model
.
generate_greedy
([
prompt
],
max_tokens
)[
0
])
vllm_model
.
generate_greedy
([
prompt
],
max_tokens
)[
0
])
...
@@ -167,20 +182,22 @@ def test_parallel_sampling(
...
@@ -167,20 +182,22 @@ def test_parallel_sampling(
max_tokens
:
int
,
max_tokens
:
int
,
)
->
None
:
)
->
None
:
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
# Numerical differences produce slightly different output for these
if
'state-spaces'
in
model
:
example_prompts
.
pop
(
0
)
example_prompts
.
pop
(
0
)
example_prompts
.
pop
(
0
)
with
vllm_runner
(
model
,
dtype
=
dtype
,
max_num_seqs
=
16
)
as
vllm_model
:
for_loop_outputs
=
[]
for_loop_outputs
=
[]
for
_
in
range
(
10
):
for
_
in
range
(
10
):
for_loop_outputs
.
append
(
for_loop_outputs
.
append
(
# using example_prompts index 1 instead of 0 since with 0 the
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)[
0
])
# logprobs get really close and the test doesn't pass
vllm_model
.
generate_greedy
([
example_prompts
[
1
]],
max_tokens
)
[
0
])
sampling_params
=
SamplingParams
(
n
=
10
,
sampling_params
=
SamplingParams
(
n
=
10
,
temperature
=
0.001
,
temperature
=
0.001
,
seed
=
0
,
seed
=
0
,
max_tokens
=
max_tokens
)
max_tokens
=
max_tokens
)
n_lt_1_outputs
=
vllm_model
.
generate
([
example_prompts
[
1
]],
n_lt_1_outputs
=
vllm_model
.
generate
(
example_prompts
,
sampling_params
)
sampling_params
)
token_ids
,
texts
=
n_lt_1_outputs
[
0
]
token_ids
,
texts
=
n_lt_1_outputs
[
0
]
n_lt_1_outputs
=
[(
token_id
,
text
)
n_lt_1_outputs
=
[(
token_id
,
text
)
for
token_id
,
text
in
zip
(
token_ids
,
texts
)]
for
token_id
,
text
in
zip
(
token_ids
,
texts
)]
...
@@ -234,7 +251,7 @@ def test_models_preemption_recompute(
...
@@ -234,7 +251,7 @@ def test_models_preemption_recompute(
# Tests that outputs are identical with and w/o preemtions (recompute)
# Tests that outputs are identical with and w/o preemtions (recompute)
assert
dtype
==
"float"
assert
dtype
==
"float"
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
with
vllm_runner
(
model
,
dtype
=
dtype
,
max_num_seqs
=
16
)
as
vllm_model
:
vllm_model
.
model
.
llm_engine
.
scheduler
[
vllm_model
.
model
.
llm_engine
.
scheduler
[
0
].
ENABLE_ARTIFICIAL_PREEMPT
=
True
0
].
ENABLE_ARTIFICIAL_PREEMPT
=
True
preempt_vllm_outputs
=
vllm_model
.
generate_greedy
(
preempt_vllm_outputs
=
vllm_model
.
generate_greedy
(
...
@@ -285,7 +302,7 @@ def test_state_cleanup(
...
@@ -285,7 +302,7 @@ def test_state_cleanup(
# This test is for verifying that the Mamba state is cleaned up between
# This test is for verifying that the Mamba state is cleaned up between
# steps, If its not cleaned, an error would be expected.
# steps, If its not cleaned, an error would be expected.
try
:
try
:
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
with
vllm_runner
(
model
,
dtype
=
dtype
,
max_num_seqs
=
16
)
as
vllm_model
:
for
_
in
range
(
10
):
for
_
in
range
(
10
):
vllm_model
.
generate_greedy
([
example_prompts
[
0
]]
*
100
,
1
)
vllm_model
.
generate_greedy
([
example_prompts
[
0
]]
*
100
,
1
)
except
ValueError
:
except
ValueError
:
...
...
tests/models/decoder_only/language/test_models.py
View file @
ec5e299c
...
@@ -28,6 +28,9 @@ from ....utils import models_path_prefix
...
@@ -28,6 +28,9 @@ from ....utils import models_path_prefix
os
.
path
.
join
(
models_path_prefix
,
"google/gemma-1.1-2b-it"
),
# gemma
os
.
path
.
join
(
models_path_prefix
,
"google/gemma-1.1-2b-it"
),
# gemma
marks
=
[
pytest
.
mark
.
core_model
,
pytest
.
mark
.
cpu_model
],
marks
=
[
pytest
.
mark
.
core_model
,
pytest
.
mark
.
cpu_model
],
),
),
pytest
.
param
(
os
.
path
.
join
(
models_path_prefix
,
"THUDM/chatglm3-6b"
),
# chatglm (text-only)
),
pytest
.
param
(
pytest
.
param
(
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
),
# llama
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
),
# llama
marks
=
[
pytest
.
mark
.
core_model
,
pytest
.
mark
.
cpu_model
],
marks
=
[
pytest
.
mark
.
core_model
,
pytest
.
mark
.
cpu_model
],
...
@@ -45,6 +48,9 @@ from ....utils import models_path_prefix
...
@@ -45,6 +48,9 @@ from ....utils import models_path_prefix
os
.
path
.
join
(
models_path_prefix
,
"microsoft/phi-2"
),
# phi
os
.
path
.
join
(
models_path_prefix
,
"microsoft/phi-2"
),
# phi
marks
=
[
pytest
.
mark
.
core_model
],
marks
=
[
pytest
.
mark
.
core_model
],
),
),
pytest
.
param
(
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen-7B"
),
# qwen (text-only)
),
pytest
.
param
(
pytest
.
param
(
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2.5-0.5B-Instruct"
),
# qwen2
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2.5-0.5B-Instruct"
),
# qwen2
marks
=
[
pytest
.
mark
.
core_model
],
marks
=
[
pytest
.
mark
.
core_model
],
...
@@ -70,6 +76,10 @@ def test_models(
...
@@ -70,6 +76,10 @@ def test_models(
)
->
None
:
)
->
None
:
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
if
model
.
startswith
(
"THUDM/chatglm3"
):
hf_model
.
model
.
get_output_embeddings
=
lambda
:
\
hf_model
.
model
.
transformer
.
output_layer
hf_outputs
=
hf_model
.
generate_greedy_logprobs_limit
(
hf_outputs
=
hf_model
.
generate_greedy_logprobs_limit
(
example_prompts
,
max_tokens
,
num_logprobs
)
example_prompts
,
max_tokens
,
num_logprobs
)
...
...
tests/models/decoder_only/vision_language/test_models.py
View file @
ec5e299c
...
@@ -157,10 +157,7 @@ VLM_TEST_SETTINGS = {
...
@@ -157,10 +157,7 @@ VLM_TEST_SETTINGS = {
auto_cls
=
AutoModelForVision2Seq
,
auto_cls
=
AutoModelForVision2Seq
,
vllm_output_post_proc
=
model_utils
.
qwen2_vllm_to_hf_output
,
vllm_output_post_proc
=
model_utils
.
qwen2_vllm_to_hf_output
,
image_size_factors
=
[(),
(
0.25
,),
(
0.25
,
0.25
,
0.25
),
(
0.25
,
0.2
,
0.15
)],
image_size_factors
=
[(),
(
0.25
,),
(
0.25
,
0.25
,
0.25
),
(
0.25
,
0.2
,
0.15
)],
marks
=
[
pytest
.
mark
.
skipif
(
marks
=
[
pytest
.
mark
.
core_model
,
pytest
.
mark
.
cpu_model
],
TRANSFORMERS_VERSION
<
"4.49.0"
,
reason
=
"HF model requires transformers>=4.49.0"
,
),
pytest
.
mark
.
core_model
,
pytest
.
mark
.
cpu_model
],
),
),
#### Extended model tests
#### Extended model tests
"aria"
:
VLMTestInfo
(
"aria"
:
VLMTestInfo
(
...
@@ -217,7 +214,6 @@ VLM_TEST_SETTINGS = {
...
@@ -217,7 +214,6 @@ VLM_TEST_SETTINGS = {
"cherry_blossom"
:
"<image>
\n
Please infer the season with reason in details."
,
# noqa: E501
"cherry_blossom"
:
"<image>
\n
Please infer the season with reason in details."
,
# noqa: E501
}),
}),
multi_image_prompt
=
"image_1:<image>
\n
image_2:<image>
\n
Which image can we see the car and the tower?"
,
# noqa: E501
multi_image_prompt
=
"image_1:<image>
\n
image_2:<image>
\n
Which image can we see the car and the tower?"
,
# noqa: E501
vllm_runner_kwargs
=
{
"hf_overrides"
:
{
"architectures"
:
[
"DeepseekVLV2ForCausalLM"
]}},
# noqa: E501
patch_hf_runner
=
model_utils
.
deepseekvl2_patch_hf_runner
,
patch_hf_runner
=
model_utils
.
deepseekvl2_patch_hf_runner
,
postprocess_inputs
=
model_utils
.
cast_dtype_post_processor
(
"images"
),
postprocess_inputs
=
model_utils
.
cast_dtype_post_processor
(
"images"
),
hf_output_post_proc
=
model_utils
.
deepseekvl2_trunc_hf_output
,
hf_output_post_proc
=
model_utils
.
deepseekvl2_trunc_hf_output
,
...
@@ -353,7 +349,6 @@ VLM_TEST_SETTINGS = {
...
@@ -353,7 +349,6 @@ VLM_TEST_SETTINGS = {
postprocess_inputs
=
model_utils
.
cast_dtype_post_processor
(
postprocess_inputs
=
model_utils
.
cast_dtype_post_processor
(
"pixel_values"
"pixel_values"
),
),
vllm_runner_kwargs
=
{
"hf_overrides"
:
{
"architectures"
:
[
"MantisForConditionalGeneration"
]}},
# noqa: E501
get_stop_token_ids
=
lambda
tok
:
[
128009
],
get_stop_token_ids
=
lambda
tok
:
[
128009
],
auto_cls
=
AutoModelForVision2Seq
,
auto_cls
=
AutoModelForVision2Seq
,
vllm_output_post_proc
=
model_utils
.
mantis_vllm_to_hf_output
,
vllm_output_post_proc
=
model_utils
.
mantis_vllm_to_hf_output
,
...
@@ -406,11 +401,10 @@ VLM_TEST_SETTINGS = {
...
@@ -406,11 +401,10 @@ VLM_TEST_SETTINGS = {
"molmo"
:
VLMTestInfo
(
"molmo"
:
VLMTestInfo
(
models
=
[
"allenai/Molmo-7B-D-0924"
],
models
=
[
"allenai/Molmo-7B-D-0924"
],
test_type
=
(
VLMTestType
.
IMAGE
),
test_type
=
(
VLMTestType
.
IMAGE
),
prompt_formatter
=
lambda
img_prompt
:
"User: "
+
img_prompt
+
" Assistant:"
,
# noqa: E501
prompt_formatter
=
identity
,
max_model_len
=
4096
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
max_num_seqs
=
2
,
image_size_factors
=
[(),(
1.0
,
1.0
,
1.0
)],
patch_hf_runner
=
model_utils
.
molmo_patch_hf_runner
,
patch_hf_runner
=
model_utils
.
mlomo_patch_hf_runner
,
postprocess_inputs
=
model_utils
.
molmo_post_processor
,
postprocess_inputs
=
model_utils
.
molmo_post_processor
,
),
),
# Tests for phi3v currently live in another file because of a bug in
# Tests for phi3v currently live in another file because of a bug in
...
@@ -440,7 +434,7 @@ VLM_TEST_SETTINGS = {
...
@@ -440,7 +434,7 @@ VLM_TEST_SETTINGS = {
auto_cls
=
AutoModelForVision2Seq
,
auto_cls
=
AutoModelForVision2Seq
,
marks
=
[
large_gpu_mark
(
min_gb
=
48
)],
marks
=
[
large_gpu_mark
(
min_gb
=
48
)],
),
),
"qwen"
:
VLMTestInfo
(
"qwen
_vl
"
:
VLMTestInfo
(
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen-VL"
)],
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen-VL"
)],
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
),
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
),
prompt_formatter
=
identity
,
prompt_formatter
=
identity
,
...
...
tests/models/decoder_only/vision_language/vlm_utils/core.py
View file @
ec5e299c
...
@@ -4,12 +4,14 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
...
@@ -4,12 +4,14 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
import
torch
import
torch
from
PIL.Image
import
Image
from
PIL.Image
import
Image
from
transformers
import
AutoTokenizer
,
BatchEncoding
,
PreTrainedTokenizerBase
from
transformers
import
BatchEncoding
from
transformers.models.auto.auto_factory
import
_BaseAutoModelClass
from
transformers.models.auto.auto_factory
import
_BaseAutoModelClass
from
vllm.config
import
TaskOption
from
vllm.config
import
TaskOption
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
from
.....conftest
import
HfRunner
,
VllmRunner
from
.....conftest
import
HfRunner
,
VllmRunner
from
....registry
import
HF_EXAMPLE_MODELS
from
.types
import
RunnerOutput
from
.types
import
RunnerOutput
...
@@ -31,10 +33,8 @@ def run_test(
...
@@ -31,10 +33,8 @@ def run_test(
use_tokenizer_eos
:
bool
,
use_tokenizer_eos
:
bool
,
postprocess_inputs
:
Callable
[[
BatchEncoding
],
BatchEncoding
],
postprocess_inputs
:
Callable
[[
BatchEncoding
],
BatchEncoding
],
comparator
:
Callable
[...,
None
],
comparator
:
Callable
[...,
None
],
get_stop_token_ids
:
Optional
[
Callable
[[
PreTrainedTokenizerBase
],
get_stop_token_ids
:
Optional
[
Callable
[[
AnyTokenizer
],
list
[
int
]]],
List
[
int
]]],
stop_str
:
Optional
[
List
[
str
]],
stop_str
:
Optional
[
List
[
str
]],
tokenizer_mode
:
str
,
limit_mm_per_prompt
:
Dict
[
str
,
int
],
limit_mm_per_prompt
:
Dict
[
str
,
int
],
vllm_runner_kwargs
:
Optional
[
Dict
[
str
,
Any
]],
vllm_runner_kwargs
:
Optional
[
Dict
[
str
,
Any
]],
hf_model_kwargs
:
Optional
[
Dict
[
str
,
Any
]],
hf_model_kwargs
:
Optional
[
Dict
[
str
,
Any
]],
...
@@ -48,7 +48,10 @@ def run_test(
...
@@ -48,7 +48,10 @@ def run_test(
"""Modality agnostic test test executor for comparing HF/vLLM outputs."""
"""Modality agnostic test test executor for comparing HF/vLLM outputs."""
# In the case of embeddings, vLLM takes separate input tensors
# In the case of embeddings, vLLM takes separate input tensors
vllm_inputs
=
vllm_embeddings
if
vllm_embeddings
is
not
None
else
inputs
vllm_inputs
=
vllm_embeddings
if
vllm_embeddings
is
not
None
else
inputs
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model
,
trust_remote_code
=
True
)
model_info
=
HF_EXAMPLE_MODELS
.
find_hf_info
(
model
)
model_info
.
check_available_online
(
on_fail
=
"skip"
)
model_info
.
check_transformers_version
(
on_fail
=
"skip"
)
vllm_outputs_per_mm
=
[]
vllm_outputs_per_mm
=
[]
hf_outputs_per_mm
=
[]
hf_outputs_per_mm
=
[]
...
@@ -57,17 +60,19 @@ def run_test(
...
@@ -57,17 +60,19 @@ def run_test(
# vLLM needs a fresh new process without cuda initialization.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
# will hurt multiprocessing backend with fork method (the default method).
vllm_kwargs
:
Dict
[
str
,
Any
]
=
{}
if
get_stop_token_ids
is
not
None
:
vllm_kwargs
[
"stop_token_ids"
]
=
get_stop_token_ids
(
tokenizer
)
if
stop_str
:
vllm_kwargs
[
"stop"
]
=
stop_str
if
vllm_runner_kwargs
is
None
:
vllm_runner_kwargs_
:
Dict
[
str
,
Any
]
=
{}
vllm_runner_kwargs
=
{}
if
model_info
.
tokenizer
:
vllm_runner_kwargs_
[
"tokenizer"
]
=
model_info
.
tokenizer
if
model_info
.
tokenizer_mode
:
vllm_runner_kwargs_
[
"tokenizer_mode"
]
=
model_info
.
tokenizer_mode
if
model_info
.
hf_overrides
:
vllm_runner_kwargs_
[
"hf_overrides"
]
=
model_info
.
hf_overrides
if
vllm_runner_kwargs
:
vllm_runner_kwargs_
.
update
(
vllm_runner_kwargs
)
with
vllm_runner
(
model
,
with
vllm_runner
(
model
,
tokenizer_mode
=
tokenizer_mode
,
max_model_len
=
max_model_len
,
max_model_len
=
max_model_len
,
max_num_seqs
=
max_num_seqs
,
max_num_seqs
=
max_num_seqs
,
dtype
=
dtype
,
dtype
=
dtype
,
...
@@ -76,7 +81,15 @@ def run_test(
...
@@ -76,7 +81,15 @@ def run_test(
distributed_executor_backend
=
distributed_executor_backend
,
distributed_executor_backend
=
distributed_executor_backend
,
enforce_eager
=
enforce_eager
,
enforce_eager
=
enforce_eager
,
task
=
task
,
task
=
task
,
**
vllm_runner_kwargs
)
as
vllm_model
:
**
vllm_runner_kwargs_
)
as
vllm_model
:
tokenizer
=
vllm_model
.
model
.
get_tokenizer
()
vllm_kwargs
:
Dict
[
str
,
Any
]
=
{}
if
get_stop_token_ids
is
not
None
:
vllm_kwargs
[
"stop_token_ids"
]
=
get_stop_token_ids
(
tokenizer
)
if
stop_str
:
vllm_kwargs
[
"stop"
]
=
stop_str
for
prompts
,
media
in
vllm_inputs
:
for
prompts
,
media
in
vllm_inputs
:
vllm_kwargs
[
runner_mm_key
]
=
media
vllm_kwargs
[
runner_mm_key
]
=
media
vllm_output
=
vllm_model
.
generate_greedy_logprobs
(
vllm_output
=
vllm_model
.
generate_greedy_logprobs
(
...
@@ -93,16 +106,19 @@ def run_test(
...
@@ -93,16 +106,19 @@ def run_test(
if
patch_hf_runner
is
not
None
:
if
patch_hf_runner
is
not
None
:
hf_model
=
patch_hf_runner
(
hf_model
)
hf_model
=
patch_hf_runner
(
hf_model
)
# Some models need to explicitly pass the eos_token_id off the tokenizer or
# processor for a good comparison; currently assume processor/tokenizer
# agree on the EOS, and pull it off the tokenizer if requested.
hf_kwargs
=
{}
if
use_tokenizer_eos
:
hf_kwargs
[
"eos_token_id"
]
=
tokenizer
.
eos_token_id
if
stop_str
:
hf_kwargs
[
"stop_strings"
]
=
stop_str
with
hf_model
,
torch
.
no_grad
():
with
hf_model
,
torch
.
no_grad
():
tokenizer
=
hf_model
.
tokenizer
# Some models need to explicitly pass the eos_token_id off the tokenizer
# or processor for a good comparison;
# currently assume processor/tokenizer agree on the EOS, and pull it off
# the tokenizer if requested.
hf_kwargs
=
{}
if
use_tokenizer_eos
:
hf_kwargs
[
"eos_token_id"
]
=
tokenizer
.
eos_token_id
if
stop_str
:
hf_kwargs
[
"stop_strings"
]
=
stop_str
for
prompts
,
media
in
inputs
:
for
prompts
,
media
in
inputs
:
hf_kwargs
[
runner_mm_key
]
=
media
hf_kwargs
[
runner_mm_key
]
=
media
hf_output
=
hf_model
.
generate_greedy_logprobs_limit
(
hf_output
=
hf_model
.
generate_greedy_logprobs_limit
(
...
...
tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
View file @
ec5e299c
...
@@ -6,7 +6,7 @@ typically specific to a small subset of models.
...
@@ -6,7 +6,7 @@ typically specific to a small subset of models.
import
re
import
re
import
types
import
types
from
pathlib
import
PosixPath
from
pathlib
import
PosixPath
from
typing
import
Any
,
Callable
,
Dict
,
List
,
Optional
,
Tuple
,
Union
from
typing
import
Callable
,
List
,
Optional
,
Tuple
,
Union
import
torch
import
torch
from
PIL.Image
import
Image
from
PIL.Image
import
Image
...
@@ -17,9 +17,7 @@ from vllm.sequence import SampleLogprobs
...
@@ -17,9 +17,7 @@ from vllm.sequence import SampleLogprobs
from
vllm.transformers_utils.tokenizer
import
patch_padding_side
from
vllm.transformers_utils.tokenizer
import
patch_padding_side
from
vllm.utils
import
STR_DTYPE_TO_TORCH_DTYPE
from
vllm.utils
import
STR_DTYPE_TO_TORCH_DTYPE
from
.....conftest
import
(
HfRunner
,
ImageAsset
,
PromptAudioInput
,
from
.....conftest
import
HfRunner
,
ImageAsset
,
_ImageAssets
PromptImageInput
,
PromptVideoInput
,
_ImageAssets
)
from
....utils
import
TokensTextLogprobs
from
.types
import
RunnerOutput
from
.types
import
RunnerOutput
...
@@ -522,74 +520,7 @@ def minicpmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
...
@@ -522,74 +520,7 @@ def minicpmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
return
hf_model
return
hf_model
def
_generate_greedy_logprobs_limit
(
def
molmo_patch_hf_runner
(
hf_model
:
HfRunner
)
->
HfRunner
:
self
,
prompts
:
List
[
str
],
max_tokens
:
int
,
num_logprobs
:
int
,
images
:
Optional
[
PromptImageInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
**
kwargs
:
Any
,
)
->
List
[
TokensTextLogprobs
]:
all_inputs
=
self
.
get_inputs
(
prompts
,
images
=
images
,
videos
=
videos
,
audios
=
audios
)
# Process in batches for inference.
if
len
(
all_inputs
):
input_ids_lst
=
[]
images_lst
=
[]
images_input_idx_lst
=
[]
imges_masks_lst
=
[]
for
inputs
in
all_inputs
:
input_ids_lst
.
append
(
inputs
[
"input_ids"
])
images_lst
.
append
(
inputs
[
"images"
])
images_input_idx_lst
.
append
(
inputs
[
"image_input_idx"
])
imges_masks_lst
.
append
(
inputs
[
"image_masks"
])
batch_inputs
=
{}
batch_inputs
[
'input_ids'
]
=
torch
.
cat
(
input_ids_lst
,
dim
=
0
)
batch_inputs
[
'images'
]
=
torch
.
cat
(
images_lst
,
dim
=
0
)
batch_inputs
[
'image_input_idx'
]
=
torch
.
cat
(
images_input_idx_lst
,
dim
=
0
)
batch_inputs
[
'image_masks'
]
=
torch
.
cat
(
imges_masks_lst
,
dim
=
0
)
outputs
=
self
.
model
.
generate_from_batch
(
batch
=
self
.
wrap_device
(
batch_inputs
,
device
=
self
.
model
.
device
.
type
),
generation_config
=
GenerationConfig
(
max_new_tokens
=
max_tokens
,
stop_strings
=
"<|endoftext|>"
,
do_sample
=
False
,
),
tokenizer
=
self
.
tokenizer
,
output_hidden_states
=
True
,
return_dict_in_generate
=
True
,
)
all_logprobs
:
List
[
List
[
Dict
[
int
,
float
]]]
=
[]
all_output_ids
:
List
[
List
[
int
]]
=
[]
all_output_strs
:
List
[
str
]
=
[]
for
index
in
range
(
len
(
all_inputs
)):
(
seq_logprobs_lst
,
output_len
,
)
=
self
.
_hidden_states_to_logprobs
(
outputs
.
hidden_states
,
num_logprobs
)
all_logprobs
.
append
(
seq_logprobs_lst
)
seq_ids
=
outputs
.
sequences
[
index
]
output_ids
=
seq_ids
[
-
output_len
:]
all_output_ids
.
append
(
output_ids
.
tolist
())
all_output_strs
.
append
(
self
.
tokenizer
.
decode
(
output_ids
))
outputs
=
zip
(
all_output_ids
,
all_output_strs
,
all_logprobs
)
return
[(
output_ids
,
output_str
,
output_logprobs
)
for
output_ids
,
output_str
,
output_logprobs
in
outputs
]
####### Molmo-specific HuggingFace runner patchers
def
mlomo_patch_hf_runner
(
hf_model
:
HfRunner
)
->
HfRunner
:
"""Patches and returns an instance of the HfRunner to use for Molmo."""
"""Patches and returns an instance of the HfRunner to use for Molmo."""
hf_processor
=
hf_model
.
processor
hf_processor
=
hf_model
.
processor
...
@@ -598,10 +529,23 @@ def mlomo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
...
@@ -598,10 +529,23 @@ def mlomo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
hf_model
.
processor
=
_processor
hf_model
.
processor
=
_processor
setattr
(
# noqa: B010
def
_generate
(
self
,
max_new_tokens
=
None
,
do_sample
=
None
,
**
kwargs
):
hf_model
,
batch
=
{
"generate_greedy_logprobs_limit"
,
k
:
kwargs
.
pop
(
k
)
types
.
MethodType
(
_generate_greedy_logprobs_limit
,
hf_model
),
for
k
in
(
"input_ids"
,
"images"
,
"image_input_idx"
,
"image_masks"
)
)
if
k
in
kwargs
}
return
self
.
generate_from_batch
(
batch
,
generation_config
=
GenerationConfig
(
max_new_tokens
=
max_new_tokens
,
stop_strings
=
"<|endoftext|>"
,
do_sample
=
do_sample
,
),
**
kwargs
,
)
hf_model
.
model
.
generate
=
types
.
MethodType
(
_generate
,
hf_model
.
model
)
return
hf_model
return
hf_model
tests/models/decoder_only/vision_language/vlm_utils/types.py
View file @
ec5e299c
...
@@ -8,12 +8,12 @@ from typing import (Any, Callable, Dict, Iterable, List, NamedTuple, Optional,
...
@@ -8,12 +8,12 @@ from typing import (Any, Callable, Dict, Iterable, List, NamedTuple, Optional,
import
torch
import
torch
from
PIL.Image
import
Image
from
PIL.Image
import
Image
from
pytest
import
MarkDecorator
from
pytest
import
MarkDecorator
from
transformers
import
(
AutoModelForCausalLM
,
BatchEncoding
,
from
transformers
import
AutoModelForCausalLM
,
BatchEncoding
PreTrainedTokenizerBase
)
from
transformers.models.auto.auto_factory
import
_BaseAutoModelClass
from
transformers.models.auto.auto_factory
import
_BaseAutoModelClass
from
vllm.config
import
TaskOption
from
vllm.config
import
TaskOption
from
vllm.sequence
import
SampleLogprobs
from
vllm.sequence
import
SampleLogprobs
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
from
vllm.utils
import
identity
from
vllm.utils
import
identity
from
.....conftest
import
IMAGE_ASSETS
,
HfRunner
,
ImageAsset
,
_ImageAssets
from
.....conftest
import
IMAGE_ASSETS
,
HfRunner
,
ImageAsset
,
_ImageAssets
...
@@ -100,8 +100,7 @@ class VLMTestInfo(NamedTuple):
...
@@ -100,8 +100,7 @@ class VLMTestInfo(NamedTuple):
vllm_runner_kwargs
:
Optional
[
Dict
[
str
,
Any
]]
=
None
vllm_runner_kwargs
:
Optional
[
Dict
[
str
,
Any
]]
=
None
# Optional callable which gets a list of token IDs from the model tokenizer
# Optional callable which gets a list of token IDs from the model tokenizer
get_stop_token_ids
:
Optional
[
Callable
[[
PreTrainedTokenizerBase
],
get_stop_token_ids
:
Optional
[
Callable
[[
AnyTokenizer
],
list
[
int
]]]
=
None
List
[
int
]]]
=
None
# Optional list of strings to stop generation, useful when stop tokens are
# Optional list of strings to stop generation, useful when stop tokens are
# not special tokens in the tokenizer
# not special tokens in the tokenizer
stop_str
:
Optional
[
List
[
str
]]
=
None
stop_str
:
Optional
[
List
[
str
]]
=
None
...
@@ -156,8 +155,6 @@ class VLMTestInfo(NamedTuple):
...
@@ -156,8 +155,6 @@ class VLMTestInfo(NamedTuple):
marks
:
Optional
[
List
[
MarkDecorator
]]
=
None
marks
:
Optional
[
List
[
MarkDecorator
]]
=
None
tokenizer_mode
:
str
=
"auto"
def
get_non_parametrized_runner_kwargs
(
self
):
def
get_non_parametrized_runner_kwargs
(
self
):
"""Returns a dictionary of expandable kwargs for items that are used
"""Returns a dictionary of expandable kwargs for items that are used
in all test types, which are NOT used when creating the parametrized
in all test types, which are NOT used when creating the parametrized
...
@@ -180,7 +177,6 @@ class VLMTestInfo(NamedTuple):
...
@@ -180,7 +177,6 @@ class VLMTestInfo(NamedTuple):
"hf_model_kwargs"
:
self
.
hf_model_kwargs
,
"hf_model_kwargs"
:
self
.
hf_model_kwargs
,
"stop_str"
:
self
.
stop_str
,
"stop_str"
:
self
.
stop_str
,
"patch_hf_runner"
:
self
.
patch_hf_runner
,
"patch_hf_runner"
:
self
.
patch_hf_runner
,
"tokenizer_mode"
:
self
.
tokenizer_mode
}
}
...
...
tests/models/encoder_decoder/vision_language/test_mllama.py
View file @
ec5e299c
...
@@ -8,11 +8,11 @@ import torch
...
@@ -8,11 +8,11 @@ import torch
from
transformers
import
(
AutoConfig
,
AutoModelForVision2Seq
,
AutoTokenizer
,
from
transformers
import
(
AutoConfig
,
AutoModelForVision2Seq
,
AutoTokenizer
,
BatchEncoding
)
BatchEncoding
)
from
vllm
import
LLM
,
SamplingParams
from
vllm.attention.backends.flash_attn
import
FlashAttentionMetadata
from
vllm.attention.backends.flash_attn
import
FlashAttentionMetadata
from
vllm.attention.selector
import
(
_Backend
,
_cached_get_attn_backend
,
from
vllm.attention.selector
import
(
_Backend
,
_cached_get_attn_backend
,
global_force_attn_backend_context_manager
)
global_force_attn_backend_context_manager
)
from
vllm.model_executor.models.mllama
import
(
MLLAMA_IMAGE_TOKEN_ID
,
from
vllm.model_executor.models.mllama
import
MllamaForConditionalGeneration
MllamaForConditionalGeneration
)
from
vllm.multimodal.image
import
rescale_image_size
from
vllm.multimodal.image
import
rescale_image_size
from
vllm.sequence
import
SampleLogprobs
from
vllm.sequence
import
SampleLogprobs
...
@@ -23,6 +23,7 @@ from ...utils import check_logprobs_close
...
@@ -23,6 +23,7 @@ from ...utils import check_logprobs_close
from
....utils
import
models_path_prefix
from
....utils
import
models_path_prefix
_LIMIT_IMAGE_PER_PROMPT
=
3
_LIMIT_IMAGE_PER_PROMPT
=
3
MLLAMA_IMAGE_TOKEN_ID
=
128256
LIST_ENC_DEC_SUPPORTED_BACKENDS
=
[
_Backend
.
XFORMERS
,
_Backend
.
FLASH_ATTN
]
LIST_ENC_DEC_SUPPORTED_BACKENDS
=
[
_Backend
.
XFORMERS
,
_Backend
.
FLASH_ATTN
]
...
@@ -398,6 +399,64 @@ def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
...
@@ -398,6 +399,64 @@ def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
)
)
@
large_gpu_test
(
min_gb
=
48
)
@
pytest
.
mark
.
core_model
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
32
])
def
test_explicit_implicit_prompt
(
image_assets
:
_ImageAssets
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
):
stop_sign
=
image_assets
[
0
].
pil_image
# yapf: disable
prompts
=
[
# explicit prompt
{
"encoder_prompt"
:
{
"prompt"
:
"<|image|>"
,
"multi_modal_data"
:
{
"image"
:
stop_sign
},
},
"decoder_prompt"
:
{
"prompt_token_ids"
:
[
128000
,
791
,
2262
,
315
,
279
,
2217
,
220
,
128256
,
374
],
# noqa: E501
}
},
{
"encoder_prompt"
:
"Not <|image|>"
,
"decoder_prompt"
:
"The color of the sky is blue but sometimes it can also be"
,
# noqa: E501
},
# implicit prompt
{
"prompt"
:
"<|begin_of_text|>The content of the image <|image|> is"
,
# noqa: E501
"multi_modal_data"
:
{
"image"
:
stop_sign
},
},
{
"prompt"
:
"The color of the sky is blue but sometimes it can also be"
,
# noqa: E501
},
]
# yapf: enable
llm
=
LLM
(
model
=
model
,
dtype
=
dtype
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
tensor_parallel_size
=
1
,
enforce_eager
=
True
,
)
sampling_params
=
SamplingParams
(
temperature
=
0
,
max_tokens
=
max_tokens
,
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
n_prompts
=
len
(
prompts
)
explicit_outputs
=
outputs
[:
n_prompts
//
2
]
implicit_outputs
=
outputs
[
n_prompts
//
2
:]
for
exp_output
,
imp_output
in
zip
(
explicit_outputs
,
implicit_outputs
):
assert
exp_output
.
outputs
[
0
].
text
==
imp_output
.
outputs
[
0
].
text
@
large_gpu_test
(
min_gb
=
48
)
@
large_gpu_test
(
min_gb
=
48
)
@
pytest
.
mark
.
core_model
@
pytest
.
mark
.
core_model
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
...
@@ -460,6 +519,10 @@ def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
...
@@ -460,6 +519,10 @@ def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
images
=
images
)
images
=
images
)
class
DummyModel
:
image_token_id
=
MLLAMA_IMAGE_TOKEN_ID
@
pytest
.
mark
.
core_model
@
pytest
.
mark
.
core_model
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"input_indices_and_output"
,
"input_indices_and_output"
,
...
@@ -501,7 +564,7 @@ def test_get_cross_attention_mask(input_indices_and_output) -> None:
...
@@ -501,7 +564,7 @@ def test_get_cross_attention_mask(input_indices_and_output) -> None:
use_cuda_graph
=
False
,
use_cuda_graph
=
False
,
)
)
dummy
:
dict
[
str
,
str
]
=
{}
dummy
=
DummyModel
()
cross_attention_mask
,
kv_range_for_decode
=
MllamaForConditionalGeneration
\
cross_attention_mask
,
kv_range_for_decode
=
MllamaForConditionalGeneration
\
.
get_cross_attention_mask
(
dummy
,
.
get_cross_attention_mask
(
dummy
,
...
@@ -558,7 +621,7 @@ def test_get_full_text_row_masked_out_mask(input_indices) -> None:
...
@@ -558,7 +621,7 @@ def test_get_full_text_row_masked_out_mask(input_indices) -> None:
use_cuda_graph
=
False
,
use_cuda_graph
=
False
,
)
)
dummy
:
dict
[
str
,
str
]
=
{}
dummy
=
DummyModel
()
full_text_row_masked_out_mask
=
MllamaForConditionalGeneration
\
full_text_row_masked_out_mask
=
MllamaForConditionalGeneration
\
.
get_full_text_row_masked_out_mask
(
dummy
,
.
get_full_text_row_masked_out_mask
(
dummy
,
...
...
tests/models/multimodal/processing/test_common.py
View file @
ec5e299c
...
@@ -10,7 +10,7 @@ from vllm.config import ModelConfig
...
@@ -10,7 +10,7 @@ from vllm.config import ModelConfig
from
vllm.inputs
import
InputProcessingContext
from
vllm.inputs
import
InputProcessingContext
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.processing
import
ProcessingCache
from
vllm.multimodal.processing
import
ProcessingCache
from
vllm.
multimodal.utils
import
cached_
get_
tokenizer
from
vllm.
transformers_utils.tokenizer
import
cached_tokenizer
_from_config
from
....multimodal.utils
import
random_audio
,
random_image
,
random_video
from
....multimodal.utils
import
random_audio
,
random_image
,
random_video
from
...registry
import
HF_EXAMPLE_MODELS
from
...registry
import
HF_EXAMPLE_MODELS
...
@@ -42,10 +42,7 @@ def _test_processing_correctness(
...
@@ -42,10 +42,7 @@ def _test_processing_correctness(
factories
=
MULTIMODAL_REGISTRY
.
_processor_factories
[
model_cls
]
factories
=
MULTIMODAL_REGISTRY
.
_processor_factories
[
model_cls
]
ctx
=
InputProcessingContext
(
ctx
=
InputProcessingContext
(
model_config
,
model_config
,
tokenizer
=
cached_get_tokenizer
(
tokenizer
=
cached_tokenizer_from_config
(
model_config
),
model_config
.
tokenizer
,
trust_remote_code
=
model_info
.
trust_remote_code
,
),
)
)
# Ensure that it can fit all of the data
# Ensure that it can fit all of the data
cache
=
ProcessingCache
(
capacity
=
1
<<
30
)
cache
=
ProcessingCache
(
capacity
=
1
<<
30
)
...
@@ -85,11 +82,19 @@ def _test_processing_correctness(
...
@@ -85,11 +82,19 @@ def _test_processing_correctness(
partial
(
random_audio
,
rng
,
min_len
=
512
,
max_len
=
1024
,
sr
=
16000
),
partial
(
random_audio
,
rng
,
min_len
=
512
,
max_len
=
1024
,
sr
=
16000
),
}
}
tokenizer_encode_kwargs
=
{}
if
model_config
.
hf_config
.
model_type
==
"mllama"
:
# For Mllama, tokenizer will always add bos_token at the beginning of
# prompt by default, causing hf_processor outputs incorrect token ids.
# So we need use `add_special_tokens=False` here to leave bos_token
# to be added by the processor.
tokenizer_encode_kwargs
=
{
"add_special_tokens"
:
False
}
for
batch_idx
in
range
(
num_batches
):
for
batch_idx
in
range
(
num_batches
):
mm_data
=
{
mm_data
=
{
k
:
k
:
[(
input_to_hit
[
k
]
if
rng
.
rand
()
<
hit_rate
else
input_factory
[
k
]())
[(
input_to_hit
[
k
]
if
rng
.
rand
()
<
hit_rate
else
input_factory
[
k
]())
for
_
in
range
(
rng
.
randint
(
limit
))]
for
_
in
range
(
rng
.
randint
(
limit
+
1
))]
for
k
,
limit
in
limit_mm_per_prompt
.
items
()
for
k
,
limit
in
limit_mm_per_prompt
.
items
()
}
}
...
@@ -122,7 +127,7 @@ def _test_processing_correctness(
...
@@ -122,7 +127,7 @@ def _test_processing_correctness(
f
"Failed (
{
batch_idx
=
}
,
{
prompt
=
}
,
{
mm_data
=
}
)"
)
f
"Failed (
{
batch_idx
=
}
,
{
prompt
=
}
,
{
mm_data
=
}
)"
)
baseline_tokenized_result
=
baseline_processor
.
apply
(
baseline_tokenized_result
=
baseline_processor
.
apply
(
tokenizer
.
encode
(
prompt
),
tokenizer
.
encode
(
prompt
,
**
tokenizer_encode_kwargs
),
mm_data
=
mm_data
,
mm_data
=
mm_data
,
hf_processor_mm_kwargs
=
{},
hf_processor_mm_kwargs
=
{},
)
)
...
@@ -131,7 +136,7 @@ def _test_processing_correctness(
...
@@ -131,7 +136,7 @@ def _test_processing_correctness(
f
"Failed (
{
batch_idx
=
}
,
{
prompt
=
}
,
{
mm_data
=
}
)"
)
f
"Failed (
{
batch_idx
=
}
,
{
prompt
=
}
,
{
mm_data
=
}
)"
)
cached_tokenized_result
=
cached_processor
.
apply
(
cached_tokenized_result
=
cached_processor
.
apply
(
tokenizer
.
encode
(
prompt
),
tokenizer
.
encode
(
prompt
,
**
tokenizer_encode_kwargs
),
mm_data
=
mm_data
,
mm_data
=
mm_data
,
hf_processor_mm_kwargs
=
{},
hf_processor_mm_kwargs
=
{},
)
)
...
@@ -147,6 +152,7 @@ def _test_processing_correctness(
...
@@ -147,6 +152,7 @@ def _test_processing_correctness(
"facebook/chameleon-7b"
,
"facebook/chameleon-7b"
,
"deepseek-ai/deepseek-vl2-tiny"
,
"deepseek-ai/deepseek-vl2-tiny"
,
"adept/fuyu-8b"
,
"adept/fuyu-8b"
,
"THUDM/glm-4v-9b"
,
"h2oai/h2ovl-mississippi-800m"
,
"h2oai/h2ovl-mississippi-800m"
,
"OpenGVLab/InternVL2-1B"
,
"OpenGVLab/InternVL2-1B"
,
"HuggingFaceM4/Idefics3-8B-Llama3"
,
"HuggingFaceM4/Idefics3-8B-Llama3"
,
...
@@ -154,16 +160,19 @@ def _test_processing_correctness(
...
@@ -154,16 +160,19 @@ def _test_processing_correctness(
"llava-hf/llava-v1.6-mistral-7b-hf"
,
"llava-hf/llava-v1.6-mistral-7b-hf"
,
"llava-hf/LLaVA-NeXT-Video-7B-hf"
,
"llava-hf/LLaVA-NeXT-Video-7B-hf"
,
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
,
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
,
"meta-llama/Llama-3.2-11B-Vision-Instruct"
,
"TIGER-Lab/Mantis-8B-siglip-llama3"
,
"TIGER-Lab/Mantis-8B-siglip-llama3"
,
"mistral-community/pixtral-12b"
,
"mistral-community/pixtral-12b"
,
"openbmb/MiniCPM-o-2_6"
,
"openbmb/MiniCPM-o-2_6"
,
"openbmb/MiniCPM-V-2_6"
,
"openbmb/MiniCPM-V-2_6"
,
"allenai/Molmo-7B-D-0924"
,
"allenai/Molmo-7B-O-0924"
,
"nvidia/NVLM-D-72B"
,
"nvidia/NVLM-D-72B"
,
"Qwen/Qwen-VL-Chat"
,
"Qwen/Qwen-VL-Chat"
,
"Qwen/Qwen2-VL-2B-Instruct"
,
"Qwen/Qwen2-VL-2B-Instruct"
,
"Qwen/Qwen2.5-VL-3B-Instruct"
,
"Qwen/Qwen2.5-VL-3B-Instruct"
,
"Qwen/Qwen2-Audio-7B-Instruct"
,
"Qwen/Qwen2-Audio-7B-Instruct"
,
"fixie-ai/ultravox-v0_
3
"
,
"fixie-ai/ultravox-v0_
5-llama-3_2-1b
"
,
])
])
@
pytest
.
mark
.
parametrize
(
"hit_rate"
,
[
0.3
,
0.5
,
1.0
])
@
pytest
.
mark
.
parametrize
(
"hit_rate"
,
[
0.3
,
0.5
,
1.0
])
@
pytest
.
mark
.
parametrize
(
"num_batches"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"num_batches"
,
[
32
])
...
...
tests/models/multimodal/processing/test_h2ovl.py
View file @
ec5e299c
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
"""Tests for H2OVL's multimodal preprocessing kwargs."""
"""Tests for H2OVL's multimodal preprocessing kwargs."""
from
typing
import
Optional
from
typing
import
Mapping
,
Optional
import
pytest
import
pytest
from
PIL
import
Image
from
transformers
import
PretrainedConfig
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.image
import
rescale_image_size
from
vllm.multimodal.image
import
rescale_image_size
from
vllm.multimodal.utils
import
cached_get_tokenizer
from
vllm.multimodal.processing
import
BaseMultiModalProcessor
from
vllm.transformers_utils.tokenizer
import
cached_tokenizer_from_config
from
....conftest
import
_ImageAssets
from
....conftest
import
_ImageAssets
from
...utils
import
build_model_context
from
...utils
import
build_model_context
def
_get_expected_num_patches
(
config
:
PretrainedConfig
,
image
:
Image
.
Image
,
num_imgs
:
int
,
min_num
:
int
,
max_num
:
int
,
):
from
vllm.model_executor.models.h2ovl
import
(
calculate_h2ovl_targets
,
get_h2ovl_target_ratios
)
width
,
height
=
image
.
size
# Calculate the expected number of blocks
if
num_imgs
==
1
and
config
.
use_msac
:
# First pass
blocks1
,
_
,
_
,
aspect_ratio
=
calculate_h2ovl_targets
(
orig_width
=
width
,
orig_height
=
height
,
target_ratios
=
get_h2ovl_target_ratios
(
min_num
=
1
,
max_num
=
max_num
,
prior_aspect_ratio
=
None
,
),
image_size
=
config
.
vision_config
.
image_size
,
use_thumbnail
=
False
,
# Thumbnail is handled separately
)
# Second pass
blocks2
,
_
,
_
,
_
=
calculate_h2ovl_targets
(
orig_width
=
width
,
orig_height
=
height
,
target_ratios
=
get_h2ovl_target_ratios
(
min_num
=
3
,
max_num
=
max_num
,
prior_aspect_ratio
=
aspect_ratio
,
),
image_size
=
config
.
vision_config
.
image_size
,
use_thumbnail
=
False
,
)
# Add thumbnail if use_thumbnail is True and total_blocks > 1
if
config
.
use_thumbnail
:
blocks1
+=
1
if
blocks1
>
1
else
0
blocks2
+=
1
if
blocks2
>
1
else
0
# Total blocks is the sum of blocks from both passes minus
# overlapping
total_blocks
=
blocks1
+
blocks2
-
1
return
total_blocks
blocks
,
_
,
_
,
_
=
calculate_h2ovl_targets
(
orig_width
=
width
,
orig_height
=
height
,
target_ratios
=
get_h2ovl_target_ratios
(
min_num
,
max_num
,
prior_aspect_ratio
=
None
,
),
image_size
=
config
.
vision_config
.
image_size
,
use_thumbnail
=
False
,
)
expected_num_patches
=
blocks
if
config
.
use_thumbnail
and
expected_num_patches
>
1
:
expected_num_patches
+=
1
return
expected_num_patches
def
_run_check
(
processor
:
BaseMultiModalProcessor
,
images
:
list
[
Image
.
Image
],
min_num
:
int
,
max_num
:
int
,
mm_processor_kwargs
:
Mapping
[
str
,
object
],
):
tokenizer
=
processor
.
info
.
get_tokenizer
()
config
=
processor
.
info
.
get_hf_config
()
mm_data
=
{
"image"
:
images
}
total_expected_num_patches
=
sum
(
_get_expected_num_patches
(
config
,
image
,
len
(
images
),
min_num
,
max_num
)
for
image
in
images
)
processed_inputs
=
processor
.
apply
(
"<image>"
*
len
(
images
),
mm_data
,
mm_processor_kwargs
)
# Ensure we have the right number of placeholders per num_crops size
image_token_id
=
tokenizer
.
convert_tokens_to_ids
(
"<IMG_CONTEXT>"
)
img_tok_count
=
processed_inputs
[
"prompt_token_ids"
].
count
(
image_token_id
)
pixel_shape
=
processed_inputs
[
"mm_kwargs"
][
"pixel_values_flat"
].
shape
assert
img_tok_count
==
256
*
total_expected_num_patches
assert
pixel_shape
[
0
]
==
total_expected_num_patches
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"h2oai/h2ovl-mississippi-800m"
,
"h2oai/h2ovl-mississippi-800m"
,
"h2oai/h2ovl-mississippi-2b"
,
"h2oai/h2ovl-mississippi-2b"
,
...
@@ -25,118 +126,54 @@ from ...utils import build_model_context
...
@@ -25,118 +126,54 @@ from ...utils import build_model_context
[
1.0
,
1.0
,
1.0
],
[
1.0
,
1.0
,
1.0
],
# Multi-scale
# Multi-scale
[
0.25
,
0.5
,
1.0
],
[
0.25
,
0.5
,
1.0
],
[
4.0
,
2.0
,
1.0
],
],
],
)
)
@
pytest
.
mark
.
parametrize
(
"max_dynamic_patch"
,
[
1
,
2
,
4
,
8
])
@
pytest
.
mark
.
parametrize
(
(
"min_dynamic_patch"
,
"max_dynamic_patch"
),
[(
1
,
1
),
(
1
,
2
),
(
1
,
4
),
(
1
,
8
),
(
2
,
4
),
(
4
,
8
)],
)
@
pytest
.
mark
.
parametrize
(
"dynamic_image_size"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"dynamic_image_size"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"
num_imgs"
,
[
1
,
2
])
@
pytest
.
mark
.
parametrize
(
"
kwargs_on_init"
,
[
True
,
False
])
def
test_processor_override
(
def
test_processor_override
(
model_id
:
str
,
model_id
:
str
,
image_assets
:
_ImageAssets
,
image_assets
:
_ImageAssets
,
size_factors
:
list
[
int
],
size_factors
:
list
[
int
],
min_dynamic_patch
:
int
,
max_dynamic_patch
:
int
,
max_dynamic_patch
:
int
,
dynamic_image_size
:
Optional
[
bool
],
dynamic_image_size
:
Optional
[
bool
],
num_imgs
:
int
,
kwargs_on_init
:
bool
,
):
):
from
vllm.model_executor.models.h2ovl
import
(
calculate_h2ovl_targets
,
mm_processor_kwargs
=
{
get_h2ovl_target_ratios
)
"min_dynamic_patch"
:
min_dynamic_patch
,
"max_dynamic_patch"
:
max_dynamic_patch
,
"dynamic_image_size"
:
dynamic_image_size
,
}
ctx
=
build_model_context
(
ctx
=
build_model_context
(
model_name
=
model_id
,
model_name
=
model_id
,
tokenizer_name
=
model_id
,
tokenizer_name
=
model_id
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
mm_processor_kwargs
=
None
,
mm_processor_kwargs
=
mm_processor_kwargs
if
kwargs_on_init
else
None
,
limit_mm_per_prompt
=
{
"image"
:
num_imgs
},
limit_mm_per_prompt
=
{
"image"
:
len
(
size_factors
)},
)
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
,
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
)
)
tokenizer
=
cached_tokenizer_from_config
(
ctx
.
model_config
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
ctx
.
model_config
,
tokenizer
=
tokenizer
,
tokenizer
=
tokenizer
,
)
)
hf_processor_mm_kwargs
=
{}
if
kwargs_on_init
else
mm_processor_kwargs
config
=
processor
.
info
.
get_hf_config
()
min_num
=
min_dynamic_patch
if
dynamic_image_size
else
1
use_msac
=
config
.
use_msac
mm_processor_kwargs
=
{
"max_dynamic_patch"
:
max_dynamic_patch
,
}
if
dynamic_image_size
is
not
None
:
mm_processor_kwargs
[
"dynamic_image_size"
]
=
dynamic_image_size
min_num
=
config
.
min_dynamic_patch
max_num
=
max_dynamic_patch
if
dynamic_image_size
else
1
max_num
=
max_dynamic_patch
if
dynamic_image_size
else
1
# Build the image str / prompt based on the number of images we pass
_run_check
(
prompt
=
"<image>"
*
num_imgs
processor
,
[
for
asset
in
image_assets
:
rescale_image_size
(
image_assets
[
0
].
pil_image
,
f
)
for
factor
in
size_factors
:
for
f
in
size_factors
image
=
rescale_image_size
(
asset
.
pil_image
,
factor
)
],
mm_data
=
{
"image"
:
[
image
]
*
num_imgs
}
min_num
,
max_num
,
width
,
height
=
image
.
size
hf_processor_mm_kwargs
,
)
# Calculate the expected number of blocks
if
num_imgs
==
1
and
use_msac
:
# First pass
blocks1
,
_
,
_
,
aspect_ratio
=
calculate_h2ovl_targets
(
orig_width
=
width
,
orig_height
=
height
,
target_ratios
=
get_h2ovl_target_ratios
(
min_num
,
max_num
,
prior_aspect_ratio
=
None
,
),
image_size
=
config
.
vision_config
.
image_size
,
use_thumbnail
=
False
,
# Thumbnail is handled separately
)
# Second pass
blocks2
,
_
,
_
,
_
=
calculate_h2ovl_targets
(
orig_width
=
width
,
orig_height
=
height
,
target_ratios
=
get_h2ovl_target_ratios
(
min_num
,
max_num
,
prior_aspect_ratio
=
aspect_ratio
,
),
image_size
=
config
.
vision_config
.
image_size
,
use_thumbnail
=
False
,
)
# Add thumbnail if use_thumbnail is True and total_blocks > 1
if
config
.
use_thumbnail
:
blocks1
+=
1
if
blocks1
>
1
else
0
blocks2
+=
1
if
blocks2
>
1
else
0
# Total blocks is the sum of blocks from both passes minus
# overlapping
total_blocks
=
blocks1
+
blocks2
-
1
expected_num_patches
=
total_blocks
else
:
blocks
,
_
,
_
,
_
=
calculate_h2ovl_targets
(
orig_width
=
width
,
orig_height
=
height
,
target_ratios
=
get_h2ovl_target_ratios
(
min_num
,
max_num
,
prior_aspect_ratio
=
None
,
),
image_size
=
config
.
vision_config
.
image_size
,
use_thumbnail
=
False
,
)
expected_num_patches
=
blocks
if
config
.
use_thumbnail
and
expected_num_patches
!=
1
:
expected_num_patches
+=
1
processed_inputs
=
processor
.
apply
(
prompt
,
mm_data
,
mm_processor_kwargs
)
pixel_shape
=
(
processed_inputs
[
"mm_kwargs"
][
"pixel_values_flat"
].
shape
)
assert
pixel_shape
[
0
]
==
expected_num_patches
*
num_imgs
tests/models/multimodal/processing/test_idefics3.py
View file @
ec5e299c
...
@@ -5,7 +5,7 @@ import pytest
...
@@ -5,7 +5,7 @@ import pytest
from
transformers
import
Idefics3Config
from
transformers
import
Idefics3Config
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.
multimodal.utils
import
cached_
get_
tokenizer
from
vllm.
transformers_utils.tokenizer
import
cached_tokenizer
_from_config
from
....conftest
import
_ImageAssets
from
....conftest
import
_ImageAssets
from
...utils
import
build_model_context
from
...utils
import
build_model_context
...
@@ -24,9 +24,15 @@ models = [os.path.join(models_path_prefix, "HuggingFaceM4/Idefics3-8B-Llama3")]
...
@@ -24,9 +24,15 @@ models = [os.path.join(models_path_prefix, "HuggingFaceM4/Idefics3-8B-Llama3")]
])
])
# yapf: enable
# yapf: enable
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
2
])
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
2
])
def
test_processor_override
(
image_assets
:
_ImageAssets
,
model
:
str
,
@
pytest
.
mark
.
parametrize
(
"kwargs_on_init"
,
[
True
,
False
])
mm_processor_kwargs
:
dict
[
str
,
object
],
def
test_processor_override
(
expected_toks_per_img
:
int
,
num_imgs
:
int
):
image_assets
:
_ImageAssets
,
model
:
str
,
mm_processor_kwargs
:
dict
[
str
,
object
],
expected_toks_per_img
:
int
,
num_imgs
:
int
,
kwargs_on_init
:
bool
,
):
"""Ensure input_processor_for_idefics3 handles num_crops properly."""
"""Ensure input_processor_for_idefics3 handles num_crops properly."""
# Same as the previous test - don't initialize mm_processor_kwargs
# Same as the previous test - don't initialize mm_processor_kwargs
# in this test and assume that the kwargs will be correctly expanded by
# in this test and assume that the kwargs will be correctly expanded by
...
@@ -35,15 +41,15 @@ def test_processor_override(image_assets: _ImageAssets, model: str,
...
@@ -35,15 +41,15 @@ def test_processor_override(image_assets: _ImageAssets, model: str,
model_name
=
model
,
model_name
=
model
,
tokenizer_name
=
model
,
tokenizer_name
=
model
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
mm_processor_kwargs
=
None
,
mm_processor_kwargs
=
mm_processor_kwargs
if
kwargs_on_init
else
None
,
limit_mm_per_prompt
=
{
"image"
:
num_imgs
},
limit_mm_per_prompt
=
{
"image"
:
num_imgs
},
)
)
tokenizer
=
cached_
get_
tokenizer
(
ctx
.
model_config
.
tokenizer
)
tokenizer
=
cached_tokenizer
_from_config
(
ctx
.
model_config
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
ctx
.
model_config
,
tokenizer
=
tokenizer
,
tokenizer
=
tokenizer
,
)
)
hf_processor
=
processor
.
info
.
get_hf_processor
(
**
mm_processor_kwargs
)
hf_processor
_mm_kwargs
=
{}
if
kwargs_on_init
else
mm_processor_kwargs
# Build the image str / prompt based on the number of images we pass
# Build the image str / prompt based on the number of images we pass
placeholders
=
"<image>"
if
num_imgs
==
1
else
"
\n
"
.
join
(
placeholders
=
"<image>"
if
num_imgs
==
1
else
"
\n
"
.
join
(
...
@@ -56,8 +62,10 @@ def test_processor_override(image_assets: _ImageAssets, model: str,
...
@@ -56,8 +62,10 @@ def test_processor_override(image_assets: _ImageAssets, model: str,
dummy_image
=
image_assets
[
0
].
pil_image
.
resize
(
dummy_image_size
)
dummy_image
=
image_assets
[
0
].
pil_image
.
resize
(
dummy_image_size
)
mm_data
=
{
"image"
:
[
dummy_image
]
*
num_imgs
}
mm_data
=
{
"image"
:
[
dummy_image
]
*
num_imgs
}
processed_inputs
=
processor
.
apply
(
prompt
,
mm_data
,
mm_processor_kwargs
)
processed_inputs
=
processor
.
apply
(
prompt
,
mm_data
,
hf_processor_mm_kwargs
)
# Ensure the placeholders format are correct
# Ensure the placeholders format are correct
hf_processor
=
processor
.
info
.
get_hf_processor
(
**
hf_processor_mm_kwargs
)
hf_processed_inputs
=
hf_processor
(
text
=
prompt
,
images
=
mm_data
[
"image"
])
hf_processed_inputs
=
hf_processor
(
text
=
prompt
,
images
=
mm_data
[
"image"
])
assert
processed_inputs
[
"prompt_token_ids"
]
==
hf_processed_inputs
[
assert
processed_inputs
[
"prompt_token_ids"
]
==
hf_processed_inputs
[
"input_ids"
][
0
]
"input_ids"
][
0
]
...
...
tests/models/multimodal/processing/test_internvl.py
View file @
ec5e299c
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
"""Tests for InternVL's multimodal preprocessing kwargs."""
"""Tests for InternVL's multimodal preprocessing kwargs."""
from
typing
import
Optional
from
typing
import
Mapping
,
Optional
import
os
import
os
import
pytest
import
pytest
from
PIL
import
Image
from
transformers
import
PretrainedConfig
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.utils
import
cached_get_tokenizer
from
vllm.multimodal.image
import
rescale_image_size
from
vllm.multimodal.processing
import
BaseMultiModalProcessor
from
vllm.transformers_utils.tokenizer
import
cached_tokenizer_from_config
from
....conftest
import
_ImageAssets
from
....conftest
import
_ImageAssets
from
...utils
import
build_model_context
from
...utils
import
build_model_context
from
....utils
import
models_path_prefix
from
....utils
import
models_path_prefix
def
_get_expected_num_patches
(
config
:
PretrainedConfig
,
image
:
Image
.
Image
,
num_imgs
:
int
,
min_num
:
int
,
max_num
:
int
,
):
from
vllm.model_executor.models.internvl
import
(
calculate_internvl_targets
,
get_internvl_target_ratios
)
width
,
height
=
image
.
size
blocks
,
_
,
_
=
calculate_internvl_targets
(
orig_width
=
width
,
orig_height
=
height
,
target_ratios
=
get_internvl_target_ratios
(
min_num
,
max_num
,
),
image_size
=
config
.
vision_config
.
image_size
,
use_thumbnail
=
False
,
)
expected_num_patches
=
blocks
if
config
.
use_thumbnail
and
expected_num_patches
>
1
:
expected_num_patches
+=
1
return
expected_num_patches
def
_run_check
(
processor
:
BaseMultiModalProcessor
,
images
:
list
[
Image
.
Image
],
min_num
:
int
,
max_num
:
int
,
mm_processor_kwargs
:
Mapping
[
str
,
object
],
):
tokenizer
=
processor
.
info
.
get_tokenizer
()
config
=
processor
.
info
.
get_hf_config
()
mm_data
=
{
"image"
:
images
}
total_expected_num_patches
=
sum
(
_get_expected_num_patches
(
config
,
image
,
len
(
images
),
min_num
,
max_num
)
for
image
in
images
)
processed_inputs
=
processor
.
apply
(
"<image>"
*
len
(
images
),
mm_data
,
mm_processor_kwargs
)
# Ensure we have the right number of placeholders per num_crops size
image_token_id
=
tokenizer
.
convert_tokens_to_ids
(
"<IMG_CONTEXT>"
)
img_tok_count
=
processed_inputs
[
"prompt_token_ids"
].
count
(
image_token_id
)
pixel_shape
=
processed_inputs
[
"mm_kwargs"
][
"pixel_values_flat"
].
shape
assert
img_tok_count
==
256
*
total_expected_num_patches
assert
pixel_shape
[
0
]
==
total_expected_num_patches
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
os
.
path
.
join
(
models_path_prefix
,
"OpenGVLab/InternVL2-2B"
)])
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
os
.
path
.
join
(
models_path_prefix
,
"OpenGVLab/InternVL2-2B"
)])
@
pytest
.
mark
.
parametrize
(
"max_dynamic_patch"
,
[
1
,
4
])
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"dynamic_image_size"
,
[
True
,
False
,
None
])
"size_factors"
,
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
2
])
[
# Single-scale
[
1.0
],
# Single-scale, batched
[
1.0
,
1.0
,
1.0
],
# Multi-scale
[
0.25
,
0.5
,
1.0
],
[
4.0
,
2.0
,
1.0
],
],
)
@
pytest
.
mark
.
parametrize
(
(
"min_dynamic_patch"
,
"max_dynamic_patch"
),
[(
1
,
1
),
(
1
,
2
),
(
1
,
4
),
(
1
,
8
),
(
2
,
4
),
(
4
,
8
)],
)
@
pytest
.
mark
.
parametrize
(
"dynamic_image_size"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"kwargs_on_init"
,
[
True
,
False
])
def
test_processor_override
(
def
test_processor_override
(
model_id
:
str
,
model_id
:
str
,
image_assets
:
_ImageAssets
,
image_assets
:
_ImageAssets
,
size_factors
:
list
[
int
],
min_dynamic_patch
:
int
,
max_dynamic_patch
:
int
,
max_dynamic_patch
:
int
,
dynamic_image_size
:
Optional
[
bool
],
dynamic_image_size
:
Optional
[
bool
],
num_imgs
:
int
,
kwargs_on_init
:
bool
,
):
):
mm_processor_kwargs
=
{
"min_dynamic_patch"
:
min_dynamic_patch
,
"max_dynamic_patch"
:
max_dynamic_patch
,
"dynamic_image_size"
:
dynamic_image_size
,
}
ctx
=
build_model_context
(
ctx
=
build_model_context
(
model_name
=
model_id
,
model_name
=
model_id
,
tokenizer_name
=
model_id
,
tokenizer_name
=
model_id
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
mm_processor_kwargs
=
None
,
mm_processor_kwargs
=
mm_processor_kwargs
if
kwargs_on_init
else
None
,
limit_mm_per_prompt
=
{
"image"
:
num_imgs
},
limit_mm_per_prompt
=
{
"image"
:
len
(
size_factors
)},
)
tokenizer
=
cached_get_tokenizer
(
ctx
.
model_config
.
tokenizer
,
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
)
)
tokenizer
=
cached_tokenizer_from_config
(
ctx
.
model_config
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
ctx
.
model_config
,
tokenizer
=
tokenizer
,
tokenizer
=
tokenizer
,
)
)
hf_processor_mm_kwargs
=
{}
if
kwargs_on_init
else
mm_processor_kwargs
mm_processor_kwargs
=
{
min_num
=
min_dynamic_patch
if
dynamic_image_size
else
1
"max_dynamic_patch"
:
max_dynamic_patch
,
max_num
=
max_dynamic_patch
if
dynamic_image_size
else
1
}
if
dynamic_image_size
is
not
None
:
mm_processor_kwargs
[
"dynamic_image_size"
]
=
dynamic_image_size
# Build the image str / prompt based on the number of images we pass
_run_check
(
prompt
=
"<image>"
*
num_imgs
processor
,
image
=
image_assets
[
0
].
pil_image
.
resize
((
448
*
2
,
448
*
2
))
[
mm_data
=
{
"image"
:
[
image
]
*
num_imgs
}
rescale_image_size
(
image_assets
[
0
].
pil_image
,
f
)
for
f
in
size_factors
expected_num_patches
=
max_dynamic_patch
+
1
if
max_dynamic_patch
>
1
else
1
],
if
dynamic_image_size
is
False
:
min_num
,
expected_num_patches
=
1
max_num
,
hf_processor_mm_kwargs
,
processed_inputs
=
processor
.
apply
(
prompt
,
mm_data
,
mm_processor_kwargs
)
)
# Ensure we have the right number of placeholders per num_crops size
image_token_id
=
tokenizer
.
convert_tokens_to_ids
(
"<IMG_CONTEXT>"
)
img_tok_count
=
processed_inputs
[
"prompt_token_ids"
].
count
(
image_token_id
)
pixel_shape
=
processed_inputs
[
"mm_kwargs"
][
"pixel_values_flat"
].
shape
assert
img_tok_count
==
256
*
expected_num_patches
*
num_imgs
assert
pixel_shape
[
0
]
==
expected_num_patches
*
num_imgs
tests/models/multimodal/processing/test_llava_next.py
View file @
ec5e299c
...
@@ -10,7 +10,7 @@ from pqdm.threads import pqdm
...
@@ -10,7 +10,7 @@ from pqdm.threads import pqdm
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.parse
import
ImageSize
from
vllm.multimodal.parse
import
ImageSize
from
vllm.multimodal.processing
import
BaseMultiModalProcessor
from
vllm.multimodal.processing
import
BaseMultiModalProcessor
from
vllm.
multimodal.utils
import
cached_
get_
tokenizer
from
vllm.
transformers_utils.tokenizer
import
cached_tokenizer
_from_config
from
...utils
import
build_model_context
from
...utils
import
build_model_context
...
@@ -43,10 +43,7 @@ def test_processor_max_tokens(model_id):
...
@@ -43,10 +43,7 @@ def test_processor_max_tokens(model_id):
)
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
ctx
.
model_config
,
tokenizer
=
cached_get_tokenizer
(
tokenizer
=
cached_tokenizer_from_config
(
ctx
.
model_config
),
ctx
.
model_config
.
tokenizer
,
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
),
)
)
info
=
processor
.
info
info
=
processor
.
info
...
@@ -146,10 +143,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
...
@@ -146,10 +143,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
)
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
ctx
.
model_config
,
tokenizer
=
cached_get_tokenizer
(
tokenizer
=
cached_tokenizer_from_config
(
ctx
.
model_config
),
ctx
.
model_config
.
tokenizer
,
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
),
)
)
image_ratios
=
[(
171
,
152
),
(
184
,
161
),
(
198
,
176
),
(
333
,
296
),
(
369
,
328
),
image_ratios
=
[(
171
,
152
),
(
184
,
161
),
(
198
,
176
),
(
333
,
296
),
(
369
,
328
),
...
@@ -179,10 +173,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
...
@@ -179,10 +173,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
)
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
ctx
.
model_config
,
tokenizer
=
cached_get_tokenizer
(
tokenizer
=
cached_tokenizer_from_config
(
ctx
.
model_config
),
ctx
.
model_config
.
tokenizer
,
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
),
)
)
seen_aspect_ratios
=
set
[
float
]()
seen_aspect_ratios
=
set
[
float
]()
...
...
tests/models/multimodal/processing/test_llava_onevision.py
View file @
ec5e299c
...
@@ -10,7 +10,7 @@ from pqdm.threads import pqdm
...
@@ -10,7 +10,7 @@ from pqdm.threads import pqdm
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.parse
import
ImageSize
from
vllm.multimodal.parse
import
ImageSize
from
vllm.multimodal.processing
import
BaseMultiModalProcessor
from
vllm.multimodal.processing
import
BaseMultiModalProcessor
from
vllm.
multimodal.utils
import
cached_
get_
tokenizer
from
vllm.
transformers_utils.tokenizer
import
cached_tokenizer
_from_config
from
...utils
import
build_model_context
from
...utils
import
build_model_context
...
@@ -44,10 +44,7 @@ def test_processor_max_tokens(model_id):
...
@@ -44,10 +44,7 @@ def test_processor_max_tokens(model_id):
)
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
ctx
.
model_config
,
tokenizer
=
cached_get_tokenizer
(
tokenizer
=
cached_tokenizer_from_config
(
ctx
.
model_config
),
ctx
.
model_config
.
tokenizer
,
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
),
)
)
info
=
processor
.
info
info
=
processor
.
info
...
@@ -146,10 +143,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
...
@@ -146,10 +143,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
)
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
ctx
.
model_config
,
tokenizer
=
cached_get_tokenizer
(
tokenizer
=
cached_tokenizer_from_config
(
ctx
.
model_config
),
ctx
.
model_config
.
tokenizer
,
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
),
)
)
image_ratios
=
[(
171
,
152
),
(
184
,
161
),
(
198
,
176
),
(
333
,
296
),
(
369
,
328
),
image_ratios
=
[(
171
,
152
),
(
184
,
161
),
(
198
,
176
),
(
333
,
296
),
(
369
,
328
),
...
@@ -180,10 +174,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
...
@@ -180,10 +174,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
)
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
ctx
.
model_config
,
tokenizer
=
cached_get_tokenizer
(
tokenizer
=
cached_tokenizer_from_config
(
ctx
.
model_config
),
ctx
.
model_config
.
tokenizer
,
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
),
)
)
seen_aspect_ratios
=
set
[
float
]()
seen_aspect_ratios
=
set
[
float
]()
...
...
tests/models/multimodal/processing/test_phi3v.py
View file @
ec5e299c
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
import
pytest
import
pytest
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.
multimodal.utils
import
cached_
get_
tokenizer
from
vllm.
transformers_utils.tokenizer
import
cached_tokenizer
_from_config
from
....conftest
import
_ImageAssets
from
....conftest
import
_ImageAssets
from
...utils
import
build_model_context
from
...utils
import
build_model_context
...
@@ -21,12 +21,14 @@ from ...utils import build_model_context
...
@@ -21,12 +21,14 @@ from ...utils import build_model_context
])
])
# yapf: enable
# yapf: enable
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
2
])
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
2
])
@
pytest
.
mark
.
parametrize
(
"kwargs_on_init"
,
[
True
,
False
])
def
test_processor_override
(
def
test_processor_override
(
image_assets
:
_ImageAssets
,
image_assets
:
_ImageAssets
,
model_id
:
str
,
model_id
:
str
,
mm_processor_kwargs
:
dict
[
str
,
int
],
mm_processor_kwargs
:
dict
[
str
,
int
],
expected_toks_per_img
:
int
,
expected_toks_per_img
:
int
,
num_imgs
:
int
,
num_imgs
:
int
,
kwargs_on_init
:
bool
,
):
):
"""Ensure input_processor_for_phi3v handles num_crops properly."""
"""Ensure input_processor_for_phi3v handles num_crops properly."""
# Avoid initializing CUDA early
# Avoid initializing CUDA early
...
@@ -36,23 +38,22 @@ def test_processor_override(
...
@@ -36,23 +38,22 @@ def test_processor_override(
model_name
=
model_id
,
model_name
=
model_id
,
tokenizer_name
=
model_id
,
tokenizer_name
=
model_id
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
mm_processor_kwargs
=
mm_processor_kwargs
if
kwargs_on_init
else
None
,
limit_mm_per_prompt
=
{
"image"
:
num_imgs
},
limit_mm_per_prompt
=
{
"image"
:
num_imgs
},
)
)
tokenizer
=
cached_get_tokenizer
(
tokenizer
=
cached_tokenizer_from_config
(
ctx
.
model_config
)
ctx
.
model_config
.
tokenizer
,
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
ctx
.
model_config
,
tokenizer
=
tokenizer
,
tokenizer
=
tokenizer
,
)
)
hf_processor_mm_kwargs
=
{}
if
kwargs_on_init
else
mm_processor_kwargs
# Build the image str / prompt based on the number of images we pass
# Build the image str / prompt based on the number of images we pass
img_str
=
""
.
join
([
f
"<|image_
{
idx
}
|>
\n
"
for
idx
in
range
(
1
,
num_imgs
+
1
)])
img_str
=
""
.
join
([
f
"<|image_
{
idx
}
|>
\n
"
for
idx
in
range
(
1
,
num_imgs
+
1
)])
prompt
=
f
"<|user|>
\n
{
img_str
}
<|end|>
\n
<|assistant|>
\n
"
prompt
=
f
"<|user|>
\n
{
img_str
}
<|end|>
\n
<|assistant|>
\n
"
mm_data
=
{
"image"
:
[
image_assets
[
0
].
pil_image
]
*
num_imgs
}
mm_data
=
{
"image"
:
[
image_assets
[
0
].
pil_image
]
*
num_imgs
}
processed_inputs
=
processor
.
apply
(
prompt
,
mm_data
,
mm
_processor_kwargs
)
processed_inputs
=
processor
.
apply
(
prompt
,
mm_data
,
hf
_processor_
mm_
kwargs
)
# Ensure we have the right number of placeholders per num_crops size
# Ensure we have the right number of placeholders per num_crops size
img_tok_count
=
processed_inputs
[
"prompt_token_ids"
].
count
(
_IMAGE_TOKEN_ID
)
img_tok_count
=
processed_inputs
[
"prompt_token_ids"
].
count
(
_IMAGE_TOKEN_ID
)
...
...
Prev
1
…
7
8
9
10
11
12
13
14
15
…
27
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment