Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
469e903b
Commit
469e903b
authored
Mar 28, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.8.2' into v0.8.2-dev
parents
389ebcf7
25f560a6
Changes
535
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1085 additions
and
543 deletions
+1085
-543
tests/models/multimodal/processing/test_llava_next.py
tests/models/multimodal/processing/test_llava_next.py
+3
-6
tests/models/multimodal/processing/test_llava_onevision.py
tests/models/multimodal/processing/test_llava_onevision.py
+3
-6
tests/models/multimodal/processing/test_phi3v.py
tests/models/multimodal/processing/test_phi3v.py
+1
-3
tests/models/multimodal/processing/test_qwen2_vl.py
tests/models/multimodal/processing/test_qwen2_vl.py
+1
-2
tests/models/registry.py
tests/models/registry.py
+27
-10
tests/models/test_initialization.py
tests/models/test_initialization.py
+12
-8
tests/models/test_oot_registration.py
tests/models/test_oot_registration.py
+88
-72
tests/models/test_registry.py
tests/models/test_registry.py
+3
-3
tests/models/test_transformers.py
tests/models/test_transformers.py
+7
-8
tests/models/utils.py
tests/models/utils.py
+24
-25
tests/mq_llm_engine/conftest.py
tests/mq_llm_engine/conftest.py
+11
-0
tests/mq_llm_engine/test_abort.py
tests/mq_llm_engine/test_abort.py
+1
-1
tests/mq_llm_engine/test_error_handling.py
tests/mq_llm_engine/test_error_handling.py
+96
-17
tests/mq_llm_engine/test_load.py
tests/mq_llm_engine/test_load.py
+1
-3
tests/mq_llm_engine/utils.py
tests/mq_llm_engine/utils.py
+2
-2
tests/multi_step/test_correctness_async_llm.py
tests/multi_step/test_correctness_async_llm.py
+121
-117
tests/multi_step/test_correctness_llm.py
tests/multi_step/test_correctness_llm.py
+164
-161
tests/multimodal/test_processing.py
tests/multimodal/test_processing.py
+474
-95
tests/multimodal/test_utils.py
tests/multimodal/test_utils.py
+4
-4
tests/neuron/1_core/test_activation.py
tests/neuron/1_core/test_activation.py
+42
-0
No files found.
Too many changes to show.
To preserve performance only
535 of 535+
files are displayed.
Plain diff
Email patch
tests/models/multimodal/processing/test_llava_next.py
View file @
469e903b
...
...
@@ -36,8 +36,7 @@ def _validate_image_max_tokens_one(
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"llava-hf/llava-v1.6-mistral-7b-hf"
])
def
test_processor_max_tokens
(
model_id
):
ctx
=
build_model_context
(
model_name
=
model_id
,
tokenizer_name
=
model_id
,
model_id
,
mm_processor_kwargs
=
None
,
limit_mm_per_prompt
=
{
"image"
:
1
},
)
...
...
@@ -136,8 +135,7 @@ def _test_image_prompt_replacements(
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
2
])
def
test_processor_prompt_replacements_regression
(
model_id
,
num_imgs
):
ctx
=
build_model_context
(
model_name
=
model_id
,
tokenizer_name
=
model_id
,
model_id
,
mm_processor_kwargs
=
None
,
limit_mm_per_prompt
=
{
"image"
:
num_imgs
},
)
...
...
@@ -166,8 +164,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
])
def
test_processor_prompt_replacements_all
(
model_id
,
num_imgs
):
ctx
=
build_model_context
(
model_name
=
model_id
,
tokenizer_name
=
model_id
,
model_id
,
mm_processor_kwargs
=
None
,
limit_mm_per_prompt
=
{
"image"
:
num_imgs
},
)
...
...
tests/models/multimodal/processing/test_llava_onevision.py
View file @
469e903b
...
...
@@ -37,8 +37,7 @@ def _validate_image_max_tokens_one(
[
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
])
def
test_processor_max_tokens
(
model_id
):
ctx
=
build_model_context
(
model_name
=
model_id
,
tokenizer_name
=
model_id
,
model_id
,
mm_processor_kwargs
=
None
,
limit_mm_per_prompt
=
{
"image"
:
1
},
)
...
...
@@ -136,8 +135,7 @@ def _test_image_prompt_replacements(
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
2
])
def
test_processor_prompt_replacements_regression
(
model_id
,
num_imgs
):
ctx
=
build_model_context
(
model_name
=
model_id
,
tokenizer_name
=
model_id
,
model_id
,
mm_processor_kwargs
=
None
,
limit_mm_per_prompt
=
{
"image"
:
num_imgs
},
)
...
...
@@ -167,8 +165,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
])
def
test_processor_prompt_replacements_all
(
model_id
,
num_imgs
):
ctx
=
build_model_context
(
model_name
=
model_id
,
tokenizer_name
=
model_id
,
model_id
,
mm_processor_kwargs
=
None
,
limit_mm_per_prompt
=
{
"image"
:
num_imgs
},
)
...
...
tests/models/multimodal/processing/test_phi3v.py
View file @
469e903b
...
...
@@ -35,9 +35,7 @@ def test_processor_override(
from
vllm.model_executor.models.phi3v
import
_IMAGE_TOKEN_ID
ctx
=
build_model_context
(
model_name
=
model_id
,
tokenizer_name
=
model_id
,
trust_remote_code
=
True
,
model_id
,
mm_processor_kwargs
=
mm_processor_kwargs
if
kwargs_on_init
else
None
,
limit_mm_per_prompt
=
{
"image"
:
num_imgs
},
)
...
...
tests/models/multimodal/processing/test_qwen2_vl.py
View file @
469e903b
...
...
@@ -30,8 +30,7 @@ def test_processor_override(
):
"""Ensure Qwen2VLMultiModalProcessor handles min/max pixels properly."""
ctx
=
build_model_context
(
model_name
=
model_id
,
tokenizer_name
=
model_id
,
model_id
,
mm_processor_kwargs
=
mm_processor_kwargs
if
kwargs_on_init
else
None
,
limit_mm_per_prompt
=
{
"image"
:
num_imgs
},
)
...
...
tests/models/registry.py
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
from
collections.abc
import
Mapping
,
Set
from
dataclasses
import
dataclass
,
field
from
typing
import
AbstractSet
,
Any
,
Literal
,
Mapping
,
Optional
from
typing
import
Any
,
Literal
,
Optional
import
pytest
from
packaging.version
import
Version
...
...
@@ -123,6 +124,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"FalconForCausalLM"
:
_HfExamplesInfo
(
"tiiuae/falcon-7b"
),
"GemmaForCausalLM"
:
_HfExamplesInfo
(
"google/gemma-2b"
),
"Gemma2ForCausalLM"
:
_HfExamplesInfo
(
"google/gemma-2-9b"
),
"Gemma3ForCausalLM"
:
_HfExamplesInfo
(
"google/gemma-3-1b-it"
,
min_transformers_version
=
"4.50"
),
"GlmForCausalLM"
:
_HfExamplesInfo
(
"THUDM/glm-4-9b-chat-hf"
),
"GPT2LMHeadModel"
:
_HfExamplesInfo
(
"gpt2"
),
"GPTBigCodeForCausalLM"
:
_HfExamplesInfo
(
"bigcode/starcoder"
),
...
...
@@ -130,6 +133,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"GPTNeoXForCausalLM"
:
_HfExamplesInfo
(
"EleutherAI/pythia-160m"
),
"GraniteForCausalLM"
:
_HfExamplesInfo
(
"ibm/PowerLM-3b"
),
"GraniteMoeForCausalLM"
:
_HfExamplesInfo
(
"ibm/PowerMoE-3b"
),
"GraniteMoeSharedForCausalLM"
:
_HfExamplesInfo
(
"ibm-research/moe-7b-1b-active-shared-experts"
,
# noqa: E501
min_transformers_version
=
"4.49"
),
# noqa: E501
"Grok1ModelForCausalLM"
:
_HfExamplesInfo
(
"hpcai-tech/grok-1"
,
trust_remote_code
=
True
),
"InternLMForCausalLM"
:
_HfExamplesInfo
(
"internlm/internlm-chat-7b"
,
trust_remote_code
=
True
),
"InternLM2ForCausalLM"
:
_HfExamplesInfo
(
"internlm/internlm2-chat-7b"
,
...
...
@@ -185,17 +192,16 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"SolarForCausalLM"
:
_HfExamplesInfo
(
"upstage/solar-pro-preview-instruct"
),
"TeleChat2ForCausalLM"
:
_HfExamplesInfo
(
"Tele-AI/TeleChat2-3B"
,
trust_remote_code
=
True
),
"TeleFLMForCausalLM"
:
_HfExamplesInfo
(
"CofeAI/FLM-2-52B-Instruct-2407"
,
trust_remote_code
=
True
),
"XverseForCausalLM"
:
_HfExamplesInfo
(
"xverse/XVERSE-7B-Chat"
,
is_available_online
=
False
,
trust_remote_code
=
True
),
"Zamba2ForCausalLM"
:
_HfExamplesInfo
(
"Zyphra/Zamba2-7B-instruct"
,
min_transformers_version
=
"4.49"
),
# [Encoder-decoder]
"BartModel"
:
_HfExamplesInfo
(
"facebook/bart-base"
),
"BartForConditionalGeneration"
:
_HfExamplesInfo
(
"facebook/bart-large-cnn"
),
# Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
# Therefore, we borrow the BartTokenizer from the original Bart model
"Florence2ForConditionalGeneration"
:
_HfExamplesInfo
(
"microsoft/Florence-2-base"
,
# noqa: E501
tokenizer
=
"facebook/bart-base"
,
trust_remote_code
=
True
),
# noqa: E501
}
_EMBEDDING_EXAMPLE_MODELS
=
{
...
...
@@ -214,7 +220,7 @@ _EMBEDDING_EXAMPLE_MODELS = {
"Qwen2ForSequenceClassification"
:
_HfExamplesInfo
(
"jason9693/Qwen2.5-1.5B-apeach"
),
# noqa: E501
"RobertaModel"
:
_HfExamplesInfo
(
"sentence-transformers/stsb-roberta-base-v2"
),
# noqa: E501
"RobertaForMaskedLM"
:
_HfExamplesInfo
(
"sentence-transformers/all-roberta-large-v1"
),
# noqa: E501
"XLMRobertaModel"
:
_HfExamplesInfo
(
"intfloat/multilingual-e5-
large
"
),
"XLMRobertaModel"
:
_HfExamplesInfo
(
"intfloat/multilingual-e5-
small
"
),
# [Multimodal]
"LlavaNextForConditionalGeneration"
:
_HfExamplesInfo
(
"royokong/e5-v"
),
"Phi3VForCausalLM"
:
_HfExamplesInfo
(
"TIGER-Lab/VLM2Vec-Full"
,
...
...
@@ -241,6 +247,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"DeepseekVLV2ForCausalLM"
:
_HfExamplesInfo
(
"deepseek-ai/deepseek-vl2-tiny"
,
# noqa: E501
hf_overrides
=
{
"architectures"
:
[
"DeepseekVLV2ForCausalLM"
]}),
# noqa: E501
"FuyuForCausalLM"
:
_HfExamplesInfo
(
"adept/fuyu-8b"
),
"Gemma3ForConditionalGeneration"
:
_HfExamplesInfo
(
"google/gemma-3-4b-it"
,
min_transformers_version
=
"4.50"
),
"GLM4VForCausalLM"
:
_HfExamplesInfo
(
"THUDM/glm-4v-9b"
,
trust_remote_code
=
True
,
hf_overrides
=
{
"architectures"
:
[
"GLM4VForCausalLM"
]}),
# noqa: E501
...
...
@@ -252,7 +260,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"Idefics3ForConditionalGeneration"
:
_HfExamplesInfo
(
"HuggingFaceM4/Idefics3-8B-Llama3"
,
# noqa: E501
{
"tiny"
:
"HuggingFaceTB/SmolVLM-256M-Instruct"
}),
# noqa: E501
"LlavaForConditionalGeneration"
:
_HfExamplesInfo
(
"llava-hf/llava-1.5-7b-hf"
,
extras
=
{
"mistral"
:
"mistral-community/pixtral-12b"
}),
# noqa: E501
extras
=
{
"mistral"
:
"mistral-community/pixtral-12b"
,
# noqa: E501
"mistral-fp8"
:
"nm-testing/pixtral-12b-FP8-dynamic"
}),
# noqa: E501
"LlavaNextForConditionalGeneration"
:
_HfExamplesInfo
(
"llava-hf/llava-v1.6-mistral-7b-hf"
),
# noqa: E501
"LlavaNextVideoForConditionalGeneration"
:
_HfExamplesInfo
(
"llava-hf/LLaVA-NeXT-Video-7B-hf"
),
# noqa: E501
"LlavaOnevisionForConditionalGeneration"
:
_HfExamplesInfo
(
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
),
# noqa: E501
...
...
@@ -271,6 +280,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"PaliGemmaForConditionalGeneration"
:
_HfExamplesInfo
(
"google/paligemma-3b-mix-224"
,
# noqa: E501
extras
=
{
"v2"
:
"google/paligemma2-3b-ft-docci-448"
}),
# noqa: E501
"Phi3VForCausalLM"
:
_HfExamplesInfo
(
"microsoft/Phi-3-vision-128k-instruct"
,
trust_remote_code
=
True
,
extras
=
{
"phi3.5"
:
"microsoft/Phi-3.5-vision-instruct"
}),
# noqa: E501),
"Phi4MMForCausalLM"
:
_HfExamplesInfo
(
"microsoft/Phi-4-multimodal-instruct"
,
trust_remote_code
=
True
),
"PixtralForConditionalGeneration"
:
_HfExamplesInfo
(
"mistralai/Pixtral-12B-2409"
,
# noqa: E501
tokenizer_mode
=
"mistral"
),
...
...
@@ -282,9 +294,14 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"Qwen2VLForConditionalGeneration"
:
_HfExamplesInfo
(
"Qwen/Qwen2-VL-2B-Instruct"
),
# noqa: E501
"Qwen2_5_VLForConditionalGeneration"
:
_HfExamplesInfo
(
"Qwen/Qwen2.5-VL-3B-Instruct"
,
# noqa: E501
min_transformers_version
=
"4.49"
),
# noqa: E501
"UltravoxModel"
:
_HfExamplesInfo
(
"fixie-ai/ultravox-v0_5-llama-3_2-1b"
,
"UltravoxModel"
:
_HfExamplesInfo
(
"fixie-ai/ultravox-v0_5-llama-3_2-1b"
,
# noqa: E501
trust_remote_code
=
True
),
# [Encoder-decoder]
# Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
# Therefore, we borrow the BartTokenizer from the original Bart model
"Florence2ForConditionalGeneration"
:
_HfExamplesInfo
(
"microsoft/Florence-2-base"
,
# noqa: E501
tokenizer
=
"facebook/bart-base"
,
trust_remote_code
=
True
),
# noqa: E501
"MllamaForConditionalGeneration"
:
_HfExamplesInfo
(
"meta-llama/Llama-3.2-11B-Vision-Instruct"
),
# noqa: E501
"WhisperForConditionalGeneration"
:
_HfExamplesInfo
(
"openai/whisper-large-v3"
),
# noqa: E501
}
...
...
@@ -321,7 +338,7 @@ class HfExampleModels:
self
.
hf_models
=
hf_models
def
get_supported_archs
(
self
)
->
Abstract
Set
[
str
]:
def
get_supported_archs
(
self
)
->
Set
[
str
]:
return
self
.
hf_models
.
keys
()
def
get_hf_info
(
self
,
model_arch
:
str
)
->
_HfExamplesInfo
:
...
...
tests/models/test_initialization.py
View file @
469e903b
...
...
@@ -6,8 +6,9 @@ import pytest
from
transformers
import
PretrainedConfig
from
vllm
import
LLM
from
vllm.engine.llm_engine
import
LLMEngine
as
V0LLMEngine
from
vllm.v1.engine.core
import
EngineCore
as
V1EngineCore
from
..conftest
import
MODELS_ON_S3
from
.registry
import
HF_EXAMPLE_MODELS
...
...
@@ -37,17 +38,20 @@ def test_can_initialize(model_arch):
return
hf_config
# Avoid calling model.forward()
def
_initialize_kv_caches
(
self
)
->
None
:
def
_initialize_kv_caches
_v0
(
self
)
->
None
:
self
.
cache_config
.
num_gpu_blocks
=
0
self
.
cache_config
.
num_cpu_blocks
=
0
with
patch
.
object
(
LLM
.
get_engine_class
(),
"_initialize_kv_caches"
,
_initialize_kv_caches
):
model_name
=
model_info
.
default
if
model_name
in
MODELS_ON_S3
:
model_name
=
f
"s3://vllm-ci-model-weights/
{
model_name
.
split
(
'/'
)[
-
1
]
}
"
def
_initalize_kv_caches_v1
(
self
,
vllm_config
):
# gpu_blocks (> 0), cpu_blocks
return
1
,
0
with
(
patch
.
object
(
V0LLMEngine
,
"_initialize_kv_caches"
,
_initialize_kv_caches_v0
),
patch
.
object
(
V1EngineCore
,
"_initialize_kv_caches"
,
_initalize_kv_caches_v1
)):
LLM
(
model_
name
,
model_
info
.
default
,
tokenizer
=
model_info
.
tokenizer
,
tokenizer_mode
=
model_info
.
tokenizer_mode
,
speculative_model
=
model_info
.
speculative_model
,
...
...
tests/models/test_oot_registration.py
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
import
os
import
pytest
from
vllm
import
LLM
,
SamplingParams
from
vllm.assets.image
import
ImageAsset
from
..utils
import
fork_new_process_for_each_test
@
fork_new_process_for_each_test
def
test_plugin
(
dummy_opt_path
):
os
.
environ
[
"VLLM_PLUGINS"
]
=
""
with
pytest
.
raises
(
Exception
)
as
excinfo
:
LLM
(
model
=
dummy_opt_path
,
load_format
=
"dummy"
)
error_msg
=
"has no vLLM implementation and "
\
"the Transformers implementation is not compatible with vLLM."
assert
(
error_msg
in
str
(
excinfo
.
value
))
@
fork_new_process_for_each_test
def
test_oot_registration_text_generation
(
dummy_opt_path
):
os
.
environ
[
"VLLM_PLUGINS"
]
=
"register_dummy_model"
prompts
=
[
"Hello, my name is"
,
"The text does not matter"
]
sampling_params
=
SamplingParams
(
temperature
=
0
)
llm
=
LLM
(
model
=
dummy_opt_path
,
load_format
=
"dummy"
)
first_token
=
llm
.
get_tokenizer
().
decode
(
0
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
for
output
in
outputs
:
generated_text
=
output
.
outputs
[
0
].
text
# make sure only the first token is generated
rest
=
generated_text
.
replace
(
first_token
,
""
)
assert
rest
==
""
@
fork_new_process_for_each_test
def
test_oot_registration_embedding
(
dummy_gemma2_embedding_path
):
os
.
environ
[
"VLLM_PLUGINS"
]
=
"register_dummy_model"
prompts
=
[
"Hello, my name is"
,
"The text does not matter"
]
llm
=
LLM
(
model
=
dummy_gemma2_embedding_path
,
load_format
=
"dummy"
)
outputs
=
llm
.
embed
(
prompts
)
for
output
in
outputs
:
assert
all
(
v
==
0
for
v
in
output
.
outputs
.
embedding
)
from
..utils
import
create_new_process_for_each_test
@
create_new_process_for_each_test
()
def
test_plugin
(
monkeypatch
:
pytest
.
MonkeyPatch
,
dummy_opt_path
:
str
,
):
# V1 shuts down rather than raising an error here.
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"0"
)
m
.
setenv
(
"VLLM_PLUGINS"
,
""
)
with
pytest
.
raises
(
Exception
)
as
excinfo
:
LLM
(
model
=
dummy_opt_path
,
load_format
=
"dummy"
)
error_msg
=
"has no vLLM implementation and the Transformers implementation is not compatible with vLLM"
# noqa: E501
assert
(
error_msg
in
str
(
excinfo
.
value
))
@
create_new_process_for_each_test
()
def
test_oot_registration_text_generation
(
monkeypatch
:
pytest
.
MonkeyPatch
,
dummy_opt_path
:
str
,
):
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_PLUGINS"
,
"register_dummy_model"
)
prompts
=
[
"Hello, my name is"
,
"The text does not matter"
]
sampling_params
=
SamplingParams
(
temperature
=
0
)
llm
=
LLM
(
model
=
dummy_opt_path
,
load_format
=
"dummy"
)
first_token
=
llm
.
get_tokenizer
().
decode
(
0
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
for
output
in
outputs
:
generated_text
=
output
.
outputs
[
0
].
text
# make sure only the first token is generated
rest
=
generated_text
.
replace
(
first_token
,
""
)
assert
rest
==
""
@
create_new_process_for_each_test
()
def
test_oot_registration_embedding
(
monkeypatch
:
pytest
.
MonkeyPatch
,
dummy_gemma2_embedding_path
:
str
,
):
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_PLUGINS"
,
"register_dummy_model"
)
prompts
=
[
"Hello, my name is"
,
"The text does not matter"
]
llm
=
LLM
(
model
=
dummy_gemma2_embedding_path
,
load_format
=
"dummy"
)
outputs
=
llm
.
embed
(
prompts
)
for
output
in
outputs
:
assert
all
(
v
==
0
for
v
in
output
.
outputs
.
embedding
)
image
=
ImageAsset
(
"cherry_blossom"
).
pil_image
.
convert
(
"RGB"
)
@
fork_new_process_for_each_test
def
test_oot_registration_multimodal
(
dummy_llava_path
):
os
.
environ
[
"VLLM_PLUGINS"
]
=
"register_dummy_model"
prompts
=
[{
"prompt"
:
"What's in the image?<image>"
,
"multi_modal_data"
:
{
"image"
:
image
},
},
{
"prompt"
:
"Describe the image<image>"
,
"multi_modal_data"
:
{
"image"
:
image
},
}]
sampling_params
=
SamplingParams
(
temperature
=
0
)
llm
=
LLM
(
model
=
dummy_llava_path
,
load_format
=
"dummy"
,
max_num_seqs
=
1
,
trust_remote_code
=
True
,
gpu_memory_utilization
=
0.98
,
max_model_len
=
4096
,
enforce_eager
=
True
,
limit_mm_per_prompt
=
{
"image"
:
1
})
first_token
=
llm
.
get_tokenizer
().
decode
(
0
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
for
output
in
outputs
:
generated_text
=
output
.
outputs
[
0
].
text
# make sure only the first token is generated
rest
=
generated_text
.
replace
(
first_token
,
""
)
assert
rest
==
""
@
create_new_process_for_each_test
()
def
test_oot_registration_multimodal
(
monkeypatch
:
pytest
.
MonkeyPatch
,
dummy_llava_path
:
str
,
):
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_PLUGINS"
,
"register_dummy_model"
)
prompts
=
[{
"prompt"
:
"What's in the image?<image>"
,
"multi_modal_data"
:
{
"image"
:
image
},
},
{
"prompt"
:
"Describe the image<image>"
,
"multi_modal_data"
:
{
"image"
:
image
},
}]
sampling_params
=
SamplingParams
(
temperature
=
0
)
llm
=
LLM
(
model
=
dummy_llava_path
,
load_format
=
"dummy"
,
max_num_seqs
=
1
,
trust_remote_code
=
True
,
gpu_memory_utilization
=
0.98
,
max_model_len
=
4096
,
enforce_eager
=
True
,
limit_mm_per_prompt
=
{
"image"
:
1
})
first_token
=
llm
.
get_tokenizer
().
decode
(
0
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
for
output
in
outputs
:
generated_text
=
output
.
outputs
[
0
].
text
# make sure only the first token is generated
rest
=
generated_text
.
replace
(
first_token
,
""
)
assert
rest
==
""
tests/models/test_registry.py
View file @
469e903b
...
...
@@ -17,7 +17,7 @@ from vllm.model_executor.models.registry import (_MULTIMODAL_MODELS,
ModelRegistry
)
from
vllm.platforms
import
current_platform
from
..utils
import
fork
_new_process_for_each_test
from
..utils
import
create
_new_process_for_each_test
from
.registry
import
HF_EXAMPLE_MODELS
...
...
@@ -45,7 +45,7 @@ def test_registry_imports(model_arch):
assert
supports_multimodal
(
model_cls
)
@
fork
_new_process_for_each_test
@
create
_new_process_for_each_test
()
@
pytest
.
mark
.
parametrize
(
"model_arch,is_mm,init_cuda,is_ce"
,
[
(
"LlamaForCausalLM"
,
False
,
False
,
False
),
(
"MllamaForConditionalGeneration"
,
True
,
False
,
False
),
...
...
@@ -70,7 +70,7 @@ def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
stacklevel
=
2
)
@
fork
_new_process_for_each_test
@
create
_new_process_for_each_test
()
@
pytest
.
mark
.
parametrize
(
"model_arch,is_pp,init_cuda"
,
[
(
"MLPSpeculatorPreTrainedModel"
,
False
,
False
),
(
"DeepseekV2ForCausalLM"
,
True
,
False
),
...
...
tests/models/test_transformers.py
View file @
469e903b
...
...
@@ -4,7 +4,6 @@
Run `pytest tests/models/test_transformers.py`.
"""
from
contextlib
import
nullcontext
from
typing
import
Type
import
pytest
...
...
@@ -14,8 +13,8 @@ from .utils import check_logprobs_close
def
check_implementation
(
hf_runner
:
T
ype
[
HfRunner
],
vllm_runner
:
T
ype
[
VllmRunner
],
hf_runner
:
t
ype
[
HfRunner
],
vllm_runner
:
t
ype
[
VllmRunner
],
example_prompts
:
list
[
str
],
model
:
str
,
**
kwargs
,
...
...
@@ -47,8 +46,8 @@ def check_implementation(
(
"ArthurZ/Ilama-3.2-1B"
,
"auto"
),
# CUSTOM CODE
])
# trust_remote_code=True by default
def
test_models
(
hf_runner
:
T
ype
[
HfRunner
],
vllm_runner
:
T
ype
[
VllmRunner
],
hf_runner
:
t
ype
[
HfRunner
],
vllm_runner
:
t
ype
[
VllmRunner
],
example_prompts
:
list
[
str
],
model
:
str
,
model_impl
:
str
,
...
...
@@ -71,8 +70,8 @@ def test_models(
@
multi_gpu_test
(
num_gpus
=
2
)
def
test_distributed
(
hf_runner
:
T
ype
[
HfRunner
],
vllm_runner
:
T
ype
[
VllmRunner
],
hf_runner
:
t
ype
[
HfRunner
],
vllm_runner
:
t
ype
[
VllmRunner
],
example_prompts
,
):
kwargs
=
{
"model_impl"
:
"transformers"
,
"tensor_parallel_size"
:
2
}
...
...
@@ -92,7 +91,7 @@ def test_distributed(
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
def
test_quantization
(
vllm_runner
:
T
ype
[
VllmRunner
],
vllm_runner
:
t
ype
[
VllmRunner
],
example_prompts
:
list
[
str
],
model
:
str
,
quantization_kwargs
:
dict
[
str
,
str
],
...
...
tests/models/utils.py
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
import
warnings
from
typing
import
Dict
,
List
,
Optional
,
Sequence
,
Tuple
,
Union
from
collections.abc
import
Sequence
from
typing
import
Any
,
Optional
,
Union
import
torch
...
...
@@ -9,7 +10,9 @@ from vllm.config import ModelConfig, TaskOption
from
vllm.inputs
import
InputContext
from
vllm.sequence
import
Logprob
,
PromptLogprobs
,
SampleLogprobs
TokensText
=
Tuple
[
List
[
int
],
str
]
from
.registry
import
HF_EXAMPLE_MODELS
TokensText
=
tuple
[
list
[
int
],
str
]
def
check_outputs_equal
(
...
...
@@ -46,7 +49,7 @@ def check_outputs_equal(
# * List of top sample logprobs for each sampled token
#
# Assumes prompt logprobs were not requested.
TokensTextLogprobs
=
T
uple
[
L
ist
[
int
],
str
,
Optional
[
Union
[
L
ist
[
D
ict
[
int
,
TokensTextLogprobs
=
t
uple
[
l
ist
[
int
],
str
,
Optional
[
Union
[
l
ist
[
d
ict
[
int
,
float
]],
SampleLogprobs
]]]
...
...
@@ -57,8 +60,8 @@ TokensTextLogprobs = Tuple[List[int], str, Optional[Union[List[Dict[int,
# * Optional list of top sample logprobs for each sampled token
#
# Assumes prompt logprobs were not requested.
TextTextLogprobs
=
T
uple
[
L
ist
[
str
],
str
,
Optional
[
Union
[
L
ist
[
D
ict
[
str
,
float
]],
L
ist
[
D
ict
[
str
,
TextTextLogprobs
=
t
uple
[
l
ist
[
str
],
str
,
Optional
[
Union
[
l
ist
[
d
ict
[
str
,
float
]],
l
ist
[
d
ict
[
str
,
Logprob
]]]]]
# Representation of generated sequence as a tuple of
...
...
@@ -68,9 +71,9 @@ TextTextLogprobs = Tuple[List[str], str, Optional[Union[List[Dict[str, float]],
# * Optional list of top prompt logprobs for each prompt token
#
# Allows prompt logprobs to be requested.
TokensTextLogprobsPromptLogprobs
=
T
uple
[
L
ist
[
int
],
str
,
Optional
[
Union
[
L
ist
[
D
ict
[
int
,
float
]],
SampleLogprobs
]],
Optional
[
Union
[
L
ist
[
Optional
[
D
ict
[
int
,
float
]]],
PromptLogprobs
]]]
TokensTextLogprobsPromptLogprobs
=
t
uple
[
l
ist
[
int
],
str
,
Optional
[
Union
[
l
ist
[
d
ict
[
int
,
float
]],
SampleLogprobs
]],
Optional
[
Union
[
l
ist
[
Optional
[
d
ict
[
int
,
float
]]],
PromptLogprobs
]]]
def
check_logprobs_close
(
...
...
@@ -249,21 +252,17 @@ def check_logprobs_close(
def
build_model_context
(
model_
name
:
str
,
model_
id
:
str
,
task
:
TaskOption
=
"auto"
,
tokenizer_name
:
Optional
[
str
]
=
None
,
trust_remote_code
:
bool
=
False
,
dtype
:
Optional
[
Union
[
str
,
torch
.
dtype
]]
=
None
,
mm_processor_kwargs
:
Optional
[
Dict
]
=
None
,
limit_mm_per_prompt
:
Optional
[
Dict
]
=
None
,
dtype
:
Union
[
str
,
torch
.
dtype
]
=
"auto"
,
mm_processor_kwargs
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
limit_mm_per_prompt
:
Optional
[
dict
[
str
,
int
]]
=
None
,
disable_mm_preprocessor_cache
:
bool
=
True
,
):
"""Creates an InputContext for a given model.
Args:
model_name: Name of the model being considered.
tokenizer_name: Name of the tokenizer being considered.
trust_remote_code: Whether or not to allow loading remote code.
model_id: ID of the model being considered.
mm_processor_kwargs: optional processor kwargs for to be leveraged
in the input processor, mapper, dummy data creation, etc.
limit_mm_per_prompt: Multimodal limits.
...
...
@@ -271,21 +270,21 @@ def build_model_context(
Returns:
InputContext for the model being considered.
"""
if
tokenizer_name
is
None
:
tokenizer_name
=
model_name
if
dtype
is
None
:
dtype
=
"half"
model_info
=
HF_EXAMPLE_MODELS
.
find_hf_info
(
model_id
)
model_info
.
check_available_online
(
on_fail
=
"skip"
)
model_info
.
check_transformers_version
(
on_fail
=
"skip"
)
model_config
=
ModelConfig
(
model_
name
,
model_
id
,
task
=
task
,
tokenizer
=
tokenizer_name
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
trust_remote_code
,
tokenizer
=
model_info
.
tokenizer
or
model_id
,
tokenizer_mode
=
model_info
.
tokenizer_mode
,
trust_remote_code
=
model_info
.
trust_remote_code
,
dtype
=
dtype
,
seed
=
0
,
mm_processor_kwargs
=
mm_processor_kwargs
,
limit_mm_per_prompt
=
limit_mm_per_prompt
,
disable_mm_preprocessor_cache
=
disable_mm_preprocessor_cache
,
hf_overrides
=
model_info
.
hf_overrides
,
)
return
InputContext
(
model_config
)
tests/mq_llm_engine/conftest.py
0 → 100644
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
import
pytest
@
pytest
.
fixture
(
scope
=
"function"
,
autouse
=
True
)
def
use_v0_only
(
monkeypatch
):
"""
Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the module.
"""
monkeypatch
.
setenv
(
'VLLM_USE_V1'
,
'0'
)
tests/mq_llm_engine/test_abort.py
View file @
469e903b
...
...
@@ -13,7 +13,7 @@ from vllm.engine.arg_utils import AsyncEngineArgs
from
..utils
import
models_path_prefix
MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"gemma-1.1-2b-it"
)
ENGINE_ARGS
=
AsyncEngineArgs
(
model
=
MODEL
,
load_format
=
"runai_streamer"
)
ENGINE_ARGS
=
AsyncEngineArgs
(
model
=
MODEL
)
RAISED_ERROR
=
KeyError
RAISED_VALUE
=
"foo"
EXPECTED_TOKENS
=
250
...
...
tests/mq_llm_engine/test_error_handling.py
View file @
469e903b
...
...
@@ -19,14 +19,13 @@ from vllm.engine.multiprocessing.engine import MQLLMEngine
from
vllm.entrypoints.openai.api_server
import
build_async_engine_client
from
vllm.entrypoints.openai.cli_args
import
make_arg_parser
from
vllm.lora.request
import
LoRARequest
from
vllm.sequence
import
SequenceGroupMetadata
from
vllm.usage.usage_lib
import
UsageContext
from
vllm.utils
import
FlexibleArgumentParser
from
..utils
import
models_path_prefix
MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"gemma-1.1-2b-it"
)
ENGINE_ARGS
=
AsyncEngineArgs
(
model
=
MODEL
,
load_format
=
"runai_streamer"
,
enforce_eager
=
True
)
ENGINE_ARGS
=
AsyncEngineArgs
(
model
=
MODEL
,
enforce_eager
=
True
)
RAISED_ERROR
=
KeyError
RAISED_VALUE
=
"foo"
...
...
@@ -238,25 +237,28 @@ async def test_bad_request(tmp_socket):
@
pytest
.
mark
.
asyncio
async
def
test_mp_crash_detection
(
monkeypatch
):
async
def
test_mp_crash_detection
(
monkeypatch
:
pytest
.
MonkeyPatch
):
with
monkeypatch
.
context
()
as
m
:
parser
=
FlexibleArgumentParser
(
description
=
"vLLM's remote OpenAI server."
)
parser
=
make_arg_parser
(
parser
)
args
=
parser
.
parse_args
([])
parser
=
FlexibleArgumentParser
(
description
=
"vLLM's remote OpenAI server."
)
parser
=
make_arg_parser
(
parser
)
args
=
parser
.
parse_args
([])
# When LLMEngine is loaded, it will crash.
def
mock_init
():
raise
ValueError
# When LLMEngine is loaded, it will crash.
def
mock_init
():
raise
ValueError
m
onkeypatch
.
setattr
(
LLMEngine
,
"__init__"
,
mock_init
)
m
.
setattr
(
LLMEngine
,
"__init__"
,
mock_init
)
start
=
time
.
perf_counter
()
async
with
build_async_engine_client
(
args
):
pass
end
=
time
.
perf_counter
()
start
=
time
.
perf_counter
()
async
with
build_async_engine_client
(
args
):
pass
end
=
time
.
perf_counter
()
assert
end
-
start
<
60
,
(
"Expected vLLM to gracefully shutdown in <60s "
"if there is an error in the startup."
)
assert
end
-
start
<
60
,
(
"Expected vLLM to gracefully shutdown in <60s "
"if there is an error in the startup."
)
@
pytest
.
mark
.
asyncio
...
...
@@ -296,3 +298,80 @@ async def test_engine_process_death(tmp_socket):
await
client
.
check_health
()
client
.
close
()
def
run_with_evil_input_processing
(
engine_args
:
AsyncEngineArgs
,
ipc_path
:
str
):
"""Simulate an exception while preparing inputs for the model.
In the wild, this could be something like a multimodal input processor
failing on invalid image data."""
# Make engine.
engine
=
MQLLMEngine
.
from_engine_args
(
engine_args
=
engine_args
,
usage_context
=
UsageContext
.
UNKNOWN_CONTEXT
,
ipc_path
=
ipc_path
)
runner
=
engine
.
engine
.
model_executor
.
driver_worker
.
worker
.
model_runner
# Raise error in the model runner when adding a sequence group.
# See class ModelInputForGPUBuilder
def
raiser
(
_
,
seq_group_metadata
:
SequenceGroupMetadata
):
if
seq_group_metadata
.
request_id
.
startswith
(
"evil"
):
raise
RAISED_ERROR
(
RAISED_VALUE
)
runner
.
builder
.
per_seq_group_compute_fns
.
append
(
raiser
)
# Run engine.
engine
.
start
()
@
pytest
.
mark
.
asyncio
async
def
test_failed_inputs
(
tmp_socket
):
with
RemoteMQLLMEngine
(
engine_args
=
ENGINE_ARGS
,
ipc_path
=
tmp_socket
,
run_fn
=
run_with_evil_input_processing
)
as
engine
:
client
=
await
engine
.
make_client
()
assert
client
.
is_running
# Engine should be healthy
await
client
.
check_health
()
async
def
run_failing_request
():
async
for
_
in
client
.
generate
(
prompt
=
"Hello my name is"
,
sampling_params
=
SamplingParams
(
max_tokens
=
10
),
request_id
=
"evil"
+
str
(
uuid
.
uuid4
())):
pass
async
def
run_passing_request
():
async
for
_
in
client
.
generate
(
prompt
=
"Hello my name is"
,
sampling_params
=
SamplingParams
(
max_tokens
=
10
),
request_id
=
str
(
uuid
.
uuid4
())):
pass
passing_tasks
=
[
asyncio
.
create_task
(
run_passing_request
())
for
_
in
range
(
10
)
]
failing_tasks
=
[
asyncio
.
create_task
(
run_failing_request
())
for
_
in
range
(
10
)
]
await
asyncio
.
gather
(
*
failing_tasks
,
return_exceptions
=
True
)
await
asyncio
.
gather
(
*
passing_tasks
)
# All the bad inputs should have raised
for
task
in
failing_tasks
:
with
pytest
.
raises
(
RAISED_ERROR
):
task
.
result
()
# But all good inputs should have still succeeded
for
task
in
passing_tasks
:
task
.
result
()
# And the engine should remain healthy
assert
not
client
.
errored
await
client
.
check_health
()
client
.
close
()
tests/mq_llm_engine/test_load.py
View file @
469e903b
...
...
@@ -17,9 +17,7 @@ NUM_EXPECTED_TOKENS = 10
NUM_REQUESTS
=
10000
# Scenarios to test for num generated token.
ENGINE_ARGS
=
AsyncEngineArgs
(
model
=
MODEL
,
load_format
=
"runai_streamer"
,
disable_log_requests
=
True
)
ENGINE_ARGS
=
AsyncEngineArgs
(
model
=
MODEL
,
disable_log_requests
=
True
)
@
pytest
.
fixture
(
scope
=
"function"
)
...
...
tests/mq_llm_engine/utils.py
View file @
469e903b
...
...
@@ -2,7 +2,7 @@
import
asyncio
import
multiprocessing
from
typing
import
Callable
,
Tuple
,
Union
from
typing
import
Callable
,
Union
from
vllm
import
SamplingParams
from
vllm.engine.arg_utils
import
AsyncEngineArgs
...
...
@@ -16,7 +16,7 @@ async def generate(
client
:
MQLLMEngineClient
,
request_id
:
str
,
num_tokens
:
int
,
return_output
:
bool
=
False
)
->
Union
[
RequestOutput
,
T
uple
[
int
,
str
]]:
return_output
:
bool
=
False
)
->
Union
[
RequestOutput
,
t
uple
[
int
,
str
]]:
final_output
=
None
count
=
0
...
...
tests/multi_step/test_correctness_async_llm.py
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
# Test the AsyncLLMEngine with multi-step-decoding
from
typing
import
List
,
Optional
from
typing
import
Optional
import
pytest
import
os
from
tests.kernels
.utils
import
override_backend_env_variable
from
vllm
.utils
import
STR_BACKEND_ENV_VAR
from
..models.utils
import
check_logprobs_close
from
..utils
import
(
completions_with_server_args
,
get_client_text_generations
,
...
...
@@ -18,7 +18,7 @@ MODELS = [
NUM_SCHEDULER_STEPS
=
[
8
]
# Multi-step decoding steps
NUM_PROMPTS
=
[
10
]
DEFAULT_SERVER_ARGS
:
L
ist
[
str
]
=
[
DEFAULT_SERVER_ARGS
:
l
ist
[
str
]
=
[
"--distributed-executor-backend"
,
"ray"
,
"--gpu-memory-utilization"
,
...
...
@@ -54,7 +54,7 @@ async def test_multi_step(
num_logprobs
:
Optional
[
int
],
attention_backend
:
str
,
enable_chunked_prefill
:
bool
,
monkeypatch
,
monkeypatch
:
pytest
.
MonkeyPatch
,
)
->
None
:
"""Test vLLM engine with multi-step scheduling in an OpenAI-protocol
client/server environment.
...
...
@@ -84,67 +84,70 @@ async def test_multi_step(
pytest
.
skip
(
"Multi-step with Chunked-Prefill only supports"
"PP=1 and FLASH_ATTN backend"
)
override_backend_env_variable
(
monkeypatch
,
attention_backend
)
prompts
=
example_prompts
if
len
(
prompts
)
<
num_prompts
:
prompts
=
prompts
*
((
num_prompts
//
len
(
prompts
))
+
1
)
prompts
=
prompts
[:
num_prompts
]
assert
len
(
prompts
)
==
num_prompts
server_args
=
DEFAULT_SERVER_ARGS
+
[
"--enforce-eager"
]
ms_server_args
=
DEFAULT_SERVER_ARGS
+
\
[
"--num-scheduler-steps"
,
f
"
{
num_scheduler_steps
}
"
]
if
not
is_async
:
ms_server_args
+=
[
"--disable-async-output-proc"
]
if
eager_mode
:
ms_server_args
.
append
(
"--enforce-eager"
)
if
enable_chunked_prefill
:
ms_server_args
.
append
(
"--enable-chunked-prefill"
)
distributed_args
=
[
"--tensor-parallel-size"
,
str
(
tp_size
),
"--pipeline-parallel-size"
,
str
(
pp_size
),
]
# Spin up client/server & issue completion API requests.
# Default `max_wait_seconds` is 240 but was empirically
# was raised 5x to 1200 *just for this test* due to
# observed timeouts in GHA CI
ref_completions
=
await
completions_with_server_args
(
prompts
,
model
,
server_args
+
distributed_args
,
num_logprobs
,
max_wait_seconds
=
5
*
240
)
test_completions
=
await
completions_with_server_args
(
prompts
,
model
,
ms_server_args
+
distributed_args
,
num_logprobs
,
max_wait_seconds
=
5
*
240
)
# Assert multi-step scheduling produces identical tokens
# to single-step scheduling.
ref_generations
=
get_client_text_generations
(
ref_completions
)
test_generations
=
get_client_text_generations
(
test_completions
)
assert
ref_generations
==
test_generations
# Assert multi-step scheduling produces nearly-identical logprobs
# to single-step scheduling.
ref_text_logprobs
=
get_client_text_logprob_generations
(
ref_completions
)
test_text_logprobs
=
get_client_text_logprob_generations
(
test_completions
)
check_logprobs_close
(
outputs_0_lst
=
ref_text_logprobs
,
outputs_1_lst
=
test_text_logprobs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
STR_BACKEND_ENV_VAR
,
attention_backend
)
prompts
=
example_prompts
if
len
(
prompts
)
<
num_prompts
:
prompts
=
prompts
*
((
num_prompts
//
len
(
prompts
))
+
1
)
prompts
=
prompts
[:
num_prompts
]
assert
len
(
prompts
)
==
num_prompts
server_args
=
DEFAULT_SERVER_ARGS
+
[
"--enforce-eager"
]
ms_server_args
=
DEFAULT_SERVER_ARGS
+
\
[
"--num-scheduler-steps"
,
f
"
{
num_scheduler_steps
}
"
]
if
not
is_async
:
ms_server_args
+=
[
"--disable-async-output-proc"
]
if
eager_mode
:
ms_server_args
.
append
(
"--enforce-eager"
)
if
enable_chunked_prefill
:
ms_server_args
.
append
(
"--enable-chunked-prefill"
)
distributed_args
=
[
"--tensor-parallel-size"
,
str
(
tp_size
),
"--pipeline-parallel-size"
,
str
(
pp_size
),
]
# Spin up client/server & issue completion API requests.
# Default `max_wait_seconds` is 240 but was empirically
# was raised 5x to 1200 *just for this test* due to
# observed timeouts in GHA CI
ref_completions
=
await
completions_with_server_args
(
prompts
,
model
,
server_args
+
distributed_args
,
num_logprobs
,
max_wait_seconds
=
5
*
240
)
test_completions
=
await
completions_with_server_args
(
prompts
,
model
,
ms_server_args
+
distributed_args
,
num_logprobs
,
max_wait_seconds
=
5
*
240
)
# Assert multi-step scheduling produces identical tokens
# to single-step scheduling.
ref_generations
=
get_client_text_generations
(
ref_completions
)
test_generations
=
get_client_text_generations
(
test_completions
)
assert
ref_generations
==
test_generations
# Assert multi-step scheduling produces nearly-identical logprobs
# to single-step scheduling.
ref_text_logprobs
=
get_client_text_logprob_generations
(
ref_completions
)
test_text_logprobs
=
get_client_text_logprob_generations
(
test_completions
)
check_logprobs_close
(
outputs_0_lst
=
ref_text_logprobs
,
outputs_1_lst
=
test_text_logprobs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
@
pytest
.
mark
.
parametrize
((
"tp_size, pp_size"
),
[
...
...
@@ -154,7 +157,7 @@ async def test_multi_step(
async
def
test_multi_step_pp_smoke
(
tp_size
:
int
,
pp_size
:
int
,
monkeypatch
,
monkeypatch
:
pytest
.
MonkeyPatch
,
)
->
None
:
"""
Smoke test for the vLLM engine with multi-step scheduling in an
...
...
@@ -176,54 +179,55 @@ async def test_multi_step_pp_smoke(
attention_backend
=
"FLASH_ATTN"
max_num_seqs
=
3
override_backend_env_variable
(
monkeypatch
,
attention_backend
)
# Prompt from the ShareGPT dataset
prompts
=
[
"in the jtbd context whats a push?"
,
# codespell:ignore
"in the jtbd context whats a push?"
,
# codespell:ignore
"in the jtbd context whats a push?"
,
# codespell:ignore
"in the jtbd context whats a push?"
,
# codespell:ignore
]
# Use varying max_tokens to introduce scheduling randomness.
max_tokens
=
[
10
*
i
for
i
in
range
(
1
,
len
(
prompts
)
+
1
)]
assert
len
(
prompts
)
==
len
(
max_tokens
)
test_args
=
[
"--tensor-parallel-size"
,
str
(
tp_size
),
"--pipeline-parallel-size"
,
str
(
pp_size
),
"--max-num-seqs"
,
str
(
max_num_seqs
)
]
server_args
=
DEFAULT_SERVER_ARGS
+
test_args
ms_server_args
=
DEFAULT_SERVER_ARGS
+
\
[
"--num-scheduler-steps"
,
f
"
{
num_scheduler_steps
}
"
]
+
\
test_args
# Spin up client/server & issue completion API requests.
# Default `max_wait_seconds` is 240 but was empirically
# was raised 3x to 720 *just for this test* due to
# observed timeouts in GHA CI
ref_completions
=
await
completions_with_server_args
(
prompts
=
prompts
,
model_name
=
model
,
server_cli_args
=
server_args
,
num_logprobs
=
None
,
max_wait_seconds
=
5
*
240
,
max_tokens
=
max_tokens
)
test_completions
=
await
completions_with_server_args
(
prompts
=
prompts
,
model_name
=
model
,
server_cli_args
=
ms_server_args
,
num_logprobs
=
None
,
max_wait_seconds
=
5
*
240
,
max_tokens
=
max_tokens
)
# Assert multi-step scheduling produces identical tokens
# to single-step scheduling.
ref_generations
=
get_client_text_generations
(
ref_completions
)
test_generations
=
get_client_text_generations
(
test_completions
)
assert
ref_generations
==
test_generations
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
STR_BACKEND_ENV_VAR
,
attention_backend
)
# Prompt from the ShareGPT dataset
prompts
=
[
"in the jtbd context whats a push?"
,
# codespell:ignore
"in the jtbd context whats a push?"
,
# codespell:ignore
"in the jtbd context whats a push?"
,
# codespell:ignore
"in the jtbd context whats a push?"
,
# codespell:ignore
]
# Use varying max_tokens to introduce scheduling randomness.
max_tokens
=
[
10
*
i
for
i
in
range
(
1
,
len
(
prompts
)
+
1
)]
assert
len
(
prompts
)
==
len
(
max_tokens
)
test_args
=
[
"--tensor-parallel-size"
,
str
(
tp_size
),
"--pipeline-parallel-size"
,
str
(
pp_size
),
"--max-num-seqs"
,
str
(
max_num_seqs
)
]
server_args
=
DEFAULT_SERVER_ARGS
+
test_args
ms_server_args
=
DEFAULT_SERVER_ARGS
+
\
[
"--num-scheduler-steps"
,
f
"
{
num_scheduler_steps
}
"
]
+
\
test_args
# Spin up client/server & issue completion API requests.
# Default `max_wait_seconds` is 240 but was empirically
# was raised 3x to 720 *just for this test* due to
# observed timeouts in GHA CI
ref_completions
=
await
completions_with_server_args
(
prompts
=
prompts
,
model_name
=
model
,
server_cli_args
=
server_args
,
num_logprobs
=
None
,
max_wait_seconds
=
5
*
240
,
max_tokens
=
max_tokens
)
test_completions
=
await
completions_with_server_args
(
prompts
=
prompts
,
model_name
=
model
,
server_cli_args
=
ms_server_args
,
num_logprobs
=
None
,
max_wait_seconds
=
5
*
240
,
max_tokens
=
max_tokens
)
# Assert multi-step scheduling produces identical tokens
# to single-step scheduling.
ref_generations
=
get_client_text_generations
(
ref_completions
)
test_generations
=
get_client_text_generations
(
test_completions
)
assert
ref_generations
==
test_generations
tests/multi_step/test_correctness_llm.py
View file @
469e903b
...
...
@@ -8,7 +8,7 @@ from typing import Optional
import
pytest
import
os
from
tests.kernels
.utils
import
override_backend_env_variable
from
vllm
.utils
import
STR_BACKEND_ENV_VAR
from
..models.utils
import
check_logprobs_close
,
check_outputs_equal
from
..utils
import
models_path_prefix
...
...
@@ -44,7 +44,7 @@ def test_multi_step_llm(
num_prompts
:
int
,
num_logprobs
:
Optional
[
int
],
attention_backend
:
str
,
monkeypatch
,
monkeypatch
:
pytest
.
MonkeyPatch
,
)
->
None
:
"""Test vLLM engine with multi-step scheduling via sync LLM Engine.
...
...
@@ -72,48 +72,49 @@ def test_multi_step_llm(
num_logprobs: corresponds to the `logprobs` argument to the OpenAI
completions endpoint; `None` -> 1 logprob returned.
"""
override_backend_env_variable
(
monkeypatch
,
attention_backend
)
prompts
=
example_prompts
if
len
(
prompts
)
<
num_prompts
:
prompts
=
prompts
*
((
num_prompts
//
len
(
prompts
))
+
1
)
prompts
=
prompts
[:
num_prompts
]
assert
len
(
prompts
)
==
num_prompts
with
vllm_runner
(
model
,
dtype
=
dtype
,
enforce_eager
=
enforce_eager
,
gpu_memory_utilization
=
0.7
,
tensor_parallel_size
=
tp_size
,
enable_chunked_prefill
=
enable_chunked_prefill
,
num_scheduler_steps
=
num_scheduler_steps
,
)
as
vllm_model
:
vllm_outputs
=
(
vllm_model
.
generate_greedy
(
prompts
,
max_tokens
)
if
num_logprobs
is
None
else
vllm_model
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
))
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
(
hf_model
.
generate_greedy
(
prompts
,
max_tokens
)
if
num_logprobs
is
None
else
hf_model
.
generate_greedy_logprobs_limit
(
prompts
,
max_tokens
,
num_logprobs
))
if
num_logprobs
is
None
:
check_outputs_equal
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
else
:
check_logprobs_close
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
STR_BACKEND_ENV_VAR
,
attention_backend
)
prompts
=
example_prompts
if
len
(
prompts
)
<
num_prompts
:
prompts
=
prompts
*
((
num_prompts
//
len
(
prompts
))
+
1
)
prompts
=
prompts
[:
num_prompts
]
assert
len
(
prompts
)
==
num_prompts
with
vllm_runner
(
model
,
dtype
=
dtype
,
enforce_eager
=
enforce_eager
,
gpu_memory_utilization
=
0.7
,
tensor_parallel_size
=
tp_size
,
enable_chunked_prefill
=
enable_chunked_prefill
,
num_scheduler_steps
=
num_scheduler_steps
,
)
as
vllm_model
:
vllm_outputs
=
(
vllm_model
.
generate_greedy
(
prompts
,
max_tokens
)
if
num_logprobs
is
None
else
vllm_model
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
))
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
(
hf_model
.
generate_greedy
(
prompts
,
max_tokens
)
if
num_logprobs
is
None
else
hf_model
.
generate_greedy_logprobs_limit
(
prompts
,
max_tokens
,
num_logprobs
))
if
num_logprobs
is
None
:
check_outputs_equal
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
else
:
check_logprobs_close
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
...
...
@@ -138,7 +139,7 @@ def test_multi_step_llm_w_prompt_logprobs(
num_logprobs
:
Optional
[
int
],
num_prompt_logprobs
:
Optional
[
int
],
attention_backend
:
str
,
monkeypatch
,
monkeypatch
:
pytest
.
MonkeyPatch
,
)
->
None
:
"""Test prompt logprobs with multi-step scheduling via sync LLM Engine.
...
...
@@ -168,47 +169,48 @@ def test_multi_step_llm_w_prompt_logprobs(
note that this argument is not supported by the
OpenAI completions endpoint.
"""
override_backend_env_variable
(
monkeypatch
,
attention_backend
)
prompts
=
example_prompts
if
len
(
prompts
)
<
num_prompts
:
prompts
=
prompts
*
((
num_prompts
//
len
(
prompts
))
+
1
)
prompts
=
prompts
[:
num_prompts
]
assert
len
(
prompts
)
==
num_prompts
with
vllm_runner
(
model
,
dtype
=
dtype
,
enforce_eager
=
enforce_eager
,
gpu_memory_utilization
=
0.7
,
tensor_parallel_size
=
tp_size
,
num_scheduler_steps
=
num_scheduler_steps
,
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
,
num_prompt_logprobs
=
num_prompt_logprobs
)
with
vllm_runner
(
model
,
dtype
=
dtype
,
enforce_eager
=
enforce_eager
,
gpu_memory_utilization
=
0.7
,
tensor_parallel_size
=
tp_size
,
)
as
vllm_model
:
single_step_vllm_outputs
=
vllm_model
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
,
num_prompt_logprobs
=
num_prompt_logprobs
)
check_logprobs_close
(
outputs_0_lst
=
single_step_vllm_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
STR_BACKEND_ENV_VAR
,
attention_backend
)
prompts
=
example_prompts
if
len
(
prompts
)
<
num_prompts
:
prompts
=
prompts
*
((
num_prompts
//
len
(
prompts
))
+
1
)
prompts
=
prompts
[:
num_prompts
]
assert
len
(
prompts
)
==
num_prompts
with
vllm_runner
(
model
,
dtype
=
dtype
,
enforce_eager
=
enforce_eager
,
gpu_memory_utilization
=
0.7
,
tensor_parallel_size
=
tp_size
,
num_scheduler_steps
=
num_scheduler_steps
,
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
,
num_prompt_logprobs
=
num_prompt_logprobs
)
with
vllm_runner
(
model
,
dtype
=
dtype
,
enforce_eager
=
enforce_eager
,
gpu_memory_utilization
=
0.7
,
tensor_parallel_size
=
tp_size
,
)
as
vllm_model
:
single_step_vllm_outputs
=
vllm_model
.
generate_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
,
num_prompt_logprobs
=
num_prompt_logprobs
)
check_logprobs_close
(
outputs_0_lst
=
single_step_vllm_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
...
...
@@ -232,7 +234,7 @@ def test_multi_step_llm_chunked_prefill_prefix_cache(
num_prompts
:
int
,
num_logprobs
:
Optional
[
int
],
attention_backend
:
str
,
monkeypatch
,
monkeypatch
:
pytest
.
MonkeyPatch
,
)
->
None
:
"""Test vLLM engine with multi-step+"single-step chunked prefill"+APC.
...
...
@@ -295,77 +297,78 @@ def test_multi_step_llm_chunked_prefill_prefix_cache(
#
# The Incorrect scheduling behavior - if it occurs - will cause an exception
# in the model runner resulting from `do_sample=False`.
override_backend_env_variable
(
monkeypatch
,
attention_backend
)
assert
len
(
example_prompts
)
>=
2
challenge_prompts
=
copy
.
deepcopy
(
example_prompts
)
challenge_prompts
[
0
]
=
(
'vLLM is a high-throughput and memory-efficient '
'inference and serving engine for LLMs.
\n
'
)
# 24 tok
challenge_prompts
[
1
]
=
(
'Briefly describe the major milestones in the '
'development of artificial intelligence from 1950 to 2020.
\n
'
)
# 30 tok
# If necessary, adjust the length of `challenge_prompts` to match
# `num_prompts`
if
len
(
challenge_prompts
)
<
num_prompts
:
challenge_prompts
=
(
challenge_prompts
*
((
num_prompts
//
len
(
challenge_prompts
))
+
1
))
challenge_prompts
=
challenge_prompts
[:
num_prompts
]
assert
len
(
challenge_prompts
)
==
num_prompts
# Single-step scheduler baseline
with
vllm_runner
(
model
,
dtype
=
dtype
,
enforce_eager
=
enforce_eager
,
gpu_memory_utilization
=
0.7
,
tensor_parallel_size
=
tp_size
,
num_scheduler_steps
=
num_scheduler_steps
,
max_model_len
=
48
,
max_num_batched_tokens
=
48
,
max_num_seqs
=
4
,
block_size
=
16
,
)
as
vllm_model
:
outputs_baseline
=
(
vllm_model
.
generate_greedy
(
challenge_prompts
,
max_tokens
)
if
num_logprobs
is
None
else
vllm_model
.
generate_greedy_logprobs
(
challenge_prompts
,
max_tokens
,
num_logprobs
))
# multi-step+"single-step chunked prefill"+APC
with
vllm_runner
(
model
,
dtype
=
dtype
,
enforce_eager
=
enforce_eager
,
gpu_memory_utilization
=
0.7
,
tensor_parallel_size
=
tp_size
,
enable_chunked_prefill
=
True
,
enable_prefix_caching
=
True
,
num_scheduler_steps
=
num_scheduler_steps
,
max_model_len
=
48
,
max_num_batched_tokens
=
48
,
max_num_seqs
=
4
,
block_size
=
16
,
)
as
vllm_model
:
outputs_w_features
=
(
vllm_model
.
generate_greedy
(
challenge_prompts
,
max_tokens
)
if
num_logprobs
is
None
else
vllm_model
.
generate_greedy_logprobs
(
challenge_prompts
,
max_tokens
,
num_logprobs
))
if
num_logprobs
is
None
:
# No-logprobs test
check_outputs_equal
(
outputs_0_lst
=
outputs_baseline
,
outputs_1_lst
=
outputs_w_features
,
name_0
=
"multi-step"
,
name_1
=
"multi-step+features"
,
)
else
:
# Yes-logprobs test
check_logprobs_close
(
outputs_0_lst
=
outputs_baseline
,
outputs_1_lst
=
outputs_w_features
,
name_0
=
"multi-step"
,
name_1
=
"multi-step+features"
,
)
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
STR_BACKEND_ENV_VAR
,
attention_backend
)
assert
len
(
example_prompts
)
>=
2
challenge_prompts
=
copy
.
deepcopy
(
example_prompts
)
challenge_prompts
[
0
]
=
(
'vLLM is a high-throughput and memory-efficient '
'inference and serving engine for LLMs.
\n
'
)
# 24 tok
challenge_prompts
[
1
]
=
(
'Briefly describe the major milestones in the '
'development of artificial intelligence from 1950 to 2020.
\n
'
)
# 30 tok
# If necessary, adjust the length of `challenge_prompts` to match
# `num_prompts`
if
len
(
challenge_prompts
)
<
num_prompts
:
challenge_prompts
=
(
challenge_prompts
*
((
num_prompts
//
len
(
challenge_prompts
))
+
1
))
challenge_prompts
=
challenge_prompts
[:
num_prompts
]
assert
len
(
challenge_prompts
)
==
num_prompts
# Single-step scheduler baseline
with
vllm_runner
(
model
,
dtype
=
dtype
,
enforce_eager
=
enforce_eager
,
gpu_memory_utilization
=
0.7
,
tensor_parallel_size
=
tp_size
,
num_scheduler_steps
=
num_scheduler_steps
,
max_model_len
=
48
,
max_num_batched_tokens
=
48
,
max_num_seqs
=
4
,
block_size
=
16
,
)
as
vllm_model
:
outputs_baseline
=
(
vllm_model
.
generate_greedy
(
challenge_prompts
,
max_tokens
)
if
num_logprobs
is
None
else
vllm_model
.
generate_greedy_logprobs
(
challenge_prompts
,
max_tokens
,
num_logprobs
))
# multi-step+"single-step chunked prefill"+APC
with
vllm_runner
(
model
,
dtype
=
dtype
,
enforce_eager
=
enforce_eager
,
gpu_memory_utilization
=
0.7
,
tensor_parallel_size
=
tp_size
,
enable_chunked_prefill
=
True
,
enable_prefix_caching
=
True
,
num_scheduler_steps
=
num_scheduler_steps
,
max_model_len
=
48
,
max_num_batched_tokens
=
48
,
max_num_seqs
=
4
,
block_size
=
16
,
)
as
vllm_model
:
outputs_w_features
=
(
vllm_model
.
generate_greedy
(
challenge_prompts
,
max_tokens
)
if
num_logprobs
is
None
else
vllm_model
.
generate_greedy_logprobs
(
challenge_prompts
,
max_tokens
,
num_logprobs
))
if
num_logprobs
is
None
:
# No-logprobs test
check_outputs_equal
(
outputs_0_lst
=
outputs_baseline
,
outputs_1_lst
=
outputs_w_features
,
name_0
=
"multi-step"
,
name_1
=
"multi-step+features"
,
)
else
:
# Yes-logprobs test
check_logprobs_close
(
outputs_0_lst
=
outputs_baseline
,
outputs_1_lst
=
outputs_w_features
,
name_0
=
"multi-step"
,
name_1
=
"multi-step+features"
,
)
tests/multimodal/test_processing.py
View file @
469e903b
...
...
@@ -7,18 +7,24 @@ from unittest.mock import MagicMock
import
numpy
as
np
import
pytest
import
torch
from
transformers
import
ProcessorMixin
from
vllm.config
import
ModelConfig
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
(
MultiModalFieldElem
,
MultiModalKwargs
,
MultiModalKwargsItem
,
MultiModalSharedField
)
# yapf conflicts with isort for this block
# yapf: disable
from
vllm.multimodal.processing
import
(
PlaceholderFeaturesInfo
,
PromptReplacement
,
ProcessingCache
,
PromptIndexTargets
,
PromptInsertion
,
PromptReplacement
,
apply_text_matches
,
apply_token_matches
,
find_mm_placeholders
,
find_text_matches
,
find_token_matches
,
iter_token_matches
,
replace_text_matches
,
replace_token_matches
)
# yapf: enable
from
vllm.multimodal.profiling
import
MultiModalProfiler
...
...
@@ -89,6 +95,58 @@ def test_iter_token_matches(token_ids, match_ids, expected):
assert
all
(
match_len
==
len
(
match_ids
)
for
match_len
in
match_lens
)
# yapf: disable
@
pytest
.
mark
.
parametrize
(
(
"token_ids"
,
"match_ids"
,
"new_ids"
,
"expected"
),
[
([],
[],
[
-
1
],
[]),
([],
[
32000
],
[
-
1
],
[]),
(
[
32000
,
32000
,
32000
],
[
32000
],
[
-
1
],
[
-
1
,
-
1
,
-
1
],
),
(
[
32000
,
32000
,
32000
],
[
32000
,
32000
],
[
-
1
],
[
-
1
,
32000
],
),
(
[
32000
,
32000
,
32000
],
[
32000
,
32000
,
32000
],
[
-
1
],
[
-
1
],
),
(
[
9833
,
28747
,
32000
,
32000
,
32000
,
9833
,
28747
,
32000
,
32000
,
918
],
[
28747
,
32000
],
[
-
1
],
[
9833
,
-
1
,
32000
,
32000
,
9833
,
-
1
,
32000
,
918
],
),
(
[
9833
,
28747
,
32000
,
32000
,
32000
,
9833
,
28747
,
32000
,
32000
,
918
],
[
28747
,
32000
,
32000
,
32000
],
[
-
1
],
[
9833
,
-
1
,
9833
,
28747
,
32000
,
32000
,
918
],
),
(
[
9833
,
28747
,
32000
,
32000
,
32000
,
9833
,
28747
,
32000
,
32000
,
918
],
[
28747
,
0
,
32000
],
[
-
1
],
[
9833
,
28747
,
32000
,
32000
,
32000
,
9833
,
28747
,
32000
,
32000
,
918
],
),
],
)
# yapf: enable
def
test_replace_token_matches
(
token_ids
,
match_ids
,
new_ids
,
expected
):
result
=
replace_token_matches
(
token_ids
,
match_ids
,
new_ids
)
# Manually constructed results
assert
result
==
expected
# yapf: disable
@
pytest
.
mark
.
parametrize
(
(
"prompt"
,
"target_by_key"
,
"expected_by_key"
),
...
...
@@ -98,11 +156,21 @@ def test_iter_token_matches(token_ids, match_ids, expected):
{
"pattern_1"
:
[],
"pattern_2"
:
[
32000
],
"pattern_3"
:
PromptIndexTargets
.
start
(),
"pattern_4"
:
PromptIndexTargets
.
prefix
([
32000
]),
"pattern_5"
:
PromptIndexTargets
.
end
(),
},
{
"pattern_1"
:
[],
"pattern_2"
:
[],
}
"pattern_3"
:
[
{
"start_idx"
:
0
,
"end_idx"
:
0
},
],
"pattern_4"
:
[],
"pattern_5"
:
[
{
"start_idx"
:
0
,
"end_idx"
:
0
},
],
},
),
(
[
32000
,
32000
,
32000
,
32000
],
...
...
@@ -110,6 +178,9 @@ def test_iter_token_matches(token_ids, match_ids, expected):
"pattern_1"
:
[
32000
],
"pattern_2"
:
[
32000
,
32000
],
"pattern_3"
:
[
32000
,
32000
,
32000
],
"pattern_4"
:
PromptIndexTargets
.
start
(),
"pattern_5"
:
PromptIndexTargets
.
prefix
([
32000
]),
"pattern_6"
:
PromptIndexTargets
.
end
(),
},
{
"pattern_1"
:
[
...
...
@@ -125,6 +196,15 @@ def test_iter_token_matches(token_ids, match_ids, expected):
"pattern_3"
:
[
{
"start_idx"
:
0
,
"end_idx"
:
3
},
],
"pattern_4"
:
[
{
"start_idx"
:
0
,
"end_idx"
:
0
},
],
"pattern_5"
:
[
{
"start_idx"
:
1
,
"end_idx"
:
1
},
],
"pattern_6"
:
[
{
"start_idx"
:
4
,
"end_idx"
:
4
},
],
},
),
(
...
...
@@ -133,6 +213,9 @@ def test_iter_token_matches(token_ids, match_ids, expected):
"pattern_1"
:
[
28747
,
32000
],
"pattern_2"
:
[
28747
,
32000
,
32000
,
32000
],
"pattern_3"
:
[
28747
,
0
,
32000
],
"pattern_4"
:
PromptIndexTargets
.
start
(),
"pattern_5"
:
PromptIndexTargets
.
prefix
([
28747
,
32000
]),
"pattern_6"
:
PromptIndexTargets
.
end
(),
},
{
"pattern_1"
:
[
...
...
@@ -143,20 +226,33 @@ def test_iter_token_matches(token_ids, match_ids, expected):
{
"start_idx"
:
1
,
"end_idx"
:
5
},
],
"pattern_3"
:
[],
"pattern_4"
:
[
{
"start_idx"
:
0
,
"end_idx"
:
0
},
],
"pattern_5"
:
[],
"pattern_6"
:
[
{
"start_idx"
:
10
,
"end_idx"
:
10
},
],
},
),
],
)
@
pytest
.
mark
.
parametrize
(
"update_type"
,
[
PromptInsertion
,
PromptReplacement
])
# yapf: enable
def
test_find_token_matches
(
prompt
,
target_by_key
,
expected_by_key
):
def
test_find_token_matches
(
prompt
,
target_by_key
,
expected_by_key
,
update_type
,
):
# Should not be used since there is nothing to convert to token IDs
mock_tokenizer
=
cast
(
AnyTokenizer
,
object
())
prompt_
repl
s
=
[
PromptReplacement
(
key
,
target
,
[]).
bind
(
mock_tokenizer
)
prompt_
update
s
=
[
update_type
(
key
,
target
,
[]).
bind
(
mock_tokenizer
)
for
key
,
target
in
target_by_key
.
items
()
]
result
=
find_token_matches
(
prompt
,
prompt_
repl
s
)
result
=
find_token_matches
(
prompt
,
prompt_
update
s
)
# Only displayed on error
print
(
"result:"
,
result
)
...
...
@@ -183,10 +279,20 @@ def test_find_token_matches(prompt, target_by_key, expected_by_key):
{
"pattern_1"
:
""
,
"pattern_2"
:
"<image>"
,
"pattern_3"
:
PromptIndexTargets
.
start
(),
"pattern_4"
:
PromptIndexTargets
.
prefix
(
"<image>"
),
"pattern_5"
:
PromptIndexTargets
.
end
(),
},
{
"pattern_1"
:
[{
"start_idx"
:
0
,
"end_idx"
:
0
}],
"pattern_2"
:
[],
"pattern_3"
:
[
{
"start_idx"
:
0
,
"end_idx"
:
0
},
],
"pattern_4"
:
[],
"pattern_5"
:
[
{
"start_idx"
:
0
,
"end_idx"
:
0
},
],
}
),
(
...
...
@@ -195,6 +301,9 @@ def test_find_token_matches(prompt, target_by_key, expected_by_key):
"pattern_1"
:
"<image>"
,
"pattern_2"
:
"<image><image>"
,
"pattern_3"
:
"<image><image><image>"
,
"pattern_4"
:
PromptIndexTargets
.
start
(),
"pattern_5"
:
PromptIndexTargets
.
prefix
(
"<image>"
),
"pattern_6"
:
PromptIndexTargets
.
end
(),
},
{
"pattern_1"
:
[
...
...
@@ -210,6 +319,15 @@ def test_find_token_matches(prompt, target_by_key, expected_by_key):
"pattern_3"
:
[
{
"start_idx"
:
0
,
"end_idx"
:
21
},
],
"pattern_4"
:
[
{
"start_idx"
:
0
,
"end_idx"
:
0
},
],
"pattern_5"
:
[
{
"start_idx"
:
7
,
"end_idx"
:
7
},
],
"pattern_6"
:
[
{
"start_idx"
:
28
,
"end_idx"
:
28
},
],
},
),
(
...
...
@@ -218,6 +336,9 @@ def test_find_token_matches(prompt, target_by_key, expected_by_key):
"pattern_1"
:
"Image:<image>"
,
"pattern_2"
:
"Image:<image><image><image>"
,
"pattern_3"
:
"Image:<unk><image>"
,
"pattern_4"
:
PromptIndexTargets
.
start
(),
"pattern_5"
:
PromptIndexTargets
.
prefix
(
"Image:<image>"
),
"pattern_6"
:
PromptIndexTargets
.
end
(),
},
{
"pattern_1"
:
[
...
...
@@ -228,6 +349,15 @@ def test_find_token_matches(prompt, target_by_key, expected_by_key):
{
"start_idx"
:
0
,
"end_idx"
:
27
},
],
"pattern_3"
:
[],
"pattern_4"
:
[
{
"start_idx"
:
0
,
"end_idx"
:
0
},
],
"pattern_5"
:
[
{
"start_idx"
:
13
,
"end_idx"
:
13
},
],
"pattern_6"
:
[
{
"start_idx"
:
48
,
"end_idx"
:
48
},
],
},
),
# Test regex escape
...
...
@@ -254,16 +384,22 @@ def test_find_token_matches(prompt, target_by_key, expected_by_key):
),
],
)
@
pytest
.
mark
.
parametrize
(
"update_type"
,
[
PromptInsertion
,
PromptReplacement
])
# yapf: enable
def
test_find_text_matches
(
prompt
,
target_by_key
,
expected_by_key
):
def
test_find_text_matches
(
prompt
,
target_by_key
,
expected_by_key
,
update_type
,
):
# Should not be used since there is nothing to convert to text
mock_tokenizer
=
cast
(
AnyTokenizer
,
object
())
prompt_
repl
s
=
[
PromptReplacement
(
key
,
target
,
[]).
bind
(
mock_tokenizer
)
prompt_
update
s
=
[
update_type
(
key
,
target
,
[]).
bind
(
mock_tokenizer
)
for
key
,
target
in
target_by_key
.
items
()
]
result
=
find_text_matches
(
prompt
,
prompt_
repl
s
)
result
=
find_text_matches
(
prompt
,
prompt_
update
s
)
# Only displayed on error
print
(
"result:"
,
result
)
...
...
@@ -281,7 +417,7 @@ def test_find_text_matches(prompt, target_by_key, expected_by_key):
# yapf: disable
@
pytest
.
mark
.
parametrize
(
(
"prompt"
,
"target_by_key"
,
"repl_by_key"
)
,
(
"prompt"
,
"target_by_key"
,
"repl_by_key"
,
"expected_by_update_type_mm_count"
),
# noqa: E501
[
(
"Image:<image>Image:<image><image>!"
,
...
...
@@ -300,58 +436,160 @@ def test_find_text_matches(prompt, target_by_key, expected_by_key):
# Test dynamic replacement (beyond the form of `unit * count`)
"pattern_3"
:
"?!?"
,
},
{
PromptInsertion
:
{
0
:
"Image:<image>Image:<image><image>!"
,
1
:
"Image:<image><image><image>Image:<image><image>!?!?"
,
2
:
"Image:<image><image><image><image><image>Image:<image><image>!?!??!?"
,
# noqa: E501
},
PromptReplacement
:
{
0
:
"Image:<image>Image:<image><image>!"
,
1
:
"<image><image>Image:<image><image>?!?"
,
2
:
"<image><image><image><image><image>?!?"
,
},
},
),
# Test index targets
(
""
,
{
"pattern_1"
:
PromptIndexTargets
.
start
(),
"pattern_2"
:
PromptIndexTargets
.
prefix
(
"<image>"
),
"pattern_3"
:
PromptIndexTargets
.
end
(),
},
{
"pattern_1"
:
"1"
,
"pattern_2"
:
"2"
,
"pattern_3"
:
"3"
,
},
{
PromptInsertion
:
{
0
:
""
,
1
:
"13"
,
2
:
"1133"
,
},
PromptReplacement
:
{
0
:
""
,
1
:
"13"
,
2
:
"1133"
,
},
},
),
(
"<image>"
,
{
"pattern_1"
:
PromptIndexTargets
.
start
(),
"pattern_2"
:
PromptIndexTargets
.
prefix
(
"<image>"
),
"pattern_3"
:
PromptIndexTargets
.
end
(),
},
{
"pattern_1"
:
"1"
,
"pattern_2"
:
"2"
,
"pattern_3"
:
"3"
,
},
{
PromptInsertion
:
{
0
:
"<image>"
,
1
:
"1<image>23"
,
2
:
"11<image>2233"
,
},
PromptReplacement
:
{
0
:
"<image>"
,
1
:
"1<image>23"
,
2
:
"11<image>2233"
,
},
},
),
# Test different replacement per item
(
"<image><image><image>"
,
{
"pattern_1"
:
"<image>"
,
},
{
"pattern_1"
:
lambda
idx
:
str
(
idx
+
1
),
},
{
PromptInsertion
:
{
0
:
"<image><image><image>"
,
1
:
"<image>1<image><image>"
,
2
:
"<image>12<image><image>"
,
},
PromptReplacement
:
{
0
:
"<image><image><image>"
,
1
:
"1<image><image>"
,
2
:
"12<image>"
,
},
},
),
(
"<image><image><image>"
,
{
"pattern_1"
:
PromptIndexTargets
.
prefix
(
"<image>"
),
},
{
"pattern_1"
:
lambda
idx
:
str
(
idx
+
1
),
},
{
PromptInsertion
:
{
0
:
"<image><image><image>"
,
1
:
"<image>1<image><image>"
,
2
:
"<image>12<image><image>"
,
},
PromptReplacement
:
{
0
:
"<image><image><image>"
,
1
:
"<image>1<image><image>"
,
2
:
"<image>12<image><image>"
,
},
},
),
]
)
@
pytest
.
mark
.
parametrize
(
(
"mm_count"
,
"expected"
),
[
(
0
,
"Image:<image>Image:<image><image>!"
),
(
1
,
"<image><image>Image:<image><image>?!?"
),
(
2
,
"<image><image><image><image><image>?!?"
),
]
)
# yapf: enable
def
test_find_
replac
e_text
(
def
test_find_
updat
e_text
(
prompt
,
target_by_key
,
repl_by_key
,
mm_count
,
expected
,
expected_by_update_type_mm_count
,
):
# Should not be used since there is nothing to convert to text
mock_tokenizer
=
cast
(
AnyTokenizer
,
object
())
mm_prompt_repls
=
{
key
:
[
PromptReplacement
(
key
,
target
,
repl_by_key
[
key
]).
bind
(
mock_tokenizer
)
]
for
key
,
target
in
target_by_key
.
items
()
}
mm_matches
=
{
key
:
find_text_matches
(
prompt
,
prompt_repls
)
for
key
,
prompt_repls
in
mm_prompt_repls
.
items
()
}
result
=
replace_text_matches
(
prompt
,
mm_matches
,
{
key
:
mm_count
for
key
in
repl_by_key
},
)
# Only displayed on error
print
(
"mm_matches:"
,
mm_matches
)
print
(
"result:"
,
result
)
# Manually constructed results
assert
result
==
expected
for
(
update_type
,
expected_by_mm_count
,
)
in
expected_by_update_type_mm_count
.
items
():
mm_prompt_updates
=
{
key
:
[
update_type
(
key
,
target
,
repl_by_key
[
key
]).
bind
(
mock_tokenizer
)]
for
key
,
target
in
target_by_key
.
items
()
}
mm_matches
=
{
key
:
find_text_matches
(
prompt
,
updates
)
for
key
,
updates
in
mm_prompt_updates
.
items
()
}
for
mm_count
,
expected
in
expected_by_mm_count
.
items
():
result
=
apply_text_matches
(
prompt
,
mm_matches
,
{
key
:
mm_count
for
key
in
repl_by_key
},
)
# Only displayed on error
print
(
"update_type:"
,
update_type
)
print
(
"mm_count:"
,
mm_count
)
print
(
"mm_matches:"
,
mm_matches
)
print
(
"result:"
,
result
)
# Manually constructed results
assert
result
==
expected
# yapf: disable
@
pytest
.
mark
.
parametrize
(
(
"prompt"
,
"target_by_key"
,
"repl_by_key"
)
,
(
"prompt"
,
"target_by_key"
,
"repl_by_key"
,
"expected_by_update_type_mm_count"
),
# noqa: E501
[
# Tokenized test cases of `test_find_replace_text`
# using the vocab of llava-hf/llava-v1.6-mistral-7b-hf
...
...
@@ -372,53 +610,155 @@ def test_find_replace_text(
# Test dynamic replacement (beyond the form of `unit * count`)
"pattern_3"
:
[
1550
,
918
,
1550
],
},
{
PromptInsertion
:
{
0
:
[
1
,
9833
,
28747
,
32000
,
9833
,
28747
,
32000
,
32000
,
918
],
1
:
[
1
,
9833
,
28747
,
32000
,
32000
,
32000
,
9833
,
28747
,
32000
,
32000
,
918
,
1550
,
918
,
1550
],
# noqa: E501
2
:
[
1
,
9833
,
28747
,
32000
,
32000
,
32000
,
32000
,
32000
,
9833
,
28747
,
32000
,
32000
,
918
,
1550
,
918
,
1550
,
1550
,
918
,
1550
],
# noqa: E501
},
PromptReplacement
:
{
0
:
[
1
,
9833
,
28747
,
32000
,
9833
,
28747
,
32000
,
32000
,
918
],
1
:
[
1
,
32000
,
32000
,
9833
,
28747
,
32000
,
32000
,
1550
,
918
,
1550
],
# noqa: E501
2
:
[
1
,
32000
,
32000
,
32000
,
32000
,
32000
,
1550
,
918
,
1550
],
},
},
),
# Test index targets
(
[],
{
"pattern_1"
:
PromptIndexTargets
.
start
(),
"pattern_2"
:
PromptIndexTargets
.
prefix
([
32000
]),
"pattern_3"
:
PromptIndexTargets
.
end
(),
},
{
"pattern_1"
:
[
-
1
],
"pattern_2"
:
[
-
2
],
"pattern_3"
:
[
-
3
],
},
{
PromptInsertion
:
{
0
:
[],
1
:
[
-
1
,
-
3
],
2
:
[
-
1
,
-
1
,
-
3
,
-
3
],
},
PromptReplacement
:
{
0
:
[],
1
:
[
-
1
,
-
3
],
2
:
[
-
1
,
-
1
,
-
3
,
-
3
],
},
},
),
(
[
32000
],
{
"pattern_1"
:
PromptIndexTargets
.
start
(),
"pattern_2"
:
PromptIndexTargets
.
prefix
([
32000
]),
"pattern_3"
:
PromptIndexTargets
.
end
(),
},
{
"pattern_1"
:
[
-
1
],
"pattern_2"
:
[
-
2
],
"pattern_3"
:
[
-
3
],
},
{
PromptInsertion
:
{
0
:
[
32000
],
1
:
[
-
1
,
32000
,
-
2
,
-
3
],
2
:
[
-
1
,
-
1
,
32000
,
-
2
,
-
2
,
-
3
,
-
3
],
},
PromptReplacement
:
{
0
:
[
32000
],
1
:
[
-
1
,
32000
,
-
2
,
-
3
],
2
:
[
-
1
,
-
1
,
32000
,
-
2
,
-
2
,
-
3
,
-
3
],
},
},
),
# Test different replacement per item
(
[
32000
,
32000
,
32000
],
{
"pattern_1"
:
[
32000
],
},
{
"pattern_1"
:
lambda
idx
:
[
-
(
idx
+
1
)],
},
{
PromptInsertion
:
{
0
:
[
32000
,
32000
,
32000
],
1
:
[
32000
,
-
1
,
32000
,
32000
],
2
:
[
32000
,
-
1
,
-
2
,
32000
,
32000
],
},
PromptReplacement
:
{
0
:
[
32000
,
32000
,
32000
],
1
:
[
-
1
,
32000
,
32000
],
2
:
[
-
1
,
-
2
,
32000
],
},
},
),
(
[
32000
,
32000
,
32000
],
{
"pattern_1"
:
PromptIndexTargets
.
prefix
([
32000
]),
},
{
"pattern_1"
:
lambda
idx
:
[
-
(
idx
+
1
)],
},
{
PromptInsertion
:
{
0
:
[
32000
,
32000
,
32000
],
1
:
[
32000
,
-
1
,
32000
,
32000
],
2
:
[
32000
,
-
1
,
-
2
,
32000
,
32000
],
},
PromptReplacement
:
{
0
:
[
32000
,
32000
,
32000
],
1
:
[
32000
,
-
1
,
32000
,
32000
],
2
:
[
32000
,
-
1
,
-
2
,
32000
,
32000
],
},
},
),
]
)
@
pytest
.
mark
.
parametrize
(
(
"mm_count"
,
"expected"
),
[
(
0
,
[
1
,
9833
,
28747
,
32000
,
9833
,
28747
,
32000
,
32000
,
918
]),
(
1
,
[
1
,
32000
,
32000
,
9833
,
28747
,
32000
,
32000
,
1550
,
918
,
1550
]),
(
2
,
[
1
,
32000
,
32000
,
32000
,
32000
,
32000
,
1550
,
918
,
1550
]),
]
)
# yapf: enable
def
test_find_
replac
e_tokens
(
def
test_find_
updat
e_tokens
(
prompt
,
target_by_key
,
repl_by_key
,
mm_count
,
expected
,
expected_by_update_type_mm_count
,
):
# Should not be used since there is nothing to convert to tokens
mock_tokenizer
=
cast
(
AnyTokenizer
,
object
())
mm_prompt_repls
=
{
key
:
[
PromptReplacement
(
key
,
target
,
repl_by_key
[
key
]).
bind
(
mock_tokenizer
)
]
for
key
,
target
in
target_by_key
.
items
()
}
mm_matches
=
{
key
:
find_token_matches
(
prompt
,
prompt_repls
)
for
key
,
prompt_repls
in
mm_prompt_repls
.
items
()
}
result
=
replace_token_matches
(
prompt
,
mm_matches
,
{
key
:
mm_count
for
key
in
repl_by_key
},
)
# Only displayed on error
print
(
"mm_matches:"
,
mm_matches
)
print
(
"result:"
,
result
)
# Manually constructed results
assert
result
==
expected
for
(
update_type
,
expected_by_mm_count
,
)
in
expected_by_update_type_mm_count
.
items
():
mm_prompt_updates
=
{
key
:
[
update_type
(
key
,
target
,
repl_by_key
[
key
]).
bind
(
mock_tokenizer
)]
for
key
,
target
in
target_by_key
.
items
()
}
mm_matches
=
{
key
:
find_token_matches
(
prompt
,
updates
)
for
key
,
updates
in
mm_prompt_updates
.
items
()
}
for
mm_count
,
expected
in
expected_by_mm_count
.
items
():
result
=
apply_token_matches
(
prompt
,
mm_matches
,
{
key
:
mm_count
for
key
in
repl_by_key
},
)
# Only displayed on error
print
(
"update_type:"
,
update_type
)
print
(
"mm_count:"
,
mm_count
)
print
(
"mm_matches:"
,
mm_matches
)
print
(
"result:"
,
result
)
# Manually constructed results
assert
result
==
expected
# yapf: disable
...
...
@@ -524,22 +864,24 @@ def test_find_replace_tokens(
),
]
)
@
pytest
.
mark
.
parametrize
(
"update_type"
,
[
PromptInsertion
,
PromptReplacement
])
# yapf: enable
def
test_find_mm_placeholders
(
repl_by_key
,
prompt
,
expected
,
update_type
,
):
# Should not be used since there is nothing to convert to tokens
mock_tokenizer
=
cast
(
AnyTokenizer
,
object
())
mm_prompt_
repl
s
=
{
key
:
[
PromptReplacement
(
key
,
[],
repl
).
bind
(
mock_tokenizer
)]
mm_prompt_
update
s
=
{
key
:
[
update_type
(
key
,
[],
repl
).
bind
(
mock_tokenizer
)]
for
key
,
repl
in
repl_by_key
.
items
()
}
result
=
find_mm_placeholders
(
mm_prompt_
repl
s
,
mm_prompt_
update
s
,
prompt
,
# Effectively match all occurrences in the prompt
{
key
:
3
...
...
@@ -553,8 +895,46 @@ def test_find_mm_placeholders(
assert
result
==
expected
def
_dummy_elem
(
modality
:
str
,
key
:
str
,
size
:
int
):
return
MultiModalFieldElem
(
modality
=
modality
,
key
=
key
,
data
=
torch
.
empty
((
size
,
),
dtype
=
torch
.
int8
),
field
=
MultiModalSharedField
(
1
),
)
def
_dummy_item
(
modality
:
str
,
size_by_key
:
dict
[
str
,
int
]):
return
MultiModalKwargsItem
.
from_elems
([
_dummy_elem
(
modality
,
key
,
size
)
for
key
,
size
in
size_by_key
.
items
()
])
def
_dummy_kw
(
size_by_key_modality
:
dict
[
str
,
dict
[
str
,
int
]]):
return
MultiModalKwargs
.
from_items
([
_dummy_item
(
modality
,
size_by_key
)
for
modality
,
size_by_key
in
size_by_key_modality
.
items
()
])
# yapf: disable
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"s3://vllm-ci-model-weights/llava-v1.6-mistral-7b-hf"
])
(
"item"
,
"expected_size"
),
[
(
_dummy_item
(
"a"
,
{
"a1"
:
100
}),
100
),
(
_dummy_item
(
"a"
,
{
"a1"
:
100
,
"a2"
:
110
}),
210
),
(
_dummy_kw
({
"a"
:
{
"a1"
:
100
,
"a2"
:
110
},
"b"
:
{
"b1"
:
120
,
"b2"
:
130
}}),
460
),
# noqa: E501
],
)
# yapf: enable
def
test_cache_item_size
(
item
,
expected_size
):
cache
=
ProcessingCache
.
get_lru_cache
(
2048
,
type
(
item
))
cache
[
""
]
=
item
assert
cache
.
currsize
==
expected_size
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"llava-hf/llava-v1.6-mistral-7b-hf"
])
@
pytest
.
mark
.
parametrize
(
(
"limit"
,
"num_supported"
,
"is_valid"
),
[(
0
,
0
,
True
),
(
0
,
1
,
True
),
(
1
,
0
,
False
),
(
1
,
1
,
True
),
(
1
,
2
,
True
),
...
...
@@ -570,7 +950,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
tokenizer_mode
=
"auto"
,
trust_remote_code
=
False
,
seed
=
0
,
dtype
=
"
half
"
,
dtype
=
"
auto
"
,
revision
=
None
,
limit_mm_per_prompt
=
limit_mm_per_prompt
,
)
...
...
@@ -590,11 +970,10 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
exc_ctx
=
pytest
.
raises
(
ValueError
,
match
=
"this model only supports"
)
with
exc_ctx
:
profiler
.
get_dummy_data
(
model_config
.
max_model_len
)
profiler
.
get_
decoder_
dummy_data
(
model_config
.
max_model_len
)
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"s3://vllm-ci-model-weights/llava-v1.6-mistral-7b-hf"
])
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"llava-hf/llava-v1.6-mistral-7b-hf"
])
@
pytest
.
mark
.
parametrize
(
(
"num_images"
,
"limit"
,
"is_valid"
),
[(
0
,
0
,
True
),
(
0
,
1
,
True
),
(
1
,
0
,
False
),
(
1
,
1
,
True
),
(
1
,
2
,
True
),
...
...
@@ -610,7 +989,7 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
tokenizer_mode
=
"auto"
,
trust_remote_code
=
False
,
seed
=
0
,
dtype
=
"
half
"
,
dtype
=
"
auto
"
,
revision
=
None
,
limit_mm_per_prompt
=
limit_mm_per_prompt
,
)
...
...
@@ -683,7 +1062,7 @@ def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
tokenizer_mode
=
"auto"
,
trust_remote_code
=
False
,
seed
=
0
,
dtype
=
"
half
"
,
dtype
=
"
auto
"
,
revision
=
None
,
)
...
...
tests/multimodal/test_utils.py
View file @
469e903b
...
...
@@ -4,7 +4,7 @@ import base64
import
mimetypes
import
os
from
tempfile
import
NamedTemporaryFile
,
TemporaryDirectory
from
typing
import
TYPE_CHECKING
,
Dict
,
NamedTuple
,
Optional
,
Tuple
from
typing
import
TYPE_CHECKING
,
NamedTuple
,
Optional
import
numpy
as
np
import
pytest
...
...
@@ -33,7 +33,7 @@ TEST_IMAGE_URLS = [
@
pytest
.
fixture
(
scope
=
"module"
)
def
url_images
()
->
D
ict
[
str
,
Image
.
Image
]:
def
url_images
()
->
d
ict
[
str
,
Image
.
Image
]:
connector
=
MediaConnector
()
return
{
...
...
@@ -42,7 +42,7 @@ def url_images() -> Dict[str, Image.Image]:
}
def
get_supported_suffixes
()
->
T
uple
[
str
,
...]:
def
get_supported_suffixes
()
->
t
uple
[
str
,
...]:
# We should at least test the file types mentioned in GPT-4 with Vision
OPENAI_SUPPORTED_SUFFIXES
=
(
'.png'
,
'.jpeg'
,
'.jpg'
,
'.webp'
,
'.gif'
)
...
...
@@ -69,7 +69,7 @@ async def test_fetch_image_http(image_url: str):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_URLS
)
@
pytest
.
mark
.
parametrize
(
"suffix"
,
get_supported_suffixes
())
async
def
test_fetch_image_base64
(
url_images
:
D
ict
[
str
,
Image
.
Image
],
async
def
test_fetch_image_base64
(
url_images
:
d
ict
[
str
,
Image
.
Image
],
image_url
:
str
,
suffix
:
str
):
connector
=
MediaConnector
()
url_image
=
url_images
[
image_url
]
...
...
tests/neuron/1_core/test_activation.py
0 → 100644
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
import
pytest
import
torch
import
torch.nn.functional
as
F
from
vllm.model_executor.layers.activation
import
FastGELU
,
SiluAndMul
from
vllm.platforms
import
current_platform
@
pytest
.
mark
.
parametrize
(
"activation"
,
[
"silu_and_mul"
,
"gelu_fast"
])
@
pytest
.
mark
.
parametrize
(
"num_tokens,d,dtype"
,
[
(
7
,
512
,
torch
.
half
),
(
7
,
512
,
torch
.
float
),
(
83
,
512
,
torch
.
half
),
])
@
torch
.
inference_mode
()
def
test_act_and_mul
(
activation
:
str
,
num_tokens
:
int
,
d
:
int
,
dtype
:
torch
.
dtype
,
)
->
None
:
import
torch_xla.core.xla_model
as
xm
device
=
xm
.
xla_device
()
current_platform
.
seed_everything
(
0
)
torch
.
set_default_device
(
"cpu"
)
x
=
torch
.
randn
(
num_tokens
,
2
*
d
,
dtype
=
dtype
).
to
(
device
=
device
)
if
activation
==
"silu_and_mul"
:
layer
=
SiluAndMul
()
fn
=
layer
.
forward_native
elif
activation
==
"gelu_fast"
:
layer
=
FastGELU
()
fn
=
F
.
gelu
else
:
raise
NotImplementedError
(
f
"activation
{
activation
}
is not implemented."
)
assert
x
.
is_xla
,
"input tensor under testing is expected to be XLA tensor."
out
=
layer
.
to
(
device
=
device
).
forward_neuron
(
x
)
ref_out
=
fn
(
x
.
cpu
())
torch
.
testing
.
assert_close
(
out
.
cpu
(),
ref_out
,
atol
=
0.01
,
rtol
=
0.0
)
Prev
1
…
21
22
23
24
25
26
27
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment