Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
6d2051cc
Commit
6d2051cc
authored
Oct 21, 2024
by
zhuwenwen
Browse files
Merge tag 'v0.6.3.post1' into v0.6.3.post1-dev
parents
2c7f740a
a2c71c54
Changes
457
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1040 additions
and
392 deletions
+1040
-392
tests/models/decoder_only/vision_language/test_qwen.py
tests/models/decoder_only/vision_language/test_qwen.py
+7
-7
tests/models/embedding/language/test_embedding.py
tests/models/embedding/language/test_embedding.py
+22
-19
tests/models/embedding/utils.py
tests/models/embedding/utils.py
+29
-0
tests/models/embedding/vision_language/__init__.py
tests/models/embedding/vision_language/__init__.py
+0
-0
tests/models/embedding/vision_language/test_phi3v.py
tests/models/embedding/vision_language/test_phi3v.py
+62
-0
tests/models/encoder_decoder/language/test_bart.py
tests/models/encoder_decoder/language/test_bart.py
+211
-217
tests/models/encoder_decoder/vision_language/test_broadcast.py
.../models/encoder_decoder/vision_language/test_broadcast.py
+35
-0
tests/models/encoder_decoder/vision_language/test_mllama.py
tests/models/encoder_decoder/vision_language/test_mllama.py
+119
-64
tests/models/test_oot_registration.py
tests/models/test_oot_registration.py
+52
-2
tests/models/test_registry.py
tests/models/test_registry.py
+69
-9
tests/models/utils.py
tests/models/utils.py
+8
-1
tests/mq_llm_engine/test_error_handling.py
tests/mq_llm_engine/test_error_handling.py
+6
-6
tests/mq_llm_engine/utils.py
tests/mq_llm_engine/utils.py
+1
-1
tests/multi_step/test_correctness_async_llm.py
tests/multi_step/test_correctness_async_llm.py
+91
-1
tests/multi_step/test_correctness_llm.py
tests/multi_step/test_correctness_llm.py
+160
-2
tests/multimodal/test_processor_kwargs.py
tests/multimodal/test_processor_kwargs.py
+73
-43
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
...ins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
+14
-20
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
...dd_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
+34
-0
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
...ins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
+28
-0
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
...ugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
+19
-0
No files found.
Too many changes to show.
To preserve performance only
457 of 457+
files are displayed.
Plain diff
Email patch
tests/models/decoder_only/vision_language/test_qwen.py
View file @
6d2051cc
...
...
@@ -5,7 +5,7 @@ import pytest
import
torch
from
PIL.Image
import
Image
from
vllm.inputs
import
InputContext
,
LLMI
nputs
from
vllm.inputs
import
InputContext
,
token_i
nputs
from
vllm.multimodal.base
import
MultiModalInputs
from
vllm.multimodal.utils
import
cached_get_tokenizer
,
rescale_image_size
...
...
@@ -71,12 +71,12 @@ def test_input_processor_valid_mm_data(input_processor_for_qwen,
"""Happy cases for image inputs to Qwen's multimodal input processor."""
prompt
=
""
.
join
(
[
f
"Picture
{
num
}
: <img></img>
\n
"
for
num
in
range
(
1
,
num_images
+
1
)])
inputs
=
LLMI
nputs
(
inputs
=
token_i
nputs
(
prompt
=
prompt
,
# When processing multimodal data for a multimodal model, the qwen
# input processor will overwrite the provided prompt_token_ids with
# the image prompts
prompt_token_ids
=
None
,
prompt_token_ids
=
[]
,
multi_modal_data
=
{
"image"
:
torch
.
rand
(
num_images
,
TOKS_PER_IMG
,
4096
)},
)
proc_inputs
=
input_processor_for_qwen
(
qwen_vl_context
,
inputs
)
...
...
@@ -134,9 +134,9 @@ def test_input_processor_invalid_mm_data(input_processor_for_qwen,
trust_remote_code
=
True
)
prompt
=
"Picture 1: <img></img>
\n
"
prompt_token_ids
=
tokenizer
.
encode
(
prompt
)
inputs
=
LLMI
nputs
(
prompt
=
prompt
,
prompt_token_ids
=
prompt_token_ids
,
multi_modal_data
=
mm_data
)
inputs
=
token_i
nputs
(
prompt
=
prompt
,
prompt_token_ids
=
prompt_token_ids
,
multi_modal_data
=
mm_data
)
# Should fail since we have too many or too few dimensions for embeddings
with
pytest
.
raises
(
ValueError
):
input_processor_for_qwen
(
qwen_vl_context
,
inputs
)
...
...
@@ -221,7 +221,7 @@ def run_test(
# Qwen encodes each image into a fixed content size of 256
with
vllm_runner
(
model
,
max_model_len
=
1024
,
max_num_seqs
=
1
,
max_num_seqs
=
2
,
dtype
=
dtype
,
limit_mm_per_prompt
=
{
"image"
:
mm_limit
},
tensor_parallel_size
=
tensor_parallel_size
,
...
...
tests/models/embedding/language/test_embedding.py
View file @
6d2051cc
"""Compare the outputs of HF and vLLM
for Mistral models using greedy sampling
.
"""Compare the
embedding
outputs of HF and vLLM
models
.
Run `pytest tests/models/
test_llama
_embedding.py`.
Run `pytest tests/models/
embedding/language/test
_embedding.py`.
"""
import
pytest
import
torch
import
torch.nn.functional
as
F
from
..utils
import
check_embeddings_close
MODELS
=
[
"intfloat/e5-mistral-7b-instruct"
,
"BAAI/bge-multilingual-gemma2"
,
]
def
compare_embeddings
(
embeddings1
,
embeddings2
):
similarities
=
[
F
.
cosine_similarity
(
torch
.
tensor
(
e1
),
torch
.
tensor
(
e2
),
dim
=
0
)
for
e1
,
e2
in
zip
(
embeddings1
,
embeddings2
)
]
return
similarities
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
def
test_models
(
...
...
@@ -28,15 +21,25 @@ def test_models(
model
:
str
,
dtype
:
str
,
)
->
None
:
with
hf_runner
(
model
,
dtype
=
dtype
,
is_embedding_model
=
True
)
as
hf_model
:
# The example_prompts has ending "\n", for example:
# "Write a short story about a robot that dreams for the first time.\n"
# sentence_transformers will strip the input texts, see:
# https://github.com/UKPLab/sentence-transformers/blob/v3.1.1/sentence_transformers/models/Transformer.py#L159
# This makes the input_ids different between hf_model and vllm_model.
# So we need to strip the input texts to avoid test failing.
example_prompts
=
[
str
(
s
).
strip
()
for
s
in
example_prompts
]
with
hf_runner
(
model
,
dtype
=
dtype
,
is_sentence_transformer
=
True
)
as
hf_model
:
hf_outputs
=
hf_model
.
encode
(
example_prompts
)
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
encode
(
example_prompts
)
similarities
=
compare_embeddings
(
hf_outputs
,
vllm_outputs
)
all_similarities
=
torch
.
stack
(
similarities
)
tolerance
=
1e-2
assert
torch
.
all
((
all_similarities
<=
1.0
+
tolerance
)
&
(
all_similarities
>=
1.0
-
tolerance
)
),
f
"Not all values are within
{
tolerance
}
of 1.0"
check_embeddings_close
(
embeddings_0_lst
=
hf_outputs
,
embeddings_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
tol
=
1e-2
,
)
tests/models/embedding/utils.py
0 → 100644
View file @
6d2051cc
from
typing
import
List
,
Sequence
import
torch
import
torch.nn.functional
as
F
def
check_embeddings_close
(
*
,
embeddings_0_lst
:
Sequence
[
List
[
float
]],
embeddings_1_lst
:
Sequence
[
List
[
float
]],
name_0
:
str
,
name_1
:
str
,
tol
:
float
=
1e-3
,
)
->
None
:
assert
len
(
embeddings_0_lst
)
==
len
(
embeddings_1_lst
)
for
prompt_idx
,
(
embeddings_0
,
embeddings_1
)
in
enumerate
(
zip
(
embeddings_0_lst
,
embeddings_1_lst
)):
assert
len
(
embeddings_0
)
==
len
(
embeddings_1
)
sim
=
F
.
cosine_similarity
(
torch
.
tensor
(
embeddings_0
),
torch
.
tensor
(
embeddings_1
),
dim
=
0
)
fail_msg
=
(
f
"Test
{
prompt_idx
}
:"
f
"
\n
{
name_0
}
:
\t
{
embeddings_0
!
r
}
"
f
"
\n
{
name_1
}
:
\t
{
embeddings_1
!
r
}
"
)
assert
sim
>=
1
-
tol
,
fail_msg
tests/models/embedding/vision_language/__init__.py
0 → 100644
View file @
6d2051cc
tests/models/embedding/vision_language/test_phi3v.py
0 → 100644
View file @
6d2051cc
import
pytest
import
torch.nn.functional
as
F
from
....conftest
import
IMAGE_ASSETS
from
..utils
import
check_embeddings_close
HF_IMAGE_PROMPTS
=
IMAGE_ASSETS
.
prompts
({
"stop_sign"
:
"<|image_1|> Select the portion of the image that isolates the object of the given label: The label of the object is stop sign"
,
# noqa: E501
"cherry_blossom"
:
"<|image_1|> Represent the given image with the following question: What is in the image"
,
# noqa: E501
})
MODELS
=
[
"TIGER-Lab/VLM2Vec-Full"
]
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
def
test_models
(
hf_runner
,
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
)
->
None
:
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
with
vllm_runner
(
model
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
dtype
=
dtype
,
enforce_eager
=
True
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
encode
(
example_prompts
)
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
all_inputs
=
hf_model
.
get_inputs
(
example_prompts
)
all_outputs
=
[]
for
inputs
in
all_inputs
:
# Based on: https://github.com/TIGER-AI-Lab/VLM2Vec/blob/db3b951bccabba220c1f53ab46a734e50dd2fc08/src/model.py
outputs
=
hf_model
.
model
(
**
hf_model
.
wrap_device
(
inputs
,
device
=
hf_model
.
model
.
device
.
type
),
return_dict
=
True
,
output_hidden_states
=
True
,
)
last_hidden_state
=
outputs
.
hidden_states
[
-
1
][
0
]
reps
=
last_hidden_state
[
inputs
.
attention_mask
[
0
].
sum
()
-
1
]
pooled_output
=
F
.
normalize
(
reps
,
p
=
2
,
dim
=-
1
)
all_outputs
.
append
(
pooled_output
.
tolist
())
hf_outputs
=
all_outputs
check_embeddings_close
(
embeddings_0_lst
=
hf_outputs
,
embeddings_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
tests/models/encoder_decoder/language/test_bart.py
View file @
6d2051cc
...
...
@@ -4,220 +4,214 @@ Run `pytest tests/models/encoder_decoder/language/test_bart.py`.
"""
from
typing
import
List
,
Optional
,
Tuple
,
Type
from
vllm.utils
import
is_cpu
if
not
is_cpu
():
# CPU backend is not currently supported with encoder/decoder models
# skip test definitions entirely to avoid importing GPU kernel libs
# (xFormers, etc.)
import
pytest
from
transformers
import
AutoModelForSeq2SeqLM
from
vllm.sequence
import
SampleLogprobs
from
....conftest
import
(
DecoderPromptType
,
ExplicitEncoderDecoderPrompt
,
HfRunner
,
VllmRunner
)
from
....utils
import
multi_gpu_test
from
...utils
import
check_logprobs_close
MODELS
=
[
"facebook/bart-base"
,
"facebook/bart-large-cnn"
]
def
vllm_to_hf_output
(
vllm_output
:
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]],
decoder_prompt_type
:
DecoderPromptType
,
):
"""Sanitize vllm output to be comparable with hf output."""
output_ids
,
output_str
,
out_logprobs
=
vllm_output
hf_output_str
=
output_str
+
"</s>"
if
decoder_prompt_type
==
DecoderPromptType
.
NONE
:
hf_output_str
=
"<s>"
+
hf_output_str
return
output_ids
,
hf_output_str
,
out_logprobs
def
run_test
(
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
prompts
:
List
[
ExplicitEncoderDecoderPrompt
[
str
,
str
]],
decoder_prompt_type
:
DecoderPromptType
,
model
:
str
,
*
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
tensor_parallel_size
:
int
,
distributed_executor_backend
:
Optional
[
str
]
=
None
,
)
->
None
:
'''
Test the vLLM BART model for a variety of encoder/decoder input prompts,
by validating it against HuggingFace (HF) BART.
Arguments:
* hf_runner: HuggingFace (HF) test model runner
* vllm_runner: vLLM test model runner
* example_encoder_decoder_prompts: test fixture which provides a
dictionary of dummy prompts
* model: the HF ID of the specific BART variant under test
* dtype: the tensor datatype to employ
* max_tokens
* num_logprobs
* decoder_prompt_type: key into the example_encoder_decoder_prompts
dictionary; selects specific encoder/decoder
prompt scenarios to test
A note on using HF BART as a baseline for validating vLLM BART,
specifically when the decoder prompt is None.
The HF GenerationMixin's default behavior is to force the first
decoded token to be <BOS> if the prompt does not already contain
<BOS> (this is accomplished using a logit
processor setting.)
So when we use HF BART as our baseline for comparison, note that
when the user provides a request with a None decoder prompt
(i.e. a singleton encoder prompt, or else an explicit encoder/
decoder prompt with the decoder sub-prompt set to None), HF and
vLLM handle this in different ways:
* HF will (1) tokenize the None prompt as an empty token-list,
(2) append <decoder-start-token> to the beginning, yielding
[<decoder-start-token>], (3) pass this token list to the model, and
then (4) after computing logits during prefill, override the model
logits & force <BOS> to be the first generated token.
* vLLM will (1) tokenize the None prompt as [<BOS>], (2) append decoder-
start-token to the beginning, yielding [<decoder-start-token><BOS>],
(3) pass these tokens to the model & proceed with generation.
The net effect is that compared to vLLM, the list of HF *decoded* tokens
will contain one more initial <BOS> than the vLLM generated tokens,
because vLLM's <BOS> token is injected into the prompt rather than into
the generated output. This is in spite of the fact that overall, the
complete sequences (prompt + decoded tokens) produced by vLLM will match
HF.
So when we use HF decoded token output to validate vLLM's decoded token
output, the testing process must account for the difference in decoded
token sequences between vLLM and HF specifically in the
decoder-prompt-is-None case.
One option is to disable the logit processor feature that forces the
<BOS> token to be decoded (forced_bos_token_id = None), eliminating
the problem entirely. However this is not "normal" BART usage.
The other option is - only in the decoder-prompt-is-None case - to
discard the first decoded token from the HF output before comparing it
to vLLM.
To that end, when testing the scenario where the decoder prompt is None
(and only in that one scenario), this test skips the first HF decoded
token during the process of validating the vLLM decoded output.
'''
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default).
# Note: currently encoder/decoder models are only compatible with
# enforce_eager=True. Normally this is not a problem because
# for encoder/decoder models vLLM will
# default to enforce_eager=True if enforce_eager
# is left unspecified. However, the
# VllmRunner test fixture (which wraps around the LLM class) defaults to
# enforce_eager=False (a behavior which a number of already-exisitng
# decoder-only unit tests expect), so when testing an encoder/decoder
# model we must explicitly specify enforce_eager=True in the VllmRunner
# constructor.
with
vllm_runner
(
model
,
dtype
=
dtype
,
tensor_parallel_size
=
tensor_parallel_size
,
distributed_executor_backend
=
distributed_executor_backend
,
enforce_eager
=
True
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_encoder_decoder_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
)
# Configuration settings for HF baseline
hf_kwargs
=
{
"top_k"
:
None
,
"num_beams"
:
1
,
"repetition_penalty"
:
1.0
,
"top_p"
:
1.0
,
"length_penalty"
:
1.0
,
"early_stopping"
:
False
,
"no_repeat_ngram_size"
:
None
,
"min_length"
:
0
}
with
hf_runner
(
model
,
dtype
=
dtype
,
auto_cls
=
AutoModelForSeq2SeqLM
)
as
hf_model
:
hf_outputs
=
(
hf_model
.
generate_encoder_decoder_greedy_logprobs_limit
(
prompts
,
max_tokens
,
num_logprobs
,
**
hf_kwargs
,
))
hf_skip_tokens
=
(
1
if
decoder_prompt_type
==
DecoderPromptType
.
NONE
else
0
)
check_logprobs_close
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
[
vllm_to_hf_output
(
vllm_output
,
decoder_prompt_type
)
for
vllm_output
in
vllm_outputs
],
name_0
=
"hf"
,
name_1
=
"vllm"
,
num_outputs_0_skip_tokens
=
hf_skip_tokens
,
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
,
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
64
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"decoder_prompt_type"
,
list
(
DecoderPromptType
))
def
test_models
(
hf_runner
,
vllm_runner
,
example_encoder_decoder_prompts
,
model
,
dtype
,
max_tokens
,
num_logprobs
,
decoder_prompt_type
)
->
None
:
run_test
(
hf_runner
,
vllm_runner
,
example_encoder_decoder_prompts
[
decoder_prompt_type
],
decoder_prompt_type
,
model
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
tensor_parallel_size
=
1
,
)
@
multi_gpu_test
(
num_gpus
=
2
)
@
pytest
.
mark
.
parametrize
(
"distributed_executor_backend"
,
[
"ray"
,
"mp"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"facebook/bart-large-cnn"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
64
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"decoder_prompt_type"
,
[
DecoderPromptType
.
CUSTOM
])
def
test_models_distributed
(
hf_runner
,
vllm_runner
,
example_encoder_decoder_prompts
,
distributed_executor_backend
,
model
,
dtype
,
max_tokens
,
num_logprobs
,
decoder_prompt_type
)
->
None
:
run_test
(
hf_runner
,
vllm_runner
,
example_encoder_decoder_prompts
[
decoder_prompt_type
],
decoder_prompt_type
,
model
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
tensor_parallel_size
=
2
,
distributed_executor_backend
=
distributed_executor_backend
,
)
import
pytest
from
transformers
import
AutoModelForSeq2SeqLM
from
vllm.sequence
import
SampleLogprobs
from
....conftest
import
(
DecoderPromptType
,
ExplicitEncoderDecoderPrompt
,
HfRunner
,
VllmRunner
)
from
....utils
import
multi_gpu_test
from
...utils
import
check_logprobs_close
MODELS
=
[
"facebook/bart-base"
,
"facebook/bart-large-cnn"
]
def
vllm_to_hf_output
(
vllm_output
:
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]],
decoder_prompt_type
:
DecoderPromptType
,
):
"""Sanitize vllm output to be comparable with hf output."""
output_ids
,
output_str
,
out_logprobs
=
vllm_output
hf_output_str
=
output_str
+
"</s>"
if
decoder_prompt_type
==
DecoderPromptType
.
NONE
:
hf_output_str
=
"<s>"
+
hf_output_str
return
output_ids
,
hf_output_str
,
out_logprobs
def
run_test
(
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
prompts
:
List
[
ExplicitEncoderDecoderPrompt
[
str
,
str
]],
decoder_prompt_type
:
DecoderPromptType
,
model
:
str
,
*
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
tensor_parallel_size
:
int
,
distributed_executor_backend
:
Optional
[
str
]
=
None
,
)
->
None
:
'''
Test the vLLM BART model for a variety of encoder/decoder input prompts,
by validating it against HuggingFace (HF) BART.
Arguments:
* hf_runner: HuggingFace (HF) test model runner
* vllm_runner: vLLM test model runner
* example_encoder_decoder_prompts: test fixture which provides a
dictionary of dummy prompts
* model: the HF ID of the specific BART variant under test
* dtype: the tensor datatype to employ
* max_tokens
* num_logprobs
* decoder_prompt_type: key into the example_encoder_decoder_prompts
dictionary; selects specific encoder/decoder
prompt scenarios to test
A note on using HF BART as a baseline for validating vLLM BART,
specifically when the decoder prompt is None.
The HF GenerationMixin's default behavior is to force the first
decoded token to be <BOS> if the prompt does not already contain
<BOS> (this is accomplished using a logit
processor setting.)
So when we use HF BART as our baseline for comparison, note that
when the user provides a request with a None decoder prompt
(i.e. a singleton encoder prompt, or else an explicit encoder/
decoder prompt with the decoder sub-prompt set to None), HF and
vLLM handle this in different ways:
* HF will (1) tokenize the None prompt as an empty token-list,
(2) append <decoder-start-token> to the beginning, yielding
[<decoder-start-token>], (3) pass this token list to the model, and
then (4) after computing logits during prefill, override the model
logits & force <BOS> to be the first generated token.
* vLLM will (1) tokenize the None prompt as [<BOS>], (2) append decoder-
start-token to the beginning, yielding [<decoder-start-token><BOS>],
(3) pass these tokens to the model & proceed with generation.
The net effect is that compared to vLLM, the list of HF *decoded* tokens
will contain one more initial <BOS> than the vLLM generated tokens,
because vLLM's <BOS> token is injected into the prompt rather than into
the generated output. This is in spite of the fact that overall, the
complete sequences (prompt + decoded tokens) produced by vLLM will match
HF.
So when we use HF decoded token output to validate vLLM's decoded token
output, the testing process must account for the difference in decoded
token sequences between vLLM and HF specifically in the
decoder-prompt-is-None case.
One option is to disable the logit processor feature that forces the
<BOS> token to be decoded (forced_bos_token_id = None), eliminating
the problem entirely. However this is not "normal" BART usage.
The other option is - only in the decoder-prompt-is-None case - to
discard the first decoded token from the HF output before comparing it
to vLLM.
To that end, when testing the scenario where the decoder prompt is None
(and only in that one scenario), this test skips the first HF decoded
token during the process of validating the vLLM decoded output.
'''
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default).
# Note: currently encoder/decoder models are only compatible with
# enforce_eager=True. Normally this is not a problem because
# for encoder/decoder models vLLM will
# default to enforce_eager=True if enforce_eager
# is left unspecified. However, the
# VllmRunner test fixture (which wraps around the LLM class) defaults to
# enforce_eager=False (a behavior which a number of already-exisitng
# decoder-only unit tests expect), so when testing an encoder/decoder
# model we must explicitly specify enforce_eager=True in the VllmRunner
# constructor.
with
vllm_runner
(
model
,
dtype
=
dtype
,
tensor_parallel_size
=
tensor_parallel_size
,
distributed_executor_backend
=
distributed_executor_backend
,
enforce_eager
=
True
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_encoder_decoder_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
)
# Configuration settings for HF baseline
hf_kwargs
=
{
"top_k"
:
None
,
"num_beams"
:
1
,
"repetition_penalty"
:
1.0
,
"top_p"
:
1.0
,
"length_penalty"
:
1.0
,
"early_stopping"
:
False
,
"no_repeat_ngram_size"
:
None
,
"min_length"
:
0
}
with
hf_runner
(
model
,
dtype
=
dtype
,
auto_cls
=
AutoModelForSeq2SeqLM
)
as
hf_model
:
hf_outputs
=
(
hf_model
.
generate_encoder_decoder_greedy_logprobs_limit
(
prompts
,
max_tokens
,
num_logprobs
,
**
hf_kwargs
,
))
hf_skip_tokens
=
(
1
if
decoder_prompt_type
==
DecoderPromptType
.
NONE
else
0
)
check_logprobs_close
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
[
vllm_to_hf_output
(
vllm_output
,
decoder_prompt_type
)
for
vllm_output
in
vllm_outputs
],
name_0
=
"hf"
,
name_1
=
"vllm"
,
num_outputs_0_skip_tokens
=
hf_skip_tokens
,
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
,
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
64
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"decoder_prompt_type"
,
list
(
DecoderPromptType
))
def
test_models
(
hf_runner
,
vllm_runner
,
example_encoder_decoder_prompts
,
model
,
dtype
,
max_tokens
,
num_logprobs
,
decoder_prompt_type
)
->
None
:
run_test
(
hf_runner
,
vllm_runner
,
example_encoder_decoder_prompts
[
decoder_prompt_type
],
decoder_prompt_type
,
model
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
tensor_parallel_size
=
1
,
)
@
multi_gpu_test
(
num_gpus
=
2
)
@
pytest
.
mark
.
parametrize
(
"distributed_executor_backend"
,
[
"ray"
,
"mp"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"facebook/bart-large-cnn"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
64
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"decoder_prompt_type"
,
[
DecoderPromptType
.
CUSTOM
])
def
test_models_distributed
(
hf_runner
,
vllm_runner
,
example_encoder_decoder_prompts
,
distributed_executor_backend
,
model
,
dtype
,
max_tokens
,
num_logprobs
,
decoder_prompt_type
)
->
None
:
run_test
(
hf_runner
,
vllm_runner
,
example_encoder_decoder_prompts
[
decoder_prompt_type
],
decoder_prompt_type
,
model
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
tensor_parallel_size
=
2
,
distributed_executor_backend
=
distributed_executor_backend
,
)
tests/models/encoder_decoder/vision_language/test_broadcast.py
0 → 100644
View file @
6d2051cc
import
pytest
from
....utils
import
multi_gpu_test
@
multi_gpu_test
(
num_gpus
=
2
)
@
pytest
.
mark
.
parametrize
(
"distributed_executor_backend"
,
[
"ray"
,
"mp"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"meta-llama/Llama-3.2-11B-Vision-Instruct"
,
])
def
test_models
(
hf_runner
,
vllm_runner
,
image_assets
,
distributed_executor_backend
,
model
)
->
None
:
dtype
=
"half"
max_tokens
=
5
num_logprobs
=
5
tensor_parallel_size
=
2
if
model
.
startswith
(
"meta-llama/Llama-3.2-11B-Vision-Instruct"
):
from
.test_mllama
import
models
,
run_test
else
:
raise
NotImplementedError
(
f
"Unsupported model:
{
model
}
"
)
run_test
(
hf_runner
,
vllm_runner
,
image_assets
,
model
=
models
[
0
],
size_factors
=
[
0.25
,
0.5
,
1.0
],
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
tensor_parallel_size
=
tensor_parallel_size
,
distributed_executor_backend
=
distributed_executor_backend
,
)
tests/models/encoder_decoder/vision_language/test_mllama.py
View file @
6d2051cc
...
...
@@ -9,10 +9,10 @@ from vllm.sequence import SampleLogprobs
from
....conftest
import
(
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
,
_ImageAssets
)
from
....utils
import
multi
_gpu_test
from
....utils
import
large
_gpu_test
from
...utils
import
check_logprobs_close
_LIMIT_IMAGE_PER_PROMPT
=
1
_LIMIT_IMAGE_PER_PROMPT
=
3
HF_IMAGE_PROMPTS
=
IMAGE_ASSETS
.
prompts
({
"stop_sign"
:
...
...
@@ -47,14 +47,46 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
if
token_id
!=
image_token_id
or
output_ids
[
idx
-
1
]
!=
image_token_id
]
assert
output_str
[
0
]
==
" "
hf_output_str
=
output_str
[
1
:]
hf_output_str
=
output_str
if
hf_output_ids
[
-
1
]
==
eos_token_id
:
hf_output_str
=
hf_output_str
+
tokenizer
.
decode
(
eos_token_id
)
return
hf_output_ids
,
hf_output_str
,
out_logprobs
def
_get_inputs
(
image_assets
:
_ImageAssets
,
*
,
size_factors
:
Optional
[
List
[
float
]]
=
None
,
sizes
:
Optional
[
List
[
Tuple
[
int
,
int
]]]
=
None
,
)
->
List
[
Tuple
[
List
[
str
],
PromptImageInput
]]:
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
if
size_factors
is
not
None
:
inputs_per_image
=
[(
[
prompt
for
_
in
size_factors
],
[
rescale_image_size
(
image
,
factor
)
for
factor
in
size_factors
],
)
for
image
,
prompt
in
zip
(
images
,
HF_IMAGE_PROMPTS
)]
elif
sizes
is
not
None
:
inputs_per_image
=
[(
[
prompt
if
size
is
not
None
else
text_only_prompts
[
0
]
for
size
in
sizes
],
[
image
.
resize
(
size
)
if
size
is
not
None
else
None
for
size
in
sizes
],
)
for
image
,
prompt
in
zip
(
images
,
HF_IMAGE_PROMPTS
)]
if
len
(
sizes
)
==
0
:
inputs_per_image
.
append
(
(
text_only_prompts
,
[
None
]
*
len
(
text_only_prompts
)))
else
:
raise
ValueError
(
"You must provide either `size_factors` or `sizes`"
)
return
inputs_per_image
@
overload
def
run_test
(
hf_runner
:
Type
[
HfRunner
],
...
...
@@ -103,39 +135,17 @@ def run_test(
tensor_parallel_size
:
int
,
distributed_executor_backend
:
Optional
[
str
]
=
None
,
):
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
if
size_factors
is
not
None
:
inputs_per_image
=
[(
[
prompt
for
_
in
size_factors
],
[
rescale_image_size
(
image
,
factor
)
for
factor
in
size_factors
],
)
for
image
,
prompt
in
zip
(
images
,
HF_IMAGE_PROMPTS
)]
elif
sizes
is
not
None
:
inputs_per_image
=
[(
[
prompt
if
size
is
not
None
else
text_only_prompts
[
0
]
for
size
in
sizes
],
[
image
.
resize
(
size
)
if
size
is
not
None
else
None
for
size
in
sizes
],
)
for
image
,
prompt
in
zip
(
images
,
HF_IMAGE_PROMPTS
)]
if
len
(
sizes
)
==
0
:
inputs_per_image
.
append
(
(
text_only_prompts
,
[
None
]
*
len
(
text_only_prompts
)))
else
:
raise
ValueError
(
"You must provide either `size_factors` or `sizes`"
)
_run_test
(
hf_runner
,
vllm_runner
,
inputs_per_image
,
model
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
tensor_parallel_size
=
tensor_parallel_size
,
distributed_executor_backend
=
distributed_executor_backend
)
_run_test
(
hf_runner
,
vllm_runner
,
_get_inputs
(
image_assets
,
size_factors
=
size_factors
,
sizes
=
sizes
),
model
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
tensor_parallel_size
=
tensor_parallel_size
,
distributed_executor_backend
=
distributed_executor_backend
,
)
def
_run_test
(
...
...
@@ -167,8 +177,8 @@ def _run_test(
# max_model_len should be greater than image_feature_size
with
vllm_runner
(
model
,
dtype
=
dtype
,
max_num_seqs
=
16
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
tensor_parallel_size
=
tensor_parallel_size
,
distributed_executor_backend
=
distributed_executor_backend
,
enforce_eager
=
True
,
...
...
@@ -185,14 +195,9 @@ def _run_test(
def
process
(
hf_inputs
:
BatchEncoding
):
return
hf_inputs
from
transformers
import
AutoConfig
from
transformers.models.mllama
import
MllamaConfig
as
MllamaConfigHf
# use transformer's MllamaConfig for hf_runner
# and vllm's MllamaConfig for vllm_runner
AutoConfig
.
register
(
"mllama"
,
MllamaConfigHf
,
exist_ok
=
True
)
with
hf_runner
(
model
,
dtype
=
dtype
,
model_kwargs
=
{
"device_map"
:
"auto"
},
postprocess_inputs
=
process
,
auto_cls
=
AutoModelForVision2Seq
)
as
hf_model
:
hf_outputs_per_image
=
[
...
...
@@ -203,8 +208,6 @@ def _run_test(
for
prompts
,
images
in
inputs
]
from
vllm.transformers_utils.configs.mllama
import
MllamaConfig
AutoConfig
.
register
(
"mllama"
,
MllamaConfig
,
exist_ok
=
True
)
for
hf_outputs
,
vllm_outputs
in
zip
(
hf_outputs_per_image
,
vllm_outputs_per_image
):
check_logprobs_close
(
...
...
@@ -218,6 +221,7 @@ def _run_test(
)
@
large_gpu_test
(
min_gb
=
48
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"sizes"
,
...
...
@@ -236,13 +240,13 @@ def _run_test(
(
1024
,
1024
),
(
512
,
1536
),
(
512
,
2028
),
None
],
# mllama has 8 possible aspect ratios, carefully set the sizes
# to cover all of them
],
)
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
def
test_models
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
sizes
,
dtype
,
max_tokens
,
num_logprobs
)
->
None
:
def
test_models_single_leading_image
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
sizes
,
dtype
,
max_tokens
,
num_logprobs
)
->
None
:
run_test
(
hf_runner
,
vllm_runner
,
...
...
@@ -256,28 +260,79 @@ def test_models(hf_runner, vllm_runner, image_assets, model, sizes, dtype,
)
@
multi
_gpu_test
(
num_gpus
=
2
)
@
large
_gpu_test
(
min_gb
=
48
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"sizes"
,
[
[(
512
,
512
),
(
1024
,
512
),
(
1536
,
512
),
(
2048
,
512
),
(
512
,
1024
),
(
1024
,
1024
),
(
512
,
1536
),
(
512
,
2028
),
None
],
],
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
def
test_models_distributed
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
sizes
,
dtype
,
max_tokens
,
num_logprobs
)
->
None
:
run_test
(
def
test_models_multi_leading_images
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
dtype
,
max_tokens
,
num_logprobs
)
->
None
:
stop_sign
=
image_assets
[
0
].
pil_image
cherry_blossom
=
image_assets
[
1
].
pil_image
inputs
=
[(
[
"<|image|><|image|><|begin_of_text|>Describe 2 images."
,
# noqa: E501
"<|image|><|image|><|begin_of_text|>Describe 2 images."
,
# noqa: E501
"<|image|><|image|><|image|><|begin_of_text|>Describe 3 images."
,
# noqa: E501
],
[
[
stop_sign
,
cherry_blossom
],
# Images with different sizes.
[
stop_sign
.
resize
((
512
,
512
)),
stop_sign
,
],
[
stop_sign
,
stop_sign
.
resize
((
512
,
1536
)),
cherry_blossom
.
resize
((
512
,
1024
)),
],
])]
_run_test
(
hf_runner
,
vllm_runner
,
image_assets
,
inputs
,
model
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
tensor_parallel_size
=
1
,
)
@
large_gpu_test
(
min_gb
=
48
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
def
test_models_interleaved_images
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
dtype
,
max_tokens
,
num_logprobs
)
->
None
:
stop_sign
=
image_assets
[
0
].
pil_image
cherry_blossom
=
image_assets
[
1
].
pil_image
inputs
=
[(
[
"<|begin_of_text|>The content of the image <|image|> is"
,
# noqa: E501
"<|begin_of_text|>Between the first image <|image|> and the second image<|image|>, "
# noqa: E501
"which is a stop sign and which is a cherry blossom?"
,
# noqa: E501
],
[
[
stop_sign
],
[
stop_sign
,
cherry_blossom
],
])]
_run_test
(
hf_runner
,
vllm_runner
,
inputs
,
model
,
sizes
=
sizes
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
tensor_parallel_size
=
2
,
tensor_parallel_size
=
1
,
)
tests/models/test_oot_registration.py
View file @
6d2051cc
...
...
@@ -2,7 +2,8 @@ import os
import
pytest
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
PoolingParams
,
SamplingParams
from
vllm.assets.image
import
ImageAsset
from
..utils
import
fork_new_process_for_each_test
...
...
@@ -16,7 +17,7 @@ def test_plugin(dummy_opt_path):
@
fork_new_process_for_each_test
def
test_oot_registration
(
dummy_opt_path
):
def
test_oot_registration
_text_generation
(
dummy_opt_path
):
os
.
environ
[
"VLLM_PLUGINS"
]
=
"register_dummy_model"
prompts
=
[
"Hello, my name is"
,
"The text does not matter"
]
sampling_params
=
SamplingParams
(
temperature
=
0
)
...
...
@@ -29,3 +30,52 @@ def test_oot_registration(dummy_opt_path):
# make sure only the first token is generated
rest
=
generated_text
.
replace
(
first_token
,
""
)
assert
rest
==
""
@
fork_new_process_for_each_test
def
test_oot_registration_embedding
(
dummy_gemma2_embedding_path
):
os
.
environ
[
"VLLM_PLUGINS"
]
=
"register_dummy_model"
prompts
=
[
"Hello, my name is"
,
"The text does not matter"
]
sampling_params
=
PoolingParams
()
llm
=
LLM
(
model
=
dummy_gemma2_embedding_path
,
load_format
=
"dummy"
)
outputs
=
llm
.
encode
(
prompts
,
sampling_params
)
for
output
in
outputs
:
assert
all
(
v
==
0
for
v
in
output
.
outputs
.
embedding
)
image
=
ImageAsset
(
"cherry_blossom"
).
pil_image
.
convert
(
"RGB"
)
@
fork_new_process_for_each_test
def
test_oot_registration_multimodal
(
dummy_llava_path
):
os
.
environ
[
"VLLM_PLUGINS"
]
=
"register_dummy_model"
prompts
=
[{
"prompt"
:
"What's in the image?<image>"
,
"multi_modal_data"
:
{
"image"
:
image
},
},
{
"prompt"
:
"Describe the image<image>"
,
"multi_modal_data"
:
{
"image"
:
image
},
}]
sampling_params
=
SamplingParams
(
temperature
=
0
)
llm
=
LLM
(
model
=
dummy_llava_path
,
load_format
=
"dummy"
,
max_num_seqs
=
1
,
trust_remote_code
=
True
,
gpu_memory_utilization
=
0.98
,
max_model_len
=
4096
,
enforce_eager
=
True
,
limit_mm_per_prompt
=
{
"image"
:
1
})
first_token
=
llm
.
get_tokenizer
().
decode
(
0
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
for
output
in
outputs
:
generated_text
=
output
.
outputs
[
0
].
text
# make sure only the first token is generated
rest
=
generated_text
.
replace
(
first_token
,
""
)
assert
rest
==
""
tests/models/test_registry.py
View file @
6d2051cc
import
warnings
import
pytest
import
t
ransformers
import
t
orch.cuda
from
vllm.model_executor.models
import
_MODELS
,
ModelRegistry
from
vllm.model_executor.models
import
(
is_embedding_model
,
is_text_generation_model
,
supports_multimodal
)
from
vllm.model_executor.models.registry
import
(
_EMBEDDING_MODELS
,
_MULTIMODAL_MODELS
,
_SPECULATIVE_DECODING_MODELS
,
_TEXT_GENERATION_MODELS
,
ModelRegistry
)
from
vllm.platforms
import
current_platform
from
..utils
import
fork_new_process_for_each_test
@
pytest
.
mark
.
parametrize
(
"model_cls"
,
_MODELS
)
def
test_registry_imports
(
model_cls
):
if
(
model_cls
in
(
"LlavaOnevisionForConditionalGeneration"
,
"Qwen2VLForConditionalGeneration"
)
and
transformers
.
__version__
<
"4.45"
):
pytest
.
skip
(
"Waiting for next transformers release"
)
@
pytest
.
mark
.
parametrize
(
"model_arch"
,
ModelRegistry
.
get_supported_archs
())
def
test_registry_imports
(
model_arch
):
# Ensure all model classes can be imported successfully
ModelRegistry
.
resolve_model_cls
([
model_cls
])
model_cls
,
_
=
ModelRegistry
.
resolve_model_cls
(
model_arch
)
if
model_arch
in
_SPECULATIVE_DECODING_MODELS
:
pass
# Ignore these models which do not have a unified format
else
:
assert
is_text_generation_model
(
model_cls
)
is
(
model_arch
in
_TEXT_GENERATION_MODELS
or
model_arch
in
_MULTIMODAL_MODELS
)
assert
is_embedding_model
(
model_cls
)
is
(
model_arch
in
_EMBEDDING_MODELS
)
assert
supports_multimodal
(
model_cls
)
is
(
model_arch
in
_MULTIMODAL_MODELS
)
@
fork_new_process_for_each_test
@
pytest
.
mark
.
parametrize
(
"model_arch,is_mm,init_cuda"
,
[
(
"LlamaForCausalLM"
,
False
,
False
),
(
"MllamaForConditionalGeneration"
,
True
,
False
),
(
"LlavaForConditionalGeneration"
,
True
,
True
),
])
def
test_registry_is_multimodal
(
model_arch
,
is_mm
,
init_cuda
):
assert
ModelRegistry
.
is_multimodal_model
(
model_arch
)
is
is_mm
if
init_cuda
and
current_platform
.
is_cuda_alike
():
assert
not
torch
.
cuda
.
is_initialized
()
ModelRegistry
.
resolve_model_cls
(
model_arch
)
if
not
torch
.
cuda
.
is_initialized
():
warnings
.
warn
(
"This model no longer initializes CUDA on import. "
"Please test using a different one."
,
stacklevel
=
2
)
@
fork_new_process_for_each_test
@
pytest
.
mark
.
parametrize
(
"model_arch,is_pp,init_cuda"
,
[
(
"MLPSpeculatorPreTrainedModel"
,
False
,
False
),
(
"DeepseekV2ForCausalLM"
,
True
,
False
),
(
"Qwen2VLForConditionalGeneration"
,
True
,
True
),
])
def
test_registry_is_pp
(
model_arch
,
is_pp
,
init_cuda
):
assert
ModelRegistry
.
is_pp_supported_model
(
model_arch
)
is
is_pp
if
init_cuda
and
current_platform
.
is_cuda_alike
():
assert
not
torch
.
cuda
.
is_initialized
()
ModelRegistry
.
resolve_model_cls
(
model_arch
)
if
not
torch
.
cuda
.
is_initialized
():
warnings
.
warn
(
"This model no longer initializes CUDA on import. "
"Please test using a different one."
,
stacklevel
=
2
)
tests/models/utils.py
View file @
6d2051cc
import
warnings
from
typing
import
Dict
,
List
,
Optional
,
Sequence
,
Tuple
,
Union
import
torch
from
vllm.config
import
ModelConfig
from
vllm.inputs
import
InputContext
from
vllm.sequence
import
Logprob
,
PromptLogprobs
,
SampleLogprobs
from
vllm.utils
import
is_cpu
TokensText
=
Tuple
[
List
[
int
],
str
]
...
...
@@ -247,6 +250,7 @@ def check_logprobs_close(
def
build_model_context
(
model_name
:
str
,
tokenizer_name
:
Optional
[
str
]
=
None
,
trust_remote_code
:
bool
=
False
,
dtype
:
Optional
[
Union
[
str
,
torch
.
dtype
]]
=
None
,
mm_processor_kwargs
:
Optional
[
Dict
]
=
None
,
limit_mm_per_prompt
:
Optional
[
Dict
]
=
None
):
"""Creates an InputContext for a given model.
...
...
@@ -264,12 +268,15 @@ def build_model_context(model_name: str,
"""
if
tokenizer_name
is
None
:
tokenizer_name
=
model_name
if
dtype
is
None
:
dtype
=
"bfloat16"
if
is_cpu
()
else
"half"
model_config
=
ModelConfig
(
model_name
,
tokenizer_name
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
trust_remote_code
,
dtype
=
"float32"
,
dtype
=
dtype
,
seed
=
0
,
mm_processor_kwargs
=
mm_processor_kwargs
,
limit_mm_per_prompt
=
limit_mm_per_prompt
,
...
...
tests/mq_llm_engine/test_error_handling.py
View file @
6d2051cc
...
...
@@ -61,7 +61,7 @@ async def test_evil_forward(tmp_socket):
# Throws an error in first forward pass.
with
pytest
.
raises
(
RAISED_ERROR
):
async
for
_
in
client
.
generate
(
inputs
=
"Hello my name is"
,
async
for
_
in
client
.
generate
(
prompt
=
"Hello my name is"
,
sampling_params
=
SamplingParams
(),
request_id
=
uuid
.
uuid4
()):
pass
...
...
@@ -69,7 +69,7 @@ async def test_evil_forward(tmp_socket):
# Engine is errored, should get ENGINE_DEAD_ERROR.
with
pytest
.
raises
(
MQEngineDeadError
):
async
for
_
in
client
.
generate
(
inputs
=
"Hello my name is"
,
async
for
_
in
client
.
generate
(
prompt
=
"Hello my name is"
,
sampling_params
=
SamplingParams
(),
request_id
=
uuid
.
uuid4
()):
pass
...
...
@@ -118,7 +118,7 @@ async def test_failed_health_check(tmp_socket):
# Generate call should throw ENGINE_DEAD_ERROR
with
pytest
.
raises
(
MQEngineDeadError
):
async
for
_
in
client
.
generate
(
inputs
=
"Hello my name is"
,
async
for
_
in
client
.
generate
(
prompt
=
"Hello my name is"
,
sampling_params
=
SamplingParams
(),
request_id
=
uuid
.
uuid4
()):
pass
...
...
@@ -160,7 +160,7 @@ async def test_failed_abort(tmp_socket):
# with reference to the original KeyError("foo")
with
pytest
.
raises
(
MQEngineDeadError
)
as
execinfo
:
async
for
_
in
client
.
generate
(
inputs
=
"Hello my name is"
,
prompt
=
"Hello my name is"
,
sampling_params
=
SamplingParams
(
max_tokens
=
10
),
request_id
=
uuid
.
uuid4
()):
pass
...
...
@@ -183,7 +183,7 @@ async def test_bad_request(tmp_socket):
# Invalid request should fail, but not crash the server.
with
pytest
.
raises
(
ValueError
):
async
for
_
in
client
.
generate
(
inputs
=
"Hello my name is"
,
async
for
_
in
client
.
generate
(
prompt
=
"Hello my name is"
,
sampling_params
=
SamplingParams
(),
request_id
=
"abcd-1"
,
lora_request
=
LoRARequest
(
...
...
@@ -192,7 +192,7 @@ async def test_bad_request(tmp_socket):
pass
# This request should be okay.
async
for
_
in
client
.
generate
(
inputs
=
"Hello my name is"
,
async
for
_
in
client
.
generate
(
prompt
=
"Hello my name is"
,
sampling_params
=
SamplingParams
(),
request_id
=
"abcd-2"
):
pass
...
...
tests/mq_llm_engine/utils.py
View file @
6d2051cc
...
...
@@ -20,7 +20,7 @@ async def generate(
count
=
0
async
for
out
in
client
.
generate
(
request_id
=
request_id
,
inputs
=
"Hello my name is Robert and"
,
prompt
=
"Hello my name is Robert and"
,
sampling_params
=
SamplingParams
(
max_tokens
=
num_tokens
,
temperature
=
0
)):
...
...
tests/multi_step/test_correctness_async_llm.py
View file @
6d2051cc
...
...
@@ -17,7 +17,6 @@ NUM_PROMPTS = [10]
DEFAULT_SERVER_ARGS
:
List
[
str
]
=
[
"--disable-log-requests"
,
"--use-v2-block-manager"
,
"--worker-use-ray"
,
"--gpu-memory-utilization"
,
"0.85"
,
...
...
@@ -37,6 +36,7 @@ DEFAULT_SERVER_ARGS: List[str] = [
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"is_async"
,
[
True
])
@
pytest
.
mark
.
parametrize
(
"attention_backend"
,
[
"FLASHINFER"
,
"FLASH_ATTN"
])
@
pytest
.
mark
.
parametrize
(
"enable_chunked_prefill"
,
[
True
,
False
])
@
pytest
.
mark
.
asyncio
async
def
test_multi_step
(
example_prompts
,
...
...
@@ -49,6 +49,7 @@ async def test_multi_step(
is_async
:
bool
,
num_logprobs
:
Optional
[
int
],
attention_backend
:
str
,
enable_chunked_prefill
:
bool
,
monkeypatch
,
)
->
None
:
"""Test vLLM engine with multi-step scheduling in an OpenAI-protocol
...
...
@@ -74,6 +75,10 @@ async def test_multi_step(
num_logprobs: corresponds to the `logprobs` argument to the OpenAI
completions endpoint; `None` -> no logprobs
"""
if
enable_chunked_prefill
and
\
(
pp_size
>
1
or
attention_backend
!=
"FLASH_ATTN"
):
pytest
.
skip
(
"Multi-step with Chunked-Prefill only supports"
"PP=1 and FLASH_ATTN backend"
)
override_backend_env_variable
(
monkeypatch
,
attention_backend
)
...
...
@@ -93,6 +98,9 @@ async def test_multi_step(
if
eager_mode
:
ms_server_args
.
append
(
"--enforce-eager"
)
if
enable_chunked_prefill
:
ms_server_args
.
append
(
"--enable-chunked-prefill"
)
distributed_args
=
[
"--tensor-parallel-size"
,
str
(
tp_size
),
...
...
@@ -133,3 +141,85 @@ async def test_multi_step(
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
@
pytest
.
mark
.
parametrize
((
"tp_size, pp_size"
),
[
(
1
,
2
),
])
@
pytest
.
mark
.
asyncio
async
def
test_multi_step_pp_smoke
(
tp_size
:
int
,
pp_size
:
int
,
monkeypatch
,
)
->
None
:
"""
Smoke test for the vLLM engine with multi-step scheduling in an
OpenAI-protocol client/server environment.
This tests compares the outputs between multi-step scheduling and
single-step scheduling. Notably, this test lets the engines generate
more tokens (default is 5) and test for an exact match over all the
tokens.
Args:
tp_size: degree of tensor-parallelism
pp_size: degree of pipeline-parallelism
eager_mode
"""
model
=
"JackFram/llama-160m"
num_scheduler_steps
=
8
attention_backend
=
"FLASH_ATTN"
max_num_seqs
=
3
override_backend_env_variable
(
monkeypatch
,
attention_backend
)
# Prompt from the ShareGPT dataset
prompts
=
[
"in the jtbd context whats a push?"
,
# codespell:ignore
"in the jtbd context whats a push?"
,
# codespell:ignore
"in the jtbd context whats a push?"
,
# codespell:ignore
"in the jtbd context whats a push?"
,
# codespell:ignore
]
# Use varying max_tokens to introduce scheduling randomness.
max_tokens
=
[
10
*
i
for
i
in
range
(
1
,
len
(
prompts
)
+
1
)]
assert
len
(
prompts
)
==
len
(
max_tokens
)
test_args
=
[
"--tensor-parallel-size"
,
str
(
tp_size
),
"--pipeline-parallel-size"
,
str
(
pp_size
),
"--max-num-seqs"
,
str
(
max_num_seqs
)
]
server_args
=
DEFAULT_SERVER_ARGS
+
test_args
ms_server_args
=
DEFAULT_SERVER_ARGS
+
\
[
"--num-scheduler-steps"
,
f
"
{
num_scheduler_steps
}
"
]
+
\
test_args
# Spin up client/server & issue completion API requests.
# Default `max_wait_seconds` is 240 but was empirically
# was raised 3x to 720 *just for this test* due to
# observed timeouts in GHA CI
ref_completions
=
await
completions_with_server_args
(
prompts
=
prompts
,
model_name
=
model
,
server_cli_args
=
server_args
,
num_logprobs
=
None
,
max_wait_seconds
=
5
*
240
,
max_tokens
=
max_tokens
)
test_completions
=
await
completions_with_server_args
(
prompts
=
prompts
,
model_name
=
model
,
server_cli_args
=
ms_server_args
,
num_logprobs
=
None
,
max_wait_seconds
=
5
*
240
,
max_tokens
=
max_tokens
)
# Assert multi-step scheduling produces identical tokens
# to single-step scheduling.
ref_generations
=
get_client_text_generations
(
ref_completions
)
test_generations
=
get_client_text_generations
(
test_completions
)
assert
ref_generations
==
test_generations
tests/multi_step/test_correctness_llm.py
View file @
6d2051cc
# Test the LLMEngine with multi-step-decoding
import
copy
from
typing
import
Optional
import
pytest
...
...
@@ -16,6 +17,7 @@ NUM_PROMPTS = [10]
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"enable_chunked_prefill"
,
[
False
,
True
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"enforce_eager"
,
[
True
])
@
pytest
.
mark
.
parametrize
(
"num_scheduler_steps"
,
NUM_SCHEDULER_STEPS
)
...
...
@@ -28,6 +30,7 @@ def test_multi_step_llm(
model
:
str
,
dtype
:
str
,
tp_size
:
int
,
enable_chunked_prefill
:
bool
,
max_tokens
:
int
,
enforce_eager
:
int
,
num_scheduler_steps
:
int
,
...
...
@@ -51,6 +54,7 @@ def test_multi_step_llm(
model: model under test (same for single- and multi-step engines)
dtype: tensor datatype for engine to utilize
tp_size: degree of tensor-parallelism
enable_chunked_prefill: chunked-prefill on/off
max_tokens: the maximum number of tokens to generate
enforce_eager
num_scheduler_steps: for multi-step scheduling, GPU-side steps per
...
...
@@ -72,7 +76,7 @@ def test_multi_step_llm(
enforce_eager
=
enforce_eager
,
gpu_memory_utilization
=
0.7
,
tensor_parallel_size
=
tp_size
,
use_v2_block_manager
=
True
,
enable_chunked_prefill
=
enable_chunked_prefill
,
num_scheduler_steps
=
num_scheduler_steps
,
)
as
vllm_model
:
vllm_outputs
=
(
vllm_model
.
generate_greedy
(
prompts
,
max_tokens
)
...
...
@@ -164,7 +168,6 @@ def test_multi_step_llm_w_prompt_logprobs(
enforce_eager
=
enforce_eager
,
gpu_memory_utilization
=
0.7
,
tensor_parallel_size
=
tp_size
,
use_v2_block_manager
=
True
,
num_scheduler_steps
=
num_scheduler_steps
,
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy_logprobs
(
...
...
@@ -192,3 +195,158 @@ def test_multi_step_llm_w_prompt_logprobs(
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"enforce_eager"
,
[
True
])
@
pytest
.
mark
.
parametrize
(
"num_scheduler_steps"
,
NUM_SCHEDULER_STEPS
)
@
pytest
.
mark
.
parametrize
(
"num_prompts"
,
NUM_PROMPTS
)
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
None
,
5
])
def
test_multi_step_llm_chunked_prefill_prefix_cache
(
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
tp_size
:
int
,
max_tokens
:
int
,
enforce_eager
:
int
,
num_scheduler_steps
:
int
,
num_prompts
:
int
,
num_logprobs
:
Optional
[
int
],
)
->
None
:
"""Test vLLM engine with multi-step+"single-step chunked prefill"+APC.
Set up contrived scenario which tests for a possible failure mode of
scheduling with multi-step+"single-step chunked prefill"+APC
"single-step chunked prefill" here refers to the current vLLM multi-step+
chunked-prefill implementation, which requires that a prefill may only
be scheduled in the same step as decodes if the prefill prompt fits in a
single chunk (note that "complete" multi-step+chunked-prefill would allow
a prefill to span multiple chunks & multiple steps but that is not yet
the case.)
"APC" is short for "automatic prefix caching".
This test creates a scenario where the scheduler must decide whether/how
to schedule a prefill with a prompt that exceeds the available token budget.
The correct behavior for multi-step+"single-step chunked prefill"+APC is to
put off scheduling the prefill until a future step.
Validate that:
* Multi-step kernels do not raise an exception due to incorrect scheduler
behavior
* Generated tokens match between
multi-step+"single-step chunked prefill"+APC and
single-step scheduling.
* (If logprobs are enabled) check logprobs are close enough
Args:
vllm_runner: vLLM model runner fixture
example_prompts: test fixture providing example prompts
model: model under test (same for single- and multi-step engines)
dtype: tensor datatype for engine to utilize
tp_size: degree of tensor-parallelism
max_tokens: the maximum number of tokens to generate
enforce_eager
num_scheduler_steps: for multi-step scheduling, GPU-side steps per
GPU -> CPU output transfer
num_prompts: number of example prompts under test
num_logprobs: corresponds to the `logprobs` argument to the OpenAI
completions endpoint; `None` -> 1 logprob returned.
"""
# Set up contrived test for correct scheduling behavior with
# multi-step+"single-step chunked prefill"+APC.
#
# Assume block_size=16
#
# Assume max_num_batched_tokens=48
# => Per-step token budget=48
#
# 1. Scheduler schedules 0th prompt (24 tokens)
# => Remaining token budget=24
# 2. Scheduler attempts to schedule 1st prompt (30 tokens)
# * 30 tokens exceeds 24 token remaining budget
# * Correct behavior: do not schedule this prompt in this step
# * Incorrect behavior: schedule prompt chunk
# * `do_sample=False` for this prompt in this step
# * Chunk size = (remaining tokens // block size) * block size
#
# The Incorrect scheduling behavior - if it occurs - will cause an exception
# in the model runner resulting from `do_sample=False`.
assert
len
(
example_prompts
)
>=
2
challenge_prompts
=
copy
.
deepcopy
(
example_prompts
)
challenge_prompts
[
0
]
=
(
'vLLM is a high-throughput and memory-efficient '
'inference and serving engine for LLMs.
\n
'
)
# 24 tok
challenge_prompts
[
1
]
=
(
'Briefly describe the major milestones in the '
'development of artificial intelligence from 1950 to 2020.
\n
'
)
# 30 tok
# If necessary, adjust the length of `challenge_prompts` to match
# `num_prompts`
if
len
(
challenge_prompts
)
<
num_prompts
:
challenge_prompts
=
(
challenge_prompts
*
((
num_prompts
//
len
(
challenge_prompts
))
+
1
))
challenge_prompts
=
challenge_prompts
[:
num_prompts
]
assert
len
(
challenge_prompts
)
==
num_prompts
# Single-step scheduler baseline
with
vllm_runner
(
model
,
dtype
=
dtype
,
enforce_eager
=
enforce_eager
,
gpu_memory_utilization
=
0.7
,
tensor_parallel_size
=
tp_size
,
num_scheduler_steps
=
num_scheduler_steps
,
max_model_len
=
48
,
max_num_batched_tokens
=
48
,
max_num_seqs
=
4
,
block_size
=
16
,
)
as
vllm_model
:
outputs_baseline
=
(
vllm_model
.
generate_greedy
(
challenge_prompts
,
max_tokens
)
if
num_logprobs
is
None
else
vllm_model
.
generate_greedy_logprobs
(
challenge_prompts
,
max_tokens
,
num_logprobs
))
# multi-step+"single-step chunked prefill"+APC
with
vllm_runner
(
model
,
dtype
=
dtype
,
enforce_eager
=
enforce_eager
,
gpu_memory_utilization
=
0.7
,
tensor_parallel_size
=
tp_size
,
enable_chunked_prefill
=
True
,
enable_prefix_caching
=
True
,
num_scheduler_steps
=
num_scheduler_steps
,
max_model_len
=
48
,
max_num_batched_tokens
=
48
,
max_num_seqs
=
4
,
block_size
=
16
,
)
as
vllm_model
:
outputs_w_features
=
(
vllm_model
.
generate_greedy
(
challenge_prompts
,
max_tokens
)
if
num_logprobs
is
None
else
vllm_model
.
generate_greedy_logprobs
(
challenge_prompts
,
max_tokens
,
num_logprobs
))
if
num_logprobs
is
None
:
# No-logprobs test
check_outputs_equal
(
outputs_0_lst
=
outputs_baseline
,
outputs_1_lst
=
outputs_w_features
,
name_0
=
"multi-step"
,
name_1
=
"multi-step+features"
,
)
else
:
# Yes-logprobs test
check_logprobs_close
(
outputs_0_lst
=
outputs_baseline
,
outputs_1_lst
=
outputs_w_features
,
name_0
=
"multi-step"
,
name_1
=
"multi-step+features"
,
)
tests/multimodal/test_processor_kwargs.py
View file @
6d2051cc
...
...
@@ -5,7 +5,7 @@ from unittest.mock import patch
import
pytest
import
torch
from
vllm.inputs
import
InputContext
,
LLMI
nputs
from
vllm.inputs
import
DecoderOnlyInputs
,
InputContext
,
token_i
nputs
from
vllm.inputs.registry
import
InputRegistry
from
vllm.multimodal
import
MultiModalRegistry
from
vllm.sequence
import
VLLM_TOKEN_ID_ARRAY_TYPE
,
SequenceData
...
...
@@ -31,7 +31,7 @@ def use_processor_mock():
"""Patches the internal model input processor with an override callable."""
def
custom_processor
(
ctx
:
InputContext
,
llm_
inputs
:
LLM
Inputs
,
inputs
:
DecoderOnly
Inputs
,
*
,
num_crops
=
DEFAULT_NUM_CROPS
):
# For testing purposes, we don't worry about the llm inputs / return
...
...
@@ -74,38 +74,61 @@ def mm_model_cls():
# lambda whose signature matches max token calcs extra & mapper + extra kwargs
get_num_crops
=
lambda
ctx
,
*
,
num_crops
=
DEFAULT_NUM_CROPS
:
num_crops
custom_mapper
=
lambda
ctx
,
data
,
*
,
num_crops
=
DEFAULT_NUM_CROPS
:
{
"
num_
pixels"
:
torch
.
zeros
(
size
=
(
1
,
num_crops
+
1
,
3
,
336
,
336
))
"pixel
_value
s"
:
torch
.
zeros
(
size
=
(
1
,
num_crops
+
1
,
3
,
336
,
336
))
}
### Test for default processor logic & mm_processor_kwargs wrapping
### Test
s
for default processor logic & mm_processor_kwargs wrapping
def
test_default_processor_is_a_noop
():
"""Ensure that by default, there is no processor override."""
dummy_registry
=
InputRegistry
()
ctx
=
build_model_context
(
DUMMY_MODEL_ID
)
processor
=
dummy_registry
.
create_input_processor
(
ctx
.
model_config
)
proc_inputs
=
LLMI
nputs
(
prompt_token_ids
=
[],
prompt
=
""
)
proc_inputs
=
token_i
nputs
(
prompt_token_ids
=
[],
prompt
=
""
)
proc_outputs
=
processor
(
inputs
=
proc_inputs
)
assert
proc_inputs
is
proc_outputs
@
pytest
.
mark
.
parametrize
(
"num_crops"
,
[
None
,
NUM_CROPS_OVERRIDE
])
def
test_processor_default_kwargs
(
use_processor_mock
,
num_crops
):
"""Ensure input processors can use processor kwargs."""
dummy_registry
=
InputRegistry
()
def
_get_num_crops_info
(
init_num_crops
:
int
,
inference_num_crops
:
int
):
"""Get the init / inference kwargs and expected num_crops for this test."""
# If we have a value for num_crops, pass the override value and make
# sure we get that value as a return-value from out mock processor,
# otherwise fall back to the default value
mm_processor
_kwargs
=
None
if
num_crops
is
None
else
{
"num_crops"
:
num_crops
init
_kwargs
=
None
if
init_
num_crops
is
None
else
{
"num_crops"
:
init_
num_crops
}
expected_num_crops
=
DEFAULT_NUM_CROPS
if
num_crops
is
None
else
num_crops
ctx
=
build_model_context
(
DUMMY_MODEL_ID
,
mm_processor_kwargs
=
mm_processor_kwargs
)
processor
=
dummy_registry
.
create_input_processor
(
ctx
.
model_config
)
inference_kwargs
=
None
if
inference_num_crops
is
None
else
{
"num_crops"
:
inference_num_crops
}
if
inference_num_crops
is
not
None
:
expected_seq_count
=
inference_num_crops
elif
init_num_crops
is
not
None
:
expected_seq_count
=
init_num_crops
else
:
expected_seq_count
=
DEFAULT_NUM_CROPS
return
init_kwargs
,
inference_kwargs
,
expected_seq_count
@
pytest
.
mark
.
parametrize
(
"init_num_crops,inference_num_crops"
,
[
(
None
,
None
),
(
NUM_CROPS_OVERRIDE
,
None
),
(
DEFAULT_NUM_CROPS
,
NUM_CROPS_OVERRIDE
),
])
def
test_input_processor_kwargs
(
use_processor_mock
,
init_num_crops
,
inference_num_crops
):
"""Ensure input processors can use processor kwargs."""
dummy_registry
=
InputRegistry
()
init_kwargs
,
inference_kwargs
,
expected_seq_count
=
_get_num_crops_info
(
init_num_crops
,
inference_num_crops
)
num_crops_val
=
processor
(
LLMInputs
(
prompt_token_ids
=
[],
prompt
=
""
))
assert
num_crops_val
==
expected_num_crops
ctx
=
build_model_context
(
DUMMY_MODEL_ID
,
mm_processor_kwargs
=
init_kwargs
)
processor
=
dummy_registry
.
create_input_processor
(
ctx
.
model_config
)
num_crops_val
=
processor
(
token_inputs
(
prompt_token_ids
=
[],
prompt
=
""
,
mm_processor_kwargs
=
inference_kwargs
))
assert
num_crops_val
==
expected_seq_count
@
pytest
.
mark
.
parametrize
(
...
...
@@ -124,11 +147,16 @@ def test_processor_with_sad_kwarg_overrides(use_processor_mock,
mm_processor_kwargs
):
"""Ensure that input processors filter out invalid mm_processor_kwargs"""
dummy_registry
=
InputRegistry
()
# Should filter out the init time kwargs
ctx
=
build_model_context
(
DUMMY_MODEL_ID
,
mm_processor_kwargs
=
mm_processor_kwargs
)
processor
=
dummy_registry
.
create_input_processor
(
ctx
.
model_config
)
num_crops_val
=
processor
(
LLMInputs
(
prompt_token_ids
=
[],
prompt
=
""
))
# Should filter out the inference time kwargs
num_crops_val
=
processor
(
token_inputs
(
prompt_token_ids
=
[],
prompt
=
""
,
mm_processor_kwargs
=
mm_processor_kwargs
))
assert
num_crops_val
==
DEFAULT_NUM_CROPS
...
...
@@ -271,32 +299,34 @@ def test_default_mapper_with_processer_kwargs(image_assets, num_crops):
assert
mapped_inputs
[
"pixel_values"
].
shape
[
1
]
==
num_crops
+
1
@
pytest
.
mark
.
parametrize
(
"num_crops"
,
[
None
,
NUM_CROPS_OVERRIDE
])
def
test_custom_mapper_kwarg_overrides
(
image_assets
,
num_crops
):
@
pytest
.
mark
.
parametrize
(
"init_num_crops,inference_num_crops"
,
[
(
None
,
None
),
(
NUM_CROPS_OVERRIDE
,
None
),
(
DEFAULT_NUM_CROPS
,
NUM_CROPS_OVERRIDE
),
])
def
test_custom_mapper_kwarg_overrides
(
image_assets
,
init_num_crops
,
inference_num_crops
):
"""Ensure custom mappers can use processor kwargs."""
mm_processor_kwargs
=
None
if
num_crops
is
None
else
{
"num_crops"
:
num_crops
}
expected_seq_count
=
DEFAULT_NUM_CROPS
if
num_crops
is
None
else
num_crops
init_kwargs
,
inference_kwargs
,
expected_seq_count
=
_get_num_crops_info
(
init_num_crops
,
inference_num_crops
)
ctx
=
build_model_context
(
MULTIMODAL_MODEL_ID
,
trust_remote_code
=
True
,
mm_processor_kwargs
=
mm_processor
_kwargs
,
mm_processor_kwargs
=
init
_kwargs
,
limit_mm_per_prompt
=
{
"image"
:
1
})
mm_registry
=
MultiModalRegistry
()
mm_registry
.
init_mm_limits_per_prompt
(
ctx
.
model_config
)
# Patch the image registry for phi3v with our lambda that is compatible
# with overrides, then ensure that calling the method correctly echos
# our num_crops value back from the mm_processor_kwargs.
image
=
image_assets
[
0
].
pil_image
mm_inputs
=
{
"image"
:
image
}
with
patch
.
object
(
mm_registry
.
_get_plugin
(
"image"
),
"_default_input_mapper"
,
{
mm_model_cls
():
custom_mapper
},
):
mapped_inputs
=
mm_registry
.
map_input
(
ctx
.
model_config
,
mm_inputs
)
# Patch the image registry for phi3v with our lambda that is compatible
# with overrides, then ensure that calling the method correctly echos
# our num_crops value back from the mm_processor_kwargs.
mm_registry
.
_get_plugin
(
"image"
).
register_input_mapper
(
custom_mapper
)(
mm_model_cls
())
mapped_inputs
=
mm_registry
.
map_input
(
ctx
.
model_config
,
mm_inputs
,
inference_kwargs
)
assert
mapped_inputs
[
"pixel_values"
].
shape
[
1
]
==
expected_seq_count
+
1
...
...
@@ -316,6 +346,7 @@ def test_custom_mapper_kwarg_overrides(image_assets, num_crops):
def
test_custom_mapper_with_sad_kwarg_overrides
(
image_assets
,
mm_processor_kwargs
):
"""Ensure that custom mappers filters out invalid mm_processor_kwargs"""
# Should filter out the init time kwargs
ctx
=
build_model_context
(
MULTIMODAL_MODEL_ID
,
trust_remote_code
=
True
,
mm_processor_kwargs
=
mm_processor_kwargs
,
...
...
@@ -323,17 +354,16 @@ def test_custom_mapper_with_sad_kwarg_overrides(image_assets,
mm_registry
=
MultiModalRegistry
()
mm_registry
.
init_mm_limits_per_prompt
(
ctx
.
model_config
)
# Patch the image registry for phi3v with our lambda that is compatible
# with overrides, then ensure that calling the method correctly echos
# our num_crops value back from the mm_processor_kwargs.
image
=
image_assets
[
0
].
pil_image
mm_inputs
=
{
"image"
:
image
}
with
patch
.
object
(
mm_registry
.
_get_plugin
(
"image"
),
"_default_input_mapper"
,
{
mm_model_cls
():
custom_mapper
},
):
mapped_inputs
=
mm_registry
.
map_input
(
ctx
.
model_config
,
mm_inputs
)
# Patch the image registry for phi3v with our lambda that is compatible
# with overrides, then ensure that calling the method correctly echos
# our num_crops value back from the mm_processor_kwargs.
mm_registry
.
_get_plugin
(
"image"
).
register_input_mapper
(
custom_mapper
)(
mm_model_cls
())
# Should filter out the inference time kwargs
mapped_inputs
=
mm_registry
.
map_input
(
ctx
.
model_config
,
mm_inputs
,
mm_processor_kwargs
=
mm_processor_kwargs
)
assert
mapped_inputs
[
"pixel_values"
].
shape
[
1
]
==
DEFAULT_NUM_CROPS
+
1
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
View file @
6d2051cc
from
typing
import
Optional
import
torch
from
vllm
import
ModelRegistry
from
vllm.model_executor.models.opt
import
OPTForCausalLM
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
class
MyOPTForCausalLM
(
OPTForCausalLM
):
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
)
->
Optional
[
torch
.
Tensor
]:
# this dummy model always predicts the first token
logits
=
super
().
compute_logits
(
hidden_states
,
sampling_metadata
)
if
logits
is
not
None
:
logits
.
zero_
()
logits
[:,
0
]
+=
1.0
return
logits
def
register
():
# register our dummy model
# Test directly passing the model
from
.my_opt
import
MyOPTForCausalLM
if
"MyOPTForCausalLM"
not
in
ModelRegistry
.
get_supported_archs
():
ModelRegistry
.
register_model
(
"MyOPTForCausalLM"
,
MyOPTForCausalLM
)
# Test passing lazy model
if
"MyGemma2Embedding"
not
in
ModelRegistry
.
get_supported_archs
():
ModelRegistry
.
register_model
(
"MyGemma2Embedding"
,
"vllm_add_dummy_model.my_gemma_embedding:MyGemma2Embedding"
,
)
if
"MyLlava"
not
in
ModelRegistry
.
get_supported_archs
():
ModelRegistry
.
register_model
(
"MyLlava"
,
"vllm_add_dummy_model.my_llava:MyLlava"
)
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
0 → 100644
View file @
6d2051cc
from
typing
import
List
,
Optional
,
Union
import
torch
from
vllm.attention
import
AttentionMetadata
from
vllm.model_executor.models.gemma2
import
Gemma2EmbeddingModel
from
vllm.sequence
import
IntermediateTensors
class
MyGemma2Embedding
(
Gemma2EmbeddingModel
):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
torch
.
Tensor
],
attn_metadata
:
AttentionMetadata
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
inputs_embeds
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
Union
[
torch
.
Tensor
,
IntermediateTensors
]:
hidden_states
=
super
().
forward
(
input_ids
,
positions
,
kv_caches
,
attn_metadata
,
intermediate_tensors
=
intermediate_tensors
,
inputs_embeds
=
inputs_embeds
,
)
if
isinstance
(
hidden_states
,
IntermediateTensors
):
return
hidden_states
# Return all-zero embeddings
return
torch
.
zeros_like
(
hidden_states
)
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
0 → 100644
View file @
6d2051cc
from
typing
import
Optional
import
torch
from
vllm.inputs
import
INPUT_REGISTRY
from
vllm.model_executor.models.llava
import
(
LlavaForConditionalGeneration
,
dummy_data_for_llava
,
get_max_llava_image_tokens
,
input_processor_for_llava
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
@
MULTIMODAL_REGISTRY
.
register_image_input_mapper
()
@
MULTIMODAL_REGISTRY
.
register_max_image_tokens
(
get_max_llava_image_tokens
)
@
INPUT_REGISTRY
.
register_dummy_data
(
dummy_data_for_llava
)
@
INPUT_REGISTRY
.
register_input_processor
(
input_processor_for_llava
)
class
MyLlava
(
LlavaForConditionalGeneration
):
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
)
->
Optional
[
torch
.
Tensor
]:
# this dummy model always predicts the first token
logits
=
super
().
compute_logits
(
hidden_states
,
sampling_metadata
)
if
logits
is
not
None
:
logits
.
zero_
()
logits
[:,
0
]
+=
1.0
return
logits
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
0 → 100644
View file @
6d2051cc
from
typing
import
Optional
import
torch
from
vllm.model_executor.models.opt
import
OPTForCausalLM
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
class
MyOPTForCausalLM
(
OPTForCausalLM
):
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
)
->
Optional
[
torch
.
Tensor
]:
# this dummy model always predicts the first token
logits
=
super
().
compute_logits
(
hidden_states
,
sampling_metadata
)
if
logits
is
not
None
:
logits
.
zero_
()
logits
[:,
0
]
+=
1.0
return
logits
Prev
1
…
8
9
10
11
12
13
14
15
16
…
23
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment