Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
6d2051cc
Commit
6d2051cc
authored
Oct 21, 2024
by
zhuwenwen
Browse files
Merge tag 'v0.6.3.post1' into v0.6.3.post1-dev
parents
2c7f740a
a2c71c54
Changes
457
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1040 additions
and
392 deletions
+1040
-392
tests/models/decoder_only/vision_language/test_qwen.py
tests/models/decoder_only/vision_language/test_qwen.py
+7
-7
tests/models/embedding/language/test_embedding.py
tests/models/embedding/language/test_embedding.py
+22
-19
tests/models/embedding/utils.py
tests/models/embedding/utils.py
+29
-0
tests/models/embedding/vision_language/__init__.py
tests/models/embedding/vision_language/__init__.py
+0
-0
tests/models/embedding/vision_language/test_phi3v.py
tests/models/embedding/vision_language/test_phi3v.py
+62
-0
tests/models/encoder_decoder/language/test_bart.py
tests/models/encoder_decoder/language/test_bart.py
+211
-217
tests/models/encoder_decoder/vision_language/test_broadcast.py
.../models/encoder_decoder/vision_language/test_broadcast.py
+35
-0
tests/models/encoder_decoder/vision_language/test_mllama.py
tests/models/encoder_decoder/vision_language/test_mllama.py
+119
-64
tests/models/test_oot_registration.py
tests/models/test_oot_registration.py
+52
-2
tests/models/test_registry.py
tests/models/test_registry.py
+69
-9
tests/models/utils.py
tests/models/utils.py
+8
-1
tests/mq_llm_engine/test_error_handling.py
tests/mq_llm_engine/test_error_handling.py
+6
-6
tests/mq_llm_engine/utils.py
tests/mq_llm_engine/utils.py
+1
-1
tests/multi_step/test_correctness_async_llm.py
tests/multi_step/test_correctness_async_llm.py
+91
-1
tests/multi_step/test_correctness_llm.py
tests/multi_step/test_correctness_llm.py
+160
-2
tests/multimodal/test_processor_kwargs.py
tests/multimodal/test_processor_kwargs.py
+73
-43
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
...ins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
+14
-20
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
...dd_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
+34
-0
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
...ins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
+28
-0
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
...ugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
+19
-0
No files found.
Too many changes to show.
To preserve performance only
457 of 457+
files are displayed.
Plain diff
Email patch
tests/models/decoder_only/vision_language/test_qwen.py
View file @
6d2051cc
...
@@ -5,7 +5,7 @@ import pytest
...
@@ -5,7 +5,7 @@ import pytest
import
torch
import
torch
from
PIL.Image
import
Image
from
PIL.Image
import
Image
from
vllm.inputs
import
InputContext
,
LLMI
nputs
from
vllm.inputs
import
InputContext
,
token_i
nputs
from
vllm.multimodal.base
import
MultiModalInputs
from
vllm.multimodal.base
import
MultiModalInputs
from
vllm.multimodal.utils
import
cached_get_tokenizer
,
rescale_image_size
from
vllm.multimodal.utils
import
cached_get_tokenizer
,
rescale_image_size
...
@@ -71,12 +71,12 @@ def test_input_processor_valid_mm_data(input_processor_for_qwen,
...
@@ -71,12 +71,12 @@ def test_input_processor_valid_mm_data(input_processor_for_qwen,
"""Happy cases for image inputs to Qwen's multimodal input processor."""
"""Happy cases for image inputs to Qwen's multimodal input processor."""
prompt
=
""
.
join
(
prompt
=
""
.
join
(
[
f
"Picture
{
num
}
: <img></img>
\n
"
for
num
in
range
(
1
,
num_images
+
1
)])
[
f
"Picture
{
num
}
: <img></img>
\n
"
for
num
in
range
(
1
,
num_images
+
1
)])
inputs
=
LLMI
nputs
(
inputs
=
token_i
nputs
(
prompt
=
prompt
,
prompt
=
prompt
,
# When processing multimodal data for a multimodal model, the qwen
# When processing multimodal data for a multimodal model, the qwen
# input processor will overwrite the provided prompt_token_ids with
# input processor will overwrite the provided prompt_token_ids with
# the image prompts
# the image prompts
prompt_token_ids
=
None
,
prompt_token_ids
=
[]
,
multi_modal_data
=
{
"image"
:
torch
.
rand
(
num_images
,
TOKS_PER_IMG
,
4096
)},
multi_modal_data
=
{
"image"
:
torch
.
rand
(
num_images
,
TOKS_PER_IMG
,
4096
)},
)
)
proc_inputs
=
input_processor_for_qwen
(
qwen_vl_context
,
inputs
)
proc_inputs
=
input_processor_for_qwen
(
qwen_vl_context
,
inputs
)
...
@@ -134,9 +134,9 @@ def test_input_processor_invalid_mm_data(input_processor_for_qwen,
...
@@ -134,9 +134,9 @@ def test_input_processor_invalid_mm_data(input_processor_for_qwen,
trust_remote_code
=
True
)
trust_remote_code
=
True
)
prompt
=
"Picture 1: <img></img>
\n
"
prompt
=
"Picture 1: <img></img>
\n
"
prompt_token_ids
=
tokenizer
.
encode
(
prompt
)
prompt_token_ids
=
tokenizer
.
encode
(
prompt
)
inputs
=
LLMI
nputs
(
prompt
=
prompt
,
inputs
=
token_i
nputs
(
prompt
=
prompt
,
prompt_token_ids
=
prompt_token_ids
,
prompt_token_ids
=
prompt_token_ids
,
multi_modal_data
=
mm_data
)
multi_modal_data
=
mm_data
)
# Should fail since we have too many or too few dimensions for embeddings
# Should fail since we have too many or too few dimensions for embeddings
with
pytest
.
raises
(
ValueError
):
with
pytest
.
raises
(
ValueError
):
input_processor_for_qwen
(
qwen_vl_context
,
inputs
)
input_processor_for_qwen
(
qwen_vl_context
,
inputs
)
...
@@ -221,7 +221,7 @@ def run_test(
...
@@ -221,7 +221,7 @@ def run_test(
# Qwen encodes each image into a fixed content size of 256
# Qwen encodes each image into a fixed content size of 256
with
vllm_runner
(
model
,
with
vllm_runner
(
model
,
max_model_len
=
1024
,
max_model_len
=
1024
,
max_num_seqs
=
1
,
max_num_seqs
=
2
,
dtype
=
dtype
,
dtype
=
dtype
,
limit_mm_per_prompt
=
{
"image"
:
mm_limit
},
limit_mm_per_prompt
=
{
"image"
:
mm_limit
},
tensor_parallel_size
=
tensor_parallel_size
,
tensor_parallel_size
=
tensor_parallel_size
,
...
...
tests/models/embedding/language/test_embedding.py
View file @
6d2051cc
"""Compare the outputs of HF and vLLM
for Mistral models using greedy sampling
.
"""Compare the
embedding
outputs of HF and vLLM
models
.
Run `pytest tests/models/
test_llama
_embedding.py`.
Run `pytest tests/models/
embedding/language/test
_embedding.py`.
"""
"""
import
pytest
import
pytest
import
torch
import
torch.nn.functional
as
F
from
..utils
import
check_embeddings_close
MODELS
=
[
MODELS
=
[
"intfloat/e5-mistral-7b-instruct"
,
"intfloat/e5-mistral-7b-instruct"
,
"BAAI/bge-multilingual-gemma2"
,
]
]
def
compare_embeddings
(
embeddings1
,
embeddings2
):
similarities
=
[
F
.
cosine_similarity
(
torch
.
tensor
(
e1
),
torch
.
tensor
(
e2
),
dim
=
0
)
for
e1
,
e2
in
zip
(
embeddings1
,
embeddings2
)
]
return
similarities
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
def
test_models
(
def
test_models
(
...
@@ -28,15 +21,25 @@ def test_models(
...
@@ -28,15 +21,25 @@ def test_models(
model
:
str
,
model
:
str
,
dtype
:
str
,
dtype
:
str
,
)
->
None
:
)
->
None
:
with
hf_runner
(
model
,
dtype
=
dtype
,
is_embedding_model
=
True
)
as
hf_model
:
# The example_prompts has ending "\n", for example:
# "Write a short story about a robot that dreams for the first time.\n"
# sentence_transformers will strip the input texts, see:
# https://github.com/UKPLab/sentence-transformers/blob/v3.1.1/sentence_transformers/models/Transformer.py#L159
# This makes the input_ids different between hf_model and vllm_model.
# So we need to strip the input texts to avoid test failing.
example_prompts
=
[
str
(
s
).
strip
()
for
s
in
example_prompts
]
with
hf_runner
(
model
,
dtype
=
dtype
,
is_sentence_transformer
=
True
)
as
hf_model
:
hf_outputs
=
hf_model
.
encode
(
example_prompts
)
hf_outputs
=
hf_model
.
encode
(
example_prompts
)
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
encode
(
example_prompts
)
vllm_outputs
=
vllm_model
.
encode
(
example_prompts
)
similarities
=
compare_embeddings
(
hf_outputs
,
vllm_outputs
)
check_embeddings_close
(
all_similarities
=
torch
.
stack
(
similarities
)
embeddings_0_lst
=
hf_outputs
,
tolerance
=
1e-2
embeddings_1_lst
=
vllm_outputs
,
assert
torch
.
all
((
all_similarities
<=
1.0
+
tolerance
)
name_0
=
"hf"
,
&
(
all_similarities
>=
1.0
-
tolerance
)
name_1
=
"vllm"
,
),
f
"Not all values are within
{
tolerance
}
of 1.0"
tol
=
1e-2
,
)
tests/models/embedding/utils.py
0 → 100644
View file @
6d2051cc
from
typing
import
List
,
Sequence
import
torch
import
torch.nn.functional
as
F
def
check_embeddings_close
(
*
,
embeddings_0_lst
:
Sequence
[
List
[
float
]],
embeddings_1_lst
:
Sequence
[
List
[
float
]],
name_0
:
str
,
name_1
:
str
,
tol
:
float
=
1e-3
,
)
->
None
:
assert
len
(
embeddings_0_lst
)
==
len
(
embeddings_1_lst
)
for
prompt_idx
,
(
embeddings_0
,
embeddings_1
)
in
enumerate
(
zip
(
embeddings_0_lst
,
embeddings_1_lst
)):
assert
len
(
embeddings_0
)
==
len
(
embeddings_1
)
sim
=
F
.
cosine_similarity
(
torch
.
tensor
(
embeddings_0
),
torch
.
tensor
(
embeddings_1
),
dim
=
0
)
fail_msg
=
(
f
"Test
{
prompt_idx
}
:"
f
"
\n
{
name_0
}
:
\t
{
embeddings_0
!
r
}
"
f
"
\n
{
name_1
}
:
\t
{
embeddings_1
!
r
}
"
)
assert
sim
>=
1
-
tol
,
fail_msg
tests/models/embedding/vision_language/__init__.py
0 → 100644
View file @
6d2051cc
tests/models/embedding/vision_language/test_phi3v.py
0 → 100644
View file @
6d2051cc
import
pytest
import
torch.nn.functional
as
F
from
....conftest
import
IMAGE_ASSETS
from
..utils
import
check_embeddings_close
HF_IMAGE_PROMPTS
=
IMAGE_ASSETS
.
prompts
({
"stop_sign"
:
"<|image_1|> Select the portion of the image that isolates the object of the given label: The label of the object is stop sign"
,
# noqa: E501
"cherry_blossom"
:
"<|image_1|> Represent the given image with the following question: What is in the image"
,
# noqa: E501
})
MODELS
=
[
"TIGER-Lab/VLM2Vec-Full"
]
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
def
test_models
(
hf_runner
,
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
)
->
None
:
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
with
vllm_runner
(
model
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
dtype
=
dtype
,
enforce_eager
=
True
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
encode
(
example_prompts
)
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
all_inputs
=
hf_model
.
get_inputs
(
example_prompts
)
all_outputs
=
[]
for
inputs
in
all_inputs
:
# Based on: https://github.com/TIGER-AI-Lab/VLM2Vec/blob/db3b951bccabba220c1f53ab46a734e50dd2fc08/src/model.py
outputs
=
hf_model
.
model
(
**
hf_model
.
wrap_device
(
inputs
,
device
=
hf_model
.
model
.
device
.
type
),
return_dict
=
True
,
output_hidden_states
=
True
,
)
last_hidden_state
=
outputs
.
hidden_states
[
-
1
][
0
]
reps
=
last_hidden_state
[
inputs
.
attention_mask
[
0
].
sum
()
-
1
]
pooled_output
=
F
.
normalize
(
reps
,
p
=
2
,
dim
=-
1
)
all_outputs
.
append
(
pooled_output
.
tolist
())
hf_outputs
=
all_outputs
check_embeddings_close
(
embeddings_0_lst
=
hf_outputs
,
embeddings_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
tests/models/encoder_decoder/language/test_bart.py
View file @
6d2051cc
...
@@ -4,220 +4,214 @@ Run `pytest tests/models/encoder_decoder/language/test_bart.py`.
...
@@ -4,220 +4,214 @@ Run `pytest tests/models/encoder_decoder/language/test_bart.py`.
"""
"""
from
typing
import
List
,
Optional
,
Tuple
,
Type
from
typing
import
List
,
Optional
,
Tuple
,
Type
from
vllm.utils
import
is_cpu
import
pytest
from
transformers
import
AutoModelForSeq2SeqLM
if
not
is_cpu
():
# CPU backend is not currently supported with encoder/decoder models
from
vllm.sequence
import
SampleLogprobs
# skip test definitions entirely to avoid importing GPU kernel libs
# (xFormers, etc.)
from
....conftest
import
(
DecoderPromptType
,
ExplicitEncoderDecoderPrompt
,
HfRunner
,
VllmRunner
)
import
pytest
from
....utils
import
multi_gpu_test
from
transformers
import
AutoModelForSeq2SeqLM
from
...utils
import
check_logprobs_close
from
vllm.sequence
import
SampleLogprobs
MODELS
=
[
"facebook/bart-base"
,
"facebook/bart-large-cnn"
]
from
....conftest
import
(
DecoderPromptType
,
ExplicitEncoderDecoderPrompt
,
HfRunner
,
VllmRunner
)
def
vllm_to_hf_output
(
from
....utils
import
multi_gpu_test
vllm_output
:
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]],
from
...utils
import
check_logprobs_close
decoder_prompt_type
:
DecoderPromptType
,
):
MODELS
=
[
"facebook/bart-base"
,
"facebook/bart-large-cnn"
]
"""Sanitize vllm output to be comparable with hf output."""
output_ids
,
output_str
,
out_logprobs
=
vllm_output
def
vllm_to_hf_output
(
vllm_output
:
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]],
hf_output_str
=
output_str
+
"</s>"
decoder_prompt_type
:
DecoderPromptType
,
if
decoder_prompt_type
==
DecoderPromptType
.
NONE
:
):
hf_output_str
=
"<s>"
+
hf_output_str
"""Sanitize vllm output to be comparable with hf output."""
output_ids
,
output_str
,
out_logprobs
=
vllm_output
return
output_ids
,
hf_output_str
,
out_logprobs
hf_output_str
=
output_str
+
"</s>"
if
decoder_prompt_type
==
DecoderPromptType
.
NONE
:
def
run_test
(
hf_output_str
=
"<s>"
+
hf_output_str
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
return
output_ids
,
hf_output_str
,
out_logprobs
prompts
:
List
[
ExplicitEncoderDecoderPrompt
[
str
,
str
]],
decoder_prompt_type
:
DecoderPromptType
,
def
run_test
(
model
:
str
,
hf_runner
:
Type
[
HfRunner
],
*
,
vllm_runner
:
Type
[
VllmRunner
],
dtype
:
str
,
prompts
:
List
[
ExplicitEncoderDecoderPrompt
[
str
,
str
]],
max_tokens
:
int
,
decoder_prompt_type
:
DecoderPromptType
,
num_logprobs
:
int
,
model
:
str
,
tensor_parallel_size
:
int
,
*
,
distributed_executor_backend
:
Optional
[
str
]
=
None
,
dtype
:
str
,
)
->
None
:
max_tokens
:
int
,
'''
num_logprobs
:
int
,
Test the vLLM BART model for a variety of encoder/decoder input prompts,
tensor_parallel_size
:
int
,
by validating it against HuggingFace (HF) BART.
distributed_executor_backend
:
Optional
[
str
]
=
None
,
)
->
None
:
Arguments:
'''
Test the vLLM BART model for a variety of encoder/decoder input prompts,
* hf_runner: HuggingFace (HF) test model runner
by validating it against HuggingFace (HF) BART.
* vllm_runner: vLLM test model runner
* example_encoder_decoder_prompts: test fixture which provides a
Arguments:
dictionary of dummy prompts
* model: the HF ID of the specific BART variant under test
* hf_runner: HuggingFace (HF) test model runner
* dtype: the tensor datatype to employ
* vllm_runner: vLLM test model runner
* max_tokens
* example_encoder_decoder_prompts: test fixture which provides a
* num_logprobs
dictionary of dummy prompts
* decoder_prompt_type: key into the example_encoder_decoder_prompts
* model: the HF ID of the specific BART variant under test
dictionary; selects specific encoder/decoder
* dtype: the tensor datatype to employ
prompt scenarios to test
* max_tokens
* num_logprobs
A note on using HF BART as a baseline for validating vLLM BART,
* decoder_prompt_type: key into the example_encoder_decoder_prompts
specifically when the decoder prompt is None.
dictionary; selects specific encoder/decoder
prompt scenarios to test
The HF GenerationMixin's default behavior is to force the first
decoded token to be <BOS> if the prompt does not already contain
A note on using HF BART as a baseline for validating vLLM BART,
<BOS> (this is accomplished using a logit
specifically when the decoder prompt is None.
processor setting.)
The HF GenerationMixin's default behavior is to force the first
So when we use HF BART as our baseline for comparison, note that
decoded token to be <BOS> if the prompt does not already contain
when the user provides a request with a None decoder prompt
<BOS> (this is accomplished using a logit
(i.e. a singleton encoder prompt, or else an explicit encoder/
processor setting.)
decoder prompt with the decoder sub-prompt set to None), HF and
vLLM handle this in different ways:
So when we use HF BART as our baseline for comparison, note that
when the user provides a request with a None decoder prompt
* HF will (1) tokenize the None prompt as an empty token-list,
(i.e. a singleton encoder prompt, or else an explicit encoder/
(2) append <decoder-start-token> to the beginning, yielding
decoder prompt with the decoder sub-prompt set to None), HF and
[<decoder-start-token>], (3) pass this token list to the model, and
vLLM handle this in different ways:
then (4) after computing logits during prefill, override the model
logits & force <BOS> to be the first generated token.
* HF will (1) tokenize the None prompt as an empty token-list,
(2) append <decoder-start-token> to the beginning, yielding
* vLLM will (1) tokenize the None prompt as [<BOS>], (2) append decoder-
[<decoder-start-token>], (3) pass this token list to the model, and
start-token to the beginning, yielding [<decoder-start-token><BOS>],
then (4) after computing logits during prefill, override the model
(3) pass these tokens to the model & proceed with generation.
logits & force <BOS> to be the first generated token.
The net effect is that compared to vLLM, the list of HF *decoded* tokens
* vLLM will (1) tokenize the None prompt as [<BOS>], (2) append decoder-
will contain one more initial <BOS> than the vLLM generated tokens,
start-token to the beginning, yielding [<decoder-start-token><BOS>],
because vLLM's <BOS> token is injected into the prompt rather than into
(3) pass these tokens to the model & proceed with generation.
the generated output. This is in spite of the fact that overall, the
complete sequences (prompt + decoded tokens) produced by vLLM will match
The net effect is that compared to vLLM, the list of HF *decoded* tokens
HF.
will contain one more initial <BOS> than the vLLM generated tokens,
because vLLM's <BOS> token is injected into the prompt rather than into
So when we use HF decoded token output to validate vLLM's decoded token
the generated output. This is in spite of the fact that overall, the
output, the testing process must account for the difference in decoded
complete sequences (prompt + decoded tokens) produced by vLLM will match
token sequences between vLLM and HF specifically in the
HF.
decoder-prompt-is-None case.
So when we use HF decoded token output to validate vLLM's decoded token
One option is to disable the logit processor feature that forces the
output, the testing process must account for the difference in decoded
<BOS> token to be decoded (forced_bos_token_id = None), eliminating
token sequences between vLLM and HF specifically in the
the problem entirely. However this is not "normal" BART usage.
decoder-prompt-is-None case.
The other option is - only in the decoder-prompt-is-None case - to
One option is to disable the logit processor feature that forces the
discard the first decoded token from the HF output before comparing it
<BOS> token to be decoded (forced_bos_token_id = None), eliminating
to vLLM.
the problem entirely. However this is not "normal" BART usage.
To that end, when testing the scenario where the decoder prompt is None
The other option is - only in the decoder-prompt-is-None case - to
(and only in that one scenario), this test skips the first HF decoded
discard the first decoded token from the HF output before comparing it
token during the process of validating the vLLM decoded output.
to vLLM.
'''
To that end, when testing the scenario where the decoder prompt is None
# NOTE: take care of the order. run vLLM first, and then run HF.
(and only in that one scenario), this test skips the first HF decoded
# vLLM needs a fresh new process without cuda initialization.
token during the process of validating the vLLM decoded output.
# if we run HF first, the cuda initialization will be done and it
'''
# will hurt multiprocessing backend with fork method (the default).
# NOTE: take care of the order. run vLLM first, and then run HF.
# Note: currently encoder/decoder models are only compatible with
# vLLM needs a fresh new process without cuda initialization.
# enforce_eager=True. Normally this is not a problem because
# if we run HF first, the cuda initialization will be done and it
# for encoder/decoder models vLLM will
# will hurt multiprocessing backend with fork method (the default).
# default to enforce_eager=True if enforce_eager
# is left unspecified. However, the
# Note: currently encoder/decoder models are only compatible with
# VllmRunner test fixture (which wraps around the LLM class) defaults to
# enforce_eager=True. Normally this is not a problem because
# enforce_eager=False (a behavior which a number of already-exisitng
# for encoder/decoder models vLLM will
# decoder-only unit tests expect), so when testing an encoder/decoder
# default to enforce_eager=True if enforce_eager
# model we must explicitly specify enforce_eager=True in the VllmRunner
# is left unspecified. However, the
# constructor.
# VllmRunner test fixture (which wraps around the LLM class) defaults to
with
vllm_runner
(
model
,
# enforce_eager=False (a behavior which a number of already-exisitng
dtype
=
dtype
,
# decoder-only unit tests expect), so when testing an encoder/decoder
tensor_parallel_size
=
tensor_parallel_size
,
# model we must explicitly specify enforce_eager=True in the VllmRunner
distributed_executor_backend
=
distributed_executor_backend
,
# constructor.
enforce_eager
=
True
)
as
vllm_model
:
with
vllm_runner
(
vllm_outputs
=
vllm_model
.
generate_encoder_decoder_greedy_logprobs
(
model
,
prompts
,
max_tokens
,
num_logprobs
)
dtype
=
dtype
,
tensor_parallel_size
=
tensor_parallel_size
,
# Configuration settings for HF baseline
distributed_executor_backend
=
distributed_executor_backend
,
hf_kwargs
=
{
enforce_eager
=
True
)
as
vllm_model
:
"top_k"
:
None
,
vllm_outputs
=
vllm_model
.
generate_encoder_decoder_greedy_logprobs
(
"num_beams"
:
1
,
prompts
,
max_tokens
,
num_logprobs
)
"repetition_penalty"
:
1.0
,
"top_p"
:
1.0
,
# Configuration settings for HF baseline
"length_penalty"
:
1.0
,
hf_kwargs
=
{
"early_stopping"
:
False
,
"top_k"
:
None
,
"no_repeat_ngram_size"
:
None
,
"num_beams"
:
1
,
"min_length"
:
0
"repetition_penalty"
:
1.0
,
}
"top_p"
:
1.0
,
"length_penalty"
:
1.0
,
with
hf_runner
(
model
,
dtype
=
dtype
,
"early_stopping"
:
False
,
auto_cls
=
AutoModelForSeq2SeqLM
)
as
hf_model
:
"no_repeat_ngram_size"
:
None
,
hf_outputs
=
(
hf_model
.
generate_encoder_decoder_greedy_logprobs_limit
(
"min_length"
:
0
prompts
,
}
max_tokens
,
num_logprobs
,
with
hf_runner
(
model
,
dtype
=
dtype
,
**
hf_kwargs
,
auto_cls
=
AutoModelForSeq2SeqLM
)
as
hf_model
:
))
hf_outputs
=
(
hf_model
.
generate_encoder_decoder_greedy_logprobs_limit
(
hf_skip_tokens
=
(
1
prompts
,
if
decoder_prompt_type
==
DecoderPromptType
.
NONE
else
0
)
max_tokens
,
num_logprobs
,
check_logprobs_close
(
**
hf_kwargs
,
outputs_0_lst
=
hf_outputs
,
))
outputs_1_lst
=
[
vllm_to_hf_output
(
vllm_output
,
decoder_prompt_type
)
hf_skip_tokens
=
(
1
if
decoder_prompt_type
==
DecoderPromptType
.
NONE
for
vllm_output
in
vllm_outputs
else
0
)
],
name_0
=
"hf"
,
check_logprobs_close
(
name_1
=
"vllm"
,
outputs_0_lst
=
hf_outputs
,
num_outputs_0_skip_tokens
=
hf_skip_tokens
,
outputs_1_lst
=
[
)
vllm_to_hf_output
(
vllm_output
,
decoder_prompt_type
)
for
vllm_output
in
vllm_outputs
],
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
name_0
=
"hf"
,
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
,
"bfloat16"
])
name_1
=
"vllm"
,
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
64
])
num_outputs_0_skip_tokens
=
hf_skip_tokens
,
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
)
@
pytest
.
mark
.
parametrize
(
"decoder_prompt_type"
,
list
(
DecoderPromptType
))
def
test_models
(
hf_runner
,
vllm_runner
,
example_encoder_decoder_prompts
,
model
,
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
dtype
,
max_tokens
,
num_logprobs
,
decoder_prompt_type
)
->
None
:
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
,
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
64
])
run_test
(
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
hf_runner
,
@
pytest
.
mark
.
parametrize
(
"decoder_prompt_type"
,
list
(
DecoderPromptType
))
vllm_runner
,
def
test_models
(
hf_runner
,
vllm_runner
,
example_encoder_decoder_prompts
,
example_encoder_decoder_prompts
[
decoder_prompt_type
],
model
,
dtype
,
max_tokens
,
num_logprobs
,
decoder_prompt_type
,
decoder_prompt_type
)
->
None
:
model
,
dtype
=
dtype
,
run_test
(
max_tokens
=
max_tokens
,
hf_runner
,
num_logprobs
=
num_logprobs
,
vllm_runner
,
tensor_parallel_size
=
1
,
example_encoder_decoder_prompts
[
decoder_prompt_type
],
)
decoder_prompt_type
,
model
,
dtype
=
dtype
,
@
multi_gpu_test
(
num_gpus
=
2
)
max_tokens
=
max_tokens
,
@
pytest
.
mark
.
parametrize
(
"distributed_executor_backend"
,
[
"ray"
,
"mp"
])
num_logprobs
=
num_logprobs
,
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"facebook/bart-large-cnn"
])
tensor_parallel_size
=
1
,
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
)
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
64
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
multi_gpu_test
(
num_gpus
=
2
)
@
pytest
.
mark
.
parametrize
(
"decoder_prompt_type"
,
[
DecoderPromptType
.
CUSTOM
])
@
pytest
.
mark
.
parametrize
(
"distributed_executor_backend"
,
[
"ray"
,
"mp"
])
def
test_models_distributed
(
hf_runner
,
vllm_runner
,
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"facebook/bart-large-cnn"
])
example_encoder_decoder_prompts
,
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
distributed_executor_backend
,
model
,
dtype
,
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
64
])
max_tokens
,
num_logprobs
,
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
decoder_prompt_type
)
->
None
:
@
pytest
.
mark
.
parametrize
(
"decoder_prompt_type"
,
[
DecoderPromptType
.
CUSTOM
])
run_test
(
def
test_models_distributed
(
hf_runner
,
vllm_runner
,
hf_runner
,
example_encoder_decoder_prompts
,
vllm_runner
,
distributed_executor_backend
,
model
,
dtype
,
example_encoder_decoder_prompts
[
decoder_prompt_type
],
max_tokens
,
num_logprobs
,
decoder_prompt_type
,
decoder_prompt_type
)
->
None
:
model
,
run_test
(
dtype
=
dtype
,
hf_runner
,
max_tokens
=
max_tokens
,
vllm_runner
,
num_logprobs
=
num_logprobs
,
example_encoder_decoder_prompts
[
decoder_prompt_type
],
tensor_parallel_size
=
2
,
decoder_prompt_type
,
distributed_executor_backend
=
distributed_executor_backend
,
model
,
)
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
tensor_parallel_size
=
2
,
distributed_executor_backend
=
distributed_executor_backend
,
)
tests/models/encoder_decoder/vision_language/test_broadcast.py
0 → 100644
View file @
6d2051cc
import
pytest
from
....utils
import
multi_gpu_test
@
multi_gpu_test
(
num_gpus
=
2
)
@
pytest
.
mark
.
parametrize
(
"distributed_executor_backend"
,
[
"ray"
,
"mp"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"meta-llama/Llama-3.2-11B-Vision-Instruct"
,
])
def
test_models
(
hf_runner
,
vllm_runner
,
image_assets
,
distributed_executor_backend
,
model
)
->
None
:
dtype
=
"half"
max_tokens
=
5
num_logprobs
=
5
tensor_parallel_size
=
2
if
model
.
startswith
(
"meta-llama/Llama-3.2-11B-Vision-Instruct"
):
from
.test_mllama
import
models
,
run_test
else
:
raise
NotImplementedError
(
f
"Unsupported model:
{
model
}
"
)
run_test
(
hf_runner
,
vllm_runner
,
image_assets
,
model
=
models
[
0
],
size_factors
=
[
0.25
,
0.5
,
1.0
],
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
tensor_parallel_size
=
tensor_parallel_size
,
distributed_executor_backend
=
distributed_executor_backend
,
)
tests/models/encoder_decoder/vision_language/test_mllama.py
View file @
6d2051cc
...
@@ -9,10 +9,10 @@ from vllm.sequence import SampleLogprobs
...
@@ -9,10 +9,10 @@ from vllm.sequence import SampleLogprobs
from
....conftest
import
(
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
,
from
....conftest
import
(
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
,
_ImageAssets
)
_ImageAssets
)
from
....utils
import
multi
_gpu_test
from
....utils
import
large
_gpu_test
from
...utils
import
check_logprobs_close
from
...utils
import
check_logprobs_close
_LIMIT_IMAGE_PER_PROMPT
=
1
_LIMIT_IMAGE_PER_PROMPT
=
3
HF_IMAGE_PROMPTS
=
IMAGE_ASSETS
.
prompts
({
HF_IMAGE_PROMPTS
=
IMAGE_ASSETS
.
prompts
({
"stop_sign"
:
"stop_sign"
:
...
@@ -47,14 +47,46 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
...
@@ -47,14 +47,46 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
if
token_id
!=
image_token_id
or
output_ids
[
idx
-
1
]
!=
image_token_id
if
token_id
!=
image_token_id
or
output_ids
[
idx
-
1
]
!=
image_token_id
]
]
assert
output_str
[
0
]
==
" "
hf_output_str
=
output_str
hf_output_str
=
output_str
[
1
:]
if
hf_output_ids
[
-
1
]
==
eos_token_id
:
if
hf_output_ids
[
-
1
]
==
eos_token_id
:
hf_output_str
=
hf_output_str
+
tokenizer
.
decode
(
eos_token_id
)
hf_output_str
=
hf_output_str
+
tokenizer
.
decode
(
eos_token_id
)
return
hf_output_ids
,
hf_output_str
,
out_logprobs
return
hf_output_ids
,
hf_output_str
,
out_logprobs
def
_get_inputs
(
image_assets
:
_ImageAssets
,
*
,
size_factors
:
Optional
[
List
[
float
]]
=
None
,
sizes
:
Optional
[
List
[
Tuple
[
int
,
int
]]]
=
None
,
)
->
List
[
Tuple
[
List
[
str
],
PromptImageInput
]]:
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
if
size_factors
is
not
None
:
inputs_per_image
=
[(
[
prompt
for
_
in
size_factors
],
[
rescale_image_size
(
image
,
factor
)
for
factor
in
size_factors
],
)
for
image
,
prompt
in
zip
(
images
,
HF_IMAGE_PROMPTS
)]
elif
sizes
is
not
None
:
inputs_per_image
=
[(
[
prompt
if
size
is
not
None
else
text_only_prompts
[
0
]
for
size
in
sizes
],
[
image
.
resize
(
size
)
if
size
is
not
None
else
None
for
size
in
sizes
],
)
for
image
,
prompt
in
zip
(
images
,
HF_IMAGE_PROMPTS
)]
if
len
(
sizes
)
==
0
:
inputs_per_image
.
append
(
(
text_only_prompts
,
[
None
]
*
len
(
text_only_prompts
)))
else
:
raise
ValueError
(
"You must provide either `size_factors` or `sizes`"
)
return
inputs_per_image
@
overload
@
overload
def
run_test
(
def
run_test
(
hf_runner
:
Type
[
HfRunner
],
hf_runner
:
Type
[
HfRunner
],
...
@@ -103,39 +135,17 @@ def run_test(
...
@@ -103,39 +135,17 @@ def run_test(
tensor_parallel_size
:
int
,
tensor_parallel_size
:
int
,
distributed_executor_backend
:
Optional
[
str
]
=
None
,
distributed_executor_backend
:
Optional
[
str
]
=
None
,
):
):
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
_run_test
(
hf_runner
,
if
size_factors
is
not
None
:
vllm_runner
,
inputs_per_image
=
[(
_get_inputs
(
image_assets
,
size_factors
=
size_factors
,
sizes
=
sizes
),
[
prompt
for
_
in
size_factors
],
model
,
[
rescale_image_size
(
image
,
factor
)
for
factor
in
size_factors
],
dtype
=
dtype
,
)
for
image
,
prompt
in
zip
(
images
,
HF_IMAGE_PROMPTS
)]
max_tokens
=
max_tokens
,
elif
sizes
is
not
None
:
num_logprobs
=
num_logprobs
,
inputs_per_image
=
[(
tensor_parallel_size
=
tensor_parallel_size
,
[
distributed_executor_backend
=
distributed_executor_backend
,
prompt
if
size
is
not
None
else
text_only_prompts
[
0
]
)
for
size
in
sizes
],
[
image
.
resize
(
size
)
if
size
is
not
None
else
None
for
size
in
sizes
],
)
for
image
,
prompt
in
zip
(
images
,
HF_IMAGE_PROMPTS
)]
if
len
(
sizes
)
==
0
:
inputs_per_image
.
append
(
(
text_only_prompts
,
[
None
]
*
len
(
text_only_prompts
)))
else
:
raise
ValueError
(
"You must provide either `size_factors` or `sizes`"
)
_run_test
(
hf_runner
,
vllm_runner
,
inputs_per_image
,
model
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
tensor_parallel_size
=
tensor_parallel_size
,
distributed_executor_backend
=
distributed_executor_backend
)
def
_run_test
(
def
_run_test
(
...
@@ -167,8 +177,8 @@ def _run_test(
...
@@ -167,8 +177,8 @@ def _run_test(
# max_model_len should be greater than image_feature_size
# max_model_len should be greater than image_feature_size
with
vllm_runner
(
model
,
with
vllm_runner
(
model
,
dtype
=
dtype
,
dtype
=
dtype
,
max_num_seqs
=
16
,
max_model_len
=
4096
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
tensor_parallel_size
=
tensor_parallel_size
,
tensor_parallel_size
=
tensor_parallel_size
,
distributed_executor_backend
=
distributed_executor_backend
,
distributed_executor_backend
=
distributed_executor_backend
,
enforce_eager
=
True
,
enforce_eager
=
True
,
...
@@ -185,14 +195,9 @@ def _run_test(
...
@@ -185,14 +195,9 @@ def _run_test(
def
process
(
hf_inputs
:
BatchEncoding
):
def
process
(
hf_inputs
:
BatchEncoding
):
return
hf_inputs
return
hf_inputs
from
transformers
import
AutoConfig
from
transformers.models.mllama
import
MllamaConfig
as
MllamaConfigHf
# use transformer's MllamaConfig for hf_runner
# and vllm's MllamaConfig for vllm_runner
AutoConfig
.
register
(
"mllama"
,
MllamaConfigHf
,
exist_ok
=
True
)
with
hf_runner
(
model
,
with
hf_runner
(
model
,
dtype
=
dtype
,
dtype
=
dtype
,
model_kwargs
=
{
"device_map"
:
"auto"
},
postprocess_inputs
=
process
,
postprocess_inputs
=
process
,
auto_cls
=
AutoModelForVision2Seq
)
as
hf_model
:
auto_cls
=
AutoModelForVision2Seq
)
as
hf_model
:
hf_outputs_per_image
=
[
hf_outputs_per_image
=
[
...
@@ -203,8 +208,6 @@ def _run_test(
...
@@ -203,8 +208,6 @@ def _run_test(
for
prompts
,
images
in
inputs
for
prompts
,
images
in
inputs
]
]
from
vllm.transformers_utils.configs.mllama
import
MllamaConfig
AutoConfig
.
register
(
"mllama"
,
MllamaConfig
,
exist_ok
=
True
)
for
hf_outputs
,
vllm_outputs
in
zip
(
hf_outputs_per_image
,
for
hf_outputs
,
vllm_outputs
in
zip
(
hf_outputs_per_image
,
vllm_outputs_per_image
):
vllm_outputs_per_image
):
check_logprobs_close
(
check_logprobs_close
(
...
@@ -218,6 +221,7 @@ def _run_test(
...
@@ -218,6 +221,7 @@ def _run_test(
)
)
@
large_gpu_test
(
min_gb
=
48
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"sizes"
,
"sizes"
,
...
@@ -236,13 +240,13 @@ def _run_test(
...
@@ -236,13 +240,13 @@ def _run_test(
(
1024
,
1024
),
(
512
,
1536
),
(
512
,
2028
),
None
],
(
1024
,
1024
),
(
512
,
1536
),
(
512
,
2028
),
None
],
# mllama has 8 possible aspect ratios, carefully set the sizes
# mllama has 8 possible aspect ratios, carefully set the sizes
# to cover all of them
# to cover all of them
],
])
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
def
test_models
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
sizes
,
dtype
,
def
test_models_single_leading_image
(
hf_runner
,
vllm_runner
,
image_assets
,
max_tokens
,
num_logprobs
)
->
None
:
model
,
sizes
,
dtype
,
max_tokens
,
num_logprobs
)
->
None
:
run_test
(
run_test
(
hf_runner
,
hf_runner
,
vllm_runner
,
vllm_runner
,
...
@@ -256,28 +260,79 @@ def test_models(hf_runner, vllm_runner, image_assets, model, sizes, dtype,
...
@@ -256,28 +260,79 @@ def test_models(hf_runner, vllm_runner, image_assets, model, sizes, dtype,
)
)
@
multi
_gpu_test
(
num_gpus
=
2
)
@
large
_gpu_test
(
min_gb
=
48
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"sizes"
,
[
[(
512
,
512
),
(
1024
,
512
),
(
1536
,
512
),
(
2048
,
512
),
(
512
,
1024
),
(
1024
,
1024
),
(
512
,
1536
),
(
512
,
2028
),
None
],
],
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
def
test_models_distributed
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
sizes
,
def
test_models_multi_leading_images
(
hf_runner
,
vllm_runner
,
image_assets
,
dtype
,
max_tokens
,
num_logprobs
)
->
None
:
model
,
dtype
,
max_tokens
,
run_test
(
num_logprobs
)
->
None
:
stop_sign
=
image_assets
[
0
].
pil_image
cherry_blossom
=
image_assets
[
1
].
pil_image
inputs
=
[(
[
"<|image|><|image|><|begin_of_text|>Describe 2 images."
,
# noqa: E501
"<|image|><|image|><|begin_of_text|>Describe 2 images."
,
# noqa: E501
"<|image|><|image|><|image|><|begin_of_text|>Describe 3 images."
,
# noqa: E501
],
[
[
stop_sign
,
cherry_blossom
],
# Images with different sizes.
[
stop_sign
.
resize
((
512
,
512
)),
stop_sign
,
],
[
stop_sign
,
stop_sign
.
resize
((
512
,
1536
)),
cherry_blossom
.
resize
((
512
,
1024
)),
],
])]
_run_test
(
hf_runner
,
hf_runner
,
vllm_runner
,
vllm_runner
,
image_assets
,
inputs
,
model
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
tensor_parallel_size
=
1
,
)
@
large_gpu_test
(
min_gb
=
48
)
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
def
test_models_interleaved_images
(
hf_runner
,
vllm_runner
,
image_assets
,
model
,
dtype
,
max_tokens
,
num_logprobs
)
->
None
:
stop_sign
=
image_assets
[
0
].
pil_image
cherry_blossom
=
image_assets
[
1
].
pil_image
inputs
=
[(
[
"<|begin_of_text|>The content of the image <|image|> is"
,
# noqa: E501
"<|begin_of_text|>Between the first image <|image|> and the second image<|image|>, "
# noqa: E501
"which is a stop sign and which is a cherry blossom?"
,
# noqa: E501
],
[
[
stop_sign
],
[
stop_sign
,
cherry_blossom
],
])]
_run_test
(
hf_runner
,
vllm_runner
,
inputs
,
model
,
model
,
sizes
=
sizes
,
dtype
=
dtype
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
num_logprobs
=
num_logprobs
,
tensor_parallel_size
=
2
,
tensor_parallel_size
=
1
,
)
)
tests/models/test_oot_registration.py
View file @
6d2051cc
...
@@ -2,7 +2,8 @@ import os
...
@@ -2,7 +2,8 @@ import os
import
pytest
import
pytest
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
PoolingParams
,
SamplingParams
from
vllm.assets.image
import
ImageAsset
from
..utils
import
fork_new_process_for_each_test
from
..utils
import
fork_new_process_for_each_test
...
@@ -16,7 +17,7 @@ def test_plugin(dummy_opt_path):
...
@@ -16,7 +17,7 @@ def test_plugin(dummy_opt_path):
@
fork_new_process_for_each_test
@
fork_new_process_for_each_test
def
test_oot_registration
(
dummy_opt_path
):
def
test_oot_registration
_text_generation
(
dummy_opt_path
):
os
.
environ
[
"VLLM_PLUGINS"
]
=
"register_dummy_model"
os
.
environ
[
"VLLM_PLUGINS"
]
=
"register_dummy_model"
prompts
=
[
"Hello, my name is"
,
"The text does not matter"
]
prompts
=
[
"Hello, my name is"
,
"The text does not matter"
]
sampling_params
=
SamplingParams
(
temperature
=
0
)
sampling_params
=
SamplingParams
(
temperature
=
0
)
...
@@ -29,3 +30,52 @@ def test_oot_registration(dummy_opt_path):
...
@@ -29,3 +30,52 @@ def test_oot_registration(dummy_opt_path):
# make sure only the first token is generated
# make sure only the first token is generated
rest
=
generated_text
.
replace
(
first_token
,
""
)
rest
=
generated_text
.
replace
(
first_token
,
""
)
assert
rest
==
""
assert
rest
==
""
@
fork_new_process_for_each_test
def
test_oot_registration_embedding
(
dummy_gemma2_embedding_path
):
os
.
environ
[
"VLLM_PLUGINS"
]
=
"register_dummy_model"
prompts
=
[
"Hello, my name is"
,
"The text does not matter"
]
sampling_params
=
PoolingParams
()
llm
=
LLM
(
model
=
dummy_gemma2_embedding_path
,
load_format
=
"dummy"
)
outputs
=
llm
.
encode
(
prompts
,
sampling_params
)
for
output
in
outputs
:
assert
all
(
v
==
0
for
v
in
output
.
outputs
.
embedding
)
image
=
ImageAsset
(
"cherry_blossom"
).
pil_image
.
convert
(
"RGB"
)
@
fork_new_process_for_each_test
def
test_oot_registration_multimodal
(
dummy_llava_path
):
os
.
environ
[
"VLLM_PLUGINS"
]
=
"register_dummy_model"
prompts
=
[{
"prompt"
:
"What's in the image?<image>"
,
"multi_modal_data"
:
{
"image"
:
image
},
},
{
"prompt"
:
"Describe the image<image>"
,
"multi_modal_data"
:
{
"image"
:
image
},
}]
sampling_params
=
SamplingParams
(
temperature
=
0
)
llm
=
LLM
(
model
=
dummy_llava_path
,
load_format
=
"dummy"
,
max_num_seqs
=
1
,
trust_remote_code
=
True
,
gpu_memory_utilization
=
0.98
,
max_model_len
=
4096
,
enforce_eager
=
True
,
limit_mm_per_prompt
=
{
"image"
:
1
})
first_token
=
llm
.
get_tokenizer
().
decode
(
0
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
for
output
in
outputs
:
generated_text
=
output
.
outputs
[
0
].
text
# make sure only the first token is generated
rest
=
generated_text
.
replace
(
first_token
,
""
)
assert
rest
==
""
tests/models/test_registry.py
View file @
6d2051cc
import
warnings
import
pytest
import
pytest
import
t
ransformers
import
t
orch.cuda
from
vllm.model_executor.models
import
_MODELS
,
ModelRegistry
from
vllm.model_executor.models
import
(
is_embedding_model
,
is_text_generation_model
,
supports_multimodal
)
from
vllm.model_executor.models.registry
import
(
_EMBEDDING_MODELS
,
_MULTIMODAL_MODELS
,
_SPECULATIVE_DECODING_MODELS
,
_TEXT_GENERATION_MODELS
,
ModelRegistry
)
from
vllm.platforms
import
current_platform
from
..utils
import
fork_new_process_for_each_test
@
pytest
.
mark
.
parametrize
(
"model_cls"
,
_MODELS
)
def
test_registry_imports
(
model_cls
):
if
(
model_cls
in
(
"LlavaOnevisionForConditionalGeneration"
,
"Qwen2VLForConditionalGeneration"
)
and
transformers
.
__version__
<
"4.45"
):
pytest
.
skip
(
"Waiting for next transformers release"
)
@
pytest
.
mark
.
parametrize
(
"model_arch"
,
ModelRegistry
.
get_supported_archs
())
def
test_registry_imports
(
model_arch
):
# Ensure all model classes can be imported successfully
# Ensure all model classes can be imported successfully
ModelRegistry
.
resolve_model_cls
([
model_cls
])
model_cls
,
_
=
ModelRegistry
.
resolve_model_cls
(
model_arch
)
if
model_arch
in
_SPECULATIVE_DECODING_MODELS
:
pass
# Ignore these models which do not have a unified format
else
:
assert
is_text_generation_model
(
model_cls
)
is
(
model_arch
in
_TEXT_GENERATION_MODELS
or
model_arch
in
_MULTIMODAL_MODELS
)
assert
is_embedding_model
(
model_cls
)
is
(
model_arch
in
_EMBEDDING_MODELS
)
assert
supports_multimodal
(
model_cls
)
is
(
model_arch
in
_MULTIMODAL_MODELS
)
@
fork_new_process_for_each_test
@
pytest
.
mark
.
parametrize
(
"model_arch,is_mm,init_cuda"
,
[
(
"LlamaForCausalLM"
,
False
,
False
),
(
"MllamaForConditionalGeneration"
,
True
,
False
),
(
"LlavaForConditionalGeneration"
,
True
,
True
),
])
def
test_registry_is_multimodal
(
model_arch
,
is_mm
,
init_cuda
):
assert
ModelRegistry
.
is_multimodal_model
(
model_arch
)
is
is_mm
if
init_cuda
and
current_platform
.
is_cuda_alike
():
assert
not
torch
.
cuda
.
is_initialized
()
ModelRegistry
.
resolve_model_cls
(
model_arch
)
if
not
torch
.
cuda
.
is_initialized
():
warnings
.
warn
(
"This model no longer initializes CUDA on import. "
"Please test using a different one."
,
stacklevel
=
2
)
@
fork_new_process_for_each_test
@
pytest
.
mark
.
parametrize
(
"model_arch,is_pp,init_cuda"
,
[
(
"MLPSpeculatorPreTrainedModel"
,
False
,
False
),
(
"DeepseekV2ForCausalLM"
,
True
,
False
),
(
"Qwen2VLForConditionalGeneration"
,
True
,
True
),
])
def
test_registry_is_pp
(
model_arch
,
is_pp
,
init_cuda
):
assert
ModelRegistry
.
is_pp_supported_model
(
model_arch
)
is
is_pp
if
init_cuda
and
current_platform
.
is_cuda_alike
():
assert
not
torch
.
cuda
.
is_initialized
()
ModelRegistry
.
resolve_model_cls
(
model_arch
)
if
not
torch
.
cuda
.
is_initialized
():
warnings
.
warn
(
"This model no longer initializes CUDA on import. "
"Please test using a different one."
,
stacklevel
=
2
)
tests/models/utils.py
View file @
6d2051cc
import
warnings
import
warnings
from
typing
import
Dict
,
List
,
Optional
,
Sequence
,
Tuple
,
Union
from
typing
import
Dict
,
List
,
Optional
,
Sequence
,
Tuple
,
Union
import
torch
from
vllm.config
import
ModelConfig
from
vllm.config
import
ModelConfig
from
vllm.inputs
import
InputContext
from
vllm.inputs
import
InputContext
from
vllm.sequence
import
Logprob
,
PromptLogprobs
,
SampleLogprobs
from
vllm.sequence
import
Logprob
,
PromptLogprobs
,
SampleLogprobs
from
vllm.utils
import
is_cpu
TokensText
=
Tuple
[
List
[
int
],
str
]
TokensText
=
Tuple
[
List
[
int
],
str
]
...
@@ -247,6 +250,7 @@ def check_logprobs_close(
...
@@ -247,6 +250,7 @@ def check_logprobs_close(
def
build_model_context
(
model_name
:
str
,
def
build_model_context
(
model_name
:
str
,
tokenizer_name
:
Optional
[
str
]
=
None
,
tokenizer_name
:
Optional
[
str
]
=
None
,
trust_remote_code
:
bool
=
False
,
trust_remote_code
:
bool
=
False
,
dtype
:
Optional
[
Union
[
str
,
torch
.
dtype
]]
=
None
,
mm_processor_kwargs
:
Optional
[
Dict
]
=
None
,
mm_processor_kwargs
:
Optional
[
Dict
]
=
None
,
limit_mm_per_prompt
:
Optional
[
Dict
]
=
None
):
limit_mm_per_prompt
:
Optional
[
Dict
]
=
None
):
"""Creates an InputContext for a given model.
"""Creates an InputContext for a given model.
...
@@ -264,12 +268,15 @@ def build_model_context(model_name: str,
...
@@ -264,12 +268,15 @@ def build_model_context(model_name: str,
"""
"""
if
tokenizer_name
is
None
:
if
tokenizer_name
is
None
:
tokenizer_name
=
model_name
tokenizer_name
=
model_name
if
dtype
is
None
:
dtype
=
"bfloat16"
if
is_cpu
()
else
"half"
model_config
=
ModelConfig
(
model_config
=
ModelConfig
(
model_name
,
model_name
,
tokenizer_name
,
tokenizer_name
,
tokenizer_mode
=
"auto"
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
trust_remote_code
,
trust_remote_code
=
trust_remote_code
,
dtype
=
"float32"
,
dtype
=
dtype
,
seed
=
0
,
seed
=
0
,
mm_processor_kwargs
=
mm_processor_kwargs
,
mm_processor_kwargs
=
mm_processor_kwargs
,
limit_mm_per_prompt
=
limit_mm_per_prompt
,
limit_mm_per_prompt
=
limit_mm_per_prompt
,
...
...
tests/mq_llm_engine/test_error_handling.py
View file @
6d2051cc
...
@@ -61,7 +61,7 @@ async def test_evil_forward(tmp_socket):
...
@@ -61,7 +61,7 @@ async def test_evil_forward(tmp_socket):
# Throws an error in first forward pass.
# Throws an error in first forward pass.
with
pytest
.
raises
(
RAISED_ERROR
):
with
pytest
.
raises
(
RAISED_ERROR
):
async
for
_
in
client
.
generate
(
inputs
=
"Hello my name is"
,
async
for
_
in
client
.
generate
(
prompt
=
"Hello my name is"
,
sampling_params
=
SamplingParams
(),
sampling_params
=
SamplingParams
(),
request_id
=
uuid
.
uuid4
()):
request_id
=
uuid
.
uuid4
()):
pass
pass
...
@@ -69,7 +69,7 @@ async def test_evil_forward(tmp_socket):
...
@@ -69,7 +69,7 @@ async def test_evil_forward(tmp_socket):
# Engine is errored, should get ENGINE_DEAD_ERROR.
# Engine is errored, should get ENGINE_DEAD_ERROR.
with
pytest
.
raises
(
MQEngineDeadError
):
with
pytest
.
raises
(
MQEngineDeadError
):
async
for
_
in
client
.
generate
(
inputs
=
"Hello my name is"
,
async
for
_
in
client
.
generate
(
prompt
=
"Hello my name is"
,
sampling_params
=
SamplingParams
(),
sampling_params
=
SamplingParams
(),
request_id
=
uuid
.
uuid4
()):
request_id
=
uuid
.
uuid4
()):
pass
pass
...
@@ -118,7 +118,7 @@ async def test_failed_health_check(tmp_socket):
...
@@ -118,7 +118,7 @@ async def test_failed_health_check(tmp_socket):
# Generate call should throw ENGINE_DEAD_ERROR
# Generate call should throw ENGINE_DEAD_ERROR
with
pytest
.
raises
(
MQEngineDeadError
):
with
pytest
.
raises
(
MQEngineDeadError
):
async
for
_
in
client
.
generate
(
inputs
=
"Hello my name is"
,
async
for
_
in
client
.
generate
(
prompt
=
"Hello my name is"
,
sampling_params
=
SamplingParams
(),
sampling_params
=
SamplingParams
(),
request_id
=
uuid
.
uuid4
()):
request_id
=
uuid
.
uuid4
()):
pass
pass
...
@@ -160,7 +160,7 @@ async def test_failed_abort(tmp_socket):
...
@@ -160,7 +160,7 @@ async def test_failed_abort(tmp_socket):
# with reference to the original KeyError("foo")
# with reference to the original KeyError("foo")
with
pytest
.
raises
(
MQEngineDeadError
)
as
execinfo
:
with
pytest
.
raises
(
MQEngineDeadError
)
as
execinfo
:
async
for
_
in
client
.
generate
(
async
for
_
in
client
.
generate
(
inputs
=
"Hello my name is"
,
prompt
=
"Hello my name is"
,
sampling_params
=
SamplingParams
(
max_tokens
=
10
),
sampling_params
=
SamplingParams
(
max_tokens
=
10
),
request_id
=
uuid
.
uuid4
()):
request_id
=
uuid
.
uuid4
()):
pass
pass
...
@@ -183,7 +183,7 @@ async def test_bad_request(tmp_socket):
...
@@ -183,7 +183,7 @@ async def test_bad_request(tmp_socket):
# Invalid request should fail, but not crash the server.
# Invalid request should fail, but not crash the server.
with
pytest
.
raises
(
ValueError
):
with
pytest
.
raises
(
ValueError
):
async
for
_
in
client
.
generate
(
inputs
=
"Hello my name is"
,
async
for
_
in
client
.
generate
(
prompt
=
"Hello my name is"
,
sampling_params
=
SamplingParams
(),
sampling_params
=
SamplingParams
(),
request_id
=
"abcd-1"
,
request_id
=
"abcd-1"
,
lora_request
=
LoRARequest
(
lora_request
=
LoRARequest
(
...
@@ -192,7 +192,7 @@ async def test_bad_request(tmp_socket):
...
@@ -192,7 +192,7 @@ async def test_bad_request(tmp_socket):
pass
pass
# This request should be okay.
# This request should be okay.
async
for
_
in
client
.
generate
(
inputs
=
"Hello my name is"
,
async
for
_
in
client
.
generate
(
prompt
=
"Hello my name is"
,
sampling_params
=
SamplingParams
(),
sampling_params
=
SamplingParams
(),
request_id
=
"abcd-2"
):
request_id
=
"abcd-2"
):
pass
pass
...
...
tests/mq_llm_engine/utils.py
View file @
6d2051cc
...
@@ -20,7 +20,7 @@ async def generate(
...
@@ -20,7 +20,7 @@ async def generate(
count
=
0
count
=
0
async
for
out
in
client
.
generate
(
async
for
out
in
client
.
generate
(
request_id
=
request_id
,
request_id
=
request_id
,
inputs
=
"Hello my name is Robert and"
,
prompt
=
"Hello my name is Robert and"
,
sampling_params
=
SamplingParams
(
max_tokens
=
num_tokens
,
sampling_params
=
SamplingParams
(
max_tokens
=
num_tokens
,
temperature
=
0
)):
temperature
=
0
)):
...
...
tests/multi_step/test_correctness_async_llm.py
View file @
6d2051cc
...
@@ -17,7 +17,6 @@ NUM_PROMPTS = [10]
...
@@ -17,7 +17,6 @@ NUM_PROMPTS = [10]
DEFAULT_SERVER_ARGS
:
List
[
str
]
=
[
DEFAULT_SERVER_ARGS
:
List
[
str
]
=
[
"--disable-log-requests"
,
"--disable-log-requests"
,
"--use-v2-block-manager"
,
"--worker-use-ray"
,
"--worker-use-ray"
,
"--gpu-memory-utilization"
,
"--gpu-memory-utilization"
,
"0.85"
,
"0.85"
,
...
@@ -37,6 +36,7 @@ DEFAULT_SERVER_ARGS: List[str] = [
...
@@ -37,6 +36,7 @@ DEFAULT_SERVER_ARGS: List[str] = [
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"is_async"
,
[
True
])
@
pytest
.
mark
.
parametrize
(
"is_async"
,
[
True
])
@
pytest
.
mark
.
parametrize
(
"attention_backend"
,
[
"FLASHINFER"
,
"FLASH_ATTN"
])
@
pytest
.
mark
.
parametrize
(
"attention_backend"
,
[
"FLASHINFER"
,
"FLASH_ATTN"
])
@
pytest
.
mark
.
parametrize
(
"enable_chunked_prefill"
,
[
True
,
False
])
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_multi_step
(
async
def
test_multi_step
(
example_prompts
,
example_prompts
,
...
@@ -49,6 +49,7 @@ async def test_multi_step(
...
@@ -49,6 +49,7 @@ async def test_multi_step(
is_async
:
bool
,
is_async
:
bool
,
num_logprobs
:
Optional
[
int
],
num_logprobs
:
Optional
[
int
],
attention_backend
:
str
,
attention_backend
:
str
,
enable_chunked_prefill
:
bool
,
monkeypatch
,
monkeypatch
,
)
->
None
:
)
->
None
:
"""Test vLLM engine with multi-step scheduling in an OpenAI-protocol
"""Test vLLM engine with multi-step scheduling in an OpenAI-protocol
...
@@ -74,6 +75,10 @@ async def test_multi_step(
...
@@ -74,6 +75,10 @@ async def test_multi_step(
num_logprobs: corresponds to the `logprobs` argument to the OpenAI
num_logprobs: corresponds to the `logprobs` argument to the OpenAI
completions endpoint; `None` -> no logprobs
completions endpoint; `None` -> no logprobs
"""
"""
if
enable_chunked_prefill
and
\
(
pp_size
>
1
or
attention_backend
!=
"FLASH_ATTN"
):
pytest
.
skip
(
"Multi-step with Chunked-Prefill only supports"
"PP=1 and FLASH_ATTN backend"
)
override_backend_env_variable
(
monkeypatch
,
attention_backend
)
override_backend_env_variable
(
monkeypatch
,
attention_backend
)
...
@@ -93,6 +98,9 @@ async def test_multi_step(
...
@@ -93,6 +98,9 @@ async def test_multi_step(
if
eager_mode
:
if
eager_mode
:
ms_server_args
.
append
(
"--enforce-eager"
)
ms_server_args
.
append
(
"--enforce-eager"
)
if
enable_chunked_prefill
:
ms_server_args
.
append
(
"--enable-chunked-prefill"
)
distributed_args
=
[
distributed_args
=
[
"--tensor-parallel-size"
,
"--tensor-parallel-size"
,
str
(
tp_size
),
str
(
tp_size
),
...
@@ -133,3 +141,85 @@ async def test_multi_step(
...
@@ -133,3 +141,85 @@ async def test_multi_step(
name_0
=
"hf"
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
name_1
=
"vllm"
,
)
)
@
pytest
.
mark
.
parametrize
((
"tp_size, pp_size"
),
[
(
1
,
2
),
])
@
pytest
.
mark
.
asyncio
async
def
test_multi_step_pp_smoke
(
tp_size
:
int
,
pp_size
:
int
,
monkeypatch
,
)
->
None
:
"""
Smoke test for the vLLM engine with multi-step scheduling in an
OpenAI-protocol client/server environment.
This tests compares the outputs between multi-step scheduling and
single-step scheduling. Notably, this test lets the engines generate
more tokens (default is 5) and test for an exact match over all the
tokens.
Args:
tp_size: degree of tensor-parallelism
pp_size: degree of pipeline-parallelism
eager_mode
"""
model
=
"JackFram/llama-160m"
num_scheduler_steps
=
8
attention_backend
=
"FLASH_ATTN"
max_num_seqs
=
3
override_backend_env_variable
(
monkeypatch
,
attention_backend
)
# Prompt from the ShareGPT dataset
prompts
=
[
"in the jtbd context whats a push?"
,
# codespell:ignore
"in the jtbd context whats a push?"
,
# codespell:ignore
"in the jtbd context whats a push?"
,
# codespell:ignore
"in the jtbd context whats a push?"
,
# codespell:ignore
]
# Use varying max_tokens to introduce scheduling randomness.
max_tokens
=
[
10
*
i
for
i
in
range
(
1
,
len
(
prompts
)
+
1
)]
assert
len
(
prompts
)
==
len
(
max_tokens
)
test_args
=
[
"--tensor-parallel-size"
,
str
(
tp_size
),
"--pipeline-parallel-size"
,
str
(
pp_size
),
"--max-num-seqs"
,
str
(
max_num_seqs
)
]
server_args
=
DEFAULT_SERVER_ARGS
+
test_args
ms_server_args
=
DEFAULT_SERVER_ARGS
+
\
[
"--num-scheduler-steps"
,
f
"
{
num_scheduler_steps
}
"
]
+
\
test_args
# Spin up client/server & issue completion API requests.
# Default `max_wait_seconds` is 240 but was empirically
# was raised 3x to 720 *just for this test* due to
# observed timeouts in GHA CI
ref_completions
=
await
completions_with_server_args
(
prompts
=
prompts
,
model_name
=
model
,
server_cli_args
=
server_args
,
num_logprobs
=
None
,
max_wait_seconds
=
5
*
240
,
max_tokens
=
max_tokens
)
test_completions
=
await
completions_with_server_args
(
prompts
=
prompts
,
model_name
=
model
,
server_cli_args
=
ms_server_args
,
num_logprobs
=
None
,
max_wait_seconds
=
5
*
240
,
max_tokens
=
max_tokens
)
# Assert multi-step scheduling produces identical tokens
# to single-step scheduling.
ref_generations
=
get_client_text_generations
(
ref_completions
)
test_generations
=
get_client_text_generations
(
test_completions
)
assert
ref_generations
==
test_generations
tests/multi_step/test_correctness_llm.py
View file @
6d2051cc
# Test the LLMEngine with multi-step-decoding
# Test the LLMEngine with multi-step-decoding
import
copy
from
typing
import
Optional
from
typing
import
Optional
import
pytest
import
pytest
...
@@ -16,6 +17,7 @@ NUM_PROMPTS = [10]
...
@@ -16,6 +17,7 @@ NUM_PROMPTS = [10]
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"enable_chunked_prefill"
,
[
False
,
True
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"enforce_eager"
,
[
True
])
@
pytest
.
mark
.
parametrize
(
"enforce_eager"
,
[
True
])
@
pytest
.
mark
.
parametrize
(
"num_scheduler_steps"
,
NUM_SCHEDULER_STEPS
)
@
pytest
.
mark
.
parametrize
(
"num_scheduler_steps"
,
NUM_SCHEDULER_STEPS
)
...
@@ -28,6 +30,7 @@ def test_multi_step_llm(
...
@@ -28,6 +30,7 @@ def test_multi_step_llm(
model
:
str
,
model
:
str
,
dtype
:
str
,
dtype
:
str
,
tp_size
:
int
,
tp_size
:
int
,
enable_chunked_prefill
:
bool
,
max_tokens
:
int
,
max_tokens
:
int
,
enforce_eager
:
int
,
enforce_eager
:
int
,
num_scheduler_steps
:
int
,
num_scheduler_steps
:
int
,
...
@@ -51,6 +54,7 @@ def test_multi_step_llm(
...
@@ -51,6 +54,7 @@ def test_multi_step_llm(
model: model under test (same for single- and multi-step engines)
model: model under test (same for single- and multi-step engines)
dtype: tensor datatype for engine to utilize
dtype: tensor datatype for engine to utilize
tp_size: degree of tensor-parallelism
tp_size: degree of tensor-parallelism
enable_chunked_prefill: chunked-prefill on/off
max_tokens: the maximum number of tokens to generate
max_tokens: the maximum number of tokens to generate
enforce_eager
enforce_eager
num_scheduler_steps: for multi-step scheduling, GPU-side steps per
num_scheduler_steps: for multi-step scheduling, GPU-side steps per
...
@@ -72,7 +76,7 @@ def test_multi_step_llm(
...
@@ -72,7 +76,7 @@ def test_multi_step_llm(
enforce_eager
=
enforce_eager
,
enforce_eager
=
enforce_eager
,
gpu_memory_utilization
=
0.7
,
gpu_memory_utilization
=
0.7
,
tensor_parallel_size
=
tp_size
,
tensor_parallel_size
=
tp_size
,
use_v2_block_manager
=
True
,
enable_chunked_prefill
=
enable_chunked_prefill
,
num_scheduler_steps
=
num_scheduler_steps
,
num_scheduler_steps
=
num_scheduler_steps
,
)
as
vllm_model
:
)
as
vllm_model
:
vllm_outputs
=
(
vllm_model
.
generate_greedy
(
prompts
,
max_tokens
)
vllm_outputs
=
(
vllm_model
.
generate_greedy
(
prompts
,
max_tokens
)
...
@@ -164,7 +168,6 @@ def test_multi_step_llm_w_prompt_logprobs(
...
@@ -164,7 +168,6 @@ def test_multi_step_llm_w_prompt_logprobs(
enforce_eager
=
enforce_eager
,
enforce_eager
=
enforce_eager
,
gpu_memory_utilization
=
0.7
,
gpu_memory_utilization
=
0.7
,
tensor_parallel_size
=
tp_size
,
tensor_parallel_size
=
tp_size
,
use_v2_block_manager
=
True
,
num_scheduler_steps
=
num_scheduler_steps
,
num_scheduler_steps
=
num_scheduler_steps
,
)
as
vllm_model
:
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy_logprobs
(
vllm_outputs
=
vllm_model
.
generate_greedy_logprobs
(
...
@@ -192,3 +195,158 @@ def test_multi_step_llm_w_prompt_logprobs(
...
@@ -192,3 +195,158 @@ def test_multi_step_llm_w_prompt_logprobs(
name_0
=
"hf"
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
name_1
=
"vllm"
,
)
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"enforce_eager"
,
[
True
])
@
pytest
.
mark
.
parametrize
(
"num_scheduler_steps"
,
NUM_SCHEDULER_STEPS
)
@
pytest
.
mark
.
parametrize
(
"num_prompts"
,
NUM_PROMPTS
)
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
None
,
5
])
def
test_multi_step_llm_chunked_prefill_prefix_cache
(
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
tp_size
:
int
,
max_tokens
:
int
,
enforce_eager
:
int
,
num_scheduler_steps
:
int
,
num_prompts
:
int
,
num_logprobs
:
Optional
[
int
],
)
->
None
:
"""Test vLLM engine with multi-step+"single-step chunked prefill"+APC.
Set up contrived scenario which tests for a possible failure mode of
scheduling with multi-step+"single-step chunked prefill"+APC
"single-step chunked prefill" here refers to the current vLLM multi-step+
chunked-prefill implementation, which requires that a prefill may only
be scheduled in the same step as decodes if the prefill prompt fits in a
single chunk (note that "complete" multi-step+chunked-prefill would allow
a prefill to span multiple chunks & multiple steps but that is not yet
the case.)
"APC" is short for "automatic prefix caching".
This test creates a scenario where the scheduler must decide whether/how
to schedule a prefill with a prompt that exceeds the available token budget.
The correct behavior for multi-step+"single-step chunked prefill"+APC is to
put off scheduling the prefill until a future step.
Validate that:
* Multi-step kernels do not raise an exception due to incorrect scheduler
behavior
* Generated tokens match between
multi-step+"single-step chunked prefill"+APC and
single-step scheduling.
* (If logprobs are enabled) check logprobs are close enough
Args:
vllm_runner: vLLM model runner fixture
example_prompts: test fixture providing example prompts
model: model under test (same for single- and multi-step engines)
dtype: tensor datatype for engine to utilize
tp_size: degree of tensor-parallelism
max_tokens: the maximum number of tokens to generate
enforce_eager
num_scheduler_steps: for multi-step scheduling, GPU-side steps per
GPU -> CPU output transfer
num_prompts: number of example prompts under test
num_logprobs: corresponds to the `logprobs` argument to the OpenAI
completions endpoint; `None` -> 1 logprob returned.
"""
# Set up contrived test for correct scheduling behavior with
# multi-step+"single-step chunked prefill"+APC.
#
# Assume block_size=16
#
# Assume max_num_batched_tokens=48
# => Per-step token budget=48
#
# 1. Scheduler schedules 0th prompt (24 tokens)
# => Remaining token budget=24
# 2. Scheduler attempts to schedule 1st prompt (30 tokens)
# * 30 tokens exceeds 24 token remaining budget
# * Correct behavior: do not schedule this prompt in this step
# * Incorrect behavior: schedule prompt chunk
# * `do_sample=False` for this prompt in this step
# * Chunk size = (remaining tokens // block size) * block size
#
# The Incorrect scheduling behavior - if it occurs - will cause an exception
# in the model runner resulting from `do_sample=False`.
assert
len
(
example_prompts
)
>=
2
challenge_prompts
=
copy
.
deepcopy
(
example_prompts
)
challenge_prompts
[
0
]
=
(
'vLLM is a high-throughput and memory-efficient '
'inference and serving engine for LLMs.
\n
'
)
# 24 tok
challenge_prompts
[
1
]
=
(
'Briefly describe the major milestones in the '
'development of artificial intelligence from 1950 to 2020.
\n
'
)
# 30 tok
# If necessary, adjust the length of `challenge_prompts` to match
# `num_prompts`
if
len
(
challenge_prompts
)
<
num_prompts
:
challenge_prompts
=
(
challenge_prompts
*
((
num_prompts
//
len
(
challenge_prompts
))
+
1
))
challenge_prompts
=
challenge_prompts
[:
num_prompts
]
assert
len
(
challenge_prompts
)
==
num_prompts
# Single-step scheduler baseline
with
vllm_runner
(
model
,
dtype
=
dtype
,
enforce_eager
=
enforce_eager
,
gpu_memory_utilization
=
0.7
,
tensor_parallel_size
=
tp_size
,
num_scheduler_steps
=
num_scheduler_steps
,
max_model_len
=
48
,
max_num_batched_tokens
=
48
,
max_num_seqs
=
4
,
block_size
=
16
,
)
as
vllm_model
:
outputs_baseline
=
(
vllm_model
.
generate_greedy
(
challenge_prompts
,
max_tokens
)
if
num_logprobs
is
None
else
vllm_model
.
generate_greedy_logprobs
(
challenge_prompts
,
max_tokens
,
num_logprobs
))
# multi-step+"single-step chunked prefill"+APC
with
vllm_runner
(
model
,
dtype
=
dtype
,
enforce_eager
=
enforce_eager
,
gpu_memory_utilization
=
0.7
,
tensor_parallel_size
=
tp_size
,
enable_chunked_prefill
=
True
,
enable_prefix_caching
=
True
,
num_scheduler_steps
=
num_scheduler_steps
,
max_model_len
=
48
,
max_num_batched_tokens
=
48
,
max_num_seqs
=
4
,
block_size
=
16
,
)
as
vllm_model
:
outputs_w_features
=
(
vllm_model
.
generate_greedy
(
challenge_prompts
,
max_tokens
)
if
num_logprobs
is
None
else
vllm_model
.
generate_greedy_logprobs
(
challenge_prompts
,
max_tokens
,
num_logprobs
))
if
num_logprobs
is
None
:
# No-logprobs test
check_outputs_equal
(
outputs_0_lst
=
outputs_baseline
,
outputs_1_lst
=
outputs_w_features
,
name_0
=
"multi-step"
,
name_1
=
"multi-step+features"
,
)
else
:
# Yes-logprobs test
check_logprobs_close
(
outputs_0_lst
=
outputs_baseline
,
outputs_1_lst
=
outputs_w_features
,
name_0
=
"multi-step"
,
name_1
=
"multi-step+features"
,
)
tests/multimodal/test_processor_kwargs.py
View file @
6d2051cc
...
@@ -5,7 +5,7 @@ from unittest.mock import patch
...
@@ -5,7 +5,7 @@ from unittest.mock import patch
import
pytest
import
pytest
import
torch
import
torch
from
vllm.inputs
import
InputContext
,
LLMI
nputs
from
vllm.inputs
import
DecoderOnlyInputs
,
InputContext
,
token_i
nputs
from
vllm.inputs.registry
import
InputRegistry
from
vllm.inputs.registry
import
InputRegistry
from
vllm.multimodal
import
MultiModalRegistry
from
vllm.multimodal
import
MultiModalRegistry
from
vllm.sequence
import
VLLM_TOKEN_ID_ARRAY_TYPE
,
SequenceData
from
vllm.sequence
import
VLLM_TOKEN_ID_ARRAY_TYPE
,
SequenceData
...
@@ -31,7 +31,7 @@ def use_processor_mock():
...
@@ -31,7 +31,7 @@ def use_processor_mock():
"""Patches the internal model input processor with an override callable."""
"""Patches the internal model input processor with an override callable."""
def
custom_processor
(
ctx
:
InputContext
,
def
custom_processor
(
ctx
:
InputContext
,
llm_
inputs
:
LLM
Inputs
,
inputs
:
DecoderOnly
Inputs
,
*
,
*
,
num_crops
=
DEFAULT_NUM_CROPS
):
num_crops
=
DEFAULT_NUM_CROPS
):
# For testing purposes, we don't worry about the llm inputs / return
# For testing purposes, we don't worry about the llm inputs / return
...
@@ -74,38 +74,61 @@ def mm_model_cls():
...
@@ -74,38 +74,61 @@ def mm_model_cls():
# lambda whose signature matches max token calcs extra & mapper + extra kwargs
# lambda whose signature matches max token calcs extra & mapper + extra kwargs
get_num_crops
=
lambda
ctx
,
*
,
num_crops
=
DEFAULT_NUM_CROPS
:
num_crops
get_num_crops
=
lambda
ctx
,
*
,
num_crops
=
DEFAULT_NUM_CROPS
:
num_crops
custom_mapper
=
lambda
ctx
,
data
,
*
,
num_crops
=
DEFAULT_NUM_CROPS
:
{
custom_mapper
=
lambda
ctx
,
data
,
*
,
num_crops
=
DEFAULT_NUM_CROPS
:
{
"
num_
pixels"
:
torch
.
zeros
(
size
=
(
1
,
num_crops
+
1
,
3
,
336
,
336
))
"pixel
_value
s"
:
torch
.
zeros
(
size
=
(
1
,
num_crops
+
1
,
3
,
336
,
336
))
}
}
### Test for default processor logic & mm_processor_kwargs wrapping
### Test
s
for default processor logic & mm_processor_kwargs wrapping
def
test_default_processor_is_a_noop
():
def
test_default_processor_is_a_noop
():
"""Ensure that by default, there is no processor override."""
"""Ensure that by default, there is no processor override."""
dummy_registry
=
InputRegistry
()
dummy_registry
=
InputRegistry
()
ctx
=
build_model_context
(
DUMMY_MODEL_ID
)
ctx
=
build_model_context
(
DUMMY_MODEL_ID
)
processor
=
dummy_registry
.
create_input_processor
(
ctx
.
model_config
)
processor
=
dummy_registry
.
create_input_processor
(
ctx
.
model_config
)
proc_inputs
=
LLMI
nputs
(
prompt_token_ids
=
[],
prompt
=
""
)
proc_inputs
=
token_i
nputs
(
prompt_token_ids
=
[],
prompt
=
""
)
proc_outputs
=
processor
(
inputs
=
proc_inputs
)
proc_outputs
=
processor
(
inputs
=
proc_inputs
)
assert
proc_inputs
is
proc_outputs
assert
proc_inputs
is
proc_outputs
@
pytest
.
mark
.
parametrize
(
"num_crops"
,
[
None
,
NUM_CROPS_OVERRIDE
])
def
_get_num_crops_info
(
init_num_crops
:
int
,
inference_num_crops
:
int
):
def
test_processor_default_kwargs
(
use_processor_mock
,
num_crops
):
"""Get the init / inference kwargs and expected num_crops for this test."""
"""Ensure input processors can use processor kwargs."""
dummy_registry
=
InputRegistry
()
# If we have a value for num_crops, pass the override value and make
# If we have a value for num_crops, pass the override value and make
# sure we get that value as a return-value from out mock processor,
# sure we get that value as a return-value from out mock processor,
# otherwise fall back to the default value
# otherwise fall back to the default value
mm_processor
_kwargs
=
None
if
num_crops
is
None
else
{
init
_kwargs
=
None
if
init_
num_crops
is
None
else
{
"num_crops"
:
num_crops
"num_crops"
:
init_
num_crops
}
}
expected_num_crops
=
DEFAULT_NUM_CROPS
if
num_crops
is
None
else
num_crops
inference_kwargs
=
None
if
inference_num_crops
is
None
else
{
ctx
=
build_model_context
(
DUMMY_MODEL_ID
,
"num_crops"
:
inference_num_crops
mm_processor_kwargs
=
mm_processor_kwargs
)
}
processor
=
dummy_registry
.
create_input_processor
(
ctx
.
model_config
)
if
inference_num_crops
is
not
None
:
expected_seq_count
=
inference_num_crops
elif
init_num_crops
is
not
None
:
expected_seq_count
=
init_num_crops
else
:
expected_seq_count
=
DEFAULT_NUM_CROPS
return
init_kwargs
,
inference_kwargs
,
expected_seq_count
@
pytest
.
mark
.
parametrize
(
"init_num_crops,inference_num_crops"
,
[
(
None
,
None
),
(
NUM_CROPS_OVERRIDE
,
None
),
(
DEFAULT_NUM_CROPS
,
NUM_CROPS_OVERRIDE
),
])
def
test_input_processor_kwargs
(
use_processor_mock
,
init_num_crops
,
inference_num_crops
):
"""Ensure input processors can use processor kwargs."""
dummy_registry
=
InputRegistry
()
init_kwargs
,
inference_kwargs
,
expected_seq_count
=
_get_num_crops_info
(
init_num_crops
,
inference_num_crops
)
num_crops_val
=
processor
(
LLMInputs
(
prompt_token_ids
=
[],
prompt
=
""
))
ctx
=
build_model_context
(
DUMMY_MODEL_ID
,
mm_processor_kwargs
=
init_kwargs
)
assert
num_crops_val
==
expected_num_crops
processor
=
dummy_registry
.
create_input_processor
(
ctx
.
model_config
)
num_crops_val
=
processor
(
token_inputs
(
prompt_token_ids
=
[],
prompt
=
""
,
mm_processor_kwargs
=
inference_kwargs
))
assert
num_crops_val
==
expected_seq_count
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
...
@@ -124,11 +147,16 @@ def test_processor_with_sad_kwarg_overrides(use_processor_mock,
...
@@ -124,11 +147,16 @@ def test_processor_with_sad_kwarg_overrides(use_processor_mock,
mm_processor_kwargs
):
mm_processor_kwargs
):
"""Ensure that input processors filter out invalid mm_processor_kwargs"""
"""Ensure that input processors filter out invalid mm_processor_kwargs"""
dummy_registry
=
InputRegistry
()
dummy_registry
=
InputRegistry
()
# Should filter out the init time kwargs
ctx
=
build_model_context
(
DUMMY_MODEL_ID
,
ctx
=
build_model_context
(
DUMMY_MODEL_ID
,
mm_processor_kwargs
=
mm_processor_kwargs
)
mm_processor_kwargs
=
mm_processor_kwargs
)
processor
=
dummy_registry
.
create_input_processor
(
ctx
.
model_config
)
processor
=
dummy_registry
.
create_input_processor
(
ctx
.
model_config
)
num_crops_val
=
processor
(
LLMInputs
(
prompt_token_ids
=
[],
prompt
=
""
))
# Should filter out the inference time kwargs
num_crops_val
=
processor
(
token_inputs
(
prompt_token_ids
=
[],
prompt
=
""
,
mm_processor_kwargs
=
mm_processor_kwargs
))
assert
num_crops_val
==
DEFAULT_NUM_CROPS
assert
num_crops_val
==
DEFAULT_NUM_CROPS
...
@@ -271,32 +299,34 @@ def test_default_mapper_with_processer_kwargs(image_assets, num_crops):
...
@@ -271,32 +299,34 @@ def test_default_mapper_with_processer_kwargs(image_assets, num_crops):
assert
mapped_inputs
[
"pixel_values"
].
shape
[
1
]
==
num_crops
+
1
assert
mapped_inputs
[
"pixel_values"
].
shape
[
1
]
==
num_crops
+
1
@
pytest
.
mark
.
parametrize
(
"num_crops"
,
[
None
,
NUM_CROPS_OVERRIDE
])
@
pytest
.
mark
.
parametrize
(
"init_num_crops,inference_num_crops"
,
[
def
test_custom_mapper_kwarg_overrides
(
image_assets
,
num_crops
):
(
None
,
None
),
(
NUM_CROPS_OVERRIDE
,
None
),
(
DEFAULT_NUM_CROPS
,
NUM_CROPS_OVERRIDE
),
])
def
test_custom_mapper_kwarg_overrides
(
image_assets
,
init_num_crops
,
inference_num_crops
):
"""Ensure custom mappers can use processor kwargs."""
"""Ensure custom mappers can use processor kwargs."""
mm_processor_kwargs
=
None
if
num_crops
is
None
else
{
init_kwargs
,
inference_kwargs
,
expected_seq_count
=
_get_num_crops_info
(
"num_crops"
:
num_crops
init_num_crops
,
inference_num_crops
)
}
expected_seq_count
=
DEFAULT_NUM_CROPS
if
num_crops
is
None
else
num_crops
ctx
=
build_model_context
(
MULTIMODAL_MODEL_ID
,
ctx
=
build_model_context
(
MULTIMODAL_MODEL_ID
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
mm_processor_kwargs
=
mm_processor
_kwargs
,
mm_processor_kwargs
=
init
_kwargs
,
limit_mm_per_prompt
=
{
"image"
:
1
})
limit_mm_per_prompt
=
{
"image"
:
1
})
mm_registry
=
MultiModalRegistry
()
mm_registry
=
MultiModalRegistry
()
mm_registry
.
init_mm_limits_per_prompt
(
ctx
.
model_config
)
mm_registry
.
init_mm_limits_per_prompt
(
ctx
.
model_config
)
# Patch the image registry for phi3v with our lambda that is compatible
# with overrides, then ensure that calling the method correctly echos
# our num_crops value back from the mm_processor_kwargs.
image
=
image_assets
[
0
].
pil_image
image
=
image_assets
[
0
].
pil_image
mm_inputs
=
{
"image"
:
image
}
mm_inputs
=
{
"image"
:
image
}
with
patch
.
object
(
# Patch the image registry for phi3v with our lambda that is compatible
mm_registry
.
_get_plugin
(
"image"
),
# with overrides, then ensure that calling the method correctly echos
"_default_input_mapper"
,
# our num_crops value back from the mm_processor_kwargs.
{
mm_model_cls
():
custom_mapper
},
mm_registry
.
_get_plugin
(
"image"
).
register_input_mapper
(
custom_mapper
)(
):
mm_model_cls
())
mapped_inputs
=
mm_registry
.
map_input
(
ctx
.
model_config
,
mm_inputs
)
mapped_inputs
=
mm_registry
.
map_input
(
ctx
.
model_config
,
mm_inputs
,
inference_kwargs
)
assert
mapped_inputs
[
"pixel_values"
].
shape
[
1
]
==
expected_seq_count
+
1
assert
mapped_inputs
[
"pixel_values"
].
shape
[
1
]
==
expected_seq_count
+
1
...
@@ -316,6 +346,7 @@ def test_custom_mapper_kwarg_overrides(image_assets, num_crops):
...
@@ -316,6 +346,7 @@ def test_custom_mapper_kwarg_overrides(image_assets, num_crops):
def
test_custom_mapper_with_sad_kwarg_overrides
(
image_assets
,
def
test_custom_mapper_with_sad_kwarg_overrides
(
image_assets
,
mm_processor_kwargs
):
mm_processor_kwargs
):
"""Ensure that custom mappers filters out invalid mm_processor_kwargs"""
"""Ensure that custom mappers filters out invalid mm_processor_kwargs"""
# Should filter out the init time kwargs
ctx
=
build_model_context
(
MULTIMODAL_MODEL_ID
,
ctx
=
build_model_context
(
MULTIMODAL_MODEL_ID
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
mm_processor_kwargs
=
mm_processor_kwargs
,
mm_processor_kwargs
=
mm_processor_kwargs
,
...
@@ -323,17 +354,16 @@ def test_custom_mapper_with_sad_kwarg_overrides(image_assets,
...
@@ -323,17 +354,16 @@ def test_custom_mapper_with_sad_kwarg_overrides(image_assets,
mm_registry
=
MultiModalRegistry
()
mm_registry
=
MultiModalRegistry
()
mm_registry
.
init_mm_limits_per_prompt
(
ctx
.
model_config
)
mm_registry
.
init_mm_limits_per_prompt
(
ctx
.
model_config
)
# Patch the image registry for phi3v with our lambda that is compatible
# with overrides, then ensure that calling the method correctly echos
# our num_crops value back from the mm_processor_kwargs.
image
=
image_assets
[
0
].
pil_image
image
=
image_assets
[
0
].
pil_image
mm_inputs
=
{
"image"
:
image
}
mm_inputs
=
{
"image"
:
image
}
with
patch
.
object
(
# Patch the image registry for phi3v with our lambda that is compatible
mm_registry
.
_get_plugin
(
"image"
),
# with overrides, then ensure that calling the method correctly echos
"_default_input_mapper"
,
# our num_crops value back from the mm_processor_kwargs.
{
mm_model_cls
():
custom_mapper
},
mm_registry
.
_get_plugin
(
"image"
).
register_input_mapper
(
custom_mapper
)(
):
mm_model_cls
())
mapped_inputs
=
mm_registry
.
map_input
(
ctx
.
model_config
,
mm_inputs
)
# Should filter out the inference time kwargs
mapped_inputs
=
mm_registry
.
map_input
(
ctx
.
model_config
,
mm_inputs
,
mm_processor_kwargs
=
mm_processor_kwargs
)
assert
mapped_inputs
[
"pixel_values"
].
shape
[
1
]
==
DEFAULT_NUM_CROPS
+
1
assert
mapped_inputs
[
"pixel_values"
].
shape
[
1
]
==
DEFAULT_NUM_CROPS
+
1
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
View file @
6d2051cc
from
typing
import
Optional
import
torch
from
vllm
import
ModelRegistry
from
vllm
import
ModelRegistry
from
vllm.model_executor.models.opt
import
OPTForCausalLM
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
class
MyOPTForCausalLM
(
OPTForCausalLM
):
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
)
->
Optional
[
torch
.
Tensor
]:
# this dummy model always predicts the first token
logits
=
super
().
compute_logits
(
hidden_states
,
sampling_metadata
)
if
logits
is
not
None
:
logits
.
zero_
()
logits
[:,
0
]
+=
1.0
return
logits
def
register
():
def
register
():
# register our dummy model
# Test directly passing the model
from
.my_opt
import
MyOPTForCausalLM
if
"MyOPTForCausalLM"
not
in
ModelRegistry
.
get_supported_archs
():
if
"MyOPTForCausalLM"
not
in
ModelRegistry
.
get_supported_archs
():
ModelRegistry
.
register_model
(
"MyOPTForCausalLM"
,
MyOPTForCausalLM
)
ModelRegistry
.
register_model
(
"MyOPTForCausalLM"
,
MyOPTForCausalLM
)
# Test passing lazy model
if
"MyGemma2Embedding"
not
in
ModelRegistry
.
get_supported_archs
():
ModelRegistry
.
register_model
(
"MyGemma2Embedding"
,
"vllm_add_dummy_model.my_gemma_embedding:MyGemma2Embedding"
,
)
if
"MyLlava"
not
in
ModelRegistry
.
get_supported_archs
():
ModelRegistry
.
register_model
(
"MyLlava"
,
"vllm_add_dummy_model.my_llava:MyLlava"
)
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
0 → 100644
View file @
6d2051cc
from
typing
import
List
,
Optional
,
Union
import
torch
from
vllm.attention
import
AttentionMetadata
from
vllm.model_executor.models.gemma2
import
Gemma2EmbeddingModel
from
vllm.sequence
import
IntermediateTensors
class
MyGemma2Embedding
(
Gemma2EmbeddingModel
):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
torch
.
Tensor
],
attn_metadata
:
AttentionMetadata
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
inputs_embeds
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
Union
[
torch
.
Tensor
,
IntermediateTensors
]:
hidden_states
=
super
().
forward
(
input_ids
,
positions
,
kv_caches
,
attn_metadata
,
intermediate_tensors
=
intermediate_tensors
,
inputs_embeds
=
inputs_embeds
,
)
if
isinstance
(
hidden_states
,
IntermediateTensors
):
return
hidden_states
# Return all-zero embeddings
return
torch
.
zeros_like
(
hidden_states
)
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
0 → 100644
View file @
6d2051cc
from
typing
import
Optional
import
torch
from
vllm.inputs
import
INPUT_REGISTRY
from
vllm.model_executor.models.llava
import
(
LlavaForConditionalGeneration
,
dummy_data_for_llava
,
get_max_llava_image_tokens
,
input_processor_for_llava
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
@
MULTIMODAL_REGISTRY
.
register_image_input_mapper
()
@
MULTIMODAL_REGISTRY
.
register_max_image_tokens
(
get_max_llava_image_tokens
)
@
INPUT_REGISTRY
.
register_dummy_data
(
dummy_data_for_llava
)
@
INPUT_REGISTRY
.
register_input_processor
(
input_processor_for_llava
)
class
MyLlava
(
LlavaForConditionalGeneration
):
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
)
->
Optional
[
torch
.
Tensor
]:
# this dummy model always predicts the first token
logits
=
super
().
compute_logits
(
hidden_states
,
sampling_metadata
)
if
logits
is
not
None
:
logits
.
zero_
()
logits
[:,
0
]
+=
1.0
return
logits
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
0 → 100644
View file @
6d2051cc
from
typing
import
Optional
import
torch
from
vllm.model_executor.models.opt
import
OPTForCausalLM
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
class
MyOPTForCausalLM
(
OPTForCausalLM
):
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
)
->
Optional
[
torch
.
Tensor
]:
# this dummy model always predicts the first token
logits
=
super
().
compute_logits
(
hidden_states
,
sampling_metadata
)
if
logits
is
not
None
:
logits
.
zero_
()
logits
[:,
0
]
+=
1.0
return
logits
Prev
1
…
8
9
10
11
12
13
14
15
16
…
23
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment