Commit 6d2051cc authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.6.3.post1' into v0.6.3.post1-dev

parents 2c7f740a a2c71c54
...@@ -5,7 +5,7 @@ import pytest ...@@ -5,7 +5,7 @@ import pytest
import torch import torch
from PIL.Image import Image from PIL.Image import Image
from vllm.inputs import InputContext, LLMInputs from vllm.inputs import InputContext, token_inputs
from vllm.multimodal.base import MultiModalInputs from vllm.multimodal.base import MultiModalInputs
from vllm.multimodal.utils import cached_get_tokenizer, rescale_image_size from vllm.multimodal.utils import cached_get_tokenizer, rescale_image_size
...@@ -71,12 +71,12 @@ def test_input_processor_valid_mm_data(input_processor_for_qwen, ...@@ -71,12 +71,12 @@ def test_input_processor_valid_mm_data(input_processor_for_qwen,
"""Happy cases for image inputs to Qwen's multimodal input processor.""" """Happy cases for image inputs to Qwen's multimodal input processor."""
prompt = "".join( prompt = "".join(
[f"Picture {num}: <img></img>\n" for num in range(1, num_images + 1)]) [f"Picture {num}: <img></img>\n" for num in range(1, num_images + 1)])
inputs = LLMInputs( inputs = token_inputs(
prompt=prompt, prompt=prompt,
# When processing multimodal data for a multimodal model, the qwen # When processing multimodal data for a multimodal model, the qwen
# input processor will overwrite the provided prompt_token_ids with # input processor will overwrite the provided prompt_token_ids with
# the image prompts # the image prompts
prompt_token_ids=None, prompt_token_ids=[],
multi_modal_data={"image": torch.rand(num_images, TOKS_PER_IMG, 4096)}, multi_modal_data={"image": torch.rand(num_images, TOKS_PER_IMG, 4096)},
) )
proc_inputs = input_processor_for_qwen(qwen_vl_context, inputs) proc_inputs = input_processor_for_qwen(qwen_vl_context, inputs)
...@@ -134,9 +134,9 @@ def test_input_processor_invalid_mm_data(input_processor_for_qwen, ...@@ -134,9 +134,9 @@ def test_input_processor_invalid_mm_data(input_processor_for_qwen,
trust_remote_code=True) trust_remote_code=True)
prompt = "Picture 1: <img></img>\n" prompt = "Picture 1: <img></img>\n"
prompt_token_ids = tokenizer.encode(prompt) prompt_token_ids = tokenizer.encode(prompt)
inputs = LLMInputs(prompt=prompt, inputs = token_inputs(prompt=prompt,
prompt_token_ids=prompt_token_ids, prompt_token_ids=prompt_token_ids,
multi_modal_data=mm_data) multi_modal_data=mm_data)
# Should fail since we have too many or too few dimensions for embeddings # Should fail since we have too many or too few dimensions for embeddings
with pytest.raises(ValueError): with pytest.raises(ValueError):
input_processor_for_qwen(qwen_vl_context, inputs) input_processor_for_qwen(qwen_vl_context, inputs)
...@@ -221,7 +221,7 @@ def run_test( ...@@ -221,7 +221,7 @@ def run_test(
# Qwen encodes each image into a fixed content size of 256 # Qwen encodes each image into a fixed content size of 256
with vllm_runner(model, with vllm_runner(model,
max_model_len=1024, max_model_len=1024,
max_num_seqs=1, max_num_seqs=2,
dtype=dtype, dtype=dtype,
limit_mm_per_prompt={"image": mm_limit}, limit_mm_per_prompt={"image": mm_limit},
tensor_parallel_size=tensor_parallel_size, tensor_parallel_size=tensor_parallel_size,
......
"""Compare the outputs of HF and vLLM for Mistral models using greedy sampling. """Compare the embedding outputs of HF and vLLM models.
Run `pytest tests/models/test_llama_embedding.py`. Run `pytest tests/models/embedding/language/test_embedding.py`.
""" """
import pytest import pytest
import torch
import torch.nn.functional as F from ..utils import check_embeddings_close
MODELS = [ MODELS = [
"intfloat/e5-mistral-7b-instruct", "intfloat/e5-mistral-7b-instruct",
"BAAI/bge-multilingual-gemma2",
] ]
def compare_embeddings(embeddings1, embeddings2):
similarities = [
F.cosine_similarity(torch.tensor(e1), torch.tensor(e2), dim=0)
for e1, e2 in zip(embeddings1, embeddings2)
]
return similarities
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("dtype", ["half"])
def test_models( def test_models(
...@@ -28,15 +21,25 @@ def test_models( ...@@ -28,15 +21,25 @@ def test_models(
model: str, model: str,
dtype: str, dtype: str,
) -> None: ) -> None:
with hf_runner(model, dtype=dtype, is_embedding_model=True) as hf_model: # The example_prompts has ending "\n", for example:
# "Write a short story about a robot that dreams for the first time.\n"
# sentence_transformers will strip the input texts, see:
# https://github.com/UKPLab/sentence-transformers/blob/v3.1.1/sentence_transformers/models/Transformer.py#L159
# This makes the input_ids different between hf_model and vllm_model.
# So we need to strip the input texts to avoid test failing.
example_prompts = [str(s).strip() for s in example_prompts]
with hf_runner(model, dtype=dtype,
is_sentence_transformer=True) as hf_model:
hf_outputs = hf_model.encode(example_prompts) hf_outputs = hf_model.encode(example_prompts)
with vllm_runner(model, dtype=dtype) as vllm_model: with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.encode(example_prompts) vllm_outputs = vllm_model.encode(example_prompts)
similarities = compare_embeddings(hf_outputs, vllm_outputs) check_embeddings_close(
all_similarities = torch.stack(similarities) embeddings_0_lst=hf_outputs,
tolerance = 1e-2 embeddings_1_lst=vllm_outputs,
assert torch.all((all_similarities <= 1.0 + tolerance) name_0="hf",
& (all_similarities >= 1.0 - tolerance) name_1="vllm",
), f"Not all values are within {tolerance} of 1.0" tol=1e-2,
)
from typing import List, Sequence
import torch
import torch.nn.functional as F
def check_embeddings_close(
*,
embeddings_0_lst: Sequence[List[float]],
embeddings_1_lst: Sequence[List[float]],
name_0: str,
name_1: str,
tol: float = 1e-3,
) -> None:
assert len(embeddings_0_lst) == len(embeddings_1_lst)
for prompt_idx, (embeddings_0, embeddings_1) in enumerate(
zip(embeddings_0_lst, embeddings_1_lst)):
assert len(embeddings_0) == len(embeddings_1)
sim = F.cosine_similarity(torch.tensor(embeddings_0),
torch.tensor(embeddings_1),
dim=0)
fail_msg = (f"Test{prompt_idx}:"
f"\n{name_0}:\t{embeddings_0!r}"
f"\n{name_1}:\t{embeddings_1!r}")
assert sim >= 1 - tol, fail_msg
import pytest
import torch.nn.functional as F
from ....conftest import IMAGE_ASSETS
from ..utils import check_embeddings_close
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign":
"<|image_1|> Select the portion of the image that isolates the object of the given label: The label of the object is stop sign", # noqa: E501
"cherry_blossom":
"<|image_1|> Represent the given image with the following question: What is in the image", # noqa: E501
})
MODELS = ["TIGER-Lab/VLM2Vec-Full"]
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
def test_models(
hf_runner,
vllm_runner,
example_prompts,
model: str,
dtype: str,
) -> None:
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
with vllm_runner(model,
max_model_len=4096,
max_num_seqs=2,
dtype=dtype,
enforce_eager=True) as vllm_model:
vllm_outputs = vllm_model.encode(example_prompts)
with hf_runner(model, dtype=dtype) as hf_model:
all_inputs = hf_model.get_inputs(example_prompts)
all_outputs = []
for inputs in all_inputs:
# Based on: https://github.com/TIGER-AI-Lab/VLM2Vec/blob/db3b951bccabba220c1f53ab46a734e50dd2fc08/src/model.py
outputs = hf_model.model(
**hf_model.wrap_device(inputs,
device=hf_model.model.device.type),
return_dict=True,
output_hidden_states=True,
)
last_hidden_state = outputs.hidden_states[-1][0]
reps = last_hidden_state[inputs.attention_mask[0].sum() - 1]
pooled_output = F.normalize(reps, p=2, dim=-1)
all_outputs.append(pooled_output.tolist())
hf_outputs = all_outputs
check_embeddings_close(
embeddings_0_lst=hf_outputs,
embeddings_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
...@@ -4,220 +4,214 @@ Run `pytest tests/models/encoder_decoder/language/test_bart.py`. ...@@ -4,220 +4,214 @@ Run `pytest tests/models/encoder_decoder/language/test_bart.py`.
""" """
from typing import List, Optional, Tuple, Type from typing import List, Optional, Tuple, Type
from vllm.utils import is_cpu import pytest
from transformers import AutoModelForSeq2SeqLM
if not is_cpu():
# CPU backend is not currently supported with encoder/decoder models from vllm.sequence import SampleLogprobs
# skip test definitions entirely to avoid importing GPU kernel libs
# (xFormers, etc.) from ....conftest import (DecoderPromptType, ExplicitEncoderDecoderPrompt,
HfRunner, VllmRunner)
import pytest from ....utils import multi_gpu_test
from transformers import AutoModelForSeq2SeqLM from ...utils import check_logprobs_close
from vllm.sequence import SampleLogprobs MODELS = ["facebook/bart-base", "facebook/bart-large-cnn"]
from ....conftest import (DecoderPromptType, ExplicitEncoderDecoderPrompt,
HfRunner, VllmRunner) def vllm_to_hf_output(
from ....utils import multi_gpu_test vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]],
from ...utils import check_logprobs_close decoder_prompt_type: DecoderPromptType,
):
MODELS = ["facebook/bart-base", "facebook/bart-large-cnn"] """Sanitize vllm output to be comparable with hf output."""
output_ids, output_str, out_logprobs = vllm_output
def vllm_to_hf_output(
vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]], hf_output_str = output_str + "</s>"
decoder_prompt_type: DecoderPromptType, if decoder_prompt_type == DecoderPromptType.NONE:
): hf_output_str = "<s>" + hf_output_str
"""Sanitize vllm output to be comparable with hf output."""
output_ids, output_str, out_logprobs = vllm_output return output_ids, hf_output_str, out_logprobs
hf_output_str = output_str + "</s>"
if decoder_prompt_type == DecoderPromptType.NONE: def run_test(
hf_output_str = "<s>" + hf_output_str hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
return output_ids, hf_output_str, out_logprobs prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
decoder_prompt_type: DecoderPromptType,
def run_test( model: str,
hf_runner: Type[HfRunner], *,
vllm_runner: Type[VllmRunner], dtype: str,
prompts: List[ExplicitEncoderDecoderPrompt[str, str]], max_tokens: int,
decoder_prompt_type: DecoderPromptType, num_logprobs: int,
model: str, tensor_parallel_size: int,
*, distributed_executor_backend: Optional[str] = None,
dtype: str, ) -> None:
max_tokens: int, '''
num_logprobs: int, Test the vLLM BART model for a variety of encoder/decoder input prompts,
tensor_parallel_size: int, by validating it against HuggingFace (HF) BART.
distributed_executor_backend: Optional[str] = None,
) -> None: Arguments:
'''
Test the vLLM BART model for a variety of encoder/decoder input prompts, * hf_runner: HuggingFace (HF) test model runner
by validating it against HuggingFace (HF) BART. * vllm_runner: vLLM test model runner
* example_encoder_decoder_prompts: test fixture which provides a
Arguments: dictionary of dummy prompts
* model: the HF ID of the specific BART variant under test
* hf_runner: HuggingFace (HF) test model runner * dtype: the tensor datatype to employ
* vllm_runner: vLLM test model runner * max_tokens
* example_encoder_decoder_prompts: test fixture which provides a * num_logprobs
dictionary of dummy prompts * decoder_prompt_type: key into the example_encoder_decoder_prompts
* model: the HF ID of the specific BART variant under test dictionary; selects specific encoder/decoder
* dtype: the tensor datatype to employ prompt scenarios to test
* max_tokens
* num_logprobs A note on using HF BART as a baseline for validating vLLM BART,
* decoder_prompt_type: key into the example_encoder_decoder_prompts specifically when the decoder prompt is None.
dictionary; selects specific encoder/decoder
prompt scenarios to test The HF GenerationMixin's default behavior is to force the first
decoded token to be <BOS> if the prompt does not already contain
A note on using HF BART as a baseline for validating vLLM BART, <BOS> (this is accomplished using a logit
specifically when the decoder prompt is None. processor setting.)
The HF GenerationMixin's default behavior is to force the first So when we use HF BART as our baseline for comparison, note that
decoded token to be <BOS> if the prompt does not already contain when the user provides a request with a None decoder prompt
<BOS> (this is accomplished using a logit (i.e. a singleton encoder prompt, or else an explicit encoder/
processor setting.) decoder prompt with the decoder sub-prompt set to None), HF and
vLLM handle this in different ways:
So when we use HF BART as our baseline for comparison, note that
when the user provides a request with a None decoder prompt * HF will (1) tokenize the None prompt as an empty token-list,
(i.e. a singleton encoder prompt, or else an explicit encoder/ (2) append <decoder-start-token> to the beginning, yielding
decoder prompt with the decoder sub-prompt set to None), HF and [<decoder-start-token>], (3) pass this token list to the model, and
vLLM handle this in different ways: then (4) after computing logits during prefill, override the model
logits & force <BOS> to be the first generated token.
* HF will (1) tokenize the None prompt as an empty token-list,
(2) append <decoder-start-token> to the beginning, yielding * vLLM will (1) tokenize the None prompt as [<BOS>], (2) append decoder-
[<decoder-start-token>], (3) pass this token list to the model, and start-token to the beginning, yielding [<decoder-start-token><BOS>],
then (4) after computing logits during prefill, override the model (3) pass these tokens to the model & proceed with generation.
logits & force <BOS> to be the first generated token.
The net effect is that compared to vLLM, the list of HF *decoded* tokens
* vLLM will (1) tokenize the None prompt as [<BOS>], (2) append decoder- will contain one more initial <BOS> than the vLLM generated tokens,
start-token to the beginning, yielding [<decoder-start-token><BOS>], because vLLM's <BOS> token is injected into the prompt rather than into
(3) pass these tokens to the model & proceed with generation. the generated output. This is in spite of the fact that overall, the
complete sequences (prompt + decoded tokens) produced by vLLM will match
The net effect is that compared to vLLM, the list of HF *decoded* tokens HF.
will contain one more initial <BOS> than the vLLM generated tokens,
because vLLM's <BOS> token is injected into the prompt rather than into So when we use HF decoded token output to validate vLLM's decoded token
the generated output. This is in spite of the fact that overall, the output, the testing process must account for the difference in decoded
complete sequences (prompt + decoded tokens) produced by vLLM will match token sequences between vLLM and HF specifically in the
HF. decoder-prompt-is-None case.
So when we use HF decoded token output to validate vLLM's decoded token One option is to disable the logit processor feature that forces the
output, the testing process must account for the difference in decoded <BOS> token to be decoded (forced_bos_token_id = None), eliminating
token sequences between vLLM and HF specifically in the the problem entirely. However this is not "normal" BART usage.
decoder-prompt-is-None case.
The other option is - only in the decoder-prompt-is-None case - to
One option is to disable the logit processor feature that forces the discard the first decoded token from the HF output before comparing it
<BOS> token to be decoded (forced_bos_token_id = None), eliminating to vLLM.
the problem entirely. However this is not "normal" BART usage.
To that end, when testing the scenario where the decoder prompt is None
The other option is - only in the decoder-prompt-is-None case - to (and only in that one scenario), this test skips the first HF decoded
discard the first decoded token from the HF output before comparing it token during the process of validating the vLLM decoded output.
to vLLM. '''
To that end, when testing the scenario where the decoder prompt is None # NOTE: take care of the order. run vLLM first, and then run HF.
(and only in that one scenario), this test skips the first HF decoded # vLLM needs a fresh new process without cuda initialization.
token during the process of validating the vLLM decoded output. # if we run HF first, the cuda initialization will be done and it
''' # will hurt multiprocessing backend with fork method (the default).
# NOTE: take care of the order. run vLLM first, and then run HF. # Note: currently encoder/decoder models are only compatible with
# vLLM needs a fresh new process without cuda initialization. # enforce_eager=True. Normally this is not a problem because
# if we run HF first, the cuda initialization will be done and it # for encoder/decoder models vLLM will
# will hurt multiprocessing backend with fork method (the default). # default to enforce_eager=True if enforce_eager
# is left unspecified. However, the
# Note: currently encoder/decoder models are only compatible with # VllmRunner test fixture (which wraps around the LLM class) defaults to
# enforce_eager=True. Normally this is not a problem because # enforce_eager=False (a behavior which a number of already-exisitng
# for encoder/decoder models vLLM will # decoder-only unit tests expect), so when testing an encoder/decoder
# default to enforce_eager=True if enforce_eager # model we must explicitly specify enforce_eager=True in the VllmRunner
# is left unspecified. However, the # constructor.
# VllmRunner test fixture (which wraps around the LLM class) defaults to with vllm_runner(model,
# enforce_eager=False (a behavior which a number of already-exisitng dtype=dtype,
# decoder-only unit tests expect), so when testing an encoder/decoder tensor_parallel_size=tensor_parallel_size,
# model we must explicitly specify enforce_eager=True in the VllmRunner distributed_executor_backend=distributed_executor_backend,
# constructor. enforce_eager=True) as vllm_model:
with vllm_runner( vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
model, prompts, max_tokens, num_logprobs)
dtype=dtype,
tensor_parallel_size=tensor_parallel_size, # Configuration settings for HF baseline
distributed_executor_backend=distributed_executor_backend, hf_kwargs = {
enforce_eager=True) as vllm_model: "top_k": None,
vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs( "num_beams": 1,
prompts, max_tokens, num_logprobs) "repetition_penalty": 1.0,
"top_p": 1.0,
# Configuration settings for HF baseline "length_penalty": 1.0,
hf_kwargs = { "early_stopping": False,
"top_k": None, "no_repeat_ngram_size": None,
"num_beams": 1, "min_length": 0
"repetition_penalty": 1.0, }
"top_p": 1.0,
"length_penalty": 1.0, with hf_runner(model, dtype=dtype,
"early_stopping": False, auto_cls=AutoModelForSeq2SeqLM) as hf_model:
"no_repeat_ngram_size": None, hf_outputs = (hf_model.generate_encoder_decoder_greedy_logprobs_limit(
"min_length": 0 prompts,
} max_tokens,
num_logprobs,
with hf_runner(model, dtype=dtype, **hf_kwargs,
auto_cls=AutoModelForSeq2SeqLM) as hf_model: ))
hf_outputs = (
hf_model.generate_encoder_decoder_greedy_logprobs_limit( hf_skip_tokens = (1
prompts, if decoder_prompt_type == DecoderPromptType.NONE else 0)
max_tokens,
num_logprobs, check_logprobs_close(
**hf_kwargs, outputs_0_lst=hf_outputs,
)) outputs_1_lst=[
vllm_to_hf_output(vllm_output, decoder_prompt_type)
hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE for vllm_output in vllm_outputs
else 0) ],
name_0="hf",
check_logprobs_close( name_1="vllm",
outputs_0_lst=hf_outputs, num_outputs_0_skip_tokens=hf_skip_tokens,
outputs_1_lst=[ )
vllm_to_hf_output(vllm_output, decoder_prompt_type)
for vllm_output in vllm_outputs
], @pytest.mark.parametrize("model", MODELS)
name_0="hf", @pytest.mark.parametrize("dtype", ["float", "bfloat16"])
name_1="vllm", @pytest.mark.parametrize("max_tokens", [64])
num_outputs_0_skip_tokens=hf_skip_tokens, @pytest.mark.parametrize("num_logprobs", [5])
) @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, model,
@pytest.mark.parametrize("model", MODELS) dtype, max_tokens, num_logprobs, decoder_prompt_type) -> None:
@pytest.mark.parametrize("dtype", ["float", "bfloat16"])
@pytest.mark.parametrize("max_tokens", [64]) run_test(
@pytest.mark.parametrize("num_logprobs", [5]) hf_runner,
@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType)) vllm_runner,
def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, example_encoder_decoder_prompts[decoder_prompt_type],
model, dtype, max_tokens, num_logprobs, decoder_prompt_type,
decoder_prompt_type) -> None: model,
dtype=dtype,
run_test( max_tokens=max_tokens,
hf_runner, num_logprobs=num_logprobs,
vllm_runner, tensor_parallel_size=1,
example_encoder_decoder_prompts[decoder_prompt_type], )
decoder_prompt_type,
model,
dtype=dtype, @multi_gpu_test(num_gpus=2)
max_tokens=max_tokens, @pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
num_logprobs=num_logprobs, @pytest.mark.parametrize("model", ["facebook/bart-large-cnn"])
tensor_parallel_size=1, @pytest.mark.parametrize("dtype", ["float"])
) @pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
@multi_gpu_test(num_gpus=2) @pytest.mark.parametrize("decoder_prompt_type", [DecoderPromptType.CUSTOM])
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"]) def test_models_distributed(hf_runner, vllm_runner,
@pytest.mark.parametrize("model", ["facebook/bart-large-cnn"]) example_encoder_decoder_prompts,
@pytest.mark.parametrize("dtype", ["float"]) distributed_executor_backend, model, dtype,
@pytest.mark.parametrize("max_tokens", [64]) max_tokens, num_logprobs,
@pytest.mark.parametrize("num_logprobs", [5]) decoder_prompt_type) -> None:
@pytest.mark.parametrize("decoder_prompt_type", [DecoderPromptType.CUSTOM]) run_test(
def test_models_distributed(hf_runner, vllm_runner, hf_runner,
example_encoder_decoder_prompts, vllm_runner,
distributed_executor_backend, model, dtype, example_encoder_decoder_prompts[decoder_prompt_type],
max_tokens, num_logprobs, decoder_prompt_type,
decoder_prompt_type) -> None: model,
run_test( dtype=dtype,
hf_runner, max_tokens=max_tokens,
vllm_runner, num_logprobs=num_logprobs,
example_encoder_decoder_prompts[decoder_prompt_type], tensor_parallel_size=2,
decoder_prompt_type, distributed_executor_backend=distributed_executor_backend,
model, )
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
tensor_parallel_size=2,
distributed_executor_backend=distributed_executor_backend,
)
import pytest
from ....utils import multi_gpu_test
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
@pytest.mark.parametrize("model", [
"meta-llama/Llama-3.2-11B-Vision-Instruct",
])
def test_models(hf_runner, vllm_runner, image_assets,
distributed_executor_backend, model) -> None:
dtype = "half"
max_tokens = 5
num_logprobs = 5
tensor_parallel_size = 2
if model.startswith("meta-llama/Llama-3.2-11B-Vision-Instruct"):
from .test_mllama import models, run_test
else:
raise NotImplementedError(f"Unsupported model: {model}")
run_test(
hf_runner,
vllm_runner,
image_assets,
model=models[0],
size_factors=[0.25, 0.5, 1.0],
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
)
...@@ -9,10 +9,10 @@ from vllm.sequence import SampleLogprobs ...@@ -9,10 +9,10 @@ from vllm.sequence import SampleLogprobs
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner, from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
_ImageAssets) _ImageAssets)
from ....utils import multi_gpu_test from ....utils import large_gpu_test
from ...utils import check_logprobs_close from ...utils import check_logprobs_close
_LIMIT_IMAGE_PER_PROMPT = 1 _LIMIT_IMAGE_PER_PROMPT = 3
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign": "stop_sign":
...@@ -47,14 +47,46 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str, ...@@ -47,14 +47,46 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
if token_id != image_token_id or output_ids[idx - 1] != image_token_id if token_id != image_token_id or output_ids[idx - 1] != image_token_id
] ]
assert output_str[0] == " " hf_output_str = output_str
hf_output_str = output_str[1:]
if hf_output_ids[-1] == eos_token_id: if hf_output_ids[-1] == eos_token_id:
hf_output_str = hf_output_str + tokenizer.decode(eos_token_id) hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
return hf_output_ids, hf_output_str, out_logprobs return hf_output_ids, hf_output_str, out_logprobs
def _get_inputs(
image_assets: _ImageAssets,
*,
size_factors: Optional[List[float]] = None,
sizes: Optional[List[Tuple[int, int]]] = None,
) -> List[Tuple[List[str], PromptImageInput]]:
images = [asset.pil_image for asset in image_assets]
if size_factors is not None:
inputs_per_image = [(
[prompt for _ in size_factors],
[rescale_image_size(image, factor) for factor in size_factors],
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
elif sizes is not None:
inputs_per_image = [(
[
prompt if size is not None else text_only_prompts[0]
for size in sizes
],
[
image.resize(size) if size is not None else None
for size in sizes
],
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
if len(sizes) == 0:
inputs_per_image.append(
(text_only_prompts, [None] * len(text_only_prompts)))
else:
raise ValueError("You must provide either `size_factors` or `sizes`")
return inputs_per_image
@overload @overload
def run_test( def run_test(
hf_runner: Type[HfRunner], hf_runner: Type[HfRunner],
...@@ -103,39 +135,17 @@ def run_test( ...@@ -103,39 +135,17 @@ def run_test(
tensor_parallel_size: int, tensor_parallel_size: int,
distributed_executor_backend: Optional[str] = None, distributed_executor_backend: Optional[str] = None,
): ):
images = [asset.pil_image for asset in image_assets] _run_test(
hf_runner,
if size_factors is not None: vllm_runner,
inputs_per_image = [( _get_inputs(image_assets, size_factors=size_factors, sizes=sizes),
[prompt for _ in size_factors], model,
[rescale_image_size(image, factor) for factor in size_factors], dtype=dtype,
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] max_tokens=max_tokens,
elif sizes is not None: num_logprobs=num_logprobs,
inputs_per_image = [( tensor_parallel_size=tensor_parallel_size,
[ distributed_executor_backend=distributed_executor_backend,
prompt if size is not None else text_only_prompts[0] )
for size in sizes
],
[
image.resize(size) if size is not None else None
for size in sizes
],
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
if len(sizes) == 0:
inputs_per_image.append(
(text_only_prompts, [None] * len(text_only_prompts)))
else:
raise ValueError("You must provide either `size_factors` or `sizes`")
_run_test(hf_runner,
vllm_runner,
inputs_per_image,
model,
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend)
def _run_test( def _run_test(
...@@ -167,8 +177,8 @@ def _run_test( ...@@ -167,8 +177,8 @@ def _run_test(
# max_model_len should be greater than image_feature_size # max_model_len should be greater than image_feature_size
with vllm_runner(model, with vllm_runner(model,
dtype=dtype, dtype=dtype,
max_num_seqs=16,
max_model_len=4096, max_model_len=4096,
max_num_seqs=2,
tensor_parallel_size=tensor_parallel_size, tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend, distributed_executor_backend=distributed_executor_backend,
enforce_eager=True, enforce_eager=True,
...@@ -185,14 +195,9 @@ def _run_test( ...@@ -185,14 +195,9 @@ def _run_test(
def process(hf_inputs: BatchEncoding): def process(hf_inputs: BatchEncoding):
return hf_inputs return hf_inputs
from transformers import AutoConfig
from transformers.models.mllama import MllamaConfig as MllamaConfigHf
# use transformer's MllamaConfig for hf_runner
# and vllm's MllamaConfig for vllm_runner
AutoConfig.register("mllama", MllamaConfigHf, exist_ok=True)
with hf_runner(model, with hf_runner(model,
dtype=dtype, dtype=dtype,
model_kwargs={"device_map": "auto"},
postprocess_inputs=process, postprocess_inputs=process,
auto_cls=AutoModelForVision2Seq) as hf_model: auto_cls=AutoModelForVision2Seq) as hf_model:
hf_outputs_per_image = [ hf_outputs_per_image = [
...@@ -203,8 +208,6 @@ def _run_test( ...@@ -203,8 +208,6 @@ def _run_test(
for prompts, images in inputs for prompts, images in inputs
] ]
from vllm.transformers_utils.configs.mllama import MllamaConfig
AutoConfig.register("mllama", MllamaConfig, exist_ok=True)
for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
vllm_outputs_per_image): vllm_outputs_per_image):
check_logprobs_close( check_logprobs_close(
...@@ -218,6 +221,7 @@ def _run_test( ...@@ -218,6 +221,7 @@ def _run_test(
) )
@large_gpu_test(min_gb=48)
@pytest.mark.parametrize("model", models) @pytest.mark.parametrize("model", models)
@pytest.mark.parametrize( @pytest.mark.parametrize(
"sizes", "sizes",
...@@ -236,13 +240,13 @@ def _run_test( ...@@ -236,13 +240,13 @@ def _run_test(
(1024, 1024), (512, 1536), (512, 2028), None], (1024, 1024), (512, 1536), (512, 2028), None],
# mllama has 8 possible aspect ratios, carefully set the sizes # mllama has 8 possible aspect ratios, carefully set the sizes
# to cover all of them # to cover all of them
], ])
)
@pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5]) @pytest.mark.parametrize("num_logprobs", [5])
def test_models(hf_runner, vllm_runner, image_assets, model, sizes, dtype, def test_models_single_leading_image(hf_runner, vllm_runner, image_assets,
max_tokens, num_logprobs) -> None: model, sizes, dtype, max_tokens,
num_logprobs) -> None:
run_test( run_test(
hf_runner, hf_runner,
vllm_runner, vllm_runner,
...@@ -256,28 +260,79 @@ def test_models(hf_runner, vllm_runner, image_assets, model, sizes, dtype, ...@@ -256,28 +260,79 @@ def test_models(hf_runner, vllm_runner, image_assets, model, sizes, dtype,
) )
@multi_gpu_test(num_gpus=2) @large_gpu_test(min_gb=48)
@pytest.mark.parametrize("model", models) @pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
"sizes",
[
[(512, 512), (1024, 512), (1536, 512), (2048, 512), (512, 1024),
(1024, 1024), (512, 1536), (512, 2028), None],
],
)
@pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5]) @pytest.mark.parametrize("num_logprobs", [5])
def test_models_distributed(hf_runner, vllm_runner, image_assets, model, sizes, def test_models_multi_leading_images(hf_runner, vllm_runner, image_assets,
dtype, max_tokens, num_logprobs) -> None: model, dtype, max_tokens,
run_test( num_logprobs) -> None:
stop_sign = image_assets[0].pil_image
cherry_blossom = image_assets[1].pil_image
inputs = [(
[
"<|image|><|image|><|begin_of_text|>Describe 2 images.", # noqa: E501
"<|image|><|image|><|begin_of_text|>Describe 2 images.", # noqa: E501
"<|image|><|image|><|image|><|begin_of_text|>Describe 3 images.", # noqa: E501
],
[
[stop_sign, cherry_blossom],
# Images with different sizes.
[
stop_sign.resize((512, 512)),
stop_sign,
],
[
stop_sign,
stop_sign.resize((512, 1536)),
cherry_blossom.resize((512, 1024)),
],
])]
_run_test(
hf_runner, hf_runner,
vllm_runner, vllm_runner,
image_assets, inputs,
model,
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
tensor_parallel_size=1,
)
@large_gpu_test(min_gb=48)
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
dtype, max_tokens, num_logprobs) -> None:
stop_sign = image_assets[0].pil_image
cherry_blossom = image_assets[1].pil_image
inputs = [(
[
"<|begin_of_text|>The content of the image <|image|> is", # noqa: E501
"<|begin_of_text|>Between the first image <|image|> and the second image<|image|>, " # noqa: E501
"which is a stop sign and which is a cherry blossom?", # noqa: E501
],
[
[stop_sign],
[stop_sign, cherry_blossom],
])]
_run_test(
hf_runner,
vllm_runner,
inputs,
model, model,
sizes=sizes,
dtype=dtype, dtype=dtype,
max_tokens=max_tokens, max_tokens=max_tokens,
num_logprobs=num_logprobs, num_logprobs=num_logprobs,
tensor_parallel_size=2, tensor_parallel_size=1,
) )
...@@ -2,7 +2,8 @@ import os ...@@ -2,7 +2,8 @@ import os
import pytest import pytest
from vllm import LLM, SamplingParams from vllm import LLM, PoolingParams, SamplingParams
from vllm.assets.image import ImageAsset
from ..utils import fork_new_process_for_each_test from ..utils import fork_new_process_for_each_test
...@@ -16,7 +17,7 @@ def test_plugin(dummy_opt_path): ...@@ -16,7 +17,7 @@ def test_plugin(dummy_opt_path):
@fork_new_process_for_each_test @fork_new_process_for_each_test
def test_oot_registration(dummy_opt_path): def test_oot_registration_text_generation(dummy_opt_path):
os.environ["VLLM_PLUGINS"] = "register_dummy_model" os.environ["VLLM_PLUGINS"] = "register_dummy_model"
prompts = ["Hello, my name is", "The text does not matter"] prompts = ["Hello, my name is", "The text does not matter"]
sampling_params = SamplingParams(temperature=0) sampling_params = SamplingParams(temperature=0)
...@@ -29,3 +30,52 @@ def test_oot_registration(dummy_opt_path): ...@@ -29,3 +30,52 @@ def test_oot_registration(dummy_opt_path):
# make sure only the first token is generated # make sure only the first token is generated
rest = generated_text.replace(first_token, "") rest = generated_text.replace(first_token, "")
assert rest == "" assert rest == ""
@fork_new_process_for_each_test
def test_oot_registration_embedding(dummy_gemma2_embedding_path):
os.environ["VLLM_PLUGINS"] = "register_dummy_model"
prompts = ["Hello, my name is", "The text does not matter"]
sampling_params = PoolingParams()
llm = LLM(model=dummy_gemma2_embedding_path, load_format="dummy")
outputs = llm.encode(prompts, sampling_params)
for output in outputs:
assert all(v == 0 for v in output.outputs.embedding)
image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
@fork_new_process_for_each_test
def test_oot_registration_multimodal(dummy_llava_path):
os.environ["VLLM_PLUGINS"] = "register_dummy_model"
prompts = [{
"prompt": "What's in the image?<image>",
"multi_modal_data": {
"image": image
},
}, {
"prompt": "Describe the image<image>",
"multi_modal_data": {
"image": image
},
}]
sampling_params = SamplingParams(temperature=0)
llm = LLM(model=dummy_llava_path,
load_format="dummy",
max_num_seqs=1,
trust_remote_code=True,
gpu_memory_utilization=0.98,
max_model_len=4096,
enforce_eager=True,
limit_mm_per_prompt={"image": 1})
first_token = llm.get_tokenizer().decode(0)
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
generated_text = output.outputs[0].text
# make sure only the first token is generated
rest = generated_text.replace(first_token, "")
assert rest == ""
import warnings
import pytest import pytest
import transformers import torch.cuda
from vllm.model_executor.models import _MODELS, ModelRegistry from vllm.model_executor.models import (is_embedding_model,
is_text_generation_model,
supports_multimodal)
from vllm.model_executor.models.registry import (_EMBEDDING_MODELS,
_MULTIMODAL_MODELS,
_SPECULATIVE_DECODING_MODELS,
_TEXT_GENERATION_MODELS,
ModelRegistry)
from vllm.platforms import current_platform
from ..utils import fork_new_process_for_each_test
@pytest.mark.parametrize("model_cls", _MODELS)
def test_registry_imports(model_cls):
if (model_cls in ("LlavaOnevisionForConditionalGeneration",
"Qwen2VLForConditionalGeneration")
and transformers.__version__ < "4.45"):
pytest.skip("Waiting for next transformers release")
@pytest.mark.parametrize("model_arch", ModelRegistry.get_supported_archs())
def test_registry_imports(model_arch):
# Ensure all model classes can be imported successfully # Ensure all model classes can be imported successfully
ModelRegistry.resolve_model_cls([model_cls]) model_cls, _ = ModelRegistry.resolve_model_cls(model_arch)
if model_arch in _SPECULATIVE_DECODING_MODELS:
pass # Ignore these models which do not have a unified format
else:
assert is_text_generation_model(model_cls) is (
model_arch in _TEXT_GENERATION_MODELS
or model_arch in _MULTIMODAL_MODELS)
assert is_embedding_model(model_cls) is (model_arch
in _EMBEDDING_MODELS)
assert supports_multimodal(model_cls) is (model_arch
in _MULTIMODAL_MODELS)
@fork_new_process_for_each_test
@pytest.mark.parametrize("model_arch,is_mm,init_cuda", [
("LlamaForCausalLM", False, False),
("MllamaForConditionalGeneration", True, False),
("LlavaForConditionalGeneration", True, True),
])
def test_registry_is_multimodal(model_arch, is_mm, init_cuda):
assert ModelRegistry.is_multimodal_model(model_arch) is is_mm
if init_cuda and current_platform.is_cuda_alike():
assert not torch.cuda.is_initialized()
ModelRegistry.resolve_model_cls(model_arch)
if not torch.cuda.is_initialized():
warnings.warn(
"This model no longer initializes CUDA on import. "
"Please test using a different one.",
stacklevel=2)
@fork_new_process_for_each_test
@pytest.mark.parametrize("model_arch,is_pp,init_cuda", [
("MLPSpeculatorPreTrainedModel", False, False),
("DeepseekV2ForCausalLM", True, False),
("Qwen2VLForConditionalGeneration", True, True),
])
def test_registry_is_pp(model_arch, is_pp, init_cuda):
assert ModelRegistry.is_pp_supported_model(model_arch) is is_pp
if init_cuda and current_platform.is_cuda_alike():
assert not torch.cuda.is_initialized()
ModelRegistry.resolve_model_cls(model_arch)
if not torch.cuda.is_initialized():
warnings.warn(
"This model no longer initializes CUDA on import. "
"Please test using a different one.",
stacklevel=2)
import warnings import warnings
from typing import Dict, List, Optional, Sequence, Tuple, Union from typing import Dict, List, Optional, Sequence, Tuple, Union
import torch
from vllm.config import ModelConfig from vllm.config import ModelConfig
from vllm.inputs import InputContext from vllm.inputs import InputContext
from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
from vllm.utils import is_cpu
TokensText = Tuple[List[int], str] TokensText = Tuple[List[int], str]
...@@ -247,6 +250,7 @@ def check_logprobs_close( ...@@ -247,6 +250,7 @@ def check_logprobs_close(
def build_model_context(model_name: str, def build_model_context(model_name: str,
tokenizer_name: Optional[str] = None, tokenizer_name: Optional[str] = None,
trust_remote_code: bool = False, trust_remote_code: bool = False,
dtype: Optional[Union[str, torch.dtype]] = None,
mm_processor_kwargs: Optional[Dict] = None, mm_processor_kwargs: Optional[Dict] = None,
limit_mm_per_prompt: Optional[Dict] = None): limit_mm_per_prompt: Optional[Dict] = None):
"""Creates an InputContext for a given model. """Creates an InputContext for a given model.
...@@ -264,12 +268,15 @@ def build_model_context(model_name: str, ...@@ -264,12 +268,15 @@ def build_model_context(model_name: str,
""" """
if tokenizer_name is None: if tokenizer_name is None:
tokenizer_name = model_name tokenizer_name = model_name
if dtype is None:
dtype = "bfloat16" if is_cpu() else "half"
model_config = ModelConfig( model_config = ModelConfig(
model_name, model_name,
tokenizer_name, tokenizer_name,
tokenizer_mode="auto", tokenizer_mode="auto",
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
dtype="float32", dtype=dtype,
seed=0, seed=0,
mm_processor_kwargs=mm_processor_kwargs, mm_processor_kwargs=mm_processor_kwargs,
limit_mm_per_prompt=limit_mm_per_prompt, limit_mm_per_prompt=limit_mm_per_prompt,
......
...@@ -61,7 +61,7 @@ async def test_evil_forward(tmp_socket): ...@@ -61,7 +61,7 @@ async def test_evil_forward(tmp_socket):
# Throws an error in first forward pass. # Throws an error in first forward pass.
with pytest.raises(RAISED_ERROR): with pytest.raises(RAISED_ERROR):
async for _ in client.generate(inputs="Hello my name is", async for _ in client.generate(prompt="Hello my name is",
sampling_params=SamplingParams(), sampling_params=SamplingParams(),
request_id=uuid.uuid4()): request_id=uuid.uuid4()):
pass pass
...@@ -69,7 +69,7 @@ async def test_evil_forward(tmp_socket): ...@@ -69,7 +69,7 @@ async def test_evil_forward(tmp_socket):
# Engine is errored, should get ENGINE_DEAD_ERROR. # Engine is errored, should get ENGINE_DEAD_ERROR.
with pytest.raises(MQEngineDeadError): with pytest.raises(MQEngineDeadError):
async for _ in client.generate(inputs="Hello my name is", async for _ in client.generate(prompt="Hello my name is",
sampling_params=SamplingParams(), sampling_params=SamplingParams(),
request_id=uuid.uuid4()): request_id=uuid.uuid4()):
pass pass
...@@ -118,7 +118,7 @@ async def test_failed_health_check(tmp_socket): ...@@ -118,7 +118,7 @@ async def test_failed_health_check(tmp_socket):
# Generate call should throw ENGINE_DEAD_ERROR # Generate call should throw ENGINE_DEAD_ERROR
with pytest.raises(MQEngineDeadError): with pytest.raises(MQEngineDeadError):
async for _ in client.generate(inputs="Hello my name is", async for _ in client.generate(prompt="Hello my name is",
sampling_params=SamplingParams(), sampling_params=SamplingParams(),
request_id=uuid.uuid4()): request_id=uuid.uuid4()):
pass pass
...@@ -160,7 +160,7 @@ async def test_failed_abort(tmp_socket): ...@@ -160,7 +160,7 @@ async def test_failed_abort(tmp_socket):
# with reference to the original KeyError("foo") # with reference to the original KeyError("foo")
with pytest.raises(MQEngineDeadError) as execinfo: with pytest.raises(MQEngineDeadError) as execinfo:
async for _ in client.generate( async for _ in client.generate(
inputs="Hello my name is", prompt="Hello my name is",
sampling_params=SamplingParams(max_tokens=10), sampling_params=SamplingParams(max_tokens=10),
request_id=uuid.uuid4()): request_id=uuid.uuid4()):
pass pass
...@@ -183,7 +183,7 @@ async def test_bad_request(tmp_socket): ...@@ -183,7 +183,7 @@ async def test_bad_request(tmp_socket):
# Invalid request should fail, but not crash the server. # Invalid request should fail, but not crash the server.
with pytest.raises(ValueError): with pytest.raises(ValueError):
async for _ in client.generate(inputs="Hello my name is", async for _ in client.generate(prompt="Hello my name is",
sampling_params=SamplingParams(), sampling_params=SamplingParams(),
request_id="abcd-1", request_id="abcd-1",
lora_request=LoRARequest( lora_request=LoRARequest(
...@@ -192,7 +192,7 @@ async def test_bad_request(tmp_socket): ...@@ -192,7 +192,7 @@ async def test_bad_request(tmp_socket):
pass pass
# This request should be okay. # This request should be okay.
async for _ in client.generate(inputs="Hello my name is", async for _ in client.generate(prompt="Hello my name is",
sampling_params=SamplingParams(), sampling_params=SamplingParams(),
request_id="abcd-2"): request_id="abcd-2"):
pass pass
......
...@@ -20,7 +20,7 @@ async def generate( ...@@ -20,7 +20,7 @@ async def generate(
count = 0 count = 0
async for out in client.generate( async for out in client.generate(
request_id=request_id, request_id=request_id,
inputs="Hello my name is Robert and", prompt="Hello my name is Robert and",
sampling_params=SamplingParams(max_tokens=num_tokens, sampling_params=SamplingParams(max_tokens=num_tokens,
temperature=0)): temperature=0)):
......
...@@ -17,7 +17,6 @@ NUM_PROMPTS = [10] ...@@ -17,7 +17,6 @@ NUM_PROMPTS = [10]
DEFAULT_SERVER_ARGS: List[str] = [ DEFAULT_SERVER_ARGS: List[str] = [
"--disable-log-requests", "--disable-log-requests",
"--use-v2-block-manager",
"--worker-use-ray", "--worker-use-ray",
"--gpu-memory-utilization", "--gpu-memory-utilization",
"0.85", "0.85",
...@@ -37,6 +36,7 @@ DEFAULT_SERVER_ARGS: List[str] = [ ...@@ -37,6 +36,7 @@ DEFAULT_SERVER_ARGS: List[str] = [
@pytest.mark.parametrize("num_logprobs", [5]) @pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("is_async", [True]) @pytest.mark.parametrize("is_async", [True])
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"]) @pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
@pytest.mark.parametrize("enable_chunked_prefill", [True, False])
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_multi_step( async def test_multi_step(
example_prompts, example_prompts,
...@@ -49,6 +49,7 @@ async def test_multi_step( ...@@ -49,6 +49,7 @@ async def test_multi_step(
is_async: bool, is_async: bool,
num_logprobs: Optional[int], num_logprobs: Optional[int],
attention_backend: str, attention_backend: str,
enable_chunked_prefill: bool,
monkeypatch, monkeypatch,
) -> None: ) -> None:
"""Test vLLM engine with multi-step scheduling in an OpenAI-protocol """Test vLLM engine with multi-step scheduling in an OpenAI-protocol
...@@ -74,6 +75,10 @@ async def test_multi_step( ...@@ -74,6 +75,10 @@ async def test_multi_step(
num_logprobs: corresponds to the `logprobs` argument to the OpenAI num_logprobs: corresponds to the `logprobs` argument to the OpenAI
completions endpoint; `None` -> no logprobs completions endpoint; `None` -> no logprobs
""" """
if enable_chunked_prefill and \
(pp_size > 1 or attention_backend != "FLASH_ATTN"):
pytest.skip("Multi-step with Chunked-Prefill only supports"
"PP=1 and FLASH_ATTN backend")
override_backend_env_variable(monkeypatch, attention_backend) override_backend_env_variable(monkeypatch, attention_backend)
...@@ -93,6 +98,9 @@ async def test_multi_step( ...@@ -93,6 +98,9 @@ async def test_multi_step(
if eager_mode: if eager_mode:
ms_server_args.append("--enforce-eager") ms_server_args.append("--enforce-eager")
if enable_chunked_prefill:
ms_server_args.append("--enable-chunked-prefill")
distributed_args = [ distributed_args = [
"--tensor-parallel-size", "--tensor-parallel-size",
str(tp_size), str(tp_size),
...@@ -133,3 +141,85 @@ async def test_multi_step( ...@@ -133,3 +141,85 @@ async def test_multi_step(
name_0="hf", name_0="hf",
name_1="vllm", name_1="vllm",
) )
@pytest.mark.parametrize(("tp_size, pp_size"), [
(1, 2),
])
@pytest.mark.asyncio
async def test_multi_step_pp_smoke(
tp_size: int,
pp_size: int,
monkeypatch,
) -> None:
"""
Smoke test for the vLLM engine with multi-step scheduling in an
OpenAI-protocol client/server environment.
This tests compares the outputs between multi-step scheduling and
single-step scheduling. Notably, this test lets the engines generate
more tokens (default is 5) and test for an exact match over all the
tokens.
Args:
tp_size: degree of tensor-parallelism
pp_size: degree of pipeline-parallelism
eager_mode
"""
model = "JackFram/llama-160m"
num_scheduler_steps = 8
attention_backend = "FLASH_ATTN"
max_num_seqs = 3
override_backend_env_variable(monkeypatch, attention_backend)
# Prompt from the ShareGPT dataset
prompts = [
"in the jtbd context whats a push?", # codespell:ignore
"in the jtbd context whats a push?", # codespell:ignore
"in the jtbd context whats a push?", # codespell:ignore
"in the jtbd context whats a push?", # codespell:ignore
]
# Use varying max_tokens to introduce scheduling randomness.
max_tokens = [10 * i for i in range(1, len(prompts) + 1)]
assert len(prompts) == len(max_tokens)
test_args = [
"--tensor-parallel-size",
str(tp_size), "--pipeline-parallel-size",
str(pp_size), "--max-num-seqs",
str(max_num_seqs)
]
server_args = DEFAULT_SERVER_ARGS + test_args
ms_server_args = DEFAULT_SERVER_ARGS + \
["--num-scheduler-steps", f"{num_scheduler_steps}"] + \
test_args
# Spin up client/server & issue completion API requests.
# Default `max_wait_seconds` is 240 but was empirically
# was raised 3x to 720 *just for this test* due to
# observed timeouts in GHA CI
ref_completions = await completions_with_server_args(
prompts=prompts,
model_name=model,
server_cli_args=server_args,
num_logprobs=None,
max_wait_seconds=5 * 240,
max_tokens=max_tokens)
test_completions = await completions_with_server_args(
prompts=prompts,
model_name=model,
server_cli_args=ms_server_args,
num_logprobs=None,
max_wait_seconds=5 * 240,
max_tokens=max_tokens)
# Assert multi-step scheduling produces identical tokens
# to single-step scheduling.
ref_generations = get_client_text_generations(ref_completions)
test_generations = get_client_text_generations(test_completions)
assert ref_generations == test_generations
# Test the LLMEngine with multi-step-decoding # Test the LLMEngine with multi-step-decoding
import copy
from typing import Optional from typing import Optional
import pytest import pytest
...@@ -16,6 +17,7 @@ NUM_PROMPTS = [10] ...@@ -16,6 +17,7 @@ NUM_PROMPTS = [10]
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("tp_size", [1]) @pytest.mark.parametrize("tp_size", [1])
@pytest.mark.parametrize("enable_chunked_prefill", [False, True])
@pytest.mark.parametrize("max_tokens", [5]) @pytest.mark.parametrize("max_tokens", [5])
@pytest.mark.parametrize("enforce_eager", [True]) @pytest.mark.parametrize("enforce_eager", [True])
@pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS) @pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
...@@ -28,6 +30,7 @@ def test_multi_step_llm( ...@@ -28,6 +30,7 @@ def test_multi_step_llm(
model: str, model: str,
dtype: str, dtype: str,
tp_size: int, tp_size: int,
enable_chunked_prefill: bool,
max_tokens: int, max_tokens: int,
enforce_eager: int, enforce_eager: int,
num_scheduler_steps: int, num_scheduler_steps: int,
...@@ -51,6 +54,7 @@ def test_multi_step_llm( ...@@ -51,6 +54,7 @@ def test_multi_step_llm(
model: model under test (same for single- and multi-step engines) model: model under test (same for single- and multi-step engines)
dtype: tensor datatype for engine to utilize dtype: tensor datatype for engine to utilize
tp_size: degree of tensor-parallelism tp_size: degree of tensor-parallelism
enable_chunked_prefill: chunked-prefill on/off
max_tokens: the maximum number of tokens to generate max_tokens: the maximum number of tokens to generate
enforce_eager enforce_eager
num_scheduler_steps: for multi-step scheduling, GPU-side steps per num_scheduler_steps: for multi-step scheduling, GPU-side steps per
...@@ -72,7 +76,7 @@ def test_multi_step_llm( ...@@ -72,7 +76,7 @@ def test_multi_step_llm(
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
gpu_memory_utilization=0.7, gpu_memory_utilization=0.7,
tensor_parallel_size=tp_size, tensor_parallel_size=tp_size,
use_v2_block_manager=True, enable_chunked_prefill=enable_chunked_prefill,
num_scheduler_steps=num_scheduler_steps, num_scheduler_steps=num_scheduler_steps,
) as vllm_model: ) as vllm_model:
vllm_outputs = (vllm_model.generate_greedy(prompts, max_tokens) vllm_outputs = (vllm_model.generate_greedy(prompts, max_tokens)
...@@ -164,7 +168,6 @@ def test_multi_step_llm_w_prompt_logprobs( ...@@ -164,7 +168,6 @@ def test_multi_step_llm_w_prompt_logprobs(
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
gpu_memory_utilization=0.7, gpu_memory_utilization=0.7,
tensor_parallel_size=tp_size, tensor_parallel_size=tp_size,
use_v2_block_manager=True,
num_scheduler_steps=num_scheduler_steps, num_scheduler_steps=num_scheduler_steps,
) as vllm_model: ) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs( vllm_outputs = vllm_model.generate_greedy_logprobs(
...@@ -192,3 +195,158 @@ def test_multi_step_llm_w_prompt_logprobs( ...@@ -192,3 +195,158 @@ def test_multi_step_llm_w_prompt_logprobs(
name_0="hf", name_0="hf",
name_1="vllm", name_1="vllm",
) )
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("tp_size", [1])
@pytest.mark.parametrize("max_tokens", [5])
@pytest.mark.parametrize("enforce_eager", [True])
@pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
@pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
@pytest.mark.parametrize("num_logprobs", [None, 5])
def test_multi_step_llm_chunked_prefill_prefix_cache(
vllm_runner,
example_prompts,
model: str,
dtype: str,
tp_size: int,
max_tokens: int,
enforce_eager: int,
num_scheduler_steps: int,
num_prompts: int,
num_logprobs: Optional[int],
) -> None:
"""Test vLLM engine with multi-step+"single-step chunked prefill"+APC.
Set up contrived scenario which tests for a possible failure mode of
scheduling with multi-step+"single-step chunked prefill"+APC
"single-step chunked prefill" here refers to the current vLLM multi-step+
chunked-prefill implementation, which requires that a prefill may only
be scheduled in the same step as decodes if the prefill prompt fits in a
single chunk (note that "complete" multi-step+chunked-prefill would allow
a prefill to span multiple chunks & multiple steps but that is not yet
the case.)
"APC" is short for "automatic prefix caching".
This test creates a scenario where the scheduler must decide whether/how
to schedule a prefill with a prompt that exceeds the available token budget.
The correct behavior for multi-step+"single-step chunked prefill"+APC is to
put off scheduling the prefill until a future step.
Validate that:
* Multi-step kernels do not raise an exception due to incorrect scheduler
behavior
* Generated tokens match between
multi-step+"single-step chunked prefill"+APC and
single-step scheduling.
* (If logprobs are enabled) check logprobs are close enough
Args:
vllm_runner: vLLM model runner fixture
example_prompts: test fixture providing example prompts
model: model under test (same for single- and multi-step engines)
dtype: tensor datatype for engine to utilize
tp_size: degree of tensor-parallelism
max_tokens: the maximum number of tokens to generate
enforce_eager
num_scheduler_steps: for multi-step scheduling, GPU-side steps per
GPU -> CPU output transfer
num_prompts: number of example prompts under test
num_logprobs: corresponds to the `logprobs` argument to the OpenAI
completions endpoint; `None` -> 1 logprob returned.
"""
# Set up contrived test for correct scheduling behavior with
# multi-step+"single-step chunked prefill"+APC.
#
# Assume block_size=16
#
# Assume max_num_batched_tokens=48
# => Per-step token budget=48
#
# 1. Scheduler schedules 0th prompt (24 tokens)
# => Remaining token budget=24
# 2. Scheduler attempts to schedule 1st prompt (30 tokens)
# * 30 tokens exceeds 24 token remaining budget
# * Correct behavior: do not schedule this prompt in this step
# * Incorrect behavior: schedule prompt chunk
# * `do_sample=False` for this prompt in this step
# * Chunk size = (remaining tokens // block size) * block size
#
# The Incorrect scheduling behavior - if it occurs - will cause an exception
# in the model runner resulting from `do_sample=False`.
assert len(example_prompts) >= 2
challenge_prompts = copy.deepcopy(example_prompts)
challenge_prompts[0] = ('vLLM is a high-throughput and memory-efficient '
'inference and serving engine for LLMs.\n'
) # 24 tok
challenge_prompts[1] = (
'Briefly describe the major milestones in the '
'development of artificial intelligence from 1950 to 2020.\n'
) # 30 tok
# If necessary, adjust the length of `challenge_prompts` to match
# `num_prompts`
if len(challenge_prompts) < num_prompts:
challenge_prompts = (challenge_prompts *
((num_prompts // len(challenge_prompts)) + 1))
challenge_prompts = challenge_prompts[:num_prompts]
assert len(challenge_prompts) == num_prompts
# Single-step scheduler baseline
with vllm_runner(
model,
dtype=dtype,
enforce_eager=enforce_eager,
gpu_memory_utilization=0.7,
tensor_parallel_size=tp_size,
num_scheduler_steps=num_scheduler_steps,
max_model_len=48,
max_num_batched_tokens=48,
max_num_seqs=4,
block_size=16,
) as vllm_model:
outputs_baseline = (vllm_model.generate_greedy(
challenge_prompts, max_tokens) if num_logprobs is None else
vllm_model.generate_greedy_logprobs(
challenge_prompts, max_tokens, num_logprobs))
# multi-step+"single-step chunked prefill"+APC
with vllm_runner(
model,
dtype=dtype,
enforce_eager=enforce_eager,
gpu_memory_utilization=0.7,
tensor_parallel_size=tp_size,
enable_chunked_prefill=True,
enable_prefix_caching=True,
num_scheduler_steps=num_scheduler_steps,
max_model_len=48,
max_num_batched_tokens=48,
max_num_seqs=4,
block_size=16,
) as vllm_model:
outputs_w_features = (vllm_model.generate_greedy(
challenge_prompts, max_tokens) if num_logprobs is None else
vllm_model.generate_greedy_logprobs(
challenge_prompts, max_tokens, num_logprobs))
if num_logprobs is None:
# No-logprobs test
check_outputs_equal(
outputs_0_lst=outputs_baseline,
outputs_1_lst=outputs_w_features,
name_0="multi-step",
name_1="multi-step+features",
)
else:
# Yes-logprobs test
check_logprobs_close(
outputs_0_lst=outputs_baseline,
outputs_1_lst=outputs_w_features,
name_0="multi-step",
name_1="multi-step+features",
)
...@@ -5,7 +5,7 @@ from unittest.mock import patch ...@@ -5,7 +5,7 @@ from unittest.mock import patch
import pytest import pytest
import torch import torch
from vllm.inputs import InputContext, LLMInputs from vllm.inputs import DecoderOnlyInputs, InputContext, token_inputs
from vllm.inputs.registry import InputRegistry from vllm.inputs.registry import InputRegistry
from vllm.multimodal import MultiModalRegistry from vllm.multimodal import MultiModalRegistry
from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData
...@@ -31,7 +31,7 @@ def use_processor_mock(): ...@@ -31,7 +31,7 @@ def use_processor_mock():
"""Patches the internal model input processor with an override callable.""" """Patches the internal model input processor with an override callable."""
def custom_processor(ctx: InputContext, def custom_processor(ctx: InputContext,
llm_inputs: LLMInputs, inputs: DecoderOnlyInputs,
*, *,
num_crops=DEFAULT_NUM_CROPS): num_crops=DEFAULT_NUM_CROPS):
# For testing purposes, we don't worry about the llm inputs / return # For testing purposes, we don't worry about the llm inputs / return
...@@ -74,38 +74,61 @@ def mm_model_cls(): ...@@ -74,38 +74,61 @@ def mm_model_cls():
# lambda whose signature matches max token calcs extra & mapper + extra kwargs # lambda whose signature matches max token calcs extra & mapper + extra kwargs
get_num_crops = lambda ctx, *, num_crops=DEFAULT_NUM_CROPS: num_crops get_num_crops = lambda ctx, *, num_crops=DEFAULT_NUM_CROPS: num_crops
custom_mapper = lambda ctx, data, *, num_crops=DEFAULT_NUM_CROPS: { custom_mapper = lambda ctx, data, *, num_crops=DEFAULT_NUM_CROPS: {
"num_pixels": torch.zeros(size=(1, num_crops + 1, 3, 336, 336)) "pixel_values": torch.zeros(size=(1, num_crops + 1, 3, 336, 336))
} }
### Test for default processor logic & mm_processor_kwargs wrapping ### Tests for default processor logic & mm_processor_kwargs wrapping
def test_default_processor_is_a_noop(): def test_default_processor_is_a_noop():
"""Ensure that by default, there is no processor override.""" """Ensure that by default, there is no processor override."""
dummy_registry = InputRegistry() dummy_registry = InputRegistry()
ctx = build_model_context(DUMMY_MODEL_ID) ctx = build_model_context(DUMMY_MODEL_ID)
processor = dummy_registry.create_input_processor(ctx.model_config) processor = dummy_registry.create_input_processor(ctx.model_config)
proc_inputs = LLMInputs(prompt_token_ids=[], prompt="") proc_inputs = token_inputs(prompt_token_ids=[], prompt="")
proc_outputs = processor(inputs=proc_inputs) proc_outputs = processor(inputs=proc_inputs)
assert proc_inputs is proc_outputs assert proc_inputs is proc_outputs
@pytest.mark.parametrize("num_crops", [None, NUM_CROPS_OVERRIDE]) def _get_num_crops_info(init_num_crops: int, inference_num_crops: int):
def test_processor_default_kwargs(use_processor_mock, num_crops): """Get the init / inference kwargs and expected num_crops for this test."""
"""Ensure input processors can use processor kwargs."""
dummy_registry = InputRegistry()
# If we have a value for num_crops, pass the override value and make # If we have a value for num_crops, pass the override value and make
# sure we get that value as a return-value from out mock processor, # sure we get that value as a return-value from out mock processor,
# otherwise fall back to the default value # otherwise fall back to the default value
mm_processor_kwargs = None if num_crops is None else { init_kwargs = None if init_num_crops is None else {
"num_crops": num_crops "num_crops": init_num_crops
} }
expected_num_crops = DEFAULT_NUM_CROPS if num_crops is None else num_crops inference_kwargs = None if inference_num_crops is None else {
ctx = build_model_context(DUMMY_MODEL_ID, "num_crops": inference_num_crops
mm_processor_kwargs=mm_processor_kwargs) }
processor = dummy_registry.create_input_processor(ctx.model_config) if inference_num_crops is not None:
expected_seq_count = inference_num_crops
elif init_num_crops is not None:
expected_seq_count = init_num_crops
else:
expected_seq_count = DEFAULT_NUM_CROPS
return init_kwargs, inference_kwargs, expected_seq_count
@pytest.mark.parametrize("init_num_crops,inference_num_crops", [
(None, None),
(NUM_CROPS_OVERRIDE, None),
(DEFAULT_NUM_CROPS, NUM_CROPS_OVERRIDE),
])
def test_input_processor_kwargs(use_processor_mock, init_num_crops,
inference_num_crops):
"""Ensure input processors can use processor kwargs."""
dummy_registry = InputRegistry()
init_kwargs, inference_kwargs, expected_seq_count = _get_num_crops_info(
init_num_crops, inference_num_crops)
num_crops_val = processor(LLMInputs(prompt_token_ids=[], prompt="")) ctx = build_model_context(DUMMY_MODEL_ID, mm_processor_kwargs=init_kwargs)
assert num_crops_val == expected_num_crops processor = dummy_registry.create_input_processor(ctx.model_config)
num_crops_val = processor(
token_inputs(prompt_token_ids=[],
prompt="",
mm_processor_kwargs=inference_kwargs))
assert num_crops_val == expected_seq_count
@pytest.mark.parametrize( @pytest.mark.parametrize(
...@@ -124,11 +147,16 @@ def test_processor_with_sad_kwarg_overrides(use_processor_mock, ...@@ -124,11 +147,16 @@ def test_processor_with_sad_kwarg_overrides(use_processor_mock,
mm_processor_kwargs): mm_processor_kwargs):
"""Ensure that input processors filter out invalid mm_processor_kwargs""" """Ensure that input processors filter out invalid mm_processor_kwargs"""
dummy_registry = InputRegistry() dummy_registry = InputRegistry()
# Should filter out the init time kwargs
ctx = build_model_context(DUMMY_MODEL_ID, ctx = build_model_context(DUMMY_MODEL_ID,
mm_processor_kwargs=mm_processor_kwargs) mm_processor_kwargs=mm_processor_kwargs)
processor = dummy_registry.create_input_processor(ctx.model_config) processor = dummy_registry.create_input_processor(ctx.model_config)
num_crops_val = processor(LLMInputs(prompt_token_ids=[], prompt="")) # Should filter out the inference time kwargs
num_crops_val = processor(
token_inputs(prompt_token_ids=[],
prompt="",
mm_processor_kwargs=mm_processor_kwargs))
assert num_crops_val == DEFAULT_NUM_CROPS assert num_crops_val == DEFAULT_NUM_CROPS
...@@ -271,32 +299,34 @@ def test_default_mapper_with_processer_kwargs(image_assets, num_crops): ...@@ -271,32 +299,34 @@ def test_default_mapper_with_processer_kwargs(image_assets, num_crops):
assert mapped_inputs["pixel_values"].shape[1] == num_crops + 1 assert mapped_inputs["pixel_values"].shape[1] == num_crops + 1
@pytest.mark.parametrize("num_crops", [None, NUM_CROPS_OVERRIDE]) @pytest.mark.parametrize("init_num_crops,inference_num_crops", [
def test_custom_mapper_kwarg_overrides(image_assets, num_crops): (None, None),
(NUM_CROPS_OVERRIDE, None),
(DEFAULT_NUM_CROPS, NUM_CROPS_OVERRIDE),
])
def test_custom_mapper_kwarg_overrides(image_assets, init_num_crops,
inference_num_crops):
"""Ensure custom mappers can use processor kwargs.""" """Ensure custom mappers can use processor kwargs."""
mm_processor_kwargs = None if num_crops is None else { init_kwargs, inference_kwargs, expected_seq_count = _get_num_crops_info(
"num_crops": num_crops init_num_crops, inference_num_crops)
}
expected_seq_count = DEFAULT_NUM_CROPS if num_crops is None else num_crops
ctx = build_model_context(MULTIMODAL_MODEL_ID, ctx = build_model_context(MULTIMODAL_MODEL_ID,
trust_remote_code=True, trust_remote_code=True,
mm_processor_kwargs=mm_processor_kwargs, mm_processor_kwargs=init_kwargs,
limit_mm_per_prompt={"image": 1}) limit_mm_per_prompt={"image": 1})
mm_registry = MultiModalRegistry() mm_registry = MultiModalRegistry()
mm_registry.init_mm_limits_per_prompt(ctx.model_config) mm_registry.init_mm_limits_per_prompt(ctx.model_config)
# Patch the image registry for phi3v with our lambda that is compatible
# with overrides, then ensure that calling the method correctly echos
# our num_crops value back from the mm_processor_kwargs.
image = image_assets[0].pil_image image = image_assets[0].pil_image
mm_inputs = {"image": image} mm_inputs = {"image": image}
with patch.object( # Patch the image registry for phi3v with our lambda that is compatible
mm_registry._get_plugin("image"), # with overrides, then ensure that calling the method correctly echos
"_default_input_mapper", # our num_crops value back from the mm_processor_kwargs.
{mm_model_cls(): custom_mapper}, mm_registry._get_plugin("image").register_input_mapper(custom_mapper)(
): mm_model_cls())
mapped_inputs = mm_registry.map_input(ctx.model_config, mm_inputs) mapped_inputs = mm_registry.map_input(ctx.model_config, mm_inputs,
inference_kwargs)
assert mapped_inputs["pixel_values"].shape[1] == expected_seq_count + 1 assert mapped_inputs["pixel_values"].shape[1] == expected_seq_count + 1
...@@ -316,6 +346,7 @@ def test_custom_mapper_kwarg_overrides(image_assets, num_crops): ...@@ -316,6 +346,7 @@ def test_custom_mapper_kwarg_overrides(image_assets, num_crops):
def test_custom_mapper_with_sad_kwarg_overrides(image_assets, def test_custom_mapper_with_sad_kwarg_overrides(image_assets,
mm_processor_kwargs): mm_processor_kwargs):
"""Ensure that custom mappers filters out invalid mm_processor_kwargs""" """Ensure that custom mappers filters out invalid mm_processor_kwargs"""
# Should filter out the init time kwargs
ctx = build_model_context(MULTIMODAL_MODEL_ID, ctx = build_model_context(MULTIMODAL_MODEL_ID,
trust_remote_code=True, trust_remote_code=True,
mm_processor_kwargs=mm_processor_kwargs, mm_processor_kwargs=mm_processor_kwargs,
...@@ -323,17 +354,16 @@ def test_custom_mapper_with_sad_kwarg_overrides(image_assets, ...@@ -323,17 +354,16 @@ def test_custom_mapper_with_sad_kwarg_overrides(image_assets,
mm_registry = MultiModalRegistry() mm_registry = MultiModalRegistry()
mm_registry.init_mm_limits_per_prompt(ctx.model_config) mm_registry.init_mm_limits_per_prompt(ctx.model_config)
# Patch the image registry for phi3v with our lambda that is compatible
# with overrides, then ensure that calling the method correctly echos
# our num_crops value back from the mm_processor_kwargs.
image = image_assets[0].pil_image image = image_assets[0].pil_image
mm_inputs = {"image": image} mm_inputs = {"image": image}
with patch.object( # Patch the image registry for phi3v with our lambda that is compatible
mm_registry._get_plugin("image"), # with overrides, then ensure that calling the method correctly echos
"_default_input_mapper", # our num_crops value back from the mm_processor_kwargs.
{mm_model_cls(): custom_mapper}, mm_registry._get_plugin("image").register_input_mapper(custom_mapper)(
): mm_model_cls())
mapped_inputs = mm_registry.map_input(ctx.model_config, mm_inputs) # Should filter out the inference time kwargs
mapped_inputs = mm_registry.map_input(
ctx.model_config, mm_inputs, mm_processor_kwargs=mm_processor_kwargs)
assert mapped_inputs["pixel_values"].shape[1] == DEFAULT_NUM_CROPS + 1 assert mapped_inputs["pixel_values"].shape[1] == DEFAULT_NUM_CROPS + 1
from typing import Optional
import torch
from vllm import ModelRegistry from vllm import ModelRegistry
from vllm.model_executor.models.opt import OPTForCausalLM
from vllm.model_executor.sampling_metadata import SamplingMetadata
class MyOPTForCausalLM(OPTForCausalLM):
def compute_logits(
self, hidden_states: torch.Tensor,
sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
# this dummy model always predicts the first token
logits = super().compute_logits(hidden_states, sampling_metadata)
if logits is not None:
logits.zero_()
logits[:, 0] += 1.0
return logits
def register(): def register():
# register our dummy model # Test directly passing the model
from .my_opt import MyOPTForCausalLM
if "MyOPTForCausalLM" not in ModelRegistry.get_supported_archs(): if "MyOPTForCausalLM" not in ModelRegistry.get_supported_archs():
ModelRegistry.register_model("MyOPTForCausalLM", MyOPTForCausalLM) ModelRegistry.register_model("MyOPTForCausalLM", MyOPTForCausalLM)
# Test passing lazy model
if "MyGemma2Embedding" not in ModelRegistry.get_supported_archs():
ModelRegistry.register_model(
"MyGemma2Embedding",
"vllm_add_dummy_model.my_gemma_embedding:MyGemma2Embedding",
)
if "MyLlava" not in ModelRegistry.get_supported_archs():
ModelRegistry.register_model("MyLlava",
"vllm_add_dummy_model.my_llava:MyLlava")
from typing import List, Optional, Union
import torch
from vllm.attention import AttentionMetadata
from vllm.model_executor.models.gemma2 import Gemma2EmbeddingModel
from vllm.sequence import IntermediateTensors
class MyGemma2Embedding(Gemma2EmbeddingModel):
def forward(
self,
input_ids: torch.Tensor,
positions: torch.Tensor,
kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata,
intermediate_tensors: Optional[IntermediateTensors] = None,
inputs_embeds: Optional[torch.Tensor] = None,
) -> Union[torch.Tensor, IntermediateTensors]:
hidden_states = super().forward(
input_ids,
positions,
kv_caches,
attn_metadata,
intermediate_tensors=intermediate_tensors,
inputs_embeds=inputs_embeds,
)
if isinstance(hidden_states, IntermediateTensors):
return hidden_states
# Return all-zero embeddings
return torch.zeros_like(hidden_states)
from typing import Optional
import torch
from vllm.inputs import INPUT_REGISTRY
from vllm.model_executor.models.llava import (LlavaForConditionalGeneration,
dummy_data_for_llava,
get_max_llava_image_tokens,
input_processor_for_llava)
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY
@MULTIMODAL_REGISTRY.register_image_input_mapper()
@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens)
@INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava)
@INPUT_REGISTRY.register_input_processor(input_processor_for_llava)
class MyLlava(LlavaForConditionalGeneration):
def compute_logits(
self, hidden_states: torch.Tensor,
sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
# this dummy model always predicts the first token
logits = super().compute_logits(hidden_states, sampling_metadata)
if logits is not None:
logits.zero_()
logits[:, 0] += 1.0
return logits
from typing import Optional
import torch
from vllm.model_executor.models.opt import OPTForCausalLM
from vllm.model_executor.sampling_metadata import SamplingMetadata
class MyOPTForCausalLM(OPTForCausalLM):
def compute_logits(
self, hidden_states: torch.Tensor,
sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
# this dummy model always predicts the first token
logits = super().compute_logits(hidden_states, sampling_metadata)
if logits is not None:
logits.zero_()
logits[:, 0] += 1.0
return logits
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment