Commit af7f4372 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.5.5' into v0.5.5-dtk24.04.1

parents 5e19cdef 09c77926
...@@ -20,12 +20,12 @@ from vllm.triton_utils.libentry import LibEntry ...@@ -20,12 +20,12 @@ from vllm.triton_utils.libentry import LibEntry
from .utils import (generate_data, generate_data_for_expand_nslices, from .utils import (generate_data, generate_data_for_expand_nslices,
ref_torch_groupgemm) ref_torch_groupgemm)
HIDDEN_SIZES = [3424, 4096, 4097] HIDDEN_SIZES = [4097]
BATCHES = [1, 4, 16, 32] BATCHES = [1, 4, 16, 32]
NUM_LORA = [1, 4, 8, 16, 32, 64, 128] NUM_LORA = [1, 8, 32, 128]
DTYPES = [torch.float16, torch.bfloat16] DTYPES = [torch.float16, torch.bfloat16]
MAX_RANKS = [1, 4, 8, 16, 32, 64, 128] MAX_RANKS = [1, 4, 8, 16, 32, 64, 128, 256]
SCALES = [0.5] SCALES = [0.5]
SEED = [0] SEED = [0]
CUDA_DEVICES = [f"cuda:{0}"] CUDA_DEVICES = [f"cuda:{0}"]
...@@ -321,22 +321,3 @@ def test_punica_expand_nslices( ...@@ -321,22 +321,3 @@ def test_punica_expand_nslices(
slice_offset += hidden_size slice_offset += hidden_size
assert_close(our_outputs, ref_outputs) assert_close(our_outputs, ref_outputs)
if __name__ == "__main__":
from itertools import product
lst = list(
product(
BATCHES,
NUM_LORA,
MAX_RANKS,
[1.0],
[torch.float16],
["expand"],
SEED,
CUDA_DEVICES,
))
for ele in lst:
test_punica_bgmv(*ele)
print(f"{ele},pass")
"""Compare the outputs of HF and vLLM for BART models using greedy sampling.
Run `pytest tests/models/test_bart.py`.
"""
from typing import List, Optional, Tuple
from vllm.utils import is_cpu
if not is_cpu():
# CPU backend is not currently supported with encoder/decoder models
# skip test definitions entirely to avoid importing GPU kernel libs
# (xFormers, etc.)
import pytest
from transformers import AutoModelForSeq2SeqLM
from vllm.sequence import SampleLogprobs
from ..conftest import DecoderPromptType
from .utils import check_logprobs_close
MODELS = ["facebook/bart-base", "facebook/bart-large-cnn"]
def vllm_to_hf_output(
vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]],
decoder_prompt_type: DecoderPromptType,
):
"""Sanitize vllm output to be comparable with hf output."""
output_ids, output_str, out_logprobs = vllm_output
hf_output_str = output_str + "</s>"
if decoder_prompt_type == DecoderPromptType.NONE:
hf_output_str = "<s>" + hf_output_str
return output_ids, hf_output_str, out_logprobs
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float", "bfloat16"])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
def test_models(
hf_runner,
vllm_runner,
example_encoder_decoder_prompts,
model: str,
dtype: str,
max_tokens: int,
num_logprobs: int,
decoder_prompt_type: DecoderPromptType,
) -> None:
'''
Test the vLLM BART model for a variety of encoder/decoder input prompts,
by validating it against HuggingFace (HF) BART.
Arguments:
* hf_runner: HuggingFace (HF) test model runner
* vllm_runner: vLLM test model runner
* example_encoder_decoder_prompts: test fixture which provides a
dictionary of dummy prompts
* model: the HF ID of the specific BART variant under test
* dtype: the tensor datatype to employ
* max_tokens
* num_logprobs
* decoder_prompt_type: key into the example_encoder_decoder_prompts
dictionary; selects specific encoder/decoder
prompt scenarios to test
A note on using HF BART as a baseline for validating vLLM BART,
specifically when the decoder prompt is None.
The HF GenerationMixin's default behavior is to force the first
decoded token to be <BOS> if the prompt does not already contain
<BOS> (this is accomplished using a logit
processor setting.)
So when we use HF BART as our baseline for comparison, note that
when the user provides a request with a None decoder prompt
(i.e. a singleton encoder prompt, or else an explicit encoder/
decoder prompt with the decoder sub-prompt set to None), HF and
vLLM handle this in different ways:
* HF will (1) tokenize the None prompt as an empty token-list,
(2) append <decoder-start-token> to the beginning, yielding
[<decoder-start-token>], (3) pass this token list to the model, and
then (4) after computing logits during prefill, override the model
logits & force <BOS> to be the first generated token.
* vLLM will (1) tokenize the None prompt as [<BOS>], (2) append decoder-
start-token to the beginning, yielding [<decoder-start-token><BOS>],
(3) pass these tokens to the model & proceed with generation.
The net effect is that compared to vLLM, the list of HF *decoded* tokens
will contain one more initial <BOS> than the vLLM generated tokens,
because vLLM's <BOS> token is injected into the prompt rather than into
the generated output. This is in spite of the fact that overall, the
complete sequences (prompt + decoded tokens) produced by vLLM will match
HF.
So when we use HF decoded token output to validate vLLM's decoded token
output, the testing process must account for the difference in decoded
token sequences between vLLM and HF specifically in the
decoder-prompt-is-None case.
One option is to disable the logit processor feature that forces the
<BOS> token to be decoded (forced_bos_token_id = None), eliminating
the problem entirely. However this is not "normal" BART usage.
The other option is - only in the decoder-prompt-is-None case - to
discard the first decoded token from the HF output before comparing it
to vLLM.
To that end, when testing the scenario where the decoder prompt is None
(and only in that one scenario), this test skips the first HF decoded
token during the process of validating the vLLM decoded output.
'''
test_case_prompts = example_encoder_decoder_prompts[
decoder_prompt_type]
# Configuration settings for HF baseline
hf_kwargs = {
"top_k": None,
"num_beams": 1,
"repetition_penalty": 1.0,
"top_p": 1.0,
"length_penalty": 1.0,
"early_stopping": False,
"no_repeat_ngram_size": None,
"min_length": 0
}
with hf_runner(model, dtype=dtype,
auto_cls=AutoModelForSeq2SeqLM) as hf_model:
hf_outputs = (
hf_model.generate_encoder_decoder_greedy_logprobs_limit(
test_case_prompts,
max_tokens,
num_logprobs,
**hf_kwargs,
))
# Note: currently encoder/decoder models are only compatible with
# enforce_eager=True. Normally this is not a problem because
# for encoder/decoder models vLLM will
# default to enforce_eager=True if enforce_eager
# is left unspecified. However, the
# VllmRunner test fixture (which wraps around the LLM class) defaults to
# enforce_eager=False (a behavior which a number of already-exisitng
# decoder-only unit tests expect), so when testing an encoder/decoder
# model we must explicitly specify enforce_eager=True in the VllmRunner
# constructor.
with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
test_case_prompts, max_tokens, num_logprobs)
hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE
else 0)
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=[
vllm_to_hf_output(vllm_output, decoder_prompt_type)
for vllm_output in vllm_outputs
],
name_0="hf",
name_1="vllm",
num_outputs_0_skip_tokens=hf_skip_tokens,
)
from typing import List, Optional, Tuple from typing import List, Optional, Tuple
import pytest import pytest
from transformers import AutoTokenizer from transformers import AutoModelForVision2Seq, AutoTokenizer
from vllm.multimodal.utils import rescale_image_size from vllm.multimodal.utils import rescale_image_size
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
...@@ -59,7 +59,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, ...@@ -59,7 +59,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
All the image fixtures for the test is under tests/images. All the image fixtures for the test is under tests/images.
For huggingface runner, we provide the PIL images as input. For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalData objects and corresponding For vllm runner, we provide MultiModalData objects and corresponding
vision language config as input. MultiModalConfig as input.
Note, the text input is also adjusted to abide by vllm contract. Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf. The text output is sanitized to be able to compare with hf.
""" """
...@@ -80,7 +80,8 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, ...@@ -80,7 +80,8 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
for prompts, images in inputs_per_image for prompts, images in inputs_per_image
] ]
with hf_runner(model, dtype=dtype, is_vision_model=True) as hf_model: with hf_runner(model, dtype=dtype,
auto_cls=AutoModelForVision2Seq) as hf_model:
hf_outputs_per_image = [ hf_outputs_per_image = [
hf_model.generate_greedy_logprobs_limit(prompts, hf_model.generate_greedy_logprobs_limit(prompts,
max_tokens, max_tokens,
......
import re
from typing import List, Optional, Type from typing import List, Optional, Type
import pytest import pytest
from transformers import AutoModelForVision2Seq, BatchEncoding
from vllm.multimodal.utils import rescale_image_size from vllm.multimodal.utils import rescale_image_size
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
from ..conftest import IMAGE_ASSETS, VllmRunner, _ImageAssets from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
from .utils import check_outputs_equal
pytestmark = pytest.mark.vlm pytestmark = pytest.mark.vlm
...@@ -19,9 +21,8 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ ...@@ -19,9 +21,8 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
models = ["facebook/chameleon-7b"] models = ["facebook/chameleon-7b"]
#TODO (ywang96): Add correctness test when chameleon is
# available on transformers.
def run_test( def run_test(
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner], vllm_runner: Type[VllmRunner],
image_assets: _ImageAssets, image_assets: _ImageAssets,
model: str, model: str,
...@@ -29,13 +30,20 @@ def run_test( ...@@ -29,13 +30,20 @@ def run_test(
size_factors: List[float], size_factors: List[float],
dtype: str, dtype: str,
max_tokens: int, max_tokens: int,
num_logprobs: int,
tensor_parallel_size: int, tensor_parallel_size: int,
distributed_executor_backend: Optional[str] = None, distributed_executor_backend: Optional[str] = None,
): ):
"""Test if the model can generate text given """Inference result should be the same between hf and vllm.
a batch of images and prompts.
All the image fixtures for the test is under tests/images.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects
and corresponding vision language config as input.
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
""" """
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
images = [asset.pil_image for asset in image_assets] images = [asset.pil_image for asset in image_assets]
inputs_per_image = [( inputs_per_image = [(
...@@ -50,35 +58,49 @@ def run_test( ...@@ -50,35 +58,49 @@ def run_test(
distributed_executor_backend=distributed_executor_backend, distributed_executor_backend=distributed_executor_backend,
enforce_eager=True) as vllm_model: enforce_eager=True) as vllm_model:
for prompts, images in inputs_per_image: vllm_outputs_per_image = [
vllm_outputs = vllm_model.generate_greedy(prompts, vllm_model.generate_greedy_logprobs(prompts,
max_tokens, max_tokens,
images=images) num_logprobs=num_logprobs,
for i in range(len(vllm_outputs)): images=images)
for prompts, images in inputs_per_image
# format prompt back to original ]
replacements = {
"<racm3:break>": "", def process(hf_inputs: BatchEncoding):
"<eoss>": "", hf_inputs["pixel_values"] = hf_inputs["pixel_values"] \
"<reserved08706>": "" .to(torch_dtype) # type: ignore
} return hf_inputs
pattern = '|'.join(replacements.keys())
vllm_result = re.sub( with hf_runner(model,
pattern, dtype=dtype,
lambda match: replacements[match.group(0)], #noqa B023 postprocess_inputs=process,
vllm_outputs[i][1]) auto_cls=AutoModelForVision2Seq) as hf_model:
vllm_result = vllm_result.replace("<image>", "", 1023) hf_outputs_per_image = [
assert vllm_result[:len(prompts[i])] == prompts[i] hf_model.generate_greedy_logprobs_limit(prompts,
max_tokens,
# assert at least 10 new characters are generated num_logprobs=num_logprobs,
# (to take stop token into account) images=images)
assert len(vllm_outputs[i][1]) - len(prompts[i]) > 10 for prompts, images in inputs_per_image
]
for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
vllm_outputs_per_image):
# HF Logprobs include image tokens, unlike vLLM, so we don't directly
# compare them
check_outputs_equal(
outputs_0_lst=[outputs[:2] for outputs in hf_outputs],
outputs_1_lst=[outputs[:2] for outputs in vllm_outputs],
name_0="hf",
name_1="vllm",
)
@pytest.mark.parametrize("model", models) @pytest.mark.parametrize("model", models)
@pytest.mark.parametrize( @pytest.mark.parametrize(
"size_factors", "size_factors",
[ [
# No image
[],
# Single-scale # Single-scale
[1.0], [1.0],
# Single-scale, batched # Single-scale, batched
...@@ -88,15 +110,18 @@ def run_test( ...@@ -88,15 +110,18 @@ def run_test(
], ],
) )
@pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("max_tokens", [8])
def test_models(vllm_runner, image_assets, model, size_factors, dtype: str, @pytest.mark.parametrize("num_logprobs", [5])
max_tokens: int) -> None: def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
dtype, max_tokens, num_logprobs) -> None:
run_test( run_test(
hf_runner,
vllm_runner, vllm_runner,
image_assets, image_assets,
model, model,
size_factors=size_factors, size_factors=size_factors,
dtype=dtype, dtype=dtype,
max_tokens=max_tokens, max_tokens=max_tokens,
num_logprobs=num_logprobs,
tensor_parallel_size=1, tensor_parallel_size=1,
) )
...@@ -49,7 +49,7 @@ def run_test( ...@@ -49,7 +49,7 @@ def run_test(
All the image fixtures for the test is under tests/images. All the image fixtures for the test is under tests/images.
For huggingface runner, we provide the PIL images as input. For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects For vllm runner, we provide MultiModalDataDict objects
and corresponding vision language config as input. and corresponding MultiModalConfig as input.
Note, the text input is also adjusted to abide by vllm contract. Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf. The text output is sanitized to be able to compare with hf.
""" """
......
"""
Tests gguf models against unquantized models generations
Note: To pass the test, quantization higher than Q4 should be used
"""
import os
import pytest
from huggingface_hub import hf_hub_download
from transformers import AutoTokenizer
from tests.quantization.utils import is_quant_method_supported
from .utils import check_logprobs_close
os.environ["TOKENIZERS_PARALLELISM"] = "true"
MAX_MODEL_LEN = 1024
# FIXME: Move this to confest
MODELS = [
("TinyLlama/TinyLlama-1.1B-Chat-v1.0",
hf_hub_download("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
filename="tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf")),
("TinyLlama/TinyLlama-1.1B-Chat-v1.0",
hf_hub_download("duyntnet/TinyLlama-1.1B-Chat-v1.0-imatrix-GGUF",
filename="TinyLlama-1.1B-Chat-v1.0-IQ4_XS.gguf")),
("Qwen/Qwen2-1.5B-Instruct",
hf_hub_download("Qwen/Qwen2-1.5B-Instruct-GGUF",
filename="qwen2-1_5b-instruct-q4_k_m.gguf")),
("Qwen/Qwen2-1.5B-Instruct",
hf_hub_download("legraphista/Qwen2-1.5B-Instruct-IMat-GGUF",
filename="Qwen2-1.5B-Instruct.IQ4_XS.gguf")),
]
@pytest.mark.skipif(not is_quant_method_supported("gguf"),
reason="gguf is not supported on this GPU type.")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("tp_size", [1, 2])
def test_models(
num_gpus_available,
vllm_runner,
example_prompts,
model,
dtype: str,
max_tokens: int,
num_logprobs: int,
tp_size: int,
) -> None:
if num_gpus_available < tp_size:
pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
original_model, gguf_model = model
tokenizer = AutoTokenizer.from_pretrained(original_model)
messages = [[{
'role': 'user',
'content': prompt
}] for prompt in example_prompts]
example_prompts = tokenizer.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)
# Run unquantized model.
with vllm_runner(model_name=original_model,
dtype=dtype,
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=tp_size) as original_model:
original_outputs = original_model.generate_greedy_logprobs(
example_prompts[:-1], max_tokens, num_logprobs)
# Run gguf model.
with vllm_runner(model_name=gguf_model,
dtype=dtype,
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=tp_size) as gguf_model:
gguf_outputs = gguf_model.generate_greedy_logprobs(
example_prompts[:-1], max_tokens, num_logprobs)
check_logprobs_close(
outputs_0_lst=original_outputs,
outputs_1_lst=gguf_outputs,
name_0="original",
name_1="gguf",
)
from typing import Optional
import pytest
import torch
import torch.nn as nn
from huggingface_hub import snapshot_download
from transformers import AutoConfig, AutoModel, CLIPImageProcessor
from vllm.model_executor.models.intern_vit import InternVisionModel
from ..conftest import _ImageAssets, cleanup
pytestmark = pytest.mark.vlm
# we use snapshot_download to prevent conflicts between
# dynamic_module and trust_remote_code for hf_runner
DOWNLOAD_PATTERN = ["*.json", "*.py", "*.safetensors", "*.txt", "*.model"]
models = [
snapshot_download("OpenGVLab/InternViT-300M-448px",
allow_patterns=DOWNLOAD_PATTERN),
snapshot_download("OpenGVLab/InternViT-6B-448px-V1-5",
allow_patterns=DOWNLOAD_PATTERN),
]
def run_intern_vit_test(
image_assets: _ImageAssets,
model: str,
*,
dtype: str,
distributed_executor_backend: Optional[str] = None,
):
img_processor = CLIPImageProcessor.from_pretrained(model)
images = [asset.pil_image for asset in image_assets]
pixel_values = [
img_processor(images, return_tensors='pt').pixel_values.to(dtype)
for images in images
]
config = AutoConfig.from_pretrained(model, trust_remote_code=True)
if not getattr(config, "norm_type", None):
config.norm_type = "rms_norm"
hf_model = AutoModel.from_pretrained(model,
torch_dtype=dtype,
trust_remote_code=True).to("cuda")
hf_outputs_per_image = [
hf_model(pixel_value.to("cuda")).last_hidden_state
for pixel_value in pixel_values
]
vllm_model = InternVisionModel(config)
vllm_model.load_weights(hf_model.state_dict().items())
del hf_model
cleanup()
vllm_model = vllm_model.to("cuda", dtype)
vllm_outputs_per_image = [
vllm_model(pixel_values=pixel_value.to("cuda"))
for pixel_value in pixel_values
]
del vllm_model
cleanup()
cos_similar = nn.CosineSimilarity(dim=-1)
for vllm_output, hf_output in zip(vllm_outputs_per_image,
hf_outputs_per_image):
assert cos_similar(vllm_output, hf_output).mean() > 0.99
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("dtype", [torch.half])
@torch.inference_mode()
def test_models(dist_init, image_assets, model, dtype: str) -> None:
run_intern_vit_test(
image_assets,
model,
dtype=dtype,
)
import types import types
from typing import List, Optional, Type from typing import List, Optional, Tuple, Type
import pytest import pytest
import torch import torch
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
from PIL.Image import Image from PIL.Image import Image
from transformers import AutoConfig
from vllm.model_executor.models.internvl import (IMG_CONTEXT, IMG_END, from vllm.model_executor.models.internvl import (IMG_CONTEXT, IMG_END,
IMG_START, IMG_START,
...@@ -26,10 +27,15 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ ...@@ -26,10 +27,15 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
# we use snapshot_download to prevent conflicts between # we use snapshot_download to prevent conflicts between
# dynamic_module and trust_remote_code for hf_runner # dynamic_module and trust_remote_code for hf_runner
DOWNLOAD_PATTERN = ["*.json", "*.py", "*.safetensors", "*.txt", "*.model"]
models = [ models = [
snapshot_download("OpenGVLab/InternVL2-1B"), snapshot_download("OpenGVLab/InternVL2-1B",
snapshot_download("OpenGVLab/InternVL2-2B"), allow_patterns=DOWNLOAD_PATTERN),
# snapshot_download("OpenGVLab/InternVL2-4B"), # broken snapshot_download("OpenGVLab/InternVL2-2B",
allow_patterns=DOWNLOAD_PATTERN),
# Broken due to outdated implementation of Phi-3
# See: https://huggingface.co/OpenGVLab/InternVL2-4B/discussions/3
# snapshot_download("OpenGVLab/InternVL2-4B"),
] ]
...@@ -41,8 +47,17 @@ class InternVLProcessor: ...@@ -41,8 +47,17 @@ class InternVLProcessor:
self.tokenizer = hf_runner.tokenizer self.tokenizer = hf_runner.tokenizer
self.dtype = hf_runner.model.dtype self.dtype = hf_runner.model.dtype
self.config = AutoConfig.from_pretrained(hf_runner.model_name)
self.vision_config = self.config.vision_config
self.use_thumbnail = self.config.use_thumbnail
self.min_num = self.config.min_dynamic_patch
self.max_num = self.config.max_dynamic_patch
self.image_size = self.vision_config.image_size
def __call__(self, text: str, images: Image, **kwargs): def __call__(self, text: str, images: Image, **kwargs):
pixel_values = image_to_pixel_values(images).to(self.dtype) pixel_values = image_to_pixel_values(images, self.image_size,
self.min_num, self.max_num,
self.use_thumbnail).to(self.dtype)
num_patches_list = [pixel_values.shape[0]] num_patches_list = [pixel_values.shape[0]]
for num_patches in num_patches_list: for num_patches in num_patches_list:
context_tokens = IMG_CONTEXT * self.num_image_token * num_patches context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
...@@ -102,7 +117,7 @@ def run_test( ...@@ -102,7 +117,7 @@ def run_test(
All the image fixtures for the test is under tests/images. All the image fixtures for the test is under tests/images.
For huggingface runner, we provide the PIL images as input. For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects For vllm runner, we provide MultiModalDataDict objects
and corresponding vision language config as input. and corresponding MultiModalConfig as input.
Note, the text input is also adjusted to abide by vllm contract. Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf. The text output is sanitized to be able to compare with hf.
""" """
...@@ -163,6 +178,74 @@ def run_test( ...@@ -163,6 +178,74 @@ def run_test(
) )
def run_awq_test(
vllm_runner: Type[VllmRunner],
image_assets: _ImageAssets,
models: Tuple[str, str],
*,
size_factors: List[float],
dtype: str,
max_tokens: int,
num_logprobs: int,
tensor_parallel_size: int,
distributed_executor_backend: Optional[str] = None,
):
source_model, quant_model = models
images = [asset.pil_image for asset in image_assets]
inputs_per_image = [(
[prompt for _ in size_factors],
[rescale_image_size(image, factor) for factor in size_factors],
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
# max_model_len should be greater than image_feature_size
with vllm_runner(source_model,
max_model_len=4096,
dtype=dtype,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=True) as vllm_model:
source_outputs_per_image = [
vllm_model.generate_greedy_logprobs(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images)
for prompts, images in inputs_per_image
]
with vllm_runner(quant_model,
quantization="awq",
max_model_len=4096,
dtype=dtype,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=True) as vllm_model:
quant_outputs_per_image = [
vllm_model.generate_greedy_logprobs(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images)
for prompts, images in inputs_per_image
]
for source_outputs, quant_outputs in zip(source_outputs_per_image,
quant_outputs_per_image):
# TODO: Check whether using original CLIPVisionModel can improve
# consistency against HF
check_logprobs_close(
outputs_0_lst=source_outputs,
outputs_1_lst=quant_outputs,
name_0="source",
name_1="awq",
)
target_dtype = "half" target_dtype = "half"
if is_cpu(): if is_cpu():
target_dtype = "bfloat16" target_dtype = "bfloat16"
...@@ -199,3 +282,36 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, ...@@ -199,3 +282,36 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
num_logprobs=num_logprobs, num_logprobs=num_logprobs,
tensor_parallel_size=1, tensor_parallel_size=1,
) )
@pytest.mark.parametrize(
"models", [("OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B-AWQ")])
@pytest.mark.parametrize(
"size_factors",
[
# No image
[],
# Single-scale
[1.0],
# Single-scale, batched
[1.0, 1.0, 1.0],
# Multi-scale
[0.25, 0.5, 1.0],
],
)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
@torch.inference_mode()
def test_awq_models(vllm_runner, image_assets, models, size_factors,
dtype: str, max_tokens: int, num_logprobs: int) -> None:
run_awq_test(
vllm_runner,
image_assets,
models,
size_factors=size_factors,
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
tensor_parallel_size=1,
)
...@@ -6,9 +6,12 @@ from vllm.worker.model_runner import _get_graph_batch_size ...@@ -6,9 +6,12 @@ from vllm.worker.model_runner import _get_graph_batch_size
MODELS = ["ai21labs/Jamba-tiny-random"] MODELS = ["ai21labs/Jamba-tiny-random"]
# Fails due to usage of MoE as MLP(E=1_, which is different than the HF impl
# TODO: Fix this with trained model
@pytest.mark.skip()
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [20]) @pytest.mark.parametrize("max_tokens", [10])
def test_models( def test_models(
hf_runner, hf_runner,
vllm_runner, vllm_runner,
...@@ -17,8 +20,6 @@ def test_models( ...@@ -17,8 +20,6 @@ def test_models(
dtype: str, dtype: str,
max_tokens: int, max_tokens: int,
) -> None: ) -> None:
# To pass the small model tests, we need full precision.
assert dtype == "float"
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
...@@ -36,8 +37,8 @@ def test_models( ...@@ -36,8 +37,8 @@ def test_models(
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [20]) @pytest.mark.parametrize("max_tokens", [5])
def test_batching( def test_batching(
vllm_runner, vllm_runner,
example_prompts, example_prompts,
......
from typing import List, Optional, Tuple, Type from typing import List, Optional, Tuple, Type
import pytest import pytest
from transformers import AutoTokenizer from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
BatchEncoding)
from vllm.multimodal.utils import rescale_image_size from vllm.multimodal.utils import rescale_image_size
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
from .utils import check_logprobs_close from .utils import check_logprobs_close
...@@ -18,9 +20,11 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ ...@@ -18,9 +20,11 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"USER: <image>\nWhat is the season?\nASSISTANT:", "USER: <image>\nWhat is the season?\nASSISTANT:",
}) })
IMAGE_TOKEN_ID = 32000 models = [
"llava-hf/llava-1.5-7b-hf",
models = ["llava-hf/llava-1.5-7b-hf"] # TODO: Get this model to produce meaningful output in vLLM
# "TIGER-Lab/Mantis-8B-siglip-llama3",
]
def vllm_to_hf_output(vllm_output: Tuple[List[int], str, def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
...@@ -29,12 +33,15 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str, ...@@ -29,12 +33,15 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
"""Sanitize vllm output to be comparable with hf output.""" """Sanitize vllm output to be comparable with hf output."""
output_ids, output_str, out_logprobs = vllm_output output_ids, output_str, out_logprobs = vllm_output
config = AutoConfig.from_pretrained(model)
image_token_id = config.image_token_index
tokenizer = AutoTokenizer.from_pretrained(model) tokenizer = AutoTokenizer.from_pretrained(model)
eos_token_id = tokenizer.eos_token_id eos_token_id = tokenizer.eos_token_id
hf_output_ids = [ hf_output_ids = [
token_id for idx, token_id in enumerate(output_ids) token_id for idx, token_id in enumerate(output_ids)
if token_id != IMAGE_TOKEN_ID or output_ids[idx - 1] != IMAGE_TOKEN_ID if token_id != image_token_id or output_ids[idx - 1] != image_token_id
] ]
assert output_str[0] == " " assert output_str[0] == " "
...@@ -63,10 +70,21 @@ def run_test( ...@@ -63,10 +70,21 @@ def run_test(
All the image fixtures for the test is under tests/images. All the image fixtures for the test is under tests/images.
For huggingface runner, we provide the PIL images as input. For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects For vllm runner, we provide MultiModalDataDict objects
and corresponding vision language config as input. and corresponding MultiModalConfig as input.
Note, the text input is also adjusted to abide by vllm contract. Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf. The text output is sanitized to be able to compare with hf.
""" """
# NOTE: For local use; this isn't tested in CI yet (see TODO above)
if model.startswith("TIGER-Lab/Mantis"):
from mantis.models.mllava import MLlavaProcessor
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
mantis_processor = MLlavaProcessor.from_pretrained(
model, torch_dtype=torch_dtype)
assert isinstance(mantis_processor, MLlavaProcessor)
else:
mantis_processor = None
images = [asset.pil_image for asset in image_assets] images = [asset.pil_image for asset in image_assets]
inputs_per_image = [( inputs_per_image = [(
...@@ -93,7 +111,21 @@ def run_test( ...@@ -93,7 +111,21 @@ def run_test(
for prompts, images in inputs_per_image for prompts, images in inputs_per_image
] ]
with hf_runner(model, dtype=dtype, is_vision_model=True) as hf_model: if mantis_processor is not None:
def process(hf_inputs: BatchEncoding):
hf_inputs["pixel_values"] = hf_inputs["pixel_values"] \
.to(torch_dtype) # type: ignore
return hf_inputs
else:
def process(hf_inputs: BatchEncoding):
return hf_inputs
with hf_runner(model,
dtype=dtype,
postprocess_inputs=process,
auto_cls=AutoModelForVision2Seq) as hf_model:
hf_outputs_per_image = [ hf_outputs_per_image = [
hf_model.generate_greedy_logprobs_limit(prompts, hf_model.generate_greedy_logprobs_limit(prompts,
max_tokens, max_tokens,
......
from typing import List, Optional, Tuple, Type
import pytest
from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
from vllm.sequence import SampleLogprobs
from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
from .utils import check_logprobs_close
pytestmark = pytest.mark.vlm
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign":
"USER: <image>\nWhat's the content of the image?\nASSISTANT:",
"cherry_blossom":
"USER: <image>\nWhat is the season?\nASSISTANT:",
})
models = [
"llava-hf/llava-1.5-7b-hf",
]
def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
Optional[SampleLogprobs]],
model: str):
"""Sanitize vllm output to be comparable with hf output."""
output_ids, output_str, out_logprobs = vllm_output
config = AutoConfig.from_pretrained(model)
image_token_id = config.image_token_index
tokenizer = AutoTokenizer.from_pretrained(model)
eos_token_id = tokenizer.eos_token_id
hf_output_ids = [
token_id for idx, token_id in enumerate(output_ids)
if token_id != image_token_id or output_ids[idx - 1] != image_token_id
]
assert output_str[0] == " "
hf_output_str = output_str[1:]
if hf_output_ids[-1] == eos_token_id:
hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
return hf_output_ids, hf_output_str, out_logprobs
def run_test(
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
image_assets: _ImageAssets,
model: str,
*,
size_factors: List[float],
dtype: str,
max_tokens: int,
num_logprobs: int,
tensor_parallel_size: int,
distributed_executor_backend: Optional[str] = None,
):
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test is under tests/images.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects
and corresponding vision language config as input.
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
"""
# vLLM to load from image embeddings
vllm_images = [asset.image_embeds for asset in image_assets]
# transformers to load from PIL images
hf_images = [asset.pil_image for asset in image_assets]
vllm_inputs_per_image = [(
[prompt for _ in size_factors],
[image for _ in size_factors],
) for image, prompt in zip(vllm_images, HF_IMAGE_PROMPTS)]
hf_inputs_per_image = [(
[prompt for _ in size_factors],
[image for _ in size_factors],
) for image, prompt in zip(hf_images, HF_IMAGE_PROMPTS)]
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
# max_model_len should be greater than image_feature_size
with vllm_runner(model,
dtype=dtype,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=True) as vllm_model:
vllm_outputs_per_image = [
vllm_model.generate_greedy_logprobs(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images)
for prompts, images in vllm_inputs_per_image
]
with hf_runner(model, dtype=dtype,
auto_cls=AutoModelForVision2Seq) as hf_model:
hf_outputs_per_image = [
hf_model.generate_greedy_logprobs_limit(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images)
for prompts, images in hf_inputs_per_image
]
for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
vllm_outputs_per_image):
# TODO: Check whether using original CLIPVisionModel can improve
# consistency against HF
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=[
vllm_to_hf_output(vllm_output, model)
for vllm_output in vllm_outputs
],
name_0="hf",
name_1="vllm",
)
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
"size_factors",
[
# No image
[],
# Single-scale
[1.0],
# Single-scale, batched
[1.0, 1.0, 1.0],
],
)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
dtype: str, max_tokens: int, num_logprobs: int) -> None:
run_test(
hf_runner,
vllm_runner,
image_assets,
model,
size_factors=size_factors,
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
tensor_parallel_size=1,
)
from typing import List, Optional, Tuple, Type, overload from typing import List, Optional, Tuple, Type, overload
import pytest import pytest
from transformers import AutoTokenizer from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
from vllm.multimodal.utils import rescale_image_size from vllm.multimodal.utils import rescale_image_size
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
...@@ -23,8 +23,6 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ ...@@ -23,8 +23,6 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
f"{_PREFACE} USER: <image>\nWhat is the season? ASSISTANT:", f"{_PREFACE} USER: <image>\nWhat is the season? ASSISTANT:",
}) })
IMAGE_TOKEN_ID = 32000
models = ["llava-hf/llava-v1.6-vicuna-7b-hf"] models = ["llava-hf/llava-v1.6-vicuna-7b-hf"]
...@@ -34,12 +32,15 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str, ...@@ -34,12 +32,15 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
"""Sanitize vllm output to be comparable with hf output.""" """Sanitize vllm output to be comparable with hf output."""
output_ids, output_str, out_logprobs = vllm_output output_ids, output_str, out_logprobs = vllm_output
config = AutoConfig.from_pretrained(model)
image_token_id = config.image_token_index
tokenizer = AutoTokenizer.from_pretrained(model) tokenizer = AutoTokenizer.from_pretrained(model)
eos_token_id = tokenizer.eos_token_id eos_token_id = tokenizer.eos_token_id
hf_output_ids = [ hf_output_ids = [
token_id for idx, token_id in enumerate(output_ids) token_id for idx, token_id in enumerate(output_ids)
if token_id != IMAGE_TOKEN_ID or output_ids[idx - 1] != IMAGE_TOKEN_ID if token_id != image_token_id or output_ids[idx - 1] != image_token_id
] ]
assert output_str[0] == " " assert output_str[0] == " "
...@@ -128,7 +129,8 @@ def run_test( ...@@ -128,7 +129,8 @@ def run_test(
for prompts, images in inputs_per_image for prompts, images in inputs_per_image
] ]
with hf_runner(model, dtype=dtype, is_vision_model=True) as hf_model: with hf_runner(model, dtype=dtype,
auto_cls=AutoModelForVision2Seq) as hf_model:
hf_outputs_per_image = [ hf_outputs_per_image = [
hf_model.generate_greedy_logprobs_limit(prompts, hf_model.generate_greedy_logprobs_limit(prompts,
max_tokens, max_tokens,
...@@ -176,7 +178,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, ...@@ -176,7 +178,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
All the image fixtures for the test is under tests/images. All the image fixtures for the test is under tests/images.
For huggingface runner, we provide the PIL images as input. For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects For vllm runner, we provide MultiModalDataDict objects
and corresponding vision language config as input. and corresponding MultiModalConfig as input.
Note, the text input is also adjusted to abide by vllm contract. Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf. The text output is sanitized to be able to compare with hf.
""" """
......
from collections import UserDict
from typing import List, Optional, Tuple, Type from typing import List, Optional, Tuple, Type
import pytest import pytest
import torch import torch
import torch.types import torch.types
from transformers import BatchFeature from transformers import BatchEncoding
from vllm.multimodal.utils import rescale_image_size from vllm.multimodal.utils import rescale_image_size
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
...@@ -14,18 +13,6 @@ from .utils import check_logprobs_close ...@@ -14,18 +13,6 @@ from .utils import check_logprobs_close
pytestmark = pytest.mark.vlm pytestmark = pytest.mark.vlm
class NestedInputs(UserDict):
def __init__(self, model_inputs: BatchFeature):
super().__init__({"model_inputs": model_inputs})
self.model_inputs = model_inputs
def to(self, device: torch.types.Device):
return NestedInputs(self.model_inputs.to(device))
# The image token is placed before "user" on purpose so that the test can pass # The image token is placed before "user" on purpose so that the test can pass
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign": "stop_sign":
...@@ -41,6 +28,10 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ ...@@ -41,6 +28,10 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
models = ["openbmb/MiniCPM-Llama3-V-2_5"] models = ["openbmb/MiniCPM-Llama3-V-2_5"]
def _wrap_inputs(hf_inputs: BatchEncoding) -> BatchEncoding:
return BatchEncoding({"model_inputs": hf_inputs})
def trunc_hf_output(hf_output: Tuple[List[int], str, def trunc_hf_output(hf_output: Tuple[List[int], str,
Optional[SampleLogprobs]]): Optional[SampleLogprobs]]):
output_ids, output_str, out_logprobs = hf_output output_ids, output_str, out_logprobs = hf_output
...@@ -70,7 +61,7 @@ def run_test( ...@@ -70,7 +61,7 @@ def run_test(
All the image fixtures for the test is under tests/images. All the image fixtures for the test is under tests/images.
For huggingface runner, we provide the PIL images as input. For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects For vllm runner, we provide MultiModalDataDict objects
and corresponding vision language config as input. and corresponding MultiModalConfig as input.
Note, the text input is also adjusted to abide by vllm contract. Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf. The text output is sanitized to be able to compare with hf.
""" """
...@@ -105,11 +96,8 @@ def run_test( ...@@ -105,11 +96,8 @@ def run_test(
for prompts, images in inputs_per_image for prompts, images in inputs_per_image
] ]
with hf_runner(model, dtype=dtype) as hf_model, torch.no_grad(): hf_model = hf_runner(model, dtype=dtype, postprocess_inputs=_wrap_inputs)
hf_processor = hf_model.processor with hf_model, torch.no_grad():
hf_model.processor = lambda **kw: NestedInputs(
hf_processor(**kw) # type: ignore
)
hf_outputs_per_image = [ hf_outputs_per_image = [
hf_model.generate_greedy_logprobs_limit(prompts, hf_model.generate_greedy_logprobs_limit(prompts,
max_tokens, max_tokens,
...@@ -188,7 +176,7 @@ def run_multi_image_test( ...@@ -188,7 +176,7 @@ def run_multi_image_test(
All the image fixtures for the test is under tests/images. All the image fixtures for the test is under tests/images.
For huggingface runner, we provide the PIL images as input. For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects For vllm runner, we provide MultiModalDataDict objects
and corresponding vision language config as input. and corresponding MultiModalConfig as input.
Note, the text input is also adjusted to abide by vllm contract. Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf. The text output is sanitized to be able to compare with hf.
""" """
...@@ -209,6 +197,7 @@ def run_multi_image_test( ...@@ -209,6 +197,7 @@ def run_multi_image_test(
with vllm_runner(model, with vllm_runner(model,
max_model_len=4096, max_model_len=4096,
max_num_seqs=1, max_num_seqs=1,
limit_mm_per_prompt={"image": len(images)},
dtype=dtype, dtype=dtype,
tensor_parallel_size=tensor_parallel_size, tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend, distributed_executor_backend=distributed_executor_backend,
...@@ -224,11 +213,8 @@ def run_multi_image_test( ...@@ -224,11 +213,8 @@ def run_multi_image_test(
for prompts, images in inputs_per_case for prompts, images in inputs_per_case
] ]
with hf_runner(model, dtype=dtype) as hf_model, torch.no_grad(): hf_model = hf_runner(model, dtype=dtype, postprocess_inputs=_wrap_inputs)
hf_processor = hf_model.processor with hf_model, torch.no_grad():
hf_model.processor = lambda **kw: NestedInputs(
hf_processor(**kw) # type: ignore
)
hf_outputs_per_case = [ hf_outputs_per_case = [
hf_model.generate_greedy_logprobs_limit(prompts, hf_model.generate_greedy_logprobs_limit(prompts,
max_tokens, max_tokens,
......
import torch import os
from vllm import LLM, ModelRegistry, SamplingParams import pytest
from vllm.model_executor.models.opt import OPTForCausalLM
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm import LLM, SamplingParams
class MyOPTForCausalLM(OPTForCausalLM): from ..utils import fork_new_process_for_each_test
def compute_logits(self, hidden_states: torch.Tensor,
sampling_metadata: SamplingMetadata) -> torch.Tensor:
# this dummy model always predicts the first token
logits = super().compute_logits(hidden_states, sampling_metadata)
logits.zero_()
logits[:, 0] += 1.0
return logits
@fork_new_process_for_each_test
def test_plugin(dummy_opt_path):
os.environ["VLLM_PLUGINS"] = ""
with pytest.raises(Exception) as excinfo:
LLM(model=dummy_opt_path, load_format="dummy")
assert "are not supported for now" in str(excinfo.value)
def test_oot_registration():
# register our dummy model @fork_new_process_for_each_test
ModelRegistry.register_model("OPTForCausalLM", MyOPTForCausalLM) def test_oot_registration(dummy_opt_path):
os.environ["VLLM_PLUGINS"] = "register_dummy_model"
prompts = ["Hello, my name is", "The text does not matter"] prompts = ["Hello, my name is", "The text does not matter"]
sampling_params = SamplingParams(temperature=0) sampling_params = SamplingParams(temperature=0)
llm = LLM(model="facebook/opt-125m") llm = LLM(model=dummy_opt_path, load_format="dummy")
first_token = llm.get_tokenizer().decode(0) first_token = llm.get_tokenizer().decode(0)
outputs = llm.generate(prompts, sampling_params) outputs = llm.generate(prompts, sampling_params)
......
...@@ -2,7 +2,7 @@ import os ...@@ -2,7 +2,7 @@ import os
from typing import List, Optional, Tuple, Type from typing import List, Optional, Tuple, Type
import pytest import pytest
from transformers import AutoTokenizer from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
from vllm.multimodal.utils import rescale_image_size from vllm.multimodal.utils import rescale_image_size
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
...@@ -20,8 +20,6 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ ...@@ -20,8 +20,6 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"What is in the picture?", "What is in the picture?",
}) })
IMAGE_TOKEN_ID = 257152
models = ["google/paligemma-3b-mix-224"] models = ["google/paligemma-3b-mix-224"]
# ROCm Triton FA can run into compilation issues with these models due to, # ROCm Triton FA can run into compilation issues with these models due to,
...@@ -37,12 +35,15 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str, ...@@ -37,12 +35,15 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
"""Sanitize vllm output to be comparable with hf output.""" """Sanitize vllm output to be comparable with hf output."""
output_ids, output_str, out_logprobs = vllm_output output_ids, output_str, out_logprobs = vllm_output
config = AutoConfig.from_pretrained(model)
image_token_id = config.image_token_index
tokenizer = AutoTokenizer.from_pretrained(model) tokenizer = AutoTokenizer.from_pretrained(model)
eos_token_id = tokenizer.eos_token_id eos_token_id = tokenizer.eos_token_id
hf_output_ids = [ hf_output_ids = [
token_id for idx, token_id in enumerate(output_ids) token_id for idx, token_id in enumerate(output_ids)
if token_id != IMAGE_TOKEN_ID or output_ids[idx - 1] != IMAGE_TOKEN_ID if token_id != image_token_id or output_ids[idx - 1] != image_token_id
] ]
hf_output_str = output_str hf_output_str = output_str
...@@ -71,7 +72,7 @@ def run_test( ...@@ -71,7 +72,7 @@ def run_test(
All the image fixtures for the test is under tests/images. All the image fixtures for the test is under tests/images.
For huggingface runner, we provide the PIL images as input. For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects For vllm runner, we provide MultiModalDataDict objects
and corresponding vision language config as input. and corresponding MultiModalConfig as input.
Note, the text input is also adjusted to abide by vllm contract. Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf. The text output is sanitized to be able to compare with hf.
""" """
...@@ -101,7 +102,8 @@ def run_test( ...@@ -101,7 +102,8 @@ def run_test(
for prompts, images in inputs_per_image for prompts, images in inputs_per_image
] ]
with hf_runner(model, dtype=dtype, is_vision_model=True) as hf_model: with hf_runner(model, dtype=dtype,
auto_cls=AutoModelForVision2Seq) as hf_model:
hf_outputs_per_image = [ hf_outputs_per_image = [
hf_model.generate_greedy_logprobs_limit(prompts, hf_model.generate_greedy_logprobs_limit(prompts,
max_tokens, max_tokens,
......
...@@ -21,7 +21,7 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ ...@@ -21,7 +21,7 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n", "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n",
}) })
models = ["microsoft/Phi-3-vision-128k-instruct"] models = ["microsoft/Phi-3.5-vision-instruct"]
def vllm_to_hf_output(vllm_output: Tuple[List[int], str, def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
...@@ -73,7 +73,7 @@ def run_test( ...@@ -73,7 +73,7 @@ def run_test(
All the image fixtures for the test is under tests/images. All the image fixtures for the test is under tests/images.
For huggingface runner, we provide the PIL images as input. For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects For vllm runner, we provide MultiModalDataDict objects
and corresponding vision language config as input. and corresponding MultiModalConfig as input.
Note, the text input is also adjusted to abide by vllm contract. Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf. The text output is sanitized to be able to compare with hf.
""" """
...@@ -81,7 +81,10 @@ def run_test( ...@@ -81,7 +81,10 @@ def run_test(
inputs_per_image = [( inputs_per_image = [(
[prompt for _ in size_factors], [prompt for _ in size_factors],
[rescale_image_size(image, factor) for factor in size_factors], [
rescale_image_size(image, factor, transpose=idx)
for idx, factor in enumerate(size_factors)
],
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
# NOTE: take care of the order. run vLLM first, and then run HF. # NOTE: take care of the order. run vLLM first, and then run HF.
......
from typing import Type
import pytest
from ..conftest import HfRunner, VllmRunner
from .utils import check_logprobs_close
models = ["qwen/qwen-vl"]
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("model", models)
def test_text_only_qwen_model(
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
example_prompts,
model: str,
*,
dtype: str,
max_tokens: int,
num_logprobs: int,
):
# This test checks language inputs only, since the visual component
# for qwen-vl is still unsupported in VLLM. In the near-future, the
# implementation and this test will be extended to consider
# visual inputs as well.
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts,
max_tokens,
num_logprobs=num_logprobs,
)
with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts,
max_tokens,
num_logprobs=num_logprobs,
)
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
...@@ -6,4 +6,4 @@ from vllm.model_executor.models import _MODELS, ModelRegistry ...@@ -6,4 +6,4 @@ from vllm.model_executor.models import _MODELS, ModelRegistry
@pytest.mark.parametrize("model_cls", _MODELS) @pytest.mark.parametrize("model_cls", _MODELS)
def test_registry_imports(model_cls): def test_registry_imports(model_cls):
# Ensure all model classes can be imported successfully # Ensure all model classes can be imported successfully
ModelRegistry.load_model_cls(model_cls) ModelRegistry.resolve_model_cls([model_cls])
from typing import List, Optional, Tuple, Type
import librosa
import numpy as np
import pytest
from transformers import AutoModel, AutoTokenizer, BatchEncoding
from vllm.assets.audio import AudioAsset
from vllm.sequence import SampleLogprobs
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
from ..conftest import HfRunner, VllmRunner
from .utils import check_logprobs_close
pytestmark = pytest.mark.vlm
MODEL_NAME = "fixie-ai/ultravox-v0_3"
AudioTuple = Tuple[np.ndarray, int]
@pytest.fixture(scope="session")
def audio_and_sample_rate():
return AudioAsset("mary_had_lamb").audio_and_sample_rate
@pytest.fixture
def prompts_and_audios(audio_and_sample_rate):
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
vllm_placeholder = "<|reserved_special_token_0|>"
hf_placeholder = "<|audio|>"
question = "What's in the audio?"
vllm_prompt = tokenizer.apply_chat_template(
[{
'role': 'user',
'content': f"{vllm_placeholder}\n{question}"
}],
tokenize=False,
add_generation_prompt=True)
hf_prompt = tokenizer.apply_chat_template(
[{
'role': 'user',
'content': f"{hf_placeholder}\n{question}"
}],
tokenize=False,
add_generation_prompt=True)
return [(vllm_prompt, hf_prompt, audio_and_sample_rate)]
def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
Optional[SampleLogprobs]],
model: str):
"""Sanitize vllm output to be comparable with hf output."""
output_ids, output_str, out_logprobs = vllm_output
tokenizer = AutoTokenizer.from_pretrained(model)
eos_token_id = tokenizer.eos_token_id
hf_output_ids = output_ids[:]
hf_output_str = output_str
if hf_output_ids[-1] == eos_token_id:
hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
return hf_output_ids, hf_output_str, out_logprobs
def run_test(
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
prompts_and_audios: List[Tuple[str, str, AudioTuple]],
model: str,
*,
dtype: str,
max_tokens: int,
num_logprobs: int,
tensor_parallel_size: int,
distributed_executor_backend: Optional[str] = None,
):
"""Inference result should be the same between hf and vllm."""
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
with vllm_runner(model,
dtype=dtype,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=True) as vllm_model:
vllm_outputs_per_audio = [
vllm_model.generate_greedy_logprobs([vllm_prompt],
max_tokens,
num_logprobs=num_logprobs,
audios=[audio])
for vllm_prompt, _, audio in prompts_and_audios
]
def process(hf_inputs: BatchEncoding):
hf_inputs["audio_values"] = hf_inputs["audio_values"] \
.to(torch_dtype) # type: ignore
return hf_inputs
with hf_runner(model,
dtype=dtype,
postprocess_inputs=process,
auto_cls=AutoModel) as hf_model:
hf_outputs_per_audio = [
hf_model.generate_greedy_logprobs_limit(
[hf_prompt],
max_tokens,
num_logprobs=num_logprobs,
audios=[(librosa.resample(audio[0],
orig_sr=audio[1],
target_sr=16000), 16000)])
for _, hf_prompt, audio in prompts_and_audios
]
for hf_outputs, vllm_outputs in zip(hf_outputs_per_audio,
vllm_outputs_per_audio):
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=[
vllm_to_hf_output(vllm_output, model)
for vllm_output in vllm_outputs
],
name_0="hf",
name_1="vllm",
)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
def test_models(hf_runner, vllm_runner, prompts_and_audios, dtype: str,
max_tokens: int, num_logprobs: int) -> None:
run_test(
hf_runner,
vllm_runner,
prompts_and_audios,
MODEL_NAME,
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
tensor_parallel_size=1,
)
...@@ -45,11 +45,27 @@ def check_logprobs_close( ...@@ -45,11 +45,27 @@ def check_logprobs_close(
outputs_1_lst: Sequence[TokensTextLogprobs], outputs_1_lst: Sequence[TokensTextLogprobs],
name_0: str, name_0: str,
name_1: str, name_1: str,
num_outputs_0_skip_tokens: int = 0,
warn_on_mismatch: bool = True, warn_on_mismatch: bool = True,
): ):
""" """
Compare the logprobs of two sequences generated by different models, Compare the logprobs of two sequences generated by different models,
which should be similar but not necessarily equal. which should be similar but not necessarily equal.
Arguments:
* outputs_0_lst: First sequence to compare
* outputs_0_lst: Second sequence to compare
* name_0: sequence #0 name
* name_1: sequence #1 name
* num_outputs_0_skip_tokens: If > 0, specifies the number of initial
sequence #0 tokens & logprobs to discard
before comparison, i.e. all
of sequence #1 will be compared to
sequence #0 beginning at index
num_outputs_0_skip_tokens
* warn_on_mismatch: Issue a warning if there is token-wise or text-wise
mismatch between the two sequences
""" """
assert len(outputs_0_lst) == len(outputs_1_lst) assert len(outputs_0_lst) == len(outputs_1_lst)
...@@ -65,6 +81,15 @@ def check_logprobs_close( ...@@ -65,6 +81,15 @@ def check_logprobs_close(
if logprobs_1 is None: if logprobs_1 is None:
logprobs_1 = [None] * len(output_ids_1) logprobs_1 = [None] * len(output_ids_1)
# Skip specified number of initial sequence #0 tokens
# & logprobs, leaving output text as-is for simplicity
# (text mismatches may generate warnings but do not
# cause the test to fail.)
if num_outputs_0_skip_tokens < 0:
raise ValueError("num_outputs_0_skip_tokens must be non-negative")
output_ids_0 = output_ids_0[num_outputs_0_skip_tokens:]
logprobs_0 = logprobs_0[num_outputs_0_skip_tokens:]
# Loop through generated tokens. # Loop through generated tokens.
for idx, (output_id_0, for idx, (output_id_0,
output_id_1) in enumerate(zip(output_ids_0, output_ids_1)): output_id_1) in enumerate(zip(output_ids_0, output_ids_1)):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment