Unverified Commit 8f37be38 authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Bugfix] Comprehensively test and fix LLaVA-NeXT feature size calculation (#11800)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent 8082ad79
...@@ -13,6 +13,7 @@ einops # required for MPT, qwen-vl and Mamba ...@@ -13,6 +13,7 @@ einops # required for MPT, qwen-vl and Mamba
httpx httpx
librosa # required for audio tests librosa # required for audio tests
peft peft
pqdm
ray[adag]==2.40.0 ray[adag]==2.40.0
sentence-transformers # required for embedding tests sentence-transformers # required for embedding tests
soundfile # required for audio tests soundfile # required for audio tests
......
...@@ -48,6 +48,8 @@ botocore==1.35.57 ...@@ -48,6 +48,8 @@ botocore==1.35.57
# awscli # awscli
# boto3 # boto3
# s3transfer # s3transfer
bounded-pool-executor==0.0.3
# via pqdm
buildkite-test-collector==0.1.9 buildkite-test-collector==0.1.9
# via -r requirements-test.in # via -r requirements-test.in
certifi==2024.8.30 certifi==2024.8.30
...@@ -342,6 +344,8 @@ pooch==1.8.2 ...@@ -342,6 +344,8 @@ pooch==1.8.2
# via librosa # via librosa
portalocker==2.10.1 portalocker==2.10.1
# via sacrebleu # via sacrebleu
pqdm==0.2.0
# via -r requirements-test.in
propcache==0.2.0 propcache==0.2.0
# via yarl # via yarl
protobuf==5.28.3 protobuf==5.28.3
......
import itertools
from functools import partial
import pytest import pytest
from PIL import Image from PIL import Image
from pqdm.threads import pqdm
from transformers import AutoTokenizer from transformers import AutoTokenizer
from vllm.inputs import InputProcessingContext from vllm.inputs import InputProcessingContext
from vllm.multimodal.parse import ImageSize
from ....utils import build_model_context from ....utils import build_model_context
...@@ -15,20 +20,69 @@ def processor_for_llava_next(): ...@@ -15,20 +20,69 @@ def processor_for_llava_next():
return LlavaNextMultiModalProcessor return LlavaNextMultiModalProcessor
def _validate_image_prompt_replacements_one(
processor,
num_imgs: int,
failed_size_excs: list[tuple[ImageSize, Exception]],
image_size: ImageSize,
) -> None:
prompt = "<image>" * num_imgs
image = Image.new("RGB", size=image_size)
mm_data = {"image": [image] * num_imgs}
try:
# The processor will throw an error if there is a mismatch
# in the prompt replacements
processed_inputs = processor.apply(prompt, mm_data, {})
image_placeholders = processed_inputs["mm_placeholders"]["image"]
assert len(image_placeholders) == num_imgs
first_placeholder = image_placeholders[0]
# NOTE: There is a BOS token
assert first_placeholder["offset"] == 1
assert first_placeholder["length"] == (
len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
except Exception as exc:
failed_size_excs.append((image_size, exc))
def _test_image_prompt_replacements(
processor,
*,
num_imgs: int,
image_sizes: list[ImageSize],
) -> None:
"""
Ensure LlavaNextMultiModalProcessor
handles prompt replacement properly for input images.
"""
failed_size_excs = list[tuple[ImageSize, Exception]]()
validate_one = partial(
_validate_image_prompt_replacements_one,
processor,
num_imgs,
failed_size_excs,
)
pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
if failed_size_excs:
msg = "Found failing image sizes:" \
+ "\n========\n".join(f"[{size}]\n{exc}"
for size, exc in failed_size_excs)
raise AssertionError(msg)
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
@pytest.mark.parametrize("image_size", [(1669, 2560), (2560, 1669), (183, 488),
(488, 183), (198, 176), (176, 198),
(161, 184), (184, 161)])
@pytest.mark.parametrize("num_imgs", [1, 2]) @pytest.mark.parametrize("num_imgs", [1, 2])
def test_processor_prompt_replacements( def test_processor_prompt_replacements_regression(
processor_for_llava_next, processor_for_llava_next,
model_id: str, model_id: str,
image_size: tuple[int, int],
num_imgs: int, num_imgs: int,
): ):
"""
Ensure LlavaNextMultiModalProcessor handles prompt replacement properly.
"""
ctx = build_model_context( ctx = build_model_context(
model_name=model_id, model_name=model_id,
tokenizer_name=model_id, tokenizer_name=model_id,
...@@ -37,22 +91,55 @@ def test_processor_prompt_replacements( ...@@ -37,22 +91,55 @@ def test_processor_prompt_replacements(
) )
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
ctx = InputProcessingContext(ctx.model_config, tokenizer) ctx = InputProcessingContext(ctx.model_config, tokenizer)
processor = processor_for_llava_next(ctx)
# Build the image str / prompt based on the number of images we pass image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
prompt = "<image>" * num_imgs (488, 183), (2560, 1669)]
mm_data = {"image": [Image.new("RGB", size=image_size)] * num_imgs} image_sizes = [
size for w, h in image_ratios
for size in [ImageSize(w, h), ImageSize(h, w)]
]
# The processor will throw an error if there is a mismatch _test_image_prompt_replacements(
# in the prompt replacements processor,
num_imgs=num_imgs,
image_sizes=image_sizes,
)
@pytest.mark.skip("This test takes around 2 hours to run. "
"Comment this out to run it manually.")
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
@pytest.mark.parametrize("num_imgs", [1])
def test_processor_prompt_replacements_all(
processor_for_llava_next,
model_id: str,
num_imgs: int,
):
ctx = build_model_context(
model_name=model_id,
tokenizer_name=model_id,
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": num_imgs},
)
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
ctx = InputProcessingContext(ctx.model_config, tokenizer)
processor = processor_for_llava_next(ctx) processor = processor_for_llava_next(ctx)
processed_inputs = processor.apply(prompt, mm_data, {})
image_placeholders = processed_inputs["mm_placeholders"]["image"] seen_aspect_ratios = set[float]()
assert len(image_placeholders) == num_imgs image_sizes = list[ImageSize]()
first_placeholder = image_placeholders[0] # The aspect ratio of the grid layout is between 1 and 2
# NOTE: Assumes that feature size calculation is the same if we
# swap the width and height of the image
for w, h in itertools.product(range(64, 1024), repeat=2):
aspect_ratio = w / h
if 1 <= aspect_ratio <= 2 and aspect_ratio not in seen_aspect_ratios:
image_sizes.append(ImageSize(w, h))
seen_aspect_ratios.add(aspect_ratio)
# NOTE: There is a BOS token _test_image_prompt_replacements(
assert first_placeholder["offset"] == 1 processor,
assert first_placeholder["length"] == ( num_imgs=num_imgs,
len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs image_sizes=image_sizes,
)
import itertools
from functools import partial
import pytest import pytest
from PIL import Image from PIL import Image
from pqdm.threads import pqdm
from transformers import AutoTokenizer from transformers import AutoTokenizer
from vllm.inputs import InputProcessingContext from vllm.inputs import InputProcessingContext
from vllm.multimodal.parse import ImageSize
from ....utils import build_model_context from ....utils import build_model_context
...@@ -15,22 +20,68 @@ def processor_for_llava_onevision(): ...@@ -15,22 +20,68 @@ def processor_for_llava_onevision():
return LlavaOnevisionMultiModalProcessor return LlavaOnevisionMultiModalProcessor
def _validate_image_prompt_replacements_one(
processor,
num_imgs: int,
failed_size_excs: list[tuple[ImageSize, Exception]],
image_size: ImageSize,
) -> None:
prompt = "<image>" * num_imgs
image = Image.new("RGB", size=image_size)
mm_data = {"image": [image] * num_imgs}
try:
# The processor will throw an error if there is a mismatch
# in the prompt replacements
processed_inputs = processor.apply(prompt, mm_data, {})
image_placeholders = processed_inputs["mm_placeholders"]["image"]
assert len(image_placeholders) == num_imgs
first_placeholder = image_placeholders[0]
assert first_placeholder["offset"] == 0
assert first_placeholder["length"] == len(
processed_inputs["prompt_token_ids"]) // num_imgs
except Exception as exc:
failed_size_excs.append((image_size, exc))
def _test_image_prompt_replacements(
processor,
*,
num_imgs: int,
image_sizes: list[ImageSize],
) -> None:
"""
Ensure LlavaOnevisionMultiModalProcessor
handles prompt replacement properly for input images.
"""
failed_size_excs = list[tuple[ImageSize, Exception]]()
validate_one = partial(
_validate_image_prompt_replacements_one,
processor,
num_imgs,
failed_size_excs,
)
pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
if failed_size_excs:
msg = "Found failing image sizes:" \
+ "\n========\n".join(f"[{size}]\n{exc}"
for size, exc in failed_size_excs)
raise AssertionError(msg)
@pytest.mark.parametrize("model_id", @pytest.mark.parametrize("model_id",
["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"]) ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
@pytest.mark.parametrize("image_size", [(1669, 2560), (2560, 1669), (183, 488),
(488, 183), (198, 176), (176, 198),
(161, 184), (184, 161)])
@pytest.mark.parametrize("num_imgs", [1, 2]) @pytest.mark.parametrize("num_imgs", [1, 2])
def test_processor_prompt_replacements( def test_processor_prompt_replacements_regression(
processor_for_llava_onevision, processor_for_llava_onevision,
model_id: str, model_id: str,
image_size: tuple[int, int],
num_imgs: int, num_imgs: int,
): ):
"""
Ensure LlavaOnevisionMultiModalProcessor handles prompt replacement
properly.
"""
ctx = build_model_context( ctx = build_model_context(
model_name=model_id, model_name=model_id,
tokenizer_name=model_id, tokenizer_name=model_id,
...@@ -39,22 +90,56 @@ def test_processor_prompt_replacements( ...@@ -39,22 +90,56 @@ def test_processor_prompt_replacements(
) )
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
ctx = InputProcessingContext(ctx.model_config, tokenizer) ctx = InputProcessingContext(ctx.model_config, tokenizer)
processor = processor_for_llava_onevision(ctx)
# Build the image str / prompt based on the number of images we pass image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
prompt = "<image>" * num_imgs (488, 183), (2560, 1669)]
mm_data = {"image": [Image.new("RGB", size=image_size)] * num_imgs} image_sizes = [
size for w, h in image_ratios
for size in [ImageSize(w, h), ImageSize(h, w)]
]
# The processor will throw an error if there is a mismatch _test_image_prompt_replacements(
# in the prompt replacements processor,
num_imgs=num_imgs,
image_sizes=image_sizes,
)
@pytest.mark.skip("This test takes around 2 hours to run. "
"Comment this out to run it manually.")
@pytest.mark.parametrize("model_id",
["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
@pytest.mark.parametrize("num_imgs", [1])
def test_processor_prompt_replacements_all(
processor_for_llava_onevision,
model_id: str,
num_imgs: int,
):
ctx = build_model_context(
model_name=model_id,
tokenizer_name=model_id,
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": num_imgs},
)
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
ctx = InputProcessingContext(ctx.model_config, tokenizer)
processor = processor_for_llava_onevision(ctx) processor = processor_for_llava_onevision(ctx)
processed_inputs = processor.apply(prompt, mm_data, {})
image_placeholders = processed_inputs["mm_placeholders"]["image"] seen_aspect_ratios = set[float]()
assert len(image_placeholders) == num_imgs image_sizes = list[ImageSize]()
first_placeholder = image_placeholders[0] # The aspect ratio of the grid layout is between 1 and 6
# NOTE: Assumes that feature size calculation is the same if we
# swap the width and height of the image
for w, h in itertools.product(range(64, 1024), repeat=2):
aspect_ratio = w / h
if 1 <= aspect_ratio <= 6 and aspect_ratio not in seen_aspect_ratios:
image_sizes.append(ImageSize(w, h))
seen_aspect_ratios.add(aspect_ratio)
# NOTE: There is a BOS token _test_image_prompt_replacements(
assert first_placeholder["offset"] == 0 processor,
assert first_placeholder["length"] == len( num_imgs=num_imgs,
processed_inputs["prompt_token_ids"]) // num_imgs image_sizes=image_sizes,
)
...@@ -2,7 +2,6 @@ from functools import cached_property ...@@ -2,7 +2,6 @@ from functools import cached_property
from typing import (Final, Iterable, List, Literal, Mapping, Optional, from typing import (Final, Iterable, List, Literal, Mapping, Optional,
Protocol, Set, Tuple, TypedDict, Union) Protocol, Set, Tuple, TypedDict, Union)
import numpy as np
import torch import torch
import torch.nn as nn import torch.nn as nn
from transformers import BatchFeature, LlavaNextConfig, LlavaNextProcessor from transformers import BatchFeature, LlavaNextConfig, LlavaNextProcessor
...@@ -74,7 +73,7 @@ class LlavaNextProcessingMixin(BaseLlavaProcessingMixin): ...@@ -74,7 +73,7 @@ class LlavaNextProcessingMixin(BaseLlavaProcessingMixin):
def _get_hf_processor(self): def _get_hf_processor(self):
return self.ctx.get_hf_processor(LlavaNextProcessor) return self.ctx.get_hf_processor(LlavaNextProcessor)
# Based on: https://github.com/huggingface/text-generation-inference/blob/v2.2.0/server/text_generation_server/models/vlm_causal_lm.py#L106 # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L113
def _get_num_image_tokens( def _get_num_image_tokens(
self, self,
*, *,
...@@ -111,7 +110,7 @@ class LlavaNextProcessingMixin(BaseLlavaProcessingMixin): ...@@ -111,7 +110,7 @@ class LlavaNextProcessingMixin(BaseLlavaProcessingMixin):
return unpadded_feature_size + newline_feature_size + base_feature_size return unpadded_feature_size + newline_feature_size + base_feature_size
# Based on: https://github.com/huggingface/text-generation-inference/blob/v2.2.0/server/text_generation_server/models/vlm_causal_lm.py#L79 # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L86
def _get_num_unpadded_features( def _get_num_unpadded_features(
self, self,
*, *,
...@@ -121,29 +120,23 @@ class LlavaNextProcessingMixin(BaseLlavaProcessingMixin): ...@@ -121,29 +120,23 @@ class LlavaNextProcessingMixin(BaseLlavaProcessingMixin):
num_patch_height: int, num_patch_height: int,
num_patch_width: int, num_patch_width: int,
) -> tuple[int, int]: ) -> tuple[int, int]:
# NOTE: Use float32 to remain consistent with HF output current_height = npatches * num_patch_height
current_height_f = np.float32(npatches * num_patch_height) current_width = npatches * num_patch_width
current_width_f = np.float32(npatches * num_patch_width)
original_width_f = np.float32(original_width) aspect_ratio = original_width / original_height
original_height_f = np.float32(original_height) current_aspect_ratio = current_width / current_height
original_aspect_ratio = original_width_f / original_height_f if aspect_ratio > current_aspect_ratio:
current_aspect_ratio = current_width_f / current_height_f new_height = (original_height * current_width) // original_width
padding = (current_height - new_height) // 2
if original_aspect_ratio > current_aspect_ratio: current_height = current_height - (2 * padding)
scale_factor = current_width_f / original_width_f
new_height = int(original_height_f * scale_factor)
padding = (current_height_f - new_height) // 2
current_height_f -= 2 * padding
else: else:
scale_factor = current_height_f / original_height_f new_width = (original_width * current_height) // original_height
new_width = int(original_width_f * scale_factor) padding = (current_width - new_width) // 2
padding = (current_width_f - new_width) // 2 current_width = current_width - (2 * padding)
current_width_f -= 2 * padding
unpadded_features = int(current_height_f * current_width_f) unpadded_features = current_height * current_width
newline_features = int(current_height_f) newline_features = current_height
return (unpadded_features, newline_features) return (unpadded_features, newline_features)
......
...@@ -3,7 +3,6 @@ from functools import cached_property ...@@ -3,7 +3,6 @@ from functools import cached_property
from typing import (Final, Iterable, List, Literal, Mapping, Optional, from typing import (Final, Iterable, List, Literal, Mapping, Optional,
Protocol, Set, Tuple, TypedDict, Union) Protocol, Set, Tuple, TypedDict, Union)
import numpy as np
import torch import torch
import torch.nn as nn import torch.nn as nn
from transformers import (BatchFeature, LlavaOnevisionConfig, from transformers import (BatchFeature, LlavaOnevisionConfig,
...@@ -98,6 +97,8 @@ class LlavaOnevisionProcessingMixin(LlavaNextProcessingMixin): ...@@ -98,6 +97,8 @@ class LlavaOnevisionProcessingMixin(LlavaNextProcessingMixin):
def _get_hf_processor(self): def _get_hf_processor(self):
return self.ctx.get_hf_processor(LlavaOnevisionProcessor) return self.ctx.get_hf_processor(LlavaOnevisionProcessor)
# Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L86
# with additional logic afterwards taken from LlavaOnevisionProcessor
def _get_num_unpadded_features( def _get_num_unpadded_features(
self, self,
*, *,
...@@ -107,35 +108,28 @@ class LlavaOnevisionProcessingMixin(LlavaNextProcessingMixin): ...@@ -107,35 +108,28 @@ class LlavaOnevisionProcessingMixin(LlavaNextProcessingMixin):
num_patch_height: int, num_patch_height: int,
num_patch_width: int, num_patch_width: int,
) -> tuple[int, int]: ) -> tuple[int, int]:
# NOTE: Use float32 to remain consistent with HF output current_height = npatches * num_patch_height
current_height_f = np.float32(npatches * num_patch_height) current_width = npatches * num_patch_width
current_width_f = np.float32(npatches * num_patch_width)
original_width_f = np.float32(original_width) aspect_ratio = original_width / original_height
original_height_f = np.float32(original_height) current_aspect_ratio = current_width / current_height
original_aspect_ratio = original_width_f / original_height_f if aspect_ratio > current_aspect_ratio:
current_aspect_ratio = current_width_f / current_height_f new_height = (original_height * current_width) // original_width
padding = (current_height - new_height) // 2
if original_aspect_ratio > current_aspect_ratio: current_height = current_height - (2 * padding)
scale_factor = current_width_f / original_width_f
new_height = int(original_height_f * scale_factor)
padding = (current_height_f - new_height) // 2
current_height_f -= 2 * padding
else: else:
scale_factor = current_height_f / original_height_f new_width = (original_width * current_height) // original_height
new_width = int(original_width_f * scale_factor) padding = (current_width - new_width) // 2
padding = (current_width_f - new_width) // 2 current_width = current_width - (2 * padding)
current_width_f -= 2 * padding
unpadded_features = int(current_height_f * current_width_f) unpadded_features = current_height * current_width
newline_features = int(current_height_f) newline_features = current_height
ratio = math.sqrt(current_height_f * current_width_f / ratio = math.sqrt(current_height * current_width / (9 * npatches**2))
(9 * npatches**2))
if ratio > 1.1: if ratio > 1.1:
height_factor = int(current_height_f // ratio) height_factor = int(current_height // ratio)
width_factor = int(current_width_f // ratio) width_factor = int(current_width // ratio)
unpadded_features = height_factor * width_factor unpadded_features = height_factor * width_factor
newline_features = height_factor newline_features = height_factor
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment