Unverified Commit 874f7c29 authored by Roger Wang's avatar Roger Wang Committed by GitHub
Browse files

[Bugfix] Fix max image feature size for Llava-one-vision (#12104)


Signed-off-by: default avatarRoger Wang <ywang@roblox.com>
parent 92e793d9
...@@ -13,6 +13,67 @@ from vllm.multimodal.utils import cached_get_tokenizer ...@@ -13,6 +13,67 @@ from vllm.multimodal.utils import cached_get_tokenizer
from ...utils import build_model_context from ...utils import build_model_context
def _validate_image_max_tokens_one(
processor: BaseMultiModalProcessor,
max_tokens: int,
failed_size_excs: list[tuple[ImageSize, Exception]],
image_size: ImageSize,
) -> None:
info = processor.info
feature_size = info.get_num_image_tokens(image_width=image_size.width,
image_height=image_size.height)
try:
assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
except Exception as exc:
failed_size_excs.append((image_size, exc))
@pytest.mark.skip("This test takes around 5 minutes to run. "
"Comment this out to run it manually.")
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
def test_processor_max_tokens(model_id):
ctx = build_model_context(
model_name=model_id,
tokenizer_name=model_id,
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": 1},
)
processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config,
tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
)
info = processor.info
seen_aspect_ratios = set[float]()
image_sizes = list[ImageSize]()
# The aspect ratio of the grid layout is between 1 and 2
# NOTE: Assumes that feature size calculation is the same if we
# swap the width and height of the image
for w, h in itertools.product(range(32, 4096), repeat=2):
aspect_ratio = w / h
if 1 <= aspect_ratio <= 2 and aspect_ratio not in seen_aspect_ratios:
image_sizes.append(ImageSize(w, h))
seen_aspect_ratios.add(aspect_ratio)
failed_size_excs = list[tuple[ImageSize, Exception]]()
validate_one = partial(
_validate_image_max_tokens_one,
processor,
info.get_max_image_tokens(), # type: ignore
failed_size_excs,
)
pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
if failed_size_excs:
msg = "Found failing image sizes:" \
+ "\n========\n".join(f"[{size}]\n{exc}"
for size, exc in failed_size_excs)
raise AssertionError(msg)
def _validate_image_prompt_replacements_one( def _validate_image_prompt_replacements_one(
processor: BaseMultiModalProcessor, processor: BaseMultiModalProcessor,
num_imgs: int, num_imgs: int,
......
...@@ -13,6 +13,68 @@ from vllm.multimodal.utils import cached_get_tokenizer ...@@ -13,6 +13,68 @@ from vllm.multimodal.utils import cached_get_tokenizer
from ...utils import build_model_context from ...utils import build_model_context
def _validate_image_max_tokens_one(
processor: BaseMultiModalProcessor,
max_tokens: int,
failed_size_excs: list[tuple[ImageSize, Exception]],
image_size: ImageSize,
) -> None:
info = processor.info
feature_size = info.get_num_image_tokens(image_width=image_size.width,
image_height=image_size.height)
try:
assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
except Exception as exc:
failed_size_excs.append((image_size, exc))
@pytest.mark.skip("This test takes around 5 minutes to run. "
"Comment this out to run it manually.")
@pytest.mark.parametrize("model_id",
["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
def test_processor_max_tokens(model_id):
ctx = build_model_context(
model_name=model_id,
tokenizer_name=model_id,
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": 1},
)
processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config,
tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
)
info = processor.info
seen_aspect_ratios = set[float]()
image_sizes = list[ImageSize]()
# The aspect ratio of the grid layout is between 1 and 6
# NOTE: Assumes that feature size calculation is the same if we
# swap the width and height of the image
for w, h in itertools.product(range(32, 4096), repeat=2):
aspect_ratio = w / h
if 1 <= aspect_ratio <= 6 and aspect_ratio not in seen_aspect_ratios:
image_sizes.append(ImageSize(w, h))
seen_aspect_ratios.add(aspect_ratio)
failed_size_excs = list[tuple[ImageSize, Exception]]()
validate_one = partial(
_validate_image_max_tokens_one,
processor,
info.get_max_image_tokens(), # type: ignore
failed_size_excs,
)
pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
if failed_size_excs:
msg = "Found failing image sizes:" \
+ "\n========\n".join(f"[{size}]\n{exc}"
for size, exc in failed_size_excs)
raise AssertionError(msg)
def _validate_image_prompt_replacements_one( def _validate_image_prompt_replacements_one(
processor: BaseMultiModalProcessor, processor: BaseMultiModalProcessor,
num_imgs: int, num_imgs: int,
......
...@@ -19,8 +19,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata ...@@ -19,8 +19,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
NestedTensors) NestedTensors)
from vllm.multimodal.parse import (MultiModalDataItems, VideoEmbeddingItems, from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
VideoProcessorItems) VideoEmbeddingItems, VideoProcessorItems)
from vllm.multimodal.processing import PromptReplacement from vllm.multimodal.processing import PromptReplacement
from vllm.multimodal.profiling import ProcessorInputs from vllm.multimodal.profiling import ProcessorInputs
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
...@@ -145,6 +145,10 @@ class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo): ...@@ -145,6 +145,10 @@ class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo):
return (unpadded_features, newline_features) return (unpadded_features, newline_features)
def get_image_size_with_most_features(self) -> ImageSize:
# NOTE: This hardcoded value is found via processor tests
return ImageSize(width=1153, height=944)
def _get_num_frame_tokens( def _get_num_frame_tokens(
self, self,
*, *,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment