Unverified Commit 377d10bd authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[VLM][Bugfix] Pass processor kwargs properly on init (#13516)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent 52ce14d3
...@@ -85,6 +85,7 @@ def load_h2ovl(question: str, image_urls: List[str]) -> ModelRequestData: ...@@ -85,6 +85,7 @@ def load_h2ovl(question: str, image_urls: List[str]) -> ModelRequestData:
trust_remote_code=True, trust_remote_code=True,
max_model_len=8192, max_model_len=8192,
limit_mm_per_prompt={"image": len(image_urls)}, limit_mm_per_prompt={"image": len(image_urls)},
mm_processor_kwargs={"max_dynamic_patch": 4},
) )
placeholders = "\n".join(f"Image-{i}: <image>\n" placeholders = "\n".join(f"Image-{i}: <image>\n"
......
...@@ -10,7 +10,7 @@ from vllm.config import ModelConfig ...@@ -10,7 +10,7 @@ from vllm.config import ModelConfig
from vllm.inputs import InputProcessingContext from vllm.inputs import InputProcessingContext
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.processing import ProcessingCache from vllm.multimodal.processing import ProcessingCache
from vllm.multimodal.utils import cached_get_tokenizer from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from ....multimodal.utils import random_audio, random_image, random_video from ....multimodal.utils import random_audio, random_image, random_video
from ...registry import HF_EXAMPLE_MODELS from ...registry import HF_EXAMPLE_MODELS
...@@ -42,10 +42,7 @@ def _test_processing_correctness( ...@@ -42,10 +42,7 @@ def _test_processing_correctness(
factories = MULTIMODAL_REGISTRY._processor_factories[model_cls] factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
ctx = InputProcessingContext( ctx = InputProcessingContext(
model_config, model_config,
tokenizer=cached_get_tokenizer( tokenizer=cached_tokenizer_from_config(model_config),
model_config.tokenizer,
trust_remote_code=model_info.trust_remote_code,
),
) )
# Ensure that it can fit all of the data # Ensure that it can fit all of the data
cache = ProcessingCache(capacity=1 << 30) cache = ProcessingCache(capacity=1 << 30)
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
"""Tests for H2OVL's multimodal preprocessing kwargs.""" """Tests for H2OVL's multimodal preprocessing kwargs."""
from typing import Optional from typing import Mapping, Optional
import pytest import pytest
from PIL import Image
from transformers import PretrainedConfig
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.image import rescale_image_size from vllm.multimodal.image import rescale_image_size
from vllm.multimodal.utils import cached_get_tokenizer from vllm.multimodal.processing import BaseMultiModalProcessor
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from ....conftest import _ImageAssets from ....conftest import _ImageAssets
from ...utils import build_model_context from ...utils import build_model_context
@pytest.mark.parametrize("model_id", [ def _get_expected_num_patches(
"h2oai/h2ovl-mississippi-800m", config: PretrainedConfig,
"h2oai/h2ovl-mississippi-2b", image: Image.Image,
])
@pytest.mark.parametrize(
"size_factors",
[
# Single-scale
[1.0],
# Single-scale, batched
[1.0, 1.0, 1.0],
# Multi-scale
[0.25, 0.5, 1.0],
],
)
@pytest.mark.parametrize("max_dynamic_patch", [1, 2, 4, 8])
@pytest.mark.parametrize("dynamic_image_size", [True, False])
@pytest.mark.parametrize("num_imgs", [1, 2])
def test_processor_override(
model_id: str,
image_assets: _ImageAssets,
size_factors: list[int],
max_dynamic_patch: int,
dynamic_image_size: Optional[bool],
num_imgs: int, num_imgs: int,
min_num: int,
max_num: int,
): ):
from vllm.model_executor.models.h2ovl import (calculate_h2ovl_targets, from vllm.model_executor.models.h2ovl import (calculate_h2ovl_targets,
get_h2ovl_target_ratios) get_h2ovl_target_ratios)
ctx = build_model_context(
model_name=model_id,
tokenizer_name=model_id,
trust_remote_code=True,
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": num_imgs},
)
tokenizer = cached_get_tokenizer(
ctx.model_config.tokenizer,
trust_remote_code=ctx.model_config.trust_remote_code,
)
processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config,
tokenizer=tokenizer,
)
config = processor.info.get_hf_config()
use_msac = config.use_msac
mm_processor_kwargs = {
"max_dynamic_patch": max_dynamic_patch,
}
if dynamic_image_size is not None:
mm_processor_kwargs["dynamic_image_size"] = dynamic_image_size
min_num = config.min_dynamic_patch
max_num = max_dynamic_patch if dynamic_image_size else 1
# Build the image str / prompt based on the number of images we pass
prompt = "<image>" * num_imgs
for asset in image_assets:
for factor in size_factors:
image = rescale_image_size(asset.pil_image, factor)
mm_data = {"image": [image] * num_imgs}
width, height = image.size width, height = image.size
# Calculate the expected number of blocks # Calculate the expected number of blocks
if num_imgs == 1 and use_msac: if num_imgs == 1 and config.use_msac:
# First pass # First pass
blocks1, _, _, aspect_ratio = calculate_h2ovl_targets( blocks1, _, _, aspect_ratio = calculate_h2ovl_targets(
orig_width=width, orig_width=width,
orig_height=height, orig_height=height,
target_ratios=get_h2ovl_target_ratios( target_ratios=get_h2ovl_target_ratios(
min_num, min_num=1,
max_num, max_num=max_num,
prior_aspect_ratio=None, prior_aspect_ratio=None,
), ),
image_size=config.vision_config.image_size, image_size=config.vision_config.image_size,
...@@ -99,8 +47,8 @@ def test_processor_override( ...@@ -99,8 +47,8 @@ def test_processor_override(
orig_width=width, orig_width=width,
orig_height=height, orig_height=height,
target_ratios=get_h2ovl_target_ratios( target_ratios=get_h2ovl_target_ratios(
min_num, min_num=3,
max_num, max_num=max_num,
prior_aspect_ratio=aspect_ratio, prior_aspect_ratio=aspect_ratio,
), ),
image_size=config.vision_config.image_size, image_size=config.vision_config.image_size,
...@@ -116,8 +64,8 @@ def test_processor_override( ...@@ -116,8 +64,8 @@ def test_processor_override(
# overlapping # overlapping
total_blocks = blocks1 + blocks2 - 1 total_blocks = blocks1 + blocks2 - 1
expected_num_patches = total_blocks return total_blocks
else:
blocks, _, _, _ = calculate_h2ovl_targets( blocks, _, _, _ = calculate_h2ovl_targets(
orig_width=width, orig_width=width,
orig_height=height, orig_height=height,
...@@ -131,12 +79,101 @@ def test_processor_override( ...@@ -131,12 +79,101 @@ def test_processor_override(
) )
expected_num_patches = blocks expected_num_patches = blocks
if config.use_thumbnail and expected_num_patches != 1: if config.use_thumbnail and expected_num_patches > 1:
expected_num_patches += 1 expected_num_patches += 1
processed_inputs = processor.apply(prompt, mm_data, return expected_num_patches
def _run_check(
processor: BaseMultiModalProcessor,
images: list[Image.Image],
min_num: int,
max_num: int,
mm_processor_kwargs: Mapping[str, object],
):
tokenizer = processor.info.get_tokenizer()
config = processor.info.get_hf_config()
mm_data = {"image": images}
total_expected_num_patches = sum(
_get_expected_num_patches(config, image, len(images), min_num, max_num)
for image in images)
processed_inputs = processor.apply("<image>" * len(images), mm_data,
mm_processor_kwargs) mm_processor_kwargs)
pixel_shape = (
processed_inputs["mm_kwargs"]["pixel_values_flat"].shape)
assert pixel_shape[0] == expected_num_patches * num_imgs # Ensure we have the right number of placeholders per num_crops size
image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape
assert img_tok_count == 256 * total_expected_num_patches
assert pixel_shape[0] == total_expected_num_patches
@pytest.mark.parametrize("model_id", [
"h2oai/h2ovl-mississippi-800m",
"h2oai/h2ovl-mississippi-2b",
])
@pytest.mark.parametrize(
"size_factors",
[
# Single-scale
[1.0],
# Single-scale, batched
[1.0, 1.0, 1.0],
# Multi-scale
[0.25, 0.5, 1.0],
[4.0, 2.0, 1.0],
],
)
@pytest.mark.parametrize(
("min_dynamic_patch", "max_dynamic_patch"),
[(1, 1), (1, 2), (1, 4), (1, 8), (2, 4), (4, 8)],
)
@pytest.mark.parametrize("dynamic_image_size", [True, False])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override(
model_id: str,
image_assets: _ImageAssets,
size_factors: list[int],
min_dynamic_patch: int,
max_dynamic_patch: int,
dynamic_image_size: Optional[bool],
kwargs_on_init: bool,
):
mm_processor_kwargs = {
"min_dynamic_patch": min_dynamic_patch,
"max_dynamic_patch": max_dynamic_patch,
"dynamic_image_size": dynamic_image_size,
}
ctx = build_model_context(
model_name=model_id,
tokenizer_name=model_id,
trust_remote_code=True,
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": len(size_factors)},
)
tokenizer = cached_tokenizer_from_config(ctx.model_config)
processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config,
tokenizer=tokenizer,
)
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
min_num = min_dynamic_patch if dynamic_image_size else 1
max_num = max_dynamic_patch if dynamic_image_size else 1
_run_check(
processor,
[
rescale_image_size(image_assets[0].pil_image, f)
for f in size_factors
],
min_num,
max_num,
hf_processor_mm_kwargs,
)
...@@ -4,7 +4,7 @@ import pytest ...@@ -4,7 +4,7 @@ import pytest
from transformers import Idefics3Config from transformers import Idefics3Config
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.utils import cached_get_tokenizer from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from ....conftest import _ImageAssets from ....conftest import _ImageAssets
from ...utils import build_model_context from ...utils import build_model_context
...@@ -22,9 +22,15 @@ models = ["HuggingFaceM4/Idefics3-8B-Llama3"] ...@@ -22,9 +22,15 @@ models = ["HuggingFaceM4/Idefics3-8B-Llama3"]
]) ])
# yapf: enable # yapf: enable
@pytest.mark.parametrize("num_imgs", [1, 2]) @pytest.mark.parametrize("num_imgs", [1, 2])
def test_processor_override(image_assets: _ImageAssets, model: str, @pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override(
image_assets: _ImageAssets,
model: str,
mm_processor_kwargs: dict[str, object], mm_processor_kwargs: dict[str, object],
expected_toks_per_img: int, num_imgs: int): expected_toks_per_img: int,
num_imgs: int,
kwargs_on_init: bool,
):
"""Ensure input_processor_for_idefics3 handles num_crops properly.""" """Ensure input_processor_for_idefics3 handles num_crops properly."""
# Same as the previous test - don't initialize mm_processor_kwargs # Same as the previous test - don't initialize mm_processor_kwargs
# in this test and assume that the kwargs will be correctly expanded by # in this test and assume that the kwargs will be correctly expanded by
...@@ -33,15 +39,15 @@ def test_processor_override(image_assets: _ImageAssets, model: str, ...@@ -33,15 +39,15 @@ def test_processor_override(image_assets: _ImageAssets, model: str,
model_name=model, model_name=model,
tokenizer_name=model, tokenizer_name=model,
trust_remote_code=True, trust_remote_code=True,
mm_processor_kwargs=None, mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": num_imgs}, limit_mm_per_prompt={"image": num_imgs},
) )
tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer) tokenizer = cached_tokenizer_from_config(ctx.model_config)
processor = MULTIMODAL_REGISTRY.create_processor( processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config, ctx.model_config,
tokenizer=tokenizer, tokenizer=tokenizer,
) )
hf_processor = processor.info.get_hf_processor(**mm_processor_kwargs) hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
# Build the image str / prompt based on the number of images we pass # Build the image str / prompt based on the number of images we pass
placeholders = "<image>" if num_imgs == 1 else "\n".join( placeholders = "<image>" if num_imgs == 1 else "\n".join(
...@@ -54,8 +60,10 @@ def test_processor_override(image_assets: _ImageAssets, model: str, ...@@ -54,8 +60,10 @@ def test_processor_override(image_assets: _ImageAssets, model: str,
dummy_image = image_assets[0].pil_image.resize(dummy_image_size) dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
mm_data = {"image": [dummy_image] * num_imgs} mm_data = {"image": [dummy_image] * num_imgs}
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs) processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
# Ensure the placeholders format are correct # Ensure the placeholders format are correct
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"]) hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"])
assert processed_inputs["prompt_token_ids"] == hf_processed_inputs[ assert processed_inputs["prompt_token_ids"] == hf_processed_inputs[
"input_ids"][0] "input_ids"][0]
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
"""Tests for InternVL's multimodal preprocessing kwargs.""" """Tests for InternVL's multimodal preprocessing kwargs."""
from typing import Optional from typing import Mapping, Optional
import pytest import pytest
from PIL import Image
from transformers import PretrainedConfig
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.utils import cached_get_tokenizer from vllm.multimodal.image import rescale_image_size
from vllm.multimodal.processing import BaseMultiModalProcessor
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from ....conftest import _ImageAssets from ....conftest import _ImageAssets
from ...utils import build_model_context from ...utils import build_model_context
def _get_expected_num_patches(
config: PretrainedConfig,
image: Image.Image,
num_imgs: int,
min_num: int,
max_num: int,
):
from vllm.model_executor.models.internvl import (
calculate_internvl_targets, get_internvl_target_ratios)
width, height = image.size
blocks, _, _ = calculate_internvl_targets(
orig_width=width,
orig_height=height,
target_ratios=get_internvl_target_ratios(
min_num,
max_num,
),
image_size=config.vision_config.image_size,
use_thumbnail=False,
)
expected_num_patches = blocks
if config.use_thumbnail and expected_num_patches > 1:
expected_num_patches += 1
return expected_num_patches
def _run_check(
processor: BaseMultiModalProcessor,
images: list[Image.Image],
min_num: int,
max_num: int,
mm_processor_kwargs: Mapping[str, object],
):
tokenizer = processor.info.get_tokenizer()
config = processor.info.get_hf_config()
mm_data = {"image": images}
total_expected_num_patches = sum(
_get_expected_num_patches(config, image, len(images), min_num, max_num)
for image in images)
processed_inputs = processor.apply("<image>" * len(images), mm_data,
mm_processor_kwargs)
# Ensure we have the right number of placeholders per num_crops size
image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape
assert img_tok_count == 256 * total_expected_num_patches
assert pixel_shape[0] == total_expected_num_patches
@pytest.mark.parametrize("model_id", ["OpenGVLab/InternVL2-2B"]) @pytest.mark.parametrize("model_id", ["OpenGVLab/InternVL2-2B"])
@pytest.mark.parametrize("max_dynamic_patch", [1, 4]) @pytest.mark.parametrize(
@pytest.mark.parametrize("dynamic_image_size", [True, False, None]) "size_factors",
@pytest.mark.parametrize("num_imgs", [1, 2]) [
# Single-scale
[1.0],
# Single-scale, batched
[1.0, 1.0, 1.0],
# Multi-scale
[0.25, 0.5, 1.0],
[4.0, 2.0, 1.0],
],
)
@pytest.mark.parametrize(
("min_dynamic_patch", "max_dynamic_patch"),
[(1, 1), (1, 2), (1, 4), (1, 8), (2, 4), (4, 8)],
)
@pytest.mark.parametrize("dynamic_image_size", [True, False])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override( def test_processor_override(
model_id: str, model_id: str,
image_assets: _ImageAssets, image_assets: _ImageAssets,
size_factors: list[int],
min_dynamic_patch: int,
max_dynamic_patch: int, max_dynamic_patch: int,
dynamic_image_size: Optional[bool], dynamic_image_size: Optional[bool],
num_imgs: int, kwargs_on_init: bool,
): ):
mm_processor_kwargs = {
"min_dynamic_patch": min_dynamic_patch,
"max_dynamic_patch": max_dynamic_patch,
"dynamic_image_size": dynamic_image_size,
}
ctx = build_model_context( ctx = build_model_context(
model_name=model_id, model_name=model_id,
tokenizer_name=model_id, tokenizer_name=model_id,
trust_remote_code=True, trust_remote_code=True,
mm_processor_kwargs=None, mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": num_imgs}, limit_mm_per_prompt={"image": len(size_factors)},
)
tokenizer = cached_get_tokenizer(
ctx.model_config.tokenizer,
trust_remote_code=ctx.model_config.trust_remote_code,
) )
tokenizer = cached_tokenizer_from_config(ctx.model_config)
processor = MULTIMODAL_REGISTRY.create_processor( processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config, ctx.model_config,
tokenizer=tokenizer, tokenizer=tokenizer,
) )
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
mm_processor_kwargs = { min_num = min_dynamic_patch if dynamic_image_size else 1
"max_dynamic_patch": max_dynamic_patch, max_num = max_dynamic_patch if dynamic_image_size else 1
}
if dynamic_image_size is not None:
mm_processor_kwargs["dynamic_image_size"] = dynamic_image_size
# Build the image str / prompt based on the number of images we pass _run_check(
prompt = "<image>" * num_imgs processor,
image = image_assets[0].pil_image.resize((448 * 2, 448 * 2)) [
mm_data = {"image": [image] * num_imgs} rescale_image_size(image_assets[0].pil_image, f)
for f in size_factors
expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1 ],
if dynamic_image_size is False: min_num,
expected_num_patches = 1 max_num,
hf_processor_mm_kwargs,
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs) )
# Ensure we have the right number of placeholders per num_crops size
image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape
assert img_tok_count == 256 * expected_num_patches * num_imgs
assert pixel_shape[0] == expected_num_patches * num_imgs
...@@ -10,7 +10,7 @@ from pqdm.threads import pqdm ...@@ -10,7 +10,7 @@ from pqdm.threads import pqdm
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.parse import ImageSize from vllm.multimodal.parse import ImageSize
from vllm.multimodal.processing import BaseMultiModalProcessor from vllm.multimodal.processing import BaseMultiModalProcessor
from vllm.multimodal.utils import cached_get_tokenizer from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from ...utils import build_model_context from ...utils import build_model_context
...@@ -43,10 +43,7 @@ def test_processor_max_tokens(model_id): ...@@ -43,10 +43,7 @@ def test_processor_max_tokens(model_id):
) )
processor = MULTIMODAL_REGISTRY.create_processor( processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config, ctx.model_config,
tokenizer=cached_get_tokenizer( tokenizer=cached_tokenizer_from_config(ctx.model_config),
ctx.model_config.tokenizer,
trust_remote_code=ctx.model_config.trust_remote_code,
),
) )
info = processor.info info = processor.info
...@@ -146,10 +143,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs): ...@@ -146,10 +143,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
) )
processor = MULTIMODAL_REGISTRY.create_processor( processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config, ctx.model_config,
tokenizer=cached_get_tokenizer( tokenizer=cached_tokenizer_from_config(ctx.model_config),
ctx.model_config.tokenizer,
trust_remote_code=ctx.model_config.trust_remote_code,
),
) )
image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328), image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
...@@ -179,10 +173,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs): ...@@ -179,10 +173,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
) )
processor = MULTIMODAL_REGISTRY.create_processor( processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config, ctx.model_config,
tokenizer=cached_get_tokenizer( tokenizer=cached_tokenizer_from_config(ctx.model_config),
ctx.model_config.tokenizer,
trust_remote_code=ctx.model_config.trust_remote_code,
),
) )
seen_aspect_ratios = set[float]() seen_aspect_ratios = set[float]()
......
...@@ -10,7 +10,7 @@ from pqdm.threads import pqdm ...@@ -10,7 +10,7 @@ from pqdm.threads import pqdm
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.parse import ImageSize from vllm.multimodal.parse import ImageSize
from vllm.multimodal.processing import BaseMultiModalProcessor from vllm.multimodal.processing import BaseMultiModalProcessor
from vllm.multimodal.utils import cached_get_tokenizer from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from ...utils import build_model_context from ...utils import build_model_context
...@@ -44,10 +44,7 @@ def test_processor_max_tokens(model_id): ...@@ -44,10 +44,7 @@ def test_processor_max_tokens(model_id):
) )
processor = MULTIMODAL_REGISTRY.create_processor( processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config, ctx.model_config,
tokenizer=cached_get_tokenizer( tokenizer=cached_tokenizer_from_config(ctx.model_config),
ctx.model_config.tokenizer,
trust_remote_code=ctx.model_config.trust_remote_code,
),
) )
info = processor.info info = processor.info
...@@ -146,10 +143,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs): ...@@ -146,10 +143,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
) )
processor = MULTIMODAL_REGISTRY.create_processor( processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config, ctx.model_config,
tokenizer=cached_get_tokenizer( tokenizer=cached_tokenizer_from_config(ctx.model_config),
ctx.model_config.tokenizer,
trust_remote_code=ctx.model_config.trust_remote_code,
),
) )
image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328), image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
...@@ -180,10 +174,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs): ...@@ -180,10 +174,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
) )
processor = MULTIMODAL_REGISTRY.create_processor( processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config, ctx.model_config,
tokenizer=cached_get_tokenizer( tokenizer=cached_tokenizer_from_config(ctx.model_config),
ctx.model_config.tokenizer,
trust_remote_code=ctx.model_config.trust_remote_code,
),
) )
seen_aspect_ratios = set[float]() seen_aspect_ratios = set[float]()
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
import pytest import pytest
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.utils import cached_get_tokenizer from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from ....conftest import _ImageAssets from ....conftest import _ImageAssets
from ...utils import build_model_context from ...utils import build_model_context
...@@ -21,12 +21,14 @@ from ...utils import build_model_context ...@@ -21,12 +21,14 @@ from ...utils import build_model_context
]) ])
# yapf: enable # yapf: enable
@pytest.mark.parametrize("num_imgs", [1, 2]) @pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override( def test_processor_override(
image_assets: _ImageAssets, image_assets: _ImageAssets,
model_id: str, model_id: str,
mm_processor_kwargs: dict[str, int], mm_processor_kwargs: dict[str, int],
expected_toks_per_img: int, expected_toks_per_img: int,
num_imgs: int, num_imgs: int,
kwargs_on_init: bool,
): ):
"""Ensure input_processor_for_phi3v handles num_crops properly.""" """Ensure input_processor_for_phi3v handles num_crops properly."""
# Avoid initializing CUDA early # Avoid initializing CUDA early
...@@ -36,23 +38,22 @@ def test_processor_override( ...@@ -36,23 +38,22 @@ def test_processor_override(
model_name=model_id, model_name=model_id,
tokenizer_name=model_id, tokenizer_name=model_id,
trust_remote_code=True, trust_remote_code=True,
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": num_imgs}, limit_mm_per_prompt={"image": num_imgs},
) )
tokenizer = cached_get_tokenizer( tokenizer = cached_tokenizer_from_config(ctx.model_config)
ctx.model_config.tokenizer,
trust_remote_code=ctx.model_config.trust_remote_code,
)
processor = MULTIMODAL_REGISTRY.create_processor( processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config, ctx.model_config,
tokenizer=tokenizer, tokenizer=tokenizer,
) )
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
# Build the image str / prompt based on the number of images we pass # Build the image str / prompt based on the number of images we pass
img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)]) img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n" prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
mm_data = {"image": [image_assets[0].pil_image] * num_imgs} mm_data = {"image": [image_assets[0].pil_image] * num_imgs}
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs) processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
# Ensure we have the right number of placeholders per num_crops size # Ensure we have the right number of placeholders per num_crops size
img_tok_count = processed_inputs["prompt_token_ids"].count(_IMAGE_TOKEN_ID) img_tok_count = processed_inputs["prompt_token_ids"].count(_IMAGE_TOKEN_ID)
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
import pytest import pytest
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.utils import cached_get_tokenizer from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from ....conftest import _ImageAssets from ....conftest import _ImageAssets
from ...utils import build_model_context from ...utils import build_model_context
...@@ -18,6 +18,7 @@ from ...utils import build_model_context ...@@ -18,6 +18,7 @@ from ...utils import build_model_context
]) ])
# yapf: enable # yapf: enable
@pytest.mark.parametrize("num_imgs", [1, 2]) @pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override( def test_processor_override(
image_assets: _ImageAssets, image_assets: _ImageAssets,
model_id: str, model_id: str,
...@@ -25,31 +26,30 @@ def test_processor_override( ...@@ -25,31 +26,30 @@ def test_processor_override(
expected_toks_per_img: int, expected_toks_per_img: int,
expected_pixels_shape: tuple[int, int], expected_pixels_shape: tuple[int, int],
num_imgs: int, num_imgs: int,
kwargs_on_init: bool,
): ):
"""Ensure Qwen2VLMultiModalProcessor handles min/max pixels properly.""" """Ensure Qwen2VLMultiModalProcessor handles min/max pixels properly."""
ctx = build_model_context( ctx = build_model_context(
model_name=model_id, model_name=model_id,
tokenizer_name=model_id, tokenizer_name=model_id,
mm_processor_kwargs=None, mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": num_imgs}, limit_mm_per_prompt={"image": num_imgs},
) )
tokenizer = cached_get_tokenizer( tokenizer = cached_tokenizer_from_config(ctx.model_config)
ctx.model_config.tokenizer,
trust_remote_code=ctx.model_config.trust_remote_code,
)
processor = MULTIMODAL_REGISTRY.create_processor( processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config, ctx.model_config,
tokenizer=tokenizer, tokenizer=tokenizer,
) )
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
# Build the image str / prompt based on the number of images we pass # Build the image str / prompt based on the number of images we pass
prompt = "<|vision_start|><|image_pad|><|vision_end|>" * num_imgs prompt = "<|vision_start|><|image_pad|><|vision_end|>" * num_imgs
mm_data = {"image": [image_assets[0].pil_image] * num_imgs} mm_data = {"image": [image_assets[0].pil_image] * num_imgs}
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs) processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
# Ensure we have the right number of placeholders per num_crops size # Ensure we have the right number of placeholders per num_crops size
hf_processor = processor.info.get_hf_processor(**mm_processor_kwargs) hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token) image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token)
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id) img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
pixel_shape = processed_inputs["mm_kwargs"]["pixel_values"].shape pixel_shape = processed_inputs["mm_kwargs"]["pixel_values"].shape
......
...@@ -248,13 +248,16 @@ def check_logprobs_close( ...@@ -248,13 +248,16 @@ def check_logprobs_close(
warnings.warn(fail_msg, stacklevel=2) warnings.warn(fail_msg, stacklevel=2)
def build_model_context(model_name: str, def build_model_context(
model_name: str,
task: TaskOption = "auto", task: TaskOption = "auto",
tokenizer_name: Optional[str] = None, tokenizer_name: Optional[str] = None,
trust_remote_code: bool = False, trust_remote_code: bool = False,
dtype: Optional[Union[str, torch.dtype]] = None, dtype: Optional[Union[str, torch.dtype]] = None,
mm_processor_kwargs: Optional[Dict] = None, mm_processor_kwargs: Optional[Dict] = None,
limit_mm_per_prompt: Optional[Dict] = None): limit_mm_per_prompt: Optional[Dict] = None,
disable_mm_preprocessor_cache: bool = True,
):
"""Creates an InputContext for a given model. """Creates an InputContext for a given model.
Args: Args:
...@@ -283,5 +286,6 @@ def build_model_context(model_name: str, ...@@ -283,5 +286,6 @@ def build_model_context(model_name: str,
seed=0, seed=0,
mm_processor_kwargs=mm_processor_kwargs, mm_processor_kwargs=mm_processor_kwargs,
limit_mm_per_prompt=limit_mm_per_prompt, limit_mm_per_prompt=limit_mm_per_prompt,
disable_mm_preprocessor_cache=disable_mm_preprocessor_cache,
) )
return InputContext(model_config) return InputContext(model_config)
...@@ -22,8 +22,8 @@ from vllm.multimodal.processing import (PlaceholderFeaturesInfo, ...@@ -22,8 +22,8 @@ from vllm.multimodal.processing import (PlaceholderFeaturesInfo,
replace_token_matches) replace_token_matches)
# yapf: enable # yapf: enable
from vllm.multimodal.profiling import MultiModalProfiler from vllm.multimodal.profiling import MultiModalProfiler
from vllm.multimodal.utils import cached_get_tokenizer from vllm.transformers_utils.tokenizer import (AnyTokenizer,
from vllm.transformers_utils.tokenizer import AnyTokenizer cached_tokenizer_from_config)
from vllm.utils import full_groupby from vllm.utils import full_groupby
from .utils import random_image from .utils import random_image
...@@ -577,7 +577,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid): ...@@ -577,7 +577,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
processor = MULTIMODAL_REGISTRY.create_processor( processor = MULTIMODAL_REGISTRY.create_processor(
model_config, model_config,
tokenizer=cached_get_tokenizer(model_config.tokenizer), tokenizer=cached_tokenizer_from_config(model_config),
) )
profiler = MultiModalProfiler(processor) profiler = MultiModalProfiler(processor)
...@@ -617,7 +617,7 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid): ...@@ -617,7 +617,7 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
processor = MULTIMODAL_REGISTRY.create_processor( processor = MULTIMODAL_REGISTRY.create_processor(
model_config, model_config,
tokenizer=cached_get_tokenizer(model_config.tokenizer), tokenizer=cached_tokenizer_from_config(model_config),
) )
rng = np.random.RandomState(0) rng = np.random.RandomState(0)
...@@ -689,7 +689,7 @@ def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs): ...@@ -689,7 +689,7 @@ def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
processor = MULTIMODAL_REGISTRY.create_processor( processor = MULTIMODAL_REGISTRY.create_processor(
model_config, model_config,
tokenizer=cached_get_tokenizer(model_config.tokenizer), tokenizer=cached_tokenizer_from_config(model_config),
) )
orig_get_hf_processor = processor.info.get_hf_processor orig_get_hf_processor = processor.info.get_hf_processor
......
...@@ -11,8 +11,9 @@ from transformers import BatchFeature, PretrainedConfig, ProcessorMixin ...@@ -11,8 +11,9 @@ from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
from typing_extensions import TypeVar, assert_never from typing_extensions import TypeVar, assert_never
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.transformers_utils.processor import cached_get_processor from vllm.transformers_utils.processor import cached_processor_from_config
from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer import (AnyTokenizer,
cached_tokenizer_from_config)
from vllm.utils import (ClassRegistry, get_allowed_kwarg_only_overrides, from vllm.utils import (ClassRegistry, get_allowed_kwarg_only_overrides,
resolve_mm_processor_kwargs) resolve_mm_processor_kwargs)
...@@ -27,19 +28,9 @@ if TYPE_CHECKING: ...@@ -27,19 +28,9 @@ if TYPE_CHECKING:
logger = init_logger(__name__) logger = init_logger(__name__)
C = TypeVar("C", bound=PretrainedConfig, default=PretrainedConfig) _T = TypeVar("_T")
P = TypeVar("P", bound=ProcessorMixin, default=ProcessorMixin) _C = TypeVar("_C", bound=PretrainedConfig, default=PretrainedConfig)
_P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin)
class HashableDict(dict):
"""
A dictionary that can be hashed by lru_cache.
"""
# NOTE: pythonic dict is not hashable,
# we override on it directly for simplicity
def __hash__(self) -> int: # type: ignore[override]
return hash(frozenset(self.items()))
@dataclass(frozen=True) @dataclass(frozen=True)
...@@ -54,9 +45,9 @@ class InputContext: ...@@ -54,9 +45,9 @@ class InputContext:
def get_hf_config( def get_hf_config(
self, self,
typ: Union[type[C], tuple[type[C], ...]] = PretrainedConfig, typ: Union[type[_C], tuple[type[_C], ...]] = PretrainedConfig,
/, /,
) -> C: ) -> _C:
""" """
Get the HuggingFace configuration Get the HuggingFace configuration
(:class:`transformers.PretrainedConfig`) of the model, (:class:`transformers.PretrainedConfig`) of the model,
...@@ -94,10 +85,10 @@ class InputContext: ...@@ -94,10 +85,10 @@ class InputContext:
def get_hf_processor( def get_hf_processor(
self, self,
typ: Union[type[P], tuple[type[P], ...]] = ProcessorMixin, typ: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin,
/, /,
**kwargs: object, **kwargs: object,
) -> P: ) -> _P:
""" """
Get the HuggingFace processor Get the HuggingFace processor
(:class:`transformers.ProcessorMixin`) of the model, (:class:`transformers.ProcessorMixin`) of the model,
...@@ -106,33 +97,29 @@ class InputContext: ...@@ -106,33 +97,29 @@ class InputContext:
Raises: Raises:
TypeError: If the processor is not of the specified type. TypeError: If the processor is not of the specified type.
""" """
return cached_processor_from_config(
self.model_config,
processor_cls=typ,
**kwargs,
)
def init_processor(
self,
typ: type[_T],
/,
**kwargs: object,
) -> _T:
"""
Initialize a HuggingFace-like processor class, merging the
keyword arguments with those in the model's configuration.
"""
base_kwargs = self.model_config.mm_processor_kwargs base_kwargs = self.model_config.mm_processor_kwargs
if base_kwargs is None: if base_kwargs is None:
base_kwargs = {} base_kwargs = {}
merged_kwargs = {**base_kwargs, **kwargs} merged_kwargs = {**base_kwargs, **kwargs}
if isinstance(typ, type): return typ(**merged_kwargs)
merged_kwargs["processor_cls"] = typ
# NOTE: Pythonic dict is not hashable and will raise unhashable type
# error when calling `cached_get_processor`, therefore we need to
# wrap it to a hashable dict.
for key, value in merged_kwargs.items():
if isinstance(value, dict):
merged_kwargs[key] = HashableDict(value)
hf_processor = cached_get_processor(
self.model_config.model,
trust_remote_code=self.model_config.trust_remote_code,
**merged_kwargs,
)
if not isinstance(hf_processor, typ):
raise TypeError("Invalid type of HuggingFace processor. "
f"Expected type: {typ}, but "
f"found type: {type(hf_processor)}")
return hf_processor
@dataclass(frozen=True) @dataclass(frozen=True)
...@@ -142,10 +129,10 @@ class InputProcessingContext(InputContext): ...@@ -142,10 +129,10 @@ class InputProcessingContext(InputContext):
def get_hf_processor( def get_hf_processor(
self, self,
typ: Union[type[P], tuple[type[P], ...]] = ProcessorMixin, typ: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin,
/, /,
**kwargs: object, **kwargs: object,
) -> P: ) -> _P:
return super().get_hf_processor( return super().get_hf_processor(
typ, typ,
tokenizer=self.tokenizer, tokenizer=self.tokenizer,
...@@ -341,13 +328,9 @@ class InputRegistry: ...@@ -341,13 +328,9 @@ class InputRegistry:
from vllm.model_executor.model_loader import get_model_architecture from vllm.model_executor.model_loader import get_model_architecture
from vllm.multimodal import MultiModalKwargs from vllm.multimodal import MultiModalKwargs
from vllm.multimodal.profiling import MultiModalProfiler from vllm.multimodal.profiling import MultiModalProfiler
from vllm.multimodal.utils import cached_get_tokenizer
if mm_registry.has_processor(model_config): if mm_registry.has_processor(model_config):
tokenizer = cached_get_tokenizer( tokenizer = cached_tokenizer_from_config(model_config)
model_config.tokenizer,
trust_remote_code=model_config.trust_remote_code,
)
processor = mm_registry.create_processor(model_config, tokenizer) processor = mm_registry.create_processor(model_config, tokenizer)
profiler = MultiModalProfiler(processor) profiler = MultiModalProfiler(processor)
dummy_data = profiler.get_dummy_data( dummy_data = profiler.get_dummy_data(
......
...@@ -400,8 +400,8 @@ class AriaProcessingInfo(BaseProcessingInfo): ...@@ -400,8 +400,8 @@ class AriaProcessingInfo(BaseProcessingInfo):
def get_vision_config(self): def get_vision_config(self):
return self.get_hf_config().vision_config return self.get_hf_config().vision_config
def get_hf_processor(self): def get_hf_processor(self, **kwargs: object):
return self.ctx.get_hf_processor(AriaProcessor) return self.ctx.get_hf_processor(AriaProcessor, **kwargs)
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None} return {"image": None}
......
...@@ -58,8 +58,8 @@ class ChameleonProcessingInfo(BaseProcessingInfo): ...@@ -58,8 +58,8 @@ class ChameleonProcessingInfo(BaseProcessingInfo):
def get_hf_config(self): def get_hf_config(self):
return self.ctx.get_hf_config(ChameleonConfig) return self.ctx.get_hf_config(ChameleonConfig)
def get_hf_processor(self): def get_hf_processor(self, **kwargs: object):
return self.ctx.get_hf_processor(ChameleonProcessor) return self.ctx.get_hf_processor(ChameleonProcessor, **kwargs)
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": 1} return {"image": 1}
......
...@@ -28,13 +28,13 @@ from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, ...@@ -28,13 +28,13 @@ from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.processing import (BaseMultiModalProcessor,
BaseProcessingInfo, PromptReplacement) BaseProcessingInfo, PromptReplacement)
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
from vllm.multimodal.utils import cached_get_tokenizer
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.configs.deepseek_vl2 import (DeepseekVLV2Config, from vllm.transformers_utils.configs.deepseek_vl2 import (DeepseekVLV2Config,
MlpProjectorConfig, MlpProjectorConfig,
VisionEncoderConfig) VisionEncoderConfig)
from vllm.transformers_utils.processors.deepseek_vl2 import ( from vllm.transformers_utils.processors.deepseek_vl2 import (
DeepseekVLV2Processor) DeepseekVLV2Processor)
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from vllm.utils import is_list_of from vllm.utils import is_list_of
from .interfaces import SupportsMultiModal, SupportsPP from .interfaces import SupportsMultiModal, SupportsPP
...@@ -133,8 +133,8 @@ class DeepseekVL2ProcessingInfo(BaseProcessingInfo): ...@@ -133,8 +133,8 @@ class DeepseekVL2ProcessingInfo(BaseProcessingInfo):
def get_hf_config(self): def get_hf_config(self):
return self.ctx.get_hf_config(DeepseekVLV2Config) return self.ctx.get_hf_config(DeepseekVLV2Config)
def get_hf_processor(self) -> DeepseekVLV2Processor: def get_hf_processor(self, **kwargs: object):
return self.ctx.get_hf_processor(DeepseekVLV2Processor) return self.ctx.get_hf_processor(DeepseekVLV2Processor, **kwargs)
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None} return {"image": None}
...@@ -308,13 +308,8 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): ...@@ -308,13 +308,8 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
self.text_config = config.text_config self.text_config = config.text_config
model_config = vllm_config.model_config model_config = vllm_config.model_config
tokenizer = cached_get_tokenizer( tokenizer = cached_tokenizer_from_config(model_config)
model_config.tokenizer, self.image_token_id = tokenizer.vocab[_IMAGE_TOKEN]
tokenizer_mode=model_config.tokenizer_mode,
tokenizer_revision=model_config.tokenizer_revision,
trust_remote_code=model_config.trust_remote_code,
)
self.image_token_id = tokenizer.vocab.get(_IMAGE_TOKEN)
self.vision = self._init_vision_module(self.vision_config, self.vision = self._init_vision_module(self.vision_config,
quant_config, quant_config,
......
...@@ -71,8 +71,8 @@ class FuyuProcessingInfo(BaseProcessingInfo): ...@@ -71,8 +71,8 @@ class FuyuProcessingInfo(BaseProcessingInfo):
def get_hf_config(self): def get_hf_config(self):
return self.ctx.get_hf_config(FuyuConfig) return self.ctx.get_hf_config(FuyuConfig)
def get_hf_processor(self): def get_hf_processor(self, **kwargs: object):
return self.ctx.get_hf_processor(FuyuProcessor) return self.ctx.get_hf_processor(FuyuProcessor, **kwargs)
def get_image_processor(self) -> FuyuImageProcessor: def get_image_processor(self) -> FuyuImageProcessor:
return self.get_hf_processor().image_processor return self.get_hf_processor().image_processor
......
...@@ -416,18 +416,15 @@ class GLM4VProcessor: ...@@ -416,18 +416,15 @@ class GLM4VProcessor:
class GLM4VProcessingInfo(BaseProcessingInfo): class GLM4VProcessingInfo(BaseProcessingInfo):
def get_tokenizer(self):
tokenizer = self.ctx.tokenizer
assert isinstance(tokenizer, PreTrainedTokenizer)
return tokenizer
def get_hf_config(self): def get_hf_config(self):
return self.ctx.get_hf_config(ChatGLMConfig) return self.ctx.get_hf_config(ChatGLMConfig)
def get_hf_processor(self) -> GLM4VProcessor: def get_hf_processor(self, **kwargs: object) -> GLM4VProcessor:
return GLM4VProcessor( return self.ctx.init_processor(
self.get_hf_config(), GLM4VProcessor,
self.get_tokenizer(), config=self.get_hf_config(),
tokenizer=self.get_tokenizer(),
**kwargs,
) )
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
......
...@@ -15,9 +15,9 @@ from vllm.model_executor.layers.pooler import PoolerHead ...@@ -15,9 +15,9 @@ from vllm.model_executor.layers.pooler import PoolerHead
from vllm.model_executor.models.llama import LlamaForCausalLM from vllm.model_executor.models.llama import LlamaForCausalLM
from vllm.model_executor.pooling_metadata import (PoolingMetadata, from vllm.model_executor.pooling_metadata import (PoolingMetadata,
PoolingTensors) PoolingTensors)
from vllm.multimodal.utils import cached_get_tokenizer
from vllm.sequence import (IntermediateTensors, PoolerOutput, from vllm.sequence import (IntermediateTensors, PoolerOutput,
PoolingSequenceGroupOutput) PoolingSequenceGroupOutput)
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
logger = init_logger(__name__) logger = init_logger(__name__)
...@@ -29,12 +29,7 @@ class GritLMPooler(nn.Module): ...@@ -29,12 +29,7 @@ class GritLMPooler(nn.Module):
self.model_config = model_config self.model_config = model_config
tokenizer = cached_get_tokenizer( tokenizer = cached_tokenizer_from_config(self.model_config)
self.model_config.tokenizer,
tokenizer_mode=self.model_config.tokenizer_mode,
tokenizer_revision=self.model_config.tokenizer_revision,
trust_remote_code=self.model_config.trust_remote_code,
)
# Collect the tokens needed for pattern matching. # Collect the tokens needed for pattern matching.
# "▁<" is different from "_<". The former uses "▁" to indicate that # "▁<" is different from "_<". The former uses "▁" to indicate that
......
...@@ -41,6 +41,7 @@ def resolve_h2ovl_min_max_num( ...@@ -41,6 +41,7 @@ def resolve_h2ovl_min_max_num(
dynamic_image_size: bool, dynamic_image_size: bool,
use_thumbnail: bool, use_thumbnail: bool,
) -> tuple[int, int]: ) -> tuple[int, int]:
min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1 max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
if use_thumbnail and max_dynamic_patch != 1: if use_thumbnail and max_dynamic_patch != 1:
...@@ -190,7 +191,7 @@ def image_to_pixel_values_h2ovl( ...@@ -190,7 +191,7 @@ def image_to_pixel_values_h2ovl(
pixel_values1, aspect_ratio1 = _preprocess_image( pixel_values1, aspect_ratio1 = _preprocess_image(
image, image,
input_size=input_size, input_size=input_size,
min_num=min_num, min_num=1,
max_num=max_num, max_num=max_num,
use_thumbnail=True, use_thumbnail=True,
prior_aspect_ratio=None, prior_aspect_ratio=None,
...@@ -199,7 +200,7 @@ def image_to_pixel_values_h2ovl( ...@@ -199,7 +200,7 @@ def image_to_pixel_values_h2ovl(
pixel_values2, _ = _preprocess_image( pixel_values2, _ = _preprocess_image(
image, image,
input_size=input_size, input_size=input_size,
min_num=3, # Hardcoded value min_num=3,
max_num=max_num, max_num=max_num,
use_thumbnail=True, use_thumbnail=True,
prior_aspect_ratio=aspect_ratio1, prior_aspect_ratio=aspect_ratio1,
...@@ -228,6 +229,7 @@ class H2OVLProcessor(BaseInternVLProcessor): ...@@ -228,6 +229,7 @@ class H2OVLProcessor(BaseInternVLProcessor):
config: PretrainedConfig, config: PretrainedConfig,
tokenizer: AnyTokenizer, tokenizer: AnyTokenizer,
*, *,
min_dynamic_patch: Optional[int] = None,
max_dynamic_patch: Optional[int] = None, max_dynamic_patch: Optional[int] = None,
dynamic_image_size: Optional[bool] = None, dynamic_image_size: Optional[bool] = None,
use_msac: Optional[bool] = None, use_msac: Optional[bool] = None,
...@@ -235,6 +237,7 @@ class H2OVLProcessor(BaseInternVLProcessor): ...@@ -235,6 +237,7 @@ class H2OVLProcessor(BaseInternVLProcessor):
super().__init__( super().__init__(
config, config,
tokenizer, tokenizer,
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch, max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size, dynamic_image_size=dynamic_image_size,
) )
...@@ -267,11 +270,13 @@ class H2OVLProcessor(BaseInternVLProcessor): ...@@ -267,11 +270,13 @@ class H2OVLProcessor(BaseInternVLProcessor):
def resolve_min_max_num( def resolve_min_max_num(
self, self,
*, *,
min_dynamic_patch: Optional[int] = None,
max_dynamic_patch: Optional[int] = None, max_dynamic_patch: Optional[int] = None,
dynamic_image_size: Optional[bool] = None, dynamic_image_size: Optional[bool] = None,
use_thumbnail: Optional[bool] = None, use_thumbnail: Optional[bool] = None,
) -> tuple[int, int]: ) -> tuple[int, int]:
min_dynamic_patch = self.min_dynamic_patch min_dynamic_patch = (self.min_dynamic_patch if min_dynamic_patch
is None else min_dynamic_patch)
max_dynamic_patch = (self.max_dynamic_patch if max_dynamic_patch max_dynamic_patch = (self.max_dynamic_patch if max_dynamic_patch
is None else max_dynamic_patch) is None else max_dynamic_patch)
dynamic_image_size = (self.dynamic_image_size if dynamic_image_size dynamic_image_size = (self.dynamic_image_size if dynamic_image_size
...@@ -289,18 +294,21 @@ class H2OVLProcessor(BaseInternVLProcessor): ...@@ -289,18 +294,21 @@ class H2OVLProcessor(BaseInternVLProcessor):
def resolve_target_ratios( def resolve_target_ratios(
self, self,
*, *,
min_dynamic_patch: Optional[int] = None,
max_dynamic_patch: Optional[int] = None, max_dynamic_patch: Optional[int] = None,
dynamic_image_size: Optional[bool] = None, dynamic_image_size: Optional[bool] = None,
use_thumbnail: Optional[bool] = None, use_thumbnail: Optional[bool] = None,
prior_aspect_ratio: Optional[tuple[int, int]] = None, prior_aspect_ratio: Optional[tuple[int, int]] = None,
override_min_num: Optional[int] = None,
) -> list[tuple[int, int]]: ) -> list[tuple[int, int]]:
min_num, max_num = self.resolve_min_max_num( min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch, max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size, dynamic_image_size=dynamic_image_size,
use_thumbnail=use_thumbnail, use_thumbnail=use_thumbnail,
) )
if prior_aspect_ratio: # hardcoded value for second pass of use_msac if override_min_num is not None:
min_num = 3 min_num = override_min_num
return get_h2ovl_target_ratios( return get_h2ovl_target_ratios(
min_num, min_num,
...@@ -322,6 +330,7 @@ class H2OVLProcessor(BaseInternVLProcessor): ...@@ -322,6 +330,7 @@ class H2OVLProcessor(BaseInternVLProcessor):
if use_msac: if use_msac:
target_ratios_1 = self.resolve_target_ratios( target_ratios_1 = self.resolve_target_ratios(
use_thumbnail=False, # Applied in calculate_targets use_thumbnail=False, # Applied in calculate_targets
override_min_num=1,
) )
num_patches_1, _, _, aspect_ratio_1 = calculate_h2ovl_targets( num_patches_1, _, _, aspect_ratio_1 = calculate_h2ovl_targets(
orig_width=image_width, orig_width=image_width,
...@@ -334,6 +343,7 @@ class H2OVLProcessor(BaseInternVLProcessor): ...@@ -334,6 +343,7 @@ class H2OVLProcessor(BaseInternVLProcessor):
target_ratios_2 = self.resolve_target_ratios( target_ratios_2 = self.resolve_target_ratios(
use_thumbnail=False, # Applied in calculate_targets use_thumbnail=False, # Applied in calculate_targets
prior_aspect_ratio=aspect_ratio_1, prior_aspect_ratio=aspect_ratio_1,
override_min_num=3,
) )
num_patches_2, _, _, _ = calculate_h2ovl_targets( num_patches_2, _, _, _ = calculate_h2ovl_targets(
orig_width=image_width, orig_width=image_width,
...@@ -361,12 +371,14 @@ class H2OVLProcessor(BaseInternVLProcessor): ...@@ -361,12 +371,14 @@ class H2OVLProcessor(BaseInternVLProcessor):
def _images_to_pixel_values_lst( def _images_to_pixel_values_lst(
self, self,
images: list[Image.Image], images: list[Image.Image],
min_dynamic_patch: Optional[int] = None,
max_dynamic_patch: Optional[int] = None, max_dynamic_patch: Optional[int] = None,
dynamic_image_size: Optional[bool] = None, dynamic_image_size: Optional[bool] = None,
) -> list[torch.Tensor]: ) -> list[torch.Tensor]:
use_msac = self.use_msac if len(images) == 1 else False use_msac = self.use_msac if len(images) == 1 else False
min_num, max_num = self.resolve_min_max_num( min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch, max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size, dynamic_image_size=dynamic_image_size,
use_thumbnail=False, # Applied in image_to_pixel_values use_thumbnail=False, # Applied in image_to_pixel_values
...@@ -389,14 +401,23 @@ class H2OVLProcessingInfo(BaseInternVLProcessingInfo): ...@@ -389,14 +401,23 @@ class H2OVLProcessingInfo(BaseInternVLProcessingInfo):
def get_hf_processor( def get_hf_processor(
self, self,
*, *,
min_dynamic_patch: Optional[int] = None,
max_dynamic_patch: Optional[int] = None, max_dynamic_patch: Optional[int] = None,
dynamic_image_size: Optional[bool] = None, dynamic_image_size: Optional[bool] = None,
**kwargs: object,
) -> H2OVLProcessor: ) -> H2OVLProcessor:
return H2OVLProcessor( if min_dynamic_patch is not None:
self.get_hf_config(), kwargs["min_dynamic_patch"] = min_dynamic_patch
self.get_tokenizer(), if max_dynamic_patch is not None:
max_dynamic_patch=max_dynamic_patch, kwargs["max_dynamic_patch"] = max_dynamic_patch
dynamic_image_size=dynamic_image_size, if dynamic_image_size is not None:
kwargs["dynamic_image_size"] = dynamic_image_size
return self.ctx.init_processor(
H2OVLProcessor,
config=self.get_hf_config(),
tokenizer=self.get_tokenizer(),
**kwargs,
) )
def get_mm_max_tokens_per_item( def get_mm_max_tokens_per_item(
......
...@@ -85,11 +85,13 @@ class Idefics3ProcessingInfo(BaseProcessingInfo): ...@@ -85,11 +85,13 @@ class Idefics3ProcessingInfo(BaseProcessingInfo):
def get_hf_processor( def get_hf_processor(
self, self,
*, *,
size: Optional[Dict[str, int]] = None) -> Idefics3Processor: size: Optional[Dict[str, int]] = None,
**kwargs: object,
) -> Idefics3Processor:
if size is not None: if size is not None:
return self.ctx.get_hf_processor(Idefics3Processor, size=size) kwargs["size"] = size
return self.ctx.get_hf_processor(Idefics3Processor) return self.ctx.get_hf_processor(Idefics3Processor, **kwargs)
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None} return {"image": None}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment