Unverified Commit 82de9b9d authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Misc] Automatically resolve HF processor init kwargs (#22005)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent ad57f23f
......@@ -449,25 +449,6 @@ def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
)
# omni-research/Tarsier-7b
def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "omni-research/Tarsier-7b"
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=4096,
limit_mm_per_prompt={modality: 1},
)
prompts = [(f"USER: <image>\n{question} ASSISTANT:") for question in questions]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# Intern-S1
def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
model_name = "internlm/Intern-S1"
......@@ -1293,6 +1274,25 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
)
# omni-research/Tarsier-7b
def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "omni-research/Tarsier-7b"
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=4096,
limit_mm_per_prompt={modality: 1},
)
prompts = [(f"USER: <image>\n{question} ASSISTANT:") for question in questions]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
model_name = "omni-research/Tarsier2-Recap-7b"
......
......@@ -4,8 +4,6 @@ from dataclasses import dataclass
from typing import Optional
import pytest
from packaging.version import Version
from transformers import __version__ as TRANSFORMERS_VERSION
import vllm
from vllm.assets.image import ImageAsset
......@@ -185,10 +183,6 @@ def test_qwen2vl_lora_beam_search(qwen2vl_lora_files):
current_platform.is_rocm(),
reason="Qwen2.5-VL dependency xformers incompatible with ROCm",
)
@pytest.mark.skipif(
Version(TRANSFORMERS_VERSION) < Version("4.49.0"),
reason="Qwen2.5-VL require transformers version no lower than 4.49.0",
)
def test_qwen25vl_lora(qwen25vl_lora_files):
"""Test Qwen 2.5 VL model with LoRA"""
config = TestConfig(model_path=QWEN25VL_MODEL_PATH,
......
......@@ -702,13 +702,38 @@ VLM_TEST_SETTINGS = {
"smolvlm": VLMTestInfo(
models=["HuggingFaceTB/SmolVLM2-2.2B-Instruct"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt:f"<|im_start|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501
prompt_formatter=lambda img_prompt: f"<|im_start|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501
img_idx_to_prompt=lambda idx: "<image>",
max_model_len=8192,
max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
hf_output_post_proc=model_utils.smolvlm_trunc_hf_output,
),
"tarsier": VLMTestInfo(
models=["omni-research/Tarsier-7b"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"USER: {img_prompt} ASSISTANT:",
max_model_len=4096,
max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
patch_hf_runner=model_utils.tarsier_patch_hf_runner,
),
"tarsier2": VLMTestInfo(
models=["omni-research/Tarsier2-Recap-7b"],
test_type=(
VLMTestType.IMAGE,
VLMTestType.MULTI_IMAGE,
VLMTestType.VIDEO,
),
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
max_model_len=4096,
max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
marks=[pytest.mark.skip("Model initialization hangs")],
),
### Tensor parallel / multi-gpu broadcast tests
"chameleon-broadcast": VLMTestInfo(
models=["facebook/chameleon-7b"],
......
......@@ -818,3 +818,15 @@ def qwen2_5_omni_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
thinker.get_output_embeddings = lambda: thinker.lm_head
hf_model.model = thinker
return hf_model
def tarsier_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
from vllm.model_executor.models.tarsier import get_vision_encoder_info
vision_encoder_info = get_vision_encoder_info(hf_model.config)
hf_processor = hf_model.processor
if hf_processor.patch_size is None:
hf_processor.patch_size = vision_encoder_info.get_patch_size()
return hf_model
......@@ -16,7 +16,7 @@ def test_multimodal_processor(model_id):
model_impl="transformers",
)
mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config, )
mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config)
image_pil = ImageAsset('cherry_blossom').pil_image
mm_data = {"image": image_pil}
......
......@@ -465,8 +465,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
is_available_online=False),
"UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b", # noqa: E501
trust_remote_code=True),
"TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b", # noqa: E501
hf_overrides={"architectures": ["TarsierForConditionalGeneration"]}), # noqa: E501
"TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b"), # noqa: E501
"Tarsier2ForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier2-Recap-7b", # noqa: E501
hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]}), # noqa: E501
"VoxtralForConditionalGeneration": _HfExamplesInfo(
......
......@@ -2,16 +2,15 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from contextlib import nullcontext
from types import MethodType
from typing import cast
from typing import Optional, cast
from unittest.mock import MagicMock
import numpy as np
import pytest
import torch
from transformers import ProcessorMixin
from vllm.config import ModelConfig
from vllm.inputs import InputProcessingContext
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargs,
MultiModalKwargsItem,
......@@ -1013,57 +1012,91 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
)
class _ProcessorProxy:
class DummyProcessor:
def __init__(self, processor: ProcessorMixin) -> None:
def __init__(self, a: int = 0, b: int = 0) -> None:
super().__init__()
self.__processor = processor
def __getattr__(self, key: str):
return getattr(self.__processor, key)
self.a = a
self.b = b
def __call__(
self,
text=None,
images=None,
videos=None,
exists=None,
return_tensors=None,
):
return dict(exists=exists)
a: int = 0,
c: int = 0,
return_tensors: Optional[str] = None,
) -> dict[str, int]:
return dict(a=a, c=c)
@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) # Dummy
# yapf: disable
@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) # Dummy
@pytest.mark.parametrize(
("call_kwargs", "expected_kwargs"),
("config_kwargs", "inference_kwargs", "expected_kwargs"),
[
# Should ignore invalid kwargs
({"does_not_exist": 100}, {"exists": None}),
({"exists": 1}, {"exists": 1}),
({"does_not_exist": 100, "exists": 1}, {"exists": 1}),
({"a": 1}, {}, {"a": 1, "b": 0}),
({}, {"a": 1}, {"a": 1, "b": 0}),
# inference_kwargs should take precedence
({"a": 1}, {"a": 2}, {"a": 2, "b": 0}),
# Should ignore extra kwargs
({"a": 1, "c": 1}, {}, {"a": 1, "b": 0}),
({"b": 1, "c": 1}, {}, {"a": 0, "b": 1}),
],
)
# yapf: enable
def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
model_config = ModelConfig(model_id)
def test_hf_processor_init_kwargs(
model_id,
config_kwargs,
inference_kwargs,
expected_kwargs,
):
# Should not be used since there is nothing to convert to tokens
mock_tokenizer = cast(AnyTokenizer, object())
processor = MULTIMODAL_REGISTRY.create_processor(model_config)
orig_get_hf_processor = processor.info.get_hf_processor
ctx = InputProcessingContext(
model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs),
tokenizer=mock_tokenizer,
)
processor = ctx.get_hf_processor(
DummyProcessor, # type: ignore[arg-type]
**inference_kwargs,
)
for k, v in expected_kwargs.items():
assert getattr(processor, k) == v
def get_hf_processor(self, **kwargs):
assert kwargs == call_kwargs
return _ProcessorProxy(orig_get_hf_processor())
processor.info.get_hf_processor = MethodType(get_hf_processor,
processor.info)
# yapf: disable
@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) # Dummy
@pytest.mark.parametrize(
("config_kwargs", "inference_kwargs", "expected_kwargs"),
[
({"a": 1}, {}, {"a": 1, "c": 0}),
({}, {"a": 1}, {"a": 1, "c": 0}),
# inference_kwargs should take precedence
({"a": 1}, {"a": 2}, {"a": 2, "c": 0}),
# Should ignore extra kwargs
({"a": 1, "c": 1}, {}, {"a": 1, "c": 1}),
({"b": 1, "c": 1}, {}, {"a": 0, "c": 1}),
],
)
# yapf: enable
def test_hf_processor_call_kwargs(
model_id,
config_kwargs,
inference_kwargs,
expected_kwargs,
):
# Should not be used since there is nothing to convert to tokens
mock_tokenizer = cast(AnyTokenizer, object())
out_kwargs = processor._call_hf_processor(
prompt="",
mm_data={},
mm_kwargs=call_kwargs,
tok_kwargs={},
ctx = InputProcessingContext(
model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs),
tokenizer=mock_tokenizer,
)
assert out_kwargs == expected_kwargs
processor = ctx.get_hf_processor(DummyProcessor) # type: ignore[arg-type]
result = ctx.call_hf_processor(processor, {}, inference_kwargs)
assert result == expected_kwargs
......@@ -11,6 +11,7 @@ import textwrap
import uuid
import warnings
from collections import Counter
from collections.abc import Mapping
from contextlib import contextmanager
from dataclasses import (MISSING, Field, asdict, field, fields, is_dataclass,
replace)
......@@ -3332,7 +3333,16 @@ class MultiModalConfig:
999 if envs.VLLM_USE_V1 else 1,
)
# TODO: Add configs to init vision tower or not.
def merge_mm_processor_kwargs(
self,
inference_kwargs: Mapping[str, object],
) -> dict[str, object]:
"""
Get the keyword arguments to pass to the multi-modal processor
according to the extra arguments passed during inference.
"""
kwargs = self.mm_processor_kwargs or {}
return kwargs | dict(inference_kwargs)
@config
......
......@@ -11,7 +11,7 @@ from typing_extensions import TypeVar
from vllm.jsontree import JSONTree, json_map_leaves
from vllm.logger import init_logger
from vllm.transformers_utils.processor import cached_processor_from_config
from vllm.utils import resolve_mm_processor_kwargs
from vllm.utils import get_allowed_kwarg_only_overrides
if TYPE_CHECKING:
from vllm.config import ModelConfig
......@@ -154,14 +154,11 @@ class InputProcessingContext(InputContext):
assert callable(hf_processor)
mm_config = self.model_config.get_multimodal_config()
base_kwargs = mm_config.mm_processor_kwargs
if base_kwargs is None:
base_kwargs = {}
merged_kwargs = mm_config.merge_mm_processor_kwargs(kwargs)
merged_kwargs = resolve_mm_processor_kwargs(
base_kwargs,
kwargs,
allowed_kwargs = get_allowed_kwarg_only_overrides(
hf_processor,
merged_kwargs,
requires_kw_only=False,
allow_var_kwargs=True,
)
......@@ -173,7 +170,9 @@ class InputProcessingContext(InputContext):
return x
try:
output = hf_processor(**data, **merged_kwargs, return_tensors="pt")
output = hf_processor(**data,
**allowed_kwargs,
return_tensors="pt")
# this emulates output.to(dtype=self.model_config.dtype)
if isinstance(output, BatchFeature):
cast_output = json_map_leaves(maybe_cast_dtype, output.data)
......@@ -189,7 +188,7 @@ class InputProcessingContext(InputContext):
except Exception as exc:
msg = (f"Failed to apply {type(hf_processor).__name__} "
f"on data={data} with kwargs={merged_kwargs}")
f"on data={data} with kwargs={allowed_kwargs}")
raise ValueError(msg) from exc
......
......@@ -123,16 +123,10 @@ class AyaVisionProcessingInfo(BaseProcessingInfo):
return self.ctx.get_hf_config(AyaVisionConfig)
def get_hf_processor(self, **kwargs: object) -> AyaVisionProcessor:
processor = self.ctx.get_hf_processor(AyaVisionProcessor, **kwargs)
return self.ctx.get_hf_processor(AyaVisionProcessor, **kwargs)
# Temporary workaround since this processor has multiple image tokens
# See https://github.com/huggingface/transformers/issues/38350
processor._check_special_mm_tokens = lambda *args, **kwargs: None
return processor
def get_image_processor(self) -> GotOcr2ImageProcessor:
return self.get_hf_processor().image_processor
def get_image_processor(self, **kwargs: object) -> GotOcr2ImageProcessor:
return self.get_hf_processor(**kwargs).image_processor
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None}
......
......@@ -214,25 +214,25 @@ class DeepseekVL2MultiModalProcessor(
mm_kwargs: Mapping[str, object],
tok_kwargs: Mapping[str, object],
) -> BatchFeature:
if mm_data:
processed_outputs = self.info.ctx.call_hf_processor(
self.info.get_hf_processor(**mm_kwargs),
dict(prompt=prompt, **mm_data),
dict(**mm_kwargs, **tok_kwargs),
)
pixel_values = processed_outputs["pixel_values"]
# split pixel values into patches corresponding to each image
images_spatial_crop = processed_outputs["images_spatial_crop"]
patches_per_image = [
x.prod().item() + 1 for x in images_spatial_crop
]
pixel_values = pixel_values.split(patches_per_image)
processed_outputs["pixel_values"] = pixel_values
else:
if not mm_data:
tokenizer = self.info.get_tokenizer()
processed_outputs = tokenizer(prompt,
add_special_tokens=True,
return_tensors="pt")
return tokenizer(prompt,
add_special_tokens=True,
return_tensors="pt")
processed_outputs = super()._call_hf_processor(
prompt=prompt,
mm_data=mm_data,
mm_kwargs=mm_kwargs,
tok_kwargs=tok_kwargs,
)
pixel_values = processed_outputs["pixel_values"]
# split pixel values into patches corresponding to each image
images_spatial_crop = processed_outputs["images_spatial_crop"]
patches_per_image = [x.prod().item() + 1 for x in images_spatial_crop]
pixel_values = pixel_values.split(patches_per_image)
processed_outputs["pixel_values"] = pixel_values
return processed_outputs
......
......@@ -761,12 +761,6 @@ class Florence2LanguageForConditionalGeneration(nn.Module, SupportsV0Only):
class Florence2ProcessingInfo(BaseProcessingInfo):
def get_hf_config(self):
return self.ctx.get_hf_config()
def get_hf_processor(self):
return self.ctx.get_hf_processor()
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": 1}
......
......@@ -83,8 +83,8 @@ class FuyuProcessingInfo(BaseProcessingInfo):
def get_hf_processor(self, **kwargs: object):
return self.ctx.get_hf_processor(FuyuProcessor, **kwargs)
def get_image_processor(self) -> FuyuImageProcessor:
return self.get_hf_processor().image_processor
def get_image_processor(self, **kwargs: object) -> FuyuImageProcessor:
return self.get_hf_processor(**kwargs).image_processor
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": 1}
......
......@@ -809,11 +809,11 @@ class Glm4vProcessingInfo(BaseProcessingInfo):
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None, "video": 1}
def get_image_processor(self) -> Glm4vImageProcessor:
return self.get_hf_processor().image_processor
def get_image_processor(self, **kwargs: object) -> Glm4vImageProcessor:
return self.get_hf_processor(**kwargs).image_processor
def get_video_processor(self) -> Glm4vVideoProcessor:
return self.get_hf_processor().video_processor
def get_video_processor(self, **kwargs: object) -> Glm4vVideoProcessor:
return self.get_hf_processor(**kwargs).video_processor
def _get_vision_info(
self,
......
......@@ -392,21 +392,7 @@ class H2OVLProcessor(BaseInternVLProcessor):
class H2OVLProcessingInfo(BaseInternVLProcessingInfo):
def get_hf_processor(
self,
*,
min_dynamic_patch: Optional[int] = None,
max_dynamic_patch: Optional[int] = None,
dynamic_image_size: Optional[bool] = None,
**kwargs: object,
) -> H2OVLProcessor:
if min_dynamic_patch is not None:
kwargs["min_dynamic_patch"] = min_dynamic_patch
if max_dynamic_patch is not None:
kwargs["max_dynamic_patch"] = max_dynamic_patch
if dynamic_image_size is not None:
kwargs["dynamic_image_size"] = dynamic_image_size
def get_hf_processor(self, **kwargs: object) -> H2OVLProcessor:
return self.ctx.init_processor(
H2OVLProcessor,
config=self.get_hf_config(),
......
......@@ -25,8 +25,7 @@ import torch
import torch.nn as nn
from timm.layers import LayerNorm, LayerNorm2d
from timm.models.regnet import RegStage
from transformers import (AutoProcessor, BatchFeature, CLIPVisionConfig,
SiglipVisionConfig)
from transformers import BatchFeature, CLIPVisionConfig, SiglipVisionConfig
from transformers.modeling_utils import no_init_weights
from vllm.config import VllmConfig
......@@ -80,26 +79,9 @@ HCXVisionMultimodalInputs = Union[HCXVisionMultimodalPixelInputs]
class HCXVisionProcessingInfo(BaseProcessingInfo):
def get_hf_config(self):
return self.ctx.get_hf_config()
def get_vision_encoder_info(self):
return get_vision_encoder_info(self.get_hf_config())
def get_hf_processor(
self,
**kwargs: object,
):
processor_cls = type(
AutoProcessor.from_pretrained(
self.ctx.model_config.model,
trust_remote_code=self.ctx.model_config.trust_remote_code,
))
return self.ctx.get_hf_processor(
processor_cls,
**kwargs,
)
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None, "video": None}
......
......@@ -88,15 +88,7 @@ ImageInputs = Union[Idefics3ImagePixelInputs, Idefics3ImageEmbeddingInputs]
class Idefics3ProcessingInfo(BaseProcessingInfo):
def get_hf_processor(
self,
*,
size: Optional[dict[str, int]] = None,
**kwargs: object,
) -> Idefics3Processor:
if size is not None:
kwargs["size"] = size
def get_hf_processor(self, **kwargs: object) -> Idefics3Processor:
return self.ctx.get_hf_processor(Idefics3Processor, **kwargs)
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
......
......@@ -665,14 +665,7 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
"""Basic image-only ProcessingInfo for InternVL-style models."""
@abstractmethod
def get_hf_processor(
self,
*,
min_dynamic_patch: Optional[int] = None,
max_dynamic_patch: Optional[int] = None,
dynamic_image_size: Optional[bool] = None,
**kwargs: object,
) -> BaseInternVLProcessor:
def get_hf_processor(self, **kwargs: object) -> BaseInternVLProcessor:
raise NotImplementedError
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
......@@ -882,27 +875,12 @@ class InternVLProcessingInfo(BaseInternVLProcessingInfo):
return max(max_frames_per_video, 1)
def get_hf_processor(
self,
*,
min_dynamic_patch: Optional[int] = None,
max_dynamic_patch: Optional[int] = None,
dynamic_image_size: Optional[bool] = None,
**kwargs: object,
) -> InternVLProcessor:
if min_dynamic_patch is not None:
kwargs["min_dynamic_patch"] = min_dynamic_patch
if max_dynamic_patch is not None:
kwargs["max_dynamic_patch"] = max_dynamic_patch
if dynamic_image_size is not None:
kwargs["dynamic_image_size"] = dynamic_image_size
kwargs["video_token"] = self.get_video_token()
def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
return self.ctx.init_processor(
InternVLProcessor,
config=self.get_hf_config(),
tokenizer=self.get_tokenizer(),
video_token=self.get_video_token(),
**kwargs,
)
......
......@@ -44,8 +44,6 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.platforms import _Backend
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.config import uses_mrope
from vllm.transformers_utils.processor import (
cached_image_processor_from_config)
from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
......@@ -980,72 +978,8 @@ class KeyeMultiModalDataParser(MultiModalDataParser):
class KeyeProcessingInfo(BaseProcessingInfo):
def get_hf_processor(
self,
*,
min_pixels: Optional[int] = None,
max_pixels: Optional[int] = None,
size: Optional[dict[str, int]] = None,
**kwargs: object,
):
return self.ctx.get_hf_processor(
image_processor=self.get_image_processor(
min_pixels=min_pixels,
max_pixels=max_pixels,
size=size,
),
**kwargs,
)
def _get_image_processor_kwargs(
self,
*,
min_pixels: Optional[int] = None,
max_pixels: Optional[int] = None,
size: Optional[dict[str, int]] = None,
**kwargs: object,
):
if self.ctx.model_config.mm_processor_kwargs:
kwargs.update(self.ctx.model_config.mm_processor_kwargs)
if min_pixels is not None:
kwargs["min_pixels"] = min_pixels
if size is None:
size = {"shortest_edge": min_pixels}
else:
size["shortest_edge"] = min_pixels
if max_pixels is not None:
kwargs["max_pixels"] = max_pixels
if size is None:
size = {"longest_edge": max_pixels}
else:
size["longest_edge"] = max_pixels
if size is not None:
kwargs["size"] = size
return kwargs
def get_image_processor(
self,
*,
min_pixels: Optional[int] = None,
max_pixels: Optional[int] = None,
size: Optional[dict[str, int]] = None,
**kwargs: object,
):
return cached_image_processor_from_config(
self.ctx.model_config,
**self._get_image_processor_kwargs(
min_pixels=min_pixels,
max_pixels=max_pixels,
size=size,
**kwargs,
),
)
def get_image_processor(self, **kwargs: object):
return self.get_hf_processor(**kwargs).image_processor
def get_supported_mm_limits(self, ) -> Mapping[str, Optional[int]]:
return {"image": None, "video": None}
......@@ -1246,20 +1180,6 @@ class KeyeMultiModalProcessor(BaseMultiModalProcessor[KeyeProcessingInfo]):
def _get_data_parser(self) -> MultiModalDataParser:
return KeyeMultiModalDataParser()
def _call_hf_processor(
self,
prompt: str,
mm_data: Mapping[str, object],
mm_kwargs: Mapping[str, object],
tok_kwargs: Mapping[str, object],
) -> BatchFeature:
mm_kwargs = self.info._get_image_processor_kwargs(**mm_kwargs)
return self.info.ctx.call_hf_processor(
self.info.get_hf_processor(**mm_kwargs),
dict(text=prompt, **mm_data),
dict(**mm_kwargs, **tok_kwargs),
)
def _get_prompt_updates(
self,
mm_items: MultiModalDataItems,
......
......@@ -8,11 +8,9 @@ from typing import (Final, Literal, Optional, Protocol, TypedDict, TypeVar,
import torch
import torch.nn as nn
from packaging.version import Version
from transformers import (BatchFeature, CLIPVisionConfig, LlavaConfig,
PixtralVisionConfig, PretrainedConfig,
SiglipVisionConfig)
from transformers import __version__ as TRANSFORMERS_VERSION
from transformers.models.llava import LlavaProcessor
from transformers.models.pixtral import PixtralProcessor
......@@ -307,29 +305,14 @@ class PixtralHFMultiModalProcessor(
pixel_values = processed_outputs.get("pixel_values")
if pixel_values is not None:
# Before/after https://github.com/huggingface/transformers/pull/35122
if Version(TRANSFORMERS_VERSION) <= Version("4.48.3"):
images = mm_data["images"]
assert isinstance(images, list)
# Original output: (1, num_images, C, H, W)
# New output: (num_images, C, H, W)
assert (isinstance(pixel_values, list)
and len(pixel_values) == 1)
assert (isinstance(pixel_values[0], list)
and len(pixel_values[0]) == len(images))
processed_outputs["pixel_values"] = pixel_values[0]
else:
# Avoid padding since we need the output for each image to be
# independent of other images for the cache to work correctly
image_sizes = processed_outputs["image_sizes"]
assert len(pixel_values) == len(image_sizes)
# Avoid padding since we need the output for each image to be
# independent of other images for the cache to work correctly
image_sizes = processed_outputs["image_sizes"]
assert len(pixel_values) == len(image_sizes)
processed_outputs["pixel_values"] = [
p[:, :h, :w]
for p, (h, w) in zip(pixel_values, image_sizes)
]
processed_outputs["pixel_values"] = [
p[:, :h, :w] for p, (h, w) in zip(pixel_values, image_sizes)
]
return processed_outputs
......@@ -784,17 +767,10 @@ class MantisProcessingInfo(LlavaProcessingInfo):
vision_info = self.get_vision_encoder_info()
kwargs.setdefault("patch_size", vision_info.get_patch_size())
if Version(TRANSFORMERS_VERSION) < Version("4.48"):
# BUG: num_additional_image_tokens = 0 but treated as 1,
# so we set vision_feature_select_strategy to None to offset this
kwargs.setdefault("vision_feature_select_strategy", None)
else:
# FIXED: https://github.com/huggingface/transformers/pull/33424/files#diff-6a37acc21efcadaae622b079b2712a131131448ff64262bd219aa346aeec38faL150
kwargs.setdefault(
"vision_feature_select_strategy",
hf_config.vision_feature_select_strategy,
)
kwargs.setdefault(
"vision_feature_select_strategy",
hf_config.vision_feature_select_strategy,
)
return self.ctx.get_hf_processor(LlavaProcessor, **kwargs)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment