Unverified Commit 377d10bd authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[VLM][Bugfix] Pass processor kwargs properly on init (#13516)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent 52ce14d3
......@@ -120,6 +120,7 @@ def resolve_internvl_min_max_num(
dynamic_image_size: bool,
use_thumbnail: bool,
) -> tuple[int, int]:
min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
if use_thumbnail and max_dynamic_patch != 1:
......@@ -247,6 +248,7 @@ class BaseInternVLProcessor(ABC):
config: PretrainedConfig,
tokenizer: AnyTokenizer,
*,
min_dynamic_patch: Optional[int] = None,
max_dynamic_patch: Optional[int] = None,
dynamic_image_size: Optional[bool] = None,
) -> None:
......@@ -258,18 +260,22 @@ class BaseInternVLProcessor(ABC):
image_size: int = config.vision_config.image_size
patch_size: int = config.vision_config.patch_size
if dynamic_image_size is None:
dynamic_image_size = config.dynamic_image_size
assert isinstance(dynamic_image_size, bool)
if min_dynamic_patch is None:
min_dynamic_patch = config.min_dynamic_patch
assert isinstance(min_dynamic_patch, int)
if max_dynamic_patch is None:
max_dynamic_patch = config.max_dynamic_patch
assert isinstance(max_dynamic_patch, int)
if dynamic_image_size is None:
dynamic_image_size = config.dynamic_image_size
assert isinstance(dynamic_image_size, bool)
self.num_image_token = int(
(image_size // patch_size)**2 * (config.downsample_ratio**2))
self.image_size = image_size
self.min_dynamic_patch: int = config.min_dynamic_patch
self.min_dynamic_patch = min_dynamic_patch
self.max_dynamic_patch = max_dynamic_patch
self.dynamic_image_size = dynamic_image_size
self.use_thumbnail: bool = config.use_thumbnail
......@@ -298,11 +304,13 @@ class BaseInternVLProcessor(ABC):
def resolve_min_max_num(
self,
*,
min_dynamic_patch: Optional[int] = None,
max_dynamic_patch: Optional[int] = None,
dynamic_image_size: Optional[bool] = None,
use_thumbnail: Optional[bool] = None,
) -> tuple[int, int]:
min_dynamic_patch = self.min_dynamic_patch
min_dynamic_patch = (self.min_dynamic_patch if min_dynamic_patch
is None else min_dynamic_patch)
max_dynamic_patch = (self.max_dynamic_patch if max_dynamic_patch
is None else max_dynamic_patch)
dynamic_image_size = (self.dynamic_image_size if dynamic_image_size
......@@ -320,11 +328,13 @@ class BaseInternVLProcessor(ABC):
def resolve_target_ratios(
self,
*,
min_dynamic_patch: Optional[int] = None,
max_dynamic_patch: Optional[int] = None,
dynamic_image_size: Optional[bool] = None,
use_thumbnail: Optional[bool] = None,
) -> list[tuple[int, int]]:
min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=use_thumbnail,
......@@ -355,10 +365,12 @@ class BaseInternVLProcessor(ABC):
def _images_to_pixel_values_lst(
self,
images: list[Image.Image],
min_dynamic_patch: Optional[int] = None,
max_dynamic_patch: Optional[int] = None,
dynamic_image_size: Optional[bool] = None,
) -> list[torch.Tensor]:
min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=False, # Applied in image_to_pixel_values
......@@ -378,6 +390,7 @@ class BaseInternVLProcessor(ABC):
self,
text: Optional[Union[str, list[str]]] = None,
images: Optional[Union[Image.Image, list[Image.Image]]] = None,
min_dynamic_patch: Optional[int] = None,
max_dynamic_patch: Optional[int] = None,
dynamic_image_size: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
......@@ -396,6 +409,7 @@ class BaseInternVLProcessor(ABC):
else:
pixel_values_lst = self._images_to_pixel_values_lst(
images,
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
)
......@@ -451,8 +465,10 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
def get_hf_processor(
self,
*,
min_dynamic_patch: Optional[int] = None,
max_dynamic_patch: Optional[int] = None,
dynamic_image_size: Optional[bool] = None,
**kwargs: object,
) -> BaseInternVLProcessor:
raise NotImplementedError
......@@ -642,14 +658,23 @@ class InternVLProcessingInfo(BaseInternVLProcessingInfo):
def get_hf_processor(
self,
*,
min_dynamic_patch: Optional[int] = None,
max_dynamic_patch: Optional[int] = None,
dynamic_image_size: Optional[bool] = None,
**kwargs: object,
) -> InternVLProcessor:
return InternVLProcessor(
self.get_hf_config(),
self.get_tokenizer(),
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
if min_dynamic_patch is not None:
kwargs["min_dynamic_patch"] = min_dynamic_patch
if max_dynamic_patch is not None:
kwargs["max_dynamic_patch"] = max_dynamic_patch
if dynamic_image_size is not None:
kwargs["dynamic_image_size"] = dynamic_image_size
return self.ctx.init_processor(
InternVLProcessor,
config=self.get_hf_config(),
tokenizer=self.get_tokenizer(),
**kwargs,
)
......
......@@ -119,7 +119,7 @@ class BaseLlavaProcessingInfo(BaseProcessingInfo):
return get_vision_encoder_info(self.get_hf_config())
@abstractmethod
def get_hf_processor(self) -> LlavaLikeProcessor:
def get_hf_processor(self, **kwargs: object) -> LlavaLikeProcessor:
raise NotImplementedError
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
......@@ -208,8 +208,8 @@ class LlavaDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
class LlavaProcessingInfo(BaseLlavaProcessingInfo):
def get_hf_processor(self):
return self.ctx.get_hf_processor(LlavaProcessor)
def get_hf_processor(self, **kwargs: object):
return self.ctx.get_hf_processor(LlavaProcessor, **kwargs)
class BaseLlavaMultiModalProcessor(BaseMultiModalProcessor[_I]):
......@@ -272,8 +272,8 @@ class LlavaMultiModalProcessor(
class PixtralHFProcessingInfo(BaseLlavaProcessingInfo):
def get_hf_processor(self):
return self.ctx.get_hf_processor(PixtralProcessor)
def get_hf_processor(self, **kwargs: object):
return self.ctx.get_hf_processor(PixtralProcessor, **kwargs)
class PixtralHFMultiModalProcessor(
......@@ -742,23 +742,24 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
class MantisProcessingInfo(LlavaProcessingInfo):
def get_hf_processor(self):
def get_hf_processor(self, **kwargs: object):
hf_config = self.get_hf_config()
vision_info = self.get_vision_encoder_info()
kwargs.setdefault("patch_size", vision_info.get_patch_size())
if Version(TRANSFORMERS_VERSION) < Version("4.48"):
# BUG: num_additional_image_tokens = 0 but treated as 1,
# so we set vision_feature_select_strategy to None to offset this
vision_feature_select_strategy = None
kwargs.setdefault("vision_feature_select_strategy", None)
else:
# FIXED: https://github.com/huggingface/transformers/pull/33424/files#diff-6a37acc21efcadaae622b079b2712a131131448ff64262bd219aa346aeec38faL150
vision_feature_select_strategy = hf_config.vision_feature_select_strategy # noqa: E501
kwargs.setdefault(
"vision_feature_select_strategy",
hf_config.vision_feature_select_strategy,
)
return self.ctx.get_hf_processor(
LlavaProcessor,
patch_size=vision_info.get_patch_size(),
vision_feature_select_strategy=vision_feature_select_strategy,
)
return self.ctx.get_hf_processor(LlavaProcessor, **kwargs)
class MantisMultiModalProcessor(LlavaMultiModalProcessor):
......
......@@ -72,8 +72,8 @@ class LlavaNextProcessingInfo(BaseLlavaProcessingInfo):
def get_hf_config(self) -> LlavaNextLikeConfig:
return self.ctx.get_hf_config(LlavaNextConfig)
def get_hf_processor(self):
hf_processor = self.ctx.get_hf_processor(LlavaNextProcessor)
def get_hf_processor(self, **kwargs: object):
hf_processor = self.ctx.get_hf_processor(LlavaNextProcessor, **kwargs)
# In case patch_size is omitted from `processor_config.json`
# e.g. for E5-V: https://huggingface.co/royokong/e5-v
......
......@@ -56,8 +56,8 @@ class LlavaNextVideoProcessingInfo(BaseProcessingInfo):
def get_vision_encoder_info(self):
return get_vision_encoder_info(self.get_hf_config())
def get_hf_processor(self):
return self.ctx.get_hf_processor(LlavaNextVideoProcessor)
def get_hf_processor(self, **kwargs: object):
return self.ctx.get_hf_processor(LlavaNextVideoProcessor, **kwargs)
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"video": 1}
......
......@@ -97,8 +97,8 @@ class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo):
def get_hf_config(self) -> LlavaOnevisionLikeConfig:
return self.ctx.get_hf_config(LlavaOnevisionConfig)
def get_hf_processor(self):
return self.ctx.get_hf_processor(LlavaOnevisionProcessor)
def get_hf_processor(self, **kwargs: object):
return self.ctx.get_hf_processor(LlavaOnevisionProcessor, **kwargs)
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None, "video": None}
......
......@@ -331,11 +331,8 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
def get_hf_config(self):
return self.ctx.get_hf_config()
def get_hf_processor(
self,
**kwargs: object,
):
hf_processor = self.ctx.get_hf_processor()
def get_hf_processor(self, **kwargs: object):
hf_processor = self.ctx.get_hf_processor(**kwargs)
# NumPy arrays are considered as Iterable but not Sequence in
# https://github.com/huggingface/transformers/blob/main/src/transformers/image_transforms.py#L428
......
......@@ -94,8 +94,8 @@ class MllamaProcessingInfo(BaseProcessingInfo):
def get_hf_config(self) -> MllamaConfig:
return self.ctx.get_hf_config(MllamaConfig)
def get_hf_processor(self) -> MllamaProcessor:
return self.ctx.get_hf_processor(MllamaProcessor)
def get_hf_processor(self, **kwargs: object) -> MllamaProcessor:
return self.ctx.get_hf_processor(MllamaProcessor, **kwargs)
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None}
......
......@@ -1200,8 +1200,8 @@ class MolmoProcessorWrapper:
class MolmoProcessingInfo(BaseProcessingInfo):
def get_hf_processor(self) -> MolmoProcessorWrapper:
processor = self.ctx.get_hf_processor()
def get_hf_processor(self, **kwargs: object) -> MolmoProcessorWrapper:
processor = self.ctx.get_hf_processor(**kwargs)
return MolmoProcessorWrapper(processor)
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
......
......@@ -69,14 +69,23 @@ class NVLMProcessingInfo(BaseInternVLProcessingInfo):
def get_hf_processor(
self,
*,
min_dynamic_patch: Optional[int] = None,
max_dynamic_patch: Optional[int] = None,
dynamic_image_size: Optional[bool] = None,
**kwargs: object,
) -> NVLMProcessor:
return NVLMProcessor(
self.get_hf_config(),
self.get_tokenizer(),
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
if min_dynamic_patch is not None:
kwargs["min_dynamic_patch"] = min_dynamic_patch
if max_dynamic_patch is not None:
kwargs["max_dynamic_patch"] = max_dynamic_patch
if dynamic_image_size is not None:
kwargs["dynamic_image_size"] = dynamic_image_size
return self.ctx.init_processor(
NVLMProcessor,
config=self.get_hf_config(),
tokenizer=self.get_tokenizer(),
**kwargs,
)
def get_max_image_tokens(self) -> int:
......
......@@ -16,8 +16,8 @@ from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import NestedTensors
from vllm.multimodal.utils import cached_get_tokenizer
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from .interfaces import SupportsMultiModal, SupportsPP
from .siglip import (SiglipVisionModel, dummy_image_for_siglip,
......@@ -88,7 +88,7 @@ def input_processor_for_paligemma(ctx: InputContext,
model_config = ctx.model_config
hf_config = ctx.get_hf_config(PaliGemmaConfig)
tokenizer = cached_get_tokenizer(model_config.tokenizer)
tokenizer = cached_tokenizer_from_config(model_config)
image_feature_size = hf_config.text_config.num_image_tokens
image_token_str = tokenizer.decode(hf_config.image_token_index)
bos_token = tokenizer.decode(hf_config.bos_token_id)
......
......@@ -313,11 +313,12 @@ class Phi3VProcessingInfo(BaseProcessingInfo):
self,
*,
num_crops: Optional[int] = None,
**kwargs: object,
) -> ProcessorMixin:
if num_crops is not None:
return self.ctx.get_hf_processor(num_crops=num_crops)
kwargs["num_crops"] = num_crops
return self.ctx.get_hf_processor()
return self.ctx.get_hf_processor(**kwargs)
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None}
......
......@@ -32,9 +32,9 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
from vllm.multimodal.inputs import NestedTensors, PlaceholderRange
from vllm.multimodal.utils import (cached_get_tokenizer,
consecutive_placeholder_ranges)
from vllm.multimodal.utils import consecutive_placeholder_ranges
from vllm.sequence import IntermediateTensors, SequenceData
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from .interfaces import SupportsMultiModal, SupportsPP
from .utils import (init_vllm_registered_model, maybe_prefix,
......@@ -49,9 +49,7 @@ except ImportError:
def get_max_pixtral_image_tokens(ctx: InputContext):
tokenizer = cached_get_tokenizer(
ctx.model_config.tokenizer,
tokenizer_mode=ctx.model_config.tokenizer_mode)
tokenizer = cached_tokenizer_from_config(ctx.model_config)
mm_encoder = tokenizer.instruct.mm_encoder
image_config = mm_encoder.mm_config if hasattr(
......@@ -65,9 +63,7 @@ def get_max_pixtral_image_tokens(ctx: InputContext):
def dummy_data_for_pixtral(ctx: InputContext, seq_len: int,
mm_counts: Mapping[str, int]):
tokenizer = cached_get_tokenizer(
ctx.model_config.tokenizer,
tokenizer_mode=ctx.model_config.tokenizer_mode)
tokenizer = cached_tokenizer_from_config(ctx.model_config)
mm_encoder = tokenizer.mistral.instruct_tokenizer.mm_encoder
image_token_id = mm_encoder.special_ids.img
......@@ -109,9 +105,7 @@ def input_mapper_for_pixtral(ctx: InputContext,
MultiModalKwargs containing the stacked normalized images tensor or
image embeddings.
"""
model_config = ctx.model_config
tokenizer = cached_get_tokenizer(
model_config.tokenizer, tokenizer_mode=model_config.tokenizer_mode)
tokenizer = cached_tokenizer_from_config(ctx.model_config)
data_list = data if isinstance(data, list) else [data]
......@@ -138,9 +132,7 @@ def input_processor_for_pixtral(ctx: InputContext, inputs: DecoderOnlyInputs):
prompt_token_ids = inputs.get("prompt_token_ids")
prompt = inputs.get("prompt")
tokenizer = cached_get_tokenizer(
ctx.model_config.tokenizer,
tokenizer_mode=ctx.model_config.tokenizer_mode)
tokenizer = cached_tokenizer_from_config(ctx.model_config)
mm_encoder = tokenizer.mistral.instruct_tokenizer.mm_encoder
image_token_id = mm_encoder.special_ids.img
......
......@@ -36,8 +36,6 @@ from transformers import BatchFeature
from transformers.models.qwen2_5_vl import Qwen2_5_VLProcessor
from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import (
Qwen2_5_VLConfig, Qwen2_5_VLVisionConfig)
from transformers.models.qwen2_vl import (Qwen2VLImageProcessor,
Qwen2VLImageProcessorFast)
from vllm.attention import AttentionMetadata
from vllm.config import VllmConfig
......@@ -690,41 +688,20 @@ class Qwen2_5_VLProcessingInfo(Qwen2VLProcessingInfo):
*,
min_pixels: Optional[int] = None,
max_pixels: Optional[int] = None,
fps: Optional[float] = 2.0,
size: Optional[dict[str, int]] = None,
fps: Optional[float] = None,
**kwargs: object,
) -> Qwen2_5_VLProcessor:
hf_processor = self.ctx.get_hf_processor(Qwen2_5_VLProcessor)
image_processor = hf_processor.image_processor # type: ignore
assert isinstance(image_processor,
(Qwen2VLImageProcessor, Qwen2VLImageProcessorFast))
if min_pixels:
image_processor.min_pixels = min_pixels
if max_pixels:
image_processor.max_pixels = max_pixels
if max_pixels or min_pixels:
image_processor.size = {
"min_pixels": image_processor.min_pixels,
"max_pixels": image_processor.max_pixels,
}
return hf_processor
def get_image_processor(
self,
*,
min_pixels: Optional[int] = None,
max_pixels: Optional[int] = None,
fps: Optional[float] = 2.0,
) -> Union[Qwen2VLImageProcessor, Qwen2VLImageProcessorFast]:
hf_processor = self.get_hf_processor(
min_pixels=min_pixels,
max_pixels=max_pixels,
fps=fps,
if fps is not None:
kwargs["fps"] = fps
return self.ctx.get_hf_processor(
Qwen2_5_VLProcessor,
image_processor=self.get_image_processor(min_pixels=min_pixels,
max_pixels=max_pixels,
size=size),
**kwargs,
)
image_processor = hf_processor.image_processor # type: ignore
assert isinstance(image_processor,
(Qwen2VLImageProcessor, Qwen2VLImageProcessorFast))
return image_processor
class Qwen2_5_VLMultiModalProcessor(Qwen2VLMultiModalProcessor):
......
......@@ -93,8 +93,9 @@ class Qwen2AudioProcessingInfo(BaseProcessingInfo):
*,
# Ignored in initialization
sampling_rate: Optional[int] = None,
**kwargs: object,
) -> Qwen2AudioProcessor:
return self.ctx.get_hf_processor(Qwen2AudioProcessor)
return self.ctx.get_hf_processor(Qwen2AudioProcessor, **kwargs)
def get_feature_extractor(
self,
......
......@@ -31,9 +31,7 @@ import torch
import torch.nn as nn
import torch.nn.functional as F
from einops import rearrange, repeat
from packaging.version import Version
from transformers import BatchFeature
from transformers import __version__ as TRANSFORMERS_VERSION
from transformers.models.qwen2_vl import (Qwen2VLImageProcessor,
Qwen2VLProcessor)
from transformers.models.qwen2_vl.configuration_qwen2_vl import (
......@@ -69,6 +67,8 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
from vllm.platforms import _Backend
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.config import uses_mrope
from vllm.transformers_utils.processor import (
cached_image_processor_from_config)
from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
from .utils import (AutoWeightsLoader, WeightsMapper,
......@@ -722,40 +722,64 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
*,
min_pixels: Optional[int] = None,
max_pixels: Optional[int] = None,
size: Optional[dict[str, int]] = None,
**kwargs: object,
) -> Qwen2VLProcessor:
hf_processor = self.ctx.get_hf_processor(Qwen2VLProcessor)
image_processor = hf_processor.image_processor # type: ignore
assert isinstance(image_processor, Qwen2VLImageProcessor)
if min_pixels:
image_processor.min_pixels = min_pixels
if max_pixels:
image_processor.max_pixels = max_pixels
if max_pixels or min_pixels:
image_processor.size = {
"min_pixels": image_processor.min_pixels,
"max_pixels": image_processor.max_pixels,
}
return hf_processor
return self.ctx.get_hf_processor(
Qwen2VLProcessor,
image_processor=self.get_image_processor(min_pixels=min_pixels,
max_pixels=max_pixels,
size=size),
**kwargs,
)
def _get_image_processor_kwargs(
self,
*,
min_pixels: Optional[int] = None,
max_pixels: Optional[int] = None,
size: Optional[dict[str, int]] = None,
**kwargs: object,
):
if self.ctx.model_config.mm_processor_kwargs:
kwargs.update(self.ctx.model_config.mm_processor_kwargs)
if min_pixels is not None:
kwargs["min_pixels"] = min_pixels
if size is None:
size = {"shortest_edge": min_pixels}
else:
size["shortest_edge"] = min_pixels
if max_pixels is not None:
kwargs["max_pixels"] = max_pixels
if size is None:
size = {"longest_edge": max_pixels}
else:
size["longest_edge"] = max_pixels
if size is not None:
kwargs["size"] = size
return kwargs
def get_image_processor(
self,
*,
min_pixels: Optional[int] = None,
max_pixels: Optional[int] = None,
size: Optional[dict[str, int]] = None,
**kwargs: object,
):
hf_processor = self.get_hf_processor(min_pixels=min_pixels,
max_pixels=max_pixels)
image_processor = hf_processor.image_processor # type: ignore
if Version(TRANSFORMERS_VERSION) >= Version("4.49"):
from transformers.models.qwen2_vl import Qwen2VLImageProcessorFast
assert isinstance(
image_processor,
(Qwen2VLImageProcessor, Qwen2VLImageProcessorFast))
else:
assert isinstance(image_processor, Qwen2VLImageProcessor)
return image_processor
return cached_image_processor_from_config(
self.ctx.model_config,
**self._get_image_processor_kwargs(min_pixels=min_pixels,
max_pixels=max_pixels,
size=size,
**kwargs),
)
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None, "video": None}
......@@ -952,6 +976,18 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
def _get_data_parser(self) -> MultiModalDataParser:
return Qwen2VLMultiModalDataParser()
def _call_hf_processor(
self,
prompt: str,
mm_data: Mapping[str, object],
mm_kwargs: Mapping[str, object],
) -> BatchFeature:
return self.info.ctx.call_hf_processor(
self.info.get_hf_processor(**mm_kwargs),
dict(text=prompt, **mm_data),
self.info._get_image_processor_kwargs(**mm_kwargs),
)
def _get_prompt_replacements(
self,
mm_items: MultiModalDataItems,
......@@ -964,8 +1000,6 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
tokenizer = self.info.get_tokenizer()
vocab = tokenizer.get_vocab()
# NOTE: Only Qwen2VLProcessor in transformers 4.47.0 has
# image_token and video_token registered
placeholder = {
"image": vocab[hf_processor.image_token],
"video": vocab[hf_processor.video_token],
......
......@@ -519,8 +519,13 @@ class QwenVLProcessingInfo(BaseProcessingInfo):
return _get_tokenizer_without_image_pad(tokenizer)
def get_hf_processor(self) -> QwenVLProcessor:
return QwenVLProcessor(self.get_hf_config(), self.get_tokenizer())
def get_hf_processor(self, **kwargs: object) -> QwenVLProcessor:
return self.ctx.init_processor(
QwenVLProcessor,
config=self.get_hf_config(),
tokenizer=self.get_tokenizer(),
**kwargs,
)
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None}
......
......@@ -68,8 +68,9 @@ class UltravoxProcessingInfo(BaseProcessingInfo):
*,
# Ignored in initialization
sampling_rate: Optional[int] = None,
**kwargs: object,
) -> ProcessorMixin:
hf_processor = self.ctx.get_hf_processor()
hf_processor = self.ctx.get_hf_processor(**kwargs)
# NOTE: Ultravox processing definition uses '<|eot_id|>' as the
# placeholder that will cause confusion with the actual end of turn
......
......@@ -29,7 +29,7 @@ from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
NestedTensors)
from vllm.multimodal.audio import resample_audio
from vllm.sequence import SequenceData
from vllm.transformers_utils.processor import cached_get_processor
from vllm.transformers_utils.processor import cached_processor_from_config
from .interfaces import SupportsMultiModal, SupportsTranscription
from .utils import AutoWeightsLoader, WeightsMapper, make_layers
......@@ -579,7 +579,7 @@ def dummy_encoder_data_for_whisper(ctx: InputContext, seq_len: int,
mm_counts: Mapping[str, int]):
assert mm_counts["audio"] == 1
num_tokens = get_max_whisper_audio_tokens(ctx)
processor = cached_get_processor(ctx.model_config.model)
processor = cached_processor_from_config(ctx.model_config)
chunk_length = processor.feature_extractor.chunk_length
sampling_rate = processor.feature_extractor.sampling_rate
num_samples = chunk_length * sampling_rate
......@@ -596,7 +596,7 @@ def input_processor_for_whisper(ctx: InputContext, inputs):
multi_modal_data["audio"] = multi_modal_data["audio"][0]
# Resample and process audio
audio, orig_sr = multi_modal_data["audio"]
processor = cached_get_processor(ctx.model_config.model)
processor = cached_processor_from_config(ctx.model_config)
target_sr = processor.feature_extractor.sampling_rate
audio = resample_audio(audio, orig_sr=orig_sr, target_sr=target_sr)
multi_modal_data["audio"] = (audio, target_sr)
......@@ -618,7 +618,7 @@ def input_mapper_for_whisper(
if len(multi_modal_data) == 0:
return MultiModalKwargs()
processor = cached_get_processor(ctx.model_config.model)
processor = cached_processor_from_config(ctx.model_config)
sampling_rate = processor.feature_extractor.sampling_rate
audios = [audio for audio, _ in multi_modal_data]
......
# SPDX-License-Identifier: Apache-2.0
import base64
from functools import lru_cache
from io import BytesIO
from pathlib import Path
from typing import TYPE_CHECKING, Any, Dict, Optional
......@@ -11,7 +10,7 @@ from PIL import Image
from vllm.inputs.registry import InputContext
from vllm.logger import init_logger
from vllm.transformers_utils.processor import get_image_processor
from vllm.transformers_utils.processor import cached_get_image_processor
from vllm.utils import is_list_of
from .base import MediaIO, MultiModalPlugin
......@@ -22,8 +21,6 @@ if TYPE_CHECKING:
logger = init_logger(__name__)
cached_get_image_processor = lru_cache(get_image_processor)
class ImagePlugin(MultiModalPlugin):
"""Plugin for image data."""
......
......@@ -11,7 +11,8 @@ import torch.nn as nn
from vllm.envs import VLLM_MM_INPUT_CACHE_SIZE
from vllm.inputs import InputProcessingContext
from vllm.logger import init_logger
from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.transformers_utils.tokenizer import (AnyTokenizer,
cached_tokenizer_from_config)
from vllm.utils import ClassRegistry
from .audio import AudioPlugin
......@@ -21,7 +22,6 @@ from .inputs import MultiModalDataDict, MultiModalKwargs, NestedTensors
from .processing import (BaseMultiModalProcessor, BaseProcessingInfo,
ProcessingCache)
from .profiling import BaseDummyInputsBuilder, MultiModalProfiler
from .utils import cached_get_tokenizer
from .video import VideoPlugin
if TYPE_CHECKING:
......@@ -256,10 +256,7 @@ class MultiModalRegistry:
on underlying model configuration.
"""
if self.has_processor(model_config):
tokenizer = cached_get_tokenizer(
model_config.tokenizer,
trust_remote_code=model_config.trust_remote_code,
)
tokenizer = cached_tokenizer_from_config(model_config)
processor = self.create_processor(model_config, tokenizer)
seq_len = model_config.max_model_len
mm_limits = self.get_mm_limits_per_prompt(model_config)
......@@ -374,10 +371,7 @@ class MultiModalRegistry:
This should be called after :meth:`init_mm_limits_per_prompt`.
"""
if self.has_processor(model_config):
tokenizer = cached_get_tokenizer(
model_config.tokenizer,
trust_remote_code=model_config.trust_remote_code,
)
tokenizer = cached_tokenizer_from_config(model_config)
processor = self.create_processor(model_config, tokenizer)
profiler = MultiModalProfiler(processor)
return profiler.get_mm_limits()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment