"csrc/vscode:/vscode.git/clone" did not exist on "1d5922fadeebc5ec133dc1c88eb1e85605a5510c"
Unverified Commit 377d10bd authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[VLM][Bugfix] Pass processor kwargs properly on init (#13516)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent 52ce14d3
...@@ -120,6 +120,7 @@ def resolve_internvl_min_max_num( ...@@ -120,6 +120,7 @@ def resolve_internvl_min_max_num(
dynamic_image_size: bool, dynamic_image_size: bool,
use_thumbnail: bool, use_thumbnail: bool,
) -> tuple[int, int]: ) -> tuple[int, int]:
min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1 max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
if use_thumbnail and max_dynamic_patch != 1: if use_thumbnail and max_dynamic_patch != 1:
...@@ -247,6 +248,7 @@ class BaseInternVLProcessor(ABC): ...@@ -247,6 +248,7 @@ class BaseInternVLProcessor(ABC):
config: PretrainedConfig, config: PretrainedConfig,
tokenizer: AnyTokenizer, tokenizer: AnyTokenizer,
*, *,
min_dynamic_patch: Optional[int] = None,
max_dynamic_patch: Optional[int] = None, max_dynamic_patch: Optional[int] = None,
dynamic_image_size: Optional[bool] = None, dynamic_image_size: Optional[bool] = None,
) -> None: ) -> None:
...@@ -258,18 +260,22 @@ class BaseInternVLProcessor(ABC): ...@@ -258,18 +260,22 @@ class BaseInternVLProcessor(ABC):
image_size: int = config.vision_config.image_size image_size: int = config.vision_config.image_size
patch_size: int = config.vision_config.patch_size patch_size: int = config.vision_config.patch_size
if dynamic_image_size is None: if min_dynamic_patch is None:
dynamic_image_size = config.dynamic_image_size min_dynamic_patch = config.min_dynamic_patch
assert isinstance(dynamic_image_size, bool) assert isinstance(min_dynamic_patch, int)
if max_dynamic_patch is None: if max_dynamic_patch is None:
max_dynamic_patch = config.max_dynamic_patch max_dynamic_patch = config.max_dynamic_patch
assert isinstance(max_dynamic_patch, int) assert isinstance(max_dynamic_patch, int)
if dynamic_image_size is None:
dynamic_image_size = config.dynamic_image_size
assert isinstance(dynamic_image_size, bool)
self.num_image_token = int( self.num_image_token = int(
(image_size // patch_size)**2 * (config.downsample_ratio**2)) (image_size // patch_size)**2 * (config.downsample_ratio**2))
self.image_size = image_size self.image_size = image_size
self.min_dynamic_patch: int = config.min_dynamic_patch self.min_dynamic_patch = min_dynamic_patch
self.max_dynamic_patch = max_dynamic_patch self.max_dynamic_patch = max_dynamic_patch
self.dynamic_image_size = dynamic_image_size self.dynamic_image_size = dynamic_image_size
self.use_thumbnail: bool = config.use_thumbnail self.use_thumbnail: bool = config.use_thumbnail
...@@ -298,11 +304,13 @@ class BaseInternVLProcessor(ABC): ...@@ -298,11 +304,13 @@ class BaseInternVLProcessor(ABC):
def resolve_min_max_num( def resolve_min_max_num(
self, self,
*, *,
min_dynamic_patch: Optional[int] = None,
max_dynamic_patch: Optional[int] = None, max_dynamic_patch: Optional[int] = None,
dynamic_image_size: Optional[bool] = None, dynamic_image_size: Optional[bool] = None,
use_thumbnail: Optional[bool] = None, use_thumbnail: Optional[bool] = None,
) -> tuple[int, int]: ) -> tuple[int, int]:
min_dynamic_patch = self.min_dynamic_patch min_dynamic_patch = (self.min_dynamic_patch if min_dynamic_patch
is None else min_dynamic_patch)
max_dynamic_patch = (self.max_dynamic_patch if max_dynamic_patch max_dynamic_patch = (self.max_dynamic_patch if max_dynamic_patch
is None else max_dynamic_patch) is None else max_dynamic_patch)
dynamic_image_size = (self.dynamic_image_size if dynamic_image_size dynamic_image_size = (self.dynamic_image_size if dynamic_image_size
...@@ -320,11 +328,13 @@ class BaseInternVLProcessor(ABC): ...@@ -320,11 +328,13 @@ class BaseInternVLProcessor(ABC):
def resolve_target_ratios( def resolve_target_ratios(
self, self,
*, *,
min_dynamic_patch: Optional[int] = None,
max_dynamic_patch: Optional[int] = None, max_dynamic_patch: Optional[int] = None,
dynamic_image_size: Optional[bool] = None, dynamic_image_size: Optional[bool] = None,
use_thumbnail: Optional[bool] = None, use_thumbnail: Optional[bool] = None,
) -> list[tuple[int, int]]: ) -> list[tuple[int, int]]:
min_num, max_num = self.resolve_min_max_num( min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch, max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size, dynamic_image_size=dynamic_image_size,
use_thumbnail=use_thumbnail, use_thumbnail=use_thumbnail,
...@@ -355,10 +365,12 @@ class BaseInternVLProcessor(ABC): ...@@ -355,10 +365,12 @@ class BaseInternVLProcessor(ABC):
def _images_to_pixel_values_lst( def _images_to_pixel_values_lst(
self, self,
images: list[Image.Image], images: list[Image.Image],
min_dynamic_patch: Optional[int] = None,
max_dynamic_patch: Optional[int] = None, max_dynamic_patch: Optional[int] = None,
dynamic_image_size: Optional[bool] = None, dynamic_image_size: Optional[bool] = None,
) -> list[torch.Tensor]: ) -> list[torch.Tensor]:
min_num, max_num = self.resolve_min_max_num( min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch, max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size, dynamic_image_size=dynamic_image_size,
use_thumbnail=False, # Applied in image_to_pixel_values use_thumbnail=False, # Applied in image_to_pixel_values
...@@ -378,6 +390,7 @@ class BaseInternVLProcessor(ABC): ...@@ -378,6 +390,7 @@ class BaseInternVLProcessor(ABC):
self, self,
text: Optional[Union[str, list[str]]] = None, text: Optional[Union[str, list[str]]] = None,
images: Optional[Union[Image.Image, list[Image.Image]]] = None, images: Optional[Union[Image.Image, list[Image.Image]]] = None,
min_dynamic_patch: Optional[int] = None,
max_dynamic_patch: Optional[int] = None, max_dynamic_patch: Optional[int] = None,
dynamic_image_size: Optional[bool] = None, dynamic_image_size: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None, return_tensors: Optional[Union[str, TensorType]] = None,
...@@ -396,6 +409,7 @@ class BaseInternVLProcessor(ABC): ...@@ -396,6 +409,7 @@ class BaseInternVLProcessor(ABC):
else: else:
pixel_values_lst = self._images_to_pixel_values_lst( pixel_values_lst = self._images_to_pixel_values_lst(
images, images,
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch, max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size, dynamic_image_size=dynamic_image_size,
) )
...@@ -451,8 +465,10 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo): ...@@ -451,8 +465,10 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
def get_hf_processor( def get_hf_processor(
self, self,
*, *,
min_dynamic_patch: Optional[int] = None,
max_dynamic_patch: Optional[int] = None, max_dynamic_patch: Optional[int] = None,
dynamic_image_size: Optional[bool] = None, dynamic_image_size: Optional[bool] = None,
**kwargs: object,
) -> BaseInternVLProcessor: ) -> BaseInternVLProcessor:
raise NotImplementedError raise NotImplementedError
...@@ -642,14 +658,23 @@ class InternVLProcessingInfo(BaseInternVLProcessingInfo): ...@@ -642,14 +658,23 @@ class InternVLProcessingInfo(BaseInternVLProcessingInfo):
def get_hf_processor( def get_hf_processor(
self, self,
*, *,
min_dynamic_patch: Optional[int] = None,
max_dynamic_patch: Optional[int] = None, max_dynamic_patch: Optional[int] = None,
dynamic_image_size: Optional[bool] = None, dynamic_image_size: Optional[bool] = None,
**kwargs: object,
) -> InternVLProcessor: ) -> InternVLProcessor:
return InternVLProcessor( if min_dynamic_patch is not None:
self.get_hf_config(), kwargs["min_dynamic_patch"] = min_dynamic_patch
self.get_tokenizer(), if max_dynamic_patch is not None:
max_dynamic_patch=max_dynamic_patch, kwargs["max_dynamic_patch"] = max_dynamic_patch
dynamic_image_size=dynamic_image_size, if dynamic_image_size is not None:
kwargs["dynamic_image_size"] = dynamic_image_size
return self.ctx.init_processor(
InternVLProcessor,
config=self.get_hf_config(),
tokenizer=self.get_tokenizer(),
**kwargs,
) )
......
...@@ -119,7 +119,7 @@ class BaseLlavaProcessingInfo(BaseProcessingInfo): ...@@ -119,7 +119,7 @@ class BaseLlavaProcessingInfo(BaseProcessingInfo):
return get_vision_encoder_info(self.get_hf_config()) return get_vision_encoder_info(self.get_hf_config())
@abstractmethod @abstractmethod
def get_hf_processor(self) -> LlavaLikeProcessor: def get_hf_processor(self, **kwargs: object) -> LlavaLikeProcessor:
raise NotImplementedError raise NotImplementedError
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
...@@ -208,8 +208,8 @@ class LlavaDummyInputsBuilder(BaseDummyInputsBuilder[_I]): ...@@ -208,8 +208,8 @@ class LlavaDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
class LlavaProcessingInfo(BaseLlavaProcessingInfo): class LlavaProcessingInfo(BaseLlavaProcessingInfo):
def get_hf_processor(self): def get_hf_processor(self, **kwargs: object):
return self.ctx.get_hf_processor(LlavaProcessor) return self.ctx.get_hf_processor(LlavaProcessor, **kwargs)
class BaseLlavaMultiModalProcessor(BaseMultiModalProcessor[_I]): class BaseLlavaMultiModalProcessor(BaseMultiModalProcessor[_I]):
...@@ -272,8 +272,8 @@ class LlavaMultiModalProcessor( ...@@ -272,8 +272,8 @@ class LlavaMultiModalProcessor(
class PixtralHFProcessingInfo(BaseLlavaProcessingInfo): class PixtralHFProcessingInfo(BaseLlavaProcessingInfo):
def get_hf_processor(self): def get_hf_processor(self, **kwargs: object):
return self.ctx.get_hf_processor(PixtralProcessor) return self.ctx.get_hf_processor(PixtralProcessor, **kwargs)
class PixtralHFMultiModalProcessor( class PixtralHFMultiModalProcessor(
...@@ -742,23 +742,24 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): ...@@ -742,23 +742,24 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
class MantisProcessingInfo(LlavaProcessingInfo): class MantisProcessingInfo(LlavaProcessingInfo):
def get_hf_processor(self): def get_hf_processor(self, **kwargs: object):
hf_config = self.get_hf_config() hf_config = self.get_hf_config()
vision_info = self.get_vision_encoder_info() vision_info = self.get_vision_encoder_info()
kwargs.setdefault("patch_size", vision_info.get_patch_size())
if Version(TRANSFORMERS_VERSION) < Version("4.48"): if Version(TRANSFORMERS_VERSION) < Version("4.48"):
# BUG: num_additional_image_tokens = 0 but treated as 1, # BUG: num_additional_image_tokens = 0 but treated as 1,
# so we set vision_feature_select_strategy to None to offset this # so we set vision_feature_select_strategy to None to offset this
vision_feature_select_strategy = None kwargs.setdefault("vision_feature_select_strategy", None)
else: else:
# FIXED: https://github.com/huggingface/transformers/pull/33424/files#diff-6a37acc21efcadaae622b079b2712a131131448ff64262bd219aa346aeec38faL150 # FIXED: https://github.com/huggingface/transformers/pull/33424/files#diff-6a37acc21efcadaae622b079b2712a131131448ff64262bd219aa346aeec38faL150
vision_feature_select_strategy = hf_config.vision_feature_select_strategy # noqa: E501 kwargs.setdefault(
"vision_feature_select_strategy",
hf_config.vision_feature_select_strategy,
)
return self.ctx.get_hf_processor( return self.ctx.get_hf_processor(LlavaProcessor, **kwargs)
LlavaProcessor,
patch_size=vision_info.get_patch_size(),
vision_feature_select_strategy=vision_feature_select_strategy,
)
class MantisMultiModalProcessor(LlavaMultiModalProcessor): class MantisMultiModalProcessor(LlavaMultiModalProcessor):
......
...@@ -72,8 +72,8 @@ class LlavaNextProcessingInfo(BaseLlavaProcessingInfo): ...@@ -72,8 +72,8 @@ class LlavaNextProcessingInfo(BaseLlavaProcessingInfo):
def get_hf_config(self) -> LlavaNextLikeConfig: def get_hf_config(self) -> LlavaNextLikeConfig:
return self.ctx.get_hf_config(LlavaNextConfig) return self.ctx.get_hf_config(LlavaNextConfig)
def get_hf_processor(self): def get_hf_processor(self, **kwargs: object):
hf_processor = self.ctx.get_hf_processor(LlavaNextProcessor) hf_processor = self.ctx.get_hf_processor(LlavaNextProcessor, **kwargs)
# In case patch_size is omitted from `processor_config.json` # In case patch_size is omitted from `processor_config.json`
# e.g. for E5-V: https://huggingface.co/royokong/e5-v # e.g. for E5-V: https://huggingface.co/royokong/e5-v
......
...@@ -56,8 +56,8 @@ class LlavaNextVideoProcessingInfo(BaseProcessingInfo): ...@@ -56,8 +56,8 @@ class LlavaNextVideoProcessingInfo(BaseProcessingInfo):
def get_vision_encoder_info(self): def get_vision_encoder_info(self):
return get_vision_encoder_info(self.get_hf_config()) return get_vision_encoder_info(self.get_hf_config())
def get_hf_processor(self): def get_hf_processor(self, **kwargs: object):
return self.ctx.get_hf_processor(LlavaNextVideoProcessor) return self.ctx.get_hf_processor(LlavaNextVideoProcessor, **kwargs)
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"video": 1} return {"video": 1}
......
...@@ -97,8 +97,8 @@ class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo): ...@@ -97,8 +97,8 @@ class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo):
def get_hf_config(self) -> LlavaOnevisionLikeConfig: def get_hf_config(self) -> LlavaOnevisionLikeConfig:
return self.ctx.get_hf_config(LlavaOnevisionConfig) return self.ctx.get_hf_config(LlavaOnevisionConfig)
def get_hf_processor(self): def get_hf_processor(self, **kwargs: object):
return self.ctx.get_hf_processor(LlavaOnevisionProcessor) return self.ctx.get_hf_processor(LlavaOnevisionProcessor, **kwargs)
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None, "video": None} return {"image": None, "video": None}
......
...@@ -331,11 +331,8 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo): ...@@ -331,11 +331,8 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
def get_hf_config(self): def get_hf_config(self):
return self.ctx.get_hf_config() return self.ctx.get_hf_config()
def get_hf_processor( def get_hf_processor(self, **kwargs: object):
self, hf_processor = self.ctx.get_hf_processor(**kwargs)
**kwargs: object,
):
hf_processor = self.ctx.get_hf_processor()
# NumPy arrays are considered as Iterable but not Sequence in # NumPy arrays are considered as Iterable but not Sequence in
# https://github.com/huggingface/transformers/blob/main/src/transformers/image_transforms.py#L428 # https://github.com/huggingface/transformers/blob/main/src/transformers/image_transforms.py#L428
......
...@@ -94,8 +94,8 @@ class MllamaProcessingInfo(BaseProcessingInfo): ...@@ -94,8 +94,8 @@ class MllamaProcessingInfo(BaseProcessingInfo):
def get_hf_config(self) -> MllamaConfig: def get_hf_config(self) -> MllamaConfig:
return self.ctx.get_hf_config(MllamaConfig) return self.ctx.get_hf_config(MllamaConfig)
def get_hf_processor(self) -> MllamaProcessor: def get_hf_processor(self, **kwargs: object) -> MllamaProcessor:
return self.ctx.get_hf_processor(MllamaProcessor) return self.ctx.get_hf_processor(MllamaProcessor, **kwargs)
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None} return {"image": None}
......
...@@ -1200,8 +1200,8 @@ class MolmoProcessorWrapper: ...@@ -1200,8 +1200,8 @@ class MolmoProcessorWrapper:
class MolmoProcessingInfo(BaseProcessingInfo): class MolmoProcessingInfo(BaseProcessingInfo):
def get_hf_processor(self) -> MolmoProcessorWrapper: def get_hf_processor(self, **kwargs: object) -> MolmoProcessorWrapper:
processor = self.ctx.get_hf_processor() processor = self.ctx.get_hf_processor(**kwargs)
return MolmoProcessorWrapper(processor) return MolmoProcessorWrapper(processor)
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
......
...@@ -69,14 +69,23 @@ class NVLMProcessingInfo(BaseInternVLProcessingInfo): ...@@ -69,14 +69,23 @@ class NVLMProcessingInfo(BaseInternVLProcessingInfo):
def get_hf_processor( def get_hf_processor(
self, self,
*, *,
min_dynamic_patch: Optional[int] = None,
max_dynamic_patch: Optional[int] = None, max_dynamic_patch: Optional[int] = None,
dynamic_image_size: Optional[bool] = None, dynamic_image_size: Optional[bool] = None,
**kwargs: object,
) -> NVLMProcessor: ) -> NVLMProcessor:
return NVLMProcessor( if min_dynamic_patch is not None:
self.get_hf_config(), kwargs["min_dynamic_patch"] = min_dynamic_patch
self.get_tokenizer(), if max_dynamic_patch is not None:
max_dynamic_patch=max_dynamic_patch, kwargs["max_dynamic_patch"] = max_dynamic_patch
dynamic_image_size=dynamic_image_size, if dynamic_image_size is not None:
kwargs["dynamic_image_size"] = dynamic_image_size
return self.ctx.init_processor(
NVLMProcessor,
config=self.get_hf_config(),
tokenizer=self.get_tokenizer(),
**kwargs,
) )
def get_max_image_tokens(self) -> int: def get_max_image_tokens(self) -> int:
......
...@@ -16,8 +16,8 @@ from vllm.model_executor.layers.sampler import SamplerOutput ...@@ -16,8 +16,8 @@ from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import NestedTensors from vllm.multimodal.inputs import NestedTensors
from vllm.multimodal.utils import cached_get_tokenizer
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from .interfaces import SupportsMultiModal, SupportsPP from .interfaces import SupportsMultiModal, SupportsPP
from .siglip import (SiglipVisionModel, dummy_image_for_siglip, from .siglip import (SiglipVisionModel, dummy_image_for_siglip,
...@@ -88,7 +88,7 @@ def input_processor_for_paligemma(ctx: InputContext, ...@@ -88,7 +88,7 @@ def input_processor_for_paligemma(ctx: InputContext,
model_config = ctx.model_config model_config = ctx.model_config
hf_config = ctx.get_hf_config(PaliGemmaConfig) hf_config = ctx.get_hf_config(PaliGemmaConfig)
tokenizer = cached_get_tokenizer(model_config.tokenizer) tokenizer = cached_tokenizer_from_config(model_config)
image_feature_size = hf_config.text_config.num_image_tokens image_feature_size = hf_config.text_config.num_image_tokens
image_token_str = tokenizer.decode(hf_config.image_token_index) image_token_str = tokenizer.decode(hf_config.image_token_index)
bos_token = tokenizer.decode(hf_config.bos_token_id) bos_token = tokenizer.decode(hf_config.bos_token_id)
......
...@@ -313,11 +313,12 @@ class Phi3VProcessingInfo(BaseProcessingInfo): ...@@ -313,11 +313,12 @@ class Phi3VProcessingInfo(BaseProcessingInfo):
self, self,
*, *,
num_crops: Optional[int] = None, num_crops: Optional[int] = None,
**kwargs: object,
) -> ProcessorMixin: ) -> ProcessorMixin:
if num_crops is not None: if num_crops is not None:
return self.ctx.get_hf_processor(num_crops=num_crops) kwargs["num_crops"] = num_crops
return self.ctx.get_hf_processor() return self.ctx.get_hf_processor(**kwargs)
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None} return {"image": None}
......
...@@ -32,9 +32,9 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader ...@@ -32,9 +32,9 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
from vllm.multimodal.inputs import NestedTensors, PlaceholderRange from vllm.multimodal.inputs import NestedTensors, PlaceholderRange
from vllm.multimodal.utils import (cached_get_tokenizer, from vllm.multimodal.utils import consecutive_placeholder_ranges
consecutive_placeholder_ranges)
from vllm.sequence import IntermediateTensors, SequenceData from vllm.sequence import IntermediateTensors, SequenceData
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from .interfaces import SupportsMultiModal, SupportsPP from .interfaces import SupportsMultiModal, SupportsPP
from .utils import (init_vllm_registered_model, maybe_prefix, from .utils import (init_vllm_registered_model, maybe_prefix,
...@@ -49,9 +49,7 @@ except ImportError: ...@@ -49,9 +49,7 @@ except ImportError:
def get_max_pixtral_image_tokens(ctx: InputContext): def get_max_pixtral_image_tokens(ctx: InputContext):
tokenizer = cached_get_tokenizer( tokenizer = cached_tokenizer_from_config(ctx.model_config)
ctx.model_config.tokenizer,
tokenizer_mode=ctx.model_config.tokenizer_mode)
mm_encoder = tokenizer.instruct.mm_encoder mm_encoder = tokenizer.instruct.mm_encoder
image_config = mm_encoder.mm_config if hasattr( image_config = mm_encoder.mm_config if hasattr(
...@@ -65,9 +63,7 @@ def get_max_pixtral_image_tokens(ctx: InputContext): ...@@ -65,9 +63,7 @@ def get_max_pixtral_image_tokens(ctx: InputContext):
def dummy_data_for_pixtral(ctx: InputContext, seq_len: int, def dummy_data_for_pixtral(ctx: InputContext, seq_len: int,
mm_counts: Mapping[str, int]): mm_counts: Mapping[str, int]):
tokenizer = cached_get_tokenizer( tokenizer = cached_tokenizer_from_config(ctx.model_config)
ctx.model_config.tokenizer,
tokenizer_mode=ctx.model_config.tokenizer_mode)
mm_encoder = tokenizer.mistral.instruct_tokenizer.mm_encoder mm_encoder = tokenizer.mistral.instruct_tokenizer.mm_encoder
image_token_id = mm_encoder.special_ids.img image_token_id = mm_encoder.special_ids.img
...@@ -109,9 +105,7 @@ def input_mapper_for_pixtral(ctx: InputContext, ...@@ -109,9 +105,7 @@ def input_mapper_for_pixtral(ctx: InputContext,
MultiModalKwargs containing the stacked normalized images tensor or MultiModalKwargs containing the stacked normalized images tensor or
image embeddings. image embeddings.
""" """
model_config = ctx.model_config tokenizer = cached_tokenizer_from_config(ctx.model_config)
tokenizer = cached_get_tokenizer(
model_config.tokenizer, tokenizer_mode=model_config.tokenizer_mode)
data_list = data if isinstance(data, list) else [data] data_list = data if isinstance(data, list) else [data]
...@@ -138,9 +132,7 @@ def input_processor_for_pixtral(ctx: InputContext, inputs: DecoderOnlyInputs): ...@@ -138,9 +132,7 @@ def input_processor_for_pixtral(ctx: InputContext, inputs: DecoderOnlyInputs):
prompt_token_ids = inputs.get("prompt_token_ids") prompt_token_ids = inputs.get("prompt_token_ids")
prompt = inputs.get("prompt") prompt = inputs.get("prompt")
tokenizer = cached_get_tokenizer( tokenizer = cached_tokenizer_from_config(ctx.model_config)
ctx.model_config.tokenizer,
tokenizer_mode=ctx.model_config.tokenizer_mode)
mm_encoder = tokenizer.mistral.instruct_tokenizer.mm_encoder mm_encoder = tokenizer.mistral.instruct_tokenizer.mm_encoder
image_token_id = mm_encoder.special_ids.img image_token_id = mm_encoder.special_ids.img
......
...@@ -36,8 +36,6 @@ from transformers import BatchFeature ...@@ -36,8 +36,6 @@ from transformers import BatchFeature
from transformers.models.qwen2_5_vl import Qwen2_5_VLProcessor from transformers.models.qwen2_5_vl import Qwen2_5_VLProcessor
from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import ( from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import (
Qwen2_5_VLConfig, Qwen2_5_VLVisionConfig) Qwen2_5_VLConfig, Qwen2_5_VLVisionConfig)
from transformers.models.qwen2_vl import (Qwen2VLImageProcessor,
Qwen2VLImageProcessorFast)
from vllm.attention import AttentionMetadata from vllm.attention import AttentionMetadata
from vllm.config import VllmConfig from vllm.config import VllmConfig
...@@ -690,41 +688,20 @@ class Qwen2_5_VLProcessingInfo(Qwen2VLProcessingInfo): ...@@ -690,41 +688,20 @@ class Qwen2_5_VLProcessingInfo(Qwen2VLProcessingInfo):
*, *,
min_pixels: Optional[int] = None, min_pixels: Optional[int] = None,
max_pixels: Optional[int] = None, max_pixels: Optional[int] = None,
fps: Optional[float] = 2.0, size: Optional[dict[str, int]] = None,
fps: Optional[float] = None,
**kwargs: object,
) -> Qwen2_5_VLProcessor: ) -> Qwen2_5_VLProcessor:
hf_processor = self.ctx.get_hf_processor(Qwen2_5_VLProcessor) if fps is not None:
image_processor = hf_processor.image_processor # type: ignore kwargs["fps"] = fps
assert isinstance(image_processor,
(Qwen2VLImageProcessor, Qwen2VLImageProcessorFast)) return self.ctx.get_hf_processor(
Qwen2_5_VLProcessor,
if min_pixels: image_processor=self.get_image_processor(min_pixels=min_pixels,
image_processor.min_pixels = min_pixels max_pixels=max_pixels,
if max_pixels: size=size),
image_processor.max_pixels = max_pixels **kwargs,
if max_pixels or min_pixels:
image_processor.size = {
"min_pixels": image_processor.min_pixels,
"max_pixels": image_processor.max_pixels,
}
return hf_processor
def get_image_processor(
self,
*,
min_pixels: Optional[int] = None,
max_pixels: Optional[int] = None,
fps: Optional[float] = 2.0,
) -> Union[Qwen2VLImageProcessor, Qwen2VLImageProcessorFast]:
hf_processor = self.get_hf_processor(
min_pixels=min_pixels,
max_pixels=max_pixels,
fps=fps,
) )
image_processor = hf_processor.image_processor # type: ignore
assert isinstance(image_processor,
(Qwen2VLImageProcessor, Qwen2VLImageProcessorFast))
return image_processor
class Qwen2_5_VLMultiModalProcessor(Qwen2VLMultiModalProcessor): class Qwen2_5_VLMultiModalProcessor(Qwen2VLMultiModalProcessor):
......
...@@ -93,8 +93,9 @@ class Qwen2AudioProcessingInfo(BaseProcessingInfo): ...@@ -93,8 +93,9 @@ class Qwen2AudioProcessingInfo(BaseProcessingInfo):
*, *,
# Ignored in initialization # Ignored in initialization
sampling_rate: Optional[int] = None, sampling_rate: Optional[int] = None,
**kwargs: object,
) -> Qwen2AudioProcessor: ) -> Qwen2AudioProcessor:
return self.ctx.get_hf_processor(Qwen2AudioProcessor) return self.ctx.get_hf_processor(Qwen2AudioProcessor, **kwargs)
def get_feature_extractor( def get_feature_extractor(
self, self,
......
...@@ -31,9 +31,7 @@ import torch ...@@ -31,9 +31,7 @@ import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from einops import rearrange, repeat from einops import rearrange, repeat
from packaging.version import Version
from transformers import BatchFeature from transformers import BatchFeature
from transformers import __version__ as TRANSFORMERS_VERSION
from transformers.models.qwen2_vl import (Qwen2VLImageProcessor, from transformers.models.qwen2_vl import (Qwen2VLImageProcessor,
Qwen2VLProcessor) Qwen2VLProcessor)
from transformers.models.qwen2_vl.configuration_qwen2_vl import ( from transformers.models.qwen2_vl.configuration_qwen2_vl import (
...@@ -69,6 +67,8 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs ...@@ -69,6 +67,8 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
from vllm.platforms import _Backend from vllm.platforms import _Backend
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.config import uses_mrope from vllm.transformers_utils.config import uses_mrope
from vllm.transformers_utils.processor import (
cached_image_processor_from_config)
from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
from .utils import (AutoWeightsLoader, WeightsMapper, from .utils import (AutoWeightsLoader, WeightsMapper,
...@@ -722,40 +722,64 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo): ...@@ -722,40 +722,64 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
*, *,
min_pixels: Optional[int] = None, min_pixels: Optional[int] = None,
max_pixels: Optional[int] = None, max_pixels: Optional[int] = None,
size: Optional[dict[str, int]] = None,
**kwargs: object,
) -> Qwen2VLProcessor: ) -> Qwen2VLProcessor:
hf_processor = self.ctx.get_hf_processor(Qwen2VLProcessor) return self.ctx.get_hf_processor(
image_processor = hf_processor.image_processor # type: ignore Qwen2VLProcessor,
assert isinstance(image_processor, Qwen2VLImageProcessor) image_processor=self.get_image_processor(min_pixels=min_pixels,
max_pixels=max_pixels,
if min_pixels: size=size),
image_processor.min_pixels = min_pixels **kwargs,
if max_pixels: )
image_processor.max_pixels = max_pixels
if max_pixels or min_pixels: def _get_image_processor_kwargs(
image_processor.size = { self,
"min_pixels": image_processor.min_pixels, *,
"max_pixels": image_processor.max_pixels, min_pixels: Optional[int] = None,
} max_pixels: Optional[int] = None,
size: Optional[dict[str, int]] = None,
return hf_processor **kwargs: object,
):
if self.ctx.model_config.mm_processor_kwargs:
kwargs.update(self.ctx.model_config.mm_processor_kwargs)
if min_pixels is not None:
kwargs["min_pixels"] = min_pixels
if size is None:
size = {"shortest_edge": min_pixels}
else:
size["shortest_edge"] = min_pixels
if max_pixels is not None:
kwargs["max_pixels"] = max_pixels
if size is None:
size = {"longest_edge": max_pixels}
else:
size["longest_edge"] = max_pixels
if size is not None:
kwargs["size"] = size
return kwargs
def get_image_processor( def get_image_processor(
self, self,
*, *,
min_pixels: Optional[int] = None, min_pixels: Optional[int] = None,
max_pixels: Optional[int] = None, max_pixels: Optional[int] = None,
size: Optional[dict[str, int]] = None,
**kwargs: object,
): ):
hf_processor = self.get_hf_processor(min_pixels=min_pixels, return cached_image_processor_from_config(
max_pixels=max_pixels) self.ctx.model_config,
image_processor = hf_processor.image_processor # type: ignore **self._get_image_processor_kwargs(min_pixels=min_pixels,
if Version(TRANSFORMERS_VERSION) >= Version("4.49"): max_pixels=max_pixels,
from transformers.models.qwen2_vl import Qwen2VLImageProcessorFast size=size,
assert isinstance( **kwargs),
image_processor, )
(Qwen2VLImageProcessor, Qwen2VLImageProcessorFast))
else:
assert isinstance(image_processor, Qwen2VLImageProcessor)
return image_processor
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None, "video": None} return {"image": None, "video": None}
...@@ -952,6 +976,18 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo] ...@@ -952,6 +976,18 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
def _get_data_parser(self) -> MultiModalDataParser: def _get_data_parser(self) -> MultiModalDataParser:
return Qwen2VLMultiModalDataParser() return Qwen2VLMultiModalDataParser()
def _call_hf_processor(
self,
prompt: str,
mm_data: Mapping[str, object],
mm_kwargs: Mapping[str, object],
) -> BatchFeature:
return self.info.ctx.call_hf_processor(
self.info.get_hf_processor(**mm_kwargs),
dict(text=prompt, **mm_data),
self.info._get_image_processor_kwargs(**mm_kwargs),
)
def _get_prompt_replacements( def _get_prompt_replacements(
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
...@@ -964,8 +1000,6 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo] ...@@ -964,8 +1000,6 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
tokenizer = self.info.get_tokenizer() tokenizer = self.info.get_tokenizer()
vocab = tokenizer.get_vocab() vocab = tokenizer.get_vocab()
# NOTE: Only Qwen2VLProcessor in transformers 4.47.0 has
# image_token and video_token registered
placeholder = { placeholder = {
"image": vocab[hf_processor.image_token], "image": vocab[hf_processor.image_token],
"video": vocab[hf_processor.video_token], "video": vocab[hf_processor.video_token],
......
...@@ -519,8 +519,13 @@ class QwenVLProcessingInfo(BaseProcessingInfo): ...@@ -519,8 +519,13 @@ class QwenVLProcessingInfo(BaseProcessingInfo):
return _get_tokenizer_without_image_pad(tokenizer) return _get_tokenizer_without_image_pad(tokenizer)
def get_hf_processor(self) -> QwenVLProcessor: def get_hf_processor(self, **kwargs: object) -> QwenVLProcessor:
return QwenVLProcessor(self.get_hf_config(), self.get_tokenizer()) return self.ctx.init_processor(
QwenVLProcessor,
config=self.get_hf_config(),
tokenizer=self.get_tokenizer(),
**kwargs,
)
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None} return {"image": None}
......
...@@ -68,8 +68,9 @@ class UltravoxProcessingInfo(BaseProcessingInfo): ...@@ -68,8 +68,9 @@ class UltravoxProcessingInfo(BaseProcessingInfo):
*, *,
# Ignored in initialization # Ignored in initialization
sampling_rate: Optional[int] = None, sampling_rate: Optional[int] = None,
**kwargs: object,
) -> ProcessorMixin: ) -> ProcessorMixin:
hf_processor = self.ctx.get_hf_processor() hf_processor = self.ctx.get_hf_processor(**kwargs)
# NOTE: Ultravox processing definition uses '<|eot_id|>' as the # NOTE: Ultravox processing definition uses '<|eot_id|>' as the
# placeholder that will cause confusion with the actual end of turn # placeholder that will cause confusion with the actual end of turn
......
...@@ -29,7 +29,7 @@ from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs, ...@@ -29,7 +29,7 @@ from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
NestedTensors) NestedTensors)
from vllm.multimodal.audio import resample_audio from vllm.multimodal.audio import resample_audio
from vllm.sequence import SequenceData from vllm.sequence import SequenceData
from vllm.transformers_utils.processor import cached_get_processor from vllm.transformers_utils.processor import cached_processor_from_config
from .interfaces import SupportsMultiModal, SupportsTranscription from .interfaces import SupportsMultiModal, SupportsTranscription
from .utils import AutoWeightsLoader, WeightsMapper, make_layers from .utils import AutoWeightsLoader, WeightsMapper, make_layers
...@@ -579,7 +579,7 @@ def dummy_encoder_data_for_whisper(ctx: InputContext, seq_len: int, ...@@ -579,7 +579,7 @@ def dummy_encoder_data_for_whisper(ctx: InputContext, seq_len: int,
mm_counts: Mapping[str, int]): mm_counts: Mapping[str, int]):
assert mm_counts["audio"] == 1 assert mm_counts["audio"] == 1
num_tokens = get_max_whisper_audio_tokens(ctx) num_tokens = get_max_whisper_audio_tokens(ctx)
processor = cached_get_processor(ctx.model_config.model) processor = cached_processor_from_config(ctx.model_config)
chunk_length = processor.feature_extractor.chunk_length chunk_length = processor.feature_extractor.chunk_length
sampling_rate = processor.feature_extractor.sampling_rate sampling_rate = processor.feature_extractor.sampling_rate
num_samples = chunk_length * sampling_rate num_samples = chunk_length * sampling_rate
...@@ -596,7 +596,7 @@ def input_processor_for_whisper(ctx: InputContext, inputs): ...@@ -596,7 +596,7 @@ def input_processor_for_whisper(ctx: InputContext, inputs):
multi_modal_data["audio"] = multi_modal_data["audio"][0] multi_modal_data["audio"] = multi_modal_data["audio"][0]
# Resample and process audio # Resample and process audio
audio, orig_sr = multi_modal_data["audio"] audio, orig_sr = multi_modal_data["audio"]
processor = cached_get_processor(ctx.model_config.model) processor = cached_processor_from_config(ctx.model_config)
target_sr = processor.feature_extractor.sampling_rate target_sr = processor.feature_extractor.sampling_rate
audio = resample_audio(audio, orig_sr=orig_sr, target_sr=target_sr) audio = resample_audio(audio, orig_sr=orig_sr, target_sr=target_sr)
multi_modal_data["audio"] = (audio, target_sr) multi_modal_data["audio"] = (audio, target_sr)
...@@ -618,7 +618,7 @@ def input_mapper_for_whisper( ...@@ -618,7 +618,7 @@ def input_mapper_for_whisper(
if len(multi_modal_data) == 0: if len(multi_modal_data) == 0:
return MultiModalKwargs() return MultiModalKwargs()
processor = cached_get_processor(ctx.model_config.model) processor = cached_processor_from_config(ctx.model_config)
sampling_rate = processor.feature_extractor.sampling_rate sampling_rate = processor.feature_extractor.sampling_rate
audios = [audio for audio, _ in multi_modal_data] audios = [audio for audio, _ in multi_modal_data]
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import base64 import base64
from functools import lru_cache
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING, Any, Dict, Optional from typing import TYPE_CHECKING, Any, Dict, Optional
...@@ -11,7 +10,7 @@ from PIL import Image ...@@ -11,7 +10,7 @@ from PIL import Image
from vllm.inputs.registry import InputContext from vllm.inputs.registry import InputContext
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.transformers_utils.processor import get_image_processor from vllm.transformers_utils.processor import cached_get_image_processor
from vllm.utils import is_list_of from vllm.utils import is_list_of
from .base import MediaIO, MultiModalPlugin from .base import MediaIO, MultiModalPlugin
...@@ -22,8 +21,6 @@ if TYPE_CHECKING: ...@@ -22,8 +21,6 @@ if TYPE_CHECKING:
logger = init_logger(__name__) logger = init_logger(__name__)
cached_get_image_processor = lru_cache(get_image_processor)
class ImagePlugin(MultiModalPlugin): class ImagePlugin(MultiModalPlugin):
"""Plugin for image data.""" """Plugin for image data."""
......
...@@ -11,7 +11,8 @@ import torch.nn as nn ...@@ -11,7 +11,8 @@ import torch.nn as nn
from vllm.envs import VLLM_MM_INPUT_CACHE_SIZE from vllm.envs import VLLM_MM_INPUT_CACHE_SIZE
from vllm.inputs import InputProcessingContext from vllm.inputs import InputProcessingContext
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer import (AnyTokenizer,
cached_tokenizer_from_config)
from vllm.utils import ClassRegistry from vllm.utils import ClassRegistry
from .audio import AudioPlugin from .audio import AudioPlugin
...@@ -21,7 +22,6 @@ from .inputs import MultiModalDataDict, MultiModalKwargs, NestedTensors ...@@ -21,7 +22,6 @@ from .inputs import MultiModalDataDict, MultiModalKwargs, NestedTensors
from .processing import (BaseMultiModalProcessor, BaseProcessingInfo, from .processing import (BaseMultiModalProcessor, BaseProcessingInfo,
ProcessingCache) ProcessingCache)
from .profiling import BaseDummyInputsBuilder, MultiModalProfiler from .profiling import BaseDummyInputsBuilder, MultiModalProfiler
from .utils import cached_get_tokenizer
from .video import VideoPlugin from .video import VideoPlugin
if TYPE_CHECKING: if TYPE_CHECKING:
...@@ -256,10 +256,7 @@ class MultiModalRegistry: ...@@ -256,10 +256,7 @@ class MultiModalRegistry:
on underlying model configuration. on underlying model configuration.
""" """
if self.has_processor(model_config): if self.has_processor(model_config):
tokenizer = cached_get_tokenizer( tokenizer = cached_tokenizer_from_config(model_config)
model_config.tokenizer,
trust_remote_code=model_config.trust_remote_code,
)
processor = self.create_processor(model_config, tokenizer) processor = self.create_processor(model_config, tokenizer)
seq_len = model_config.max_model_len seq_len = model_config.max_model_len
mm_limits = self.get_mm_limits_per_prompt(model_config) mm_limits = self.get_mm_limits_per_prompt(model_config)
...@@ -374,10 +371,7 @@ class MultiModalRegistry: ...@@ -374,10 +371,7 @@ class MultiModalRegistry:
This should be called after :meth:`init_mm_limits_per_prompt`. This should be called after :meth:`init_mm_limits_per_prompt`.
""" """
if self.has_processor(model_config): if self.has_processor(model_config):
tokenizer = cached_get_tokenizer( tokenizer = cached_tokenizer_from_config(model_config)
model_config.tokenizer,
trust_remote_code=model_config.trust_remote_code,
)
processor = self.create_processor(model_config, tokenizer) processor = self.create_processor(model_config, tokenizer)
profiler = MultiModalProfiler(processor) profiler = MultiModalProfiler(processor)
return profiler.get_mm_limits() return profiler.get_mm_limits()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment