Unverified Commit 0ab06100 authored by Isotr0py's avatar Isotr0py Committed by GitHub
Browse files

[Multimodal] Expose `mm_processor_kwargs` for `DummyInputsBuilder` (#34330)


Signed-off-by: default avatarIsotr0py <mozf@mail2.sysu.edu.cn>
parent ffb3d553
...@@ -493,6 +493,7 @@ class GLM4VDummyInputsBuilder(BaseDummyInputsBuilder[GLM4VProcessingInfo]): ...@@ -493,6 +493,7 @@ class GLM4VDummyInputsBuilder(BaseDummyInputsBuilder[GLM4VProcessingInfo]):
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
hf_config = self.info.get_hf_config() hf_config = self.info.get_hf_config()
vision_config = hf_config.vision_config vision_config = hf_config.vision_config
......
...@@ -727,8 +727,11 @@ class GlmAsrDummyInputsBuilder(BaseDummyInputsBuilder[GlmAsrProcessingInfo]): ...@@ -727,8 +727,11 @@ class GlmAsrDummyInputsBuilder(BaseDummyInputsBuilder[GlmAsrProcessingInfo]):
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
feature_extractor = self.info.get_feature_extractor() feature_extractor = self.info.get_feature_extractor(
**(mm_processor_kwargs or {})
)
sampling_rate = feature_extractor.sampling_rate sampling_rate = feature_extractor.sampling_rate
num_audios = mm_counts.get("audio", 0) num_audios = mm_counts.get("audio", 0)
audio_overrides = mm_options.get("audio") if mm_options else None audio_overrides = mm_options.get("audio") if mm_options else None
......
...@@ -217,6 +217,7 @@ class GraniteSpeechDummyInputsBuilder( ...@@ -217,6 +217,7 @@ class GraniteSpeechDummyInputsBuilder(
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_audios = mm_counts.get("audio", 0) num_audios = mm_counts.get("audio", 0)
audio_overrides = mm_options.get("audio") if mm_options else None audio_overrides = mm_options.get("audio") if mm_options else None
......
...@@ -702,6 +702,7 @@ class HunYuanVLDummyInputsBuilder(BaseDummyInputsBuilder[HunYuanVLProcessingInfo ...@@ -702,6 +702,7 @@ class HunYuanVLDummyInputsBuilder(BaseDummyInputsBuilder[HunYuanVLProcessingInfo
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 1) num_images = mm_counts.get("image", 1)
......
...@@ -166,6 +166,7 @@ class HCXVisionDummyInputsBuilder(BaseDummyInputsBuilder[HCXVisionProcessingInfo ...@@ -166,6 +166,7 @@ class HCXVisionDummyInputsBuilder(BaseDummyInputsBuilder[HCXVisionProcessingInfo
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
num_videos = mm_counts.get("video", 0) num_videos = mm_counts.get("video", 0)
......
...@@ -42,7 +42,7 @@ from vllm.multimodal.inputs import ( ...@@ -42,7 +42,7 @@ from vllm.multimodal.inputs import (
MultiModalFieldConfig, MultiModalFieldConfig,
MultiModalKwargsItems, MultiModalKwargsItems,
) )
from vllm.multimodal.parse import ImageProcessorItems, ImageSize, MultiModalDataItems from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataItems
from vllm.multimodal.processing import ( from vllm.multimodal.processing import (
BaseDummyInputsBuilder, BaseDummyInputsBuilder,
BaseMultiModalProcessor, BaseMultiModalProcessor,
...@@ -285,15 +285,6 @@ class Idefics3ProcessingInfo(BaseProcessingInfo): ...@@ -285,15 +285,6 @@ class Idefics3ProcessingInfo(BaseProcessingInfo):
return num_patches * processor.image_seq_len return num_patches * processor.image_seq_len
def get_image_size_with_most_features(self) -> ImageSize:
processor = self.get_hf_processor()
image_processor: Idefics3ImageProcessor = processor.image_processor
return ImageSize(
width=image_processor.size["longest_edge"],
height=image_processor.size["longest_edge"],
)
class Idefics3DummyInputsBuilder(BaseDummyInputsBuilder[Idefics3ProcessingInfo]): class Idefics3DummyInputsBuilder(BaseDummyInputsBuilder[Idefics3ProcessingInfo]):
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
...@@ -309,9 +300,10 @@ class Idefics3DummyInputsBuilder(BaseDummyInputsBuilder[Idefics3ProcessingInfo]) ...@@ -309,9 +300,10 @@ class Idefics3DummyInputsBuilder(BaseDummyInputsBuilder[Idefics3ProcessingInfo])
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
hf_processor = self.info.get_hf_processor() hf_processor = self.info.get_hf_processor(**(mm_processor_kwargs or {}))
image_processor: Idefics3ImageProcessor = hf_processor.image_processor image_processor: Idefics3ImageProcessor = hf_processor.image_processor
longest_edge = image_processor.max_image_size["longest_edge"] longest_edge = image_processor.max_image_size["longest_edge"]
......
...@@ -298,6 +298,7 @@ class InternS1DummyInputsBuilder(BaseDummyInputsBuilder[InternS1ProcessingInfo]) ...@@ -298,6 +298,7 @@ class InternS1DummyInputsBuilder(BaseDummyInputsBuilder[InternS1ProcessingInfo])
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
target_width, target_height = self.info.get_image_size_with_most_features() target_width, target_height = self.info.get_image_size_with_most_features()
target_num_frames = self.info.get_num_frames_with_most_features( target_num_frames = self.info.get_num_frames_with_most_features(
......
...@@ -766,6 +766,7 @@ class BaseInternVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]): ...@@ -766,6 +766,7 @@ class BaseInternVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
target_width, target_height = self.info.get_image_size_with_most_features() target_width, target_height = self.info.get_image_size_with_most_features()
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
...@@ -938,6 +939,7 @@ class InternVLDummyInputsBuilder( ...@@ -938,6 +939,7 @@ class InternVLDummyInputsBuilder(
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
dummy_image = super().get_dummy_mm_data( dummy_image = super().get_dummy_mm_data(
seq_len=seq_len, mm_counts=mm_counts, mm_options=mm_options seq_len=seq_len, mm_counts=mm_counts, mm_options=mm_options
......
...@@ -850,6 +850,7 @@ class IsaacDummyInputsBuilder(BaseDummyInputsBuilder[IsaacProcessingInfo]): ...@@ -850,6 +850,7 @@ class IsaacDummyInputsBuilder(BaseDummyInputsBuilder[IsaacProcessingInfo]):
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str] | None = None, mm_options: Mapping[str] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
......
...@@ -445,6 +445,7 @@ class KananaVDummyInputsBuilder(BaseDummyInputsBuilder[KananaVProcessingInfo]): ...@@ -445,6 +445,7 @@ class KananaVDummyInputsBuilder(BaseDummyInputsBuilder[KananaVProcessingInfo]):
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
return { return {
......
...@@ -1159,6 +1159,7 @@ class KeyeBaseDummyInputsBuilder(BaseDummyInputsBuilder[_I]): ...@@ -1159,6 +1159,7 @@ class KeyeBaseDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
num_videos = mm_counts.get("video", 0) num_videos = mm_counts.get("video", 0)
......
...@@ -238,6 +238,7 @@ class KimiK25DummyInputsBuilder(BaseDummyInputsBuilder[KimiK25ProcessingInfo]): ...@@ -238,6 +238,7 @@ class KimiK25DummyInputsBuilder(BaseDummyInputsBuilder[KimiK25ProcessingInfo]):
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
# TODO: Support mm_options for vision_chunk to allow user configuration # TODO: Support mm_options for vision_chunk to allow user configuration
dummy_items = self.get_dummy_mm_items() dummy_items = self.get_dummy_mm_items()
......
...@@ -216,6 +216,7 @@ class KimiVLDummyInputsBuilder(BaseDummyInputsBuilder[KimiVLProcessingInfo]): ...@@ -216,6 +216,7 @@ class KimiVLDummyInputsBuilder(BaseDummyInputsBuilder[KimiVLProcessingInfo]):
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
......
...@@ -319,6 +319,7 @@ class Lfm2VLDummyInputsBuilder(BaseDummyInputsBuilder[Lfm2VLProcessingInfo]): ...@@ -319,6 +319,7 @@ class Lfm2VLDummyInputsBuilder(BaseDummyInputsBuilder[Lfm2VLProcessingInfo]):
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
......
...@@ -232,6 +232,7 @@ class LlavaDummyInputsBuilder(BaseDummyInputsBuilder[_I]): ...@@ -232,6 +232,7 @@ class LlavaDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
......
...@@ -166,6 +166,7 @@ class LlavaNextVideoDummyInputsBuilder( ...@@ -166,6 +166,7 @@ class LlavaNextVideoDummyInputsBuilder(
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_videos = mm_counts.get("video", 0) num_videos = mm_counts.get("video", 0)
......
...@@ -277,6 +277,7 @@ class LlavaOnevisionDummyInputsBuilder( ...@@ -277,6 +277,7 @@ class LlavaOnevisionDummyInputsBuilder(
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
num_videos = mm_counts.get("video", 0) num_videos = mm_counts.get("video", 0)
......
...@@ -566,6 +566,7 @@ class MiDashengLMDummyInputsBuilder(BaseDummyInputsBuilder[MiDashengLMProcessing ...@@ -566,6 +566,7 @@ class MiDashengLMDummyInputsBuilder(BaseDummyInputsBuilder[MiDashengLMProcessing
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_audios = mm_counts.get("audio", 0) num_audios = mm_counts.get("audio", 0)
......
...@@ -302,6 +302,7 @@ class MiniCPMODummyInputsBuilder(MiniCPMVDummyInputsBuilder[MiniCPMOProcessingIn ...@@ -302,6 +302,7 @@ class MiniCPMODummyInputsBuilder(MiniCPMVDummyInputsBuilder[MiniCPMOProcessingIn
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_audios = mm_counts.get("audio", 0) num_audios = mm_counts.get("audio", 0)
audio_len = ( audio_len = (
......
...@@ -708,6 +708,7 @@ class MiniCPMVDummyInputsBuilder(BaseDummyInputsBuilder[_I]): ...@@ -708,6 +708,7 @@ class MiniCPMVDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
num_videos = mm_counts.get("video", 0) num_videos = mm_counts.get("video", 0)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment