Unverified Commit 987506bc authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Refactor] Simplify dummy data generation (#35025)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent c645e9a2
...@@ -746,23 +746,22 @@ class FunASRDummyInputsBuilder(BaseDummyInputsBuilder[FunASRProcessingInfo]): ...@@ -746,23 +746,22 @@ class FunASRDummyInputsBuilder(BaseDummyInputsBuilder[FunASRProcessingInfo]):
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions],
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
feature_extractor = self.info.get_feature_extractor( feature_extractor = self.info.get_feature_extractor()
**(mm_processor_kwargs or {})
)
sampling_rate = feature_extractor.sampling_rate sampling_rate = feature_extractor.sampling_rate
audio_len = feature_extractor.chunk_length * sampling_rate audio_len = feature_extractor.chunk_length * sampling_rate
num_audios = mm_counts.get("audio", 0) num_audios = mm_counts.get("audio", 0)
audio_overrides = mm_options.get("audio") if mm_options else None audio_overrides = mm_options.get("audio")
return { return {
"audio": self._get_dummy_audios( "audio": self._get_dummy_audios(
length=audio_len, num_audios=num_audios, overrides=audio_overrides length=audio_len,
) num_audios=num_audios,
overrides=audio_overrides,
),
} }
......
...@@ -610,12 +610,9 @@ class FunAudioChatDummyInputsBuilder( ...@@ -610,12 +610,9 @@ class FunAudioChatDummyInputsBuilder(
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions],
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
feature_extractor = self.info.get_feature_extractor( feature_extractor = self.info.get_feature_extractor()
**(mm_processor_kwargs or {})
)
sampling_rate = int(feature_extractor.sampling_rate) sampling_rate = int(feature_extractor.sampling_rate)
# Dummy inputs are used for profiling; construct the worst-case audio # Dummy inputs are used for profiling; construct the worst-case audio
...@@ -632,7 +629,7 @@ class FunAudioChatDummyInputsBuilder( ...@@ -632,7 +629,7 @@ class FunAudioChatDummyInputsBuilder(
) )
num_audios = int(mm_counts.get("audio", 0)) num_audios = int(mm_counts.get("audio", 0))
audio_overrides = mm_options.get("audio") if mm_options else None audio_overrides = mm_options.get("audio")
return { return {
"audio": self._get_dummy_audios( "audio": self._get_dummy_audios(
length=audio_len, length=audio_len,
......
...@@ -142,13 +142,12 @@ class FuyuDummyInputsBuilder(BaseDummyInputsBuilder[FuyuProcessingInfo]): ...@@ -142,13 +142,12 @@ class FuyuDummyInputsBuilder(BaseDummyInputsBuilder[FuyuProcessingInfo]):
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions],
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
target_width, target_height = self.info.get_image_size_with_most_features() target_width, target_height = self.info.get_image_size_with_most_features()
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
image_overrides = mm_options.get("image") if mm_options else None image_overrides = mm_options.get("image")
return { return {
"image": self._get_dummy_images( "image": self._get_dummy_images(
......
...@@ -241,14 +241,13 @@ class Gemma3DummyInputsBuilder(BaseDummyInputsBuilder[Gemma3ProcessingInfo]): ...@@ -241,14 +241,13 @@ class Gemma3DummyInputsBuilder(BaseDummyInputsBuilder[Gemma3ProcessingInfo]):
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions],
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
target_width, target_height = self.info.get_image_size_with_most_features() target_width, target_height = self.info.get_image_size_with_most_features()
image_overrides = mm_options.get("image") if mm_options else None image_overrides = mm_options.get("image")
return { return {
"image": self._get_dummy_images( "image": self._get_dummy_images(
......
...@@ -175,8 +175,7 @@ class Gemma3nDummyInputsBuilder(BaseDummyInputsBuilder[Gemma3nProcessingInfo]): ...@@ -175,8 +175,7 @@ class Gemma3nDummyInputsBuilder(BaseDummyInputsBuilder[Gemma3nProcessingInfo]):
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions],
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
num_audios = mm_counts.get("audio", 0) num_audios = mm_counts.get("audio", 0)
...@@ -189,8 +188,8 @@ class Gemma3nDummyInputsBuilder(BaseDummyInputsBuilder[Gemma3nProcessingInfo]): ...@@ -189,8 +188,8 @@ class Gemma3nDummyInputsBuilder(BaseDummyInputsBuilder[Gemma3nProcessingInfo]):
img_width = image_processor.size.get("width", 224) img_width = image_processor.size.get("width", 224)
img_height = image_processor.size.get("height", 224) img_height = image_processor.size.get("height", 224)
image_overrides = mm_options.get("image") if mm_options else None image_overrides = mm_options.get("image")
audio_overrides = mm_options.get("audio") if mm_options else None audio_overrides = mm_options.get("audio")
return { return {
"image": self._get_dummy_images( "image": self._get_dummy_images(
...@@ -200,7 +199,9 @@ class Gemma3nDummyInputsBuilder(BaseDummyInputsBuilder[Gemma3nProcessingInfo]): ...@@ -200,7 +199,9 @@ class Gemma3nDummyInputsBuilder(BaseDummyInputsBuilder[Gemma3nProcessingInfo]):
overrides=image_overrides, overrides=image_overrides,
), ),
"audio": self._get_dummy_audios( "audio": self._get_dummy_audios(
length=audio_len, num_audios=num_audios, overrides=audio_overrides length=audio_len,
num_audios=num_audios,
overrides=audio_overrides,
), ),
} }
......
...@@ -1163,8 +1163,7 @@ class Glm4vDummyInputsBuilder(BaseDummyInputsBuilder[Glm4vProcessingInfo]): ...@@ -1163,8 +1163,7 @@ class Glm4vDummyInputsBuilder(BaseDummyInputsBuilder[Glm4vProcessingInfo]):
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions],
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
num_videos = mm_counts.get("video", 0) num_videos = mm_counts.get("video", 0)
...@@ -1174,8 +1173,8 @@ class Glm4vDummyInputsBuilder(BaseDummyInputsBuilder[Glm4vProcessingInfo]): ...@@ -1174,8 +1173,8 @@ class Glm4vDummyInputsBuilder(BaseDummyInputsBuilder[Glm4vProcessingInfo]):
seq_len, mm_counts seq_len, mm_counts
) )
image_overrides = mm_options.get("image") if mm_options else None image_overrides = mm_options.get("image")
video_overrides = mm_options.get("video") if mm_options else None video_overrides = mm_options.get("video")
return { return {
"image": self._get_dummy_images( "image": self._get_dummy_images(
......
...@@ -492,8 +492,7 @@ class GLM4VDummyInputsBuilder(BaseDummyInputsBuilder[GLM4VProcessingInfo]): ...@@ -492,8 +492,7 @@ class GLM4VDummyInputsBuilder(BaseDummyInputsBuilder[GLM4VProcessingInfo]):
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions],
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
hf_config = self.info.get_hf_config() hf_config = self.info.get_hf_config()
vision_config = hf_config.vision_config vision_config = hf_config.vision_config
...@@ -501,7 +500,7 @@ class GLM4VDummyInputsBuilder(BaseDummyInputsBuilder[GLM4VProcessingInfo]): ...@@ -501,7 +500,7 @@ class GLM4VDummyInputsBuilder(BaseDummyInputsBuilder[GLM4VProcessingInfo]):
target_width = target_height = vision_config["image_size"] target_width = target_height = vision_config["image_size"]
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
image_overrides = mm_options.get("image") if mm_options else None image_overrides = mm_options.get("image")
return { return {
"image": self._get_dummy_images( "image": self._get_dummy_images(
......
...@@ -726,15 +726,12 @@ class GlmAsrDummyInputsBuilder(BaseDummyInputsBuilder[GlmAsrProcessingInfo]): ...@@ -726,15 +726,12 @@ class GlmAsrDummyInputsBuilder(BaseDummyInputsBuilder[GlmAsrProcessingInfo]):
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions],
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
feature_extractor = self.info.get_feature_extractor( feature_extractor = self.info.get_feature_extractor()
**(mm_processor_kwargs or {})
)
sampling_rate = feature_extractor.sampling_rate sampling_rate = feature_extractor.sampling_rate
num_audios = mm_counts.get("audio", 0) num_audios = mm_counts.get("audio", 0)
audio_overrides = mm_options.get("audio") if mm_options else None audio_overrides = mm_options.get("audio")
max_audio_len = getattr( max_audio_len = getattr(
self.info.get_hf_processor(), "max_audio_len", DEFAULT_MAX_AUDIO_LEN_S self.info.get_hf_processor(), "max_audio_len", DEFAULT_MAX_AUDIO_LEN_S
...@@ -743,7 +740,9 @@ class GlmAsrDummyInputsBuilder(BaseDummyInputsBuilder[GlmAsrProcessingInfo]): ...@@ -743,7 +740,9 @@ class GlmAsrDummyInputsBuilder(BaseDummyInputsBuilder[GlmAsrProcessingInfo]):
return { return {
"audio": self._get_dummy_audios( "audio": self._get_dummy_audios(
length=audio_len, num_audios=num_audios, overrides=audio_overrides length=audio_len,
num_audios=num_audios,
overrides=audio_overrides,
) )
} }
......
...@@ -216,11 +216,10 @@ class GraniteSpeechDummyInputsBuilder( ...@@ -216,11 +216,10 @@ class GraniteSpeechDummyInputsBuilder(
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions],
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_audios = mm_counts.get("audio", 0) num_audios = mm_counts.get("audio", 0)
audio_overrides = mm_options.get("audio") if mm_options else None audio_overrides = mm_options.get("audio")
return { return {
"audio": self._get_dummy_audios( "audio": self._get_dummy_audios(
......
...@@ -713,8 +713,7 @@ class HunYuanVLDummyInputsBuilder(BaseDummyInputsBuilder[HunYuanVLProcessingInfo ...@@ -713,8 +713,7 @@ class HunYuanVLDummyInputsBuilder(BaseDummyInputsBuilder[HunYuanVLProcessingInfo
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions],
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 1) num_images = mm_counts.get("image", 1)
......
...@@ -165,8 +165,7 @@ class HCXVisionDummyInputsBuilder(BaseDummyInputsBuilder[HCXVisionProcessingInfo ...@@ -165,8 +165,7 @@ class HCXVisionDummyInputsBuilder(BaseDummyInputsBuilder[HCXVisionProcessingInfo
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions],
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
num_videos = mm_counts.get("video", 0) num_videos = mm_counts.get("video", 0)
...@@ -174,8 +173,8 @@ class HCXVisionDummyInputsBuilder(BaseDummyInputsBuilder[HCXVisionProcessingInfo ...@@ -174,8 +173,8 @@ class HCXVisionDummyInputsBuilder(BaseDummyInputsBuilder[HCXVisionProcessingInfo
target_width, target_height = self.info.get_image_size_with_most_features() target_width, target_height = self.info.get_image_size_with_most_features()
target_num_frames = 32 target_num_frames = 32
image_overrides = mm_options.get("image") if mm_options else None image_overrides = mm_options.get("image")
video_overrides = mm_options.get("video") if mm_options else None video_overrides = mm_options.get("video")
return { return {
"image": self._get_dummy_images( "image": self._get_dummy_images(
......
...@@ -277,15 +277,14 @@ class Idefics3DummyInputsBuilder(BaseDummyInputsBuilder[Idefics3ProcessingInfo]) ...@@ -277,15 +277,14 @@ class Idefics3DummyInputsBuilder(BaseDummyInputsBuilder[Idefics3ProcessingInfo])
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions],
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
hf_processor = self.info.get_hf_processor(**(mm_processor_kwargs or {})) hf_processor = self.info.get_hf_processor()
image_processor: Idefics3ImageProcessor = hf_processor.image_processor image_processor: Idefics3ImageProcessor = hf_processor.image_processor
longest_edge = image_processor.max_image_size["longest_edge"] longest_edge = image_processor.max_image_size["longest_edge"]
image_overrides = mm_options.get("image") if mm_options else None image_overrides = mm_options.get("image")
return { return {
"image": self._get_dummy_images( "image": self._get_dummy_images(
......
...@@ -297,8 +297,7 @@ class InternS1DummyInputsBuilder(BaseDummyInputsBuilder[InternS1ProcessingInfo]) ...@@ -297,8 +297,7 @@ class InternS1DummyInputsBuilder(BaseDummyInputsBuilder[InternS1ProcessingInfo])
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions],
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
target_width, target_height = self.info.get_image_size_with_most_features() target_width, target_height = self.info.get_image_size_with_most_features()
target_num_frames = self.info.get_num_frames_with_most_features( target_num_frames = self.info.get_num_frames_with_most_features(
...@@ -310,8 +309,8 @@ class InternS1DummyInputsBuilder(BaseDummyInputsBuilder[InternS1ProcessingInfo]) ...@@ -310,8 +309,8 @@ class InternS1DummyInputsBuilder(BaseDummyInputsBuilder[InternS1ProcessingInfo])
config = self.info.get_hf_config() config = self.info.get_hf_config()
image_size_h, image_size_w = config.vision_config.image_size image_size_h, image_size_w = config.vision_config.image_size
image_overrides = mm_options.get("image") if mm_options else None image_overrides = mm_options.get("image")
video_overrides = mm_options.get("video") if mm_options else None video_overrides = mm_options.get("video")
return { return {
"image": self._get_dummy_images( "image": self._get_dummy_images(
......
...@@ -762,13 +762,12 @@ class BaseInternVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]): ...@@ -762,13 +762,12 @@ class BaseInternVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions],
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
target_width, target_height = self.info.get_image_size_with_most_features() target_width, target_height = self.info.get_image_size_with_most_features()
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
image_overrides = mm_options.get("image") if mm_options else None image_overrides = mm_options.get("image")
return { return {
"image": self._get_dummy_images( "image": self._get_dummy_images(
...@@ -935,12 +934,9 @@ class InternVLDummyInputsBuilder( ...@@ -935,12 +934,9 @@ class InternVLDummyInputsBuilder(
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions],
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
dummy_image = super().get_dummy_mm_data( dummy_image = super().get_dummy_mm_data(seq_len, mm_counts, mm_options)
seq_len=seq_len, mm_counts=mm_counts, mm_options=mm_options
)
if self.info.supports_video: if self.info.supports_video:
config = self.info.get_hf_config() config = self.info.get_hf_config()
image_size: int = config.vision_config.image_size image_size: int = config.vision_config.image_size
...@@ -948,7 +944,7 @@ class InternVLDummyInputsBuilder( ...@@ -948,7 +944,7 @@ class InternVLDummyInputsBuilder(
seq_len, mm_counts seq_len, mm_counts
) )
num_videos = mm_counts.get("video", 0) num_videos = mm_counts.get("video", 0)
video_overrides = mm_options.get("video") if mm_options else None video_overrides = mm_options.get("video")
dummy_video = { dummy_video = {
"video": self._get_dummy_videos( "video": self._get_dummy_videos(
width=image_size, width=image_size,
......
...@@ -18,6 +18,7 @@ from typing_extensions import TypedDict, Unpack ...@@ -18,6 +18,7 @@ from typing_extensions import TypedDict, Unpack
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.model import ModelConfig from vllm.config.model import ModelConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.distributed import parallel_state from vllm.distributed import parallel_state
from vllm.distributed import utils as dist_utils from vllm.distributed import utils as dist_utils
from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.attention import MMEncoderAttention
...@@ -849,13 +850,12 @@ class IsaacDummyInputsBuilder(BaseDummyInputsBuilder[IsaacProcessingInfo]): ...@@ -849,13 +850,12 @@ class IsaacDummyInputsBuilder(BaseDummyInputsBuilder[IsaacProcessingInfo]):
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str] | None = None, mm_options: Mapping[str, BaseDummyOptions],
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
target_width, target_height = self.info.get_image_size_with_most_features() target_width, target_height = self.info.get_image_size_with_most_features()
image_overrides = mm_options.get("image") if mm_options else None image_overrides = mm_options.get("image")
return { return {
"image": self._get_dummy_images( "image": self._get_dummy_images(
......
...@@ -444,8 +444,7 @@ class KananaVDummyInputsBuilder(BaseDummyInputsBuilder[KananaVProcessingInfo]): ...@@ -444,8 +444,7 @@ class KananaVDummyInputsBuilder(BaseDummyInputsBuilder[KananaVProcessingInfo]):
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions],
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
return { return {
......
...@@ -1170,8 +1170,7 @@ class KeyeBaseDummyInputsBuilder(BaseDummyInputsBuilder[_I]): ...@@ -1170,8 +1170,7 @@ class KeyeBaseDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions],
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
num_videos = mm_counts.get("video", 0) num_videos = mm_counts.get("video", 0)
...@@ -1179,8 +1178,8 @@ class KeyeBaseDummyInputsBuilder(BaseDummyInputsBuilder[_I]): ...@@ -1179,8 +1178,8 @@ class KeyeBaseDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
target_width, target_height = self.info.get_image_size_with_most_features() target_width, target_height = self.info.get_image_size_with_most_features()
target_num_frames = self.info.get_num_frames_with_most_features(seq_len) target_num_frames = self.info.get_num_frames_with_most_features(seq_len)
image_overrides = mm_options.get("image") if mm_options else None image_overrides = mm_options.get("image")
video_overrides = mm_options.get("video") if mm_options else None video_overrides = mm_options.get("video")
mm_data = { mm_data = {
"image": self._get_dummy_images( "image": self._get_dummy_images(
......
...@@ -240,8 +240,7 @@ class KimiK25DummyInputsBuilder(BaseDummyInputsBuilder[KimiK25ProcessingInfo]): ...@@ -240,8 +240,7 @@ class KimiK25DummyInputsBuilder(BaseDummyInputsBuilder[KimiK25ProcessingInfo]):
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions],
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
# TODO: Support mm_options for vision_chunk to allow user configuration # TODO: Support mm_options for vision_chunk to allow user configuration
dummy_items = self.get_dummy_mm_items() dummy_items = self.get_dummy_mm_items()
......
...@@ -215,12 +215,11 @@ class KimiVLDummyInputsBuilder(BaseDummyInputsBuilder[KimiVLProcessingInfo]): ...@@ -215,12 +215,11 @@ class KimiVLDummyInputsBuilder(BaseDummyInputsBuilder[KimiVLProcessingInfo]):
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions],
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
image_overrides = mm_options.get("image") if mm_options else None image_overrides = mm_options.get("image")
return { return {
"image": self._get_dummy_images( "image": self._get_dummy_images(
......
...@@ -343,14 +343,13 @@ class Lfm2VLDummyInputsBuilder(BaseDummyInputsBuilder[Lfm2VLProcessingInfo]): ...@@ -343,14 +343,13 @@ class Lfm2VLDummyInputsBuilder(BaseDummyInputsBuilder[Lfm2VLProcessingInfo]):
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions],
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
target_width, target_height = self.info.get_image_size_with_most_features() target_width, target_height = self.info.get_image_size_with_most_features()
image_overrides = mm_options.get("image") if mm_options else None image_overrides = mm_options.get("image")
return { return {
"image": self._get_dummy_images( "image": self._get_dummy_images(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment