Unverified Commit 987506bc authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Refactor] Simplify dummy data generation (#35025)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent c645e9a2
...@@ -232,14 +232,13 @@ class LlavaDummyInputsBuilder(BaseDummyInputsBuilder[_I]): ...@@ -232,14 +232,13 @@ class LlavaDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions],
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
target_width, target_height = self.info.get_image_size_with_most_features() target_width, target_height = self.info.get_image_size_with_most_features()
image_overrides = mm_options.get("image") if mm_options else None image_overrides = mm_options.get("image")
return { return {
"image": self._get_dummy_images( "image": self._get_dummy_images(
......
...@@ -165,8 +165,7 @@ class LlavaNextVideoDummyInputsBuilder( ...@@ -165,8 +165,7 @@ class LlavaNextVideoDummyInputsBuilder(
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions],
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_videos = mm_counts.get("video", 0) num_videos = mm_counts.get("video", 0)
...@@ -175,7 +174,7 @@ class LlavaNextVideoDummyInputsBuilder( ...@@ -175,7 +174,7 @@ class LlavaNextVideoDummyInputsBuilder(
seq_len, mm_counts seq_len, mm_counts
) )
video_overrides = mm_options.get("video") if mm_options else None video_overrides = mm_options.get("video")
return { return {
"video": self._get_dummy_videos( "video": self._get_dummy_videos(
......
...@@ -276,8 +276,7 @@ class LlavaOnevisionDummyInputsBuilder( ...@@ -276,8 +276,7 @@ class LlavaOnevisionDummyInputsBuilder(
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions],
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
num_videos = mm_counts.get("video", 0) num_videos = mm_counts.get("video", 0)
...@@ -287,8 +286,8 @@ class LlavaOnevisionDummyInputsBuilder( ...@@ -287,8 +286,8 @@ class LlavaOnevisionDummyInputsBuilder(
seq_len, mm_counts seq_len, mm_counts
) )
image_overrides = mm_options.get("image") if mm_options else None image_overrides = mm_options.get("image")
video_overrides = mm_options.get("video") if mm_options else None video_overrides = mm_options.get("video")
return { return {
"image": self._get_dummy_images( "image": self._get_dummy_images(
......
...@@ -565,12 +565,11 @@ class MiDashengLMDummyInputsBuilder(BaseDummyInputsBuilder[MiDashengLMProcessing ...@@ -565,12 +565,11 @@ class MiDashengLMDummyInputsBuilder(BaseDummyInputsBuilder[MiDashengLMProcessing
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions],
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_audios = mm_counts.get("audio", 0) num_audios = mm_counts.get("audio", 0)
audio_overrides = mm_options.get("audio") if mm_options else None audio_overrides = mm_options.get("audio")
return { return {
"audio": self._get_dummy_audios( "audio": self._get_dummy_audios(
......
...@@ -301,8 +301,7 @@ class MiniCPMODummyInputsBuilder(MiniCPMVDummyInputsBuilder[MiniCPMOProcessingIn ...@@ -301,8 +301,7 @@ class MiniCPMODummyInputsBuilder(MiniCPMVDummyInputsBuilder[MiniCPMOProcessingIn
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions],
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_audios = mm_counts.get("audio", 0) num_audios = mm_counts.get("audio", 0)
audio_len = ( audio_len = (
...@@ -310,11 +309,13 @@ class MiniCPMODummyInputsBuilder(MiniCPMVDummyInputsBuilder[MiniCPMOProcessingIn ...@@ -310,11 +309,13 @@ class MiniCPMODummyInputsBuilder(MiniCPMVDummyInputsBuilder[MiniCPMOProcessingIn
* self.info.get_default_audio_sampling_rate() * self.info.get_default_audio_sampling_rate()
) )
audio_overrides = mm_options.get("audio") if mm_options else None audio_overrides = mm_options.get("audio")
audio_mm_data = { audio_mm_data = {
"audio": self._get_dummy_audios( "audio": self._get_dummy_audios(
length=audio_len, num_audios=num_audios, overrides=audio_overrides length=audio_len,
num_audios=num_audios,
overrides=audio_overrides,
) )
} }
......
...@@ -707,8 +707,7 @@ class MiniCPMVDummyInputsBuilder(BaseDummyInputsBuilder[_I]): ...@@ -707,8 +707,7 @@ class MiniCPMVDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions],
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
num_videos = mm_counts.get("video", 0) num_videos = mm_counts.get("video", 0)
...@@ -719,8 +718,8 @@ class MiniCPMVDummyInputsBuilder(BaseDummyInputsBuilder[_I]): ...@@ -719,8 +718,8 @@ class MiniCPMVDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
seq_len, mm_counts seq_len, mm_counts
) )
image_overrides = mm_options.get("image") if mm_options else None image_overrides = mm_options.get("image")
video_overrides = mm_options.get("video") if mm_options else None video_overrides = mm_options.get("video")
return { return {
"image": self._get_dummy_images( "image": self._get_dummy_images(
......
...@@ -236,14 +236,13 @@ class Mistral3DummyInputsBuilder(BaseDummyInputsBuilder[_I]): ...@@ -236,14 +236,13 @@ class Mistral3DummyInputsBuilder(BaseDummyInputsBuilder[_I]):
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions],
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
target_width, target_height = self.info.get_image_size_with_most_features() target_width, target_height = self.info.get_image_size_with_most_features()
image_overrides = mm_options.get("image") if mm_options else None image_overrides = mm_options.get("image")
return { return {
"image": self._get_dummy_images( "image": self._get_dummy_images(
......
...@@ -707,14 +707,13 @@ class Mllama4DummyInputsBuilder(BaseDummyInputsBuilder[Mllama4ProcessingInfo]): ...@@ -707,14 +707,13 @@ class Mllama4DummyInputsBuilder(BaseDummyInputsBuilder[Mllama4ProcessingInfo]):
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions],
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
(target_width, target_height) = self.info.get_image_size_with_most_features() (target_width, target_height) = self.info.get_image_size_with_most_features()
image_overrides = mm_options.get("image") if mm_options else None image_overrides = mm_options.get("image")
return { return {
"image": self._get_dummy_images( "image": self._get_dummy_images(
......
...@@ -1274,13 +1274,12 @@ class MolmoDummyInputsBuilder(BaseDummyInputsBuilder[MolmoProcessingInfo]): ...@@ -1274,13 +1274,12 @@ class MolmoDummyInputsBuilder(BaseDummyInputsBuilder[MolmoProcessingInfo]):
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions],
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
target_width, target_height = self.info.get_image_size_with_most_features() target_width, target_height = self.info.get_image_size_with_most_features()
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
image_overrides = mm_options.get("image") if mm_options else None image_overrides = mm_options.get("image")
return { return {
"image": self._get_dummy_images( "image": self._get_dummy_images(
......
...@@ -2082,8 +2082,7 @@ class Molmo2DummyInputsBuilder(BaseDummyInputsBuilder[Molmo2ProcessingInfo]): ...@@ -2082,8 +2082,7 @@ class Molmo2DummyInputsBuilder(BaseDummyInputsBuilder[Molmo2ProcessingInfo]):
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions],
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
num_videos = mm_counts.get("video", 0) num_videos = mm_counts.get("video", 0)
...@@ -2094,7 +2093,7 @@ class Molmo2DummyInputsBuilder(BaseDummyInputsBuilder[Molmo2ProcessingInfo]): ...@@ -2094,7 +2093,7 @@ class Molmo2DummyInputsBuilder(BaseDummyInputsBuilder[Molmo2ProcessingInfo]):
if num_images > 0: if num_images > 0:
target_width, target_height = self.info.get_image_size_with_most_features() target_width, target_height = self.info.get_image_size_with_most_features()
image_overrides = mm_options.get("image") if mm_options else None image_overrides = mm_options.get("image")
dummy_images = self._get_dummy_images( dummy_images = self._get_dummy_images(
width=target_width, width=target_width,
...@@ -2110,7 +2109,7 @@ class Molmo2DummyInputsBuilder(BaseDummyInputsBuilder[Molmo2ProcessingInfo]): ...@@ -2110,7 +2109,7 @@ class Molmo2DummyInputsBuilder(BaseDummyInputsBuilder[Molmo2ProcessingInfo]):
seq_len, mm_counts seq_len, mm_counts
) )
video_overrides = mm_options.get("video") if mm_options else None video_overrides = mm_options.get("video")
if video_overrides: if video_overrides:
assert isinstance(video_overrides, VideoDummyOptions) assert isinstance(video_overrides, VideoDummyOptions)
......
...@@ -1388,8 +1388,7 @@ class NanoNemotronVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]): ...@@ -1388,8 +1388,7 @@ class NanoNemotronVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions],
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
processor = self.info.get_hf_processor() processor = self.info.get_hf_processor()
...@@ -1404,7 +1403,7 @@ class NanoNemotronVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]): ...@@ -1404,7 +1403,7 @@ class NanoNemotronVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
max_num_tiles max_num_tiles
) )
image_overrides = mm_options.get("image") if mm_options else None image_overrides = mm_options.get("image")
return { return {
"image": self._get_dummy_images( "image": self._get_dummy_images(
...@@ -1461,12 +1460,9 @@ class NanoNemotronVLDummyInputsBuilder( ...@@ -1461,12 +1460,9 @@ class NanoNemotronVLDummyInputsBuilder(
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions],
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
dummy_image = super().get_dummy_mm_data( dummy_image = super().get_dummy_mm_data(seq_len, mm_counts, mm_options)
seq_len=seq_len, mm_counts=mm_counts, mm_options=mm_options
)
if self.info.supports_video: if self.info.supports_video:
config = self.info.get_hf_config() config = self.info.get_hf_config()
image_size: int = config.force_image_size image_size: int = config.force_image_size
...@@ -1474,7 +1470,7 @@ class NanoNemotronVLDummyInputsBuilder( ...@@ -1474,7 +1470,7 @@ class NanoNemotronVLDummyInputsBuilder(
seq_len, mm_counts seq_len, mm_counts
) )
num_videos = mm_counts.get("video", 0) num_videos = mm_counts.get("video", 0)
video_overrides = mm_options.get("video") if mm_options else None video_overrides = mm_options.get("video")
dummy_video = { dummy_video = {
"video": self._get_dummy_videos( "video": self._get_dummy_videos(
width=image_size, width=image_size,
......
...@@ -645,8 +645,7 @@ class NemotronParseDummyInputsBuilder( ...@@ -645,8 +645,7 @@ class NemotronParseDummyInputsBuilder(
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions],
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
......
...@@ -92,13 +92,12 @@ class NVLMDummyInputsBuilder(BaseInternVLDummyInputsBuilder[NVLMProcessingInfo]) ...@@ -92,13 +92,12 @@ class NVLMDummyInputsBuilder(BaseInternVLDummyInputsBuilder[NVLMProcessingInfo])
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions],
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
target_width, target_height = self.info.get_image_size_with_most_features() target_width, target_height = self.info.get_image_size_with_most_features()
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
image_overrides = mm_options.get("image") if mm_options else None image_overrides = mm_options.get("image")
return { return {
"image": self._get_dummy_images( "image": self._get_dummy_images(
......
...@@ -306,14 +306,13 @@ class OvisDummyInputsBuilder(BaseDummyInputsBuilder[OvisProcessingInfo]): ...@@ -306,14 +306,13 @@ class OvisDummyInputsBuilder(BaseDummyInputsBuilder[OvisProcessingInfo]):
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions],
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
target_width, target_height = self.info.get_image_size_with_most_features() target_width, target_height = self.info.get_image_size_with_most_features()
image_overrides = mm_options.get("image") if mm_options else None image_overrides = mm_options.get("image")
mm_data = { mm_data = {
"image": self._get_dummy_images( "image": self._get_dummy_images(
......
...@@ -287,8 +287,7 @@ class Ovis2_5DummyInputsBuilder(BaseDummyInputsBuilder[Ovis2_5ProcessingInfo]): ...@@ -287,8 +287,7 @@ class Ovis2_5DummyInputsBuilder(BaseDummyInputsBuilder[Ovis2_5ProcessingInfo]):
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions],
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
num_videos = mm_counts.get("video", 0) num_videos = mm_counts.get("video", 0)
...@@ -298,8 +297,8 @@ class Ovis2_5DummyInputsBuilder(BaseDummyInputsBuilder[Ovis2_5ProcessingInfo]): ...@@ -298,8 +297,8 @@ class Ovis2_5DummyInputsBuilder(BaseDummyInputsBuilder[Ovis2_5ProcessingInfo]):
seq_len, mm_counts seq_len, mm_counts
) )
image_overrides = mm_options.get("image") if mm_options else None image_overrides = mm_options.get("image")
video_overrides = mm_options.get("video") if mm_options else None video_overrides = mm_options.get("video")
mm_data = { mm_data = {
"image": self._get_dummy_images( "image": self._get_dummy_images(
......
...@@ -206,13 +206,12 @@ class PaddleOCRVLDummyInputsBuilder(BaseDummyInputsBuilder[PaddleOCRVLProcessing ...@@ -206,13 +206,12 @@ class PaddleOCRVLDummyInputsBuilder(BaseDummyInputsBuilder[PaddleOCRVLProcessing
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions],
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
max_image_size = self.info.get_image_size_with_most_features() max_image_size = self.info.get_image_size_with_most_features()
image_overrides = mm_options.get("image") if mm_options else None image_overrides = mm_options.get("image")
return { return {
"image": self._get_dummy_images( "image": self._get_dummy_images(
......
...@@ -131,8 +131,7 @@ class PaliGemmaDummyInputsBuilder(BaseDummyInputsBuilder[PaliGemmaProcessingInfo ...@@ -131,8 +131,7 @@ class PaliGemmaDummyInputsBuilder(BaseDummyInputsBuilder[PaliGemmaProcessingInfo
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions],
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
hf_config = self.info.get_hf_config() hf_config = self.info.get_hf_config()
vision_config = hf_config.vision_config vision_config = hf_config.vision_config
...@@ -140,7 +139,7 @@ class PaliGemmaDummyInputsBuilder(BaseDummyInputsBuilder[PaliGemmaProcessingInfo ...@@ -140,7 +139,7 @@ class PaliGemmaDummyInputsBuilder(BaseDummyInputsBuilder[PaliGemmaProcessingInfo
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
image_overrides = mm_options.get("image") if mm_options else None image_overrides = mm_options.get("image")
return { return {
"image": self._get_dummy_images( "image": self._get_dummy_images(
......
...@@ -376,14 +376,13 @@ class Phi3VDummyInputsBuilder(BaseDummyInputsBuilder[Phi3VProcessingInfo]): ...@@ -376,14 +376,13 @@ class Phi3VDummyInputsBuilder(BaseDummyInputsBuilder[Phi3VProcessingInfo]):
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions],
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
target_width, target_height = self.info.get_image_size_with_most_features() target_width, target_height = self.info.get_image_size_with_most_features()
image_overrides = mm_options.get("image") if mm_options else None image_overrides = mm_options.get("image")
return { return {
"image": self._get_dummy_images( "image": self._get_dummy_images(
......
...@@ -822,16 +822,15 @@ class Phi4MMDummyInputsBuilder(BaseDummyInputsBuilder[Phi4MMProcessingInfo]): ...@@ -822,16 +822,15 @@ class Phi4MMDummyInputsBuilder(BaseDummyInputsBuilder[Phi4MMProcessingInfo]):
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions],
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_audios = mm_counts.get("audio", 0) num_audios = mm_counts.get("audio", 0)
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
target_width, target_height = self.info.get_image_size_with_most_features() target_width, target_height = self.info.get_image_size_with_most_features()
image_overrides = mm_options.get("image") if mm_options else None image_overrides = mm_options.get("image")
audio_overrides = mm_options.get("audio") if mm_options else None audio_overrides = mm_options.get("audio")
mm_data = { mm_data = {
"image": self._get_dummy_images( "image": self._get_dummy_images(
......
...@@ -249,14 +249,13 @@ class PixtralDummyInputsBuilder(BaseDummyInputsBuilder[PixtralProcessingInfo]): ...@@ -249,14 +249,13 @@ class PixtralDummyInputsBuilder(BaseDummyInputsBuilder[PixtralProcessingInfo]):
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions],
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
target_width, target_height = self.info.get_image_size_with_most_features() target_width, target_height = self.info.get_image_size_with_most_features()
image_overrides = mm_options.get("image") if mm_options else None image_overrides = mm_options.get("image")
return { return {
"image": self._get_dummy_images( "image": self._get_dummy_images(
...@@ -271,8 +270,7 @@ class PixtralDummyInputsBuilder(BaseDummyInputsBuilder[PixtralProcessingInfo]): ...@@ -271,8 +270,7 @@ class PixtralDummyInputsBuilder(BaseDummyInputsBuilder[PixtralProcessingInfo]):
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Mapping[str, BaseDummyOptions],
mm_processor_kwargs: Mapping[str, object] | None = None,
) -> ProcessorInputs: ) -> ProcessorInputs:
tokenizer = self.info.get_tokenizer() tokenizer = self.info.get_tokenizer()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment