Unverified Commit 987506bc authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Refactor] Simplify dummy data generation (#35025)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent c645e9a2
......@@ -232,14 +232,13 @@ class LlavaDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
self,
seq_len: int,
mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
mm_options: Mapping[str, BaseDummyOptions],
) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0)
target_width, target_height = self.info.get_image_size_with_most_features()
image_overrides = mm_options.get("image") if mm_options else None
image_overrides = mm_options.get("image")
return {
"image": self._get_dummy_images(
......
......@@ -165,8 +165,7 @@ class LlavaNextVideoDummyInputsBuilder(
self,
seq_len: int,
mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
mm_options: Mapping[str, BaseDummyOptions],
) -> MultiModalDataDict:
num_videos = mm_counts.get("video", 0)
......@@ -175,7 +174,7 @@ class LlavaNextVideoDummyInputsBuilder(
seq_len, mm_counts
)
video_overrides = mm_options.get("video") if mm_options else None
video_overrides = mm_options.get("video")
return {
"video": self._get_dummy_videos(
......
......@@ -276,8 +276,7 @@ class LlavaOnevisionDummyInputsBuilder(
self,
seq_len: int,
mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
mm_options: Mapping[str, BaseDummyOptions],
) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0)
num_videos = mm_counts.get("video", 0)
......@@ -287,8 +286,8 @@ class LlavaOnevisionDummyInputsBuilder(
seq_len, mm_counts
)
image_overrides = mm_options.get("image") if mm_options else None
video_overrides = mm_options.get("video") if mm_options else None
image_overrides = mm_options.get("image")
video_overrides = mm_options.get("video")
return {
"image": self._get_dummy_images(
......
......@@ -565,12 +565,11 @@ class MiDashengLMDummyInputsBuilder(BaseDummyInputsBuilder[MiDashengLMProcessing
self,
seq_len: int,
mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
mm_options: Mapping[str, BaseDummyOptions],
) -> MultiModalDataDict:
num_audios = mm_counts.get("audio", 0)
audio_overrides = mm_options.get("audio") if mm_options else None
audio_overrides = mm_options.get("audio")
return {
"audio": self._get_dummy_audios(
......
......@@ -301,8 +301,7 @@ class MiniCPMODummyInputsBuilder(MiniCPMVDummyInputsBuilder[MiniCPMOProcessingIn
self,
seq_len: int,
mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
mm_options: Mapping[str, BaseDummyOptions],
) -> MultiModalDataDict:
num_audios = mm_counts.get("audio", 0)
audio_len = (
......@@ -310,11 +309,13 @@ class MiniCPMODummyInputsBuilder(MiniCPMVDummyInputsBuilder[MiniCPMOProcessingIn
* self.info.get_default_audio_sampling_rate()
)
audio_overrides = mm_options.get("audio") if mm_options else None
audio_overrides = mm_options.get("audio")
audio_mm_data = {
"audio": self._get_dummy_audios(
length=audio_len, num_audios=num_audios, overrides=audio_overrides
length=audio_len,
num_audios=num_audios,
overrides=audio_overrides,
)
}
......
......@@ -707,8 +707,7 @@ class MiniCPMVDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
self,
seq_len: int,
mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
mm_options: Mapping[str, BaseDummyOptions],
) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0)
num_videos = mm_counts.get("video", 0)
......@@ -719,8 +718,8 @@ class MiniCPMVDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
seq_len, mm_counts
)
image_overrides = mm_options.get("image") if mm_options else None
video_overrides = mm_options.get("video") if mm_options else None
image_overrides = mm_options.get("image")
video_overrides = mm_options.get("video")
return {
"image": self._get_dummy_images(
......
......@@ -236,14 +236,13 @@ class Mistral3DummyInputsBuilder(BaseDummyInputsBuilder[_I]):
self,
seq_len: int,
mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
mm_options: Mapping[str, BaseDummyOptions],
) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0)
target_width, target_height = self.info.get_image_size_with_most_features()
image_overrides = mm_options.get("image") if mm_options else None
image_overrides = mm_options.get("image")
return {
"image": self._get_dummy_images(
......
......@@ -707,14 +707,13 @@ class Mllama4DummyInputsBuilder(BaseDummyInputsBuilder[Mllama4ProcessingInfo]):
self,
seq_len: int,
mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
mm_options: Mapping[str, BaseDummyOptions],
) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0)
(target_width, target_height) = self.info.get_image_size_with_most_features()
image_overrides = mm_options.get("image") if mm_options else None
image_overrides = mm_options.get("image")
return {
"image": self._get_dummy_images(
......
......@@ -1274,13 +1274,12 @@ class MolmoDummyInputsBuilder(BaseDummyInputsBuilder[MolmoProcessingInfo]):
self,
seq_len: int,
mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
mm_options: Mapping[str, BaseDummyOptions],
) -> MultiModalDataDict:
target_width, target_height = self.info.get_image_size_with_most_features()
num_images = mm_counts.get("image", 0)
image_overrides = mm_options.get("image") if mm_options else None
image_overrides = mm_options.get("image")
return {
"image": self._get_dummy_images(
......
......@@ -2082,8 +2082,7 @@ class Molmo2DummyInputsBuilder(BaseDummyInputsBuilder[Molmo2ProcessingInfo]):
self,
seq_len: int,
mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
mm_options: Mapping[str, BaseDummyOptions],
) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0)
num_videos = mm_counts.get("video", 0)
......@@ -2094,7 +2093,7 @@ class Molmo2DummyInputsBuilder(BaseDummyInputsBuilder[Molmo2ProcessingInfo]):
if num_images > 0:
target_width, target_height = self.info.get_image_size_with_most_features()
image_overrides = mm_options.get("image") if mm_options else None
image_overrides = mm_options.get("image")
dummy_images = self._get_dummy_images(
width=target_width,
......@@ -2110,7 +2109,7 @@ class Molmo2DummyInputsBuilder(BaseDummyInputsBuilder[Molmo2ProcessingInfo]):
seq_len, mm_counts
)
video_overrides = mm_options.get("video") if mm_options else None
video_overrides = mm_options.get("video")
if video_overrides:
assert isinstance(video_overrides, VideoDummyOptions)
......
......@@ -1388,8 +1388,7 @@ class NanoNemotronVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
self,
seq_len: int,
mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
mm_options: Mapping[str, BaseDummyOptions],
) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0)
processor = self.info.get_hf_processor()
......@@ -1404,7 +1403,7 @@ class NanoNemotronVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
max_num_tiles
)
image_overrides = mm_options.get("image") if mm_options else None
image_overrides = mm_options.get("image")
return {
"image": self._get_dummy_images(
......@@ -1461,12 +1460,9 @@ class NanoNemotronVLDummyInputsBuilder(
self,
seq_len: int,
mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
mm_options: Mapping[str, BaseDummyOptions],
) -> MultiModalDataDict:
dummy_image = super().get_dummy_mm_data(
seq_len=seq_len, mm_counts=mm_counts, mm_options=mm_options
)
dummy_image = super().get_dummy_mm_data(seq_len, mm_counts, mm_options)
if self.info.supports_video:
config = self.info.get_hf_config()
image_size: int = config.force_image_size
......@@ -1474,7 +1470,7 @@ class NanoNemotronVLDummyInputsBuilder(
seq_len, mm_counts
)
num_videos = mm_counts.get("video", 0)
video_overrides = mm_options.get("video") if mm_options else None
video_overrides = mm_options.get("video")
dummy_video = {
"video": self._get_dummy_videos(
width=image_size,
......
......@@ -645,8 +645,7 @@ class NemotronParseDummyInputsBuilder(
self,
seq_len: int,
mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
mm_options: Mapping[str, BaseDummyOptions],
) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0)
......
......@@ -92,13 +92,12 @@ class NVLMDummyInputsBuilder(BaseInternVLDummyInputsBuilder[NVLMProcessingInfo])
self,
seq_len: int,
mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
mm_options: Mapping[str, BaseDummyOptions],
) -> MultiModalDataDict:
target_width, target_height = self.info.get_image_size_with_most_features()
num_images = mm_counts.get("image", 0)
image_overrides = mm_options.get("image") if mm_options else None
image_overrides = mm_options.get("image")
return {
"image": self._get_dummy_images(
......
......@@ -306,14 +306,13 @@ class OvisDummyInputsBuilder(BaseDummyInputsBuilder[OvisProcessingInfo]):
self,
seq_len: int,
mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
mm_options: Mapping[str, BaseDummyOptions],
) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0)
target_width, target_height = self.info.get_image_size_with_most_features()
image_overrides = mm_options.get("image") if mm_options else None
image_overrides = mm_options.get("image")
mm_data = {
"image": self._get_dummy_images(
......
......@@ -287,8 +287,7 @@ class Ovis2_5DummyInputsBuilder(BaseDummyInputsBuilder[Ovis2_5ProcessingInfo]):
self,
seq_len: int,
mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
mm_options: Mapping[str, BaseDummyOptions],
) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0)
num_videos = mm_counts.get("video", 0)
......@@ -298,8 +297,8 @@ class Ovis2_5DummyInputsBuilder(BaseDummyInputsBuilder[Ovis2_5ProcessingInfo]):
seq_len, mm_counts
)
image_overrides = mm_options.get("image") if mm_options else None
video_overrides = mm_options.get("video") if mm_options else None
image_overrides = mm_options.get("image")
video_overrides = mm_options.get("video")
mm_data = {
"image": self._get_dummy_images(
......
......@@ -206,13 +206,12 @@ class PaddleOCRVLDummyInputsBuilder(BaseDummyInputsBuilder[PaddleOCRVLProcessing
self,
seq_len: int,
mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
mm_options: Mapping[str, BaseDummyOptions],
) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0)
max_image_size = self.info.get_image_size_with_most_features()
image_overrides = mm_options.get("image") if mm_options else None
image_overrides = mm_options.get("image")
return {
"image": self._get_dummy_images(
......
......@@ -131,8 +131,7 @@ class PaliGemmaDummyInputsBuilder(BaseDummyInputsBuilder[PaliGemmaProcessingInfo
self,
seq_len: int,
mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
mm_options: Mapping[str, BaseDummyOptions],
) -> MultiModalDataDict:
hf_config = self.info.get_hf_config()
vision_config = hf_config.vision_config
......@@ -140,7 +139,7 @@ class PaliGemmaDummyInputsBuilder(BaseDummyInputsBuilder[PaliGemmaProcessingInfo
num_images = mm_counts.get("image", 0)
image_overrides = mm_options.get("image") if mm_options else None
image_overrides = mm_options.get("image")
return {
"image": self._get_dummy_images(
......
......@@ -376,14 +376,13 @@ class Phi3VDummyInputsBuilder(BaseDummyInputsBuilder[Phi3VProcessingInfo]):
self,
seq_len: int,
mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
mm_options: Mapping[str, BaseDummyOptions],
) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0)
target_width, target_height = self.info.get_image_size_with_most_features()
image_overrides = mm_options.get("image") if mm_options else None
image_overrides = mm_options.get("image")
return {
"image": self._get_dummy_images(
......
......@@ -822,16 +822,15 @@ class Phi4MMDummyInputsBuilder(BaseDummyInputsBuilder[Phi4MMProcessingInfo]):
self,
seq_len: int,
mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
mm_options: Mapping[str, BaseDummyOptions],
) -> MultiModalDataDict:
num_audios = mm_counts.get("audio", 0)
num_images = mm_counts.get("image", 0)
target_width, target_height = self.info.get_image_size_with_most_features()
image_overrides = mm_options.get("image") if mm_options else None
audio_overrides = mm_options.get("audio") if mm_options else None
image_overrides = mm_options.get("image")
audio_overrides = mm_options.get("audio")
mm_data = {
"image": self._get_dummy_images(
......
......@@ -249,14 +249,13 @@ class PixtralDummyInputsBuilder(BaseDummyInputsBuilder[PixtralProcessingInfo]):
self,
seq_len: int,
mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
mm_options: Mapping[str, BaseDummyOptions],
) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0)
target_width, target_height = self.info.get_image_size_with_most_features()
image_overrides = mm_options.get("image") if mm_options else None
image_overrides = mm_options.get("image")
return {
"image": self._get_dummy_images(
......@@ -271,8 +270,7 @@ class PixtralDummyInputsBuilder(BaseDummyInputsBuilder[PixtralProcessingInfo]):
self,
seq_len: int,
mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None,
mm_processor_kwargs: Mapping[str, object] | None = None,
mm_options: Mapping[str, BaseDummyOptions],
) -> ProcessorInputs:
tokenizer = self.info.get_tokenizer()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment