Unverified Commit 4395c87a authored by Mick's avatar Mick Committed by GitHub
Browse files

refactor: unify names of the feature field of MultimodalDataItem (#8075)

parent c28ad199
...@@ -237,7 +237,7 @@ class VILAForConditionalGeneration(nn.Module): ...@@ -237,7 +237,7 @@ class VILAForConditionalGeneration(nn.Module):
return cast(LogitsProcessorOutput, output) return cast(LogitsProcessorOutput, output)
def get_image_feature(self, mm_input: List[MultimodalDataItem]) -> Tensor: def get_image_feature(self, mm_input: List[MultimodalDataItem]) -> Tensor:
pixel_values = cast(Tensor, mm_input[0].pixel_values) pixel_values = cast(Tensor, mm_input[0].feature)
##### BEGIN COPY modeling_vila.py ##### ##### BEGIN COPY modeling_vila.py #####
......
...@@ -5,7 +5,6 @@ import multiprocessing as mp ...@@ -5,7 +5,6 @@ import multiprocessing as mp
import os import os
import re import re
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from functools import lru_cache
from typing import Any, Dict, List, Optional, Tuple, Union from typing import Any, Dict, List, Optional, Tuple, Union
import numpy as np import numpy as np
...@@ -156,6 +155,10 @@ class BaseMultimodalProcessor(ABC): ...@@ -156,6 +155,10 @@ class BaseMultimodalProcessor(ABC):
# "precomputed_features" - handled specially as it can be any modality # "precomputed_features" - handled specially as it can be any modality
} }
# name of the feature filed
# TODO: pass from processors
self.FEATURE_NAMES = ["pixel_values", "pixel_values_videos", "audio_features"]
def process_mm_data( def process_mm_data(
self, input_text, images=None, videos=None, audios=None, **kwargs self, input_text, images=None, videos=None, audios=None, **kwargs
): ):
...@@ -524,6 +527,9 @@ class BaseMultimodalProcessor(ABC): ...@@ -524,6 +527,9 @@ class BaseMultimodalProcessor(ABC):
if modality not in items: if modality not in items:
items[modality] = MultimodalDataItem(modality=modality) items[modality] = MultimodalDataItem(modality=modality)
if attr_name in self.FEATURE_NAMES:
attr_name = "feature"
# Set attribute # Set attribute
setattr(items[modality], attr_name, value) setattr(items[modality], attr_name, value)
......
...@@ -26,7 +26,7 @@ class ClipImageProcessor(BaseMultimodalProcessor): ...@@ -26,7 +26,7 @@ class ClipImageProcessor(BaseMultimodalProcessor):
image_inputs["input_ids"] = image_inputs["input_ids"].tolist()[0] image_inputs["input_ids"] = image_inputs["input_ids"].tolist()[0]
image_inputs["mm_items"] = [ image_inputs["mm_items"] = [
MultimodalDataItem( MultimodalDataItem(
pixel_values=image_inputs["pixel_values"], modality=Modality.IMAGE feature=image_inputs["pixel_values"], modality=Modality.IMAGE
) )
] ]
......
...@@ -68,7 +68,7 @@ class DeepseekVL2ImageProcessor(BaseMultimodalProcessor): ...@@ -68,7 +68,7 @@ class DeepseekVL2ImageProcessor(BaseMultimodalProcessor):
input_ids=input_ids, mm_token_id=self._processor.image_token_id input_ids=input_ids, mm_token_id=self._processor.image_token_id
) )
item = MultimodalDataItem( item = MultimodalDataItem(
pixel_values=res["images"], feature=res["images"],
offsets=image_offsets, offsets=image_offsets,
modality=Modality.IMAGE, modality=Modality.IMAGE,
image_emb_mask=images_seq_mask, image_emb_mask=images_seq_mask,
......
...@@ -223,7 +223,7 @@ class InternVLImageProcessor(BaseMultimodalProcessor): ...@@ -223,7 +223,7 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
) )
items = [ items = [
MultimodalDataItem( MultimodalDataItem(
pixel_values=pixel_values, feature=pixel_values,
modality=Modality.IMAGE, modality=Modality.IMAGE,
offsets=image_offsets, offsets=image_offsets,
) )
......
...@@ -47,7 +47,7 @@ class JanusProImageProcessor(BaseMultimodalProcessor): ...@@ -47,7 +47,7 @@ class JanusProImageProcessor(BaseMultimodalProcessor):
return { return {
"mm_items": [ "mm_items": [
MultimodalDataItem( MultimodalDataItem(
pixel_values=res["pixel_values"], feature=res["pixel_values"],
image_emb_mask=res["images_emb_mask"], image_emb_mask=res["images_emb_mask"],
offsets=image_offsets, offsets=image_offsets,
modality=Modality.IMAGE, modality=Modality.IMAGE,
......
...@@ -158,7 +158,7 @@ class LlavaImageProcessor(BaseMultimodalProcessor): ...@@ -158,7 +158,7 @@ class LlavaImageProcessor(BaseMultimodalProcessor):
return { return {
"mm_items": [ "mm_items": [
MultimodalDataItem( MultimodalDataItem(
pixel_values=pixel_values, feature=pixel_values,
image_sizes=image_sizes, image_sizes=image_sizes,
modality=modality, modality=modality,
) )
......
...@@ -114,7 +114,7 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor): ...@@ -114,7 +114,7 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
if len(pixel_values) != 0: if len(pixel_values) != 0:
item = MultimodalDataItem( item = MultimodalDataItem(
pixel_values=pixel_values, feature=pixel_values,
offsets=image_offsets, offsets=image_offsets,
tgt_size=tgt_sizes_flat, tgt_size=tgt_sizes_flat,
modality=Modality.IMAGE, modality=Modality.IMAGE,
...@@ -135,7 +135,7 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor): ...@@ -135,7 +135,7 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
else: else:
audio_offsets = None audio_offsets = None
item = MultimodalDataItem( item = MultimodalDataItem(
audio_features=[res["audio_features"]], feature=[res["audio_features"]],
audio_feature_lens=res["audio_feature_lens"], audio_feature_lens=res["audio_feature_lens"],
offsets=audio_offsets, offsets=audio_offsets,
modality=Modality.AUDIO, modality=Modality.AUDIO,
......
...@@ -24,7 +24,7 @@ class MllamaImageProcessor(BaseMultimodalProcessor): ...@@ -24,7 +24,7 @@ class MllamaImageProcessor(BaseMultimodalProcessor):
image_inputs["input_ids"] = image_inputs["input_ids"].tolist()[0] image_inputs["input_ids"] = image_inputs["input_ids"].tolist()[0]
image_inputs["mm_items"] = [ image_inputs["mm_items"] = [
MultimodalDataItem( MultimodalDataItem(
pixel_values=image_inputs["pixel_values"], feature=image_inputs["pixel_values"],
aspect_ratio_id=image_inputs["aspect_ratio_ids"], aspect_ratio_id=image_inputs["aspect_ratio_ids"],
aspect_ratio_mask=image_inputs["aspect_ratio_mask"], aspect_ratio_mask=image_inputs["aspect_ratio_mask"],
modality=Modality.IMAGE, modality=Modality.IMAGE,
......
...@@ -142,7 +142,7 @@ class Mllama4ImageProcessor(BaseMultimodalProcessor): ...@@ -142,7 +142,7 @@ class Mllama4ImageProcessor(BaseMultimodalProcessor):
# Add metadata for image processing # Add metadata for image processing
processor_output["mm_items"] = [ processor_output["mm_items"] = [
MultimodalDataItem( MultimodalDataItem(
pixel_values=processor_output["pixel_values"], feature=processor_output["pixel_values"],
modality=Modality.IMAGE, modality=Modality.IMAGE,
offsets=image_offsets, offsets=image_offsets,
) )
......
...@@ -62,7 +62,7 @@ class Phi4MMImageProcessor(BaseMultimodalProcessor): ...@@ -62,7 +62,7 @@ class Phi4MMImageProcessor(BaseMultimodalProcessor):
items = [ items = [
MultimodalDataItem( MultimodalDataItem(
pixel_values=res["input_image_embeds"], feature=res["input_image_embeds"],
image_sizes=res["image_sizes"], image_sizes=res["image_sizes"],
image_emb_mask=res["image_attention_mask"], image_emb_mask=res["image_attention_mask"],
offsets=image_offsets, offsets=image_offsets,
......
...@@ -103,7 +103,7 @@ class PixtralProcessor(BaseMultimodalProcessor): ...@@ -103,7 +103,7 @@ class PixtralProcessor(BaseMultimodalProcessor):
) )
mm_items = [ mm_items = [
MultimodalDataItem( MultimodalDataItem(
pixel_values=processor_output["pixel_values"], feature=processor_output["pixel_values"],
image_sizes=processor_output["image_sizes"], image_sizes=processor_output["image_sizes"],
modality=Modality.IMAGE, modality=Modality.IMAGE,
offsets=image_offsets, offsets=image_offsets,
......
...@@ -245,7 +245,7 @@ class TestMiniCPMVLogits(VisionLLMLogitsBase): ...@@ -245,7 +245,7 @@ class TestMiniCPMVLogits(VisionLLMLogitsBase):
MultimodalInputs( MultimodalInputs(
mm_items=[ mm_items=[
MultimodalDataItem( MultimodalDataItem(
pixel_values=pixel_values_flat, feature=pixel_values_flat,
offsets=image_offsets, offsets=image_offsets,
tgt_size=tgt_sizes_flat, tgt_size=tgt_sizes_flat,
modality=Modality.IMAGE, modality=Modality.IMAGE,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment