Unverified Commit 4395c87a authored by Mick's avatar Mick Committed by GitHub
Browse files

refactor: unify names of the feature field of MultimodalDataItem (#8075)

parent c28ad199
......@@ -237,7 +237,7 @@ class VILAForConditionalGeneration(nn.Module):
return cast(LogitsProcessorOutput, output)
def get_image_feature(self, mm_input: List[MultimodalDataItem]) -> Tensor:
pixel_values = cast(Tensor, mm_input[0].pixel_values)
pixel_values = cast(Tensor, mm_input[0].feature)
##### BEGIN COPY modeling_vila.py #####
......
......@@ -5,7 +5,6 @@ import multiprocessing as mp
import os
import re
from abc import ABC, abstractmethod
from functools import lru_cache
from typing import Any, Dict, List, Optional, Tuple, Union
import numpy as np
......@@ -156,6 +155,10 @@ class BaseMultimodalProcessor(ABC):
# "precomputed_features" - handled specially as it can be any modality
}
# name of the feature filed
# TODO: pass from processors
self.FEATURE_NAMES = ["pixel_values", "pixel_values_videos", "audio_features"]
def process_mm_data(
self, input_text, images=None, videos=None, audios=None, **kwargs
):
......@@ -524,6 +527,9 @@ class BaseMultimodalProcessor(ABC):
if modality not in items:
items[modality] = MultimodalDataItem(modality=modality)
if attr_name in self.FEATURE_NAMES:
attr_name = "feature"
# Set attribute
setattr(items[modality], attr_name, value)
......
......@@ -26,7 +26,7 @@ class ClipImageProcessor(BaseMultimodalProcessor):
image_inputs["input_ids"] = image_inputs["input_ids"].tolist()[0]
image_inputs["mm_items"] = [
MultimodalDataItem(
pixel_values=image_inputs["pixel_values"], modality=Modality.IMAGE
feature=image_inputs["pixel_values"], modality=Modality.IMAGE
)
]
......
......@@ -68,7 +68,7 @@ class DeepseekVL2ImageProcessor(BaseMultimodalProcessor):
input_ids=input_ids, mm_token_id=self._processor.image_token_id
)
item = MultimodalDataItem(
pixel_values=res["images"],
feature=res["images"],
offsets=image_offsets,
modality=Modality.IMAGE,
image_emb_mask=images_seq_mask,
......
......@@ -223,7 +223,7 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
)
items = [
MultimodalDataItem(
pixel_values=pixel_values,
feature=pixel_values,
modality=Modality.IMAGE,
offsets=image_offsets,
)
......
......@@ -47,7 +47,7 @@ class JanusProImageProcessor(BaseMultimodalProcessor):
return {
"mm_items": [
MultimodalDataItem(
pixel_values=res["pixel_values"],
feature=res["pixel_values"],
image_emb_mask=res["images_emb_mask"],
offsets=image_offsets,
modality=Modality.IMAGE,
......
......@@ -158,7 +158,7 @@ class LlavaImageProcessor(BaseMultimodalProcessor):
return {
"mm_items": [
MultimodalDataItem(
pixel_values=pixel_values,
feature=pixel_values,
image_sizes=image_sizes,
modality=modality,
)
......
......@@ -114,7 +114,7 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
if len(pixel_values) != 0:
item = MultimodalDataItem(
pixel_values=pixel_values,
feature=pixel_values,
offsets=image_offsets,
tgt_size=tgt_sizes_flat,
modality=Modality.IMAGE,
......@@ -135,7 +135,7 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
else:
audio_offsets = None
item = MultimodalDataItem(
audio_features=[res["audio_features"]],
feature=[res["audio_features"]],
audio_feature_lens=res["audio_feature_lens"],
offsets=audio_offsets,
modality=Modality.AUDIO,
......
......@@ -24,7 +24,7 @@ class MllamaImageProcessor(BaseMultimodalProcessor):
image_inputs["input_ids"] = image_inputs["input_ids"].tolist()[0]
image_inputs["mm_items"] = [
MultimodalDataItem(
pixel_values=image_inputs["pixel_values"],
feature=image_inputs["pixel_values"],
aspect_ratio_id=image_inputs["aspect_ratio_ids"],
aspect_ratio_mask=image_inputs["aspect_ratio_mask"],
modality=Modality.IMAGE,
......
......@@ -142,7 +142,7 @@ class Mllama4ImageProcessor(BaseMultimodalProcessor):
# Add metadata for image processing
processor_output["mm_items"] = [
MultimodalDataItem(
pixel_values=processor_output["pixel_values"],
feature=processor_output["pixel_values"],
modality=Modality.IMAGE,
offsets=image_offsets,
)
......
......@@ -62,7 +62,7 @@ class Phi4MMImageProcessor(BaseMultimodalProcessor):
items = [
MultimodalDataItem(
pixel_values=res["input_image_embeds"],
feature=res["input_image_embeds"],
image_sizes=res["image_sizes"],
image_emb_mask=res["image_attention_mask"],
offsets=image_offsets,
......
......@@ -103,7 +103,7 @@ class PixtralProcessor(BaseMultimodalProcessor):
)
mm_items = [
MultimodalDataItem(
pixel_values=processor_output["pixel_values"],
feature=processor_output["pixel_values"],
image_sizes=processor_output["image_sizes"],
modality=Modality.IMAGE,
offsets=image_offsets,
......
......@@ -245,7 +245,7 @@ class TestMiniCPMVLogits(VisionLLMLogitsBase):
MultimodalInputs(
mm_items=[
MultimodalDataItem(
pixel_values=pixel_values_flat,
feature=pixel_values_flat,
offsets=image_offsets,
tgt_size=tgt_sizes_flat,
modality=Modality.IMAGE,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment