Unverified Commit 41d71ca4 authored by Mick's avatar Mick Committed by GitHub
Browse files

fix: fix obsolete qwen-audio processor arg (#9003)

parent 20cfc5a2
......@@ -208,7 +208,7 @@ class BaseMultimodalProcessor(ABC):
def process_mm_data(
self, input_text, images=None, videos=None, audios=None, **kwargs
):
) -> dict:
"""
process multimodal data with transformers AutoProcessor
"""
......@@ -217,10 +217,14 @@ class BaseMultimodalProcessor(ABC):
if videos:
kwargs["videos"] = videos
if audios:
kwargs["audios"] = audios
if self.__class__.__name__ == "Gemma3nSGLangProcessor":
if self.arch in {
"Gemma3nForConditionalGeneration",
"Qwen2AudioForConditionalGeneration",
}:
# Note(Xinyuan): for gemma3n, ref: https://github.com/huggingface/transformers/blob/ccf2ca162e33f381e454cdb74bf4b41a51ab976d/src/transformers/models/gemma3n/processing_gemma3n.py#L107
kwargs["audio"] = audios
else:
kwargs["audios"] = audios
processor = self._processor
if (
......@@ -607,12 +611,6 @@ class BaseMultimodalProcessor(ABC):
all_collected_items: list[MultimodalDataItem] = []
input_ids = None
# Handle dict items (already processed)
for dict_item in dict_items:
all_collected_items.extend(
self.collect_mm_items_from_processor_output(dict_item)
)
# Handle raw items (need processing)
if raw_images or raw_audios or raw_videos:
collected_items, input_ids, ret = self._process_and_collect_mm_items(
......@@ -622,10 +620,16 @@ class BaseMultimodalProcessor(ABC):
videos=raw_videos,
**kwargs,
)
all_collected_items.extend(collected_items)
all_collected_items = collected_items
else:
ret = None
# Handle dict items (already processed)
for dict_item in dict_items:
all_collected_items.extend(
self.collect_mm_items_from_processor_output(dict_item)
)
# Fallback tokenization if no raw items were processed
if input_ids is None:
input_ids = self._processor.tokenizer(
......
import re
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
from sglang.srt.managers.schedule_batch import Modality
from sglang.srt.models.qwen2_audio import Qwen2AudioForConditionalGeneration
from sglang.srt.multimodal.processors.base_processor import (
BaseMultimodalProcessor,
......@@ -29,6 +29,8 @@ class Qwen2AudioMultimodalProcessor(BaseMultimodalProcessor):
audio_token_id=self.audio_token_id,
).build(_processor)
self.ATTR_NAME_TO_MODALITY.update({"feature_attention_mask": Modality.AUDIO})
async def process_mm_data_async(
self,
audio_data,
......@@ -54,7 +56,7 @@ class Qwen2AudioMultimodalProcessor(BaseMultimodalProcessor):
input_lengths = (input_lengths - 1) // 2 + 1
output_lengths = (input_lengths - 2) // 2 + 1
mm_items[0].model_specific_data["audio_feature_lens"] = output_lengths
mm_items[0].audio_feature_lens = output_lengths
return {
"mm_items": mm_items,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment