"...git@developer.sourcefind.cn:renzhc/diffusers_dcu.git" did not exist on "14b97549230ce4a40c0775e075a11c3891dc0446"
Unverified Commit 41d71ca4 authored by Mick's avatar Mick Committed by GitHub
Browse files

fix: fix obsolete qwen-audio processor arg (#9003)

parent 20cfc5a2
...@@ -208,7 +208,7 @@ class BaseMultimodalProcessor(ABC): ...@@ -208,7 +208,7 @@ class BaseMultimodalProcessor(ABC):
def process_mm_data( def process_mm_data(
self, input_text, images=None, videos=None, audios=None, **kwargs self, input_text, images=None, videos=None, audios=None, **kwargs
): ) -> dict:
""" """
process multimodal data with transformers AutoProcessor process multimodal data with transformers AutoProcessor
""" """
...@@ -217,10 +217,14 @@ class BaseMultimodalProcessor(ABC): ...@@ -217,10 +217,14 @@ class BaseMultimodalProcessor(ABC):
if videos: if videos:
kwargs["videos"] = videos kwargs["videos"] = videos
if audios: if audios:
kwargs["audios"] = audios if self.arch in {
if self.__class__.__name__ == "Gemma3nSGLangProcessor": "Gemma3nForConditionalGeneration",
"Qwen2AudioForConditionalGeneration",
}:
# Note(Xinyuan): for gemma3n, ref: https://github.com/huggingface/transformers/blob/ccf2ca162e33f381e454cdb74bf4b41a51ab976d/src/transformers/models/gemma3n/processing_gemma3n.py#L107 # Note(Xinyuan): for gemma3n, ref: https://github.com/huggingface/transformers/blob/ccf2ca162e33f381e454cdb74bf4b41a51ab976d/src/transformers/models/gemma3n/processing_gemma3n.py#L107
kwargs["audio"] = audios kwargs["audio"] = audios
else:
kwargs["audios"] = audios
processor = self._processor processor = self._processor
if ( if (
...@@ -607,12 +611,6 @@ class BaseMultimodalProcessor(ABC): ...@@ -607,12 +611,6 @@ class BaseMultimodalProcessor(ABC):
all_collected_items: list[MultimodalDataItem] = [] all_collected_items: list[MultimodalDataItem] = []
input_ids = None input_ids = None
# Handle dict items (already processed)
for dict_item in dict_items:
all_collected_items.extend(
self.collect_mm_items_from_processor_output(dict_item)
)
# Handle raw items (need processing) # Handle raw items (need processing)
if raw_images or raw_audios or raw_videos: if raw_images or raw_audios or raw_videos:
collected_items, input_ids, ret = self._process_and_collect_mm_items( collected_items, input_ids, ret = self._process_and_collect_mm_items(
...@@ -622,10 +620,16 @@ class BaseMultimodalProcessor(ABC): ...@@ -622,10 +620,16 @@ class BaseMultimodalProcessor(ABC):
videos=raw_videos, videos=raw_videos,
**kwargs, **kwargs,
) )
all_collected_items.extend(collected_items) all_collected_items = collected_items
else: else:
ret = None ret = None
# Handle dict items (already processed)
for dict_item in dict_items:
all_collected_items.extend(
self.collect_mm_items_from_processor_output(dict_item)
)
# Fallback tokenization if no raw items were processed # Fallback tokenization if no raw items were processed
if input_ids is None: if input_ids is None:
input_ids = self._processor.tokenizer( input_ids = self._processor.tokenizer(
......
import re import re
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem from sglang.srt.managers.schedule_batch import Modality
from sglang.srt.models.qwen2_audio import Qwen2AudioForConditionalGeneration from sglang.srt.models.qwen2_audio import Qwen2AudioForConditionalGeneration
from sglang.srt.multimodal.processors.base_processor import ( from sglang.srt.multimodal.processors.base_processor import (
BaseMultimodalProcessor, BaseMultimodalProcessor,
...@@ -29,6 +29,8 @@ class Qwen2AudioMultimodalProcessor(BaseMultimodalProcessor): ...@@ -29,6 +29,8 @@ class Qwen2AudioMultimodalProcessor(BaseMultimodalProcessor):
audio_token_id=self.audio_token_id, audio_token_id=self.audio_token_id,
).build(_processor) ).build(_processor)
self.ATTR_NAME_TO_MODALITY.update({"feature_attention_mask": Modality.AUDIO})
async def process_mm_data_async( async def process_mm_data_async(
self, self,
audio_data, audio_data,
...@@ -54,7 +56,7 @@ class Qwen2AudioMultimodalProcessor(BaseMultimodalProcessor): ...@@ -54,7 +56,7 @@ class Qwen2AudioMultimodalProcessor(BaseMultimodalProcessor):
input_lengths = (input_lengths - 1) // 2 + 1 input_lengths = (input_lengths - 1) // 2 + 1
output_lengths = (input_lengths - 2) // 2 + 1 output_lengths = (input_lengths - 2) // 2 + 1
mm_items[0].model_specific_data["audio_feature_lens"] = output_lengths mm_items[0].audio_feature_lens = output_lengths
return { return {
"mm_items": mm_items, "mm_items": mm_items,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment