Unverified Commit 535de06c authored by Muhammad Hashmi's avatar Muhammad Hashmi Committed by GitHub
Browse files

[Model] Add transcription support for Qwen3-Omni (#29828)


Signed-off-by: default avatarMuhammad Hashmi <mhashmi@berkeley.edu>
Signed-off-by: default avatarNickLucche <nlucches@redhat.com>
Co-authored-by: default avatarNickLucche <nlucches@redhat.com>
parent 4292c90a
...@@ -251,6 +251,7 @@ No extra registration is required beyond having your model class available via t ...@@ -251,6 +251,7 @@ No extra registration is required beyond having your model class available via t
- Whisper encoder–decoder (audio-only): [vllm/model_executor/models/whisper.py](../../../vllm/model_executor/models/whisper.py) - Whisper encoder–decoder (audio-only): [vllm/model_executor/models/whisper.py](../../../vllm/model_executor/models/whisper.py)
- Voxtral decoder-only (audio embeddings + LLM): [vllm/model_executor/models/voxtral.py](../../../vllm/model_executor/models/voxtral.py). Make sure to have installed `mistral-common[audio]`. - Voxtral decoder-only (audio embeddings + LLM): [vllm/model_executor/models/voxtral.py](../../../vllm/model_executor/models/voxtral.py). Make sure to have installed `mistral-common[audio]`.
- Gemma3n decoder-only with fixed instruction prompt: [vllm/model_executor/models/gemma3n_mm.py](../../../vllm/model_executor/models/gemma3n_mm.py) - Gemma3n decoder-only with fixed instruction prompt: [vllm/model_executor/models/gemma3n_mm.py](../../../vllm/model_executor/models/gemma3n_mm.py)
- Qwen3-Omni multimodal with audio embeddings: [vllm/model_executor/models/qwen3_omni_moe_thinker.py](../../../vllm/model_executor/models/qwen3_omni_moe_thinker.py)
## Test with the API ## Test with the API
......
...@@ -781,6 +781,7 @@ Speech2Text models trained specifically for Automatic Speech Recognition. ...@@ -781,6 +781,7 @@ Speech2Text models trained specifically for Automatic Speech Recognition.
| `GlmAsrForConditionalGeneration` | GLM-ASR | `zai-org/GLM-ASR-Nano-2512` | ✅︎ | ✅︎ | | `GlmAsrForConditionalGeneration` | GLM-ASR | `zai-org/GLM-ASR-Nano-2512` | ✅︎ | ✅︎ |
| `GraniteSpeechForConditionalGeneration` | Granite Speech | `ibm-granite/granite-speech-3.3-2b`, `ibm-granite/granite-speech-3.3-8b`, etc. | ✅︎ | ✅︎ | | `GraniteSpeechForConditionalGeneration` | Granite Speech | `ibm-granite/granite-speech-3.3-2b`, `ibm-granite/granite-speech-3.3-8b`, etc. | ✅︎ | ✅︎ |
| `Qwen3ASRForConditionalGeneration` | Qwen3-ASR | `Qwen/Qwen3-ASR-1.7B`, etc. | | ✅︎ | | `Qwen3ASRForConditionalGeneration` | Qwen3-ASR | `Qwen/Qwen3-ASR-1.7B`, etc. | | ✅︎ |
| `Qwen3OmniMoeThinkerForConditionalGeneration` | Qwen3-Omni | `Qwen/Qwen3-Omni-30B-A3B-Instruct`, etc. | | ✅︎ |
| `VoxtralForConditionalGeneration` | Voxtral (Mistral format) | `mistralai/Voxtral-Mini-3B-2507`, `mistralai/Voxtral-Small-24B-2507`, etc. | ✅︎ | ✅︎ | | `VoxtralForConditionalGeneration` | Voxtral (Mistral format) | `mistralai/Voxtral-Mini-3B-2507`, `mistralai/Voxtral-Small-24B-2507`, etc. | ✅︎ | ✅︎ |
| `WhisperForConditionalGeneration` | Whisper | `openai/whisper-small`, `openai/whisper-large-v3-turbo`, etc. | | | | `WhisperForConditionalGeneration` | Whisper | `openai/whisper-small`, `openai/whisper-large-v3-turbo`, etc. | | |
......
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence
from functools import partial from functools import partial
from typing import Any from typing import Any, Literal, cast
import numpy as np import numpy as np
import torch import torch
...@@ -48,8 +48,9 @@ from transformers import __version__ as TRANSFORMERS_VERSION ...@@ -48,8 +48,9 @@ from transformers import __version__ as TRANSFORMERS_VERSION
# isort: on # isort: on
from vllm.compilation.decorators import support_torch_compile from vllm.compilation.decorators import support_torch_compile
from vllm.config import VllmConfig from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
from vllm.inputs.data import PromptType
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY
from vllm.model_executor.layers.attention.mm_encoder_attention import ( from vllm.model_executor.layers.attention.mm_encoder_attention import (
...@@ -79,6 +80,7 @@ from vllm.multimodal.processing.processor import ( ...@@ -79,6 +80,7 @@ from vllm.multimodal.processing.processor import (
PromptUpdateDetails, PromptUpdateDetails,
) )
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.processor import cached_processor_from_config
from vllm.v1.attention.backends.registry import AttentionBackendEnum from vllm.v1.attention.backends.registry import AttentionBackendEnum
from .interfaces import ( from .interfaces import (
...@@ -86,6 +88,7 @@ from .interfaces import ( ...@@ -86,6 +88,7 @@ from .interfaces import (
SupportsMRoPE, SupportsMRoPE,
SupportsMultiModal, SupportsMultiModal,
SupportsPP, SupportsPP,
SupportsTranscription,
) )
from .qwen2_5_omni_thinker import ( from .qwen2_5_omni_thinker import (
Qwen2_5OmniAudioFeatureInputs, Qwen2_5OmniAudioFeatureInputs,
...@@ -110,6 +113,29 @@ from .vision import get_vit_attn_backend ...@@ -110,6 +113,29 @@ from .vision import get_vit_attn_backend
logger = init_logger(__name__) logger = init_logger(__name__)
# Speech input languages supported by Qwen3-Omni
# From: https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct
ISO639_1_SUPPORTED_LANGS = {
"en": "English",
"zh": "Chinese",
"ko": "Korean",
"ja": "Japanese",
"de": "German",
"ru": "Russian",
"it": "Italian",
"fr": "French",
"es": "Spanish",
"pt": "Portuguese",
"ms": "Malay",
"nl": "Dutch",
"id": "Indonesian",
"tr": "Turkish",
"vi": "Vietnamese",
"yue": "Cantonese",
"ar": "Arabic",
"ur": "Urdu",
}
def _get_feat_extract_output_lengths(input_lengths: torch.Tensor): def _get_feat_extract_output_lengths(input_lengths: torch.Tensor):
input_lengths_leave = input_lengths % 100 input_lengths_leave = input_lengths % 100
...@@ -1572,6 +1598,7 @@ class Qwen3OmniMoeThinkerForConditionalGeneration( ...@@ -1572,6 +1598,7 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
SupportsPP, SupportsPP,
SupportsMRoPE, SupportsMRoPE,
Qwen3OmniMoeConditionalGenerationMixin, Qwen3OmniMoeConditionalGenerationMixin,
SupportsTranscription,
): ):
hf_to_vllm_mapper = WeightsMapper( hf_to_vllm_mapper = WeightsMapper(
orig_to_new_prefix={ orig_to_new_prefix={
...@@ -1593,6 +1620,8 @@ class Qwen3OmniMoeThinkerForConditionalGeneration( ...@@ -1593,6 +1620,8 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
], ],
} }
supported_languages = ISO639_1_SUPPORTED_LANGS
@classmethod @classmethod
def get_placeholder_str(cls, modality: str, i: int) -> str | None: def get_placeholder_str(cls, modality: str, i: int) -> str | None:
if modality.startswith("image"): if modality.startswith("image"):
...@@ -2085,6 +2114,77 @@ class Qwen3OmniMoeThinkerForConditionalGeneration( ...@@ -2085,6 +2114,77 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
total_tokens = num_video + audio_len total_tokens = num_video + audio_len
return np.concatenate(pos_ids_list, axis=1), total_tokens return np.concatenate(pos_ids_list, axis=1), total_tokens
@classmethod
def get_speech_to_text_config(
cls, model_config: ModelConfig, task_type: str
) -> SpeechToTextConfig:
processor = cached_processor_from_config(
model_config, processor_cls=Qwen3OmniMoeProcessor
)
return SpeechToTextConfig(
max_audio_clip_s=processor.feature_extractor.chunk_length,
sample_rate=processor.feature_extractor.sampling_rate,
min_energy_split_window_size=None,
)
@classmethod
def get_generation_prompt(
cls,
audio: np.ndarray,
stt_config: SpeechToTextConfig,
model_config: ModelConfig,
language: str | None,
task_type: Literal["transcribe", "translate"],
request_prompt: str,
to_language: str | None,
) -> PromptType:
"""
Construct a transcription/translation prompt for Qwen3-Omni.
"""
# Transcribe this audio [into <language>] | for transcription
# Translate this audio [from <language> into <to_language>] | for translation
instruction = "Transcribe" if task_type == "transcribe" else "Translate"
instruction += " this audio"
# Default to_language to English for translation
if task_type == "translate" and to_language is None:
to_language = "en"
# Get full language names from supported_languages mapping
full_lang_name = cls.supported_languages.get(language, "")
full_lang_name_to = cls.supported_languages.get(to_language, "")
if task_type == "transcribe" and full_lang_name:
instruction += f" into {full_lang_name}"
elif task_type == "translate":
if full_lang_name:
instruction += f" from {full_lang_name}"
if full_lang_name_to:
instruction += f" into {full_lang_name_to}"
instruction += "."
if request_prompt:
instruction += f" {request_prompt}"
processor = cached_processor_from_config(
model_config, processor_cls=Qwen3OmniMoeProcessor
)
# Audio placeholder format: <|audio_start|><|audio_pad|><|audio_end|>
audio_placeholder = "<|audio_start|><|audio_pad|><|audio_end|>"
user_content = f"{audio_placeholder}{instruction}"
messages = [{"role": "user", "content": user_content}]
prompt = processor.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
)
audio_data = (audio, stt_config.sample_rate)
prompts_dict = {"multi_modal_data": {"audio": audio_data}, "prompt": prompt}
return cast(PromptType, prompts_dict)
def get_mrope_input_positions( def get_mrope_input_positions(
self, self,
input_tokens: list[int], input_tokens: list[int],
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment