Unverified Commit 535de06c authored by Muhammad Hashmi's avatar Muhammad Hashmi Committed by GitHub
Browse files

[Model] Add transcription support for Qwen3-Omni (#29828)


Signed-off-by: default avatarMuhammad Hashmi <mhashmi@berkeley.edu>
Signed-off-by: default avatarNickLucche <nlucches@redhat.com>
Co-authored-by: default avatarNickLucche <nlucches@redhat.com>
parent 4292c90a
......@@ -251,6 +251,7 @@ No extra registration is required beyond having your model class available via t
- Whisper encoder–decoder (audio-only): [vllm/model_executor/models/whisper.py](../../../vllm/model_executor/models/whisper.py)
- Voxtral decoder-only (audio embeddings + LLM): [vllm/model_executor/models/voxtral.py](../../../vllm/model_executor/models/voxtral.py). Make sure to have installed `mistral-common[audio]`.
- Gemma3n decoder-only with fixed instruction prompt: [vllm/model_executor/models/gemma3n_mm.py](../../../vllm/model_executor/models/gemma3n_mm.py)
- Qwen3-Omni multimodal with audio embeddings: [vllm/model_executor/models/qwen3_omni_moe_thinker.py](../../../vllm/model_executor/models/qwen3_omni_moe_thinker.py)
## Test with the API
......
......@@ -781,6 +781,7 @@ Speech2Text models trained specifically for Automatic Speech Recognition.
| `GlmAsrForConditionalGeneration` | GLM-ASR | `zai-org/GLM-ASR-Nano-2512` | ✅︎ | ✅︎ |
| `GraniteSpeechForConditionalGeneration` | Granite Speech | `ibm-granite/granite-speech-3.3-2b`, `ibm-granite/granite-speech-3.3-8b`, etc. | ✅︎ | ✅︎ |
| `Qwen3ASRForConditionalGeneration` | Qwen3-ASR | `Qwen/Qwen3-ASR-1.7B`, etc. | | ✅︎ |
| `Qwen3OmniMoeThinkerForConditionalGeneration` | Qwen3-Omni | `Qwen/Qwen3-Omni-30B-A3B-Instruct`, etc. | | ✅︎ |
| `VoxtralForConditionalGeneration` | Voxtral (Mistral format) | `mistralai/Voxtral-Mini-3B-2507`, `mistralai/Voxtral-Small-24B-2507`, etc. | ✅︎ | ✅︎ |
| `WhisperForConditionalGeneration` | Whisper | `openai/whisper-small`, `openai/whisper-large-v3-turbo`, etc. | | |
......
......@@ -24,7 +24,7 @@
from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence
from functools import partial
from typing import Any
from typing import Any, Literal, cast
import numpy as np
import torch
......@@ -48,8 +48,9 @@ from transformers import __version__ as TRANSFORMERS_VERSION
# isort: on
from vllm.compilation.decorators import support_torch_compile
from vllm.config import VllmConfig
from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
from vllm.inputs.data import PromptType
from vllm.logger import init_logger
from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY
from vllm.model_executor.layers.attention.mm_encoder_attention import (
......@@ -79,6 +80,7 @@ from vllm.multimodal.processing.processor import (
PromptUpdateDetails,
)
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.processor import cached_processor_from_config
from vllm.v1.attention.backends.registry import AttentionBackendEnum
from .interfaces import (
......@@ -86,6 +88,7 @@ from .interfaces import (
SupportsMRoPE,
SupportsMultiModal,
SupportsPP,
SupportsTranscription,
)
from .qwen2_5_omni_thinker import (
Qwen2_5OmniAudioFeatureInputs,
......@@ -110,6 +113,29 @@ from .vision import get_vit_attn_backend
logger = init_logger(__name__)
# Speech input languages supported by Qwen3-Omni
# From: https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct
ISO639_1_SUPPORTED_LANGS = {
"en": "English",
"zh": "Chinese",
"ko": "Korean",
"ja": "Japanese",
"de": "German",
"ru": "Russian",
"it": "Italian",
"fr": "French",
"es": "Spanish",
"pt": "Portuguese",
"ms": "Malay",
"nl": "Dutch",
"id": "Indonesian",
"tr": "Turkish",
"vi": "Vietnamese",
"yue": "Cantonese",
"ar": "Arabic",
"ur": "Urdu",
}
def _get_feat_extract_output_lengths(input_lengths: torch.Tensor):
input_lengths_leave = input_lengths % 100
......@@ -1572,6 +1598,7 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
SupportsPP,
SupportsMRoPE,
Qwen3OmniMoeConditionalGenerationMixin,
SupportsTranscription,
):
hf_to_vllm_mapper = WeightsMapper(
orig_to_new_prefix={
......@@ -1593,6 +1620,8 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
],
}
supported_languages = ISO639_1_SUPPORTED_LANGS
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> str | None:
if modality.startswith("image"):
......@@ -2085,6 +2114,77 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
total_tokens = num_video + audio_len
return np.concatenate(pos_ids_list, axis=1), total_tokens
@classmethod
def get_speech_to_text_config(
cls, model_config: ModelConfig, task_type: str
) -> SpeechToTextConfig:
processor = cached_processor_from_config(
model_config, processor_cls=Qwen3OmniMoeProcessor
)
return SpeechToTextConfig(
max_audio_clip_s=processor.feature_extractor.chunk_length,
sample_rate=processor.feature_extractor.sampling_rate,
min_energy_split_window_size=None,
)
@classmethod
def get_generation_prompt(
cls,
audio: np.ndarray,
stt_config: SpeechToTextConfig,
model_config: ModelConfig,
language: str | None,
task_type: Literal["transcribe", "translate"],
request_prompt: str,
to_language: str | None,
) -> PromptType:
"""
Construct a transcription/translation prompt for Qwen3-Omni.
"""
# Transcribe this audio [into <language>] | for transcription
# Translate this audio [from <language> into <to_language>] | for translation
instruction = "Transcribe" if task_type == "transcribe" else "Translate"
instruction += " this audio"
# Default to_language to English for translation
if task_type == "translate" and to_language is None:
to_language = "en"
# Get full language names from supported_languages mapping
full_lang_name = cls.supported_languages.get(language, "")
full_lang_name_to = cls.supported_languages.get(to_language, "")
if task_type == "transcribe" and full_lang_name:
instruction += f" into {full_lang_name}"
elif task_type == "translate":
if full_lang_name:
instruction += f" from {full_lang_name}"
if full_lang_name_to:
instruction += f" into {full_lang_name_to}"
instruction += "."
if request_prompt:
instruction += f" {request_prompt}"
processor = cached_processor_from_config(
model_config, processor_cls=Qwen3OmniMoeProcessor
)
# Audio placeholder format: <|audio_start|><|audio_pad|><|audio_end|>
audio_placeholder = "<|audio_start|><|audio_pad|><|audio_end|>"
user_content = f"{audio_placeholder}{instruction}"
messages = [{"role": "user", "content": user_content}]
prompt = processor.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
)
audio_data = (audio, stt_config.sample_rate)
prompts_dict = {"multi_modal_data": {"audio": audio_data}, "prompt": prompt}
return cast(PromptType, prompts_dict)
def get_mrope_input_positions(
self,
input_tokens: list[int],
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment