添加qwen3-omni支持

f39afa4a · cx · a9c37628 · f39afa4a · f39afa4a · f39afa4a
Commit f39afa4a authored Oct 27, 2025 by cx
8 changed files
--- a/vllm/model_executor/layers/rotary_embedding/mrope.py
+++ b/vllm/model_executor/layers/rotary_embedding/mrope.py
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from collections.abc import Iterable, Mapping, MutableSequence
+from collections.abc import Iterable, Mapping, MutableSequence, Callable
 from typing import (TYPE_CHECKING, ClassVar, Literal, Optional, Protocol,
                    Union, overload, runtime_checkable)
@@ -20,7 +20,7 @@ from vllm.model_executor.layers.quantization.base_config import (
    QuantizationConfig)
 from vllm.utils import supports_kw
-from .interfaces_base import is_pooling_model
+from .interfaces_base import is_pooling_model, VllmModel
 if TYPE_CHECKING:
    from vllm.config import VllmConfig
@@ -81,10 +81,9 @@ class SupportsMultiModal(Protocol):
        """
        ...
-    def get_multimodal_embeddings(self,
+    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
-                                  **kwargs: object) -> MultiModalEmbeddings:
        """
-        Returns multimodal embeddings generated from multimodal kwargs 
+        Returns multimodal embeddings generated from multimodal kwargs
        to be merged with text embeddings.
        Note:
@@ -94,11 +93,11 @@ class SupportsMultiModal(Protocol):
        """
        ...
-    def get_language_model(self) -> torch.nn.Module:
+    def get_language_model(self) -> VllmModel:
        """
        Returns the underlying language model used for text generation.
-        This is typically the `torch.nn.Module` instance responsible for 
+        This is typically the `torch.nn.Module` instance responsible for
        processing the merged multimodal embeddings and producing hidden states
        Returns:
@@ -106,19 +105,83 @@ class SupportsMultiModal(Protocol):
        """
        ...
+    @overload
+    def get_input_embeddings(self, input_ids: Tensor) -> Tensor: ...
+    @overload
+    def get_input_embeddings(
+        self,
+        input_ids: Tensor,
+        multimodal_embeddings: MultiModalEmbeddings,
+        *,
+        is_multimodal: torch.Tensor,
+        handle_oov_mm_token: bool = False,
+    ) -> Tensor: ...
+    def _get_text_embeddings(
+        self,
+        input_ids: Tensor,
+        get_input_embeddings: Callable[[Tensor], Tensor],
+        *,
+        is_multimodal: Optional[Tensor],
+        handle_oov_mm_token: bool,
+    ) -> Tensor:
+        if handle_oov_mm_token and is_multimodal is not None:
+            is_text = ~is_multimodal
+            text_embeds = get_input_embeddings(input_ids[is_text])
+            return torch.empty(
+                (input_ids.shape[0], text_embeds.shape[1]),
+                dtype=text_embeds.dtype,
+                device=text_embeds.device,
+            ).masked_scatter_(is_text.unsqueeze_(-1), text_embeds)
+        return get_input_embeddings(input_ids)
    def get_input_embeddings(
        self,
        input_ids: Tensor,
        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+        *,
+        is_multimodal: Optional[Tensor] = None,
+        handle_oov_mm_token: bool = False,
    ) -> Tensor:
        """
-        Returns the input embeddings merged from the text embeddings from 
+        Apply token embeddings to `input_ids`.
-        input_ids and the multimodal embeddings generated from multimodal 
-        kwargs.
+        If `multimodal_embeddings` is passed, scatter them into
+        `input_ids` according to the mask `is_multimodal`.
+        In case the multi-modal token IDs exceed the vocabulary size of
+        the language model, you can set `handle_oov_mm_token=False`
+        to avoid calling the language model's `get_input_embeddings` method
+        on those tokens. Note however that doing so increases memory usage
+        as an additional buffer is needed to hold the input embeddings.
        """
-        ...
+        from .utils import _merge_multimodal_embeddings
+        inputs_embeds = self._get_text_embeddings(
+            input_ids,
+            self.get_language_model().get_input_embeddings,
+            is_multimodal=is_multimodal,
+            handle_oov_mm_token=handle_oov_mm_token,
+        )
+        if multimodal_embeddings is None or len(multimodal_embeddings) == 0:
+            return inputs_embeds
+        if is_multimodal is None:
+            raise ValueError(
+                "`get_input_embeddings` now requires `is_multimodal` arg, "
+                "please update your model runner according to "
+                "https://github.com/vllm-project/vllm/pull/16229."
+            )
+        return _merge_multimodal_embeddings(
+            inputs_embeds=inputs_embeds,
+            multimodal_embeddings=multimodal_embeddings,
+            is_multimodal=is_multimodal,
+        )
 @runtime_checkable
 class SupportsMultiModalPruning(Protocol):
    """The interface required for models that support returning both input

--- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py
+++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -269,6 +269,7 @@ _MULTIMODAL_MODELS = {
    "Qwen2AudioForConditionalGeneration": ("qwen2_audio", "Qwen2AudioForConditionalGeneration"),  # noqa: E501
    "Qwen2_5OmniModel": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"),  # noqa: E501
    "Qwen2_5OmniForConditionalGeneration": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"),  # noqa: E501
+    "Qwen3OmniMoeForConditionalGeneration": ("qwen3_omni_moe_thinker","Qwen3OmniMoeThinkerForConditionalGeneration"),
    "Qwen3VLForConditionalGeneration": ("qwen3_vl", "Qwen3VLForConditionalGeneration"),  # noqa: E501
    "Qwen3VLMoeForConditionalGeneration": ("qwen3_vl_moe", "Qwen3VLMoeForConditionalGeneration"),  # noqa: E501
    "SkyworkR1VChatModel": ("skyworkr1v", "SkyworkR1VChatModel"),

--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -10,6 +10,7 @@ import torch
 import torch.nn as nn
 from torch.func import functional_call
 from transformers import PretrainedConfig
+from typing_extensions import deprecated
 import vllm.envs as envs
 from vllm.config import VllmConfig
@@ -391,92 +392,79 @@ def _embedding_count_expression(embeddings: NestedTensors) -> str:
    return " + ".join(
        _embedding_count_expression(inner) for inner in embeddings)
+def split_list_into_ranges(lst: torch.Tensor, interval: int) -> list[list[int]]:
+    ranges: list[list[int]] = [[] for _ in range((max(lst) // interval) + 1)]
+    for num in lst:
+        index = num // interval
+        ranges[index].append(num)
+    return ranges
 def _merge_multimodal_embeddings(
    inputs_embeds: torch.Tensor,
-    is_multimodal: torch.Tensor,
    multimodal_embeddings: NestedTensors,
+    is_multimodal: torch.Tensor,
 ) -> torch.Tensor:
    """
-    Merge ``multimodal_embeddings`` into ``inputs_embeds`` by overwriting the
+    Merge `multimodal_embeddings` into `inputs_embeds` by overwriting the
-    positions in ``inputs_embeds`` corresponding to placeholder tokens in
+    positions in `inputs_embeds` corresponding to placeholder tokens in
-    ``input_ids``.
+    `input_ids`.
    Note:
-        This updates ``inputs_embeds`` in place.
+        This updates `inputs_embeds` in place.
    """
-    flattened = _flatten_embeddings(multimodal_embeddings)
+    if len(multimodal_embeddings) == 0:
+        return inputs_embeds
+    mm_embeds_flat = _flatten_embeddings(multimodal_embeddings)
+    input_dtype = inputs_embeds.dtype
    try:
-        # This is equivalent to: inputs_embeds[is_multimodal] = flattened.
+        # For debugging
-        inputs_embeds.masked_scatter_(is_multimodal.unsqueeze(-1),
+        # inputs_embeds[is_multimodal] = mm_embeds_flat.to(dtype=input_dtype)
-                                      flattened.to(dtype=inputs_embeds.dtype))
+        # NOTE: This can avoid D2H sync (#22105), but fails to
+        # raise an error if is_multimodal.sum() < len(mm_embeds_flat)
+        inputs_embeds.masked_scatter_(
+            is_multimodal.unsqueeze(-1), mm_embeds_flat.to(dtype=input_dtype)
+        )
    except RuntimeError as e:
+        num_actual_tokens = len(mm_embeds_flat)
        num_expected_tokens = is_multimodal.sum().item()
-        assert isinstance(num_expected_tokens, int)
-        if flattened.shape[0] != num_expected_tokens:
+        if num_actual_tokens != num_expected_tokens:
            expr = _embedding_count_expression(multimodal_embeddings)
            raise ValueError(
-                f"Attempted to assign {expr} = {flattened.shape[0]} "
+                f"Attempted to assign {expr} = {num_actual_tokens} "
                f"multimodal tokens to {num_expected_tokens} placeholders"
            ) from e
-        else:
-            raise ValueError("Error during masked scatter operation") from e
-    return inputs_embeds
-def embed_multimodal(
-    input_ids: torch.Tensor,
-    multimodal_token_id: int,
-    get_text_embeds: Callable[[torch.Tensor], torch.Tensor],
-    multimodal_embeds: NestedTensors,
-) -> torch.Tensor:
-    """
-    Embed token IDs and multimodal inputs and combine their embeddings.
-    ``multimodal_token_id`` is used to determine whether a token ID should
-    be embedded using ``get_text_embeds`` or ``get_multimodal_embeds``.
-    Compared to ``merge_multimodal_embeddings`, this avoids running
-    ``get_text_embeds`` on ``input_ids[input_ids == multimodal_token_id]``
-    which causes issues when the placeholder token ID exceeds the
-    vocabulary size of the language model.
-    """
-    is_multimodal = input_ids == multimodal_token_id
-    is_text = ~is_multimodal
-    text_embeds = get_text_embeds(input_ids[is_text])
-    merged_embeds = torch.empty(
-        (input_ids.shape[0], text_embeds.shape[1]),
-        dtype=text_embeds.dtype,
-        device=text_embeds.device,
-    )
-    merged_embeds[is_text] = text_embeds
+        raise ValueError("Error during masked scatter operation") from e
-    return _merge_multimodal_embeddings(
+    return inputs_embeds
-        merged_embeds,
-        is_multimodal,
-        multimodal_embeds,
-    )
+@deprecated(
+    "`merge_multimodal_embeddings` has been replaced with "
+    "`SupportsMultiModal.get_input_embeddings` and will be "
+    "removed in v0.12."
+)
 def merge_multimodal_embeddings(
    input_ids: torch.Tensor,
    inputs_embeds: torch.Tensor,
    multimodal_embeddings: NestedTensors,
-    placeholder_token_id: Union[int, list[int]],
+    placeholder_token_id: int | list[int],
 ) -> torch.Tensor:
    """
-    Merge ``multimodal_embeddings`` into ``inputs_embeds`` by overwriting the
+    Merge `multimodal_embeddings` into `inputs_embeds` by overwriting the
-    positions in ``inputs_embeds`` corresponding to placeholder tokens in
+    positions in `inputs_embeds` corresponding to placeholder tokens in
-    ``input_ids``.
+    `input_ids`.
-    ``placeholder_token_id`` can be a list of token ids (e.g, token ids
+    `placeholder_token_id` can be a list of token ids (e.g, token ids
    of img_start, img_break, and img_end tokens) when needed: This means
-    the order of these tokens in the ``input_ids`` MUST MATCH the order of
+    the order of these tokens in the `input_ids` MUST MATCH the order of
-    their embeddings in ``multimodal_embeddings`` since we need to
+    their embeddings in `multimodal_embeddings` since we need to
    slice-merge instead of individually scattering.
    For example, if input_ids is "TTTTTSIIIBIIIBIIIETTT", where
@@ -491,26 +479,32 @@ def merge_multimodal_embeddings(
    input_ids for a correct embedding merge.
    Note:
-        This updates ``inputs_embeds`` in place.
+        This updates `inputs_embeds` in place.
    """
    if isinstance(placeholder_token_id, list):
-        placeholder_token_id = torch.tensor(
+        is_multimodal = isin_list(input_ids, placeholder_token_id)
-            placeholder_token_id,
+    else:
-            pin_memory=is_pin_memory_available()).to(device=input_ids.device,
+        is_multimodal = input_ids == placeholder_token_id
-                                                     non_blocking=True)
-        return _merge_multimodal_embeddings(
-            inputs_embeds,
-            torch.isin(input_ids, placeholder_token_id),
-            multimodal_embeddings,
-        )
    return _merge_multimodal_embeddings(
        inputs_embeds,
-        (input_ids == placeholder_token_id),
+        multimodal_embeddings=multimodal_embeddings,
-        multimodal_embeddings,
+        is_multimodal=is_multimodal,
    )
+def isin_list(
+    elements: torch.Tensor,
+    test_elements_list: list[int],
+) -> torch.Tensor:
+    test_elements = torch.tensor(
+        test_elements_list,
+        pin_memory=is_pin_memory_available(),
+    ).to(device=elements.device, non_blocking=True)
+    return torch.isin(elements, test_elements)
 class LayerFn(Protocol):
    def __call__(self, prefix: str) -> torch.nn.Module:

--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -402,3 +402,39 @@ def run_dp_sharded_mrope_vision_model(
    assert len(out_embeddings) == len(
        original_order_embeddings), "Found unassigned embeddings"
    return out_embeddings
+def get_llm_pos_ids_for_vision(
+    start_idx: int,
+    vision_idx: int,
+    spatial_merge_size: int,
+    t_index: list[int],
+    grid_hs: torch.Tensor,
+    grid_ws: torch.Tensor,
+) -> torch.Tensor:
+    llm_pos_ids_list = []
+    llm_grid_h = grid_hs[vision_idx] // spatial_merge_size
+    llm_grid_w = grid_ws[vision_idx] // spatial_merge_size
+    h_index = (
+        torch.arange(llm_grid_h)
+        .view(1, -1, 1)
+        .expand(len(t_index), -1, llm_grid_w)
+        .flatten()
+    )
+    w_index = (
+        torch.arange(llm_grid_w)
+        .view(1, 1, -1)
+        .expand(len(t_index), llm_grid_h, -1)
+        .flatten()
+    )
+    t_index_tensor = (
+        torch.Tensor(t_index)
+        .to(llm_grid_h.device)
+        .view(-1, 1)
+        .expand(-1, llm_grid_h * llm_grid_w)
+        .long()
+        .flatten()
+    )
+    _llm_pos_ids = torch.stack([t_index_tensor, h_index, w_index])
+    llm_pos_ids_list.append(_llm_pos_ids + start_idx)
+    llm_pos_ids = torch.cat(llm_pos_ids_list, dim=1)
+    return llm_pos_ids
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -368,6 +368,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                                                         dtype=torch.int32)
        self.num_accepted_tokens = self._make_buffer(self.max_num_reqs,
                                                     dtype=torch.int64)
+        # Only relevant for multimodal models
+        if self.supports_mm_inputs:
+            self.is_mm_embed = self._make_buffer(self.max_num_tokens, dtype=torch.bool)
        # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
        if self.uses_mrope:
@@ -1612,17 +1615,23 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
        self,
        scheduler_output: "SchedulerOutput",
        shift_computed_tokens: int = 0,
-    ) -> list[torch.Tensor]:
+    ) -> tuple[list[torch.Tensor], torch.Tensor]:
+        total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
+        mm_embeds = list[torch.Tensor]()
+        is_mm_embed = self.is_mm_embed.cpu
+        is_mm_embed[:total_num_scheduled_tokens] = False
+        req_start_idx = 0
        should_sync_mrope_positions = False
-        mm_embeds: list[torch.Tensor] = []
        for req_id in self.input_batch.req_ids:
            mm_embeds_req: list[torch.Tensor] = []
-            num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
+            num_scheduled_tokens = scheduler_output.num_scheduled_tokens[req_id]
-                req_id]
            req_state = self.requests[req_id]
-            num_computed_tokens = \
+            num_computed_tokens = req_state.num_computed_tokens + shift_computed_tokens
-                req_state.num_computed_tokens + shift_computed_tokens
            for mm_feature in req_state.mm_features:
                pos_info = mm_feature.mm_position
                start_pos = pos_info.offset
@@ -1649,12 +1658,16 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                mm_hash = mm_feature.identifier
                encoder_output = self.encoder_cache.get(mm_hash, None)
-                assert encoder_output is not None,\
+                assert encoder_output is not None, f"Encoder cache miss for {mm_hash}."
-                    f"Encoder cache miss for {mm_hash}."
                if (is_embed := pos_info.is_embed) is not None:
                    is_embed = is_embed[start_idx:end_idx]
+                req_start_pos = req_start_idx + start_pos - num_computed_tokens
+                is_mm_embed[req_start_pos + start_idx : req_start_pos + end_idx] = (
+                    True if is_embed is None else is_embed
+                )
                mm_embeds_item = gather_mm_placeholders(
                    encoder_output[start_idx:end_idx],
                    is_embed=is_embed,
@@ -1662,6 +1675,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                mm_embeds_req.append(mm_embeds_item)
            if self.is_multimodal_pruning_enabled and self.uses_mrope:
+                assert req_state.mrope_positions is not None
                should_sync_mrope_positions = True
                mm_embeds_req, new_mrope_positions, new_delta = (
                    self.model.recompute_mrope_positions(
@@ -1669,19 +1683,21 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                        multimodal_embeddings=mm_embeds_req,
                        mrope_positions=req_state.mrope_positions,
                        num_computed_tokens=req_state.num_computed_tokens,
-                    ))
+                    )
-                assert req_state.mrope_positions is not None
+                )
                req_state.mrope_positions.copy_(new_mrope_positions)
                req_state.mrope_position_delta = new_delta
            mm_embeds.extend(mm_embeds_req)
+            req_start_idx += num_scheduled_tokens
+        is_mm_embed = self.is_mm_embed.copy_to_gpu(total_num_scheduled_tokens)
        if should_sync_mrope_positions:
            self._calc_mrope_positions(scheduler_output)
-            self.mrope_positions.copy_to_gpu(
+            self.mrope_positions.copy_to_gpu(total_num_scheduled_tokens)
-                scheduler_output.total_num_scheduled_tokens)
-        return mm_embeds
+        return mm_embeds, is_mm_embed
    def _extract_encoder_inputs(
        self,
@@ -1975,7 +1991,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                and not self.model_config.is_encoder_decoder):
            # Run the multimodal encoder if any.
            self._execute_mm_encoder(scheduler_output)
-            mm_embeds = self._gather_mm_embeddings(scheduler_output)
+            mm_embeds, is_mm_embed = self._gather_mm_embeddings(scheduler_output)
            # NOTE(woosuk): To unify token ids and soft tokens (vision
            # embeddings), we always use embeddings (rather than token ids)
@@ -1983,6 +1999,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
            inputs_embeds_scheduled = self.model.get_input_embeddings(
                input_ids=self.input_ids.gpu[:num_scheduled_tokens],
                multimodal_embeddings=mm_embeds or None,
+                is_multimodal=is_mm_embed,
            )
            # TODO(woosuk): Avoid the copy. Optimize.

--- a/vllm/version.py
+++ b/vllm/version.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 try:
-    from ._version import __version__, __version_tuple__
+    __version__ = "0.11.0"
+    __version_tuple__ = (0, 11, 0)
+    __hcu_version__ = f'0.11.0+das.opt1.alpha.6c015e7.dtk25041' 
+    from vllm.version import __version__, __version_tuple__, __hcu_version__
 except Exception as e:
    import warnings
-    warnings.warn(f"Failed to read commit hash:\n{e}",
+    warnings.warn(f"Failed to read commit hash:\n + str(e)",
                  RuntimeWarning,
                  stacklevel=2)
    __version__ = "dev"
    __version_tuple__ = (0, 0, __version__)
 def _prev_minor_version_was(version_str):
-    """Check whether a given version matches the previous minor version.
+    '''Check whether a given version matches the previous minor version.
    Return True if version_str matches the previous minor version.
@@ -23,19 +24,19 @@ def _prev_minor_version_was(version_str):
    supplied version_str is '0.6'.
    Used for --show-hidden-metrics-for-version.
-    """
+    '''
    # Match anything if this is a dev tree
    if __version_tuple__[0:2] == (0, 0):
        return True
    # Note - this won't do the right thing when we release 1.0!
-    assert __version_tuple__[0] == 0
+    # assert __version_tuple__[0] == 0
    assert isinstance(__version_tuple__[1], int)
    return version_str == f"{__version_tuple__[0]}.{__version_tuple__[1] - 1}"
 def _prev_minor_version():
-    """For the purpose of testing, return a previous minor version number."""
+    '''For the purpose of testing, return a previous minor version number.'''
    # In dev tree, this will return "0.-1", but that will work fine"
    assert isinstance(__version_tuple__[1], int)
    return f"{__version_tuple__[0]}.{__version_tuple__[1] - 1}"