Merge tag 'v0.8.4' into v0.8.4-dev

31330101 · zhuwenwen · e8933c34 · dc1b4a6f · 31330101 · 31330101
Commit 31330101 authored Apr 16, 2025 by zhuwenwen
20 changed files
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -52,16 +52,17 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalEncDecInputs,
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalEncDecInputs,
                                    MultiModalFieldConfig, MultiModalKwargs)
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
-                                   MultiModalDataDict, MultiModalDataItems)
+                                   MultiModalDataItems)
 from vllm.multimodal.processing import (BaseProcessingInfo,
                                        EncDecMultiModalProcessor,
                                        PromptReplacement, PromptUpdate)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder

 from .clip import CLIPMLP
 from .interfaces import SupportsMultiModal, SupportsV0Only
@@ -106,16 +107,6 @@ class MllamaProcessingInfo(BaseProcessingInfo):
        image_size = self.get_hf_config().vision_config.image_size
        return calc_token_per_chunk(image_size)

-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        vision_config = self.get_hf_config().vision_config
-        token_per_chunk = self.get_token_per_chunk_from_config()
-        mm_max_tokens = vision_config.max_num_tiles * token_per_chunk
-        return {"image": mm_max_tokens}
-
    def get_num_tiles_per_image(self, image_height: int,
                                image_width: int) -> int:
        vision_config = self.get_hf_config().vision_config
@@ -141,31 +132,31 @@ class MllamaProcessingInfo(BaseProcessingInfo):

 class MllamaDummyInputsBuilder(BaseDummyInputsBuilder[MllamaProcessingInfo]):

-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        processor = self.info.get_hf_processor()
+        image_token = processor.image_token
+
+        return image_token * num_images
+
+    def get_dummy_mm_data(
        self,
        seq_len: int,
        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
        num_images = mm_counts.get("image", 0)

        target_width, target_height = \
            self.info.get_image_size_with_most_features()

-        mm_data = {
+        return {
            "image":
            self._get_dummy_images(width=target_width,
                                   height=target_height,
                                   num_images=num_images)
        }

-        hf_processor = self.info.get_hf_processor()
-        image_token: str = hf_processor.image_token
-
-        return ProcessorInputs(
-            prompt_text=image_token * num_images,
-            mm_data=mm_data,
-        )
-

 class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo]
                                ):
@@ -211,6 +202,9 @@ class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo]
        # }

        if mm_data:
+            hf_processor = self.info.get_hf_processor()
+            image_token: str = hf_processor.image_token
+
            # Since only the last group of consecutive images
            # are attended by the decoded tokens, we only need to
            # get the number of tokens for those images.
@@ -227,7 +221,7 @@ class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo]
            num_tokens = decode_tiles * token_per_chunk
            mm_inputs["encoder_prompt_token_ids"] = [image_token_id
                                                     ] * num_tokens
-            mm_inputs["encoder_prompt"] = "<|image|>" * num_tokens
+            mm_inputs["encoder_prompt"] = image_token * num_tokens

        return mm_inputs

@@ -1188,6 +1182,7 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal,
        super().__init__()
        config: MllamaConfig = vllm_config.model_config.hf_config
        quant_config = vllm_config.quant_config
+        self.config = config
        self.quant_config = quant_config
        self.vocab_size = config.text_config.vocab_size
        self.hidden_size = config.text_config.hidden_size
@@ -1306,6 +1301,31 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal,

        raise AssertionError("This line should be unreachable.")

+    def _get_and_validate_encoder_lens(
+        self,
+        encoder_seq_lens: List[int],
+        num_tiles: List[List[int]],
+        num_tokens_per_tile: int,
+    ) -> List[int]:
+        # Get the actual number of encoder tokens for each sample.
+        # Because attn_metadata.encoder_seq_lens only counts the last
+        # group of images for each sample, which is used to cheat the
+        # block manager to allocate blocks for those images only.
+        # See MllamaMultiModalProcessor for more details.
+        actual_encoder_seq_lens = [
+            sum(num_tile) * num_tokens_per_tile for num_tile in num_tiles
+        ]
+
+        # remove 0 encoder len entries for text-only requests for these
+        # assertions
+        attn_metadata_lens = [x for x in encoder_seq_lens if x > 0]
+        assert len(actual_encoder_seq_lens) == len(attn_metadata_lens)
+        for actual_len, last_group_len in zip(actual_encoder_seq_lens,
+                                              attn_metadata_lens):
+            assert actual_len >= last_group_len
+
+        return actual_encoder_seq_lens
+
    def flat_encoder_result(self, cross_attention_states: torch.Tensor,
                            attn_metadata: AttentionMetadata,
                            actual_encoder_seq_lens: List[int]):
@@ -1325,6 +1345,9 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal,
        cross_attention_states = cross_attention_states_flat
        return cross_attention_states

+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
    def get_cross_attention_states(
        self,
        image_inputs: MllamaImagePixelInputs,
@@ -1430,20 +1453,14 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal,
        else:
            skip_cross_attention = False

-            # Get the actual number of encoder tokens for each sample.
-            # Because attn_metadata.encoder_seq_lens only counts the last
-            # group of images for each sample, which is used to cheat the
-            # block manager to allocate blocks for those images only.
-            # See MllamaMultiModalProcessor for more details.
-            num_tiles_tensor = kwargs.pop("num_tiles")
-            num_tiles = [t.tolist() for t in num_tiles_tensor]
+            num_tiles = [t.tolist() for t in kwargs.pop("num_tiles")]
            num_tokens_per_tile = calc_token_per_chunk(self.image_size)
-            actual_encoder_seq_lens = [
-                sum(num_tile) * num_tokens_per_tile for num_tile in num_tiles
-            ]
-            for actual_len, last_group_len in zip(
-                    actual_encoder_seq_lens, attn_metadata.encoder_seq_lens):
-                assert actual_len >= last_group_len
+
+            actual_encoder_seq_lens = self._get_and_validate_encoder_lens(
+                attn_metadata.encoder_seq_lens,
+                num_tiles,
+                num_tokens_per_tile,
+            )

            cross_attention_states = self.get_cross_attention_states(
                image_inputs, attn_metadata, actual_encoder_seq_lens)
@@ -1521,6 +1538,15 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal,
                updated_params.add(name)
        return updated_params

+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="multi_modal_projector",
+            tower_model="vision_model")
+

 def skip_attention_mask(sparse_mask: List[List[int]]) -> bool:
    for mask in sparse_mask:

--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -17,6 +17,7 @@
 # limitations under the License.
 import math
 from collections.abc import Iterable, Mapping
+from functools import cached_property
 from itertools import tee
 from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union

@@ -24,7 +25,6 @@ import torch
 from torch import nn
 from transformers import BatchFeature, Llama4Config, Llama4VisionConfig
 from transformers.image_utils import SizeDict
-from transformers.modeling_outputs import BaseModelOutput
 from transformers.models.llama4 import Llama4Processor
 from transformers.models.llama4.image_processing_llama4_fast import (
    find_supported_resolutions, get_best_fit)
@@ -33,33 +33,30 @@ from vllm.attention.layer import MultiHeadAttention
 from vllm.config import VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.inputs import InputProcessingContext
-from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                               QKVParallelLinear,
                                               RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.model_loader.loader import _initialize_model
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
-                                    NestedTensors)
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs, NestedTensors)
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                   MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                        BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+                                        PromptUpdate, PromptUpdateDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config

-from .interfaces import MultiModalEmbeddings, SupportsMultiModal
-from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
-                    maybe_prefix, merge_multimodal_embeddings)
-from .vision import scatter_patch_features, select_patch_features
-
-logger = init_logger(__name__)
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .llama4 import Llama4ForCausalLM
+from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
+                    merge_multimodal_embeddings)


 class Llama4ImagePatchInputs(TypedDict):
@@ -76,11 +73,7 @@ class Llama4ImagePatchInputs(TypedDict):
    This is used to split the embeddings which has the first two dimensions
    flattened just like `flat_data`.
    """
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-    """
+
    aspect_ratios: Union[torch.Tensor, list[torch.Tensor]]
    """
    A list of aspect ratios corresponding to the number of tiles
@@ -345,7 +338,7 @@ class Llama4VisionEncoder(nn.Module):
    def forward(
        self,
        hidden_states: torch.Tensor,
-    ) -> BaseModelOutput:
+    ) -> torch.Tensor:
        r"""
        Args:
            inputs_embeds (`torch.FloatTensor` of shape
@@ -361,7 +354,7 @@ class Llama4VisionEncoder(nn.Module):
            layer_outputs = encoder_layer(hidden_states)
            hidden_states = layer_outputs[0]

-        return BaseModelOutput(last_hidden_state=hidden_states, )
+        return hidden_states


 class Llama4UnfoldConvolution(nn.Module):
@@ -433,7 +426,7 @@ class Llama4VisionModel(nn.Module):
    def forward(
        self,
        images_flattened: torch.Tensor,
-    ) -> BaseModelOutput:
+    ) -> torch.Tensor:
        # Patch embedding
        hidden_state = self.patch_embedding(images_flattened)
        num_tiles, num_patches, hidden_dim = hidden_state.shape
@@ -458,8 +451,7 @@ class Llama4VisionModel(nn.Module):
        hidden_state = hidden_state.view(num_tiles, -1, hidden_dim)

        # Apply encoder
-        output = self.model(hidden_state)
-        hidden_state = output.last_hidden_state
+        hidden_state = self.model(hidden_state)
        hidden_state = self.layernorm_post(hidden_state)

        # Remove CLS token output
@@ -468,10 +460,7 @@ class Llama4VisionModel(nn.Module):
        # now, we use Llama4VisionPixelShuffle + mlp to project embeddings
        hidden_state = self.vision_adapter(hidden_state)

-        return BaseModelOutput(
-            last_hidden_state=hidden_state,
-            attentions=None,
-        )
+        return hidden_state


 class Mllama4ProcessingInfo(BaseProcessingInfo):
@@ -488,7 +477,9 @@ class Mllama4ProcessingInfo(BaseProcessingInfo):
                                         **kwargs)

    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"image": 10}
+        # Although vLLM can support more images from an infra capability
+        # perspective, we do not recommend using >10 images in practice.
+        return {"image": None}

    @staticmethod
    def get_patch_per_chunk(vision_config: Llama4VisionConfig) -> int:
@@ -507,18 +498,6 @@ class Mllama4ProcessingInfo(BaseProcessingInfo):
        image_processor = self.get_hf_processor().image_processor
        return image_processor.max_patches

-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        vision_config = self.get_hf_config().vision_config
-        # image_start + local tiles * (patches + 1 x separator) +
-        # 1 global tile * (image x 1 + patches) + image_end
-        token_per_chunk = self.get_patch_per_chunk(vision_config) + 1
-        mm_max_tokens = (self.get_max_num_tiles() + 1) * token_per_chunk + 2
-        return {"image": mm_max_tokens}
-
    def get_image_size_with_most_features(self) -> ImageSize:
        vision_config = self.get_hf_config().vision_config
        image_size = vision_config.image_size
@@ -581,33 +560,9 @@ class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo]
                for (r_h, r_w) in aspect_ratios
            ]

-            # embed_is_patch should have one feature per image-related token:
-            #   <|image_start|>, <|tile_*_separator|>, <|image|>, <|image_end|>
-            #             -> False
-            #   <|patch|> -> True
-            # embed_is_patch has no entries corresponding to non-image-related
-            # tokens.
-            patch_id = tokenizer.get_vocab()[processor.img_patch_token]
-            num_patches_per_chunk = self.info.get_patch_per_chunk(
-                vision_config)
-            expanded_image_tokens_list = [
-                processor._prompt_split_image(aspect_ratio,
-                                              num_patches_per_chunk)
-                for aspect_ratio in aspect_ratios
-            ]
-            expanded_image_token_ids = [
-                tokenizer.encode(image_tokens, add_special_tokens=False)
-                for image_tokens in expanded_image_tokens_list
-            ]
-            embed_is_patch = [
-                torch.tensor(tokens) == patch_id
-                for tokens in expanded_image_token_ids
-            ]
-
            processed_outputs["aspect_ratios"] = aspect_ratios
            processed_outputs["patches_per_image"] = torch.tensor(
                patches_per_image)
-            processed_outputs["embed_is_patch"] = embed_is_patch

        return processed_outputs

@@ -622,7 +577,6 @@ class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo]
                "image", patches_per_image),
            patches_per_image=MultiModalFieldConfig.batched("image"),
            aspect_ratios=MultiModalFieldConfig.batched("image"),
-            embed_is_patch=MultiModalFieldConfig.batched("image"),
        )

    def _get_prompt_updates(
@@ -642,12 +596,17 @@ class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo]
        num_patches_per_chunk = self.info.get_patch_per_chunk(vision_config)
        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
        image_token = hf_processor.image_token
+        img_patch_token = hf_processor.img_patch_token

        def get_replacement(item_idx: int):
            aspect_ratio = out_mm_kwargs["aspect_ratios"][item_idx]
-            return hf_processor._prompt_split_image(
+
+            repl = hf_processor._prompt_split_image(
                aspect_ratio=aspect_ratio,
-                num_patches_per_chunk=num_patches_per_chunk)
+                num_patches_per_chunk=num_patches_per_chunk,
+            )
+
+            return PromptUpdateDetails.select_text(repl, img_patch_token)

        return [
            PromptReplacement(
@@ -660,36 +619,39 @@ class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo]

 class Mllama4DummyInputsBuilder(BaseDummyInputsBuilder[Mllama4ProcessingInfo]):

-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        processor = self.info.get_hf_processor()
+        image_token = processor.fake_image_token
+
+        return image_token * num_images
+
+    def get_dummy_mm_data(
        self,
        seq_len: int,
        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
        num_images = mm_counts.get("image", 0)

        (target_width,
         target_height) = self.info.get_image_size_with_most_features()

-        mm_data = {
+        return {
            "image":
            self._get_dummy_images(width=target_width,
                                   height=target_height,
                                   num_images=num_images)
        }

-        image_token = self.info.get_hf_processor().fake_image_token
-        return ProcessorInputs(
-            prompt_text=image_token * num_images,
-            mm_data=mm_data,
-        )
-

 @MULTIMODAL_REGISTRY.register_processor(
    Mllama4MultiModalProcessor,
    info=Mllama4ProcessingInfo,
    dummy_inputs=Mllama4DummyInputsBuilder,
 )
-class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal):
+class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                     SupportsPP):
    packed_modules_mapping = {
        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
    }
@@ -710,13 +672,22 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal):
            self.config,
            None,
            prefix=maybe_prefix(prefix, "multi_modal_projector"))
-        self.language_model = init_vllm_registered_model(
-            vllm_config=vllm_config,
-            hf_config=config.text_config,
-            architectures=["Llama4ForCausalLM"],
-            prefix=maybe_prefix(prefix, "language_model"))

-        self.tokenizer = cached_tokenizer_from_config(vllm_config.model_config)
+        self.language_model = _initialize_model(
+            vllm_config=vllm_config.with_hf_config(config.text_config),
+            prefix=maybe_prefix(prefix, "language_model"),
+            model_class=Llama4ForCausalLM,
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    @cached_property
+    def sampler(self):
+        if hasattr(self.language_model, "sampler"):
+            return self.language_model.sampler
+
+        return get_sampler()

    def _parse_and_validate_image_input(
            self, **kwargs: object) -> Optional[Llama4ImagePatchInputs]:
@@ -730,11 +701,6 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal):
        flat_pixel_values = flatten_bn(pixel_values, concat=True)
        patches_per_image = flatten_bn(kwargs.pop("patches_per_image"))

-        embed_is_patch = kwargs.pop("embed_is_patch", None)
-        if not isinstance(embed_is_patch, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of embed_is_patch. "
-                             f"Got type: {type(embed_is_patch)}")
-
        aspect_ratios = kwargs.pop("aspect_ratios", None)
        if not isinstance(aspect_ratios, (torch.Tensor, list)):
            raise ValueError("Incorrect type of aspect_ratios. "
@@ -744,7 +710,6 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal):
            type="pixel_values",
            flat_data=flat_pixel_values,
            patches_per_image=patches_per_image,
-            embed_is_patch=embed_is_patch,
            aspect_ratios=aspect_ratios,
        )

@@ -752,8 +717,18 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal):
            self, image_input: Llama4ImagePatchInputs) -> MultiModalEmbeddings:
        flat_data = image_input["flat_data"]
        patches_per_image = image_input["patches_per_image"].tolist()
-        vision_embeddings_flat = self.vision_model(flat_data).last_hidden_state
-        return vision_embeddings_flat.split(patches_per_image, dim=0)
+
+        vision_embeddings_flat = self.vision_model(flat_data)
+        vision_embeddings_flat = self.multi_modal_projector(
+            vision_embeddings_flat)
+
+        return [
+            img.flatten(0, 1)
+            for img in vision_embeddings_flat.split(patches_per_image, dim=0)
+        ]
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model

    def get_multimodal_embeddings(self,
                                  **kwargs) -> Optional[MultiModalEmbeddings]:
@@ -761,20 +736,7 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal):
        if image_input is None:
            return None

-        # num_images x [num_chunks, num_patches, hidden_dim]
-        image_features = self._process_image_input(image_input)
-        # num_images x [num_chunks x num_patches, hidden_dim]
-        image_features_flat = [img.flatten(0, 1) for img in image_features]
-        # num_images x [1, input_len] -> num_images x [input_len]
-        embed_is_patch_flat = [
-            is_patch.flatten(0, 1)
-            for is_patch in image_input["embed_is_patch"]
-        ]
-
-        return scatter_patch_features(
-            image_features_flat,
-            embed_is_patch_flat,
-        )
+        return self._process_image_input(image_input)

    def get_input_embeddings(
        self,
@@ -784,11 +746,12 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal):
        inputs_embeds = self.language_model.get_input_embeddings(input_ids)

        if multimodal_embeddings is not None:
-            multimodal_embeddings = torch.cat(multimodal_embeddings)
-            mm_embeddings = self.multi_modal_projector(multimodal_embeddings)
            inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, select_patch_features(mm_embeddings),
-                self.config.image_token_index)
+                input_ids,
+                inputs_embeds,
+                multimodal_embeddings,
+                self.config.image_token_index,
+            )

        return inputs_embeds

@@ -800,9 +763,12 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal):
        inputs_embeds: Optional[torch.Tensor] = None,
        **kwargs: object,
    ) -> Union[torch.Tensor, IntermediateTensors]:
-        # NOTE: In v1, inputs_embeds is always generated at model runner, this
-        # condition is for v0 compatibility.
-        if "pixel_values" in kwargs:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner,
+        # this condition is for v0 compatibility.
+        elif inputs_embeds is None:
            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
            inputs_embeds = self.get_input_embeddings(input_ids,
                                                      vision_embeddings)
@@ -857,9 +823,8 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal):

        # language_model is an Llama4ForCausalLM instance. We load it's
        # using llama4's load_weights routine.
-        language_model_prefix = "language_model.model."
        language_model_weights, other_weights = self.separate_weights(
-            weights, prefix=language_model_prefix)
+            weights, prefix="language_model.model.")
        loader = AutoWeightsLoader(self)
        loaded_language_model_params = loader.load_weights(
            language_model_weights)
@@ -883,4 +848,4 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal):

                weight_loader(param, loaded_weight)
                updated_params.add(name)
-        return updated_params
+        return updated_params
\ No newline at end of file
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -41,13 +41,15 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                   MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                        BaseProcessingInfo, PromptIndexTargets,
-                                        PromptInsertion, PromptUpdate)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+                                        PromptInsertion, PromptUpdate,
+                                        PromptUpdateDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors

 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
@@ -56,7 +58,6 @@ from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
                    is_pp_missing_parameter,
                    make_empty_intermediate_tensors_factory, make_layers,
                    maybe_prefix, merge_multimodal_embeddings)
-from .vision import scatter_patch_features, select_patch_features

 # TODO: hard-coded for now. Consider making it configurable.
 VIT_LAYERS = [-2, -9]
@@ -84,14 +85,6 @@ class MolmoImageInputs(TypedDict):
    Shape: `(batch_size * num_images, num_crops, num_patch)`
    """

-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-    
-    Shape: `(batch_size * num_images, num_embeds)`
-    """
-
    num_crops: torch.Tensor
    """Shape: `(batch_size * num_images)`"""

@@ -1146,30 +1139,6 @@ class MolmoProcessorWrapper:
        if image_input_idx is not None:
            feat_is_patch = image_input_idx >= 0

-            input_is_embed = torch.isin(
-                input_ids,
-                torch.tensor([
-                    self.image_patch_id,
-                    self.im_col_id,
-                    self.im_start_id,
-                    self.im_end_id,
-                ]),
-            )
-            embed_ids = input_ids[input_is_embed]
-            embed_is_patch = embed_ids == self.image_patch_id
-            assert embed_is_patch.sum() == feat_is_patch.sum()
-
-            # image_tokens = extra_joint + joint
-            # Both `extra_joint` and `joint` have `im_start_id` and `im_end_id`
-            embed_start = torch.nonzero(embed_ids == self.im_start_id)[::2, 0]
-            embed_end = torch.nonzero(embed_ids == self.im_end_id)[1::2, 0]
-            assert len(embed_start) == len(embed_end) == len(images)
-
-            embed_is_patch = [
-                embed_is_patch[start:end + 1]
-                for start, end in zip(embed_start, embed_end)
-            ]
-
            tilings = [
                self.select_tiling(
                    image_width=image.size[0],
@@ -1181,7 +1150,6 @@ class MolmoProcessorWrapper:
            assert num_crops.sum() == len(feat_is_patch)

            outputs["feat_is_patch"] = feat_is_patch
-            outputs["embed_is_patch"] = embed_is_patch
            outputs["num_crops"] = num_crops
            outputs["img_patch_id"] = self.image_patch_id

@@ -1197,13 +1165,6 @@ class MolmoProcessingInfo(BaseProcessingInfo):
    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {"image": None}

-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_max_image_tokens()}
-
    def get_num_image_tokens(
        self,
        *,
@@ -1220,26 +1181,13 @@ class MolmoProcessingInfo(BaseProcessingInfo):
        )
        pooling_size = processor.pooling_size

-        base_image_input_size = processor.base_image_input_size
-        base_image_input_d = processor.image_patch_size
-
-        crop_patches = base_image_input_size[0] // base_image_input_d
-
-        per_row = ncols // pooling_size + 1
-        joint = per_row * (nrows // pooling_size) + 2
-        image_token_length = (crop_patches + pooling_size - 1) // pooling_size
-        resize = (image_token_length + 1) * image_token_length + 2
+        image_token_length_w = processor.image_token_length_w
+        image_token_length_h = processor.image_token_length_h

-        return resize + joint
+        extra = image_token_length_w * image_token_length_h
+        joint = ((ncols + 1) // pooling_size) * ((nrows + 1) // pooling_size)

-    def get_max_image_tokens(self) -> int:
-        target_width, target_height = self.get_image_size_with_most_features()
-
-        return self.get_num_image_tokens(
-            image_width=target_width,
-            image_height=target_height,
-            processor=None,
-        )
+        return extra + joint

    def get_image_size_with_most_features(self) -> ImageSize:
        processor = self.get_hf_processor()
@@ -1269,27 +1217,25 @@ class MolmoProcessingInfo(BaseProcessingInfo):

 class MolmoDummyInputsBuilder(BaseDummyInputsBuilder[MolmoProcessingInfo]):

-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        return ""
+
+    def get_dummy_mm_data(
        self,
        seq_len: int,
        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
        target_width, target_height = \
            self.info.get_image_size_with_most_features()
        num_images = mm_counts.get("image", 0)

-        mm_data = {
+        return {
            "image":
            self._get_dummy_images(width=target_width,
                                   height=target_height,
                                   num_images=num_images)
        }

-        return ProcessorInputs(
-            prompt_text="",
-            mm_data=mm_data,
-        )
-

 class MolmoMultiModalProcessor(BaseMultiModalProcessor[MolmoProcessingInfo]):

@@ -1328,7 +1274,6 @@ class MolmoMultiModalProcessor(BaseMultiModalProcessor[MolmoProcessingInfo]):
                "image", num_crops),
            feat_is_patch=MultiModalFieldConfig.flat_from_sizes(
                "image", num_crops),
-            embed_is_patch=MultiModalFieldConfig.batched("image"),
            num_crops=MultiModalFieldConfig.batched("image"),
            img_patch_id=MultiModalFieldConfig.shared("image", num_images),
        )
@@ -1368,8 +1313,10 @@ class MolmoMultiModalProcessor(BaseMultiModalProcessor[MolmoProcessingInfo]):
            joint = ([img_start_id] + joint_row *
                     ((nrows + 1) // pooling_size) + [img_end_id])

-            image_tokens = extra_joint + joint
-            return image_tokens
+            return PromptUpdateDetails.select_token_id(
+                extra_joint + joint,
+                embed_token_id=img_patch_id,
+            )

        return [
            PromptInsertion(
@@ -1475,11 +1422,6 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
            raise ValueError("Incorrect type of feat_is_patch. "
                             f"Got type: {type(feat_is_patch)}")

-        embed_is_patch = kwargs.pop("embed_is_patch", None)
-        if not isinstance(embed_is_patch, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of embed_is_patch. "
-                             f"Got type: {type(embed_is_patch)}")
-
        num_crops = kwargs.pop("num_crops", None)
        if not isinstance(num_crops, (torch.Tensor, list)):
            raise ValueError("Incorrect type of num_crops. "
@@ -1491,14 +1433,12 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
                             f"Got type: {type(img_patch_id)}")
        self.img_patch_id = img_patch_id.flatten().unique().item()

-        embed_is_patch = flatten_bn(embed_is_patch)
        num_crops = flatten_bn(num_crops, concat=True)

        return MolmoImageInputs(
            images=images,
            image_masks=image_masks,
            feat_is_patch=feat_is_patch,
-            embed_is_patch=embed_is_patch,
            num_crops=num_crops,
        )

@@ -1531,18 +1471,16 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
            )
        ]

+    def get_language_model(self) -> torch.nn.Module:
+        return self.model
+
    def get_multimodal_embeddings(
            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
        image_input = self._parse_and_validate_image_input(**kwargs)
        if image_input is None:
            return None

-        image_features = self._process_image_input(image_input)
-
-        return scatter_patch_features(
-            image_features,
-            image_input["embed_is_patch"],
-        )
+        return self._process_image_input(image_input)

    def get_input_embeddings(
        self,
@@ -1556,7 +1494,7 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
            inputs_embeds = merge_multimodal_embeddings(
                input_ids,
                inputs_embeds,
-                select_patch_features(multimodal_embeddings),
+                multimodal_embeddings,
                self.img_patch_id,
            )
        return inputs_embeds

--- a/vllm/model_executor/models/nvlm_d.py
+++ b/vllm/model_executor/models/nvlm_d.py
@@ -15,12 +15,11 @@ from transformers import PretrainedConfig

 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalKwargs
+from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargs
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                   MultiModalDataItems)
 from vllm.multimodal.processing import (PromptReplacement, PromptUpdate,
                                        PromptUpdateDetails)
-from vllm.multimodal.profiling import ProcessorInputs

 from .intern_vit import InternVisionModel
 from .internvl import (BaseInternVLProcessingInfo, BaseInternVLProcessor,
@@ -57,7 +56,7 @@ class NVLMProcessor(BaseInternVLProcessor):
        # when trying to find "<tile" as a subsequence of "<Image><tile"
        repl = "<Image>" + features + "</Image>"

-        return PromptUpdateDetails(full=repl, features=repl)
+        return PromptUpdateDetails.select_text(repl, IMG_PAD)


 class NVLMProcessingInfo(BaseInternVLProcessingInfo):
@@ -84,57 +83,32 @@ class NVLMProcessingInfo(BaseInternVLProcessingInfo):
            **kwargs,
        )

-    def get_max_image_tokens(self) -> int:
-        hf_processor = self.get_hf_processor()
-        tokenizer = hf_processor.tokenizer

-        max_num_patches = hf_processor.max_dynamic_patch
-        # we need +1 here because max_dynamic_patch in config doesn't
-        # include the thumbnail patch
-        tile_pos_identifiers = [
-            f"<tile_{i+1}>" for i in range(max_num_patches)
-        ]
-        if hf_processor.use_thumbnail and max_num_patches != 1:
-            tile_pos_identifiers += ["<tile_global_thumbnail>"]
+class NVLMDummyInputsBuilder(InternVLDummyInputsBuilder[NVLMProcessingInfo]):

-        # "<Image><tile" is tokenized as ["<Image", "><", "tile"]
-        # so we include <tile_1> in the start_str
-        start_str = "<Image>" + tile_pos_identifiers.pop(0)
-        end_str = "</Image>"
-        start_token_len = len(tokenizer.encode(start_str))
-        end_token_len = len(tokenizer.encode(end_str))
-        tile_token_len = sum(
-            len(tokenizer.encode(identifier))
-            for identifier in tile_pos_identifiers)
-        non_image_tokens_num = start_token_len + end_token_len + tile_token_len
-        return super().get_max_image_tokens() + non_image_tokens_num
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)

+        # The newline is necessary to separate ">" of the current item
+        # and "<" of the next item
+        return "<image>\n" * num_images

-class NVLMDummyInputsBuilder(InternVLDummyInputsBuilder[NVLMProcessingInfo]):
-
-    def get_dummy_processor_inputs(
+    def get_dummy_mm_data(
        self,
        seq_len: int,
        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
        target_width, target_height = \
            self.info.get_image_size_with_most_features()
        num_images = mm_counts.get("image", 0)

-        mm_data = {
+        return {
            "image":
            self._get_dummy_images(width=target_width,
                                   height=target_height,
                                   num_images=num_images)
        }

-        return ProcessorInputs(
-            # The newline is necessary to separate ">" of the current item
-            # and "<" of the next item
-            prompt_text="<image>\n" * num_images,
-            mm_data=mm_data,
-        )
-

 class NVLMMultiModalProcessor(InternVLMultiModalProcessor[NVLMProcessingInfo]):

@@ -177,10 +151,7 @@ class NVLMMultiModalProcessor(InternVLMultiModalProcessor[NVLMProcessingInfo]):

            repl = hf_processor.get_image_repl(feature_size, num_patches)

-            return PromptUpdateDetails(
-                full=repl.full + "\n",
-                features=repl.features + "\n",
-            )
+            return PromptUpdateDetails.select_text(repl.full + "\n", IMG_PAD)

        # See note in dummy data regarding why we have the extra newline
        return [

--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -324,7 +324,6 @@ class OPTForCausalLM(nn.Module, SupportsPP):
        super().__init__()
        config = vllm_config.model_config.hf_config
        quant_config = vllm_config.quant_config
-        super().__init__()
        self.config = config
        self.quant_config = quant_config
        self.model = OPTModel(vllm_config=vllm_config,

--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -13,12 +13,13 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                    MultiModalInputs, MultiModalKwargs)
-from vllm.multimodal.parse import MultiModalDataItems
+from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
+                                   MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                        BaseProcessingInfo, PromptIndexTargets,
                                        PromptInsertion, PromptUpdate,
                                        PromptUpdateDetails)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors

 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -72,44 +73,44 @@ class PaliGemmaProcessingInfo(BaseProcessingInfo):
    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {"image": 1}

-    def get_mm_max_tokens_per_item(
+    def get_num_image_tokens(
        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_num_image_tokens()}
-
-    def get_num_image_tokens(self) -> int:
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
        vision_encoder_info = self.get_vision_encoder_info()
-        return vision_encoder_info.get_max_image_tokens()
+
+        return vision_encoder_info.get_num_image_tokens(
+            image_width=image_width,
+            image_height=image_height,
+        )


 class PaliGemmaDummyInputsBuilder(
        BaseDummyInputsBuilder[PaliGemmaProcessingInfo]):

-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        return ""
+
+    def get_dummy_mm_data(
        self,
        seq_len: int,
        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
        hf_config = self.info.get_hf_config()
        vision_config = hf_config.vision_config
        max_image_size = vision_config.image_size

        num_images = mm_counts.get("image", 0)

-        mm_data = {
+        return {
            "image":
            self._get_dummy_images(width=max_image_size,
                                   height=max_image_size,
                                   num_images=num_images)
        }

-        return ProcessorInputs(
-            prompt_text="",
-            mm_data=mm_data,
-        )
-

 class PaliGemmaMultiModalProcessor(
        BaseMultiModalProcessor[PaliGemmaProcessingInfo]):
@@ -148,12 +149,30 @@ class PaliGemmaMultiModalProcessor(
        image_token_id = hf_config.image_token_index

        tokenizer = self.info.get_tokenizer()
-        num_image_tokens = self.info.get_num_image_tokens()
-        image_tokens = [image_token_id] * num_image_tokens

        bos_token_id = tokenizer.bos_token_id
        assert isinstance(bos_token_id, int)

+        def get_insertion(item_idx: int):
+            images = mm_items.get_items(
+                "image", (ImageEmbeddingItems, ImageProcessorItems))
+
+            if isinstance(images, ImageEmbeddingItems):
+                num_image_tokens = images.get_feature_size(item_idx)
+            else:
+                image_size = images.get_image_size(item_idx)
+                num_image_tokens = self.info.get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                )
+
+            image_tokens = [image_token_id] * num_image_tokens
+
+            return PromptUpdateDetails.select_token_id(
+                image_tokens + [bos_token_id],
+                embed_token_id=image_token_id,
+            )
+
        # Paligemma 1 and 2 have different tokenizer.add_bos_token
        # Insert <image>*n + <bos> after <bos> for Paligemma 1
        # Insert <image>*n + <bos> for Paligemma 2
@@ -162,10 +181,7 @@ class PaliGemmaMultiModalProcessor(
                modality="image",
                target=PromptIndexTargets.prefix(
                    [bos_token_id] if tokenizer.add_bos_token else []),
-                insertion=PromptUpdateDetails(
-                    full=image_tokens + [bos_token_id],
-                    features=image_tokens,
-                ),
+                insertion=get_insertion,
            )
        ]

@@ -323,6 +339,9 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,

        return self.multi_modal_projector(image_features)

+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
    def get_multimodal_embeddings(
            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
        image_input = self._parse_and_validate_image_input(**kwargs)

--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -61,7 +61,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors

 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                    make_empty_intermediate_tensors_factory, make_layers,
                    maybe_prefix)

@@ -249,6 +249,49 @@ class PhiModel(nn.Module):

        return hidden_states

+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v")
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # pylint: disable=E1136
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+

 class PhiForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
    packed_modules_mapping = {
@@ -317,43 +360,5 @@ class PhiForCausalLM(nn.Module, SupportsLoRA, SupportsPP):

    def load_weights(self, weights: Iterable[Tuple[str,
                                                   torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v")
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # pylint: disable=E1136
-
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -32,7 +32,8 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
    VocabParallelEmbedding)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                   ImageSize, MultiModalDataItems)
 # yapf conflicts with isort for this block
@@ -40,10 +41,9 @@ from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                        BaseProcessingInfo, BoundPromptUpdate,
                                        PlaceholderFeaturesInfo,
-                                        PromptReplacement, PromptUpdate,
-                                        PromptUpdateDetails)
+                                        PromptReplacement, PromptUpdate)
 # yapf: enable
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of

@@ -322,21 +322,6 @@ class Phi3VProcessingInfo(BaseProcessingInfo):
    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {"image": None}

-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        target_width, target_height = self.get_image_size_with_most_features()
-
-        max_image_tokens = self.get_num_image_tokens(
-            image_width=target_width,
-            image_height=target_height,
-            processor=None,
-        )
-
-        return {"image": max_image_tokens}
-
    def get_num_image_tokens(
        self,
        *,
@@ -359,31 +344,31 @@ class Phi3VProcessingInfo(BaseProcessingInfo):

 class Phi3VDummyInputsBuilder(BaseDummyInputsBuilder[Phi3VProcessingInfo]):

-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        hf_processor = self.info.get_hf_processor()
+        image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
+
+        return "".join(image_tokens[:num_images])
+
+    def get_dummy_mm_data(
        self,
        seq_len: int,
        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
        num_images = mm_counts.get("image", 0)

        target_width, target_height = \
            self.info.get_image_size_with_most_features()

-        mm_data = {
+        return {
            "image":
            self._get_dummy_images(width=target_width,
                                   height=target_height,
                                   num_images=num_images)
        }

-        hf_processor = self.info.get_hf_processor()
-        image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
-
-        return ProcessorInputs(
-            prompt_text="".join(image_tokens[:num_images]),
-            mm_data=mm_data,
-        )
-

 class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):

@@ -443,12 +428,7 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
                    processor=hf_processor,
                )

-            image_tokens = [_IMAGE_TOKEN_ID] * num_image_tokens
-
-            return PromptUpdateDetails(
-                full=image_tokens,
-                features=image_tokens,
-            )
+            return [_IMAGE_TOKEN_ID] * num_image_tokens

        num_images = mm_items.get_count("image", strict=False)

@@ -517,6 +497,7 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
                        item_idx=p.item_idx,
                        start_idx=p.start_idx - 1,
                        tokens=p.tokens,
+                        is_embed=p.is_embed,
                    ) for p in ps
                ]
                for modality, ps in placeholders.items()
@@ -679,6 +660,9 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,

        return image_embeds

+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
    def get_multimodal_embeddings(
            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
        image_input = self._parse_and_validate_image_input(**kwargs)

--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -1802,3 +1802,6 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal,
            connector=["audio_projection_for_vision", "audio_projection"],
            tower_model=["vision_encoder", "embed_tokens_extend"],
        )
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.model
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -49,7 +49,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors

 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                    make_empty_intermediate_tensors_factory, make_layers,
                    maybe_prefix)

@@ -448,6 +448,8 @@ class PhiMoEModel(nn.Module):
                       (lora_config.max_loras or 1)) if lora_config else 0)
        self.vocab_size = config.vocab_size + lora_vocab
        self.org_vocab_size = config.vocab_size
+        self.config = config
+        self.quant_config = quant_config

        self.embed_tokens = VocabParallelEmbedding(
            self.vocab_size,
@@ -504,85 +506,6 @@ class PhiMoEModel(nn.Module):
        hidden_states = self.norm(hidden_states)
        return hidden_states

-
-class PhiMoEForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
-    fall_back_to_pt_during_load = False
-
-    packed_modules_mapping = {
-        "qkv_proj": [
-            "q_proj",
-            "k_proj",
-            "v_proj",
-        ],
-    }
-
-    # LoRA specific attributes
-    embedding_modules = {
-        "embed_tokens": "input_embeddings",
-        "lm_head": "output_embeddings",
-    }
-    embedding_padding_modules = ["lm_head"]
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-        config = vllm_config.model_config.hf_config
-        lora_config = vllm_config.lora_config
-        self.config = config
-        self.lora_config = lora_config
-        self.quant_config = vllm_config.quant_config
-
-        self.model = PhiMoEModel(vllm_config=vllm_config,
-                                 prefix=maybe_prefix(prefix, "model"))
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
-        self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
-            config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=(
-                DEFAULT_VOCAB_PADDING_SIZE
-                # We need bigger padding if using lora for kernel
-                # compatibility
-                if not lora_config else lora_config.lora_vocab_padding_size),
-            quant_config=None,
-            bias=True,
-        )
-        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                config.vocab_size)
-        self.sampler = get_sampler()
-
-        self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
-
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
-        return hidden_states
-
-    def compute_logits(self, hidden_states: torch.Tensor,
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head, hidden_states,
-                                       sampling_metadata)
-        return logits
-
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
    def load_weights(self, weights: Iterable[Tuple[str,
                                                   torch.Tensor]]) -> Set[str]:
        stacked_params_mapping = [
@@ -601,9 +524,6 @@ class PhiMoEForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
        params_dict = dict(self.named_parameters())
        loaded_params: Set[str] = set()
        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-
            if (self.quant_config is not None and
                (scale_name := self.quant_config.get_cache_scale(name))):
                # Loading kv cache quantization scales
@@ -667,3 +587,90 @@ class PhiMoEForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
                    weight_loader(param, loaded_weight)
            loaded_params.add(name)
        return loaded_params
+
+
+class PhiMoEForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    fall_back_to_pt_during_load = False
+
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        lora_config = vllm_config.lora_config
+        self.config = config
+        self.lora_config = lora_config
+        self.quant_config = vllm_config.quant_config
+
+        self.model = PhiMoEModel(vllm_config=vllm_config,
+                                 prefix=maybe_prefix(prefix, "model"))
+        self.unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        self.lm_head = ParallelLMHead(
+            self.unpadded_vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=(
+                DEFAULT_VOCAB_PADDING_SIZE
+                # We need bigger padding if using lora for kernel
+                # compatibility
+                if not lora_config else lora_config.lora_vocab_padding_size),
+            quant_config=None,
+            bias=True,
+        )
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size)
+        self.sampler = get_sampler()
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: Optional[torch.Tensor],
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["rotary_emb.inv_freq"]),
+        )
+        return loader.load_weights(weights)
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -32,13 +32,14 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
-from vllm.multimodal.inputs import MultiModalFieldConfig, NestedTensors
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    NestedTensors)
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                   MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                        BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+                                        PromptUpdate, PromptUpdateDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.tokenizer import (MistralTokenizer,
                                               cached_tokenizer_from_config)
@@ -46,8 +47,7 @@ from vllm.transformers_utils.tokenizer import (MistralTokenizer,
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (flatten_bn, init_vllm_registered_model, maybe_prefix,
                    merge_multimodal_embeddings)
-from .vision import (VisionEncoderInfo, resolve_visual_encoder_outputs,
-                     scatter_patch_features, select_patch_features)
+from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs

 try:
    from xformers import ops as xops
@@ -68,14 +68,6 @@ class PixtralImagePixelInputs(TypedDict):
    The result of stacking :attr:`ImageEncoding.tokens` from each prompt.
    """

-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-
-    Shape: `(batch_size * num_images, num_embeds)`
-    """
-

 class PixtralProcessorAdapter:
    """
@@ -144,11 +136,8 @@ class PixtralProcessorAdapter:
                "For more info, see: "
                "https://github.com/vllm-project/vllm/issues/8411.")

-        image_token_id = self.image_token_id
-
        images_processed = list[torch.Tensor]()
        images_tokens = list[torch.Tensor]()
-        images_embed_is_patch = list[torch.Tensor]()

        for image in images:
            image_inputs = self.image_processor(ImageChunk(image=image))
@@ -157,12 +146,10 @@ class PixtralProcessorAdapter:

            images_processed.append(image_processed)
            images_tokens.append(image_tokens)
-            images_embed_is_patch.append(image_tokens == image_token_id)

        return {
            "input_ids": torch.cat(images_tokens)[None].expand(len(text), -1),
            "images": images_processed,
-            "embed_is_patch": images_embed_is_patch,
        }


@@ -181,13 +168,6 @@ class PixtralProcessingInfo(BaseProcessingInfo):
    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {"image": None}

-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_max_image_tokens()}
-
    def get_vision_config(
        self,
        processor: Optional[PixtralProcessorAdapter] = None,
@@ -213,7 +193,7 @@ class PixtralProcessingInfo(BaseProcessingInfo):
        ncols, nrows = processor.image_processor._image_to_num_tokens(
            Image.new("RGB", (image_width, image_height)))

-        return (ncols + 1) * nrows
+        return ncols * nrows

    def get_image_size_with_most_features(self) -> ImageSize:
        image_processor = self.get_hf_processor().image_processor
@@ -221,39 +201,29 @@ class PixtralProcessingInfo(BaseProcessingInfo):

        return ImageSize(width=max_image_size, height=max_image_size)

-    def get_max_image_tokens(self) -> int:
-        target_width, target_height = self.get_image_size_with_most_features()
-
-        return self.get_num_image_tokens(
-            image_width=target_width,
-            image_height=target_height,
-        )
-

 class PixtralDummyInputsBuilder(BaseDummyInputsBuilder[PixtralProcessingInfo]):

-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        return ""
+
+    def get_dummy_mm_data(
        self,
        seq_len: int,
        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
        num_images = mm_counts.get("image", 0)

        target_width, target_height = \
            self.info.get_image_size_with_most_features()

-        mm_data = {
+        return {
            "image":
            self._get_dummy_images(width=target_width,
                                   height=target_height,
                                   num_images=num_images)
        }

-        return ProcessorInputs(
-            prompt_text="",
-            mm_data=mm_data,
-        )
-

 class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]
                                 ):
@@ -263,10 +233,7 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]
        hf_inputs: Mapping[str, NestedTensors],
        hf_processor_mm_kwargs: Mapping[str, object],
    ) -> Mapping[str, MultiModalFieldConfig]:
-        return dict(
-            images=MultiModalFieldConfig.batched("image"),
-            embed_is_patch=MultiModalFieldConfig.batched("image"),
-        )
+        return dict(images=MultiModalFieldConfig.batched("image"))

    def _get_prompt_updates(
        self,
@@ -290,7 +257,7 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]
            tokens = ([image_token_id] * ncols + [image_break_id]) * nrows
            tokens[-1] = image_end_id

-            return tokens
+            return PromptUpdateDetails.select_token_id(tokens, image_token_id)

        return [
            PromptReplacement(
@@ -381,17 +348,9 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
            raise ValueError("Incorrect type of images. "
                             f"Got type: {type(images)}")

-        embed_is_patch = kwargs.pop("embed_is_patch")
-        if not isinstance(embed_is_patch, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of embed_is_patch. "
-                             f"Got type: {type(embed_is_patch)}")
-
-        embed_is_patch = flatten_bn(embed_is_patch)
-
        return PixtralImagePixelInputs(
            type="pixel_values",
            images=flatten_bn(images),
-            embed_is_patch=embed_is_patch,
        )

    def _process_image_input(
@@ -421,18 +380,16 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
        image_embeds = torch.split(image_embeds, feature_sizes)
        return image_embeds

+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
    def get_multimodal_embeddings(
            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
        image_input = self._parse_and_validate_image_input(**kwargs)
        if image_input is None:
            return None

-        image_features = self._process_image_input(image_input)
-
-        return scatter_patch_features(
-            image_features,
-            image_input["embed_is_patch"],
-        )
+        return self._process_image_input(image_input)

    def get_input_embeddings(
        self,
@@ -444,7 +401,7 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
            inputs_embeds = merge_multimodal_embeddings(
                input_ids,
                inputs_embeds,
-                select_patch_features(multimodal_embeddings),
+                multimodal_embeddings,
                self.vision_args.image_token_id,
            )
        return inputs_embeds
@@ -963,24 +920,15 @@ class PixtralHFEncoderInfo(VisionEncoderInfo[PixtralVisionConfig]):
            image_width=image_width,
            image_height=image_height,
        )
-
-        # Consider the image_break_token
-        return (ncols + 1) * nrows
-
-    def get_max_image_tokens(self) -> int:
-        image_size = self.get_image_size()
-
-        return self.get_num_image_tokens(
-            image_width=image_size,
-            image_height=image_size,
-        )
+        return ncols * nrows

    def get_image_size(self) -> int:
        return self.vision_config.image_size

    def get_patch_size(self) -> int:
-        return (self.vision_config.patch_size *
-                self.vision_config.spatial_merge_size)
+        spatial_merge_size = getattr(self.vision_config, "spatial_merge_size",
+                                     1)
+        return (self.vision_config.patch_size * spatial_merge_size)

    def get_patch_grid_length(self) -> int:
        image_size, patch_size = self.get_image_size(), self.get_patch_size()

--- a/vllm/model_executor/models/prithvi_geospatial_mae.py
+++ b/vllm/model_executor/models/prithvi_geospatial_mae.py
@@ -35,7 +35,7 @@ from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                        BaseProcessingInfo, PromptUpdate)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import (IntermediateTensors, PoolerOutput,
                           PoolingSequenceGroupOutput)

@@ -45,27 +45,25 @@ class PrithviGeoSpatialMAEProcessingInfo(BaseProcessingInfo):
    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {"image": None}

-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        return {"image": 0}
-

 class PrithviGeoSpatialMAEInputBuilder(
        BaseDummyInputsBuilder[PrithviGeoSpatialMAEProcessingInfo]):

-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        return ""
+
+    def get_dummy_mm_data(
        self,
        seq_len: int,
        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
-        return ProcessorInputs(
-            prompt_text="",
-            # This model input is fixed and is in the form of a torch Tensor.
-            # The size of pixel_values might change in the cases where we resize
-            # the input but never exceeds the dimensions below.
-            mm_data={
-                "pixel_values": torch.full((1, 6, 512, 512), 1.0),
-                "location_coords": torch.full((1, 2), 1.0)
-            })
+    ) -> MultiModalDataDict:
+        # This model input is fixed and is in the form of a torch Tensor.
+        # The size of pixel_values might change in the cases where we resize
+        # the input but never exceeds the dimensions below.
+        return {
+            "pixel_values": torch.full((1, 6, 512, 512), 1.0),
+            "location_coords": torch.full((1, 2), 1.0),
+        }


 class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor):

--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -278,7 +278,11 @@ class Qwen2DecoderLayer(nn.Module):
    })
 class Qwen2Model(nn.Module):

-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = "",
+                 decoder_layer_type: type[nn.Module] = Qwen2DecoderLayer):
        super().__init__()

        config = vllm_config.model_config.hf_config
@@ -312,12 +316,14 @@ class Qwen2Model(nn.Module):
        else:
            self.embed_tokens = PPMissingLayer()

+        # Use the provided decoder layer type or default to Qwen2DecoderLayer
+        decoder_layer_type = decoder_layer_type or Qwen2DecoderLayer
        self.start_layer, self.end_layer, self.layers = make_layers(
            config.num_hidden_layers,
-            lambda prefix: Qwen2DecoderLayer(config=config,
-                                             cache_config=cache_config,
-                                             quant_config=quant_config,
-                                             prefix=prefix),
+            lambda prefix: decoder_layer_type(config=config,
+                                              cache_config=cache_config,
+                                              quant_config=quant_config,
+                                              prefix=prefix),
            prefix=f"{prefix}.layers",
        )


--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -1026,6 +1026,9 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
                    **kwargs)
        return modalities

+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
    def get_multimodal_embeddings(
            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:


--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -37,13 +37,14 @@ from vllm.config import VllmConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
 from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
                                   MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                        BaseProcessingInfo, PromptReplacement,
                                        PromptUpdate, PromptUpdateDetails)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors

 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -109,42 +110,34 @@ class Qwen2AudioProcessingInfo(BaseProcessingInfo):
    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {"audio": None}

-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        hf_config = self.get_hf_config()
-        max_source_positions = hf_config.audio_config.max_source_positions
-        max_output_lengths = (max_source_positions - 2) // 2 + 1
-
-        return {"audio": max_output_lengths}
-

 class Qwen2AudioDummyInputsBuilder(
        BaseDummyInputsBuilder[Qwen2AudioProcessingInfo]):

-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_audios = mm_counts.get("audio", 0)
+
+        hf_processor = self.info.get_hf_processor()
+        audio_token = hf_processor.audio_token
+
+        return audio_token * num_audios
+
+    def get_dummy_mm_data(
        self,
        seq_len: int,
        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
        feature_extractor = self.info.get_feature_extractor()

        sampling_rate = feature_extractor.sampling_rate
        audio_len = feature_extractor.chunk_length * sampling_rate
        num_audios = mm_counts.get("audio", 0)

-        mm_data = {
+        return {
            "audio":
            self._get_dummy_audios(length=audio_len, num_audios=num_audios)
        }

-        return ProcessorInputs(
-            prompt_text="<|AUDIO|>" * num_audios,
-            mm_data=mm_data,
-        )
-

 class Qwen2AudioMultiModalProcessor(
        BaseMultiModalProcessor[Qwen2AudioProcessingInfo]):
@@ -229,9 +222,9 @@ class Qwen2AudioMultiModalProcessor(

            audio_tokens = [audio_token_id] * num_features

-            return PromptUpdateDetails(
-                full=[audio_bos_id] + audio_tokens + [audio_eos_id],
-                features=audio_tokens,
+            return PromptUpdateDetails.select_token_id(
+                [audio_bos_id] + audio_tokens + [audio_eos_id],
+                embed_token_id=audio_token_id,
            )

        return [
@@ -355,6 +348,9 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
        return torch.split(masked_audio_features,
                           audio_output_lengths.flatten().tolist())

+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
    def get_multimodal_embeddings(
            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
        audio_input = self._parse_and_validate_audio_input(**kwargs)

--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -55,7 +55,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors

 from .interfaces import SupportsPP
-from .utils import (extract_layer_index, is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, extract_layer_index,
+                    is_pp_missing_parameter,
                    make_empty_intermediate_tensors_factory, make_layers,
                    maybe_prefix)
 import os
@@ -333,6 +334,7 @@ class Qwen2MoeModel(nn.Module):
        quant_config = vllm_config.quant_config

        self.vocab_size = config.vocab_size
+        self.config = config

        self.embed_tokens = VocabParallelEmbedding(
            config.vocab_size,
@@ -350,6 +352,16 @@ class Qwen2MoeModel(nn.Module):
        self.make_empty_intermediate_tensors = (
            make_empty_intermediate_tensors_factory(
                ["hidden_states", "residual"], config.hidden_size))
+        
+        self.quant_method = None
+        if quant_config is not None:
+            self.quant_method=quant_config.get_name()
+            self.quant_config=quant_config
+               
+        self.use_llama_nn = os.environ.get('LLAMA_NN') == '1'
+        self.use_gemm_pad = os.environ.get('GEMM_PAD') == '1'
+        self.use_fa_pad = os.environ.get('FA_PAD') == '1'
+        self.use_awq_pad = os.environ.get('AWQ_PAD') == '1'

    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
        return self.embed_tokens(input_ids)
@@ -382,70 +394,6 @@ class Qwen2MoeModel(nn.Module):
        return hidden_states


-class Qwen2MoeForCausalLM(nn.Module, SupportsPP):
-
-    fall_back_to_pt_during_load = False
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-        config = vllm_config.model_config.hf_config
-        quant_config = vllm_config.quant_config
-        self.config = config
-        self.quant_config = quant_config
-        self.model = Qwen2MoeModel(vllm_config=vllm_config,
-                                   prefix=maybe_prefix(prefix, "model"))
-        self.lm_head = ParallelLMHead(config.vocab_size,
-                                      config.hidden_size,
-                                      quant_config=quant_config)
-        if self.config.tie_word_embeddings:
-            self.lm_head.weight = self.model.embed_tokens.weight
-        self.logits_processor = LogitsProcessor(config.vocab_size)
-
-        self.sampler = get_sampler()
-        self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
-        
-        self.quant_method = None
-        if quant_config is not None:
-            self.quant_method=quant_config.get_name()
-            self.quant_config=quant_config
-               
-        self.use_llama_nn = os.environ.get('LLAMA_NN') == '1'
-        self.use_gemm_pad = os.environ.get('GEMM_PAD') == '1'
-        self.use_fa_pad = os.environ.get('FA_PAD') == '1'
-        self.use_awq_pad = os.environ.get('AWQ_PAD') == '1'
-
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds)
-        return hidden_states
-
-    def compute_logits(
-        self,
-        hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[torch.Tensor]:
-        logits = self.logits_processor(self.lm_head, hidden_states,
-                                       sampling_metadata)
-        return logits
-
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
    def load_weights(self, weights: Iterable[Tuple[str,
                                                   torch.Tensor]]) -> Set[str]:
        stacked_params_mapping = [
@@ -468,8 +416,6 @@ class Qwen2MoeForCausalLM(nn.Module, SupportsPP):
        params_dict = dict(self.named_parameters())
        loaded_params: Set[str] = set()
        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
            for (param_name, weight_name, shard_id) in stacked_params_mapping:
                # Skip non-stacked layers and experts (experts handled below).
                if weight_name not in name:
@@ -586,3 +532,65 @@ class Qwen2MoeForCausalLM(nn.Module, SupportsPP):
                    weight.data=weight.data.reshape(ori_shape[1],-1)
            
        return loaded_params
+
+
+class Qwen2MoeForCausalLM(nn.Module, SupportsPP):
+
+    fall_back_to_pt_during_load = False
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Qwen2MoeModel(vllm_config=vllm_config,
+                                   prefix=maybe_prefix(prefix, "model"))
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
+        if self.config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.sampler = get_sampler()
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: Optional[torch.Tensor],
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["rotary_emb.inv_freq"]),
+        )
+        return loader.load_weights(weights)
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -56,15 +56,15 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (ImageItem, ModalityData,
-                                    MultiModalFieldConfig, MultiModalKwargs,
-                                    VideoItem)
+                                    MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs, VideoItem)
 from vllm.multimodal.parse import (DictEmbeddingItems, ImageSize,
                                   ModalityDataItems, MultiModalDataItems,
                                   MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                        BaseProcessingInfo, PromptReplacement,
                                        PromptUpdate)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.platforms import _Backend
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import uses_mrope
@@ -781,7 +781,7 @@ class Qwen2VLMultiModalDataParser(MultiModalDataParser):
    def _parse_image_data(
        self,
        data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
-    ) -> ModalityDataItems[Any, Any]:
+    ) -> Optional[ModalityDataItems[Any, Any]]:
        if isinstance(data, dict):
            return DictEmbeddingItems(
                data,
@@ -795,7 +795,7 @@ class Qwen2VLMultiModalDataParser(MultiModalDataParser):
    def _parse_video_data(
        self,
        data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]],
-    ) -> ModalityDataItems[Any, Any]:
+    ) -> Optional[ModalityDataItems[Any, Any]]:
        if isinstance(data, dict):
            return DictEmbeddingItems(
                data,
@@ -879,16 +879,6 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {"image": None, "video": None}

-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {
-            "image": self.get_max_image_tokens(),
-            "video": self.get_max_video_tokens(seq_len, mm_counts),
-        }
-
    def _get_vision_info(
        self,
        *,
@@ -1036,11 +1026,7 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):

 class Qwen2VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2VLProcessingInfo]):

-    def get_dummy_processor_inputs(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
        num_images = mm_counts.get("image", 0)
        num_videos = mm_counts.get("video", 0)

@@ -1048,12 +1034,22 @@ class Qwen2VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2VLProcessingInfo]):
        image_token: str = hf_processor.image_token
        video_token: str = hf_processor.video_token

+        return image_token * num_images + video_token * num_videos
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
        target_width, target_height = \
            self.info.get_image_size_with_most_features()
        target_num_frames = \
            self.info.get_num_frames_with_most_features(seq_len, mm_counts)

-        mm_data = {
+        return {
            "image":
            self._get_dummy_images(width=target_width,
                                   height=target_height,
@@ -1067,11 +1063,6 @@ class Qwen2VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2VLProcessingInfo]):
            )
        }

-        return ProcessorInputs(
-            prompt_text=image_token * num_images + video_token * num_videos,
-            mm_data=mm_data,
-        )
-

 class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
                                 ):
@@ -1338,6 +1329,9 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,

        return modalities

+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
    def get_multimodal_embeddings(
            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:


--- a/vllm/model_executor/models/qwen3.py
+++ b/vllm/model_executor/models/qwen3.py
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright 2024 The Qwen team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Qwen3 model compatible with HuggingFace weights."""
+from typing import Iterable, Optional, Set, Tuple, Union
+
+import torch
+from torch import nn
+from transformers import Qwen3Config
+
+from vllm.attention import Attention, AttentionType
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.logger import init_logger
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsLoRA, SupportsPP
+from .qwen2 import Qwen2MLP as Qwen3MLP
+from .qwen2 import Qwen2Model
+from .utils import AutoWeightsLoader, PPMissingLayer, maybe_prefix
+
+logger = init_logger(__name__)
+
+
+class Qwen3Attention(nn.Module):
+
+    def __init__(self,
+                 hidden_size: int,
+                 num_heads: int,
+                 num_kv_heads: int,
+                 max_position: int = 4096 * 32,
+                 head_dim: Optional[int] = None,
+                 rms_norm_eps: float = 1e-06,
+                 qkv_bias: bool = False,
+                 rope_theta: float = 10000,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 rope_scaling: Optional[Tuple] = None,
+                 prefix: str = "",
+                 attn_type: str = AttentionType.DECODER) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim or hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=qkv_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position,
+            base=self.rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn",
+                              attn_type=attn_type)
+        self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
+        self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        # Add qk-norm
+        q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim,
+                           self.head_dim)
+        q_by_head = self.q_norm.forward_native(q_by_head)
+        q = q_by_head.view(q.shape)
+        k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim,
+                           self.head_dim)
+        k_by_head = self.k_norm.forward_native(k_by_head)
+        k = k_by_head.view(k.shape)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Qwen3DecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: Qwen3Config,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        # Requires transformers > 4.32.0
+        rope_theta = getattr(config, "rope_theta", 1000000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+
+        # By default, Qwen3 uses causal attention as it is a decoder-only model.
+        # You can override the HF config with `is_causal=False` to enable
+        # bidirectional attention, which is used in some embedding models
+        # (e.g. Alibaba-NLP/gte-Qwen3-7B-instruct)
+        if getattr(config, "is_causal", True):
+            attn_type = AttentionType.DECODER
+        else:
+            attn_type = AttentionType.ENCODER_ONLY
+
+        self.self_attn = Qwen3Attention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            max_position=config.max_position_embeddings,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            rms_norm_eps=config.rms_norm_eps,
+            qkv_bias=getattr(config, 'attention_bias', False),
+            head_dim=getattr(config, 'head_dim', None),
+            cache_config=cache_config,
+            quant_config=quant_config,
+            rope_scaling=rope_scaling,
+            prefix=f"{prefix}.self_attn",
+            attn_type=attn_type,
+        )
+        self.mlp = Qwen3MLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+ALL_DECODER_LAYER_TYPES = {
+    "attention": Qwen3DecoderLayer,
+}
+
+
+@support_torch_compile(
+    dynamic_arg_dims={
+        "input_ids": 0,
+        # positions is of shape (3, seq_len) if mrope is enabled for qwen2-vl,
+        # otherwise (seq_len, ).
+        "positions": -1,
+        "intermediate_tensors": 0,
+        "inputs_embeds": 0,
+    })
+class Qwen3Model(Qwen2Model):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config,
+                         prefix=prefix,
+                         decoder_layer_type=Qwen3DecoderLayer)
+
+
+class Qwen3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.lora_config = lora_config
+
+        self.quant_config = quant_config
+        self.model = Qwen3Model(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
+
+        if get_pp_group().is_last_rank:
+            if config.tie_word_embeddings:
+                self.lm_head = self.model.embed_tokens
+            else:
+                self.lm_head = ParallelLMHead(config.vocab_size,
+                                              config.hidden_size,
+                                              quant_config=quant_config,
+                                              prefix=maybe_prefix(
+                                                  prefix, "lm_head"))
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.sampler = get_sampler()
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright 2024 The Qwen team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Qwen3MoE model compatible with HuggingFace weights."""
+from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import (get_pp_group,
+                              get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_reduce)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsPP
+from .utils import (AutoWeightsLoader, extract_layer_index,
+                    is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+logger = init_logger(__name__)
+
+
+class Qwen3MoeMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: bool = True,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size, [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj")
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=False,
+                                           quant_config=quant_config,
+                                           reduce_results=reduce_results,
+                                           prefix=f"{prefix}.down_proj")
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class Qwen3MoeSparseMoeBlock(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+
+        if self.tp_size > config.num_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {config.num_experts}.")
+
+        self.experts = FusedMoE(num_experts=config.num_experts,
+                                top_k=config.num_experts_per_tok,
+                                hidden_size=config.hidden_size,
+                                intermediate_size=config.moe_intermediate_size,
+                                reduce_results=False,
+                                renormalize=config.norm_topk_prob,
+                                quant_config=quant_config,
+                                prefix=f"{prefix}.experts")
+
+        self.gate = ReplicatedLinear(config.hidden_size,
+                                     config.num_experts,
+                                     bias=False,
+                                     quant_config=None,
+                                     prefix=f"{prefix}.gate")
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # NOTE: hidden_states can have either 1D or 2D shape.
+        orig_shape = hidden_states.shape
+        hidden_dim = hidden_states.shape[-1]
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+        final_hidden_states = self.experts(hidden_states=hidden_states,
+                                           router_logits=router_logits)
+        final_hidden_states = final_hidden_states
+        if self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(
+                final_hidden_states)
+
+        return final_hidden_states.view(orig_shape)
+
+
+class Qwen3MoeAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        head_dim: Optional[int] = None,
+        rms_norm_eps: float = 1e-06,
+        qkv_bias: bool = False,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim or (hidden_size // self.total_num_heads)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(hidden_size,
+                                          self.head_dim,
+                                          self.total_num_heads,
+                                          self.total_num_kv_heads,
+                                          bias=qkv_bias,
+                                          quant_config=quant_config,
+                                          prefix=f"{prefix}.qkv_proj")
+
+        self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim,
+                                        hidden_size,
+                                        bias=False,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.o_proj")
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
+
+        self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
+        self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        # Add qk-norm
+        q_by_head = q.view(*q.shape[:-1], q.shape[-1] // self.head_dim,
+                           self.head_dim)
+        q_by_head = self.q_norm.forward_native(q_by_head)
+        q = q_by_head.view(q.shape)
+
+        k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim,
+                           self.head_dim)
+        k_by_head = self.k_norm.forward_native(k_by_head)
+        k = k_by_head.view(k.shape)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Qwen3MoeDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        self.self_attn = Qwen3MoeAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            rms_norm_eps=config.rms_norm_eps,
+            qkv_bias=getattr(config, 'attention_bias', False),
+            head_dim=getattr(config, 'head_dim', None),
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+
+        # `mlp_only_layers` in the config.
+        layer_idx = extract_layer_index(prefix)
+        mlp_only_layers = ([] if not hasattr(config, "mlp_only_layers") else
+                           config.mlp_only_layers)
+        if (layer_idx not in mlp_only_layers) and (
+                config.num_experts > 0 and
+            (layer_idx + 1) % config.decoder_sparse_step == 0):
+            self.mlp = Qwen3MoeSparseMoeBlock(config=config,
+                                              quant_config=quant_config,
+                                              prefix=f"{prefix}.mlp")
+        else:
+            self.mlp = Qwen3MoeMLP(hidden_size=config.hidden_size,
+                                   intermediate_size=config.intermediate_size,
+                                   hidden_act=config.hidden_act,
+                                   quant_config=quant_config,
+                                   prefix=f"{prefix}.mlp")
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+@support_torch_compile
+class Qwen3MoeModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.config = config
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            prefix=f"{prefix}.embed_tokens")
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: Qwen3MoeDecoderLayer(config=config,
+                                                cache_config=cache_config,
+                                                quant_config=quant_config,
+                                                prefix=prefix),
+            prefix=f"{prefix}.layers",
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            hidden_states, residual = layer(positions, hidden_states, residual)
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts)
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if "mlp.experts" in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if ((name.endswith(".bias") or name.endswith("_bias"))
+                        and name not in params_dict):
+                    continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+                if name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    # Skip loading extra bias for GPTQ models.
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  name,
+                                  shard_id=shard_id,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
+                        continue
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    # Remapping the name of FP8 kv-scale.
+                    if name.endswith("kv_scale"):
+                        remapped_kv_scale_name = name.replace(
+                            ".kv_scale", ".attn.kv_scale")
+                        if remapped_kv_scale_name not in params_dict:
+                            logger.warning_once(
+                                "Found kv scale in the checkpoint "
+                                f"(e.g. {name}), but not found the expected "
+                                f"name in the model "
+                                f"(e.g. {remapped_kv_scale_name}). "
+                                "kv-scale is not loaded.")
+                            continue
+                        else:
+                            name = remapped_kv_scale_name
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class Qwen3MoeForCausalLM(nn.Module, SupportsPP):
+
+    fall_back_to_pt_during_load = False
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Qwen3MoeModel(vllm_config=vllm_config,
+                                   prefix=maybe_prefix(prefix, "model"))
+        self.lm_head = ParallelLMHead(config.vocab_size,
+                                      config.hidden_size,
+                                      quant_config=quant_config)
+        if self.config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.sampler = get_sampler()
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: Optional[torch.Tensor],
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["rotary_emb.inv_freq"]),
+        )
+        return loader.load_weights(weights)
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -32,12 +32,13 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.resampler import Resampler2, get_abs_pos
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs)
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                        BaseProcessingInfo, PromptReplacement,
                                        PromptUpdate, PromptUpdateDetails)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors

 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
@@ -530,13 +531,6 @@ class QwenVLProcessingInfo(BaseProcessingInfo):
    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {"image": None}

-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_num_image_tokens()}
-
    def get_num_image_tokens(self) -> int:
        hf_config = self.get_hf_config()
        vision_config = hf_config.visual
@@ -549,34 +543,34 @@ class QwenVLProcessingInfo(BaseProcessingInfo):

 class QwenVLDummyInputsBuilder(BaseDummyInputsBuilder[QwenVLProcessingInfo]):

-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        hf_processor = self.info.get_hf_processor()
+        img_start = hf_processor.image_start_tag
+        img_end = hf_processor.image_end_tag
+
+        return "".join(f"Picture {i}: {img_start}{img_end}\n"
+                       for i in range(1, num_images + 1))
+
+    def get_dummy_mm_data(
        self,
        seq_len: int,
        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
        hf_config = self.info.get_hf_config()
        vision_config = hf_config.visual

-        processor = self.info.get_hf_processor()
-        img_start = processor.image_start_tag
-        img_end = processor.image_end_tag
-
        target_width = target_height = vision_config["image_size"]
        num_images = mm_counts.get("image", 0)

-        mm_data = {
+        return {
            "image":
            self._get_dummy_images(width=target_width,
                                   height=target_height,
                                   num_images=num_images)
        }

-        return ProcessorInputs(
-            prompt_text="".join(f"Picture {i}: {img_start}{img_end}\n"
-                                for i in range(1, num_images + 1)),
-            mm_data=mm_data,
-        )
-

 class QwenVLMultiModalProcessor(BaseMultiModalProcessor[QwenVLProcessingInfo]):

@@ -647,9 +641,9 @@ class QwenVLMultiModalProcessor(BaseMultiModalProcessor[QwenVLProcessingInfo]):
            PromptReplacement(
                modality="image",
                target=[img_start_id, img_end_id],
-                replacement=PromptUpdateDetails(
-                    full=[img_start_id] + image_tokens + [img_end_id],
-                    features=image_tokens,
+                replacement=PromptUpdateDetails.select_token_id(
+                    [img_start_id] + image_tokens + [img_end_id],
+                    embed_token_id=img_pad_id,
                ),
            )
        ]
@@ -740,6 +734,9 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,

        return self.transformer.visual(image_input["data"])

+    def get_language_model(self) -> torch.nn.Module:
+        return self.transformer
+
    def get_multimodal_embeddings(
            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
        image_input = self._parse_and_validate_image_input(**kwargs)