[2/N] Initialize MM components in context managers (E-H) (#32641)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>

[2/N] Initialize MM components in context managers (E-H) (#32641)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
e1a34c3a · Cyrus Leung · GitHub · 148117ea · e1a34c3a · e1a34c3a
Unverified Commit e1a34c3a authored Jan 20, 2026 by Cyrus Leung Committed by GitHub Jan 20, 2026
12 changed files
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -590,8 +590,6 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
    def _process_image_input(
        self, image_input: AriaImagePixelInputs
    ) -> tuple[torch.Tensor, torch.Tensor]:
-        assert self.vision_tower is not None
-
        pixel_values = image_input["pixel_values"]
        pixel_mask = image_input["pixel_mask"]


--- a/vllm/model_executor/models/aya_vision.py
+++ b/vllm/model_executor/models/aya_vision.py
@@ -382,7 +382,6 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsP
    def _process_image_input(
        self, image_input: AyaVisionImagePixelInputs, **kwargs
    ) -> list[torch.Tensor]:
-        assert self.vision_tower is not None
        pixel_values = image_input["pixel_values"]
        num_patches = image_input["num_patches"]
        image_features = self._image_pixels_to_features(

--- a/vllm/model_executor/models/cohere2_vision.py
+++ b/vllm/model_executor/models/cohere2_vision.py
@@ -391,8 +391,6 @@ class Cohere2VisionForConditionalGeneration(nn.Module, SupportsMultiModal, Suppo
        Returns:
            List of flattened image embeddings, one per image
        """
-        assert self.vision_tower is not None, "Vision tower is required"
-
        pixel_values = image_input["pixel_values"]
        num_patches = image_input["num_patches"]


--- a/vllm/model_executor/models/ernie45_vl.py
+++ b/vllm/model_executor/models/ernie45_vl.py
@@ -1303,6 +1303,7 @@ class Ernie4_5_VLMoeForConditionalGeneration(
        self.config = config
        self.multimodal_config = multimodal_config

+        with self._mark_tower_model(vllm_config, {"image", "video"}):
            self.vision_model = Ernie4_5_VisionTransformer(
                config.vision_config,
                norm_eps=getattr(config, "rms_norm_eps", 1e-6),
@@ -1310,12 +1311,6 @@ class Ernie4_5_VLMoeForConditionalGeneration(
                multimodal_config=multimodal_config,
                prefix=maybe_prefix(prefix, "vision_model"),
            )
-
-        self.language_model = Ernie4_5_VLMoeForCausalLM(
-            vllm_config=vllm_config,
-            prefix=maybe_prefix(prefix, "language_model"),
-        )
-
            self.resampler_model = VariableResolutionResamplerModel(
                self.config.pixel_hidden_size,
                self.config.hidden_size,
@@ -1325,6 +1320,12 @@ class Ernie4_5_VLMoeForConditionalGeneration(
                prefix=maybe_prefix(prefix, "resampler_model"),
            )

+        with self._mark_language_model(vllm_config):
+            self.language_model = Ernie4_5_VLMoeForCausalLM(
+                vllm_config=vllm_config,
+                prefix=maybe_prefix(prefix, "language_model"),
+            )
+
        self.visual_token_mask = None
        self.make_empty_intermediate_tensors = (
            self.language_model.make_empty_intermediate_tensors
@@ -1522,9 +1523,6 @@ class Ernie4_5_VLMoeForConditionalGeneration(
        mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
        return llm_positions, mrope_position_delta

-    def get_language_model(self) -> torch.nn.Module:
-        return self.language_model
-
    def _parse_and_validate_image_input(
        self, **kwargs: object
    ) -> Ernie4_5_VLImageInputs | None:

--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -287,16 +287,20 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
        self.image_token_id = _IMAGE_TOKEN_ID
        self.image_feature_size = config.patch_size**2 * config.num_channels

+        with self._mark_tower_model(vllm_config, "image"):
            self.vision_embed_tokens = ColumnParallelLinear(
                self.image_feature_size,
                config.hidden_size,
                quant_config=quant_config,
                gather_output=True,
            )
+
+        with self._mark_language_model(vllm_config):
            self.language_model = PersimmonForCausalLM(
                vllm_config=vllm_config.with_hf_config(config.text_config),
                prefix=maybe_prefix(prefix, "language_model"),
            )
+
        self.make_empty_intermediate_tensors = (
            self.language_model.make_empty_intermediate_tensors
        )
@@ -323,14 +327,10 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
        image_patches_flat = image_input["image_patches_flat"]
        patches_per_image = image_input["patches_per_image"]

-        assert self.vision_embed_tokens is not None
        vision_embeddings_flat, _ = self.vision_embed_tokens(image_patches_flat)

        return vision_embeddings_flat.split(patches_per_image.tolist(), dim=0)

-    def get_language_model(self) -> torch.nn.Module:
-        return self.language_model
-
    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
        image_input = self._parse_and_validate_image_input(**kwargs)
        if image_input is None:
@@ -361,10 +361,7 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
        self,
        hidden_states: torch.Tensor,
    ) -> torch.Tensor | None:
-        logits = self.language_model.logits_processor(
-            self.language_model.lm_head, hidden_states
-        )
-        return logits
+        return self.language_model.compute_logits(hidden_states)

    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
        loader = AutoWeightsLoader(self)

--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -522,6 +522,7 @@ class Gemma3ForConditionalGeneration(
        self.quant_config = quant_config
        self.multimodal_config = multimodal_config

+        with self._mark_tower_model(vllm_config, "image"):
            self.vision_tower = SiglipVisionModel(
                config.vision_config,
                quant_config,
@@ -529,14 +530,15 @@ class Gemma3ForConditionalGeneration(
            )
            self.multi_modal_projector = Gemma3MultiModalProjector(config)

+        with self._mark_language_model(vllm_config):
            self.language_model = init_vllm_registered_model(
                vllm_config=vllm_config,
                hf_config=config.text_config,
                prefix=maybe_prefix(prefix, "language_model"),
                architectures=["Gemma3ForCausalLM"],
            )
-        logit_scale = getattr(config, "logit_scale", 1.0)

+            logit_scale = getattr(config, "logit_scale", 1.0)
            if hasattr(self.language_model, "logits_processor"):
                # The logits processor can be unset if we're using
                # automatic conversion to pooling model.
@@ -579,8 +581,6 @@ class Gemma3ForConditionalGeneration(
        self,
        image_input: Gemma3ImageInputs,
    ) -> list[torch.Tensor]:
-        assert self.vision_tower is not None
-
        pixel_values = image_input["pixel_values"]
        num_patches = image_input["num_patches"]

@@ -592,9 +592,6 @@ class Gemma3ForConditionalGeneration(

        return [e.flatten(0, 1) for e in image_embeds.split(num_patches.tolist())]

-    def get_language_model(self) -> torch.nn.Module:
-        return self.language_model
-
    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
        image_input = self._parse_and_validate_image_input(**kwargs)
        if image_input is None:

--- a/vllm/model_executor/models/gemma3n_mm.py
+++ b/vllm/model_executor/models/gemma3n_mm.py
@@ -503,22 +503,26 @@ class Gemma3nForConditionalGeneration(
        self.multimodal_config = multimodal_config
        self.vocab_size = config.text_config.vocab_size

+        with self._mark_tower_model(vllm_config, "image"):
            self.vision_tower = AutoModel.from_config(config=config.vision_config)
-        self.audio_tower = AutoModel.from_config(config=config.audio_config)
            self.embed_vision = Gemma3nMultimodalEmbedder(
                config.vision_config, config.text_config
            )
+
+        with self._mark_tower_model(vllm_config, "audio"):
+            self.audio_tower = AutoModel.from_config(config=config.audio_config)
            self.embed_audio = Gemma3nMultimodalEmbedder(
                config.audio_config, config.text_config
            )

-        self.language_model: nn.Module = init_vllm_registered_model(
+        with self._mark_language_model(vllm_config):
+            self.language_model: Gemma3nForCausalLM = init_vllm_registered_model(
                vllm_config=vllm_config,
                hf_config=config.text_config,
                prefix=maybe_prefix(prefix, "language_model"),
                architectures=["Gemma3nForCausalLM"],
            )
-        self.language_model = cast(Gemma3nForCausalLM, self.language_model)
+
            # NOTE (NickLucche) In order to be compatible with cudagraph, the
            # buffer needs to be consistent, so we pre-allocate here.
            self.per_layer_embeddings = torch.zeros(
@@ -583,8 +587,6 @@ class Gemma3nForConditionalGeneration(
        self,
        image_input: Gemma3nImageInputs,
    ) -> list[torch.Tensor]:
-        assert self.vision_tower is not None
-
        pixel_values = image_input["pixel_values"]
        vision_outputs = self.vision_tower(
            pixel_values=pixel_values, do_pooling=False, return_dict=True
@@ -609,7 +611,6 @@ class Gemma3nForConditionalGeneration(
        self,
        audio_input: Gemma3nAudioInputs,
    ) -> list[torch.Tensor]:
-        assert self.audio_tower is not None
        # Run on padded features to enable batching
        input_features = audio_input["input_features_padded"].squeeze(1)
        input_features_mask = audio_input["input_features_mask"].squeeze(1)
@@ -651,9 +652,6 @@ class Gemma3nForConditionalGeneration(
        # Return a list of embeddings instead of a batched tensor
        return audio_features.unbind(0)

-    def get_language_model(self) -> torch.nn.Module:
-        return self.language_model
-
    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
        mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs)
        if mm_input_by_modality is None:

--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -1434,6 +1434,7 @@ class Glm4vForConditionalGeneration(
        self.multimodal_config = multimodal_config
        self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"

+        with self._mark_tower_model(vllm_config, {"image", "video"}):
            self.visual = Glm4vVisionTransformer(
                config.vision_config,
                norm_eps=getattr(config, "rms_norm_eps", 1e-5),
@@ -1449,6 +1450,7 @@ class Glm4vForConditionalGeneration(
        else:
            architectures = None

+        with self._mark_language_model(vllm_config):
            self.language_model = init_vllm_registered_model(
                vllm_config=vllm_config,
                hf_config=config.text_config,
@@ -1578,9 +1580,6 @@ class Glm4vForConditionalGeneration(
                )
        return mm_input_by_modality

-    def get_language_model(self) -> torch.nn.Module:
-        return self.language_model
-
    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None:
        mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs)
        if not mm_input_by_modality:

--- a/vllm/model_executor/models/glmasr.py
+++ b/vllm/model_executor/models/glmasr.py
@@ -944,8 +944,9 @@ class GlmAsrForConditionalGeneration(
        multimodal_config = vllm_config.model_config.multimodal_config
        self.config = config
        self.multimodal_config = multimodal_config
+        self.quant_config = quant_config

-        # Use optimized vLLM native encoder
+        with self._mark_tower_model(vllm_config, "audio"):
            self.audio_tower = GlmAsrEncoder(
                config.audio_config,
                quant_config=quant_config,
@@ -956,8 +957,8 @@ class GlmAsrForConditionalGeneration(
                quant_config=quant_config,
                prefix=maybe_prefix(prefix, "multi_modal_projector"),
            )
-        self.quant_config = quant_config

+        with self._mark_language_model(vllm_config):
            self.language_model = init_vllm_registered_model(
                vllm_config=vllm_config,
                hf_config=config.text_config,
@@ -1063,9 +1064,6 @@ class GlmAsrForConditionalGeneration(
        )
        return _group_audio_embeddings(chunk_embeddings, chunk_counts)

-    def get_language_model(self) -> torch.nn.Module:
-        return self.language_model
-
    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
        audio_input = self._parse_and_validate_audio_input(**kwargs)
        if audio_input is None:

--- a/vllm/model_executor/models/granite_speech.py
+++ b/vllm/model_executor/models/granite_speech.py
@@ -597,6 +597,7 @@ class GraniteSpeechForConditionalGeneration(
        self.quant_config = quant_config
        self.cache_config = cache_config

+        with self._mark_language_model(vllm_config):
            # The language model is typically a Granite LLM
            self.language_model = init_vllm_registered_model(
                vllm_config=vllm_config,
@@ -604,6 +605,7 @@ class GraniteSpeechForConditionalGeneration(
                prefix=maybe_prefix(prefix, "language_model"),
            )

+        with self._mark_tower_model(vllm_config, "audio"):
            # Conformer encoder
            self.encoder = GraniteSpeechCTCEncoder(
                config=config.encoder_config,
@@ -770,9 +772,6 @@ class GraniteSpeechForConditionalGeneration(
        # Split variable length features into a tuple
        return torch.split(masked_embeds, audio_input["audio_embed_sizes"])

-    def get_language_model(self) -> torch.nn.Module:
-        return self.language_model
-
    def embed_multimodal(
        self,
        **kwargs: object,

--- a/vllm/model_executor/models/hunyuan_vision.py
+++ b/vllm/model_executor/models/hunyuan_vision.py
@@ -877,7 +877,7 @@ class HunYuanVLForConditionalGeneration(
        self.config = config
        self.multimodal_config = multimodal_config

-        if multimodal_config.get_limit_per_prompt("image"):
+        with self._mark_tower_model(vllm_config, {"image"}):
            attn_backend_override = (
                multimodal_config.mm_encoder_attn_backend
                if multimodal_config is not None
@@ -890,9 +890,8 @@ class HunYuanVLForConditionalGeneration(
                multimodal_config=multimodal_config,
                attn_backend_override=attn_backend_override,
            )
-        else:
-            self.visual = None

+        with self._mark_language_model(vllm_config):
            self.language_model = init_vllm_registered_model(
                vllm_config=vllm_config,
                prefix=maybe_prefix(prefix, "language_model.model"),
@@ -970,9 +969,6 @@ class HunYuanVLForConditionalGeneration(
                )
        return mm_input_by_modality

-    def get_language_model(self) -> torch.nn.Module:
-        return self.language_model
-
    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
        mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs)
        if not mm_input_by_modality:

--- a/vllm/model_executor/models/hyperclovax_vision.py
+++ b/vllm/model_executor/models/hyperclovax_vision.py
@@ -15,7 +15,6 @@ from einops import rearrange
 from timm.layers import LayerNorm, LayerNorm2d
 from timm.models.regnet import RegStage
 from transformers import BatchFeature, CLIPVisionConfig, SiglipVisionConfig
-from transformers.modeling_utils import no_init_weights

 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions, MultiModalConfig
@@ -625,8 +624,7 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
            config, vision_config
        )

-        # init models & parameters
-        with no_init_weights():  # weight will be loaded in from_pretrained
+        with self._mark_tower_model(vllm_config, {"image", "video"}):
            self.vision_model = init_vision_tower_for_hcxvision(
                vision_config,
                quant_config=quant_config,
@@ -635,22 +633,22 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
                require_post_norm=False,
                prefix=maybe_prefix(prefix, "vision_model"),
            )
-        self.mm_projector = self._init_mm_projector(config, text_config, vision_config)
+            self.mm_projector = self._init_mm_projector(
+                config, text_config, vision_config
+            )

-        self.lm_head_vocab_size = getattr(
-            text_config, "padded_vocab_size", text_config.vocab_size
+            if config.anyres:
+                self.image_newline = nn.Parameter(
+                    torch.empty(text_config.hidden_size, dtype=self.dtype)
                )
+
+        with self._mark_language_model(vllm_config):
            self.language_model = init_vllm_registered_model(
                vllm_config=vllm_config,
                hf_config=text_config,
                prefix=maybe_prefix(prefix, "language_model"),
            )

-        if config.anyres:
-            self.image_newline = nn.Parameter(
-                torch.empty(text_config.hidden_size, dtype=self.dtype)
-            )
-
        self.config = config
        self.vision_config = vision_config
        self.text_config = text_config
@@ -726,9 +724,6 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):

        return modalities

-    def get_language_model(self) -> torch.nn.Module:
-        return self.language_model
-
    def embed_multimodal(
        self,
        **kwargs: object,