[Model] Apply #32631 for recent models (#33785)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>

[Model] Apply #32631 for recent models (#33785)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
e57ef99b · Cyrus Leung · GitHub · f8516a1a · e57ef99b · e57ef99b
Unverified Commit e57ef99b authored Feb 04, 2026 by Cyrus Leung Committed by GitHub Feb 04, 2026
4 changed files
--- a/vllm/model_executor/models/eagle2_5_vl.py
+++ b/vllm/model_executor/models/eagle2_5_vl.py
@@ -222,6 +222,7 @@ class Eagle2_5_VLForConditionalGeneration(

        self.select_layer = getattr(config, "select_layer", -1)

+        with self._mark_tower_model(vllm_config, "image"):
            # Vision encoder (SigLIP)
            self.vision_model = self._init_vision_model(
                config,
@@ -229,6 +230,10 @@ class Eagle2_5_VLForConditionalGeneration(
                prefix=maybe_prefix(prefix, "vision_model"),
            )

+            # MLP projection
+            self.mlp1 = self._init_mlp1(config)
+
+        with self._mark_language_model(vllm_config):
            # Language model (Qwen2)
            self.language_model = init_vllm_registered_model(
                vllm_config=vllm_config,
@@ -236,9 +241,6 @@ class Eagle2_5_VLForConditionalGeneration(
                prefix=maybe_prefix(prefix, "language_model"),
            )

-        # MLP projection
-        self.mlp1 = self._init_mlp1(config)
-
        self.img_context_token_id = None

        self.make_empty_intermediate_tensors = (
@@ -399,9 +401,6 @@ class Eagle2_5_VLForConditionalGeneration(
        ]
        return image_embeds.split(image_feature_sizes)

-    def get_language_model(self) -> torch.nn.Module:
-        return self.language_model
-
    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
        """Embed multimodal inputs."""
        image_input = self._parse_and_validate_image_input(**kwargs)

--- a/vllm/model_executor/models/funaudiochat.py
+++ b/vllm/model_executor/models/funaudiochat.py
@@ -820,9 +820,6 @@ class FunAudioChatForConditionalGeneration(nn.Module, SupportsMultiModal, Suppor
            self.language_model.make_empty_intermediate_tensors
        )

-    def get_language_model(self) -> torch.nn.Module:
-        return self.language_model
-
    def _get_continuous_audio_features(
        self,
        input_features: torch.Tensor,

--- a/vllm/model_executor/models/openpangu_vl.py
+++ b/vllm/model_executor/models/openpangu_vl.py
@@ -843,6 +843,8 @@ class OpenPanguVLForConditionalGeneration(
        self.config = config
        self.vllm_config = vllm_config
        quant_config = vllm_config.quant_config
+
+        with self._mark_tower_model(vllm_config, {"image", "video"}):
            self.visual = OpenPanguVisionTransformer(
                vision_config=config.vision_config,
                out_hidden_size=config.vision_config.out_hidden_size,
@@ -852,11 +854,13 @@ class OpenPanguVLForConditionalGeneration(
                prefix=maybe_prefix(prefix, "visual"),
            )

+        with self._mark_language_model(vllm_config):
            self.language_model = init_vllm_registered_model(
                vllm_config=vllm_config,
                prefix=maybe_prefix("openpangu", "language_model"),
                architectures=["PanguEmbeddedForCausalLM"],
            )
+
        self.make_empty_intermediate_tensors = (
            self.language_model.make_empty_intermediate_tensors
        )
@@ -1008,9 +1012,6 @@ class OpenPanguVLForConditionalGeneration(
                )
        return mm_input_by_modality

-    def get_language_model(self) -> torch.nn.Module:
-        return self.language_model
-
    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None:
        mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs)
        if not mm_input_by_modality:

--- a/vllm/model_executor/models/qwen3_asr.py
+++ b/vllm/model_executor/models/qwen3_asr.py
@@ -296,13 +296,15 @@ class Qwen3ASRForConditionalGeneration(
        multimodal_config = vllm_config.model_config.multimodal_config
        self.config = thinker_config
        self.multimodal_config = multimodal_config
+        self.quant_config = quant_config

+        with self._mark_tower_model(vllm_config, "audio"):
            self.audio_tower = Qwen3OmniMoeAudioEncoder(
                thinker_config.audio_config,
                prefix=maybe_prefix(prefix, "audio_tower"),
            )
-        self.quant_config = quant_config

+        with self._mark_language_model(vllm_config):
            self.language_model = Qwen3ForCausalLM(
                vllm_config=vllm_config.with_hf_config(
                    thinker_config.text_config, architectures=["Qwen3ForCausalLM"]
@@ -363,9 +365,6 @@ class Qwen3ASRForConditionalGeneration(
        )
        return audio_features.split(audio_output_lengths.tolist())

-    def get_language_model(self) -> torch.nn.Module:
-        return self.language_model
-
    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None:
        mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs)
        if not mm_input_by_modality: