Better weight tying check for multimodal models (#38035)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>

Better weight tying check for multimodal models (#38035)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
1e673a43 · Harry Mellor · GitHub · 04417ecd · 1e673a43
Unverified Commit 1e673a43 authored Mar 25, 2026 by Harry Mellor Committed by GitHub Mar 25, 2026
Show whitespace changes
Inline Side-by-side

Showing with 30 additions and 17 deletions

vllm/config/vllm.py vllm/config/vllm.py +30 -17

No files found.
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -13,10 +13,12 @@ from dataclasses import is_dataclass
 from datetime import datetime
 from enum import IntEnum
 from functools import lru_cache
+from importlib.metadata import version
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Literal, TypeVar, get_args

 import torch
+from packaging.version import Version
 from pydantic import ConfigDict, Field, model_validator

 import vllm.envs as envs
@@ -550,26 +552,37 @@ class VllmConfig:  # type: ignore[misc]

        model_config = copy.deepcopy(self.model_config)

-        if (
-            model_config.is_multimodal_model
-            and hasattr(model_config.hf_config, "tie_word_embeddings")
-            and not hasattr(hf_config.get_text_config(), "tie_word_embeddings")
-        ):
        # In Transformers v5, tie_word_embeddings belongs to the config of the class
        # that can see both layers to be tied. For example:
        #
        # SomeVLModel:
-            #   self.language_model = SomeLanguageModel()
-            #   self.vision_model = SomeVisionModel()
+        #   self.language_model = SomeLanguageModel(SomeVLTextConfig)
+        #   self.vision_model = SomeVisionModel(SomeVLVisionConfig)
        #
        # SomeVLModelForMultimodalLM:
-            #   self.model = SomeVLModel()
+        #   self.model = SomeVLModel(SomeVLConfig)
+        #   self.lm_head = nn.Linear()
+        #
+        # Therefore, tie_word_embeddings is defined in SomeVLConfig and is not present
+        # in SomeVLTextConfig*. In vLLM, the lm_head belongs to the language_model, so
+        # we must ensure that tie_word_embeddings is set in the language_model's config.
+        #
+        # *For some models, SomeVLTextConfig may also have a tie_word_embeddings field.
+        # This is only the case if SomeVLTextConfig is also used for a text only version
+        # of the same model. For example:
+        #
+        # SomeVLModelForCausalLM:
+        #   self.model = SomeLanguageModel(SomeVLTextConfig)
        #   self.lm_head = nn.Linear()
        #
-            # Therefore, tie_word_embeddings is defined in SomeVLModelForMultimodalLM's
-            # config and is not present in SomeVLModel's config. In vLLM, the lm_head
-            # belongs to the language_model, so we must ensure that tie_word_embeddings
-            # is set in the language_model's config.
+        # Therefore, the presence of tie_word_embeddings in SomeVLTextConfig cannot
+        # be used as a signal for whether tie_word_embeddings should be copied from
+        # hf_config to the language_model config.
+        if (
+            Version(version("transformers")) >= Version("5.0.0")
+            and model_config.is_multimodal_model
+            and hasattr(model_config.hf_config, "tie_word_embeddings")
+        ):
            tie_word_embeddings = model_config.hf_config.tie_word_embeddings
            hf_config.get_text_config().tie_word_embeddings = tie_word_embeddings