[Bugfix][VLM] Fix transformers backend embed_multimodal for Qwen2.5-VL profiling (#32969)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>

[Bugfix][VLM] Fix transformers backend embed_multimodal for Qwen2.5-VL profiling (#32969)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
22aeb430 · Andreas Karatzas · GitHub · a698e8e7 · 22aeb430
Unverified Commit 22aeb430 authored Jan 25, 2026 by Andreas Karatzas Committed by GitHub Jan 26, 2026
Hide whitespace changes
Inline Side-by-side

Showing with 38 additions and 13 deletions

vllm/model_executor/models/transformers/multimodal.py vllm/model_executor/models/transformers/multimodal.py +38 -13

No files found.
--- a/vllm/model_executor/models/transformers/multimodal.py
+++ b/vllm/model_executor/models/transformers/multimodal.py
@@ -386,19 +386,44 @@ class MultiModalMixin(SupportsMultiModal, SupportsMRoPE):
                vision_embeddings = vision_embeddings.pooler_output
            if isinstance(vision_embeddings, torch.Tensor):
-                if vision_embeddings.ndim == 2:
+                split_sizes = num_image_patches.flatten().tolist()
-                    vision_embeddings = vision_embeddings.unsqueeze(0)
+                total_patches = sum(split_sizes)
-                # Embeddings have to be 2D tensors of length `num_images`
+                # Flatten to 2D: [total_tokens, hidden_dim]
-                # but transformers returns concat tensors if each patch
+                if vision_embeddings.ndim == 3:
-                # is of different size. We split it back to make vLLM happy
+                    vision_embeddings = vision_embeddings.view(
-                vision_embeddings = torch.split(
+                        -1, vision_embeddings.shape[-1]
-                    vision_embeddings, num_image_patches.flatten().tolist()
+                    )
-                )
-                vision_embeddings = [
+                total_tokens = vision_embeddings.shape[0]
-                    embed.flatten(start_dim=0, end_dim=-2)
+                if total_tokens == total_patches:
-                    for embed in vision_embeddings
+                    # Direct match: num_image_patches are actual token counts
-                ]
+                    # (e.g., Qwen2.5-VL style)
+                    token_split_sizes = split_sizes
+                elif total_patches > 0 and total_tokens % total_patches == 0:
+                    # Uniform expansion: each patch expands to N tokens
+                    # (e.g., Idefics3 style)
+                    tokens_per_patch = total_tokens // total_patches
+                    token_split_sizes = [s * tokens_per_patch for s in split_sizes]
+                elif total_patches > 0:
+                    # Mismatch (profiling with dummy data) - pad/truncate
+                    if total_tokens == 0:
+                        raise ValueError(
+                            "Vision encoder returned empty embeddings. "
+                            f"Expected {total_patches} patches from "
+                            f"num_image_patches={split_sizes}"
+                        )
+                    if total_tokens < total_patches:
+                        repeat_factor = (
+                            total_patches + total_tokens - 1
+                        ) // total_tokens
+                        vision_embeddings = vision_embeddings.repeat(repeat_factor, 1)
+                    vision_embeddings = vision_embeddings[:total_patches]
+                    token_split_sizes = split_sizes
+                else:
+                    return []
+                return list(torch.split(vision_embeddings, token_split_sizes, dim=0))
            return vision_embeddings
        else: