[Bugfix] Fix encoder cache underestimation for GLM-4V/GLM-OCR single image (#34483)

Signed-off-by: haosdent <haosdent@gmail.com>

[Bugfix] Fix encoder cache underestimation for GLM-4V/GLM-OCR single image (#34483)
Signed-off-by: haosdent <haosdent@gmail.com>
dcf6ee85 · haosdent · GitHub · 372b2e76 · dcf6ee85
Unverified Commit dcf6ee85 authored Feb 13, 2026 by haosdent Committed by GitHub Feb 12, 2026
Hide whitespace changes
Inline Side-by-side

Showing with 22 additions and 2 deletions

vllm/model_executor/models/glm4_1v.py vllm/model_executor/models/glm4_1v.py +22 -2

No files found.
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -869,9 +869,28 @@ class Glm4vProcessingInfo(BaseProcessingInfo):
        return preprocessed_size, num_vision_tokens
+    def _get_image_max_pixels(self) -> int:
+        """Read max_pixels from the HF image processor config.
+        Despite the name, ``longest_edge`` is a pixel **area** (total pixel
+        count), not an edge length.  The HF processor passes it directly to
+        ``smart_resize`` as the ``max_pixels`` argument, which constrains
+        ``t_bar * h_bar * w_bar <= max_pixels``.
+        """
+        return self.get_image_processor().size["longest_edge"]
    def get_image_size_with_most_features(self) -> ImageSize:
+        # Use num_frames=1 for single-image budget estimation.
+        # _get_vision_info defaults to num_frames=16 (video), which
+        # makes smart_resize constrain 16*H*W <= max_pixels, vastly
+        # underestimating the spatial budget for a single image and
+        # causing encoder cache overflow for large images
+        # (see https://github.com/vllm-project/vllm/issues/34040).
        max_image_size, _ = self._get_vision_info(
-            image_width=9999999, image_height=9999999
+            image_width=9999999,
+            image_height=9999999,
+            num_frames=1,
+            max_image_pixels=self._get_image_max_pixels(),
        )
        return max_image_size
@@ -884,7 +903,8 @@ class Glm4vProcessingInfo(BaseProcessingInfo):
        _, num_image_tokens = self._get_vision_info(
            image_width=image_width,
            image_height=image_height,
-            max_image_pixels=28 * 28 * 2 * 6144,
+            num_frames=1,
+            max_image_pixels=self._get_image_max_pixels(),
        )
        return num_image_tokens