Unverified Commit dcf6ee85 authored by haosdent's avatar haosdent Committed by GitHub
Browse files

[Bugfix] Fix encoder cache underestimation for GLM-4V/GLM-OCR single image (#34483)


Signed-off-by: default avatarhaosdent <haosdent@gmail.com>
parent 372b2e76
...@@ -869,9 +869,28 @@ class Glm4vProcessingInfo(BaseProcessingInfo): ...@@ -869,9 +869,28 @@ class Glm4vProcessingInfo(BaseProcessingInfo):
return preprocessed_size, num_vision_tokens return preprocessed_size, num_vision_tokens
def _get_image_max_pixels(self) -> int:
"""Read max_pixels from the HF image processor config.
Despite the name, ``longest_edge`` is a pixel **area** (total pixel
count), not an edge length. The HF processor passes it directly to
``smart_resize`` as the ``max_pixels`` argument, which constrains
``t_bar * h_bar * w_bar <= max_pixels``.
"""
return self.get_image_processor().size["longest_edge"]
def get_image_size_with_most_features(self) -> ImageSize: def get_image_size_with_most_features(self) -> ImageSize:
# Use num_frames=1 for single-image budget estimation.
# _get_vision_info defaults to num_frames=16 (video), which
# makes smart_resize constrain 16*H*W <= max_pixels, vastly
# underestimating the spatial budget for a single image and
# causing encoder cache overflow for large images
# (see https://github.com/vllm-project/vllm/issues/34040).
max_image_size, _ = self._get_vision_info( max_image_size, _ = self._get_vision_info(
image_width=9999999, image_height=9999999 image_width=9999999,
image_height=9999999,
num_frames=1,
max_image_pixels=self._get_image_max_pixels(),
) )
return max_image_size return max_image_size
...@@ -884,7 +903,8 @@ class Glm4vProcessingInfo(BaseProcessingInfo): ...@@ -884,7 +903,8 @@ class Glm4vProcessingInfo(BaseProcessingInfo):
_, num_image_tokens = self._get_vision_info( _, num_image_tokens = self._get_vision_info(
image_width=image_width, image_width=image_width,
image_height=image_height, image_height=image_height,
max_image_pixels=28 * 28 * 2 * 6144, num_frames=1,
max_image_pixels=self._get_image_max_pixels(),
) )
return num_image_tokens return num_image_tokens
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment