Unverified Commit b6e636c1 authored by zhang-prog's avatar zhang-prog Committed by GitHub
Browse files

[Fix] handle PaddleOCR-VL image processor max_pixels across Transformers v4/v5 (#38629)


Signed-off-by: default avatarzhangyue66 <zhangyue66@baidu.com>
parent f1ff50c8
......@@ -200,7 +200,13 @@ class PaddleOCRVLProcessingInfo(BaseProcessingInfo):
merge_size = hf_config.vision_config.spatial_merge_size
patch_size = hf_config.vision_config.patch_size
factor = merge_size * patch_size
max_num_tokens = image_processor.max_pixels // (factor**2)
if self.ctx.model_config.trust_remote_code:
# Defined in HF Hub repo
max_pixels = image_processor.max_pixels
else:
# Defined in Transformers library (requires v5.0 or above)
max_pixels = image_processor.size.longest_edge
max_num_tokens = max_pixels // (factor**2)
# Find factors of max_num_tokens close to its square root
# to create a dummy image with a reasonable aspect ratio.
h_patches = int(math.sqrt(max_num_tokens))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment