Adding resize images support (#2958)

* first version of image resizing * fixed bug * clean up `resize_image` --------- Co-authored-by: Artem Safin <artemsafin67@gmail.com> Co-authored-by: Baber <baber@hey.com>

Adding resize images support (#2958)
* first version of image resizing * fixed bug * clean up `resize_image` --------- Co-authored-by: Artem Safin <artemsafin67@gmail.com> Co-authored-by: Baber <baber@hey.com>
143a7fe0 · achervyakov · GitHub · 2cfdd0a2 · 143a7fe0 · 143a7fe0
Unverified Commit 143a7fe0 authored May 21, 2025 by achervyakov Committed by GitHub May 21, 2025
Showing with 146 additions and 2 deletions

lm_eval/models/hf_vlms.py lm_eval/models/hf_vlms.py +23 -1

lm_eval/models/utils.py lm_eval/models/utils.py +101 -0

lm_eval/models/vllm_vlms.py lm_eval/models/vllm_vlms.py +22 -1

No files found.
--- a/lm_eval/models/hf_vlms.py
+++ b/lm_eval/models/hf_vlms.py
@@ -17,6 +17,7 @@ from lm_eval.models.utils import (
    handle_stop_sequences,
    pad_and_concat,
    replace_placeholders,
+    resize_image,
    stop_sequences_criteria,
 )

@@ -45,10 +46,23 @@ class HFMultimodalLM(HFLM):
        # TODO: handle whitespace in image placeholder (replacement)
        max_images: Optional[int] = 999,
        convert_img_format=False,
+        # For image resizing
        min_pixels: Optional[int] = None,
        max_pixels: Optional[int] = None,
+        image_width: Optional[int] = None,
+        image_height: Optional[int] = None,
+        image_max_side: Optional[int] = None,
        **kwargs,
    ):
+        self.image_width = image_width
+        self.image_height = image_height
+        self.image_max_side = image_max_side
+        if self.image_max_side and (self.image_width or self.image_height):
+            raise ValueError(
+                "Ambiguous config for image resize: you can not specify both "
+                "image_max_side and (image_width or image_height)"
+            )
+
        # init pixels before calling tokenizer creation to avoid errors
        self.pixels = ({"min_pixels": min_pixels} if min_pixels else {}) | (
            {"max_pixels": max_pixels} if max_pixels else {}
@@ -646,7 +660,15 @@ class HFMultimodalLM(HFLM):
        for chunk in chunks:
            contexts, all_gen_kwargs, aux_arguments = zip(*chunk)

-            visuals = [arg["visual"] for arg in aux_arguments]
+            visuals = [
+                [
+                    resize_image(
+                        img, self.image_width, self.image_height, self.image_max_side
+                    )
+                    for img in arg["visual"]
+                ]
+                for arg in aux_arguments
+            ]

            if not isinstance(contexts, list):
                contexts = list(

--- a/lm_eval/models/utils.py
+++ b/lm_eval/models/utils.py
@@ -28,6 +28,7 @@ eval_logger = logging.getLogger(__name__)


 if TYPE_CHECKING:
+    from PIL import Image
    from transformers import PreTrainedTokenizerBase
    from transformers.configuration_utils import PretrainedConfig

@@ -729,3 +730,103 @@ def handle_stop_sequences(
    if eos is not None and eos not in until:
        until.append(eos)
    return until
+
+
+def resize_image(
+    image: "Image.Image",
+    width: Optional[int] = None,
+    height: Optional[int] = None,
+    max_dimension: Optional[int] = None,
+    keep_aspect_ratio: bool = True,
+    resample_filter: Union[int, str] = "Image.BICUBIC",
+    min_width: int = 1,
+    min_height: int = 1,
+) -> "Image.Image":
+    """
+    Resizes a PIL Image object with flexible options.
+
+    Args:
+        image: The PIL Image object to resize.
+        width: Target width in pixels.
+        height: Target height in pixels.
+        max_dimension: Maximum size for the longer dimension of the image.
+        keep_aspect_ratio: If True (default) and both width and height are provided,
+                          the image is resized to fit within these dimensions while
+                          maintaining its aspect ratio. If False, the image is stretched
+                          to the exact width and height.
+        resample_filter: The resampling filter to use for resizing.
+                        Defaults to Image.BICUBIC.
+        min_width: Minimum width for the resized image. Defaults to 1.
+        min_height: Minimum height for the resized image. Defaults to 1.
+
+    Returns:
+        The resized PIL Image object. If no resize parameters are provided
+        or if the image already meets the criteria, the original image is returned.
+
+    Order of precedence for resizing:
+    1. If width AND height are provided:
+       - If keep_aspect_ratio is True: Fits image within bounds, preserving aspect ratio.
+       - If keep_aspect_ratio is False: Resizes to exact dimensions (may distort).
+    2. Else if only width is provided: Calculates height proportionally.
+    3. Else if only height is provided: Calculates width proportionally.
+    4. Else if max_dimension is provided: Resizes the longest side to max_dimension
+       and scales the other side proportionally.
+    5. If none of the above are provided, returns the original image.
+    """
+    original_width, original_height = image.size
+
+    # If no arguments are provided, return the original image
+    if width is None and height is None and max_dimension is None:
+        return image
+
+    new_width = original_width
+    new_height = original_height
+
+    if width is not None and height is not None:
+        # No resize needed if image is already smaller than target dimensions
+        if original_width <= width and original_height <= height:
+            return image
+
+        if keep_aspect_ratio:
+            # Calculate the ratio to fit within the target dimensions
+            ratio = min(width / original_width, height / original_height)
+            new_width = int(original_width * ratio)
+            new_height = int(original_height * ratio)
+        else:
+            # Stretch to exact dimensions
+            new_width = width
+            new_height = height
+    elif width is not None:
+        # No resize needed if width is already smaller
+        if original_width <= width:
+            return image
+        # Calculate height proportionally
+        new_width = width
+        new_height = int((original_height / original_width) * new_width)
+    elif height is not None:
+        # No resize needed if height is already smaller
+        if original_height <= height:
+            return image
+        # Calculate width proportionally
+        new_height = height
+        new_width = int((original_width / original_height) * new_height)
+    elif max_dimension is not None:
+        # No resize needed if both dimensions are smaller than max_dimension
+        if max(original_height, original_width) <= max_dimension:
+            return image
+
+        if original_width > original_height:
+            # Width is the longer side
+            new_width = max_dimension
+            new_height = int((original_height / original_width) * new_width)
+        else:
+            # Height is the longer side or sides are equal
+            new_height = max_dimension
+            new_width = int((original_width / original_height) * new_height)
+
+    # Ensure dimensions are at least minimum values
+    new_width = max(min_width, new_width)
+    new_height = max(min_height, new_height)
+
+    # Perform the resize operation with the calculated dimensions
+    return image.resize((new_width, new_height), resample_filter)
--- a/lm_eval/models/vllm_vlms.py
+++ b/lm_eval/models/vllm_vlms.py
@@ -12,6 +12,7 @@ from lm_eval.models.utils import (
    Collator,
    handle_stop_sequences,
    replace_placeholders,
+    resize_image,
    undistribute,
 )
 from lm_eval.models.vllm_causallms import VLLM
@@ -44,8 +45,20 @@ class VLLM_VLM(VLLM):
        interleave: bool = True,
        # TODO<baber>: handle max_images and limit_mm_per_prompt better
        max_images: int = 999,
+        image_width: Optional[int] = None,
+        image_height: Optional[int] = None,
+        image_max_side: Optional[int] = None,
        **kwargs,
    ):
+        self.image_width = image_width
+        self.image_height = image_height
+        self.image_max_side = image_max_side
+        if self.image_max_side and (self.image_width or self.image_height):
+            raise ValueError(
+                "Ambiguous config for image resize: you can not specify both "
+                "image_max_side and (image_width or image_height)"
+            )
+
        if max_images != 999:
            kwargs["limit_mm_per_prompt"] = {"image": max_images}
            eval_logger.info(f"Setting limit_mm_per_prompt[image] to {max_images}")
@@ -239,7 +252,15 @@ class VLLM_VLM(VLLM):
        for chunk in chunks:
            contexts, all_gen_kwargs, aux_arguments = zip(*chunk)

-            visuals = [arg["visual"] for arg in aux_arguments]
+            visuals = [
+                [
+                    resize_image(
+                        img, self.image_width, self.image_height, self.image_max_side
+                    )
+                    for img in arg["visual"]
+                ]
+                for arg in aux_arguments
+            ]

            if not isinstance(contexts, list):
                contexts = list(