Unverified Commit 143a7fe0 authored by achervyakov's avatar achervyakov Committed by GitHub
Browse files

Adding resize images support (#2958)



* first version of image resizing

* fixed bug

* clean up `resize_image`

---------
Co-authored-by: default avatarArtem Safin <artemsafin67@gmail.com>
Co-authored-by: default avatarBaber <baber@hey.com>
parent 2cfdd0a2
......@@ -17,6 +17,7 @@ from lm_eval.models.utils import (
handle_stop_sequences,
pad_and_concat,
replace_placeholders,
resize_image,
stop_sequences_criteria,
)
......@@ -45,10 +46,23 @@ class HFMultimodalLM(HFLM):
# TODO: handle whitespace in image placeholder (replacement)
max_images: Optional[int] = 999,
convert_img_format=False,
# For image resizing
min_pixels: Optional[int] = None,
max_pixels: Optional[int] = None,
image_width: Optional[int] = None,
image_height: Optional[int] = None,
image_max_side: Optional[int] = None,
**kwargs,
):
self.image_width = image_width
self.image_height = image_height
self.image_max_side = image_max_side
if self.image_max_side and (self.image_width or self.image_height):
raise ValueError(
"Ambiguous config for image resize: you can not specify both "
"image_max_side and (image_width or image_height)"
)
# init pixels before calling tokenizer creation to avoid errors
self.pixels = ({"min_pixels": min_pixels} if min_pixels else {}) | (
{"max_pixels": max_pixels} if max_pixels else {}
......@@ -646,7 +660,15 @@ class HFMultimodalLM(HFLM):
for chunk in chunks:
contexts, all_gen_kwargs, aux_arguments = zip(*chunk)
visuals = [arg["visual"] for arg in aux_arguments]
visuals = [
[
resize_image(
img, self.image_width, self.image_height, self.image_max_side
)
for img in arg["visual"]
]
for arg in aux_arguments
]
if not isinstance(contexts, list):
contexts = list(
......
......@@ -28,6 +28,7 @@ eval_logger = logging.getLogger(__name__)
if TYPE_CHECKING:
from PIL import Image
from transformers import PreTrainedTokenizerBase
from transformers.configuration_utils import PretrainedConfig
......@@ -729,3 +730,103 @@ def handle_stop_sequences(
if eos is not None and eos not in until:
until.append(eos)
return until
def resize_image(
image: "Image.Image",
width: Optional[int] = None,
height: Optional[int] = None,
max_dimension: Optional[int] = None,
keep_aspect_ratio: bool = True,
resample_filter: Union[int, str] = "Image.BICUBIC",
min_width: int = 1,
min_height: int = 1,
) -> "Image.Image":
"""
Resizes a PIL Image object with flexible options.
Args:
image: The PIL Image object to resize.
width: Target width in pixels.
height: Target height in pixels.
max_dimension: Maximum size for the longer dimension of the image.
keep_aspect_ratio: If True (default) and both width and height are provided,
the image is resized to fit within these dimensions while
maintaining its aspect ratio. If False, the image is stretched
to the exact width and height.
resample_filter: The resampling filter to use for resizing.
Defaults to Image.BICUBIC.
min_width: Minimum width for the resized image. Defaults to 1.
min_height: Minimum height for the resized image. Defaults to 1.
Returns:
The resized PIL Image object. If no resize parameters are provided
or if the image already meets the criteria, the original image is returned.
Order of precedence for resizing:
1. If width AND height are provided:
- If keep_aspect_ratio is True: Fits image within bounds, preserving aspect ratio.
- If keep_aspect_ratio is False: Resizes to exact dimensions (may distort).
2. Else if only width is provided: Calculates height proportionally.
3. Else if only height is provided: Calculates width proportionally.
4. Else if max_dimension is provided: Resizes the longest side to max_dimension
and scales the other side proportionally.
5. If none of the above are provided, returns the original image.
"""
original_width, original_height = image.size
# If no arguments are provided, return the original image
if width is None and height is None and max_dimension is None:
return image
new_width = original_width
new_height = original_height
if width is not None and height is not None:
# No resize needed if image is already smaller than target dimensions
if original_width <= width and original_height <= height:
return image
if keep_aspect_ratio:
# Calculate the ratio to fit within the target dimensions
ratio = min(width / original_width, height / original_height)
new_width = int(original_width * ratio)
new_height = int(original_height * ratio)
else:
# Stretch to exact dimensions
new_width = width
new_height = height
elif width is not None:
# No resize needed if width is already smaller
if original_width <= width:
return image
# Calculate height proportionally
new_width = width
new_height = int((original_height / original_width) * new_width)
elif height is not None:
# No resize needed if height is already smaller
if original_height <= height:
return image
# Calculate width proportionally
new_height = height
new_width = int((original_width / original_height) * new_height)
elif max_dimension is not None:
# No resize needed if both dimensions are smaller than max_dimension
if max(original_height, original_width) <= max_dimension:
return image
if original_width > original_height:
# Width is the longer side
new_width = max_dimension
new_height = int((original_height / original_width) * new_width)
else:
# Height is the longer side or sides are equal
new_height = max_dimension
new_width = int((original_width / original_height) * new_height)
# Ensure dimensions are at least minimum values
new_width = max(min_width, new_width)
new_height = max(min_height, new_height)
# Perform the resize operation with the calculated dimensions
return image.resize((new_width, new_height), resample_filter)
......@@ -12,6 +12,7 @@ from lm_eval.models.utils import (
Collator,
handle_stop_sequences,
replace_placeholders,
resize_image,
undistribute,
)
from lm_eval.models.vllm_causallms import VLLM
......@@ -44,8 +45,20 @@ class VLLM_VLM(VLLM):
interleave: bool = True,
# TODO<baber>: handle max_images and limit_mm_per_prompt better
max_images: int = 999,
image_width: Optional[int] = None,
image_height: Optional[int] = None,
image_max_side: Optional[int] = None,
**kwargs,
):
self.image_width = image_width
self.image_height = image_height
self.image_max_side = image_max_side
if self.image_max_side and (self.image_width or self.image_height):
raise ValueError(
"Ambiguous config for image resize: you can not specify both "
"image_max_side and (image_width or image_height)"
)
if max_images != 999:
kwargs["limit_mm_per_prompt"] = {"image": max_images}
eval_logger.info(f"Setting limit_mm_per_prompt[image] to {max_images}")
......@@ -239,7 +252,15 @@ class VLLM_VLM(VLLM):
for chunk in chunks:
contexts, all_gen_kwargs, aux_arguments = zip(*chunk)
visuals = [arg["visual"] for arg in aux_arguments]
visuals = [
[
resize_image(
img, self.image_width, self.image_height, self.image_max_side
)
for img in arg["visual"]
]
for arg in aux_arguments
]
if not isinstance(contexts, list):
contexts = list(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment