pad images if exception

1c242485 · Baber · 6dc55fb3 · 1c242485 · 1c242485
Commit 1c242485 authored Sep 19, 2024 by Baber
Hide whitespace changes
Inline Side-by-side

Showing with 60 additions and 10 deletions

lm_eval/models/hf_vlms.py lm_eval/models/hf_vlms.py +23 -10

lm_eval/utils.py lm_eval/utils.py +37 -0

No files found.
--- a/lm_eval/models/hf_vlms.py
+++ b/lm_eval/models/hf_vlms.py
@@ -17,6 +17,7 @@ from lm_eval.models.utils import (
    replace_placeholders,
    stop_sequences_criteria,
 )
+from lm_eval.utils import add_padding_if_needed
 DEFAULT_IMAGE_PLACEHOLDER = "<image>"
@@ -266,7 +267,9 @@ class HFMultimodalLM(HFLM):
    def tok_batch_multimodal_encode(
        self,
        strings: List[str],  # note that input signature of this fn is different
-        images: List[List],  # TODO: images are pil.Image at the moment, update typehint
+        images: List[
+            List["PIL.Image.Image"]  # noqa: F821
+        ],  # TODO: images are pil.Image at the moment, update typehint
        padding_side: str = "left",
        left_truncate_len: int = None,
        truncation: bool = False,
@@ -292,15 +295,25 @@ class HFMultimodalLM(HFLM):
        images = [img[: self.max_images] for img in images]
        if self.rgb:
            images = [[img.convert("RGB") for img in sublist] for sublist in images]
+        try:
-        encoding = self.processor(
+            encoding = self.processor(
-            images=images,
+                images=images,
-            text=strings,
+                text=strings,
-            truncation=truncation,
+                truncation=truncation,
-            padding="longest",
+                padding="longest",
-            return_tensors="pt",
+                return_tensors="pt",
-            # **add_special_tokens, # TODO: at least some Processors error out when passing this. How do we control whether text gets BOS added?
+                # **add_special_tokens, # TODO: at least some Processors error out when passing this. How do we control whether text gets BOS added?
-        )
+            )
+            # Qwen processor errors out if a dimension is too small (defaults to do_resize=True, and that requires a min dimension)
+        except Exception:
+            encoding = self.processor(
+                images=[add_padding_if_needed(image) for image in images],
+                text=strings,
+                truncation=truncation,
+                padding="longest",
+                return_tensors="pt",
+                # **add_special_tokens, # TODO: at least some Processors error out when passing this. How do we control whether text gets BOS added?
+            )
        encoding.to(  # TODO: our other tokenization methods in HFLM don't typically move to device. this breaks convention
            self.device, self.model.dtype

--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -499,3 +499,40 @@ def weighted_f1_score(items):
    preds = unzipped_list[1]
    fscore = f1_score(golds, preds, average="weighted")
    return fscore
+def add_padding_if_needed(
+    images: List["PIL.Image.Image"],  # noqa: F821
+    min_width: int = 50,
+    min_height: int = 50,
+    color=(255, 255, 255),
+) -> List["PIL.Image.Image"]:  # noqa: F821
+    """Adds (default white) padding to images to make them at least min_width and min_height"""
+    from PIL import ImageOps
+    res = []
+    for image in images:
+        width, height = image.size
+        if width >= min_width and height >= min_height:
+            return image
+        image = image.convert("RGB")
+        new_width = max(width, min_width)
+        new_height = max(height, min_height)
+        delta_width = new_width - width
+        delta_height = new_height - height
+        padding_left = delta_width // 2
+        padding_right = delta_width - padding_left
+        padding_top = delta_height // 2
+        padding_bottom = delta_height - padding_top
+        res.append(
+            ImageOps.expand(
+                image,
+                (padding_left, padding_top, padding_right, padding_bottom),
+                fill=color,
+            )
+        )
+    return res