[`LlavaNext`] Fix llava next unsafe imports (#29773)

* path llava-next * styling * styling

[`LlavaNext`] Fix llava next unsafe imports (#29773)
* path llava-next * styling * styling
73a73b41 · Arthur · GitHub · 2ddceef9 · 73a73b41 · 73a73b41
Unverified Commit 73a73b41 authored Mar 21, 2024 by Arthur Committed by GitHub Mar 21, 2024
4 changed files
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -748,6 +748,44 @@ def get_size_dict(
    return size_dict


+def select_best_resolution(original_size: tuple, possible_resolutions: list) -> tuple:
+    """
+    Selects the best resolution from a list of possible resolutions based on the original size.
+
+    This is done by calculating the effective and wasted resolution for each possible resolution.
+
+    The best fit resolution is the one that maximizes the effective resolution and minimizes the wasted resolution.
+
+    Args:
+        original_size (tuple):
+            The original size of the image in the format (height, width).
+        possible_resolutions (list):
+            A list of possible resolutions in the format [(height1, width1), (height2, width2), ...].
+
+    Returns:
+        tuple: The best fit resolution in the format (height, width).
+    """
+    original_height, original_width = original_size
+    best_fit = None
+    max_effective_resolution = 0
+    min_wasted_resolution = float("inf")
+
+    for height, width in possible_resolutions:
+        scale = min(width / original_width, height / original_height)
+        downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
+        effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
+        wasted_resolution = (width * height) - effective_resolution
+
+        if effective_resolution > max_effective_resolution or (
+            effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution
+        ):
+            max_effective_resolution = effective_resolution
+            min_wasted_resolution = wasted_resolution
+            best_fit = (height, width)
+
+    return best_fit
+
+
 ImageProcessingMixin.push_to_hub = copy_func(ImageProcessingMixin.push_to_hub)
 if ImageProcessingMixin.push_to_hub.__doc__ is not None:
    ImageProcessingMixin.push_to_hub.__doc__ = ImageProcessingMixin.push_to_hub.__doc__.format(

--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -77,7 +77,7 @@ IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict(
        ("layoutlmv3", "LayoutLMv3ImageProcessor"),
        ("levit", "LevitImageProcessor"),
        ("llava", "CLIPImageProcessor"),
-        ("llava_next", "CLIPImageProcessor"),
+        ("llava_next", "LlavaNextImageProcessor"),
        ("mask2former", "Mask2FormerImageProcessor"),
        ("maskformer", "MaskFormerImageProcessor"),
        ("mgp-str", "ViTImageProcessor"),

--- a/src/transformers/models/llava_next/image_processing_llava_next.py
+++ b/src/transformers/models/llava_next/image_processing_llava_next.py
@@ -19,7 +19,7 @@ from typing import Dict, List, Optional, Union

 import numpy as np

-from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict, select_best_resolution
 from ...image_transforms import (
    convert_to_rgb,
    get_resize_output_image_size,
@@ -51,44 +51,6 @@ if is_vision_available():
    from PIL import Image


-def select_best_resolution(original_size: tuple, possible_resolutions: list) -> tuple:
-    """
-    Selects the best resolution from a list of possible resolutions based on the original size.
-
-    This is done by calculating the effective and wasted resolution for each possible resolution.
-
-    The best fit resolution is the one that maximizes the effective resolution and minimizes the wasted resolution.
-
-    Args:
-        original_size (tuple):
-            The original size of the image in the format (height, width).
-        possible_resolutions (list):
-            A list of possible resolutions in the format [(height1, width1), (height2, width2), ...].
-
-    Returns:
-        tuple: The best fit resolution in the format (height, width).
-    """
-    original_height, original_width = original_size
-    best_fit = None
-    max_effective_resolution = 0
-    min_wasted_resolution = float("inf")
-
-    for height, width in possible_resolutions:
-        scale = min(width / original_width, height / original_height)
-        downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
-        effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
-        wasted_resolution = (width * height) - effective_resolution
-
-        if effective_resolution > max_effective_resolution or (
-            effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution
-        ):
-            max_effective_resolution = effective_resolution
-            min_wasted_resolution = wasted_resolution
-            best_fit = (height, width)
-
-    return best_fit
-
-
 def divide_to_patches(image: np.array, patch_size: int, input_data_format) -> List[np.array]:
    """
    Divides an image into patches of a specified size.

--- a/src/transformers/models/llava_next/modeling_llava_next.py
+++ b/src/transformers/models/llava_next/modeling_llava_next.py
@@ -24,6 +24,7 @@ from torch import nn
 from ... import PreTrainedModel
 from ...activations import ACT2FN
 from ...cache_utils import Cache
+from ...image_processing_utils import select_best_resolution
 from ...modeling_outputs import ModelOutput
 from ...utils import (
    add_start_docstrings,
@@ -33,7 +34,6 @@ from ...utils import (
 )
 from ..auto import AutoModel, AutoModelForCausalLM
 from .configuration_llava_next import LlavaNextConfig
-from .image_processing_llava_next import select_best_resolution


 logger = logging.get_logger(__name__)