Unverified Commit 73a73b41 authored by Arthur's avatar Arthur Committed by GitHub
Browse files

[`LlavaNext`] Fix llava next unsafe imports (#29773)

* path llava-next

* styling

* styling
parent 2ddceef9
......@@ -748,6 +748,44 @@ def get_size_dict(
return size_dict
def select_best_resolution(original_size: tuple, possible_resolutions: list) -> tuple:
"""
Selects the best resolution from a list of possible resolutions based on the original size.
This is done by calculating the effective and wasted resolution for each possible resolution.
The best fit resolution is the one that maximizes the effective resolution and minimizes the wasted resolution.
Args:
original_size (tuple):
The original size of the image in the format (height, width).
possible_resolutions (list):
A list of possible resolutions in the format [(height1, width1), (height2, width2), ...].
Returns:
tuple: The best fit resolution in the format (height, width).
"""
original_height, original_width = original_size
best_fit = None
max_effective_resolution = 0
min_wasted_resolution = float("inf")
for height, width in possible_resolutions:
scale = min(width / original_width, height / original_height)
downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
wasted_resolution = (width * height) - effective_resolution
if effective_resolution > max_effective_resolution or (
effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution
):
max_effective_resolution = effective_resolution
min_wasted_resolution = wasted_resolution
best_fit = (height, width)
return best_fit
ImageProcessingMixin.push_to_hub = copy_func(ImageProcessingMixin.push_to_hub)
if ImageProcessingMixin.push_to_hub.__doc__ is not None:
ImageProcessingMixin.push_to_hub.__doc__ = ImageProcessingMixin.push_to_hub.__doc__.format(
......
......@@ -77,7 +77,7 @@ IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict(
("layoutlmv3", "LayoutLMv3ImageProcessor"),
("levit", "LevitImageProcessor"),
("llava", "CLIPImageProcessor"),
("llava_next", "CLIPImageProcessor"),
("llava_next", "LlavaNextImageProcessor"),
("mask2former", "Mask2FormerImageProcessor"),
("maskformer", "MaskFormerImageProcessor"),
("mgp-str", "ViTImageProcessor"),
......
......@@ -19,7 +19,7 @@ from typing import Dict, List, Optional, Union
import numpy as np
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict, select_best_resolution
from ...image_transforms import (
convert_to_rgb,
get_resize_output_image_size,
......@@ -51,44 +51,6 @@ if is_vision_available():
from PIL import Image
def select_best_resolution(original_size: tuple, possible_resolutions: list) -> tuple:
"""
Selects the best resolution from a list of possible resolutions based on the original size.
This is done by calculating the effective and wasted resolution for each possible resolution.
The best fit resolution is the one that maximizes the effective resolution and minimizes the wasted resolution.
Args:
original_size (tuple):
The original size of the image in the format (height, width).
possible_resolutions (list):
A list of possible resolutions in the format [(height1, width1), (height2, width2), ...].
Returns:
tuple: The best fit resolution in the format (height, width).
"""
original_height, original_width = original_size
best_fit = None
max_effective_resolution = 0
min_wasted_resolution = float("inf")
for height, width in possible_resolutions:
scale = min(width / original_width, height / original_height)
downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
wasted_resolution = (width * height) - effective_resolution
if effective_resolution > max_effective_resolution or (
effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution
):
max_effective_resolution = effective_resolution
min_wasted_resolution = wasted_resolution
best_fit = (height, width)
return best_fit
def divide_to_patches(image: np.array, patch_size: int, input_data_format) -> List[np.array]:
"""
Divides an image into patches of a specified size.
......
......@@ -24,6 +24,7 @@ from torch import nn
from ... import PreTrainedModel
from ...activations import ACT2FN
from ...cache_utils import Cache
from ...image_processing_utils import select_best_resolution
from ...modeling_outputs import ModelOutput
from ...utils import (
add_start_docstrings,
......@@ -33,7 +34,6 @@ from ...utils import (
)
from ..auto import AutoModel, AutoModelForCausalLM
from .configuration_llava_next import LlavaNextConfig
from .image_processing_llava_next import select_best_resolution
logger = logging.get_logger(__name__)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment