smart_img_process.py 3.15 KB
Newer Older
raojy's avatar
fix  
raojy committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
"""Smart image resizing with aspect-ratio preservation and factor alignment."""

import math
from typing import List, Tuple
from PIL import Image


def smart_resize(
    height: int,
    width: int,
    min_pixels: int,
    max_pixels: int,
    factor: int = 32,
) -> Tuple[int, int]:
    """
    Qwen2.5-VL style smart resize.
    Scales the image to fit within [min_pixels, max_pixels] while preserving
    the aspect ratio, and returns target dimensions aligned to ``factor``.
    """
    h_bar = max(round(height / factor) * factor, factor)
    w_bar = max(round(width / factor) * factor, factor)

    if h_bar * w_bar > max_pixels:
        scale = math.sqrt(max_pixels / (height * width))
        h_bar = max(math.floor(height * scale / factor) * factor, factor)
        w_bar = max(math.floor(width * scale / factor) * factor, factor)
    elif h_bar * w_bar < min_pixels:
        scale = math.sqrt(min_pixels / (height * width))
        h_bar = math.ceil(height * scale / factor) * factor
        w_bar = math.ceil(width * scale / factor) * factor

    return h_bar, w_bar


def resize_and_center_crop(
    img: Image.Image,
    target_h: int,
    target_w: int,
    factor: int = 32,
) -> Image.Image:
    """
    Resize the image (preserving aspect ratio) so that it covers the target
    dimensions, then center-crop to a factor-aligned size.
    """
    width, height = img.size

    # Scale so the image just covers the target area
    scale = max(target_h / height, target_w / width)
    new_h = int(round(height * scale))
    new_w = int(round(width * scale))

    img = img.resize((new_w, new_h), resample=Image.BICUBIC)

    # Center-crop to factor-aligned dimensions
    crop_h = (new_h // factor) * factor
    crop_w = (new_w // factor) * factor

    # Ensure at least target size
    crop_h = max(crop_h, target_h)
    crop_w = max(crop_w, target_w)

    top = (new_h - crop_h) // 2
    left = (new_w - crop_w) // 2
    img = img.crop((left, top, left + crop_w, top + crop_h))

    return img


def smart_resize_images(
    image_paths: List[str],
    patch_size: int = 16,
    merge_size: int = 2,
    single_min_pixels: int = 128 * 128,
    single_max_pixels: int = 800 * 800,
    multi_min_pixels: int = 128 * 128,
    multi_max_pixels: int = 448 * 448,
) -> List[Image.Image]:
    """
    Smart-resize a list of images for model input.

    Uses larger resolution limits for single-image inputs and smaller limits
    for multi-image inputs to control total token count.
    """
    num_images = len(image_paths)
    if num_images == 0:
        return []

    factor = patch_size * merge_size  # 32

    if num_images == 1:
        min_pixels = single_min_pixels
        max_pixels = single_max_pixels
    else:
        min_pixels = multi_min_pixels
        max_pixels = multi_max_pixels

    images = []

    for path in image_paths:
        if path is None:
            images.append(path)
            continue
        img = Image.open(path).convert("RGB")
        width, height = img.size

        target_h, target_w = smart_resize(height, width, min_pixels, max_pixels, factor)

        img = resize_and_center_crop(img, target_h, target_w, factor)
        images.append(img)

    return images