ovis.py 19.1 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24

# ruff: noqa: E501
# coding=utf-8
# adapted from https://github.com/AIDC-AI/Ovis/blob/35ab51a1a1e3542fa6db260a1084cefbc8f164bb/ovis/vllm/processing_ovis.py
# Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
25
from functools import cached_property
26
27
28
29
30

import PIL
import torch
from transformers import AutoProcessor, BatchFeature
from transformers.image_utils import ImageInput
31
from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
32
33
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput

34
35
from vllm.multimodal.image import convert_image_mode

36
__all__ = ["OvisProcessor"]
37
38
IGNORE_ID = -100

39
40

class OvisProcessorKwargs(ProcessingKwargs, total=False):  # type: ignore[call-arg]
41
42
43
44
45
    _defaults = {
        "text_kwargs": {
            "padding": False,
        },
        "images_kwargs": {
46
            "do_convert_rgb": True,
47
48
            "return_tensors": "pt",
        },
49
50
51
52
53
    }


class OvisProcessor(ProcessorMixin):
    r"""
54
    Constructs an Ovis processor which wraps an Ovis image processor and a Qwen2 tokenizer into a single processor.
55
56
57
58
59
60
61
62
63
64
65
66
    [`OvisProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the
    [`~OvisProcessor.__call__`] and [`~OvisProcessor.decode`] for more information.
    Args:
        image_processor ([`Qwen2VLImageProcessor`], *optional*):
            The image processor is a required input.
        tokenizer ([`Qwen2TokenizerFast`], *optional*):
            The tokenizer is a required input.
        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
            in a chat into a tokenizable string.
    """

    attributes = ["image_processor", "tokenizer"]
67
    valid_kwargs = ["chat_template", "image_pad_token", "image_segment_len"]
68
69

    image_processor_class = "AutoImageProcessor"
70
    tokenizer_class = "AutoTokenizer"
71

72
73
74
75
76
77
78
79
80
    def __init__(
        self,
        image_processor=None,
        tokenizer=None,
        chat_template=None,
        image_pad_token=None,
        image_segment_len=255,
        **kwargs,
    ):
81
        self.image_token = "<image>"
82
83
        self.image_pad_token = image_pad_token
        self.image_segment_len = image_segment_len
84
85
        super().__init__(image_processor, tokenizer, chat_template=chat_template)

86
87
88
89
    @cached_property
    def extra_special_tokens(self):
        image_pad_token_id = self.tokenizer.get_vocab()[self.image_pad_token]
        extra_special_tokens = {
90
91
92
93
94
95
96
            "image_token": -200,
            "image_atom": -300,
            "image_start": -301,
            "image_prefix": -302,
            "image_col_sep": -303,
            "image_row_sep": -304,
            "image_end": -305,
97
            "image_pad": image_pad_token_id,
98
        }
99
        return extra_special_tokens
100
101
102
103

    def __call__(
        self,
        images: ImageInput = None,
104
105
106
107
        text: TextInput
        | PreTokenizedInput
        | list[TextInput]
        | list[PreTokenizedInput] = None,
108
109
110
111
112
113
114
115
        **kwargs: Unpack[OvisProcessorKwargs],
    ) -> BatchFeature:
        """
        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
        the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
        Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
            Args:
116
                images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
117
118
                    The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                    tensor. Both channels-first and channels-last formats are supported.
119
                text (`str`, `list[str]`, `list[list[str]]`):
120
121
122
                    The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                    (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                    `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
123
                videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
                    The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
                    tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
                return_tensors (`str` or [`~utils.TensorType`], *optional*):
                    If set, will return tensors of a particular framework. Acceptable values are:
                    - `'tf'`: Return TensorFlow `tf.constant` objects.
                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
                    - `'np'`: Return NumPy `np.ndarray` objects.
                    - `'jax'`: Return JAX `jnp.ndarray` objects.
            Returns:
                [`BatchFeature`]: A [`BatchFeature`] with the following fields:
                - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
                - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
                  `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
                  `None`).
                - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
                - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
                - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
                - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
                - **second_per_grid_ts** -- List of video seconds per time grid. Returned when `videos` is not `None`.
        """
144
145
146
147

        max_partition = kwargs.pop("max_partition", 9)
        covering_threshold = kwargs.pop("covering_threshold", 0.9)

148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
        output_kwargs = self._merge_kwargs(
            OvisProcessorKwargs,
            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
            **kwargs,
        )

        # Process all images first
        image_features = {}
        if images is not None:
            processed_images = []
            image_placeholders_list = []
            grids = []

            # Process each image
            for image in images if isinstance(images, list) else [images]:
                pixel_values, image_placeholders, grid = self.preprocess_image(
164
165
166
167
                    image=image,
                    max_partition=max_partition,
                    covering_threshold=covering_threshold,
                    **output_kwargs["images_kwargs"],
168
169
170
171
172
173
174
175
176
177
178
179
180
181
                )
                processed_images.append(pixel_values)
                image_placeholders_list.append(image_placeholders)
                grids.append(grid)

            # assign all processed images
            if processed_images:
                image_features["image_placeholders"] = image_placeholders_list

        # Process text input
        if text is not None:
            if not isinstance(text, list):
                text = [text]

182
            tokenized_batched_text = self._tokenize_with_image_symbol(text)
183
184
185
            image_token_id = self.get_token_value("image_token")
            replaced_ids_list = []
            idx = 0
186
            for ids_tensor in tokenized_batched_text:
187
188
189
190
                if (
                    image_token_id in ids_tensor
                    and "image_placeholders" in image_features
                ):
191
192
193
194
195
196
197
198
199
                    if idx < len(image_features["image_placeholders"]):
                        # Converts in list for ease of use
                        ids_list = ids_tensor.tolist()

                        new_ids = []

                        # replace placeholders
                        for i, token_id in enumerate(ids_list):
                            if token_id == image_token_id:
200
201
202
                                placeholder_ids = image_features["image_placeholders"][
                                    idx
                                ]
203
204
205
206
207
208
209
210
211
                                new_ids.extend(placeholder_ids)
                                idx += 1
                            else:
                                new_ids.append(token_id)

                        # Converts back to tensors
                        ids_tensor = torch.tensor(new_ids, dtype=torch.long)
                    else:
                        raise RuntimeError(
212
213
                            "Mismatch between the images you provided and the number of placeholder present in the text"
                        )
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231

                replaced_ids_list.append(ids_tensor)

            if replaced_ids_list:
                replaced_and_tokenized_ids = torch.stack(replaced_ids_list)
            else:
                replaced_and_tokenized_ids = torch.tensor([], dtype=torch.long)

            # Create the output with text features
            output = BatchFeature(
                data={
                    "input_ids": replaced_and_tokenized_ids,
                }
            )

            # Add image features if present
            if image_features:
                output["pixel_values"] = processed_images
232
                output["grids"] = grids
233
234
235
236
237
238

            return output

        # If only images were provided
        return BatchFeature(data=image_features)

239
240
241
    def _tokenize_with_image_symbol(self, text_list: list[str]) -> torch.LongTensor:
        batch_token_ids = []
        for text in text_list:
242
243
244
245
            text_chunks = [
                self.tokenizer(chunk, add_special_tokens=False).input_ids
                for chunk in text.split(self.image_token)
            ]
246
247
248
249
250
251
252
253
            token_ids = []
            num_chuck = len(text_chunks)
            for i, chunk in enumerate(text_chunks):
                token_ids.extend(chunk)
                if i < num_chuck - 1:
                    token_ids.append(self.get_token_value("image_token"))
            batch_token_ids.append(token_ids)
        return torch.tensor(batch_token_ids, dtype=torch.long)
254
255

    def get_image_size(self):
256
        size = self.image_processor.size
257
258
        if "shortest_edge" in size:
            width = height = size["shortest_edge"]
259
        elif "height" in size and "width" in size:
260
261
            width = size["width"]
            height = size["height"]
262
        else:
263
            raise ValueError("Can't parse image size from image_processor config.")
264
265
266
        return height, width

    def get_token_value(self, tok):
267
        return self.extra_special_tokens[tok]
268

269
    def construct_image_indicators(self, grid):
270
271
272
273
274
        image_placeholders = [
            self.get_token_value("image_start"),
            self.get_token_value("image_atom"),
            self.get_token_value("image_prefix"),
        ]
275
276
277
        if grid[0] * grid[1] > 1:
            for r in range(grid[0]):
                for c in range(grid[1]):
278
                    image_placeholders.append(self.get_token_value("image_atom"))
279
                    if c < grid[1] - 1:
280
                        image_placeholders.append(self.get_token_value("image_col_sep"))
281
                if r < grid[0] - 1:
282
283
                    image_placeholders.append(self.get_token_value("image_row_sep"))
        image_placeholders.append(self.get_token_value("image_end"))
284
285
286
287
        return image_placeholders

    def construct_image_placeholders(self, grid):
        image_placeholders = self.construct_image_indicators(grid)
288

289
        image_atom_token_id = self.get_token_value("image_atom")
290
        # Extract the padding token ID from tokenizer
291
        image_padding_token_id = self.get_token_value("image_pad")
292
293
294
295

        # Create a new list with padding tokens inserted
        padded_placeholder_tokens = []
        for token in image_placeholders:
296
            padded_placeholder_tokens.append(image_padding_token_id)
297
            if token == image_atom_token_id:
298
299
300
                padded_placeholder_tokens.extend(
                    [image_padding_token_id] * self.image_segment_len
                )
301
302
        return padded_placeholder_tokens

303
304
305
306
307
    def preprocess_image(
        self,
        image: PIL.Image.Image,
        max_partition,
        covering_threshold,
308
        do_convert_rgb,
309
310
        return_tensors,
    ):
311
312
313
314
315
316
317
318
319
320
321
322
        def _preprocess(img: PIL.Image.Image, side):
            # first resize and preprocess
            w, h = img.size
            if w == h:
                new_width = new_height = side
            elif w > h:
                new_width = side
                new_height = int(h / w * new_width)
            else:
                new_height = side
                new_width = int(w / h * new_height)
            new_size = dict(height=new_height, width=new_width)
323
324
325
            pixel_values = self.image_processor.preprocess(
                img, size=new_size, return_tensors=return_tensors
            )["pixel_values"]
326
327

            # then pad to square
328
329
330
            square_values = torch.zeros(
                [1, 3, side, side], dtype=pixel_values.dtype, device=pixel_values.device
            )
331
332
333
334
335
            new_height, new_width = pixel_values.shape[2:]
            if new_height == new_width:
                square_values[:, :, :, :] = pixel_values
            elif new_height > new_width:
                from_index = (side - new_width) // 2
336
337
338
                square_values[:, :, :, from_index : from_index + new_width] = (
                    pixel_values
                )
339
340
            else:
                from_index = (side - new_height) // 2
341
342
343
                square_values[:, :, from_index : from_index + new_height, :] = (
                    pixel_values
                )
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384

            return square_values

        def _partition(img, grid) -> list[tuple[int, int, int, int]]:
            w, h = img.size
            row_height = h // grid[0]
            col_width = w // grid[1]

            partition = []
            for row in range(grid[0]):
                for col in range(grid[1]):
                    left = col * col_width
                    upper = row * row_height
                    right = w if col == grid[1] - 1 else (col + 1) * col_width
                    lower = h if row == grid[0] - 1 else (row + 1) * row_height
                    partition.append((left, upper, right, lower))

            return partition

        def _covering_area(left, upper, right, lower, side):
            w = right - left
            h = lower - upper
            w, h = max(w, h), min(w, h)
            if w > side:
                h = h / w * side
                w = side
            return w * h

        def _get_best_grid(img, side):
            img_area = img.size[0] * img.size[1]

            candidate_grids = []
            for i in range(1, max_partition + 1):
                for j in range(1, max_partition + 1):
                    if i * j <= max_partition:
                        candidate_grids.append((i, j))

            all_grids = []
            good_grids = []
            for grid in candidate_grids:
                partition = _partition(img, grid)
385
386
387
                covering_ratio = (
                    sum([_covering_area(*p, side) for p in partition]) / img_area
                )
388
389
390
391
392
393
394
                assert covering_ratio <= 1.0
                all_grids.append((grid, covering_ratio))
                if covering_ratio > covering_threshold:
                    good_grids.append((grid, covering_ratio))

            if len(good_grids) > 0:
                # pick the good partition with minimum #sub_images and break the tie using covering_ratio
395
396
397
                return sorted(good_grids, key=lambda x: (x[0][0] * x[0][1], -x[1]))[0][
                    0
                ]
398
399
400
401
            else:
                # pick the partition with maximum covering_ratio and break the tie using #sub_images
                return sorted(all_grids, key=lambda x: (-x[1], x[0][0] * x[0][1]))[0][0]

402
        if do_convert_rgb:
403
            image = convert_image_mode(image, "RGB")
404
405
406

        sides = self.get_image_size()
        if sides[0] != sides[1]:
407
            raise ValueError("get_image_size() returns non-square size")
408
409
410
411
412
413
414
415
        side = sides[0]
        grid = _get_best_grid(image, side)
        partition = _partition(image, grid)
        crops = [image.crop(p) for p in partition]
        if len(crops) > 1:
            crops.insert(0, image)
        pixel_values = torch.cat([_preprocess(crop, side) for crop in crops], dim=0)
        image_placeholders = self.construct_image_placeholders(grid)
416
        return torch.tensor(pixel_values), image_placeholders, torch.tensor(grid)
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439

    def batch_decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        """
        return self.tokenizer.batch_decode(*args, **kwargs)

    def decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        """
        return self.tokenizer.decode(*args, **kwargs)

    def post_process_image_text_to_text(self, generated_outputs):
        """
        Post-process the output of the model to decode the text.
        Args:
            generated_outputs (`torch.Tensor` or `np.ndarray`):
                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
                or `(sequence_length,)`.
        Returns:
440
            `list[str]`: The decoded text.
441
442
        """
        return self.tokenizer.batch_decode(
443
444
445
            generated_outputs,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False,
446
447
448
449
450
451
        )

    @property
    def model_input_names(self):
        tokenizer_input_names = self.tokenizer.model_input_names
        image_processor_input_names = self.image_processor.model_input_names
452
453
454
        names_from_processor = list(
            dict.fromkeys(tokenizer_input_names + image_processor_input_names)
        )
455
456
457
        return names_from_processor + ["second_per_grid_ts"]


458
AutoProcessor.register("OvisProcessor", OvisProcessor)