ovis.py 19 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24

# ruff: noqa: E501
# coding=utf-8
# adapted from https://github.com/AIDC-AI/Ovis/blob/35ab51a1a1e3542fa6db260a1084cefbc8f164bb/ovis/vllm/processing_ovis.py
# Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
25
from functools import cached_property
26
27
28
29
30

import PIL
import torch
from transformers import AutoProcessor, BatchFeature
from transformers.image_utils import ImageInput
31
from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
32
33
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput

34
35
from vllm.multimodal.image import convert_image_mode

36
__all__ = ["OvisProcessor"]
37
38
IGNORE_ID = -100

39
40

class OvisProcessorKwargs(ProcessingKwargs, total=False):  # type: ignore[call-arg]
41
42
43
44
45
    _defaults = {
        "text_kwargs": {
            "padding": False,
        },
        "images_kwargs": {
46
47
48
49
50
            "max_partition": 9,
            "covering_threshold": 0.9,
            "convert_to_rgb": True,
            "return_tensors": "pt",
        },
51
52
53
54
55
    }


class OvisProcessor(ProcessorMixin):
    r"""
56
    Constructs an Ovis processor which wraps an Ovis image processor and a Qwen2 tokenizer into a single processor.
57
58
59
60
61
62
63
64
65
66
67
68
    [`OvisProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the
    [`~OvisProcessor.__call__`] and [`~OvisProcessor.decode`] for more information.
    Args:
        image_processor ([`Qwen2VLImageProcessor`], *optional*):
            The image processor is a required input.
        tokenizer ([`Qwen2TokenizerFast`], *optional*):
            The tokenizer is a required input.
        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
            in a chat into a tokenizable string.
    """

    attributes = ["image_processor", "tokenizer"]
69
    valid_kwargs = ["chat_template", "image_pad_token", "image_segment_len"]
70
71

    image_processor_class = "AutoImageProcessor"
72
    tokenizer_class = "AutoTokenizer"
73

74
75
76
77
78
79
80
81
82
    def __init__(
        self,
        image_processor=None,
        tokenizer=None,
        chat_template=None,
        image_pad_token=None,
        image_segment_len=255,
        **kwargs,
    ):
83
        self.image_token = "<image>"
84
85
        self.image_pad_token = image_pad_token
        self.image_segment_len = image_segment_len
86
87
        super().__init__(image_processor, tokenizer, chat_template=chat_template)

88
89
90
91
    @cached_property
    def extra_special_tokens(self):
        image_pad_token_id = self.tokenizer.get_vocab()[self.image_pad_token]
        extra_special_tokens = {
92
93
94
95
96
97
98
            "image_token": -200,
            "image_atom": -300,
            "image_start": -301,
            "image_prefix": -302,
            "image_col_sep": -303,
            "image_row_sep": -304,
            "image_end": -305,
99
            "image_pad": image_pad_token_id,
100
        }
101
        return extra_special_tokens
102
103
104
105

    def __call__(
        self,
        images: ImageInput = None,
106
107
108
109
        text: TextInput
        | PreTokenizedInput
        | list[TextInput]
        | list[PreTokenizedInput] = None,
110
111
112
113
114
115
116
117
        **kwargs: Unpack[OvisProcessorKwargs],
    ) -> BatchFeature:
        """
        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
        the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
        Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
            Args:
118
                images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
119
120
                    The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                    tensor. Both channels-first and channels-last formats are supported.
121
                text (`str`, `list[str]`, `list[list[str]]`):
122
123
124
                    The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                    (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                    `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
125
                videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
                    The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
                    tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
                return_tensors (`str` or [`~utils.TensorType`], *optional*):
                    If set, will return tensors of a particular framework. Acceptable values are:
                    - `'tf'`: Return TensorFlow `tf.constant` objects.
                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
                    - `'np'`: Return NumPy `np.ndarray` objects.
                    - `'jax'`: Return JAX `jnp.ndarray` objects.
            Returns:
                [`BatchFeature`]: A [`BatchFeature`] with the following fields:
                - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
                - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
                  `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
                  `None`).
                - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
                - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
                - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
                - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
                - **second_per_grid_ts** -- List of video seconds per time grid. Returned when `videos` is not `None`.
        """
        output_kwargs = self._merge_kwargs(
            OvisProcessorKwargs,
            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
            **kwargs,
        )

        # Process all images first
        image_features = {}
        if images is not None:
            processed_images = []
            image_placeholders_list = []
            grids = []

            # Process each image
            for image in images if isinstance(images, list) else [images]:
                pixel_values, image_placeholders, grid = self.preprocess_image(
                    image=image, **output_kwargs["images_kwargs"]
                )
                processed_images.append(pixel_values)
                image_placeholders_list.append(image_placeholders)
                grids.append(grid)

            # assign all processed images
            if processed_images:
                image_features["image_placeholders"] = image_placeholders_list

        # Process text input
        if text is not None:
            if not isinstance(text, list):
                text = [text]

177
            tokenized_batched_text = self._tokenize_with_image_symbol(text)
178
179
180
            image_token_id = self.get_token_value("image_token")
            replaced_ids_list = []
            idx = 0
181
            for ids_tensor in tokenized_batched_text:
182
183
184
185
                if (
                    image_token_id in ids_tensor
                    and "image_placeholders" in image_features
                ):
186
187
188
189
190
191
192
193
194
                    if idx < len(image_features["image_placeholders"]):
                        # Converts in list for ease of use
                        ids_list = ids_tensor.tolist()

                        new_ids = []

                        # replace placeholders
                        for i, token_id in enumerate(ids_list):
                            if token_id == image_token_id:
195
196
197
                                placeholder_ids = image_features["image_placeholders"][
                                    idx
                                ]
198
199
200
201
202
203
204
205
206
                                new_ids.extend(placeholder_ids)
                                idx += 1
                            else:
                                new_ids.append(token_id)

                        # Converts back to tensors
                        ids_tensor = torch.tensor(new_ids, dtype=torch.long)
                    else:
                        raise RuntimeError(
207
208
                            "Mismatch between the images you provided and the number of placeholder present in the text"
                        )
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226

                replaced_ids_list.append(ids_tensor)

            if replaced_ids_list:
                replaced_and_tokenized_ids = torch.stack(replaced_ids_list)
            else:
                replaced_and_tokenized_ids = torch.tensor([], dtype=torch.long)

            # Create the output with text features
            output = BatchFeature(
                data={
                    "input_ids": replaced_and_tokenized_ids,
                }
            )

            # Add image features if present
            if image_features:
                output["pixel_values"] = processed_images
227
                output["grids"] = grids
228
229
230
231
232
233

            return output

        # If only images were provided
        return BatchFeature(data=image_features)

234
235
236
    def _tokenize_with_image_symbol(self, text_list: list[str]) -> torch.LongTensor:
        batch_token_ids = []
        for text in text_list:
237
238
239
240
            text_chunks = [
                self.tokenizer(chunk, add_special_tokens=False).input_ids
                for chunk in text.split(self.image_token)
            ]
241
242
243
244
245
246
247
248
            token_ids = []
            num_chuck = len(text_chunks)
            for i, chunk in enumerate(text_chunks):
                token_ids.extend(chunk)
                if i < num_chuck - 1:
                    token_ids.append(self.get_token_value("image_token"))
            batch_token_ids.append(token_ids)
        return torch.tensor(batch_token_ids, dtype=torch.long)
249
250

    def get_image_size(self):
251
        size = self.image_processor.size
252
253
        if "shortest_edge" in size:
            width = height = size["shortest_edge"]
254
        elif "height" in size and "width" in size:
255
256
            width = size["width"]
            height = size["height"]
257
        else:
258
            raise ValueError("Can't parse image size from image_processor config.")
259
260
261
        return height, width

    def get_token_value(self, tok):
262
        return self.extra_special_tokens[tok]
263

264
    def construct_image_indicators(self, grid):
265
266
267
268
269
        image_placeholders = [
            self.get_token_value("image_start"),
            self.get_token_value("image_atom"),
            self.get_token_value("image_prefix"),
        ]
270
271
272
        if grid[0] * grid[1] > 1:
            for r in range(grid[0]):
                for c in range(grid[1]):
273
                    image_placeholders.append(self.get_token_value("image_atom"))
274
                    if c < grid[1] - 1:
275
                        image_placeholders.append(self.get_token_value("image_col_sep"))
276
                if r < grid[0] - 1:
277
278
                    image_placeholders.append(self.get_token_value("image_row_sep"))
        image_placeholders.append(self.get_token_value("image_end"))
279
280
281
282
        return image_placeholders

    def construct_image_placeholders(self, grid):
        image_placeholders = self.construct_image_indicators(grid)
283

284
        image_atom_token_id = self.get_token_value("image_atom")
285
        # Extract the padding token ID from tokenizer
286
        image_padding_token_id = self.get_token_value("image_pad")
287
288
289
290

        # Create a new list with padding tokens inserted
        padded_placeholder_tokens = []
        for token in image_placeholders:
291
            padded_placeholder_tokens.append(image_padding_token_id)
292
            if token == image_atom_token_id:
293
294
295
                padded_placeholder_tokens.extend(
                    [image_padding_token_id] * self.image_segment_len
                )
296
297
        return padded_placeholder_tokens

298
299
300
301
302
303
304
305
    def preprocess_image(
        self,
        image: PIL.Image.Image,
        max_partition,
        covering_threshold,
        convert_to_rgb,
        return_tensors,
    ):
306
307
308
309
310
311
312
313
314
315
316
317
        def _preprocess(img: PIL.Image.Image, side):
            # first resize and preprocess
            w, h = img.size
            if w == h:
                new_width = new_height = side
            elif w > h:
                new_width = side
                new_height = int(h / w * new_width)
            else:
                new_height = side
                new_width = int(w / h * new_height)
            new_size = dict(height=new_height, width=new_width)
318
319
320
            pixel_values = self.image_processor.preprocess(
                img, size=new_size, return_tensors=return_tensors
            )["pixel_values"]
321
322

            # then pad to square
323
324
325
            square_values = torch.zeros(
                [1, 3, side, side], dtype=pixel_values.dtype, device=pixel_values.device
            )
326
327
328
329
330
            new_height, new_width = pixel_values.shape[2:]
            if new_height == new_width:
                square_values[:, :, :, :] = pixel_values
            elif new_height > new_width:
                from_index = (side - new_width) // 2
331
332
333
                square_values[:, :, :, from_index : from_index + new_width] = (
                    pixel_values
                )
334
335
            else:
                from_index = (side - new_height) // 2
336
337
338
                square_values[:, :, from_index : from_index + new_height, :] = (
                    pixel_values
                )
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379

            return square_values

        def _partition(img, grid) -> list[tuple[int, int, int, int]]:
            w, h = img.size
            row_height = h // grid[0]
            col_width = w // grid[1]

            partition = []
            for row in range(grid[0]):
                for col in range(grid[1]):
                    left = col * col_width
                    upper = row * row_height
                    right = w if col == grid[1] - 1 else (col + 1) * col_width
                    lower = h if row == grid[0] - 1 else (row + 1) * row_height
                    partition.append((left, upper, right, lower))

            return partition

        def _covering_area(left, upper, right, lower, side):
            w = right - left
            h = lower - upper
            w, h = max(w, h), min(w, h)
            if w > side:
                h = h / w * side
                w = side
            return w * h

        def _get_best_grid(img, side):
            img_area = img.size[0] * img.size[1]

            candidate_grids = []
            for i in range(1, max_partition + 1):
                for j in range(1, max_partition + 1):
                    if i * j <= max_partition:
                        candidate_grids.append((i, j))

            all_grids = []
            good_grids = []
            for grid in candidate_grids:
                partition = _partition(img, grid)
380
381
382
                covering_ratio = (
                    sum([_covering_area(*p, side) for p in partition]) / img_area
                )
383
384
385
386
387
388
389
                assert covering_ratio <= 1.0
                all_grids.append((grid, covering_ratio))
                if covering_ratio > covering_threshold:
                    good_grids.append((grid, covering_ratio))

            if len(good_grids) > 0:
                # pick the good partition with minimum #sub_images and break the tie using covering_ratio
390
391
392
                return sorted(good_grids, key=lambda x: (x[0][0] * x[0][1], -x[1]))[0][
                    0
                ]
393
394
395
396
            else:
                # pick the partition with maximum covering_ratio and break the tie using #sub_images
                return sorted(all_grids, key=lambda x: (-x[1], x[0][0] * x[0][1]))[0][0]

397
        if convert_to_rgb:
398
            image = convert_image_mode(image, "RGB")
399
400
401

        sides = self.get_image_size()
        if sides[0] != sides[1]:
402
            raise ValueError("get_image_size() returns non-square size")
403
404
405
406
407
408
409
410
        side = sides[0]
        grid = _get_best_grid(image, side)
        partition = _partition(image, grid)
        crops = [image.crop(p) for p in partition]
        if len(crops) > 1:
            crops.insert(0, image)
        pixel_values = torch.cat([_preprocess(crop, side) for crop in crops], dim=0)
        image_placeholders = self.construct_image_placeholders(grid)
411
        return torch.tensor(pixel_values), image_placeholders, torch.tensor(grid)
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434

    def batch_decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        """
        return self.tokenizer.batch_decode(*args, **kwargs)

    def decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        """
        return self.tokenizer.decode(*args, **kwargs)

    def post_process_image_text_to_text(self, generated_outputs):
        """
        Post-process the output of the model to decode the text.
        Args:
            generated_outputs (`torch.Tensor` or `np.ndarray`):
                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
                or `(sequence_length,)`.
        Returns:
435
            `list[str]`: The decoded text.
436
437
        """
        return self.tokenizer.batch_decode(
438
439
440
            generated_outputs,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False,
441
442
443
444
445
446
        )

    @property
    def model_input_names(self):
        tokenizer_input_names = self.tokenizer.model_input_names
        image_processor_input_names = self.image_processor.model_input_names
447
448
449
        names_from_processor = list(
            dict.fromkeys(tokenizer_input_names + image_processor_input_names)
        )
450
451
452
        return names_from_processor + ["second_per_grid_ts"]


453
AutoProcessor.register("OvisProcessor", OvisProcessor)