Add OWLv2, bis (#26668)

* First draft * Update conversion script * Update copied from statements * Fix style * Add copied from to config * Add copied from to processor * Run make fixup * Add docstring * Update docstrings * Add method * Improve docstrings * Fix docstrings * Improve docstrings * Remove onnx * Add flag * Address comments * Add copied from to model tests * Add flag to conversion script * Add code snippet * Address more comments * Address comment * Improve conversion script * More improvements * Add expected objectness logits * Skip test * Improve conversion script * Extend conversion script * Convert large checkpoint * Fix doc tests * Convert all checkpoints, update integration tests * Add checkpoint_path arg * Fix repo_id

Add OWLv2, bis (#26668)
* First draft * Update conversion script * Update copied from statements * Fix style * Add copied from to config * Add copied from to processor * Run make fixup * Add docstring * Update docstrings * Add method * Improve docstrings * Fix docstrings * Improve docstrings * Remove onnx * Add flag * Address comments * Add copied from to model tests * Add flag to conversion script * Add code snippet * Address more comments * Address comment * Improve conversion script * More improvements * Add expected objectness logits * Skip test * Improve conversion script * Extend conversion script * Convert large checkpoint * Fix doc tests * Convert all checkpoints, update integration tests * Add checkpoint_path arg * Fix repo_id
762af3e3 · NielsRogge · GitHub · bdb391e9 · 762af3e3 · 762af3e3
Unverified Commit 762af3e3 authored Oct 13, 2023 by NielsRogge Committed by GitHub Oct 13, 2023
12 changed files
--- a/src/transformers/models/owlv2/image_processing_owlv2.py
+++ b/src/transformers/models/owlv2/image_processing_owlv2.py
--- a/src/transformers/models/owlv2/modeling_owlv2.py
+++ b/src/transformers/models/owlv2/modeling_owlv2.py
--- a/src/transformers/models/owlv2/processing_owlv2.py
+++ b/src/transformers/models/owlv2/processing_owlv2.py
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Image/Text processor class for OWLv2
+"""
+from typing import List
+import numpy as np
+from ...processing_utils import ProcessorMixin
+from ...tokenization_utils_base import BatchEncoding
+from ...utils import is_flax_available, is_tf_available, is_torch_available
+class Owlv2Processor(ProcessorMixin):
+    r"""
+    Constructs an Owlv2 processor which wraps [`Owlv2ImageProcessor`] and [`CLIPTokenizer`]/[`CLIPTokenizerFast`] into
+    a single processor that interits both the image processor and tokenizer functionalities. See the
+    [`~OwlViTProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more information.
+    Args:
+        image_processor ([`Owlv2ImageProcessor`]):
+            The image processor is a required input.
+        tokenizer ([`CLIPTokenizer`, `CLIPTokenizerFast`]):
+            The tokenizer is a required input.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "Owlv2ImageProcessor"
+    tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
+    def __init__(self, image_processor, tokenizer, **kwargs):
+        super().__init__(image_processor, tokenizer)
+    # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.__call__ with OWLViT->OWLv2
+    def __call__(self, text=None, images=None, query_images=None, padding="max_length", return_tensors="np", **kwargs):
+        """
+        Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and
+        `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode:
+        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        of the above two methods for more information.
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`,
+            `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
+                number of channels, H and W are image height and width.
+            query_images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The query image to be prepared, one query image is expected per target image to be queried. Each image
+                can be a PIL image, NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image
+                should be of shape (C, H, W), where C is a number of channels, H and W are image height and width.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+        Returns:
+            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+        if text is None and query_images is None and images is None:
+            raise ValueError(
+                "You have to specify at least one text or query image or image. All three cannot be none."
+            )
+        if text is not None:
+            if isinstance(text, str) or (isinstance(text, List) and not isinstance(text[0], List)):
+                encodings = [self.tokenizer(text, padding=padding, return_tensors=return_tensors, **kwargs)]
+            elif isinstance(text, List) and isinstance(text[0], List):
+                encodings = []
+                # Maximum number of queries across batch
+                max_num_queries = max([len(t) for t in text])
+                # Pad all batch samples to max number of text queries
+                for t in text:
+                    if len(t) != max_num_queries:
+                        t = t + [" "] * (max_num_queries - len(t))
+                    encoding = self.tokenizer(t, padding=padding, return_tensors=return_tensors, **kwargs)
+                    encodings.append(encoding)
+            else:
+                raise TypeError("Input text should be a string, a list of strings or a nested list of strings")
+            if return_tensors == "np":
+                input_ids = np.concatenate([encoding["input_ids"] for encoding in encodings], axis=0)
+                attention_mask = np.concatenate([encoding["attention_mask"] for encoding in encodings], axis=0)
+            elif return_tensors == "jax" and is_flax_available():
+                import jax.numpy as jnp
+                input_ids = jnp.concatenate([encoding["input_ids"] for encoding in encodings], axis=0)
+                attention_mask = jnp.concatenate([encoding["attention_mask"] for encoding in encodings], axis=0)
+            elif return_tensors == "pt" and is_torch_available():
+                import torch
+                input_ids = torch.cat([encoding["input_ids"] for encoding in encodings], dim=0)
+                attention_mask = torch.cat([encoding["attention_mask"] for encoding in encodings], dim=0)
+            elif return_tensors == "tf" and is_tf_available():
+                import tensorflow as tf
+                input_ids = tf.stack([encoding["input_ids"] for encoding in encodings], axis=0)
+                attention_mask = tf.stack([encoding["attention_mask"] for encoding in encodings], axis=0)
+            else:
+                raise ValueError("Target return tensor type could not be returned")
+            encoding = BatchEncoding()
+            encoding["input_ids"] = input_ids
+            encoding["attention_mask"] = attention_mask
+        if query_images is not None:
+            encoding = BatchEncoding()
+            query_pixel_values = self.image_processor(
+                query_images, return_tensors=return_tensors, **kwargs
+            ).pixel_values
+            encoding["query_pixel_values"] = query_pixel_values
+        if images is not None:
+            image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
+        if text is not None and images is not None:
+            encoding["pixel_values"] = image_features.pixel_values
+            return encoding
+        elif query_images is not None and images is not None:
+            encoding["pixel_values"] = image_features.pixel_values
+            return encoding
+        elif text is not None or query_images is not None:
+            return encoding
+        else:
+            return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
+    # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.post_process_object_detection with OWLViT->OWLv2
+    def post_process_object_detection(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to [`OwlViTImageProcessor.post_process_object_detection`]. Please refer
+        to the docstring of this method for more information.
+        """
+        return self.image_processor.post_process_object_detection(*args, **kwargs)
+    # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.post_process_image_guided_detection with OWLViT->OWLv2
+    def post_process_image_guided_detection(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to [`OwlViTImageProcessor.post_process_one_shot_object_detection`].
+        Please refer to the docstring of this method for more information.
+        """
+        return self.image_processor.post_process_image_guided_detection(*args, **kwargs)
+    # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.batch_decode
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.decode
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
--- a/src/transformers/models/owlvit/configuration_owlvit.py
+++ b/src/transformers/models/owlvit/configuration_owlvit.py
@@ -66,15 +66,21 @@ class OwlViTTextConfig(PretrainedConfig):
        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the layer normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        initializer_factor (`float`, *optional*, defaults to 1):
+        initializer_factor (`float`, *optional*, defaults to 1.0):
            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
            testing).
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The id of the padding token in the input sequences.
+        bos_token_id (`int`, *optional*, defaults to 49406):
+            The id of the beginning-of-sequence token in the input sequences.
+        eos_token_id (`int`, *optional*, defaults to 49407):
+            The id of the end-of-sequence token in the input sequences.
    Example:
@@ -268,6 +274,8 @@ class OwlViTConfig(PretrainedConfig):
        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
            The inital value of the *logit_scale* parameter. Default is used as per the original OWL-ViT
            implementation.
+        return_dict (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return a dictionary. If `False`, returns a tuple.
        kwargs (*optional*):
            Dictionary of keyword arguments.
    """

--- a/src/transformers/models/owlvit/modeling_owlvit.py
+++ b/src/transformers/models/owlvit/modeling_owlvit.py
@@ -582,11 +582,16 @@ class OwlViTPreTrainedModel(PreTrainedModel):
 OWLVIT_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
    Parameters:
-    This model is a PyTorch [torch.nn.Module](https:
-        //pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
-    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
-    behavior.
        config ([`OwlViTConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
@@ -1250,14 +1255,14 @@ class OwlViTModel(OwlViTPreTrainedModel):
 class OwlViTBoxPredictionHead(nn.Module):
-    def __init__(self, config: OwlViTConfig):
+    def __init__(self, config: OwlViTConfig, out_dim: int = 4):
        super().__init__()
        width = config.vision_config.hidden_size
        self.dense0 = nn.Linear(width, width)
        self.dense1 = nn.Linear(width, width)
        self.gelu = nn.GELU()
-        self.dense2 = nn.Linear(width, 4)
+        self.dense2 = nn.Linear(width, out_dim)
    def forward(self, image_features: torch.Tensor) -> torch.FloatTensor:
        output = self.dense0(image_features)

--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -5860,6 +5860,44 @@ class OPTPreTrainedModel(metaclass=DummyObject):
        requires_backends(self, ["torch"])
+OWLV2_PRETRAINED_MODEL_ARCHIVE_LIST = None
+class Owlv2ForObjectDetection(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+class Owlv2Model(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+class Owlv2PreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+class Owlv2TextModel(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+class Owlv2VisionModel(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
 OWLVIT_PRETRAINED_MODEL_ARCHIVE_LIST = None

--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -373,6 +373,13 @@ class OneFormerImageProcessor(metaclass=DummyObject):
        requires_backends(self, ["vision"])
+class Owlv2ImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
 class OwlViTFeatureExtractor(metaclass=DummyObject):
    _backends = ["vision"]

--- a/tests/models/owlv2/__init__.py
+++ b/tests/models/owlv2/__init__.py
--- a/tests/models/owlv2/test_image_processor_owlv2.py
+++ b/tests/models/owlv2/test_image_processor_owlv2.py
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+from transformers.testing_utils import require_torch, require_vision, slow
+from transformers.utils import is_vision_available
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+if is_vision_available():
+    from PIL import Image
+    from transformers import Owlv2ImageProcessor
+class Owlv2ImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_normalize=True,
+        image_mean=[0.48145466, 0.4578275, 0.40821073],
+        image_std=[0.26862954, 0.26130258, 0.27577711],
+        do_convert_rgb=True,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size if size is not None else {"height": 18, "width": 18}
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_convert_rgb = do_convert_rgb
+    def prepare_image_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+        }
+    def expected_output_image_shape(self, images):
+        return self.num_channels, self.size["height"], self.size["width"]
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+@require_torch
+@require_vision
+class Owlv2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = Owlv2ImageProcessor if is_vision_available() else None
+    def setUp(self):
+        self.image_processor_tester = Owlv2ImageProcessingTester(self)
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"height": 18, "width": 18})
+        image_processor = self.image_processing_class.from_dict(
+            self.image_processor_dict, size={"height": 42, "width": 42}
+        )
+        self.assertEqual(image_processor.size, {"height": 42, "width": 42})
+    @slow
+    def test_image_processor_integration_test(self):
+        processor = Owlv2ImageProcessor()
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        pixel_values = processor(image, return_tensors="pt").pixel_values
+        mean_value = round(pixel_values.mean().item(), 4)
+        self.assertEqual(mean_value, 0.2353)
+    @unittest.skip("OWLv2 doesn't treat 4 channel PIL and numpy consistently yet")  # FIXME Amy
+    def test_call_numpy_4_channels(self):
+        pass
--- a/tests/models/owlv2/test_modeling_owlv2.py
+++ b/tests/models/owlv2/test_modeling_owlv2.py
--- a/utils/check_docstrings.py
+++ b/utils/check_docstrings.py
@@ -417,9 +417,6 @@ OBJECTS_TO_IGNORE = [
    "OneFormerProcessor",
    "OpenAIGPTTokenizerFast",
    "OpenLlamaConfig",
-    "OwlViTConfig",
-    "OwlViTModel",
-    "OwlViTTextConfig",
    "PLBartConfig",
    "PegasusConfig",
    "PegasusTokenizer",

--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -234,6 +234,8 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
    "OpenAIGPTDoubleHeadsModel",
    "OwlViTTextModel",
    "OwlViTVisionModel",
+    "Owlv2TextModel",
+    "Owlv2VisionModel",
    "OwlViTForObjectDetection",
    "RagModel",
    "RagSequenceForGeneration",