Unverified Commit 762af3e3 authored by NielsRogge's avatar NielsRogge Committed by GitHub
Browse files

Add OWLv2, bis (#26668)

* First draft

* Update conversion script

* Update copied from statements

* Fix style

* Add copied from to config

* Add copied from to processor

* Run make fixup

* Add docstring

* Update docstrings

* Add method

* Improve docstrings

* Fix docstrings

* Improve docstrings

* Remove onnx

* Add flag

* Address comments

* Add copied from to model tests

* Add flag to conversion script

* Add code snippet

* Address more comments

* Address comment

* Improve conversion script

* More improvements

* Add expected objectness logits

* Skip test

* Improve conversion script

* Extend conversion script

* Convert large checkpoint

* Fix doc tests

* Convert all checkpoints, update integration tests

* Add checkpoint_path arg

* Fix repo_id
parent bdb391e9
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Image processor class for OWLv2."""
import warnings
from typing import Dict, List, Optional, Tuple, Union
import numpy as np
from ...image_processing_utils import BaseImageProcessor, BatchFeature
from ...image_transforms import (
center_to_corners_format,
pad,
to_channel_dimension_format,
)
from ...image_utils import (
OPENAI_CLIP_MEAN,
OPENAI_CLIP_STD,
ChannelDimension,
ImageInput,
PILImageResampling,
get_image_size,
infer_channel_dimension_format,
is_scaled_image,
make_list_of_images,
to_numpy_array,
valid_images,
)
from ...utils import (
TensorType,
is_scipy_available,
is_torch_available,
is_vision_available,
logging,
requires_backends,
)
if is_torch_available():
import torch
if is_vision_available():
import PIL
if is_scipy_available():
from scipy import ndimage as ndi
logger = logging.get_logger(__name__)
# Copied from transformers.models.owlvit.image_processing_owlvit._upcast
def _upcast(t):
# Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
if t.is_floating_point():
return t if t.dtype in (torch.float32, torch.float64) else t.float()
else:
return t if t.dtype in (torch.int32, torch.int64) else t.int()
# Copied from transformers.models.owlvit.image_processing_owlvit.box_area
def box_area(boxes):
"""
Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.
Args:
boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1
< x2` and `0 <= y1 < y2`.
Returns:
`torch.FloatTensor`: a tensor containing the area for each box.
"""
boxes = _upcast(boxes)
return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
# Copied from transformers.models.owlvit.image_processing_owlvit.box_iou
def box_iou(boxes1, boxes2):
area1 = box_area(boxes1)
area2 = box_area(boxes2)
left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2]
right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2]
width_height = (right_bottom - left_top).clamp(min=0) # [N,M,2]
inter = width_height[:, :, 0] * width_height[:, :, 1] # [N,M]
union = area1[:, None] + area2 - inter
iou = inter / union
return iou, union
def _preprocess_resize_output_shape(image, output_shape):
"""Validate resize output shape according to input image.
Args:
image (`np.ndarray`):
Image to be resized.
output_shape (`iterable`):
Size of the generated output image `(rows, cols[, ...][, dim])`. If `dim` is not provided, the number of
channels is preserved.
Returns
image (`np.ndarray):
The input image, but with additional singleton dimensions appended in the case where `len(output_shape) >
input.ndim`.
output_shape (`Tuple`):
The output shape converted to tuple.
Raises ------ ValueError:
If output_shape length is smaller than the image number of dimensions.
Notes ----- The input image is reshaped if its number of dimensions is not equal to output_shape_length.
"""
output_shape = tuple(output_shape)
output_ndim = len(output_shape)
input_shape = image.shape
if output_ndim > image.ndim:
# append dimensions to input_shape
input_shape += (1,) * (output_ndim - image.ndim)
image = np.reshape(image, input_shape)
elif output_ndim == image.ndim - 1:
# multichannel case: append shape of last axis
output_shape = output_shape + (image.shape[-1],)
elif output_ndim < image.ndim:
raise ValueError("output_shape length cannot be smaller than the " "image number of dimensions")
return image, output_shape
def _clip_warp_output(input_image, output_image):
"""Clip output image to range of values of input image.
Note that this function modifies the values of *output_image* in-place.
Taken from:
https://github.com/scikit-image/scikit-image/blob/b4b521d6f0a105aabeaa31699949f78453ca3511/skimage/transform/_warps.py#L640.
Args:
input_image : ndarray
Input image.
output_image : ndarray
Output image, which is modified in-place.
"""
min_val = np.min(input_image)
if np.isnan(min_val):
# NaNs detected, use NaN-safe min/max
min_func = np.nanmin
max_func = np.nanmax
min_val = min_func(input_image)
else:
min_func = np.min
max_func = np.max
max_val = max_func(input_image)
output_image = np.clip(output_image, min_val, max_val)
return output_image
class Owlv2ImageProcessor(BaseImageProcessor):
r"""
Constructs an OWLv2 image processor.
Args:
do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the image by the specified scale `rescale_factor`. Can be overriden by `do_rescale` in
the `preprocess` method.
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Scale factor to use if rescaling the image. Can be overriden by `rescale_factor` in the `preprocess`
method.
do_pad (`bool`, *optional*, defaults to `True`):
Whether to pad the image to a square with gray pixels on the bottom and the right. Can be overriden by
`do_pad` in the `preprocess` method.
do_resize (`bool`, *optional*, defaults to `True`):
Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be overriden
by `do_resize` in the `preprocess` method.
size (`Dict[str, int]` *optional*, defaults to `{"height": 960, "width": 960}`):
Size to resize the image to. Can be overriden by `size` in the `preprocess` method.
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
Resampling method to use if resizing the image. Can be overriden by `resample` in the `preprocess` method.
do_normalize (`bool`, *optional*, defaults to `True`):
Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
method.
image_mean (`float` or `List[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`):
Mean to use if normalizing the image. This is a float or list of floats the length of the number of
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
image_std (`float` or `List[float]`, *optional*, defaults to `OPENAI_CLIP_STD`):
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
"""
model_input_names = ["pixel_values"]
def __init__(
self,
do_rescale: bool = True,
rescale_factor: Union[int, float] = 1 / 255,
do_pad: bool = True,
do_resize: bool = True,
size: Dict[str, int] = None,
resample: PILImageResampling = PILImageResampling.BILINEAR,
do_normalize: bool = True,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
**kwargs,
) -> None:
super().__init__(**kwargs)
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_pad = do_pad
self.do_resize = do_resize
self.size = size if size is not None else {"height": 960, "width": 960}
self.resample = resample
self.do_normalize = do_normalize
self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
def pad(
self,
image: np.array,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
):
"""
Pad an image to a square with gray pixels on the bottom and the right, as per the original OWLv2
implementation.
Args:
image (`np.ndarray`):
Image to pad.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred from the input
image.
"""
height, width = get_image_size(image)
size = max(height, width)
image = pad(
image=image,
padding=((0, size - height), (0, size - width)),
constant_values=0.5,
data_format=data_format,
input_data_format=input_data_format,
)
return image
def resize(
self,
image: np.ndarray,
size: Dict[str, int],
anti_aliasing: bool = True,
anti_aliasing_sigma=None,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> np.ndarray:
"""
Resize an image as per the original implementation.
Args:
image (`np.ndarray`):
Image to resize.
size (`Dict[str, int]`):
Dictionary containing the height and width to resize the image to.
anti_aliasing (`bool`, *optional*, defaults to `True`):
Whether to apply anti-aliasing when downsampling the image.
anti_aliasing_sigma (`float`, *optional*, defaults to `None`):
Standard deviation for Gaussian kernel when downsampling the image. If `None`, it will be calculated
automatically.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format of the input image. If not provided, it will be inferred from the input
image.
"""
requires_backends(self, "scipy")
output_shape = (size["height"], size["width"])
image = to_channel_dimension_format(image, ChannelDimension.LAST)
image, output_shape = _preprocess_resize_output_shape(image, output_shape)
input_shape = image.shape
factors = np.divide(input_shape, output_shape)
# Translate modes used by np.pad to those used by scipy.ndimage
ndi_mode = "mirror"
cval = 0
order = 1
if anti_aliasing:
if anti_aliasing_sigma is None:
anti_aliasing_sigma = np.maximum(0, (factors - 1) / 2)
else:
anti_aliasing_sigma = np.atleast_1d(anti_aliasing_sigma) * np.ones_like(factors)
if np.any(anti_aliasing_sigma < 0):
raise ValueError("Anti-aliasing standard deviation must be " "greater than or equal to zero")
elif np.any((anti_aliasing_sigma > 0) & (factors <= 1)):
warnings.warn(
"Anti-aliasing standard deviation greater than zero but " "not down-sampling along all axes"
)
filtered = ndi.gaussian_filter(image, anti_aliasing_sigma, cval=cval, mode=ndi_mode)
else:
filtered = image
zoom_factors = [1 / f for f in factors]
out = ndi.zoom(filtered, zoom_factors, order=order, mode=ndi_mode, cval=cval, grid_mode=True)
image = _clip_warp_output(image, out)
image = to_channel_dimension_format(image, input_data_format, ChannelDimension.LAST)
image = (
to_channel_dimension_format(image, data_format, input_data_format) if data_format is not None else image
)
return image
def preprocess(
self,
images: ImageInput,
do_pad: bool = None,
do_resize: bool = None,
size: Dict[str, int] = None,
do_rescale: bool = None,
rescale_factor: float = None,
do_normalize: bool = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: ChannelDimension = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
) -> PIL.Image.Image:
"""
Preprocess an image or batch of images.
Args:
images (`ImageInput`):
Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
passing in images with pixel values between 0 and 1, set `do_rescale=False`.
do_pad (`bool`, *optional*, defaults to `self.do_pad`):
Whether to pad the image to a square with gray pixels on the bottom and the right.
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
Whether to resize the image.
size (`Dict[str, int]`, *optional*, defaults to `self.size`):
Size to resize the image to.
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
Whether to rescale the image values between [0 - 1].
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
Whether to normalize the image.
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
Image mean.
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
Image standard deviation.
return_tensors (`str` or `TensorType`, *optional*):
The type of tensors to return. Can be one of:
- Unset: Return a list of `np.ndarray`.
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
The channel dimension format for the output image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- Unset: Use the channel dimension format of the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
"""
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
do_pad = do_pad if do_pad is not None else self.do_pad
do_resize = do_resize if do_resize is not None else self.do_resize
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
image_mean = image_mean if image_mean is not None else self.image_mean
image_std = image_std if image_std is not None else self.image_std
size = size if size is not None else self.size
images = make_list_of_images(images)
if not valid_images(images):
raise ValueError(
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
"torch.Tensor, tf.Tensor or jax.ndarray."
)
if do_resize and size is None:
raise ValueError("Size must be specified if do_resize is True.")
if do_rescale and rescale_factor is None:
raise ValueError("Rescale factor must be specified if do_rescale is True.")
if do_normalize and (image_mean is None or image_std is None):
raise ValueError("Image mean and std must be specified if do_normalize is True.")
# All transformations expect numpy arrays.
images = [to_numpy_array(image) for image in images]
if is_scaled_image(images[0]) and do_rescale:
logger.warning_once(
"It looks like you are trying to rescale already rescaled images. If the input"
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
)
if input_data_format is None:
# We assume that all images have the same channel dimension format.
input_data_format = infer_channel_dimension_format(images[0])
if do_rescale:
images = [
self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
for image in images
]
if do_pad:
images = [self.pad(image=image, input_data_format=input_data_format) for image in images]
if do_resize:
images = [
self.resize(
image=image,
size=size,
input_data_format=input_data_format,
)
for image in images
]
if do_normalize:
images = [
self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
for image in images
]
images = [
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
]
data = {"pixel_values": images}
return BatchFeature(data=data, tensor_type=return_tensors)
# Copied from transformers.models.owlvit.image_processing_owlvit.OwlViTImageProcessor.post_process_object_detection
def post_process_object_detection(
self, outputs, threshold: float = 0.1, target_sizes: Union[TensorType, List[Tuple]] = None
):
"""
Converts the raw output of [`OwlViTForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
bottom_right_x, bottom_right_y) format.
Args:
outputs ([`OwlViTObjectDetectionOutput`]):
Raw outputs of the model.
threshold (`float`, *optional*):
Score threshold to keep object detection predictions.
target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
`(height, width)` of each image in the batch. If unset, predictions will not be resized.
Returns:
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
in the batch as predicted by the model.
"""
# TODO: (amy) add support for other frameworks
logits, boxes = outputs.logits, outputs.pred_boxes
if target_sizes is not None:
if len(logits) != len(target_sizes):
raise ValueError(
"Make sure that you pass in as many target sizes as the batch dimension of the logits"
)
probs = torch.max(logits, dim=-1)
scores = torch.sigmoid(probs.values)
labels = probs.indices
# Convert to [x0, y0, x1, y1] format
boxes = center_to_corners_format(boxes)
# Convert from relative [0, 1] to absolute [0, height] coordinates
if target_sizes is not None:
if isinstance(target_sizes, List):
img_h = torch.Tensor([i[0] for i in target_sizes])
img_w = torch.Tensor([i[1] for i in target_sizes])
else:
img_h, img_w = target_sizes.unbind(1)
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
boxes = boxes * scale_fct[:, None, :]
results = []
for s, l, b in zip(scores, labels, boxes):
score = s[s > threshold]
label = l[s > threshold]
box = b[s > threshold]
results.append({"scores": score, "labels": label, "boxes": box})
return results
# Copied from transformers.models.owlvit.image_processing_owlvit.OwlViTImageProcessor.post_process_image_guided_detection
def post_process_image_guided_detection(self, outputs, threshold=0.0, nms_threshold=0.3, target_sizes=None):
"""
Converts the output of [`OwlViTForObjectDetection.image_guided_detection`] into the format expected by the COCO
api.
Args:
outputs ([`OwlViTImageGuidedObjectDetectionOutput`]):
Raw outputs of the model.
threshold (`float`, *optional*, defaults to 0.0):
Minimum confidence threshold to use to filter out predicted boxes.
nms_threshold (`float`, *optional*, defaults to 0.3):
IoU threshold for non-maximum suppression of overlapping boxes.
target_sizes (`torch.Tensor`, *optional*):
Tensor of shape (batch_size, 2) where each entry is the (height, width) of the corresponding image in
the batch. If set, predicted normalized bounding boxes are rescaled to the target sizes. If left to
None, predictions will not be unnormalized.
Returns:
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
in the batch as predicted by the model. All labels are set to None as
`OwlViTForObjectDetection.image_guided_detection` perform one-shot object detection.
"""
logits, target_boxes = outputs.logits, outputs.target_pred_boxes
if len(logits) != len(target_sizes):
raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
if target_sizes.shape[1] != 2:
raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
probs = torch.max(logits, dim=-1)
scores = torch.sigmoid(probs.values)
# Convert to [x0, y0, x1, y1] format
target_boxes = center_to_corners_format(target_boxes)
# Apply non-maximum suppression (NMS)
if nms_threshold < 1.0:
for idx in range(target_boxes.shape[0]):
for i in torch.argsort(-scores[idx]):
if not scores[idx][i]:
continue
ious = box_iou(target_boxes[idx][i, :].unsqueeze(0), target_boxes[idx])[0][0]
ious[i] = -1.0 # Mask self-IoU.
scores[idx][ious > nms_threshold] = 0.0
# Convert from relative [0, 1] to absolute [0, height] coordinates
img_h, img_w = target_sizes.unbind(1)
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(target_boxes.device)
target_boxes = target_boxes * scale_fct[:, None, :]
# Compute box display alphas based on prediction scores
results = []
alphas = torch.zeros_like(scores)
for idx in range(target_boxes.shape[0]):
# Select scores for boxes matching the current query:
query_scores = scores[idx]
if not query_scores.nonzero().numel():
continue
# Apply threshold on scores before scaling
query_scores[query_scores < threshold] = 0.0
# Scale box alpha such that the best box for each query has alpha 1.0 and the worst box has alpha 0.1.
# All other boxes will either belong to a different query, or will not be shown.
max_score = torch.max(query_scores) + 1e-6
query_alphas = (query_scores - (max_score * 0.1)) / (max_score * 0.9)
query_alphas = torch.clip(query_alphas, 0.0, 1.0)
alphas[idx] = query_alphas
mask = alphas[idx] > 0
box_scores = alphas[idx][mask]
boxes = target_boxes[idx][mask]
results.append({"scores": box_scores, "labels": None, "boxes": boxes})
return results
# coding=utf-8
# Copyright 2023 Google AI and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch OWLv2 model."""
import warnings
from dataclasses import dataclass
from typing import Any, Dict, Optional, Tuple, Union
import numpy as np
import torch
import torch.utils.checkpoint
from torch import Tensor, nn
from ...activations import ACT2FN
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
from ...modeling_utils import PreTrainedModel
from ...utils import (
ModelOutput,
add_start_docstrings,
add_start_docstrings_to_model_forward,
is_vision_available,
logging,
replace_return_docstrings,
)
from .configuration_owlv2 import Owlv2Config, Owlv2TextConfig, Owlv2VisionConfig
if is_vision_available():
from transformers.image_transforms import center_to_corners_format
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "google/owlv2-base-patch16-ensemble"
# See all Owlv2 models at https://huggingface.co/models?filter=owlv2
OWLV2_PRETRAINED_MODEL_ARCHIVE_LIST = [
"google/owlv2-base-patch16-ensemble",
# See all OWLv2 models at https://huggingface.co/models?filter=owlv2
]
# Copied from transformers.models.bart.modeling_bart._expand_mask
def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
"""
Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
"""
bsz, src_len = mask.size()
tgt_len = tgt_len if tgt_len is not None else src_len
expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
inverted_mask = 1.0 - expanded_mask
return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
# Copied from transformers.models.clip.modeling_clip.contrastive_loss with clip->owlv2
def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))
# Copied from transformers.models.clip.modeling_clip.clip_loss with clip->owlv2
def owlv2_loss(similarity: torch.Tensor) -> torch.Tensor:
caption_loss = contrastive_loss(similarity)
image_loss = contrastive_loss(similarity.t())
return (caption_loss + image_loss) / 2.0
@dataclass
class Owlv2Output(ModelOutput):
"""
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
Contrastive loss for image-text similarity.
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
similarity scores.
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
similarity scores.
text_embeds (`torch.FloatTensor` of shape `(batch_size * num_max_text_queries, output_dim`):
The text embeddings obtained by applying the projection layer to the pooled output of [`Owlv2TextModel`].
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
The image embeddings obtained by applying the projection layer to the pooled output of
[`Owlv2VisionModel`].
text_model_output (Tuple[`BaseModelOutputWithPooling`]):
The output of the [`Owlv2TextModel`].
vision_model_output (`BaseModelOutputWithPooling`):
The output of the [`Owlv2VisionModel`].
"""
loss: Optional[torch.FloatTensor] = None
logits_per_image: torch.FloatTensor = None
logits_per_text: torch.FloatTensor = None
text_embeds: torch.FloatTensor = None
image_embeds: torch.FloatTensor = None
text_model_output: BaseModelOutputWithPooling = None
vision_model_output: BaseModelOutputWithPooling = None
def to_tuple(self) -> Tuple[Any]:
return tuple(
self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
for k in self.keys()
)
# Copied from transformers.models.detr.modeling_detr._upcast
def _upcast(t: Tensor) -> Tensor:
# Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
if t.is_floating_point():
return t if t.dtype in (torch.float32, torch.float64) else t.float()
else:
return t if t.dtype in (torch.int32, torch.int64) else t.int()
# Copied from transformers.models.detr.modeling_detr.box_area
def box_area(boxes: Tensor) -> Tensor:
"""
Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.
Args:
boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1
< x2` and `0 <= y1 < y2`.
Returns:
`torch.FloatTensor`: a tensor containing the area for each box.
"""
boxes = _upcast(boxes)
return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
# Copied from transformers.models.detr.modeling_detr.box_iou
def box_iou(boxes1, boxes2):
area1 = box_area(boxes1)
area2 = box_area(boxes2)
left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2]
right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2]
width_height = (right_bottom - left_top).clamp(min=0) # [N,M,2]
inter = width_height[:, :, 0] * width_height[:, :, 1] # [N,M]
union = area1[:, None] + area2 - inter
iou = inter / union
return iou, union
# Copied from transformers.models.detr.modeling_detr.generalized_box_iou
def generalized_box_iou(boxes1, boxes2):
"""
Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format.
Returns:
`torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2)
"""
# degenerate boxes gives inf / nan results
# so do an early check
if not (boxes1[:, 2:] >= boxes1[:, :2]).all():
raise ValueError(f"boxes1 must be in [x0, y0, x1, y1] (corner) format, but got {boxes1}")
if not (boxes2[:, 2:] >= boxes2[:, :2]).all():
raise ValueError(f"boxes2 must be in [x0, y0, x1, y1] (corner) format, but got {boxes2}")
iou, union = box_iou(boxes1, boxes2)
top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2])
bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
width_height = (bottom_right - top_left).clamp(min=0) # [N,M,2]
area = width_height[:, :, 0] * width_height[:, :, 1]
return iou - (area - union) / area
@dataclass
class Owlv2ObjectDetectionOutput(ModelOutput):
"""
Output type of [`Owlv2ForObjectDetection`].
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
scale-invariant IoU loss.
loss_dict (`Dict`, *optional*):
A dictionary containing the individual losses. Useful for logging.
logits (`torch.FloatTensor` of shape `(batch_size, num_patches, num_queries)`):
Classification logits (including no-object) for all queries.
objectness_logits (`torch.FloatTensor` of shape `(batch_size, num_patches, 1)`):
The objectness logits of all image patches. OWL-ViT represents images as a set of image patches where the
total number of patches is (image_size / patch_size)**2.
pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
possible padding). You can use [`~Owlv2ImageProcessor.post_process_object_detection`] to retrieve the
unnormalized bounding boxes.
text_embeds (`torch.FloatTensor` of shape `(batch_size, num_max_text_queries, output_dim`):
The text embeddings obtained by applying the projection layer to the pooled output of [`Owlv2TextModel`].
image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`):
Pooled output of [`Owlv2VisionModel`]. OWLv2 represents images as a set of image patches and computes image
embeddings for each patch.
class_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
Class embeddings of all image patches. OWLv2 represents images as a set of image patches where the total
number of patches is (image_size / patch_size)**2.
text_model_output (Tuple[`BaseModelOutputWithPooling`]):
The output of the [`Owlv2TextModel`].
vision_model_output (`BaseModelOutputWithPooling`):
The output of the [`Owlv2VisionModel`].
"""
loss: Optional[torch.FloatTensor] = None
loss_dict: Optional[Dict] = None
logits: torch.FloatTensor = None
objectness_logits: torch.FloatTensor = None
pred_boxes: torch.FloatTensor = None
text_embeds: torch.FloatTensor = None
image_embeds: torch.FloatTensor = None
class_embeds: torch.FloatTensor = None
text_model_output: BaseModelOutputWithPooling = None
vision_model_output: BaseModelOutputWithPooling = None
def to_tuple(self) -> Tuple[Any]:
return tuple(
self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
for k in self.keys()
)
@dataclass
# Copied from transformers.models.owlvit.modeling_owlvit.OwlViTImageGuidedObjectDetectionOutput with OwlViT->Owlv2,OWL-ViT->OWLv2
class Owlv2ImageGuidedObjectDetectionOutput(ModelOutput):
"""
Output type of [`Owlv2ForObjectDetection.image_guided_detection`].
Args:
logits (`torch.FloatTensor` of shape `(batch_size, num_patches, num_queries)`):
Classification logits (including no-object) for all queries.
target_pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
values are normalized in [0, 1], relative to the size of each individual target image in the batch
(disregarding possible padding). You can use [`~Owlv2ImageProcessor.post_process_object_detection`] to
retrieve the unnormalized bounding boxes.
query_pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
values are normalized in [0, 1], relative to the size of each individual query image in the batch
(disregarding possible padding). You can use [`~Owlv2ImageProcessor.post_process_object_detection`] to
retrieve the unnormalized bounding boxes.
image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`):
Pooled output of [`Owlv2VisionModel`]. OWLv2 represents images as a set of image patches and computes image
embeddings for each patch.
query_image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`):
Pooled output of [`Owlv2VisionModel`]. OWLv2 represents images as a set of image patches and computes image
embeddings for each patch.
class_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
Class embeddings of all image patches. OWLv2 represents images as a set of image patches where the total
number of patches is (image_size / patch_size)**2.
text_model_output (Tuple[`BaseModelOutputWithPooling`]):
The output of the [`Owlv2TextModel`].
vision_model_output (`BaseModelOutputWithPooling`):
The output of the [`Owlv2VisionModel`].
"""
logits: torch.FloatTensor = None
image_embeds: torch.FloatTensor = None
query_image_embeds: torch.FloatTensor = None
target_pred_boxes: torch.FloatTensor = None
query_pred_boxes: torch.FloatTensor = None
class_embeds: torch.FloatTensor = None
text_model_output: BaseModelOutputWithPooling = None
vision_model_output: BaseModelOutputWithPooling = None
def to_tuple(self) -> Tuple[Any]:
return tuple(
self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
for k in self.keys()
)
# Copied from transformers.models.owlvit.modeling_owlvit.OwlViTVisionEmbeddings with OwlViT->Owlv2
class Owlv2VisionEmbeddings(nn.Module):
def __init__(self, config: Owlv2VisionConfig):
super().__init__()
self.config = config
self.embed_dim = config.hidden_size
self.class_embedding = nn.Parameter(torch.randn(config.hidden_size))
self.patch_embedding = nn.Conv2d(
in_channels=config.num_channels,
out_channels=self.embed_dim,
kernel_size=config.patch_size,
stride=config.patch_size,
bias=False,
)
self.num_patches = (config.image_size // config.patch_size) ** 2
self.num_positions = self.num_patches + 1
self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
batch_size = pixel_values.shape[0]
patch_embeds = self.patch_embedding(pixel_values) # shape = [batch_size, num_channels, height, width]
patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
class_embeds = self.class_embedding.expand(batch_size, 1, -1)
embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
embeddings = embeddings + self.position_embedding(self.position_ids)
return embeddings
# Copied from transformers.models.owlvit.modeling_owlvit.OwlViTTextEmbeddings with OwlViT->Owlv2
class Owlv2TextEmbeddings(nn.Module):
def __init__(self, config: Owlv2TextConfig):
super().__init__()
self.token_embedding = nn.Embedding(config.vocab_size, config.hidden_size)
self.position_embedding = nn.Embedding(config.max_position_embeddings, config.hidden_size)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.register_buffer(
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
) -> torch.Tensor:
seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
if position_ids is None:
position_ids = self.position_ids[:, :seq_length]
if inputs_embeds is None:
inputs_embeds = self.token_embedding(input_ids)
position_embeddings = self.position_embedding(position_ids)
embeddings = inputs_embeds + position_embeddings
return embeddings
# Copied from transformers.models.owlvit.modeling_owlvit.OwlViTAttention with OwlViT->Owlv2
class Owlv2Attention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(self, config):
super().__init__()
self.config = config
self.embed_dim = config.hidden_size
self.num_heads = config.num_attention_heads
self.head_dim = self.embed_dim // self.num_heads
if self.head_dim * self.num_heads != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
f" {self.num_heads})."
)
self.scale = self.head_dim**-0.5
self.dropout = config.attention_dropout
self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
causal_attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
"""Input shape: Batch x Time x Channel"""
bsz, tgt_len, embed_dim = hidden_states.size()
# get query proj
query_states = self.q_proj(hidden_states) * self.scale
key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
proj_shape = (bsz * self.num_heads, -1, self.head_dim)
query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
key_states = key_states.view(*proj_shape)
value_states = value_states.view(*proj_shape)
src_len = key_states.size(1)
attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
raise ValueError(
f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
f" {attn_weights.size()}"
)
# apply the causal_attention_mask first
if causal_attention_mask is not None:
if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len):
raise ValueError(
f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
f" {causal_attention_mask.size()}"
)
attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + causal_attention_mask
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
if attention_mask is not None:
if attention_mask.size() != (bsz, 1, tgt_len, src_len):
raise ValueError(
f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
)
attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
attn_weights = nn.functional.softmax(attn_weights, dim=-1)
if output_attentions:
# this operation is a bit akward, but it's required to
# make sure that attn_weights keeps its gradient.
# In order to do so, attn_weights have to reshaped
# twice and have to be reused in the following
attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
else:
attn_weights_reshaped = None
attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
# For int8 compatibility, sometimes the `attn_probs` are in `fp32`
attn_probs = attn_probs.to(value_states.dtype)
attn_output = torch.bmm(attn_probs, value_states)
if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
raise ValueError(
f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
f" {attn_output.size()}"
)
attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
attn_output = attn_output.transpose(1, 2)
attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
attn_output = self.out_proj(attn_output)
return attn_output, attn_weights_reshaped
# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Owlv2
class Owlv2MLP(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.activation_fn = ACT2FN[config.hidden_act]
self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.fc1(hidden_states)
hidden_states = self.activation_fn(hidden_states)
hidden_states = self.fc2(hidden_states)
return hidden_states
# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->Owlv2
class Owlv2EncoderLayer(nn.Module):
def __init__(self, config: Owlv2Config):
super().__init__()
self.embed_dim = config.hidden_size
self.self_attn = Owlv2Attention(config)
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
self.mlp = Owlv2MLP(config)
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: torch.Tensor,
causal_attention_mask: torch.Tensor,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.FloatTensor]:
"""
Args:
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
attention_mask (`torch.FloatTensor`): attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
`(config.encoder_attention_heads,)`.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
hidden_states = self.layer_norm1(hidden_states)
hidden_states, attn_weights = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
causal_attention_mask=causal_attention_mask,
output_attentions=output_attentions,
)
hidden_states = residual + hidden_states
residual = hidden_states
hidden_states = self.layer_norm2(hidden_states)
hidden_states = self.mlp(hidden_states)
hidden_states = residual + hidden_states
outputs = (hidden_states,)
if output_attentions:
outputs += (attn_weights,)
return outputs
# Copied from transformers.models.owlvit.modeling_owlvit.OwlViTPreTrainedModel with OwlViT->Owlv2,owlvit->owlv2
class Owlv2PreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = Owlv2Config
base_model_prefix = "owlv2"
supports_gradient_checkpointing = True
_no_split_modules = ["Owlv2EncoderLayer"]
def _init_weights(self, module):
"""Initialize the weights"""
factor = self.config.initializer_factor
if isinstance(module, Owlv2TextEmbeddings):
module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
elif isinstance(module, Owlv2VisionEmbeddings):
factor = self.config.initializer_factor
nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
elif isinstance(module, Owlv2Attention):
factor = self.config.initializer_factor
in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
out_proj_std = (module.embed_dim**-0.5) * factor
nn.init.normal_(module.q_proj.weight, std=in_proj_std)
nn.init.normal_(module.k_proj.weight, std=in_proj_std)
nn.init.normal_(module.v_proj.weight, std=in_proj_std)
nn.init.normal_(module.out_proj.weight, std=out_proj_std)
elif isinstance(module, Owlv2MLP):
factor = self.config.initializer_factor
in_proj_std = (
(module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
)
fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
nn.init.normal_(module.fc1.weight, std=fc_std)
nn.init.normal_(module.fc2.weight, std=in_proj_std)
elif isinstance(module, Owlv2Model):
nn.init.normal_(
module.text_projection.weight,
std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
)
nn.init.normal_(
module.visual_projection.weight,
std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
)
if isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
if isinstance(module, nn.Linear) and module.bias is not None:
module.bias.data.zero_()
def _set_gradient_checkpointing(self, module, value=False):
if isinstance(module, Owlv2Encoder):
module.gradient_checkpointing = value
OWLV2_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`Owvl2Config`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
OWLV2_TEXT_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
[`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
IDs?](../glossary#input-ids)
attention_mask (`torch.Tensor` of shape `(batch_size, num_max_text_queries, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
OWLV2_VISION_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
OWLV2_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
[`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
IDs?](../glossary#input-ids)
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values.
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
return_loss (`bool`, *optional*):
Whether or not to return the contrastive loss.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_base_image_embeds (`bool`, *optional*):
Whether or not to return the base image embeddings.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
OWLV2_OBJECT_DETECTION_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values.
input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`, *optional*):
Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
[`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
IDs?](../glossary#input-ids).
attention_mask (`torch.Tensor` of shape `(batch_size, num_max_text_queries, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
output_hidden_states (`bool`, *optional*):
Whether or not to return the last hidden state. See `text_model_last_hidden_state` and
`vision_model_last_hidden_state` under returned tensors for more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
OWLV2_IMAGE_GUIDED_OBJECT_DETECTION_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values.
query_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values of query image(s) to be detected. Pass in one query image per target image.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
# Copied from transformers.models.owlvit.modeling_owlvit.OwlViTEncoder with OwlViT->Owlv2
class Owlv2Encoder(nn.Module):
"""
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`Owlv2EncoderLayer`].
Args:
config: Owlv2Config
"""
def __init__(self, config: Owlv2Config):
super().__init__()
self.layers = nn.ModuleList([Owlv2EncoderLayer(config) for _ in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
def forward(
self,
inputs_embeds,
attention_mask: Optional[torch.Tensor] = None,
causal_attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutput]:
r"""
Args:
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`).
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Causal mask for the text model. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
encoder_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None
hidden_states = inputs_embeds
for encoder_layer in self.layers:
if output_hidden_states:
encoder_states = encoder_states + (hidden_states,)
if self.gradient_checkpointing and self.training:
def create_custom_forward(module):
def custom_forward(*inputs):
return module(*inputs, output_attentions)
return custom_forward
layer_outputs = torch.utils.checkpoint.checkpoint(
create_custom_forward(encoder_layer),
hidden_states,
attention_mask,
causal_attention_mask,
)
else:
layer_outputs = encoder_layer(
hidden_states,
attention_mask,
causal_attention_mask,
output_attentions=output_attentions,
)
hidden_states = layer_outputs[0]
if output_attentions:
all_attentions = all_attentions + (layer_outputs[1],)
if output_hidden_states:
encoder_states = encoder_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
return BaseModelOutput(
last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
)
# Copied from transformers.models.bart.modeling_bart._make_causal_mask
def _make_causal_mask(
input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
):
"""
Make causal mask used for bi-directional self-attention.
"""
bsz, tgt_len = input_ids_shape
mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
mask_cond = torch.arange(mask.size(-1), device=device)
mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
mask = mask.to(dtype)
if past_key_values_length > 0:
mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
# Copied from transformers.models.owlvit.modeling_owlvit.OwlViTTextTransformer with OWLVIT->OWLV2,OwlViT->Owlv2
class Owlv2TextTransformer(nn.Module):
def __init__(self, config: Owlv2TextConfig):
super().__init__()
self.config = config
embed_dim = config.hidden_size
self.embeddings = Owlv2TextEmbeddings(config)
self.encoder = Owlv2Encoder(config)
self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
@add_start_docstrings_to_model_forward(OWLV2_TEXT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Owlv2TextConfig)
def forward(
self,
input_ids: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPooling]:
r"""
Returns:
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
input_shape = input_ids.size()
input_ids = input_ids.view(-1, input_shape[-1])
hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
# num_samples, seq_len = input_shape where num_samples = batch_size * num_max_text_queries
# OWLV2's text model uses causal mask, prepare it here.
# https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
causal_attention_mask = _make_causal_mask(input_shape, hidden_states.dtype, device=hidden_states.device)
# expand attention_mask
if attention_mask is not None:
# [num_samples, seq_len] -> [num_samples, 1, tgt_seq_len, src_seq_len]
attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
encoder_outputs = self.encoder(
inputs_embeds=hidden_states,
attention_mask=attention_mask,
causal_attention_mask=causal_attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
last_hidden_state = encoder_outputs[0]
last_hidden_state = self.final_layer_norm(last_hidden_state)
# take features from the end of tokens embedding (end of token is the highest number in each sequence)
# casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
pooled_output = last_hidden_state[
torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
input_ids.to(torch.int).argmax(dim=-1).to(last_hidden_state.device),
]
if not return_dict:
return (last_hidden_state, pooled_output) + encoder_outputs[1:]
return BaseModelOutputWithPooling(
last_hidden_state=last_hidden_state,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
# Copied from transformers.models.owlvit.modeling_owlvit.OwlViTTextModel with google/owlvit-base-patch32->google/owlv2-base-patch16, OWLVIT->OWLV2,OwlViT->Owlv2
class Owlv2TextModel(Owlv2PreTrainedModel):
config_class = Owlv2TextConfig
def __init__(self, config: Owlv2TextConfig):
super().__init__(config)
self.text_model = Owlv2TextTransformer(config)
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self) -> nn.Module:
return self.text_model.embeddings.token_embedding
def set_input_embeddings(self, value):
self.text_model.embeddings.token_embedding = value
@add_start_docstrings_to_model_forward(OWLV2_TEXT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Owlv2TextConfig)
def forward(
self,
input_ids: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPooling]:
r"""
Returns:
Examples:
```python
>>> from transformers import AutoProcessor, Owlv2TextModel
>>> model = Owlv2TextModel.from_pretrained("google/owlv2-base-patch16")
>>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16")
>>> inputs = processor(
... text=[["a photo of a cat", "a photo of a dog"], ["photo of a astranaut"]], return_tensors="pt"
... )
>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output # pooled (EOS token) states
```"""
# Get embeddings for all text queries in all batch samples
return self.text_model(
input_ids=input_ids,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# Copied from transformers.models.owlvit.modeling_owlvit.OwlViTVisionTransformer with OWLVIT->OWLV2,OwlViT->Owlv2
class Owlv2VisionTransformer(nn.Module):
def __init__(self, config: Owlv2VisionConfig):
super().__init__()
self.config = config
self.embeddings = Owlv2VisionEmbeddings(config)
self.pre_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.encoder = Owlv2Encoder(config)
self.post_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
@add_start_docstrings_to_model_forward(OWLV2_VISION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Owlv2VisionConfig)
def forward(
self,
pixel_values: torch.FloatTensor,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPooling]:
r"""
Returns:
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# Cast the input to the expected `dtype`
expected_input_dtype = self.embeddings.patch_embedding.weight.dtype
pixel_values = pixel_values.to(expected_input_dtype)
hidden_states = self.embeddings(pixel_values)
hidden_states = self.pre_layernorm(hidden_states)
encoder_outputs = self.encoder(
inputs_embeds=hidden_states,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
last_hidden_state = encoder_outputs[0]
pooled_output = last_hidden_state[:, 0, :]
pooled_output = self.post_layernorm(pooled_output)
if not return_dict:
return (last_hidden_state, pooled_output) + encoder_outputs[1:]
return BaseModelOutputWithPooling(
last_hidden_state=last_hidden_state,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
# Copied from transformers.models.owlvit.modeling_owlvit.OwlViTVisionModel with OWLVIT->OWLV2,OwlViT->Owlv2,google/owlvit-base-patch32->google/owlv2-base-patch16
class Owlv2VisionModel(Owlv2PreTrainedModel):
config_class = Owlv2VisionConfig
main_input_name = "pixel_values"
def __init__(self, config: Owlv2VisionConfig):
super().__init__(config)
self.vision_model = Owlv2VisionTransformer(config)
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self) -> nn.Module:
return self.vision_model.embeddings.patch_embedding
@add_start_docstrings_to_model_forward(OWLV2_VISION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Owlv2VisionConfig)
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPooling]:
r"""
Returns:
Examples:
```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Owlv2VisionModel
>>> model = Owlv2VisionModel.from_pretrained("google/owlv2-base-patch16")
>>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output # pooled CLS states
```"""
return self.vision_model(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
@add_start_docstrings(OWLV2_START_DOCSTRING)
# Copied from transformers.models.owlvit.modeling_owlvit.OwlViTModel with google/owlvit-base-patch32->google/owlv2-base-patch16-ensemble, OWLVIT->OWLV2,OwlViT->Owlv2,owlvit->owlv2,OWL-ViT->OWLv2
class Owlv2Model(Owlv2PreTrainedModel):
config_class = Owlv2Config
def __init__(self, config: Owlv2Config):
super().__init__(config)
if not isinstance(config.text_config, Owlv2TextConfig):
raise ValueError(
"config.text_config is expected to be of type Owlv2TextConfig but is of type"
f" {type(config.text_config)}."
)
if not isinstance(config.vision_config, Owlv2VisionConfig):
raise ValueError(
"config.vision_config is expected to be of type Owlv2VisionConfig but is of type"
f" {type(config.vision_config)}."
)
text_config = config.text_config
vision_config = config.vision_config
self.projection_dim = config.projection_dim
self.text_embed_dim = text_config.hidden_size
self.vision_embed_dim = vision_config.hidden_size
self.text_model = Owlv2TextTransformer(text_config)
self.vision_model = Owlv2VisionTransformer(vision_config)
self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
self.logit_scale = nn.Parameter(torch.tensor(config.logit_scale_init_value))
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(OWLV2_TEXT_INPUTS_DOCSTRING)
def get_text_features(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> torch.FloatTensor:
r"""
Returns:
text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
applying the projection layer to the pooled output of [`Owlv2TextModel`].
Examples:
```python
>>> from transformers import AutoProcessor, Owlv2Model
>>> model = Owlv2Model.from_pretrained("google/owlv2-base-patch16-ensemble")
>>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
>>> inputs = processor(
... text=[["a photo of a cat", "a photo of a dog"], ["photo of a astranaut"]], return_tensors="pt"
... )
>>> text_features = model.get_text_features(**inputs)
```"""
# Use OWLv2 model's config for some fields (if specified) instead of those of vision & text components.
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# Get embeddings for all text queries in all batch samples
text_output = self.text_model(input_ids=input_ids, attention_mask=attention_mask, return_dict=return_dict)
pooled_output = text_output[1]
text_features = self.text_projection(pooled_output)
return text_features
@add_start_docstrings_to_model_forward(OWLV2_VISION_INPUTS_DOCSTRING)
def get_image_features(
self,
pixel_values: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> torch.FloatTensor:
r"""
Returns:
image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
applying the projection layer to the pooled output of [`Owlv2VisionModel`].
Examples:
```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Owlv2Model
>>> model = Owlv2Model.from_pretrained("google/owlv2-base-patch16-ensemble")
>>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="pt")
>>> image_features = model.get_image_features(**inputs)
```"""
# Use OWLv2 model's config for some fields (if specified) instead of those of vision & text components.
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
vision_outputs = self.vision_model(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
pooled_output = vision_outputs[1]
image_features = self.visual_projection(pooled_output)
return image_features
@add_start_docstrings_to_model_forward(OWLV2_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=Owlv2Output, config_class=Owlv2Config)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
pixel_values: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
return_loss: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_base_image_embeds: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, Owlv2Output]:
r"""
Returns:
Examples:
```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Owlv2Model
>>> model = Owlv2Model.from_pretrained("google/owlv2-base-patch16-ensemble")
>>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(text=[["a photo of a cat", "a photo of a dog"]], images=image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
>>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
```"""
# Use OWLv2 model's config for some fields (if specified) instead of those of vision & text components.
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
vision_outputs = self.vision_model(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# Get embeddings for all text queries in all batch samples
text_outputs = self.text_model(
input_ids=input_ids,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
text_embeds = text_outputs[1]
text_embeds = self.text_projection(text_embeds)
image_embeds = vision_outputs[1]
image_embeds = self.visual_projection(image_embeds)
# normalized features
image_embeds = image_embeds / torch.linalg.norm(image_embeds, ord=2, dim=-1, keepdim=True)
text_embeds_norm = text_embeds / torch.linalg.norm(text_embeds, ord=2, dim=-1, keepdim=True)
# cosine similarity as logits and set it on the correct device
logit_scale = self.logit_scale.exp().to(image_embeds.device)
logits_per_text = torch.matmul(text_embeds_norm, image_embeds.t()) * logit_scale
logits_per_image = logits_per_text.t()
loss = None
if return_loss:
loss = owlv2_loss(logits_per_text)
if return_base_image_embeds:
warnings.warn(
"`return_base_image_embeds` is deprecated and will be removed in v4.27 of Transformers, one can"
" obtain the base (unprojected) image embeddings from outputs.vision_model_output.",
FutureWarning,
)
last_hidden_state = vision_outputs[0]
image_embeds = self.vision_model.post_layernorm(last_hidden_state)
else:
text_embeds = text_embeds_norm
if not return_dict:
output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
return ((loss,) + output) if loss is not None else output
return Owlv2Output(
loss=loss,
logits_per_image=logits_per_image,
logits_per_text=logits_per_text,
text_embeds=text_embeds,
image_embeds=image_embeds,
text_model_output=text_outputs,
vision_model_output=vision_outputs,
)
# Copied from transformers.models.owlvit.modeling_owlvit.OwlViTBoxPredictionHead with OwlViT->Owlv2
class Owlv2BoxPredictionHead(nn.Module):
def __init__(self, config: Owlv2Config, out_dim: int = 4):
super().__init__()
width = config.vision_config.hidden_size
self.dense0 = nn.Linear(width, width)
self.dense1 = nn.Linear(width, width)
self.gelu = nn.GELU()
self.dense2 = nn.Linear(width, out_dim)
def forward(self, image_features: torch.Tensor) -> torch.FloatTensor:
output = self.dense0(image_features)
output = self.gelu(output)
output = self.dense1(output)
output = self.gelu(output)
output = self.dense2(output)
return output
# Copied from transformers.models.owlvit.modeling_owlvit.OwlViTClassPredictionHead with OwlViT->Owlv2
class Owlv2ClassPredictionHead(nn.Module):
def __init__(self, config: Owlv2Config):
super().__init__()
out_dim = config.text_config.hidden_size
self.query_dim = config.vision_config.hidden_size
self.dense0 = nn.Linear(self.query_dim, out_dim)
self.logit_shift = nn.Linear(self.query_dim, 1)
self.logit_scale = nn.Linear(self.query_dim, 1)
self.elu = nn.ELU()
def forward(
self,
image_embeds: torch.FloatTensor,
query_embeds: Optional[torch.FloatTensor],
query_mask: Optional[torch.Tensor],
) -> Tuple[torch.FloatTensor]:
image_class_embeds = self.dense0(image_embeds)
if query_embeds is None:
device = image_class_embeds.device
batch_size, num_patches = image_class_embeds.shape[:2]
pred_logits = torch.zeros((batch_size, num_patches, self.query_dim)).to(device)
return (pred_logits, image_class_embeds)
# Normalize image and text features
image_class_embeds = image_class_embeds / (torch.linalg.norm(image_class_embeds, dim=-1, keepdim=True) + 1e-6)
query_embeds = query_embeds / (torch.linalg.norm(query_embeds, dim=-1, keepdim=True) + 1e-6)
# Get class predictions
pred_logits = torch.einsum("...pd,...qd->...pq", image_class_embeds, query_embeds)
# Apply a learnable shift and scale to logits
logit_shift = self.logit_shift(image_embeds)
logit_scale = self.logit_scale(image_embeds)
logit_scale = self.elu(logit_scale) + 1
pred_logits = (pred_logits + logit_shift) * logit_scale
if query_mask is not None:
if query_mask.ndim > 1:
query_mask = torch.unsqueeze(query_mask, dim=-2)
pred_logits = pred_logits.to(torch.float64)
pred_logits = torch.where(query_mask == 0, -1e6, pred_logits)
pred_logits = pred_logits.to(torch.float32)
return (pred_logits, image_class_embeds)
class Owlv2ForObjectDetection(Owlv2PreTrainedModel):
config_class = Owlv2Config
def __init__(self, config: Owlv2Config):
super().__init__(config)
self.owlv2 = Owlv2Model(config)
self.class_head = Owlv2ClassPredictionHead(config)
self.box_head = Owlv2BoxPredictionHead(config)
self.objectness_head = Owlv2BoxPredictionHead(config, out_dim=1)
self.layer_norm = nn.LayerNorm(config.vision_config.hidden_size, eps=config.vision_config.layer_norm_eps)
self.sigmoid = nn.Sigmoid()
# Copied from transformers.models.owlvit.modeling_owlvit.OwlViTForObjectDetection.normalize_grid_corner_coordinates
def normalize_grid_corner_coordinates(self, feature_map: torch.FloatTensor):
# Computes normalized xy corner coordinates from feature_map.
if not feature_map.ndim == 4:
raise ValueError("Expected input shape is [batch_size, num_patches, num_patches, hidden_dim]")
device = feature_map.device
num_patches = feature_map.shape[1]
box_coordinates = np.stack(
np.meshgrid(np.arange(1, num_patches + 1), np.arange(1, num_patches + 1)), axis=-1
).astype(np.float32)
box_coordinates /= np.array([num_patches, num_patches], np.float32)
# Flatten (h, w, 2) -> (h*w, 2)
box_coordinates = box_coordinates.reshape(
box_coordinates.shape[0] * box_coordinates.shape[1], box_coordinates.shape[2]
)
box_coordinates = torch.from_numpy(box_coordinates).to(device)
return box_coordinates
def objectness_predictor(self, image_features: torch.FloatTensor) -> torch.FloatTensor:
"""Predicts the probability that each image feature token is an object.
Args:
image_features (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_dim)`)):
Features extracted from the image.
Returns:
Objectness scores.
"""
image_features = image_features.detach()
objectness_logits = self.objectness_head(image_features)
objectness_logits = objectness_logits[..., 0]
return objectness_logits
# Copied from transformers.models.owlvit.modeling_owlvit.OwlViTForObjectDetection.compute_box_bias
def compute_box_bias(self, feature_map: torch.FloatTensor) -> torch.FloatTensor:
# The box center is biased to its position on the feature grid
box_coordinates = self.normalize_grid_corner_coordinates(feature_map)
box_coordinates = torch.clip(box_coordinates, 0.0, 1.0)
# Unnormalize xy
box_coord_bias = torch.log(box_coordinates + 1e-4) - torch.log1p(-box_coordinates + 1e-4)
# The box size is biased to the patch size
box_size = torch.full_like(box_coord_bias, 1.0 / feature_map.shape[-2])
box_size_bias = torch.log(box_size + 1e-4) - torch.log1p(-box_size + 1e-4)
# Compute box bias
box_bias = torch.cat([box_coord_bias, box_size_bias], dim=-1)
return box_bias
# Copied from transformers.models.owlvit.modeling_owlvit.OwlViTForObjectDetection.box_predictor
def box_predictor(
self,
image_feats: torch.FloatTensor,
feature_map: torch.FloatTensor,
) -> torch.FloatTensor:
"""
Args:
image_feats:
Features extracted from the image, returned by the `image_text_embedder` method.
feature_map:
A spatial re-arrangement of image_features, also returned by the `image_text_embedder` method.
Returns:
pred_boxes:
List of predicted boxes (cxcywh normalized to 0, 1) nested within a dictionary.
"""
# Bounding box detection head [batch_size, num_boxes, 4].
pred_boxes = self.box_head(image_feats)
# Compute the location of each token on the grid and use it to compute a bias for the bbox prediction
pred_boxes += self.compute_box_bias(feature_map)
pred_boxes = self.sigmoid(pred_boxes)
return pred_boxes
# Copied from transformers.models.owlvit.modeling_owlvit.OwlViTForObjectDetection.class_predictor
def class_predictor(
self,
image_feats: torch.FloatTensor,
query_embeds: Optional[torch.FloatTensor] = None,
query_mask: Optional[torch.Tensor] = None,
) -> Tuple[torch.FloatTensor]:
"""
Args:
image_feats:
Features extracted from the `image_text_embedder`.
query_embeds:
Text query embeddings.
query_mask:
Must be provided with query_embeddings. A mask indicating which query embeddings are valid.
"""
(pred_logits, image_class_embeds) = self.class_head(image_feats, query_embeds, query_mask)
return (pred_logits, image_class_embeds)
# Copied from transformers.models.owlvit.modeling_owlvit.OwlViTForObjectDetection.image_text_embedder with owlvit->owlv2
def image_text_embedder(
self,
input_ids: torch.Tensor,
pixel_values: torch.FloatTensor,
attention_mask: torch.Tensor,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
) -> Tuple[torch.FloatTensor]:
# Encode text and image
outputs = self.owlv2(
pixel_values=pixel_values,
input_ids=input_ids,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=True,
)
# Get image embeddings
last_hidden_state = outputs.vision_model_output[0]
image_embeds = self.owlv2.vision_model.post_layernorm(last_hidden_state)
# Resize class token
new_size = tuple(np.array(image_embeds.shape) - np.array((0, 1, 0)))
class_token_out = torch.broadcast_to(image_embeds[:, :1, :], new_size)
# Merge image embedding with class tokens
image_embeds = image_embeds[:, 1:, :] * class_token_out
image_embeds = self.layer_norm(image_embeds)
# Resize to [batch_size, num_patches, num_patches, hidden_size]
new_size = (
image_embeds.shape[0],
int(np.sqrt(image_embeds.shape[1])),
int(np.sqrt(image_embeds.shape[1])),
image_embeds.shape[-1],
)
image_embeds = image_embeds.reshape(new_size)
text_embeds = outputs[-4]
return (text_embeds, image_embeds, outputs)
# Copied from transformers.models.owlvit.modeling_owlvit.OwlViTForObjectDetection.image_embedder with owlvit->owlv2, OwlViTModel->Owlv2Model
def image_embedder(
self,
pixel_values: torch.FloatTensor,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
) -> Tuple[torch.FloatTensor]:
# Get Owlv2Model vision embeddings (same as CLIP)
vision_outputs = self.owlv2.vision_model(pixel_values=pixel_values, return_dict=True)
# Apply post_layernorm to last_hidden_state, return non-projected output
last_hidden_state = vision_outputs[0]
image_embeds = self.owlv2.vision_model.post_layernorm(last_hidden_state)
# Resize class token
new_size = tuple(np.array(image_embeds.shape) - np.array((0, 1, 0)))
class_token_out = torch.broadcast_to(image_embeds[:, :1, :], new_size)
# Merge image embedding with class tokens
image_embeds = image_embeds[:, 1:, :] * class_token_out
image_embeds = self.layer_norm(image_embeds)
# Resize to [batch_size, num_patches, num_patches, hidden_size]
new_size = (
image_embeds.shape[0],
int(np.sqrt(image_embeds.shape[1])),
int(np.sqrt(image_embeds.shape[1])),
image_embeds.shape[-1],
)
image_embeds = image_embeds.reshape(new_size)
return (image_embeds, vision_outputs)
# Copied from transformers.models.owlvit.modeling_owlvit.OwlViTForObjectDetection.embed_image_query
def embed_image_query(
self, query_image_features: torch.FloatTensor, query_feature_map: torch.FloatTensor
) -> torch.FloatTensor:
_, class_embeds = self.class_predictor(query_image_features)
pred_boxes = self.box_predictor(query_image_features, query_feature_map)
pred_boxes_as_corners = center_to_corners_format(pred_boxes)
# Loop over query images
best_class_embeds = []
best_box_indices = []
pred_boxes_device = pred_boxes_as_corners.device
for i in range(query_image_features.shape[0]):
each_query_box = torch.tensor([[0, 0, 1, 1]], device=pred_boxes_device)
each_query_pred_boxes = pred_boxes_as_corners[i]
ious, _ = box_iou(each_query_box, each_query_pred_boxes)
# If there are no overlapping boxes, fall back to generalized IoU
if torch.all(ious[0] == 0.0):
ious = generalized_box_iou(each_query_box, each_query_pred_boxes)
# Use an adaptive threshold to include all boxes within 80% of the best IoU
iou_threshold = torch.max(ious) * 0.8
selected_inds = (ious[0] >= iou_threshold).nonzero()
if selected_inds.numel():
selected_embeddings = class_embeds[i][selected_inds.squeeze(1)]
mean_embeds = torch.mean(class_embeds[i], axis=0)
mean_sim = torch.einsum("d,id->i", mean_embeds, selected_embeddings)
best_box_ind = selected_inds[torch.argmin(mean_sim)]
best_class_embeds.append(class_embeds[i][best_box_ind])
best_box_indices.append(best_box_ind)
if best_class_embeds:
query_embeds = torch.stack(best_class_embeds)
box_indices = torch.stack(best_box_indices)
else:
query_embeds, box_indices = None, None
return query_embeds, box_indices, pred_boxes
@add_start_docstrings_to_model_forward(OWLV2_IMAGE_GUIDED_OBJECT_DETECTION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=Owlv2ImageGuidedObjectDetectionOutput, config_class=Owlv2Config)
def image_guided_detection(
self,
pixel_values: torch.FloatTensor,
query_pixel_values: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Owlv2ImageGuidedObjectDetectionOutput:
r"""
Returns:
Examples:
```python
>>> import requests
>>> from PIL import Image
>>> import torch
>>> from transformers import AutoProcessor, Owlv2ForObjectDetection
>>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
>>> model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> query_url = "http://images.cocodataset.org/val2017/000000001675.jpg"
>>> query_image = Image.open(requests.get(query_url, stream=True).raw)
>>> inputs = processor(images=image, query_images=query_image, return_tensors="pt")
>>> with torch.no_grad():
... outputs = model.image_guided_detection(**inputs)
>>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
>>> target_sizes = torch.Tensor([image.size[::-1]])
>>> # Convert outputs (bounding boxes and class logits) to COCO API
>>> results = processor.post_process_image_guided_detection(
... outputs=outputs, threshold=0.9, nms_threshold=0.3, target_sizes=target_sizes
... )
>>> i = 0 # Retrieve predictions for the first image
>>> boxes, scores = results[i]["boxes"], results[i]["scores"]
>>> for box, score in zip(boxes, scores):
... box = [round(i, 2) for i in box.tolist()]
... print(f"Detected similar object with confidence {round(score.item(), 3)} at location {box}")
Detected similar object with confidence 0.938 at location [327.31, 54.94, 547.39, 268.06]
Detected similar object with confidence 0.959 at location [5.78, 360.65, 619.12, 366.39]
Detected similar object with confidence 0.902 at location [2.85, 360.01, 627.63, 380.79]
Detected similar object with confidence 0.985 at location [176.97, -29.45, 672.69, 182.83]
Detected similar object with confidence 1.0 at location [6.53, 14.35, 624.87, 470.82]
Detected similar object with confidence 0.998 at location [579.98, 29.14, 615.49, 489.05]
Detected similar object with confidence 0.985 at location [206.15, 10.53, 247.74, 466.01]
Detected similar object with confidence 0.947 at location [18.62, 429.72, 646.5, 457.72]
Detected similar object with confidence 0.996 at location [523.88, 20.69, 586.84, 483.18]
Detected similar object with confidence 0.998 at location [3.39, 360.59, 617.29, 499.21]
Detected similar object with confidence 0.969 at location [4.47, 449.05, 614.5, 474.76]
Detected similar object with confidence 0.966 at location [31.44, 463.65, 654.66, 471.07]
Detected similar object with confidence 0.924 at location [30.93, 468.07, 635.35, 475.39]
```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.return_dict
# Compute feature maps for the input and query images
query_feature_map = self.image_embedder(pixel_values=query_pixel_values)[0]
feature_map, vision_outputs = self.image_embedder(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
)
batch_size, num_patches, num_patches, hidden_dim = feature_map.shape
image_feats = torch.reshape(feature_map, (batch_size, num_patches * num_patches, hidden_dim))
batch_size, num_patches, num_patches, hidden_dim = query_feature_map.shape
query_image_feats = torch.reshape(query_feature_map, (batch_size, num_patches * num_patches, hidden_dim))
# Get top class embedding and best box index for each query image in batch
query_embeds, best_box_indices, query_pred_boxes = self.embed_image_query(query_image_feats, query_feature_map)
# Predict object classes [batch_size, num_patches, num_queries+1]
(pred_logits, class_embeds) = self.class_predictor(image_feats=image_feats, query_embeds=query_embeds)
# Predict object boxes
target_pred_boxes = self.box_predictor(image_feats, feature_map)
if not return_dict:
output = (
feature_map,
query_feature_map,
target_pred_boxes,
query_pred_boxes,
pred_logits,
class_embeds,
vision_outputs.to_tuple(),
)
output = tuple(x for x in output if x is not None)
return output
return Owlv2ImageGuidedObjectDetectionOutput(
image_embeds=feature_map,
query_image_embeds=query_feature_map,
target_pred_boxes=target_pred_boxes,
query_pred_boxes=query_pred_boxes,
logits=pred_logits,
class_embeds=class_embeds,
text_model_output=None,
vision_model_output=vision_outputs,
)
@add_start_docstrings_to_model_forward(OWLV2_OBJECT_DETECTION_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=Owlv2ObjectDetectionOutput, config_class=Owlv2Config)
def forward(
self,
input_ids: torch.Tensor,
pixel_values: torch.FloatTensor,
attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Owlv2ObjectDetectionOutput:
r"""
Returns:
Examples:
```python
>>> import requests
>>> from PIL import Image
>>> import torch
>>> from transformers import AutoProcessor, Owlv2ForObjectDetection
>>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
>>> model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> texts = [["a photo of a cat", "a photo of a dog"]]
>>> inputs = processor(text=texts, images=image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
>>> target_sizes = torch.Tensor([image.size[::-1]])
>>> # Convert outputs (bounding boxes and class logits) to final bounding boxes and scores
>>> results = processor.post_process_object_detection(
... outputs=outputs, threshold=0.2, target_sizes=target_sizes
... )
>>> i = 0 # Retrieve predictions for the first image for the corresponding text queries
>>> text = texts[i]
>>> boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
>>> for box, score, label in zip(boxes, scores, labels):
... box = [round(i, 2) for i in box.tolist()]
... print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")
Detected a photo of a cat with confidence 0.614 at location [341.67, 17.54, 642.32, 278.51]
Detected a photo of a cat with confidence 0.665 at location [6.75, 38.97, 326.62, 354.85]
```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.return_dict
# Embed images and text queries
query_embeds, feature_map, outputs = self.image_text_embedder(
input_ids=input_ids,
pixel_values=pixel_values,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
)
# Text and vision model outputs
text_outputs = outputs.text_model_output
vision_outputs = outputs.vision_model_output
batch_size, num_patches, num_patches, hidden_dim = feature_map.shape
image_feats = torch.reshape(feature_map, (batch_size, num_patches * num_patches, hidden_dim))
# Reshape from [batch_size * max_text_queries, hidden_dim] -> [batch_size, max_text_queries, hidden_dim]
max_text_queries = input_ids.shape[0] // batch_size
query_embeds = query_embeds.reshape(batch_size, max_text_queries, query_embeds.shape[-1])
# If first token is 0, then this is a padded query [batch_size, num_queries].
input_ids = input_ids.reshape(batch_size, max_text_queries, input_ids.shape[-1])
query_mask = input_ids[..., 0] > 0
# Predict object classes [batch_size, num_patches, num_queries+1]
(pred_logits, class_embeds) = self.class_predictor(image_feats, query_embeds, query_mask)
# Predict objectness
objectness_logits = self.objectness_predictor(image_feats)
# Predict object boxes
pred_boxes = self.box_predictor(image_feats, feature_map)
if not return_dict:
output = (
pred_logits,
objectness_logits,
pred_boxes,
query_embeds,
feature_map,
class_embeds,
text_outputs.to_tuple(),
vision_outputs.to_tuple(),
)
output = tuple(x for x in output if x is not None)
return output
return Owlv2ObjectDetectionOutput(
image_embeds=feature_map,
text_embeds=query_embeds,
pred_boxes=pred_boxes,
logits=pred_logits,
objectness_logits=objectness_logits,
class_embeds=class_embeds,
text_model_output=text_outputs,
vision_model_output=vision_outputs,
)
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Image/Text processor class for OWLv2
"""
from typing import List
import numpy as np
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding
from ...utils import is_flax_available, is_tf_available, is_torch_available
class Owlv2Processor(ProcessorMixin):
r"""
Constructs an Owlv2 processor which wraps [`Owlv2ImageProcessor`] and [`CLIPTokenizer`]/[`CLIPTokenizerFast`] into
a single processor that interits both the image processor and tokenizer functionalities. See the
[`~OwlViTProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more information.
Args:
image_processor ([`Owlv2ImageProcessor`]):
The image processor is a required input.
tokenizer ([`CLIPTokenizer`, `CLIPTokenizerFast`]):
The tokenizer is a required input.
"""
attributes = ["image_processor", "tokenizer"]
image_processor_class = "Owlv2ImageProcessor"
tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
def __init__(self, image_processor, tokenizer, **kwargs):
super().__init__(image_processor, tokenizer)
# Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.__call__ with OWLViT->OWLv2
def __call__(self, text=None, images=None, query_images=None, padding="max_length", return_tensors="np", **kwargs):
"""
Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and
`kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode:
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
of the above two methods for more information.
Args:
text (`str`, `List[str]`, `List[List[str]]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`,
`List[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
number of channels, H and W are image height and width.
query_images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
The query image to be prepared, one query image is expected per target image to be queried. Each image
can be a PIL image, NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image
should be of shape (C, H, W), where C is a number of channels, H and W are image height and width.
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors of a particular framework. Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
[`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
`None`).
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
"""
if text is None and query_images is None and images is None:
raise ValueError(
"You have to specify at least one text or query image or image. All three cannot be none."
)
if text is not None:
if isinstance(text, str) or (isinstance(text, List) and not isinstance(text[0], List)):
encodings = [self.tokenizer(text, padding=padding, return_tensors=return_tensors, **kwargs)]
elif isinstance(text, List) and isinstance(text[0], List):
encodings = []
# Maximum number of queries across batch
max_num_queries = max([len(t) for t in text])
# Pad all batch samples to max number of text queries
for t in text:
if len(t) != max_num_queries:
t = t + [" "] * (max_num_queries - len(t))
encoding = self.tokenizer(t, padding=padding, return_tensors=return_tensors, **kwargs)
encodings.append(encoding)
else:
raise TypeError("Input text should be a string, a list of strings or a nested list of strings")
if return_tensors == "np":
input_ids = np.concatenate([encoding["input_ids"] for encoding in encodings], axis=0)
attention_mask = np.concatenate([encoding["attention_mask"] for encoding in encodings], axis=0)
elif return_tensors == "jax" and is_flax_available():
import jax.numpy as jnp
input_ids = jnp.concatenate([encoding["input_ids"] for encoding in encodings], axis=0)
attention_mask = jnp.concatenate([encoding["attention_mask"] for encoding in encodings], axis=0)
elif return_tensors == "pt" and is_torch_available():
import torch
input_ids = torch.cat([encoding["input_ids"] for encoding in encodings], dim=0)
attention_mask = torch.cat([encoding["attention_mask"] for encoding in encodings], dim=0)
elif return_tensors == "tf" and is_tf_available():
import tensorflow as tf
input_ids = tf.stack([encoding["input_ids"] for encoding in encodings], axis=0)
attention_mask = tf.stack([encoding["attention_mask"] for encoding in encodings], axis=0)
else:
raise ValueError("Target return tensor type could not be returned")
encoding = BatchEncoding()
encoding["input_ids"] = input_ids
encoding["attention_mask"] = attention_mask
if query_images is not None:
encoding = BatchEncoding()
query_pixel_values = self.image_processor(
query_images, return_tensors=return_tensors, **kwargs
).pixel_values
encoding["query_pixel_values"] = query_pixel_values
if images is not None:
image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
if text is not None and images is not None:
encoding["pixel_values"] = image_features.pixel_values
return encoding
elif query_images is not None and images is not None:
encoding["pixel_values"] = image_features.pixel_values
return encoding
elif text is not None or query_images is not None:
return encoding
else:
return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
# Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.post_process_object_detection with OWLViT->OWLv2
def post_process_object_detection(self, *args, **kwargs):
"""
This method forwards all its arguments to [`OwlViTImageProcessor.post_process_object_detection`]. Please refer
to the docstring of this method for more information.
"""
return self.image_processor.post_process_object_detection(*args, **kwargs)
# Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.post_process_image_guided_detection with OWLViT->OWLv2
def post_process_image_guided_detection(self, *args, **kwargs):
"""
This method forwards all its arguments to [`OwlViTImageProcessor.post_process_one_shot_object_detection`].
Please refer to the docstring of this method for more information.
"""
return self.image_processor.post_process_image_guided_detection(*args, **kwargs)
# Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.batch_decode
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
# Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.decode
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
the docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
...@@ -66,15 +66,21 @@ class OwlViTTextConfig(PretrainedConfig): ...@@ -66,15 +66,21 @@ class OwlViTTextConfig(PretrainedConfig):
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`): hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-5): layer_norm_eps (`float`, *optional*, defaults to 1e-05):
The epsilon used by the layer normalization layers. The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0): attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
initializer_range (`float`, *optional*, defaults to 0.02): initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
initializer_factor (`float`, *optional*, defaults to 1): initializer_factor (`float`, *optional*, defaults to 1.0):
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
testing). testing).
pad_token_id (`int`, *optional*, defaults to 0):
The id of the padding token in the input sequences.
bos_token_id (`int`, *optional*, defaults to 49406):
The id of the beginning-of-sequence token in the input sequences.
eos_token_id (`int`, *optional*, defaults to 49407):
The id of the end-of-sequence token in the input sequences.
Example: Example:
...@@ -268,6 +274,8 @@ class OwlViTConfig(PretrainedConfig): ...@@ -268,6 +274,8 @@ class OwlViTConfig(PretrainedConfig):
logit_scale_init_value (`float`, *optional*, defaults to 2.6592): logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
The inital value of the *logit_scale* parameter. Default is used as per the original OWL-ViT The inital value of the *logit_scale* parameter. Default is used as per the original OWL-ViT
implementation. implementation.
return_dict (`bool`, *optional*, defaults to `True`):
Whether or not the model should return a dictionary. If `False`, returns a tuple.
kwargs (*optional*): kwargs (*optional*):
Dictionary of keyword arguments. Dictionary of keyword arguments.
""" """
......
...@@ -582,11 +582,16 @@ class OwlViTPreTrainedModel(PreTrainedModel): ...@@ -582,11 +582,16 @@ class OwlViTPreTrainedModel(PreTrainedModel):
OWLVIT_START_DOCSTRING = r""" OWLVIT_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters: Parameters:
This model is a PyTorch [torch.nn.Module](https:
//pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
config ([`OwlViTConfig`]): Model configuration class with all the parameters of the model. config ([`OwlViTConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
...@@ -1250,14 +1255,14 @@ class OwlViTModel(OwlViTPreTrainedModel): ...@@ -1250,14 +1255,14 @@ class OwlViTModel(OwlViTPreTrainedModel):
class OwlViTBoxPredictionHead(nn.Module): class OwlViTBoxPredictionHead(nn.Module):
def __init__(self, config: OwlViTConfig): def __init__(self, config: OwlViTConfig, out_dim: int = 4):
super().__init__() super().__init__()
width = config.vision_config.hidden_size width = config.vision_config.hidden_size
self.dense0 = nn.Linear(width, width) self.dense0 = nn.Linear(width, width)
self.dense1 = nn.Linear(width, width) self.dense1 = nn.Linear(width, width)
self.gelu = nn.GELU() self.gelu = nn.GELU()
self.dense2 = nn.Linear(width, 4) self.dense2 = nn.Linear(width, out_dim)
def forward(self, image_features: torch.Tensor) -> torch.FloatTensor: def forward(self, image_features: torch.Tensor) -> torch.FloatTensor:
output = self.dense0(image_features) output = self.dense0(image_features)
......
...@@ -5860,6 +5860,44 @@ class OPTPreTrainedModel(metaclass=DummyObject): ...@@ -5860,6 +5860,44 @@ class OPTPreTrainedModel(metaclass=DummyObject):
requires_backends(self, ["torch"]) requires_backends(self, ["torch"])
OWLV2_PRETRAINED_MODEL_ARCHIVE_LIST = None
class Owlv2ForObjectDetection(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class Owlv2Model(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class Owlv2PreTrainedModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class Owlv2TextModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class Owlv2VisionModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
OWLVIT_PRETRAINED_MODEL_ARCHIVE_LIST = None OWLVIT_PRETRAINED_MODEL_ARCHIVE_LIST = None
......
...@@ -373,6 +373,13 @@ class OneFormerImageProcessor(metaclass=DummyObject): ...@@ -373,6 +373,13 @@ class OneFormerImageProcessor(metaclass=DummyObject):
requires_backends(self, ["vision"]) requires_backends(self, ["vision"])
class Owlv2ImageProcessor(metaclass=DummyObject):
_backends = ["vision"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["vision"])
class OwlViTFeatureExtractor(metaclass=DummyObject): class OwlViTFeatureExtractor(metaclass=DummyObject):
_backends = ["vision"] _backends = ["vision"]
......
# coding=utf-8
# Copyright 2023 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from transformers.testing_utils import require_torch, require_vision, slow
from transformers.utils import is_vision_available
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
if is_vision_available():
from PIL import Image
from transformers import Owlv2ImageProcessor
class Owlv2ImageProcessingTester(unittest.TestCase):
def __init__(
self,
parent,
batch_size=7,
num_channels=3,
image_size=18,
min_resolution=30,
max_resolution=400,
do_resize=True,
size=None,
do_normalize=True,
image_mean=[0.48145466, 0.4578275, 0.40821073],
image_std=[0.26862954, 0.26130258, 0.27577711],
do_convert_rgb=True,
):
self.parent = parent
self.batch_size = batch_size
self.num_channels = num_channels
self.image_size = image_size
self.min_resolution = min_resolution
self.max_resolution = max_resolution
self.do_resize = do_resize
self.size = size if size is not None else {"height": 18, "width": 18}
self.do_normalize = do_normalize
self.image_mean = image_mean
self.image_std = image_std
self.do_convert_rgb = do_convert_rgb
def prepare_image_processor_dict(self):
return {
"do_resize": self.do_resize,
"size": self.size,
"do_normalize": self.do_normalize,
"image_mean": self.image_mean,
"image_std": self.image_std,
}
def expected_output_image_shape(self, images):
return self.num_channels, self.size["height"], self.size["width"]
def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
return prepare_image_inputs(
batch_size=self.batch_size,
num_channels=self.num_channels,
min_resolution=self.min_resolution,
max_resolution=self.max_resolution,
equal_resolution=equal_resolution,
numpify=numpify,
torchify=torchify,
)
@require_torch
@require_vision
class Owlv2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
image_processing_class = Owlv2ImageProcessor if is_vision_available() else None
def setUp(self):
self.image_processor_tester = Owlv2ImageProcessingTester(self)
@property
def image_processor_dict(self):
return self.image_processor_tester.prepare_image_processor_dict()
def test_image_processor_properties(self):
image_processing = self.image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processing, "do_resize"))
self.assertTrue(hasattr(image_processing, "size"))
self.assertTrue(hasattr(image_processing, "do_normalize"))
self.assertTrue(hasattr(image_processing, "image_mean"))
self.assertTrue(hasattr(image_processing, "image_std"))
def test_image_processor_from_dict_with_kwargs(self):
image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
self.assertEqual(image_processor.size, {"height": 18, "width": 18})
image_processor = self.image_processing_class.from_dict(
self.image_processor_dict, size={"height": 42, "width": 42}
)
self.assertEqual(image_processor.size, {"height": 42, "width": 42})
@slow
def test_image_processor_integration_test(self):
processor = Owlv2ImageProcessor()
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
pixel_values = processor(image, return_tensors="pt").pixel_values
mean_value = round(pixel_values.mean().item(), 4)
self.assertEqual(mean_value, 0.2353)
@unittest.skip("OWLv2 doesn't treat 4 channel PIL and numpy consistently yet") # FIXME Amy
def test_call_numpy_4_channels(self):
pass
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Testing suite for the PyTorch Owlv2 model. """
import inspect
import os
import tempfile
import unittest
import numpy as np
import requests
from transformers import Owlv2Config, Owlv2TextConfig, Owlv2VisionConfig
from transformers.testing_utils import require_torch, require_torch_gpu, require_vision, slow, torch_device
from transformers.utils import is_torch_available, is_vision_available
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import (
ModelTesterMixin,
_config_zero_init,
floats_tensor,
ids_tensor,
random_attention_mask,
)
from ...test_pipeline_mixin import PipelineTesterMixin
if is_torch_available():
import torch
from torch import nn
from transformers import Owlv2ForObjectDetection, Owlv2Model, Owlv2TextModel, Owlv2VisionModel
from transformers.models.owlv2.modeling_owlv2 import OWLV2_PRETRAINED_MODEL_ARCHIVE_LIST
if is_vision_available():
from PIL import Image
from transformers import OwlViTProcessor
# Copied from tests.models.owlvit.test_modeling_owlvit.OwlViTVisionModelTester with OwlViT->Owlv2
class Owlv2VisionModelTester:
def __init__(
self,
parent,
batch_size=12,
image_size=32,
patch_size=2,
num_channels=3,
is_training=True,
hidden_size=32,
num_hidden_layers=2,
num_attention_heads=4,
intermediate_size=37,
dropout=0.1,
attention_dropout=0.1,
initializer_range=0.02,
scope=None,
):
self.parent = parent
self.batch_size = batch_size
self.image_size = image_size
self.patch_size = patch_size
self.num_channels = num_channels
self.is_training = is_training
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.dropout = dropout
self.attention_dropout = attention_dropout
self.initializer_range = initializer_range
self.scope = scope
# in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
num_patches = (image_size // patch_size) ** 2
self.seq_length = num_patches + 1
def prepare_config_and_inputs(self):
pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
config = self.get_config()
return config, pixel_values
def get_config(self):
return Owlv2VisionConfig(
image_size=self.image_size,
patch_size=self.patch_size,
num_channels=self.num_channels,
hidden_size=self.hidden_size,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
intermediate_size=self.intermediate_size,
dropout=self.dropout,
attention_dropout=self.attention_dropout,
initializer_range=self.initializer_range,
)
def create_and_check_model(self, config, pixel_values):
model = Owlv2VisionModel(config=config).to(torch_device)
model.eval()
pixel_values = pixel_values.to(torch.float32)
with torch.no_grad():
result = model(pixel_values)
# expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
num_patches = (self.image_size // self.patch_size) ** 2
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, pixel_values = config_and_inputs
inputs_dict = {"pixel_values": pixel_values}
return config, inputs_dict
@require_torch
# Copied from tests.models.owlvit.test_modeling_owlvit.OwlViTVisionModelTest with OwlViT->Owlv2, OWL-ViT->OwlV2, OWLVIT->OWLV2
class Owlv2VisionModelTest(ModelTesterMixin, unittest.TestCase):
"""
Here we also overwrite some of the tests of test_modeling_common.py, as OWLV2 does not use input_ids,
inputs_embeds, attention_mask and seq_length.
"""
all_model_classes = (Owlv2VisionModel,) if is_torch_available() else ()
fx_compatible = False
test_pruning = False
test_resize_embeddings = False
test_head_masking = False
def setUp(self):
self.model_tester = Owlv2VisionModelTester(self)
self.config_tester = ConfigTester(
self, config_class=Owlv2VisionConfig, has_text_modality=False, hidden_size=37
)
def test_config(self):
self.config_tester.run_common_tests()
@unittest.skip(reason="OWLV2 does not use inputs_embeds")
def test_inputs_embeds(self):
pass
def test_model_common_attributes(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
x = model.get_output_embeddings()
self.assertTrue(x is None or isinstance(x, nn.Linear))
def test_forward_signature(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
signature = inspect.signature(model.forward)
# signature.parameters is an OrderedDict => so arg_names order is deterministic
arg_names = [*signature.parameters.keys()]
expected_arg_names = ["pixel_values"]
self.assertListEqual(arg_names[:1], expected_arg_names)
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
@unittest.skip(reason="OwlV2 does not support training yet")
def test_training(self):
pass
@unittest.skip(reason="OwlV2 does not support training yet")
def test_training_gradient_checkpointing(self):
pass
@unittest.skip(reason="Owlv2VisionModel has no base class and is not available in MODEL_MAPPING")
def test_save_load_fast_init_from_base(self):
pass
@unittest.skip(reason="Owlv2VisionModel has no base class and is not available in MODEL_MAPPING")
def test_save_load_fast_init_to_base(self):
pass
@slow
def test_model_from_pretrained(self):
for model_name in OWLV2_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
model = Owlv2VisionModel.from_pretrained(model_name)
self.assertIsNotNone(model)
# Copied from tests.models.owlvit.test_modeling_owlvit.OwlViTTextModelTester with OwlViT->Owlv2
class Owlv2TextModelTester:
def __init__(
self,
parent,
batch_size=12,
num_queries=4,
seq_length=16,
is_training=True,
use_input_mask=True,
use_labels=True,
vocab_size=99,
hidden_size=64,
num_hidden_layers=12,
num_attention_heads=4,
intermediate_size=37,
dropout=0.1,
attention_dropout=0.1,
max_position_embeddings=16,
initializer_range=0.02,
scope=None,
):
self.parent = parent
self.batch_size = batch_size
self.num_queries = num_queries
self.seq_length = seq_length
self.is_training = is_training
self.use_input_mask = use_input_mask
self.use_labels = use_labels
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.dropout = dropout
self.attention_dropout = attention_dropout
self.max_position_embeddings = max_position_embeddings
self.initializer_range = initializer_range
self.scope = scope
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size * self.num_queries, self.seq_length], self.vocab_size)
input_mask = None
if self.use_input_mask:
input_mask = random_attention_mask([self.batch_size * self.num_queries, self.seq_length])
if input_mask is not None:
num_text, seq_length = input_mask.shape
rnd_start_indices = np.random.randint(1, seq_length - 1, size=(num_text,))
for idx, start_index in enumerate(rnd_start_indices):
input_mask[idx, :start_index] = 1
input_mask[idx, start_index:] = 0
config = self.get_config()
return config, input_ids, input_mask
def get_config(self):
return Owlv2TextConfig(
vocab_size=self.vocab_size,
hidden_size=self.hidden_size,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
intermediate_size=self.intermediate_size,
dropout=self.dropout,
attention_dropout=self.attention_dropout,
max_position_embeddings=self.max_position_embeddings,
initializer_range=self.initializer_range,
)
def create_and_check_model(self, config, input_ids, input_mask):
model = Owlv2TextModel(config=config).to(torch_device)
model.eval()
with torch.no_grad():
result = model(input_ids=input_ids, attention_mask=input_mask)
self.parent.assertEqual(
result.last_hidden_state.shape, (self.batch_size * self.num_queries, self.seq_length, self.hidden_size)
)
self.parent.assertEqual(result.pooler_output.shape, (self.batch_size * self.num_queries, self.hidden_size))
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, input_ids, input_mask = config_and_inputs
inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
return config, inputs_dict
@require_torch
# Copied from tests.models.owlvit.test_modeling_owlvit.OwlViTTextModelTest with OwlViT->Owlv2, OWL-ViT->OwlV2, OWLVIT->OWLV2
class Owlv2TextModelTest(ModelTesterMixin, unittest.TestCase):
all_model_classes = (Owlv2TextModel,) if is_torch_available() else ()
fx_compatible = False
test_pruning = False
test_head_masking = False
def setUp(self):
self.model_tester = Owlv2TextModelTester(self)
self.config_tester = ConfigTester(self, config_class=Owlv2TextConfig, hidden_size=37)
def test_config(self):
self.config_tester.run_common_tests()
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
@unittest.skip(reason="OwlV2 does not support training yet")
def test_training(self):
pass
@unittest.skip(reason="OwlV2 does not support training yet")
def test_training_gradient_checkpointing(self):
pass
@unittest.skip(reason="OWLV2 does not use inputs_embeds")
def test_inputs_embeds(self):
pass
@unittest.skip(reason="Owlv2TextModel has no base class and is not available in MODEL_MAPPING")
def test_save_load_fast_init_from_base(self):
pass
@unittest.skip(reason="Owlv2TextModel has no base class and is not available in MODEL_MAPPING")
def test_save_load_fast_init_to_base(self):
pass
@slow
def test_model_from_pretrained(self):
for model_name in OWLV2_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
model = Owlv2TextModel.from_pretrained(model_name)
self.assertIsNotNone(model)
class Owlv2ModelTester:
def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
if text_kwargs is None:
text_kwargs = {}
if vision_kwargs is None:
vision_kwargs = {}
self.parent = parent
self.text_model_tester = Owlv2TextModelTester(parent, **text_kwargs)
self.vision_model_tester = Owlv2VisionModelTester(parent, **vision_kwargs)
self.is_training = is_training
self.text_config = self.text_model_tester.get_config().to_dict()
self.vision_config = self.vision_model_tester.get_config().to_dict()
def prepare_config_and_inputs(self):
text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
config = self.get_config()
return config, input_ids, attention_mask, pixel_values
def get_config(self):
return Owlv2Config.from_text_vision_configs(self.text_config, self.vision_config, projection_dim=64)
def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
model = Owlv2Model(config).to(torch_device).eval()
with torch.no_grad():
result = model(
input_ids=input_ids,
pixel_values=pixel_values,
attention_mask=attention_mask,
)
image_logits_size = (
self.vision_model_tester.batch_size,
self.text_model_tester.batch_size * self.text_model_tester.num_queries,
)
text_logits_size = (
self.text_model_tester.batch_size * self.text_model_tester.num_queries,
self.vision_model_tester.batch_size,
)
self.parent.assertEqual(result.logits_per_image.shape, image_logits_size)
self.parent.assertEqual(result.logits_per_text.shape, text_logits_size)
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, input_ids, attention_mask, pixel_values = config_and_inputs
inputs_dict = {
"pixel_values": pixel_values,
"input_ids": input_ids,
"attention_mask": attention_mask,
"return_loss": False,
}
return config, inputs_dict
@require_torch
# Copied from tests.models.owlvit.test_modeling_owlvit.OwlViTModelTest with OwlViT->Owlv2, OWL-ViT->OwlV2, OWLVIT->OWLV2
class Owlv2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
all_model_classes = (Owlv2Model,) if is_torch_available() else ()
pipeline_model_mapping = (
{"feature-extraction": Owlv2Model, "zero-shot-object-detection": Owlv2ForObjectDetection}
if is_torch_available()
else {}
)
fx_compatible = False
test_head_masking = False
test_pruning = False
test_resize_embeddings = False
test_attention_outputs = False
def setUp(self):
self.model_tester = Owlv2ModelTester(self)
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
@unittest.skip(reason="Hidden_states is tested in individual model tests")
def test_hidden_states_output(self):
pass
@unittest.skip(reason="Inputs_embeds is tested in individual model tests")
def test_inputs_embeds(self):
pass
@unittest.skip(reason="Retain_grad is tested in individual model tests")
def test_retain_grad_hidden_states_attentions(self):
pass
@unittest.skip(reason="Owlv2Model does not have input/output embeddings")
def test_model_common_attributes(self):
pass
# override as the `logit_scale` parameter initilization is different for OWLV2
def test_initialization(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
configs_no_init = _config_zero_init(config)
for model_class in self.all_model_classes:
model = model_class(config=configs_no_init)
for name, param in model.named_parameters():
if param.requires_grad:
# check if `logit_scale` is initilized as per the original implementation
if name == "logit_scale":
self.assertAlmostEqual(
param.data.item(),
np.log(1 / 0.07),
delta=1e-3,
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
)
else:
self.assertIn(
((param.data.mean() * 1e9).round() / 1e9).item(),
[0.0, 1.0],
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
)
def _create_and_check_torchscript(self, config, inputs_dict):
if not self.test_torchscript:
return
configs_no_init = _config_zero_init(config) # To be sure we have no Nan
configs_no_init.torchscript = True
configs_no_init.return_dict = False
for model_class in self.all_model_classes:
model = model_class(config=configs_no_init).to(torch_device)
model.eval()
try:
input_ids = inputs_dict["input_ids"]
pixel_values = inputs_dict["pixel_values"] # OWLV2 needs pixel_values
traced_model = torch.jit.trace(model, (input_ids, pixel_values))
except RuntimeError:
self.fail("Couldn't trace module.")
with tempfile.TemporaryDirectory() as tmp_dir_name:
pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
try:
torch.jit.save(traced_model, pt_file_name)
except Exception:
self.fail("Couldn't save module.")
try:
loaded_model = torch.jit.load(pt_file_name)
except Exception:
self.fail("Couldn't load module.")
loaded_model = loaded_model.to(torch_device)
loaded_model.eval()
model_state_dict = model.state_dict()
loaded_model_state_dict = loaded_model.state_dict()
non_persistent_buffers = {}
for key in loaded_model_state_dict.keys():
if key not in model_state_dict.keys():
non_persistent_buffers[key] = loaded_model_state_dict[key]
loaded_model_state_dict = {
key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
}
self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
model_buffers = list(model.buffers())
for non_persistent_buffer in non_persistent_buffers.values():
found_buffer = False
for i, model_buffer in enumerate(model_buffers):
if torch.equal(non_persistent_buffer, model_buffer):
found_buffer = True
break
self.assertTrue(found_buffer)
model_buffers.pop(i)
models_equal = True
for layer_name, p1 in model_state_dict.items():
p2 = loaded_model_state_dict[layer_name]
if p1.data.ne(p2.data).sum() > 0:
models_equal = False
self.assertTrue(models_equal)
def test_load_vision_text_config(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
# Save Owlv2Config and check if we can load Owlv2VisionConfig from it
with tempfile.TemporaryDirectory() as tmp_dir_name:
config.save_pretrained(tmp_dir_name)
vision_config = Owlv2VisionConfig.from_pretrained(tmp_dir_name)
self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
# Save Owlv2Config and check if we can load Owlv2TextConfig from it
with tempfile.TemporaryDirectory() as tmp_dir_name:
config.save_pretrained(tmp_dir_name)
text_config = Owlv2TextConfig.from_pretrained(tmp_dir_name)
self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
@slow
def test_model_from_pretrained(self):
for model_name in OWLV2_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
model = Owlv2Model.from_pretrained(model_name)
self.assertIsNotNone(model)
# Copied from tests.models.owlvit.test_modeling_owlvit.OwlViTForObjectDetectionTester with OwlViT->Owlv2, OWL-ViT->OwlV2, OWLVIT->OWLV2
class Owlv2ForObjectDetectionTester:
def __init__(self, parent, is_training=True):
self.parent = parent
self.text_model_tester = Owlv2TextModelTester(parent)
self.vision_model_tester = Owlv2VisionModelTester(parent)
self.is_training = is_training
self.text_config = self.text_model_tester.get_config().to_dict()
self.vision_config = self.vision_model_tester.get_config().to_dict()
def prepare_config_and_inputs(self):
text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
config = self.get_config()
return config, pixel_values, input_ids, attention_mask
def get_config(self):
return Owlv2Config.from_text_vision_configs(self.text_config, self.vision_config, projection_dim=64)
def create_and_check_model(self, config, pixel_values, input_ids, attention_mask):
model = Owlv2ForObjectDetection(config).to(torch_device).eval()
with torch.no_grad():
result = model(
pixel_values=pixel_values,
input_ids=input_ids,
attention_mask=attention_mask,
return_dict=True,
)
pred_boxes_size = (
self.vision_model_tester.batch_size,
(self.vision_model_tester.image_size // self.vision_model_tester.patch_size) ** 2,
4,
)
pred_logits_size = (
self.vision_model_tester.batch_size,
(self.vision_model_tester.image_size // self.vision_model_tester.patch_size) ** 2,
4,
)
pred_class_embeds_size = (
self.vision_model_tester.batch_size,
(self.vision_model_tester.image_size // self.vision_model_tester.patch_size) ** 2,
self.text_model_tester.hidden_size,
)
self.parent.assertEqual(result.pred_boxes.shape, pred_boxes_size)
self.parent.assertEqual(result.logits.shape, pred_logits_size)
self.parent.assertEqual(result.class_embeds.shape, pred_class_embeds_size)
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, pixel_values, input_ids, attention_mask = config_and_inputs
inputs_dict = {
"pixel_values": pixel_values,
"input_ids": input_ids,
"attention_mask": attention_mask,
}
return config, inputs_dict
@require_torch
# Copied from tests.models.owlvit.test_modeling_owlvit.OwlViTForObjectDetectionTest with OwlViT->Owlv2, OWL-ViT->OwlV2, OWLVIT->OWLV2
class Owlv2ForObjectDetectionTest(ModelTesterMixin, unittest.TestCase):
all_model_classes = (Owlv2ForObjectDetection,) if is_torch_available() else ()
fx_compatible = False
test_head_masking = False
test_pruning = False
test_resize_embeddings = False
test_attention_outputs = False
def setUp(self):
self.model_tester = Owlv2ForObjectDetectionTester(self)
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
@unittest.skip(reason="Hidden_states is tested in individual model tests")
def test_hidden_states_output(self):
pass
@unittest.skip(reason="Inputs_embeds is tested in individual model tests")
def test_inputs_embeds(self):
pass
@unittest.skip(reason="Retain_grad is tested in individual model tests")
def test_retain_grad_hidden_states_attentions(self):
pass
@unittest.skip(reason="Owlv2Model does not have input/output embeddings")
def test_model_common_attributes(self):
pass
@unittest.skip(reason="Test_initialization is tested in individual model tests")
def test_initialization(self):
pass
@unittest.skip(reason="Test_forward_signature is tested in individual model tests")
def test_forward_signature(self):
pass
@unittest.skip(reason="Test_save_load_fast_init_from_base is tested in individual model tests")
def test_save_load_fast_init_from_base(self):
pass
@unittest.skip(reason="OwlV2 does not support training yet")
def test_training(self):
pass
@unittest.skip(reason="OwlV2 does not support training yet")
def test_training_gradient_checkpointing(self):
pass
def _create_and_check_torchscript(self, config, inputs_dict):
if not self.test_torchscript:
return
configs_no_init = _config_zero_init(config) # To be sure we have no Nan
configs_no_init.torchscript = True
configs_no_init.return_dict = False
for model_class in self.all_model_classes:
model = model_class(config=configs_no_init).to(torch_device)
model.eval()
try:
input_ids = inputs_dict["input_ids"]
pixel_values = inputs_dict["pixel_values"] # OWLV2 needs pixel_values
traced_model = torch.jit.trace(model, (input_ids, pixel_values))
except RuntimeError:
self.fail("Couldn't trace module.")
with tempfile.TemporaryDirectory() as tmp_dir_name:
pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
try:
torch.jit.save(traced_model, pt_file_name)
except Exception:
self.fail("Couldn't save module.")
try:
loaded_model = torch.jit.load(pt_file_name)
except Exception:
self.fail("Couldn't load module.")
loaded_model = loaded_model.to(torch_device)
loaded_model.eval()
model_state_dict = model.state_dict()
loaded_model_state_dict = loaded_model.state_dict()
non_persistent_buffers = {}
for key in loaded_model_state_dict.keys():
if key not in model_state_dict.keys():
non_persistent_buffers[key] = loaded_model_state_dict[key]
loaded_model_state_dict = {
key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
}
self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
model_buffers = list(model.buffers())
for non_persistent_buffer in non_persistent_buffers.values():
found_buffer = False
for i, model_buffer in enumerate(model_buffers):
if torch.equal(non_persistent_buffer, model_buffer):
found_buffer = True
break
self.assertTrue(found_buffer)
model_buffers.pop(i)
models_equal = True
for layer_name, p1 in model_state_dict.items():
p2 = loaded_model_state_dict[layer_name]
if p1.data.ne(p2.data).sum() > 0:
models_equal = False
self.assertTrue(models_equal)
@slow
def test_model_from_pretrained(self):
for model_name in OWLV2_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
model = Owlv2ForObjectDetection.from_pretrained(model_name)
self.assertIsNotNone(model)
# We will verify our results on an image of cute cats
def prepare_img():
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
im = Image.open(requests.get(url, stream=True).raw)
return im
@require_vision
@require_torch
class Owlv2ModelIntegrationTest(unittest.TestCase):
@slow
def test_inference(self):
model_name = "google/owlv2-base-patch16"
model = Owlv2Model.from_pretrained(model_name).to(torch_device)
processor = OwlViTProcessor.from_pretrained(model_name)
image = prepare_img()
inputs = processor(
text=[["a photo of a cat", "a photo of a dog"]],
images=image,
max_length=16,
padding="max_length",
return_tensors="pt",
).to(torch_device)
# forward pass
with torch.no_grad():
outputs = model(**inputs)
# verify the logits
self.assertEqual(
outputs.logits_per_image.shape,
torch.Size((inputs.pixel_values.shape[0], inputs.input_ids.shape[0])),
)
self.assertEqual(
outputs.logits_per_text.shape,
torch.Size((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])),
)
expected_logits = torch.tensor([[-6.2229, -8.2601]], device=torch_device)
self.assertTrue(torch.allclose(outputs.logits_per_image, expected_logits, atol=1e-3))
@slow
def test_inference_object_detection(self):
model_name = "google/owlv2-base-patch16"
model = Owlv2ForObjectDetection.from_pretrained(model_name).to(torch_device)
processor = OwlViTProcessor.from_pretrained(model_name)
image = prepare_img()
inputs = processor(
text=[["a photo of a cat", "a photo of a dog"]],
images=image,
max_length=16,
padding="max_length",
return_tensors="pt",
).to(torch_device)
with torch.no_grad():
outputs = model(**inputs)
num_queries = int((model.config.vision_config.image_size / model.config.vision_config.patch_size) ** 2)
self.assertEqual(outputs.pred_boxes.shape, torch.Size((1, num_queries, 4)))
expected_slice_logits = torch.tensor([[-21.4139, -21.6130], [-19.0084, -19.5491], [-20.9592, -21.3830]])
self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_slice_logits, atol=1e-4))
expected_slice_boxes = torch.tensor(
[[0.2413, 0.0519, 0.4533], [0.1395, 0.0457, 0.2507], [0.2330, 0.0505, 0.4277]],
).to(torch_device)
self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
@slow
def test_inference_one_shot_object_detection(self):
model_name = "google/owlv2-base-patch16"
model = Owlv2ForObjectDetection.from_pretrained(model_name).to(torch_device)
processor = OwlViTProcessor.from_pretrained(model_name)
image = prepare_img()
query_image = prepare_img()
inputs = processor(
images=image,
query_images=query_image,
max_length=16,
padding="max_length",
return_tensors="pt",
).to(torch_device)
with torch.no_grad():
outputs = model.image_guided_detection(**inputs)
num_queries = int((model.config.vision_config.image_size / model.config.vision_config.patch_size) ** 2)
self.assertEqual(outputs.target_pred_boxes.shape, torch.Size((1, num_queries, 4)))
expected_slice_boxes = torch.tensor(
[[0.2413, 0.0519, 0.4533], [0.1395, 0.0457, 0.2507], [0.2330, 0.0505, 0.4277]],
).to(torch_device)
self.assertTrue(torch.allclose(outputs.target_pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
@slow
@require_torch_gpu
def test_inference_one_shot_object_detection_fp16(self):
model_name = "google/owlv2-base-patch16"
model = Owlv2ForObjectDetection.from_pretrained(model_name, torch_dtype=torch.float16).to(torch_device)
processor = OwlViTProcessor.from_pretrained(model_name)
image = prepare_img()
query_image = prepare_img()
inputs = processor(
images=image,
query_images=query_image,
max_length=16,
padding="max_length",
return_tensors="pt",
).to(torch_device)
with torch.no_grad():
outputs = model.image_guided_detection(**inputs)
# No need to check the logits, we just check inference runs fine.
num_queries = int((model.config.vision_config.image_size / model.config.vision_config.patch_size) ** 2)
self.assertEqual(outputs.target_pred_boxes.shape, torch.Size((1, num_queries, 4)))
...@@ -417,9 +417,6 @@ OBJECTS_TO_IGNORE = [ ...@@ -417,9 +417,6 @@ OBJECTS_TO_IGNORE = [
"OneFormerProcessor", "OneFormerProcessor",
"OpenAIGPTTokenizerFast", "OpenAIGPTTokenizerFast",
"OpenLlamaConfig", "OpenLlamaConfig",
"OwlViTConfig",
"OwlViTModel",
"OwlViTTextConfig",
"PLBartConfig", "PLBartConfig",
"PegasusConfig", "PegasusConfig",
"PegasusTokenizer", "PegasusTokenizer",
......
...@@ -234,6 +234,8 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [ ...@@ -234,6 +234,8 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
"OpenAIGPTDoubleHeadsModel", "OpenAIGPTDoubleHeadsModel",
"OwlViTTextModel", "OwlViTTextModel",
"OwlViTVisionModel", "OwlViTVisionModel",
"Owlv2TextModel",
"Owlv2VisionModel",
"OwlViTForObjectDetection", "OwlViTForObjectDetection",
"RagModel", "RagModel",
"RagSequenceForGeneration", "RagSequenceForGeneration",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment