Unverified Commit 5451f889 authored by NielsRogge's avatar NielsRogge Committed by GitHub
Browse files

Add DETA (#20983)

* First draft

* Add initial draft of conversion script

* Convert all weights

* Fix config

* Add image processor

* Fix DetaImageProcessor

* Run make fix copies

* Remove timm dependency

* Fix dummy objects

* Improve loss function

* Remove conv_encoder attribute

* Update conversion scripts

* Improve postprocessing + docs

* Fix copied from statements

* Add tests

* Improve postprocessing

* Improve postprocessing

* Update READMEs

* More improvements

* Fix rebase

* Add is_torchvision_available

* Add torchvision dependency

* Fix typo and README

* Fix bug

* Add copied from

* Fix style

* Apply suggestions

* Fix thanks to @ydshieh

* Fix another dependency check

* Simplify image processor

* Add scipy

* Improve code

* Add threshold argument

* Fix bug

* Set default threshold

* Improve integration test

* Add another integration test

* Update setup.py

* Address review

* Improve deformable attention function

* Improve copied from

* Use relative imports

* Address review

* Replace assertions

* Address review

* Update dummies

* Remove dummies

* Address comments, update READMEs

* Remove custom kernel code

* Add image processor tests

* Add requires_backends

* Add minor comment

* Update scripts

* Update organization name

* Fix defaults, add doc tests

* Add id2label for object 365

* Fix tests

* Update task guide
parent 98d88b23
# flake8: noqa
# There's no way to ignore "F401 '...' imported but unused" warnings in this
# module, but to preserve other warnings. So, don't check this module at all.
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
_import_structure = {
"configuration_deta": ["DETA_PRETRAINED_CONFIG_ARCHIVE_MAP", "DetaConfig"],
}
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["image_processing_deta"] = ["DetaImageProcessor"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_deta"] = [
"DETA_PRETRAINED_MODEL_ARCHIVE_LIST",
"DetaForObjectDetection",
"DetaModel",
"DetaPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_deta import DETA_PRETRAINED_CONFIG_ARCHIVE_MAP, DetaConfig
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .image_processing_deta import DetaImageProcessor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_deta import (
DETA_PRETRAINED_MODEL_ARCHIVE_LIST,
DetaForObjectDetection,
DetaModel,
DetaPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
# coding=utf-8
# Copyright 2022 SenseTime and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" DETA model configuration"""
import copy
from ...configuration_utils import PretrainedConfig
from ...utils import logging
from ..auto import CONFIG_MAPPING
logger = logging.get_logger(__name__)
DETA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"ut/deta": "https://huggingface.co/ut/deta/resolve/main/config.json",
}
class DetaConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`DetaModel`]. It is used to instantiate a DETA
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the DETA
[SenseTime/deformable-detr](https://huggingface.co/SenseTime/deformable-detr) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
backbone_config (`PretrainedConfig` or `dict`, *optional*, defaults to `ResNetConfig()`):
The configuration of the backbone model.
num_queries (`int`, *optional*, defaults to 900):
Number of object queries, i.e. detection slots. This is the maximal number of objects [`DetaModel`] can
detect in a single image. In case `two_stage` is set to `True`, we use `two_stage_num_proposals` instead.
d_model (`int`, *optional*, defaults to 256):
Dimension of the layers.
encoder_layers (`int`, *optional*, defaults to 6):
Number of encoder layers.
decoder_layers (`int`, *optional*, defaults to 6):
Number of decoder layers.
encoder_attention_heads (`int`, *optional*, defaults to 8):
Number of attention heads for each attention layer in the Transformer encoder.
decoder_attention_heads (`int`, *optional*, defaults to 8):
Number of attention heads for each attention layer in the Transformer decoder.
decoder_ffn_dim (`int`, *optional*, defaults to 2048):
Dimension of the "intermediate" (often named feed-forward) layer in decoder.
encoder_ffn_dim (`int`, *optional*, defaults to 2048):
Dimension of the "intermediate" (often named feed-forward) layer in decoder.
activation_function (`str` or `function`, *optional*, defaults to `"relu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"silu"` and `"gelu_new"` are supported.
dropout (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
activation_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for activations inside the fully connected layer.
init_std (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
init_xavier_std (`float`, *optional*, defaults to 1):
The scaling factor used for the Xavier initialization gain in the HM Attention map module.
encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
for more details.
decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
for more details.
auxiliary_loss (`bool`, *optional*, defaults to `False`):
Whether auxiliary decoding losses (loss at each decoder layer) are to be used.
position_embedding_type (`str`, *optional*, defaults to `"sine"`):
Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`.
class_cost (`float`, *optional*, defaults to 1):
Relative weight of the classification error in the Hungarian matching cost.
bbox_cost (`float`, *optional*, defaults to 5):
Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost.
giou_cost (`float`, *optional*, defaults to 2):
Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost.
mask_loss_coefficient (`float`, *optional*, defaults to 1):
Relative weight of the Focal loss in the panoptic segmentation loss.
dice_loss_coefficient (`float`, *optional*, defaults to 1):
Relative weight of the DICE/F-1 loss in the panoptic segmentation loss.
bbox_loss_coefficient (`float`, *optional*, defaults to 5):
Relative weight of the L1 bounding box loss in the object detection loss.
giou_loss_coefficient (`float`, *optional*, defaults to 2):
Relative weight of the generalized IoU loss in the object detection loss.
eos_coefficient (`float`, *optional*, defaults to 0.1):
Relative classification weight of the 'no-object' class in the object detection loss.
num_feature_levels (`int`, *optional*, defaults to 5):
The number of input feature levels.
encoder_n_points (`int`, *optional*, defaults to 4):
The number of sampled keys in each feature level for each attention head in the encoder.
decoder_n_points (`int`, *optional*, defaults to 4):
The number of sampled keys in each feature level for each attention head in the decoder.
two_stage (`bool`, *optional*, defaults to `True`):
Whether to apply a two-stage deformable DETR, where the region proposals are also generated by a variant of
DETA, which are further fed into the decoder for iterative bounding box refinement.
two_stage_num_proposals (`int`, *optional*, defaults to 300):
The number of region proposals to be generated, in case `two_stage` is set to `True`.
with_box_refine (`bool`, *optional*, defaults to `True`):
Whether to apply iterative bounding box refinement, where each decoder layer refines the bounding boxes
based on the predictions from the previous layer.
focal_alpha (`float`, *optional*, defaults to 0.25):
Alpha parameter in the focal loss.
Examples:
```python
>>> from transformers import DetaConfig, DetaModel
>>> # Initializing a DETA SenseTime/deformable-detr style configuration
>>> configuration = DetaConfig()
>>> # Initializing a model (with random weights) from the SenseTime/deformable-detr style configuration
>>> model = DetaModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "deta"
attribute_map = {
"hidden_size": "d_model",
"num_attention_heads": "encoder_attention_heads",
}
def __init__(
self,
backbone_config=None,
num_queries=900,
max_position_embeddings=2048,
encoder_layers=6,
encoder_ffn_dim=2048,
encoder_attention_heads=8,
decoder_layers=6,
decoder_ffn_dim=1024,
decoder_attention_heads=8,
encoder_layerdrop=0.0,
decoder_layerdrop=0.0,
is_encoder_decoder=True,
activation_function="relu",
d_model=256,
dropout=0.1,
attention_dropout=0.0,
activation_dropout=0.0,
init_std=0.02,
init_xavier_std=1.0,
return_intermediate=True,
auxiliary_loss=False,
position_embedding_type="sine",
num_feature_levels=5,
encoder_n_points=4,
decoder_n_points=4,
two_stage=True,
two_stage_num_proposals=300,
with_box_refine=True,
assign_first_stage=True,
class_cost=1,
bbox_cost=5,
giou_cost=2,
mask_loss_coefficient=1,
dice_loss_coefficient=1,
bbox_loss_coefficient=5,
giou_loss_coefficient=2,
eos_coefficient=0.1,
focal_alpha=0.25,
**kwargs
):
if backbone_config is None:
logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.")
backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage2", "stage3", "stage4"])
else:
if isinstance(backbone_config, dict):
backbone_model_type = backbone_config.pop("model_type")
config_class = CONFIG_MAPPING[backbone_model_type]
backbone_config = config_class.from_dict(backbone_config)
self.backbone_config = backbone_config
self.num_queries = num_queries
self.max_position_embeddings = max_position_embeddings
self.d_model = d_model
self.encoder_ffn_dim = encoder_ffn_dim
self.encoder_layers = encoder_layers
self.encoder_attention_heads = encoder_attention_heads
self.decoder_ffn_dim = decoder_ffn_dim
self.decoder_layers = decoder_layers
self.decoder_attention_heads = decoder_attention_heads
self.dropout = dropout
self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.activation_function = activation_function
self.init_std = init_std
self.init_xavier_std = init_xavier_std
self.encoder_layerdrop = encoder_layerdrop
self.decoder_layerdrop = decoder_layerdrop
self.auxiliary_loss = auxiliary_loss
self.position_embedding_type = position_embedding_type
# deformable attributes
self.num_feature_levels = num_feature_levels
self.encoder_n_points = encoder_n_points
self.decoder_n_points = decoder_n_points
self.two_stage = two_stage
self.two_stage_num_proposals = two_stage_num_proposals
self.with_box_refine = with_box_refine
self.assign_first_stage = assign_first_stage
if two_stage is True and with_box_refine is False:
raise ValueError("If two_stage is True, with_box_refine must be True.")
# Hungarian matcher
self.class_cost = class_cost
self.bbox_cost = bbox_cost
self.giou_cost = giou_cost
# Loss coefficients
self.mask_loss_coefficient = mask_loss_coefficient
self.dice_loss_coefficient = dice_loss_coefficient
self.bbox_loss_coefficient = bbox_loss_coefficient
self.giou_loss_coefficient = giou_loss_coefficient
self.eos_coefficient = eos_coefficient
self.focal_alpha = focal_alpha
super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
@property
def num_attention_heads(self) -> int:
return self.encoder_attention_heads
@property
def hidden_size(self) -> int:
return self.d_model
def to_dict(self):
"""
Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. Returns:
`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
"""
output = copy.deepcopy(self.__dict__)
output["backbone_config"] = self.backbone_config.to_dict()
output["model_type"] = self.__class__.model_type
return output
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert DETA checkpoints from the original repository.
URL: https://github.com/jozhang97/DETA/tree/master"""
import argparse
import json
from pathlib import Path
import torch
from PIL import Image
import requests
from huggingface_hub import cached_download, hf_hub_download, hf_hub_url
from transformers import DetaConfig, DetaForObjectDetection, DetaImageProcessor
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
def get_deta_config():
config = DetaConfig(
num_queries=900,
encoder_ffn_dim=2048,
decoder_ffn_dim=2048,
num_feature_levels=5,
assign_first_stage=True,
with_box_refine=True,
two_stage=True,
)
# set labels
config.num_labels = 91
repo_id = "huggingface/label-files"
filename = "coco-detection-id2label.json"
id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename, repo_type="dataset")), "r"))
id2label = {int(k): v for k, v in id2label.items()}
config.id2label = id2label
config.label2id = {v: k for k, v in id2label.items()}
return config
# here we list all keys to be renamed (original name on the left, our name on the right)
def create_rename_keys(config):
rename_keys = []
# stem
# fmt: off
rename_keys.append(("backbone.0.body.conv1.weight", "model.backbone.model.embedder.embedder.convolution.weight"))
rename_keys.append(("backbone.0.body.bn1.weight", "model.backbone.model.embedder.embedder.normalization.weight"))
rename_keys.append(("backbone.0.body.bn1.bias", "model.backbone.model.embedder.embedder.normalization.bias"))
rename_keys.append(("backbone.0.body.bn1.running_mean", "model.backbone.model.embedder.embedder.normalization.running_mean"))
rename_keys.append(("backbone.0.body.bn1.running_var", "model.backbone.model.embedder.embedder.normalization.running_var"))
# stages
for stage_idx in range(len(config.backbone_config.depths)):
for layer_idx in range(config.backbone_config.depths[stage_idx]):
# shortcut
if layer_idx == 0:
rename_keys.append(
(
f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.0.weight",
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.convolution.weight",
)
)
rename_keys.append(
(
f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.weight",
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.weight",
)
)
rename_keys.append(
(
f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.bias",
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.bias",
)
)
rename_keys.append(
(
f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.running_mean",
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_mean",
)
)
rename_keys.append(
(
f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.running_var",
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_var",
)
)
# 3 convs
for i in range(3):
rename_keys.append(
(
f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.conv{i+1}.weight",
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.convolution.weight",
)
)
rename_keys.append(
(
f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.weight",
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.weight",
)
)
rename_keys.append(
(
f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.bias",
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.bias",
)
)
rename_keys.append(
(
f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.running_mean",
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.running_mean",
)
)
rename_keys.append(
(
f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.running_var",
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.running_var",
)
)
# transformer encoder
for i in range(config.encoder_layers):
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.weight", f"model.encoder.layers.{i}.self_attn.sampling_offsets.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.bias", f"model.encoder.layers.{i}.self_attn.sampling_offsets.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.weight", f"model.encoder.layers.{i}.self_attn.attention_weights.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.bias", f"model.encoder.layers.{i}.self_attn.attention_weights.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.weight", f"model.encoder.layers.{i}.self_attn.value_proj.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.bias", f"model.encoder.layers.{i}.self_attn.value_proj.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.weight", f"model.encoder.layers.{i}.self_attn.output_proj.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.bias", f"model.encoder.layers.{i}.self_attn.output_proj.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.norm1.weight", f"model.encoder.layers.{i}.self_attn_layer_norm.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"model.encoder.layers.{i}.self_attn_layer_norm.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"model.encoder.layers.{i}.fc1.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"model.encoder.layers.{i}.fc1.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"model.encoder.layers.{i}.fc2.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"model.encoder.layers.{i}.fc2.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"model.encoder.layers.{i}.final_layer_norm.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"model.encoder.layers.{i}.final_layer_norm.bias"))
# transformer decoder
for i in range(config.decoder_layers):
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.weight", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.bias", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.weight", f"model.decoder.layers.{i}.encoder_attn.attention_weights.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.bias", f"model.decoder.layers.{i}.encoder_attn.attention_weights.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.weight", f"model.decoder.layers.{i}.encoder_attn.value_proj.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.bias", f"model.decoder.layers.{i}.encoder_attn.value_proj.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.weight", f"model.decoder.layers.{i}.encoder_attn.output_proj.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.bias", f"model.decoder.layers.{i}.encoder_attn.output_proj.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm1.weight", f"model.decoder.layers.{i}.encoder_attn_layer_norm.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"model.decoder.layers.{i}.encoder_attn_layer_norm.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"model.decoder.layers.{i}.self_attn.out_proj.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"model.decoder.layers.{i}.self_attn.out_proj.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm2.weight", f"model.decoder.layers.{i}.self_attn_layer_norm.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm2.bias", f"model.decoder.layers.{i}.self_attn_layer_norm.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"model.decoder.layers.{i}.fc1.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"model.decoder.layers.{i}.fc1.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"model.decoder.layers.{i}.fc2.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"model.decoder.layers.{i}.fc2.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"model.decoder.layers.{i}.final_layer_norm.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"model.decoder.layers.{i}.final_layer_norm.bias"))
# fmt: on
return rename_keys
def rename_key(dct, old, new):
val = dct.pop(old)
dct[new] = val
def read_in_decoder_q_k_v(state_dict, config):
# transformer decoder self-attention layers
hidden_size = config.d_model
for i in range(config.decoder_layers):
# read in weights + bias of input projection layer of self-attention
in_proj_weight = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_weight")
in_proj_bias = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_bias")
# next, add query, keys and values (in that order) to the state dict
state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:hidden_size, :]
state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:hidden_size]
state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[
hidden_size : hidden_size * 2, :
]
state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[hidden_size : hidden_size * 2]
state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-hidden_size:, :]
state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-hidden_size:]
# We will verify our results on an image of cute cats
def prepare_img():
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
im = Image.open(requests.get(url, stream=True).raw)
return im
@torch.no_grad()
def convert_deta_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
"""
Copy/paste/tweak model's weights to our DETA structure.
"""
# load config
config = get_deta_config()
# load original state dict
if model_name == "deta-resnet-50":
filename = "adet_checkpoint0011.pth"
elif model_name == "deta-resnet-50-24-epochs":
filename = "adet_2x_checkpoint0023.pth"
else:
raise ValueError(f"Model name {model_name} not supported")
checkpoint_path = hf_hub_download(repo_id="nielsr/deta-checkpoints", filename=filename)
state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
# rename keys
rename_keys = create_rename_keys(config)
for src, dest in rename_keys:
rename_key(state_dict, src, dest)
read_in_decoder_q_k_v(state_dict, config)
# fix some prefixes
for key in state_dict.copy().keys():
if "transformer.decoder.class_embed" in key or "transformer.decoder.bbox_embed" in key:
val = state_dict.pop(key)
state_dict[key.replace("transformer.decoder", "model.decoder")] = val
if "input_proj" in key:
val = state_dict.pop(key)
state_dict["model." + key] = val
if "level_embed" in key or "pos_trans" in key or "pix_trans" in key or "enc_output" in key:
val = state_dict.pop(key)
state_dict[key.replace("transformer", "model")] = val
# finally, create HuggingFace model and load state dict
model = DetaForObjectDetection(config)
model.load_state_dict(state_dict)
model.eval()
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
# load image processor
processor = DetaImageProcessor(format="coco_detection")
# verify our conversion on image
img = prepare_img()
encoding = processor(images=img, return_tensors="pt")
pixel_values = encoding["pixel_values"]
outputs = model(pixel_values.to(device))
# verify logits
if model_name == "deta-resnet-50":
expected_logits = torch.tensor(
[[-7.3978, -2.5406, -4.1668], [-8.2684, -3.9933, -3.8096], [-7.0515, -3.7973, -5.8516]]
)
expected_boxes = torch.tensor([[0.5043, 0.4973, 0.9998], [0.2542, 0.5489, 0.4748], [0.5490, 0.2765, 0.0570]])
elif model_name == "deta-resnet-50-24-epochs":
expected_logits = torch.tensor(
[[-7.1688, -2.4857, -4.8669], [-7.8630, -3.8154, -4.2674], [-7.2730, -4.1865, -5.5323]]
)
expected_boxes = torch.tensor([[0.5021, 0.4971, 0.9994], [0.2546, 0.5486, 0.4731], [0.1686, 0.1986, 0.2142]])
assert torch.allclose(outputs.logits[0, :3, :3], expected_logits.to(device), atol=1e-4)
assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes.to(device), atol=1e-4)
print("Everything ok!")
if pytorch_dump_folder_path:
# Save model and processor
logger.info(f"Saving PyTorch model and processor to {pytorch_dump_folder_path}...")
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
model.save_pretrained(pytorch_dump_folder_path)
processor.save_pretrained(pytorch_dump_folder_path)
# Push to hub
if push_to_hub:
print("Pushing model and processor to hub...")
model.push_to_hub(f"jozhang97/{model_name}")
processor.push_to_hub(f"jozhang97/{model_name}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name",
type=str,
default="deta-resnet-50",
choices=["deta-resnet-50", "deta-resnet-50-24-epochs"],
help="Name of the model you'd like to convert.",
)
parser.add_argument(
"--pytorch_dump_folder_path",
default=None,
type=str,
help="Path to the folder to output PyTorch model.",
)
parser.add_argument(
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
)
args = parser.parse_args()
convert_deta_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert DETA checkpoints from the original repository.
URL: https://github.com/jozhang97/DETA/tree/master"""
import argparse
import json
from pathlib import Path
import torch
from PIL import Image
import requests
from huggingface_hub import cached_download, hf_hub_download, hf_hub_url
from transformers import DetaConfig, DetaForObjectDetection, DetaImageProcessor, SwinConfig
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
def get_deta_config(model_name):
backbone_config = SwinConfig(
embed_dim=192,
depths=(2, 2, 18, 2),
num_heads=(6, 12, 24, 48),
window_size=12,
out_features=["stage2", "stage3", "stage4"],
)
config = DetaConfig(
backbone_config=backbone_config,
num_queries=900,
encoder_ffn_dim=2048,
decoder_ffn_dim=2048,
num_feature_levels=5,
assign_first_stage=True,
with_box_refine=True,
two_stage=True,
)
# set labels
repo_id = "huggingface/label-files"
if "o365" in model_name:
num_labels = 366
filename = "object365-id2label.json"
else:
num_labels = 91
filename = "coco-detection-id2label.json"
config.num_labels = num_labels
id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename, repo_type="dataset")), "r"))
id2label = {int(k): v for k, v in id2label.items()}
config.id2label = id2label
config.label2id = {v: k for k, v in id2label.items()}
return config
# here we list all keys to be renamed (original name on the left, our name on the right)
def create_rename_keys(config):
rename_keys = []
# stem
# fmt: off
rename_keys.append(("backbone.0.body.patch_embed.proj.weight", "model.backbone.model.embeddings.patch_embeddings.projection.weight"))
rename_keys.append(("backbone.0.body.patch_embed.proj.bias", "model.backbone.model.embeddings.patch_embeddings.projection.bias"))
rename_keys.append(("backbone.0.body.patch_embed.norm.weight", "model.backbone.model.embeddings.norm.weight"))
rename_keys.append(("backbone.0.body.patch_embed.norm.bias", "model.backbone.model.embeddings.norm.bias"))
# stages
for i in range(len(config.backbone_config.depths)):
for j in range(config.backbone_config.depths[i]):
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm1.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_before.weight"))
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm1.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_before.bias"))
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.relative_position_bias_table", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.relative_position_bias_table"))
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.relative_position_index", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.relative_position_index"))
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.proj.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.output.dense.weight"))
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.proj.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.output.dense.bias"))
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm2.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_after.weight"))
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm2.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_after.bias"))
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc1.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.intermediate.dense.weight"))
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc1.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.intermediate.dense.bias"))
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc2.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.output.dense.weight"))
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc2.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.output.dense.bias"))
if i < 3:
rename_keys.append((f"backbone.0.body.layers.{i}.downsample.reduction.weight", f"model.backbone.model.encoder.layers.{i}.downsample.reduction.weight"))
rename_keys.append((f"backbone.0.body.layers.{i}.downsample.norm.weight", f"model.backbone.model.encoder.layers.{i}.downsample.norm.weight"))
rename_keys.append((f"backbone.0.body.layers.{i}.downsample.norm.bias", f"model.backbone.model.encoder.layers.{i}.downsample.norm.bias"))
rename_keys.append(("backbone.0.body.norm1.weight", "model.backbone.model.hidden_states_norms.stage2.weight"))
rename_keys.append(("backbone.0.body.norm1.bias", "model.backbone.model.hidden_states_norms.stage2.bias"))
rename_keys.append(("backbone.0.body.norm2.weight", "model.backbone.model.hidden_states_norms.stage3.weight"))
rename_keys.append(("backbone.0.body.norm2.bias", "model.backbone.model.hidden_states_norms.stage3.bias"))
rename_keys.append(("backbone.0.body.norm3.weight", "model.backbone.model.hidden_states_norms.stage4.weight"))
rename_keys.append(("backbone.0.body.norm3.bias", "model.backbone.model.hidden_states_norms.stage4.bias"))
# transformer encoder
for i in range(config.encoder_layers):
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.weight", f"model.encoder.layers.{i}.self_attn.sampling_offsets.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.bias", f"model.encoder.layers.{i}.self_attn.sampling_offsets.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.weight", f"model.encoder.layers.{i}.self_attn.attention_weights.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.bias", f"model.encoder.layers.{i}.self_attn.attention_weights.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.weight", f"model.encoder.layers.{i}.self_attn.value_proj.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.bias", f"model.encoder.layers.{i}.self_attn.value_proj.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.weight", f"model.encoder.layers.{i}.self_attn.output_proj.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.bias", f"model.encoder.layers.{i}.self_attn.output_proj.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.norm1.weight", f"model.encoder.layers.{i}.self_attn_layer_norm.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"model.encoder.layers.{i}.self_attn_layer_norm.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"model.encoder.layers.{i}.fc1.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"model.encoder.layers.{i}.fc1.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"model.encoder.layers.{i}.fc2.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"model.encoder.layers.{i}.fc2.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"model.encoder.layers.{i}.final_layer_norm.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"model.encoder.layers.{i}.final_layer_norm.bias"))
# transformer decoder
for i in range(config.decoder_layers):
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.weight", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.bias", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.weight", f"model.decoder.layers.{i}.encoder_attn.attention_weights.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.bias", f"model.decoder.layers.{i}.encoder_attn.attention_weights.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.weight", f"model.decoder.layers.{i}.encoder_attn.value_proj.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.bias", f"model.decoder.layers.{i}.encoder_attn.value_proj.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.weight", f"model.decoder.layers.{i}.encoder_attn.output_proj.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.bias", f"model.decoder.layers.{i}.encoder_attn.output_proj.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm1.weight", f"model.decoder.layers.{i}.encoder_attn_layer_norm.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"model.decoder.layers.{i}.encoder_attn_layer_norm.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"model.decoder.layers.{i}.self_attn.out_proj.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"model.decoder.layers.{i}.self_attn.out_proj.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm2.weight", f"model.decoder.layers.{i}.self_attn_layer_norm.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm2.bias", f"model.decoder.layers.{i}.self_attn_layer_norm.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"model.decoder.layers.{i}.fc1.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"model.decoder.layers.{i}.fc1.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"model.decoder.layers.{i}.fc2.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"model.decoder.layers.{i}.fc2.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"model.decoder.layers.{i}.final_layer_norm.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"model.decoder.layers.{i}.final_layer_norm.bias"))
# fmt: on
return rename_keys
def rename_key(dct, old, new):
val = dct.pop(old)
dct[new] = val
# we split up the matrix of each encoder layer into queries, keys and values
def read_in_swin_q_k_v(state_dict, backbone_config):
num_features = [int(backbone_config.embed_dim * 2**i) for i in range(len(backbone_config.depths))]
for i in range(len(backbone_config.depths)):
dim = num_features[i]
for j in range(backbone_config.depths[i]):
# fmt: off
# read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias)
in_proj_weight = state_dict.pop(f"backbone.0.body.layers.{i}.blocks.{j}.attn.qkv.weight")
in_proj_bias = state_dict.pop(f"backbone.0.body.layers.{i}.blocks.{j}.attn.qkv.bias")
# next, add query, keys and values (in that order) to the state dict
state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.query.weight"] = in_proj_weight[:dim, :]
state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.query.bias"] = in_proj_bias[: dim]
state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.key.weight"] = in_proj_weight[
dim : dim * 2, :
]
state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.key.bias"] = in_proj_bias[
dim : dim * 2
]
state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.value.weight"] = in_proj_weight[
-dim :, :
]
state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.value.bias"] = in_proj_bias[-dim :]
# fmt: on
def read_in_decoder_q_k_v(state_dict, config):
# transformer decoder self-attention layers
hidden_size = config.d_model
for i in range(config.decoder_layers):
# read in weights + bias of input projection layer of self-attention
in_proj_weight = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_weight")
in_proj_bias = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_bias")
# next, add query, keys and values (in that order) to the state dict
state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:hidden_size, :]
state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:hidden_size]
state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[
hidden_size : hidden_size * 2, :
]
state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[hidden_size : hidden_size * 2]
state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-hidden_size:, :]
state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-hidden_size:]
# We will verify our results on an image of cute cats
def prepare_img():
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
im = Image.open(requests.get(url, stream=True).raw)
return im
@torch.no_grad()
def convert_deta_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
"""
Copy/paste/tweak model's weights to our DETA structure.
"""
# load config
config = get_deta_config(model_name)
# load original state dict
if model_name == "deta-swin-large":
checkpoint_path = hf_hub_download(repo_id="nielsr/deta-checkpoints", filename="adet_swin_ft.pth")
elif model_name == "deta-swin-large-o365":
checkpoint_path = hf_hub_download(repo_id="jozhang97/deta-swin-l-o365", filename="deta_swin_pt_o365.pth")
else:
raise ValueError(f"Model name {model_name} not supported")
state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
# original state dict
for name, param in state_dict.items():
print(name, param.shape)
# rename keys
rename_keys = create_rename_keys(config)
for src, dest in rename_keys:
rename_key(state_dict, src, dest)
read_in_swin_q_k_v(state_dict, config.backbone_config)
read_in_decoder_q_k_v(state_dict, config)
# fix some prefixes
for key in state_dict.copy().keys():
if "transformer.decoder.class_embed" in key or "transformer.decoder.bbox_embed" in key:
val = state_dict.pop(key)
state_dict[key.replace("transformer.decoder", "model.decoder")] = val
if "input_proj" in key:
val = state_dict.pop(key)
state_dict["model." + key] = val
if "level_embed" in key or "pos_trans" in key or "pix_trans" in key or "enc_output" in key:
val = state_dict.pop(key)
state_dict[key.replace("transformer", "model")] = val
# finally, create HuggingFace model and load state dict
model = DetaForObjectDetection(config)
model.load_state_dict(state_dict)
model.eval()
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
# load image processor
processor = DetaImageProcessor(format="coco_detection")
# verify our conversion on image
img = prepare_img()
encoding = processor(images=img, return_tensors="pt")
pixel_values = encoding["pixel_values"]
outputs = model(pixel_values.to(device))
# verify logits
print("Logits:", outputs.logits[0, :3, :3])
print("Boxes:", outputs.pred_boxes[0, :3, :3])
if model_name == "deta-swin-large":
expected_logits = torch.tensor(
[[-7.6308, -2.8485, -5.3737], [-7.2037, -4.5505, -4.8027], [-7.2943, -4.2611, -4.6617]]
)
expected_boxes = torch.tensor([[0.4987, 0.4969, 0.9999], [0.2549, 0.5498, 0.4805], [0.5498, 0.2757, 0.0569]])
elif model_name == "deta-swin-large-o365":
expected_logits = torch.tensor(
[[-8.0122, -3.5720, -4.9717], [-8.1547, -3.6886, -4.6389], [-7.6610, -3.6194, -5.0134]]
)
expected_boxes = torch.tensor([[0.2523, 0.5549, 0.4881], [0.7715, 0.4149, 0.4601], [0.5503, 0.2753, 0.0575]])
assert torch.allclose(outputs.logits[0, :3, :3], expected_logits.to(device), atol=1e-4)
assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes.to(device), atol=1e-4)
print("Everything ok!")
if pytorch_dump_folder_path:
# Save model and processor
logger.info(f"Saving PyTorch model and processor to {pytorch_dump_folder_path}...")
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
model.save_pretrained(pytorch_dump_folder_path)
processor.save_pretrained(pytorch_dump_folder_path)
# Push to hub
if push_to_hub:
print("Pushing model and processor to hub...")
model.push_to_hub(f"jozhang97/{model_name}")
processor.push_to_hub(f"jozhang97/{model_name}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name",
type=str,
default="deta-swin-large",
choices=["deta-swin-large", "deta-swin-large-o365"],
help="Name of the model you'd like to convert.",
)
parser.add_argument(
"--pytorch_dump_folder_path",
default=None,
type=str,
help="Path to the folder to output PyTorch model.",
)
parser.add_argument(
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
)
args = parser.parse_args()
convert_deta_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Image processor class for Deformable DETR."""
import pathlib
import warnings
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
import numpy as np
from ...feature_extraction_utils import BatchFeature
from ...image_processing_utils import BaseImageProcessor, get_size_dict
from ...image_transforms import (
PaddingMode,
center_to_corners_format,
corners_to_center_format,
normalize,
pad,
rescale,
resize,
rgb_to_id,
to_channel_dimension_format,
)
from ...image_utils import (
IMAGENET_DEFAULT_MEAN,
IMAGENET_DEFAULT_STD,
ChannelDimension,
ImageInput,
PILImageResampling,
get_image_size,
infer_channel_dimension_format,
is_batched,
to_numpy_array,
valid_coco_detection_annotations,
valid_coco_panoptic_annotations,
valid_images,
)
from ...utils import (
is_flax_available,
is_jax_tensor,
is_tf_available,
is_tf_tensor,
is_torch_available,
is_torch_tensor,
is_torchvision_available,
is_vision_available,
)
from ...utils.generic import ExplicitEnum, TensorType
if is_torch_available():
import torch
if is_torchvision_available():
from torchvision.ops.boxes import batched_nms
if is_vision_available():
import PIL
class AnnotionFormat(ExplicitEnum):
COCO_DETECTION = "coco_detection"
COCO_PANOPTIC = "coco_panoptic"
SUPPORTED_ANNOTATION_FORMATS = (AnnotionFormat.COCO_DETECTION, AnnotionFormat.COCO_PANOPTIC)
# Copied from transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio
def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, int]:
"""
Computes the output image size given the input image size and the desired output size.
Args:
image_size (`Tuple[int, int]`):
The input image size.
size (`int`):
The desired output size.
max_size (`int`, *optional*):
The maximum allowed output size.
"""
height, width = image_size
if max_size is not None:
min_original_size = float(min((height, width)))
max_original_size = float(max((height, width)))
if max_original_size / min_original_size * size > max_size:
size = int(round(max_size * min_original_size / max_original_size))
if (height <= width and height == size) or (width <= height and width == size):
return height, width
if width < height:
ow = size
oh = int(size * height / width)
else:
oh = size
ow = int(size * width / height)
return (oh, ow)
# Copied from transformers.models.detr.image_processing_detr.get_resize_output_image_size
def get_resize_output_image_size(
input_image: np.ndarray, size: Union[int, Tuple[int, int], List[int]], max_size: Optional[int] = None
) -> Tuple[int, int]:
"""
Computes the output image size given the input image size and the desired output size. If the desired output size
is a tuple or list, the output image size is returned as is. If the desired output size is an integer, the output
image size is computed by keeping the aspect ratio of the input image size.
Args:
image_size (`Tuple[int, int]`):
The input image size.
size (`int`):
The desired output size.
max_size (`int`, *optional*):
The maximum allowed output size.
"""
image_size = get_image_size(input_image)
if isinstance(size, (list, tuple)):
return size
return get_size_with_aspect_ratio(image_size, size, max_size)
# Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn
def get_numpy_to_framework_fn(arr) -> Callable:
"""
Returns a function that converts a numpy array to the framework of the input array.
Args:
arr (`np.ndarray`): The array to convert.
"""
if isinstance(arr, np.ndarray):
return np.array
if is_tf_available() and is_tf_tensor(arr):
import tensorflow as tf
return tf.convert_to_tensor
if is_torch_available() and is_torch_tensor(arr):
import torch
return torch.tensor
if is_flax_available() and is_jax_tensor(arr):
import jax.numpy as jnp
return jnp.array
raise ValueError(f"Cannot convert arrays of type {type(arr)}")
# Copied from transformers.models.detr.image_processing_detr.safe_squeeze
def safe_squeeze(arr: np.ndarray, axis: Optional[int] = None) -> np.ndarray:
"""
Squeezes an array, but only if the axis specified has dim 1.
"""
if axis is None:
return arr.squeeze()
try:
return arr.squeeze(axis=axis)
except ValueError:
return arr
# Copied from transformers.models.detr.image_processing_detr.normalize_annotation
def normalize_annotation(annotation: Dict, image_size: Tuple[int, int]) -> Dict:
image_height, image_width = image_size
norm_annotation = {}
for key, value in annotation.items():
if key == "boxes":
boxes = value
boxes = corners_to_center_format(boxes)
boxes /= np.asarray([image_width, image_height, image_width, image_height], dtype=np.float32)
norm_annotation[key] = boxes
else:
norm_annotation[key] = value
return norm_annotation
# Copied from transformers.models.detr.image_processing_detr.max_across_indices
def max_across_indices(values: Iterable[Any]) -> List[Any]:
"""
Return the maximum value across all indices of an iterable of values.
"""
return [max(values_i) for values_i in zip(*values)]
# Copied from transformers.models.detr.image_processing_detr.get_max_height_width
def get_max_height_width(images: List[np.ndarray]) -> List[int]:
"""
Get the maximum height and width across all images in a batch.
"""
input_channel_dimension = infer_channel_dimension_format(images[0])
if input_channel_dimension == ChannelDimension.FIRST:
_, max_height, max_width = max_across_indices([img.shape for img in images])
elif input_channel_dimension == ChannelDimension.LAST:
max_height, max_width, _ = max_across_indices([img.shape for img in images])
else:
raise ValueError(f"Invalid channel dimension format: {input_channel_dimension}")
return (max_height, max_width)
# Copied from transformers.models.detr.image_processing_detr.make_pixel_mask
def make_pixel_mask(image: np.ndarray, output_size: Tuple[int, int]) -> np.ndarray:
"""
Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
Args:
image (`np.ndarray`):
Image to make the pixel mask for.
output_size (`Tuple[int, int]`):
Output size of the mask.
"""
input_height, input_width = get_image_size(image)
mask = np.zeros(output_size, dtype=np.int64)
mask[:input_height, :input_width] = 1
return mask
# Copied from transformers.models.detr.image_processing_detr.convert_coco_poly_to_mask
def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndarray:
"""
Convert a COCO polygon annotation to a mask.
Args:
segmentations (`List[List[float]]`):
List of polygons, each polygon represented by a list of x-y coordinates.
height (`int`):
Height of the mask.
width (`int`):
Width of the mask.
"""
try:
from pycocotools import mask as coco_mask
except ImportError:
raise ImportError("Pycocotools is not installed in your environment.")
masks = []
for polygons in segmentations:
rles = coco_mask.frPyObjects(polygons, height, width)
mask = coco_mask.decode(rles)
if len(mask.shape) < 3:
mask = mask[..., None]
mask = np.asarray(mask, dtype=np.uint8)
mask = np.any(mask, axis=2)
masks.append(mask)
if masks:
masks = np.stack(masks, axis=0)
else:
masks = np.zeros((0, height, width), dtype=np.uint8)
return masks
# Copied from transformers.models.detr.image_processing_detr.prepare_coco_detection_annotation with DETR->DETA
def prepare_coco_detection_annotation(image, target, return_segmentation_masks: bool = False):
"""
Convert the target in COCO format into the format expected by DETA.
"""
image_height, image_width = get_image_size(image)
image_id = target["image_id"]
image_id = np.asarray([image_id], dtype=np.int64)
# Get all COCO annotations for the given image.
annotations = target["annotations"]
annotations = [obj for obj in annotations if "iscrowd" not in obj or obj["iscrowd"] == 0]
classes = [obj["category_id"] for obj in annotations]
classes = np.asarray(classes, dtype=np.int64)
# for conversion to coco api
area = np.asarray([obj["area"] for obj in annotations], dtype=np.float32)
iscrowd = np.asarray([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in annotations], dtype=np.int64)
boxes = [obj["bbox"] for obj in annotations]
# guard against no boxes via resizing
boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4)
boxes[:, 2:] += boxes[:, :2]
boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width)
boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height)
keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
new_target = {}
new_target["image_id"] = image_id
new_target["class_labels"] = classes[keep]
new_target["boxes"] = boxes[keep]
new_target["area"] = area[keep]
new_target["iscrowd"] = iscrowd[keep]
new_target["orig_size"] = np.asarray([int(image_height), int(image_width)], dtype=np.int64)
if annotations and "keypoints" in annotations[0]:
keypoints = [obj["keypoints"] for obj in annotations]
keypoints = np.asarray(keypoints, dtype=np.float32)
num_keypoints = keypoints.shape[0]
keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
new_target["keypoints"] = keypoints[keep]
if return_segmentation_masks:
segmentation_masks = [obj["segmentation"] for obj in annotations]
masks = convert_coco_poly_to_mask(segmentation_masks, image_height, image_width)
new_target["masks"] = masks[keep]
return new_target
# Copied from transformers.models.detr.image_processing_detr.masks_to_boxes
def masks_to_boxes(masks: np.ndarray) -> np.ndarray:
"""
Compute the bounding boxes around the provided panoptic segmentation masks.
Args:
masks: masks in format `[number_masks, height, width]` where N is the number of masks
Returns:
boxes: bounding boxes in format `[number_masks, 4]` in xyxy format
"""
if masks.size == 0:
return np.zeros((0, 4))
h, w = masks.shape[-2:]
y = np.arange(0, h, dtype=np.float32)
x = np.arange(0, w, dtype=np.float32)
# see https://github.com/pytorch/pytorch/issues/50276
y, x = np.meshgrid(y, x, indexing="ij")
x_mask = masks * np.expand_dims(x, axis=0)
x_max = x_mask.reshape(x_mask.shape[0], -1).max(-1)
x = np.ma.array(x_mask, mask=~(np.array(masks, dtype=bool)))
x_min = x.filled(fill_value=1e8)
x_min = x_min.reshape(x_min.shape[0], -1).min(-1)
y_mask = masks * np.expand_dims(y, axis=0)
y_max = y_mask.reshape(x_mask.shape[0], -1).max(-1)
y = np.ma.array(y_mask, mask=~(np.array(masks, dtype=bool)))
y_min = y.filled(fill_value=1e8)
y_min = y_min.reshape(y_min.shape[0], -1).min(-1)
return np.stack([x_min, y_min, x_max, y_max], 1)
# Copied from transformers.models.detr.image_processing_detr.prepare_coco_panoptic_annotation with DETR->DETA
def prepare_coco_panoptic_annotation(
image: np.ndarray, target: Dict, masks_path: Union[str, pathlib.Path], return_masks: bool = True
) -> Dict:
"""
Prepare a coco panoptic annotation for DETA.
"""
image_height, image_width = get_image_size(image)
annotation_path = pathlib.Path(masks_path) / target["file_name"]
new_target = {}
new_target["image_id"] = np.asarray([target["image_id"] if "image_id" in target else target["id"]], dtype=np.int64)
new_target["size"] = np.asarray([image_height, image_width], dtype=np.int64)
new_target["orig_size"] = np.asarray([image_height, image_width], dtype=np.int64)
if "segments_info" in target:
masks = np.asarray(PIL.Image.open(annotation_path), dtype=np.uint32)
masks = rgb_to_id(masks)
ids = np.array([segment_info["id"] for segment_info in target["segments_info"]])
masks = masks == ids[:, None, None]
masks = masks.astype(np.uint8)
if return_masks:
new_target["masks"] = masks
new_target["boxes"] = masks_to_boxes(masks)
new_target["class_labels"] = np.array(
[segment_info["category_id"] for segment_info in target["segments_info"]], dtype=np.int64
)
new_target["iscrowd"] = np.asarray(
[segment_info["iscrowd"] for segment_info in target["segments_info"]], dtype=np.int64
)
new_target["area"] = np.asarray(
[segment_info["area"] for segment_info in target["segments_info"]], dtype=np.float32
)
return new_target
# Copied from transformers.models.detr.image_processing_detr.resize_annotation
def resize_annotation(
annotation: Dict[str, Any],
orig_size: Tuple[int, int],
target_size: Tuple[int, int],
threshold: float = 0.5,
resample: PILImageResampling = PILImageResampling.NEAREST,
):
"""
Resizes an annotation to a target size.
Args:
annotation (`Dict[str, Any]`):
The annotation dictionary.
orig_size (`Tuple[int, int]`):
The original size of the input image.
target_size (`Tuple[int, int]`):
The target size of the image, as returned by the preprocessing `resize` step.
threshold (`float`, *optional*, defaults to 0.5):
The threshold used to binarize the segmentation masks.
resample (`PILImageResampling`, defaults to `PILImageResampling.NEAREST`):
The resampling filter to use when resizing the masks.
"""
ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(target_size, orig_size))
ratio_height, ratio_width = ratios
new_annotation = {}
new_annotation["size"] = target_size
for key, value in annotation.items():
if key == "boxes":
boxes = value
scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32)
new_annotation["boxes"] = scaled_boxes
elif key == "area":
area = value
scaled_area = area * (ratio_width * ratio_height)
new_annotation["area"] = scaled_area
elif key == "masks":
masks = value[:, None]
masks = np.array([resize(mask, target_size, resample=resample) for mask in masks])
masks = masks.astype(np.float32)
masks = masks[:, 0] > threshold
new_annotation["masks"] = masks
elif key == "size":
new_annotation["size"] = target_size
else:
new_annotation[key] = value
return new_annotation
class DetaImageProcessor(BaseImageProcessor):
r"""
Constructs a Deformable DETR image processor.
Args:
format (`str`, *optional*, defaults to `"coco_detection"`):
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
do_resize (`bool`, *optional*, defaults to `True`):
Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
overridden by the `do_resize` parameter in the `preprocess` method.
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in
the `preprocess` method.
resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
Resampling filter to use if resizing the image.
do_rescale (`bool`, *optional*, defaults to `True`):
Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
`do_rescale` parameter in the `preprocess` method.
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
`preprocess` method.
do_normalize:
Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
`preprocess` method.
image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
Mean values to use when normalizing the image. Can be a single value or a list of values, one for each
channel. Can be overridden by the `image_mean` parameter in the `preprocess` method.
image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
do_pad (`bool`, *optional*, defaults to `True`):
Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be
overridden by the `do_pad` parameter in the `preprocess` method.
"""
model_input_names = ["pixel_values", "pixel_mask"]
def __init__(
self,
format: Union[str, AnnotionFormat] = AnnotionFormat.COCO_DETECTION,
do_resize: bool = True,
size: Dict[str, int] = None,
resample: PILImageResampling = PILImageResampling.BILINEAR,
do_rescale: bool = True,
rescale_factor: Union[int, float] = 1 / 255,
do_normalize: bool = True,
image_mean: Union[float, List[float]] = None,
image_std: Union[float, List[float]] = None,
do_pad: bool = True,
**kwargs
) -> None:
if "pad_and_return_pixel_mask" in kwargs:
do_pad = kwargs.pop("pad_and_return_pixel_mask")
size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
size = get_size_dict(size, default_to_square=False)
super().__init__(**kwargs)
self.format = format
self.do_resize = do_resize
self.size = size
self.resample = resample
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
self.do_pad = do_pad
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_annotation with DETR->DETA
def prepare_annotation(
self,
image: np.ndarray,
target: Dict,
format: Optional[AnnotionFormat] = None,
return_segmentation_masks: bool = None,
masks_path: Optional[Union[str, pathlib.Path]] = None,
) -> Dict:
"""
Prepare an annotation for feeding into DETA model.
"""
format = format if format is not None else self.format
if format == AnnotionFormat.COCO_DETECTION:
return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
target = prepare_coco_detection_annotation(image, target, return_segmentation_masks)
elif format == AnnotionFormat.COCO_PANOPTIC:
return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
target = prepare_coco_panoptic_annotation(
image, target, masks_path=masks_path, return_masks=return_segmentation_masks
)
else:
raise ValueError(f"Format {format} is not supported.")
return target
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare
def prepare(self, image, target, return_segmentation_masks=None, masks_path=None):
warnings.warn(
"The `prepare` method is deprecated and will be removed in a future version. "
"Please use `prepare_annotation` instead. Note: the `prepare_annotation` method "
"does not return the image anymore.",
)
target = self.prepare_annotation(image, target, return_segmentation_masks, masks_path, self.format)
return image, target
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.convert_coco_poly_to_mask
def convert_coco_poly_to_mask(self, *args, **kwargs):
warnings.warn("The `convert_coco_poly_to_mask` method is deprecated and will be removed in a future version. ")
return convert_coco_poly_to_mask(*args, **kwargs)
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_detection
def prepare_coco_detection(self, *args, **kwargs):
warnings.warn("The `prepare_coco_detection` method is deprecated and will be removed in a future version. ")
return prepare_coco_detection_annotation(*args, **kwargs)
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_panoptic
def prepare_coco_panoptic(self, *args, **kwargs):
warnings.warn("The `prepare_coco_panoptic` method is deprecated and will be removed in a future version. ")
return prepare_coco_panoptic_annotation(*args, **kwargs)
def resize(
self,
image: np.ndarray,
size: Dict[str, int],
resample: PILImageResampling = PILImageResampling.BILINEAR,
data_format: Optional[ChannelDimension] = None,
**kwargs
) -> np.ndarray:
"""
Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
int, smaller edge of the image will be matched to this number.
"""
size = get_size_dict(size, default_to_square=False)
if "shortest_edge" in size and "longest_edge" in size:
size = get_resize_output_image_size(image, size["shortest_edge"], size["longest_edge"])
elif "height" in size and "width" in size:
size = (size["height"], size["width"])
else:
raise ValueError(
"Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
f" {size.keys()}."
)
image = resize(image, size=size, resample=resample, data_format=data_format)
return image
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize_annotation
def resize_annotation(
self,
annotation,
orig_size,
size,
resample: PILImageResampling = PILImageResampling.NEAREST,
) -> Dict:
"""
Resize the annotation to match the resized image. If size is an int, smaller edge of the mask will be matched
to this number.
"""
return resize_annotation(annotation, orig_size=orig_size, target_size=size, resample=resample)
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
def rescale(
self, image: np.ndarray, rescale_factor: Union[float, int], data_format: Optional[ChannelDimension] = None
) -> np.ndarray:
"""
Rescale the image by the given factor.
"""
return rescale(image, rescale_factor, data_format=data_format)
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize
def normalize(
self,
image: np.ndarray,
mean: Union[float, Iterable[float]],
std: Union[float, Iterable[float]],
data_format: Optional[ChannelDimension] = None,
) -> np.ndarray:
"""
Normalize the image with the given mean and standard deviation.
"""
return normalize(image, mean=mean, std=std, data_format=data_format)
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize_annotation
def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
"""
Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
`[center_x, center_y, width, height]` format.
"""
return normalize_annotation(annotation, image_size=image_size)
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad_and_create_pixel_mask
def pad_and_create_pixel_mask(
self,
pixel_values_list: List[ImageInput],
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = None,
) -> BatchFeature:
"""
Pads a batch of images with zeros to the size of largest height and width in the batch and returns their
corresponding pixel mask.
Args:
images (`List[np.ndarray]`):
Batch of images to pad.
return_tensors (`str` or `TensorType`, *optional*):
The type of tensors to return. Can be one of:
- Unset: Return a list of `np.ndarray`.
- `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
- `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
"""
warnings.warn(
"This method is deprecated and will be removed in v4.27.0. Please use pad instead.", FutureWarning
)
# pad expects a list of np.ndarray, but the previous feature extractors expected torch tensors
images = [to_numpy_array(image) for image in pixel_values_list]
return self.pad(
images=images,
return_pixel_mask=True,
return_tensors=return_tensors,
data_format=data_format,
)
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
def _pad_image(
self,
image: np.ndarray,
output_size: Tuple[int, int],
constant_values: Union[float, Iterable[float]] = 0,
data_format: Optional[ChannelDimension] = None,
) -> np.ndarray:
"""
Pad an image with zeros to the given size.
"""
input_height, input_width = get_image_size(image)
output_height, output_width = output_size
pad_bottom = output_height - input_height
pad_right = output_width - input_width
padding = ((0, pad_bottom), (0, pad_right))
padded_image = pad(
image, padding, mode=PaddingMode.CONSTANT, constant_values=constant_values, data_format=data_format
)
return padded_image
# Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad
def pad(
self,
images: List[np.ndarray],
constant_values: Union[float, Iterable[float]] = 0,
return_pixel_mask: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = None,
) -> np.ndarray:
"""
Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
in the batch and optionally returns their corresponding pixel mask.
Args:
image (`np.ndarray`):
Image to pad.
constant_values (`float` or `Iterable[float]`, *optional*):
The value to use for the padding if `mode` is `"constant"`.
return_pixel_mask (`bool`, *optional*, defaults to `True`):
Whether to return a pixel mask.
input_channel_dimension (`ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be inferred from the input image.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format of the image. If not provided, it will be the same as the input image.
"""
pad_size = get_max_height_width(images)
padded_images = [
self._pad_image(image, pad_size, constant_values=constant_values, data_format=data_format)
for image in images
]
data = {"pixel_values": padded_images}
if return_pixel_mask:
masks = [make_pixel_mask(image=image, output_size=pad_size) for image in images]
data["pixel_mask"] = masks
return BatchFeature(data=data, tensor_type=return_tensors)
def preprocess(
self,
images: ImageInput,
annotations: Optional[Union[List[Dict], List[List[Dict]]]] = None,
return_segmentation_masks: bool = None,
masks_path: Optional[Union[str, pathlib.Path]] = None,
do_resize: Optional[bool] = None,
size: Optional[Dict[str, int]] = None,
resample=None, # PILImageResampling
do_rescale: Optional[bool] = None,
rescale_factor: Optional[Union[int, float]] = None,
do_normalize: Optional[bool] = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_pad: Optional[bool] = None,
format: Optional[Union[str, AnnotionFormat]] = None,
return_tensors: Optional[Union[TensorType, str]] = None,
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
**kwargs
) -> BatchFeature:
"""
Preprocess an image or a batch of images so that it can be used by the model.
Args:
images (`ImageInput`):
Image or batch of images to preprocess.
annotations (`List[Dict]` or `List[List[Dict]]`, *optional*):
List of annotations associated with the image or batch of images. If annotionation is for object
detection, the annotations should be a dictionary with the following keys:
- "image_id" (`int`): The image id.
- "annotations" (`List[Dict]`): List of annotations for an image. Each annotation should be a
dictionary. An image can have no annotations, in which case the list should be empty.
If annotionation is for segmentation, the annotations should be a dictionary with the following keys:
- "image_id" (`int`): The image id.
- "segments_info" (`List[Dict]`): List of segments for an image. Each segment should be a dictionary.
An image can have no segments, in which case the list should be empty.
- "file_name" (`str`): The file name of the image.
return_segmentation_masks (`bool`, *optional*, defaults to self.return_segmentation_masks):
Whether to return segmentation masks.
masks_path (`str` or `pathlib.Path`, *optional*):
Path to the directory containing the segmentation masks.
do_resize (`bool`, *optional*, defaults to self.do_resize):
Whether to resize the image.
size (`Dict[str, int]`, *optional*, defaults to self.size):
Size of the image after resizing.
resample (`PILImageResampling`, *optional*, defaults to self.resample):
Resampling filter to use when resizing the image.
do_rescale (`bool`, *optional*, defaults to self.do_rescale):
Whether to rescale the image.
rescale_factor (`float`, *optional*, defaults to self.rescale_factor):
Rescale factor to use when rescaling the image.
do_normalize (`bool`, *optional*, defaults to self.do_normalize):
Whether to normalize the image.
image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean):
Mean to use when normalizing the image.
image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
Standard deviation to use when normalizing the image.
do_pad (`bool`, *optional*, defaults to self.do_pad):
Whether to pad the image.
format (`str` or `AnnotionFormat`, *optional*, defaults to self.format):
Format of the annotations.
return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
Type of tensors to return. If `None`, will return the list of images.
data_format (`str` or `ChannelDimension`, *optional*, defaults to self.data_format):
The channel dimension format of the image. If not provided, it will be the same as the input image.
"""
if "pad_and_return_pixel_mask" in kwargs:
warnings.warn(
"The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, "
"use `do_pad` instead.",
FutureWarning,
)
do_pad = kwargs.pop("pad_and_return_pixel_mask")
do_resize = self.do_resize if do_resize is None else do_resize
size = self.size if size is None else size
size = get_size_dict(size=size, default_to_square=False)
resample = self.resample if resample is None else resample
do_rescale = self.do_rescale if do_rescale is None else do_rescale
rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor
do_normalize = self.do_normalize if do_normalize is None else do_normalize
image_mean = self.image_mean if image_mean is None else image_mean
image_std = self.image_std if image_std is None else image_std
do_pad = self.do_pad if do_pad is None else do_pad
format = self.format if format is None else format
if do_resize is not None and size is None:
raise ValueError("Size and max_size must be specified if do_resize is True.")
if do_rescale is not None and rescale_factor is None:
raise ValueError("Rescale factor must be specified if do_rescale is True.")
if do_normalize is not None and (image_mean is None or image_std is None):
raise ValueError("Image mean and std must be specified if do_normalize is True.")
if not is_batched(images):
images = [images]
annotations = [annotations] if annotations is not None else None
if annotations is not None and len(images) != len(annotations):
raise ValueError(
f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match."
)
if not valid_images(images):
raise ValueError(
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
"torch.Tensor, tf.Tensor or jax.ndarray."
)
format = AnnotionFormat(format)
if annotations is not None:
if format == AnnotionFormat.COCO_DETECTION and not valid_coco_detection_annotations(annotations):
raise ValueError(
"Invalid COCO detection annotations. Annotations must a dict (single image) of list of dicts"
"(batch of images) with the following keys: `image_id` and `annotations`, with the latter "
"being a list of annotations in the COCO format."
)
elif format == AnnotionFormat.COCO_PANOPTIC and not valid_coco_panoptic_annotations(annotations):
raise ValueError(
"Invalid COCO panoptic annotations. Annotations must a dict (single image) of list of dicts "
"(batch of images) with the following keys: `image_id`, `file_name` and `segments_info`, with "
"the latter being a list of annotations in the COCO format."
)
elif format not in SUPPORTED_ANNOTATION_FORMATS:
raise ValueError(
f"Unsupported annotation format: {format} must be one of {SUPPORTED_ANNOTATION_FORMATS}"
)
if (
masks_path is not None
and format == AnnotionFormat.COCO_PANOPTIC
and not isinstance(masks_path, (pathlib.Path, str))
):
raise ValueError(
"The path to the directory containing the mask PNG files should be provided as a"
f" `pathlib.Path` or string object, but is {type(masks_path)} instead."
)
# All transformations expect numpy arrays
images = [to_numpy_array(image) for image in images]
# prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image)
if annotations is not None:
prepared_images = []
prepared_annotations = []
for image, target in zip(images, annotations):
target = self.prepare_annotation(
image, target, format, return_segmentation_masks=return_segmentation_masks, masks_path=masks_path
)
prepared_images.append(image)
prepared_annotations.append(target)
images = prepared_images
annotations = prepared_annotations
del prepared_images, prepared_annotations
# transformations
if do_resize:
if annotations is not None:
resized_images, resized_annotations = [], []
for image, target in zip(images, annotations):
orig_size = get_image_size(image)
resized_image = self.resize(image, size=size, resample=resample)
resized_annotation = self.resize_annotation(target, orig_size, get_image_size(resized_image))
resized_images.append(resized_image)
resized_annotations.append(resized_annotation)
images = resized_images
annotations = resized_annotations
del resized_images, resized_annotations
else:
images = [self.resize(image, size=size, resample=resample) for image in images]
if do_rescale:
images = [self.rescale(image, rescale_factor) for image in images]
if do_normalize:
images = [self.normalize(image, image_mean, image_std) for image in images]
if annotations is not None:
annotations = [
self.normalize_annotation(annotation, get_image_size(image))
for annotation, image in zip(annotations, images)
]
if do_pad:
# Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
data = self.pad(images, return_pixel_mask=True, data_format=data_format)
else:
images = [to_channel_dimension_format(image, data_format) for image in images]
data = {"pixel_values": images}
encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
if annotations is not None:
encoded_inputs["labels"] = [
BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
]
return encoded_inputs
def post_process_object_detection(
self,
outputs,
threshold: float = 0.5,
target_sizes: Union[TensorType, List[Tuple]] = None,
nms_threshold: float = 0.7,
):
"""
Converts the output of [`DetaForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
bottom_right_x, bottom_right_y) format. Only supports PyTorch.
Args:
outputs ([`DetrObjectDetectionOutput`]):
Raw outputs of the model.
threshold (`float`, *optional*, defaults to 0.5):
Score threshold to keep object detection predictions.
target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
(height, width) of each image in the batch. If left to None, predictions will not be resized.
nms_threshold (`float`, *optional*, defaults to 0.7):
NMS threshold.
Returns:
`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
in the batch as predicted by the model.
"""
out_logits, out_bbox = outputs.logits, outputs.pred_boxes
batch_size, num_queries, num_labels = out_logits.shape
if target_sizes is not None:
if len(out_logits) != len(target_sizes):
raise ValueError(
"Make sure that you pass in as many target sizes as the batch dimension of the logits"
)
prob = out_logits.sigmoid()
all_scores = prob.view(batch_size, num_queries * num_labels).to(out_logits.device)
all_indexes = torch.arange(num_queries * num_labels)[None].repeat(batch_size, 1).to(out_logits.device)
all_boxes = all_indexes // out_logits.shape[2]
all_labels = all_indexes % out_logits.shape[2]
boxes = center_to_corners_format(out_bbox)
boxes = torch.gather(boxes, 1, all_boxes.unsqueeze(-1).repeat(1, 1, 4))
# and from relative [0, 1] to absolute [0, height] coordinates
if target_sizes is not None:
if isinstance(target_sizes, List):
img_h = torch.Tensor([i[0] for i in target_sizes])
img_w = torch.Tensor([i[1] for i in target_sizes])
else:
img_h, img_w = target_sizes.unbind(1)
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
boxes = boxes * scale_fct[:, None, :]
results = []
for b in range(batch_size):
box = boxes[b]
score = all_scores[b]
lbls = all_labels[b]
pre_topk = score.topk(min(10000, len(score))).indices
box = box[pre_topk]
score = score[pre_topk]
lbls = lbls[pre_topk]
# apply NMS
keep_inds = batched_nms(box, score, lbls, nms_threshold)[:100]
score = score[keep_inds]
lbls = lbls[keep_inds]
box = box[keep_inds]
results.append(
{
"scores": score[score > threshold],
"labels": lbls[score > threshold],
"boxes": box[score > threshold],
}
)
return results
# coding=utf-8
# Copyright 2022 SenseTime and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch DETA model."""
import copy
import math
import warnings
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple
import torch
import torch.nn.functional as F
from torch import Tensor, nn
from ...activations import ACT2FN
from ...file_utils import (
ModelOutput,
add_start_docstrings,
add_start_docstrings_to_model_forward,
is_scipy_available,
is_vision_available,
replace_return_docstrings,
)
from ...modeling_outputs import BaseModelOutput
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import meshgrid
from ...utils import is_torchvision_available, logging, requires_backends
from ..auto import AutoBackbone
from .configuration_deta import DetaConfig
logger = logging.get_logger(__name__)
if is_vision_available():
from transformers.image_transforms import center_to_corners_format
if is_torchvision_available():
from torchvision.ops.boxes import batched_nms
if is_scipy_available():
from scipy.optimize import linear_sum_assignment
logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "DetaConfig"
_CHECKPOINT_FOR_DOC = "jozhang97/deta-swin-large-o365"
DETA_PRETRAINED_MODEL_ARCHIVE_LIST = [
"jozhang97/deta-swin-large-o365",
# See all DETA models at https://huggingface.co/models?filter=deta
]
@dataclass
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoderOutput with DeformableDetr->Deta
class DetaDecoderOutput(ModelOutput):
"""
Base class for outputs of the DetaDecoder. This class adds two attributes to BaseModelOutputWithCrossAttentions,
namely:
- a stacked tensor of intermediate decoder hidden states (i.e. the output of each decoder layer)
- a stacked tensor of intermediate reference points.
Args:
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
Stacked intermediate hidden states (output of each layer of the decoder).
intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`):
Stacked intermediate reference points (reference points of each layer of the decoder).
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
plus the initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
the self-attention heads.
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
used to compute the weighted average in the cross-attention heads.
"""
last_hidden_state: torch.FloatTensor = None
intermediate_hidden_states: torch.FloatTensor = None
intermediate_reference_points: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModelOutput with DeformableDetr->Deta,Deformable DETR->DETA
class DetaModelOutput(ModelOutput):
"""
Base class for outputs of the Deformable DETR encoder-decoder model.
Args:
init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
Initial reference points sent through the Transformer decoder.
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the decoder of the model.
intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
Stacked intermediate hidden states (output of each layer of the decoder).
intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
Stacked intermediate reference points (reference points of each layer of the decoder).
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer
plus the initial embedding outputs.
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries,
num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted
average in the self-attention heads.
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
weighted average in the cross-attention heads.
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder of the model.
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
layer plus the initial embedding outputs.
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
foreground and background).
enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
Logits of predicted bounding boxes coordinates in the first stage.
"""
init_reference_points: torch.FloatTensor = None
last_hidden_state: torch.FloatTensor = None
intermediate_hidden_states: torch.FloatTensor = None
intermediate_reference_points: torch.FloatTensor = None
decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
encoder_last_hidden_state: Optional[torch.FloatTensor] = None
encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
enc_outputs_class: Optional[torch.FloatTensor] = None
enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
@dataclass
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrObjectDetectionOutput with DeformableDetr->Deta
class DetaObjectDetectionOutput(ModelOutput):
"""
Output type of [`DetaForObjectDetection`].
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
scale-invariant IoU loss.
loss_dict (`Dict`, *optional*):
A dictionary containing the individual losses. Useful for logging.
logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
Classification logits (including no-object) for all queries.
pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
possible padding). You can use [`~DetaProcessor.post_process_object_detection`] to retrieve the
unnormalized bounding boxes.
auxiliary_outputs (`list[Dict]`, *optional*):
Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
`pred_boxes`) for each decoder layer.
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the decoder of the model.
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer
plus the initial embedding outputs.
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries,
num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted
average in the self-attention heads.
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
weighted average in the cross-attention heads.
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder of the model.
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
layer plus the initial embedding outputs.
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_heads, 4,
4)`. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average
in the self-attention heads.
intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
Stacked intermediate hidden states (output of each layer of the decoder).
intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
Stacked intermediate reference points (reference points of each layer of the decoder).
init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
Initial reference points sent through the Transformer decoder.
enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
foreground and background).
enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
Logits of predicted bounding boxes coordinates in the first stage.
"""
loss: Optional[torch.FloatTensor] = None
loss_dict: Optional[Dict] = None
logits: torch.FloatTensor = None
pred_boxes: torch.FloatTensor = None
auxiliary_outputs: Optional[List[Dict]] = None
init_reference_points: Optional[torch.FloatTensor] = None
last_hidden_state: Optional[torch.FloatTensor] = None
intermediate_hidden_states: Optional[torch.FloatTensor] = None
intermediate_reference_points: Optional[torch.FloatTensor] = None
decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
encoder_last_hidden_state: Optional[torch.FloatTensor] = None
encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
enc_outputs_class: Optional = None
enc_outputs_coord_logits: Optional = None
def _get_clones(module, N):
return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
def inverse_sigmoid(x, eps=1e-5):
x = x.clamp(min=0, max=1)
x1 = x.clamp(min=eps)
x2 = (1 - x).clamp(min=eps)
return torch.log(x1 / x2)
# Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->Deta
class DetaFrozenBatchNorm2d(nn.Module):
"""
BatchNorm2d where the batch statistics and the affine parameters are fixed.
Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than
torchvision.models.resnet[18,34,50,101] produce nans.
"""
def __init__(self, n):
super().__init__()
self.register_buffer("weight", torch.ones(n))
self.register_buffer("bias", torch.zeros(n))
self.register_buffer("running_mean", torch.zeros(n))
self.register_buffer("running_var", torch.ones(n))
def _load_from_state_dict(
self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
):
num_batches_tracked_key = prefix + "num_batches_tracked"
if num_batches_tracked_key in state_dict:
del state_dict[num_batches_tracked_key]
super()._load_from_state_dict(
state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
)
def forward(self, x):
# move reshapes to the beginning
# to make it user-friendly
weight = self.weight.reshape(1, -1, 1, 1)
bias = self.bias.reshape(1, -1, 1, 1)
running_var = self.running_var.reshape(1, -1, 1, 1)
running_mean = self.running_mean.reshape(1, -1, 1, 1)
epsilon = 1e-5
scale = weight * (running_var + epsilon).rsqrt()
bias = bias - running_mean * scale
return x * scale + bias
# Copied from transformers.models.detr.modeling_detr.replace_batch_norm with Detr->Deta
def replace_batch_norm(m, name=""):
for attr_str in dir(m):
target_attr = getattr(m, attr_str)
if isinstance(target_attr, nn.BatchNorm2d):
frozen = DetaFrozenBatchNorm2d(target_attr.num_features)
bn = getattr(m, attr_str)
frozen.weight.data.copy_(bn.weight)
frozen.bias.data.copy_(bn.bias)
frozen.running_mean.data.copy_(bn.running_mean)
frozen.running_var.data.copy_(bn.running_var)
setattr(m, attr_str, frozen)
for n, ch in m.named_children():
replace_batch_norm(ch, n)
class DetaBackboneWithPositionalEncodings(nn.Module):
"""
Backbone model with positional embeddings.
nn.BatchNorm2d layers are replaced by DetaFrozenBatchNorm2d as defined above.
"""
def __init__(self, config):
super().__init__()
backbone = AutoBackbone.from_config(config.backbone_config)
with torch.no_grad():
replace_batch_norm(backbone)
self.model = backbone
self.intermediate_channel_sizes = self.model.channels
# TODO fix this
if config.backbone_config.model_type == "resnet":
for name, parameter in self.model.named_parameters():
if "stages.1" not in name and "stages.2" not in name and "stages.3" not in name:
parameter.requires_grad_(False)
self.position_embedding = build_position_encoding(config)
def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
"""
Outputs feature maps of latter stages C_3 through C_5 in ResNet if `config.num_feature_levels > 1`, otherwise
outputs feature maps of C_5.
"""
# first, send pixel_values through the backbone to get list of feature maps
features = self.model(pixel_values).feature_maps
# next, create position embeddings
out = []
pos = []
for feature_map in features:
# downsample pixel_mask to match shape of corresponding feature_map
mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0]
position_embeddings = self.position_embedding(feature_map, mask).to(feature_map.dtype)
out.append((feature_map, mask))
pos.append(position_embeddings)
return out, pos
# Copied from transformers.models.detr.modeling_detr._expand_mask
def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, target_len: Optional[int] = None):
"""
Expands attention_mask from `[batch_size, seq_len]` to `[batch_size, 1, target_seq_len, source_seq_len]`.
"""
batch_size, source_len = mask.size()
target_len = target_len if target_len is not None else source_len
expanded_mask = mask[:, None, None, :].expand(batch_size, 1, target_len, source_len).to(dtype)
inverted_mask = 1.0 - expanded_mask
return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrSinePositionEmbedding with DeformableDetr->Deta
class DetaSinePositionEmbedding(nn.Module):
"""
This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
need paper, generalized to work on images.
"""
def __init__(self, embedding_dim=64, temperature=10000, normalize=False, scale=None):
super().__init__()
self.embedding_dim = embedding_dim
self.temperature = temperature
self.normalize = normalize
if scale is not None and normalize is False:
raise ValueError("normalize should be True if scale is passed")
if scale is None:
scale = 2 * math.pi
self.scale = scale
def forward(self, pixel_values, pixel_mask):
if pixel_mask is None:
raise ValueError("No pixel mask provided")
y_embed = pixel_mask.cumsum(1, dtype=torch.float32)
x_embed = pixel_mask.cumsum(2, dtype=torch.float32)
if self.normalize:
eps = 1e-6
y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale
x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale
dim_t = torch.arange(self.embedding_dim, dtype=torch.float32, device=pixel_values.device)
dim_t = self.temperature ** (2 * (dim_t // 2) / self.embedding_dim)
pos_x = x_embed[:, :, :, None] / dim_t
pos_y = y_embed[:, :, :, None] / dim_t
pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
return pos
# Copied from transformers.models.detr.modeling_detr.DetrLearnedPositionEmbedding
class DetaLearnedPositionEmbedding(nn.Module):
"""
This module learns positional embeddings up to a fixed maximum size.
"""
def __init__(self, embedding_dim=256):
super().__init__()
self.row_embeddings = nn.Embedding(50, embedding_dim)
self.column_embeddings = nn.Embedding(50, embedding_dim)
def forward(self, pixel_values, pixel_mask=None):
height, width = pixel_values.shape[-2:]
width_values = torch.arange(width, device=pixel_values.device)
height_values = torch.arange(height, device=pixel_values.device)
x_emb = self.column_embeddings(width_values)
y_emb = self.row_embeddings(height_values)
pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1)
pos = pos.permute(2, 0, 1)
pos = pos.unsqueeze(0)
pos = pos.repeat(pixel_values.shape[0], 1, 1, 1)
return pos
# Copied from transformers.models.detr.modeling_detr.build_position_encoding with Detr->Deta
def build_position_encoding(config):
n_steps = config.d_model // 2
if config.position_embedding_type == "sine":
# TODO find a better way of exposing other arguments
position_embedding = DetaSinePositionEmbedding(n_steps, normalize=True)
elif config.position_embedding_type == "learned":
position_embedding = DetaLearnedPositionEmbedding(n_steps)
else:
raise ValueError(f"Not supported {config.position_embedding_type}")
return position_embedding
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.multi_scale_deformable_attention
def multi_scale_deformable_attention(
value: Tensor, value_spatial_shapes: Tensor, sampling_locations: Tensor, attention_weights: Tensor
) -> Tensor:
batch_size, _, num_heads, hidden_dim = value.shape
_, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
value_list = value.split([height * width for height, width in value_spatial_shapes], dim=1)
sampling_grids = 2 * sampling_locations - 1
sampling_value_list = []
for level_id, (height, width) in enumerate(value_spatial_shapes):
# batch_size, height*width, num_heads, hidden_dim
# -> batch_size, height*width, num_heads*hidden_dim
# -> batch_size, num_heads*hidden_dim, height*width
# -> batch_size*num_heads, hidden_dim, height, width
value_l_ = (
value_list[level_id].flatten(2).transpose(1, 2).reshape(batch_size * num_heads, hidden_dim, height, width)
)
# batch_size, num_queries, num_heads, num_points, 2
# -> batch_size, num_heads, num_queries, num_points, 2
# -> batch_size*num_heads, num_queries, num_points, 2
sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1)
# batch_size*num_heads, hidden_dim, num_queries, num_points
sampling_value_l_ = nn.functional.grid_sample(
value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False
)
sampling_value_list.append(sampling_value_l_)
# (batch_size, num_queries, num_heads, num_levels, num_points)
# -> (batch_size, num_heads, num_queries, num_levels, num_points)
# -> (batch_size, num_heads, 1, num_queries, num_levels*num_points)
attention_weights = attention_weights.transpose(1, 2).reshape(
batch_size * num_heads, 1, num_queries, num_levels * num_points
)
output = (
(torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
.sum(-1)
.view(batch_size, num_heads * hidden_dim, num_queries)
)
return output.transpose(1, 2).contiguous()
class DetaMultiscaleDeformableAttention(nn.Module):
"""
Multiscale deformable attention as proposed in Deformable DETR.
"""
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiscaleDeformableAttention.__init__ with DeformableDetr->Deta
def __init__(self, embed_dim: int, num_heads: int, n_levels: int, n_points: int):
super().__init__()
if embed_dim % num_heads != 0:
raise ValueError(
f"embed_dim (d_model) must be divisible by num_heads, but got {embed_dim} and {num_heads}"
)
dim_per_head = embed_dim // num_heads
# check if dim_per_head is power of 2
if not ((dim_per_head & (dim_per_head - 1) == 0) and dim_per_head != 0):
warnings.warn(
"You'd better set embed_dim (d_model) in DetaMultiscaleDeformableAttention to make the"
" dimension of each attention head a power of 2 which is more efficient in the authors' CUDA"
" implementation."
)
self.im2col_step = 64
self.d_model = embed_dim
self.n_levels = n_levels
self.n_heads = num_heads
self.n_points = n_points
self.sampling_offsets = nn.Linear(embed_dim, num_heads * n_levels * n_points * 2)
self.attention_weights = nn.Linear(embed_dim, num_heads * n_levels * n_points)
self.value_proj = nn.Linear(embed_dim, embed_dim)
self.output_proj = nn.Linear(embed_dim, embed_dim)
self._reset_parameters()
def _reset_parameters(self):
nn.init.constant_(self.sampling_offsets.weight.data, 0.0)
thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
grid_init = (
(grid_init / grid_init.abs().max(-1, keepdim=True)[0])
.view(self.n_heads, 1, 1, 2)
.repeat(1, self.n_levels, self.n_points, 1)
)
for i in range(self.n_points):
grid_init[:, :, i, :] *= i + 1
with torch.no_grad():
self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
nn.init.constant_(self.attention_weights.weight.data, 0.0)
nn.init.constant_(self.attention_weights.bias.data, 0.0)
nn.init.xavier_uniform_(self.value_proj.weight.data)
nn.init.constant_(self.value_proj.bias.data, 0.0)
nn.init.xavier_uniform_(self.output_proj.weight.data)
nn.init.constant_(self.output_proj.bias.data, 0.0)
def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
return tensor if position_embeddings is None else tensor + position_embeddings
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states=None,
encoder_attention_mask=None,
position_embeddings: Optional[torch.Tensor] = None,
reference_points=None,
spatial_shapes=None,
level_start_index=None,
output_attentions: bool = False,
):
# add position embeddings to the hidden states before projecting to queries and keys
if position_embeddings is not None:
hidden_states = self.with_pos_embed(hidden_states, position_embeddings)
batch_size, num_queries, _ = hidden_states.shape
batch_size, sequence_length, _ = encoder_hidden_states.shape
if (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() != sequence_length:
raise ValueError(
"Make sure to align the spatial shapes with the sequence length of the encoder hidden states"
)
value = self.value_proj(encoder_hidden_states)
if attention_mask is not None:
# we invert the attention_mask
value = value.masked_fill(~attention_mask[..., None], float(0))
value = value.view(batch_size, sequence_length, self.n_heads, self.d_model // self.n_heads)
sampling_offsets = self.sampling_offsets(hidden_states).view(
batch_size, num_queries, self.n_heads, self.n_levels, self.n_points, 2
)
attention_weights = self.attention_weights(hidden_states).view(
batch_size, num_queries, self.n_heads, self.n_levels * self.n_points
)
attention_weights = F.softmax(attention_weights, -1).view(
batch_size, num_queries, self.n_heads, self.n_levels, self.n_points
)
# batch_size, num_queries, n_heads, n_levels, n_points, 2
if reference_points.shape[-1] == 2:
offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
sampling_locations = (
reference_points[:, :, None, :, None, :]
+ sampling_offsets / offset_normalizer[None, None, None, :, None, :]
)
elif reference_points.shape[-1] == 4:
sampling_locations = (
reference_points[:, :, None, :, None, :2]
+ sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
)
else:
raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}")
# PyTorch implementation (for now)
output = multi_scale_deformable_attention(value, spatial_shapes, sampling_locations, attention_weights)
output = self.output_proj(output)
return output, attention_weights
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiheadAttention with DeformableDetr->Deta,Deformable DETR->DETA
class DetaMultiheadAttention(nn.Module):
"""
Multi-headed attention from 'Attention Is All You Need' paper.
Here, we add position embeddings to the queries and keys (as explained in the Deformable DETR paper).
"""
def __init__(
self,
embed_dim: int,
num_heads: int,
dropout: float = 0.0,
bias: bool = True,
):
super().__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.head_dim = embed_dim // num_heads
if self.head_dim * num_heads != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
f" {num_heads})."
)
self.scaling = self.head_dim**-0.5
self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
return tensor if position_embeddings is None else tensor + position_embeddings
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_embeddings: Optional[torch.Tensor] = None,
output_attentions: bool = False,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
"""Input shape: Batch x Time x Channel"""
batch_size, target_len, embed_dim = hidden_states.size()
# add position embeddings to the hidden states before projecting to queries and keys
if position_embeddings is not None:
hidden_states_original = hidden_states
hidden_states = self.with_pos_embed(hidden_states, position_embeddings)
# get queries, keys and values
query_states = self.q_proj(hidden_states) * self.scaling
key_states = self._shape(self.k_proj(hidden_states), -1, batch_size)
value_states = self._shape(self.v_proj(hidden_states_original), -1, batch_size)
proj_shape = (batch_size * self.num_heads, -1, self.head_dim)
query_states = self._shape(query_states, target_len, batch_size).view(*proj_shape)
key_states = key_states.view(*proj_shape)
value_states = value_states.view(*proj_shape)
source_len = key_states.size(1)
attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
if attn_weights.size() != (batch_size * self.num_heads, target_len, source_len):
raise ValueError(
f"Attention weights should be of size {(batch_size * self.num_heads, target_len, source_len)}, but is"
f" {attn_weights.size()}"
)
# expand attention_mask
if attention_mask is not None:
# [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len]
attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
if attention_mask is not None:
if attention_mask.size() != (batch_size, 1, target_len, source_len):
raise ValueError(
f"Attention mask should be of size {(batch_size, 1, target_len, source_len)}, but is"
f" {attention_mask.size()}"
)
attn_weights = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attention_mask
attn_weights = attn_weights.view(batch_size * self.num_heads, target_len, source_len)
attn_weights = nn.functional.softmax(attn_weights, dim=-1)
if output_attentions:
# this operation is a bit awkward, but it's required to
# make sure that attn_weights keeps its gradient.
# In order to do so, attn_weights have to reshaped
# twice and have to be reused in the following
attn_weights_reshaped = attn_weights.view(batch_size, self.num_heads, target_len, source_len)
attn_weights = attn_weights_reshaped.view(batch_size * self.num_heads, target_len, source_len)
else:
attn_weights_reshaped = None
attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
attn_output = torch.bmm(attn_probs, value_states)
if attn_output.size() != (batch_size * self.num_heads, target_len, self.head_dim):
raise ValueError(
f"`attn_output` should be of size {(batch_size, self.num_heads, target_len, self.head_dim)}, but is"
f" {attn_output.size()}"
)
attn_output = attn_output.view(batch_size, self.num_heads, target_len, self.head_dim)
attn_output = attn_output.transpose(1, 2)
attn_output = attn_output.reshape(batch_size, target_len, embed_dim)
attn_output = self.out_proj(attn_output)
return attn_output, attn_weights_reshaped
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrEncoderLayer with DeformableDetr->Deta
class DetaEncoderLayer(nn.Module):
def __init__(self, config: DetaConfig):
super().__init__()
self.embed_dim = config.d_model
self.self_attn = DetaMultiscaleDeformableAttention(
embed_dim=self.embed_dim,
num_heads=config.encoder_attention_heads,
n_levels=config.num_feature_levels,
n_points=config.encoder_n_points,
)
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: torch.Tensor,
position_embeddings: torch.Tensor = None,
reference_points=None,
spatial_shapes=None,
level_start_index=None,
output_attentions: bool = False,
):
"""
Args:
hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Input to the layer.
attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Attention mask.
position_embeddings (`torch.FloatTensor`, *optional*):
Position embeddings, to be added to `hidden_states`.
reference_points (`torch.FloatTensor`, *optional*):
Reference points.
spatial_shapes (`torch.LongTensor`, *optional*):
Spatial shapes of the backbone feature maps.
level_start_index (`torch.LongTensor`, *optional*):
Level start index.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
# Apply Multi-scale Deformable Attention Module on the multi-scale feature maps.
hidden_states, attn_weights = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
encoder_hidden_states=hidden_states,
encoder_attention_mask=attention_mask,
position_embeddings=position_embeddings,
reference_points=reference_points,
spatial_shapes=spatial_shapes,
level_start_index=level_start_index,
output_attentions=output_attentions,
)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states)
residual = hidden_states
hidden_states = self.activation_fn(self.fc1(hidden_states))
hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
hidden_states = self.fc2(hidden_states)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
hidden_states = self.final_layer_norm(hidden_states)
if self.training:
if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
clamp_value = torch.finfo(hidden_states.dtype).max - 1000
hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
outputs = (hidden_states,)
if output_attentions:
outputs += (attn_weights,)
return outputs
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoderLayer with DeformableDetr->Deta
class DetaDecoderLayer(nn.Module):
def __init__(self, config: DetaConfig):
super().__init__()
self.embed_dim = config.d_model
# self-attention
self.self_attn = DetaMultiheadAttention(
embed_dim=self.embed_dim,
num_heads=config.decoder_attention_heads,
dropout=config.attention_dropout,
)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
# cross-attention
self.encoder_attn = DetaMultiscaleDeformableAttention(
embed_dim=self.embed_dim,
num_heads=config.decoder_attention_heads,
n_levels=config.num_feature_levels,
n_points=config.decoder_n_points,
)
self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
# feedforward neural networks
self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
def forward(
self,
hidden_states: torch.Tensor,
position_embeddings: Optional[torch.Tensor] = None,
reference_points=None,
spatial_shapes=None,
level_start_index=None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = False,
):
"""
Args:
hidden_states (`torch.FloatTensor`):
Input to the layer of shape `(seq_len, batch, embed_dim)`.
position_embeddings (`torch.FloatTensor`, *optional*):
Position embeddings that are added to the queries and keys in the self-attention layer.
reference_points (`torch.FloatTensor`, *optional*):
Reference points.
spatial_shapes (`torch.LongTensor`, *optional*):
Spatial shapes.
level_start_index (`torch.LongTensor`, *optional*):
Level start index.
encoder_hidden_states (`torch.FloatTensor`):
cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
`(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
values.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
# Self Attention
hidden_states, self_attn_weights = self.self_attn(
hidden_states=hidden_states,
position_embeddings=position_embeddings,
output_attentions=output_attentions,
)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states)
second_residual = hidden_states
# Cross-Attention
cross_attn_weights = None
hidden_states, cross_attn_weights = self.encoder_attn(
hidden_states=hidden_states,
attention_mask=encoder_attention_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
position_embeddings=position_embeddings,
reference_points=reference_points,
spatial_shapes=spatial_shapes,
level_start_index=level_start_index,
output_attentions=output_attentions,
)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = second_residual + hidden_states
hidden_states = self.encoder_attn_layer_norm(hidden_states)
# Fully Connected
residual = hidden_states
hidden_states = self.activation_fn(self.fc1(hidden_states))
hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
hidden_states = self.fc2(hidden_states)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
hidden_states = self.final_layer_norm(hidden_states)
outputs = (hidden_states,)
if output_attentions:
outputs += (self_attn_weights, cross_attn_weights)
return outputs
# Copied from transformers.models.detr.modeling_detr.DetrClassificationHead
class DetaClassificationHead(nn.Module):
"""Head for sentence-level classification tasks."""
def __init__(self, input_dim: int, inner_dim: int, num_classes: int, pooler_dropout: float):
super().__init__()
self.dense = nn.Linear(input_dim, inner_dim)
self.dropout = nn.Dropout(p=pooler_dropout)
self.out_proj = nn.Linear(inner_dim, num_classes)
def forward(self, hidden_states: torch.Tensor):
hidden_states = self.dropout(hidden_states)
hidden_states = self.dense(hidden_states)
hidden_states = torch.tanh(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.out_proj(hidden_states)
return hidden_states
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrPreTrainedModel with DeformableDetr->Deta
class DetaPreTrainedModel(PreTrainedModel):
config_class = DetaConfig
base_model_prefix = "model"
main_input_name = "pixel_values"
def _init_weights(self, module):
std = self.config.init_std
if isinstance(module, DetaLearnedPositionEmbedding):
nn.init.uniform_(module.row_embeddings.weight)
nn.init.uniform_(module.column_embeddings.weight)
elif isinstance(module, DetaMultiscaleDeformableAttention):
module._reset_parameters()
elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
# Slightly different from the TF version which uses truncated_normal for initialization
# cf https://github.com/pytorch/pytorch/pull/5617
module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=std)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
if hasattr(module, "reference_points") and not self.config.two_stage:
nn.init.xavier_uniform_(module.reference_points.weight.data, gain=1.0)
nn.init.constant_(module.reference_points.bias.data, 0.0)
if hasattr(module, "level_embed"):
nn.init.normal_(module.level_embed)
def _set_gradient_checkpointing(self, module, value=False):
if isinstance(module, DetaDecoder):
module.gradient_checkpointing = value
DETA_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`DetaConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
DETA_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it.
Pixel values can be obtained using [`AutoImageProcessor`]. See [`AutoImageProcessor.__call__`] for details.
pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
- 1 for pixels that are real (i.e. **not masked**),
- 0 for pixels that are padding (i.e. **masked**).
[What are attention masks?](../glossary#attention-mask)
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, num_queries)`, *optional*):
Not used by default. Can be used to mask object queries.
encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
`last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
can choose to directly pass a flattened representation of an image.
decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
embedded representation.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrEncoder with DeformableDetr->Deta
class DetaEncoder(DetaPreTrainedModel):
"""
Transformer encoder consisting of *config.encoder_layers* deformable attention layers. Each layer is a
[`DetaEncoderLayer`].
The encoder updates the flattened multi-scale feature maps through multiple deformable attention layers.
Args:
config: DetaConfig
"""
def __init__(self, config: DetaConfig):
super().__init__(config)
self.dropout = config.dropout
self.layers = nn.ModuleList([DetaEncoderLayer(config) for _ in range(config.encoder_layers)])
# Initialize weights and apply final processing
self.post_init()
@staticmethod
def get_reference_points(spatial_shapes, valid_ratios, device):
"""
Get reference points for each feature map. Used in decoder.
Args:
spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`):
Spatial shapes of each feature map.
valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
Valid ratios of each feature map.
device (`torch.device`):
Device on which to create the tensors.
Returns:
`torch.FloatTensor` of shape `(batch_size, num_queries, num_feature_levels, 2)`
"""
reference_points_list = []
for level, (height, width) in enumerate(spatial_shapes):
ref_y, ref_x = meshgrid(
torch.linspace(0.5, height - 0.5, height, dtype=torch.float32, device=device),
torch.linspace(0.5, width - 0.5, width, dtype=torch.float32, device=device),
indexing="ij",
)
# TODO: valid_ratios could be useless here. check https://github.com/fundamentalvision/Deformable-DETR/issues/36
ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, level, 1] * height)
ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, level, 0] * width)
ref = torch.stack((ref_x, ref_y), -1)
reference_points_list.append(ref)
reference_points = torch.cat(reference_points_list, 1)
reference_points = reference_points[:, :, None] * valid_ratios[:, None]
return reference_points
def forward(
self,
inputs_embeds=None,
attention_mask=None,
position_embeddings=None,
spatial_shapes=None,
level_start_index=None,
valid_ratios=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
):
r"""
Args:
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Flattened feature map (output of the backbone + projection layer) that is passed to the encoder.
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`:
- 1 for pixel features that are real (i.e. **not masked**),
- 0 for pixel features that are padding (i.e. **masked**).
[What are attention masks?](../glossary#attention-mask)
position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Position embeddings that are added to the queries and keys in each self-attention layer.
spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`):
Spatial shapes of each feature map.
level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`):
Starting index of each feature map.
valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
Ratio of valid area in each feature level.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
hidden_states = inputs_embeds
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=inputs_embeds.device)
encoder_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None
for i, encoder_layer in enumerate(self.layers):
if output_hidden_states:
encoder_states = encoder_states + (hidden_states,)
layer_outputs = encoder_layer(
hidden_states,
attention_mask,
position_embeddings=position_embeddings,
reference_points=reference_points,
spatial_shapes=spatial_shapes,
level_start_index=level_start_index,
output_attentions=output_attentions,
)
hidden_states = layer_outputs[0]
if output_attentions:
all_attentions = all_attentions + (layer_outputs[1],)
if output_hidden_states:
encoder_states = encoder_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
return BaseModelOutput(
last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
)
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoder with DeformableDetr->Deta,Deformable DETR->DETA
class DetaDecoder(DetaPreTrainedModel):
"""
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`DetaDecoderLayer`].
The decoder updates the query embeddings through multiple self-attention and cross-attention layers.
Some tweaks for Deformable DETR:
- `position_embeddings`, `reference_points`, `spatial_shapes` and `valid_ratios` are added to the forward pass.
- it also returns a stack of intermediate outputs and reference points from all decoding layers.
Args:
config: DetaConfig
"""
def __init__(self, config: DetaConfig):
super().__init__(config)
self.dropout = config.dropout
self.layers = nn.ModuleList([DetaDecoderLayer(config) for _ in range(config.decoder_layers)])
self.gradient_checkpointing = False
# hack implementation for iterative bounding box refinement and two-stage Deformable DETR
self.bbox_embed = None
self.class_embed = None
# Initialize weights and apply final processing
self.post_init()
def forward(
self,
inputs_embeds=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
position_embeddings=None,
reference_points=None,
spatial_shapes=None,
level_start_index=None,
valid_ratios=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
):
r"""
Args:
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
The query embeddings that are passed into the decoder.
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
of the decoder.
encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected
in `[0, 1]`:
- 1 for pixels that are real (i.e. **not masked**),
- 0 for pixels that are padding (i.e. **masked**).
position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
Position embeddings that are added to the queries and keys in each self-attention layer.
reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)` is `as_two_stage` else `(batch_size, num_queries, 2)` or , *optional*):
Reference point in range `[0, 1]`, top-left (0,0), bottom-right (1, 1), including padding area.
spatial_shapes (`torch.FloatTensor` of shape `(num_feature_levels, 2)`):
Spatial shapes of the feature maps.
level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`, *optional*):
Indexes for the start of each feature level. In range `[0, sequence_length]`.
valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`, *optional*):
Ratio of valid area in each feature level.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if inputs_embeds is not None:
hidden_states = inputs_embeds
# decoder layers
all_hidden_states = () if output_hidden_states else None
all_self_attns = () if output_attentions else None
all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
intermediate = ()
intermediate_reference_points = ()
for idx, decoder_layer in enumerate(self.layers):
if reference_points.shape[-1] == 4:
reference_points_input = (
reference_points[:, :, None] * torch.cat([valid_ratios, valid_ratios], -1)[:, None]
)
else:
if reference_points.shape[-1] != 2:
raise ValueError("Reference points' last dimension must be of size 2")
reference_points_input = reference_points[:, :, None] * valid_ratios[:, None]
if output_hidden_states:
all_hidden_states += (hidden_states,)
if self.gradient_checkpointing and self.training:
def create_custom_forward(module):
def custom_forward(*inputs):
return module(*inputs, output_attentions)
return custom_forward
layer_outputs = torch.utils.checkpoint.checkpoint(
create_custom_forward(decoder_layer),
hidden_states,
encoder_hidden_states,
encoder_attention_mask,
None,
)
else:
layer_outputs = decoder_layer(
hidden_states,
position_embeddings=position_embeddings,
encoder_hidden_states=encoder_hidden_states,
reference_points=reference_points_input,
spatial_shapes=spatial_shapes,
level_start_index=level_start_index,
encoder_attention_mask=encoder_attention_mask,
output_attentions=output_attentions,
)
hidden_states = layer_outputs[0]
# hack implementation for iterative bounding box refinement
if self.bbox_embed is not None:
tmp = self.bbox_embed[idx](hidden_states)
if reference_points.shape[-1] == 4:
new_reference_points = tmp + inverse_sigmoid(reference_points)
new_reference_points = new_reference_points.sigmoid()
else:
if reference_points.shape[-1] != 2:
raise ValueError(
f"Reference points' last dimension must be of size 2, but is {reference_points.shape[-1]}"
)
new_reference_points = tmp
new_reference_points[..., :2] = tmp[..., :2] + inverse_sigmoid(reference_points)
new_reference_points = new_reference_points.sigmoid()
reference_points = new_reference_points.detach()
intermediate += (hidden_states,)
intermediate_reference_points += (reference_points,)
if output_attentions:
all_self_attns += (layer_outputs[1],)
if encoder_hidden_states is not None:
all_cross_attentions += (layer_outputs[2],)
# Keep batch_size as first dimension
intermediate = torch.stack(intermediate, dim=1)
intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1)
# add hidden states from the last decoder layer
if output_hidden_states:
all_hidden_states += (hidden_states,)
if not return_dict:
return tuple(
v
for v in [
hidden_states,
intermediate,
intermediate_reference_points,
all_hidden_states,
all_self_attns,
all_cross_attentions,
]
if v is not None
)
return DetaDecoderOutput(
last_hidden_state=hidden_states,
intermediate_hidden_states=intermediate,
intermediate_reference_points=intermediate_reference_points,
hidden_states=all_hidden_states,
attentions=all_self_attns,
cross_attentions=all_cross_attentions,
)
@add_start_docstrings(
"""
The bare DETA Model (consisting of a backbone and encoder-decoder Transformer) outputting raw hidden-states without
any specific head on top.
""",
DETA_START_DOCSTRING,
)
class DetaModel(DetaPreTrainedModel):
def __init__(self, config: DetaConfig):
super().__init__(config)
if config.two_stage:
requires_backends(self, ["torchvision"])
# Create backbone with positional encoding
self.backbone = DetaBackboneWithPositionalEncodings(config)
intermediate_channel_sizes = self.backbone.intermediate_channel_sizes
# Create input projection layers
if config.num_feature_levels > 1:
num_backbone_outs = len(intermediate_channel_sizes)
input_proj_list = []
for _ in range(num_backbone_outs):
in_channels = intermediate_channel_sizes[_]
input_proj_list.append(
nn.Sequential(
nn.Conv2d(in_channels, config.d_model, kernel_size=1),
nn.GroupNorm(32, config.d_model),
)
)
for _ in range(config.num_feature_levels - num_backbone_outs):
input_proj_list.append(
nn.Sequential(
nn.Conv2d(in_channels, config.d_model, kernel_size=3, stride=2, padding=1),
nn.GroupNorm(32, config.d_model),
)
)
in_channels = config.d_model
self.input_proj = nn.ModuleList(input_proj_list)
else:
self.input_proj = nn.ModuleList(
[
nn.Sequential(
nn.Conv2d(intermediate_channel_sizes[-1], config.d_model, kernel_size=1),
nn.GroupNorm(32, config.d_model),
)
]
)
if not config.two_stage:
self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model * 2)
self.encoder = DetaEncoder(config)
self.decoder = DetaDecoder(config)
self.level_embed = nn.Parameter(torch.Tensor(config.num_feature_levels, config.d_model))
if config.two_stage:
self.enc_output = nn.Linear(config.d_model, config.d_model)
self.enc_output_norm = nn.LayerNorm(config.d_model)
self.pos_trans = nn.Linear(config.d_model * 2, config.d_model * 2)
self.pos_trans_norm = nn.LayerNorm(config.d_model * 2)
self.pix_trans = nn.Linear(config.d_model, config.d_model)
self.pix_trans_norm = nn.LayerNorm(config.d_model)
else:
self.reference_points = nn.Linear(config.d_model, 2)
self.assign_first_stage = config.assign_first_stage
self.two_stage_num_proposals = config.two_stage_num_proposals
self.post_init()
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModel.get_encoder
def get_encoder(self):
return self.encoder
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModel.get_decoder
def get_decoder(self):
return self.decoder
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModel.freeze_backbone
def freeze_backbone(self):
for name, param in self.backbone.conv_encoder.model.named_parameters():
param.requires_grad_(False)
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModel.unfreeze_backbone
def unfreeze_backbone(self):
for name, param in self.backbone.conv_encoder.model.named_parameters():
param.requires_grad_(True)
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModel.get_valid_ratio
def get_valid_ratio(self, mask):
"""Get the valid ratio of all feature maps."""
_, height, width = mask.shape
valid_height = torch.sum(mask[:, :, 0], 1)
valid_width = torch.sum(mask[:, 0, :], 1)
valid_ratio_heigth = valid_height.float() / height
valid_ratio_width = valid_width.float() / width
valid_ratio = torch.stack([valid_ratio_width, valid_ratio_heigth], -1)
return valid_ratio
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModel.get_proposal_pos_embed
def get_proposal_pos_embed(self, proposals):
"""Get the position embedding of the proposals."""
num_pos_feats = 128
temperature = 10000
scale = 2 * math.pi
dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=proposals.device)
dim_t = temperature ** (2 * (dim_t // 2) / num_pos_feats)
# batch_size, num_queries, 4
proposals = proposals.sigmoid() * scale
# batch_size, num_queries, 4, 128
pos = proposals[:, :, :, None] / dim_t
# batch_size, num_queries, 4, 64, 2 -> batch_size, num_queries, 512
pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2)
return pos
def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes):
"""Generate the encoder output proposals from encoded enc_output.
Args:
enc_output (Tensor[batch_size, sequence_length, hidden_size]): Output of the encoder.
padding_mask (Tensor[batch_size, sequence_length]): Padding mask for `enc_output`.
spatial_shapes (Tensor[num_feature_levels, 2]): Spatial shapes of the feature maps.
Returns:
`tuple(torch.FloatTensor)`: A tuple of feature map and bbox prediction.
- object_query (Tensor[batch_size, sequence_length, hidden_size]): Object query features. Later used to
directly predict a bounding box. (without the need of a decoder)
- output_proposals (Tensor[batch_size, sequence_length, 4]): Normalized proposals, after an inverse
sigmoid.
"""
batch_size = enc_output.shape[0]
proposals = []
_cur = 0
level_ids = []
for level, (height, width) in enumerate(spatial_shapes):
mask_flatten_ = padding_mask[:, _cur : (_cur + height * width)].view(batch_size, height, width, 1)
valid_height = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
valid_width = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
grid_y, grid_x = meshgrid(
torch.linspace(0, height - 1, height, dtype=torch.float32, device=enc_output.device),
torch.linspace(0, width - 1, width, dtype=torch.float32, device=enc_output.device),
indexing="ij",
)
grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
scale = torch.cat([valid_width.unsqueeze(-1), valid_height.unsqueeze(-1)], 1).view(batch_size, 1, 1, 2)
grid = (grid.unsqueeze(0).expand(batch_size, -1, -1, -1) + 0.5) / scale
width_heigth = torch.ones_like(grid) * 0.05 * (2.0**level)
proposal = torch.cat((grid, width_heigth), -1).view(batch_size, -1, 4)
proposals.append(proposal)
_cur += height * width
level_ids.append(grid.new_ones(height * width, dtype=torch.long) * level)
output_proposals = torch.cat(proposals, 1)
output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
output_proposals = torch.log(output_proposals / (1 - output_proposals)) # inverse sigmoid
output_proposals = output_proposals.masked_fill(padding_mask.unsqueeze(-1), float("inf"))
output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf"))
# assign each pixel as an object query
object_query = enc_output
object_query = object_query.masked_fill(padding_mask.unsqueeze(-1), float(0))
object_query = object_query.masked_fill(~output_proposals_valid, float(0))
object_query = self.enc_output_norm(self.enc_output(object_query))
level_ids = torch.cat(level_ids)
return object_query, output_proposals, level_ids
@add_start_docstrings_to_model_forward(DETA_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=DetaModelOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
pixel_values,
pixel_mask=None,
decoder_attention_mask=None,
encoder_outputs=None,
inputs_embeds=None,
decoder_inputs_embeds=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
):
r"""
Returns:
Examples:
```python
>>> from transformers import AutoImageProcessor, DetaModel
>>> from PIL import Image
>>> import requests
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> image_processor = AutoImageProcessor.from_pretrained("jozhang97/deta-swin-large-o365")
>>> model = DetaModel.from_pretrained("jozhang97/deta-swin-large-o365", two_stage=False)
>>> inputs = image_processor(images=image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> last_hidden_states = outputs.last_hidden_state
>>> list(last_hidden_states.shape)
[1, 900, 256]
```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
batch_size, num_channels, height, width = pixel_values.shape
device = pixel_values.device
if pixel_mask is None:
pixel_mask = torch.ones(((batch_size, height, width)), dtype=torch.long, device=device)
# Extract multi-scale feature maps of same resolution `config.d_model` (cf Figure 4 in paper)
# First, sent pixel_values + pixel_mask through Backbone to obtain the features
# which is a list of tuples
features, position_embeddings_list = self.backbone(pixel_values, pixel_mask)
# Then, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default)
sources = []
masks = []
for level, (source, mask) in enumerate(features):
sources.append(self.input_proj[level](source))
masks.append(mask)
if mask is None:
raise ValueError("No attention mask was provided")
# Lowest resolution feature maps are obtained via 3x3 stride 2 convolutions on the final stage
if self.config.num_feature_levels > len(sources):
_len_sources = len(sources)
for level in range(_len_sources, self.config.num_feature_levels):
if level == _len_sources:
source = self.input_proj[level](features[-1][0])
else:
source = self.input_proj[level](sources[-1])
mask = nn.functional.interpolate(pixel_mask[None].float(), size=source.shape[-2:]).to(torch.bool)[0]
pos_l = self.backbone.position_embedding(source, mask).to(source.dtype)
sources.append(source)
masks.append(mask)
position_embeddings_list.append(pos_l)
# Create queries
query_embeds = None
if not self.config.two_stage:
query_embeds = self.query_position_embeddings.weight
# Prepare encoder inputs (by flattening)
spatial_shapes = [(source.shape[2:]) for source in sources]
source_flatten = [source.flatten(2).transpose(1, 2) for source in sources]
mask_flatten = [mask.flatten(1) for mask in masks]
lvl_pos_embed_flatten = []
for level, pos_embed in enumerate(position_embeddings_list):
pos_embed = pos_embed.flatten(2).transpose(1, 2)
lvl_pos_embed = pos_embed + self.level_embed[level].view(1, 1, -1)
lvl_pos_embed_flatten.append(lvl_pos_embed)
source_flatten = torch.cat(source_flatten, 1)
mask_flatten = torch.cat(mask_flatten, 1)
lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=source_flatten.device)
level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)
valid_ratios = valid_ratios.float()
# Fourth, sent source_flatten + mask_flatten + lvl_pos_embed_flatten (backbone + proj layer output) through encoder
# Also provide spatial_shapes, level_start_index and valid_ratios
if encoder_outputs is None:
encoder_outputs = self.encoder(
inputs_embeds=source_flatten,
attention_mask=mask_flatten,
position_embeddings=lvl_pos_embed_flatten,
spatial_shapes=spatial_shapes,
level_start_index=level_start_index,
valid_ratios=valid_ratios,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
encoder_outputs = BaseModelOutput(
last_hidden_state=encoder_outputs[0],
hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
)
# Fifth, prepare decoder inputs
batch_size, _, num_channels = encoder_outputs[0].shape
enc_outputs_class = None
enc_outputs_coord_logits = None
if self.config.two_stage:
object_query_embedding, output_proposals, level_ids = self.gen_encoder_output_proposals(
encoder_outputs[0], ~mask_flatten, spatial_shapes
)
# hack implementation for two-stage DETA
# apply a detection head to each pixel (A.4 in paper)
# linear projection for bounding box binary classification (i.e. foreground and background)
enc_outputs_class = self.decoder.class_embed[-1](object_query_embedding)
# 3-layer FFN to predict bounding boxes coordinates (bbox regression branch)
delta_bbox = self.decoder.bbox_embed[-1](object_query_embedding)
enc_outputs_coord_logits = delta_bbox + output_proposals
# only keep top scoring `config.two_stage_num_proposals` proposals
topk = self.two_stage_num_proposals
proposal_logit = enc_outputs_class[..., 0]
if self.assign_first_stage:
proposal_boxes = center_to_corners_format(enc_outputs_coord_logits.sigmoid().float()).clamp(0, 1)
topk_proposals = []
for b in range(batch_size):
prop_boxes_b = proposal_boxes[b]
prop_logits_b = proposal_logit[b]
# pre-nms per-level topk
pre_nms_topk = 1000
pre_nms_inds = []
for lvl in range(len(spatial_shapes)):
lvl_mask = level_ids == lvl
pre_nms_inds.append(torch.topk(prop_logits_b.sigmoid() * lvl_mask, pre_nms_topk)[1])
pre_nms_inds = torch.cat(pre_nms_inds)
# nms on topk indices
post_nms_inds = batched_nms(
prop_boxes_b[pre_nms_inds], prop_logits_b[pre_nms_inds], level_ids[pre_nms_inds], 0.9
)
keep_inds = pre_nms_inds[post_nms_inds]
if len(keep_inds) < self.two_stage_num_proposals:
print(
f"[WARNING] nms proposals ({len(keep_inds)}) < {self.two_stage_num_proposals}, running"
" naive topk"
)
keep_inds = torch.topk(proposal_logit[b], topk)[1]
# keep top Q/L indices for L levels
q_per_l = topk // len(spatial_shapes)
is_level_ordered = (
level_ids[keep_inds][None]
== torch.arange(len(spatial_shapes), device=level_ids.device)[:, None]
)
keep_inds_mask = is_level_ordered & (is_level_ordered.cumsum(1) <= q_per_l) # LS
keep_inds_mask = keep_inds_mask.any(0) # S
# pad to Q indices (might let ones filtered from pre-nms sneak by... unlikely because we pick high conf anyways)
if keep_inds_mask.sum() < topk:
num_to_add = topk - keep_inds_mask.sum()
pad_inds = (~keep_inds_mask).nonzero()[:num_to_add]
keep_inds_mask[pad_inds] = True
keep_inds_topk = keep_inds[keep_inds_mask]
topk_proposals.append(keep_inds_topk)
topk_proposals = torch.stack(topk_proposals)
else:
topk_proposals = torch.topk(enc_outputs_class[..., 0], topk, dim=1)[1]
topk_coords_logits = torch.gather(
enc_outputs_coord_logits, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4)
)
topk_coords_logits = topk_coords_logits.detach()
reference_points = topk_coords_logits.sigmoid()
init_reference_points = reference_points
pos_trans_out = self.pos_trans_norm(self.pos_trans(self.get_proposal_pos_embed(topk_coords_logits)))
query_embed, target = torch.split(pos_trans_out, num_channels, dim=2)
else:
query_embed, target = torch.split(query_embeds, num_channels, dim=1)
query_embed = query_embed.unsqueeze(0).expand(batch_size, -1, -1)
target = target.unsqueeze(0).expand(batch_size, -1, -1)
reference_points = self.reference_points(query_embed).sigmoid()
init_reference_points = reference_points
decoder_outputs = self.decoder(
inputs_embeds=target,
position_embeddings=query_embed,
encoder_hidden_states=encoder_outputs[0],
encoder_attention_mask=mask_flatten,
reference_points=reference_points,
spatial_shapes=spatial_shapes,
level_start_index=level_start_index,
valid_ratios=valid_ratios,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
if not return_dict:
enc_outputs = tuple(value for value in [enc_outputs_class, enc_outputs_coord_logits] if value is not None)
tuple_outputs = (init_reference_points,) + decoder_outputs + encoder_outputs + enc_outputs
return tuple_outputs
return DetaModelOutput(
init_reference_points=init_reference_points,
last_hidden_state=decoder_outputs.last_hidden_state,
intermediate_hidden_states=decoder_outputs.intermediate_hidden_states,
intermediate_reference_points=decoder_outputs.intermediate_reference_points,
decoder_hidden_states=decoder_outputs.hidden_states,
decoder_attentions=decoder_outputs.attentions,
cross_attentions=decoder_outputs.cross_attentions,
encoder_last_hidden_state=encoder_outputs.last_hidden_state,
encoder_hidden_states=encoder_outputs.hidden_states,
encoder_attentions=encoder_outputs.attentions,
enc_outputs_class=enc_outputs_class,
enc_outputs_coord_logits=enc_outputs_coord_logits,
)
@add_start_docstrings(
"""
DETA Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top, for tasks
such as COCO detection.
""",
DETA_START_DOCSTRING,
)
class DetaForObjectDetection(DetaPreTrainedModel):
# When using clones, all layers > 0 will be clones, but layer 0 *is* required
_keys_to_ignore_on_load_missing = ["bbox_embed\.[1-9]\d*", "class_embed\.[1-9]\d*"]
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrForObjectDetection.__init__ with DeformableDetr->Deta
def __init__(self, config: DetaConfig):
super().__init__(config)
# Deformable DETR encoder-decoder model
self.model = DetaModel(config)
# Detection heads on top
self.class_embed = nn.Linear(config.d_model, config.num_labels)
self.bbox_embed = DetaMLPPredictionHead(
input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
)
prior_prob = 0.01
bias_value = -math.log((1 - prior_prob) / prior_prob)
self.class_embed.bias.data = torch.ones(config.num_labels) * bias_value
nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0)
nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0)
# if two-stage, the last class_embed and bbox_embed is for region proposal generation
num_pred = (config.decoder_layers + 1) if config.two_stage else config.decoder_layers
if config.with_box_refine:
self.class_embed = _get_clones(self.class_embed, num_pred)
self.bbox_embed = _get_clones(self.bbox_embed, num_pred)
nn.init.constant_(self.bbox_embed[0].layers[-1].bias.data[2:], -2.0)
# hack implementation for iterative bounding box refinement
self.model.decoder.bbox_embed = self.bbox_embed
else:
nn.init.constant_(self.bbox_embed.layers[-1].bias.data[2:], -2.0)
self.class_embed = nn.ModuleList([self.class_embed for _ in range(num_pred)])
self.bbox_embed = nn.ModuleList([self.bbox_embed for _ in range(num_pred)])
self.model.decoder.bbox_embed = None
if config.two_stage:
# hack implementation for two-stage
self.model.decoder.class_embed = self.class_embed
for box_embed in self.bbox_embed:
nn.init.constant_(box_embed.layers[-1].bias.data[2:], 0.0)
# Initialize weights and apply final processing
self.post_init()
@torch.jit.unused
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrForObjectDetection._set_aux_loss
def _set_aux_loss(self, outputs_class, outputs_coord):
# this is a workaround to make torchscript happy, as torchscript
# doesn't support dictionary with non-homogeneous values, such
# as a dict having both a Tensor and a list.
return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]
@add_start_docstrings_to_model_forward(DETA_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=DetaObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
pixel_values,
pixel_mask=None,
decoder_attention_mask=None,
encoder_outputs=None,
inputs_embeds=None,
decoder_inputs_embeds=None,
labels=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
):
r"""
labels (`List[Dict]` of len `(batch_size,)`, *optional*):
Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.
Returns:
Examples:
```python
>>> from transformers import AutoImageProcessor, DetaForObjectDetection
>>> from PIL import Image
>>> import requests
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> image_processor = AutoImageProcessor.from_pretrained("jozhang97/deta-swin-large")
>>> model = DetaForObjectDetection.from_pretrained("jozhang97/deta-swin-large")
>>> inputs = image_processor(images=image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> # convert outputs (bounding boxes and class logits) to COCO API
>>> target_sizes = torch.tensor([image.size[::-1]])
>>> results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[
... 0
... ]
>>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
... box = [round(i, 2) for i in box.tolist()]
... print(
... f"Detected {model.config.id2label[label.item()]} with confidence "
... f"{round(score.item(), 3)} at location {box}"
... )
Detected cat with confidence 0.683 at location [345.85, 23.68, 639.86, 372.83]
Detected cat with confidence 0.683 at location [8.8, 52.49, 316.93, 473.45]
Detected remote with confidence 0.568 at location [40.02, 73.75, 175.96, 117.33]
Detected remote with confidence 0.546 at location [333.68, 77.13, 370.12, 187.51]
```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# First, sent images through DETR base model to obtain encoder + decoder outputs
outputs = self.model(
pixel_values,
pixel_mask=pixel_mask,
decoder_attention_mask=decoder_attention_mask,
encoder_outputs=encoder_outputs,
inputs_embeds=inputs_embeds,
decoder_inputs_embeds=decoder_inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = outputs.intermediate_hidden_states if return_dict else outputs[2]
init_reference = outputs.init_reference_points if return_dict else outputs[0]
inter_references = outputs.intermediate_reference_points if return_dict else outputs[3]
# class logits + predicted bounding boxes
outputs_classes = []
outputs_coords = []
for level in range(hidden_states.shape[1]):
if level == 0:
reference = init_reference
else:
reference = inter_references[:, level - 1]
reference = inverse_sigmoid(reference)
outputs_class = self.class_embed[level](hidden_states[:, level])
delta_bbox = self.bbox_embed[level](hidden_states[:, level])
if reference.shape[-1] == 4:
outputs_coord_logits = delta_bbox + reference
elif reference.shape[-1] == 2:
delta_bbox[..., :2] += reference
outputs_coord_logits = delta_bbox
else:
raise ValueError(f"reference.shape[-1] should be 4 or 2, but got {reference.shape[-1]}")
outputs_coord = outputs_coord_logits.sigmoid()
outputs_classes.append(outputs_class)
outputs_coords.append(outputs_coord)
# Keep batch_size as first dimension
outputs_class = torch.stack(outputs_classes, dim=1)
outputs_coord = torch.stack(outputs_coords, dim=1)
logits = outputs_class[:, -1]
pred_boxes = outputs_coord[:, -1]
loss, loss_dict, auxiliary_outputs = None, None, None
if labels is not None:
# First: create the matcher
matcher = DetaHungarianMatcher(
class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost
)
# Second: create the criterion
losses = ["labels", "boxes", "cardinality"]
criterion = DetaLoss(
matcher=matcher,
num_classes=self.config.num_labels,
focal_alpha=self.config.focal_alpha,
losses=losses,
num_queries=self.config.num_queries,
)
criterion.to(logits.device)
# Third: compute the losses, based on outputs and labels
outputs_loss = {}
outputs_loss["logits"] = logits
outputs_loss["pred_boxes"] = pred_boxes
if self.config.auxiliary_loss:
intermediate = outputs.intermediate_hidden_states if return_dict else outputs[4]
outputs_class = self.class_embed(intermediate)
outputs_coord = self.bbox_embed(intermediate).sigmoid()
auxiliary_outputs = self._set_aux_loss(outputs_class, outputs_coord)
outputs_loss["auxiliary_outputs"] = auxiliary_outputs
if self.config.two_stage:
enc_outputs_coord = outputs.enc_outputs_coord_logits.sigmoid()
outputs["enc_outputs"] = {"pred_logits": outputs.enc_outputs_class, "pred_boxes": enc_outputs_coord}
loss_dict = criterion(outputs_loss, labels)
# Fourth: compute total loss, as a weighted sum of the various losses
weight_dict = {"loss_ce": 1, "loss_bbox": self.config.bbox_loss_coefficient}
weight_dict["loss_giou"] = self.config.giou_loss_coefficient
if self.config.auxiliary_loss:
aux_weight_dict = {}
for i in range(self.config.decoder_layers - 1):
aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
weight_dict.update(aux_weight_dict)
loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
if not return_dict:
if auxiliary_outputs is not None:
output = (logits, pred_boxes) + auxiliary_outputs + outputs
else:
output = (logits, pred_boxes) + outputs
tuple_outputs = ((loss, loss_dict) + output) if loss is not None else output
return tuple_outputs
dict_outputs = DetaObjectDetectionOutput(
loss=loss,
loss_dict=loss_dict,
logits=logits,
pred_boxes=pred_boxes,
auxiliary_outputs=auxiliary_outputs,
last_hidden_state=outputs.last_hidden_state,
decoder_hidden_states=outputs.decoder_hidden_states,
decoder_attentions=outputs.decoder_attentions,
cross_attentions=outputs.cross_attentions,
encoder_last_hidden_state=outputs.encoder_last_hidden_state,
encoder_hidden_states=outputs.encoder_hidden_states,
encoder_attentions=outputs.encoder_attentions,
intermediate_hidden_states=outputs.intermediate_hidden_states,
intermediate_reference_points=outputs.intermediate_reference_points,
init_reference_points=outputs.init_reference_points,
enc_outputs_class=outputs.enc_outputs_class,
enc_outputs_coord_logits=outputs.enc_outputs_coord_logits,
)
return dict_outputs
# Copied from transformers.models.detr.modeling_detr.dice_loss
def dice_loss(inputs, targets, num_boxes):
"""
Compute the DICE loss, similar to generalized IOU for masks
Args:
inputs: A float tensor of arbitrary shape.
The predictions for each example.
targets: A float tensor with the same shape as inputs. Stores the binary
classification label for each element in inputs (0 for the negative class and 1 for the positive
class).
"""
inputs = inputs.sigmoid()
inputs = inputs.flatten(1)
numerator = 2 * (inputs * targets).sum(1)
denominator = inputs.sum(-1) + targets.sum(-1)
loss = 1 - (numerator + 1) / (denominator + 1)
return loss.sum() / num_boxes
# Copied from transformers.models.detr.modeling_detr.sigmoid_focal_loss
def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
"""
Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
Args:
inputs (`torch.FloatTensor` of arbitrary shape):
The predictions for each example.
targets (`torch.FloatTensor` with the same shape as `inputs`)
A tensor storing the binary classification label for each element in the `inputs` (0 for the negative class
and 1 for the positive class).
alpha (`float`, *optional*, defaults to `0.25`):
Optional weighting factor in the range (0,1) to balance positive vs. negative examples.
gamma (`int`, *optional*, defaults to `2`):
Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples.
Returns:
Loss tensor
"""
prob = inputs.sigmoid()
ce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
# add modulating factor
p_t = prob * targets + (1 - prob) * (1 - targets)
loss = ce_loss * ((1 - p_t) ** gamma)
if alpha >= 0:
alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
loss = alpha_t * loss
return loss.mean(1).sum() / num_boxes
class DetaLoss(nn.Module):
"""
This class computes the losses for `DetaForObjectDetection`. The process happens in two steps: 1) we compute
hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of matched
ground-truth / prediction (supervised class and box).
Args:
matcher (`DetaHungarianMatcher`):
Module able to compute a matching between targets and proposals.
num_classes (`int`):
Number of object categories, omitting the special no-object category.
focal_alpha (`float`):
Alpha parameter in focal loss.
losses (`List[str]`):
List of all the losses to be applied. See `get_loss` for a list of all available losses.
"""
def __init__(
self,
matcher,
num_classes,
focal_alpha,
losses,
num_queries,
assign_first_stage=False,
assign_second_stage=False,
):
super().__init__()
self.matcher = matcher
self.num_classes = num_classes
self.focal_alpha = focal_alpha
self.losses = losses
self.assign_first_stage = assign_first_stage
self.assign_second_stage = assign_second_stage
if self.assign_first_stage:
self.stg1_assigner = DetaStage1Assigner()
if self.assign_second_stage:
self.stg2_assigner = DetaStage2Assigner(num_queries)
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss.loss_labels
def loss_labels(self, outputs, targets, indices, num_boxes):
"""
Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor
of dim [nb_target_boxes]
"""
if "logits" not in outputs:
raise KeyError("No logits were found in the outputs")
source_logits = outputs["logits"]
idx = self._get_source_permutation_idx(indices)
target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)])
target_classes = torch.full(
source_logits.shape[:2], self.num_classes, dtype=torch.int64, device=source_logits.device
)
target_classes[idx] = target_classes_o
target_classes_onehot = torch.zeros(
[source_logits.shape[0], source_logits.shape[1], source_logits.shape[2] + 1],
dtype=source_logits.dtype,
layout=source_logits.layout,
device=source_logits.device,
)
target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)
target_classes_onehot = target_classes_onehot[:, :, :-1]
loss_ce = (
sigmoid_focal_loss(source_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2)
* source_logits.shape[1]
)
losses = {"loss_ce": loss_ce}
return losses
@torch.no_grad()
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss.loss_cardinality
def loss_cardinality(self, outputs, targets, indices, num_boxes):
"""
Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes.
This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients.
"""
logits = outputs["logits"]
device = logits.device
target_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device)
# Count the number of predictions that are NOT "no-object" (which is the last class)
card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1)
card_err = nn.functional.l1_loss(card_pred.float(), target_lengths.float())
losses = {"cardinality_error": card_err}
return losses
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss.loss_boxes
def loss_boxes(self, outputs, targets, indices, num_boxes):
"""
Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss.
Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes
are expected in format (center_x, center_y, w, h), normalized by the image size.
"""
if "pred_boxes" not in outputs:
raise KeyError("No predicted boxes found in outputs")
idx = self._get_source_permutation_idx(indices)
source_boxes = outputs["pred_boxes"][idx]
target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0)
loss_bbox = nn.functional.l1_loss(source_boxes, target_boxes, reduction="none")
losses = {}
losses["loss_bbox"] = loss_bbox.sum() / num_boxes
loss_giou = 1 - torch.diag(
generalized_box_iou(center_to_corners_format(source_boxes), center_to_corners_format(target_boxes))
)
losses["loss_giou"] = loss_giou.sum() / num_boxes
return losses
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss._get_source_permutation_idx
def _get_source_permutation_idx(self, indices):
# permute predictions following indices
batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)])
source_idx = torch.cat([source for (source, _) in indices])
return batch_idx, source_idx
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss._get_target_permutation_idx
def _get_target_permutation_idx(self, indices):
# permute targets following indices
batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)])
target_idx = torch.cat([target for (_, target) in indices])
return batch_idx, target_idx
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss.get_loss
def get_loss(self, loss, outputs, targets, indices, num_boxes):
loss_map = {
"labels": self.loss_labels,
"cardinality": self.loss_cardinality,
"boxes": self.loss_boxes,
}
if loss not in loss_map:
raise ValueError(f"Loss {loss} not supported")
return loss_map[loss](outputs, targets, indices, num_boxes)
def forward(self, outputs, targets):
"""
This performs the loss computation.
Args:
outputs (`dict`, *optional*):
Dictionary of tensors, see the output specification of the model for the format.
targets (`List[dict]`, *optional*):
List of dicts, such that `len(targets) == batch_size`. The expected keys in each dict depends on the
losses applied, see each loss' doc.
"""
outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs"}
# Retrieve the matching between the outputs of the last layer and the targets
if self.assign_second_stage:
indices = self.stg2_assigner(outputs_without_aux, targets)
else:
indices = self.matcher(outputs_without_aux, targets)
# Compute the average number of target boxes accross all nodes, for normalization purposes
num_boxes = sum(len(t["class_labels"]) for t in targets)
num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
# (Niels): comment out function below, distributed training to be added
# if is_dist_avail_and_initialized():
# torch.distributed.all_reduce(num_boxes)
# (Niels) in original implementation, num_boxes is divided by get_world_size()
num_boxes = torch.clamp(num_boxes, min=1).item()
# Compute all the requested losses
losses = {}
for loss in self.losses:
losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes))
# In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
if "auxiliary_outputs" in outputs:
for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]):
if not self.assign_second_stage:
indices = self.matcher(auxiliary_outputs, targets)
for loss in self.losses:
l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes)
l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
losses.update(l_dict)
if "enc_outputs" in outputs:
enc_outputs = outputs["enc_outputs"]
bin_targets = copy.deepcopy(targets)
for bt in bin_targets:
bt["labels"] = torch.zeros_like(bt["labels"])
if self.assign_first_stage:
indices = self.stg1_assigner(enc_outputs, bin_targets)
else:
indices = self.matcher(enc_outputs, bin_targets)
for loss in self.losses:
kwargs = {}
if loss == "labels":
# Logging is enabled only for the last layer
kwargs["log"] = False
l_dict = self.get_loss(loss, enc_outputs, bin_targets, indices, num_boxes, **kwargs)
l_dict = {k + "_enc": v for k, v in l_dict.items()}
losses.update(l_dict)
return losses
# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead
class DetaMLPPredictionHead(nn.Module):
"""
Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
height and width of a bounding box w.r.t. an image.
Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py
"""
def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
super().__init__()
self.num_layers = num_layers
h = [hidden_dim] * (num_layers - 1)
self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
def forward(self, x):
for i, layer in enumerate(self.layers):
x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
return x
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrHungarianMatcher with DeformableDetr->Deta
class DetaHungarianMatcher(nn.Module):
"""
This class computes an assignment between the targets and the predictions of the network.
For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more
predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are
un-matched (and thus treated as non-objects).
Args:
class_cost:
The relative weight of the classification error in the matching cost.
bbox_cost:
The relative weight of the L1 error of the bounding box coordinates in the matching cost.
giou_cost:
The relative weight of the giou loss of the bounding box in the matching cost.
"""
def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float = 1):
super().__init__()
requires_backends(self, ["scipy"])
self.class_cost = class_cost
self.bbox_cost = bbox_cost
self.giou_cost = giou_cost
if class_cost == 0 and bbox_cost == 0 and giou_cost == 0:
raise ValueError("All costs of the Matcher can't be 0")
@torch.no_grad()
def forward(self, outputs, targets):
"""
Args:
outputs (`dict`):
A dictionary that contains at least these entries:
* "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
* "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates.
targets (`List[dict]`):
A list of targets (len(targets) = batch_size), where each target is a dict containing:
* "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of
ground-truth
objects in the target) containing the class labels
* "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates.
Returns:
`List[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where:
- index_i is the indices of the selected predictions (in order)
- index_j is the indices of the corresponding selected targets (in order)
For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
"""
batch_size, num_queries = outputs["logits"].shape[:2]
# We flatten to compute the cost matrices in a batch
out_prob = outputs["logits"].flatten(0, 1).sigmoid() # [batch_size * num_queries, num_classes]
out_bbox = outputs["pred_boxes"].flatten(0, 1) # [batch_size * num_queries, 4]
# Also concat the target labels and boxes
target_ids = torch.cat([v["class_labels"] for v in targets])
target_bbox = torch.cat([v["boxes"] for v in targets])
# Compute the classification cost.
alpha = 0.25
gamma = 2.0
neg_cost_class = (1 - alpha) * (out_prob**gamma) * (-(1 - out_prob + 1e-8).log())
pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
class_cost = pos_cost_class[:, target_ids] - neg_cost_class[:, target_ids]
# Compute the L1 cost between boxes
bbox_cost = torch.cdist(out_bbox, target_bbox, p=1)
# Compute the giou cost between boxes
giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox))
# Final cost matrix
cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost
cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu()
sizes = [len(v["boxes"]) for v in targets]
indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))]
return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
# Copied from transformers.models.detr.modeling_detr._upcast
def _upcast(t: Tensor) -> Tensor:
# Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
if t.is_floating_point():
return t if t.dtype in (torch.float32, torch.float64) else t.float()
else:
return t if t.dtype in (torch.int32, torch.int64) else t.int()
# Copied from transformers.models.detr.modeling_detr.box_area
def box_area(boxes: Tensor) -> Tensor:
"""
Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.
Args:
boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1
< x2` and `0 <= y1 < y2`.
Returns:
`torch.FloatTensor`: a tensor containing the area for each box.
"""
boxes = _upcast(boxes)
return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
# Copied from transformers.models.detr.modeling_detr.box_iou
def box_iou(boxes1, boxes2):
area1 = box_area(boxes1)
area2 = box_area(boxes2)
left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2]
right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2]
width_height = (right_bottom - left_top).clamp(min=0) # [N,M,2]
inter = width_height[:, :, 0] * width_height[:, :, 1] # [N,M]
union = area1[:, None] + area2 - inter
iou = inter / union
return iou, union
# Copied from transformers.models.detr.modeling_detr.generalized_box_iou
def generalized_box_iou(boxes1, boxes2):
"""
Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format.
Returns:
`torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2)
"""
# degenerate boxes gives inf / nan results
# so do an early check
if not (boxes1[:, 2:] >= boxes1[:, :2]).all():
raise ValueError(f"boxes1 must be in [x0, y0, x1, y1] (corner) format, but got {boxes1}")
if not (boxes2[:, 2:] >= boxes2[:, :2]).all():
raise ValueError(f"boxes2 must be in [x0, y0, x1, y1] (corner) format, but got {boxes2}")
iou, union = box_iou(boxes1, boxes2)
top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2])
bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
width_height = (bottom_right - top_left).clamp(min=0) # [N,M,2]
area = width_height[:, :, 0] * width_height[:, :, 1]
return iou - (area - union) / area
# from https://github.com/facebookresearch/detectron2/blob/cbbc1ce26473cb2a5cc8f58e8ada9ae14cb41052/detectron2/layers/wrappers.py#L100
def nonzero_tuple(x):
"""
A 'as_tuple=True' version of torch.nonzero to support torchscript. because of
https://github.com/pytorch/pytorch/issues/38718
"""
if torch.jit.is_scripting():
if x.dim() == 0:
return x.unsqueeze(0).nonzero().unbind(1)
return x.nonzero().unbind(1)
else:
return x.nonzero(as_tuple=True)
# from https://github.com/facebookresearch/detectron2/blob/9921a2caa585d4fa66c4b534b6fab6e74d89b582/detectron2/modeling/matcher.py#L9
class DetaMatcher(object):
"""
This class assigns to each predicted "element" (e.g., a box) a ground-truth element. Each predicted element will
have exactly zero or one matches; each ground-truth element may be matched to zero or more predicted elements.
The matching is determined by the MxN match_quality_matrix, that characterizes how well each (ground-truth,
prediction)-pair match each other. For example, if the elements are boxes, this matrix may contain box
intersection-over-union overlap values.
The matcher returns (a) a vector of length N containing the index of the ground-truth element m in [0, M) that
matches to prediction n in [0, N). (b) a vector of length N containing the labels for each prediction.
"""
def __init__(self, thresholds: List[float], labels: List[int], allow_low_quality_matches: bool = False):
"""
Args:
thresholds (`list[float]`):
A list of thresholds used to stratify predictions into levels.
labels (`list[int`):
A list of values to label predictions belonging at each level. A label can be one of {-1, 0, 1}
signifying {ignore, negative class, positive class}, respectively.
allow_low_quality_matches (`bool`, *optional*, defaults to `False`):
If `True`, produce additional matches for predictions with maximum match quality lower than
high_threshold. See `set_low_quality_matches_` for more details.
For example,
thresholds = [0.3, 0.5] labels = [0, -1, 1] All predictions with iou < 0.3 will be marked with 0 and
thus will be considered as false positives while training. All predictions with 0.3 <= iou < 0.5 will
be marked with -1 and thus will be ignored. All predictions with 0.5 <= iou will be marked with 1 and
thus will be considered as true positives.
"""
# Add -inf and +inf to first and last position in thresholds
thresholds = thresholds[:]
if thresholds[0] < 0:
raise ValueError("Thresholds should be positive")
thresholds.insert(0, -float("inf"))
thresholds.append(float("inf"))
# Currently torchscript does not support all + generator
if not all([low <= high for (low, high) in zip(thresholds[:-1], thresholds[1:])]):
raise ValueError("Thresholds should be sorted.")
if not all([l in [-1, 0, 1] for l in labels]):
raise ValueError("All labels should be either -1, 0 or 1")
if len(labels) != len(thresholds) - 1:
raise ValueError("Number of labels should be equal to number of thresholds - 1")
self.thresholds = thresholds
self.labels = labels
self.allow_low_quality_matches = allow_low_quality_matches
def __call__(self, match_quality_matrix):
"""
Args:
match_quality_matrix (Tensor[float]): an MxN tensor, containing the
pairwise quality between M ground-truth elements and N predicted elements. All elements must be >= 0
(due to the us of `torch.nonzero` for selecting indices in `set_low_quality_matches_`).
Returns:
matches (Tensor[int64]): a vector of length N, where matches[i] is a matched
ground-truth index in [0, M)
match_labels (Tensor[int8]): a vector of length N, where pred_labels[i] indicates
whether a prediction is a true or false positive or ignored
"""
assert match_quality_matrix.dim() == 2
if match_quality_matrix.numel() == 0:
default_matches = match_quality_matrix.new_full((match_quality_matrix.size(1),), 0, dtype=torch.int64)
# When no gt boxes exist, we define IOU = 0 and therefore set labels
# to `self.labels[0]`, which usually defaults to background class 0
# To choose to ignore instead, can make labels=[-1,0,-1,1] + set appropriate thresholds
default_match_labels = match_quality_matrix.new_full(
(match_quality_matrix.size(1),), self.labels[0], dtype=torch.int8
)
return default_matches, default_match_labels
assert torch.all(match_quality_matrix >= 0)
# match_quality_matrix is M (gt) x N (predicted)
# Max over gt elements (dim 0) to find best gt candidate for each prediction
matched_vals, matches = match_quality_matrix.max(dim=0)
match_labels = matches.new_full(matches.size(), 1, dtype=torch.int8)
for l, low, high in zip(self.labels, self.thresholds[:-1], self.thresholds[1:]):
low_high = (matched_vals >= low) & (matched_vals < high)
match_labels[low_high] = l
if self.allow_low_quality_matches:
self.set_low_quality_matches_(match_labels, match_quality_matrix)
return matches, match_labels
def set_low_quality_matches_(self, match_labels, match_quality_matrix):
"""
Produce additional matches for predictions that have only low-quality matches. Specifically, for each
ground-truth G find the set of predictions that have maximum overlap with it (including ties); for each
prediction in that set, if it is unmatched, then match it to the ground-truth G.
This function implements the RPN assignment case (i) in Sec. 3.1.2 of :paper:`Faster R-CNN`.
"""
# For each gt, find the prediction with which it has highest quality
highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1)
# Find the highest quality match available, even if it is low, including ties.
# Note that the matches qualities must be positive due to the use of
# `torch.nonzero`.
_, pred_inds_with_highest_quality = nonzero_tuple(match_quality_matrix == highest_quality_foreach_gt[:, None])
# If an anchor was labeled positive only due to a low-quality match
# with gt_A, but it has larger overlap with gt_B, it's matched index will still be gt_B.
# This follows the implementation in Detectron, and is found to have no significant impact.
match_labels[pred_inds_with_highest_quality] = 1
# from https://github.com/facebookresearch/detectron2/blob/cbbc1ce26473cb2a5cc8f58e8ada9ae14cb41052/detectron2/modeling/sampling.py#L9
def subsample_labels(labels: torch.Tensor, num_samples: int, positive_fraction: float, bg_label: int):
"""
Return `num_samples` (or fewer, if not enough found) random samples from `labels` which is a mixture of positives &
negatives. It will try to return as many positives as possible without exceeding `positive_fraction * num_samples`,
and then try to fill the remaining slots with negatives.
Args:
labels (Tensor): (N, ) label vector with values:
* -1: ignore
* bg_label: background ("negative") class
* otherwise: one or more foreground ("positive") classes
num_samples (int): The total number of labels with value >= 0 to return.
Values that are not sampled will be filled with -1 (ignore).
positive_fraction (float): The number of subsampled labels with values > 0
is `min(num_positives, int(positive_fraction * num_samples))`. The number of negatives sampled is
`min(num_negatives, num_samples - num_positives_sampled)`. In order words, if there are not enough
positives, the sample is filled with negatives. If there are also not enough negatives, then as many
elements are sampled as is possible.
bg_label (int): label index of background ("negative") class.
Returns:
pos_idx, neg_idx (Tensor):
1D vector of indices. The total length of both is `num_samples` or fewer.
"""
positive = nonzero_tuple((labels != -1) & (labels != bg_label))[0]
negative = nonzero_tuple(labels == bg_label)[0]
num_pos = int(num_samples * positive_fraction)
# protect against not enough positive examples
num_pos = min(positive.numel(), num_pos)
num_neg = num_samples - num_pos
# protect against not enough negative examples
num_neg = min(negative.numel(), num_neg)
# randomly select positive and negative examples
perm1 = torch.randperm(positive.numel(), device=positive.device)[:num_pos]
perm2 = torch.randperm(negative.numel(), device=negative.device)[:num_neg]
pos_idx = positive[perm1]
neg_idx = negative[perm2]
return pos_idx, neg_idx
def sample_topk_per_gt(pr_inds, gt_inds, iou, k):
if len(gt_inds) == 0:
return pr_inds, gt_inds
# find topk matches for each gt
gt_inds2, counts = gt_inds.unique(return_counts=True)
scores, pr_inds2 = iou[gt_inds2].topk(k, dim=1)
gt_inds2 = gt_inds2[:, None].repeat(1, k)
# filter to as many matches that gt has
pr_inds3 = torch.cat([pr[:c] for c, pr in zip(counts, pr_inds2)])
gt_inds3 = torch.cat([gt[:c] for c, gt in zip(counts, gt_inds2)])
return pr_inds3, gt_inds3
# modified from https://github.com/facebookresearch/detectron2/blob/cbbc1ce26473cb2a5cc8f58e8ada9ae14cb41052/detectron2/modeling/roi_heads/roi_heads.py#L123
class DetaStage2Assigner(nn.Module):
def __init__(self, num_queries, max_k=4):
super().__init__()
self.positive_fraction = 0.25
self.bg_label = 400 # number > 91 to filter out later
self.batch_size_per_image = num_queries
self.proposal_matcher = DetaMatcher(thresholds=[0.6], labels=[0, 1], allow_low_quality_matches=True)
self.k = max_k
def _sample_proposals(self, matched_idxs: torch.Tensor, matched_labels: torch.Tensor, gt_classes: torch.Tensor):
"""
Based on the matching between N proposals and M groundtruth, sample the proposals and set their classification
labels.
Args:
matched_idxs (Tensor): a vector of length N, each is the best-matched
gt index in [0, M) for each proposal.
matched_labels (Tensor): a vector of length N, the matcher's label
(one of cfg.MODEL.ROI_HEADS.IOU_LABELS) for each proposal.
gt_classes (Tensor): a vector of length M.
Returns:
Tensor: a vector of indices of sampled proposals. Each is in [0, N). Tensor: a vector of the same length,
the classification label for
each sampled proposal. Each sample is labeled as either a category in [0, num_classes) or the
background (num_classes).
"""
has_gt = gt_classes.numel() > 0
# Get the corresponding GT for each proposal
if has_gt:
gt_classes = gt_classes[matched_idxs]
# Label unmatched proposals (0 label from matcher) as background (label=num_classes)
gt_classes[matched_labels == 0] = self.bg_label
# Label ignore proposals (-1 label)
gt_classes[matched_labels == -1] = -1
else:
gt_classes = torch.zeros_like(matched_idxs) + self.bg_label
sampled_fg_idxs, sampled_bg_idxs = subsample_labels(
gt_classes, self.batch_size_per_image, self.positive_fraction, self.bg_label
)
sampled_idxs = torch.cat([sampled_fg_idxs, sampled_bg_idxs], dim=0)
return sampled_idxs, gt_classes[sampled_idxs]
def forward(self, outputs, targets, return_cost_matrix=False):
# COCO categories are from 1 to 90. They set num_classes=91 and apply sigmoid.
bs = len(targets)
indices = []
ious = []
for b in range(bs):
iou, _ = box_iou(
center_to_corners_format(targets[b]["boxes"]),
center_to_corners_format(outputs["init_reference"][b].detach()),
)
matched_idxs, matched_labels = self.proposal_matcher(
iou
) # proposal_id -> highest_iou_gt_id, proposal_id -> [1 if iou > 0.6, 0 ow]
(
sampled_idxs,
sampled_gt_classes,
) = self._sample_proposals( # list of sampled proposal_ids, sampled_id -> [0, num_classes)+[bg_label]
matched_idxs, matched_labels, targets[b]["labels"]
)
pos_pr_inds = sampled_idxs[sampled_gt_classes != self.bg_label]
pos_gt_inds = matched_idxs[pos_pr_inds]
pos_pr_inds, pos_gt_inds = self.postprocess_indices(pos_pr_inds, pos_gt_inds, iou)
indices.append((pos_pr_inds, pos_gt_inds))
ious.append(iou)
if return_cost_matrix:
return indices, ious
return indices
def postprocess_indices(self, pr_inds, gt_inds, iou):
return sample_topk_per_gt(pr_inds, gt_inds, iou, self.k)
# modified from https://github.com/facebookresearch/detectron2/blob/cbbc1ce26473cb2a5cc8f58e8ada9ae14cb41052/detectron2/modeling/proposal_generator/rpn.py#L181
class DetaStage1Assigner(nn.Module):
def __init__(self, t_low=0.3, t_high=0.7, max_k=4):
super().__init__()
self.positive_fraction = 0.5
self.batch_size_per_image = 256
self.k = max_k
self.t_low = t_low
self.t_high = t_high
self.anchor_matcher = DetaMatcher(
thresholds=[t_low, t_high], labels=[0, -1, 1], allow_low_quality_matches=True
)
def _subsample_labels(self, label):
"""
Randomly sample a subset of positive and negative examples, and overwrite the label vector to the ignore value
(-1) for all elements that are not included in the sample.
Args:
labels (Tensor): a vector of -1, 0, 1. Will be modified in-place and returned.
"""
pos_idx, neg_idx = subsample_labels(label, self.batch_size_per_image, self.positive_fraction, 0)
# Fill with the ignore label (-1), then set positive and negative labels
label.fill_(-1)
label.scatter_(0, pos_idx, 1)
label.scatter_(0, neg_idx, 0)
return label
def forward(self, outputs, targets):
bs = len(targets)
indices = []
for b in range(bs):
anchors = outputs["anchors"][b]
if len(targets[b]["boxes"]) == 0:
indices.append(
(
torch.tensor([], dtype=torch.long, device=anchors.device),
torch.tensor([], dtype=torch.long, device=anchors.device),
)
)
continue
iou, _ = box_iou(
center_to_corners_format(targets[b]["boxes"]),
center_to_corners_format(anchors),
)
matched_idxs, matched_labels = self.anchor_matcher(
iou
) # proposal_id -> highest_iou_gt_id, proposal_id -> [1 if iou > 0.7, 0 if iou < 0.3, -1 ow]
matched_labels = self._subsample_labels(matched_labels)
all_pr_inds = torch.arange(len(anchors))
pos_pr_inds = all_pr_inds[matched_labels == 1]
pos_gt_inds = matched_idxs[pos_pr_inds]
pos_pr_inds, pos_gt_inds = self.postprocess_indices(pos_pr_inds, pos_gt_inds, iou)
pos_pr_inds, pos_gt_inds = pos_pr_inds.to(anchors.device), pos_gt_inds.to(anchors.device)
indices.append((pos_pr_inds, pos_gt_inds))
return indices
def postprocess_indices(self, pr_inds, gt_inds, iou):
return sample_topk_per_gt(pr_inds, gt_inds, iou, self.k)
...@@ -805,36 +805,42 @@ class Mask2FormerLoss(nn.Module): ...@@ -805,36 +805,42 @@ class Mask2FormerLoss(nn.Module):
return num_masks_pt return num_masks_pt
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.multi_scale_deformable_attention
def multi_scale_deformable_attention( def multi_scale_deformable_attention(
value: Tensor, value_spatial_shapes: Tensor, sampling_locations: Tensor, attention_weights: Tensor value: Tensor, value_spatial_shapes: Tensor, sampling_locations: Tensor, attention_weights: Tensor
): ) -> Tensor:
batch_size, _, num_attn_head, hidden_dim = value.shape batch_size, _, num_heads, hidden_dim = value.shape
_, num_queries, num_attn_head, num_levels, num_points, _ = sampling_locations.shape _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
value_list = value.split([height * width for height, width in value_spatial_shapes], dim=1) value_list = value.split([height * width for height, width in value_spatial_shapes], dim=1)
sampling_grids = 2 * sampling_locations - 1 sampling_grids = 2 * sampling_locations - 1
sampling_value_list = [] sampling_value_list = []
for level_id, (height, width) in enumerate(value_spatial_shapes):
for idx, (height, width) in enumerate(value_spatial_shapes): # batch_size, height*width, num_heads, hidden_dim
# batch_size, height*width, num_attn_head, hidden_dim -> batch_size*num_attn_head, hidden_dim, height, width # -> batch_size, height*width, num_heads*hidden_dim
# -> batch_size, num_heads*hidden_dim, height*width
# -> batch_size*num_heads, hidden_dim, height, width
value_l_ = ( value_l_ = (
value_list[idx].flatten(2).transpose(1, 2).reshape(batch_size * num_attn_head, hidden_dim, height, width) value_list[level_id].flatten(2).transpose(1, 2).reshape(batch_size * num_heads, hidden_dim, height, width)
) )
# (batch_size, num_queries, num_attn_head, num_points) -> (batch_size * num_attn_head, num_queries, num_points, 2) # batch_size, num_queries, num_heads, num_points, 2
sampling_grid_l_ = sampling_grids[:, :, :, idx].transpose(1, 2).flatten(0, 1) # -> batch_size, num_heads, num_queries, num_points, 2
# batch_size*num_attn_head, D_, num_queries, num_points # -> batch_size*num_heads, num_queries, num_points, 2
sampling_value_l_ = torch.nn.functional.grid_sample( sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1)
# batch_size*num_heads, hidden_dim, num_queries, num_points
sampling_value_l_ = nn.functional.grid_sample(
value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False
) )
sampling_value_list.append(sampling_value_l_) sampling_value_list.append(sampling_value_l_)
# (batch_size, num_queries, num_heads, num_levels, num_points)
# (batch_size, num_queries, num_attn_head, num_levels, num_points) -> (batch_size, num_attn_head, 1, num_queries, num_levels*num_points) # -> (batch_size, num_heads, num_queries, num_levels, num_points)
# -> (batch_size, num_heads, 1, num_queries, num_levels*num_points)
attention_weights = attention_weights.transpose(1, 2).reshape( attention_weights = attention_weights.transpose(1, 2).reshape(
batch_size * num_attn_head, 1, num_queries, num_levels * num_points batch_size * num_heads, 1, num_queries, num_levels * num_points
) )
output = ( output = (
(torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights) (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
.sum(-1) .sum(-1)
.view(batch_size, num_attn_head * hidden_dim, num_queries) .view(batch_size, num_heads * hidden_dim, num_queries)
) )
return output.transpose(1, 2).contiguous() return output.transpose(1, 2).contiguous()
......
...@@ -61,7 +61,8 @@ def _get_clones(module, N): ...@@ -61,7 +61,8 @@ def _get_clones(module, N):
return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
def multiscale_deform_attn_core_pytorch( # Copied from transformers.models.deformable_detr.modeling_deformable_detr.multi_scale_deformable_attention
def multi_scale_deformable_attention(
value: Tensor, value_spatial_shapes: Tensor, sampling_locations: Tensor, attention_weights: Tensor value: Tensor, value_spatial_shapes: Tensor, sampling_locations: Tensor, attention_weights: Tensor
) -> Tensor: ) -> Tensor:
batch_size, _, num_heads, hidden_dim = value.shape batch_size, _, num_heads, hidden_dim = value.shape
...@@ -1044,8 +1045,8 @@ class OneFormerPixelDecoderEncoderMultiscaleDeformableAttention(nn.Module): ...@@ -1044,8 +1045,8 @@ class OneFormerPixelDecoderEncoderMultiscaleDeformableAttention(nn.Module):
) )
else: else:
raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}") raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}")
# CPU # PyTorch implementation
output = multiscale_deform_attn_core_pytorch(value, spatial_shapes, sampling_locations, attention_weights) output = multi_scale_deformable_attention(value, spatial_shapes, sampling_locations, attention_weights)
output = self.output_proj(output) output = self.output_proj(output)
return output, attention_weights return output, attention_weights
......
...@@ -90,6 +90,7 @@ from .utils import ( ...@@ -90,6 +90,7 @@ from .utils import (
is_torch_tpu_available, is_torch_tpu_available,
is_torchaudio_available, is_torchaudio_available,
is_torchdynamo_available, is_torchdynamo_available,
is_torchvision_available,
is_vision_available, is_vision_available,
) )
...@@ -307,6 +308,16 @@ def require_torch(test_case): ...@@ -307,6 +308,16 @@ def require_torch(test_case):
return unittest.skipUnless(is_torch_available(), "test requires PyTorch")(test_case) return unittest.skipUnless(is_torch_available(), "test requires PyTorch")(test_case)
def require_torchvision(test_case):
"""
Decorator marking a test that requires Torchvision.
These tests are skipped when Torchvision isn't installed.
"""
return unittest.skipUnless(is_torchvision_available(), "test requires Torchvision")(test_case)
def require_torch_or_tf(test_case): def require_torch_or_tf(test_case):
""" """
Decorator marking a test that requires PyTorch or TensorFlow. Decorator marking a test that requires PyTorch or TensorFlow.
......
...@@ -162,6 +162,7 @@ from .import_utils import ( ...@@ -162,6 +162,7 @@ from .import_utils import (
is_torchaudio_available, is_torchaudio_available,
is_torchdistx_available, is_torchdistx_available,
is_torchdynamo_available, is_torchdynamo_available,
is_torchvision_available,
is_training_run_on_sagemaker, is_training_run_on_sagemaker,
is_vision_available, is_vision_available,
requires_backends, requires_backends,
......
...@@ -1991,6 +1991,30 @@ class DeiTPreTrainedModel(metaclass=DummyObject): ...@@ -1991,6 +1991,30 @@ class DeiTPreTrainedModel(metaclass=DummyObject):
requires_backends(self, ["torch"]) requires_backends(self, ["torch"])
DETA_PRETRAINED_MODEL_ARCHIVE_LIST = None
class DetaForObjectDetection(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class DetaModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class DetaPreTrainedModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
DINAT_PRETRAINED_MODEL_ARCHIVE_LIST = None DINAT_PRETRAINED_MODEL_ARCHIVE_LIST = None
......
...@@ -136,6 +136,13 @@ class DeiTImageProcessor(metaclass=DummyObject): ...@@ -136,6 +136,13 @@ class DeiTImageProcessor(metaclass=DummyObject):
requires_backends(self, ["vision"]) requires_backends(self, ["vision"])
class DetaImageProcessor(metaclass=DummyObject):
_backends = ["vision"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["vision"])
class DetrFeatureExtractor(metaclass=DummyObject): class DetrFeatureExtractor(metaclass=DummyObject):
_backends = ["vision"] _backends = ["vision"]
......
...@@ -288,6 +288,10 @@ def is_torch_available(): ...@@ -288,6 +288,10 @@ def is_torch_available():
return _torch_available return _torch_available
def is_torchvision_available():
return importlib.util.find_spec("torchvision") is not None
def is_pyctcdecode_available(): def is_pyctcdecode_available():
return _pyctcdecode_available return _pyctcdecode_available
...@@ -803,6 +807,14 @@ installation page: https://pytorch.org/get-started/locally/ and follow the ones ...@@ -803,6 +807,14 @@ installation page: https://pytorch.org/get-started/locally/ and follow the ones
Please note that you may need to restart your runtime after installation. Please note that you may need to restart your runtime after installation.
""" """
# docstyle-ignore
TORCHVISION_IMPORT_ERROR = """
{0} requires the Torchvision library but it was not found in your environment. Checkout the instructions on the
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
Please note that you may need to restart your runtime after installation.
"""
# docstyle-ignore # docstyle-ignore
PYTORCH_IMPORT_ERROR_WITH_TF = """ PYTORCH_IMPORT_ERROR_WITH_TF = """
{0} requires the PyTorch library but it was not found in your environment. {0} requires the PyTorch library but it was not found in your environment.
...@@ -1009,6 +1021,7 @@ BACKENDS_MAPPING = OrderedDict( ...@@ -1009,6 +1021,7 @@ BACKENDS_MAPPING = OrderedDict(
("natten", (is_natten_available, NATTEN_IMPORT_ERROR)), ("natten", (is_natten_available, NATTEN_IMPORT_ERROR)),
("tokenizers", (is_tokenizers_available, TOKENIZERS_IMPORT_ERROR)), ("tokenizers", (is_tokenizers_available, TOKENIZERS_IMPORT_ERROR)),
("torch", (is_torch_available, PYTORCH_IMPORT_ERROR)), ("torch", (is_torch_available, PYTORCH_IMPORT_ERROR)),
("torchvision", (is_torchvision_available, TORCHVISION_IMPORT_ERROR)),
("vision", (is_vision_available, VISION_IMPORT_ERROR)), ("vision", (is_vision_available, VISION_IMPORT_ERROR)),
("scipy", (is_scipy_available, SCIPY_IMPORT_ERROR)), ("scipy", (is_scipy_available, SCIPY_IMPORT_ERROR)),
("accelerate", (is_accelerate_available, ACCELERATE_IMPORT_ERROR)), ("accelerate", (is_accelerate_available, ACCELERATE_IMPORT_ERROR)),
......
# coding=utf-8
# Copyright 2022 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import pathlib
import unittest
import numpy as np
from transformers.testing_utils import require_torch, require_vision, slow
from transformers.utils import is_torch_available, is_vision_available
from ...test_image_processing_common import ImageProcessingSavingTestMixin, prepare_image_inputs
if is_torch_available():
import torch
if is_vision_available():
from PIL import Image
from transformers import DetaImageProcessor
class DetaImageProcessingTester(unittest.TestCase):
def __init__(
self,
parent,
batch_size=7,
num_channels=3,
min_resolution=30,
max_resolution=400,
do_resize=True,
size=None,
do_normalize=True,
image_mean=[0.5, 0.5, 0.5],
image_std=[0.5, 0.5, 0.5],
do_rescale=True,
rescale_factor=1 / 255,
do_pad=True,
):
# by setting size["longest_edge"] > max_resolution we're effectively not testing this :p
size = size if size is not None else {"shortest_edge": 18, "longest_edge": 1333}
self.parent = parent
self.batch_size = batch_size
self.num_channels = num_channels
self.min_resolution = min_resolution
self.max_resolution = max_resolution
self.do_resize = do_resize
self.size = size
self.do_normalize = do_normalize
self.image_mean = image_mean
self.image_std = image_std
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_pad = do_pad
def prepare_image_processor_dict(self):
return {
"do_resize": self.do_resize,
"size": self.size,
"do_normalize": self.do_normalize,
"image_mean": self.image_mean,
"image_std": self.image_std,
"do_rescale": self.do_rescale,
"rescale_factor": self.rescale_factor,
"do_pad": self.do_pad,
}
def get_expected_values(self, image_inputs, batched=False):
"""
This function computes the expected height and width when providing images to DetaImageProcessor,
assuming do_resize is set to True with a scalar size.
"""
if not batched:
image = image_inputs[0]
if isinstance(image, Image.Image):
w, h = image.size
else:
h, w = image.shape[1], image.shape[2]
if w < h:
expected_height = int(self.size["shortest_edge"] * h / w)
expected_width = self.size["shortest_edge"]
elif w > h:
expected_height = self.size["shortest_edge"]
expected_width = int(self.size["shortest_edge"] * w / h)
else:
expected_height = self.size["shortest_edge"]
expected_width = self.size["shortest_edge"]
else:
expected_values = []
for image in image_inputs:
expected_height, expected_width = self.get_expected_values([image])
expected_values.append((expected_height, expected_width))
expected_height = max(expected_values, key=lambda item: item[0])[0]
expected_width = max(expected_values, key=lambda item: item[1])[1]
return expected_height, expected_width
@require_torch
@require_vision
class DetaImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase):
image_processing_class = DetaImageProcessor if is_vision_available() else None
def setUp(self):
self.image_processor_tester = DetaImageProcessingTester(self)
@property
def image_processor_dict(self):
return self.image_processor_tester.prepare_image_processor_dict()
def test_image_processor_properties(self):
image_processing = self.image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processing, "image_mean"))
self.assertTrue(hasattr(image_processing, "image_std"))
self.assertTrue(hasattr(image_processing, "do_normalize"))
self.assertTrue(hasattr(image_processing, "do_resize"))
self.assertTrue(hasattr(image_processing, "do_rescale"))
self.assertTrue(hasattr(image_processing, "do_pad"))
self.assertTrue(hasattr(image_processing, "size"))
def test_image_processor_from_dict_with_kwargs(self):
image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
self.assertEqual(image_processor.size, {"shortest_edge": 18, "longest_edge": 1333})
self.assertEqual(image_processor.do_pad, True)
def test_batch_feature(self):
pass
def test_call_pil(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
# create random PIL images
image_inputs = prepare_image_inputs(self.image_processor_tester, equal_resolution=False)
for image in image_inputs:
self.assertIsInstance(image, Image.Image)
# Test not batched input
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
expected_height, expected_width = self.image_processor_tester.get_expected_values(image_inputs)
self.assertEqual(
encoded_images.shape,
(1, self.image_processor_tester.num_channels, expected_height, expected_width),
)
# Test batched
expected_height, expected_width = self.image_processor_tester.get_expected_values(image_inputs, batched=True)
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
self.assertEqual(
encoded_images.shape,
(
self.image_processor_tester.batch_size,
self.image_processor_tester.num_channels,
expected_height,
expected_width,
),
)
def test_call_numpy(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
# create random numpy tensors
image_inputs = prepare_image_inputs(self.image_processor_tester, equal_resolution=False, numpify=True)
for image in image_inputs:
self.assertIsInstance(image, np.ndarray)
# Test not batched input
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
expected_height, expected_width = self.image_processor_tester.get_expected_values(image_inputs)
self.assertEqual(
encoded_images.shape,
(1, self.image_processor_tester.num_channels, expected_height, expected_width),
)
# Test batched
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
expected_height, expected_width = self.image_processor_tester.get_expected_values(image_inputs, batched=True)
self.assertEqual(
encoded_images.shape,
(
self.image_processor_tester.batch_size,
self.image_processor_tester.num_channels,
expected_height,
expected_width,
),
)
def test_call_pytorch(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
# create random PyTorch tensors
image_inputs = prepare_image_inputs(self.image_processor_tester, equal_resolution=False, torchify=True)
for image in image_inputs:
self.assertIsInstance(image, torch.Tensor)
# Test not batched input
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
expected_height, expected_width = self.image_processor_tester.get_expected_values(image_inputs)
self.assertEqual(
encoded_images.shape,
(1, self.image_processor_tester.num_channels, expected_height, expected_width),
)
# Test batched
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
expected_height, expected_width = self.image_processor_tester.get_expected_values(image_inputs, batched=True)
self.assertEqual(
encoded_images.shape,
(
self.image_processor_tester.batch_size,
self.image_processor_tester.num_channels,
expected_height,
expected_width,
),
)
def test_equivalence_pad_and_create_pixel_mask(self):
# Initialize image_processings
image_processing_1 = self.image_processing_class(**self.image_processor_dict)
image_processing_2 = self.image_processing_class(do_resize=False, do_normalize=False, do_rescale=False)
# create random PyTorch tensors
image_inputs = prepare_image_inputs(self.image_processor_tester, equal_resolution=False, torchify=True)
for image in image_inputs:
self.assertIsInstance(image, torch.Tensor)
# Test whether the method "pad_and_return_pixel_mask" and calling the image processor return the same tensors
encoded_images_with_method = image_processing_1.pad_and_create_pixel_mask(image_inputs, return_tensors="pt")
encoded_images = image_processing_2(image_inputs, return_tensors="pt")
self.assertTrue(
torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4)
)
self.assertTrue(
torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4)
)
@slow
def test_call_pytorch_with_coco_detection_annotations(self):
# prepare image and target
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
target = json.loads(f.read())
target = {"image_id": 39769, "annotations": target}
# encode them
image_processing = DetaImageProcessor()
encoding = image_processing(images=image, annotations=target, return_tensors="pt")
# verify pixel values
expected_shape = torch.Size([1, 3, 800, 1066])
self.assertEqual(encoding["pixel_values"].shape, expected_shape)
expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
# verify area
expected_area = torch.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438])
self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
# verify boxes
expected_boxes_shape = torch.Size([6, 4])
self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215])
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
# verify image_id
expected_image_id = torch.tensor([39769])
self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
# verify is_crowd
expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
# verify class_labels
expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17])
self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
# verify orig_size
expected_orig_size = torch.tensor([480, 640])
self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
# verify size
expected_size = torch.tensor([800, 1066])
self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
@slow
def test_call_pytorch_with_coco_panoptic_annotations(self):
# prepare image, target and masks_path
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
target = json.loads(f.read())
target = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
# encode them
image_processing = DetaImageProcessor(format="coco_panoptic")
encoding = image_processing(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")
# verify pixel values
expected_shape = torch.Size([1, 3, 800, 1066])
self.assertEqual(encoding["pixel_values"].shape, expected_shape)
expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
# verify area
expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147])
self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
# verify boxes
expected_boxes_shape = torch.Size([6, 4])
self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625])
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
# verify image_id
expected_image_id = torch.tensor([39769])
self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
# verify is_crowd
expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
# verify class_labels
expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
# verify masks
expected_masks_sum = 822873
self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum)
# verify orig_size
expected_orig_size = torch.tensor([480, 640])
self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
# verify size
expected_size = torch.tensor([800, 1066])
self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Testing suite for the PyTorch DETA model. """
import inspect
import math
import unittest
from transformers import DetaConfig, is_torch_available, is_torchvision_available, is_vision_available
from transformers.file_utils import cached_property
from transformers.testing_utils import require_torchvision, require_vision, slow, torch_device
from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
if is_torch_available():
import torch
if is_torchvision_available():
from transformers import DetaForObjectDetection, DetaModel
if is_vision_available():
from PIL import Image
from transformers import AutoImageProcessor
class DetaModelTester:
def __init__(
self,
parent,
batch_size=8,
is_training=True,
use_labels=True,
hidden_size=256,
num_hidden_layers=2,
num_attention_heads=8,
intermediate_size=4,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
num_queries=12,
num_channels=3,
image_size=196,
n_targets=8,
num_labels=91,
num_feature_levels=4,
encoder_n_points=2,
decoder_n_points=6,
two_stage=False,
):
self.parent = parent
self.batch_size = batch_size
self.is_training = is_training
self.use_labels = use_labels
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.num_queries = num_queries
self.num_channels = num_channels
self.image_size = image_size
self.n_targets = n_targets
self.num_labels = num_labels
self.num_feature_levels = num_feature_levels
self.encoder_n_points = encoder_n_points
self.decoder_n_points = decoder_n_points
self.two_stage = two_stage
# we also set the expected seq length for both encoder and decoder
self.encoder_seq_length = (
math.ceil(self.image_size / 8) ** 2
+ math.ceil(self.image_size / 16) ** 2
+ math.ceil(self.image_size / 32) ** 2
+ math.ceil(self.image_size / 64) ** 2
)
self.decoder_seq_length = self.num_queries
def prepare_config_and_inputs(self):
pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
pixel_mask = torch.ones([self.batch_size, self.image_size, self.image_size], device=torch_device)
labels = None
if self.use_labels:
# labels is a list of Dict (each Dict being the labels for a given example in the batch)
labels = []
for i in range(self.batch_size):
target = {}
target["class_labels"] = torch.randint(
high=self.num_labels, size=(self.n_targets,), device=torch_device
)
target["boxes"] = torch.rand(self.n_targets, 4, device=torch_device)
target["masks"] = torch.rand(self.n_targets, self.image_size, self.image_size, device=torch_device)
labels.append(target)
config = self.get_config()
return config, pixel_values, pixel_mask, labels
def get_config(self):
return DetaConfig(
d_model=self.hidden_size,
encoder_layers=self.num_hidden_layers,
decoder_layers=self.num_hidden_layers,
encoder_attention_heads=self.num_attention_heads,
decoder_attention_heads=self.num_attention_heads,
encoder_ffn_dim=self.intermediate_size,
decoder_ffn_dim=self.intermediate_size,
dropout=self.hidden_dropout_prob,
attention_dropout=self.attention_probs_dropout_prob,
num_queries=self.num_queries,
num_labels=self.num_labels,
num_feature_levels=self.num_feature_levels,
encoder_n_points=self.encoder_n_points,
decoder_n_points=self.decoder_n_points,
two_stage=self.two_stage,
)
def prepare_config_and_inputs_for_common(self):
config, pixel_values, pixel_mask, labels = self.prepare_config_and_inputs()
inputs_dict = {"pixel_values": pixel_values, "pixel_mask": pixel_mask}
return config, inputs_dict
def create_and_check_deta_model(self, config, pixel_values, pixel_mask, labels):
model = DetaModel(config=config)
model.to(torch_device)
model.eval()
result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
result = model(pixel_values)
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.num_queries, self.hidden_size))
def create_and_check_deta_object_detection_head_model(self, config, pixel_values, pixel_mask, labels):
model = DetaForObjectDetection(config=config)
model.to(torch_device)
model.eval()
result = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
result = model(pixel_values)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels))
self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels)
self.parent.assertEqual(result.loss.shape, ())
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, self.num_labels))
self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4))
@require_torchvision
class DetaModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
all_model_classes = (DetaModel, DetaForObjectDetection) if is_torchvision_available() else ()
is_encoder_decoder = True
test_torchscript = False
test_pruning = False
test_head_masking = False
test_missing_keys = False
# special case for head models
def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
if return_labels:
if model_class.__name__ == "DetaForObjectDetection":
labels = []
for i in range(self.model_tester.batch_size):
target = {}
target["class_labels"] = torch.ones(
size=(self.model_tester.n_targets,), device=torch_device, dtype=torch.long
)
target["boxes"] = torch.ones(
self.model_tester.n_targets, 4, device=torch_device, dtype=torch.float
)
target["masks"] = torch.ones(
self.model_tester.n_targets,
self.model_tester.image_size,
self.model_tester.image_size,
device=torch_device,
dtype=torch.float,
)
labels.append(target)
inputs_dict["labels"] = labels
return inputs_dict
def setUp(self):
self.model_tester = DetaModelTester(self)
self.config_tester = ConfigTester(self, config_class=DetaConfig, has_text_modality=False)
def test_config(self):
# we don't test common_properties and arguments_init as these don't apply for DETA
self.config_tester.create_and_test_config_to_json_string()
self.config_tester.create_and_test_config_to_json_file()
self.config_tester.create_and_test_config_from_and_save_pretrained()
self.config_tester.create_and_test_config_with_num_labels()
self.config_tester.check_config_can_be_init_without_params()
def test_deta_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_deta_model(*config_and_inputs)
def test_deta_object_detection_head_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_deta_object_detection_head_model(*config_and_inputs)
@unittest.skip(reason="DETA does not use inputs_embeds")
def test_inputs_embeds(self):
pass
@unittest.skip(reason="DETA does not have a get_input_embeddings method")
def test_model_common_attributes(self):
pass
@unittest.skip(reason="DETA is not a generative model")
def test_generate_without_input_ids(self):
pass
@unittest.skip(reason="DETA does not use token embeddings")
def test_resize_tokens_embeddings(self):
pass
@unittest.skip(reason="Feed forward chunking is not implemented")
def test_feed_forward_chunking(self):
pass
def test_attention_outputs(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.return_dict = True
for model_class in self.all_model_classes:
inputs_dict["output_attentions"] = True
inputs_dict["output_hidden_states"] = False
config.return_dict = True
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
attentions = outputs.encoder_attentions
self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
# check that output_attentions also work using config
del inputs_dict["output_attentions"]
config.output_attentions = True
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
attentions = outputs.encoder_attentions
self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
self.assertListEqual(
list(attentions[0].shape[-3:]),
[
self.model_tester.num_attention_heads,
self.model_tester.num_feature_levels,
self.model_tester.encoder_n_points,
],
)
out_len = len(outputs)
correct_outlen = 8
# loss is at first position
if "labels" in inputs_dict:
correct_outlen += 1 # loss is added to beginning
# Object Detection model returns pred_logits and pred_boxes
if model_class.__name__ == "DetaForObjectDetection":
correct_outlen += 2
self.assertEqual(out_len, correct_outlen)
# decoder attentions
decoder_attentions = outputs.decoder_attentions
self.assertIsInstance(decoder_attentions, (list, tuple))
self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
self.assertListEqual(
list(decoder_attentions[0].shape[-3:]),
[self.model_tester.num_attention_heads, self.model_tester.num_queries, self.model_tester.num_queries],
)
# cross attentions
cross_attentions = outputs.cross_attentions
self.assertIsInstance(cross_attentions, (list, tuple))
self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
self.assertListEqual(
list(cross_attentions[0].shape[-3:]),
[
self.model_tester.num_attention_heads,
self.model_tester.num_feature_levels,
self.model_tester.decoder_n_points,
],
)
# Check attention is always last and order is fine
inputs_dict["output_attentions"] = True
inputs_dict["output_hidden_states"] = True
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
if hasattr(self.model_tester, "num_hidden_states_types"):
added_hidden_states = self.model_tester.num_hidden_states_types
elif self.is_encoder_decoder:
added_hidden_states = 2
else:
added_hidden_states = 1
self.assertEqual(out_len + added_hidden_states, len(outputs))
self_attentions = outputs.encoder_attentions
self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
self.assertListEqual(
list(self_attentions[0].shape[-3:]),
[
self.model_tester.num_attention_heads,
self.model_tester.num_feature_levels,
self.model_tester.encoder_n_points,
],
)
# removed retain_grad and grad on decoder_hidden_states, as queries don't require grad
def test_retain_grad_hidden_states_attentions(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.output_hidden_states = True
config.output_attentions = True
# no need to test all models as different heads yield the same functionality
model_class = self.all_model_classes[0]
model = model_class(config)
model.to(torch_device)
inputs = self._prepare_for_class(inputs_dict, model_class)
outputs = model(**inputs)
# we take the second output since last_hidden_state is the second item
output = outputs[1]
encoder_hidden_states = outputs.encoder_hidden_states[0]
encoder_attentions = outputs.encoder_attentions[0]
encoder_hidden_states.retain_grad()
encoder_attentions.retain_grad()
decoder_attentions = outputs.decoder_attentions[0]
decoder_attentions.retain_grad()
cross_attentions = outputs.cross_attentions[0]
cross_attentions.retain_grad()
output.flatten()[0].backward(retain_graph=True)
self.assertIsNotNone(encoder_hidden_states.grad)
self.assertIsNotNone(encoder_attentions.grad)
self.assertIsNotNone(decoder_attentions.grad)
self.assertIsNotNone(cross_attentions.grad)
def test_forward_signature(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
signature = inspect.signature(model.forward)
# signature.parameters is an OrderedDict => so arg_names order is deterministic
arg_names = [*signature.parameters.keys()]
if model.config.is_encoder_decoder:
expected_arg_names = ["pixel_values", "pixel_mask"]
expected_arg_names.extend(
["head_mask", "decoder_head_mask", "encoder_outputs"]
if "head_mask" and "decoder_head_mask" in arg_names
else []
)
self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
else:
expected_arg_names = ["pixel_values", "pixel_mask"]
self.assertListEqual(arg_names[:1], expected_arg_names)
@unittest.skip(reason="Model doesn't use tied weights")
def test_tied_model_weights_key_ignore(self):
pass
def test_initialization(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
configs_no_init = _config_zero_init(config)
for model_class in self.all_model_classes:
model = model_class(config=configs_no_init)
for name, param in model.named_parameters():
if param.requires_grad:
if param.requires_grad:
if (
"level_embed" in name
or "sampling_offsets.bias" in name
or "value_proj" in name
or "output_proj" in name
or "reference_points" in name
):
continue
self.assertIn(
((param.data.mean() * 1e9).round() / 1e9).item(),
[0.0, 1.0],
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
)
TOLERANCE = 1e-4
# We will verify our results on an image of cute cats
def prepare_img():
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
return image
@require_torchvision
@require_vision
@slow
class DetaModelIntegrationTests(unittest.TestCase):
@cached_property
def default_image_processor(self):
return AutoImageProcessor.from_pretrained("jozhang97/deta-resnet-50") if is_vision_available() else None
def test_inference_object_detection_head(self):
model = DetaForObjectDetection.from_pretrained("jozhang97/deta-resnet-50").to(torch_device)
image_processor = self.default_image_processor
image = prepare_img()
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
with torch.no_grad():
outputs = model(**inputs)
expected_shape_logits = torch.Size((1, 300, model.config.num_labels))
self.assertEqual(outputs.logits.shape, expected_shape_logits)
expected_logits = torch.tensor(
[[-7.3978, -2.5406, -4.1668], [-8.2684, -3.9933, -3.8096], [-7.0515, -3.7973, -5.8516]]
).to(torch_device)
expected_boxes = torch.tensor(
[[0.5043, 0.4973, 0.9998], [0.2542, 0.5489, 0.4748], [0.5490, 0.2765, 0.0570]]
).to(torch_device)
self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-4))
expected_shape_boxes = torch.Size((1, 300, 4))
self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4))
# verify postprocessing
results = image_processor.post_process_object_detection(
outputs, threshold=0.3, target_sizes=[image.size[::-1]]
)[0]
expected_scores = torch.tensor([0.6392, 0.6276, 0.5546, 0.5260, 0.4706], device=torch_device)
expected_labels = [75, 17, 17, 75, 63]
expected_slice_boxes = torch.tensor([40.5866, 73.2107, 176.1421, 117.1751], device=torch_device)
self.assertTrue(torch.allclose(results["scores"], expected_scores, atol=1e-4))
self.assertSequenceEqual(results["labels"].tolist(), expected_labels)
self.assertTrue(torch.allclose(results["boxes"][0, :], expected_slice_boxes))
def test_inference_object_detection_head_swin_backbone(self):
model = DetaForObjectDetection.from_pretrained("jozhang97/deta-swin-large").to(torch_device)
image_processor = self.default_image_processor
image = prepare_img()
inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
with torch.no_grad():
outputs = model(**inputs)
expected_shape_logits = torch.Size((1, 300, model.config.num_labels))
self.assertEqual(outputs.logits.shape, expected_shape_logits)
expected_logits = torch.tensor(
[[-7.6308, -2.8485, -5.3737], [-7.2037, -4.5505, -4.8027], [-7.2943, -4.2611, -4.6617]]
).to(torch_device)
expected_boxes = torch.tensor(
[[0.4987, 0.4969, 0.9999], [0.2549, 0.5498, 0.4805], [0.5498, 0.2757, 0.0569]]
).to(torch_device)
self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-4))
expected_shape_boxes = torch.Size((1, 300, 4))
self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes)
self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4))
# verify postprocessing
results = image_processor.post_process_object_detection(
outputs, threshold=0.3, target_sizes=[image.size[::-1]]
)[0]
expected_scores = torch.tensor([0.6831, 0.6826, 0.5684, 0.5464, 0.4392], device=torch_device)
expected_labels = [17, 17, 75, 75, 63]
expected_slice_boxes = torch.tensor([345.8478, 23.6754, 639.8562, 372.8265], device=torch_device)
self.assertTrue(torch.allclose(results["scores"], expected_scores, atol=1e-4))
self.assertSequenceEqual(results["labels"].tolist(), expected_labels)
self.assertTrue(torch.allclose(results["boxes"][0, :], expected_slice_boxes))
...@@ -53,6 +53,8 @@ PRIVATE_MODELS = [ ...@@ -53,6 +53,8 @@ PRIVATE_MODELS = [
# Being in this list is an exception and should **not** be the rule. # Being in this list is an exception and should **not** be the rule.
IGNORE_NON_TESTED = PRIVATE_MODELS.copy() + [ IGNORE_NON_TESTED = PRIVATE_MODELS.copy() + [
# models to ignore for not tested # models to ignore for not tested
"DetaEncoder", # Building part of bigger (tested) model.
"DetaDecoder", # Building part of bigger (tested) model.
"GraphormerEncoder", # Building part of bigger (tested) model. "GraphormerEncoder", # Building part of bigger (tested) model.
"GraphormerDecoderHead", # Building part of bigger (tested) model. "GraphormerDecoderHead", # Building part of bigger (tested) model.
"CLIPSegDecoder", # Building part of bigger (tested) model. "CLIPSegDecoder", # Building part of bigger (tested) model.
......
...@@ -67,6 +67,8 @@ src/transformers/models/deformable_detr/modeling_deformable_detr.py ...@@ -67,6 +67,8 @@ src/transformers/models/deformable_detr/modeling_deformable_detr.py
src/transformers/models/deit/configuration_deit.py src/transformers/models/deit/configuration_deit.py
src/transformers/models/deit/modeling_deit.py src/transformers/models/deit/modeling_deit.py
src/transformers/models/deit/modeling_tf_deit.py src/transformers/models/deit/modeling_tf_deit.py
src/transformers/models/deta/configuration_deta.py
src/transformers/models/deta/modeling_deta.py
src/transformers/models/detr/configuration_detr.py src/transformers/models/detr/configuration_detr.py
src/transformers/models/detr/modeling_detr.py src/transformers/models/detr/modeling_detr.py
src/transformers/models/dinat/configuration_dinat.py src/transformers/models/dinat/configuration_dinat.py
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment