Unverified Commit 5451f889 authored by NielsRogge's avatar NielsRogge Committed by GitHub
Browse files

Add DETA (#20983)

* First draft

* Add initial draft of conversion script

* Convert all weights

* Fix config

* Add image processor

* Fix DetaImageProcessor

* Run make fix copies

* Remove timm dependency

* Fix dummy objects

* Improve loss function

* Remove conv_encoder attribute

* Update conversion scripts

* Improve postprocessing + docs

* Fix copied from statements

* Add tests

* Improve postprocessing

* Improve postprocessing

* Update READMEs

* More improvements

* Fix rebase

* Add is_torchvision_available

* Add torchvision dependency

* Fix typo and README

* Fix bug

* Add copied from

* Fix style

* Apply suggestions

* Fix thanks to @ydshieh

* Fix another dependency check

* Simplify image processor

* Add scipy

* Improve code

* Add threshold argument

* Fix bug

* Set default threshold

* Improve integration test

* Add another integration test

* Update setup.py

* Address review

* Improve deformable attention function

* Improve copied from

* Use relative imports

* Address review

* Replace assertions

* Address review

* Update dummies

* Remove dummies

* Address comments, update READMEs

* Remove custom kernel code

* Add image processor tests

* Add requires_backends

* Add minor comment

* Update scripts

* Update organization name

* Fix defaults, add doc tests

* Add id2label for object 365

* Fix tests

* Update task guide
parent 98d88b23
# flake8: noqa
# There's no way to ignore "F401 '...' imported but unused" warnings in this
# module, but to preserve other warnings. So, don't check this module at all.
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
_import_structure = {
"configuration_deta": ["DETA_PRETRAINED_CONFIG_ARCHIVE_MAP", "DetaConfig"],
}
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["image_processing_deta"] = ["DetaImageProcessor"]
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_deta"] = [
"DETA_PRETRAINED_MODEL_ARCHIVE_LIST",
"DetaForObjectDetection",
"DetaModel",
"DetaPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_deta import DETA_PRETRAINED_CONFIG_ARCHIVE_MAP, DetaConfig
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .image_processing_deta import DetaImageProcessor
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_deta import (
DETA_PRETRAINED_MODEL_ARCHIVE_LIST,
DetaForObjectDetection,
DetaModel,
DetaPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
# coding=utf-8
# Copyright 2022 SenseTime and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" DETA model configuration"""
import copy
from ...configuration_utils import PretrainedConfig
from ...utils import logging
from ..auto import CONFIG_MAPPING
logger = logging.get_logger(__name__)
DETA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"ut/deta": "https://huggingface.co/ut/deta/resolve/main/config.json",
}
class DetaConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`DetaModel`]. It is used to instantiate a DETA
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the DETA
[SenseTime/deformable-detr](https://huggingface.co/SenseTime/deformable-detr) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
backbone_config (`PretrainedConfig` or `dict`, *optional*, defaults to `ResNetConfig()`):
The configuration of the backbone model.
num_queries (`int`, *optional*, defaults to 900):
Number of object queries, i.e. detection slots. This is the maximal number of objects [`DetaModel`] can
detect in a single image. In case `two_stage` is set to `True`, we use `two_stage_num_proposals` instead.
d_model (`int`, *optional*, defaults to 256):
Dimension of the layers.
encoder_layers (`int`, *optional*, defaults to 6):
Number of encoder layers.
decoder_layers (`int`, *optional*, defaults to 6):
Number of decoder layers.
encoder_attention_heads (`int`, *optional*, defaults to 8):
Number of attention heads for each attention layer in the Transformer encoder.
decoder_attention_heads (`int`, *optional*, defaults to 8):
Number of attention heads for each attention layer in the Transformer decoder.
decoder_ffn_dim (`int`, *optional*, defaults to 2048):
Dimension of the "intermediate" (often named feed-forward) layer in decoder.
encoder_ffn_dim (`int`, *optional*, defaults to 2048):
Dimension of the "intermediate" (often named feed-forward) layer in decoder.
activation_function (`str` or `function`, *optional*, defaults to `"relu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"silu"` and `"gelu_new"` are supported.
dropout (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
activation_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for activations inside the fully connected layer.
init_std (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
init_xavier_std (`float`, *optional*, defaults to 1):
The scaling factor used for the Xavier initialization gain in the HM Attention map module.
encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
for more details.
decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
for more details.
auxiliary_loss (`bool`, *optional*, defaults to `False`):
Whether auxiliary decoding losses (loss at each decoder layer) are to be used.
position_embedding_type (`str`, *optional*, defaults to `"sine"`):
Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`.
class_cost (`float`, *optional*, defaults to 1):
Relative weight of the classification error in the Hungarian matching cost.
bbox_cost (`float`, *optional*, defaults to 5):
Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost.
giou_cost (`float`, *optional*, defaults to 2):
Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost.
mask_loss_coefficient (`float`, *optional*, defaults to 1):
Relative weight of the Focal loss in the panoptic segmentation loss.
dice_loss_coefficient (`float`, *optional*, defaults to 1):
Relative weight of the DICE/F-1 loss in the panoptic segmentation loss.
bbox_loss_coefficient (`float`, *optional*, defaults to 5):
Relative weight of the L1 bounding box loss in the object detection loss.
giou_loss_coefficient (`float`, *optional*, defaults to 2):
Relative weight of the generalized IoU loss in the object detection loss.
eos_coefficient (`float`, *optional*, defaults to 0.1):
Relative classification weight of the 'no-object' class in the object detection loss.
num_feature_levels (`int`, *optional*, defaults to 5):
The number of input feature levels.
encoder_n_points (`int`, *optional*, defaults to 4):
The number of sampled keys in each feature level for each attention head in the encoder.
decoder_n_points (`int`, *optional*, defaults to 4):
The number of sampled keys in each feature level for each attention head in the decoder.
two_stage (`bool`, *optional*, defaults to `True`):
Whether to apply a two-stage deformable DETR, where the region proposals are also generated by a variant of
DETA, which are further fed into the decoder for iterative bounding box refinement.
two_stage_num_proposals (`int`, *optional*, defaults to 300):
The number of region proposals to be generated, in case `two_stage` is set to `True`.
with_box_refine (`bool`, *optional*, defaults to `True`):
Whether to apply iterative bounding box refinement, where each decoder layer refines the bounding boxes
based on the predictions from the previous layer.
focal_alpha (`float`, *optional*, defaults to 0.25):
Alpha parameter in the focal loss.
Examples:
```python
>>> from transformers import DetaConfig, DetaModel
>>> # Initializing a DETA SenseTime/deformable-detr style configuration
>>> configuration = DetaConfig()
>>> # Initializing a model (with random weights) from the SenseTime/deformable-detr style configuration
>>> model = DetaModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "deta"
attribute_map = {
"hidden_size": "d_model",
"num_attention_heads": "encoder_attention_heads",
}
def __init__(
self,
backbone_config=None,
num_queries=900,
max_position_embeddings=2048,
encoder_layers=6,
encoder_ffn_dim=2048,
encoder_attention_heads=8,
decoder_layers=6,
decoder_ffn_dim=1024,
decoder_attention_heads=8,
encoder_layerdrop=0.0,
decoder_layerdrop=0.0,
is_encoder_decoder=True,
activation_function="relu",
d_model=256,
dropout=0.1,
attention_dropout=0.0,
activation_dropout=0.0,
init_std=0.02,
init_xavier_std=1.0,
return_intermediate=True,
auxiliary_loss=False,
position_embedding_type="sine",
num_feature_levels=5,
encoder_n_points=4,
decoder_n_points=4,
two_stage=True,
two_stage_num_proposals=300,
with_box_refine=True,
assign_first_stage=True,
class_cost=1,
bbox_cost=5,
giou_cost=2,
mask_loss_coefficient=1,
dice_loss_coefficient=1,
bbox_loss_coefficient=5,
giou_loss_coefficient=2,
eos_coefficient=0.1,
focal_alpha=0.25,
**kwargs
):
if backbone_config is None:
logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.")
backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage2", "stage3", "stage4"])
else:
if isinstance(backbone_config, dict):
backbone_model_type = backbone_config.pop("model_type")
config_class = CONFIG_MAPPING[backbone_model_type]
backbone_config = config_class.from_dict(backbone_config)
self.backbone_config = backbone_config
self.num_queries = num_queries
self.max_position_embeddings = max_position_embeddings
self.d_model = d_model
self.encoder_ffn_dim = encoder_ffn_dim
self.encoder_layers = encoder_layers
self.encoder_attention_heads = encoder_attention_heads
self.decoder_ffn_dim = decoder_ffn_dim
self.decoder_layers = decoder_layers
self.decoder_attention_heads = decoder_attention_heads
self.dropout = dropout
self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.activation_function = activation_function
self.init_std = init_std
self.init_xavier_std = init_xavier_std
self.encoder_layerdrop = encoder_layerdrop
self.decoder_layerdrop = decoder_layerdrop
self.auxiliary_loss = auxiliary_loss
self.position_embedding_type = position_embedding_type
# deformable attributes
self.num_feature_levels = num_feature_levels
self.encoder_n_points = encoder_n_points
self.decoder_n_points = decoder_n_points
self.two_stage = two_stage
self.two_stage_num_proposals = two_stage_num_proposals
self.with_box_refine = with_box_refine
self.assign_first_stage = assign_first_stage
if two_stage is True and with_box_refine is False:
raise ValueError("If two_stage is True, with_box_refine must be True.")
# Hungarian matcher
self.class_cost = class_cost
self.bbox_cost = bbox_cost
self.giou_cost = giou_cost
# Loss coefficients
self.mask_loss_coefficient = mask_loss_coefficient
self.dice_loss_coefficient = dice_loss_coefficient
self.bbox_loss_coefficient = bbox_loss_coefficient
self.giou_loss_coefficient = giou_loss_coefficient
self.eos_coefficient = eos_coefficient
self.focal_alpha = focal_alpha
super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
@property
def num_attention_heads(self) -> int:
return self.encoder_attention_heads
@property
def hidden_size(self) -> int:
return self.d_model
def to_dict(self):
"""
Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. Returns:
`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
"""
output = copy.deepcopy(self.__dict__)
output["backbone_config"] = self.backbone_config.to_dict()
output["model_type"] = self.__class__.model_type
return output
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert DETA checkpoints from the original repository.
URL: https://github.com/jozhang97/DETA/tree/master"""
import argparse
import json
from pathlib import Path
import torch
from PIL import Image
import requests
from huggingface_hub import cached_download, hf_hub_download, hf_hub_url
from transformers import DetaConfig, DetaForObjectDetection, DetaImageProcessor
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
def get_deta_config():
config = DetaConfig(
num_queries=900,
encoder_ffn_dim=2048,
decoder_ffn_dim=2048,
num_feature_levels=5,
assign_first_stage=True,
with_box_refine=True,
two_stage=True,
)
# set labels
config.num_labels = 91
repo_id = "huggingface/label-files"
filename = "coco-detection-id2label.json"
id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename, repo_type="dataset")), "r"))
id2label = {int(k): v for k, v in id2label.items()}
config.id2label = id2label
config.label2id = {v: k for k, v in id2label.items()}
return config
# here we list all keys to be renamed (original name on the left, our name on the right)
def create_rename_keys(config):
rename_keys = []
# stem
# fmt: off
rename_keys.append(("backbone.0.body.conv1.weight", "model.backbone.model.embedder.embedder.convolution.weight"))
rename_keys.append(("backbone.0.body.bn1.weight", "model.backbone.model.embedder.embedder.normalization.weight"))
rename_keys.append(("backbone.0.body.bn1.bias", "model.backbone.model.embedder.embedder.normalization.bias"))
rename_keys.append(("backbone.0.body.bn1.running_mean", "model.backbone.model.embedder.embedder.normalization.running_mean"))
rename_keys.append(("backbone.0.body.bn1.running_var", "model.backbone.model.embedder.embedder.normalization.running_var"))
# stages
for stage_idx in range(len(config.backbone_config.depths)):
for layer_idx in range(config.backbone_config.depths[stage_idx]):
# shortcut
if layer_idx == 0:
rename_keys.append(
(
f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.0.weight",
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.convolution.weight",
)
)
rename_keys.append(
(
f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.weight",
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.weight",
)
)
rename_keys.append(
(
f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.bias",
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.bias",
)
)
rename_keys.append(
(
f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.running_mean",
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_mean",
)
)
rename_keys.append(
(
f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.running_var",
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_var",
)
)
# 3 convs
for i in range(3):
rename_keys.append(
(
f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.conv{i+1}.weight",
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.convolution.weight",
)
)
rename_keys.append(
(
f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.weight",
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.weight",
)
)
rename_keys.append(
(
f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.bias",
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.bias",
)
)
rename_keys.append(
(
f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.running_mean",
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.running_mean",
)
)
rename_keys.append(
(
f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.running_var",
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.running_var",
)
)
# transformer encoder
for i in range(config.encoder_layers):
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.weight", f"model.encoder.layers.{i}.self_attn.sampling_offsets.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.bias", f"model.encoder.layers.{i}.self_attn.sampling_offsets.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.weight", f"model.encoder.layers.{i}.self_attn.attention_weights.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.bias", f"model.encoder.layers.{i}.self_attn.attention_weights.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.weight", f"model.encoder.layers.{i}.self_attn.value_proj.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.bias", f"model.encoder.layers.{i}.self_attn.value_proj.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.weight", f"model.encoder.layers.{i}.self_attn.output_proj.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.bias", f"model.encoder.layers.{i}.self_attn.output_proj.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.norm1.weight", f"model.encoder.layers.{i}.self_attn_layer_norm.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"model.encoder.layers.{i}.self_attn_layer_norm.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"model.encoder.layers.{i}.fc1.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"model.encoder.layers.{i}.fc1.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"model.encoder.layers.{i}.fc2.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"model.encoder.layers.{i}.fc2.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"model.encoder.layers.{i}.final_layer_norm.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"model.encoder.layers.{i}.final_layer_norm.bias"))
# transformer decoder
for i in range(config.decoder_layers):
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.weight", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.bias", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.weight", f"model.decoder.layers.{i}.encoder_attn.attention_weights.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.bias", f"model.decoder.layers.{i}.encoder_attn.attention_weights.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.weight", f"model.decoder.layers.{i}.encoder_attn.value_proj.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.bias", f"model.decoder.layers.{i}.encoder_attn.value_proj.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.weight", f"model.decoder.layers.{i}.encoder_attn.output_proj.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.bias", f"model.decoder.layers.{i}.encoder_attn.output_proj.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm1.weight", f"model.decoder.layers.{i}.encoder_attn_layer_norm.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"model.decoder.layers.{i}.encoder_attn_layer_norm.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"model.decoder.layers.{i}.self_attn.out_proj.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"model.decoder.layers.{i}.self_attn.out_proj.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm2.weight", f"model.decoder.layers.{i}.self_attn_layer_norm.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm2.bias", f"model.decoder.layers.{i}.self_attn_layer_norm.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"model.decoder.layers.{i}.fc1.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"model.decoder.layers.{i}.fc1.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"model.decoder.layers.{i}.fc2.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"model.decoder.layers.{i}.fc2.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"model.decoder.layers.{i}.final_layer_norm.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"model.decoder.layers.{i}.final_layer_norm.bias"))
# fmt: on
return rename_keys
def rename_key(dct, old, new):
val = dct.pop(old)
dct[new] = val
def read_in_decoder_q_k_v(state_dict, config):
# transformer decoder self-attention layers
hidden_size = config.d_model
for i in range(config.decoder_layers):
# read in weights + bias of input projection layer of self-attention
in_proj_weight = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_weight")
in_proj_bias = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_bias")
# next, add query, keys and values (in that order) to the state dict
state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:hidden_size, :]
state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:hidden_size]
state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[
hidden_size : hidden_size * 2, :
]
state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[hidden_size : hidden_size * 2]
state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-hidden_size:, :]
state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-hidden_size:]
# We will verify our results on an image of cute cats
def prepare_img():
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
im = Image.open(requests.get(url, stream=True).raw)
return im
@torch.no_grad()
def convert_deta_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
"""
Copy/paste/tweak model's weights to our DETA structure.
"""
# load config
config = get_deta_config()
# load original state dict
if model_name == "deta-resnet-50":
filename = "adet_checkpoint0011.pth"
elif model_name == "deta-resnet-50-24-epochs":
filename = "adet_2x_checkpoint0023.pth"
else:
raise ValueError(f"Model name {model_name} not supported")
checkpoint_path = hf_hub_download(repo_id="nielsr/deta-checkpoints", filename=filename)
state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
# rename keys
rename_keys = create_rename_keys(config)
for src, dest in rename_keys:
rename_key(state_dict, src, dest)
read_in_decoder_q_k_v(state_dict, config)
# fix some prefixes
for key in state_dict.copy().keys():
if "transformer.decoder.class_embed" in key or "transformer.decoder.bbox_embed" in key:
val = state_dict.pop(key)
state_dict[key.replace("transformer.decoder", "model.decoder")] = val
if "input_proj" in key:
val = state_dict.pop(key)
state_dict["model." + key] = val
if "level_embed" in key or "pos_trans" in key or "pix_trans" in key or "enc_output" in key:
val = state_dict.pop(key)
state_dict[key.replace("transformer", "model")] = val
# finally, create HuggingFace model and load state dict
model = DetaForObjectDetection(config)
model.load_state_dict(state_dict)
model.eval()
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
# load image processor
processor = DetaImageProcessor(format="coco_detection")
# verify our conversion on image
img = prepare_img()
encoding = processor(images=img, return_tensors="pt")
pixel_values = encoding["pixel_values"]
outputs = model(pixel_values.to(device))
# verify logits
if model_name == "deta-resnet-50":
expected_logits = torch.tensor(
[[-7.3978, -2.5406, -4.1668], [-8.2684, -3.9933, -3.8096], [-7.0515, -3.7973, -5.8516]]
)
expected_boxes = torch.tensor([[0.5043, 0.4973, 0.9998], [0.2542, 0.5489, 0.4748], [0.5490, 0.2765, 0.0570]])
elif model_name == "deta-resnet-50-24-epochs":
expected_logits = torch.tensor(
[[-7.1688, -2.4857, -4.8669], [-7.8630, -3.8154, -4.2674], [-7.2730, -4.1865, -5.5323]]
)
expected_boxes = torch.tensor([[0.5021, 0.4971, 0.9994], [0.2546, 0.5486, 0.4731], [0.1686, 0.1986, 0.2142]])
assert torch.allclose(outputs.logits[0, :3, :3], expected_logits.to(device), atol=1e-4)
assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes.to(device), atol=1e-4)
print("Everything ok!")
if pytorch_dump_folder_path:
# Save model and processor
logger.info(f"Saving PyTorch model and processor to {pytorch_dump_folder_path}...")
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
model.save_pretrained(pytorch_dump_folder_path)
processor.save_pretrained(pytorch_dump_folder_path)
# Push to hub
if push_to_hub:
print("Pushing model and processor to hub...")
model.push_to_hub(f"jozhang97/{model_name}")
processor.push_to_hub(f"jozhang97/{model_name}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name",
type=str,
default="deta-resnet-50",
choices=["deta-resnet-50", "deta-resnet-50-24-epochs"],
help="Name of the model you'd like to convert.",
)
parser.add_argument(
"--pytorch_dump_folder_path",
default=None,
type=str,
help="Path to the folder to output PyTorch model.",
)
parser.add_argument(
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
)
args = parser.parse_args()
convert_deta_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert DETA checkpoints from the original repository.
URL: https://github.com/jozhang97/DETA/tree/master"""
import argparse
import json
from pathlib import Path
import torch
from PIL import Image
import requests
from huggingface_hub import cached_download, hf_hub_download, hf_hub_url
from transformers import DetaConfig, DetaForObjectDetection, DetaImageProcessor, SwinConfig
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
def get_deta_config(model_name):
backbone_config = SwinConfig(
embed_dim=192,
depths=(2, 2, 18, 2),
num_heads=(6, 12, 24, 48),
window_size=12,
out_features=["stage2", "stage3", "stage4"],
)
config = DetaConfig(
backbone_config=backbone_config,
num_queries=900,
encoder_ffn_dim=2048,
decoder_ffn_dim=2048,
num_feature_levels=5,
assign_first_stage=True,
with_box_refine=True,
two_stage=True,
)
# set labels
repo_id = "huggingface/label-files"
if "o365" in model_name:
num_labels = 366
filename = "object365-id2label.json"
else:
num_labels = 91
filename = "coco-detection-id2label.json"
config.num_labels = num_labels
id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename, repo_type="dataset")), "r"))
id2label = {int(k): v for k, v in id2label.items()}
config.id2label = id2label
config.label2id = {v: k for k, v in id2label.items()}
return config
# here we list all keys to be renamed (original name on the left, our name on the right)
def create_rename_keys(config):
rename_keys = []
# stem
# fmt: off
rename_keys.append(("backbone.0.body.patch_embed.proj.weight", "model.backbone.model.embeddings.patch_embeddings.projection.weight"))
rename_keys.append(("backbone.0.body.patch_embed.proj.bias", "model.backbone.model.embeddings.patch_embeddings.projection.bias"))
rename_keys.append(("backbone.0.body.patch_embed.norm.weight", "model.backbone.model.embeddings.norm.weight"))
rename_keys.append(("backbone.0.body.patch_embed.norm.bias", "model.backbone.model.embeddings.norm.bias"))
# stages
for i in range(len(config.backbone_config.depths)):
for j in range(config.backbone_config.depths[i]):
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm1.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_before.weight"))
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm1.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_before.bias"))
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.relative_position_bias_table", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.relative_position_bias_table"))
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.relative_position_index", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.relative_position_index"))
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.proj.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.output.dense.weight"))
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.proj.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.output.dense.bias"))
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm2.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_after.weight"))
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm2.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_after.bias"))
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc1.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.intermediate.dense.weight"))
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc1.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.intermediate.dense.bias"))
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc2.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.output.dense.weight"))
rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc2.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.output.dense.bias"))
if i < 3:
rename_keys.append((f"backbone.0.body.layers.{i}.downsample.reduction.weight", f"model.backbone.model.encoder.layers.{i}.downsample.reduction.weight"))
rename_keys.append((f"backbone.0.body.layers.{i}.downsample.norm.weight", f"model.backbone.model.encoder.layers.{i}.downsample.norm.weight"))
rename_keys.append((f"backbone.0.body.layers.{i}.downsample.norm.bias", f"model.backbone.model.encoder.layers.{i}.downsample.norm.bias"))
rename_keys.append(("backbone.0.body.norm1.weight", "model.backbone.model.hidden_states_norms.stage2.weight"))
rename_keys.append(("backbone.0.body.norm1.bias", "model.backbone.model.hidden_states_norms.stage2.bias"))
rename_keys.append(("backbone.0.body.norm2.weight", "model.backbone.model.hidden_states_norms.stage3.weight"))
rename_keys.append(("backbone.0.body.norm2.bias", "model.backbone.model.hidden_states_norms.stage3.bias"))
rename_keys.append(("backbone.0.body.norm3.weight", "model.backbone.model.hidden_states_norms.stage4.weight"))
rename_keys.append(("backbone.0.body.norm3.bias", "model.backbone.model.hidden_states_norms.stage4.bias"))
# transformer encoder
for i in range(config.encoder_layers):
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.weight", f"model.encoder.layers.{i}.self_attn.sampling_offsets.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.bias", f"model.encoder.layers.{i}.self_attn.sampling_offsets.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.weight", f"model.encoder.layers.{i}.self_attn.attention_weights.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.bias", f"model.encoder.layers.{i}.self_attn.attention_weights.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.weight", f"model.encoder.layers.{i}.self_attn.value_proj.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.bias", f"model.encoder.layers.{i}.self_attn.value_proj.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.weight", f"model.encoder.layers.{i}.self_attn.output_proj.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.bias", f"model.encoder.layers.{i}.self_attn.output_proj.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.norm1.weight", f"model.encoder.layers.{i}.self_attn_layer_norm.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"model.encoder.layers.{i}.self_attn_layer_norm.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"model.encoder.layers.{i}.fc1.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"model.encoder.layers.{i}.fc1.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"model.encoder.layers.{i}.fc2.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"model.encoder.layers.{i}.fc2.bias"))
rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"model.encoder.layers.{i}.final_layer_norm.weight"))
rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"model.encoder.layers.{i}.final_layer_norm.bias"))
# transformer decoder
for i in range(config.decoder_layers):
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.weight", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.bias", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.weight", f"model.decoder.layers.{i}.encoder_attn.attention_weights.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.bias", f"model.decoder.layers.{i}.encoder_attn.attention_weights.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.weight", f"model.decoder.layers.{i}.encoder_attn.value_proj.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.bias", f"model.decoder.layers.{i}.encoder_attn.value_proj.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.weight", f"model.decoder.layers.{i}.encoder_attn.output_proj.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.bias", f"model.decoder.layers.{i}.encoder_attn.output_proj.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm1.weight", f"model.decoder.layers.{i}.encoder_attn_layer_norm.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"model.decoder.layers.{i}.encoder_attn_layer_norm.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"model.decoder.layers.{i}.self_attn.out_proj.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"model.decoder.layers.{i}.self_attn.out_proj.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm2.weight", f"model.decoder.layers.{i}.self_attn_layer_norm.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm2.bias", f"model.decoder.layers.{i}.self_attn_layer_norm.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"model.decoder.layers.{i}.fc1.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"model.decoder.layers.{i}.fc1.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"model.decoder.layers.{i}.fc2.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"model.decoder.layers.{i}.fc2.bias"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"model.decoder.layers.{i}.final_layer_norm.weight"))
rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"model.decoder.layers.{i}.final_layer_norm.bias"))
# fmt: on
return rename_keys
def rename_key(dct, old, new):
val = dct.pop(old)
dct[new] = val
# we split up the matrix of each encoder layer into queries, keys and values
def read_in_swin_q_k_v(state_dict, backbone_config):
num_features = [int(backbone_config.embed_dim * 2**i) for i in range(len(backbone_config.depths))]
for i in range(len(backbone_config.depths)):
dim = num_features[i]
for j in range(backbone_config.depths[i]):
# fmt: off
# read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias)
in_proj_weight = state_dict.pop(f"backbone.0.body.layers.{i}.blocks.{j}.attn.qkv.weight")
in_proj_bias = state_dict.pop(f"backbone.0.body.layers.{i}.blocks.{j}.attn.qkv.bias")
# next, add query, keys and values (in that order) to the state dict
state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.query.weight"] = in_proj_weight[:dim, :]
state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.query.bias"] = in_proj_bias[: dim]
state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.key.weight"] = in_proj_weight[
dim : dim * 2, :
]
state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.key.bias"] = in_proj_bias[
dim : dim * 2
]
state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.value.weight"] = in_proj_weight[
-dim :, :
]
state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.value.bias"] = in_proj_bias[-dim :]
# fmt: on
def read_in_decoder_q_k_v(state_dict, config):
# transformer decoder self-attention layers
hidden_size = config.d_model
for i in range(config.decoder_layers):
# read in weights + bias of input projection layer of self-attention
in_proj_weight = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_weight")
in_proj_bias = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_bias")
# next, add query, keys and values (in that order) to the state dict
state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:hidden_size, :]
state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:hidden_size]
state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[
hidden_size : hidden_size * 2, :
]
state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[hidden_size : hidden_size * 2]
state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-hidden_size:, :]
state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-hidden_size:]
# We will verify our results on an image of cute cats
def prepare_img():
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
im = Image.open(requests.get(url, stream=True).raw)
return im
@torch.no_grad()
def convert_deta_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
"""
Copy/paste/tweak model's weights to our DETA structure.
"""
# load config
config = get_deta_config(model_name)
# load original state dict
if model_name == "deta-swin-large":
checkpoint_path = hf_hub_download(repo_id="nielsr/deta-checkpoints", filename="adet_swin_ft.pth")
elif model_name == "deta-swin-large-o365":
checkpoint_path = hf_hub_download(repo_id="jozhang97/deta-swin-l-o365", filename="deta_swin_pt_o365.pth")
else:
raise ValueError(f"Model name {model_name} not supported")
state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
# original state dict
for name, param in state_dict.items():
print(name, param.shape)
# rename keys
rename_keys = create_rename_keys(config)
for src, dest in rename_keys:
rename_key(state_dict, src, dest)
read_in_swin_q_k_v(state_dict, config.backbone_config)
read_in_decoder_q_k_v(state_dict, config)
# fix some prefixes
for key in state_dict.copy().keys():
if "transformer.decoder.class_embed" in key or "transformer.decoder.bbox_embed" in key:
val = state_dict.pop(key)
state_dict[key.replace("transformer.decoder", "model.decoder")] = val
if "input_proj" in key:
val = state_dict.pop(key)
state_dict["model." + key] = val
if "level_embed" in key or "pos_trans" in key or "pix_trans" in key or "enc_output" in key:
val = state_dict.pop(key)
state_dict[key.replace("transformer", "model")] = val
# finally, create HuggingFace model and load state dict
model = DetaForObjectDetection(config)
model.load_state_dict(state_dict)
model.eval()
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
# load image processor
processor = DetaImageProcessor(format="coco_detection")
# verify our conversion on image
img = prepare_img()
encoding = processor(images=img, return_tensors="pt")
pixel_values = encoding["pixel_values"]
outputs = model(pixel_values.to(device))
# verify logits
print("Logits:", outputs.logits[0, :3, :3])
print("Boxes:", outputs.pred_boxes[0, :3, :3])
if model_name == "deta-swin-large":
expected_logits = torch.tensor(
[[-7.6308, -2.8485, -5.3737], [-7.2037, -4.5505, -4.8027], [-7.2943, -4.2611, -4.6617]]
)
expected_boxes = torch.tensor([[0.4987, 0.4969, 0.9999], [0.2549, 0.5498, 0.4805], [0.5498, 0.2757, 0.0569]])
elif model_name == "deta-swin-large-o365":
expected_logits = torch.tensor(
[[-8.0122, -3.5720, -4.9717], [-8.1547, -3.6886, -4.6389], [-7.6610, -3.6194, -5.0134]]
)
expected_boxes = torch.tensor([[0.2523, 0.5549, 0.4881], [0.7715, 0.4149, 0.4601], [0.5503, 0.2753, 0.0575]])
assert torch.allclose(outputs.logits[0, :3, :3], expected_logits.to(device), atol=1e-4)
assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes.to(device), atol=1e-4)
print("Everything ok!")
if pytorch_dump_folder_path:
# Save model and processor
logger.info(f"Saving PyTorch model and processor to {pytorch_dump_folder_path}...")
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
model.save_pretrained(pytorch_dump_folder_path)
processor.save_pretrained(pytorch_dump_folder_path)
# Push to hub
if push_to_hub:
print("Pushing model and processor to hub...")
model.push_to_hub(f"jozhang97/{model_name}")
processor.push_to_hub(f"jozhang97/{model_name}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name",
type=str,
default="deta-swin-large",
choices=["deta-swin-large", "deta-swin-large-o365"],
help="Name of the model you'd like to convert.",
)
parser.add_argument(
"--pytorch_dump_folder_path",
default=None,
type=str,
help="Path to the folder to output PyTorch model.",
)
parser.add_argument(
"--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
)
args = parser.parse_args()
convert_deta_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
This diff is collapsed.
This diff is collapsed.
......@@ -805,36 +805,42 @@ class Mask2FormerLoss(nn.Module):
return num_masks_pt
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.multi_scale_deformable_attention
def multi_scale_deformable_attention(
value: Tensor, value_spatial_shapes: Tensor, sampling_locations: Tensor, attention_weights: Tensor
):
batch_size, _, num_attn_head, hidden_dim = value.shape
_, num_queries, num_attn_head, num_levels, num_points, _ = sampling_locations.shape
) -> Tensor:
batch_size, _, num_heads, hidden_dim = value.shape
_, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
value_list = value.split([height * width for height, width in value_spatial_shapes], dim=1)
sampling_grids = 2 * sampling_locations - 1
sampling_value_list = []
for idx, (height, width) in enumerate(value_spatial_shapes):
# batch_size, height*width, num_attn_head, hidden_dim -> batch_size*num_attn_head, hidden_dim, height, width
for level_id, (height, width) in enumerate(value_spatial_shapes):
# batch_size, height*width, num_heads, hidden_dim
# -> batch_size, height*width, num_heads*hidden_dim
# -> batch_size, num_heads*hidden_dim, height*width
# -> batch_size*num_heads, hidden_dim, height, width
value_l_ = (
value_list[idx].flatten(2).transpose(1, 2).reshape(batch_size * num_attn_head, hidden_dim, height, width)
value_list[level_id].flatten(2).transpose(1, 2).reshape(batch_size * num_heads, hidden_dim, height, width)
)
# (batch_size, num_queries, num_attn_head, num_points) -> (batch_size * num_attn_head, num_queries, num_points, 2)
sampling_grid_l_ = sampling_grids[:, :, :, idx].transpose(1, 2).flatten(0, 1)
# batch_size*num_attn_head, D_, num_queries, num_points
sampling_value_l_ = torch.nn.functional.grid_sample(
# batch_size, num_queries, num_heads, num_points, 2
# -> batch_size, num_heads, num_queries, num_points, 2
# -> batch_size*num_heads, num_queries, num_points, 2
sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1)
# batch_size*num_heads, hidden_dim, num_queries, num_points
sampling_value_l_ = nn.functional.grid_sample(
value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False
)
sampling_value_list.append(sampling_value_l_)
# (batch_size, num_queries, num_attn_head, num_levels, num_points) -> (batch_size, num_attn_head, 1, num_queries, num_levels*num_points)
# (batch_size, num_queries, num_heads, num_levels, num_points)
# -> (batch_size, num_heads, num_queries, num_levels, num_points)
# -> (batch_size, num_heads, 1, num_queries, num_levels*num_points)
attention_weights = attention_weights.transpose(1, 2).reshape(
batch_size * num_attn_head, 1, num_queries, num_levels * num_points
batch_size * num_heads, 1, num_queries, num_levels * num_points
)
output = (
(torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
.sum(-1)
.view(batch_size, num_attn_head * hidden_dim, num_queries)
.view(batch_size, num_heads * hidden_dim, num_queries)
)
return output.transpose(1, 2).contiguous()
......
......@@ -61,7 +61,8 @@ def _get_clones(module, N):
return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
def multiscale_deform_attn_core_pytorch(
# Copied from transformers.models.deformable_detr.modeling_deformable_detr.multi_scale_deformable_attention
def multi_scale_deformable_attention(
value: Tensor, value_spatial_shapes: Tensor, sampling_locations: Tensor, attention_weights: Tensor
) -> Tensor:
batch_size, _, num_heads, hidden_dim = value.shape
......@@ -1044,8 +1045,8 @@ class OneFormerPixelDecoderEncoderMultiscaleDeformableAttention(nn.Module):
)
else:
raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}")
# CPU
output = multiscale_deform_attn_core_pytorch(value, spatial_shapes, sampling_locations, attention_weights)
# PyTorch implementation
output = multi_scale_deformable_attention(value, spatial_shapes, sampling_locations, attention_weights)
output = self.output_proj(output)
return output, attention_weights
......
......@@ -90,6 +90,7 @@ from .utils import (
is_torch_tpu_available,
is_torchaudio_available,
is_torchdynamo_available,
is_torchvision_available,
is_vision_available,
)
......@@ -307,6 +308,16 @@ def require_torch(test_case):
return unittest.skipUnless(is_torch_available(), "test requires PyTorch")(test_case)
def require_torchvision(test_case):
"""
Decorator marking a test that requires Torchvision.
These tests are skipped when Torchvision isn't installed.
"""
return unittest.skipUnless(is_torchvision_available(), "test requires Torchvision")(test_case)
def require_torch_or_tf(test_case):
"""
Decorator marking a test that requires PyTorch or TensorFlow.
......
......@@ -162,6 +162,7 @@ from .import_utils import (
is_torchaudio_available,
is_torchdistx_available,
is_torchdynamo_available,
is_torchvision_available,
is_training_run_on_sagemaker,
is_vision_available,
requires_backends,
......
......@@ -1991,6 +1991,30 @@ class DeiTPreTrainedModel(metaclass=DummyObject):
requires_backends(self, ["torch"])
DETA_PRETRAINED_MODEL_ARCHIVE_LIST = None
class DetaForObjectDetection(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class DetaModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class DetaPreTrainedModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
DINAT_PRETRAINED_MODEL_ARCHIVE_LIST = None
......
......@@ -136,6 +136,13 @@ class DeiTImageProcessor(metaclass=DummyObject):
requires_backends(self, ["vision"])
class DetaImageProcessor(metaclass=DummyObject):
_backends = ["vision"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["vision"])
class DetrFeatureExtractor(metaclass=DummyObject):
_backends = ["vision"]
......
......@@ -288,6 +288,10 @@ def is_torch_available():
return _torch_available
def is_torchvision_available():
return importlib.util.find_spec("torchvision") is not None
def is_pyctcdecode_available():
return _pyctcdecode_available
......@@ -803,6 +807,14 @@ installation page: https://pytorch.org/get-started/locally/ and follow the ones
Please note that you may need to restart your runtime after installation.
"""
# docstyle-ignore
TORCHVISION_IMPORT_ERROR = """
{0} requires the Torchvision library but it was not found in your environment. Checkout the instructions on the
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
Please note that you may need to restart your runtime after installation.
"""
# docstyle-ignore
PYTORCH_IMPORT_ERROR_WITH_TF = """
{0} requires the PyTorch library but it was not found in your environment.
......@@ -1009,6 +1021,7 @@ BACKENDS_MAPPING = OrderedDict(
("natten", (is_natten_available, NATTEN_IMPORT_ERROR)),
("tokenizers", (is_tokenizers_available, TOKENIZERS_IMPORT_ERROR)),
("torch", (is_torch_available, PYTORCH_IMPORT_ERROR)),
("torchvision", (is_torchvision_available, TORCHVISION_IMPORT_ERROR)),
("vision", (is_vision_available, VISION_IMPORT_ERROR)),
("scipy", (is_scipy_available, SCIPY_IMPORT_ERROR)),
("accelerate", (is_accelerate_available, ACCELERATE_IMPORT_ERROR)),
......
# coding=utf-8
# Copyright 2022 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import pathlib
import unittest
import numpy as np
from transformers.testing_utils import require_torch, require_vision, slow
from transformers.utils import is_torch_available, is_vision_available
from ...test_image_processing_common import ImageProcessingSavingTestMixin, prepare_image_inputs
if is_torch_available():
import torch
if is_vision_available():
from PIL import Image
from transformers import DetaImageProcessor
class DetaImageProcessingTester(unittest.TestCase):
def __init__(
self,
parent,
batch_size=7,
num_channels=3,
min_resolution=30,
max_resolution=400,
do_resize=True,
size=None,
do_normalize=True,
image_mean=[0.5, 0.5, 0.5],
image_std=[0.5, 0.5, 0.5],
do_rescale=True,
rescale_factor=1 / 255,
do_pad=True,
):
# by setting size["longest_edge"] > max_resolution we're effectively not testing this :p
size = size if size is not None else {"shortest_edge": 18, "longest_edge": 1333}
self.parent = parent
self.batch_size = batch_size
self.num_channels = num_channels
self.min_resolution = min_resolution
self.max_resolution = max_resolution
self.do_resize = do_resize
self.size = size
self.do_normalize = do_normalize
self.image_mean = image_mean
self.image_std = image_std
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_pad = do_pad
def prepare_image_processor_dict(self):
return {
"do_resize": self.do_resize,
"size": self.size,
"do_normalize": self.do_normalize,
"image_mean": self.image_mean,
"image_std": self.image_std,
"do_rescale": self.do_rescale,
"rescale_factor": self.rescale_factor,
"do_pad": self.do_pad,
}
def get_expected_values(self, image_inputs, batched=False):
"""
This function computes the expected height and width when providing images to DetaImageProcessor,
assuming do_resize is set to True with a scalar size.
"""
if not batched:
image = image_inputs[0]
if isinstance(image, Image.Image):
w, h = image.size
else:
h, w = image.shape[1], image.shape[2]
if w < h:
expected_height = int(self.size["shortest_edge"] * h / w)
expected_width = self.size["shortest_edge"]
elif w > h:
expected_height = self.size["shortest_edge"]
expected_width = int(self.size["shortest_edge"] * w / h)
else:
expected_height = self.size["shortest_edge"]
expected_width = self.size["shortest_edge"]
else:
expected_values = []
for image in image_inputs:
expected_height, expected_width = self.get_expected_values([image])
expected_values.append((expected_height, expected_width))
expected_height = max(expected_values, key=lambda item: item[0])[0]
expected_width = max(expected_values, key=lambda item: item[1])[1]
return expected_height, expected_width
@require_torch
@require_vision
class DetaImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase):
image_processing_class = DetaImageProcessor if is_vision_available() else None
def setUp(self):
self.image_processor_tester = DetaImageProcessingTester(self)
@property
def image_processor_dict(self):
return self.image_processor_tester.prepare_image_processor_dict()
def test_image_processor_properties(self):
image_processing = self.image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processing, "image_mean"))
self.assertTrue(hasattr(image_processing, "image_std"))
self.assertTrue(hasattr(image_processing, "do_normalize"))
self.assertTrue(hasattr(image_processing, "do_resize"))
self.assertTrue(hasattr(image_processing, "do_rescale"))
self.assertTrue(hasattr(image_processing, "do_pad"))
self.assertTrue(hasattr(image_processing, "size"))
def test_image_processor_from_dict_with_kwargs(self):
image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
self.assertEqual(image_processor.size, {"shortest_edge": 18, "longest_edge": 1333})
self.assertEqual(image_processor.do_pad, True)
def test_batch_feature(self):
pass
def test_call_pil(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
# create random PIL images
image_inputs = prepare_image_inputs(self.image_processor_tester, equal_resolution=False)
for image in image_inputs:
self.assertIsInstance(image, Image.Image)
# Test not batched input
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
expected_height, expected_width = self.image_processor_tester.get_expected_values(image_inputs)
self.assertEqual(
encoded_images.shape,
(1, self.image_processor_tester.num_channels, expected_height, expected_width),
)
# Test batched
expected_height, expected_width = self.image_processor_tester.get_expected_values(image_inputs, batched=True)
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
self.assertEqual(
encoded_images.shape,
(
self.image_processor_tester.batch_size,
self.image_processor_tester.num_channels,
expected_height,
expected_width,
),
)
def test_call_numpy(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
# create random numpy tensors
image_inputs = prepare_image_inputs(self.image_processor_tester, equal_resolution=False, numpify=True)
for image in image_inputs:
self.assertIsInstance(image, np.ndarray)
# Test not batched input
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
expected_height, expected_width = self.image_processor_tester.get_expected_values(image_inputs)
self.assertEqual(
encoded_images.shape,
(1, self.image_processor_tester.num_channels, expected_height, expected_width),
)
# Test batched
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
expected_height, expected_width = self.image_processor_tester.get_expected_values(image_inputs, batched=True)
self.assertEqual(
encoded_images.shape,
(
self.image_processor_tester.batch_size,
self.image_processor_tester.num_channels,
expected_height,
expected_width,
),
)
def test_call_pytorch(self):
# Initialize image_processing
image_processing = self.image_processing_class(**self.image_processor_dict)
# create random PyTorch tensors
image_inputs = prepare_image_inputs(self.image_processor_tester, equal_resolution=False, torchify=True)
for image in image_inputs:
self.assertIsInstance(image, torch.Tensor)
# Test not batched input
encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
expected_height, expected_width = self.image_processor_tester.get_expected_values(image_inputs)
self.assertEqual(
encoded_images.shape,
(1, self.image_processor_tester.num_channels, expected_height, expected_width),
)
# Test batched
encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
expected_height, expected_width = self.image_processor_tester.get_expected_values(image_inputs, batched=True)
self.assertEqual(
encoded_images.shape,
(
self.image_processor_tester.batch_size,
self.image_processor_tester.num_channels,
expected_height,
expected_width,
),
)
def test_equivalence_pad_and_create_pixel_mask(self):
# Initialize image_processings
image_processing_1 = self.image_processing_class(**self.image_processor_dict)
image_processing_2 = self.image_processing_class(do_resize=False, do_normalize=False, do_rescale=False)
# create random PyTorch tensors
image_inputs = prepare_image_inputs(self.image_processor_tester, equal_resolution=False, torchify=True)
for image in image_inputs:
self.assertIsInstance(image, torch.Tensor)
# Test whether the method "pad_and_return_pixel_mask" and calling the image processor return the same tensors
encoded_images_with_method = image_processing_1.pad_and_create_pixel_mask(image_inputs, return_tensors="pt")
encoded_images = image_processing_2(image_inputs, return_tensors="pt")
self.assertTrue(
torch.allclose(encoded_images_with_method["pixel_values"], encoded_images["pixel_values"], atol=1e-4)
)
self.assertTrue(
torch.allclose(encoded_images_with_method["pixel_mask"], encoded_images["pixel_mask"], atol=1e-4)
)
@slow
def test_call_pytorch_with_coco_detection_annotations(self):
# prepare image and target
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
target = json.loads(f.read())
target = {"image_id": 39769, "annotations": target}
# encode them
image_processing = DetaImageProcessor()
encoding = image_processing(images=image, annotations=target, return_tensors="pt")
# verify pixel values
expected_shape = torch.Size([1, 3, 800, 1066])
self.assertEqual(encoding["pixel_values"].shape, expected_shape)
expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
# verify area
expected_area = torch.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438])
self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
# verify boxes
expected_boxes_shape = torch.Size([6, 4])
self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215])
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
# verify image_id
expected_image_id = torch.tensor([39769])
self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
# verify is_crowd
expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
# verify class_labels
expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17])
self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
# verify orig_size
expected_orig_size = torch.tensor([480, 640])
self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
# verify size
expected_size = torch.tensor([800, 1066])
self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
@slow
def test_call_pytorch_with_coco_panoptic_annotations(self):
# prepare image, target and masks_path
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f:
target = json.loads(f.read())
target = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target}
masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic")
# encode them
image_processing = DetaImageProcessor(format="coco_panoptic")
encoding = image_processing(images=image, annotations=target, masks_path=masks_path, return_tensors="pt")
# verify pixel values
expected_shape = torch.Size([1, 3, 800, 1066])
self.assertEqual(encoding["pixel_values"].shape, expected_shape)
expected_slice = torch.tensor([0.2796, 0.3138, 0.3481])
self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4))
# verify area
expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147])
self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area))
# verify boxes
expected_boxes_shape = torch.Size([6, 4])
self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape)
expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625])
self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3))
# verify image_id
expected_image_id = torch.tensor([39769])
self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id))
# verify is_crowd
expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0])
self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd))
# verify class_labels
expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93])
self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels))
# verify masks
expected_masks_sum = 822873
self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum)
# verify orig_size
expected_orig_size = torch.tensor([480, 640])
self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size))
# verify size
expected_size = torch.tensor([800, 1066])
self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size))
This diff is collapsed.
......@@ -53,6 +53,8 @@ PRIVATE_MODELS = [
# Being in this list is an exception and should **not** be the rule.
IGNORE_NON_TESTED = PRIVATE_MODELS.copy() + [
# models to ignore for not tested
"DetaEncoder", # Building part of bigger (tested) model.
"DetaDecoder", # Building part of bigger (tested) model.
"GraphormerEncoder", # Building part of bigger (tested) model.
"GraphormerDecoderHead", # Building part of bigger (tested) model.
"CLIPSegDecoder", # Building part of bigger (tested) model.
......
......@@ -67,6 +67,8 @@ src/transformers/models/deformable_detr/modeling_deformable_detr.py
src/transformers/models/deit/configuration_deit.py
src/transformers/models/deit/modeling_deit.py
src/transformers/models/deit/modeling_tf_deit.py
src/transformers/models/deta/configuration_deta.py
src/transformers/models/deta/modeling_deta.py
src/transformers/models/detr/configuration_detr.py
src/transformers/models/detr/modeling_detr.py
src/transformers/models/dinat/configuration_dinat.py
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment