"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "b8935980a206f15c4db5590d5132ae477571d610"
Unverified Commit fc4a993e authored by Ali Hassani's avatar Ali Hassani Committed by GitHub
Browse files

Add Neighborhood Attention Transformer (NAT) and Dilated NAT (DiNAT) models (#20219)

* Add DiNAT

* Adds DiNAT + tests

* Minor fixes

* Added HF model

* Add natten to dependencies.

* Cleanup

* Minor fixup

* Reformat

* Optional NATTEN import.

* Reformat & add doc to _toctree

* Reformat (finally)

* Dummy objects for DiNAT

* Add NAT + minor changes

Adds NAT as its own independent model + docs, tests
Adds NATTEN to ext deps to ensure ci picks it up.

* Remove natten from `all` and `dev-torch` deps, add manual pip install to ci tests

* Minor fixes.

* Fix READMEs.

* Requested changes to docs + minor fixes.

* Requested changes.

* Add NAT/DiNAT tests to layoutlm_job

* Correction to Dinat doc.

* Requested changes.
parent 8d6de0b9
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Dilated Neighborhood Attention Transformer model configuration"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
DINAT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"shi-labs/dinat-mini-in1k-224": "https://huggingface.co/shi-labs/dinat-mini-in1k-224/resolve/main/config.json",
# See all Dinat models at https://huggingface.co/models?filter=dinat
}
class DinatConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`DinatModel`]. It is used to instantiate a Dinat
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the Dinat
[shi-labs/dinat-mini-in1k-224](https://huggingface.co/shi-labs/dinat-mini-in1k-224) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
patch_size (`int`, *optional*, defaults to 4):
The size (resolution) of each patch. NOTE: Only patch size of 4 is supported at the moment.
num_channels (`int`, *optional*, defaults to 3):
The number of input channels.
embed_dim (`int`, *optional*, defaults to 64):
Dimensionality of patch embedding.
depths (`List[int]`, *optional*, defaults to `[2, 2, 6, 2]`):
Number of layers in each level of the encoder.
num_heads (`List[int]`, *optional*, defaults to `[3, 6, 12, 24]`):
Number of attention heads in each layer of the Transformer encoder.
kernel_size (`int`, *optional*, defaults to 7):
Neighborhood Attention kernel size.
dilations (`List[List[int]]`, *optional*, defaults to `[[1, 8, 1], [1, 4, 1, 4], [1, 2, 1, 2, 1, 2], [1, 1, 1, 1, 1]]`):
Dilation value of each NA layer in the Transformer encoder.
mlp_ratio (`float`, *optional*, defaults to 3.0):
Ratio of MLP hidden dimensionality to embedding dimensionality.
qkv_bias (`bool`, *optional*, defaults to `True`):
Whether or not a learnable bias should be added to the queries, keys and values.
hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
The dropout probability for all fully connected layers in the embeddings and encoder.
attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
drop_path_rate (`float`, *optional*, defaults to 0.1):
Stochastic depth rate.
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`,
`"selu"` and `"gelu_new"` are supported.
patch_norm (`bool`, *optional*, defaults to `True`):
Whether or not to add layer normalization after patch embedding.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers.
Example:
```python
>>> from transformers import DinatConfig, DinatModel
>>> # Initializing a Dinat shi-labs/dinat-mini-in1k-224 style configuration
>>> configuration = DinatConfig()
>>> # Initializing a model (with random weights) from the shi-labs/dinat-mini-in1k-224 style configuration
>>> model = DinatModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "dinat"
attribute_map = {
"num_attention_heads": "num_heads",
"num_hidden_layers": "num_layers",
}
def __init__(
self,
patch_size=4,
num_channels=3,
embed_dim=64,
depths=[3, 4, 6, 5],
num_heads=[2, 4, 8, 16],
kernel_size=7,
dilations=[[1, 8, 1], [1, 4, 1, 4], [1, 2, 1, 2, 1, 2], [1, 1, 1, 1, 1]],
mlp_ratio=3.0,
qkv_bias=True,
hidden_dropout_prob=0.0,
attention_probs_dropout_prob=0.0,
drop_path_rate=0.1,
hidden_act="gelu",
patch_norm=True,
initializer_range=0.02,
layer_norm_eps=1e-5,
**kwargs
):
super().__init__(**kwargs)
self.patch_size = patch_size
self.num_channels = num_channels
self.embed_dim = embed_dim
self.depths = depths
self.num_layers = len(depths)
self.num_heads = num_heads
self.kernel_size = kernel_size
self.dilations = dilations
self.mlp_ratio = mlp_ratio
self.qkv_bias = qkv_bias
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.drop_path_rate = drop_path_rate
self.hidden_act = hidden_act
self.path_norm = patch_norm
self.layer_norm_eps = layer_norm_eps
self.initializer_range = initializer_range
# we set the hidden_size attribute in order to make Dinat work with VisionEncoderDecoderModel
# this indicates the channel dimension after the last stage of the model
self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))
# coding=utf-8
# Copyright 2022 SHI Labs and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch Dilated Neighborhood Attention Transformer model."""
import math
from dataclasses import dataclass
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
ModelOutput,
OptionalDependencyNotAvailable,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
is_natten_available,
logging,
requires_backends,
)
from .configuration_dinat import DinatConfig
if is_natten_available():
from natten.functional import natten2dav, natten2dqkrpb
else:
def natten2dqkrpb(*args, **kwargs):
raise OptionalDependencyNotAvailable()
def natten2dav(*args, **kwargs):
raise OptionalDependencyNotAvailable()
logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "DinatConfig"
_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "shi-labs/dinat-mini-in1k-224"
_EXPECTED_OUTPUT_SHAPE = [1, 7, 7, 512]
# Image classification docstring
_IMAGE_CLASS_CHECKPOINT = "shi-labs/dinat-mini-in1k-224"
_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
DINAT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"shi-labs/dinat-mini-in1k-224",
# See all Dinat models at https://huggingface.co/models?filter=dinat
]
# drop_path and DinatDropPath are from the timm library.
@dataclass
# Copied from transformers.models.nat.modeling_nat.NatEncoderOutput with Nat->Dinat
class DinatEncoderOutput(ModelOutput):
"""
Dinat encoder's outputs, with potential hidden states and attentions.
Args:
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
shape `(batch_size, hidden_size, height, width)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
include the spatial dimensions.
"""
last_hidden_state: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
reshaped_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
# Copied from transformers.models.nat.modeling_nat.NatModelOutput with Nat->Dinat
class DinatModelOutput(ModelOutput):
"""
Dinat model's outputs that also contains a pooling of the last hidden states.
Args:
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
Average pooling of the last layer hidden-state.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
shape `(batch_size, hidden_size, height, width)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
include the spatial dimensions.
"""
last_hidden_state: torch.FloatTensor = None
pooler_output: Optional[torch.FloatTensor] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
reshaped_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
# Copied from transformers.models.nat.modeling_nat.NatImageClassifierOutput with Nat->Dinat
class DinatImageClassifierOutput(ModelOutput):
"""
Dinat outputs for image classification.
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Classification (or regression if config.num_labels==1) loss.
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
shape `(batch_size, hidden_size, height, width)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
include the spatial dimensions.
"""
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
reshaped_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
# Copied from transformers.models.nat.modeling_nat.NatEmbeddings with Nat->Dinat
class DinatEmbeddings(nn.Module):
"""
Construct the patch and position embeddings.
"""
def __init__(self, config):
super().__init__()
self.patch_embeddings = DinatPatchEmbeddings(config)
self.norm = nn.LayerNorm(config.embed_dim)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, pixel_values: Optional[torch.FloatTensor]) -> Tuple[torch.Tensor]:
embeddings = self.patch_embeddings(pixel_values)
embeddings = self.norm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
# Copied from transformers.models.nat.modeling_nat.NatPatchEmbeddings with Nat->Dinat
class DinatPatchEmbeddings(nn.Module):
"""
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, height, width, hidden_size)` to be consumed by a
Transformer.
"""
def __init__(self, config):
super().__init__()
patch_size = config.patch_size
num_channels, hidden_size = config.num_channels, config.embed_dim
self.num_channels = num_channels
if patch_size == 4:
pass
else:
# TODO: Support arbitrary patch sizes.
raise ValueError("Dinat only supports patch size of 4 at the moment.")
self.projection = nn.Sequential(
nn.Conv2d(self.num_channels, hidden_size // 2, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)),
nn.Conv2d(hidden_size // 2, hidden_size, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)),
)
def forward(self, pixel_values: Optional[torch.FloatTensor]) -> torch.Tensor:
_, num_channels, height, width = pixel_values.shape
if num_channels != self.num_channels:
raise ValueError(
"Make sure that the channel dimension of the pixel values match with the one set in the configuration."
)
embeddings = self.projection(pixel_values)
embeddings = embeddings.permute(0, 2, 3, 1)
return embeddings
# Copied from transformers.models.nat.modeling_nat.NatDownsampler with Nat->Dinat
class DinatDownsampler(nn.Module):
"""
Convolutional Downsampling Layer.
Args:
dim (`int`):
Number of input channels.
norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
Normalization layer class.
"""
def __init__(self, dim: int, norm_layer: nn.Module = nn.LayerNorm) -> None:
super().__init__()
self.dim = dim
self.reduction = nn.Conv2d(dim, 2 * dim, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
self.norm = norm_layer(2 * dim)
def forward(self, input_feature: torch.Tensor) -> torch.Tensor:
input_feature = self.reduction(input_feature.permute(0, 3, 1, 2)).permute(0, 2, 3, 1)
input_feature = self.norm(input_feature)
return input_feature
# Copied from transformers.models.beit.modeling_beit.drop_path
def drop_path(input, drop_prob=0.0, training=False, scale_by_keep=True):
"""
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
argument.
"""
if drop_prob == 0.0 or not training:
return input
keep_prob = 1 - drop_prob
shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
random_tensor.floor_() # binarize
output = input.div(keep_prob) * random_tensor
return output
# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->Dinat
class DinatDropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
def __init__(self, drop_prob: Optional[float] = None) -> None:
super().__init__()
self.drop_prob = drop_prob
def forward(self, x: torch.Tensor) -> torch.Tensor:
return drop_path(x, self.drop_prob, self.training)
def extra_repr(self) -> str:
return "p={}".format(self.drop_prob)
class NeighborhoodAttention(nn.Module):
def __init__(self, config, dim, num_heads, kernel_size, dilation):
super().__init__()
if dim % num_heads != 0:
raise ValueError(
f"The hidden size ({dim}) is not a multiple of the number of attention heads ({num_heads})"
)
self.num_attention_heads = num_heads
self.attention_head_size = int(dim / num_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.kernel_size = kernel_size
self.dilation = dilation
# rpb is learnable relative positional biases; same concept is used Swin.
self.rpb = nn.Parameter(torch.zeros(num_heads, (2 * self.kernel_size - 1), (2 * self.kernel_size - 1)))
self.query = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
self.key = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
self.value = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
# Copied from transformers.models.nat.modeling_nat.NeighborhoodAttention.transpose_for_scores with Nat->Dinat
def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(new_x_shape)
return x.permute(0, 3, 1, 2, 4)
def forward(
self,
hidden_states: torch.Tensor,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
query_layer = self.transpose_for_scores(self.query(hidden_states))
key_layer = self.transpose_for_scores(self.key(hidden_states))
value_layer = self.transpose_for_scores(self.value(hidden_states))
# Apply the scale factor before computing attention weights. It's usually more efficient because
# attention weights are typically a bigger tensor compared to query.
# It gives identical results because scalars are commutable in matrix multiplication.
query_layer = query_layer / math.sqrt(self.attention_head_size)
# Compute NA between "query" and "key" to get the raw attention scores, and add relative positional biases.
attention_scores = natten2dqkrpb(query_layer, key_layer, self.rpb, self.dilation)
# Normalize the attention scores to probabilities.
attention_probs = nn.functional.softmax(attention_scores, dim=-1)
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs = self.dropout(attention_probs)
context_layer = natten2dav(attention_probs, value_layer, self.dilation)
context_layer = context_layer.permute(0, 2, 3, 1, 4).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(new_context_layer_shape)
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
return outputs
# Copied from transformers.models.nat.modeling_nat.NeighborhoodAttentionOutput
class NeighborhoodAttentionOutput(nn.Module):
def __init__(self, config, dim):
super().__init__()
self.dense = nn.Linear(dim, dim)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
return hidden_states
class NeighborhoodAttentionModule(nn.Module):
def __init__(self, config, dim, num_heads, kernel_size, dilation):
super().__init__()
self.self = NeighborhoodAttention(config, dim, num_heads, kernel_size, dilation)
self.output = NeighborhoodAttentionOutput(config, dim)
self.pruned_heads = set()
# Copied from transformers.models.nat.modeling_nat.NeighborhoodAttentionModule.prune_heads
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
)
# Prune linear layers
self.self.query = prune_linear_layer(self.self.query, index)
self.self.key = prune_linear_layer(self.self.key, index)
self.self.value = prune_linear_layer(self.self.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
# Update hyper params and store pruned heads
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
# Copied from transformers.models.nat.modeling_nat.NeighborhoodAttentionModule.forward
def forward(
self,
hidden_states: torch.Tensor,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
self_outputs = self.self(hidden_states, output_attentions)
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
return outputs
# Copied from transformers.models.nat.modeling_nat.NatIntermediate with Nat->Dinat
class DinatIntermediate(nn.Module):
def __init__(self, config, dim):
super().__init__()
self.dense = nn.Linear(dim, int(config.mlp_ratio * dim))
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
# Copied from transformers.models.nat.modeling_nat.NatOutput with Nat->Dinat
class DinatOutput(nn.Module):
def __init__(self, config, dim):
super().__init__()
self.dense = nn.Linear(int(config.mlp_ratio * dim), dim)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
return hidden_states
class DinatLayer(nn.Module):
def __init__(self, config, dim, num_heads, dilation, drop_path_rate=0.0):
super().__init__()
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.kernel_size = config.kernel_size
self.dilation = dilation
self.window_size = self.kernel_size * self.dilation
self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps)
self.attention = NeighborhoodAttentionModule(
config, dim, num_heads, kernel_size=self.kernel_size, dilation=self.dilation
)
self.drop_path = DinatDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps)
self.intermediate = DinatIntermediate(config, dim)
self.output = DinatOutput(config, dim)
def maybe_pad(self, hidden_states, height, width):
window_size = self.window_size
pad_values = (0, 0, 0, 0, 0, 0)
if height < window_size or width < window_size:
pad_l = pad_t = 0
pad_r = max(0, window_size - width)
pad_b = max(0, window_size - height)
pad_values = (0, 0, pad_l, pad_r, pad_t, pad_b)
hidden_states = nn.functional.pad(hidden_states, pad_values)
return hidden_states, pad_values
def forward(
self,
hidden_states: torch.Tensor,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor, torch.Tensor]:
batch_size, height, width, channels = hidden_states.size()
shortcut = hidden_states
hidden_states = self.layernorm_before(hidden_states)
# pad hidden_states if they are smaller than kernel size x dilation
hidden_states, pad_values = self.maybe_pad(hidden_states, height, width)
_, height_pad, width_pad, _ = hidden_states.shape
attention_outputs = self.attention(hidden_states, output_attentions=output_attentions)
attention_output = attention_outputs[0]
was_padded = pad_values[3] > 0 or pad_values[5] > 0
if was_padded:
attention_output = attention_output[:, :height, :width, :].contiguous()
hidden_states = shortcut + self.drop_path(attention_output)
layer_output = self.layernorm_after(hidden_states)
layer_output = self.intermediate(layer_output)
layer_output = hidden_states + self.output(layer_output)
layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,)
return layer_outputs
class DinatStage(nn.Module):
def __init__(self, config, dim, depth, num_heads, dilations, drop_path_rate, downsample):
super().__init__()
self.config = config
self.dim = dim
self.layers = nn.ModuleList(
[
DinatLayer(
config=config,
dim=dim,
num_heads=num_heads,
dilation=dilations[i],
drop_path_rate=drop_path_rate[i],
)
for i in range(depth)
]
)
# patch merging layer
if downsample is not None:
self.downsample = downsample(dim=dim, norm_layer=nn.LayerNorm)
else:
self.downsample = None
self.pointing = False
# Copied from transformers.models.nat.modeling_nat.NatStage.forward
def forward(
self,
hidden_states: torch.Tensor,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
_, height, width, _ = hidden_states.size()
for i, layer_module in enumerate(self.layers):
layer_outputs = layer_module(hidden_states, output_attentions)
hidden_states = layer_outputs[0]
if self.downsample is not None:
height_downsampled, width_downsampled = (height + 1) // 2, (width + 1) // 2
output_dimensions = (height, width, height_downsampled, width_downsampled)
hidden_states = self.downsample(layer_outputs[0])
else:
output_dimensions = (height, width, height, width)
stage_outputs = (hidden_states, output_dimensions)
if output_attentions:
stage_outputs += layer_outputs[1:]
return stage_outputs
class DinatEncoder(nn.Module):
def __init__(self, config):
super().__init__()
self.num_levels = len(config.depths)
self.config = config
dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
self.levels = nn.ModuleList(
[
DinatStage(
config=config,
dim=int(config.embed_dim * 2**i_layer),
depth=config.depths[i_layer],
num_heads=config.num_heads[i_layer],
dilations=config.dilations[i_layer],
drop_path_rate=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
downsample=DinatDownsampler if (i_layer < self.num_levels - 1) else None,
)
for i_layer in range(self.num_levels)
]
)
# Copied from transformers.models.nat.modeling_nat.NatEncoder.forward with Nat->Dinat
def forward(
self,
hidden_states: torch.Tensor,
output_attentions: Optional[bool] = False,
output_hidden_states: Optional[bool] = False,
return_dict: Optional[bool] = True,
) -> Union[Tuple, DinatEncoderOutput]:
all_hidden_states = () if output_hidden_states else None
all_reshaped_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
if output_hidden_states:
# rearrange b h w c -> b c h w
reshaped_hidden_state = hidden_states.permute(0, 3, 1, 2)
all_hidden_states += (hidden_states,)
all_reshaped_hidden_states += (reshaped_hidden_state,)
for i, layer_module in enumerate(self.levels):
layer_outputs = layer_module(hidden_states, output_attentions)
hidden_states = layer_outputs[0]
if output_hidden_states:
# rearrange b h w c -> b c h w
reshaped_hidden_state = hidden_states.permute(0, 3, 1, 2)
all_hidden_states += (hidden_states,)
all_reshaped_hidden_states += (reshaped_hidden_state,)
if output_attentions:
all_self_attentions += layer_outputs[2:]
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
return DinatEncoderOutput(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
reshaped_hidden_states=all_reshaped_hidden_states,
)
class DinatPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = DinatConfig
base_model_prefix = "dinat"
main_input_name = "pixel_values"
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, (nn.Linear, nn.Conv2d)):
# Slightly different from the TF version which uses truncated_normal for initialization
# cf https://github.com/pytorch/pytorch/pull/5617
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
def _set_gradient_checkpointing(self, module: DinatEncoder, value: bool = False) -> None:
pass
DINAT_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`DinatConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
DINAT_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`AutoImageProcessor.__call__`] for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
"The bare Dinat Model transformer outputting raw hidden-states without any specific head on top.",
DINAT_START_DOCSTRING,
)
# Copied from transformers.models.nat.modeling_nat.NatModel with Nat->Dinat, NAT->DINAT
class DinatModel(DinatPreTrainedModel):
def __init__(self, config, add_pooling_layer=True):
super().__init__(config)
requires_backends(self, ["natten"])
self.config = config
self.num_levels = len(config.depths)
self.num_features = int(config.embed_dim * 2 ** (self.num_levels - 1))
self.embeddings = DinatEmbeddings(config)
self.encoder = DinatEncoder(config)
self.layernorm = nn.LayerNorm(self.num_features, eps=config.layer_norm_eps)
self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer else None
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.embeddings.patch_embeddings
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(DINAT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=DinatModelOutput,
config_class=_CONFIG_FOR_DOC,
modality="vision",
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, DinatModelOutput]:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
embedding_output = self.embeddings(pixel_values)
encoder_outputs = self.encoder(
embedding_output,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = encoder_outputs[0]
sequence_output = self.layernorm(sequence_output)
pooled_output = None
if self.pooler is not None:
pooled_output = self.pooler(sequence_output.flatten(1, 2).transpose(1, 2))
pooled_output = torch.flatten(pooled_output, 1)
if not return_dict:
output = (sequence_output, pooled_output) + encoder_outputs[1:]
return output
return DinatModelOutput(
last_hidden_state=sequence_output,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
reshaped_hidden_states=encoder_outputs.reshaped_hidden_states,
)
@add_start_docstrings(
"""
Dinat Model transformer with an image classification head on top (a linear layer on top of the final hidden state
of the [CLS] token) e.g. for ImageNet.
""",
DINAT_START_DOCSTRING,
)
class DinatForImageClassification(DinatPreTrainedModel):
def __init__(self, config):
super().__init__(config)
requires_backends(self, ["natten"])
self.num_labels = config.num_labels
self.dinat = DinatModel(config)
# Classifier head
self.classifier = (
nn.Linear(self.dinat.num_features, config.num_labels) if config.num_labels > 0 else nn.Identity()
)
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(DINAT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=DinatImageClassifierOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
)
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, DinatImageClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.dinat(
pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
pooled_output = outputs[1]
logits = self.classifier(pooled_output)
loss = None
if labels is not None:
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return DinatImageClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
reshaped_hidden_states=outputs.reshaped_hidden_states,
)
# flake8: noqa
# There's no way to ignore "F401 '...' imported but unused" warnings in this
# module, but to preserve other warnings. So, don't check this module at all.
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING
# rely on isort to merge the imports
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
_import_structure = {"configuration_nat": ["NAT_PRETRAINED_CONFIG_ARCHIVE_MAP", "NatConfig"]}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_nat"] = [
"NAT_PRETRAINED_MODEL_ARCHIVE_LIST",
"NatForImageClassification",
"NatModel",
"NatPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_nat import NAT_PRETRAINED_CONFIG_ARCHIVE_MAP, NatConfig
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_nat import (
NAT_PRETRAINED_MODEL_ARCHIVE_LIST,
NatForImageClassification,
NatModel,
NatPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Neighborhood Attention Transformer model configuration"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
NAT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"shi-labs/nat-mini-in1k-224": "https://huggingface.co/shi-labs/nat-mini-in1k-224/resolve/main/config.json",
# See all Nat models at https://huggingface.co/models?filter=nat
}
class NatConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`NatModel`]. It is used to instantiate a Nat model
according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the Nat
[shi-labs/nat-mini-in1k-224](https://huggingface.co/shi-labs/nat-mini-in1k-224) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
patch_size (`int`, *optional*, defaults to 4):
The size (resolution) of each patch. NOTE: Only patch size of 4 is supported at the moment.
num_channels (`int`, *optional*, defaults to 3):
The number of input channels.
embed_dim (`int`, *optional*, defaults to 64):
Dimensionality of patch embedding.
depths (`List[int]`, *optional*, defaults to `[2, 2, 6, 2]`):
Number of layers in each level of the encoder.
num_heads (`List[int]`, *optional*, defaults to `[3, 6, 12, 24]`):
Number of attention heads in each layer of the Transformer encoder.
kernel_size (`int`, *optional*, defaults to 7):
Neighborhood Attention kernel size.
mlp_ratio (`float`, *optional*, defaults to 3.0):
Ratio of MLP hidden dimensionality to embedding dimensionality.
qkv_bias (`bool`, *optional*, defaults to `True`):
Whether or not a learnable bias should be added to the queries, keys and values.
hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
The dropout probability for all fully connected layers in the embeddings and encoder.
attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
drop_path_rate (`float`, *optional*, defaults to 0.1):
Stochastic depth rate.
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`,
`"selu"` and `"gelu_new"` are supported.
patch_norm (`bool`, *optional*, defaults to `True`):
Whether or not to add layer normalization after patch embedding.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers.
Example:
```python
>>> from transformers import NatConfig, NatModel
>>> # Initializing a Nat shi-labs/nat-mini-in1k-224 style configuration
>>> configuration = NatConfig()
>>> # Initializing a model (with random weights) from the shi-labs/nat-mini-in1k-224 style configuration
>>> model = NatModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "nat"
attribute_map = {
"num_attention_heads": "num_heads",
"num_hidden_layers": "num_layers",
}
def __init__(
self,
patch_size=4,
num_channels=3,
embed_dim=64,
depths=[3, 4, 6, 5],
num_heads=[2, 4, 8, 16],
kernel_size=7,
mlp_ratio=3.0,
qkv_bias=True,
hidden_dropout_prob=0.0,
attention_probs_dropout_prob=0.0,
drop_path_rate=0.1,
hidden_act="gelu",
patch_norm=True,
initializer_range=0.02,
layer_norm_eps=1e-5,
**kwargs
):
super().__init__(**kwargs)
self.patch_size = patch_size
self.num_channels = num_channels
self.embed_dim = embed_dim
self.depths = depths
self.num_layers = len(depths)
self.num_heads = num_heads
self.kernel_size = kernel_size
self.mlp_ratio = mlp_ratio
self.qkv_bias = qkv_bias
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.drop_path_rate = drop_path_rate
self.hidden_act = hidden_act
self.path_norm = patch_norm
self.layer_norm_eps = layer_norm_eps
self.initializer_range = initializer_range
# we set the hidden_size attribute in order to make Nat work with VisionEncoderDecoderModel
# this indicates the channel dimension after the last stage of the model
self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))
# coding=utf-8
# Copyright 2022 SHI Labs and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch Neighborhood Attention Transformer model."""
import math
from dataclasses import dataclass
from typing import Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ...activations import ACT2FN
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
ModelOutput,
OptionalDependencyNotAvailable,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
is_natten_available,
logging,
requires_backends,
)
from .configuration_nat import NatConfig
if is_natten_available():
from natten.functional import natten2dav, natten2dqkrpb
else:
def natten2dqkrpb(*args, **kwargs):
raise OptionalDependencyNotAvailable()
def natten2dav(*args, **kwargs):
raise OptionalDependencyNotAvailable()
logger = logging.get_logger(__name__)
# General docstring
_CONFIG_FOR_DOC = "NatConfig"
_FEAT_EXTRACTOR_FOR_DOC = "AutoImageProcessor"
# Base docstring
_CHECKPOINT_FOR_DOC = "shi-labs/nat-mini-in1k-224"
_EXPECTED_OUTPUT_SHAPE = [1, 7, 7, 512]
# Image classification docstring
_IMAGE_CLASS_CHECKPOINT = "shi-labs/nat-mini-in1k-224"
_IMAGE_CLASS_EXPECTED_OUTPUT = "tiger cat"
NAT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"shi-labs/nat-mini-in1k-224",
# See all Nat models at https://huggingface.co/models?filter=nat
]
# drop_path and NatDropPath are from the timm library.
@dataclass
class NatEncoderOutput(ModelOutput):
"""
Nat encoder's outputs, with potential hidden states and attentions.
Args:
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
shape `(batch_size, hidden_size, height, width)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
include the spatial dimensions.
"""
last_hidden_state: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
reshaped_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
class NatModelOutput(ModelOutput):
"""
Nat model's outputs that also contains a pooling of the last hidden states.
Args:
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
Average pooling of the last layer hidden-state.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
shape `(batch_size, hidden_size, height, width)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
include the spatial dimensions.
"""
last_hidden_state: torch.FloatTensor = None
pooler_output: Optional[torch.FloatTensor] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
reshaped_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
class NatImageClassifierOutput(ModelOutput):
"""
Nat outputs for image classification.
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Classification (or regression if config.num_labels==1) loss.
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
shape `(batch_size, hidden_size, height, width)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
include the spatial dimensions.
"""
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
reshaped_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
class NatEmbeddings(nn.Module):
"""
Construct the patch and position embeddings.
"""
def __init__(self, config):
super().__init__()
self.patch_embeddings = NatPatchEmbeddings(config)
self.norm = nn.LayerNorm(config.embed_dim)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, pixel_values: Optional[torch.FloatTensor]) -> Tuple[torch.Tensor]:
embeddings = self.patch_embeddings(pixel_values)
embeddings = self.norm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
class NatPatchEmbeddings(nn.Module):
"""
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, height, width, hidden_size)` to be consumed by a
Transformer.
"""
def __init__(self, config):
super().__init__()
patch_size = config.patch_size
num_channels, hidden_size = config.num_channels, config.embed_dim
self.num_channels = num_channels
if patch_size == 4:
pass
else:
# TODO: Support arbitrary patch sizes.
raise ValueError("Dinat only supports patch size of 4 at the moment.")
self.projection = nn.Sequential(
nn.Conv2d(self.num_channels, hidden_size // 2, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)),
nn.Conv2d(hidden_size // 2, hidden_size, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)),
)
def forward(self, pixel_values: Optional[torch.FloatTensor]) -> torch.Tensor:
_, num_channels, height, width = pixel_values.shape
if num_channels != self.num_channels:
raise ValueError(
"Make sure that the channel dimension of the pixel values match with the one set in the configuration."
)
embeddings = self.projection(pixel_values)
embeddings = embeddings.permute(0, 2, 3, 1)
return embeddings
class NatDownsampler(nn.Module):
"""
Convolutional Downsampling Layer.
Args:
dim (`int`):
Number of input channels.
norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
Normalization layer class.
"""
def __init__(self, dim: int, norm_layer: nn.Module = nn.LayerNorm) -> None:
super().__init__()
self.dim = dim
self.reduction = nn.Conv2d(dim, 2 * dim, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
self.norm = norm_layer(2 * dim)
def forward(self, input_feature: torch.Tensor) -> torch.Tensor:
input_feature = self.reduction(input_feature.permute(0, 3, 1, 2)).permute(0, 2, 3, 1)
input_feature = self.norm(input_feature)
return input_feature
# Copied from transformers.models.beit.modeling_beit.drop_path
def drop_path(input, drop_prob=0.0, training=False, scale_by_keep=True):
"""
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
argument.
"""
if drop_prob == 0.0 or not training:
return input
keep_prob = 1 - drop_prob
shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
random_tensor.floor_() # binarize
output = input.div(keep_prob) * random_tensor
return output
# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->Nat
class NatDropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
def __init__(self, drop_prob: Optional[float] = None) -> None:
super().__init__()
self.drop_prob = drop_prob
def forward(self, x: torch.Tensor) -> torch.Tensor:
return drop_path(x, self.drop_prob, self.training)
def extra_repr(self) -> str:
return "p={}".format(self.drop_prob)
class NeighborhoodAttention(nn.Module):
def __init__(self, config, dim, num_heads, kernel_size):
super().__init__()
if dim % num_heads != 0:
raise ValueError(
f"The hidden size ({dim}) is not a multiple of the number of attention heads ({num_heads})"
)
self.num_attention_heads = num_heads
self.attention_head_size = int(dim / num_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.kernel_size = kernel_size
# rpb is learnable relative positional biases; same concept is used Swin.
self.rpb = nn.Parameter(torch.zeros(num_heads, (2 * self.kernel_size - 1), (2 * self.kernel_size - 1)))
self.query = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
self.key = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
self.value = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(new_x_shape)
return x.permute(0, 3, 1, 2, 4)
def forward(
self,
hidden_states: torch.Tensor,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
query_layer = self.transpose_for_scores(self.query(hidden_states))
key_layer = self.transpose_for_scores(self.key(hidden_states))
value_layer = self.transpose_for_scores(self.value(hidden_states))
# Apply the scale factor before computing attention weights. It's usually more efficient because
# attention weights are typically a bigger tensor compared to query.
# It gives identical results because scalars are commutable in matrix multiplication.
query_layer = query_layer / math.sqrt(self.attention_head_size)
# Compute NA between "query" and "key" to get the raw attention scores, and add relative positional biases.
attention_scores = natten2dqkrpb(query_layer, key_layer, self.rpb, 1)
# Normalize the attention scores to probabilities.
attention_probs = nn.functional.softmax(attention_scores, dim=-1)
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs = self.dropout(attention_probs)
context_layer = natten2dav(attention_probs, value_layer, 1)
context_layer = context_layer.permute(0, 2, 3, 1, 4).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(new_context_layer_shape)
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
return outputs
class NeighborhoodAttentionOutput(nn.Module):
def __init__(self, config, dim):
super().__init__()
self.dense = nn.Linear(dim, dim)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
return hidden_states
class NeighborhoodAttentionModule(nn.Module):
def __init__(self, config, dim, num_heads, kernel_size):
super().__init__()
self.self = NeighborhoodAttention(config, dim, num_heads, kernel_size)
self.output = NeighborhoodAttentionOutput(config, dim)
self.pruned_heads = set()
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
)
# Prune linear layers
self.self.query = prune_linear_layer(self.self.query, index)
self.self.key = prune_linear_layer(self.self.key, index)
self.self.value = prune_linear_layer(self.self.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
# Update hyper params and store pruned heads
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(
self,
hidden_states: torch.Tensor,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
self_outputs = self.self(hidden_states, output_attentions)
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
return outputs
class NatIntermediate(nn.Module):
def __init__(self, config, dim):
super().__init__()
self.dense = nn.Linear(dim, int(config.mlp_ratio * dim))
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
class NatOutput(nn.Module):
def __init__(self, config, dim):
super().__init__()
self.dense = nn.Linear(int(config.mlp_ratio * dim), dim)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
return hidden_states
class NatLayer(nn.Module):
def __init__(self, config, dim, num_heads, drop_path_rate=0.0):
super().__init__()
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.kernel_size = config.kernel_size
self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps)
self.attention = NeighborhoodAttentionModule(config, dim, num_heads, kernel_size=self.kernel_size)
self.drop_path = NatDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps)
self.intermediate = NatIntermediate(config, dim)
self.output = NatOutput(config, dim)
def maybe_pad(self, hidden_states, height, width):
window_size = self.kernel_size
pad_values = (0, 0, 0, 0, 0, 0)
if height < window_size or width < window_size:
pad_l = pad_t = 0
pad_r = max(0, window_size - width)
pad_b = max(0, window_size - height)
pad_values = (0, 0, pad_l, pad_r, pad_t, pad_b)
hidden_states = nn.functional.pad(hidden_states, pad_values)
return hidden_states, pad_values
def forward(
self,
hidden_states: torch.Tensor,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor, torch.Tensor]:
batch_size, height, width, channels = hidden_states.size()
shortcut = hidden_states
hidden_states = self.layernorm_before(hidden_states)
# pad hidden_states if they are smaller than kernel size
hidden_states, pad_values = self.maybe_pad(hidden_states, height, width)
_, height_pad, width_pad, _ = hidden_states.shape
attention_outputs = self.attention(hidden_states, output_attentions=output_attentions)
attention_output = attention_outputs[0]
was_padded = pad_values[3] > 0 or pad_values[5] > 0
if was_padded:
attention_output = attention_output[:, :height, :width, :].contiguous()
hidden_states = shortcut + self.drop_path(attention_output)
layer_output = self.layernorm_after(hidden_states)
layer_output = self.intermediate(layer_output)
layer_output = hidden_states + self.output(layer_output)
layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,)
return layer_outputs
class NatStage(nn.Module):
def __init__(self, config, dim, depth, num_heads, drop_path_rate, downsample):
super().__init__()
self.config = config
self.dim = dim
self.layers = nn.ModuleList(
[
NatLayer(
config=config,
dim=dim,
num_heads=num_heads,
drop_path_rate=drop_path_rate[i],
)
for i in range(depth)
]
)
# patch merging layer
if downsample is not None:
self.downsample = downsample(dim=dim, norm_layer=nn.LayerNorm)
else:
self.downsample = None
self.pointing = False
def forward(
self,
hidden_states: torch.Tensor,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
_, height, width, _ = hidden_states.size()
for i, layer_module in enumerate(self.layers):
layer_outputs = layer_module(hidden_states, output_attentions)
hidden_states = layer_outputs[0]
if self.downsample is not None:
height_downsampled, width_downsampled = (height + 1) // 2, (width + 1) // 2
output_dimensions = (height, width, height_downsampled, width_downsampled)
hidden_states = self.downsample(layer_outputs[0])
else:
output_dimensions = (height, width, height, width)
stage_outputs = (hidden_states, output_dimensions)
if output_attentions:
stage_outputs += layer_outputs[1:]
return stage_outputs
class NatEncoder(nn.Module):
def __init__(self, config):
super().__init__()
self.num_levels = len(config.depths)
self.config = config
dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
self.levels = nn.ModuleList(
[
NatStage(
config=config,
dim=int(config.embed_dim * 2**i_layer),
depth=config.depths[i_layer],
num_heads=config.num_heads[i_layer],
drop_path_rate=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
downsample=NatDownsampler if (i_layer < self.num_levels - 1) else None,
)
for i_layer in range(self.num_levels)
]
)
def forward(
self,
hidden_states: torch.Tensor,
output_attentions: Optional[bool] = False,
output_hidden_states: Optional[bool] = False,
return_dict: Optional[bool] = True,
) -> Union[Tuple, NatEncoderOutput]:
all_hidden_states = () if output_hidden_states else None
all_reshaped_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
if output_hidden_states:
# rearrange b h w c -> b c h w
reshaped_hidden_state = hidden_states.permute(0, 3, 1, 2)
all_hidden_states += (hidden_states,)
all_reshaped_hidden_states += (reshaped_hidden_state,)
for i, layer_module in enumerate(self.levels):
layer_outputs = layer_module(hidden_states, output_attentions)
hidden_states = layer_outputs[0]
if output_hidden_states:
# rearrange b h w c -> b c h w
reshaped_hidden_state = hidden_states.permute(0, 3, 1, 2)
all_hidden_states += (hidden_states,)
all_reshaped_hidden_states += (reshaped_hidden_state,)
if output_attentions:
all_self_attentions += layer_outputs[2:]
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
return NatEncoderOutput(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
reshaped_hidden_states=all_reshaped_hidden_states,
)
class NatPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = NatConfig
base_model_prefix = "nat"
main_input_name = "pixel_values"
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, (nn.Linear, nn.Conv2d)):
# Slightly different from the TF version which uses truncated_normal for initialization
# cf https://github.com/pytorch/pytorch/pull/5617
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
def _set_gradient_checkpointing(self, module: NatEncoder, value: bool = False) -> None:
pass
NAT_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`NatConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
NAT_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`AutoImageProcessor.__call__`] for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
"The bare Nat Model transformer outputting raw hidden-states without any specific head on top.",
NAT_START_DOCSTRING,
)
class NatModel(NatPreTrainedModel):
def __init__(self, config, add_pooling_layer=True):
super().__init__(config)
requires_backends(self, ["natten"])
self.config = config
self.num_levels = len(config.depths)
self.num_features = int(config.embed_dim * 2 ** (self.num_levels - 1))
self.embeddings = NatEmbeddings(config)
self.encoder = NatEncoder(config)
self.layernorm = nn.LayerNorm(self.num_features, eps=config.layer_norm_eps)
self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer else None
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.embeddings.patch_embeddings
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(NAT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=NatModelOutput,
config_class=_CONFIG_FOR_DOC,
modality="vision",
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, NatModelOutput]:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
embedding_output = self.embeddings(pixel_values)
encoder_outputs = self.encoder(
embedding_output,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = encoder_outputs[0]
sequence_output = self.layernorm(sequence_output)
pooled_output = None
if self.pooler is not None:
pooled_output = self.pooler(sequence_output.flatten(1, 2).transpose(1, 2))
pooled_output = torch.flatten(pooled_output, 1)
if not return_dict:
output = (sequence_output, pooled_output) + encoder_outputs[1:]
return output
return NatModelOutput(
last_hidden_state=sequence_output,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
reshaped_hidden_states=encoder_outputs.reshaped_hidden_states,
)
@add_start_docstrings(
"""
Nat Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
the [CLS] token) e.g. for ImageNet.
""",
NAT_START_DOCSTRING,
)
class NatForImageClassification(NatPreTrainedModel):
def __init__(self, config):
super().__init__(config)
requires_backends(self, ["natten"])
self.num_labels = config.num_labels
self.nat = NatModel(config)
# Classifier head
self.classifier = (
nn.Linear(self.nat.num_features, config.num_labels) if config.num_labels > 0 else nn.Identity()
)
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(NAT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
processor_class=_FEAT_EXTRACTOR_FOR_DOC,
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=NatImageClassifierOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
)
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, NatImageClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.nat(
pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
pooled_output = outputs[1]
logits = self.classifier(pooled_output)
loss = None
if labels is not None:
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return NatImageClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
reshaped_hidden_states=outputs.reshaped_hidden_states,
)
...@@ -58,6 +58,7 @@ from .utils import ( ...@@ -58,6 +58,7 @@ from .utils import (
is_ipex_available, is_ipex_available,
is_jumanpp_available, is_jumanpp_available,
is_librosa_available, is_librosa_available,
is_natten_available,
is_onnx_available, is_onnx_available,
is_pandas_available, is_pandas_available,
is_phonemizer_available, is_phonemizer_available,
...@@ -282,6 +283,16 @@ def require_timm(test_case): ...@@ -282,6 +283,16 @@ def require_timm(test_case):
return unittest.skipUnless(is_timm_available(), "test requires Timm")(test_case) return unittest.skipUnless(is_timm_available(), "test requires Timm")(test_case)
def require_natten(test_case):
"""
Decorator marking a test that requires NATTEN.
These tests are skipped when NATTEN isn't installed.
"""
return unittest.skipUnless(is_natten_available(), "test requires natten")(test_case)
def require_torch(test_case): def require_torch(test_case):
""" """
Decorator marking a test that requires PyTorch. Decorator marking a test that requires PyTorch.
......
...@@ -113,6 +113,7 @@ from .import_utils import ( ...@@ -113,6 +113,7 @@ from .import_utils import (
is_kenlm_available, is_kenlm_available,
is_librosa_available, is_librosa_available,
is_more_itertools_available, is_more_itertools_available,
is_natten_available,
is_ninja_available, is_ninja_available,
is_onnx_available, is_onnx_available,
is_pandas_available, is_pandas_available,
......
...@@ -1740,6 +1740,30 @@ class DeiTPreTrainedModel(metaclass=DummyObject): ...@@ -1740,6 +1740,30 @@ class DeiTPreTrainedModel(metaclass=DummyObject):
requires_backends(self, ["torch"]) requires_backends(self, ["torch"])
DINAT_PRETRAINED_MODEL_ARCHIVE_LIST = None
class DinatForImageClassification(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class DinatModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class DinatPreTrainedModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
...@@ -3738,6 +3762,30 @@ class MvpPreTrainedModel(metaclass=DummyObject): ...@@ -3738,6 +3762,30 @@ class MvpPreTrainedModel(metaclass=DummyObject):
requires_backends(self, ["torch"]) requires_backends(self, ["torch"])
NAT_PRETRAINED_MODEL_ARCHIVE_LIST = None
class NatForImageClassification(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class NatModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class NatPreTrainedModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
NEZHA_PRETRAINED_MODEL_ARCHIVE_LIST = None NEZHA_PRETRAINED_MODEL_ARCHIVE_LIST = None
......
...@@ -218,6 +218,14 @@ except importlib_metadata.PackageNotFoundError: ...@@ -218,6 +218,14 @@ except importlib_metadata.PackageNotFoundError:
_timm_available = False _timm_available = False
_natten_available = importlib.util.find_spec("natten") is not None
try:
_natten_version = importlib_metadata.version("natten")
logger.debug(f"Successfully imported natten version {_natten_version}")
except importlib_metadata.PackageNotFoundError:
_natten_available = False
_torchaudio_available = importlib.util.find_spec("torchaudio") is not None _torchaudio_available = importlib.util.find_spec("torchaudio") is not None
try: try:
_torchaudio_version = importlib_metadata.version("torchaudio") _torchaudio_version = importlib_metadata.version("torchaudio")
...@@ -644,6 +652,10 @@ def is_timm_available(): ...@@ -644,6 +652,10 @@ def is_timm_available():
return _timm_available return _timm_available
def is_natten_available():
return _natten_available
def is_torchaudio_available(): def is_torchaudio_available():
return _torchaudio_available return _torchaudio_available
...@@ -882,6 +894,13 @@ TIMM_IMPORT_ERROR = """ ...@@ -882,6 +894,13 @@ TIMM_IMPORT_ERROR = """
`pip install timm`. Please note that you may need to restart your runtime after installation. `pip install timm`. Please note that you may need to restart your runtime after installation.
""" """
# docstyle-ignore
NATTEN_IMPORT_ERROR = """
{0} requires the natten library but it was not found in your environment. You can install it by referring to:
shi-labs.com/natten . You can also install it with pip (may take longer to build):
`pip install natten`. Please note that you may need to restart your runtime after installation.
"""
# docstyle-ignore # docstyle-ignore
VISION_IMPORT_ERROR = """ VISION_IMPORT_ERROR = """
{0} requires the PIL library but it was not found in your environment. You can install it with pip: {0} requires the PIL library but it was not found in your environment. You can install it with pip:
...@@ -936,6 +955,7 @@ BACKENDS_MAPPING = OrderedDict( ...@@ -936,6 +955,7 @@ BACKENDS_MAPPING = OrderedDict(
("tf", (is_tf_available, TENSORFLOW_IMPORT_ERROR)), ("tf", (is_tf_available, TENSORFLOW_IMPORT_ERROR)),
("tensorflow_text", (is_tensorflow_text_available, TENSORFLOW_TEXT_IMPORT_ERROR)), ("tensorflow_text", (is_tensorflow_text_available, TENSORFLOW_TEXT_IMPORT_ERROR)),
("timm", (is_timm_available, TIMM_IMPORT_ERROR)), ("timm", (is_timm_available, TIMM_IMPORT_ERROR)),
("natten", (is_natten_available, NATTEN_IMPORT_ERROR)),
("tokenizers", (is_tokenizers_available, TOKENIZERS_IMPORT_ERROR)), ("tokenizers", (is_tokenizers_available, TOKENIZERS_IMPORT_ERROR)),
("torch", (is_torch_available, PYTORCH_IMPORT_ERROR)), ("torch", (is_torch_available, PYTORCH_IMPORT_ERROR)),
("vision", (is_vision_available, VISION_IMPORT_ERROR)), ("vision", (is_vision_available, VISION_IMPORT_ERROR)),
......
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Testing suite for the PyTorch Dinat model. """
import collections
import inspect
import unittest
from transformers import DinatConfig
from transformers.testing_utils import require_natten, require_torch, require_vision, slow, torch_device
from transformers.utils import cached_property, is_torch_available, is_vision_available
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
if is_torch_available():
import torch
from torch import nn
from transformers import DinatForImageClassification, DinatModel
from transformers.models.dinat.modeling_dinat import DINAT_PRETRAINED_MODEL_ARCHIVE_LIST
if is_vision_available():
from PIL import Image
from transformers import AutoImageProcessor
class DinatModelTester:
def __init__(
self,
parent,
batch_size=13,
image_size=64,
patch_size=4,
num_channels=3,
embed_dim=16,
depths=[1, 2, 1],
num_heads=[2, 4, 8],
kernel_size=3,
dilations=[[3], [1, 2], [1]],
mlp_ratio=2.0,
qkv_bias=True,
hidden_dropout_prob=0.0,
attention_probs_dropout_prob=0.0,
drop_path_rate=0.1,
hidden_act="gelu",
patch_norm=True,
initializer_range=0.02,
layer_norm_eps=1e-5,
is_training=True,
scope=None,
use_labels=True,
type_sequence_label_size=10,
encoder_stride=8,
):
self.parent = parent
self.batch_size = batch_size
self.image_size = image_size
self.patch_size = patch_size
self.num_channels = num_channels
self.embed_dim = embed_dim
self.depths = depths
self.num_heads = num_heads
self.kernel_size = kernel_size
self.dilations = dilations
self.mlp_ratio = mlp_ratio
self.qkv_bias = qkv_bias
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.drop_path_rate = drop_path_rate
self.hidden_act = hidden_act
self.patch_norm = patch_norm
self.layer_norm_eps = layer_norm_eps
self.initializer_range = initializer_range
self.is_training = is_training
self.scope = scope
self.use_labels = use_labels
self.type_sequence_label_size = type_sequence_label_size
self.encoder_stride = encoder_stride
def prepare_config_and_inputs(self):
pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
labels = None
if self.use_labels:
labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
config = self.get_config()
return config, pixel_values, labels
def get_config(self):
return DinatConfig(
image_size=self.image_size,
patch_size=self.patch_size,
num_channels=self.num_channels,
embed_dim=self.embed_dim,
depths=self.depths,
num_heads=self.num_heads,
kernel_size=self.kernel_size,
dilations=self.dilations,
mlp_ratio=self.mlp_ratio,
qkv_bias=self.qkv_bias,
hidden_dropout_prob=self.hidden_dropout_prob,
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
drop_path_rate=self.drop_path_rate,
hidden_act=self.hidden_act,
patch_norm=self.patch_norm,
layer_norm_eps=self.layer_norm_eps,
initializer_range=self.initializer_range,
encoder_stride=self.encoder_stride,
)
def create_and_check_model(self, config, pixel_values, labels):
model = DinatModel(config=config)
model.to(torch_device)
model.eval()
result = model(pixel_values)
expected_height = expected_width = (config.image_size // config.patch_size) // (2 ** (len(config.depths) - 1))
expected_dim = int(config.embed_dim * 2 ** (len(config.depths) - 1))
self.parent.assertEqual(
result.last_hidden_state.shape, (self.batch_size, expected_height, expected_width, expected_dim)
)
def create_and_check_for_image_classification(self, config, pixel_values, labels):
config.num_labels = self.type_sequence_label_size
model = DinatForImageClassification(config)
model.to(torch_device)
model.eval()
result = model(pixel_values, labels=labels)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
# test greyscale images
config.num_channels = 1
model = DinatForImageClassification(config)
model.to(torch_device)
model.eval()
pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
result = model(pixel_values)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, pixel_values, labels = config_and_inputs
inputs_dict = {"pixel_values": pixel_values}
return config, inputs_dict
@require_natten
@require_torch
class DinatModelTest(ModelTesterMixin, unittest.TestCase):
all_model_classes = (DinatModel, DinatForImageClassification) if is_torch_available() else ()
fx_compatible = False
test_torchscript = False
test_pruning = False
test_resize_embeddings = False
test_head_masking = False
def setUp(self):
self.model_tester = DinatModelTester(self)
self.config_tester = ConfigTester(self, config_class=DinatConfig, embed_dim=37)
def test_config(self):
self.create_and_test_config_common_properties()
self.config_tester.create_and_test_config_to_json_string()
self.config_tester.create_and_test_config_to_json_file()
self.config_tester.create_and_test_config_from_and_save_pretrained()
self.config_tester.create_and_test_config_with_num_labels()
self.config_tester.check_config_can_be_init_without_params()
self.config_tester.check_config_arguments_init()
def create_and_test_config_common_properties(self):
return
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
def test_for_image_classification(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
def test_inputs_embeds(self):
# Dinat does not use inputs_embeds
pass
def test_model_common_attributes(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
x = model.get_output_embeddings()
self.assertTrue(x is None or isinstance(x, nn.Linear))
def test_forward_signature(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
signature = inspect.signature(model.forward)
# signature.parameters is an OrderedDict => so arg_names order is deterministic
arg_names = [*signature.parameters.keys()]
expected_arg_names = ["pixel_values"]
self.assertListEqual(arg_names[:1], expected_arg_names)
def test_attention_outputs(self):
self.skipTest("Dinat's attention operation is handled entirely by NATTEN.")
def check_hidden_states_output(self, inputs_dict, config, model_class, image_size):
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
hidden_states = outputs.hidden_states
expected_num_layers = getattr(
self.model_tester, "expected_num_hidden_layers", len(self.model_tester.depths) + 1
)
self.assertEqual(len(hidden_states), expected_num_layers)
# Dinat has a different seq_length
patch_size = (
config.patch_size
if isinstance(config.patch_size, collections.abc.Iterable)
else (config.patch_size, config.patch_size)
)
height = image_size[0] // patch_size[0]
width = image_size[1] // patch_size[1]
self.assertListEqual(
list(hidden_states[0].shape[-3:]),
[height, width, self.model_tester.embed_dim],
)
reshaped_hidden_states = outputs.reshaped_hidden_states
self.assertEqual(len(reshaped_hidden_states), expected_num_layers)
batch_size, num_channels, height, width = reshaped_hidden_states[0].shape
reshaped_hidden_states = (
reshaped_hidden_states[0].view(batch_size, num_channels, height, width).permute(0, 2, 3, 1)
)
self.assertListEqual(
list(reshaped_hidden_states.shape[-3:]),
[height, width, self.model_tester.embed_dim],
)
def test_hidden_states_output(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
image_size = (
self.model_tester.image_size
if isinstance(self.model_tester.image_size, collections.abc.Iterable)
else (self.model_tester.image_size, self.model_tester.image_size)
)
for model_class in self.all_model_classes:
inputs_dict["output_hidden_states"] = True
self.check_hidden_states_output(inputs_dict, config, model_class, image_size)
# check that output_hidden_states also work using config
del inputs_dict["output_hidden_states"]
config.output_hidden_states = True
self.check_hidden_states_output(inputs_dict, config, model_class, image_size)
@slow
def test_model_from_pretrained(self):
for model_name in DINAT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
model = DinatModel.from_pretrained(model_name)
self.assertIsNotNone(model)
def test_initialization(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
configs_no_init = _config_zero_init(config)
for model_class in self.all_model_classes:
model = model_class(config=configs_no_init)
for name, param in model.named_parameters():
if "embeddings" not in name and param.requires_grad:
self.assertIn(
((param.data.mean() * 1e9).round() / 1e9).item(),
[0.0, 1.0],
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
)
@require_natten
@require_vision
@require_torch
class DinatModelIntegrationTest(unittest.TestCase):
@cached_property
def default_feature_extractor(self):
return AutoImageProcessor.from_pretrained("shi-labs/dinat-mini-in1k-224") if is_vision_available() else None
@slow
def test_inference_image_classification_head(self):
model = DinatForImageClassification.from_pretrained("shi-labs/dinat-mini-in1k-224").to(torch_device)
feature_extractor = self.default_feature_extractor
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
# forward pass
with torch.no_grad():
outputs = model(**inputs)
# verify the logits
expected_shape = torch.Size((1, 1000))
self.assertEqual(outputs.logits.shape, expected_shape)
expected_slice = torch.tensor([-0.1545, -0.7667, 0.4642]).to(torch_device)
self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Testing suite for the PyTorch Nat model. """
import collections
import inspect
import unittest
from transformers import NatConfig
from transformers.testing_utils import require_natten, require_torch, require_vision, slow, torch_device
from transformers.utils import cached_property, is_torch_available, is_vision_available
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
if is_torch_available():
import torch
from torch import nn
from transformers import NatForImageClassification, NatModel
from transformers.models.nat.modeling_nat import NAT_PRETRAINED_MODEL_ARCHIVE_LIST
if is_vision_available():
from PIL import Image
from transformers import AutoImageProcessor
class NatModelTester:
def __init__(
self,
parent,
batch_size=13,
image_size=64,
patch_size=4,
num_channels=3,
embed_dim=16,
depths=[1, 2, 1],
num_heads=[2, 4, 8],
kernel_size=3,
mlp_ratio=2.0,
qkv_bias=True,
hidden_dropout_prob=0.0,
attention_probs_dropout_prob=0.0,
drop_path_rate=0.1,
hidden_act="gelu",
patch_norm=True,
initializer_range=0.02,
layer_norm_eps=1e-5,
is_training=True,
scope=None,
use_labels=True,
type_sequence_label_size=10,
encoder_stride=8,
):
self.parent = parent
self.batch_size = batch_size
self.image_size = image_size
self.patch_size = patch_size
self.num_channels = num_channels
self.embed_dim = embed_dim
self.depths = depths
self.num_heads = num_heads
self.kernel_size = kernel_size
self.mlp_ratio = mlp_ratio
self.qkv_bias = qkv_bias
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.drop_path_rate = drop_path_rate
self.hidden_act = hidden_act
self.patch_norm = patch_norm
self.layer_norm_eps = layer_norm_eps
self.initializer_range = initializer_range
self.is_training = is_training
self.scope = scope
self.use_labels = use_labels
self.type_sequence_label_size = type_sequence_label_size
self.encoder_stride = encoder_stride
def prepare_config_and_inputs(self):
pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
labels = None
if self.use_labels:
labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
config = self.get_config()
return config, pixel_values, labels
def get_config(self):
return NatConfig(
image_size=self.image_size,
patch_size=self.patch_size,
num_channels=self.num_channels,
embed_dim=self.embed_dim,
depths=self.depths,
num_heads=self.num_heads,
kernel_size=self.kernel_size,
mlp_ratio=self.mlp_ratio,
qkv_bias=self.qkv_bias,
hidden_dropout_prob=self.hidden_dropout_prob,
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
drop_path_rate=self.drop_path_rate,
hidden_act=self.hidden_act,
patch_norm=self.patch_norm,
layer_norm_eps=self.layer_norm_eps,
initializer_range=self.initializer_range,
encoder_stride=self.encoder_stride,
)
def create_and_check_model(self, config, pixel_values, labels):
model = NatModel(config=config)
model.to(torch_device)
model.eval()
result = model(pixel_values)
expected_height = expected_width = (config.image_size // config.patch_size) // (2 ** (len(config.depths) - 1))
expected_dim = int(config.embed_dim * 2 ** (len(config.depths) - 1))
self.parent.assertEqual(
result.last_hidden_state.shape, (self.batch_size, expected_height, expected_width, expected_dim)
)
def create_and_check_for_image_classification(self, config, pixel_values, labels):
config.num_labels = self.type_sequence_label_size
model = NatForImageClassification(config)
model.to(torch_device)
model.eval()
result = model(pixel_values, labels=labels)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
# test greyscale images
config.num_channels = 1
model = NatForImageClassification(config)
model.to(torch_device)
model.eval()
pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
result = model(pixel_values)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, pixel_values, labels = config_and_inputs
inputs_dict = {"pixel_values": pixel_values}
return config, inputs_dict
@require_natten
@require_torch
class NatModelTest(ModelTesterMixin, unittest.TestCase):
all_model_classes = (NatModel, NatForImageClassification) if is_torch_available() else ()
fx_compatible = False
test_torchscript = False
test_pruning = False
test_resize_embeddings = False
test_head_masking = False
def setUp(self):
self.model_tester = NatModelTester(self)
self.config_tester = ConfigTester(self, config_class=NatConfig, embed_dim=37)
def test_config(self):
self.create_and_test_config_common_properties()
self.config_tester.create_and_test_config_to_json_string()
self.config_tester.create_and_test_config_to_json_file()
self.config_tester.create_and_test_config_from_and_save_pretrained()
self.config_tester.create_and_test_config_with_num_labels()
self.config_tester.check_config_can_be_init_without_params()
self.config_tester.check_config_arguments_init()
def create_and_test_config_common_properties(self):
return
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
def test_for_image_classification(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
def test_inputs_embeds(self):
# Nat does not use inputs_embeds
pass
def test_model_common_attributes(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
x = model.get_output_embeddings()
self.assertTrue(x is None or isinstance(x, nn.Linear))
def test_forward_signature(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
signature = inspect.signature(model.forward)
# signature.parameters is an OrderedDict => so arg_names order is deterministic
arg_names = [*signature.parameters.keys()]
expected_arg_names = ["pixel_values"]
self.assertListEqual(arg_names[:1], expected_arg_names)
def test_attention_outputs(self):
self.skipTest("Nat's attention operation is handled entirely by NATTEN.")
def check_hidden_states_output(self, inputs_dict, config, model_class, image_size):
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
hidden_states = outputs.hidden_states
expected_num_layers = getattr(
self.model_tester, "expected_num_hidden_layers", len(self.model_tester.depths) + 1
)
self.assertEqual(len(hidden_states), expected_num_layers)
# Nat has a different seq_length
patch_size = (
config.patch_size
if isinstance(config.patch_size, collections.abc.Iterable)
else (config.patch_size, config.patch_size)
)
height = image_size[0] // patch_size[0]
width = image_size[1] // patch_size[1]
self.assertListEqual(
list(hidden_states[0].shape[-3:]),
[height, width, self.model_tester.embed_dim],
)
reshaped_hidden_states = outputs.reshaped_hidden_states
self.assertEqual(len(reshaped_hidden_states), expected_num_layers)
batch_size, num_channels, height, width = reshaped_hidden_states[0].shape
reshaped_hidden_states = (
reshaped_hidden_states[0].view(batch_size, num_channels, height, width).permute(0, 2, 3, 1)
)
self.assertListEqual(
list(reshaped_hidden_states.shape[-3:]),
[height, width, self.model_tester.embed_dim],
)
def test_hidden_states_output(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
image_size = (
self.model_tester.image_size
if isinstance(self.model_tester.image_size, collections.abc.Iterable)
else (self.model_tester.image_size, self.model_tester.image_size)
)
for model_class in self.all_model_classes:
inputs_dict["output_hidden_states"] = True
self.check_hidden_states_output(inputs_dict, config, model_class, image_size)
# check that output_hidden_states also work using config
del inputs_dict["output_hidden_states"]
config.output_hidden_states = True
self.check_hidden_states_output(inputs_dict, config, model_class, image_size)
@slow
def test_model_from_pretrained(self):
for model_name in NAT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
model = NatModel.from_pretrained(model_name)
self.assertIsNotNone(model)
def test_initialization(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
configs_no_init = _config_zero_init(config)
for model_class in self.all_model_classes:
model = model_class(config=configs_no_init)
for name, param in model.named_parameters():
if "embeddings" not in name and param.requires_grad:
self.assertIn(
((param.data.mean() * 1e9).round() / 1e9).item(),
[0.0, 1.0],
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
)
@require_natten
@require_vision
@require_torch
class NatModelIntegrationTest(unittest.TestCase):
@cached_property
def default_feature_extractor(self):
return AutoImageProcessor.from_pretrained("shi-labs/nat-mini-in1k-224") if is_vision_available() else None
@slow
def test_inference_image_classification_head(self):
model = NatForImageClassification.from_pretrained("shi-labs/nat-mini-in1k-224").to(torch_device)
feature_extractor = self.default_feature_extractor
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
# forward pass
with torch.no_grad():
outputs = model(**inputs)
# verify the logits
expected_shape = torch.Size((1, 1000))
self.assertEqual(outputs.logits.shape, expected_shape)
expected_slice = torch.tensor([0.3805, -0.8676, -0.3912]).to(torch_device)
self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
...@@ -66,6 +66,8 @@ src/transformers/models/deit/modeling_deit.py ...@@ -66,6 +66,8 @@ src/transformers/models/deit/modeling_deit.py
src/transformers/models/deit/modeling_tf_deit.py src/transformers/models/deit/modeling_tf_deit.py
src/transformers/models/detr/configuration_detr.py src/transformers/models/detr/configuration_detr.py
src/transformers/models/detr/modeling_detr.py src/transformers/models/detr/modeling_detr.py
src/transformers/models/dinat/configuration_dinat.py
src/transformers/models/dinat/modeling_dinat.py
src/transformers/models/distilbert/configuration_distilbert.py src/transformers/models/distilbert/configuration_distilbert.py
src/transformers/models/dpr/configuration_dpr.py src/transformers/models/dpr/configuration_dpr.py
src/transformers/models/dpt/modeling_dpt.py src/transformers/models/dpt/modeling_dpt.py
...@@ -113,6 +115,8 @@ src/transformers/models/mobilebert/modeling_tf_mobilebert.py ...@@ -113,6 +115,8 @@ src/transformers/models/mobilebert/modeling_tf_mobilebert.py
src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py
src/transformers/models/mobilevit/modeling_mobilevit.py src/transformers/models/mobilevit/modeling_mobilevit.py
src/transformers/models/mobilevit/modeling_tf_mobilevit.py src/transformers/models/mobilevit/modeling_tf_mobilevit.py
src/transformers/models/nat/configuration_nat.py
src/transformers/models/nat/modeling_nat.py
src/transformers/models/nezha/configuration_nezha.py src/transformers/models/nezha/configuration_nezha.py
src/transformers/models/openai/configuration_openai.py src/transformers/models/openai/configuration_openai.py
src/transformers/models/opt/configuration_opt.py src/transformers/models/opt/configuration_opt.py
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment