Add Neighborhood Attention Transformer (NAT) and Dilated NAT (DiNAT) models (#20219)

* Add DiNAT * Adds DiNAT + tests * Minor fixes * Added HF model * Add natten to dependencies. * Cleanup * Minor fixup * Reformat * Optional NATTEN import. * Reformat & add doc to _toctree * Reformat (finally) * Dummy objects for DiNAT * Add NAT + minor changes Adds NAT as its own independent model + docs, tests Adds NATTEN to ext deps to ensure ci picks it up. * Remove natten from `all` and `dev-torch` deps, add manual pip install to ci tests * Minor fixes. * Fix READMEs. * Requested changes to docs + minor fixes. * Requested changes. * Add NAT/DiNAT tests to layoutlm_job * Correction to Dinat doc. * Requested changes.

Add Neighborhood Attention Transformer (NAT) and Dilated NAT (DiNAT) models (#20219)
* Add DiNAT * Adds DiNAT + tests * Minor fixes * Added HF model * Add natten to dependencies. * Cleanup * Minor fixup * Reformat * Optional NATTEN import. * Reformat & add doc to _toctree * Reformat (finally) * Dummy objects for DiNAT * Add NAT + minor changes Adds NAT as its own independent model + docs, tests Adds NATTEN to ext deps to ensure ci picks it up. * Remove natten from `all` and `dev-torch` deps, add manual pip install to ci tests * Minor fixes. * Fix READMEs. * Requested changes to docs + minor fixes. * Requested changes. * Add NAT/DiNAT tests to layoutlm_job * Correction to Dinat doc. * Requested changes.
fc4a993e · Ali Hassani · GitHub · 8d6de0b9 · fc4a993e · fc4a993e
Unverified Commit fc4a993e authored Nov 18, 2022 by Ali Hassani Committed by GitHub Nov 18, 2022
14 changed files
--- a/src/transformers/models/dinat/configuration_dinat.py
+++ b/src/transformers/models/dinat/configuration_dinat.py
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Dilated Neighborhood Attention Transformer model configuration"""
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+logger = logging.get_logger(__name__)
+DINAT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "shi-labs/dinat-mini-in1k-224": "https://huggingface.co/shi-labs/dinat-mini-in1k-224/resolve/main/config.json",
+    # See all Dinat models at https://huggingface.co/models?filter=dinat
+}
+class DinatConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DinatModel`]. It is used to instantiate a Dinat
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Dinat
+    [shi-labs/dinat-mini-in1k-224](https://huggingface.co/shi-labs/dinat-mini-in1k-224) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        patch_size (`int`, *optional*, defaults to 4):
+            The size (resolution) of each patch. NOTE: Only patch size of 4 is supported at the moment.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        embed_dim (`int`, *optional*, defaults to 64):
+            Dimensionality of patch embedding.
+        depths (`List[int]`, *optional*, defaults to `[2, 2, 6, 2]`):
+            Number of layers in each level of the encoder.
+        num_heads (`List[int]`, *optional*, defaults to `[3, 6, 12, 24]`):
+            Number of attention heads in each layer of the Transformer encoder.
+        kernel_size (`int`, *optional*, defaults to 7):
+            Neighborhood Attention kernel size.
+        dilations (`List[List[int]]`, *optional*, defaults to `[[1, 8, 1], [1, 4, 1, 4], [1, 2, 1, 2, 1, 2], [1, 1, 1, 1, 1]]`):
+            Dilation value of each NA layer in the Transformer encoder.
+        mlp_ratio (`float`, *optional*, defaults to 3.0):
+            Ratio of MLP hidden dimensionality to embedding dimensionality.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether or not a learnable bias should be added to the queries, keys and values.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        drop_path_rate (`float`, *optional*, defaults to 0.1):
+            Stochastic depth rate.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`,
+            `"selu"` and `"gelu_new"` are supported.
+        patch_norm (`bool`, *optional*, defaults to `True`):
+            Whether or not to add layer normalization after patch embedding.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+    Example:
+    ```python
+    >>> from transformers import DinatConfig, DinatModel
+    >>> # Initializing a Dinat shi-labs/dinat-mini-in1k-224 style configuration
+    >>> configuration = DinatConfig()
+    >>> # Initializing a model (with random weights) from the shi-labs/dinat-mini-in1k-224 style configuration
+    >>> model = DinatModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "dinat"
+    attribute_map = {
+        "num_attention_heads": "num_heads",
+        "num_hidden_layers": "num_layers",
+    }
+    def __init__(
+        self,
+        patch_size=4,
+        num_channels=3,
+        embed_dim=64,
+        depths=[3, 4, 6, 5],
+        num_heads=[2, 4, 8, 16],
+        kernel_size=7,
+        dilations=[[1, 8, 1], [1, 4, 1, 4], [1, 2, 1, 2, 1, 2], [1, 1, 1, 1, 1]],
+        mlp_ratio=3.0,
+        qkv_bias=True,
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        drop_path_rate=0.1,
+        hidden_act="gelu",
+        patch_norm=True,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.embed_dim = embed_dim
+        self.depths = depths
+        self.num_layers = len(depths)
+        self.num_heads = num_heads
+        self.kernel_size = kernel_size
+        self.dilations = dilations
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.drop_path_rate = drop_path_rate
+        self.hidden_act = hidden_act
+        self.path_norm = patch_norm
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_range = initializer_range
+        # we set the hidden_size attribute in order to make Dinat work with VisionEncoderDecoderModel
+        # this indicates the channel dimension after the last stage of the model
+        self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))
--- a/src/transformers/models/dinat/modeling_dinat.py
+++ b/src/transformers/models/dinat/modeling_dinat.py
--- a/src/transformers/models/nat/__init__.py
+++ b/src/transformers/models/nat/__init__.py
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+# rely on isort to merge the imports
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+_import_structure = {"configuration_nat": ["NAT_PRETRAINED_CONFIG_ARCHIVE_MAP", "NatConfig"]}
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_nat"] = [
+        "NAT_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "NatForImageClassification",
+        "NatModel",
+        "NatPreTrainedModel",
+    ]
+if TYPE_CHECKING:
+    from .configuration_nat import NAT_PRETRAINED_CONFIG_ARCHIVE_MAP, NatConfig
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_nat import (
+            NAT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            NatForImageClassification,
+            NatModel,
+            NatPreTrainedModel,
+        )
+else:
+    import sys
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
--- a/src/transformers/models/nat/configuration_nat.py
+++ b/src/transformers/models/nat/configuration_nat.py
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Neighborhood Attention Transformer model configuration"""
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+logger = logging.get_logger(__name__)
+NAT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "shi-labs/nat-mini-in1k-224": "https://huggingface.co/shi-labs/nat-mini-in1k-224/resolve/main/config.json",
+    # See all Nat models at https://huggingface.co/models?filter=nat
+}
+class NatConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`NatModel`]. It is used to instantiate a Nat model
+    according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Nat
+    [shi-labs/nat-mini-in1k-224](https://huggingface.co/shi-labs/nat-mini-in1k-224) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        patch_size (`int`, *optional*, defaults to 4):
+            The size (resolution) of each patch. NOTE: Only patch size of 4 is supported at the moment.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        embed_dim (`int`, *optional*, defaults to 64):
+            Dimensionality of patch embedding.
+        depths (`List[int]`, *optional*, defaults to `[2, 2, 6, 2]`):
+            Number of layers in each level of the encoder.
+        num_heads (`List[int]`, *optional*, defaults to `[3, 6, 12, 24]`):
+            Number of attention heads in each layer of the Transformer encoder.
+        kernel_size (`int`, *optional*, defaults to 7):
+            Neighborhood Attention kernel size.
+        mlp_ratio (`float`, *optional*, defaults to 3.0):
+            Ratio of MLP hidden dimensionality to embedding dimensionality.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether or not a learnable bias should be added to the queries, keys and values.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        drop_path_rate (`float`, *optional*, defaults to 0.1):
+            Stochastic depth rate.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`,
+            `"selu"` and `"gelu_new"` are supported.
+        patch_norm (`bool`, *optional*, defaults to `True`):
+            Whether or not to add layer normalization after patch embedding.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+    Example:
+    ```python
+    >>> from transformers import NatConfig, NatModel
+    >>> # Initializing a Nat shi-labs/nat-mini-in1k-224 style configuration
+    >>> configuration = NatConfig()
+    >>> # Initializing a model (with random weights) from the shi-labs/nat-mini-in1k-224 style configuration
+    >>> model = NatModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "nat"
+    attribute_map = {
+        "num_attention_heads": "num_heads",
+        "num_hidden_layers": "num_layers",
+    }
+    def __init__(
+        self,
+        patch_size=4,
+        num_channels=3,
+        embed_dim=64,
+        depths=[3, 4, 6, 5],
+        num_heads=[2, 4, 8, 16],
+        kernel_size=7,
+        mlp_ratio=3.0,
+        qkv_bias=True,
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        drop_path_rate=0.1,
+        hidden_act="gelu",
+        patch_norm=True,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.embed_dim = embed_dim
+        self.depths = depths
+        self.num_layers = len(depths)
+        self.num_heads = num_heads
+        self.kernel_size = kernel_size
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.drop_path_rate = drop_path_rate
+        self.hidden_act = hidden_act
+        self.path_norm = patch_norm
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_range = initializer_range
+        # we set the hidden_size attribute in order to make Nat work with VisionEncoderDecoderModel
+        # this indicates the channel dimension after the last stage of the model
+        self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))
--- a/src/transformers/models/nat/modeling_nat.py
+++ b/src/transformers/models/nat/modeling_nat.py
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -58,6 +58,7 @@ from .utils import (
    is_ipex_available,
    is_jumanpp_available,
    is_librosa_available,
+    is_natten_available,
    is_onnx_available,
    is_pandas_available,
    is_phonemizer_available,
@@ -282,6 +283,16 @@ def require_timm(test_case):
    return unittest.skipUnless(is_timm_available(), "test requires Timm")(test_case)
+def require_natten(test_case):
+    """
+    Decorator marking a test that requires NATTEN.
+    These tests are skipped when NATTEN isn't installed.
+    """
+    return unittest.skipUnless(is_natten_available(), "test requires natten")(test_case)
 def require_torch(test_case):
    """
    Decorator marking a test that requires PyTorch.

--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -113,6 +113,7 @@ from .import_utils import (
    is_kenlm_available,
    is_librosa_available,
    is_more_itertools_available,
+    is_natten_available,
    is_ninja_available,
    is_onnx_available,
    is_pandas_available,

--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -1740,6 +1740,30 @@ class DeiTPreTrainedModel(metaclass=DummyObject):
        requires_backends(self, ["torch"])
+DINAT_PRETRAINED_MODEL_ARCHIVE_LIST = None
+class DinatForImageClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+class DinatModel(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+class DinatPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
 DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -3738,6 +3762,30 @@ class MvpPreTrainedModel(metaclass=DummyObject):
        requires_backends(self, ["torch"])
+NAT_PRETRAINED_MODEL_ARCHIVE_LIST = None
+class NatForImageClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+class NatModel(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+class NatPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
 NEZHA_PRETRAINED_MODEL_ARCHIVE_LIST = None

--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -218,6 +218,14 @@ except importlib_metadata.PackageNotFoundError:
    _timm_available = False
+_natten_available = importlib.util.find_spec("natten") is not None
+try:
+    _natten_version = importlib_metadata.version("natten")
+    logger.debug(f"Successfully imported natten version {_natten_version}")
+except importlib_metadata.PackageNotFoundError:
+    _natten_available = False
 _torchaudio_available = importlib.util.find_spec("torchaudio") is not None
 try:
    _torchaudio_version = importlib_metadata.version("torchaudio")
@@ -644,6 +652,10 @@ def is_timm_available():
    return _timm_available
+def is_natten_available():
+    return _natten_available
 def is_torchaudio_available():
    return _torchaudio_available
@@ -882,6 +894,13 @@ TIMM_IMPORT_ERROR = """
 `pip install timm`. Please note that you may need to restart your runtime after installation.
 """
+# docstyle-ignore
+NATTEN_IMPORT_ERROR = """
+{0} requires the natten library but it was not found in your environment. You can install it by referring to:
+shi-labs.com/natten . You can also install it with pip (may take longer to build):
+`pip install natten`. Please note that you may need to restart your runtime after installation.
+"""
 # docstyle-ignore
 VISION_IMPORT_ERROR = """
 {0} requires the PIL library but it was not found in your environment. You can install it with pip:
@@ -936,6 +955,7 @@ BACKENDS_MAPPING = OrderedDict(
        ("tf", (is_tf_available, TENSORFLOW_IMPORT_ERROR)),
        ("tensorflow_text", (is_tensorflow_text_available, TENSORFLOW_TEXT_IMPORT_ERROR)),
        ("timm", (is_timm_available, TIMM_IMPORT_ERROR)),
+        ("natten", (is_natten_available, NATTEN_IMPORT_ERROR)),
        ("tokenizers", (is_tokenizers_available, TOKENIZERS_IMPORT_ERROR)),
        ("torch", (is_torch_available, PYTORCH_IMPORT_ERROR)),
        ("vision", (is_vision_available, VISION_IMPORT_ERROR)),

--- a/tests/models/dinat/__init__.py
+++ b/tests/models/dinat/__init__.py
--- a/tests/models/dinat/test_modeling_dinat.py
+++ b/tests/models/dinat/test_modeling_dinat.py
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Dinat model. """
+import collections
+import inspect
+import unittest
+from transformers import DinatConfig
+from transformers.testing_utils import require_natten, require_torch, require_vision, slow, torch_device
+from transformers.utils import cached_property, is_torch_available, is_vision_available
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+if is_torch_available():
+    import torch
+    from torch import nn
+    from transformers import DinatForImageClassification, DinatModel
+    from transformers.models.dinat.modeling_dinat import DINAT_PRETRAINED_MODEL_ARCHIVE_LIST
+if is_vision_available():
+    from PIL import Image
+    from transformers import AutoImageProcessor
+class DinatModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        image_size=64,
+        patch_size=4,
+        num_channels=3,
+        embed_dim=16,
+        depths=[1, 2, 1],
+        num_heads=[2, 4, 8],
+        kernel_size=3,
+        dilations=[[3], [1, 2], [1]],
+        mlp_ratio=2.0,
+        qkv_bias=True,
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        drop_path_rate=0.1,
+        hidden_act="gelu",
+        patch_norm=True,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        is_training=True,
+        scope=None,
+        use_labels=True,
+        type_sequence_label_size=10,
+        encoder_stride=8,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.embed_dim = embed_dim
+        self.depths = depths
+        self.num_heads = num_heads
+        self.kernel_size = kernel_size
+        self.dilations = dilations
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.drop_path_rate = drop_path_rate
+        self.hidden_act = hidden_act
+        self.patch_norm = patch_norm
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_range = initializer_range
+        self.is_training = is_training
+        self.scope = scope
+        self.use_labels = use_labels
+        self.type_sequence_label_size = type_sequence_label_size
+        self.encoder_stride = encoder_stride
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+        config = self.get_config()
+        return config, pixel_values, labels
+    def get_config(self):
+        return DinatConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            embed_dim=self.embed_dim,
+            depths=self.depths,
+            num_heads=self.num_heads,
+            kernel_size=self.kernel_size,
+            dilations=self.dilations,
+            mlp_ratio=self.mlp_ratio,
+            qkv_bias=self.qkv_bias,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            drop_path_rate=self.drop_path_rate,
+            hidden_act=self.hidden_act,
+            patch_norm=self.patch_norm,
+            layer_norm_eps=self.layer_norm_eps,
+            initializer_range=self.initializer_range,
+            encoder_stride=self.encoder_stride,
+        )
+    def create_and_check_model(self, config, pixel_values, labels):
+        model = DinatModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        expected_height = expected_width = (config.image_size // config.patch_size) // (2 ** (len(config.depths) - 1))
+        expected_dim = int(config.embed_dim * 2 ** (len(config.depths) - 1))
+        self.parent.assertEqual(
+            result.last_hidden_state.shape, (self.batch_size, expected_height, expected_width, expected_dim)
+        )
+    def create_and_check_for_image_classification(self, config, pixel_values, labels):
+        config.num_labels = self.type_sequence_label_size
+        model = DinatForImageClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values, labels=labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
+        # test greyscale images
+        config.num_channels = 1
+        model = DinatForImageClassification(config)
+        model.to(torch_device)
+        model.eval()
+        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
+        result = model(pixel_values)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values, labels = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+@require_natten
+@require_torch
+class DinatModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (DinatModel, DinatForImageClassification) if is_torch_available() else ()
+    fx_compatible = False
+    test_torchscript = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+    def setUp(self):
+        self.model_tester = DinatModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=DinatConfig, embed_dim=37)
+    def test_config(self):
+        self.create_and_test_config_common_properties()
+        self.config_tester.create_and_test_config_to_json_string()
+        self.config_tester.create_and_test_config_to_json_file()
+        self.config_tester.create_and_test_config_from_and_save_pretrained()
+        self.config_tester.create_and_test_config_with_num_labels()
+        self.config_tester.check_config_can_be_init_without_params()
+        self.config_tester.check_config_arguments_init()
+    def create_and_test_config_common_properties(self):
+        return
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+    def test_for_image_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
+    def test_inputs_embeds(self):
+        # Dinat does not use inputs_embeds
+        pass
+    def test_model_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+    def test_attention_outputs(self):
+        self.skipTest("Dinat's attention operation is handled entirely by NATTEN.")
+    def check_hidden_states_output(self, inputs_dict, config, model_class, image_size):
+        model = model_class(config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+        hidden_states = outputs.hidden_states
+        expected_num_layers = getattr(
+            self.model_tester, "expected_num_hidden_layers", len(self.model_tester.depths) + 1
+        )
+        self.assertEqual(len(hidden_states), expected_num_layers)
+        # Dinat has a different seq_length
+        patch_size = (
+            config.patch_size
+            if isinstance(config.patch_size, collections.abc.Iterable)
+            else (config.patch_size, config.patch_size)
+        )
+        height = image_size[0] // patch_size[0]
+        width = image_size[1] // patch_size[1]
+        self.assertListEqual(
+            list(hidden_states[0].shape[-3:]),
+            [height, width, self.model_tester.embed_dim],
+        )
+        reshaped_hidden_states = outputs.reshaped_hidden_states
+        self.assertEqual(len(reshaped_hidden_states), expected_num_layers)
+        batch_size, num_channels, height, width = reshaped_hidden_states[0].shape
+        reshaped_hidden_states = (
+            reshaped_hidden_states[0].view(batch_size, num_channels, height, width).permute(0, 2, 3, 1)
+        )
+        self.assertListEqual(
+            list(reshaped_hidden_states.shape[-3:]),
+            [height, width, self.model_tester.embed_dim],
+        )
+    def test_hidden_states_output(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        image_size = (
+            self.model_tester.image_size
+            if isinstance(self.model_tester.image_size, collections.abc.Iterable)
+            else (self.model_tester.image_size, self.model_tester.image_size)
+        )
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            self.check_hidden_states_output(inputs_dict, config, model_class, image_size)
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+            self.check_hidden_states_output(inputs_dict, config, model_class, image_size)
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in DINAT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = DinatModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if "embeddings" not in name and param.requires_grad:
+                    self.assertIn(
+                        ((param.data.mean() * 1e9).round() / 1e9).item(),
+                        [0.0, 1.0],
+                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                    )
+@require_natten
+@require_vision
+@require_torch
+class DinatModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def default_feature_extractor(self):
+        return AutoImageProcessor.from_pretrained("shi-labs/dinat-mini-in1k-224") if is_vision_available() else None
+    @slow
+    def test_inference_image_classification_head(self):
+        model = DinatForImageClassification.from_pretrained("shi-labs/dinat-mini-in1k-224").to(torch_device)
+        feature_extractor = self.default_feature_extractor
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+        # verify the logits
+        expected_shape = torch.Size((1, 1000))
+        self.assertEqual(outputs.logits.shape, expected_shape)
+        expected_slice = torch.tensor([-0.1545, -0.7667, 0.4642]).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
--- a/tests/models/nat/__init__.py
+++ b/tests/models/nat/__init__.py
--- a/tests/models/nat/test_modeling_nat.py
+++ b/tests/models/nat/test_modeling_nat.py
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Nat model. """
+import collections
+import inspect
+import unittest
+from transformers import NatConfig
+from transformers.testing_utils import require_natten, require_torch, require_vision, slow, torch_device
+from transformers.utils import cached_property, is_torch_available, is_vision_available
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+if is_torch_available():
+    import torch
+    from torch import nn
+    from transformers import NatForImageClassification, NatModel
+    from transformers.models.nat.modeling_nat import NAT_PRETRAINED_MODEL_ARCHIVE_LIST
+if is_vision_available():
+    from PIL import Image
+    from transformers import AutoImageProcessor
+class NatModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        image_size=64,
+        patch_size=4,
+        num_channels=3,
+        embed_dim=16,
+        depths=[1, 2, 1],
+        num_heads=[2, 4, 8],
+        kernel_size=3,
+        mlp_ratio=2.0,
+        qkv_bias=True,
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        drop_path_rate=0.1,
+        hidden_act="gelu",
+        patch_norm=True,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        is_training=True,
+        scope=None,
+        use_labels=True,
+        type_sequence_label_size=10,
+        encoder_stride=8,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.embed_dim = embed_dim
+        self.depths = depths
+        self.num_heads = num_heads
+        self.kernel_size = kernel_size
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.drop_path_rate = drop_path_rate
+        self.hidden_act = hidden_act
+        self.patch_norm = patch_norm
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_range = initializer_range
+        self.is_training = is_training
+        self.scope = scope
+        self.use_labels = use_labels
+        self.type_sequence_label_size = type_sequence_label_size
+        self.encoder_stride = encoder_stride
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+        config = self.get_config()
+        return config, pixel_values, labels
+    def get_config(self):
+        return NatConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            embed_dim=self.embed_dim,
+            depths=self.depths,
+            num_heads=self.num_heads,
+            kernel_size=self.kernel_size,
+            mlp_ratio=self.mlp_ratio,
+            qkv_bias=self.qkv_bias,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            drop_path_rate=self.drop_path_rate,
+            hidden_act=self.hidden_act,
+            patch_norm=self.patch_norm,
+            layer_norm_eps=self.layer_norm_eps,
+            initializer_range=self.initializer_range,
+            encoder_stride=self.encoder_stride,
+        )
+    def create_and_check_model(self, config, pixel_values, labels):
+        model = NatModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        expected_height = expected_width = (config.image_size // config.patch_size) // (2 ** (len(config.depths) - 1))
+        expected_dim = int(config.embed_dim * 2 ** (len(config.depths) - 1))
+        self.parent.assertEqual(
+            result.last_hidden_state.shape, (self.batch_size, expected_height, expected_width, expected_dim)
+        )
+    def create_and_check_for_image_classification(self, config, pixel_values, labels):
+        config.num_labels = self.type_sequence_label_size
+        model = NatForImageClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values, labels=labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
+        # test greyscale images
+        config.num_channels = 1
+        model = NatForImageClassification(config)
+        model.to(torch_device)
+        model.eval()
+        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
+        result = model(pixel_values)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values, labels = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+@require_natten
+@require_torch
+class NatModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (NatModel, NatForImageClassification) if is_torch_available() else ()
+    fx_compatible = False
+    test_torchscript = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+    def setUp(self):
+        self.model_tester = NatModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=NatConfig, embed_dim=37)
+    def test_config(self):
+        self.create_and_test_config_common_properties()
+        self.config_tester.create_and_test_config_to_json_string()
+        self.config_tester.create_and_test_config_to_json_file()
+        self.config_tester.create_and_test_config_from_and_save_pretrained()
+        self.config_tester.create_and_test_config_with_num_labels()
+        self.config_tester.check_config_can_be_init_without_params()
+        self.config_tester.check_config_arguments_init()
+    def create_and_test_config_common_properties(self):
+        return
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+    def test_for_image_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
+    def test_inputs_embeds(self):
+        # Nat does not use inputs_embeds
+        pass
+    def test_model_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+    def test_attention_outputs(self):
+        self.skipTest("Nat's attention operation is handled entirely by NATTEN.")
+    def check_hidden_states_output(self, inputs_dict, config, model_class, image_size):
+        model = model_class(config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+        hidden_states = outputs.hidden_states
+        expected_num_layers = getattr(
+            self.model_tester, "expected_num_hidden_layers", len(self.model_tester.depths) + 1
+        )
+        self.assertEqual(len(hidden_states), expected_num_layers)
+        # Nat has a different seq_length
+        patch_size = (
+            config.patch_size
+            if isinstance(config.patch_size, collections.abc.Iterable)
+            else (config.patch_size, config.patch_size)
+        )
+        height = image_size[0] // patch_size[0]
+        width = image_size[1] // patch_size[1]
+        self.assertListEqual(
+            list(hidden_states[0].shape[-3:]),
+            [height, width, self.model_tester.embed_dim],
+        )
+        reshaped_hidden_states = outputs.reshaped_hidden_states
+        self.assertEqual(len(reshaped_hidden_states), expected_num_layers)
+        batch_size, num_channels, height, width = reshaped_hidden_states[0].shape
+        reshaped_hidden_states = (
+            reshaped_hidden_states[0].view(batch_size, num_channels, height, width).permute(0, 2, 3, 1)
+        )
+        self.assertListEqual(
+            list(reshaped_hidden_states.shape[-3:]),
+            [height, width, self.model_tester.embed_dim],
+        )
+    def test_hidden_states_output(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        image_size = (
+            self.model_tester.image_size
+            if isinstance(self.model_tester.image_size, collections.abc.Iterable)
+            else (self.model_tester.image_size, self.model_tester.image_size)
+        )
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            self.check_hidden_states_output(inputs_dict, config, model_class, image_size)
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+            self.check_hidden_states_output(inputs_dict, config, model_class, image_size)
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in NAT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = NatModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if "embeddings" not in name and param.requires_grad:
+                    self.assertIn(
+                        ((param.data.mean() * 1e9).round() / 1e9).item(),
+                        [0.0, 1.0],
+                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                    )
+@require_natten
+@require_vision
+@require_torch
+class NatModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def default_feature_extractor(self):
+        return AutoImageProcessor.from_pretrained("shi-labs/nat-mini-in1k-224") if is_vision_available() else None
+    @slow
+    def test_inference_image_classification_head(self):
+        model = NatForImageClassification.from_pretrained("shi-labs/nat-mini-in1k-224").to(torch_device)
+        feature_extractor = self.default_feature_extractor
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+        # verify the logits
+        expected_shape = torch.Size((1, 1000))
+        self.assertEqual(outputs.logits.shape, expected_shape)
+        expected_slice = torch.tensor([0.3805, -0.8676, -0.3912]).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
--- a/utils/documentation_tests.txt
+++ b/utils/documentation_tests.txt
@@ -66,6 +66,8 @@ src/transformers/models/deit/modeling_deit.py
 src/transformers/models/deit/modeling_tf_deit.py
 src/transformers/models/detr/configuration_detr.py
 src/transformers/models/detr/modeling_detr.py
+src/transformers/models/dinat/configuration_dinat.py
+src/transformers/models/dinat/modeling_dinat.py
 src/transformers/models/distilbert/configuration_distilbert.py
 src/transformers/models/dpr/configuration_dpr.py
 src/transformers/models/dpt/modeling_dpt.py
@@ -113,6 +115,8 @@ src/transformers/models/mobilebert/modeling_tf_mobilebert.py
 src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py
 src/transformers/models/mobilevit/modeling_mobilevit.py
 src/transformers/models/mobilevit/modeling_tf_mobilevit.py
+src/transformers/models/nat/configuration_nat.py
+src/transformers/models/nat/modeling_nat.py
 src/transformers/models/nezha/configuration_nezha.py
 src/transformers/models/openai/configuration_openai.py
 src/transformers/models/opt/configuration_opt.py