configuration_beit.py 9.42 KB
Newer Older
NielsRogge's avatar
NielsRogge committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# coding=utf-8
# Copyright Microsoft Research and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Sylvain Gugger's avatar
Sylvain Gugger committed
15
""" BEiT model configuration"""
Jim Rohrer's avatar
Jim Rohrer committed
16
17
18
19
from collections import OrderedDict
from typing import Mapping

from packaging import version
NielsRogge's avatar
NielsRogge committed
20
21

from ...configuration_utils import PretrainedConfig
Jim Rohrer's avatar
Jim Rohrer committed
22
from ...onnx import OnnxConfig
NielsRogge's avatar
NielsRogge committed
23
24
25
26
27
28
from ...utils import logging


logger = logging.get_logger(__name__)

BEIT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
Sylvain Gugger's avatar
Sylvain Gugger committed
29
30
31
    "microsoft/beit-base-patch16-224-pt22k": (
        "https://huggingface.co/microsoft/beit-base-patch16-224-pt22k/resolve/main/config.json"
    ),
NielsRogge's avatar
NielsRogge committed
32
33
34
35
36
37
    # See all BEiT models at https://huggingface.co/models?filter=beit
}


class BeitConfig(PretrainedConfig):
    r"""
Sylvain Gugger's avatar
Sylvain Gugger committed
38
39
40
    This is the configuration class to store the configuration of a [`BeitModel`]. It is used to instantiate an BEiT
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the BEiT
41
    [microsoft/beit-base-patch16-224-pt22k](https://huggingface.co/microsoft/beit-base-patch16-224-pt22k) architecture.
NielsRogge's avatar
NielsRogge committed
42
43

    Args:
Sylvain Gugger's avatar
Sylvain Gugger committed
44
        vocab_size (`int`, *optional*, defaults to 8192):
NielsRogge's avatar
NielsRogge committed
45
46
            Vocabulary size of the BEiT model. Defines the number of different image tokens that can be used during
            pre-training.
47
        hidden_size (`int`, *optional*, defaults to 768):
NielsRogge's avatar
NielsRogge committed
48
            Dimensionality of the encoder layers and the pooler layer.
49
        num_hidden_layers (`int`, *optional*, defaults to 12):
NielsRogge's avatar
NielsRogge committed
50
            Number of hidden layers in the Transformer encoder.
51
        num_attention_heads (`int`, *optional*, defaults to 12):
NielsRogge's avatar
NielsRogge committed
52
            Number of attention heads for each attention layer in the Transformer encoder.
53
        intermediate_size (`int`, *optional*, defaults to 3072):
NielsRogge's avatar
NielsRogge committed
54
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
55
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
Sylvain Gugger's avatar
Sylvain Gugger committed
56
57
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` are supported.
NielsRogge's avatar
NielsRogge committed
58
        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
NielsRogge's avatar
NielsRogge committed
59
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
NielsRogge's avatar
NielsRogge committed
60
        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
NielsRogge's avatar
NielsRogge committed
61
            The dropout ratio for the attention probabilities.
62
        initializer_range (`float`, *optional*, defaults to 0.02):
NielsRogge's avatar
NielsRogge committed
63
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
64
        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
NielsRogge's avatar
NielsRogge committed
65
            The epsilon used by the layer normalization layers.
NielsRogge's avatar
NielsRogge committed
66
        image_size (`int`, *optional*, defaults to 224):
NielsRogge's avatar
NielsRogge committed
67
            The size (resolution) of each image.
NielsRogge's avatar
NielsRogge committed
68
        patch_size (`int`, *optional*, defaults to 16):
NielsRogge's avatar
NielsRogge committed
69
            The size (resolution) of each patch.
NielsRogge's avatar
NielsRogge committed
70
        num_channels (`int`, *optional*, defaults to 3):
NielsRogge's avatar
NielsRogge committed
71
            The number of input channels.
72
        use_mask_token (`bool`, *optional*, defaults to `False`):
NielsRogge's avatar
NielsRogge committed
73
            Whether to use a mask token for masked image modeling.
74
        use_absolute_position_embeddings (`bool`, *optional*, defaults to `False`):
NielsRogge's avatar
NielsRogge committed
75
            Whether to use BERT-style absolute position embeddings.
76
        use_relative_position_bias (`bool`, *optional*, defaults to `False`):
NielsRogge's avatar
NielsRogge committed
77
            Whether to use T5-style relative position embeddings in the self-attention layers.
78
        use_shared_relative_position_bias (`bool`, *optional*, defaults to `False`):
NielsRogge's avatar
NielsRogge committed
79
            Whether to use the same relative position embeddings across all self-attention layers of the Transformer.
80
        layer_scale_init_value (`float`, *optional*, defaults to 0.1):
NielsRogge's avatar
NielsRogge committed
81
            Scale to use in the self-attention layers. 0.1 for base, 1e-5 for large. Set 0 to disable layer scale.
82
        drop_path_rate (`float`, *optional*, defaults to 0.1):
NielsRogge's avatar
NielsRogge committed
83
            Stochastic depth rate per sample (when applied in the main path of residual layers).
84
        use_mean_pooling (`bool`, *optional*, defaults to `True`):
NielsRogge's avatar
NielsRogge committed
85
86
            Whether to mean pool the final hidden states of the patches instead of using the final hidden state of the
            CLS token, before applying the classification head.
87
        out_indices (`List[int]`, *optional*, defaults to `[3, 5, 7, 11]`):
88
            Indices of the feature maps to use for semantic segmentation.
89
        pool_scales (`Tuple[int]`, *optional*, defaults to `[1, 2, 3, 6]`):
90
            Pooling scales used in Pooling Pyramid Module applied on the last feature map.
91
        use_auxiliary_head (`bool`, *optional*, defaults to `True`):
92
            Whether to use an auxiliary head during training.
93
        auxiliary_loss_weight (`float`, *optional*, defaults to 0.4):
94
            Weight of the cross-entropy loss of the auxiliary head.
95
        auxiliary_channels (`int`, *optional*, defaults to 256):
96
            Number of channels to use in the auxiliary head.
97
        auxiliary_num_convs (`int`, *optional*, defaults to 1):
98
            Number of convolutional layers to use in the auxiliary head.
99
        auxiliary_concat_input (`bool`, *optional*, defaults to `False`):
100
            Whether to concatenate the output of the auxiliary head with the input before the classification layer.
101
        semantic_loss_ignore_index (`int`, *optional*, defaults to 255):
102
            The index that is ignored by the loss function of the semantic segmentation model.
NielsRogge's avatar
NielsRogge committed
103

104
    Example:
NielsRogge's avatar
NielsRogge committed
105

106
    ```python
107
    >>> from transformers import BeitConfig, BeitModel
NielsRogge's avatar
NielsRogge committed
108

109
    >>> # Initializing a BEiT beit-base-patch16-224-pt22k style configuration
110
    >>> configuration = BeitConfig()
NielsRogge's avatar
NielsRogge committed
111

112
    >>> # Initializing a model (with random weights) from the beit-base-patch16-224-pt22k style configuration
113
    >>> model = BeitModel(configuration)
NielsRogge's avatar
NielsRogge committed
114

115
116
117
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
NielsRogge's avatar
NielsRogge committed
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
    model_type = "beit"

    def __init__(
        self,
        vocab_size=8192,
        hidden_size=768,
        num_hidden_layers=12,
        num_attention_heads=12,
        intermediate_size=3072,
        hidden_act="gelu",
        hidden_dropout_prob=0.0,
        attention_probs_dropout_prob=0.0,
        initializer_range=0.02,
        layer_norm_eps=1e-12,
        image_size=224,
        patch_size=16,
        num_channels=3,
        use_mask_token=False,
        use_absolute_position_embeddings=False,
        use_relative_position_bias=False,
        use_shared_relative_position_bias=False,
        layer_scale_init_value=0.1,
        drop_path_rate=0.1,
        use_mean_pooling=True,
142
143
144
145
146
147
148
        out_indices=[3, 5, 7, 11],
        pool_scales=[1, 2, 3, 6],
        use_auxiliary_head=True,
        auxiliary_loss_weight=0.4,
        auxiliary_channels=256,
        auxiliary_num_convs=1,
        auxiliary_concat_input=False,
149
        semantic_loss_ignore_index=255,
150
        **kwargs,
NielsRogge's avatar
NielsRogge committed
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
    ):
        super().__init__(**kwargs)

        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.intermediate_size = intermediate_size
        self.hidden_act = hidden_act
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.initializer_range = initializer_range
        self.layer_norm_eps = layer_norm_eps

        self.image_size = image_size
        self.patch_size = patch_size
        self.num_channels = num_channels
        self.use_mask_token = use_mask_token
        self.use_absolute_position_embeddings = use_absolute_position_embeddings
        self.use_relative_position_bias = use_relative_position_bias
        self.use_shared_relative_position_bias = use_shared_relative_position_bias
        self.layer_scale_init_value = layer_scale_init_value
        self.drop_path_rate = drop_path_rate
        self.use_mean_pooling = use_mean_pooling
175
176
177
178
179
180
181
182
183
        # decode head attributes (semantic segmentation)
        self.out_indices = out_indices
        self.pool_scales = pool_scales
        # auxiliary head attributes (semantic segmentation)
        self.use_auxiliary_head = use_auxiliary_head
        self.auxiliary_loss_weight = auxiliary_loss_weight
        self.auxiliary_channels = auxiliary_channels
        self.auxiliary_num_convs = auxiliary_num_convs
        self.auxiliary_concat_input = auxiliary_concat_input
184
        self.semantic_loss_ignore_index = semantic_loss_ignore_index
Jim Rohrer's avatar
Jim Rohrer committed
185
186
187
188
189
190
191
192
193
194


# Copied from transformers.models.vit.configuration_vit.ViTOnnxConfig
class BeitOnnxConfig(OnnxConfig):
    torch_onnx_minimum_version = version.parse("1.11")

    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        return OrderedDict(
            [
195
                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
Jim Rohrer's avatar
Jim Rohrer committed
196
197
198
199
200
201
            ]
        )

    @property
    def atol_for_validation(self) -> float:
        return 1e-4