configuration_bert.py 10 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" BERT model configuration """
17
18
from collections import OrderedDict
from typing import Mapping
19

Sylvain Gugger's avatar
Sylvain Gugger committed
20
from ...configuration_utils import PretrainedConfig
21
from ...onnx import OnnxConfig
Sylvain Gugger's avatar
Sylvain Gugger committed
22
from ...utils import logging
23

Aymeric Augustin's avatar
Aymeric Augustin committed
24

Lysandre Debut's avatar
Lysandre Debut committed
25
logger = logging.get_logger(__name__)
26
27

BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
    "bert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/config.json",
    "bert-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/config.json",
    "bert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/config.json",
    "bert-large-cased": "https://huggingface.co/bert-large-cased/resolve/main/config.json",
    "bert-base-multilingual-uncased": "https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json",
    "bert-base-multilingual-cased": "https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json",
    "bert-base-chinese": "https://huggingface.co/bert-base-chinese/resolve/main/config.json",
    "bert-base-german-cased": "https://huggingface.co/bert-base-german-cased/resolve/main/config.json",
    "bert-large-uncased-whole-word-masking": "https://huggingface.co/bert-large-uncased-whole-word-masking/resolve/main/config.json",
    "bert-large-cased-whole-word-masking": "https://huggingface.co/bert-large-cased-whole-word-masking/resolve/main/config.json",
    "bert-large-uncased-whole-word-masking-finetuned-squad": "https://huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/config.json",
    "bert-large-cased-whole-word-masking-finetuned-squad": "https://huggingface.co/bert-large-cased-whole-word-masking-finetuned-squad/resolve/main/config.json",
    "bert-base-cased-finetuned-mrpc": "https://huggingface.co/bert-base-cased-finetuned-mrpc/resolve/main/config.json",
    "bert-base-german-dbmdz-cased": "https://huggingface.co/bert-base-german-dbmdz-cased/resolve/main/config.json",
    "bert-base-german-dbmdz-uncased": "https://huggingface.co/bert-base-german-dbmdz-uncased/resolve/main/config.json",
    "cl-tohoku/bert-base-japanese": "https://huggingface.co/cl-tohoku/bert-base-japanese/resolve/main/config.json",
    "cl-tohoku/bert-base-japanese-whole-word-masking": "https://huggingface.co/cl-tohoku/bert-base-japanese-whole-word-masking/resolve/main/config.json",
    "cl-tohoku/bert-base-japanese-char": "https://huggingface.co/cl-tohoku/bert-base-japanese-char/resolve/main/config.json",
    "cl-tohoku/bert-base-japanese-char-whole-word-masking": "https://huggingface.co/cl-tohoku/bert-base-japanese-char-whole-word-masking/resolve/main/config.json",
    "TurkuNLP/bert-base-finnish-cased-v1": "https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/config.json",
    "TurkuNLP/bert-base-finnish-uncased-v1": "https://huggingface.co/TurkuNLP/bert-base-finnish-uncased-v1/resolve/main/config.json",
    "wietsedv/bert-base-dutch-cased": "https://huggingface.co/wietsedv/bert-base-dutch-cased/resolve/main/config.json",
50
    # See all BERT models at https://huggingface.co/models?filter=bert
51
52
53
54
55
}


class BertConfig(PretrainedConfig):
    r"""
Sylvain Gugger's avatar
Sylvain Gugger committed
56
    This is the configuration class to store the configuration of a :class:`~transformers.BertModel` or a
Sylvain Gugger's avatar
Sylvain Gugger committed
57
58
59
    :class:`~transformers.TFBertModel`. It is used to instantiate a BERT model according to the specified arguments,
    defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
    to that of the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
60

Sylvain Gugger's avatar
Sylvain Gugger committed
61
62
    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
63

Lysandre Debut's avatar
Lysandre Debut committed
64

Lysandre's avatar
Lysandre committed
65
    Args:
Sylvain Gugger's avatar
Sylvain Gugger committed
66
67
68
69
70
        vocab_size (:obj:`int`, `optional`, defaults to 30522):
            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
            :obj:`inputs_ids` passed when calling :class:`~transformers.BertModel` or
            :class:`~transformers.TFBertModel`.
        hidden_size (:obj:`int`, `optional`, defaults to 768):
Lysandre's avatar
Lysandre committed
71
            Dimensionality of the encoder layers and the pooler layer.
Sylvain Gugger's avatar
Sylvain Gugger committed
72
        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
Lysandre's avatar
Lysandre committed
73
            Number of hidden layers in the Transformer encoder.
Sylvain Gugger's avatar
Sylvain Gugger committed
74
        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
Lysandre's avatar
Lysandre committed
75
            Number of attention heads for each attention layer in the Transformer encoder.
Sylvain Gugger's avatar
Sylvain Gugger committed
76
77
78
        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
Sylvain Gugger's avatar
Sylvain Gugger committed
79
            The non-linear activation function (function or string) in the encoder and pooler. If string,
TFUsers's avatar
TFUsers committed
80
            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
Sylvain Gugger's avatar
Sylvain Gugger committed
81
        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
82
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
Sylvain Gugger's avatar
Sylvain Gugger committed
83
        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
Lysandre's avatar
Lysandre committed
84
            The dropout ratio for the attention probabilities.
Sylvain Gugger's avatar
Sylvain Gugger committed
85
        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
Sylvain Gugger's avatar
Sylvain Gugger committed
86
87
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
Sylvain Gugger's avatar
Sylvain Gugger committed
88
89
90
91
        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.BertModel` or
            :class:`~transformers.TFBertModel`.
        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
Lysandre's avatar
Lysandre committed
92
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
Sylvain Gugger's avatar
Sylvain Gugger committed
93
        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
Lysandre's avatar
Lysandre committed
94
            The epsilon used by the layer normalization layers.
95
96
97
98
99
100
101
        position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`):
            Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`,
            :obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on
            :obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.)
            <https://arxiv.org/abs/1803.02155>`__. For more information on :obj:`"relative_key_query"`, please refer to
            `Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
            <https://arxiv.org/abs/2009.13658>`__.
102
103
104
        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if ``config.is_decoder=True``.
105
106
        classifier_dropout (:obj:`float`, `optional`):
            The dropout ratio for the classification head.
Lysandre's avatar
Lysandre committed
107

Sylvain Gugger's avatar
Sylvain Gugger committed
108
    Examples::
Lysandre's avatar
Lysandre committed
109

Lysandre's avatar
Lysandre committed
110
        >>> from transformers import BertModel, BertConfig
Lysandre's avatar
Lysandre committed
111

Lysandre's avatar
Lysandre committed
112
113
        >>> # Initializing a BERT bert-base-uncased style configuration
        >>> configuration = BertConfig()
Lysandre's avatar
Lysandre committed
114

Lysandre's avatar
Lysandre committed
115
116
        >>> # Initializing a model from the bert-base-uncased style configuration
        >>> model = BertModel(configuration)
Lysandre's avatar
Lysandre committed
117

Lysandre's avatar
Lysandre committed
118
119
        >>> # Accessing the model configuration
        >>> configuration = model.config
120
    """
Julien Chaumond's avatar
Julien Chaumond committed
121
    model_type = "bert"
122

123
124
125
126
127
128
129
130
131
132
133
134
135
136
    def __init__(
        self,
        vocab_size=30522,
        hidden_size=768,
        num_hidden_layers=12,
        num_attention_heads=12,
        intermediate_size=3072,
        hidden_act="gelu",
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1,
        max_position_embeddings=512,
        type_vocab_size=2,
        initializer_range=0.02,
        layer_norm_eps=1e-12,
137
        pad_token_id=0,
138
        position_embedding_type="absolute",
139
        use_cache=True,
140
        classifier_dropout=None,
141
142
        **kwargs
    ):
143
        super().__init__(pad_token_id=pad_token_id, **kwargs)
Lysandre Debut's avatar
Lysandre Debut committed
144

thomwolf's avatar
thomwolf committed
145
146
147
148
149
150
151
152
153
154
155
156
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.hidden_act = hidden_act
        self.intermediate_size = intermediate_size
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.max_position_embeddings = max_position_embeddings
        self.type_vocab_size = type_vocab_size
        self.initializer_range = initializer_range
        self.layer_norm_eps = layer_norm_eps
157
        self.position_embedding_type = position_embedding_type
158
        self.use_cache = use_cache
159
        self.classifier_dropout = classifier_dropout
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175


class BertOnnxConfig(OnnxConfig):
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        return OrderedDict(
            [
                ("input_ids", {0: "batch", 1: "sequence"}),
                ("attention_mask", {0: "batch", 1: "sequence"}),
                ("token_type_ids", {0: "batch", 1: "sequence"}),
            ]
        )

    @property
    def outputs(self) -> Mapping[str, Mapping[int, str]]:
        return OrderedDict([("last_hidden_state", {0: "batch", 1: "sequence"}), ("pooler_output", {0: "batch"})])