configuration_xxx.py 5.01 KB
Newer Older
thomwolf's avatar
thomwolf committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# coding=utf-8
# Copyright 2010, XXX authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" XXX model configuration """

from __future__ import absolute_import, division, print_function, unicode_literals

import json
import logging
import sys
import six
from io import open

from .configuration_utils import PretrainedConfig

logger = logging.getLogger(__name__)

XXX_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    'xxx-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-config.json",
    'xxx-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-config.json",
}


class XxxConfig(PretrainedConfig):
    r"""
        :class:`~transformers.XxxConfig` is the configuration class to store the configuration of a
        `XxxModel`.


        Arguments:
thomwolf's avatar
thomwolf committed
42
            vocab_size: Vocabulary size of `inputs_ids` in `XxxModel`.
thomwolf's avatar
thomwolf committed
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
            hidden_size: Size of the encoder layers and the pooler layer.
            num_hidden_layers: Number of hidden layers in the Transformer encoder.
            num_attention_heads: Number of attention heads for each attention layer in
                the Transformer encoder.
            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
                layer in the Transformer encoder.
            hidden_act: The non-linear activation function (function or string) in the
                encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
            hidden_dropout_prob: The dropout probabilitiy for all fully connected
                layers in the embeddings, encoder, and pooler.
            attention_probs_dropout_prob: The dropout ratio for the attention
                probabilities.
            max_position_embeddings: The maximum sequence length that this model might
                ever be used with. Typically set this to something large just in case
                (e.g., 512 or 1024 or 2048).
            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
                `XxxModel`.
            initializer_range: The sttdev of the truncated_normal_initializer for
                initializing all weight matrices.
            layer_norm_eps: The epsilon used by LayerNorm.
    """
    pretrained_config_archive_map = XXX_PRETRAINED_CONFIG_ARCHIVE_MAP

    def __init__(self,
thomwolf's avatar
thomwolf committed
67
                 vocab_size=50257,
thomwolf's avatar
thomwolf committed
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
                 n_positions=1024,
                 n_ctx=1024,
                 n_embd=768,
                 n_layer=12,
                 n_head=12,
                 resid_pdrop=0.1,
                 embd_pdrop=0.1,
                 attn_pdrop=0.1,
                 layer_norm_epsilon=1e-5,
                 initializer_range=0.02,

                 num_labels=1,
                 summary_type='cls_index',
                 summary_use_proj=True,
                 summary_activation=None,
                 summary_proj_to_labels=True,
                 summary_first_dropout=0.1,
                 **kwargs):
        super(XxxConfig, self).__init__(**kwargs)
thomwolf's avatar
thomwolf committed
87
        self.vocab_size = vocab_size if isinstance(vocab_size, six.string_types) else -1
thomwolf's avatar
thomwolf committed
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
        self.n_ctx = n_ctx
        self.n_positions = n_positions
        self.n_embd = n_embd
        self.n_layer = n_layer
        self.n_head = n_head
        self.resid_pdrop = resid_pdrop
        self.embd_pdrop = embd_pdrop
        self.attn_pdrop = attn_pdrop
        self.layer_norm_epsilon = layer_norm_epsilon
        self.initializer_range = initializer_range

        self.num_labels = num_labels
        self.summary_type = summary_type
        self.summary_use_proj = summary_use_proj
        self.summary_activation = summary_activation
        self.summary_first_dropout = summary_first_dropout
        self.summary_proj_to_labels = summary_proj_to_labels
thomwolf's avatar
thomwolf committed
105
106
        if isinstance(vocab_size, six.string_types):
            with open(vocab_size, "r", encoding="utf-8") as reader:
thomwolf's avatar
thomwolf committed
107
108
109
                json_config = json.loads(reader.read())
            for key, value in json_config.items():
                self.__dict__[key] = value
thomwolf's avatar
thomwolf committed
110
        elif not isinstance(vocab_size, int):
thomwolf's avatar
thomwolf committed
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
            raise ValueError(
                "First argument must be either a vocabulary size (int)"
                "or the path to a pretrained model config file (str)"
            )

    @property
    def max_position_embeddings(self):
        return self.n_positions

    @property
    def hidden_size(self):
        return self.n_embd

    @property
    def num_attention_heads(self):
        return self.n_head

    @property
    def num_hidden_layers(self):
        return self.n_layer