"official/legacy/bert/README.md" did not exist on "e2293a971ff750b0a67dd7df6dcfff92dc341ad5"
bert_hubconf.py 16.9 KB
Newer Older
thomwolf's avatar
thomwolf committed
1
2
from pytorch_transformers.tokenization_bert import BertTokenizer
from pytorch_transformers.modeling_bert import (
VictorSanh's avatar
VictorSanh committed
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
        BertModel,
        BertForNextSentencePrediction,
        BertForMaskedLM,
        BertForMultipleChoice,
        BertForPreTraining,
        BertForQuestionAnswering,
        BertForSequenceClassification,
        BertForTokenClassification,
        )

# A lot of models share the same param doc. Use a decorator
# to save typing
bert_docstring = """
    Params:
        pretrained_model_name_or_path: either:
            - a str with the name of a pre-trained model to load
                . `bert-base-uncased`
                . `bert-large-uncased`
                . `bert-base-cased`
                . `bert-large-cased`
                . `bert-base-multilingual-uncased`
                . `bert-base-multilingual-cased`
                . `bert-base-chinese`
thomwolf's avatar
thomwolf committed
26
27
28
                . `bert-base-german-cased`
                . `bert-large-uncased-whole-word-masking`
                . `bert-large-cased-whole-word-masking`
VictorSanh's avatar
VictorSanh committed
29
30
31
32
33
34
35
36
37
38
39
            - a path or url to a pretrained model archive containing:
                . `bert_config.json` a configuration file for the model
                . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining
                  instance
            - a path or url to a pretrained model archive containing:
                . `bert_config.json` a configuration file for the model
                . `model.chkpt` a TensorFlow checkpoint
        from_tf: should we load the weights from a locally saved TensorFlow
                 checkpoint
        cache_dir: an optional path to a folder in which the pre-trained models
                   will be cached.
thomwolf's avatar
thomwolf committed
40
        state_dict: an optional state dictionary
VictorSanh's avatar
VictorSanh committed
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
                    (collections.OrderedDict object) to use instead of Google
                    pre-trained models
        *inputs, **kwargs: additional input for the specific Bert class
            (ex: num_labels for BertForSequenceClassification)
"""


def _append_from_pretrained_docstring(docstr):
    def docstring_decorator(fn):
        fn.__doc__ = fn.__doc__ + docstr
        return fn
    return docstring_decorator


def bertTokenizer(*args, **kwargs):
    """
    Instantiate a BertTokenizer from a pre-trained/customized vocab file
    Args:
    pretrained_model_name_or_path: Path to pretrained model archive
                                   or one of pre-trained vocab configs below.
                                       * bert-base-uncased
                                       * bert-large-uncased
                                       * bert-base-cased
                                       * bert-large-cased
                                       * bert-base-multilingual-uncased
                                       * bert-base-multilingual-cased
                                       * bert-base-chinese
    Keyword args:
    cache_dir: an optional path to a specific directory to download and cache
               the pre-trained model weights.
               Default: None
    do_lower_case: Whether to lower case the input.
                   Only has an effect when do_wordpiece_only=False
                   Default: True
    do_basic_tokenize: Whether to do basic tokenization before wordpiece.
                       Default: True
    max_len: An artificial maximum length to truncate tokenized sequences to;
             Effective maximum length is always the minimum of this
             value (if specified) and the underlying BERT model's
             sequence length.
             Default: None
    never_split: List of tokens which will never be split during tokenization.
                 Only has an effect when do_wordpiece_only=False
                 Default: ["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"]

    Example:
thomwolf's avatar
thomwolf committed
87
        >>> import torch
VictorSanh's avatar
VictorSanh committed
88
        >>> sentence = 'Hello, World!'
thomwolf's avatar
thomwolf committed
89
        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
VictorSanh's avatar
VictorSanh committed
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
        >>> toks = tokenizer.tokenize(sentence)
        ['Hello', '##,', 'World', '##!']
        >>> ids = tokenizer.convert_tokens_to_ids(toks)
        [8667, 28136, 1291, 28125]
    """
    tokenizer = BertTokenizer.from_pretrained(*args, **kwargs)
    return tokenizer


@_append_from_pretrained_docstring(bert_docstring)
def bertModel(*args, **kwargs):
    """
    BertModel is the basic BERT Transformer model with a layer of summed token,
    position and sequence embeddings followed by a series of identical
    self-attention blocks (12 for BERT-base, 24 for BERT-large).

    Example:
        # Load the tokenizer
thomwolf's avatar
thomwolf committed
108
        >>> import torch
thomwolf's avatar
thomwolf committed
109
        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
VictorSanh's avatar
VictorSanh committed
110
111
112
113
114
115
116
117
        #  Prepare tokenized input
        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
        >>> tokenized_text = tokenizer.tokenize(text)
        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
        >>> tokens_tensor = torch.tensor([indexed_tokens])
        >>> segments_tensors = torch.tensor([segments_ids])
        # Load bertModel
thomwolf's avatar
thomwolf committed
118
        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertModel', 'bert-base-cased')
VictorSanh's avatar
VictorSanh committed
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
        >>> model.eval()
        # Predict hidden states features for each layer
        >>> with torch.no_grad():
                encoded_layers, _ = model(tokens_tensor, segments_tensors)
    """
    model = BertModel.from_pretrained(*args, **kwargs)
    return model


@_append_from_pretrained_docstring(bert_docstring)
def bertForNextSentencePrediction(*args, **kwargs):
    """
    BERT model with next sentence prediction head.
    This module comprises the BERT model followed by the next sentence
    classification head.
134
135
136

    Example:
        # Load the tokenizer
thomwolf's avatar
thomwolf committed
137
        >>> import torch
thomwolf's avatar
thomwolf committed
138
        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
139
140
141
142
143
144
145
146
        #  Prepare tokenized input
        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
        >>> tokenized_text = tokenizer.tokenize(text)
        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
        >>> tokens_tensor = torch.tensor([indexed_tokens])
        >>> segments_tensors = torch.tensor([segments_ids])
        # Load bertForNextSentencePrediction
thomwolf's avatar
thomwolf committed
147
        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForNextSentencePrediction', 'bert-base-cased')
148
149
150
151
        >>> model.eval()
        # Predict the next sentence classification logits
        >>> with torch.no_grad():
                next_sent_classif_logits = model(tokens_tensor, segments_tensors)
VictorSanh's avatar
VictorSanh committed
152
153
154
155
156
157
158
159
160
161
162
163
    """
    model = BertForNextSentencePrediction.from_pretrained(*args, **kwargs)
    return model


@_append_from_pretrained_docstring(bert_docstring)
def bertForPreTraining(*args, **kwargs):
    """
    BERT model with pre-training heads.
    This module comprises the BERT model followed by the two pre-training heads
        - the masked language modeling head, and
        - the next sentence classification head.
VictorSanh's avatar
VictorSanh committed
164
165
166

    Example:
        # Load the tokenizer
thomwolf's avatar
thomwolf committed
167
        >>> import torch
thomwolf's avatar
thomwolf committed
168
        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
VictorSanh's avatar
VictorSanh committed
169
170
171
172
173
174
175
        #  Prepare tokenized input
        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
        >>> tokenized_text = tokenizer.tokenize(text)
        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
        >>> tokens_tensor = torch.tensor([indexed_tokens])
        >>> segments_tensors = torch.tensor([segments_ids])
        # Load bertForPreTraining
thomwolf's avatar
thomwolf committed
176
        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForPreTraining', 'bert-base-cased')
VictorSanh's avatar
VictorSanh committed
177
        >>> masked_lm_logits_scores, seq_relationship_logits = model(tokens_tensor, segments_tensors)
VictorSanh's avatar
VictorSanh committed
178
179
180
181
182
183
184
185
186
187
188
189
190
    """
    model = BertForPreTraining.from_pretrained(*args, **kwargs)
    return model


@_append_from_pretrained_docstring(bert_docstring)
def bertForMaskedLM(*args, **kwargs):
    """
    BertForMaskedLM includes the BertModel Transformer followed by the
    (possibly) pre-trained masked language modeling head.

    Example:
        # Load the tokenizer
thomwolf's avatar
thomwolf committed
191
        >>> import torch
thomwolf's avatar
thomwolf committed
192
        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
VictorSanh's avatar
VictorSanh committed
193
194
195
196
197
198
199
200
201
202
        #  Prepare tokenized input
        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
        >>> tokenized_text = tokenizer.tokenize(text)
        >>> masked_index = 8
        >>> tokenized_text[masked_index] = '[MASK]'
        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
        >>> tokens_tensor = torch.tensor([indexed_tokens])
        >>> segments_tensors = torch.tensor([segments_ids])
        # Load bertForMaskedLM
thomwolf's avatar
thomwolf committed
203
        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForMaskedLM', 'bert-base-cased')
VictorSanh's avatar
VictorSanh committed
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
        >>> model.eval()
        # Predict all tokens
        >>> with torch.no_grad():
                predictions = model(tokens_tensor, segments_tensors)
        >>> predicted_index = torch.argmax(predictions[0, masked_index]).item()
        >>> predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
        'henson'
    """
    model = BertForMaskedLM.from_pretrained(*args, **kwargs)
    return model


@_append_from_pretrained_docstring(bert_docstring)
def bertForSequenceClassification(*args, **kwargs):
    """
    BertForSequenceClassification is a fine-tuning model that includes
    BertModel and a sequence-level (sequence or pair of sequences) classifier
VictorSanh's avatar
VictorSanh committed
221
222
    on top of the BertModel. Note that the classification head is only initialized
    and has to be trained.
VictorSanh's avatar
VictorSanh committed
223
224
225
226
227
228
229
230
231

    The sequence-level classifier is a linear layer that takes as input the
    last hidden state of the first character in the input sequence
    (see Figures 3a and 3b in the BERT paper).

    Args:
    num_labels: the number (>=2) of classes for the classifier.

    Example:
232
        # Load the tokenizer
thomwolf's avatar
thomwolf committed
233
        >>> import torch
thomwolf's avatar
thomwolf committed
234
        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
235
236
237
238
239
240
241
242
        #  Prepare tokenized input
        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
        >>> tokenized_text = tokenizer.tokenize(text)
        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
        >>> tokens_tensor = torch.tensor([indexed_tokens])
        >>> segments_tensors = torch.tensor([segments_ids])
        # Load bertForSequenceClassification
thomwolf's avatar
thomwolf committed
243
        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForSequenceClassification', 'bert-base-cased', num_labels=2)
244
245
246
247
248
249
        >>> model.eval()
        # Predict the sequence classification logits
        >>> with torch.no_grad():
                seq_classif_logits = model(tokens_tensor, segments_tensors)
        # Or get the sequence classification loss
        >>> labels = torch.tensor([1])
VictorSanh's avatar
VictorSanh committed
250
        >>> seq_classif_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
VictorSanh's avatar
VictorSanh committed
251
252
253
254
255
256
257
258
259
    """
    model = BertForSequenceClassification.from_pretrained(*args, **kwargs)
    return model


@_append_from_pretrained_docstring(bert_docstring)
def bertForMultipleChoice(*args, **kwargs):
    """
    BertForMultipleChoice is a fine-tuning model that includes BertModel and a
VictorSanh's avatar
VictorSanh committed
260
261
    linear layer on top of the BertModel. Note that the multiple choice head is
    only initialized and has to be trained.
VictorSanh's avatar
VictorSanh committed
262
263
264
265
266

    Args:
    num_choices: the number (>=2) of classes for the classifier.

    Example:
267
        # Load the tokenizer
thomwolf's avatar
thomwolf committed
268
        >>> import torch
thomwolf's avatar
thomwolf committed
269
        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
270
271
272
273
274
275
276
277
        #  Prepare tokenized input
        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
        >>> tokenized_text = tokenizer.tokenize(text)
        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
        >>> tokens_tensor = torch.tensor([indexed_tokens, indexed_tokens]).unsqueeze(0)
        >>> segments_tensors = torch.tensor([segments_ids, segments_ids]).unsqueeze(0)
        # Load bertForMultipleChoice
thomwolf's avatar
thomwolf committed
278
        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForMultipleChoice', 'bert-base-cased', num_choices=2)
279
280
281
282
283
284
        >>> model.eval()
        # Predict the multiple choice logits
        >>> with torch.no_grad():
                multiple_choice_logits = model(tokens_tensor, segments_tensors)
        # Or get the multiple choice loss
        >>> labels = torch.tensor([1])
VictorSanh's avatar
VictorSanh committed
285
        >>> multiple_choice_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
VictorSanh's avatar
VictorSanh committed
286
287
288
289
290
291
292
293
294
295
    """
    model = BertForMultipleChoice.from_pretrained(*args, **kwargs)
    return model


@_append_from_pretrained_docstring(bert_docstring)
def bertForQuestionAnswering(*args, **kwargs):
    """
    BertForQuestionAnswering is a fine-tuning model that includes BertModel
    with a token-level classifiers on top of the full sequence of last hidden
VictorSanh's avatar
VictorSanh committed
296
297
    states. Note that the classification head is only initialized
    and has to be trained.
298
299
300

    Example:
        # Load the tokenizer
thomwolf's avatar
thomwolf committed
301
        >>> import torch
thomwolf's avatar
thomwolf committed
302
        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
303
304
305
306
307
308
309
310
        #  Prepare tokenized input
        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
        >>> tokenized_text = tokenizer.tokenize(text)
        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
        >>> tokens_tensor = torch.tensor([indexed_tokens])
        >>> segments_tensors = torch.tensor([segments_ids])
        # Load bertForQuestionAnswering
thomwolf's avatar
thomwolf committed
311
        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForQuestionAnswering', 'bert-base-cased')
312
313
314
315
316
317
        >>> model.eval()
        # Predict the start and end positions logits
        >>> with torch.no_grad():
                start_logits, end_logits = model(tokens_tensor, segments_tensors)
        # Or get the total loss which is the sum of the CrossEntropy loss for the start and end token positions
        >>> start_positions, end_positions = torch.tensor([12]), torch.tensor([14])
VictorSanh's avatar
VictorSanh committed
318
        # set model.train() before if training this loss
319
        >>> multiple_choice_loss = model(tokens_tensor, segments_tensors, start_positions=start_positions, end_positions=end_positions)
VictorSanh's avatar
VictorSanh committed
320
321
322
323
324
325
326
327
328
    """
    model = BertForQuestionAnswering.from_pretrained(*args, **kwargs)
    return model


@_append_from_pretrained_docstring(bert_docstring)
def bertForTokenClassification(*args, **kwargs):
    """
    BertForTokenClassification is a fine-tuning model that includes BertModel
VictorSanh's avatar
VictorSanh committed
329
330
    and a token-level classifier on top of the BertModel. Note that the classification
    head is only initialized and has to be trained.
VictorSanh's avatar
VictorSanh committed
331
332
333
334
335
336
337
338

    The token-level classifier is a linear layer that takes as input the last
    hidden state of the sequence.

    Args:
    num_labels: the number (>=2) of classes for the classifier.

    Example:
339
        # Load the tokenizer
thomwolf's avatar
thomwolf committed
340
        >>> import torch
thomwolf's avatar
thomwolf committed
341
        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
342
343
344
345
346
347
348
349
        #  Prepare tokenized input
        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
        >>> tokenized_text = tokenizer.tokenize(text)
        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
        >>> tokens_tensor = torch.tensor([indexed_tokens])
        >>> segments_tensors = torch.tensor([segments_ids])
        # Load bertForTokenClassification
thomwolf's avatar
thomwolf committed
350
        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForTokenClassification', 'bert-base-cased', num_labels=2)
351
352
353
354
355
356
        >>> model.eval()
        # Predict the token classification logits
        >>> with torch.no_grad():
                classif_logits = model(tokens_tensor, segments_tensors)
        # Or get the token classification loss
        >>> labels = torch.tensor([[0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0]])
VictorSanh's avatar
VictorSanh committed
357
        >>> classif_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
VictorSanh's avatar
VictorSanh committed
358
359
360
    """
    model = BertForTokenClassification.from_pretrained(*args, **kwargs)
    return model