modeling_gpt2.py 38.8 KB
Newer Older
thomwolf's avatar
thomwolf committed
1
# coding=utf-8
thomwolf's avatar
thomwolf committed
2
# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
thomwolf's avatar
thomwolf committed
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch OpenAI GPT-2 model."""

18

thomwolf's avatar
thomwolf committed
19
20
import logging
import os
Sylvain Gugger's avatar
Sylvain Gugger committed
21
import warnings
22
23
from dataclasses import dataclass
from typing import List, Optional, Tuple
thomwolf's avatar
thomwolf committed
24
25
26
27
28

import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss

29
from .activations import ACT2FN
30
from .configuration_gpt2 import GPT2Config
31
32
33
34
35
36
37
38
from .file_utils import (
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_callable,
    replace_return_docstrings,
)
from .modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
39
40
41
42
43
44
45
from .modeling_utils import (
    Conv1D,
    PreTrainedModel,
    SequenceSummary,
    find_pruneable_heads_and_indices,
    prune_conv1d_layer,
)
Aymeric Augustin's avatar
Aymeric Augustin committed
46

thomwolf's avatar
thomwolf committed
47
48
49

logger = logging.getLogger(__name__)

50
_CONFIG_FOR_DOC = "GPT2Config"
51
52
_TOKENIZER_FOR_DOC = "GPT2Tokenizer"

53
54
55
56
57
58
59
60
GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "gpt2",
    "gpt2-medium",
    "gpt2-large",
    "gpt2-xl",
    "distilgpt2",
    # See all GPT-2 models at https://huggingface.co/models?filter=gpt2
]
61

thomwolf's avatar
thomwolf committed
62

63
def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
thomwolf's avatar
thomwolf committed
64
65
66
67
68
69
    """ Load tf checkpoints in a pytorch model
    """
    try:
        import re
        import tensorflow as tf
    except ImportError:
70
71
72
73
        logger.error(
            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
            "https://www.tensorflow.org/install/ for installation instructions."
        )
thomwolf's avatar
thomwolf committed
74
75
        raise
    tf_path = os.path.abspath(gpt2_checkpoint_path)
thomwolf's avatar
thomwolf committed
76
    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
thomwolf's avatar
thomwolf committed
77
78
79
80
81
    # Load weights from TF model
    init_vars = tf.train.list_variables(tf_path)
    names = []
    arrays = []
    for name, shape in init_vars:
thomwolf's avatar
thomwolf committed
82
        logger.info("Loading TF weight {} with shape {}".format(name, shape))
thomwolf's avatar
thomwolf committed
83
84
        array = tf.train.load_variable(tf_path, name)
        names.append(name)
thomwolf's avatar
thomwolf committed
85
        arrays.append(array.squeeze())
thomwolf's avatar
thomwolf committed
86
87

    for name, array in zip(names, arrays):
thomwolf's avatar
thomwolf committed
88
        name = name[6:]  # skip "model/"
89
        name = name.split("/")
thomwolf's avatar
thomwolf committed
90
91
        pointer = model
        for m_name in name:
92
            if re.fullmatch(r"[A-Za-z]+\d+", m_name):
93
                scope_names = re.split(r"(\d+)", m_name)
thomwolf's avatar
thomwolf committed
94
            else:
95
96
                scope_names = [m_name]
            if scope_names[0] == "w" or scope_names[0] == "g":
97
                pointer = getattr(pointer, "weight")
98
            elif scope_names[0] == "b":
99
                pointer = getattr(pointer, "bias")
100
101
            elif scope_names[0] == "wpe" or scope_names[0] == "wte":
                pointer = getattr(pointer, scope_names[0])
102
                pointer = getattr(pointer, "weight")
thomwolf's avatar
thomwolf committed
103
            else:
104
105
106
                pointer = getattr(pointer, scope_names[0])
            if len(scope_names) >= 2:
                num = int(scope_names[1])
thomwolf's avatar
thomwolf committed
107
108
                pointer = pointer[num]
        try:
Teven's avatar
Teven committed
109
110
111
            assert (
                pointer.shape == array.shape
            ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
thomwolf's avatar
thomwolf committed
112
113
114
        except AssertionError as e:
            e.args += (pointer.shape, array.shape)
            raise
thomwolf's avatar
thomwolf committed
115
        logger.info("Initialize PyTorch weight {}".format(name))
thomwolf's avatar
thomwolf committed
116
117
118
119
120
        pointer.data = torch.from_numpy(array)
    return model


class Attention(nn.Module):
121
    def __init__(self, nx, n_ctx, config, scale=False, is_cross_attention=False):
Julien Chaumond's avatar
Julien Chaumond committed
122
        super().__init__()
thomwolf's avatar
thomwolf committed
123

thomwolf's avatar
thomwolf committed
124
125
126
        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
        # [switch nx => n_state from Block to Attention to keep identical to TF implem]
        assert n_state % config.n_head == 0
127
128
129
130
        self.register_buffer(
            "bias", torch.tril(torch.ones((n_ctx, n_ctx), dtype=torch.uint8)).view(1, 1, n_ctx, n_ctx)
        )
        self.register_buffer("masked_bias", torch.tensor(-1e4))
thomwolf's avatar
thomwolf committed
131
132
133
        self.n_head = config.n_head
        self.split_size = n_state
        self.scale = scale
134
135
136
137
138
139
        self.is_cross_attention = is_cross_attention
        if self.is_cross_attention:
            self.c_attn = Conv1D(2 * n_state, nx)
            self.q_attn = Conv1D(n_state, nx)
        else:
            self.c_attn = Conv1D(3 * n_state, nx)
thomwolf's avatar
thomwolf committed
140
        self.c_proj = Conv1D(n_state, nx)
141
142
        self.attn_dropout = nn.Dropout(config.attn_pdrop)
        self.resid_dropout = nn.Dropout(config.resid_pdrop)
143
        self.pruned_heads = set()
thomwolf's avatar
thomwolf committed
144

145
    def prune_heads(self, heads):
thomwolf's avatar
thomwolf committed
146
147
        if len(heads) == 0:
            return
148
149
150
        heads, index = find_pruneable_heads_and_indices(
            heads, self.n_head, self.split_size // self.n_head, self.pruned_heads
        )
151
        index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)])
152

153
154
155
        # Prune conv1d layers
        self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
        self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
156

157
158
159
        # Update hyper params
        self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads))
        self.n_head = self.n_head - len(heads)
160
        self.pruned_heads = self.pruned_heads.union(heads)
161

162
    def _attn(self, q, k, v, attention_mask=None, head_mask=None, output_attentions=False):
thomwolf's avatar
thomwolf committed
163
164
        w = torch.matmul(q, k)
        if self.scale:
165
            w = w / (float(v.size(-1)) ** 0.5)
thomwolf's avatar
thomwolf committed
166
        nd, ns = w.size(-2), w.size(-1)
167
168
169
170
171

        if not self.is_cross_attention:
            # if only "normal" attention layer implements causal mask
            mask = self.bias[:, :, ns - nd : ns, :ns]
            w = torch.where(mask.bool(), w, self.masked_bias.to(w.dtype))
thomwolf's avatar
thomwolf committed
172

173
174
175
176
        if attention_mask is not None:
            # Apply the attention mask
            w = w + attention_mask

thomwolf's avatar
thomwolf committed
177
        w = nn.Softmax(dim=-1)(w)
178
        w = self.attn_dropout(w)
179
180
181
182
183

        # Mask heads if we want to
        if head_mask is not None:
            w = w * head_mask

thomwolf's avatar
thomwolf committed
184
        outputs = [torch.matmul(w, v)]
185
        if output_attentions:
thomwolf's avatar
thomwolf committed
186
187
            outputs.append(w)
        return outputs
thomwolf's avatar
thomwolf committed
188
189
190
191
192
193
194
195
196
197

    def merge_heads(self, x):
        x = x.permute(0, 2, 1, 3).contiguous()
        new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)
        return x.view(*new_x_shape)  # in Tensorflow implem: fct merge_states

    def split_heads(self, x, k=False):
        new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)
        x = x.view(*new_x_shape)  # in Tensorflow implem: fct split_states
        if k:
thomwolf's avatar
thomwolf committed
198
            return x.permute(0, 2, 3, 1)  # (batch, head, head_features, seq_length)
thomwolf's avatar
thomwolf committed
199
        else:
thomwolf's avatar
thomwolf committed
200
            return x.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)
thomwolf's avatar
thomwolf committed
201

202
    def forward(
203
204
205
206
207
208
209
210
211
        self,
        hidden_states,
        layer_past=None,
        attention_mask=None,
        head_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        use_cache=False,
        output_attentions=False,
212
    ):
213
214
215
216
217
218
219
220
221
222
        if encoder_hidden_states is not None:
            assert hasattr(
                self, "q_attn"
            ), "If class is used as cross attention, the weights `q_attn` have to be defined. Please make sure to instantiate class with `Attention(..., is_cross_attention=True)`."
            query = self.q_attn(hidden_states)
            key, value = self.c_attn(encoder_hidden_states).split(self.split_size, dim=2)
            attention_mask = encoder_attention_mask
        else:
            query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2)

thomwolf's avatar
thomwolf committed
223
224
225
        query = self.split_heads(query)
        key = self.split_heads(key, k=True)
        value = self.split_heads(value)
thomwolf's avatar
thomwolf committed
226
        if layer_past is not None:
thomwolf's avatar
thomwolf committed
227
            past_key, past_value = layer_past[0].transpose(-2, -1), layer_past[1]  # transpose back cf below
thomwolf's avatar
thomwolf committed
228
            key = torch.cat((past_key, key), dim=-1)
thomwolf's avatar
thomwolf committed
229
            value = torch.cat((past_value, value), dim=-2)
230
231
232
233
234

        if use_cache is True:
            present = torch.stack((key.transpose(-2, -1), value))  # transpose to have same shapes for stacking
        else:
            present = (None,)
235

236
        attn_outputs = self._attn(query, key, value, attention_mask, head_mask, output_attentions)
thomwolf's avatar
thomwolf committed
237
        a = attn_outputs[0]
238

thomwolf's avatar
thomwolf committed
239
240
        a = self.merge_heads(a)
        a = self.c_proj(a)
241
        a = self.resid_dropout(a)
thomwolf's avatar
thomwolf committed
242
243
244

        outputs = [a, present] + attn_outputs[1:]
        return outputs  # a, present, (attentions)
thomwolf's avatar
thomwolf committed
245
246
247
248


class MLP(nn.Module):
    def __init__(self, n_state, config):  # in MLP: n_state=3072 (4 * n_embd)
Julien Chaumond's avatar
Julien Chaumond committed
249
        super().__init__()
thomwolf's avatar
thomwolf committed
250
251
252
        nx = config.n_embd
        self.c_fc = Conv1D(n_state, nx)
        self.c_proj = Conv1D(nx, n_state)
253
        self.act = ACT2FN[config.activation_function]
254
        self.dropout = nn.Dropout(config.resid_pdrop)
thomwolf's avatar
thomwolf committed
255
256
257
258

    def forward(self, x):
        h = self.act(self.c_fc(x))
        h2 = self.c_proj(h)
259
        return self.dropout(h2)
thomwolf's avatar
thomwolf committed
260
261
262


class Block(nn.Module):
thomwolf's avatar
thomwolf committed
263
    def __init__(self, n_ctx, config, scale=False):
Julien Chaumond's avatar
Julien Chaumond committed
264
        super().__init__()
265
266
267
268
269
270
271
272
        hidden_size = config.n_embd
        inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
        self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
        self.attn = Attention(hidden_size, n_ctx, config, scale)
        self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
        if config.add_cross_attention:
            self.crossattention = Attention(hidden_size, n_ctx, config, scale, is_cross_attention=True)
            self.ln_cross_attn = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
273
        self.mlp = MLP(inner_dim, config)
thomwolf's avatar
thomwolf committed
274

275
    def forward(
276
277
278
279
280
281
282
283
284
        self,
        hidden_states,
        layer_past=None,
        attention_mask=None,
        head_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        use_cache=False,
        output_attentions=False,
285
    ):
286
287
        attn_outputs = self.attn(
            self.ln_1(hidden_states),
288
289
290
291
            layer_past=layer_past,
            attention_mask=attention_mask,
            head_mask=head_mask,
            use_cache=use_cache,
292
            output_attentions=output_attentions,
293
        )
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
        attn_output = attn_outputs[0]  # output_attn: a, present, (attentions)
        outputs = attn_outputs[1:]
        # residual connection
        hidden_states = attn_output + hidden_states

        if encoder_hidden_states is not None:
            # add one self-attention block for cross-attention
            assert hasattr(
                self, "crossattention"
            ), f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`"
            cross_attn_outputs = self.crossattention(
                self.ln_cross_attn(hidden_states),
                attention_mask=attention_mask,
                head_mask=head_mask,
                encoder_hidden_states=encoder_hidden_states,
                encoder_attention_mask=encoder_attention_mask,
                output_attentions=output_attentions,
            )
            attn_output = cross_attn_outputs[0]
            # residual connection
            hidden_states = hidden_states + attn_output
            outputs = outputs + cross_attn_outputs[1:]  # add cross attentions if we output attention weights
thomwolf's avatar
thomwolf committed
316

317
318
319
        feed_forward_hidden_states = self.mlp(self.ln_2(hidden_states))
        # residual connection
        hidden_states = hidden_states + feed_forward_hidden_states
thomwolf's avatar
thomwolf committed
320

321
322
        outputs = [hidden_states] + outputs
        return outputs  # hidden_states, present, (cross_attentions, attentions)
thomwolf's avatar
thomwolf committed
323
324


325
class GPT2PreTrainedModel(PreTrainedModel):
thomwolf's avatar
thomwolf committed
326
    """ An abstract class to handle weights initialization and
327
        a simple interface for downloading and loading pretrained models.
thomwolf's avatar
thomwolf committed
328
    """
329

330
331
332
    config_class = GPT2Config
    load_tf_weights = load_tf_weights_in_gpt2
    base_model_prefix = "transformer"
thomwolf's avatar
thomwolf committed
333

334
    def __init__(self, *inputs, **kwargs):
Julien Chaumond's avatar
Julien Chaumond committed
335
        super().__init__(*inputs, **kwargs)
336

337
    def _init_weights(self, module):
thomwolf's avatar
thomwolf committed
338
339
        """ Initialize the weights.
        """
340
        if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)):
thomwolf's avatar
thomwolf committed
341
342
343
            # Slightly different from the TF version which uses truncated_normal for initialization
            # cf https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
344
345
            if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None:
                module.bias.data.zero_()
346
        elif isinstance(module, nn.LayerNorm):
thomwolf's avatar
thomwolf committed
347
348
349
350
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
@dataclass
class GPT2DoubleHeadsModelOutput(ModelOutput):
    """
    Base class for outputs of models predicting if two sentences are consecutive or not.

    Args:
        lm_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided):
            Language modeling loss.
        mc_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`mc_labels` is provided):
            Multiple choice classification loss.
        lm_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        mc_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
            Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
        past_key_values (:obj:`List[torch.FloatTensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
            List of :obj:`torch.FloatTensor` of length :obj:`config.n_layers`,  with each tensor of shape
            :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`).

            Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
            ``past_key_values`` input) to speed up sequential decoding.
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

384
385
386
387
    lm_loss: Optional[torch.FloatTensor] = None
    mc_loss: Optional[torch.FloatTensor] = None
    lm_logits: torch.FloatTensor = None
    mc_logits: torch.FloatTensor = None
388
389
390
391
392
    past_key_values: Optional[List[torch.FloatTensor]] = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None


Lysandre's avatar
Lysandre committed
393
394
395
396
GPT2_START_DOCSTRING = r"""

    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
Lysandre's avatar
Fixes  
Lysandre committed
397
    usage and behavior.
thomwolf's avatar
thomwolf committed
398
399

    Parameters:
400
        config (:class:`~transformers.GPT2Config`): Model configuration class with all the parameters of the model.
401
            Initializing with a config file does not load the weights associated with the model, only the configuration.
402
            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
thomwolf's avatar
thomwolf committed
403
404
"""

Lysandre's avatar
Lysandre committed
405
GPT2_INPUTS_DOCSTRING = r"""
406
    Args:
407
        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, input_ids_length)`):
408
409
            :obj:`input_ids_length` = ``sequence_length`` if ``past_key_values`` is ``None`` else
            ``past_key_values[0].shape[-2]`` (``sequence_length`` of input past key value states).
Lysandre's avatar
Lysandre committed
410
            Indices of input sequence tokens in the vocabulary.
411

412
413
            If ``past_key_values`` is used, only ``input_ids`` that do not have their past calculated should be passed
            as ``input_ids``.
Lysandre's avatar
Lysandre committed
414

415
416
            Indices can be obtained using :class:`transformers.GPT2Tokenizer`.
            See :func:`transformers.PreTrainedTokenizer.encode` and
417
            :func:`transformers.PreTrainedTokenizer.__call__` for details.
Lysandre's avatar
Lysandre committed
418

419
            `What are input IDs? <../glossary.html#input-ids>`__
420

421
        past_key_values (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
422
            Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
423
424
            (see ``past_key_values`` output below). Can be used to speed up sequential decoding.
            The ``input_ids`` which have their past given to this model should not be passed as ``input_ids`` as they have already been computed.
425
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
426
427
428
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
Lysandre's avatar
Lysandre committed
429

430
            `What are attention masks? <../glossary.html#attention-mask>`__
431
432
        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, input_ids_length)`, `optional`, defaults to :obj:`None`):
            `input_ids_length` = `sequence_length if `past` is None else 1
433
434
435
436
437
            Segment token indices to indicate first and second portions of the inputs.
            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
            corresponds to a `sentence B` token
            `What are token type IDs? <../glossary.html#token-type-ids>`_
        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
438
439
            Indices of positions of each input sequence tokens in the position embeddings.
            Selected in the range ``[0, config.max_position_embeddings - 1]``.
Lysandre's avatar
Lysandre committed
440

441
442
            `What are position IDs? <../glossary.html#position-ids>`_
        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
thomwolf's avatar
thomwolf committed
443
            Mask to nullify selected heads of the self-attention modules.
thomwolf's avatar
thomwolf committed
444
            Mask values selected in ``[0, 1]``:
445
            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
flozi00's avatar
flozi00 committed
446
        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
447
448
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
            than the model's internal embedding lookup matrix.
449
            If ``past_key_values`` is used, optionally only the last `inputs_embeds` have to be input (see ``past_key_values``).
450
        use_cache (:obj:`bool`):
451
            If `use_cache` is True, ``past_key_values`` key value states are returned and can be used to speed up decoding (see ``past_key_values``). Defaults to `True`.
ZhuBaohe's avatar
ZhuBaohe committed
452
        output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
453
            If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
454
455
        output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
            If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
456
457
458
        return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
            If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
            plain tuple.
thomwolf's avatar
thomwolf committed
459
460
"""

461
462
463
464
465

@add_start_docstrings(
    "The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.",
    GPT2_START_DOCSTRING,
)
thomwolf's avatar
thomwolf committed
466
class GPT2Model(GPT2PreTrainedModel):
thomwolf's avatar
thomwolf committed
467
    def __init__(self, config):
Julien Chaumond's avatar
Julien Chaumond committed
468
        super().__init__(config)
thomwolf's avatar
thomwolf committed
469

thomwolf's avatar
thomwolf committed
470
        self.wte = nn.Embedding(config.vocab_size, config.n_embd)
thomwolf's avatar
thomwolf committed
471
        self.wpe = nn.Embedding(config.n_positions, config.n_embd)
472
        self.drop = nn.Dropout(config.embd_pdrop)
473
        self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
474
        self.ln_f = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
thomwolf's avatar
thomwolf committed
475

476
        self.init_weights()
thomwolf's avatar
thomwolf committed
477

thomwolf's avatar
thomwolf committed
478
    def get_input_embeddings(self):
thomwolf's avatar
thomwolf committed
479
        return self.wte
thomwolf's avatar
thomwolf committed
480

thomwolf's avatar
thomwolf committed
481
    def set_input_embeddings(self, new_embeddings):
482
483
        self.wte = new_embeddings

thomwolf's avatar
thomwolf committed
484
    def _prune_heads(self, heads_to_prune):
485
486
487
488
489
490
        """ Prunes heads of the model.
            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
        """
        for layer, heads in heads_to_prune.items():
            self.h[layer].attn.prune_heads(heads)

491
    @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
492
493
494
495
496
497
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="gpt2",
        output_type=BaseModelOutputWithPast,
        config_class=_CONFIG_FOR_DOC,
    )
498
499
500
    def forward(
        self,
        input_ids=None,
501
        past_key_values=None,
502
503
504
505
506
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
507
508
        encoder_hidden_states=None,
        encoder_attention_mask=None,
509
        use_cache=None,
510
        output_attentions=None,
Joseph Liu's avatar
Joseph Liu committed
511
        output_hidden_states=None,
512
        return_dict=None,
513
        **kwargs,
514
    ):
515
516
517
518
519
520
521
522
        if "past" in kwargs:
            warnings.warn(
                "The `past` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
                FutureWarning,
            )
            past_key_values = kwargs.pop("past")
        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."

523
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
Joseph Liu's avatar
Joseph Liu committed
524
525
526
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
527
        use_cache = use_cache if use_cache is not None else self.config.use_cache
528
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
529

Julien Chaumond's avatar
Julien Chaumond committed
530
531
532
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        elif input_ids is not None:
533
534
            input_shape = input_ids.size()
            input_ids = input_ids.view(-1, input_shape[-1])
535
            batch_size = input_ids.shape[0]
536
537
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.size()[:-1]
538
            batch_size = inputs_embeds.shape[0]
539
540
541
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")

542
543
544
545
546
        if token_type_ids is not None:
            token_type_ids = token_type_ids.view(-1, input_shape[-1])
        if position_ids is not None:
            position_ids = position_ids.view(-1, input_shape[-1])

547
        if past_key_values is None:
thomwolf's avatar
thomwolf committed
548
            past_length = 0
549
            past_key_values = [None] * len(self.h)
thomwolf's avatar
thomwolf committed
550
        else:
551
            past_length = past_key_values[0][0].size(-2)
thomwolf's avatar
thomwolf committed
552
        if position_ids is None:
553
554
555
            device = input_ids.device if input_ids is not None else inputs_embeds.device
            position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
            position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
thomwolf's avatar
thomwolf committed
556

557
558
        # Attention mask.
        if attention_mask is not None:
559
            assert batch_size > 0, "batch_size has to be defined and > 0"
560
            attention_mask = attention_mask.view(batch_size, -1)
561
562
563
564
565
            # We create a 3D attention mask from a 2D tensor mask.
            # Sizes are [batch_size, 1, 1, to_seq_length]
            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
            # this attention mask is more simple than the triangular masking of causal attention
            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
566
            attention_mask = attention_mask[:, None, None, :]
567
568
569
570
571
572

            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
            # masked positions, this operation will create a tensor which is 0.0 for
            # positions we want to attend and -10000.0 for masked positions.
            # Since we are adding it to the raw scores before the softmax, this is
            # effectively the same as removing these entirely.
573
            attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
574
575
            attention_mask = (1.0 - attention_mask) * -10000.0

576
577
578
579
580
581
582
583
584
585
586
        # If a 2D ou 3D attention mask is provided for the cross-attention
        # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
        if self.config.add_cross_attention and encoder_hidden_states is not None:
            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
            if encoder_attention_mask is None:
                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
            encoder_attention_mask = self.invert_attention_mask(encoder_attention_mask)
        else:
            encoder_attention_mask = None

587
        # Prepare head mask if needed
thomwolf's avatar
thomwolf committed
588
        # 1.0 in head_mask indicate we keep the head
589
        # attention_probs has shape bsz x n_heads x N x N
590
        # head_mask has shape n_layer x batch x n_heads x N x N
591
        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
592

593
594
        if inputs_embeds is None:
            inputs_embeds = self.wte(input_ids)
thomwolf's avatar
thomwolf committed
595
596
597
598
599
600
        position_embeds = self.wpe(position_ids)
        if token_type_ids is not None:
            token_type_embeds = self.wte(token_type_ids)
        else:
            token_type_embeds = 0
        hidden_states = inputs_embeds + position_embeds + token_type_embeds
601
602
        hidden_states = self.drop(hidden_states)

603
604
        output_shape = input_shape + (hidden_states.size(-1),)

605
606
607
        presents = () if use_cache else None
        all_attentions = () if output_attentions else None
        all_hidden_states = () if output_hidden_states else None
608
        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
Joseph Liu's avatar
Joseph Liu committed
609
            if output_hidden_states:
610
                all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
thomwolf's avatar
thomwolf committed
611

612
            outputs = block(
613
614
615
616
                hidden_states,
                layer_past=layer_past,
                attention_mask=attention_mask,
                head_mask=head_mask[i],
617
618
                encoder_hidden_states=encoder_hidden_states,
                encoder_attention_mask=encoder_attention_mask,
619
                use_cache=use_cache,
620
                output_attentions=output_attentions,
621
            )
622

thomwolf's avatar
thomwolf committed
623
            hidden_states, present = outputs[:2]
624
            if use_cache is True:
625
                presents = presents + (present,)
thomwolf's avatar
thomwolf committed
626

627
            if output_attentions:
628
                all_attentions = all_attentions + (outputs[2],)
thomwolf's avatar
thomwolf committed
629

thomwolf's avatar
thomwolf committed
630
        hidden_states = self.ln_f(hidden_states)
631

thomwolf's avatar
thomwolf committed
632
633
        hidden_states = hidden_states.view(*output_shape)
        # Add last hidden state
Joseph Liu's avatar
Joseph Liu committed
634
        if output_hidden_states:
635
            all_hidden_states = all_hidden_states + (hidden_states,)
thomwolf's avatar
thomwolf committed
636

637
        if not return_dict:
638
639
640
641
642
643
644
645
            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_attentions] if v is not None)

        return BaseModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=presents,
            hidden_states=all_hidden_states,
            attentions=all_attentions,
        )
thomwolf's avatar
thomwolf committed
646
647


648
@add_start_docstrings(
Lysandre's avatar
Lysandre committed
649
    """The GPT2 Model transformer with a language modeling head on top
650
    (linear layer with weights tied to the input embeddings). """,
651
652
    GPT2_START_DOCSTRING,
)
thomwolf's avatar
thomwolf committed
653
class GPT2LMHeadModel(GPT2PreTrainedModel):
654
655
    authorized_missing_keys = [r"h\.\d+\.attn\.masked_bias", r"lm_head\.weight"]

thomwolf's avatar
thomwolf committed
656
    def __init__(self, config):
Julien Chaumond's avatar
Julien Chaumond committed
657
        super().__init__(config)
thomwolf's avatar
thomwolf committed
658
        self.transformer = GPT2Model(config)
thomwolf's avatar
thomwolf committed
659
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
thomwolf's avatar
thomwolf committed
660

661
        self.init_weights()
662

thomwolf's avatar
thomwolf committed
663
    def get_output_embeddings(self):
664
        return self.lm_head
thomwolf's avatar
thomwolf committed
665

666
    def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
667
        # only last token for inputs_ids if past is defined in kwargs
668
        if past:
669
            input_ids = input_ids[:, -1].unsqueeze(-1)
670

671
672
673
674
675
        return {
            "input_ids": input_ids,
            "past_key_values": past,
            "use_cache": kwargs.get("use_cache"),
        }
676

677
    @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
678
679
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
680
        checkpoint="gpt2",
681
682
683
        output_type=CausalLMOutputWithPast,
        config_class=_CONFIG_FOR_DOC,
    )
684
685
686
    def forward(
        self,
        input_ids=None,
687
        past_key_values=None,
688
689
690
691
692
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
693
694
        encoder_hidden_states=None,
        encoder_attention_mask=None,
695
        labels=None,
696
        use_cache=None,
697
        output_attentions=None,
Joseph Liu's avatar
Joseph Liu committed
698
        output_hidden_states=None,
699
        return_dict=None,
700
        **kwargs,
701
    ):
702
703
704
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Labels for language modeling.
705
            Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids``
Lysandre's avatar
Lysandre committed
706
707
            Indices are selected in ``[-100, 0, ..., config.vocab_size]``
            All labels set to ``-100`` are ignored (masked), the loss is only
708
709
            computed for labels in ``[0, ..., config.vocab_size]``
        """
710
711
712
713
714
715
716
        if "past" in kwargs:
            warnings.warn(
                "The `past` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
                FutureWarning,
            )
            past_key_values = kwargs.pop("past")
        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
717
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
718

719
720
        transformer_outputs = self.transformer(
            input_ids,
721
            past_key_values=past_key_values,
722
723
724
725
726
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
727
728
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
729
            use_cache=use_cache,
730
            output_attentions=output_attentions,
Joseph Liu's avatar
Joseph Liu committed
731
            output_hidden_states=output_hidden_states,
732
            return_dict=return_dict,
733
        )
thomwolf's avatar
thomwolf committed
734
        hidden_states = transformer_outputs[0]
735

thomwolf's avatar
thomwolf committed
736
        lm_logits = self.lm_head(hidden_states)
thomwolf's avatar
thomwolf committed
737

738
        loss = None
thomwolf's avatar
thomwolf committed
739
        if labels is not None:
740
            # Shift so that tokens < n predict n
741
            shift_logits = lm_logits[..., :-1, :].contiguous()
thomwolf's avatar
thomwolf committed
742
            shift_labels = labels[..., 1:].contiguous()
Catalin Voss's avatar
Catalin Voss committed
743
            # Flatten the tokens
LysandreJik's avatar
LysandreJik committed
744
            loss_fct = CrossEntropyLoss()
745
            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
thomwolf's avatar
thomwolf committed
746

747
        if not return_dict:
748
749
750
751
752
753
754
755
756
757
            output = (lm_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return CausalLMOutputWithPast(
            loss=loss,
            logits=lm_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )
thomwolf's avatar
thomwolf committed
758
759


760
761
@add_start_docstrings(
    """The GPT2 Model transformer with a language modeling and a multiple-choice classification
762
763
764
    head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
    The language modeling head has its weights tied to the input embeddings,
    the classification head takes as input the input of a specified classification token index in the input sequence).
765
766
767
""",
    GPT2_START_DOCSTRING,
)
thomwolf's avatar
thomwolf committed
768
class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
769
770
771
772
773
774
775
776
777
778
779
780
781
    def __init__(self, config):
        super().__init__(config)
        config.num_labels = 1
        self.transformer = GPT2Model(config)
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.multiple_choice_head = SequenceSummary(config)

        self.init_weights()

    def get_output_embeddings(self):
        return self.lm_head

    @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
782
    @replace_return_docstrings(output_type=GPT2DoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC)
783
784
785
    def forward(
        self,
        input_ids=None,
786
        past_key_values=None,
787
788
789
790
791
792
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        mc_token_ids=None,
Sylvain Gugger's avatar
Sylvain Gugger committed
793
        labels=None,
794
        mc_labels=None,
795
        use_cache=None,
796
        output_attentions=None,
Joseph Liu's avatar
Joseph Liu committed
797
        output_hidden_states=None,
798
        return_dict=None,
799
        **kwargs,
800
801
802
    ):
        r"""
        mc_token_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input)
thomwolf's avatar
thomwolf committed
803
804
            Index of the classification token in each input sequence.
            Selected in the range ``[0, input_ids.size(-1) - 1[``.
Sylvain Gugger's avatar
Sylvain Gugger committed
805
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`)
thomwolf's avatar
thomwolf committed
806
            Labels for language modeling.
Sylvain Gugger's avatar
Sylvain Gugger committed
807
            Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids``
thomwolf's avatar
thomwolf committed
808
            Indices are selected in ``[-1, 0, ..., config.vocab_size]``
Lysandre's avatar
Lysandre committed
809
            All labels set to ``-100`` are ignored (masked), the loss is only
thomwolf's avatar
thomwolf committed
810
            computed for labels in ``[0, ..., config.vocab_size]``
811
        mc_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size)`, `optional`, defaults to :obj:`None`)
thomwolf's avatar
thomwolf committed
812
813
814
            Labels for computing the multiple choice classification loss.
            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
            of the input tensors. (see `input_ids` above)
Sylvain Gugger's avatar
Sylvain Gugger committed
815
816
        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.
thomwolf's avatar
thomwolf committed
817

818
    Return:
thomwolf's avatar
thomwolf committed
819
820
821

    Examples::

822
823
824
825
        >>> import torch
        >>> from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel

        >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
826
        >>> model = GPT2DoubleHeadsModel.from_pretrained('gpt2, return_dict=True)
827

828
829
        >>> # Add a [CLS] to the vocabulary (we should train it also!)
        >>> num_added_tokens = tokenizer.add_special_tokens({'cls_token': '[CLS]'})
830

831
        >>> embedding_layer = model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
832

833
834
835
        >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
        >>> encoded_choices = [tokenizer.encode(s) for s in choices]
        >>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
thomwolf's avatar
thomwolf committed
836

837
838
        >>> input_ids = torch.tensor(encoded_choices).unsqueeze(0)  # Batch size: 1, number of choices: 2
        >>> mc_token_ids = torch.tensor([cls_token_location])  # Batch size: 1
thomwolf's avatar
thomwolf committed
839

840
        >>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
841
842
        >>> lm_logits = outputs.lm_logits
        >>> mc_logits = outputs.mc_logits
thomwolf's avatar
thomwolf committed
843

844
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
845
846
847
        if "lm_labels" in kwargs:
            warnings.warn(
                "The `lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
848
                FutureWarning,
Sylvain Gugger's avatar
Sylvain Gugger committed
849
850
            )
            labels = kwargs.pop("lm_labels")
851
852
853
854
855
856
        if "past" in kwargs:
            warnings.warn(
                "The `past` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
                FutureWarning,
            )
            past_key_values = kwargs.pop("past")
Sylvain Gugger's avatar
Sylvain Gugger committed
857
        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
858
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
Sylvain Gugger's avatar
Sylvain Gugger committed
859

860
861
        transformer_outputs = self.transformer(
            input_ids,
862
            past_key_values=past_key_values,
863
864
865
866
867
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
868
            use_cache=use_cache,
869
            output_attentions=output_attentions,
Joseph Liu's avatar
Joseph Liu committed
870
            output_hidden_states=output_hidden_states,
871
            return_dict=return_dict,
872
        )
873

thomwolf's avatar
thomwolf committed
874
        hidden_states = transformer_outputs[0]
875

thomwolf's avatar
thomwolf committed
876
        lm_logits = self.lm_head(hidden_states)
thomwolf's avatar
thomwolf committed
877
        mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids).squeeze(-1)
thomwolf's avatar
thomwolf committed
878

879
        mc_loss = None
thomwolf's avatar
thomwolf committed
880
881
        if mc_labels is not None:
            loss_fct = CrossEntropyLoss()
882
883
            mc_loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1))
        lm_loss = None
Sylvain Gugger's avatar
Sylvain Gugger committed
884
        if labels is not None:
885
            shift_logits = lm_logits[..., :-1, :].contiguous()
Sylvain Gugger's avatar
Sylvain Gugger committed
886
            shift_labels = labels[..., 1:].contiguous()
LysandreJik's avatar
LysandreJik committed
887
            loss_fct = CrossEntropyLoss()
888
889
            lm_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

890
        if not return_dict:
891
892
893
894
895
896
897
898
899
900
901
902
903
904
            output = (lm_logits, mc_logits) + transformer_outputs[1:]
            if mc_loss is not None:
                output = (mc_loss,) + output
            return ((lm_loss,) + output) if lm_loss is not None else output

        return GPT2DoubleHeadsModelOutput(
            lm_loss=lm_loss,
            mc_loss=mc_loss,
            lm_logits=lm_logits,
            mc_logits=mc_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )