modeling_gpt2.py 46.8 KB
Newer Older
thomwolf's avatar
thomwolf committed
1
# coding=utf-8
thomwolf's avatar
thomwolf committed
2
# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
thomwolf's avatar
thomwolf committed
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch OpenAI GPT-2 model."""

import os
Sylvain Gugger's avatar
Sylvain Gugger committed
19
import warnings
20
21
from dataclasses import dataclass
from typing import List, Optional, Tuple
thomwolf's avatar
thomwolf committed
22
23
24

import torch
import torch.nn as nn
25
from torch.nn import CrossEntropyLoss, MSELoss
thomwolf's avatar
thomwolf committed
26

27
from .activations import ACT2FN
28
from .configuration_gpt2 import GPT2Config
29
30
31
32
from .file_utils import (
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
33
    add_start_docstrings_to_model_forward,
34
35
    replace_return_docstrings,
)
36
37
38
39
40
from .modeling_outputs import (
    BaseModelOutputWithPastAndCrossAttentions,
    CausalLMOutputWithPastAndCrossAttentions,
    SequenceClassifierOutputWithPast,
)
41
42
43
44
45
46
47
from .modeling_utils import (
    Conv1D,
    PreTrainedModel,
    SequenceSummary,
    find_pruneable_heads_and_indices,
    prune_conv1d_layer,
)
Lysandre Debut's avatar
Lysandre Debut committed
48
from .utils import logging
Aymeric Augustin's avatar
Aymeric Augustin committed
49

thomwolf's avatar
thomwolf committed
50

Lysandre Debut's avatar
Lysandre Debut committed
51
logger = logging.get_logger(__name__)
thomwolf's avatar
thomwolf committed
52

53
_CONFIG_FOR_DOC = "GPT2Config"
54
55
_TOKENIZER_FOR_DOC = "GPT2Tokenizer"

56
57
58
59
60
61
62
63
GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "gpt2",
    "gpt2-medium",
    "gpt2-large",
    "gpt2-xl",
    "distilgpt2",
    # See all GPT-2 models at https://huggingface.co/models?filter=gpt2
]
64

thomwolf's avatar
thomwolf committed
65

66
def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
Lysandre's avatar
Lysandre committed
67
    """Load tf checkpoints in a pytorch model"""
thomwolf's avatar
thomwolf committed
68
69
    try:
        import re
70

thomwolf's avatar
thomwolf committed
71
72
        import tensorflow as tf
    except ImportError:
73
74
75
76
        logger.error(
            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
            "https://www.tensorflow.org/install/ for installation instructions."
        )
thomwolf's avatar
thomwolf committed
77
78
        raise
    tf_path = os.path.abspath(gpt2_checkpoint_path)
thomwolf's avatar
thomwolf committed
79
    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
thomwolf's avatar
thomwolf committed
80
81
82
83
84
    # Load weights from TF model
    init_vars = tf.train.list_variables(tf_path)
    names = []
    arrays = []
    for name, shape in init_vars:
thomwolf's avatar
thomwolf committed
85
        logger.info("Loading TF weight {} with shape {}".format(name, shape))
thomwolf's avatar
thomwolf committed
86
87
        array = tf.train.load_variable(tf_path, name)
        names.append(name)
thomwolf's avatar
thomwolf committed
88
        arrays.append(array.squeeze())
thomwolf's avatar
thomwolf committed
89
90

    for name, array in zip(names, arrays):
thomwolf's avatar
thomwolf committed
91
        name = name[6:]  # skip "model/"
92
        name = name.split("/")
thomwolf's avatar
thomwolf committed
93
94
        pointer = model
        for m_name in name:
95
            if re.fullmatch(r"[A-Za-z]+\d+", m_name):
96
                scope_names = re.split(r"(\d+)", m_name)
thomwolf's avatar
thomwolf committed
97
            else:
98
99
                scope_names = [m_name]
            if scope_names[0] == "w" or scope_names[0] == "g":
100
                pointer = getattr(pointer, "weight")
101
            elif scope_names[0] == "b":
102
                pointer = getattr(pointer, "bias")
103
104
            elif scope_names[0] == "wpe" or scope_names[0] == "wte":
                pointer = getattr(pointer, scope_names[0])
105
                pointer = getattr(pointer, "weight")
thomwolf's avatar
thomwolf committed
106
            else:
107
108
109
                pointer = getattr(pointer, scope_names[0])
            if len(scope_names) >= 2:
                num = int(scope_names[1])
thomwolf's avatar
thomwolf committed
110
111
                pointer = pointer[num]
        try:
Teven's avatar
Teven committed
112
113
114
            assert (
                pointer.shape == array.shape
            ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
thomwolf's avatar
thomwolf committed
115
116
117
        except AssertionError as e:
            e.args += (pointer.shape, array.shape)
            raise
thomwolf's avatar
thomwolf committed
118
        logger.info("Initialize PyTorch weight {}".format(name))
thomwolf's avatar
thomwolf committed
119
120
121
122
123
        pointer.data = torch.from_numpy(array)
    return model


class Attention(nn.Module):
124
    def __init__(self, nx, n_ctx, config, scale=False, is_cross_attention=False):
Julien Chaumond's avatar
Julien Chaumond committed
125
        super().__init__()
thomwolf's avatar
thomwolf committed
126

thomwolf's avatar
thomwolf committed
127
128
129
        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
        # [switch nx => n_state from Block to Attention to keep identical to TF implem]
        assert n_state % config.n_head == 0
130
131
132
133
        self.register_buffer(
            "bias", torch.tril(torch.ones((n_ctx, n_ctx), dtype=torch.uint8)).view(1, 1, n_ctx, n_ctx)
        )
        self.register_buffer("masked_bias", torch.tensor(-1e4))
thomwolf's avatar
thomwolf committed
134
135
136
        self.n_head = config.n_head
        self.split_size = n_state
        self.scale = scale
137
138
139
140
141
142
        self.is_cross_attention = is_cross_attention
        if self.is_cross_attention:
            self.c_attn = Conv1D(2 * n_state, nx)
            self.q_attn = Conv1D(n_state, nx)
        else:
            self.c_attn = Conv1D(3 * n_state, nx)
thomwolf's avatar
thomwolf committed
143
        self.c_proj = Conv1D(n_state, nx)
144
145
        self.attn_dropout = nn.Dropout(config.attn_pdrop)
        self.resid_dropout = nn.Dropout(config.resid_pdrop)
146
        self.pruned_heads = set()
thomwolf's avatar
thomwolf committed
147

148
    def prune_heads(self, heads):
thomwolf's avatar
thomwolf committed
149
150
        if len(heads) == 0:
            return
151
152
153
        heads, index = find_pruneable_heads_and_indices(
            heads, self.n_head, self.split_size // self.n_head, self.pruned_heads
        )
154
        index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)])
155

156
157
158
        # Prune conv1d layers
        self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
        self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
159

160
161
162
        # Update hyper params
        self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads))
        self.n_head = self.n_head - len(heads)
163
        self.pruned_heads = self.pruned_heads.union(heads)
164

165
    def _attn(self, q, k, v, attention_mask=None, head_mask=None, output_attentions=False):
thomwolf's avatar
thomwolf committed
166
167
        w = torch.matmul(q, k)
        if self.scale:
168
            w = w / (float(v.size(-1)) ** 0.5)
thomwolf's avatar
thomwolf committed
169
        nd, ns = w.size(-2), w.size(-1)
170
171
172
173
174

        if not self.is_cross_attention:
            # if only "normal" attention layer implements causal mask
            mask = self.bias[:, :, ns - nd : ns, :ns]
            w = torch.where(mask.bool(), w, self.masked_bias.to(w.dtype))
thomwolf's avatar
thomwolf committed
175

176
177
178
179
        if attention_mask is not None:
            # Apply the attention mask
            w = w + attention_mask

thomwolf's avatar
thomwolf committed
180
        w = nn.Softmax(dim=-1)(w)
181
        w = self.attn_dropout(w)
182
183
184
185
186

        # Mask heads if we want to
        if head_mask is not None:
            w = w * head_mask

thomwolf's avatar
thomwolf committed
187
        outputs = [torch.matmul(w, v)]
188
        if output_attentions:
thomwolf's avatar
thomwolf committed
189
190
            outputs.append(w)
        return outputs
thomwolf's avatar
thomwolf committed
191
192
193
194
195
196
197
198
199
200

    def merge_heads(self, x):
        x = x.permute(0, 2, 1, 3).contiguous()
        new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)
        return x.view(*new_x_shape)  # in Tensorflow implem: fct merge_states

    def split_heads(self, x, k=False):
        new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)
        x = x.view(*new_x_shape)  # in Tensorflow implem: fct split_states
        if k:
thomwolf's avatar
thomwolf committed
201
            return x.permute(0, 2, 3, 1)  # (batch, head, head_features, seq_length)
thomwolf's avatar
thomwolf committed
202
        else:
thomwolf's avatar
thomwolf committed
203
            return x.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)
thomwolf's avatar
thomwolf committed
204

205
    def forward(
206
207
208
209
210
211
212
213
214
        self,
        hidden_states,
        layer_past=None,
        attention_mask=None,
        head_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        use_cache=False,
        output_attentions=False,
215
    ):
216
217
218
219
220
221
222
223
224
225
        if encoder_hidden_states is not None:
            assert hasattr(
                self, "q_attn"
            ), "If class is used as cross attention, the weights `q_attn` have to be defined. Please make sure to instantiate class with `Attention(..., is_cross_attention=True)`."
            query = self.q_attn(hidden_states)
            key, value = self.c_attn(encoder_hidden_states).split(self.split_size, dim=2)
            attention_mask = encoder_attention_mask
        else:
            query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2)

thomwolf's avatar
thomwolf committed
226
227
228
        query = self.split_heads(query)
        key = self.split_heads(key, k=True)
        value = self.split_heads(value)
thomwolf's avatar
thomwolf committed
229
        if layer_past is not None:
thomwolf's avatar
thomwolf committed
230
            past_key, past_value = layer_past[0].transpose(-2, -1), layer_past[1]  # transpose back cf below
thomwolf's avatar
thomwolf committed
231
            key = torch.cat((past_key, key), dim=-1)
thomwolf's avatar
thomwolf committed
232
            value = torch.cat((past_value, value), dim=-2)
233
234
235
236
237

        if use_cache is True:
            present = torch.stack((key.transpose(-2, -1), value))  # transpose to have same shapes for stacking
        else:
            present = (None,)
238

239
        attn_outputs = self._attn(query, key, value, attention_mask, head_mask, output_attentions)
thomwolf's avatar
thomwolf committed
240
        a = attn_outputs[0]
241

thomwolf's avatar
thomwolf committed
242
243
        a = self.merge_heads(a)
        a = self.c_proj(a)
244
        a = self.resid_dropout(a)
thomwolf's avatar
thomwolf committed
245
246
247

        outputs = [a, present] + attn_outputs[1:]
        return outputs  # a, present, (attentions)
thomwolf's avatar
thomwolf committed
248
249
250
251


class MLP(nn.Module):
    def __init__(self, n_state, config):  # in MLP: n_state=3072 (4 * n_embd)
Julien Chaumond's avatar
Julien Chaumond committed
252
        super().__init__()
thomwolf's avatar
thomwolf committed
253
254
255
        nx = config.n_embd
        self.c_fc = Conv1D(n_state, nx)
        self.c_proj = Conv1D(nx, n_state)
256
        self.act = ACT2FN[config.activation_function]
257
        self.dropout = nn.Dropout(config.resid_pdrop)
thomwolf's avatar
thomwolf committed
258
259
260
261

    def forward(self, x):
        h = self.act(self.c_fc(x))
        h2 = self.c_proj(h)
262
        return self.dropout(h2)
thomwolf's avatar
thomwolf committed
263
264
265


class Block(nn.Module):
thomwolf's avatar
thomwolf committed
266
    def __init__(self, n_ctx, config, scale=False):
Julien Chaumond's avatar
Julien Chaumond committed
267
        super().__init__()
268
269
270
271
272
273
274
275
        hidden_size = config.n_embd
        inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
        self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
        self.attn = Attention(hidden_size, n_ctx, config, scale)
        self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
        if config.add_cross_attention:
            self.crossattention = Attention(hidden_size, n_ctx, config, scale, is_cross_attention=True)
            self.ln_cross_attn = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
276
        self.mlp = MLP(inner_dim, config)
thomwolf's avatar
thomwolf committed
277

278
    def forward(
279
280
281
282
283
284
285
286
287
        self,
        hidden_states,
        layer_past=None,
        attention_mask=None,
        head_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        use_cache=False,
        output_attentions=False,
288
    ):
289
290
        attn_outputs = self.attn(
            self.ln_1(hidden_states),
291
292
293
294
            layer_past=layer_past,
            attention_mask=attention_mask,
            head_mask=head_mask,
            use_cache=use_cache,
295
            output_attentions=output_attentions,
296
        )
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
        attn_output = attn_outputs[0]  # output_attn: a, present, (attentions)
        outputs = attn_outputs[1:]
        # residual connection
        hidden_states = attn_output + hidden_states

        if encoder_hidden_states is not None:
            # add one self-attention block for cross-attention
            assert hasattr(
                self, "crossattention"
            ), f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`"
            cross_attn_outputs = self.crossattention(
                self.ln_cross_attn(hidden_states),
                attention_mask=attention_mask,
                head_mask=head_mask,
                encoder_hidden_states=encoder_hidden_states,
                encoder_attention_mask=encoder_attention_mask,
                output_attentions=output_attentions,
            )
            attn_output = cross_attn_outputs[0]
            # residual connection
            hidden_states = hidden_states + attn_output
318
            outputs = outputs + cross_attn_outputs[2:]  # add cross attentions if we output attention weights
thomwolf's avatar
thomwolf committed
319

320
321
322
        feed_forward_hidden_states = self.mlp(self.ln_2(hidden_states))
        # residual connection
        hidden_states = hidden_states + feed_forward_hidden_states
thomwolf's avatar
thomwolf committed
323

324
        outputs = [hidden_states] + outputs
325
        return outputs  # hidden_states, present, (attentions, cross_attentions)
thomwolf's avatar
thomwolf committed
326
327


328
class GPT2PreTrainedModel(PreTrainedModel):
Sylvain Gugger's avatar
Sylvain Gugger committed
329
330
331
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
thomwolf's avatar
thomwolf committed
332
    """
333

334
335
336
    config_class = GPT2Config
    load_tf_weights = load_tf_weights_in_gpt2
    base_model_prefix = "transformer"
thomwolf's avatar
thomwolf committed
337

338
    def __init__(self, *inputs, **kwargs):
Julien Chaumond's avatar
Julien Chaumond committed
339
        super().__init__(*inputs, **kwargs)
340

341
    def _init_weights(self, module):
Lysandre's avatar
Lysandre committed
342
        """Initialize the weights."""
343
        if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)):
thomwolf's avatar
thomwolf committed
344
345
346
            # Slightly different from the TF version which uses truncated_normal for initialization
            # cf https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
347
348
            if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None:
                module.bias.data.zero_()
349
        elif isinstance(module, nn.LayerNorm):
thomwolf's avatar
thomwolf committed
350
351
352
353
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


354
355
356
357
358
359
@dataclass
class GPT2DoubleHeadsModelOutput(ModelOutput):
    """
    Base class for outputs of models predicting if two sentences are consecutive or not.

    Args:
360
        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided):
361
362
363
            Language modeling loss.
        mc_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`mc_labels` is provided):
            Multiple choice classification loss.
364
        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`):
365
366
367
368
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        mc_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
            Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
        past_key_values (:obj:`List[torch.FloatTensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
Sylvain Gugger's avatar
Sylvain Gugger committed
369
370
            List of :obj:`torch.FloatTensor` of length :obj:`config.n_layers`, with each tensor of shape :obj:`(2,
            batch_size, num_heads, sequence_length, embed_size_per_head)`).
371
372

            Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
373
            :obj:`past_key_values` input) to speed up sequential decoding.
374
375
376
377
378
379
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Sylvain Gugger's avatar
Sylvain Gugger committed
380
381
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
            sequence_length, sequence_length)`.
382
383
384
385
386

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

387
    loss: Optional[torch.FloatTensor] = None
388
    mc_loss: Optional[torch.FloatTensor] = None
389
    logits: torch.FloatTensor = None
390
    mc_logits: torch.FloatTensor = None
391
392
393
394
395
    past_key_values: Optional[List[torch.FloatTensor]] = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None


Lysandre's avatar
Lysandre committed
396
397
GPT2_START_DOCSTRING = r"""

Sylvain Gugger's avatar
Sylvain Gugger committed
398
399
400
401
    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
    pruning heads etc.)

Sylvain Gugger's avatar
Sylvain Gugger committed
402
403
404
    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
    general usage and behavior.
thomwolf's avatar
thomwolf committed
405
406

    Parameters:
407
        config (:class:`~transformers.GPT2Config`): Model configuration class with all the parameters of the model.
Sylvain Gugger's avatar
Sylvain Gugger committed
408
409
410
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
            weights.
thomwolf's avatar
thomwolf committed
411
412
"""

Lysandre's avatar
Lysandre committed
413
GPT2_INPUTS_DOCSTRING = r"""
414
    Args:
415
        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, input_ids_length)`):
416
            :obj:`input_ids_length` = ``sequence_length`` if :obj:`past_key_values` is ``None`` else
Sylvain Gugger's avatar
Sylvain Gugger committed
417
418
            ``past_key_values[0].shape[-2]`` (``sequence_length`` of input past key value states). Indices of input
            sequence tokens in the vocabulary.
419

Sylvain Gugger's avatar
Sylvain Gugger committed
420
421
            If :obj:`past_key_values` is used, only ``input_ids`` that do not have their past calculated should be
            passed as ``input_ids``.
Lysandre's avatar
Lysandre committed
422

Sylvain Gugger's avatar
Sylvain Gugger committed
423
424
425
            Indices can be obtained using :class:`~transformers.GPT2Tokenizer`. See
            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
            details.
Lysandre's avatar
Lysandre committed
426

427
            `What are input IDs? <../glossary.html#input-ids>`__
428
        past_key_values (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
Sylvain Gugger's avatar
Sylvain Gugger committed
429
430
431
432
            Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see
            :obj:`past_key_values` output below). Can be used to speed up sequential decoding. The ``input_ids`` which
            have their past given to this model should not be passed as ``input_ids`` as they have already been
            computed.
433
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
434
            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
Sylvain Gugger's avatar
Sylvain Gugger committed
435
436

            - 1 for tokens that are **not masked**,
437
            - 0 for tokens that are **masked**.
Lysandre's avatar
Lysandre committed
438

439
            `What are attention masks? <../glossary.html#attention-mask>`__
440
        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, input_ids_length)`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
441
442
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
            1]``:
Sylvain Gugger's avatar
Sylvain Gugger committed
443
444
445
446

            - 0 corresponds to a `sentence A` token,
            - 1 corresponds to a `sentence B` token.

447
            `What are token type IDs? <../glossary.html#token-type-ids>`_
448
        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
449
450
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
            config.max_position_embeddings - 1]``.
Lysandre's avatar
Lysandre committed
451

452
            `What are position IDs? <../glossary.html#position-ids>`_
453
        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
454
            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
Sylvain Gugger's avatar
Sylvain Gugger committed
455
456
457
458

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

459
        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
460
461
462
463
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
            vectors than the model's internal embedding lookup matrix.

464
465
            If :obj:`past_key_values` is used, optionally only the last :obj:`inputs_embeds` have to be input (see
            :obj:`past_key_values`).
Sylvain Gugger's avatar
Sylvain Gugger committed
466
        use_cache (:obj:`bool`, `optional`):
467
468
            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
            decoding (see :obj:`past_key_values`).
469
        output_attentions (:obj:`bool`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
470
471
            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
            tensors for more detail.
472
        output_hidden_states (:obj:`bool`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
473
474
            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
            more detail.
475
        return_dict (:obj:`bool`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
476
            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
thomwolf's avatar
thomwolf committed
477
478
"""

479
480
481
482
483

@add_start_docstrings(
    "The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.",
    GPT2_START_DOCSTRING,
)
thomwolf's avatar
thomwolf committed
484
class GPT2Model(GPT2PreTrainedModel):
thomwolf's avatar
thomwolf committed
485
    def __init__(self, config):
Julien Chaumond's avatar
Julien Chaumond committed
486
        super().__init__(config)
thomwolf's avatar
thomwolf committed
487

thomwolf's avatar
thomwolf committed
488
        self.wte = nn.Embedding(config.vocab_size, config.n_embd)
thomwolf's avatar
thomwolf committed
489
        self.wpe = nn.Embedding(config.n_positions, config.n_embd)
490
        self.drop = nn.Dropout(config.embd_pdrop)
491
        self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
492
        self.ln_f = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
thomwolf's avatar
thomwolf committed
493

494
        self.init_weights()
thomwolf's avatar
thomwolf committed
495

thomwolf's avatar
thomwolf committed
496
    def get_input_embeddings(self):
thomwolf's avatar
thomwolf committed
497
        return self.wte
thomwolf's avatar
thomwolf committed
498

thomwolf's avatar
thomwolf committed
499
    def set_input_embeddings(self, new_embeddings):
500
501
        self.wte = new_embeddings

thomwolf's avatar
thomwolf committed
502
    def _prune_heads(self, heads_to_prune):
Sylvain Gugger's avatar
Sylvain Gugger committed
503
504
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
505
506
507
508
        """
        for layer, heads in heads_to_prune.items():
            self.h[layer].attn.prune_heads(heads)

509
    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
510
511
512
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="gpt2",
513
        output_type=BaseModelOutputWithPastAndCrossAttentions,
514
515
        config_class=_CONFIG_FOR_DOC,
    )
516
517
518
    def forward(
        self,
        input_ids=None,
519
        past_key_values=None,
520
521
522
523
524
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
525
526
        encoder_hidden_states=None,
        encoder_attention_mask=None,
527
        use_cache=None,
528
        output_attentions=None,
Joseph Liu's avatar
Joseph Liu committed
529
        output_hidden_states=None,
530
        return_dict=None,
531
        **kwargs,
532
    ):
533
534
535
536
537
538
539
540
        if "past" in kwargs:
            warnings.warn(
                "The `past` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
                FutureWarning,
            )
            past_key_values = kwargs.pop("past")
        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."

541
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
Joseph Liu's avatar
Joseph Liu committed
542
543
544
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
545
        use_cache = use_cache if use_cache is not None else self.config.use_cache
546
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
547

Julien Chaumond's avatar
Julien Chaumond committed
548
549
550
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        elif input_ids is not None:
551
552
            input_shape = input_ids.size()
            input_ids = input_ids.view(-1, input_shape[-1])
553
            batch_size = input_ids.shape[0]
554
555
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.size()[:-1]
556
            batch_size = inputs_embeds.shape[0]
557
558
559
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")

560
561
562
563
564
        if token_type_ids is not None:
            token_type_ids = token_type_ids.view(-1, input_shape[-1])
        if position_ids is not None:
            position_ids = position_ids.view(-1, input_shape[-1])

565
        if past_key_values is None:
thomwolf's avatar
thomwolf committed
566
            past_length = 0
567
            past_key_values = [None] * len(self.h)
thomwolf's avatar
thomwolf committed
568
        else:
569
            past_length = past_key_values[0][0].size(-2)
thomwolf's avatar
thomwolf committed
570
        if position_ids is None:
571
572
573
            device = input_ids.device if input_ids is not None else inputs_embeds.device
            position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
            position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
thomwolf's avatar
thomwolf committed
574

575
576
        # Attention mask.
        if attention_mask is not None:
577
            assert batch_size > 0, "batch_size has to be defined and > 0"
578
            attention_mask = attention_mask.view(batch_size, -1)
579
580
581
582
583
            # We create a 3D attention mask from a 2D tensor mask.
            # Sizes are [batch_size, 1, 1, to_seq_length]
            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
            # this attention mask is more simple than the triangular masking of causal attention
            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
584
            attention_mask = attention_mask[:, None, None, :]
585
586
587
588
589
590

            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
            # masked positions, this operation will create a tensor which is 0.0 for
            # positions we want to attend and -10000.0 for masked positions.
            # Since we are adding it to the raw scores before the softmax, this is
            # effectively the same as removing these entirely.
Lysandre Debut's avatar
Lysandre Debut committed
591
            attention_mask = attention_mask.to(dtype=self.dtype)  # fp16 compatibility
592
593
            attention_mask = (1.0 - attention_mask) * -10000.0

594
        # If a 2D ou 3D attention mask is provided for the cross-attention
595
        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
596
597
598
599
600
601
602
603
604
        if self.config.add_cross_attention and encoder_hidden_states is not None:
            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
            if encoder_attention_mask is None:
                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
            encoder_attention_mask = self.invert_attention_mask(encoder_attention_mask)
        else:
            encoder_attention_mask = None

605
        # Prepare head mask if needed
thomwolf's avatar
thomwolf committed
606
        # 1.0 in head_mask indicate we keep the head
607
        # attention_probs has shape bsz x n_heads x N x N
608
        # head_mask has shape n_layer x batch x n_heads x N x N
609
        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
610

611
612
        if inputs_embeds is None:
            inputs_embeds = self.wte(input_ids)
thomwolf's avatar
thomwolf committed
613
        position_embeds = self.wpe(position_ids)
614
615
        hidden_states = inputs_embeds + position_embeds

thomwolf's avatar
thomwolf committed
616
617
        if token_type_ids is not None:
            token_type_embeds = self.wte(token_type_ids)
618
619
            hidden_states = hidden_states + token_type_embeds

620
621
        hidden_states = self.drop(hidden_states)

622
623
        output_shape = input_shape + (hidden_states.size(-1),)

624
        presents = () if use_cache else None
625
626
        all_self_attentions = () if output_attentions else None
        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
627
        all_hidden_states = () if output_hidden_states else None
628
        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
Joseph Liu's avatar
Joseph Liu committed
629
            if output_hidden_states:
630
                all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
thomwolf's avatar
thomwolf committed
631

632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
            if getattr(self.config, "gradient_checkpointing", False):

                def create_custom_forward(module):
                    def custom_forward(*inputs):
                        # checkpointing only works with tuple returns, not with lists
                        return tuple(output for output in module(*inputs, use_cache, output_attentions))

                    return custom_forward

                outputs = torch.utils.checkpoint.checkpoint(
                    create_custom_forward(block),
                    hidden_states,
                    layer_past,
                    attention_mask,
                    head_mask[i],
                    encoder_hidden_states,
                    encoder_attention_mask,
                )
            else:
                outputs = block(
                    hidden_states,
                    layer_past=layer_past,
                    attention_mask=attention_mask,
                    head_mask=head_mask[i],
                    encoder_hidden_states=encoder_hidden_states,
                    encoder_attention_mask=encoder_attention_mask,
                    use_cache=use_cache,
                    output_attentions=output_attentions,
                )
661

thomwolf's avatar
thomwolf committed
662
            hidden_states, present = outputs[:2]
663
            if use_cache is True:
664
                presents = presents + (present,)
thomwolf's avatar
thomwolf committed
665

666
            if output_attentions:
667
668
669
                all_self_attentions = all_self_attentions + (outputs[2],)
                if self.config.add_cross_attention:
                    all_cross_attentions = all_cross_attentions + (outputs[3],)
thomwolf's avatar
thomwolf committed
670

thomwolf's avatar
thomwolf committed
671
        hidden_states = self.ln_f(hidden_states)
672

thomwolf's avatar
thomwolf committed
673
674
        hidden_states = hidden_states.view(*output_shape)
        # Add last hidden state
Joseph Liu's avatar
Joseph Liu committed
675
        if output_hidden_states:
676
            all_hidden_states = all_hidden_states + (hidden_states,)
thomwolf's avatar
thomwolf committed
677

678
        if not return_dict:
679
            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
680

681
        return BaseModelOutputWithPastAndCrossAttentions(
682
683
684
            last_hidden_state=hidden_states,
            past_key_values=presents,
            hidden_states=all_hidden_states,
685
686
            attentions=all_self_attentions,
            cross_attentions=all_cross_attentions,
687
        )
thomwolf's avatar
thomwolf committed
688
689


690
@add_start_docstrings(
Sylvain Gugger's avatar
Sylvain Gugger committed
691
692
693
694
    """
    The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    """,
695
696
    GPT2_START_DOCSTRING,
)
thomwolf's avatar
thomwolf committed
697
class GPT2LMHeadModel(GPT2PreTrainedModel):
698
699
    authorized_missing_keys = [r"h\.\d+\.attn\.masked_bias", r"lm_head\.weight"]

thomwolf's avatar
thomwolf committed
700
    def __init__(self, config):
Julien Chaumond's avatar
Julien Chaumond committed
701
        super().__init__(config)
thomwolf's avatar
thomwolf committed
702
        self.transformer = GPT2Model(config)
thomwolf's avatar
thomwolf committed
703
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
thomwolf's avatar
thomwolf committed
704

705
        self.init_weights()
706

thomwolf's avatar
thomwolf committed
707
    def get_output_embeddings(self):
708
        return self.lm_head
thomwolf's avatar
thomwolf committed
709

710
    def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
711
        token_type_ids = kwargs.get("token_type_ids", None)
712
        # only last token for inputs_ids if past is defined in kwargs
713
        if past:
714
            input_ids = input_ids[:, -1].unsqueeze(-1)
715
716
            if token_type_ids is not None:
                token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
717

718
719
720
721
        attention_mask = kwargs.get("attention_mask", None)
        position_ids = kwargs.get("position_ids", None)

        if attention_mask is not None and position_ids is None:
722
            # create position_ids on the fly for batch generation
723
724
725
726
727
728
            position_ids = attention_mask.long().cumsum(-1) - 1
            position_ids.masked_fill_(attention_mask == 0, 1)
            if past:
                position_ids = position_ids[:, -1].unsqueeze(-1)
        else:
            position_ids = None
729
730
731
732
        return {
            "input_ids": input_ids,
            "past_key_values": past,
            "use_cache": kwargs.get("use_cache"),
733
734
            "position_ids": position_ids,
            "attention_mask": attention_mask,
735
            "token_type_ids": token_type_ids,
736
        }
737

738
    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
739
740
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
741
        checkpoint="gpt2",
742
        output_type=CausalLMOutputWithPastAndCrossAttentions,
743
744
        config_class=_CONFIG_FOR_DOC,
    )
745
746
747
    def forward(
        self,
        input_ids=None,
748
        past_key_values=None,
749
750
751
752
753
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
754
755
        encoder_hidden_states=None,
        encoder_attention_mask=None,
756
        labels=None,
757
        use_cache=None,
758
        output_attentions=None,
Joseph Liu's avatar
Joseph Liu committed
759
        output_hidden_states=None,
760
        return_dict=None,
761
        **kwargs,
762
    ):
763
        r"""
764
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
765
766
767
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to
            ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]``
768
        """
769
770
771
772
773
774
775
        if "past" in kwargs:
            warnings.warn(
                "The `past` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
                FutureWarning,
            )
            past_key_values = kwargs.pop("past")
        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
776
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
777

778
779
        transformer_outputs = self.transformer(
            input_ids,
780
            past_key_values=past_key_values,
781
782
783
784
785
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
786
787
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
788
            use_cache=use_cache,
789
            output_attentions=output_attentions,
Joseph Liu's avatar
Joseph Liu committed
790
            output_hidden_states=output_hidden_states,
791
            return_dict=return_dict,
792
        )
thomwolf's avatar
thomwolf committed
793
        hidden_states = transformer_outputs[0]
794

thomwolf's avatar
thomwolf committed
795
        lm_logits = self.lm_head(hidden_states)
thomwolf's avatar
thomwolf committed
796

797
        loss = None
thomwolf's avatar
thomwolf committed
798
        if labels is not None:
799
            # Shift so that tokens < n predict n
800
            shift_logits = lm_logits[..., :-1, :].contiguous()
thomwolf's avatar
thomwolf committed
801
            shift_labels = labels[..., 1:].contiguous()
Catalin Voss's avatar
Catalin Voss committed
802
            # Flatten the tokens
LysandreJik's avatar
LysandreJik committed
803
            loss_fct = CrossEntropyLoss()
804
            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
thomwolf's avatar
thomwolf committed
805

806
        if not return_dict:
807
808
809
            output = (lm_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

810
        return CausalLMOutputWithPastAndCrossAttentions(
811
812
813
814
815
            loss=loss,
            logits=lm_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
816
            cross_attentions=transformer_outputs.cross_attentions,
817
        )
thomwolf's avatar
thomwolf committed
818
819


820
@add_start_docstrings(
Sylvain Gugger's avatar
Sylvain Gugger committed
821
822
823
824
825
    """
The GPT2 Model transformer with a language modeling and a multiple-choice classification head on top e.g. for
RocStories/SWAG tasks. The two heads are two linear layers. The language modeling head has its weights tied to the
input embeddings, the classification head takes as input the input of a specified classification token index in the
input sequence).
826
827
828
""",
    GPT2_START_DOCSTRING,
)
thomwolf's avatar
thomwolf committed
829
class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
830
831
832
833
834
835
836
837
838
839
840
841
    def __init__(self, config):
        super().__init__(config)
        config.num_labels = 1
        self.transformer = GPT2Model(config)
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.multiple_choice_head = SequenceSummary(config)

        self.init_weights()

    def get_output_embeddings(self):
        return self.lm_head

842
    def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
843
        token_type_ids = kwargs.get("token_type_ids", None)
844
845
846
        # only last token for inputs_ids if past is defined in kwargs
        if past:
            input_ids = input_ids[:, -1].unsqueeze(-1)
847
848
849
850
851
852
853
854
855
856
857
858
859
860
            if token_type_ids is not None:
                token_type_ids = token_type_ids[:, -1].unsqueeze(-1)

        attention_mask = kwargs.get("attention_mask", None)
        position_ids = kwargs.get("position_ids", None)

        if attention_mask is not None and position_ids is None:
            # create position_ids on the fly for batch generation
            position_ids = attention_mask.long().cumsum(-1) - 1
            position_ids.masked_fill_(attention_mask == 0, 1)
            if past:
                position_ids = position_ids[:, -1].unsqueeze(-1)
        else:
            position_ids = None
861
862
863
864
865

        return {
            "input_ids": input_ids,
            "past_key_values": past,
            "use_cache": kwargs.get("use_cache"),
866
867
868
            "position_ids": position_ids,
            "attention_mask": attention_mask,
            "token_type_ids": token_type_ids,
869
870
        }

871
    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
872
    @replace_return_docstrings(output_type=GPT2DoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC)
873
874
875
    def forward(
        self,
        input_ids=None,
876
        past_key_values=None,
877
878
879
880
881
882
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        mc_token_ids=None,
Sylvain Gugger's avatar
Sylvain Gugger committed
883
        labels=None,
884
        mc_labels=None,
885
        use_cache=None,
886
        output_attentions=None,
Joseph Liu's avatar
Joseph Liu committed
887
        output_hidden_states=None,
888
        return_dict=None,
889
        **kwargs,
890
891
    ):
        r"""
892
        mc_token_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input):
Sylvain Gugger's avatar
Sylvain Gugger committed
893
894
            Index of the classification token in each input sequence. Selected in the range ``[0, input_ids.size(-1) -
            1[``.
895
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
896
897
898
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            ``labels = input_ids`` Indices are selected in ``[-1, 0, ..., config.vocab_size]`` All labels set to
            ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]``
899
        mc_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size)`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
900
901
902
            Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
            num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see
            `input_ids` above)
Sylvain Gugger's avatar
Sylvain Gugger committed
903
904
        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.
Lysandre's avatar
Lysandre committed
905
906
907

        Return:

Sylvain Gugger's avatar
Sylvain Gugger committed
908
        Example::
Lysandre's avatar
Lysandre committed
909
910
911
912
913

            >>> import torch
            >>> from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel

            >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
914
            >>> model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
Lysandre's avatar
Lysandre committed
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930

            >>> # Add a [CLS] to the vocabulary (we should train it also!)
            >>> num_added_tokens = tokenizer.add_special_tokens({'cls_token': '[CLS]'})

            >>> embedding_layer = model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size

            >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
            >>> encoded_choices = [tokenizer.encode(s) for s in choices]
            >>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]

            >>> input_ids = torch.tensor(encoded_choices).unsqueeze(0)  # Batch size: 1, number of choices: 2
            >>> mc_token_ids = torch.tensor([cls_token_location])  # Batch size: 1

            >>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
            >>> lm_logits = outputs.lm_logits
            >>> mc_logits = outputs.mc_logits
thomwolf's avatar
thomwolf committed
931

932
        """
Sylvain Gugger's avatar
Sylvain Gugger committed
933
934
935
        if "lm_labels" in kwargs:
            warnings.warn(
                "The `lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
936
                FutureWarning,
Sylvain Gugger's avatar
Sylvain Gugger committed
937
938
            )
            labels = kwargs.pop("lm_labels")
939
940
941
942
943
944
        if "past" in kwargs:
            warnings.warn(
                "The `past` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
                FutureWarning,
            )
            past_key_values = kwargs.pop("past")
Sylvain Gugger's avatar
Sylvain Gugger committed
945
        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
946
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
Sylvain Gugger's avatar
Sylvain Gugger committed
947

948
949
        transformer_outputs = self.transformer(
            input_ids,
950
            past_key_values=past_key_values,
951
952
953
954
955
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
956
            use_cache=use_cache,
957
            output_attentions=output_attentions,
Joseph Liu's avatar
Joseph Liu committed
958
            output_hidden_states=output_hidden_states,
959
            return_dict=return_dict,
960
        )
961

thomwolf's avatar
thomwolf committed
962
        hidden_states = transformer_outputs[0]
963

thomwolf's avatar
thomwolf committed
964
        lm_logits = self.lm_head(hidden_states)
thomwolf's avatar
thomwolf committed
965
        mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids).squeeze(-1)
thomwolf's avatar
thomwolf committed
966

967
        mc_loss = None
thomwolf's avatar
thomwolf committed
968
969
        if mc_labels is not None:
            loss_fct = CrossEntropyLoss()
970
971
            mc_loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1))
        lm_loss = None
Sylvain Gugger's avatar
Sylvain Gugger committed
972
        if labels is not None:
973
            shift_logits = lm_logits[..., :-1, :].contiguous()
Sylvain Gugger's avatar
Sylvain Gugger committed
974
            shift_labels = labels[..., 1:].contiguous()
LysandreJik's avatar
LysandreJik committed
975
            loss_fct = CrossEntropyLoss()
976
977
            lm_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

978
        if not return_dict:
979
980
981
982
983
984
            output = (lm_logits, mc_logits) + transformer_outputs[1:]
            if mc_loss is not None:
                output = (mc_loss,) + output
            return ((lm_loss,) + output) if lm_loss is not None else output

        return GPT2DoubleHeadsModelOutput(
985
            loss=lm_loss,
986
            mc_loss=mc_loss,
987
            logits=lm_logits,
988
989
990
991
992
            mc_logits=mc_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )
993
994
995


@add_start_docstrings(
Sylvain Gugger's avatar
Sylvain Gugger committed
996
997
    """
    The GPT2 Model transformer with a sequence classification head on top (linear layer).
998
999
1000
1001

    :class:`~transformers.GPT2ForSequenceClassification` uses the last token in order to do the classification, as
    other causal models (e.g. GPT-1) do.

Sylvain Gugger's avatar
Sylvain Gugger committed
1002
1003
1004
1005
1006
    Since it does classification on the last token, it requires to know the position of the last token. If a
    :obj:`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each
    row. If no :obj:`pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot
    guess the padding tokens when :obj:`inputs_embeds` are passed instead of :obj:`input_ids`, it does the same (take
    the last value in each row of the batch).
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
    """,
    GPT2_START_DOCSTRING,
)
class GPT2ForSequenceClassification(GPT2PreTrainedModel):
    authorized_missing_keys = [r"h\.\d+\.attn\.masked_bias", r"lm_head\.weight"]

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.transformer = GPT2Model(config)
        self.score = nn.Linear(config.n_embd, self.num_labels, bias=False)

        self.init_weights()

1021
    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="microsoft/dialogrpt",
        output_type=SequenceClassifierOutputWithPast,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids=None,
        past_key_values=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        use_cache=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Sylvain Gugger's avatar
Sylvain Gugger committed
1045
1046
            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        transformer_outputs = self.transformer(
            input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = transformer_outputs[0]
        logits = self.score(hidden_states)

        if input_ids is not None:
            batch_size, sequence_length = input_ids.shape[:2]
        else:
            batch_size, sequence_length = inputs_embeds.shape[:2]

        assert (
            self.config.pad_token_id is not None or batch_size == 1
        ), "Cannot handle batch sizes > 1 if no padding token is defined."
        if self.config.pad_token_id is None:
            sequence_lengths = -1
        else:
            if input_ids is not None:
                sequence_lengths = torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1
            else:
                sequence_lengths = -1
                logger.warning(
                    f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
1084
                    f"unexpected if using padding tokens in conjunction with `inputs_embeds.`"
1085
1086
1087
1088
1089
1090
1091
1092
1093
                )

        pooled_logits = logits[range(batch_size), sequence_lengths]

        loss = None
        if labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = MSELoss()
1094
                loss = loss_fct(pooled_logits.view(-1), labels.to(self.dtype).view(-1))
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
            else:
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (pooled_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutputWithPast(
            loss=loss,
            logits=pooled_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )