modeling_outputs.py 58.9 KB
Newer Older
Sylvain Gugger's avatar
Sylvain Gugger committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Sylvain Gugger's avatar
Sylvain Gugger committed
15
from dataclasses import dataclass
16
from typing import Optional, Tuple
Sylvain Gugger's avatar
Sylvain Gugger committed
17
18
19

import torch

20
from .utils import ModelOutput
Sylvain Gugger's avatar
Sylvain Gugger committed
21
22
23
24
25
26
27
28


@dataclass
class BaseModelOutput(ModelOutput):
    """
    Base class for model's outputs, with potential hidden states and attentions.

    Args:
29
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sylvain Gugger's avatar
Sylvain Gugger committed
30
            Sequence of hidden-states at the output of the last layer of the model.
31
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
32
33
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
34

35
            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
36
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
37
38
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
39
40
41
42
43

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

44
    last_hidden_state: torch.FloatTensor = None
Sylvain Gugger's avatar
Sylvain Gugger committed
45
46
47
48
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None


49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
@dataclass
class BaseModelOutputWithNoAttention(ModelOutput):
    """
    Base class for model's outputs, with potential hidden states.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, num_channels, height, width)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    """

    last_hidden_state: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None


Sylvain Gugger's avatar
Sylvain Gugger committed
68
69
70
71
72
73
@dataclass
class BaseModelOutputWithPooling(ModelOutput):
    """
    Base class for model's outputs that also contains a pooling of the last hidden states.

    Args:
74
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sylvain Gugger's avatar
Sylvain Gugger committed
75
            Sequence of hidden-states at the output of the last layer of the model.
76
        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
77
78
79
80
            Last layer hidden-state of the first token of the sequence (classification token) after further processing
            through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
            the classification token after processing through a linear layer and a tanh activation function. The linear
            layer weights are trained from the next sentence prediction (classification) objective during pretraining.
81
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
82
83
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
84

85
            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
86
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
87
88
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
89
90
91
92
93

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

94
    last_hidden_state: torch.FloatTensor = None
Sylvain Gugger's avatar
Sylvain Gugger committed
95
96
97
98
99
    pooler_output: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None


100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
@dataclass
class BaseModelOutputWithPoolingAndNoAttention(ModelOutput):
    """
    Base class for model's outputs that also contains a pooling of the last hidden states.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Sequence of hidden-states at the output of the last layer of the model.
        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
            Last layer hidden-state after a pooling operation on the spatial dimensions.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, num_channels, height, width)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    """

    last_hidden_state: torch.FloatTensor = None
    pooler_output: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None


Sylvain Gugger's avatar
Sylvain Gugger committed
122
123
124
125
126
127
@dataclass
class BaseModelOutputWithPast(ModelOutput):
    """
    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).

    Args:
128
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sylvain Gugger's avatar
Sylvain Gugger committed
129
130
            Sequence of hidden-states at the output of the last layer of the model.

Sylvain Gugger's avatar
Sylvain Gugger committed
131
132
            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
            hidden_size)` is output.
133
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
134
135
136
137
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
            encoder_sequence_length, embed_size_per_head)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
138

139
            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
Sylvain Gugger's avatar
Sylvain Gugger committed
140
141
            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
            input) to speed up sequential decoding.
142
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
143
144
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
145

146
            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
147
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
148
149
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
150
151
152
153
154

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

155
    last_hidden_state: torch.FloatTensor = None
156
    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
Sylvain Gugger's avatar
Sylvain Gugger committed
157
158
159
160
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None


161
162
163
164
165
166
@dataclass
class BaseModelOutputWithCrossAttentions(ModelOutput):
    """
    Base class for model's outputs, with potential hidden states and attentions.

    Args:
167
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
168
            Sequence of hidden-states at the output of the last layer of the model.
169
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
170
171
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
172

173
            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
174
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
175
176
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
177
178
179

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
180
        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
181
182
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
183
184
185
186
187

            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
            weighted average in the cross-attention heads.
    """

188
    last_hidden_state: torch.FloatTensor = None
189
190
191
192
193
194
195
196
197
198
199
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None
    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None


@dataclass
class BaseModelOutputWithPoolingAndCrossAttentions(ModelOutput):
    """
    Base class for model's outputs that also contains a pooling of the last hidden states.

    Args:
200
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
201
            Sequence of hidden-states at the output of the last layer of the model.
202
        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
203
204
205
206
            Last layer hidden-state of the first token of the sequence (classification token) after further processing
            through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
            the classification token after processing through a linear layer and a tanh activation function. The linear
            layer weights are trained from the next sentence prediction (classification) objective during pretraining.
207
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
208
209
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
210

211
            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
212
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
213
214
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
215
216
217

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
218
        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
219
220
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
221
222
223

            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
            weighted average in the cross-attention heads.
224
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
225
226
227
228
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
            encoder_sequence_length, embed_size_per_head)`.
229

230
            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
Sylvain Gugger's avatar
Sylvain Gugger committed
231
232
            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
            input) to speed up sequential decoding.
233
234
    """

235
    last_hidden_state: torch.FloatTensor = None
236
237
    pooler_output: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
238
    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
239
240
241
242
243
244
245
246
247
248
    attentions: Optional[Tuple[torch.FloatTensor]] = None
    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None


@dataclass
class BaseModelOutputWithPastAndCrossAttentions(ModelOutput):
    """
    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).

    Args:
249
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
250
251
            Sequence of hidden-states at the output of the last layer of the model.

Sylvain Gugger's avatar
Sylvain Gugger committed
252
253
            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
            hidden_size)` is output.
254
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
255
256
257
258
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
            encoder_sequence_length, embed_size_per_head)`.
259

260
            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
Sylvain Gugger's avatar
Sylvain Gugger committed
261
262
            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
            input) to speed up sequential decoding.
263
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
264
265
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
266

267
            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
268
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
269
270
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
271
272
273

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
274
        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
275
276
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
277
278
279
280
281

            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
            weighted average in the cross-attention heads.
    """

282
    last_hidden_state: torch.FloatTensor = None
283
    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
284
285
286
287
288
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None
    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None


Sylvain Gugger's avatar
Sylvain Gugger committed
289
290
291
292
293
294
295
@dataclass
class Seq2SeqModelOutput(ModelOutput):
    """
    Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
    decoding.

    Args:
296
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sylvain Gugger's avatar
Sylvain Gugger committed
297
298
            Sequence of hidden-states at the output of the last layer of the decoder of the model.

Sylvain Gugger's avatar
Sylvain Gugger committed
299
300
            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
            hidden_size)` is output.
301
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
302
303
304
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
305

306
            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
307
308
            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
309
310
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
311

312
            Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs.
313
        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
314
315
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
316
317
318

            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
319
        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
320
321
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
322
323
324

            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
            weighted average in the cross-attention heads.
325
        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
326
            Sequence of hidden-states at the output of the last layer of the encoder of the model.
327
        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
328
329
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
330

331
            Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs.
332
        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
333
334
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
335
336
337
338
339

            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
    """

340
    last_hidden_state: torch.FloatTensor = None
341
    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
Sylvain Gugger's avatar
Sylvain Gugger committed
342
343
    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
344
    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
Sylvain Gugger's avatar
Sylvain Gugger committed
345
346
347
348
349
350
351
352
353
354
355
    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None


@dataclass
class CausalLMOutput(ModelOutput):
    """
    Base class for causal language model (or autoregressive) outputs.

    Args:
356
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Sylvain Gugger's avatar
Sylvain Gugger committed
357
            Language modeling loss (for next-token prediction).
358
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Sylvain Gugger's avatar
Sylvain Gugger committed
359
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
360
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
361
362
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
363

364
            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
365
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
366
367
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
368
369
370
371
372

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

373
    loss: Optional[torch.FloatTensor] = None
Sylvain Gugger's avatar
Sylvain Gugger committed
374
375
376
377
378
379
380
381
382
383
384
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None


@dataclass
class CausalLMOutputWithPast(ModelOutput):
    """
    Base class for causal language model (or autoregressive) outputs.

    Args:
385
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Sylvain Gugger's avatar
Sylvain Gugger committed
386
            Language modeling loss (for next-token prediction).
387
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Sylvain Gugger's avatar
Sylvain Gugger committed
388
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
389
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
390
391
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
Sylvain Gugger's avatar
Sylvain Gugger committed
392

393
            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
394
395
            `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
396
397
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
398

399
            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
400
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
401
402
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
403
404
405
406
407
408
409

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
410
    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
Sylvain Gugger's avatar
Sylvain Gugger committed
411
412
413
414
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None


415
416
417
418
419
420
@dataclass
class CausalLMOutputWithCrossAttentions(ModelOutput):
    """
    Base class for causal language model (or autoregressive) outputs.

    Args:
421
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
422
            Language modeling loss (for next-token prediction).
423
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
424
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
425
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
426
427
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
428

429
            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
430
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
431
432
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
433
434
435

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
436
        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
437
438
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
439
440
441

            Cross attentions weights after the attention softmax, used to compute the weighted average in the
            cross-attention heads.
442
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
443
444
445
            Tuple of `torch.FloatTensor` tuples of length `config.n_layers`, with each tuple containing the cached key,
            value states of the self-attention and the cross-attention layers if model is used in encoder-decoder
            setting. Only relevant if `config.is_decoder = True`.
446
447

            Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
448
            `past_key_values` input) to speed up sequential decoding.
449
450
451
452
    """

    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
453
    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
454
455
456
457
458
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None
    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None


Sylvain Gugger's avatar
Sylvain Gugger committed
459
460
461
462
463
464
@dataclass
class SequenceClassifierOutputWithPast(ModelOutput):
    """
    Base class for outputs of sentence classification models.

    Args:
465
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Sylvain Gugger's avatar
Sylvain Gugger committed
466
            Classification (or regression if config.num_labels==1) loss.
467
        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
Sylvain Gugger's avatar
Sylvain Gugger committed
468
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
469
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
470
471
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
Sylvain Gugger's avatar
Sylvain Gugger committed
472

473
            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
474
475
            `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
476
477
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
478

479
            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
480
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
481
482
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
483
484
485
486
487
488
489

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
490
    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
Sylvain Gugger's avatar
Sylvain Gugger committed
491
492
493
494
495
496
497
498
499
500
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None


@dataclass
class MaskedLMOutput(ModelOutput):
    """
    Base class for masked language models outputs.

    Args:
501
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
502
            Masked language modeling (MLM) loss.
503
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Sylvain Gugger's avatar
Sylvain Gugger committed
504
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
505
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
506
507
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
508

509
            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
510
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
511
512
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None


@dataclass
class Seq2SeqLMOutput(ModelOutput):
    """
    Base class for sequence-to-sequence language models outputs.

    Args:
530
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
531
            Language modeling loss.
532
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Sylvain Gugger's avatar
Sylvain Gugger committed
533
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
534
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
535
536
537
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
538

539
            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
540
541
            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
542
543
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
544
545

            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
546
        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
547
548
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
549
550
551

            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
552
        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
553
554
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
555
556
557

            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
            weighted average in the cross-attention heads.
558
        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
559
            Sequence of hidden-states at the output of the last layer of the encoder of the model.
560
        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
561
562
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
563
564

            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
565
        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
566
567
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
568
569
570
571
572
573
574

            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
    """

    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
575
    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
Sylvain Gugger's avatar
Sylvain Gugger committed
576
577
    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
578
    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
Sylvain Gugger's avatar
Sylvain Gugger committed
579
580
581
582
583
584
585
586
587
588
589
    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None


@dataclass
class NextSentencePredictorOutput(ModelOutput):
    """
    Base class for outputs of models predicting if two sentences are consecutive or not.

    Args:
590
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `next_sentence_label` is provided):
Sylvain Gugger's avatar
Sylvain Gugger committed
591
            Next sequence prediction (classification) loss.
592
        logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
Sylvain Gugger's avatar
Sylvain Gugger committed
593
594
            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
            before SoftMax).
595
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
596
597
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
598

599
            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
600
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
601
602
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None


@dataclass
class SequenceClassifierOutput(ModelOutput):
    """
    Base class for outputs of sentence classification models.

    Args:
620
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Sylvain Gugger's avatar
Sylvain Gugger committed
621
            Classification (or regression if config.num_labels==1) loss.
622
        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
Sylvain Gugger's avatar
Sylvain Gugger committed
623
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
624
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
625
626
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
627

628
            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
629
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
630
631
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None


@dataclass
class Seq2SeqSequenceClassifierOutput(ModelOutput):
    """
    Base class for outputs of sequence-to-sequence sentence classification models.

    Args:
649
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `label` is provided):
Sylvain Gugger's avatar
Sylvain Gugger committed
650
            Classification (or regression if config.num_labels==1) loss.
651
        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
Sylvain Gugger's avatar
Sylvain Gugger committed
652
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
653
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
654
655
656
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
657

658
            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
659
660
            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
661
662
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
663
664

            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
665
        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
666
667
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
668
669
670

            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
671
        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
672
673
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
674
675
676

            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
            weighted average in the cross-attention heads.
677
        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
678
            Sequence of hidden-states at the output of the last layer of the encoder of the model.
679
        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
680
681
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
682
683

            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
684
        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
685
686
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
687
688
689
690
691
692
693

            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
    """

    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
694
    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
Sylvain Gugger's avatar
Sylvain Gugger committed
695
696
    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
697
    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
Sylvain Gugger's avatar
Sylvain Gugger committed
698
699
700
701
702
703
704
705
706
707
708
    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None


@dataclass
class MultipleChoiceModelOutput(ModelOutput):
    """
    Base class for outputs of multiple choice models.

    Args:
709
        loss (`torch.FloatTensor` of shape *(1,)*, *optional*, returned when `labels` is provided):
Sylvain Gugger's avatar
Sylvain Gugger committed
710
            Classification loss.
711
712
        logits (`torch.FloatTensor` of shape `(batch_size, num_choices)`):
            *num_choices* is the second dimension of the input tensors. (see *input_ids* above).
Sylvain Gugger's avatar
Sylvain Gugger committed
713
714

            Classification scores (before SoftMax).
715
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
716
717
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
718

719
            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
720
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
721
722
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None


@dataclass
class TokenClassifierOutput(ModelOutput):
    """
    Base class for outputs of token classification models.

    Args:
740
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided) :
Sylvain Gugger's avatar
Sylvain Gugger committed
741
            Classification loss.
742
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`):
Sylvain Gugger's avatar
Sylvain Gugger committed
743
            Classification scores (before SoftMax).
744
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
745
746
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
747

748
            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
749
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
750
751
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None


@dataclass
class QuestionAnsweringModelOutput(ModelOutput):
    """
    Base class for outputs of question answering models.

    Args:
769
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Sylvain Gugger's avatar
Sylvain Gugger committed
770
            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
771
        start_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Sylvain Gugger's avatar
Sylvain Gugger committed
772
            Span-start scores (before SoftMax).
773
        end_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Sylvain Gugger's avatar
Sylvain Gugger committed
774
            Span-end scores (before SoftMax).
775
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
776
777
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
778

779
            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
780
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
781
782
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    loss: Optional[torch.FloatTensor] = None
    start_logits: torch.FloatTensor = None
    end_logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None


@dataclass
class Seq2SeqQuestionAnsweringModelOutput(ModelOutput):
    """
    Base class for outputs of sequence-to-sequence question answering models.

    Args:
801
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Sylvain Gugger's avatar
Sylvain Gugger committed
802
            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
803
        start_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Sylvain Gugger's avatar
Sylvain Gugger committed
804
            Span-start scores (before SoftMax).
805
        end_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Sylvain Gugger's avatar
Sylvain Gugger committed
806
            Span-end scores (before SoftMax).
807
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
808
809
810
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
811

812
            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
813
814
            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
815
816
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
817
818

            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
819
        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
820
821
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
822
823
824

            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
825
        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
826
827
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
828
829
830

            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
            weighted average in the cross-attention heads.
831
        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
832
            Sequence of hidden-states at the output of the last layer of the encoder of the model.
833
        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
834
835
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
836
837

            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
838
        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
839
840
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
841
842
843
844
845
846
847
848

            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
    """

    loss: Optional[torch.FloatTensor] = None
    start_logits: torch.FloatTensor = None
    end_logits: torch.FloatTensor = None
849
    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
Sylvain Gugger's avatar
Sylvain Gugger committed
850
851
    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
852
    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
Sylvain Gugger's avatar
Sylvain Gugger committed
853
854
855
    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
856
857
858


@dataclass
859
class SemanticSegmenterOutput(ModelOutput):
860
861
862
863
864
865
    """
    Base class for outputs of semantic segmentation models.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Classification (or regression if config.num_labels==1) loss.
866
        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels, logits_height, logits_width)`):
867
            Classification scores for each pixel.
868
869
870
871
872
873
874
875
876

            <Tip warning={true}>

            The logits returned do not necessarily have the same size as the `pixel_values` passed as inputs. This is
            to avoid doing two interpolations and lose some quality when a user needs to resize the logits to the
            original image size as post-processing. You should always check your logits shape and resize as needed.

            </Tip>

877
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
878
879
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, patch_size, hidden_size)`.
880

881
            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
882
883
884
885
886
887
888
889
890
891
892
893
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None
894
895
896
897
898
899
900
901
902
903
904
905
906


@dataclass
class ImageClassifierOutput(ModelOutput):
    """
    Base class for outputs of image classification models.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Classification (or regression if config.num_labels==1) loss.
        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
907
908
909
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
            (also called feature maps) of the model at the output of each stage.
910
911
912
913
914
915
916
917
918
919
920
921
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None
NielsRogge's avatar
NielsRogge committed
922
923


924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
@dataclass
class ImageClassifierOutputWithNoAttention(ModelOutput):
    """
    Base class for outputs of image classification models.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Classification (or regression if config.num_labels==1) loss.
        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each stage) of shape `(batch_size, num_channels, height, width)`. Hidden-states (also
            called feature maps) of the model at the output of each stage.
    """

    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None


NielsRogge's avatar
NielsRogge committed
945
946
947
948
949
950
951
952
953
954
955
956
@dataclass
class DepthEstimatorOutput(ModelOutput):
    """
    Base class for outputs of depth estimation models.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Classification (or regression if config.num_labels==1) loss.
        predicted_depth (`torch.FloatTensor` of shape `(batch_size, height, width)`):
            Predicted depth for each pixel.

        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
957
958
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, num_channels, height, width)`.
NielsRogge's avatar
NielsRogge committed
959

960
            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
NielsRogge's avatar
NielsRogge committed
961
962
963
964
965
966
967
968
969
970
971
972
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    loss: Optional[torch.FloatTensor] = None
    predicted_depth: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None