modeling_tf_outputs.py 50 KB
Newer Older
Sylvain Gugger's avatar
Sylvain Gugger committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Sylvain Gugger's avatar
Sylvain Gugger committed
15
16
17
18
19
from dataclasses import dataclass
from typing import List, Optional, Tuple

import tensorflow as tf

20
from .utils import ModelOutput
Sylvain Gugger's avatar
Sylvain Gugger committed
21
22
23
24
25
26
27
28


@dataclass
class TFBaseModelOutput(ModelOutput):
    """
    Base class for model's outputs, with potential hidden states and attentions.

    Args:
29
        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sylvain Gugger's avatar
Sylvain Gugger committed
30
            Sequence of hidden-states at the output of the last layer of the model.
31
        hidden_states (`tuple(tf.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
32
33
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
34
35

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
36
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
37
38
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
39
40
41
42
43
44
45
46
47
48

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    last_hidden_state: tf.Tensor = None
    hidden_states: Optional[Tuple[tf.Tensor]] = None
    attentions: Optional[Tuple[tf.Tensor]] = None


49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
@dataclass
class TFBaseModelOutputWithNoAttention(ModelOutput):
    """
    Base class for model's outputs, with potential hidden states.

    Args:
        last_hidden_state (`tf.Tensor` shape `(batch_size, num_channels, height, width)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, num_channels, height, width)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    """

    last_hidden_state: tf.Tensor = None
amyeroberts's avatar
amyeroberts committed
65
    hidden_states: Optional[Tuple[tf.Tensor, ...]] = None
66
67


Sylvain Gugger's avatar
Sylvain Gugger committed
68
69
70
71
72
73
@dataclass
class TFBaseModelOutputWithPooling(ModelOutput):
    """
    Base class for model's outputs that also contains a pooling of the last hidden states.

    Args:
74
        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sylvain Gugger's avatar
Sylvain Gugger committed
75
            Sequence of hidden-states at the output of the last layer of the model.
76
        pooler_output (`tf.Tensor` of shape `(batch_size, hidden_size)`):
Sylvain Gugger's avatar
Sylvain Gugger committed
77
78
79
80
81
82
            Last layer hidden-state of the first token of the sequence (classification token) further processed by a
            Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence
            prediction (classification) objective during pretraining.

            This output is usually *not* a good summary of the semantic content of the input, you're often better with
            averaging or pooling the sequence of hidden-states for the whole input sequence.
83
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
84
85
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
86
87

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
88
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
89
90
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
91
92
93
94
95
96
97
98
99
100
101

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    last_hidden_state: tf.Tensor = None
    pooler_output: tf.Tensor = None
    hidden_states: Optional[Tuple[tf.Tensor]] = None
    attentions: Optional[Tuple[tf.Tensor]] = None


102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
@dataclass
class TFBaseModelOutputWithPoolingAndNoAttention(ModelOutput):
    """
    Base class for model's outputs that also contains a pooling of the last hidden states.

    Args:
        last_hidden_state (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
            Sequence of hidden-states at the output of the last layer of the model.
        pooler_output (`tf.Tensor` of shape `(batch_size, hidden_size)`):
            Last layer hidden-state after a pooling operation on the spatial dimensions.
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
            the output of each layer) of shape `(batch_size, num_channels, height, width)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    """

    last_hidden_state: tf.Tensor = None
    pooler_output: tf.Tensor = None
amyeroberts's avatar
amyeroberts committed
121
    hidden_states: Optional[Tuple[tf.Tensor, ...]] = None
122
123


124
125
126
127
128
129
@dataclass
class TFBaseModelOutputWithPoolingAndCrossAttentions(ModelOutput):
    """
    Base class for model's outputs that also contains a pooling of the last hidden states.

    Args:
130
        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
131
            Sequence of hidden-states at the output of the last layer of the model.
132
        pooler_output (`tf.Tensor` of shape `(batch_size, hidden_size)`):
133
134
135
136
137
138
            Last layer hidden-state of the first token of the sequence (classification token) further processed by a
            Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence
            prediction (classification) objective during pretraining.

            This output is usually *not* a good summary of the semantic content of the input, you're often better with
            averaging or pooling the sequence of hidden-states for the whole input sequence.
139
        past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
140
141
            List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
            sequence_length, embed_size_per_head)`).
142
143

            Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
144
145
            `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
146
147
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.
148
149

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
150
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
151
152
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
153
154
155

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
156
        cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
157
158
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
159
160
161
162
163
164
165
166
167
168
169
170
171

            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
            weighted average in the cross-attention heads.
    """

    last_hidden_state: tf.Tensor = None
    pooler_output: tf.Tensor = None
    past_key_values: Optional[List[tf.Tensor]] = None
    hidden_states: Optional[Tuple[tf.Tensor]] = None
    attentions: Optional[Tuple[tf.Tensor]] = None
    cross_attentions: Optional[Tuple[tf.Tensor]] = None


Sylvain Gugger's avatar
Sylvain Gugger committed
172
173
174
175
176
177
@dataclass
class TFBaseModelOutputWithPast(ModelOutput):
    """
    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).

    Args:
178
        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sylvain Gugger's avatar
Sylvain Gugger committed
179
180
            Sequence of hidden-states at the output of the last layer of the model.

Sylvain Gugger's avatar
Sylvain Gugger committed
181
182
            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
            hidden_size)` is output.
183
        past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
184
185
            List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
            sequence_length, embed_size_per_head)`).
Sylvain Gugger's avatar
Sylvain Gugger committed
186
187

            Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
188
189
            `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
190
191
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
192
193

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
194
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
195
196
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
197
198
199
200
201
202
203
204
205
206
207

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    last_hidden_state: tf.Tensor = None
    past_key_values: Optional[List[tf.Tensor]] = None
    hidden_states: Optional[Tuple[tf.Tensor]] = None
    attentions: Optional[Tuple[tf.Tensor]] = None


208
209
210
211
212
213
@dataclass
class TFBaseModelOutputWithCrossAttentions(ModelOutput):
    """
    Base class for model's outputs, with potential hidden states and attentions.

    Args:
214
        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
215
            Sequence of hidden-states at the output of the last layer of the model.
216
        hidden_states (`tuple(tf.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
217
218
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.
219
220

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
221
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
222
223
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
224
225
226

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
227
        cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
228
229
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246

            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
            weighted average in the cross-attention heads.
    """

    last_hidden_state: tf.Tensor = None
    hidden_states: Optional[Tuple[tf.Tensor]] = None
    attentions: Optional[Tuple[tf.Tensor]] = None
    cross_attentions: Optional[Tuple[tf.Tensor]] = None


@dataclass
class TFBaseModelOutputWithPastAndCrossAttentions(ModelOutput):
    """
    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).

    Args:
247
        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
248
249
            Sequence of hidden-states at the output of the last layer of the model.

Sylvain Gugger's avatar
Sylvain Gugger committed
250
251
            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
            hidden_size)` is output.
252
        past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
253
254
            List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
            sequence_length, embed_size_per_head)`).
255
256

            Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
257
258
            `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(tf.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
259
260
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.
261
262

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
263
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
264
265
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
266
267
268

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
269
        cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
270
271
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
272
273
274
275
276
277
278
279
280
281
282
283

            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
            weighted average in the cross-attention heads.
    """

    last_hidden_state: tf.Tensor = None
    past_key_values: Optional[List[tf.Tensor]] = None
    hidden_states: Optional[Tuple[tf.Tensor]] = None
    attentions: Optional[Tuple[tf.Tensor]] = None
    cross_attentions: Optional[Tuple[tf.Tensor]] = None


Sylvain Gugger's avatar
Sylvain Gugger committed
284
285
286
287
288
289
290
@dataclass
class TFSeq2SeqModelOutput(ModelOutput):
    """
    Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
    decoding.

    Args:
291
        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sylvain Gugger's avatar
Sylvain Gugger committed
292
293
            Sequence of hidden-states at the output of the last layer of the decoder of the model.

Sylvain Gugger's avatar
Sylvain Gugger committed
294
295
            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
            hidden_size)` is output.
296
        past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
297
298
            List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
            sequence_length, embed_size_per_head)`).
Sylvain Gugger's avatar
Sylvain Gugger committed
299
300

            Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
301
302
            used (see `past_key_values` input) to speed up sequential decoding.
        decoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
303
304
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
305
306

            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
307
        decoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
308
309
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
310
311
312

            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
313
        cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
314
315
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
316
317
318

            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
            weighted average in the cross-attention heads.
319
        encoder_last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
320
            Sequence of hidden-states at the output of the last layer of the encoder of the model.
321
        encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
322
323
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
324
325

            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
326
        encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
327
328
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
329
330
331
332
333
334
335
336
337

            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
    """

    last_hidden_state: tf.Tensor = None
    past_key_values: Optional[List[tf.Tensor]] = None
    decoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
    decoder_attentions: Optional[Tuple[tf.Tensor]] = None
338
    cross_attentions: Optional[Tuple[tf.Tensor]] = None
Sylvain Gugger's avatar
Sylvain Gugger committed
339
340
341
342
343
344
345
346
347
348
349
    encoder_last_hidden_state: Optional[tf.Tensor] = None
    encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
    encoder_attentions: Optional[Tuple[tf.Tensor]] = None


@dataclass
class TFCausalLMOutput(ModelOutput):
    """
    Base class for causal language model (or autoregressive) outputs.

    Args:
350
        loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `labels` is provided):
Sylvain Gugger's avatar
Sylvain Gugger committed
351
            Language modeling loss (for next-token prediction).
352
        logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Sylvain Gugger's avatar
Sylvain Gugger committed
353
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
354
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
355
356
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
357
358

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
359
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
360
361
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    loss: Optional[tf.Tensor] = None
    logits: tf.Tensor = None
    hidden_states: Optional[Tuple[tf.Tensor]] = None
    attentions: Optional[Tuple[tf.Tensor]] = None


@dataclass
class TFCausalLMOutputWithPast(ModelOutput):
    """
    Base class for causal language model (or autoregressive) outputs.

    Args:
379
        loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `labels` is provided):
Sylvain Gugger's avatar
Sylvain Gugger committed
380
            Language modeling loss (for next-token prediction).
381
        logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Sylvain Gugger's avatar
Sylvain Gugger committed
382
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
383
        past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
384
385
            List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
            sequence_length, embed_size_per_head)`).
Sylvain Gugger's avatar
Sylvain Gugger committed
386
387

            Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
388
389
            `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
390
391
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
392
393

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
394
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
395
396
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
397
398
399
400
401
402
403
404
405
406
407
408

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    loss: Optional[tf.Tensor] = None
    logits: tf.Tensor = None
    past_key_values: Optional[List[tf.Tensor]] = None
    hidden_states: Optional[Tuple[tf.Tensor]] = None
    attentions: Optional[Tuple[tf.Tensor]] = None


409
410
411
412
413
414
@dataclass
class TFCausalLMOutputWithCrossAttentions(ModelOutput):
    """
    Base class for causal language model (or autoregressive) outputs.

    Args:
415
        loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `labels` is provided):
416
            Language modeling loss (for next-token prediction).
417
        logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
418
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
419
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
420
421
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.
422
423

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
424
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
425
426
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
427
428
429

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
430
        cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
431
432
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
433
434
435

            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
            weighted average in the cross-attention heads.
436
        past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
437
438
            List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
            sequence_length, embed_size_per_head)`).
439
440

            Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
441
            `past_key_values` input) to speed up sequential decoding.
442
443
444
445
446
447
448
449
450
451
    """

    loss: Optional[tf.Tensor] = None
    logits: tf.Tensor = None
    past_key_values: Optional[List[tf.Tensor]] = None
    hidden_states: Optional[Tuple[tf.Tensor]] = None
    attentions: Optional[Tuple[tf.Tensor]] = None
    cross_attentions: Optional[Tuple[tf.Tensor]] = None


Sylvain Gugger's avatar
Sylvain Gugger committed
452
453
454
455
456
457
@dataclass
class TFMaskedLMOutput(ModelOutput):
    """
    Base class for masked language models outputs.

    Args:
458
        loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `labels` is provided):
459
            Masked language modeling (MLM) loss.
460
        logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Sylvain Gugger's avatar
Sylvain Gugger committed
461
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
462
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
463
464
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
465
466

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
467
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
468
469
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    loss: Optional[tf.Tensor] = None
    logits: tf.Tensor = None
    hidden_states: Optional[Tuple[tf.Tensor]] = None
    attentions: Optional[Tuple[tf.Tensor]] = None


@dataclass
class TFSeq2SeqLMOutput(ModelOutput):
    """
    Base class for sequence-to-sequence language models outputs.

    Args:
487
        loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `labels` is provided):
488
            Language modeling loss.
489
        logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Sylvain Gugger's avatar
Sylvain Gugger committed
490
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
491
        past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
492
493
            List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
            sequence_length, embed_size_per_head)`).
Sylvain Gugger's avatar
Sylvain Gugger committed
494
495

            Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
496
497
            used (see `past_key_values` input) to speed up sequential decoding.
        decoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
498
499
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
500
501

            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
502
        decoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
503
504
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
505
506
507

            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
508
        cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
509
510
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
511
512
513

            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
            weighted average in the cross-attention heads.
514
        encoder_last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
515
            Sequence of hidden-states at the output of the last layer of the encoder of the model.
516
        encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
517
518
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
519
520

            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
521
        encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
522
523
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
524
525
526
527
528
529
530
531
532
533

            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
    """

    loss: Optional[tf.Tensor] = None
    logits: tf.Tensor = None
    past_key_values: Optional[List[tf.Tensor]] = None
    decoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
    decoder_attentions: Optional[Tuple[tf.Tensor]] = None
534
    cross_attentions: Optional[Tuple[tf.Tensor]] = None
Sylvain Gugger's avatar
Sylvain Gugger committed
535
536
537
538
539
540
541
542
543
544
545
    encoder_last_hidden_state: Optional[tf.Tensor] = None
    encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
    encoder_attentions: Optional[Tuple[tf.Tensor]] = None


@dataclass
class TFNextSentencePredictorOutput(ModelOutput):
    """
    Base class for outputs of models predicting if two sentences are consecutive or not.

    Args:
546
        loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `next_sentence_label` is provided):
547
            Next sentence prediction loss.
548
        logits (`tf.Tensor` of shape `(batch_size, 2)`):
Sylvain Gugger's avatar
Sylvain Gugger committed
549
550
            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
            before SoftMax).
551
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
552
553
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
554
555

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
556
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
557
558
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
559
560
561
562
563

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

Julien Plu's avatar
Julien Plu committed
564
    loss: Optional[tf.Tensor] = None
Sylvain Gugger's avatar
Sylvain Gugger committed
565
566
567
568
569
570
571
572
573
574
575
    logits: tf.Tensor = None
    hidden_states: Optional[Tuple[tf.Tensor]] = None
    attentions: Optional[Tuple[tf.Tensor]] = None


@dataclass
class TFSequenceClassifierOutput(ModelOutput):
    """
    Base class for outputs of sentence classification models.

    Args:
576
        loss (`tf.Tensor` of shape `(batch_size, )`, *optional*, returned when `labels` is provided):
Sylvain Gugger's avatar
Sylvain Gugger committed
577
            Classification (or regression if config.num_labels==1) loss.
578
        logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
Sylvain Gugger's avatar
Sylvain Gugger committed
579
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
580
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
581
582
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
583
584

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
585
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
586
587
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    loss: Optional[tf.Tensor] = None
    logits: tf.Tensor = None
    hidden_states: Optional[Tuple[tf.Tensor]] = None
    attentions: Optional[Tuple[tf.Tensor]] = None


@dataclass
class TFSeq2SeqSequenceClassifierOutput(ModelOutput):
    """
    Base class for outputs of sequence-to-sequence sentence classification models.

    Args:
605
        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `label` is provided):
Sylvain Gugger's avatar
Sylvain Gugger committed
606
            Classification (or regression if config.num_labels==1) loss.
607
        logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
Sylvain Gugger's avatar
Sylvain Gugger committed
608
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
609
        past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
610
611
            List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
            sequence_length, embed_size_per_head)`).
Sylvain Gugger's avatar
Sylvain Gugger committed
612
613

            Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
614
615
            used (see `past_key_values` input) to speed up sequential decoding.
        decoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
616
617
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
618
619

            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
620
        decoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
621
622
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
623
624
625

            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
626
        encoder_last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
627
            Sequence of hidden-states at the output of the last layer of the encoder of the model.
628
        encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
629
630
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
631
632

            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
633
        encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
634
635
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650

            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
    """

    loss: Optional[tf.Tensor] = None
    logits: tf.Tensor = None
    past_key_values: Optional[List[tf.Tensor]] = None
    decoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
    decoder_attentions: Optional[Tuple[tf.Tensor]] = None
    encoder_last_hidden_state: Optional[tf.Tensor] = None
    encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
    encoder_attentions: Optional[Tuple[tf.Tensor]] = None


651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
@dataclass
class TFSemanticSegmenterOutput(ModelOutput):
    """
    Base class for outputs of semantic segmentation models.

    Args:
        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Classification (or regression if config.num_labels==1) loss.
        logits (`tf.Tensor` of shape `(batch_size, config.num_labels, logits_height, logits_width)`):
            Classification scores for each pixel.

            <Tip warning={true}>

            The logits returned do not necessarily have the same size as the `pixel_values` passed as inputs. This is
            to avoid doing two interpolations and lose some quality when a user needs to resize the logits to the
            original image size as post-processing. You should always check your logits shape and resize as needed.

            </Tip>

        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
            the output of each layer) of shape `(batch_size, patch_size, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, patch_size, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    loss: Optional[tf.Tensor] = None
    logits: tf.Tensor = None
    hidden_states: Optional[Tuple[tf.Tensor]] = None
    attentions: Optional[Tuple[tf.Tensor]] = None


Sylvain Gugger's avatar
Sylvain Gugger committed
688
689
690
691
692
693
@dataclass
class TFMultipleChoiceModelOutput(ModelOutput):
    """
    Base class for outputs of multiple choice models.

    Args:
694
        loss (`tf.Tensor` of shape *(batch_size, )*, *optional*, returned when `labels` is provided):
Sylvain Gugger's avatar
Sylvain Gugger committed
695
            Classification loss.
696
697
        logits (`tf.Tensor` of shape `(batch_size, num_choices)`):
            *num_choices* is the second dimension of the input tensors. (see *input_ids* above).
Sylvain Gugger's avatar
Sylvain Gugger committed
698
699

            Classification scores (before SoftMax).
700
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
701
702
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
703
704

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
705
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
706
707
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    loss: Optional[tf.Tensor] = None
    logits: tf.Tensor = None
    hidden_states: Optional[Tuple[tf.Tensor]] = None
    attentions: Optional[Tuple[tf.Tensor]] = None


@dataclass
class TFTokenClassifierOutput(ModelOutput):
    """
    Base class for outputs of token classification models.

    Args:
725
        loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of unmasked labels, returned when `labels` is provided) :
Sylvain Gugger's avatar
Sylvain Gugger committed
726
            Classification loss.
727
        logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.num_labels)`):
Sylvain Gugger's avatar
Sylvain Gugger committed
728
            Classification scores (before SoftMax).
729
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
730
731
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
732
733

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
734
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
735
736
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    loss: Optional[tf.Tensor] = None
    logits: tf.Tensor = None
    hidden_states: Optional[Tuple[tf.Tensor]] = None
    attentions: Optional[Tuple[tf.Tensor]] = None


@dataclass
class TFQuestionAnsweringModelOutput(ModelOutput):
    """
    Base class for outputs of question answering models.

    Args:
754
        loss (`tf.Tensor` of shape `(batch_size, )`, *optional*, returned when `start_positions` and `end_positions` are provided):
Sylvain Gugger's avatar
Sylvain Gugger committed
755
            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
756
        start_logits (`tf.Tensor` of shape `(batch_size, sequence_length)`):
Sylvain Gugger's avatar
Sylvain Gugger committed
757
            Span-start scores (before SoftMax).
758
        end_logits (`tf.Tensor` of shape `(batch_size, sequence_length)`):
Sylvain Gugger's avatar
Sylvain Gugger committed
759
            Span-end scores (before SoftMax).
760
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
761
762
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
763
764

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
765
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
766
767
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    loss: Optional[tf.Tensor] = None
    start_logits: tf.Tensor = None
    end_logits: tf.Tensor = None
    hidden_states: Optional[Tuple[tf.Tensor]] = None
    attentions: Optional[Tuple[tf.Tensor]] = None


@dataclass
class TFSeq2SeqQuestionAnsweringModelOutput(ModelOutput):
    """
    Base class for outputs of sequence-to-sequence question answering models.

    Args:
786
        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Sylvain Gugger's avatar
Sylvain Gugger committed
787
            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
788
        start_logits (`tf.Tensor` of shape `(batch_size, sequence_length)`):
Sylvain Gugger's avatar
Sylvain Gugger committed
789
            Span-start scores (before SoftMax).
790
        end_logits (`tf.Tensor` of shape `(batch_size, sequence_length)`):
Sylvain Gugger's avatar
Sylvain Gugger committed
791
            Span-end scores (before SoftMax).
792
        past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
793
794
            List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
            sequence_length, embed_size_per_head)`).
Sylvain Gugger's avatar
Sylvain Gugger committed
795
796

            Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
797
798
            used (see `past_key_values` input) to speed up sequential decoding.
        decoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
799
800
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
801
802

            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
803
        decoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
804
805
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
806
807
808

            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
809
        encoder_last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sylvain Gugger's avatar
Sylvain Gugger committed
810
            Sequence of hidden-states at the output of the last layer of the encoder of the model.
811
        encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
812
813
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
814
815

            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
816
        encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
817
818
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
Sylvain Gugger's avatar
Sylvain Gugger committed
819
820
821
822
823
824
825
826
827
828
829
830
831
832

            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
    """

    loss: Optional[tf.Tensor] = None
    start_logits: tf.Tensor = None
    end_logits: tf.Tensor = None
    past_key_values: Optional[List[tf.Tensor]] = None
    decoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
    decoder_attentions: Optional[Tuple[tf.Tensor]] = None
    encoder_last_hidden_state: Optional[tf.Tensor] = None
    encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
    encoder_attentions: Optional[Tuple[tf.Tensor]] = None
833
834
835
836
837
838
839
840


@dataclass
class TFSequenceClassifierOutputWithPast(ModelOutput):
    """
    Base class for outputs of sentence classification models.

    Args:
841
        loss (`tf.Tensor` of shape `(batch_size, )`, *optional*, returned when `labels` is provided):
842
            Classification (or regression if config.num_labels==1) loss.
843
        logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
844
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
845
        past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
846
847
            List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
            sequence_length, embed_size_per_head)`).
848
849

            Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
850
851
            `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
852
853
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.
854
855

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
856
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Sylvain Gugger's avatar
Sylvain Gugger committed
857
858
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
859
860
861
862
863
864
865
866
867
868

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    loss: Optional[tf.Tensor] = None
    logits: tf.Tensor = None
    past_key_values: Optional[List[tf.Tensor]] = None
    hidden_states: Optional[Tuple[tf.Tensor]] = None
    attentions: Optional[Tuple[tf.Tensor]] = None
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888


@dataclass
class TFImageClassifierOutputWithNoAttention(ModelOutput):
    """
    Base class for outputs of image classification models.

    Args:
        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Classification (or regression if config.num_labels==1) loss.
        logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
            the output of each stage) of shape `(batch_size, num_channels, height, width)`. Hidden-states (also called
            feature maps) of the model at the output of each stage.
    """

    loss: Optional[tf.Tensor] = None
    logits: tf.Tensor = None
amyeroberts's avatar
amyeroberts committed
889
    hidden_states: Optional[Tuple[tf.Tensor, ...]] = None