Output hidden states (#4978)

* Configure all models to use output_hidden_states as argument passed to foward() * Pass all tests * Remove cast_bool_to_primitive in TF Flaubert model * correct tf xlnet * add pytorch test * add tf test * Fix broken tests * Configure all models to use output_hidden_states as argument passed to foward() * Pass all tests * Remove cast_bool_to_primitive in TF Flaubert model * correct tf xlnet * add pytorch test * add tf test * Fix broken tests * Refactor output_hidden_states for mobilebert * Reset and remerge to master Co-authored-by: Joseph Liu <joseph.liu@coinflex.com> Co-authored-by: patrickvonplaten <patrick.v.platen@gmail.com>

Output hidden states (#4978)
* Configure all models to use output_hidden_states as argument passed to foward() * Pass all tests * Remove cast_bool_to_primitive in TF Flaubert model * correct tf xlnet * add pytorch test * add tf test * Fix broken tests * Configure all models to use output_hidden_states as argument passed to foward() * Pass all tests * Remove cast_bool_to_primitive in TF Flaubert model * correct tf xlnet * add pytorch test * add tf test * Fix broken tests * Refactor output_hidden_states for mobilebert * Reset and remerge to master Co-authored-by: Joseph Liu <joseph.liu@coinflex.com> Co-authored-by: patrickvonplaten <patrick.v.platen@gmail.com>
f4e1f022 · Joseph Liu · GitHub · 866a8cca · f4e1f022 · f4e1f022
Unverified Commit f4e1f022 authored Jun 22, 2020 by Joseph Liu Committed by GitHub Jun 22, 2020
14 changed files
--- a/src/transformers/modeling_tf_flaubert.py
+++ b/src/transformers/modeling_tf_flaubert.py
@@ -137,6 +137,7 @@ class TFFlaubertMainLayer(TFXLMMainLayer):
        inputs_embeds=None,
        training=False,
        output_attentions=False,
+        output_hidden_states=False,
    ):
        # removed: src_enc=None, src_len=None
        if isinstance(inputs, (tuple, list)):
@@ -251,14 +252,13 @@ class TFFlaubertMainLayer(TFXLMMainLayer):
            if training and (dropout_probability < self.layerdrop):
                continue
-            if self.output_hidden_states:
+            if output_hidden_states:
                hidden_states = hidden_states + (tensor,)
            # self attention
            if not self.pre_norm:
                attn_outputs = self.attentions[i]([tensor, attn_mask, None, cache, head_mask[i]], training=training)
                attn = attn_outputs[0]
-                if output_attentions:
                attentions = attentions + (attn_outputs[1],)
                attn = self.dropout(attn, training=training)
                tensor = tensor + attn
@@ -292,7 +292,7 @@ class TFFlaubertMainLayer(TFXLMMainLayer):
            tensor = tensor * mask[..., tf.newaxis]
        # Add last hidden state
-        if self.output_hidden_states:
+        if output_hidden_states:
            hidden_states = hidden_states + (tensor,)
        # update cache length
@@ -303,7 +303,7 @@ class TFFlaubertMainLayer(TFXLMMainLayer):
        # tensor = tensor.transpose(0, 1)
        outputs = (tensor,)
-        if self.output_hidden_states:
+        if output_hidden_states:
            outputs = outputs + (hidden_states,)
        if output_attentions:
            outputs = outputs + (attentions,)

--- a/src/transformers/modeling_tf_gpt2.py
+++ b/src/transformers/modeling_tf_gpt2.py
@@ -257,6 +257,7 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
        use_cache=True,
        training=False,
        output_attentions=None,
+        output_hidden_states=None,
    ):
        if isinstance(inputs, (tuple, list)):
            input_ids = inputs[0]
@@ -268,7 +269,8 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
            inputs_embeds = inputs[6] if len(inputs) > 6 else inputs_embeds
            use_cache = inputs[7] if len(inputs) > 7 else use_cache
            output_attentions = inputs[8] if len(inputs) > 7 else output_attentions
-            assert len(inputs) <= 9, "Too many inputs."
+            output_hidden_states = inputs[9] if len(inputs) > 8 else output_hidden_states
+            assert len(inputs) <= 10, "Too many inputs."
        elif isinstance(inputs, (dict, BatchEncoding)):
            input_ids = inputs.get("input_ids")
            past = inputs.get("past", past)
@@ -279,11 +281,13 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
            use_cache = inputs.get("use_cache", use_cache)
            output_attentions = inputs.get("output_attentions", output_attentions)
-            assert len(inputs) <= 9, "Too many inputs."
+            output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
+            assert len(inputs) <= 10, "Too many inputs."
        else:
            input_ids = inputs
        output_attentions = output_attentions if output_attentions is not None else self.output_attentions
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
@@ -352,7 +356,7 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
        all_attentions = []
        all_hidden_states = ()
        for i, (block, layer_past) in enumerate(zip(self.h, past)):
-            if self.output_hidden_states:
+            if cast_bool_to_primitive(output_hidden_states) is True:
                all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),)
            outputs = block(
@@ -370,14 +374,14 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
        hidden_states = tf.reshape(hidden_states, output_shape)
        # Add last hidden state
-        if self.output_hidden_states:
+        if cast_bool_to_primitive(output_hidden_states) is True:
            all_hidden_states = all_hidden_states + (hidden_states,)
        outputs = (hidden_states,)
        if use_cache is True:
            outputs = outputs + (presents,)
-        if self.output_hidden_states:
+        if cast_bool_to_primitive(output_hidden_states) is True:
            outputs = outputs + (all_hidden_states,)
        if cast_bool_to_primitive(output_attentions) is True:
            # let the number of heads free (-1) so we can extract attention even after head pruning
@@ -493,7 +497,7 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(tf.Tensor)` `optional`, returned when ``config.output_hidden_states=True``):
+        hidden_states (:obj:`tuple(tf.Tensor)` `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -552,7 +556,7 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -620,6 +624,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
        mc_token_ids=None,
        use_cache=True,
        output_attentions=None,
+        output_hidden_states=None,
        training=False,
    ):
        r"""
@@ -637,7 +642,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as `input_ids` as they have already been computed.
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -726,6 +731,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
            inputs_embeds,
            use_cache,
            output_attentions,
+            output_hidden_states,
        ]
        transformer_outputs = self.transformer(flat_inputs, training=training)

--- a/src/transformers/modeling_tf_mobilebert.py
+++ b/src/transformers/modeling_tf_mobilebert.py
@@ -508,16 +508,15 @@ class TFMobileBertLayer(tf.keras.layers.Layer):
 class TFMobileBertEncoder(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)
-        self.output_hidden_states = config.output_hidden_states
        self.layer = [TFMobileBertLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)]
    def call(self, inputs, training=False):
-        hidden_states, attention_mask, head_mask, output_attentions = inputs
+        hidden_states, attention_mask, head_mask, output_attentions, output_hidden_states = inputs
        all_hidden_states = ()
        all_attentions = ()
        for i, layer_module in enumerate(self.layer):
-            if self.output_hidden_states:
+            if cast_bool_to_primitive(output_hidden_states) is True:
                all_hidden_states = all_hidden_states + (hidden_states,)
            layer_outputs = layer_module(
@@ -529,11 +528,11 @@ class TFMobileBertEncoder(tf.keras.layers.Layer):
                all_attentions = all_attentions + (layer_outputs[1],)
        # Add last layer
-        if self.output_hidden_states:
+        if cast_bool_to_primitive(output_hidden_states) is True:
            all_hidden_states = all_hidden_states + (hidden_states,)
        outputs = (hidden_states,)
-        if self.output_hidden_states:
+        if cast_bool_to_primitive(output_hidden_states) is True:
            outputs = outputs + (all_hidden_states,)
        if cast_bool_to_primitive(output_attentions) is True:
            outputs = outputs + (all_attentions,)
@@ -643,6 +642,7 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer):
        super().__init__(**kwargs)
        self.num_hidden_layers = config.num_hidden_layers
        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
        self.embeddings = TFMobileBertEmbeddings(config, name="embeddings")
        self.encoder = TFMobileBertEncoder(config, name="encoder")
@@ -670,6 +670,7 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer):
        head_mask=None,
        inputs_embeds=None,
        output_attentions=None,
+        output_hidden_states=None,
        training=False,
    ):
        if isinstance(inputs, (tuple, list)):
@@ -680,7 +681,8 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer):
            head_mask = inputs[4] if len(inputs) > 4 else head_mask
            inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
            output_attentions = inputs[6] if len(inputs) > 6 else output_attentions
-            assert len(inputs) <= 7, "Too many inputs."
+            output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states
+            assert len(inputs) <= 8, "Too many inputs."
        elif isinstance(inputs, (dict, BatchEncoding)):
            input_ids = inputs.get("input_ids")
            attention_mask = inputs.get("attention_mask", attention_mask)
@@ -689,11 +691,13 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer):
            head_mask = inputs.get("head_mask", head_mask)
            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
            output_attentions = inputs.get("output_attentions", output_attentions)
-            assert len(inputs) <= 7, "Too many inputs."
+            output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
+            assert len(inputs) <= 8, "Too many inputs."
        else:
            input_ids = inputs
        output_attentions = output_attentions if output_attentions is not None else self.output_attentions
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
@@ -738,7 +742,8 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer):
        embedding_output = self.embeddings([input_ids, position_ids, token_type_ids, inputs_embeds], training=training)
        encoder_outputs = self.encoder(
-            [embedding_output, extended_attention_mask, head_mask, output_attentions], training=training
+            [embedding_output, extended_attention_mask, head_mask, output_attentions, output_hidden_states],
+            training=training,
        )
        sequence_output = encoder_outputs[0]
@@ -1079,6 +1084,7 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
+        output_hidden_states=None,
        training=False,
    ):
        r"""
@@ -1092,7 +1098,7 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.MobileBertConfig`) and inputs:
        logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True``):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -1125,6 +1131,7 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
            training=training,
        )
@@ -1172,6 +1179,7 @@ class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAn
        p_mask=None,
        is_impossible=None,
        output_attentions=None,
+        output_hidden_states=None,
        training=False,
    ):
        r"""
@@ -1190,7 +1198,7 @@ class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAn
            Span-start scores (before SoftMax).
        end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
            Span-end scores (before SoftMax).
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True``):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -1225,6 +1233,7 @@ class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAn
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
            training=training,
        )
@@ -1281,6 +1290,7 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
+        output_hidden_states=None,
        training=False,
    ):
        r"""
@@ -1295,7 +1305,7 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic
            `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above).
            Classification scores (before SoftMax).
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True``):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -1330,7 +1340,8 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic
            head_mask = inputs[4] if len(inputs) > 4 else head_mask
            inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
            output_attentions = inputs[6] if len(inputs) > 6 else output_attentions
-            assert len(inputs) <= 7, "Too many inputs."
+            output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states
+            assert len(inputs) <= 8, "Too many inputs."
        elif isinstance(inputs, (dict, BatchEncoding)):
            input_ids = inputs.get("input_ids")
            attention_mask = inputs.get("attention_mask", attention_mask)
@@ -1339,7 +1350,8 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic
            head_mask = inputs.get("head_mask", head_mask)
            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
            output_attentions = inputs.get("output_attentions", output_attentions)
-            assert len(inputs) <= 7, "Too many inputs."
+            output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
+            assert len(inputs) <= 8, "Too many inputs."
        else:
            input_ids = inputs
@@ -1368,6 +1380,7 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic
            head_mask,
            flat_inputs_embeds,
            output_attentions,
+            output_hidden_states,
        ]
        outputs = self.mobilebert(flat_inputs, training=training)
@@ -1414,6 +1427,7 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
+        output_hidden_states=None,
        training=False,
    ):
        r"""
@@ -1425,7 +1439,7 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.MobileBertConfig`) and inputs:
        scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
            Classification scores (before SoftMax).
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` or ``config.output_hidden_states=True``):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -1457,6 +1471,7 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
            training=training,
        )

--- a/src/transformers/modeling_tf_openai.py
+++ b/src/transformers/modeling_tf_openai.py
@@ -246,6 +246,7 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
        head_mask=None,
        inputs_embeds=None,
        output_attentions=None,
+        output_hidden_states=None,
        training=False,
    ):
        if isinstance(inputs, (tuple, list)):
@@ -256,7 +257,8 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
            head_mask = inputs[4] if len(inputs) > 4 else head_mask
            inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
            output_attentions = inputs[6] if len(inputs) > 6 else output_attentions
-            assert len(inputs) <= 7, "Too many inputs."
+            output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states
+            assert len(inputs) <= 8, "Too many inputs."
        elif isinstance(inputs, (dict, BatchEncoding)):
            input_ids = inputs.get("input_ids")
            attention_mask = inputs.get("attention_mask", attention_mask)
@@ -265,11 +267,13 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
            head_mask = inputs.get("head_mask", head_mask)
            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
            output_attentions = inputs.get("output_attentions", output_attentions)
-            assert len(inputs) <= 7, "Too many inputs."
+            output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
+            assert len(inputs) <= 8, "Too many inputs."
        else:
            input_ids = inputs
        output_attentions = output_attentions if output_attentions is not None else self.output_attentions
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
@@ -332,7 +336,7 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
        all_attentions = []
        all_hidden_states = ()
        for i, block in enumerate(self.h):
-            if self.output_hidden_states:
+            if cast_bool_to_primitive(output_hidden_states) is True:
                all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),)
            outputs = block([hidden_states, attention_mask, head_mask[i], output_attentions], training=training)
@@ -342,11 +346,11 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
        hidden_states = tf.reshape(hidden_states, output_shape)
        # Add last hidden state
-        if self.output_hidden_states:
+        if cast_bool_to_primitive(output_hidden_states) is True:
            all_hidden_states = all_hidden_states + (hidden_states,)
        outputs = (hidden_states,)
-        if self.output_hidden_states:
+        if cast_bool_to_primitive(output_hidden_states) is True:
            outputs = outputs + (all_hidden_states,)
        if cast_bool_to_primitive(output_attentions) is True:
            # let the number of heads free (-1) so we can extract attention even after head pruning
@@ -451,7 +455,7 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.OpenAIGPTConfig`) and inputs:
        last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the last layer of the model.
-        hidden_states (:obj:`tuple(tf.Tensor)` `optional`, returned when ``config.output_hidden_states=True``):
+        hidden_states (:obj:`tuple(tf.Tensor)` `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -499,7 +503,7 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.OpenAIGPTConfig`) and inputs:
        prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -564,6 +568,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
        inputs_embeds=None,
        mc_token_ids=None,
        output_attentions=None,
+        output_hidden_states=None,
        training=False,
    ):
        r"""
@@ -581,7 +586,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -661,6 +666,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
            head_mask,
            inputs_embeds,
            output_attentions,
+            output_hidden_states,
        ]
        transformer_outputs = self.transformer(flat_inputs, training=training)

--- a/src/transformers/modeling_tf_roberta.py
+++ b/src/transformers/modeling_tf_roberta.py
@@ -207,7 +207,8 @@ class TFRobertaModel(TFRobertaPreTrainedModel):
            objective during Bert pretraining. This output is usually *not* a good summary
            of the semantic content of the input, you're often better with averaging or pooling
            the sequence of hidden-states for the whole input sequence.
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when
+        ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -283,7 +284,8 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
        prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when
+        ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -365,6 +367,7 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceCla
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
+        output_hidden_states=None,
        training=False,
    ):
        r"""
@@ -372,7 +375,8 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceCla
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
        logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when
+        ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -404,6 +408,7 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceCla
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
            training=training,
        )
@@ -454,6 +459,7 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
+        output_hidden_states=None,
        training=False,
    ):
        r"""
@@ -468,7 +474,8 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)
            `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above).
            Classification scores (before SoftMax).
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when
+        ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -533,6 +540,8 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)
            flat_position_ids,
            head_mask,
            inputs_embeds,
+            output_attentions,
+            output_hidden_states,
        ]
        outputs = self.roberta(flat_inputs, training=training)
@@ -579,6 +588,7 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
+        output_hidden_states=None,
        training=False,
    ):
        r"""
@@ -590,7 +600,8 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
        scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
            Classification scores (before SoftMax).
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when
+        ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -622,6 +633,7 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
            training=training,
        )
@@ -668,6 +680,7 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin
        p_mask=None,
        is_impossible=None,
        output_attentions=None,
+        output_hidden_states=None,
        training=False,
    ):
        r"""
@@ -686,7 +699,8 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin
            Span-start scores (before SoftMax).
        end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
            Span-end scores (before SoftMax).
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when
+        ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -723,6 +737,7 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
            training=training,
        )

--- a/src/transformers/modeling_tf_t5.py
+++ b/src/transformers/modeling_tf_t5.py
@@ -558,6 +558,7 @@ class TFT5MainLayer(tf.keras.layers.Layer):
        past_key_value_states=None,
        use_cache=False,
        output_attentions=None,
+        output_hidden_states=None,
        training=False,
    ):
        if isinstance(inputs, (tuple, list)):
@@ -584,6 +585,7 @@ class TFT5MainLayer(tf.keras.layers.Layer):
            input_ids = inputs
        output_attentions = output_attentions if output_attentions is not None else self.output_attentions
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both inputs and inputs_embeds at the same time")
@@ -696,7 +698,7 @@ class TFT5MainLayer(tf.keras.layers.Layer):
        hidden_states = self.dropout(inputs_embeds, training=training)
        for i, (layer_module, past_key_value_state) in enumerate(zip(self.block, past_key_value_states)):
-            if self.output_hidden_states:
+            if cast_bool_to_primitive(output_hidden_states) is True:
                all_hidden_states = all_hidden_states + (hidden_states,)
            layer_outputs = layer_module(
@@ -731,14 +733,14 @@ class TFT5MainLayer(tf.keras.layers.Layer):
        hidden_states = self.dropout(hidden_states, training=training)
        # Add last layer
-        if self.output_hidden_states:
+        if cast_bool_to_primitive(output_hidden_states) is True:
            all_hidden_states = all_hidden_states + (hidden_states,)
        outputs = (hidden_states,)
        if use_cache is True:
            assert self.is_decoder, "`use_cache` can only be set to `True` if {} is used as a decoder".format(self)
            outputs = outputs + (present_key_value_states,)
-        if self.output_hidden_states:
+        if cast_bool_to_primitive(output_hidden_states) is True:
            outputs = outputs + (all_hidden_states,)
        if cast_bool_to_primitive(output_attentions) is True:
            outputs = outputs + (all_attentions,)
@@ -912,7 +914,7 @@ class TFT5Model(TFT5PreTrainedModel):
            Contains pre-computed key and value hidden-states of the attention blocks.
            Can be used to speed up sequential decoding (see `decoder_past_key_value_states` input).
            Note that when using `decoder_past_key_value_states`, the model only outputs the last `hidden-state` of the sequence of shape :obj:`(batch_size, 1, config.vocab_size)`.
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -953,6 +955,7 @@ class TFT5Model(TFT5PreTrainedModel):
        use_cache = kwargs.get("use_cache", True)
        head_mask = kwargs.get("head_mask", None)
        output_attentions = kwargs.get("output_attentions", None)
+        output_hidden_states = kwargs.get("output_hidden_states", None)
        # Encode if needed (training, first prediction pass)
        if encoder_outputs is None:
@@ -962,6 +965,7 @@ class TFT5Model(TFT5PreTrainedModel):
                inputs_embeds=inputs_embeds,
                head_mask=head_mask,
                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
            )
        hidden_states = encoder_outputs[0]
@@ -985,6 +989,7 @@ class TFT5Model(TFT5PreTrainedModel):
            head_mask=head_mask,
            use_cache=use_cache,
            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
        )
        if use_cache is True:
@@ -1049,7 +1054,7 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel):
            Contains pre-computed key and value hidden-states of the attention blocks.
            Can be used to speed up sequential decoding (see `decoder_past_key_value_states` input).
            Note that when using `decoder_past_key_value_states`, the model only outputs the last `prediction_score` of the sequence of shape :obj:`(batch_size, 1, config.vocab_size)`.
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -1094,6 +1099,7 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel):
        decoder_inputs_embeds = kwargs.get("decoder_inputs_embeds", None)
        head_mask = kwargs.get("head_mask", None)
        output_attentions = kwargs.get("output_attentions", None)
+        output_hidden_states = kwargs.get("output_hidden_states", None)
        # Encode if needed (training, first prediction pass)
        if encoder_outputs is None:
@@ -1104,6 +1110,7 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel):
                inputs_embeds=inputs_embeds,
                head_mask=head_mask,
                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
            )
        hidden_states = encoder_outputs[0]
@@ -1127,6 +1134,7 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel):
            head_mask=head_mask,
            use_cache=use_cache,
            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
        )
        # insert decoder past at right place

--- a/src/transformers/modeling_tf_transfo_xl.py
+++ b/src/transformers/modeling_tf_transfo_xl.py
@@ -520,25 +520,37 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
        return new_mems
-    def call(self, inputs, mems=None, head_mask=None, inputs_embeds=None, output_attentions=None, training=False):
+    def call(
+        self,
+        inputs,
+        mems=None,
+        head_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        training=False,
+    ):
        if isinstance(inputs, (tuple, list)):
            input_ids = inputs[0]
            mems = inputs[1] if len(inputs) > 1 else mems
            head_mask = inputs[2] if len(inputs) > 2 else head_mask
            inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds
            output_attentions = inputs[4] if len(inputs) > 4 else output_attentions
-            assert len(inputs) <= 5, "Too many inputs."
+            output_hidden_states = inputs[5] if len(inputs) > 4 else output_hidden_states
+            assert len(inputs) <= 6, "Too many inputs."
        elif isinstance(inputs, (dict, BatchEncoding)):
            input_ids = inputs.get("input_ids")
            mems = inputs.get("mems", mems)
            head_mask = inputs.get("head_mask", head_mask)
            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
            output_attentions = inputs.get("output_attentions", output_attentions)
-            assert len(inputs) <= 5, "Too many inputs."
+            output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
+            assert len(inputs) <= 6, "Too many inputs."
        else:
            input_ids = inputs
        output_attentions = output_attentions if output_attentions is not None else self.output_attentions
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states
        # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
        # so we transpose here from shape [bsz, len] to shape [len, bsz]
@@ -625,7 +637,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
        # We transpose back here to shape [bsz, len, hidden_dim]
        outputs = [tf.transpose(core_out, perm=(1, 0, 2)), new_mems]
-        if self.output_hidden_states:
+        if cast_bool_to_primitive(output_hidden_states):
            # Add last layer and transpose to library standard shape [bsz, len, hidden_dim]
            hids.append(core_out)
            hids = list(tf.transpose(t, perm=(1, 0, 2)) for t in hids)
@@ -720,7 +732,7 @@ class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -807,6 +819,7 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
+        output_hidden_states=None,
        training=False,
    ):
        r"""
@@ -818,7 +831,7 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -867,7 +880,7 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
            bsz, tgt_len = shape_list(inputs_embeds)[:2]
        transformer_outputs = self.transformer(
-            [input_ids, mems, head_mask, inputs_embeds, output_attentions], training=training
+            [input_ids, mems, head_mask, inputs_embeds, output_attentions, output_hidden_states], training=training
        )
        last_hidden = transformer_outputs[0]

--- a/src/transformers/modeling_tf_xlm.py
+++ b/src/transformers/modeling_tf_xlm.py
@@ -332,6 +332,7 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
        head_mask=None,
        inputs_embeds=None,
        output_attentions=None,
+        output_hidden_states=None,
        training=False,
    ):  # removed: src_enc=None, src_len=None
        if isinstance(inputs, (tuple, list)):
@@ -345,7 +346,8 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
            head_mask = inputs[7] if len(inputs) > 7 else head_mask
            inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds
            output_attentions = inputs[9] if len(inputs) > 9 else output_attentions
-            assert len(inputs) <= 10, "Too many inputs."
+            output_hidden_states = inputs[10] if len(inputs) > 10 else output_hidden_states
+            assert len(inputs) <= 11, "Too many inputs."
        elif isinstance(inputs, (dict, BatchEncoding)):
            input_ids = inputs.get("input_ids")
            attention_mask = inputs.get("attention_mask", attention_mask)
@@ -357,11 +359,13 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
            head_mask = inputs.get("head_mask", head_mask)
            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
            output_attentions = inputs.get("output_attentions", output_attentions)
-            assert len(inputs) <= 10, "Too many inputs."
+            output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
+            assert len(inputs) <= 11, "Too many inputs."
        else:
            input_ids = inputs
        output_attentions = output_attentions if output_attentions is not None else self.output_attentions
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
@@ -445,7 +449,7 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
        hidden_states = ()
        attentions = ()
        for i in range(self.n_layers):
-            if self.output_hidden_states:
+            if cast_bool_to_primitive(output_hidden_states) is True:
                hidden_states = hidden_states + (tensor,)
            # self attention
@@ -472,7 +476,7 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
            tensor = tensor * mask[..., tf.newaxis]
        # Add last hidden state
-        if self.output_hidden_states:
+        if cast_bool_to_primitive(output_hidden_states) is True:
            hidden_states = hidden_states + (tensor,)
        # update cache length
@@ -483,7 +487,7 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
        # tensor = tensor.transpose(0, 1)
        outputs = (tensor,)
-        if self.output_hidden_states:
+        if cast_bool_to_primitive(output_hidden_states) is True:
            outputs = outputs + (hidden_states,)
        if cast_bool_to_primitive(output_attentions) is True:
            outputs = outputs + (attentions,)
@@ -610,7 +614,7 @@ class TFXLMModel(TFXLMPreTrainedModel):
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs:
        last_hidden_state (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -706,7 +710,7 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs:
        prediction_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -766,6 +770,7 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel, TFSequenceClassificat
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
+        output_hidden_states=None,
        training=False,
    ):
        r"""
@@ -779,7 +784,7 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel, TFSequenceClassificat
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs:
        logits (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -815,6 +820,7 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel, TFSequenceClassificat
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
            training=training,
        )
        output = transformer_outputs[0]
@@ -865,6 +871,7 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
+        output_hidden_states=None,
        training=False,
    ):
        r"""
@@ -879,7 +886,8 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
            `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above).
            Classification scores (before SoftMax).
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when
+        ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -956,6 +964,7 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
            head_mask,
            inputs_embeds,
            output_attentions,
+            output_hidden_states,
        ]
        transformer_outputs = self.transformer(flat_inputs, training=training)
@@ -1002,6 +1011,7 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
+        output_hidden_states=None,
        training=False,
    ):
        r"""
@@ -1013,7 +1023,8 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
        scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
            Classification scores (before SoftMax).
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when
+        ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -1045,6 +1056,7 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
            training=training,
        )
@@ -1093,6 +1105,7 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL
        p_mask=None,
        is_impossible=None,
        output_attentions=None,
+        output_hidden_states=None,
        training=False,
    ):
        r"""
@@ -1111,7 +1124,7 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL
            Span-start scores (before SoftMax).
        end_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length,)`):
            Span-end scores (before SoftMax).
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -1150,6 +1163,7 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
            training=training,
        )

--- a/src/transformers/modeling_tf_xlnet.py
+++ b/src/transformers/modeling_tf_xlnet.py
@@ -517,6 +517,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
        inputs_embeds=None,
        use_cache=True,
        output_attentions=None,
+        output_hidden_states=None,
        training=False,
    ):
        if isinstance(inputs, (tuple, list)):
@@ -530,8 +531,9 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
            head_mask = inputs[7] if len(inputs) > 7 else head_mask
            inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds
            use_cache = inputs[9] if len(inputs) > 9 else use_cache
-            output_attentions = inputs[-9] if len(inputs) > 10 else output_attentions
+            output_attentions = inputs[10] if len(inputs) > 10 else output_attentions
-            assert len(inputs) <= 11, "Too many inputs."
+            output_hidden_states = inputs[11] if len(inputs) > 11 else output_hidden_states
+            assert len(inputs) <= 12, "Too many inputs."
        elif isinstance(inputs, (dict, BatchEncoding)):
            input_ids = inputs.get("input_ids")
            attention_mask = inputs.get("attention_mask", attention_mask)
@@ -544,11 +546,13 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
            use_cache = inputs.get("use_cache", use_cache)
            output_attentions = inputs.get("output_attentions", output_attentions)
-            assert len(inputs) <= 11, "Too many inputs."
+            output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
+            assert len(inputs) <= 12, "Too many inputs."
        else:
            input_ids = inputs
        output_attentions = output_attentions if output_attentions is not None else self.output_attentions
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states
        # the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
        # but we want a unified interface in the library with the batch size on the first dimension
@@ -677,7 +681,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
            # cache new mems
            if self.mem_len is not None and self.mem_len > 0 and use_cache is True:
                new_mems = new_mems + (self.cache_mem(output_h, mems[i]),)
-            if self.output_hidden_states:
+            if cast_bool_to_primitive(output_hidden_states) is True:
                hidden_states.append((output_h, output_g) if output_g is not None else output_h)
            outputs = layer_module(
@@ -700,7 +704,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
                attentions.append(outputs[2])
        # Add last hidden state
-        if self.output_hidden_states:
+        if cast_bool_to_primitive(output_hidden_states) is True:
            hidden_states.append((output_h, output_g) if output_g is not None else output_h)
        output = self.dropout(output_g if output_g is not None else output_h, training=training)
@@ -711,7 +715,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
        if self.mem_len is not None and self.mem_len > 0 and use_cache is True:
            outputs = outputs + (new_mems,)
-        if self.output_hidden_states:
+        if cast_bool_to_primitive(output_hidden_states) is True:
            if output_g is not None:
                hidden_states = tuple(tf.transpose(h, perm=(1, 0, 2)) for hs in hidden_states for h in hs)
            else:
@@ -838,7 +842,7 @@ class TFXLNetModel(TFXLNetPreTrainedModel):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -922,7 +926,7 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -996,6 +1000,7 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassif
        use_cache=True,
        labels=None,
        output_attentions=None,
+        output_hidden_states=None,
        training=False,
    ):
        r"""
@@ -1013,7 +1018,7 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassif
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -1050,6 +1055,7 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassif
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
        )
        output = transformer_outputs[0]
@@ -1106,6 +1112,7 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss):
        use_cache=True,
        labels=None,
        output_attentions=None,
+        output_hidden_states=None,
        training=False,
    ):
        r"""
@@ -1120,7 +1127,8 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss):
            `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above).
            Classification scores (before SoftMax).
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when
+        ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -1158,8 +1166,9 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss):
            head_mask = inputs[7] if len(inputs) > 7 else head_mask
            inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds
            use_cache = inputs[9] if len(inputs) > 9 else use_cache
-            output_attentions = inputs[-9] if len(inputs) > 10 else output_attentions
+            output_attentions = inputs[10] if len(inputs) > 10 else output_attentions
-            assert len(inputs) <= 11, "Too many inputs."
+            output_hidden_states = inputs[11] if len(inputs) > 11 else output_hidden_states
+            assert len(inputs) <= 12, "Too many inputs."
        elif isinstance(inputs, (dict, BatchEncoding)):
            input_ids = inputs.get("input_ids")
            attention_mask = inputs.get("attention_mask", attention_mask)
@@ -1172,7 +1181,8 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss):
            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
            use_cache = inputs.get("use_cache", use_cache)
            output_attentions = inputs.get("output_attentions", output_attentions)
-            assert len(inputs) <= 11, "Too many inputs."
+            output_hidden_states = inputs.get("output_hidden_states", output_attentions)
+            assert len(inputs) <= 12, "Too many inputs."
        else:
            input_ids = inputs
@@ -1200,6 +1210,7 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss):
            inputs_embeds,
            use_cache,
            output_attentions,
+            output_hidden_states,
        ]
        transformer_outputs = self.transformer(flat_inputs, training=training)
@@ -1246,6 +1257,7 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificatio
        use_cache=True,
        labels=None,
        output_attentions=None,
+        output_hidden_states=None,
        training=False,
    ):
        r"""
@@ -1261,7 +1273,7 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificatio
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -1298,6 +1310,7 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificatio
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
            training=training,
        )
        output = transformer_outputs[0]
@@ -1345,6 +1358,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer
        p_mask=None,
        is_impossible=None,
        output_attentions=None,
+        output_hidden_states=None,
        training=False,
    ):
        r"""
@@ -1369,7 +1383,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -1408,6 +1422,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
            training=training,
        )
@@ -1457,7 +1472,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer
 #             that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
 #             if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
 #             See details in the docstring of the `mems` input above.
-#         **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+#         **hidden_states**: (`optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``)
 #             list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
 #             of shape ``(batch_size, sequence_length, hidden_size)``:
 #             Hidden-states of the model at the output of each layer plus the initial embedding outputs.

--- a/src/transformers/modeling_transfo_xl.py
+++ b/src/transformers/modeling_transfo_xl.py
@@ -634,7 +634,6 @@ TRANSFO_XL_INPUTS_DOCSTRING = r"""
 class TransfoXLModel(TransfoXLPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
-        self.output_hidden_states = config.output_hidden_states
        self.n_token = config.vocab_size
@@ -750,7 +749,15 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
        return new_mems
    @add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING)
-    def forward(self, input_ids=None, mems=None, head_mask=None, inputs_embeds=None, output_attentions=None):
+    def forward(
+        self,
+        input_ids=None,
+        mems=None,
+        head_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+    ):
        r"""
    Return:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.TransfoXLConfig`) and inputs:
@@ -760,7 +767,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -785,6 +792,9 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
        # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
        # so we transpose here from shape [bsz, len] to shape [len, bsz]
@@ -873,7 +883,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
        # We transpose back here to shape [bsz, len, hidden_dim]
        outputs = [core_out.transpose(0, 1).contiguous(), new_mems]
-        if self.output_hidden_states:
+        if output_hidden_states:
            # Add last layer and transpose to library standard shape [bsz, len, hidden_dim]
            hids.append(core_out)
            hids = list(t.transpose(0, 1).contiguous() for t in hids)
@@ -936,7 +946,14 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
    @add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING)
    def forward(
-        self, input_ids=None, mems=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None
+        self,
+        input_ids=None,
+        mems=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
@@ -956,7 +973,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -988,7 +1005,12 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
            raise ValueError("You have to specify either input_ids or inputs_embeds")
        transformer_outputs = self.transformer(
-            input_ids, mems=mems, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions
+            input_ids,
+            mems=mems,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
        )
        last_hidden = transformer_outputs[0]

--- a/src/transformers/modeling_xlm.py
+++ b/src/transformers/modeling_xlm.py
@@ -314,7 +314,6 @@ XLM_INPUTS_DOCSTRING = r"""
 class XLMModel(XLMPreTrainedModel):
    def __init__(self, config):  # , dico, is_encoder, with_output):
        super().__init__(config)
-        self.output_hidden_states = config.output_hidden_states
        # encoder / decoder, output layer
        self.is_encoder = config.is_encoder
@@ -408,13 +407,14 @@ class XLMModel(XLMPreTrainedModel):
        head_mask=None,
        inputs_embeds=None,
        output_attentions=None,
+        output_hidden_states=None,
    ):
        r"""
    Return:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs:
        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -439,6 +439,9 @@ class XLMModel(XLMPreTrainedModel):
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
        if input_ids is not None:
            bs, slen = input_ids.size()
@@ -511,7 +514,7 @@ class XLMModel(XLMPreTrainedModel):
        hidden_states = ()
        attentions = ()
        for i in range(self.n_layers):
-            if self.output_hidden_states:
+            if output_hidden_states:
                hidden_states = hidden_states + (tensor,)
            # self attention
@@ -538,7 +541,7 @@ class XLMModel(XLMPreTrainedModel):
            tensor *= mask.unsqueeze(-1).to(tensor.dtype)
        # Add last hidden state
-        if self.output_hidden_states:
+        if output_hidden_states:
            hidden_states = hidden_states + (tensor,)
        # update cache length
@@ -549,7 +552,7 @@ class XLMModel(XLMPreTrainedModel):
        # tensor = tensor.transpose(0, 1)
        outputs = (tensor,)
-        if self.output_hidden_states:
+        if output_hidden_states:
            outputs = outputs + (hidden_states,)
        if output_attentions:
            outputs = outputs + (attentions,)
@@ -642,6 +645,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
+        output_hidden_states=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
@@ -657,7 +661,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
            Language modeling loss.
        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -692,6 +696,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
        )
        output = transformer_outputs[0]
@@ -730,6 +735,7 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
+        output_hidden_states=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
@@ -744,7 +750,7 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
            Classification (or regression if config.num_labels==1) loss.
        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -780,6 +786,7 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
        )
        output = transformer_outputs[0]
@@ -829,6 +836,7 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
        start_positions=None,
        end_positions=None,
        output_attentions=None,
+        output_hidden_states=None,
    ):
        r"""
        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
@@ -848,7 +856,7 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
            Span-start scores (before SoftMax).
        end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
            Span-end scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -885,6 +893,7 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
        )
        sequence_output = transformer_outputs[0]
@@ -952,6 +961,7 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
        cls_index=None,
        p_mask=None,
        output_attentions=None,
+        output_hidden_states=None,
    ):
        r"""
        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
@@ -984,7 +994,7 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
            Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
        cls_logits (``torch.FloatTensor`` of shape ``(batch_size,)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
            Log probabilities for the ``is_impossible`` label of the answers.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -1021,6 +1031,7 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
        )
        output = transformer_outputs[0]
@@ -1066,6 +1077,7 @@ class XLMForTokenClassification(XLMPreTrainedModel):
        head_mask=None,
        labels=None,
        output_attentions=None,
+        output_hidden_states=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
@@ -1078,7 +1090,7 @@ class XLMForTokenClassification(XLMPreTrainedModel):
            Classification loss.
        scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`)
            Classification scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -1111,6 +1123,7 @@ class XLMForTokenClassification(XLMPreTrainedModel):
            position_ids=position_ids,
            head_mask=head_mask,
            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
        )
        sequence_output = outputs[0]

--- a/src/transformers/modeling_xlnet.py
+++ b/src/transformers/modeling_xlnet.py
@@ -630,7 +630,6 @@ XLNET_INPUTS_DOCSTRING = r"""
 class XLNetModel(XLNetPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
-        self.output_hidden_states = config.output_hidden_states
        self.mem_len = config.mem_len
        self.reuse_len = config.reuse_len
@@ -763,6 +762,7 @@ class XLNetModel(XLNetPreTrainedModel):
        inputs_embeds=None,
        use_cache=True,
        output_attentions=None,
+        output_hidden_states=None,
    ):
        r"""
    Return:
@@ -774,7 +774,7 @@ class XLNetModel(XLNetPreTrainedModel):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -801,6 +801,9 @@ class XLNetModel(XLNetPreTrainedModel):
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
        # the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
        # but we want a unified interface in the library with the batch size on the first dimension
@@ -934,7 +937,7 @@ class XLNetModel(XLNetPreTrainedModel):
            if self.mem_len is not None and self.mem_len > 0 and use_cache is True:
                # cache new mems
                new_mems = new_mems + (self.cache_mem(output_h, mems[i]),)
-            if self.output_hidden_states:
+            if output_hidden_states:
                hidden_states.append((output_h, output_g) if output_g is not None else output_h)
            outputs = layer_module(
@@ -954,7 +957,7 @@ class XLNetModel(XLNetPreTrainedModel):
                attentions.append(outputs[2])
        # Add last hidden state
-        if self.output_hidden_states:
+        if output_hidden_states:
            hidden_states.append((output_h, output_g) if output_g is not None else output_h)
        output = self.dropout(output_g if output_g is not None else output_h)
@@ -965,7 +968,7 @@ class XLNetModel(XLNetPreTrainedModel):
        if self.mem_len is not None and self.mem_len > 0 and use_cache is True:
            outputs = outputs + (new_mems,)
-        if self.output_hidden_states:
+        if output_hidden_states:
            if output_g is not None:
                hidden_states = tuple(h.permute(1, 0, 2).contiguous() for hs in hidden_states for h in hs)
            else:
@@ -1051,6 +1054,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
        use_cache=True,
        labels=None,
        output_attentions=None,
+        output_hidden_states=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_predict)`, `optional`, defaults to :obj:`None`):
@@ -1072,7 +1076,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -1127,6 +1131,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
        )
        logits = self.lm_loss(transformer_outputs[0])
@@ -1173,6 +1178,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
        use_cache=True,
        labels=None,
        output_attentions=None,
+        output_hidden_states=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`)
@@ -1191,7 +1197,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -1229,6 +1235,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
        )
        output = transformer_outputs[0]
@@ -1280,6 +1287,7 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
        use_cache=True,
        labels=None,
        output_attentions=None,
+        output_hidden_states=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
@@ -1297,7 +1305,7 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -1337,6 +1345,7 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
        )
        sequence_output = outputs[0]
@@ -1391,6 +1400,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
        use_cache=True,
        labels=None,
        output_attentions=None,
+        output_hidden_states=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
@@ -1410,7 +1420,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -1462,6 +1472,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
            inputs_embeds=flat_inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
        )
        output = transformer_outputs[0]
@@ -1512,6 +1523,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
        start_positions=None,
        end_positions=None,
        output_attentions=None,
+        output_hidden_states=None,
    ):
        r"""
        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
@@ -1535,7 +1547,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -1576,6 +1588,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
        )
        sequence_output = outputs[0]
@@ -1643,6 +1656,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
        cls_index=None,
        p_mask=None,
        output_attentions=None,
+        output_hidden_states=None,
    ):
        r"""
        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
@@ -1679,7 +1693,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -1718,6 +1732,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
        )
        hidden_states = transformer_outputs[0]
        start_logits = self.start_logits(hidden_states, p_mask=p_mask)

--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -143,14 +143,13 @@ class ModelTesterMixin:
        for model_class in self.all_model_classes:
            inputs_dict["output_attentions"] = True
-            config.output_hidden_states = False
+            inputs_dict["output_hidden_states"] = False
            model = model_class(config)
            model.to(torch_device)
            model.eval()
            with torch.no_grad():
                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
            attentions = outputs[-1]
-            self.assertEqual(model.config.output_hidden_states, False)
            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
            # check that output_attentions also work using config
@@ -162,7 +161,6 @@ class ModelTesterMixin:
            with torch.no_grad():
                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
            attentions = outputs[-1]
-            self.assertEqual(model.config.output_hidden_states, False)
            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
            if chunk_length is not None:
@@ -201,14 +199,13 @@ class ModelTesterMixin:
            # Check attention is always last and order is fine
            inputs_dict["output_attentions"] = True
-            config.output_hidden_states = True
+            inputs_dict["output_hidden_states"] = True
            model = model_class(config)
            model.to(torch_device)
            model.eval()
            with torch.no_grad():
                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
            self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs))
-            self.assertEqual(model.config.output_hidden_states, True)
            self_attentions = outputs[-1]
            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
@@ -493,19 +490,16 @@ class ModelTesterMixin:
            self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2], 2: [1, 2]})
    def test_hidden_states_output(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        def check_hidden_states_output(inputs_dict, config, model_class):
-        for model_class in self.all_model_classes:
-            config.output_hidden_states = True
            model = model_class(config)
            model.to(torch_device)
            model.eval()
            with torch.no_grad():
                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
            hidden_states = outputs[-1]
-            self.assertEqual(model.config.output_hidden_states, True)
-            self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1)
+            self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1)
            if hasattr(self.model_tester, "encoder_seq_length"):
                seq_length = self.model_tester.encoder_seq_length
                if hasattr(self.model_tester, "chunk_length") and self.model_tester.chunk_length > 1:
@@ -517,6 +511,18 @@ class ModelTesterMixin:
                list(hidden_states[0].shape[-2:]), [seq_length, self.model_tester.hidden_size],
            )
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+            check_hidden_states_output(inputs_dict, config, model_class)
    def test_resize_tokens_embeddings(self):
        (original_config, inputs_dict,) = self.model_tester.prepare_config_and_inputs_for_common()
        if not self.test_resize_embeddings:

--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -392,17 +392,23 @@ class TFModelTesterMixin:
    def test_hidden_states_output(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
+        def check_hidden_states_output(config, inputs_dict, model_class):
-            config.output_hidden_states = True
            model = model_class(config)
            outputs = model(self._prepare_for_class(inputs_dict, model_class))
            hidden_states = [t.numpy() for t in outputs[-1]]
-            self.assertEqual(model.config.output_hidden_states, True)
            self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1)
            self.assertListEqual(
                list(hidden_states[0].shape[-2:]), [self.model_tester.seq_length, self.model_tester.hidden_size],
            )
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(config, inputs_dict, model_class)
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+            check_hidden_states_output(config, inputs_dict, model_class)
    def test_model_common_attributes(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()