Refactor Code samples; Test code samples (#5036)

* Refactor code samples * Test docstrings * Style * Tokenization examples * Run rust of tests * First step to testing source docs * Style and BART comment * Test the remainder of the code samples * Style * let to const * Formatting fixes * Ready for merge * Fix fixture + Style * Fix last tests * Update docs/source/quicktour.rst Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Addressing @sgugger's comments + Fix MobileBERT in TF Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

Refactor Code samples; Test code samples (#5036)
* Refactor code samples * Test docstrings * Style * Tokenization examples * Run rust of tests * First step to testing source docs * Style and BART comment * Test the remainder of the code samples * Style * let to const * Formatting fixes * Ready for merge * Fix fixture + Style * Fix last tests * Update docs/source/quicktour.rst Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Addressing @sgugger's comments + Fix MobileBERT in TF Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
364a5ae1 · Lysandre Debut · GitHub · 315f464b · 364a5ae1 · 364a5ae1
Unverified Commit 364a5ae1 authored Jun 25, 2020 by Lysandre Debut Committed by GitHub Jun 25, 2020
20 changed files
--- a/src/transformers/modeling_mobilebert.py
+++ b/src/transformers/modeling_mobilebert.py
@@ -34,11 +34,14 @@ from transformers.modeling_bert import BertIntermediate

 from .activations import gelu, gelu_new, swish
 from .configuration_mobilebert import MobileBertConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer


 logger = logging.getLogger(__name__)
+
+_TOKENIZER_FOR_DOC = "MobileBertTokenizer"
+
 MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST = ["google/mobilebert-uncased"]


@@ -747,6 +750,7 @@ class MobileBertModel(MobileBertPreTrainedModel):
            self.encoder.layer[layer].attention.prune_heads(heads)

    @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
    def forward(
        self,
        input_ids=None,
@@ -785,20 +789,6 @@ class MobileBertModel(MobileBertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import MobileBertModel, MobileBertTokenizer
-        import torch
-
-        tokenizer = MobileBertTokenizer.from_pretrained(model_name_or_path)
-        model = MobileBertModel.from_pretrained(model_name_or_path)
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
        """

        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -951,13 +941,17 @@ class MobileBertForPreTraining(MobileBertPreTrainedModel):
            heads.

    Examples::
-        from transformers import MobileBertTokenizer, MobileBertForPreTraining
-        import torch
-        tokenizer = MobileBertTokenizer.from_pretrained(model_name_or_path)
-        model = MobileBertForPreTraining.from_pretrained(model_name_or_path)
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        prediction_scores, seq_relationship_scores = outputs[:2]
+
+        >>> from transformers import MobileBertTokenizer, MobileBertForPreTraining
+        >>> import torch
+
+        >>> tokenizer = MobileBertTokenizer.from_pretrained("google/mobilebert-uncased")
+        >>> model = MobileBertForPreTraining.from_pretrained("google/mobilebert-uncased")
+
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids)
+
+        >>> prediction_scores, seq_relationship_scores = outputs[:2]

        """
        outputs = self.mobilebert(
@@ -1022,6 +1016,7 @@ class MobileBertForMaskedLM(MobileBertPreTrainedModel):
            self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings())

    @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
    def forward(
        self,
        input_ids=None,
@@ -1063,20 +1058,6 @@ class MobileBertForMaskedLM(MobileBertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-        Examples::
-
-            from transformers import MobileBertTokenizer, MobileBertForMaskedLM
-            import torch
-
-            tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
-            model = MobileBertForMaskedLM.from_pretrained('mobilebert-uncased')
-
-            input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-            outputs = model(input_ids, labels=input_ids)
-
-            loss, prediction_scores = outputs[:2]
-
        """
        if "masked_lm_labels" in kwargs:
            warnings.warn(
@@ -1174,18 +1155,17 @@ class MobileBertForNextSentencePrediction(MobileBertPreTrainedModel):

    Examples::

-        from transformers import MobileBertTokenizer, MobileBertForNextSentencePrediction
-        import torch
+        >>> from transformers import MobileBertTokenizer, MobileBertForNextSentencePrediction
+        >>> import torch

-        tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
-        model = MobileBertForNextSentencePrediction.from_pretrained('mobilebert-uncased')
+        >>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
+        >>> model = MobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased')

-        prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        next_sentence = "The sky is blue due to the shorter wavelength of blue light."
-        encoding = tokenizer.encode_plus(prompt, next_sentence, return_tensors='pt')
+        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+        >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')

-        loss, logits = model(**encoding, next_sentence_label=torch.LongTensor([1]))
-        assert logits[0, 0] < logits[0, 1] # next sentence was random
+        >>> loss, logits = model(**encoding, next_sentence_label=torch.LongTensor([1]))
        """

        outputs = self.mobilebert(
@@ -1228,6 +1208,7 @@ class MobileBertForSequenceClassification(MobileBertPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
    def forward(
        self,
        input_ids=None,
@@ -1263,20 +1244,6 @@ class MobileBertForSequenceClassification(MobileBertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import BertTokenizer, BertForSequenceClassification
-        import torch
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-
-        loss, logits = outputs[:2]
        """

        outputs = self.mobilebert(
@@ -1321,6 +1288,7 @@ class MobileBertForQuestionAnswering(MobileBertPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
    def forward(
        self,
        input_ids=None,
@@ -1363,25 +1331,6 @@ class MobileBertForQuestionAnswering(MobileBertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import MobileBertTokenizer, MobileBertForQuestionAnswering
-        import torch
-
-        tokenizer = BertTokenizer.from_pretrained(model_name_or_path)
-        model = MobileBertForQuestionAnswering.from_pretrained(model_name_or_path)
-
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        encoding = tokenizer.encode_plus(question, text)
-        input_ids, token_type_ids = encoding["input_ids"], encoding["token_type_ids"]
-        start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
-
-        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
-        answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])
-
-        assert answer == "a nice puppet"
-
        """

        outputs = self.mobilebert(
@@ -1439,6 +1388,7 @@ class MobileBertForMultipleChoice(MobileBertPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
    def forward(
        self,
        input_ids=None,
@@ -1476,25 +1426,6 @@ class MobileBertForMultipleChoice(MobileBertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import MobileBertTokenizer, MobileBertForMultipleChoice
-        import torch
-
-        tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
-        model = MobileBertForMultipleChoice.from_pretrained('mobilebert-uncased')
-
-        prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        choice0 = "It is eaten with a fork and a knife."
-        choice1 = "It is eaten while held in the hand."
-        labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1
-
-        encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='pt', pad_to_max_length=True)
-        outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
-
-        # the linear classifier still needs to be trained
-        loss, logits = outputs[:2]
        """
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

@@ -1552,6 +1483,7 @@ class MobileBertForTokenClassification(MobileBertPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
    def forward(
        self,
        input_ids=None,
@@ -1586,21 +1518,6 @@ class MobileBertForTokenClassification(MobileBertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import MobileBertTokenizer, MobileBertForTokenClassification
-        import torch
-
-        tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
-        model = MobileBertForTokenClassification.from_pretrained('mobilebert-uncased')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-
-        loss, scores = outputs[:2]
-
        """

        outputs = self.mobilebert(

--- a/src/transformers/modeling_openai.py
+++ b/src/transformers/modeling_openai.py
@@ -28,7 +28,7 @@ from torch.nn import CrossEntropyLoss

 from .activations import gelu_new, swish
 from .configuration_openai import OpenAIGPTConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_utils import (
    Conv1D,
    PreTrainedModel,
@@ -40,6 +40,8 @@ from .modeling_utils import (

 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "OpenAIGPTTokenizer"
+
 OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "openai-gpt",
    # See all OpenAI GPT models at https://huggingface.co/models?filter=openai-gpt
@@ -356,6 +358,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
            self.h[layer].attn.prune_heads(heads)

    @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="openai-gpt")
    def forward(
        self,
        input_ids=None,
@@ -383,18 +386,6 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import OpenAIGPTTokenizer, OpenAIGPTModel
-        import torch
-
-        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-        model = OpenAIGPTModel.from_pretrained('openai-gpt')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
@@ -490,6 +481,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
        return self.lm_head

    @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="openai-gpt")
    def forward(
        self,
        input_ids=None,
@@ -531,18 +523,6 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import OpenAIGPTTokenizer, OpenAIGPTLMHeadModel
-        import torch
-
-        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-        model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=input_ids)
-        loss, logits = outputs[:2]
-
    """
        transformer_outputs = self.transformer(
            input_ids,

--- a/src/transformers/modeling_reformer.py
+++ b/src/transformers/modeling_reformer.py
@@ -29,12 +29,20 @@ from torch.nn import CrossEntropyLoss

 from .activations import gelu, gelu_fast, gelu_new, swish
 from .configuration_reformer import ReformerConfig
-from .file_utils import DUMMY_INPUTS, DUMMY_MASK, add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import (
+    DUMMY_INPUTS,
+    DUMMY_MASK,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_callable,
+)
 from .modeling_utils import PreTrainedModel, apply_chunking_to_forward


 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "ReformerTokenizer"
+
 REFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "google/reformer-crime-and-punishment",
    "google/reformer-enwik8",
@@ -1543,6 +1551,7 @@ class ReformerModel(ReformerPreTrainedModel):
            self.encoder.layer[layer].attention.prune_heads(heads)

    @add_start_docstrings_to_callable(REFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/reformer-crime-and-punishment")
    def forward(
        self,
        input_ids=None,
@@ -1570,19 +1579,6 @@ class ReformerModel(ReformerPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import ReformerModel, ReformerTokenizer
-        import torch
-
-        tokenizer = ReformerTokenizer.from_pretrained('google/reformer-crime-and-punishment')
-        model =  ReformerModel.from_pretrained('google/reformer-crime-and-punishment')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
        """

        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -1738,6 +1734,7 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel):
        pass

    @add_start_docstrings_to_callable(REFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/reformer-crime-and-punishment")
    def forward(
        self,
        input_ids=None,
@@ -1774,19 +1771,6 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import ReformerModelWithLMHead, ReformerTokenizer
-        import torch
-
-        tokenizer = ReformerTokenizer.from_pretrained('google/reformer-crime-and-punishment')
-        model =  ReformerModelWithLMHead.from_pretrained('google/reformer-crime-and-punishment')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=input_ids)
-
-        loss, prediction_scores = outputs[:2]
        """

        reformer_outputs = self.reformer(

--- a/src/transformers/modeling_roberta.py
+++ b/src/transformers/modeling_roberta.py
@@ -24,12 +24,14 @@ import torch.nn as nn
 from torch.nn import CrossEntropyLoss, MSELoss

 from .configuration_roberta import RobertaConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_bert import BertEmbeddings, BertLayerNorm, BertModel, BertPreTrainedModel, gelu


 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "RobertaTokenizer"
+
 ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "roberta-base",
    "roberta-large",
@@ -177,6 +179,7 @@ class RobertaForMaskedLM(BertPreTrainedModel):
        return self.lm_head.decoder

    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
    def forward(
        self,
        input_ids=None,
@@ -216,18 +219,6 @@ class RobertaForMaskedLM(BertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import RobertaTokenizer, RobertaForMaskedLM
-        import torch
-
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = RobertaForMaskedLM.from_pretrained('roberta-base')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=input_ids)
-        loss, prediction_scores = outputs[:2]
-
        """
        if "masked_lm_labels" in kwargs:
            warnings.warn(
@@ -304,6 +295,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
    def forward(
        self,
        input_ids=None,
@@ -340,19 +332,6 @@ class RobertaForSequenceClassification(BertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import RobertaTokenizer, RobertaForSequenceClassification
-        import torch
-
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = RobertaForSequenceClassification.from_pretrained('roberta-base')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
-
        """
        outputs = self.roberta(
            input_ids,
@@ -400,6 +379,7 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
    def forward(
        self,
        input_ids=None,
@@ -437,20 +417,6 @@ class RobertaForMultipleChoice(BertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import RobertaTokenizer, RobertaForMultipleChoice
-        import torch
-
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = RobertaForMultipleChoice.from_pretrained('roberta-base')
-        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
-        input_ids = torch.tensor([tokenizer.encode(s, add_special_tokens=True) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
-        labels = torch.tensor(1).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, classification_scores = outputs[:2]
-
        """
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

@@ -510,6 +476,7 @@ class RobertaForTokenClassification(BertPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
    def forward(
        self,
        input_ids=None,
@@ -544,19 +511,6 @@ class RobertaForTokenClassification(BertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import RobertaTokenizer, RobertaForTokenClassification
-        import torch
-
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = RobertaForTokenClassification.from_pretrained('roberta-base')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
-
        """

        outputs = self.roberta(
@@ -632,6 +586,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
    def forward(
        self,
        input_ids=None,
@@ -674,25 +629,6 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        # The checkpoint roberta-large is not fine-tuned for question answering. Please see the
-        # examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task.
-
-        from transformers import RobertaTokenizer, RobertaForQuestionAnswering
-        import torch
-
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = RobertaForQuestionAnswering.from_pretrained('roberta-base')
-
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        input_ids = tokenizer.encode(question, text)
-        start_scores, end_scores = model(torch.tensor([input_ids]))
-
-        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
-        answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])
-
        """

        outputs = self.roberta(

--- a/src/transformers/modeling_t5.py
+++ b/src/transformers/modeling_t5.py
@@ -33,6 +33,8 @@ from .modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, p

 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "T5Tokenizer"
+
 ####################################################
 # This dict contrains shortcut names and associated url
 # for the pretrained weights provided with the models
@@ -924,16 +926,17 @@ class T5Model(T5PreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

-    Examples::
+        Example::
+
+            >>> from transformers import T5Tokenizer, T5Model

-        from transformers import T5Tokenizer, T5Model
+            >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
+            >>> model = T5Model.from_pretrained('t5-small')

-        tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        model = T5Model.from_pretrained('t5-small')
-        input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt")  # Batch size 1
-        outputs = model(input_ids=input_ids, decoder_input_ids=input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+            >>> input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt")  # Batch size 1
+            >>> outputs = model(input_ids=input_ids, decoder_input_ids=input_ids)

+            >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
        """
        use_cache = use_cache if use_cache is not None else self.config.use_cache

@@ -1068,18 +1071,18 @@ class T5ForConditionalGeneration(T5PreTrainedModel):

    Examples::

-        from transformers import T5Tokenizer, T5ForConditionalGeneration
+        >>> from transformers import T5Tokenizer, T5ForConditionalGeneration

-        tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        model = T5ForConditionalGeneration.from_pretrained('t5-small')
-        input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt")  # Batch size 1
-        outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=input_ids)
-        loss, prediction_scores = outputs[:2]
+        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
+        >>> model = T5ForConditionalGeneration.from_pretrained('t5-small')
+        >>> input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt")  # Batch size 1
+        >>> outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=input_ids)
+        >>> loss, prediction_scores = outputs[:2]

-        tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        model = T5ForConditionalGeneration.from_pretrained('t5-small')
-        input_ids = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="pt")  # Batch size 1
-        outputs = model.generate(input_ids)
+        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
+        >>> model = T5ForConditionalGeneration.from_pretrained('t5-small')
+        >>> input_ids = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="pt")  # Batch size 1
+        >>> outputs = model.generate(input_ids)
        """

        if "lm_labels" in kwargs:

--- a/src/transformers/modeling_tf_albert.py
+++ b/src/transformers/modeling_tf_albert.py
@@ -21,7 +21,12 @@ import logging
 import tensorflow as tf

 from .configuration_albert import AlbertConfig
-from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import (
+    MULTIPLE_CHOICE_DUMMY_INPUTS,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_callable,
+)
 from .modeling_tf_bert import ACT2FN, TFBertSelfAttention
 from .modeling_tf_utils import (
    TFMultipleChoiceLoss,
@@ -39,6 +44,8 @@ from .tokenization_utils import BatchEncoding

 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "AlbertTokenizer"
+
 TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "albert-base-v1",
    "albert-large-v1",
@@ -713,6 +720,7 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
        self.albert = TFAlbertMainLayer(config, name="albert")

    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
    def call(self, inputs, **kwargs):
        r"""
    Returns:
@@ -737,18 +745,6 @@ class TFAlbertModel(TFAlbertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import AlbertTokenizer, TFAlbertModel
-
-        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        model = TFAlbertModel.from_pretrained('albert-base-v2')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
        """
        outputs = self.albert(inputs, **kwargs)
        return outputs
@@ -837,6 +833,7 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
        return self.albert.embeddings

    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
    def call(self, inputs, **kwargs):
        r"""
    Returns:
@@ -854,18 +851,6 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import AlbertTokenizer, TFAlbertForMaskedLM
-
-        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        model = TFAlbertForMaskedLM.from_pretrained('albert-base-v2')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        prediction_scores = outputs[0]
-
        """
        outputs = self.albert(inputs, **kwargs)

@@ -895,6 +880,7 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass
        )

    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
    def call(
        self,
        inputs=None,
@@ -930,19 +916,6 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import AlbertTokenizer, TFAlbertForSequenceClassification
-
-        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        model = TFAlbertForSequenceClassification.from_pretrained('albert-base-v2')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
-
        """
        if isinstance(inputs, (tuple, list)):
            labels = inputs[8] if len(inputs) > 8 else labels
@@ -994,6 +967,7 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat
        )

    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
    def call(
        self,
        inputs=None,
@@ -1027,19 +1001,6 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import AlbertTokenizer, TFAlbertForTokenClassification
-
-        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        model = TFAlbertForTokenClassification.from_pretrained('albert-base-v2')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
-
        """
        if isinstance(inputs, (tuple, list)):
            labels = inputs[8] if len(inputs) > 8 else labels
@@ -1089,6 +1050,7 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL
        )

    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
    def call(
        self,
        inputs=None,
@@ -1130,24 +1092,6 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        # The checkpoint albert-base-v2 is not fine-tuned for question answering. Please see the
-        # examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task.
-
-        import tensorflow as tf
-        from transformers import AlbertTokenizer, TFAlbertForQuestionAnswering
-
-        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        model = TFAlbertForQuestionAnswering.from_pretrained('albert-base-v2')
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
-        start_scores, end_scores = model(input_dict)
-
-        all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
-        answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
-
        """
        if isinstance(inputs, (tuple, list)):
            start_positions = inputs[8] if len(inputs) > 8 else start_positions
@@ -1213,6 +1157,7 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss):
        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}

    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
    def call(
        self,
        inputs,
@@ -1249,22 +1194,6 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import AlbertTokenizer, TFAlbertForMultipleChoice
-
-        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        model = TFAlbertForMultipleChoice.from_pretrained('albert-base-v2')
-        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
-
-        input_ids = tokenizer(choices, add_special_tokens=True, return_tensors='tf', truncation=True, padding=True)[None, :] # Batch size 1, 2 choices
-        labels = tf.reshape(tf.constant(1), (-1, 1))
-        outputs = model(input_ids, labels=labels)
-
-        loss, classification_scores = outputs[:2]
-
        """
        if isinstance(inputs, (tuple, list)):
            input_ids = inputs[0]

--- a/src/transformers/modeling_tf_bert.py
+++ b/src/transformers/modeling_tf_bert.py
@@ -22,7 +22,12 @@ import numpy as np
 import tensorflow as tf

 from .configuration_bert import BertConfig
-from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import (
+    MULTIPLE_CHOICE_DUMMY_INPUTS,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_callable,
+)
 from .modeling_tf_utils import (
    TFMultipleChoiceLoss,
    TFPreTrainedModel,
@@ -39,6 +44,7 @@ from .tokenization_utils import BatchEncoding

 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "BertTokenizer"

 TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "bert-base-uncased",
@@ -704,6 +710,7 @@ class TFBertModel(TFBertPreTrainedModel):
        self.bert = TFBertMainLayer(config, name="bert")

    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased")
    def call(self, inputs, **kwargs):
        r"""
    Returns:
@@ -728,18 +735,6 @@ class TFBertModel(TFBertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import BertTokenizer, TFBertModel
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = TFBertModel.from_pretrained('bert-base-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
        """
        outputs = self.bert(inputs, **kwargs)
        return outputs
@@ -819,6 +814,7 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
        return self.bert.embeddings

    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased")
    def call(self, inputs, **kwargs):
        r"""
    Return:
@@ -836,18 +832,6 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import BertTokenizer, TFBertForMaskedLM
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = TFBertForMaskedLM.from_pretrained('bert-base-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        prediction_scores = outputs[0]
-
        """
        outputs = self.bert(inputs, **kwargs)

@@ -930,6 +914,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific
        )

    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased")
    def call(
        self,
        inputs=None,
@@ -965,19 +950,6 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import BertTokenizer, TFBertForSequenceClassification
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
-
        """
        if isinstance(inputs, (tuple, list)):
            labels = inputs[8] if len(inputs) > 8 else labels
@@ -1037,6 +1009,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss):
        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}

    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased")
    def call(
        self,
        inputs,
@@ -1073,22 +1046,6 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import BertTokenizer, TFBertForMultipleChoice
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = TFBertForMultipleChoice.from_pretrained('bert-base-uncased')
-        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
-
-        input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
-        labels = tf.reshape(tf.constant(1), (-1, 1))
-        outputs = model(input_ids, labels=labels)
-
-        loss, classification_scores = outputs[:2]
-
        """
        if isinstance(inputs, (tuple, list)):
            input_ids = inputs[0]
@@ -1177,6 +1134,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL
        )

    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased")
    def call(
        self,
        inputs=None,
@@ -1210,19 +1168,6 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import BertTokenizer, TFBertForTokenClassification
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = TFBertForTokenClassification.from_pretrained('bert-base-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
-
        """
        if isinstance(inputs, (tuple, list)):
            labels = inputs[8] if len(inputs) > 8 else labels
@@ -1273,6 +1218,7 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss)
        )

    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased")
    def call(
        self,
        inputs=None,
@@ -1314,22 +1260,6 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss)

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import BertTokenizer, TFBertForQuestionAnswering
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = TFBertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
-        start_scores, end_scores = model(input_dict)
-
-        all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
-        answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
-        assert answer == "a nice puppet"
-
        """
        if isinstance(inputs, (tuple, list)):
            start_positions = inputs[8] if len(inputs) > 8 else start_positions

--- a/src/transformers/modeling_tf_ctrl.py
+++ b/src/transformers/modeling_tf_ctrl.py
@@ -22,7 +22,7 @@ import numpy as np
 import tensorflow as tf

 from .configuration_ctrl import CTRLConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_tf_utils import (
    TFPreTrainedModel,
    TFSharedEmbeddings,
@@ -35,6 +35,8 @@ from .tokenization_utils import BatchEncoding

 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "CtrlTokenizer"
+
 TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "ctrl"
    # See all CTRL models at https://huggingface.co/models?filter=ctrl
@@ -489,6 +491,7 @@ class TFCTRLModel(TFCTRLPreTrainedModel):
        self.transformer = TFCTRLMainLayer(config, name="transformer")

    @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="ctrl")
    def call(self, inputs, **kwargs):
        r"""
    Return:
@@ -510,18 +513,6 @@ class TFCTRLModel(TFCTRLPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import CTRLTokenizer, TFCTRLModel
-
-        tokenizer = CTRLTokenizer.from_pretrained('ctrl')
-        model = TFCTRLModel.from_pretrained('ctrl')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
        """
        outputs = self.transformer(inputs, **kwargs)
        return outputs
@@ -569,6 +560,7 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):
        return {"inputs": inputs, "past": past, "use_cache": kwargs["use_cache"]}

    @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="ctrl")
    def call(self, inputs, **kwargs):
        r"""
    Return:
@@ -590,19 +582,6 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import CTRLTokenizer, TFCTRLLMHeadModel
-
-        tokenizer = CTRLTokenizer.from_pretrained('ctrl')
-        model = TFCTRLLMHeadModel.from_pretrained('ctrl')
-
-        input_ids = tf.constant([tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)])
-        outputs = model(input_ids)
-        loss, logits = outputs[:2]
-
        """
        transformer_outputs = self.transformer(inputs, **kwargs)
        hidden_states = transformer_outputs[0]

--- a/src/transformers/modeling_tf_distilbert.py
+++ b/src/transformers/modeling_tf_distilbert.py
@@ -23,7 +23,12 @@ import numpy as np
 import tensorflow as tf

 from .configuration_distilbert import DistilBertConfig
-from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import (
+    MULTIPLE_CHOICE_DUMMY_INPUTS,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_callable,
+)
 from .modeling_tf_utils import (
    TFMultipleChoiceLoss,
    TFPreTrainedModel,
@@ -41,6 +46,7 @@ from .tokenization_utils import BatchEncoding

 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "DistilBertTokenizer"

 TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "distilbert-base-uncased",
@@ -575,6 +581,7 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel):
        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")  # Embeddings

    @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
    def call(self, inputs, **kwargs):
        r"""
    Returns:
@@ -592,17 +599,6 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import DistilBertTokenizer, TFDistilBertModel
-
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
-        model = TFDistilBertModel.from_pretrained('distilbert-base-cased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
        """
        outputs = self.distilbert(inputs, **kwargs)
        return outputs
@@ -647,6 +643,7 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
        return self.vocab_projector.input_embeddings

    @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
    def call(self, inputs, **kwargs):
        r"""

@@ -665,18 +662,6 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import DistilBertTokenizer, TFDistilBertForMaskedLM
-
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
-        model = TFDistilBertForMaskedLM.from_pretrained('distilbert-base-cased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        prediction_scores = outputs[0]
-
        """
        distilbert_output = self.distilbert(inputs, **kwargs)

@@ -713,6 +698,7 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSeque
        self.dropout = tf.keras.layers.Dropout(config.seq_classif_dropout)

    @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
    def call(
        self,
        inputs=None,
@@ -746,19 +732,6 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSeque

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
-
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
-        model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-cased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
-
        """
        if isinstance(inputs, (tuple, list)):
            labels = inputs[6] if len(inputs) > 6 else labels
@@ -809,6 +782,7 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla
        )

    @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
    def call(
        self,
        inputs=None,
@@ -840,19 +814,6 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import DistilBertTokenizer, TFDistilBertForTokenClassification
-
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
-        model = TFDistilBertForTokenClassification.from_pretrained('distilbert-base-cased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
-
        """
        if isinstance(inputs, (tuple, list)):
            labels = inputs[6] if len(inputs) > 6 else labels
@@ -916,6 +877,7 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic
        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}

    @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
    def call(
        self,
        inputs,
@@ -950,22 +912,6 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import DistilBertTokenizer, TFDistilBertForMultipleChoice
-
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
-        model = TFDistilBertForMultipleChoice.from_pretrained('distilbert-base-uncased')
-        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
-
-        input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
-        labels = tf.reshape(tf.constant(1), (-1, 1))
-        outputs = model(input_ids, labels=labels)
-
-        loss, classification_scores = outputs[:2]
-
        """
        if isinstance(inputs, (tuple, list)):
            input_ids = inputs[0]
@@ -1046,6 +992,7 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAn
        self.dropout = tf.keras.layers.Dropout(config.qa_dropout)

    @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
    def call(
        self,
        inputs=None,
@@ -1085,21 +1032,6 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAn

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import DistilBertTokenizer, TFDistilBertForQuestionAnswering
-
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
-        model = TFDistilBertForQuestionAnswering.from_pretrained('distilbert-base-cased')
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
-        start_scores, end_scores = model(input_dict)
-
-        all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
-        answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
-
        """
        if isinstance(inputs, (tuple, list)):
            start_positions = inputs[6] if len(inputs) > 6 else start_positions

--- a/src/transformers/modeling_tf_electra.py
+++ b/src/transformers/modeling_tf_electra.py
@@ -4,7 +4,7 @@ import tensorflow as tf

 from transformers import ElectraConfig

-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_tf_bert import ACT2FN, TFBertEncoder, TFBertPreTrainedModel
 from .modeling_tf_utils import (
    TFQuestionAnsweringLoss,
@@ -18,6 +18,7 @@ from .tokenization_utils import BatchEncoding

 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "ElectraTokenizer"

 TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "google/electra-small-generator",
@@ -383,6 +384,7 @@ class TFElectraModel(TFElectraPreTrainedModel):
        self.electra = TFElectraMainLayer(config, name="electra")

    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
    def call(self, inputs, **kwargs):
        r"""
    Returns:
@@ -400,17 +402,6 @@ class TFElectraModel(TFElectraPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import ElectraTokenizer, TFElectraModel
-
-        tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
-        model = TFElectraModel.from_pretrained('google/electra-small-discriminator')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
        """
        outputs = self.electra(inputs, **kwargs)
        return outputs
@@ -532,6 +523,7 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel):
        return self.generator_lm_head

    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-generator")
    def call(
        self,
        input_ids=None,
@@ -560,18 +552,6 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import ElectraTokenizer, TFElectraForMaskedLM
-
-        tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-generator')
-        model = TFElectraForMaskedLM.from_pretrained('google/electra-small-generator')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        prediction_scores = outputs[0]
-
        """

        generator_hidden_states = self.electra(
@@ -611,6 +591,7 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel, TFTokenClassific
        )

    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
    def call(
        self,
        inputs=None,
@@ -644,19 +625,6 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel, TFTokenClassific

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import ElectraTokenizer, TFElectraForTokenClassification
-
-        tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
-        model = TFElectraForTokenClassification.from_pretrained('google/electra-small-discriminator')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
-
        """
        if isinstance(inputs, (tuple, list)):
            labels = inputs[8] if len(inputs) > 8 else labels
@@ -705,6 +673,7 @@ class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnswerin
        )

    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
    def call(
        self,
        inputs=None,
@@ -746,22 +715,6 @@ class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnswerin

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import ElectraTokenizer, TFElectraForQuestionAnswering
-
-        tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-generator')
-        model = TFElectraForQuestionAnswering.from_pretrained('google/electra-small-generator')
-
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
-        start_scores, end_scores = model(input_dict)
-
-        all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
-        answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
-
        """
        if isinstance(inputs, (tuple, list)):
            start_positions = inputs[8] if len(inputs) > 8 else start_positions

--- a/src/transformers/modeling_tf_gpt2.py
+++ b/src/transformers/modeling_tf_gpt2.py
@@ -22,7 +22,7 @@ import numpy as np
 import tensorflow as tf

 from .configuration_gpt2 import GPT2Config
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_tf_utils import (
    TFConv1D,
    TFPreTrainedModel,
@@ -38,6 +38,8 @@ from .tokenization_utils import BatchEncoding

 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "GPT2Tokenizer"
+
 TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "gpt2",
    "gpt2-medium",
@@ -490,6 +492,7 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
        self.transformer = TFGPT2MainLayer(config, name="transformer")

    @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="gpt2")
    def call(self, inputs, **kwargs):
        r"""
    Return:
@@ -511,18 +514,6 @@ class TFGPT2Model(TFGPT2PreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import GPT2Tokenizer, TFGPT2Model
-
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        model = TFGPT2Model.from_pretrained('gpt2')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
    """
        outputs = self.transformer(inputs, **kwargs)
        return outputs
@@ -549,6 +540,7 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
        return {"inputs": inputs, "past": past, "use_cache": kwargs["use_cache"]}

    @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="gpt2")
    def call(self, inputs, **kwargs):
        r"""
    Return:
@@ -570,19 +562,6 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import GPT2Tokenizer, TFGPT2LMHeadModel
-
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        model = TFGPT2LMHeadModel.from_pretrained('gpt2')
-
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        logits = outputs[0]
-
        """
        transformer_outputs = self.transformer(inputs, **kwargs)
        hidden_states = transformer_outputs[0]
@@ -659,29 +638,26 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):

    Examples::

-        # For example purposes. Not runnable.
-        import tensorflow as tf
-        from transformers import GPT2Tokenizer, TFGPT2DoubleHeadsModel
+        >>> import tensorflow as tf
+        >>> from transformers import GPT2Tokenizer, TFGPT2DoubleHeadsModel

-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        model = TFGPT2DoubleHeadsModel.from_pretrained('gpt2')
+        >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        >>> model = TFGPT2DoubleHeadsModel.from_pretrained('gpt2')

-        # Add a [CLS] to the vocabulary (we should train it also!)
-        # This option is currently not implemented in TF 2.0
-        raise NotImplementedError
-        tokenizer.add_special_tokens({'cls_token': '[CLS]'})
-        model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
-        print(tokenizer.cls_token_id, len(tokenizer))  # The newly token the last token of the vocabulary
+        >>> # Add a [CLS] to the vocabulary (we should train it also!)
+        >>> num_added_tokens = tokenizer.add_special_tokens({'cls_token': '[CLS]'})
+
+        >>> embedding_layer = model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size

-        choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
-        encoded_choices = [tokenizer.encode(s) for s in choices]
-        cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
+        >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
+        >>> encoded_choices = [tokenizer.encode(s) for s in choices]
+        >>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]

-        input_ids = tf.constant(encoded_choices)[None, :]  # Batch size: 1, number of choices: 2
-        mc_token_ids = tf.constant([cls_token_location])  # Batch size: 1
+        >>> input_ids = tf.constant(encoded_choices)[None, :]  # Batch size: 1, number of choices: 2
+        >>> mc_token_ids = tf.constant([cls_token_location])  # Batch size: 1

-        outputs = model(input_ids, mc_token_ids=mc_token_ids)
-        lm_prediction_scores, mc_prediction_scores = outputs[:2]
+        >>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
+        >>> lm_prediction_scores, mc_prediction_scores = outputs[:2]

        """
        if isinstance(inputs, (tuple, list)):

--- a/src/transformers/modeling_tf_mobilebert.py
+++ b/src/transformers/modeling_tf_mobilebert.py
@@ -21,7 +21,12 @@ import logging
 import tensorflow as tf

 from . import MobileBertConfig
-from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import (
+    MULTIPLE_CHOICE_DUMMY_INPUTS,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_callable,
+)
 from .modeling_tf_bert import TFBertIntermediate, gelu, gelu_new, swish
 from .modeling_tf_utils import (
    TFMultipleChoiceLoss,
@@ -39,6 +44,7 @@ from .tokenization_utils import BatchEncoding

 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "MobileBertTokenizer"

 TF_MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "mobilebert-uncased",
@@ -621,19 +627,6 @@ class TFMobileBertMLMHead(tf.keras.layers.Layer):
        return prediction_scores


-class TFMobileBertPreTrainingHeads(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.predictions = TFMobileBertLMPredictionHead(config, name="predictions")
-        self.seq_relationship = tf.keras.layers.Dense(2, name="seq_relationship")
-
-    def call(self, inputs):
-        sequence_output, pooled_output = inputs
-        prediction_scores = self.predictions(sequence_output)
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return prediction_scores, seq_relationship_score
-
-
 @keras_serializable
 class TFMobileBertMainLayer(tf.keras.layers.Layer):
    config_class = MobileBertConfig
@@ -845,6 +838,7 @@ class TFMobileBertModel(TFMobileBertPreTrainedModel):
        self.mobilebert = TFMobileBertMainLayer(config, name="mobilebert")

    @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
    def call(self, inputs, **kwargs):
        r"""
    Returns:
@@ -869,18 +863,6 @@ class TFMobileBertModel(TFMobileBertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import MobileBertTokenizer, TFMobileBertModel
-
-        tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
-        model = TFMobileBertModel.from_pretrained('mobilebert-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
        """
        outputs = self.mobilebert(inputs, **kwargs)
        return outputs
@@ -895,7 +877,8 @@ class TFMobileBertForPreTraining(TFMobileBertPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.mobilebert = TFMobileBertMainLayer(config, name="mobilebert")
-        self.cls = TFMobileBertPreTrainingHeads(config, name="cls")
+        self.predictions = TFMobileBertMLMHead(config, name="predictions___cls")
+        self.seq_relationship = TFMobileBertOnlyNSPHead(2, name="seq_relationship___cls")

    def get_output_embeddings(self):
        return self.mobilebert.embeddings
@@ -923,20 +906,21 @@ class TFMobileBertForPreTraining(TFMobileBertPreTrainedModel):

    Examples::

-        import tensorflow as tf
-        from transformers import MobileBertTokenizer, TFMobileBertForPreTraining
+        >>> import tensorflow as tf
+        >>> from transformers import MobileBertTokenizer, TFMobileBertForPreTraining

-        tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
-        model = TFMobileBertForPreTraining.from_pretrained('mobilebert-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        prediction_scores, seq_relationship_scores = outputs[:2]
+        >>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
+        >>> model = TFMobileBertForPreTraining.from_pretrained('google/mobilebert-uncased')
+        >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        >>> outputs = model(input_ids)
+        >>> prediction_scores, seq_relationship_scores = outputs[:2]

        """
        outputs = self.mobilebert(inputs, **kwargs)

        sequence_output, pooled_output = outputs[:2]
-        prediction_scores, seq_relationship_score = self.cls([sequence_output, pooled_output])
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
        outputs = (prediction_scores, seq_relationship_score,) + outputs[
            2:
        ]  # add hidden states and attention if they are here
@@ -956,6 +940,7 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel):
        return self.mobilebert.embeddings

    @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
    def call(self, inputs, **kwargs):
        r"""
    Return:
@@ -973,18 +958,6 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import MobileBertTokenizer, TFMobileBertForMaskedLM
-
-        tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
-        model = TFMobileBertForMaskedLM.from_pretrained('mobilebert-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        prediction_scores = outputs[0]
-
        """
        outputs = self.mobilebert(inputs, **kwargs)

@@ -1015,7 +988,7 @@ class TFMobileBertForNextSentencePrediction(TFMobileBertPreTrainedModel):
        super().__init__(config, *inputs, **kwargs)

        self.mobilebert = TFMobileBertMainLayer(config, name="mobilebert")
-        self.cls = TFMobileBertOnlyNSPHead(config, name="cls")
+        self.cls = TFMobileBertOnlyNSPHead(config, name="seq_relationship___cls")

    @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
    def call(self, inputs, **kwargs):
@@ -1038,18 +1011,17 @@ class TFMobileBertForNextSentencePrediction(TFMobileBertPreTrainedModel):

    Examples::

-        import tensorflow as tf
-        from transformers import MobileBertTokenizer, TFMobileBertForNextSentencePrediction
+        >>> import tensorflow as tf
+        >>> from transformers import MobileBertTokenizer, TFMobileBertForNextSentencePrediction

-        tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
-        model = TFMobileBertForNextSentencePrediction.from_pretrained('mobilebert-uncased')
+        >>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
+        >>> model = TFMobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased')

-        prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        next_sentence = "The sky is blue due to the shorter wavelength of blue light."
-        encoding = tokenizer.encode_plus(prompt, next_sentence, return_tensors='tf')
+        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+        >>> encoding = tokenizer(prompt, next_sentence, return_tensors='tf')

-        logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0]
-        assert logits[0][0] < logits[0][1] # the next sentence was random
+        >>> logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0]
        """
        outputs = self.mobilebert(inputs, **kwargs)

@@ -1078,6 +1050,7 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque
        )

    @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
    def call(
        self,
        inputs=None,
@@ -1113,19 +1086,6 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import MobileBertTokenizer, TFBMobileBertForSequenceClassification
-
-        tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
-        model = TFMobileBertForSequenceClassification.from_pretrained('mobilebert-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
-
        """
        if isinstance(inputs, (tuple, list)):
            labels = inputs[8] if len(inputs) > 8 else labels
@@ -1176,6 +1136,7 @@ class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAn
        )

    @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
    def call(
        self,
        inputs=None,
@@ -1217,22 +1178,6 @@ class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAn

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import MobileBertTokenizer, TFMobileBertForQuestionAnswering
-
-        tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
-        model = TFMobileBertForQuestionAnswering.from_pretrained('mobilebert-uncased')  # Not a fine-tuned model! Load a fine-tuned model to obtain coherent results.
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
-        start_scores, end_scores = model(input_dict)
-
-        all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
-        answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
-        assert answer == "a nice puppet"
-
        """
        if isinstance(inputs, (tuple, list)):
            start_positions = inputs[8] if len(inputs) > 8 else start_positions
@@ -1298,6 +1243,7 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic
        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}

    @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
    def call(
        self,
        inputs,
@@ -1334,22 +1280,6 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import MobileBertTokenizer, TFMobileBertForMultipleChoice
-
-        tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
-        model = TFMobileBertForMultipleChoice.from_pretrained('mobilebert-uncased')
-        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
-
-        input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
-        labels = tf.reshape(tf.constant(1), (-1, 1))
-        outputs = model(input_ids, labels=labels)
-
-        loss, classification_scores = outputs[:2]
-
        """
        if isinstance(inputs, (tuple, list)):
            input_ids = inputs[0]
@@ -1438,6 +1368,7 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla
        )

    @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
    def call(
        self,
        inputs=None,
@@ -1471,19 +1402,6 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import MobileBertTokenizer, TFMobileBertForTokenClassification
-
-        tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
-        model = TFMobileBertForTokenClassification.from_pretrained('mobilebert-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
-
        """
        if isinstance(inputs, (tuple, list)):
            labels = inputs[8] if len(inputs) > 8 else labels

--- a/src/transformers/modeling_tf_openai.py
+++ b/src/transformers/modeling_tf_openai.py
@@ -22,7 +22,7 @@ import numpy as np
 import tensorflow as tf

 from .configuration_openai import OpenAIGPTConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_tf_utils import (
    TFConv1D,
    TFPreTrainedModel,
@@ -38,6 +38,8 @@ from .tokenization_utils import BatchEncoding

 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "OpenAIGPTTokenizer"
+
 TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "openai-gpt",
    # See all OpenAI GPT models at https://huggingface.co/models?filter=openai-gpt
@@ -449,6 +451,7 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
        self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")

    @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="openai-gpt")
    def call(self, inputs, **kwargs):
        r"""
    Return:
@@ -466,18 +469,6 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import OpenAIGPTTokenizer, TFOpenAIGPTModel
-
-        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-        model = TFOpenAIGPTModel.from_pretrained('openai-gpt')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
        """
        outputs = self.transformer(inputs, **kwargs)
        return outputs
@@ -497,6 +488,7 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
        return self.transformer.tokens_embed

    @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="openai-gpt")
    def call(self, inputs, **kwargs):
        r"""
    Return:
@@ -514,18 +506,6 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import OpenAIGPTTokenizer, TFOpenAIGPTLMHeadModel
-
-        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-        model = TFOpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        logits = outputs[0]
-
        """
        transformer_outputs = self.transformer(inputs, **kwargs)
        hidden_states = transformer_outputs[0]
@@ -601,26 +581,23 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):

    Examples::

-        # For example purposes. Not runnable.
-        import tensorflow as tf
-        from transformers import OpenAIGPTTokenizer, TFOpenAIGPTDoubleHeadsModel
+        >>> import tensorflow as tf
+        >>> from transformers import OpenAIGPTTokenizer, TFOpenAIGPTDoubleHeadsModel

-        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-        model = TFOpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
+        >>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
+        >>> model = TFOpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')

-        # Add a [CLS] to the vocabulary (we should train it also!)
-        # This option is currently not implemented in TF 2.0
-        raise NotImplementedError
-        tokenizer.add_special_tokens({'cls_token': '[CLS]'})
-        model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
-        print(tokenizer.cls_token_id, len(tokenizer))  # The newly token the last token of the vocabulary
-
-        choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
-        input_ids = tf.constant([tokenizer.encode(s) for s in choices])[None, :]  # Batch size 1, 2 choices
-        mc_token_ids = tf.constant([input_ids.size(-1), input_ids.size(-1)])[None, :]  # Batch size 1
-        outputs = model(input_ids, mc_token_ids=mc_token_ids)
-        lm_prediction_scores, mc_prediction_scores = outputs[:2]
+        >>> # Add a [CLS] to the vocabulary (we should train it also!)
+        >>> tokenizer.add_special_tokens({'cls_token': '[CLS]'})
+        >>> model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
+        >>> print(tokenizer.cls_token_id, len(tokenizer))  # The newly token the last token of the vocabulary

+        >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
+        >>> encoding = tokenizer(choices, return_tensors="tf")
+        >>> inputs = {k: tf.expand_dims(v, 0) for k, v in encoding.items()}
+        >>> inputs["mc_token_ids"]= tf.constant([inputs["input_ids"].shape[-1] - 1, inputs["input_ids"].shape[-1] - 1])[None, :]  # Batch size 1
+        >>> outputs = model(inputs)
+        >>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
        """

        if isinstance(inputs, (tuple, list)):
@@ -633,7 +610,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
            mc_token_ids = inputs[6] if len(inputs) > 6 else mc_token_ids
            output_attentions = inputs[7] if len(inputs) > 7 else output_attentions
            assert len(inputs) <= 8, "Too many inputs."
-        elif isinstance(inputs, dict):
+        elif isinstance(inputs, (dict, BatchEncoding)):
            input_ids = inputs.get("input_ids")
            attention_mask = inputs.get("attention_mask", attention_mask)
            token_type_ids = inputs.get("token_type_ids", token_type_ids)

--- a/src/transformers/modeling_tf_roberta.py
+++ b/src/transformers/modeling_tf_roberta.py
@@ -21,7 +21,12 @@ import logging
 import tensorflow as tf

 from .configuration_roberta import RobertaConfig
-from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import (
+    MULTIPLE_CHOICE_DUMMY_INPUTS,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_callable,
+)
 from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu
 from .modeling_tf_utils import (
    TFMultipleChoiceLoss,
@@ -38,6 +43,8 @@ from .tokenization_utils_base import BatchEncoding

 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "RobertaTokenizer"
+
 TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "roberta-base",
    "roberta-large",
@@ -195,6 +202,7 @@ class TFRobertaModel(TFRobertaPreTrainedModel):
        self.roberta = TFRobertaMainLayer(config, name="roberta")

    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
    def call(self, inputs, **kwargs):
        r"""
    Returns:
@@ -219,18 +227,6 @@ class TFRobertaModel(TFRobertaPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import RobertaTokenizer, TFRobertaModel
-
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = TFRobertaModel.from_pretrained('roberta-base')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
        """
        outputs = self.roberta(inputs, **kwargs)
        return outputs
@@ -279,6 +275,7 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):
        return self.lm_head.decoder

    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
    def call(self, inputs, **kwargs):
        r"""
    Return:
@@ -296,18 +293,6 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import RobertaTokenizer, TFRobertaForMaskedLM
-
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = TFRobertaForMaskedLM.from_pretrained('roberta-base')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        prediction_scores = outputs[0]
-
        """
        outputs = self.roberta(inputs, **kwargs)

@@ -358,6 +343,7 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceCla
        self.classifier = TFRobertaClassificationHead(config, name="classifier")

    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
    def call(
        self,
        inputs=None,
@@ -387,19 +373,6 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceCla

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import RobertaTokenizer, TFRobertaForSequenceClassification
-
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = TFRobertaForSequenceClassification.from_pretrained('roberta-base')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
-
        """
        if isinstance(inputs, (tuple, list)):
            labels = inputs[8] if len(inputs) > 8 else labels
@@ -441,7 +414,7 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

-        self.roberta = TFBertMainLayer(config, name="roberta")
+        self.roberta = TFRobertaMainLayer(config, name="roberta")
        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
        self.classifier = tf.keras.layers.Dense(
            1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
@@ -457,6 +430,7 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)
        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}

    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
    def call(
        self,
        inputs,
@@ -493,22 +467,6 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import RobertaTokenizer, TFRobertaForMultipleChoice
-
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = TFRobertaForMultipleChoice.from_pretrained('roberta-base')
-        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
-
-        input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
-        labels = tf.reshape(tf.constant(1), (-1, 1))
-        outputs = model(input_ids, labels=labels)
-
-        loss, classification_scores = outputs[:2]
-
        """
        if isinstance(inputs, (tuple, list)):
            input_ids = inputs[0]
@@ -592,6 +550,7 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific
        )

    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
    def call(
        self,
        inputs=None,
@@ -625,19 +584,6 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import RobertaTokenizer, TFRobertaForTokenClassification
-
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = TFRobertaForTokenClassification.from_pretrained('roberta-base')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
-
        """
        if isinstance(inputs, (tuple, list)):
            labels = inputs[8] if len(inputs) > 8 else labels
@@ -687,6 +633,7 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin
        )

    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
    def call(
        self,
        inputs=None,
@@ -728,24 +675,6 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        # The checkpoint roberta-base is not fine-tuned for question answering. Please see the
-        # examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task.
-
-        import tensorflow as tf
-        from transformers import RobertaTokenizer, TFRobertaForQuestionAnswering
-
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = TFRobertaForQuestionAnswering.from_pretrained('roberta-base')
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
-        start_scores, end_scores = model(input_dict)
-
-        all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
-        answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
-
        """
        if isinstance(inputs, (tuple, list)):
            start_positions = inputs[8] if len(inputs) > 8 else start_positions

--- a/src/transformers/modeling_tf_t5.py
+++ b/src/transformers/modeling_tf_t5.py
@@ -37,6 +37,8 @@ from .tokenization_utils import BatchEncoding

 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "T5Tokenizer"
+
 TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "t5-small",
    "t5-base",
@@ -931,13 +933,13 @@ class TFT5Model(TFT5PreTrainedModel):

    Examples::

-        from transformers import T5Tokenizer, TFT5Model
+        >>> from transformers import T5Tokenizer, TFT5Model

-        tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        model = TFT5Model.from_pretrained('t5-small')
-        inputs = tokenizer.encode("Hello, my dog is cute", return_tensors="tf")  # Batch size 1
-        outputs = model(inputs, decoder_input_ids=inputs)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
+        >>> model = TFT5Model.from_pretrained('t5-small')
+        >>> inputs = tokenizer.encode("Hello, my dog is cute", return_tensors="tf")  # Batch size 1
+        >>> outputs = model(inputs, decoder_input_ids=inputs)
+        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

        """

@@ -1074,18 +1076,18 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel):

    Examples::

-        from transformers import T5Tokenizer, TFT5ForConditionalGeneration
+        >>> from transformers import T5Tokenizer, TFT5ForConditionalGeneration

-        tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        model = TFT5ForConditionalGeneration.from_pretrained('t5-small')
-        inputs = tokenizer.encode("Hello, my dog is cute", return_tensors="tf")  # Batch size 1
-        outputs = model(inputs, decoder_input_ids=inputs)
-        prediction_scores = outputs[0]
+        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
+        >>> model = TFT5ForConditionalGeneration.from_pretrained('t5-small')
+        >>> inputs = tokenizer.encode("Hello, my dog is cute", return_tensors="tf")  # Batch size 1
+        >>> outputs = model(inputs, decoder_input_ids=inputs)
+        >>> prediction_scores = outputs[0]

-        tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        model = TFT5ForConditionalGeneration.from_pretrained('t5-small')
-        inputs = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="tf")  # Batch size 1
-        model.generate(inputs)
+        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
+        >>> model = TFT5ForConditionalGeneration.from_pretrained('t5-small')
+        >>> inputs = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="tf")  # Batch size 1
+        >>> result = model.generate(inputs)

        """


--- a/src/transformers/modeling_tf_transfo_xl.py
+++ b/src/transformers/modeling_tf_transfo_xl.py
@@ -22,7 +22,7 @@ import logging
 import tensorflow as tf

 from .configuration_transfo_xl import TransfoXLConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_tf_transfo_xl_utilities import TFAdaptiveSoftmaxMask
 from .modeling_tf_utils import (
    TFPreTrainedModel,
@@ -36,6 +36,8 @@ from .tokenization_utils import BatchEncoding

 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "TransfoXLTokenizer"
+
 TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "transfo-xl-wt103",
    # See all Transformer XL models at https://huggingface.co/models?filter=transfo-xl
@@ -722,6 +724,7 @@ class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
        self.transformer = TFTransfoXLMainLayer(config, name="transformer")

    @add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="transfo-xl-wt103")
    def call(self, inputs, **kwargs):
        r"""
    Return:
@@ -743,18 +746,6 @@ class TFTransfoXLModel(TFTransfoXLPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import TransfoXLTokenizer, TFTransfoXLModel
-
-        tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
-        model = TFTransfoXLModel.from_pretrained('transfo-xl-wt103')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states, mems = outputs[:2]
-
        """
        outputs = self.transformer(inputs, **kwargs)
        return outputs
@@ -811,6 +802,7 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
        return self.transformer.init_mems(bsz)

    @add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="transfo-xl-wt103")
    def call(
        self,
        inputs,
@@ -842,18 +834,6 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import TransfoXLTokenizer, TFTransfoXLLMHeadModel
-
-        tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
-        model = TFTransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        prediction_scores, mems = outputs[:2]
-
        """
        if isinstance(inputs, (tuple, list)):
            input_ids = inputs[0]
@@ -863,7 +843,7 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
            labels = inputs[4] if len(inputs) > 4 else labels
            output_attentions = inputs[5] if len(inputs) > 5 else output_attentions
            assert len(inputs) <= 6, "Too many inputs."
-        elif isinstance(inputs, dict):
+        elif isinstance(inputs, (BatchEncoding, dict)):
            input_ids = inputs.get("input_ids")
            mems = inputs.get("mems", mems)
            head_mask = inputs.get("head_mask", head_mask)

--- a/src/transformers/modeling_tf_xlm.py
+++ b/src/transformers/modeling_tf_xlm.py
@@ -24,7 +24,12 @@ import numpy as np
 import tensorflow as tf

 from .configuration_xlm import XLMConfig
-from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import (
+    MULTIPLE_CHOICE_DUMMY_INPUTS,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_callable,
+)
 from .modeling_tf_utils import (
    TFMultipleChoiceLoss,
    TFPreTrainedModel,
@@ -43,6 +48,8 @@ from .tokenization_utils import BatchEncoding

 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "XLMTokenizer"
+
 TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "xlm-mlm-en-2048",
    "xlm-mlm-ende-1024",
@@ -608,6 +615,7 @@ class TFXLMModel(TFXLMPreTrainedModel):
        self.transformer = TFXLMMainLayer(config, name="transformer")

    @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
    def call(self, inputs, **kwargs):
        r"""
    Return:
@@ -625,18 +633,6 @@ class TFXLMModel(TFXLMPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XLMTokenizer, TFXLMModel
-
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        model = TFXLMModel.from_pretrained('xlm-mlm-en-2048')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
        """
        outputs = self.transformer(inputs, **kwargs)
        return outputs
@@ -704,6 +700,7 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
        return {"inputs": inputs, "langs": langs}

    @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
    def call(self, inputs, **kwargs):
        r"""
    Return:
@@ -721,18 +718,6 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XLMTokenizer, TFXLMWithLMHeadModel
-
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        model = TFXLMWithLMHeadModel.from_pretrained('xlm-mlm-en-2048')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
        """
        transformer_outputs = self.transformer(inputs, **kwargs)

@@ -757,6 +742,7 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel, TFSequenceClassificat
        self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary")

    @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
    def call(
        self,
        inputs=None,
@@ -795,19 +781,6 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel, TFSequenceClassificat

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XLMTokenizer, TFXLMForSequenceClassification
-
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        model = TFXLMForSequenceClassification.from_pretrained('xlm-mlm-en-2048')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
-
        """
        if isinstance(inputs, (tuple, list)):
            labels = inputs[11] if len(inputs) > 11 else labels
@@ -865,6 +838,7 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}

    @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
    def call(
        self,
        inputs,
@@ -876,9 +850,9 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
        cache=None,
        head_mask=None,
        inputs_embeds=None,
-        labels=None,
        output_attentions=None,
        output_hidden_states=None,
+        labels=None,
        training=False,
    ):
        r"""
@@ -904,22 +878,6 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XLMTokenizer, TFXLMForMultipleChoice
-
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        model = TFXLMForMultipleChoice.from_pretrained('xlm-mlm-en-2048')
-        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
-
-        input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
-        labels = tf.reshape(tf.constant(1), (-1, 1))
-        outputs = model(input_ids, labels=labels)
-
-        loss, classification_scores = outputs[:2]
-
        """
        if isinstance(inputs, (tuple, list)):
            input_ids = inputs[0]
@@ -932,7 +890,9 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
            head_mask = inputs[7] if len(inputs) > 7 else head_mask
            inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds
            output_attentions = inputs[9] if len(inputs) > 9 else output_attentions
-            assert len(inputs) <= 10, "Too many inputs."
+            output_hidden_states = inputs[10] if len(inputs) > 10 else output_hidden_states
+            labels = inputs[11] if len(inputs) > 11 else labels
+            assert len(inputs) <= 11, "Too many inputs."
        elif isinstance(inputs, (dict, BatchEncoding)):
            input_ids = inputs.get("input_ids")
            attention_mask = inputs.get("attention_mask", attention_mask)
@@ -944,7 +904,9 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
            head_mask = inputs.get("head_mask", head_mask)
            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
            output_attentions = inputs.get("output_attentions", output_attentions)
-            assert len(inputs) <= 10, "Too many inputs."
+            output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
+            labels = inputs.get("labels", labels)
+            assert len(inputs) <= 12, "Too many inputs."
        else:
            input_ids = inputs

@@ -1001,13 +963,14 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos
        self.transformer = TFXLMMainLayer(config, name="transformer")
        self.dropout = tf.keras.layers.Dropout(config.dropout)
        self.classifier = tf.keras.layers.Dense(
-            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+            config.num_labels, kernel_initializer=get_initializer(config.init_std), name="classifier"
        )

    @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
    def call(
        self,
-        input_ids=None,
+        inputs=None,
        attention_mask=None,
        langs=None,
        token_type_ids=None,
@@ -1016,9 +979,9 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos
        cache=None,
        head_mask=None,
        inputs_embeds=None,
-        labels=None,
        output_attentions=None,
        output_hidden_states=None,
+        labels=None,
        training=False,
    ):
        r"""
@@ -1041,25 +1004,22 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XLMTokenizer, TFXLMForTokenClassification
-
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        model = TFXLMForTokenClassification.from_pretrained('xlm-mlm-en-2048')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
-
        """
+        if isinstance(inputs, (tuple, list)):
+            labels = inputs[11] if len(inputs) > 11 else labels
+            if len(inputs) > 11:
+                inputs = inputs[:11]
+        elif isinstance(inputs, (dict, BatchEncoding)):
+            labels = inputs.pop("labels", labels)
+
        transformer_outputs = self.transformer(
-            input_ids,
+            inputs,
            attention_mask=attention_mask,
+            langs=langs,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
@@ -1072,7 +1032,7 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos
        sequence_output = self.dropout(sequence_output, training=training)
        logits = self.classifier(sequence_output)

-        outputs = (logits,) + transformer_outputs[2:]  # add hidden states and attention if they are here
+        outputs = (logits,) + transformer_outputs[1:]  # add hidden states and attention if they are here

        if labels is not None:
            loss = self.compute_loss(labels, logits)
@@ -1095,6 +1055,7 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL
        )

    @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
    def call(
        self,
        inputs=None,
@@ -1139,21 +1100,6 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XLMTokenizer, TFXLMForQuestionAnsweringSimple
-
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        model = TFXLMForQuestionAnsweringSimple.from_pretrained('xlm-mlm-en-2048')
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
-        start_scores, end_scores = model(input_dict)
-
-        all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
-        answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
-
        """
        if isinstance(inputs, (tuple, list)):
            start_positions = inputs[11] if len(inputs) > 11 else start_positions

--- a/src/transformers/modeling_tf_xlnet.py
+++ b/src/transformers/modeling_tf_xlnet.py
@@ -23,7 +23,12 @@ import numpy as np
 import tensorflow as tf

 from .configuration_xlnet import XLNetConfig
-from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import (
+    MULTIPLE_CHOICE_DUMMY_INPUTS,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_callable,
+)
 from .modeling_tf_utils import (
    TFMultipleChoiceLoss,
    TFPreTrainedModel,
@@ -42,6 +47,8 @@ from .tokenization_utils import BatchEncoding

 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "XLNetTokenizer"
+
 TF_XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "xlnet-base-cased",
    "xlnet-large-cased",
@@ -832,6 +839,7 @@ class TFXLNetModel(TFXLNetPreTrainedModel):
        self.transformer = TFXLNetMainLayer(config, name="transformer")

    @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
    def call(self, inputs, **kwargs):
        r"""
    Return:
@@ -853,18 +861,6 @@ class TFXLNetModel(TFXLNetPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XLNetTokenizer, TFXLNetModel
-
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-        model = TFXLNetModel.from_pretrained('xlnet-large-cased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
        """
        outputs = self.transformer(inputs, **kwargs)
        return outputs
@@ -949,10 +945,13 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):

        # We show how to setup inputs to predict a next token using a bi-directional context.
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=True))[None, :]  # We will predict the masked token
+
        perm_mask = np.zeros((1, input_ids.shape[1], input_ids.shape[1]))
        perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
+
        target_mapping = np.zeros((1, 1, input_ids.shape[1]))  # Shape [1, 1, seq_length] => let's predict one token
        target_mapping[0, 0, -1] = 1.0  # Our first (and only) prediction will be the last token of the sequence (the masked token)
+
        outputs = model(input_ids, perm_mask=tf.constant(perm_mask, dtype=tf.float32), target_mapping=tf.constant(target_mapping, dtype=tf.float32))

        next_token_logits = outputs[0]  # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
@@ -986,6 +985,7 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassif
        )

    @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
    def call(
        self,
        inputs=None,
@@ -1029,19 +1029,6 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassif

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XLNetTokenizer, TFXLNetForSequenceClassification
-
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-        model = TFXLNetForSequenceClassification.from_pretrained('xlnet-large-cased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
-
        """
        if isinstance(inputs, (tuple, list)):
            labels = inputs[12] if len(inputs) > 12 else labels
@@ -1105,6 +1092,7 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss):
        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}

    @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
    def call(
        self,
        inputs=None,
@@ -1145,22 +1133,6 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XLNetTokenizer, TFXLNetForMultipleChoice
-
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
-        model = TFXLNetForMultipleChoice.from_pretrained('xlnet-base-cased')
-        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
-
-        input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
-        labels = tf.reshape(tf.constant(1), (-1, 1))
-        outputs = model(input_ids, labels=labels)
-
-        loss, classification_scores = outputs[:2]
-
        """
        if isinstance(inputs, (tuple, list)):
            input_ids = inputs[0]
@@ -1257,6 +1229,8 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificatio
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
        )

+    @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
    def call(
        self,
        inputs=None,
@@ -1298,19 +1272,6 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificatio

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XLNetTokenizer, TFXLNetForTokenClassification
-
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-        model = TFXLNetForTokenClassification.from_pretrained('xlnet-large-cased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
-
        """
        if isinstance(inputs, (tuple, list)):
            labels = inputs[12] if len(inputs) > 12 else labels
@@ -1361,6 +1322,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer
        )

    @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
    def call(
        self,
        inputs=None,
@@ -1412,21 +1374,6 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XLNetTokenizer, TFXLNetForQuestionAnsweringSimple
-
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
-        model = TFXLNetForQuestionAnsweringSimple.from_pretrained('xlnet-base-cased')
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
-        start_scores, end_scores = model(input_dict)
-
-        all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
-        answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
-
        """
        if isinstance(inputs, (tuple, list)):
            start_positions = inputs[12] if len(inputs) > 12 else start_positions

--- a/src/transformers/modeling_transfo_xl.py
+++ b/src/transformers/modeling_transfo_xl.py
@@ -27,13 +27,15 @@ import torch.nn as nn
 import torch.nn.functional as F

 from .configuration_transfo_xl import TransfoXLConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax
 from .modeling_utils import PreTrainedModel


 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "TransfoXLTokenizer"
+
 TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "transfo-xl-wt103",
    # See all Transformer XL models at https://huggingface.co/models?filter=transfo-xl
@@ -749,6 +751,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
        return new_mems

    @add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="transfo-xl-wt103")
    def forward(
        self,
        input_ids=None,
@@ -778,18 +781,6 @@ class TransfoXLModel(TransfoXLPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import TransfoXLTokenizer, TransfoXLModel
-        import torch
-
-        tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
-        model = TransfoXLModel.from_pretrained('transfo-xl-wt103')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states, mems = outputs[:2]
-
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
@@ -945,6 +936,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
        return self.transformer.init_mems(bsz)

    @add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="transfo-xl-wt103")
    def forward(
        self,
        input_ids=None,
@@ -984,18 +976,6 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import TransfoXLTokenizer, TransfoXLLMHeadModel
-        import torch
-
-        tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
-        model = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        prediction_scores, mems = outputs[:2]
-
        """
        if input_ids is not None:
            bsz, tgt_len = input_ids.size(0), input_ids.size(1)

--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -978,13 +978,15 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):

        Examples::

+            from transformers import AutoTokenizer, AutoModelForCausalLM
+
            tokenizer = AutoTokenizer.from_pretrained('distilgpt2')   # Initialize tokenizer
-            model = AutoModelWithLMHead.from_pretrained('distilgpt2')    # Download model and configuration from S3 and cache.
+            model = AutoModelForCausalLM.from_pretrained('distilgpt2')    # Download model and configuration from S3 and cache.
            outputs = model.generate(max_length=40)  # do greedy decoding
            print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True)))

            tokenizer = AutoTokenizer.from_pretrained('openai-gpt')   # Initialize tokenizer
-            model = AutoModelWithLMHead.from_pretrained('openai-gpt')    # Download model and configuration from S3 and cache.
+            model = AutoModelForCausalLM.from_pretrained('openai-gpt')    # Download model and configuration from S3 and cache.
            input_context = 'The dog'
            input_ids = tokenizer.encode(input_context, return_tensors='pt')  # encode input context
            outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5)  # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog'
@@ -992,22 +994,22 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
                print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True)))

            tokenizer = AutoTokenizer.from_pretrained('distilgpt2')   # Initialize tokenizer
-            model = AutoModelWithLMHead.from_pretrained('distilgpt2')    # Download model and configuration from S3 and cache.
+            model = AutoModelForCausalLM.from_pretrained('distilgpt2')    # Download model and configuration from S3 and cache.
            input_context = 'The dog'
            input_ids = tokenizer.encode(input_context, return_tensors='pt')  # encode input context
-            outputs = model.generate(input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3)  # 3 generate sequences using by sampling
+            outputs = model.generate(input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3, do_sample=True)  # 3 generate sequences using by sampling
            for i in range(3): #  3 output sequences were generated
                print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True)))

            tokenizer = AutoTokenizer.from_pretrained('ctrl')   # Initialize tokenizer
-            model = AutoModelWithLMHead.from_pretrained('ctrl')    # Download model and configuration from S3 and cache.
+            model = AutoModelForCausalLM.from_pretrained('ctrl')    # Download model and configuration from S3 and cache.
            input_context = 'Legal My neighbor is'  # "Legal" is one of the control codes for ctrl
            input_ids = tokenizer.encode(input_context, return_tensors='pt')  # encode input context
            outputs = model.generate(input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2)  # generate sequences
            print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True)))

            tokenizer = AutoTokenizer.from_pretrained('gpt2')   # Initialize tokenizer
-            model = AutoModelWithLMHead.from_pretrained('gpt2')    # Download model and configuration from S3 and cache.
+            model = AutoModelForCausalLM.from_pretrained('gpt2')    # Download model and configuration from S3 and cache.
            input_context = 'My cute dog'  # "Legal" is one of the control codes for ctrl
            bad_words_ids = [tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ['idiot', 'stupid', 'shut up']]
            input_ids = tokenizer.encode(input_context, return_tensors='pt')  # encode input context