Refactor Code samples; Test code samples (#5036)

* Refactor code samples * Test docstrings * Style * Tokenization examples * Run rust of tests * First step to testing source docs * Style and BART comment * Test the remainder of the code samples * Style * let to const * Formatting fixes * Ready for merge * Fix fixture + Style * Fix last tests * Update docs/source/quicktour.rst Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Addressing @sgugger's comments + Fix MobileBERT in TF Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

Refactor Code samples; Test code samples (#5036)
* Refactor code samples * Test docstrings * Style * Tokenization examples * Run rust of tests * First step to testing source docs * Style and BART comment * Test the remainder of the code samples * Style * let to const * Formatting fixes * Ready for merge * Fix fixture + Style * Fix last tests * Update docs/source/quicktour.rst Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Addressing @sgugger's comments + Fix MobileBERT in TF Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
364a5ae1 · Lysandre Debut · GitHub · 315f464b · 364a5ae1 · 364a5ae1
Unverified Commit 364a5ae1 authored Jun 25, 2020 by Lysandre Debut Committed by GitHub Jun 25, 2020
20 changed files
--- a/src/transformers/modeling_mobilebert.py
+++ b/src/transformers/modeling_mobilebert.py
@@ -34,11 +34,14 @@ from transformers.modeling_bert import BertIntermediate
 from .activations import gelu, gelu_new, swish
 from .configuration_mobilebert import MobileBertConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
 logger = logging.getLogger(__name__)
+_TOKENIZER_FOR_DOC = "MobileBertTokenizer"
 MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST = ["google/mobilebert-uncased"]
@@ -747,6 +750,7 @@ class MobileBertModel(MobileBertPreTrainedModel):
            self.encoder.layer[layer].attention.prune_heads(heads)
    @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
    def forward(
        self,
        input_ids=None,
@@ -785,20 +789,6 @@ class MobileBertModel(MobileBertPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Examples::
-        from transformers import MobileBertModel, MobileBertTokenizer
-        import torch
-        tokenizer = MobileBertTokenizer.from_pretrained(model_name_or_path)
-        model = MobileBertModel.from_pretrained(model_name_or_path)
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -951,13 +941,17 @@ class MobileBertForPreTraining(MobileBertPreTrainedModel):
            heads.
    Examples::
-        from transformers import MobileBertTokenizer, MobileBertForPreTraining
-        import torch
+        >>> from transformers import MobileBertTokenizer, MobileBertForPreTraining
-        tokenizer = MobileBertTokenizer.from_pretrained(model_name_or_path)
+        >>> import torch
-        model = MobileBertForPreTraining.from_pretrained(model_name_or_path)
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+        >>> tokenizer = MobileBertTokenizer.from_pretrained("google/mobilebert-uncased")
-        outputs = model(input_ids)
+        >>> model = MobileBertForPreTraining.from_pretrained("google/mobilebert-uncased")
-        prediction_scores, seq_relationship_scores = outputs[:2]
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids)
+        >>> prediction_scores, seq_relationship_scores = outputs[:2]
        """
        outputs = self.mobilebert(
@@ -1022,6 +1016,7 @@ class MobileBertForMaskedLM(MobileBertPreTrainedModel):
            self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings())
    @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
    def forward(
        self,
        input_ids=None,
@@ -1063,20 +1058,6 @@ class MobileBertForMaskedLM(MobileBertPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-        Examples::
-            from transformers import MobileBertTokenizer, MobileBertForMaskedLM
-            import torch
-            tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
-            model = MobileBertForMaskedLM.from_pretrained('mobilebert-uncased')
-            input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-            outputs = model(input_ids, labels=input_ids)
-            loss, prediction_scores = outputs[:2]
        """
        if "masked_lm_labels" in kwargs:
            warnings.warn(
@@ -1174,18 +1155,17 @@ class MobileBertForNextSentencePrediction(MobileBertPreTrainedModel):
    Examples::
-        from transformers import MobileBertTokenizer, MobileBertForNextSentencePrediction
+        >>> from transformers import MobileBertTokenizer, MobileBertForNextSentencePrediction
-        import torch
+        >>> import torch
-        tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
+        >>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
-        model = MobileBertForNextSentencePrediction.from_pretrained('mobilebert-uncased')
+        >>> model = MobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased')
-        prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
-        encoding = tokenizer.encode_plus(prompt, next_sentence, return_tensors='pt')
+        >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
-        loss, logits = model(**encoding, next_sentence_label=torch.LongTensor([1]))
+        >>> loss, logits = model(**encoding, next_sentence_label=torch.LongTensor([1]))
-        assert logits[0, 0] < logits[0, 1] # next sentence was random
        """
        outputs = self.mobilebert(
@@ -1228,6 +1208,7 @@ class MobileBertForSequenceClassification(MobileBertPreTrainedModel):
        self.init_weights()
    @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
    def forward(
        self,
        input_ids=None,
@@ -1263,20 +1244,6 @@ class MobileBertForSequenceClassification(MobileBertPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Examples::
-        from transformers import BertTokenizer, BertForSequenceClassification
-        import torch
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
        """
        outputs = self.mobilebert(
@@ -1321,6 +1288,7 @@ class MobileBertForQuestionAnswering(MobileBertPreTrainedModel):
        self.init_weights()
    @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
    def forward(
        self,
        input_ids=None,
@@ -1363,25 +1331,6 @@ class MobileBertForQuestionAnswering(MobileBertPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Examples::
-        from transformers import MobileBertTokenizer, MobileBertForQuestionAnswering
-        import torch
-        tokenizer = BertTokenizer.from_pretrained(model_name_or_path)
-        model = MobileBertForQuestionAnswering.from_pretrained(model_name_or_path)
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        encoding = tokenizer.encode_plus(question, text)
-        input_ids, token_type_ids = encoding["input_ids"], encoding["token_type_ids"]
-        start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
-        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
-        answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])
-        assert answer == "a nice puppet"
        """
        outputs = self.mobilebert(
@@ -1439,6 +1388,7 @@ class MobileBertForMultipleChoice(MobileBertPreTrainedModel):
        self.init_weights()
    @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
    def forward(
        self,
        input_ids=None,
@@ -1476,25 +1426,6 @@ class MobileBertForMultipleChoice(MobileBertPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Examples::
-        from transformers import MobileBertTokenizer, MobileBertForMultipleChoice
-        import torch
-        tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
-        model = MobileBertForMultipleChoice.from_pretrained('mobilebert-uncased')
-        prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        choice0 = "It is eaten with a fork and a knife."
-        choice1 = "It is eaten while held in the hand."
-        labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1
-        encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='pt', pad_to_max_length=True)
-        outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
-        # the linear classifier still needs to be trained
-        loss, logits = outputs[:2]
        """
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
@@ -1552,6 +1483,7 @@ class MobileBertForTokenClassification(MobileBertPreTrainedModel):
        self.init_weights()
    @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
    def forward(
        self,
        input_ids=None,
@@ -1586,21 +1518,6 @@ class MobileBertForTokenClassification(MobileBertPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Examples::
-        from transformers import MobileBertTokenizer, MobileBertForTokenClassification
-        import torch
-        tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
-        model = MobileBertForTokenClassification.from_pretrained('mobilebert-uncased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
        """
        outputs = self.mobilebert(

--- a/src/transformers/modeling_openai.py
+++ b/src/transformers/modeling_openai.py
@@ -28,7 +28,7 @@ from torch.nn import CrossEntropyLoss
 from .activations import gelu_new, swish
 from .configuration_openai import OpenAIGPTConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_utils import (
    Conv1D,
    PreTrainedModel,
@@ -40,6 +40,8 @@ from .modeling_utils import (
 logger = logging.getLogger(__name__)
+_TOKENIZER_FOR_DOC = "OpenAIGPTTokenizer"
 OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "openai-gpt",
    # See all OpenAI GPT models at https://huggingface.co/models?filter=openai-gpt
@@ -356,6 +358,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
            self.h[layer].attn.prune_heads(heads)
    @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="openai-gpt")
    def forward(
        self,
        input_ids=None,
@@ -383,18 +386,6 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Examples::
-        from transformers import OpenAIGPTTokenizer, OpenAIGPTModel
-        import torch
-        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-        model = OpenAIGPTModel.from_pretrained('openai-gpt')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
@@ -490,6 +481,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
        return self.lm_head
    @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="openai-gpt")
    def forward(
        self,
        input_ids=None,
@@ -531,18 +523,6 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Examples::
-        from transformers import OpenAIGPTTokenizer, OpenAIGPTLMHeadModel
-        import torch
-        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-        model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=input_ids)
-        loss, logits = outputs[:2]
    """
        transformer_outputs = self.transformer(
            input_ids,

--- a/src/transformers/modeling_reformer.py
+++ b/src/transformers/modeling_reformer.py
@@ -29,12 +29,20 @@ from torch.nn import CrossEntropyLoss
 from .activations import gelu, gelu_fast, gelu_new, swish
 from .configuration_reformer import ReformerConfig
-from .file_utils import DUMMY_INPUTS, DUMMY_MASK, add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import (
+    DUMMY_INPUTS,
+    DUMMY_MASK,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_callable,
+)
 from .modeling_utils import PreTrainedModel, apply_chunking_to_forward
 logger = logging.getLogger(__name__)
+_TOKENIZER_FOR_DOC = "ReformerTokenizer"
 REFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "google/reformer-crime-and-punishment",
    "google/reformer-enwik8",
@@ -1543,6 +1551,7 @@ class ReformerModel(ReformerPreTrainedModel):
            self.encoder.layer[layer].attention.prune_heads(heads)
    @add_start_docstrings_to_callable(REFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/reformer-crime-and-punishment")
    def forward(
        self,
        input_ids=None,
@@ -1570,19 +1579,6 @@ class ReformerModel(ReformerPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Examples::
-        from transformers import ReformerModel, ReformerTokenizer
-        import torch
-        tokenizer = ReformerTokenizer.from_pretrained('google/reformer-crime-and-punishment')
-        model =  ReformerModel.from_pretrained('google/reformer-crime-and-punishment')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -1738,6 +1734,7 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel):
        pass
    @add_start_docstrings_to_callable(REFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/reformer-crime-and-punishment")
    def forward(
        self,
        input_ids=None,
@@ -1774,19 +1771,6 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Examples::
-        from transformers import ReformerModelWithLMHead, ReformerTokenizer
-        import torch
-        tokenizer = ReformerTokenizer.from_pretrained('google/reformer-crime-and-punishment')
-        model =  ReformerModelWithLMHead.from_pretrained('google/reformer-crime-and-punishment')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=input_ids)
-        loss, prediction_scores = outputs[:2]
        """
        reformer_outputs = self.reformer(

--- a/src/transformers/modeling_roberta.py
+++ b/src/transformers/modeling_roberta.py
@@ -24,12 +24,14 @@ import torch.nn as nn
 from torch.nn import CrossEntropyLoss, MSELoss
 from .configuration_roberta import RobertaConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_bert import BertEmbeddings, BertLayerNorm, BertModel, BertPreTrainedModel, gelu
 logger = logging.getLogger(__name__)
+_TOKENIZER_FOR_DOC = "RobertaTokenizer"
 ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "roberta-base",
    "roberta-large",
@@ -177,6 +179,7 @@ class RobertaForMaskedLM(BertPreTrainedModel):
        return self.lm_head.decoder
    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
    def forward(
        self,
        input_ids=None,
@@ -216,18 +219,6 @@ class RobertaForMaskedLM(BertPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Examples::
-        from transformers import RobertaTokenizer, RobertaForMaskedLM
-        import torch
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = RobertaForMaskedLM.from_pretrained('roberta-base')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=input_ids)
-        loss, prediction_scores = outputs[:2]
        """
        if "masked_lm_labels" in kwargs:
            warnings.warn(
@@ -304,6 +295,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
        self.init_weights()
    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
    def forward(
        self,
        input_ids=None,
@@ -340,19 +332,6 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Examples::
-        from transformers import RobertaTokenizer, RobertaForSequenceClassification
-        import torch
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = RobertaForSequenceClassification.from_pretrained('roberta-base')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
        """
        outputs = self.roberta(
            input_ids,
@@ -400,6 +379,7 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
        self.init_weights()
    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
    def forward(
        self,
        input_ids=None,
@@ -437,20 +417,6 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Examples::
-        from transformers import RobertaTokenizer, RobertaForMultipleChoice
-        import torch
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = RobertaForMultipleChoice.from_pretrained('roberta-base')
-        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
-        input_ids = torch.tensor([tokenizer.encode(s, add_special_tokens=True) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
-        labels = torch.tensor(1).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, classification_scores = outputs[:2]
        """
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
@@ -510,6 +476,7 @@ class RobertaForTokenClassification(BertPreTrainedModel):
        self.init_weights()
    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
    def forward(
        self,
        input_ids=None,
@@ -544,19 +511,6 @@ class RobertaForTokenClassification(BertPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Examples::
-        from transformers import RobertaTokenizer, RobertaForTokenClassification
-        import torch
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = RobertaForTokenClassification.from_pretrained('roberta-base')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
        """
        outputs = self.roberta(
@@ -632,6 +586,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
        self.init_weights()
    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
    def forward(
        self,
        input_ids=None,
@@ -674,25 +629,6 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Examples::
-        # The checkpoint roberta-large is not fine-tuned for question answering. Please see the
-        # examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task.
-        from transformers import RobertaTokenizer, RobertaForQuestionAnswering
-        import torch
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = RobertaForQuestionAnswering.from_pretrained('roberta-base')
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        input_ids = tokenizer.encode(question, text)
-        start_scores, end_scores = model(torch.tensor([input_ids]))
-        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
-        answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])
        """
        outputs = self.roberta(

--- a/src/transformers/modeling_t5.py
+++ b/src/transformers/modeling_t5.py
@@ -33,6 +33,8 @@ from .modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, p
 logger = logging.getLogger(__name__)
+_TOKENIZER_FOR_DOC = "T5Tokenizer"
 ####################################################
 # This dict contrains shortcut names and associated url
 # for the pretrained weights provided with the models
@@ -924,16 +926,17 @@ class T5Model(T5PreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Examples::
+        Example::
+            >>> from transformers import T5Tokenizer, T5Model
-        from transformers import T5Tokenizer, T5Model
+            >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
+            >>> model = T5Model.from_pretrained('t5-small')
-        tokenizer = T5Tokenizer.from_pretrained('t5-small')
+            >>> input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt")  # Batch size 1
-        model = T5Model.from_pretrained('t5-small')
+            >>> outputs = model(input_ids=input_ids, decoder_input_ids=input_ids)
-        input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt")  # Batch size 1
-        outputs = model(input_ids=input_ids, decoder_input_ids=input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+            >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
        """
        use_cache = use_cache if use_cache is not None else self.config.use_cache
@@ -1068,18 +1071,18 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
    Examples::
-        from transformers import T5Tokenizer, T5ForConditionalGeneration
+        >>> from transformers import T5Tokenizer, T5ForConditionalGeneration
-        tokenizer = T5Tokenizer.from_pretrained('t5-small')
+        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        model = T5ForConditionalGeneration.from_pretrained('t5-small')
+        >>> model = T5ForConditionalGeneration.from_pretrained('t5-small')
-        input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt")  # Batch size 1
+        >>> input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt")  # Batch size 1
-        outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=input_ids)
+        >>> outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=input_ids)
-        loss, prediction_scores = outputs[:2]
+        >>> loss, prediction_scores = outputs[:2]
-        tokenizer = T5Tokenizer.from_pretrained('t5-small')
+        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        model = T5ForConditionalGeneration.from_pretrained('t5-small')
+        >>> model = T5ForConditionalGeneration.from_pretrained('t5-small')
-        input_ids = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="pt")  # Batch size 1
+        >>> input_ids = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="pt")  # Batch size 1
-        outputs = model.generate(input_ids)
+        >>> outputs = model.generate(input_ids)
        """
        if "lm_labels" in kwargs:

--- a/src/transformers/modeling_tf_albert.py
+++ b/src/transformers/modeling_tf_albert.py
--- a/src/transformers/modeling_tf_bert.py
+++ b/src/transformers/modeling_tf_bert.py
--- a/src/transformers/modeling_tf_ctrl.py
+++ b/src/transformers/modeling_tf_ctrl.py
--- a/src/transformers/modeling_tf_distilbert.py
+++ b/src/transformers/modeling_tf_distilbert.py
--- a/src/transformers/modeling_tf_electra.py
+++ b/src/transformers/modeling_tf_electra.py
--- a/src/transformers/modeling_tf_gpt2.py
+++ b/src/transformers/modeling_tf_gpt2.py
--- a/src/transformers/modeling_tf_mobilebert.py
+++ b/src/transformers/modeling_tf_mobilebert.py
--- a/src/transformers/modeling_tf_openai.py
+++ b/src/transformers/modeling_tf_openai.py
--- a/src/transformers/modeling_tf_roberta.py
+++ b/src/transformers/modeling_tf_roberta.py
--- a/src/transformers/modeling_tf_t5.py
+++ b/src/transformers/modeling_tf_t5.py
--- a/src/transformers/modeling_tf_transfo_xl.py
+++ b/src/transformers/modeling_tf_transfo_xl.py
--- a/src/transformers/modeling_tf_xlm.py
+++ b/src/transformers/modeling_tf_xlm.py
--- a/src/transformers/modeling_tf_xlnet.py
+++ b/src/transformers/modeling_tf_xlnet.py
--- a/src/transformers/modeling_transfo_xl.py
+++ b/src/transformers/modeling_transfo_xl.py
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py