Refactor Code samples; Test code samples (#5036)

* Refactor code samples * Test docstrings * Style * Tokenization examples * Run rust of tests * First step to testing source docs * Style and BART comment * Test the remainder of the code samples * Style * let to const * Formatting fixes * Ready for merge * Fix fixture + Style * Fix last tests * Update docs/source/quicktour.rst Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Addressing @sgugger's comments + Fix MobileBERT in TF Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

Refactor Code samples; Test code samples (#5036)
* Refactor code samples * Test docstrings * Style * Tokenization examples * Run rust of tests * First step to testing source docs * Style and BART comment * Test the remainder of the code samples * Style * let to const * Formatting fixes * Ready for merge * Fix fixture + Style * Fix last tests * Update docs/source/quicktour.rst Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Addressing @sgugger's comments + Fix MobileBERT in TF Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
364a5ae1 · Lysandre Debut · GitHub · 315f464b · 364a5ae1 · 364a5ae1
Unverified Commit 364a5ae1 authored Jun 25, 2020 by Lysandre Debut Committed by GitHub Jun 25, 2020
20 changed files
--- a/src/transformers/configuration_reformer.py
+++ b/src/transformers/configuration_reformer.py
@@ -125,16 +125,16 @@ class ReformerConfig(PretrainedConfig):
        Example::
-            from transformers import ReformerModel, ReformerConfig
+            >>> from transformers import ReformerModel, ReformerConfig
-            # Initializing a Reformer configuration
+            >>> # Initializing a Reformer configuration
-            configuration = ReformerConfig()
+            >>> configuration = ReformerConfig()
-            # Initializing a Reformer model
+            >>> # Initializing a Reformer model
-            model = ReformerModel(configuration)
+            >>> model = ReformerModel(configuration)
-            # Accessing the model configuration
+            >>> # Accessing the model configuration
-            configuration = model.config
+            >>> configuration = model.config
    """
    model_type = "reformer"

--- a/src/transformers/configuration_roberta.py
+++ b/src/transformers/configuration_roberta.py
@@ -49,16 +49,16 @@ class RobertaConfig(BertConfig):
        Example::
-            from transformers import RobertaConfig, RobertaModel
+            >>> from transformers import RobertaConfig, RobertaModel
-            # Initializing a RoBERTa configuration
+            >>> # Initializing a RoBERTa configuration
-            configuration = RobertaConfig()
+            >>> configuration = RobertaConfig()
-            # Initializing a model from the configuration
+            >>> # Initializing a model from the configuration
-            model = RobertaModel(configuration)
+            >>> model = RobertaModel(configuration)
-            # Accessing the model configuration
+            >>> # Accessing the model configuration
-            configuration = model.config
+            >>> configuration = model.config
    """
    model_type = "roberta"

--- a/src/transformers/configuration_transfo_xl.py
+++ b/src/transformers/configuration_transfo_xl.py
@@ -100,16 +100,16 @@ class TransfoXLConfig(PretrainedConfig):
        Example::
-            from transformers import TransfoXLConfig, TransfoXLModel
+            >>> from transformers import TransfoXLConfig, TransfoXLModel
-            # Initializing a Transformer XL configuration
+            >>> # Initializing a Transformer XL configuration
-            configuration = TransfoXLConfig()
+            >>> configuration = TransfoXLConfig()
-            # Initializing a model from the configuration
+            >>> # Initializing a model from the configuration
-            model = TransfoXLModel(configuration)
+            >>> model = TransfoXLModel(configuration)
-            # Accessing the model configuration
+            >>> # Accessing the model configuration
-            configuration = model.config
+            >>> configuration = model.config
    """
    model_type = "transfo-xl"

--- a/src/transformers/configuration_xlm.py
+++ b/src/transformers/configuration_xlm.py
@@ -142,16 +142,16 @@ class XLMConfig(PretrainedConfig):
        Example::
-            from transformers import XLMConfig, XLMModel
+            >>> from transformers import XLMConfig, XLMModel
-            # Initializing a XLM configuration
+            >>> # Initializing a XLM configuration
-            configuration = XLMConfig()
+            >>> configuration = XLMConfig()
-            # Initializing a model from the configuration
+            >>> # Initializing a model from the configuration
-            model = XLMModel(configuration)
+            >>> model = XLMModel(configuration)
-            # Accessing the model configuration
+            >>> # Accessing the model configuration
-            configuration = model.config
+            >>> configuration = model.config
    """
    model_type = "xlm"

--- a/src/transformers/configuration_xlnet.py
+++ b/src/transformers/configuration_xlnet.py
@@ -113,16 +113,16 @@ class XLNetConfig(PretrainedConfig):
        Example::
-            from transformers import XLNetConfig, XLNetModel
+            >>> from transformers import XLNetConfig, XLNetModel
-            # Initializing a XLNet configuration
+            >>> # Initializing a XLNet configuration
-            configuration = XLNetConfig()
+            >>> configuration = XLNetConfig()
-            # Initializing a model from the configuration
+            >>> # Initializing a model from the configuration
-            model = XLNetModel(configuration)
+            >>> model = XLNetModel(configuration)
-            # Accessing the model configuration
+            >>> # Accessing the model configuration
-            configuration = model.config
+            >>> configuration = model.config
    """
    model_type = "xlnet"

--- a/src/transformers/data/processors/squad.py
+++ b/src/transformers/data/processors/squad.py
@@ -488,11 +488,11 @@ class SquadProcessor(DataProcessor):
        Examples::
-            import tensorflow_datasets as tfds
+            >>> import tensorflow_datasets as tfds
-            dataset = tfds.load("squad")
+            >>> dataset = tfds.load("squad")
-            training_examples = get_examples_from_dataset(dataset, evaluate=False)
+            >>> training_examples = get_examples_from_dataset(dataset, evaluate=False)
-            evaluation_examples = get_examples_from_dataset(dataset, evaluate=True)
+            >>> evaluation_examples = get_examples_from_dataset(dataset, evaluate=True)
        """
        if evaluate:

--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -186,6 +186,263 @@ def add_end_docstrings(*docstr):
    return docstring_decorator
+PT_TOKEN_CLASSIFICATION_SAMPLE = r"""
+    Example::
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import torch
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> labels = torch.tensor([1] * inputs["input_ids"].size(1)).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(**inputs, labels=labels)
+        >>> loss, scores = outputs[:2]
+"""
+PT_QUESTION_ANSWERING_SAMPLE = r"""
+    Example::
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import torch
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> start_positions = torch.tensor([1])
+        >>> end_positions = torch.tensor([3])
+        >>> outputs = model(**inputs, start_positions=start_positions, end_positions=end_positions)
+        >>> loss, start_scores, end_scores = outputs[:3]
+"""
+PT_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
+    Example::
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import torch
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(**inputs, labels=labels)
+        >>> loss, logits = outputs[:2]
+"""
+PT_MASKED_LM_SAMPLE = r"""
+    Example::
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import torch
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+        >>> input_ids = tokenizer("Hello, my dog is cute", return_tensors="pt")["input_ids"]
+        >>> outputs = model(input_ids, labels=input_ids)
+        >>> loss, prediction_scores = outputs[:2]
+"""
+PT_BASE_MODEL_SAMPLE = r"""
+    Example::
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import torch
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+"""
+PT_MULTIPLE_CHOICE_SAMPLE = r"""
+    Example::
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import torch
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+        >>> choice0 = "It is eaten with a fork and a knife."
+        >>> choice1 = "It is eaten while held in the hand."
+        >>> labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1
+        >>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='pt', pad_to_max_length=True)
+        >>> outputs = model(**{{k: v.unsqueeze(0) for k,v in encoding.items()}}, labels=labels)  # batch size is 1
+        >>> # the linear classifier still needs to be trained
+        >>> loss, logits = outputs[:2]
+"""
+PT_CAUSAL_LM_SAMPLE = r"""
+    Example::
+        >>> import torch
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs, labels=inputs["input_ids"])
+        >>> loss, logits = outputs[:2]
+"""
+TF_TOKEN_CLASSIFICATION_SAMPLE = r"""
+    Example::
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import tensorflow as tf
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
+        >>> input_ids = inputs["input_ids"]
+        >>> inputs["labels"] = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
+        >>> outputs = model(inputs)
+        >>> loss, scores = outputs[:2]
+"""
+TF_QUESTION_ANSWERING_SAMPLE = r"""
+    Example::
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import tensorflow as tf
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+        >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
+        >>> input_dict = tokenizer(question, text, return_tensors='tf')
+        >>> start_scores, end_scores = model(input_dict)
+        >>> all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
+        >>> answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
+"""
+TF_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
+    Example::
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import tensorflow as tf
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
+        >>> inputs["labels"] = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
+        >>> outputs = model(inputs)
+        >>> loss, logits = outputs[:2]
+"""
+TF_MASKED_LM_SAMPLE = r"""
+    Example::
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import tensorflow as tf
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+        >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
+        >>> outputs = model(input_ids)
+        >>> prediction_scores = outputs[0]
+"""
+TF_BASE_MODEL_SAMPLE = r"""
+    Example::
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import tensorflow as tf
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
+        >>> outputs = model(inputs)
+        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+"""
+TF_MULTIPLE_CHOICE_SAMPLE = r"""
+    Example::
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import tensorflow as tf
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+        >>> choice0 = "It is eaten with a fork and a knife."
+        >>> choice1 = "It is eaten while held in the hand."
+        >>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='tf', pad_to_max_length=True)
+        >>> inputs = {{k: tf.expand_dims(v, 0) for k, v in encoding.items()}}
+        >>> outputs = model(inputs)  # batch size is 1
+        >>> # the linear classifier still needs to be trained
+        >>> logits = outputs[0]
+"""
+TF_CAUSAL_LM_SAMPLE = r"""
+    Example::
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import tensorflow as tf
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
+        >>> outputs = model(inputs)
+        >>> logits = outputs[0]
+"""
+def add_code_sample_docstrings(*docstr, tokenizer_class=None, checkpoint=None):
+    def docstring_decorator(fn):
+        model_class = fn.__qualname__.split(".")[0]
+        is_tf_class = model_class[:2] == "TF"
+        if "SequenceClassification" in model_class:
+            code_sample = TF_SEQUENCE_CLASSIFICATION_SAMPLE if is_tf_class else PT_SEQUENCE_CLASSIFICATION_SAMPLE
+        elif "QuestionAnswering" in model_class:
+            code_sample = TF_QUESTION_ANSWERING_SAMPLE if is_tf_class else PT_QUESTION_ANSWERING_SAMPLE
+        elif "TokenClassification" in model_class:
+            code_sample = TF_TOKEN_CLASSIFICATION_SAMPLE if is_tf_class else PT_TOKEN_CLASSIFICATION_SAMPLE
+        elif "MultipleChoice" in model_class:
+            code_sample = TF_MULTIPLE_CHOICE_SAMPLE if is_tf_class else PT_MULTIPLE_CHOICE_SAMPLE
+        elif "MaskedLM" in model_class:
+            code_sample = TF_MASKED_LM_SAMPLE if is_tf_class else PT_MASKED_LM_SAMPLE
+        elif "LMHead" in model_class:
+            code_sample = TF_CAUSAL_LM_SAMPLE if is_tf_class else PT_CAUSAL_LM_SAMPLE
+        elif "Model" in model_class:
+            code_sample = TF_BASE_MODEL_SAMPLE if is_tf_class else PT_BASE_MODEL_SAMPLE
+        else:
+            raise ValueError(f"Docstring can't be built for model {model_class}")
+        built_doc = code_sample.format(model_class=model_class, tokenizer_class=tokenizer_class, checkpoint=checkpoint)
+        fn.__doc__ = (fn.__doc__ or "") + "".join(docstr) + built_doc
+        return fn
+    return docstring_decorator
 def is_remote_url(url_or_filename):
    parsed = urlparse(url_or_filename)
    return parsed.scheme in ("http", "https")

--- a/src/transformers/modeling_albert.py
+++ b/src/transformers/modeling_albert.py
@@ -24,13 +24,15 @@ import torch.nn as nn
 from torch.nn import CrossEntropyLoss, MSELoss
 from .configuration_albert import AlbertConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_bert import ACT2FN, BertEmbeddings, BertSelfAttention, prune_linear_layer
 from .modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices
 logger = logging.getLogger(__name__)
+_TOKENIZER_FOR_DOC = "AlbertTokenizer"
 ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "albert-base-v1",
@@ -485,6 +487,7 @@ class AlbertModel(AlbertPreTrainedModel):
            self.encoder.albert_layer_groups[group_idx].albert_layers[inner_group_idx].attention.prune_heads(heads)
    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
    def forward(
        self,
        input_ids=None,
@@ -521,18 +524,6 @@ class AlbertModel(AlbertPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Example::
-        from transformers import AlbertModel, AlbertTokenizer
-        import torch
-        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        model = AlbertModel.from_pretrained('albert-base-v2')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -657,16 +648,16 @@ class AlbertForPreTraining(AlbertPreTrainedModel):
    Examples::
-        from transformers import AlbertTokenizer, AlbertForPreTraining
+        >>> from transformers import AlbertTokenizer, AlbertForPreTraining
-        import torch
+        >>> import torch
-        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
+        >>> tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        model = AlbertForPreTraining.from_pretrained('albert-base-v2')
+        >>> model = AlbertForPreTraining.from_pretrained('albert-base-v2')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
+        >>> outputs = model(input_ids)
-        prediction_scores, sop_scores = outputs[:2]
+        >>> prediction_scores, sop_scores = outputs[:2]
        """
@@ -763,6 +754,7 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
        return self.predictions.decoder
    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
    def forward(
        self,
        input_ids=None,
@@ -802,18 +794,6 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Example::
-        from transformers import AlbertTokenizer, AlbertForMaskedLM
-        import torch
-        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        model = AlbertForMaskedLM.from_pretrained('albert-base-v2')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=input_ids)
-        loss, prediction_scores = outputs[:2]
        """
        if "masked_lm_labels" in kwargs:
            warnings.warn(
@@ -863,6 +843,7 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
        self.init_weights()
    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
    def forward(
        self,
        input_ids=None,
@@ -899,19 +880,6 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-        Examples::
-            from transformers import AlbertTokenizer, AlbertForSequenceClassification
-            import torch
-            tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-            model = AlbertForSequenceClassification.from_pretrained('albert-base-v2')
-            input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-            labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-            outputs = model(input_ids, labels=labels)
-            loss, logits = outputs[:2]
        """
        outputs = self.albert(
@@ -962,6 +930,7 @@ class AlbertForTokenClassification(AlbertPreTrainedModel):
        self.init_weights()
    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
    def forward(
        self,
        input_ids=None,
@@ -996,21 +965,6 @@ class AlbertForTokenClassification(AlbertPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Examples::
-        from transformers import AlbertTokenizer, AlbertForTokenClassification
-        import torch
-        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        model = AlbertForTokenClassification.from_pretrained('albert-base-v2')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
        """
        outputs = self.albert(
@@ -1062,6 +1016,7 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):
        self.init_weights()
    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
    def forward(
        self,
        input_ids=None,
@@ -1104,21 +1059,6 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Examples::
-        # The checkpoint albert-base-v2 is not fine-tuned for question answering. Please see the
-        # examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task.
-        from transformers import AlbertTokenizer, AlbertForQuestionAnswering
-        import torch
-        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        model = AlbertForQuestionAnswering.from_pretrained('albert-base-v2')
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        input_dict = tokenizer.encode_plus(question, text, return_tensors='pt')
-        start_scores, end_scores = model(**input_dict)
        """
        outputs = self.albert(
@@ -1176,6 +1116,7 @@ class AlbertForMultipleChoice(AlbertPreTrainedModel):
        self.init_weights()
    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
    def forward(
        self,
        input_ids=None,
@@ -1213,25 +1154,6 @@ class AlbertForMultipleChoice(AlbertPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Examples::
-        from transformers import AlbertTokenizer, AlbertForMultipleChoice
-        import torch
-        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        model = AlbertForMultipleChoice.from_pretrained('albert-base-v2')
-        prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        choice0 = "It is eaten with a fork and a knife."
-        choice1 = "It is eaten while held in the hand."
-        labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1
-        encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='pt', pad_to_max_length=True)
-        outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
-        # the linear classifier still needs to be trained
-        loss, logits = outputs[:2]
        """
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

--- a/src/transformers/modeling_auto.py
+++ b/src/transformers/modeling_auto.py
@@ -392,8 +392,8 @@ class AutoModel:
        Examples::
-            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
+            >>> config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
-            model = AutoModel.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            >>> model = AutoModel.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
        """
        for config_class, model_class in MODEL_MAPPING.items():
            if isinstance(config, config_class):
@@ -480,8 +480,7 @@ class AutoModel:
        Examples::
            model = AutoModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
-            model = AutoModel.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            assert model.config.output_attentions == True
-            assert model.config.output_attention == True
            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
            model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
@@ -547,8 +546,8 @@ class AutoModelForPreTraining:
        Examples::
-            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
+            >>> config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
-            model = AutoModelForPreTraining.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            >>> model = AutoModelForPreTraining.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
        """
        for config_class, model_class in MODEL_FOR_PRETRAINING_MAPPING.items():
            if isinstance(config, config_class):

--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -27,12 +27,19 @@ from torch.nn import CrossEntropyLoss
 from .activations import ACT2FN
 from .configuration_bart import BartConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import (
+    add_code_sample_docstrings,
+    add_end_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_callable,
+)
 from .modeling_utils import PreTrainedModel
 logger = logging.getLogger(__name__)
+_TOKENIZER_FOR_DOC = "BartTokenizer"
 BART_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "facebook/bart-large",
@@ -56,14 +63,17 @@ BART_START_DOCSTRING = r"""
 """
 BART_GENERATION_EXAMPLE = r"""
-    Examples::
+    Summarization example::
        from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
        # see ``examples/summarization/bart/run_eval.py`` for a longer example
        model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
        tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
        ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
-        inputs = tokenizer.batch_encode_plus([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
+        inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
        # Generate Summary
        summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True)
        print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])
@@ -807,6 +817,7 @@ class BartModel(PretrainedBartModel):
        self.init_weights()
    @add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="facebook/bart-large")
    def forward(
        self,
        input_ids,
@@ -883,8 +894,7 @@ class BartModel(PretrainedBartModel):
 @add_start_docstrings(
-    "The BART Model with a language modeling head. Can be used for summarization.",
+    "The BART Model with a language modeling head. Can be used for summarization.", BART_START_DOCSTRING
-    BART_START_DOCSTRING + BART_GENERATION_EXAMPLE,
 )
 class BartForConditionalGeneration(PretrainedBartModel):
    base_model_prefix = "model"
@@ -911,6 +921,7 @@ class BartForConditionalGeneration(PretrainedBartModel):
        self.register_buffer("final_logits_bias", new_bias)
    @add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING)
+    @add_end_docstrings(BART_GENERATION_EXAMPLE)
    def forward(
        self,
        input_ids,
@@ -951,18 +962,21 @@ class BartForConditionalGeneration(PretrainedBartModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Examples::
+    Conditional generation example::
            # Mask filling only works for bart-large
            from transformers import BartTokenizer, BartForConditionalGeneration
-            tokenizer = BartTokenizer.from_pretrained('bart-large')
+            tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
            TXT = "My friends are <mask> but they eat too many carbs."
-            model = BartForConditionalGeneration.from_pretrained('bart-large')
-            input_ids = tokenizer.batch_encode_plus([TXT], return_tensors='pt')['input_ids']
+            model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
+            input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
            logits = model(input_ids)[0]
            masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
            probs = logits[0, masked_index].softmax(dim=0)
            values, predictions = probs.topk(5)
            tokenizer.decode(predictions).split()
            # ['good', 'great', 'all', 'really', 'very']
        """
@@ -1068,6 +1082,7 @@ class BartForSequenceClassification(PretrainedBartModel):
        self.model._init_weights(self.classification_head.out_proj)
    @add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="facebook/bart-large")
    def forward(
        self,
        input_ids,
@@ -1092,28 +1107,15 @@ class BartForSequenceClassification(PretrainedBartModel):
                Classification loss (cross entropy)
            logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
                Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
                of shape :obj:`(batch_size, sequence_length, hidden_size)`.
                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+                Attentions weights after the attention softmax, used to compute the weighted average in the
+                self-attention
                heads.
-    Examples::
-        from transformers import BartTokenizer, BartForSequenceClassification
-        import torch
-        tokenizer = BartTokenizer.from_pretrained('bart-large')
-        model = BartForSequenceClassification.from_pretrained('bart-large')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute",
-        add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
        """
        if labels is not None:
            use_cache = False
@@ -1161,6 +1163,7 @@ class BartForQuestionAnswering(PretrainedBartModel):
        self.model._init_weights(self.qa_outputs)
    @add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="facebook/bart-large")
    def forward(
        self,
        input_ids,
@@ -1200,25 +1203,6 @@ class BartForQuestionAnswering(PretrainedBartModel):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Examples::
-        # The checkpoint bart-large is not fine-tuned for question answering. Please see the
-        # examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task.
-        from transformers import BartTokenizer, BartForQuestionAnswering
-        import torch
-        tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
-        model = BartForQuestionAnswering.from_pretrained('facebook/bart-large')
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        input_ids = tokenizer.encode(question, text)
-        start_scores, end_scores = model(torch.tensor([input_ids]))
-        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
-        answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])
        """
        if start_positions is not None and end_positions is not None:
            use_cache = False
@@ -1259,7 +1243,7 @@ class BartForQuestionAnswering(PretrainedBartModel):
            total_loss = (start_loss + end_loss) / 2
            outputs = (total_loss,) + outputs
-        return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)
+        return outputs  # return outputs  # (loss), start_logits, end_logits, encoder_outputs, (hidden_states), (attentions)
 class SinusoidalPositionalEmbedding(nn.Embedding):

--- a/src/transformers/modeling_bert.py
+++ b/src/transformers/modeling_bert.py
@@ -28,12 +28,14 @@ from torch.nn import CrossEntropyLoss, MSELoss
 from .activations import gelu, gelu_new, swish
 from .configuration_bert import BertConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
 logger = logging.getLogger(__name__)
+_TOKENIZER_FOR_DOC = "BertTokenizer"
 BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "bert-base-uncased",
    "bert-large-uncased",
@@ -664,6 +666,7 @@ class BertModel(BertPreTrainedModel):
            self.encoder.layer[layer].attention.prune_heads(heads)
    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
    def forward(
        self,
        input_ids=None,
@@ -702,20 +705,6 @@ class BertModel(BertPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Examples::
-        from transformers import BertModel, BertTokenizer
-        import torch
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertModel.from_pretrained('bert-base-uncased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
@@ -851,16 +840,16 @@ class BertForPreTraining(BertPreTrainedModel):
    Examples::
-        from transformers import BertTokenizer, BertForPreTraining
+        >>> from transformers import BertTokenizer, BertForPreTraining
-        import torch
+        >>> import torch
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertForPreTraining.from_pretrained('bert-base-uncased')
+        >>> model = BertForPreTraining.from_pretrained('bert-base-uncased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
-        outputs = model(input_ids)
+        >>> outputs = model(**inputs)
-        prediction_scores, seq_relationship_scores = outputs[:2]
+        >>> prediction_scores, seq_relationship_scores = outputs[:2]
        """
        if "masked_lm_labels" in kwargs:
@@ -958,19 +947,20 @@ class BertLMHeadModel(BertPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-        Examples::
+    Example::
-            from transformers import BertTokenizer, BertLMHeadModel
-            import torch
-            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
-            model = BertLMHeadModel.from_pretrained('bert-base-uncased', is_decoder=True)
+        >>> import torch
-            input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
-            outputs = model(input_ids, labels=input_ids)
+        >>> config = BertConfig.from_pretrained("bert-base-cased")
+        >>> config.is_decoder = True
+        >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
-            loss, prediction_scores = outputs[:2]
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
        """
        outputs = self.bert(
@@ -1028,6 +1018,7 @@ class BertForMaskedLM(BertPreTrainedModel):
        return self.cls.predictions.decoder
    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
    def forward(
        self,
        input_ids=None,
@@ -1069,20 +1060,6 @@ class BertForMaskedLM(BertPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-        Examples::
-            from transformers import BertTokenizer, BertForMaskedLM
-            import torch
-            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-            model = BertForMaskedLM.from_pretrained('bert-base-uncased')
-            input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-            outputs = model(input_ids, labels=input_ids)
-            loss, prediction_scores = outputs[:2]
        """
        if "masked_lm_labels" in kwargs:
            warnings.warn(
@@ -1185,18 +1162,18 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
    Examples::
-        from transformers import BertTokenizer, BertForNextSentencePrediction
+        >>> from transformers import BertTokenizer, BertForNextSentencePrediction
-        import torch
+        >>> import torch
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
+        >>> model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
-        prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
-        encoding = tokenizer.encode_plus(prompt, next_sentence, return_tensors='pt')
+        >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
-        loss, logits = model(**encoding, next_sentence_label=torch.LongTensor([1]))
+        >>> loss, logits = model(**encoding, next_sentence_label=torch.LongTensor([1]))
-        assert logits[0, 0] < logits[0, 1] # next sentence was random
+        >>> assert logits[0, 0] < logits[0, 1] # next sentence was random
        """
        outputs = self.bert(
@@ -1240,6 +1217,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
        self.init_weights()
    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
    def forward(
        self,
        input_ids=None,
@@ -1276,21 +1254,6 @@ class BertForSequenceClassification(BertPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Examples::
-        from transformers import BertTokenizer, BertForSequenceClassification
-        import torch
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
        """
        outputs = self.bert(
@@ -1340,6 +1303,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
        self.init_weights()
    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
    def forward(
        self,
        input_ids=None,
@@ -1377,25 +1341,6 @@ class BertForMultipleChoice(BertPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Examples::
-        from transformers import BertTokenizer, BertForMultipleChoice
-        import torch
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertForMultipleChoice.from_pretrained('bert-base-uncased')
-        prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        choice0 = "It is eaten with a fork and a knife."
-        choice1 = "It is eaten while held in the hand."
-        labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1
-        encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='pt', pad_to_max_length=True)
-        outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
-        # the linear classifier still needs to be trained
-        loss, logits = outputs[:2]
        """
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
@@ -1453,6 +1398,7 @@ class BertForTokenClassification(BertPreTrainedModel):
        self.init_weights()
    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
    def forward(
        self,
        input_ids=None,
@@ -1487,21 +1433,6 @@ class BertForTokenClassification(BertPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Examples::
-        from transformers import BertTokenizer, BertForTokenClassification
-        import torch
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertForTokenClassification.from_pretrained('bert-base-uncased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
        """
        outputs = self.bert(
@@ -1554,6 +1485,7 @@ class BertForQuestionAnswering(BertPreTrainedModel):
        self.init_weights()
    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
    def forward(
        self,
        input_ids=None,
@@ -1596,25 +1528,6 @@ class BertForQuestionAnswering(BertPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Examples::
-        from transformers import BertTokenizer, BertForQuestionAnswering
-        import torch
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        encoding = tokenizer.encode_plus(question, text)
-        input_ids, token_type_ids = encoding["input_ids"], encoding["token_type_ids"]
-        start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
-        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
-        answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])
-        assert answer == "a nice puppet"
        """
        outputs = self.bert(

--- a/src/transformers/modeling_camembert.py
+++ b/src/transformers/modeling_camembert.py
@@ -31,6 +31,8 @@ from .modeling_roberta import (
 logger = logging.getLogger(__name__)
+_TOKENIZER_FOR_DOC = "CamembertTokenizer"
 CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "camembert-base",
    "Musixmatch/umberto-commoncrawl-cased-v1",

--- a/src/transformers/modeling_ctrl.py
+++ b/src/transformers/modeling_ctrl.py
@@ -24,12 +24,14 @@ import torch.nn as nn
 from torch.nn import CrossEntropyLoss
 from .configuration_ctrl import CTRLConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_utils import Conv1D, PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
 logger = logging.getLogger(__name__)
+_TOKENIZER_FOR_DOC = "CTRLTokenizer"
 CTRL_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "ctrl"
    # See all CTRL models at https://huggingface.co/models?filter=ctrl
@@ -326,6 +328,7 @@ class CTRLModel(CTRLPreTrainedModel):
            self.h[layer].multi_head_attention.prune_heads(heads)
    @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="ctrl")
    def forward(
        self,
        input_ids=None,
@@ -358,20 +361,6 @@ class CTRLModel(CTRLPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Examples::
-        from transformers import CTRLTokenizer, CTRLModel
-        import torch
-        tokenizer = CTRLTokenizer.from_pretrained('ctrl')
-        model = CTRLModel.from_pretrained('ctrl')
-        input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        use_cache = use_cache if use_cache is not None else self.config.use_cache
@@ -510,6 +499,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
        return {"input_ids": input_ids, "past": past, "use_cache": kwargs["use_cache"]}
    @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="ctrl")
    def forward(
        self,
        input_ids=None,
@@ -552,19 +542,6 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Examples::
-        import torch
-        from transformers import CTRLTokenizer, CTRLLMHeadModel
-        tokenizer = CTRLTokenizer.from_pretrained('ctrl')
-        model = CTRLLMHeadModel.from_pretrained('ctrl')
-        input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=input_ids)
-        loss, logits = outputs[:2]
        """
        transformer_outputs = self.transformer(
            input_ids,

--- a/src/transformers/modeling_distilbert.py
+++ b/src/transformers/modeling_distilbert.py
@@ -30,12 +30,13 @@ from torch.nn import CrossEntropyLoss
 from .activations import gelu
 from .configuration_distilbert import DistilBertConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
 logger = logging.getLogger(__name__)
+_TOKENIZER_FOR_DOC = "DistilBertTokenizer"
 DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "distilbert-base-uncased",
@@ -409,6 +410,7 @@ class DistilBertModel(DistilBertPreTrainedModel):
            self.transformer.layer[layer].attention.prune_heads(heads)
    @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
    def forward(
        self,
        input_ids=None,
@@ -434,20 +436,6 @@ class DistilBertModel(DistilBertPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Examples::
-        from transformers import DistilBertTokenizer, DistilBertModel
-        import torch
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
-        model = DistilBertModel.from_pretrained('distilbert-base-cased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
@@ -506,6 +494,7 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
        return self.vocab_projector
    @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
    def forward(
        self,
        input_ids=None,
@@ -544,17 +533,6 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Examples::
-        from transformers import DistilBertTokenizer, DistilBertForMaskedLM
-        import torch
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
-        model = DistilBertForMaskedLM.from_pretrained('distilbert-base-cased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=input_ids)
-        loss, prediction_scores = outputs[:2]
        """
        if "masked_lm_labels" in kwargs:
            warnings.warn(
@@ -604,6 +582,7 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
        self.init_weights()
    @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
    def forward(
        self,
        input_ids=None,
@@ -639,18 +618,6 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Examples::
-        from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
-        import torch
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
-        model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-cased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
        """
        distilbert_output = self.distilbert(
            input_ids=input_ids,
@@ -697,6 +664,7 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
        self.init_weights()
    @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
    def forward(
        self,
        input_ids=None,
@@ -737,20 +705,6 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Examples::
-        from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering
-        import torch
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
-        model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-cased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        start_positions = torch.tensor([1])
-        end_positions = torch.tensor([3])
-        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
-        loss, start_scores, end_scores = outputs[:3]
        """
        distilbert_output = self.distilbert(
            input_ids=input_ids,
@@ -806,6 +760,7 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel):
        self.init_weights()
    @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
    def forward(
        self,
        input_ids=None,
@@ -838,19 +793,6 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Examples::
-        from transformers import DistilBertTokenizer, DistilBertForTokenClassification
-        import torch
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
-        model = DistilBertForTokenClassification.from_pretrained('distilbert-base-cased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
        """
        outputs = self.distilbert(
@@ -940,22 +882,23 @@ class DistilBertForMultipleChoice(DistilBertPreTrainedModel):
    Examples::
-        from transformers import DistilBertTokenizer, DistilBertForMultipleChoice
+        >>> from transformers import DistilBertTokenizer, DistilBertForMultipleChoice
-        import torch
+        >>> import torch
+        >>> tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
+        >>> model = DistilBertForMultipleChoice.from_pretrained('distilbert-base-cased')
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
+        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        model = DistilBertForMultipleChoice.from_pretrained('distilbert-base-cased')
+        >>> choice0 = "It is eaten with a fork and a knife."
+        >>> choice1 = "It is eaten while held in the hand."
+        >>> labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1
-        prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+        >>> encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='pt', pad_to_max_length=True)
-        choice0 = "It is eaten with a fork and a knife."
+        >>> outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
-        choice1 = "It is eaten while held in the hand."
-        labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1
-        encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='pt', pad_to_max_length=True)
+        >>> # the linear classifier still needs to be trained
-        outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
+        >>> loss, logits = outputs[:2]
-        # the linear classifier still needs to be trained
-        loss, logits = outputs[:2]
        """
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

--- a/src/transformers/modeling_electra.py
+++ b/src/transformers/modeling_electra.py
@@ -8,13 +8,14 @@ from torch.nn import CrossEntropyLoss, MSELoss
 from .activations import get_activation
 from .configuration_electra import ElectraConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_bert import BertEmbeddings, BertEncoder, BertLayerNorm, BertPreTrainedModel
 from .modeling_utils import SequenceSummary
 logger = logging.getLogger(__name__)
+_TOKENIZER_FOR_DOC = "ElectraTokenizer"
 ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "google/electra-small-generator",
@@ -264,6 +265,7 @@ class ElectraModel(ElectraPreTrainedModel):
            self.encoder.layer[layer].attention.prune_heads(heads)
    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
    def forward(
        self,
        input_ids=None,
@@ -291,20 +293,6 @@ class ElectraModel(ElectraPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Examples::
-        from transformers import ElectraModel, ElectraTokenizer
-        import torch
-        tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
-        model = ElectraModel.from_pretrained('google/electra-small-discriminator')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -383,6 +371,7 @@ class ElectraForSequenceClassification(ElectraPreTrainedModel):
        self.init_weights()
    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
    def forward(
        self,
        input_ids=None,
@@ -419,21 +408,6 @@ class ElectraForSequenceClassification(ElectraPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Examples::
-        from transformers import BertTokenizer, BertForSequenceClassification
-        import torch
-        tokenizer = ElectraTokenizer.from_pretrained('bert-base-uncased')
-        model = ElectraForSequenceClassification.from_pretrained('bert-base-uncased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
        """
        discriminator_hidden_states = self.electra(
            input_ids,
@@ -521,16 +495,14 @@ class ElectraForPreTraining(ElectraPreTrainedModel):
    Examples::
-        from transformers import ElectraTokenizer, ElectraForPreTraining
+        >>> from transformers import ElectraTokenizer, ElectraForPreTraining
-        import torch
+        >>> import torch
-        tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
+        >>> tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
-        model = ElectraForPreTraining.from_pretrained('google/electra-small-discriminator')
+        >>> model = ElectraForPreTraining.from_pretrained('google/electra-small-discriminator')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
+        >>> scores = model(input_ids)[0]
-        prediction_scores, seq_relationship_scores = outputs[:2]
        """
@@ -589,6 +561,7 @@ class ElectraForMaskedLM(ElectraPreTrainedModel):
        return self.generator_lm_head
    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-generator")
    def forward(
        self,
        input_ids=None,
@@ -628,20 +601,6 @@ class ElectraForMaskedLM(ElectraPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-        Examples::
-            from transformers import ElectraTokenizer, ElectraForMaskedLM
-            import torch
-            tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-generator')
-            model = ElectraForMaskedLM.from_pretrained('google/electra-small-generator')
-            input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-            outputs = model(input_ids, labels=input_ids)
-            loss, prediction_scores = outputs[:2]
        """
        if "masked_lm_labels" in kwargs:
            warnings.warn(
@@ -696,6 +655,7 @@ class ElectraForTokenClassification(ElectraPreTrainedModel):
        self.init_weights()
    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
    def forward(
        self,
        input_ids=None,
@@ -730,21 +690,6 @@ class ElectraForTokenClassification(ElectraPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Examples::
-        from transformers import ElectraTokenizer, ElectraForTokenClassification
-        import torch
-        tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
-        model = ElectraForTokenClassification.from_pretrained('google/electra-small-discriminator')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
        """
        discriminator_hidden_states = self.electra(
@@ -802,6 +747,7 @@ class ElectraForQuestionAnswering(ElectraPreTrainedModel):
        self.init_weights()
    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
    def forward(
        self,
        input_ids=None,
@@ -844,23 +790,6 @@ class ElectraForQuestionAnswering(ElectraPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Examples::
-        from transformers import ElectraTokenizer, ElectraForQuestionAnswering
-        import torch
-        tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')
-        model = ElectraForQuestionAnswering.from_pretrained('google/electra-base-discriminator')
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        encoding = tokenizer.encode_plus(question, text, return_tensors='pt')
-        input_ids, token_type_ids = encoding['input_ids'], encoding['token_type_ids']
-        start_scores, end_scores = model(input_ids, token_type_ids=token_type_ids)
-        all_tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze(0))
-        answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])
        """
        discriminator_hidden_states = self.electra(
@@ -918,6 +847,7 @@ class ElectraForMultipleChoice(ElectraPreTrainedModel):
        self.init_weights()
    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
    def forward(
        self,
        input_ids=None,
@@ -954,25 +884,6 @@ class ElectraForMultipleChoice(ElectraPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Examples::
-        from transformers import ElectraTokenizer, ElectraForMultipleChoice
-        import torch
-        tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')
-        model = ElectraForMultipleChoice.from_pretrained('google/electra-base-discriminator')
-        prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        choice0 = "It is eaten with a fork and a knife."
-        choice1 = "It is eaten while held in the hand."
-        labels = torch.tensor(0) # choice0 is correct (according to Wikipedia ;))
-        encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='pt', pad_to_max_length=True)
-        outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
-        # the linear classifier still needs to be trained
-        loss, logits = outputs[:2]
        """
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

--- a/src/transformers/modeling_encoder_decoder.py
+++ b/src/transformers/modeling_encoder_decoder.py
@@ -126,9 +126,8 @@ class EncoderDecoderModel(PreTrainedModel):
        Examples::
-            from transformers import EncoderDecoder
+            >>> from transformers import EncoderDecoderModel
+            >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert
-            model = EncoderDecoder.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert
        """
        kwargs_encoder = {
@@ -244,21 +243,21 @@ class EncoderDecoderModel(PreTrainedModel):
        Examples::
-            from transformers import EncoderDecoderModel, BertTokenizer
+            >>> from transformers import EncoderDecoderModel, BertTokenizer
-            import torch
+            >>> import torch
-            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-            model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert
+            >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert
-            # forward
+            >>> # forward
-            input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+            >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-            outputs = model(input_ids=input_ids, decoder_input_ids=input_ids)
+            >>> outputs = model(input_ids=input_ids, decoder_input_ids=input_ids)
-            # training
+            >>> # training
-            loss, outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, lm_labels=input_ids)[:2]
+            >>> loss, outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=input_ids)[:2]
-            # generation
+            >>> # generation
-            generated = model.generate(input_ids, decoder_start_token_id=model.config.decoder.pad_token_id)
+            >>> generated = model.generate(input_ids, decoder_start_token_id=model.config.decoder.pad_token_id)
        """

--- a/src/transformers/modeling_flaubert.py
+++ b/src/transformers/modeling_flaubert.py
@@ -22,7 +22,7 @@ import torch
 from torch.nn import functional as F
 from .configuration_flaubert import FlaubertConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_xlm import (
    XLMForQuestionAnswering,
    XLMForQuestionAnsweringSimple,
@@ -35,6 +35,8 @@ from .modeling_xlm import (
 logger = logging.getLogger(__name__)
+_TOKENIZER_FOR_DOC = "FlaubertTokenizer"
 FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "flaubert/flaubert_small_cased",
    "flaubert/flaubert_base_uncased",
@@ -119,6 +121,7 @@ class FlaubertModel(XLMModel):
        self.pre_norm = getattr(config, "pre_norm", False)
    @add_start_docstrings_to_callable(FLAUBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="flaubert/flaubert_base_cased")
    def forward(
        self,
        input_ids=None,
@@ -149,18 +152,6 @@ class FlaubertModel(XLMModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Examples::
-        from transformers import FlaubertTokenizer, FlaubertModel
-        import torch
-        tokenizer = FlaubertTokenizer.from_pretrained('flaubert-base-cased')
-        model = FlaubertModel.from_pretrained('flaubert-base-cased')
-        input_ids = torch.tensor(tokenizer.encode("Le chat mange une pomme.", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (

--- a/src/transformers/modeling_gpt2.py
+++ b/src/transformers/modeling_gpt2.py
@@ -26,7 +26,7 @@ from torch.nn import CrossEntropyLoss
 from .activations import ACT2FN
 from .configuration_gpt2 import GPT2Config
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_utils import (
    Conv1D,
    PreTrainedModel,
@@ -38,6 +38,8 @@ from .modeling_utils import (
 logger = logging.getLogger(__name__)
+_TOKENIZER_FOR_DOC = "GPT2Tokenizer"
 GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "gpt2",
    "gpt2-medium",
@@ -370,6 +372,7 @@ class GPT2Model(GPT2PreTrainedModel):
            self.h[layer].attn.prune_heads(heads)
    @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="gpt2")
    def forward(
        self,
        input_ids=None,
@@ -403,18 +406,6 @@ class GPT2Model(GPT2PreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Examples::
-        from transformers import GPT2Tokenizer, GPT2Model
-        import torch
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        model = GPT2Model.from_pretrained('gpt2')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
@@ -553,6 +544,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
        return {"input_ids": input_ids, "past": past, "use_cache": kwargs["use_cache"]}
    @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="gpt2")
    def forward(
        self,
        input_ids=None,
@@ -595,19 +587,6 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Examples::
-        import torch
-        from transformers import GPT2Tokenizer, GPT2LMHeadModel
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        model = GPT2LMHeadModel.from_pretrained('gpt2')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=input_ids)
-        loss, logits = outputs[:2]
        """
        transformer_outputs = self.transformer(
            input_ids,
@@ -721,26 +700,26 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
    Examples::
-        import torch
+        >>> import torch
-        from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
+        >>> from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
+        >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        >>> model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        >>> # Add a [CLS] to the vocabulary (we should train it also!)
-        model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
+        >>> num_added_tokens = tokenizer.add_special_tokens({'cls_token': '[CLS]'})
-        # Add a [CLS] to the vocabulary (we should train it also!)
+        >>> embedding_layer = model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
-        tokenizer.add_special_tokens({'cls_token': '[CLS]'})
-        model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
-        print(tokenizer.cls_token_id, len(tokenizer))  # The newly token the last token of the vocabulary
-        choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
+        >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
-        encoded_choices = [tokenizer.encode(s) for s in choices]
+        >>> encoded_choices = [tokenizer.encode(s) for s in choices]
-        cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
+        >>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
-        input_ids = torch.tensor(encoded_choices).unsqueeze(0)  # Batch size: 1, number of choices: 2
+        >>> input_ids = torch.tensor(encoded_choices).unsqueeze(0)  # Batch size: 1, number of choices: 2
-        mc_token_ids = torch.tensor([cls_token_location])  # Batch size: 1
+        >>> mc_token_ids = torch.tensor([cls_token_location])  # Batch size: 1
-        outputs = model(input_ids, mc_token_ids=mc_token_ids)
+        >>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
-        lm_prediction_scores, mc_prediction_scores = outputs[:2]
+        >>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
        """
        if "lm_labels" in kwargs:

--- a/src/transformers/modeling_longformer.py
+++ b/src/transformers/modeling_longformer.py
@@ -24,13 +24,15 @@ from torch.nn import CrossEntropyLoss, MSELoss
 from torch.nn import functional as F
 from .configuration_longformer import LongformerConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_bert import BertPreTrainedModel
 from .modeling_roberta import RobertaLMHead, RobertaModel
 logger = logging.getLogger(__name__)
+_TOKENIZER_FOR_DOC = "LongformerTokenizer"
 LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "allenai/longformer-base-4096",
    "allenai/longformer-large-4096",
@@ -609,22 +611,22 @@ class LongformerModel(RobertaModel):
    Examples::
-        import torch
+        >>> import torch
-        from transformers import LongformerModel, LongformerTokenizer
+        >>> from transformers import LongformerModel, LongformerTokenizer
-        model = LongformerModel.from_pretrained('allenai/longformer-base-4096')
+        >>> model = LongformerModel.from_pretrained('allenai/longformer-base-4096')
-        tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
+        >>> tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
-        SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000)  # long input document
+        >>> SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000)  # long input document
-        input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0)  # batch of size 1
+        >>> input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0)  # batch of size 1
-        # Attention mask values -- 0: no attention, 1: local attention, 2: global attention
+        >>> # Attention mask values -- 0: no attention, 1: local attention, 2: global attention
-        attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to local attention
+        >>> attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to local attention
-        attention_mask[:, [1, 4, 21,]] = 2  # Set global attention based on the task. For example,
+        >>> attention_mask[:, [1, 4, 21,]] = 2  # Set global attention based on the task. For example,
-                                            # classification: the <s> token
+        ...                                     # classification: the <s> token
-                                            # QA: question tokens
+        ...                                     # QA: question tokens
-                                            # LM: potentially on the beginning of sentences and paragraphs
+        ...                                     # LM: potentially on the beginning of sentences and paragraphs
-        sequence_output, pooled_output = model(input_ids, attention_mask=attention_mask)
+        >>> sequence_output, pooled_output = model(input_ids, attention_mask=attention_mask)
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -743,18 +745,18 @@ class LongformerForMaskedLM(BertPreTrainedModel):
    Examples::
-        import torch
+        >>> import torch
-        from transformers import LongformerForMaskedLM, LongformerTokenizer
+        >>> from transformers import LongformerForMaskedLM, LongformerTokenizer
-        model = LongformerForMaskedLM.from_pretrained('allenai/longformer-base-4096')
+        >>> model = LongformerForMaskedLM.from_pretrained('allenai/longformer-base-4096')
-        tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
+        >>> tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
-        SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000)  # long input document
+        >>> SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000)  # long input document
-        input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0)  # batch of size 1
+        >>> input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0)  # batch of size 1
-        attention_mask = None  # default is local attention everywhere, which is a good choice for MaskedLM
+        >>> attention_mask = None  # default is local attention everywhere, which is a good choice for MaskedLM
-                               # check ``LongformerModel.forward`` for more details how to set `attention_mask`
+        ...                        # check ``LongformerModel.forward`` for more details how to set `attention_mask`
-        loss, prediction_scores = model(input_ids, attention_mask=attention_mask, labels=input_ids)
+        >>> loss, prediction_scores = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        """
        if "masked_lm_labels" in kwargs:
@@ -807,6 +809,7 @@ class LongformerForSequenceClassification(BertPreTrainedModel):
        self.init_weights()
    @add_start_docstrings_to_callable(LONGFORMER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="allenai/longformer-base-4096")
    def forward(
        self,
        input_ids=None,
@@ -843,19 +846,6 @@ class LongformerForSequenceClassification(BertPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Examples::
-        from transformers import LongformerTokenizer, LongformerForSequenceClassification
-        import torch
-        tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
-        model = LongformerForSequenceClassification.from_pretrained('allenai/longformer-base-4096')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
        """
        if global_attention_mask is None:
@@ -973,25 +963,25 @@ class LongformerForQuestionAnswering(BertPreTrainedModel):
    Examples::
-        from transformers import LongformerTokenizer, LongformerForQuestionAnswering
+        >>> from transformers import LongformerTokenizer, LongformerForQuestionAnswering
-        import torch
+        >>> import torch
-        tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
+        >>> tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
-        model = LongformerForQuestionAnswering.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
+        >>> model = LongformerForQuestionAnswering.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
+        >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        encoding = tokenizer.encode_plus(question, text, return_tensors="pt")
+        >>> encoding = tokenizer.encode_plus(question, text, return_tensors="pt")
-        input_ids = encoding["input_ids"]
+        >>> input_ids = encoding["input_ids"]
-        # default is local attention everywhere
+        >>> # default is local attention everywhere
-        # the forward method will automatically set global attention on question tokens
+        >>> # the forward method will automatically set global attention on question tokens
-        attention_mask = encoding["attention_mask"]
+        >>> attention_mask = encoding["attention_mask"]
-        start_scores, end_scores = model(input_ids, attention_mask=attention_mask)
+        >>> start_scores, end_scores = model(input_ids, attention_mask=attention_mask)
-        all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())
+        >>> all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())
-        answer_tokens = all_tokens[torch.argmax(start_scores) :torch.argmax(end_scores)+1]
+        >>> answer_tokens = all_tokens[torch.argmax(start_scores) :torch.argmax(end_scores)+1]
-        answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens)) # remove space prepending space token
+        >>> answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens)) # remove space prepending space token
        """
@@ -1060,6 +1050,7 @@ class LongformerForTokenClassification(BertPreTrainedModel):
        self.init_weights()
    @add_start_docstrings_to_callable(LONGFORMER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="allenai/longformer-base-4096")
    def forward(
        self,
        input_ids=None,
@@ -1094,19 +1085,6 @@ class LongformerForTokenClassification(BertPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Examples::
-        from transformers import LongformerTokenizer, LongformerForTokenClassification
-        import torch
-        tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
-        model = LongformerForTokenClassification.from_pretrained('allenai/longformer-base-4096')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
        """
        outputs = self.longformer(
@@ -1163,6 +1141,7 @@ class LongformerForMultipleChoice(BertPreTrainedModel):
        self.init_weights()
    @add_start_docstrings_to_callable(LONGFORMER_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="allenai/longformer-base-4096")
    def forward(
        self,
        input_ids=None,
@@ -1200,23 +1179,6 @@ class LongformerForMultipleChoice(BertPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-    Examples::
-        from transformers import LongformerTokenizer, LongformerForMultipleChoice
-        import torch
-        tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
-        model = LongformerForMultipleChoice.from_pretrained('allenai/longformer-base-4096')
-        # context = "The dog is cute" | choice = "the dog" / "the cat"
-        choices = [("The dog is cute", "the dog"), ("The dog is cute", "the cat")]
-        input_ids = torch.tensor([tokenizer.encode(s[0], s[1], add_special_tokens=True) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
-        labels = torch.tensor(1).unsqueeze(0)  # Batch size 1
-        # global attention is automatically put on "the dog" and "the cat"
-        outputs = model(input_ids, labels=labels)
-        loss, classification_scores = outputs[:2]
        """
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

--- a/src/transformers/modeling_marian.py
+++ b/src/transformers/modeling_marian.py
@@ -31,18 +31,18 @@ class MarianMTModel(BartForConditionalGeneration):
    Examples::
-        from transformers import MarianTokenizer, MarianMTModel
+        >>> from transformers import MarianTokenizer, MarianMTModel
-        from typing import List
+        >>> from typing import List
-        src = 'fr'  # source language
+        >>> src = 'fr'  # source language
-        trg = 'en'  # target language
+        >>> trg = 'en'  # target language
-        sample_text = "où est l'arrêt de bus ?"
+        >>> sample_text = "où est l'arrêt de bus ?"
-        mname = f'Helsinki-NLP/opus-mt-{src}-{trg}'
+        >>> mname = f'Helsinki-NLP/opus-mt-{src}-{trg}'
-        model = MarianMTModel.from_pretrained(mname)
+        >>> model = MarianMTModel.from_pretrained(mname)
-        tok = MarianTokenizer.from_pretrained(mname)
+        >>> tok = MarianTokenizer.from_pretrained(mname)
-        batch = tok.prepare_translation_batch(src_texts=[sample_text])  # don't need tgt_text for inference
+        >>> batch = tok.prepare_translation_batch(src_texts=[sample_text])  # don't need tgt_text for inference
-        gen = model.generate(**batch)  # for forward pass: model(**batch)
+        >>> gen = model.generate(**batch)  # for forward pass: model(**batch)
-        words: List[str] = tok.batch_decode(gen, skip_special_tokens=True)  # returns "Where is the the bus stop ?"
+        >>> words: List[str] = tok.batch_decode(gen, skip_special_tokens=True)  # returns "Where is the the bus stop ?"
    """