Refactor Code samples; Test code samples (#5036)

* Refactor code samples * Test docstrings * Style * Tokenization examples * Run rust of tests * First step to testing source docs * Style and BART comment * Test the remainder of the code samples * Style * let to const * Formatting fixes * Ready for merge * Fix fixture + Style * Fix last tests * Update docs/source/quicktour.rst Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Addressing @sgugger's comments + Fix MobileBERT in TF Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

Refactor Code samples; Test code samples (#5036)
* Refactor code samples * Test docstrings * Style * Tokenization examples * Run rust of tests * First step to testing source docs * Style and BART comment * Test the remainder of the code samples * Style * let to const * Formatting fixes * Ready for merge * Fix fixture + Style * Fix last tests * Update docs/source/quicktour.rst Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Addressing @sgugger's comments + Fix MobileBERT in TF Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
364a5ae1 · Lysandre Debut · GitHub · 315f464b · 364a5ae1 · 364a5ae1
Unverified Commit 364a5ae1 authored Jun 25, 2020 by Lysandre Debut Committed by GitHub Jun 25, 2020
8 changed files
--- a/src/transformers/modeling_xlm.py
+++ b/src/transformers/modeling_xlm.py
@@ -28,7 +28,7 @@ from torch.nn import functional as F

 from .activations import gelu
 from .configuration_xlm import XLMConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_utils import (
    PreTrainedModel,
    SequenceSummary,
@@ -40,6 +40,8 @@ from .modeling_utils import (

 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "XLMTokenizer"
+
 XLM_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "xlm-mlm-en-2048",
    "xlm-mlm-ende-1024",
@@ -395,6 +397,7 @@ class XLMModel(XLMPreTrainedModel):
            self.attentions[layer].prune_heads(heads)

    @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
    def forward(
        self,
        input_ids=None,
@@ -425,18 +428,6 @@ class XLMModel(XLMPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import XLMTokenizer, XLMModel
-        import torch
-
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        model = XLMModel.from_pretrained('xlm-mlm-en-2048')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
@@ -632,6 +623,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
        return {"input_ids": input_ids, "langs": langs}

    @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
    def forward(
        self,
        input_ids=None,
@@ -672,18 +664,6 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import XLMTokenizer, XLMWithLMHeadModel
-        import torch
-
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        model = XLMWithLMHeadModel.from_pretrained('xlm-mlm-en-2048')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
        """
        transformer_outputs = self.transformer(
            input_ids,
@@ -722,6 +702,7 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
    def forward(
        self,
        input_ids=None,
@@ -761,19 +742,6 @@ class XLMForSequenceClassification(XLMPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import XLMTokenizer, XLMForSequenceClassification
-        import torch
-
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        model = XLMForSequenceClassification.from_pretrained('xlm-mlm-en-2048')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
-
        """
        transformer_outputs = self.transformer(
            input_ids,
@@ -822,6 +790,7 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
    def forward(
        self,
        input_ids=None,
@@ -867,20 +836,6 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import XLMTokenizer, XLMForQuestionAnsweringSimple
-        import torch
-
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        model = XLMForQuestionAnsweringSimple.from_pretrained('xlm-mlm-en-2048')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        start_positions = torch.tensor([1])
-        end_positions = torch.tensor([3])
-        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
-        loss = outputs[0]
-
        """
        transformer_outputs = self.transformer(
            input_ids,
@@ -1006,19 +961,20 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

-    Examples::
+    Example::

-        from transformers import XLMTokenizer, XLMForQuestionAnswering
-        import torch
+        >>> from transformers import XLMTokenizer, XLMForQuestionAnswering
+        >>> import torch

-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        start_positions = torch.tensor([1])
-        end_positions = torch.tensor([3])
-        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
-        loss = outputs[0]
+        >>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        >>> model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048')

+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+        >>> start_positions = torch.tensor([1])
+        >>> end_positions = torch.tensor([3])
+
+        >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
+        >>> loss = outputs[0]
        """
        transformer_outputs = self.transformer(
            input_ids,
@@ -1067,6 +1023,7 @@ class XLMForTokenClassification(XLMPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
    def forward(
        self,
        input_ids=None,
@@ -1074,6 +1031,8 @@ class XLMForTokenClassification(XLMPreTrainedModel):
        langs=None,
        token_type_ids=None,
        position_ids=None,
+        lengths=None,
+        cache=None,
        head_mask=None,
        labels=None,
        output_attentions=None,
@@ -1101,19 +1060,6 @@ class XLMForTokenClassification(XLMPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import XLMTokenizer, XLMForTokenClassification
-        import torch
-
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-100-1280')
-        model = XLMForTokenClassification.from_pretrained('xlm-mlm-100-1280')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
-
        """
        outputs = self.transformer(
            input_ids,
@@ -1121,6 +1067,8 @@ class XLMForTokenClassification(XLMPreTrainedModel):
            langs=langs,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,

--- a/src/transformers/modeling_xlnet.py
+++ b/src/transformers/modeling_xlnet.py
@@ -26,12 +26,14 @@ from torch.nn import functional as F

 from .activations import gelu_new, swish
 from .configuration_xlnet import XLNetConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_utils import PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits, PreTrainedModel, SequenceSummary


 logger = logging.getLogger(__name__)

+_TOKENIZER_FOR_DOC = "XLNetTokenizer"
+
 XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "xlnet-base-cased",
    "xlnet-large-cased",
@@ -749,6 +751,7 @@ class XLNetModel(XLNetPreTrainedModel):
        return pos_emb

    @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
    def forward(
        self,
        input_ids=None,
@@ -785,20 +788,6 @@ class XLNetModel(XLNetPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import XLNetTokenizer, XLNetModel
-        import torch
-
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-        model = XLNetModel.from_pretrained('xlnet-large-cased')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=False)).unsqueeze(0)  # Batch size 1
-
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
@@ -1164,6 +1153,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
    def forward(
        self,
        input_ids=None,
@@ -1208,20 +1198,6 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import XLNetTokenizer, XLNetForSequenceClassification
-        import torch
-
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-        model = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
-
        """
        transformer_outputs = self.transformer(
            input_ids,
@@ -1273,6 +1249,7 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
    def forward(
        self,
        input_ids=None,
@@ -1316,21 +1293,6 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import XLNetTokenizer, XLNetForTokenClassification
-        import torch
-
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-        model = XLNetForTokenClassification.from_pretrained('xlnet-large-cased')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-
-        scores = outputs[0]
-
        """

        outputs = self.transformer(
@@ -1386,6 +1348,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
    def forward(
        self,
        input_ids=None,
@@ -1431,22 +1394,6 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import XLNetTokenizer, XLNetForMultipleChoice
-        import torch
-
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
-        model = XLNetForMultipleChoice.from_pretrained('xlnet-base-cased')
-
-        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
-        input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
-        labels = torch.tensor(1).unsqueeze(0)  # Batch size 1
-
-        outputs = model(input_ids, labels=labels)
-        loss, classification_scores = outputs[:2]
-
        """
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

@@ -1508,6 +1455,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
        self.init_weights()

    @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
    def forward(
        self,
        input_ids=None,
@@ -1558,22 +1506,6 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
-
-    Examples::
-
-        from transformers import XLNetTokenizer, XLNetForQuestionAnsweringSimple
-        import torch
-
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
-        model = XLNetForQuestionAnsweringSimple.from_pretrained('xlnet-base-cased')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        start_positions = torch.tensor([1])
-        end_positions = torch.tensor([3])
-
-        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
-        loss = outputs[0]
-
        """

        outputs = self.transformer(
@@ -1705,20 +1637,20 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

-    Examples::
+        Example::

-        from transformers import XLNetTokenizer, XLNetForQuestionAnswering
-        import torch
+            >>> from transformers import XLNetTokenizer, XLNetForQuestionAnswering
+            >>> import torch

-        tokenizer =  XLNetTokenizer.from_pretrained('xlnet-base-cased')
-        model = XLNetForQuestionAnswering.from_pretrained('xlnet-base-cased')
+            >>> tokenizer =  XLNetTokenizer.from_pretrained('xlnet-base-cased')
+            >>> model = XLNetForQuestionAnswering.from_pretrained('xlnet-base-cased')

-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        start_positions = torch.tensor([1])
-        end_positions = torch.tensor([3])
-        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
-        loss = outputs[0]
+            >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+            >>> start_positions = torch.tensor([1])
+            >>> end_positions = torch.tensor([3])
+            >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)

+            >>> loss = outputs[0]
        """
        transformer_outputs = self.transformer(
            input_ids,

--- a/src/transformers/tokenization_bart.py
+++ b/src/transformers/tokenization_bart.py
@@ -66,13 +66,15 @@ class MBartTokenizer(XLMRobertaTokenizer):
    The tokenization method is <tokens> <eos> <language code>. There is no BOS token.

    Examples::
-        from transformers import MBartTokenizer
-        tokenizer = MBartTokenizer.from_pretrained('mbart-large-en-ro')
-        example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
-        expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
-        batch: dict = tokenizer.prepare_translation_batch(
-            example_english_phrase, src_lang="en_XX", tgt_lang="ro_RO", tgt_texts=expected_translation_romanian
-        )
+
+        >>> from transformers import MBartTokenizer
+        >>> tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-en-ro')
+        >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
+        >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
+        >>> batch: dict = tokenizer.prepare_translation_batch(
+        ...     example_english_phrase, src_lang="en_XX", tgt_lang="ro_RO", tgt_texts=expected_translation_romanian
+        ... )
+
    """

    vocab_files_names = {"vocab_file": "sentencepiece.bpe.model"}

--- a/src/transformers/tokenization_marian.py
+++ b/src/transformers/tokenization_marian.py
@@ -25,13 +25,13 @@ class MarianTokenizer(PreTrainedTokenizer):

    Examples::

-        from transformers import MarianTokenizer
-        tok = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
-        src_texts = [ "I am a small frog.", "Tom asked his teacher for advice."]
-        tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."]  # optional
-        batch_enc: BatchEncoding = tok.prepare_translation_batch(src_texts, tgt_texts=tgt_texts)
-        # keys  [input_ids, attention_mask, decoder_input_ids,  decoder_attention_mask].
-        # model(**batch) should work
+        >>> from transformers import MarianTokenizer
+        >>> tok = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
+        >>> src_texts = [ "I am a small frog.", "Tom asked his teacher for advice."]
+        >>> tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."]  # optional
+        >>> batch_enc: BatchEncoding = tok.prepare_translation_batch(src_texts, tgt_texts=tgt_texts)
+        >>> # keys  [input_ids, attention_mask, decoder_input_ids,  decoder_attention_mask].
+        >>> # model(**batch) should work
    """

    vocab_files_names = vocab_files_names

--- a/src/transformers/tokenization_reformer.py
+++ b/src/transformers/tokenization_reformer.py
@@ -81,6 +81,7 @@ class ReformerTokenizer(PreTrainedTokenizer):
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["attention_mask"]

    def __init__(
        self,

--- a/src/transformers/tokenization_t5.py
+++ b/src/transformers/tokenization_t5.py
@@ -94,6 +94,7 @@ class T5Tokenizer(PreTrainedTokenizer):
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["attention_mask"]

    def __init__(
        self,

--- a/tests/test_doc_samples.py
+++ b/tests/test_doc_samples.py
@@ -13,52 +13,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+import doctest
+import logging
 import os
 import unittest
+from pathlib import Path
 from typing import List, Union

-from .utils import require_tf, require_torch, slow
-
+import transformers

-def get_examples_from_file(file):
-    examples = []
-    example = []
-    example_mode = False
-    example_indentation = None
-    for i, line in enumerate(file):
-        if example_mode:
-            current_indentation = len(line) - len(line.strip()) - 1
-
-            # Check if the indentation is 0 for the example, so that we don't exit as soon as there's a line return.
-            empty_line = example_indentation == 0 and len(line) == 1
-
-            # If we're back to the example indentation or if it's the end of the docstring.
-            if (current_indentation == example_indentation and not empty_line) or '"""' in line:
-                # Exit the example mode and add the example to the examples list
-                example_mode = False
-                example_indentation = None
-                examples.append(example)
-                example = []
-            else:
-                # If line is not empty, add it to the current example
-                if line != "\n":
-                    example.append(line[example_indentation + 4 : -1])
-
-        # Detect the example from '::' or 'example::'
-        if "example::" in line.lower():
-            example_mode = True
-            example_indentation = line.lower().find("example::")
-        elif "examples::" in line.lower():
-            example_mode = True
-            example_indentation = line.lower().find("examples::")
-        # elif "::" in line.lower() and len(line.strip()) == 2:
-        #     example_mode = True
-        #     example_indentation = line.lower().find("::")
+from .utils import require_tf, require_torch, slow

-    examples = ["\n".join(example) for example in examples]
-    examples = [example for example in examples if "not runnable" not in example.lower()]

-    return examples
+logger = logging.getLogger()


 @require_torch
@@ -66,68 +33,81 @@ def get_examples_from_file(file):
 @slow
 class TestCodeExamples(unittest.TestCase):
    def analyze_directory(
-        self, directory: str, identifier: Union[str, None] = None, ignore_files: Union[List[str], None] = None
+        self,
+        directory: Path,
+        identifier: Union[str, None] = None,
+        ignore_files: Union[List[str], None] = [],
+        n_identifier: Union[str, None] = None,
+        only_modules: bool = True,
    ):
+        """
+        Runs through the specific directory, looking for the files identified with `identifier`. Executes
+        the doctests in those files
+
+        Args:
+            directory (:obj:`str`): Directory containing the files
+            identifier (:obj:`str`): Will parse files containing this
+            ignore_files (:obj:`List[str]`): List of files to skip
+            n_identifier (:obj:`str` or :obj:`List[str]`): Will not parse files containing this/these identifiers.
+            only_modules (:obj:`bool`): Whether to only analyze modules
+        """
        files = [file for file in os.listdir(directory) if os.path.isfile(os.path.join(directory, file))]

        if identifier is not None:
            files = [file for file in files if identifier in file]

-        if ignore_files is not None:
-            files = [file for file in files if file not in ignore_files]
+        if n_identifier is not None:
+            if isinstance(n_identifier, List):
+                for n_ in n_identifier:
+                    files = [file for file in files if n_ not in file]
+            else:
+                files = [file for file in files if n_identifier not in file]
+
+        ignore_files.append("__init__.py")
+        files = [file for file in files if file not in ignore_files]

        for file in files:
            # Open all files
-            print("Testing", file, end=" ")
-            with open(os.path.join(directory, file)) as f:
-                # Retrieve examples
-                examples = get_examples_from_file(f)
-                joined_examples = []
-
-                def execute_example(code_example):
-                    exec(code_example, {})
-
-                # Some examples are the continuation of others.
-                if len(examples) > 0:
-                    joined_examples.append(examples[0])
-                    joined_examples_index = 0
-                    for example in examples[1:]:
-                        # If they contain this line, then they're a continuation of the previous script
-                        if "# Continuation of the previous script" in example:
-                            joined_examples[joined_examples_index] += "\n" + example
-                        # If not, create a new example and increment the index
-                        else:
-                            joined_examples.append(example)
-                            joined_examples_index += 1
-
-                print(str(len(joined_examples)) + "/" + str(len(joined_examples)))
-
-                # Execute sub tests with every example.
-                for index, code_example in enumerate(joined_examples):
-                    with self.subTest(msg=file + " " + str(index) + "/" + str(len(joined_examples)) + code_example):
-                        execute_example(code_example)
-
-    def test_configuration_examples(self):
-        transformers_directory = "src/transformers"
-        configuration_files = "configuration"
-        ignore_files = ["configuration_auto.py", "configuration_utils.py"]
-        self.analyze_directory(transformers_directory, identifier=configuration_files, ignore_files=ignore_files)
-
-    def test_main_doc_examples(self):
-        doc_directory = "docs/source"
-        ignore_files = ["favicon.ico"]
-        self.analyze_directory(doc_directory, ignore_files=ignore_files)
+            print("Testing", file)
+
+            if only_modules:
+                try:
+                    module_identifier = file.split(".")[0]
+                    module_identifier = getattr(transformers, module_identifier)
+                    suite = doctest.DocTestSuite(module_identifier)
+                    result = unittest.TextTestRunner().run(suite)
+                    self.assertIs(len(result.failures), 0)
+                except AttributeError:
+                    logger.info(f"{module_identifier} is not a module.")
+            else:
+                result = doctest.testfile(str(".." / directory / file), optionflags=doctest.ELLIPSIS)
+                self.assertIs(result.failed, 0)

    def test_modeling_examples(self):
        transformers_directory = "src/transformers"
-        modeling_files = "modeling"
+        files = "modeling"
        ignore_files = [
-            "modeling_auto.py",
-            "modeling_t5.py",
-            "modeling_tf_auto.py",
-            "modeling_utils.py",
-            "modeling_tf_t5.py",
-            "modeling_bart.py",
-            "modeling_tf_utils.py",
+            "modeling_ctrl.py",
+            "modeling_tf_ctrl.py",
        ]
-        self.analyze_directory(transformers_directory, identifier=modeling_files, ignore_files=ignore_files)
+        self.analyze_directory(transformers_directory, identifier=files, ignore_files=ignore_files)
+
+    def test_tokenization_examples(self):
+        transformers_directory = Path("src/transformers")
+        files = "tokenization"
+        self.analyze_directory(transformers_directory, identifier=files)
+
+    def test_configuration_examples(self):
+        transformers_directory = Path("src/transformers")
+        files = "configuration"
+        self.analyze_directory(transformers_directory, identifier=files)
+
+    def test_remaining_examples(self):
+        transformers_directory = Path("src/transformers")
+        n_identifiers = ["configuration", "modeling", "tokenization"]
+        self.analyze_directory(transformers_directory, n_identifier=n_identifiers)
+
+    def test_doc_sources(self):
+        doc_source_directory = Path("docs/source")
+        ignore_files = ["favicon.ico"]
+        self.analyze_directory(doc_source_directory, ignore_files=ignore_files, only_modules=False)
--- a/tests/test_modeling_tf_xlm.py
+++ b/tests/test_modeling_tf_xlm.py
@@ -31,6 +31,7 @@ if is_tf_available():
        TFXLMWithLMHeadModel,
        TFXLMForSequenceClassification,
        TFXLMForQuestionAnsweringSimple,
+        TFXLMForTokenClassification,
        TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST,
    )

@@ -219,6 +220,26 @@ class TFXLMModelTester:

        self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.type_sequence_label_size])

+    def create_and_check_xlm_for_token_classification(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_lengths,
+        sequence_labels,
+        token_labels,
+        is_impossible_labels,
+        input_mask,
+    ):
+        config.num_labels = self.num_labels
+        model = TFXLMForTokenClassification(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        (logits,) = model(inputs)
+        result = {
+            "logits": logits.numpy(),
+        }
+        self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels])
+
    def prepare_config_and_inputs_for_common(self):
        config_and_inputs = self.prepare_config_and_inputs()
        (
@@ -244,7 +265,14 @@ class TFXLMModelTester:
 class TFXLMModelTest(TFModelTesterMixin, unittest.TestCase):

    all_model_classes = (
-        (TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, TFXLMForQuestionAnsweringSimple)
+        # TODO The multiple choice model is missing and should be added.
+        (
+            TFXLMModel,
+            TFXLMWithLMHeadModel,
+            TFXLMForSequenceClassification,
+            TFXLMForQuestionAnsweringSimple,
+            TFXLMForTokenClassification,
+        )
        if is_tf_available()
        else ()
    )
@@ -275,6 +303,10 @@ class TFXLMModelTest(TFModelTesterMixin, unittest.TestCase):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_xlm_sequence_classif(*config_and_inputs)

+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlm_for_token_classification(*config_and_inputs)
+
    @slow
    def test_model_from_pretrained(self):
        for model_name in TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: