Can test examples spread over multiple blocks

632675ea · Lysandre · Lysandre Debut · eaa6b9af · 632675ea · 632675ea
Commit 632675ea authored Jan 14, 2020 by Lysandre Committed by Lysandre Debut Jan 23, 2020
Show whitespace changes
Inline Side-by-side

Showing with 77 additions and 14 deletions

docs/source/glossary.rst docs/source/glossary.rst +15 -4

tests/test_examples.py tests/test_examples.py +62 -10

No files found.
--- a/docs/source/glossary.rst
+++ b/docs/source/glossary.rst
@@ -24,6 +24,7 @@ The tokenizer takes care of splitting the sequence into tokens available in the

 ::

+    # Continuation of the previous script
    tokenized_sequence = tokenizer.tokenize(sequence)
    assert tokenized_sequence == ['A', 'Titan', 'R', '##T', '##X', 'has', '24', '##GB', 'of', 'V', '##RA', '##M']

@@ -33,6 +34,7 @@ this, the recommended being `encode` or `encode_plus`, which leverage the Rust i

 ::

+    # Continuation of the previous script
    encoded_sequence = tokenizer.encode(sequence)
    assert encoded_sequence == [101, 138, 18696, 155, 1942, 3190, 1144, 1572, 13745, 1104, 159, 9664, 2107, 102]

@@ -48,6 +50,9 @@ For example, consider these two sequences:

 ::

+    from transformers import BertTokenizer
+    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+
    sequence_a = "This is a short sequence."
    sequence_b = "This is a rather long sequence. It is at least longer than the sequence A."

@@ -65,10 +70,11 @@ In the first case, the list of IDs will be extended by the padding indices:

 ::

+    # Continuation of the previous script
    padded_sequence_a = tokenizer.encode(sequence_a, max_length=19, pad_to_max_length=True)

-    assert padded_sequence_a = [101, 1188, 1110, 170, 1603, 4954,  119, 102,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,   0]
-    assert encoded_sequence_b = [101, 1188, 1110, 170, 1897, 1263, 4954, 119, 1135, 1110, 1120, 1655, 2039, 1190, 1103, 4954, 138, 119, 102]
+    assert padded_sequence_a == [101, 1188, 1110, 170, 1603, 4954,  119, 102,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,   0]
+    assert encoded_sequence_b == [101, 1188, 1110, 170, 1897, 1263, 4954, 119, 1135, 1110, 1120, 1655, 2039, 1190, 1103, 4954, 138, 119, 102]

 These can then be converted into a tensor in PyTorch or TensorFlow. The attention mask is a binary tensor indicating
 the position of the padded indices so that the model does not attend to them. For the
@@ -79,6 +85,7 @@ The method :func:`~transformers.PreTrainedTokenizer.encode_plus` may be used to

 ::

+    # Continuation of the previous script
    sequence_a_dict = tokenizer.encode_plus(sequence_a, max_length=19, pad_to_max_length=True)

    assert sequence_a_dict['input_ids'] == [101, 1188, 1110, 170, 1603, 4954, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
@@ -94,6 +101,9 @@ tokens. For example, the BERT model builds its two sequence input as such:

 ::

+    from transformers import BertTokenizer
+    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+
    # [CLS] SEQ_A [SEP] SEQ_B [SEP]

    sequence_a = "HuggingFace is based in NYC"
@@ -110,10 +120,11 @@ We can leverage :func:`~transformers.PreTrainedTokenizer.encode_plus` to output

 ::

+    # Continuation of the previous script
    encoded_dict = tokenizer.encode_plus(sequence_a, sequence_b)

-    assert sequence_a_dict['input_ids'] == [101, 20164, 10932, 2271, 7954, 1110, 1359, 1107, 17520, 102, 2777, 1110, 20164, 10932, 2271, 7954, 1359, 136, 102]
-    assert sequence_a_dict['token_type_ids'] == [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+    assert encoded_dict['input_ids'] == [101, 20164, 10932, 2271, 7954, 1110, 1359, 1107, 17520, 102, 2777, 1110, 20164, 10932, 2271, 7954, 1359, 136, 102]
+    assert encoded_dict['token_type_ids'] == [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]

 The first sequence, the "context" used for the question, has all its tokens represented by :obj:`0`, whereas the
 question has all its tokens represented by :obj:`1`. Some models, like :class:`~transformers.XLNetModel` use an

--- a/tests/test_examples.py
+++ b/tests/test_examples.py
@@ -15,6 +15,8 @@

 import os
 import unittest
+from typing import List, Union
+
 from .utils import require_torch


@@ -26,34 +28,84 @@ def get_examples_from_file(file):
    for i, line in enumerate(file):
        if example_mode:
            current_indentation = len(line) - len(line.strip()) - 1
-            if current_indentation == example_indentation or '"""' in line:
+
+            # Check if the indentation is 0 for the example, so that we don't exit as soon as there's a line return.
+            empty_line = example_indentation == 0 and len(line) == 1
+
+            # If we're back to the example indentation or if it's the end of the docstring.
+            if (current_indentation == example_indentation and not empty_line) or '"""' in line:
+                # Exit the example mode and add the example to the examples list
                example_mode = False
                example_indentation = None
                examples.append(example)
                example = []
            else:
+                # If line is not empty, add it to the current example
                if line is not "\n":
                    example.append(line[example_indentation + 4 : -1])
+
+        # Detect the example from '::' or 'example::'
        if "example::" in line.lower():
            example_mode = True
            example_indentation = line.lower().find("example::")
+        elif "examples::" in line.lower():
+            example_mode = True
+            example_indentation = line.lower().find("examples::")
+        elif "::" in line.lower():
+            example_mode = True
+            example_indentation = line.lower().find("::")

-    return ['\n'.join(example) for example in examples]
+    return ["\n".join(example) for example in examples]


 @require_torch
 class TestCodeExamples(unittest.TestCase):
-    def test_configuration_examples(self):
-        transformers_directory = "../src/transformers"
-        configuration_files = [file for file in os.listdir(transformers_directory) if "configuration" in file]
+    def analyze_directory(
+        self, directory: str, identifier: Union[str, None] = None, ignore_files: Union[List[str], None] = None
+    ):
+        files = [file for file in os.listdir(directory) if os.path.isfile(os.path.join(directory, file))]
+
+        if identifier is not None:
+            files = [file for file in files if identifier in file]

-        for configuration_file in configuration_files:
-            with open(os.path.join(transformers_directory, configuration_file)) as f:
+        if ignore_files is not None:
+            files = [file for file in files if file not in ignore_files]
+
+        for file in files:
+            # Open all files
+            with open(os.path.join(directory, file)) as f:
+                # Retrieve examples
                examples = get_examples_from_file(f)
-                print("Testing", configuration_file, str(len(examples)) + "/" + str(len(examples)))
+                joined_examples = []

                def execute_example(code_example):
                    exec(code_example)

-                with self.subTest(msg=configuration_file):
-                    [execute_example(code_example) for code_example in examples]
+                # Some examples are the continuation of others.
+                if len(examples) > 1:
+                    joined_examples.append(examples[0])
+                    joined_examples_index = 0
+                    for example in examples[1:]:
+                        # If they contain this line, then they're a continuation of the previous script
+                        if "# Continuation of the previous script" in example:
+                            joined_examples[joined_examples_index] += "\n" + example
+                        # If not, create a new example and increment the index
+                        else:
+                            joined_examples.append(example)
+                            joined_examples_index += 1
+
+                print("Testing", file, str(len(joined_examples)) + "/" + str(len(joined_examples)))
+
+                # Execute sub tests with every example.
+                with self.subTest(msg=file):
+                    [execute_example(code_example) for code_example in joined_examples]
+
+    def test_configuration_examples(self):
+        transformers_directory = "src/transformers"
+        configuration_files = "configuration"
+        ignore_files = ["configuration_auto.py", "configuration_utils.py"]
+        self.analyze_directory(transformers_directory, identifier=configuration_files, ignore_files=ignore_files)
+
+    def test_main_doc_examples(self):
+        doc_directory = "docs/source"
+        self.analyze_directory(doc_directory)