Commit 632675ea authored by Lysandre's avatar Lysandre Committed by Lysandre Debut
Browse files

Can test examples spread over multiple blocks

parent eaa6b9af
...@@ -24,6 +24,7 @@ The tokenizer takes care of splitting the sequence into tokens available in the ...@@ -24,6 +24,7 @@ The tokenizer takes care of splitting the sequence into tokens available in the
:: ::
# Continuation of the previous script
tokenized_sequence = tokenizer.tokenize(sequence) tokenized_sequence = tokenizer.tokenize(sequence)
assert tokenized_sequence == ['A', 'Titan', 'R', '##T', '##X', 'has', '24', '##GB', 'of', 'V', '##RA', '##M'] assert tokenized_sequence == ['A', 'Titan', 'R', '##T', '##X', 'has', '24', '##GB', 'of', 'V', '##RA', '##M']
...@@ -33,6 +34,7 @@ this, the recommended being `encode` or `encode_plus`, which leverage the Rust i ...@@ -33,6 +34,7 @@ this, the recommended being `encode` or `encode_plus`, which leverage the Rust i
:: ::
# Continuation of the previous script
encoded_sequence = tokenizer.encode(sequence) encoded_sequence = tokenizer.encode(sequence)
assert encoded_sequence == [101, 138, 18696, 155, 1942, 3190, 1144, 1572, 13745, 1104, 159, 9664, 2107, 102] assert encoded_sequence == [101, 138, 18696, 155, 1942, 3190, 1144, 1572, 13745, 1104, 159, 9664, 2107, 102]
...@@ -48,6 +50,9 @@ For example, consider these two sequences: ...@@ -48,6 +50,9 @@ For example, consider these two sequences:
:: ::
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
sequence_a = "This is a short sequence." sequence_a = "This is a short sequence."
sequence_b = "This is a rather long sequence. It is at least longer than the sequence A." sequence_b = "This is a rather long sequence. It is at least longer than the sequence A."
...@@ -65,10 +70,11 @@ In the first case, the list of IDs will be extended by the padding indices: ...@@ -65,10 +70,11 @@ In the first case, the list of IDs will be extended by the padding indices:
:: ::
# Continuation of the previous script
padded_sequence_a = tokenizer.encode(sequence_a, max_length=19, pad_to_max_length=True) padded_sequence_a = tokenizer.encode(sequence_a, max_length=19, pad_to_max_length=True)
assert padded_sequence_a = [101, 1188, 1110, 170, 1603, 4954, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] assert padded_sequence_a == [101, 1188, 1110, 170, 1603, 4954, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
assert encoded_sequence_b = [101, 1188, 1110, 170, 1897, 1263, 4954, 119, 1135, 1110, 1120, 1655, 2039, 1190, 1103, 4954, 138, 119, 102] assert encoded_sequence_b == [101, 1188, 1110, 170, 1897, 1263, 4954, 119, 1135, 1110, 1120, 1655, 2039, 1190, 1103, 4954, 138, 119, 102]
These can then be converted into a tensor in PyTorch or TensorFlow. The attention mask is a binary tensor indicating These can then be converted into a tensor in PyTorch or TensorFlow. The attention mask is a binary tensor indicating
the position of the padded indices so that the model does not attend to them. For the the position of the padded indices so that the model does not attend to them. For the
...@@ -79,6 +85,7 @@ The method :func:`~transformers.PreTrainedTokenizer.encode_plus` may be used to ...@@ -79,6 +85,7 @@ The method :func:`~transformers.PreTrainedTokenizer.encode_plus` may be used to
:: ::
# Continuation of the previous script
sequence_a_dict = tokenizer.encode_plus(sequence_a, max_length=19, pad_to_max_length=True) sequence_a_dict = tokenizer.encode_plus(sequence_a, max_length=19, pad_to_max_length=True)
assert sequence_a_dict['input_ids'] == [101, 1188, 1110, 170, 1603, 4954, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] assert sequence_a_dict['input_ids'] == [101, 1188, 1110, 170, 1603, 4954, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
...@@ -94,6 +101,9 @@ tokens. For example, the BERT model builds its two sequence input as such: ...@@ -94,6 +101,9 @@ tokens. For example, the BERT model builds its two sequence input as such:
:: ::
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
# [CLS] SEQ_A [SEP] SEQ_B [SEP] # [CLS] SEQ_A [SEP] SEQ_B [SEP]
sequence_a = "HuggingFace is based in NYC" sequence_a = "HuggingFace is based in NYC"
...@@ -110,10 +120,11 @@ We can leverage :func:`~transformers.PreTrainedTokenizer.encode_plus` to output ...@@ -110,10 +120,11 @@ We can leverage :func:`~transformers.PreTrainedTokenizer.encode_plus` to output
:: ::
# Continuation of the previous script
encoded_dict = tokenizer.encode_plus(sequence_a, sequence_b) encoded_dict = tokenizer.encode_plus(sequence_a, sequence_b)
assert sequence_a_dict['input_ids'] == [101, 20164, 10932, 2271, 7954, 1110, 1359, 1107, 17520, 102, 2777, 1110, 20164, 10932, 2271, 7954, 1359, 136, 102] assert encoded_dict['input_ids'] == [101, 20164, 10932, 2271, 7954, 1110, 1359, 1107, 17520, 102, 2777, 1110, 20164, 10932, 2271, 7954, 1359, 136, 102]
assert sequence_a_dict['token_type_ids'] == [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1] assert encoded_dict['token_type_ids'] == [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
The first sequence, the "context" used for the question, has all its tokens represented by :obj:`0`, whereas the The first sequence, the "context" used for the question, has all its tokens represented by :obj:`0`, whereas the
question has all its tokens represented by :obj:`1`. Some models, like :class:`~transformers.XLNetModel` use an question has all its tokens represented by :obj:`1`. Some models, like :class:`~transformers.XLNetModel` use an
......
...@@ -15,6 +15,8 @@ ...@@ -15,6 +15,8 @@
import os import os
import unittest import unittest
from typing import List, Union
from .utils import require_torch from .utils import require_torch
...@@ -26,34 +28,84 @@ def get_examples_from_file(file): ...@@ -26,34 +28,84 @@ def get_examples_from_file(file):
for i, line in enumerate(file): for i, line in enumerate(file):
if example_mode: if example_mode:
current_indentation = len(line) - len(line.strip()) - 1 current_indentation = len(line) - len(line.strip()) - 1
if current_indentation == example_indentation or '"""' in line:
# Check if the indentation is 0 for the example, so that we don't exit as soon as there's a line return.
empty_line = example_indentation == 0 and len(line) == 1
# If we're back to the example indentation or if it's the end of the docstring.
if (current_indentation == example_indentation and not empty_line) or '"""' in line:
# Exit the example mode and add the example to the examples list
example_mode = False example_mode = False
example_indentation = None example_indentation = None
examples.append(example) examples.append(example)
example = [] example = []
else: else:
# If line is not empty, add it to the current example
if line is not "\n": if line is not "\n":
example.append(line[example_indentation + 4 : -1]) example.append(line[example_indentation + 4 : -1])
# Detect the example from '::' or 'example::'
if "example::" in line.lower(): if "example::" in line.lower():
example_mode = True example_mode = True
example_indentation = line.lower().find("example::") example_indentation = line.lower().find("example::")
elif "examples::" in line.lower():
example_mode = True
example_indentation = line.lower().find("examples::")
elif "::" in line.lower():
example_mode = True
example_indentation = line.lower().find("::")
return ['\n'.join(example) for example in examples] return ["\n".join(example) for example in examples]
@require_torch @require_torch
class TestCodeExamples(unittest.TestCase): class TestCodeExamples(unittest.TestCase):
def test_configuration_examples(self): def analyze_directory(
transformers_directory = "../src/transformers" self, directory: str, identifier: Union[str, None] = None, ignore_files: Union[List[str], None] = None
configuration_files = [file for file in os.listdir(transformers_directory) if "configuration" in file] ):
files = [file for file in os.listdir(directory) if os.path.isfile(os.path.join(directory, file))]
for configuration_file in configuration_files: if identifier is not None:
with open(os.path.join(transformers_directory, configuration_file)) as f: files = [file for file in files if identifier in file]
if ignore_files is not None:
files = [file for file in files if file not in ignore_files]
for file in files:
# Open all files
with open(os.path.join(directory, file)) as f:
# Retrieve examples
examples = get_examples_from_file(f) examples = get_examples_from_file(f)
print("Testing", configuration_file, str(len(examples)) + "/" + str(len(examples))) joined_examples = []
def execute_example(code_example): def execute_example(code_example):
exec(code_example) exec(code_example)
with self.subTest(msg=configuration_file): # Some examples are the continuation of others.
[execute_example(code_example) for code_example in examples] if len(examples) > 1:
joined_examples.append(examples[0])
joined_examples_index = 0
for example in examples[1:]:
# If they contain this line, then they're a continuation of the previous script
if "# Continuation of the previous script" in example:
joined_examples[joined_examples_index] += "\n" + example
# If not, create a new example and increment the index
else:
joined_examples.append(example)
joined_examples_index += 1
print("Testing", file, str(len(joined_examples)) + "/" + str(len(joined_examples)))
# Execute sub tests with every example.
with self.subTest(msg=file):
[execute_example(code_example) for code_example in joined_examples]
def test_configuration_examples(self):
transformers_directory = "src/transformers"
configuration_files = "configuration"
ignore_files = ["configuration_auto.py", "configuration_utils.py"]
self.analyze_directory(transformers_directory, identifier=configuration_files, ignore_files=ignore_files)
def test_main_doc_examples(self):
doc_directory = "docs/source"
self.analyze_directory(doc_directory)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment