Unverified Commit 364a5ae1 authored by Lysandre Debut's avatar Lysandre Debut Committed by GitHub
Browse files

Refactor Code samples; Test code samples (#5036)



* Refactor code samples

* Test docstrings

* Style

* Tokenization examples

* Run rust of tests

* First step to testing source docs

* Style and BART comment

* Test the remainder of the code samples

* Style

* let to const

* Formatting fixes

* Ready for merge

* Fix fixture + Style

* Fix last tests

* Update docs/source/quicktour.rst
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Addressing @sgugger's comments + Fix MobileBERT in TF
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
parent 315f464b
......@@ -28,7 +28,7 @@ from torch.nn import functional as F
from .activations import gelu
from .configuration_xlm import XLMConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_utils import (
PreTrainedModel,
SequenceSummary,
......@@ -40,6 +40,8 @@ from .modeling_utils import (
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "XLMTokenizer"
XLM_PRETRAINED_MODEL_ARCHIVE_LIST = [
"xlm-mlm-en-2048",
"xlm-mlm-ende-1024",
......@@ -395,6 +397,7 @@ class XLMModel(XLMPreTrainedModel):
self.attentions[layer].prune_heads(heads)
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
def forward(
self,
input_ids=None,
......@@ -425,18 +428,6 @@ class XLMModel(XLMPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import XLMTokenizer, XLMModel
import torch
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = XLMModel.from_pretrained('xlm-mlm-en-2048')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
......@@ -632,6 +623,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
return {"input_ids": input_ids, "langs": langs}
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
def forward(
self,
input_ids=None,
......@@ -672,18 +664,6 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import XLMTokenizer, XLMWithLMHeadModel
import torch
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = XLMWithLMHeadModel.from_pretrained('xlm-mlm-en-2048')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
transformer_outputs = self.transformer(
input_ids,
......@@ -722,6 +702,7 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
def forward(
self,
input_ids=None,
......@@ -761,19 +742,6 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import XLMTokenizer, XLMForSequenceClassification
import torch
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = XLMForSequenceClassification.from_pretrained('xlm-mlm-en-2048')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
"""
transformer_outputs = self.transformer(
input_ids,
......@@ -822,6 +790,7 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
def forward(
self,
input_ids=None,
......@@ -867,20 +836,6 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import XLMTokenizer, XLMForQuestionAnsweringSimple
import torch
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = XLMForQuestionAnsweringSimple.from_pretrained('xlm-mlm-en-2048')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
start_positions = torch.tensor([1])
end_positions = torch.tensor([3])
outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
loss = outputs[0]
"""
transformer_outputs = self.transformer(
input_ids,
......@@ -1006,19 +961,20 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
Example::
from transformers import XLMTokenizer, XLMForQuestionAnswering
import torch
>>> from transformers import XLMTokenizer, XLMForQuestionAnswering
>>> import torch
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
start_positions = torch.tensor([1])
end_positions = torch.tensor([3])
outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
loss = outputs[0]
>>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
>>> model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048')
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
>>> start_positions = torch.tensor([1])
>>> end_positions = torch.tensor([3])
>>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
>>> loss = outputs[0]
"""
transformer_outputs = self.transformer(
input_ids,
......@@ -1067,6 +1023,7 @@ class XLMForTokenClassification(XLMPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
def forward(
self,
input_ids=None,
......@@ -1074,6 +1031,8 @@ class XLMForTokenClassification(XLMPreTrainedModel):
langs=None,
token_type_ids=None,
position_ids=None,
lengths=None,
cache=None,
head_mask=None,
labels=None,
output_attentions=None,
......@@ -1101,19 +1060,6 @@ class XLMForTokenClassification(XLMPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import XLMTokenizer, XLMForTokenClassification
import torch
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-100-1280')
model = XLMForTokenClassification.from_pretrained('xlm-mlm-100-1280')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
"""
outputs = self.transformer(
input_ids,
......@@ -1121,6 +1067,8 @@ class XLMForTokenClassification(XLMPreTrainedModel):
langs=langs,
token_type_ids=token_type_ids,
position_ids=position_ids,
lengths=lengths,
cache=cache,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
......
......@@ -26,12 +26,14 @@ from torch.nn import functional as F
from .activations import gelu_new, swish
from .configuration_xlnet import XLNetConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_utils import PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits, PreTrainedModel, SequenceSummary
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "XLNetTokenizer"
XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
"xlnet-base-cased",
"xlnet-large-cased",
......@@ -749,6 +751,7 @@ class XLNetModel(XLNetPreTrainedModel):
return pos_emb
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
def forward(
self,
input_ids=None,
......@@ -785,20 +788,6 @@ class XLNetModel(XLNetPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import XLNetTokenizer, XLNetModel
import torch
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = XLNetModel.from_pretrained('xlnet-large-cased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=False)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
......@@ -1164,6 +1153,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
def forward(
self,
input_ids=None,
......@@ -1208,20 +1198,6 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import XLNetTokenizer, XLNetForSequenceClassification
import torch
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
"""
transformer_outputs = self.transformer(
input_ids,
......@@ -1273,6 +1249,7 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
def forward(
self,
input_ids=None,
......@@ -1316,21 +1293,6 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import XLNetTokenizer, XLNetForTokenClassification
import torch
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = XLNetForTokenClassification.from_pretrained('xlnet-large-cased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
scores = outputs[0]
"""
outputs = self.transformer(
......@@ -1386,6 +1348,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
def forward(
self,
input_ids=None,
......@@ -1431,22 +1394,6 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import XLNetTokenizer, XLNetForMultipleChoice
import torch
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = XLNetForMultipleChoice.from_pretrained('xlnet-base-cased')
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
labels = torch.tensor(1).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, classification_scores = outputs[:2]
"""
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
......@@ -1508,6 +1455,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
def forward(
self,
input_ids=None,
......@@ -1558,22 +1506,6 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import XLNetTokenizer, XLNetForQuestionAnsweringSimple
import torch
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = XLNetForQuestionAnsweringSimple.from_pretrained('xlnet-base-cased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
start_positions = torch.tensor([1])
end_positions = torch.tensor([3])
outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
loss = outputs[0]
"""
outputs = self.transformer(
......@@ -1705,20 +1637,20 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
Example::
from transformers import XLNetTokenizer, XLNetForQuestionAnswering
import torch
>>> from transformers import XLNetTokenizer, XLNetForQuestionAnswering
>>> import torch
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = XLNetForQuestionAnswering.from_pretrained('xlnet-base-cased')
>>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
>>> model = XLNetForQuestionAnswering.from_pretrained('xlnet-base-cased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
start_positions = torch.tensor([1])
end_positions = torch.tensor([3])
outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
loss = outputs[0]
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
>>> start_positions = torch.tensor([1])
>>> end_positions = torch.tensor([3])
>>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
>>> loss = outputs[0]
"""
transformer_outputs = self.transformer(
input_ids,
......
......@@ -66,13 +66,15 @@ class MBartTokenizer(XLMRobertaTokenizer):
The tokenization method is <tokens> <eos> <language code>. There is no BOS token.
Examples::
from transformers import MBartTokenizer
tokenizer = MBartTokenizer.from_pretrained('mbart-large-en-ro')
example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
batch: dict = tokenizer.prepare_translation_batch(
example_english_phrase, src_lang="en_XX", tgt_lang="ro_RO", tgt_texts=expected_translation_romanian
)
>>> from transformers import MBartTokenizer
>>> tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-en-ro')
>>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
>>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
>>> batch: dict = tokenizer.prepare_translation_batch(
... example_english_phrase, src_lang="en_XX", tgt_lang="ro_RO", tgt_texts=expected_translation_romanian
... )
"""
vocab_files_names = {"vocab_file": "sentencepiece.bpe.model"}
......
......@@ -25,13 +25,13 @@ class MarianTokenizer(PreTrainedTokenizer):
Examples::
from transformers import MarianTokenizer
tok = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
src_texts = [ "I am a small frog.", "Tom asked his teacher for advice."]
tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."] # optional
batch_enc: BatchEncoding = tok.prepare_translation_batch(src_texts, tgt_texts=tgt_texts)
# keys [input_ids, attention_mask, decoder_input_ids, decoder_attention_mask].
# model(**batch) should work
>>> from transformers import MarianTokenizer
>>> tok = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
>>> src_texts = [ "I am a small frog.", "Tom asked his teacher for advice."]
>>> tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."] # optional
>>> batch_enc: BatchEncoding = tok.prepare_translation_batch(src_texts, tgt_texts=tgt_texts)
>>> # keys [input_ids, attention_mask, decoder_input_ids, decoder_attention_mask].
>>> # model(**batch) should work
"""
vocab_files_names = vocab_files_names
......
......@@ -81,6 +81,7 @@ class ReformerTokenizer(PreTrainedTokenizer):
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"]
def __init__(
self,
......
......@@ -94,6 +94,7 @@ class T5Tokenizer(PreTrainedTokenizer):
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"]
def __init__(
self,
......
......@@ -13,52 +13,19 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import doctest
import logging
import os
import unittest
from pathlib import Path
from typing import List, Union
from .utils import require_tf, require_torch, slow
import transformers
def get_examples_from_file(file):
examples = []
example = []
example_mode = False
example_indentation = None
for i, line in enumerate(file):
if example_mode:
current_indentation = len(line) - len(line.strip()) - 1
# Check if the indentation is 0 for the example, so that we don't exit as soon as there's a line return.
empty_line = example_indentation == 0 and len(line) == 1
# If we're back to the example indentation or if it's the end of the docstring.
if (current_indentation == example_indentation and not empty_line) or '"""' in line:
# Exit the example mode and add the example to the examples list
example_mode = False
example_indentation = None
examples.append(example)
example = []
else:
# If line is not empty, add it to the current example
if line != "\n":
example.append(line[example_indentation + 4 : -1])
# Detect the example from '::' or 'example::'
if "example::" in line.lower():
example_mode = True
example_indentation = line.lower().find("example::")
elif "examples::" in line.lower():
example_mode = True
example_indentation = line.lower().find("examples::")
# elif "::" in line.lower() and len(line.strip()) == 2:
# example_mode = True
# example_indentation = line.lower().find("::")
from .utils import require_tf, require_torch, slow
examples = ["\n".join(example) for example in examples]
examples = [example for example in examples if "not runnable" not in example.lower()]
return examples
logger = logging.getLogger()
@require_torch
......@@ -66,68 +33,81 @@ def get_examples_from_file(file):
@slow
class TestCodeExamples(unittest.TestCase):
def analyze_directory(
self, directory: str, identifier: Union[str, None] = None, ignore_files: Union[List[str], None] = None
self,
directory: Path,
identifier: Union[str, None] = None,
ignore_files: Union[List[str], None] = [],
n_identifier: Union[str, None] = None,
only_modules: bool = True,
):
"""
Runs through the specific directory, looking for the files identified with `identifier`. Executes
the doctests in those files
Args:
directory (:obj:`str`): Directory containing the files
identifier (:obj:`str`): Will parse files containing this
ignore_files (:obj:`List[str]`): List of files to skip
n_identifier (:obj:`str` or :obj:`List[str]`): Will not parse files containing this/these identifiers.
only_modules (:obj:`bool`): Whether to only analyze modules
"""
files = [file for file in os.listdir(directory) if os.path.isfile(os.path.join(directory, file))]
if identifier is not None:
files = [file for file in files if identifier in file]
if ignore_files is not None:
files = [file for file in files if file not in ignore_files]
if n_identifier is not None:
if isinstance(n_identifier, List):
for n_ in n_identifier:
files = [file for file in files if n_ not in file]
else:
files = [file for file in files if n_identifier not in file]
ignore_files.append("__init__.py")
files = [file for file in files if file not in ignore_files]
for file in files:
# Open all files
print("Testing", file, end=" ")
with open(os.path.join(directory, file)) as f:
# Retrieve examples
examples = get_examples_from_file(f)
joined_examples = []
def execute_example(code_example):
exec(code_example, {})
# Some examples are the continuation of others.
if len(examples) > 0:
joined_examples.append(examples[0])
joined_examples_index = 0
for example in examples[1:]:
# If they contain this line, then they're a continuation of the previous script
if "# Continuation of the previous script" in example:
joined_examples[joined_examples_index] += "\n" + example
# If not, create a new example and increment the index
else:
joined_examples.append(example)
joined_examples_index += 1
print(str(len(joined_examples)) + "/" + str(len(joined_examples)))
# Execute sub tests with every example.
for index, code_example in enumerate(joined_examples):
with self.subTest(msg=file + " " + str(index) + "/" + str(len(joined_examples)) + code_example):
execute_example(code_example)
def test_configuration_examples(self):
transformers_directory = "src/transformers"
configuration_files = "configuration"
ignore_files = ["configuration_auto.py", "configuration_utils.py"]
self.analyze_directory(transformers_directory, identifier=configuration_files, ignore_files=ignore_files)
def test_main_doc_examples(self):
doc_directory = "docs/source"
ignore_files = ["favicon.ico"]
self.analyze_directory(doc_directory, ignore_files=ignore_files)
print("Testing", file)
if only_modules:
try:
module_identifier = file.split(".")[0]
module_identifier = getattr(transformers, module_identifier)
suite = doctest.DocTestSuite(module_identifier)
result = unittest.TextTestRunner().run(suite)
self.assertIs(len(result.failures), 0)
except AttributeError:
logger.info(f"{module_identifier} is not a module.")
else:
result = doctest.testfile(str(".." / directory / file), optionflags=doctest.ELLIPSIS)
self.assertIs(result.failed, 0)
def test_modeling_examples(self):
transformers_directory = "src/transformers"
modeling_files = "modeling"
files = "modeling"
ignore_files = [
"modeling_auto.py",
"modeling_t5.py",
"modeling_tf_auto.py",
"modeling_utils.py",
"modeling_tf_t5.py",
"modeling_bart.py",
"modeling_tf_utils.py",
"modeling_ctrl.py",
"modeling_tf_ctrl.py",
]
self.analyze_directory(transformers_directory, identifier=modeling_files, ignore_files=ignore_files)
self.analyze_directory(transformers_directory, identifier=files, ignore_files=ignore_files)
def test_tokenization_examples(self):
transformers_directory = Path("src/transformers")
files = "tokenization"
self.analyze_directory(transformers_directory, identifier=files)
def test_configuration_examples(self):
transformers_directory = Path("src/transformers")
files = "configuration"
self.analyze_directory(transformers_directory, identifier=files)
def test_remaining_examples(self):
transformers_directory = Path("src/transformers")
n_identifiers = ["configuration", "modeling", "tokenization"]
self.analyze_directory(transformers_directory, n_identifier=n_identifiers)
def test_doc_sources(self):
doc_source_directory = Path("docs/source")
ignore_files = ["favicon.ico"]
self.analyze_directory(doc_source_directory, ignore_files=ignore_files, only_modules=False)
......@@ -31,6 +31,7 @@ if is_tf_available():
TFXLMWithLMHeadModel,
TFXLMForSequenceClassification,
TFXLMForQuestionAnsweringSimple,
TFXLMForTokenClassification,
TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST,
)
......@@ -219,6 +220,26 @@ class TFXLMModelTester:
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.type_sequence_label_size])
def create_and_check_xlm_for_token_classification(
self,
config,
input_ids,
token_type_ids,
input_lengths,
sequence_labels,
token_labels,
is_impossible_labels,
input_mask,
):
config.num_labels = self.num_labels
model = TFXLMForTokenClassification(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
(logits,) = model(inputs)
result = {
"logits": logits.numpy(),
}
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels])
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(
......@@ -244,7 +265,14 @@ class TFXLMModelTester:
class TFXLMModelTest(TFModelTesterMixin, unittest.TestCase):
all_model_classes = (
(TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, TFXLMForQuestionAnsweringSimple)
# TODO The multiple choice model is missing and should be added.
(
TFXLMModel,
TFXLMWithLMHeadModel,
TFXLMForSequenceClassification,
TFXLMForQuestionAnsweringSimple,
TFXLMForTokenClassification,
)
if is_tf_available()
else ()
)
......@@ -275,6 +303,10 @@ class TFXLMModelTest(TFModelTesterMixin, unittest.TestCase):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_xlm_sequence_classif(*config_and_inputs)
def test_for_token_classification(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_xlm_for_token_classification(*config_and_inputs)
@slow
def test_model_from_pretrained(self):
for model_name in TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment