Unverified Commit 364a5ae1 authored by Lysandre Debut's avatar Lysandre Debut Committed by GitHub
Browse files

Refactor Code samples; Test code samples (#5036)



* Refactor code samples

* Test docstrings

* Style

* Tokenization examples

* Run rust of tests

* First step to testing source docs

* Style and BART comment

* Test the remainder of the code samples

* Style

* let to const

* Formatting fixes

* Ready for merge

* Fix fixture + Style

* Fix last tests

* Update docs/source/quicktour.rst
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Addressing @sgugger's comments + Fix MobileBERT in TF
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
parent 315f464b
...@@ -28,7 +28,7 @@ from torch.nn import functional as F ...@@ -28,7 +28,7 @@ from torch.nn import functional as F
from .activations import gelu from .activations import gelu
from .configuration_xlm import XLMConfig from .configuration_xlm import XLMConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_utils import ( from .modeling_utils import (
PreTrainedModel, PreTrainedModel,
SequenceSummary, SequenceSummary,
...@@ -40,6 +40,8 @@ from .modeling_utils import ( ...@@ -40,6 +40,8 @@ from .modeling_utils import (
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "XLMTokenizer"
XLM_PRETRAINED_MODEL_ARCHIVE_LIST = [ XLM_PRETRAINED_MODEL_ARCHIVE_LIST = [
"xlm-mlm-en-2048", "xlm-mlm-en-2048",
"xlm-mlm-ende-1024", "xlm-mlm-ende-1024",
...@@ -395,6 +397,7 @@ class XLMModel(XLMPreTrainedModel): ...@@ -395,6 +397,7 @@ class XLMModel(XLMPreTrainedModel):
self.attentions[layer].prune_heads(heads) self.attentions[layer].prune_heads(heads)
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -425,18 +428,6 @@ class XLMModel(XLMPreTrainedModel): ...@@ -425,18 +428,6 @@ class XLMModel(XLMPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import XLMTokenizer, XLMModel
import torch
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = XLMModel.from_pretrained('xlm-mlm-en-2048')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
""" """
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = ( output_hidden_states = (
...@@ -632,6 +623,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel): ...@@ -632,6 +623,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
return {"input_ids": input_ids, "langs": langs} return {"input_ids": input_ids, "langs": langs}
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -672,18 +664,6 @@ class XLMWithLMHeadModel(XLMPreTrainedModel): ...@@ -672,18 +664,6 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import XLMTokenizer, XLMWithLMHeadModel
import torch
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = XLMWithLMHeadModel.from_pretrained('xlm-mlm-en-2048')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
""" """
transformer_outputs = self.transformer( transformer_outputs = self.transformer(
input_ids, input_ids,
...@@ -722,6 +702,7 @@ class XLMForSequenceClassification(XLMPreTrainedModel): ...@@ -722,6 +702,7 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -761,19 +742,6 @@ class XLMForSequenceClassification(XLMPreTrainedModel): ...@@ -761,19 +742,6 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import XLMTokenizer, XLMForSequenceClassification
import torch
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = XLMForSequenceClassification.from_pretrained('xlm-mlm-en-2048')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
""" """
transformer_outputs = self.transformer( transformer_outputs = self.transformer(
input_ids, input_ids,
...@@ -822,6 +790,7 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel): ...@@ -822,6 +790,7 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -867,20 +836,6 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel): ...@@ -867,20 +836,6 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import XLMTokenizer, XLMForQuestionAnsweringSimple
import torch
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = XLMForQuestionAnsweringSimple.from_pretrained('xlm-mlm-en-2048')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
start_positions = torch.tensor([1])
end_positions = torch.tensor([3])
outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
loss = outputs[0]
""" """
transformer_outputs = self.transformer( transformer_outputs = self.transformer(
input_ids, input_ids,
...@@ -1006,19 +961,20 @@ class XLMForQuestionAnswering(XLMPreTrainedModel): ...@@ -1006,19 +961,20 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples:: Example::
from transformers import XLMTokenizer, XLMForQuestionAnswering >>> from transformers import XLMTokenizer, XLMForQuestionAnswering
import torch >>> import torch
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') >>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048') >>> model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
start_positions = torch.tensor([1])
end_positions = torch.tensor([3])
outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
loss = outputs[0]
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
>>> start_positions = torch.tensor([1])
>>> end_positions = torch.tensor([3])
>>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
>>> loss = outputs[0]
""" """
transformer_outputs = self.transformer( transformer_outputs = self.transformer(
input_ids, input_ids,
...@@ -1067,6 +1023,7 @@ class XLMForTokenClassification(XLMPreTrainedModel): ...@@ -1067,6 +1023,7 @@ class XLMForTokenClassification(XLMPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -1074,6 +1031,8 @@ class XLMForTokenClassification(XLMPreTrainedModel): ...@@ -1074,6 +1031,8 @@ class XLMForTokenClassification(XLMPreTrainedModel):
langs=None, langs=None,
token_type_ids=None, token_type_ids=None,
position_ids=None, position_ids=None,
lengths=None,
cache=None,
head_mask=None, head_mask=None,
labels=None, labels=None,
output_attentions=None, output_attentions=None,
...@@ -1101,19 +1060,6 @@ class XLMForTokenClassification(XLMPreTrainedModel): ...@@ -1101,19 +1060,6 @@ class XLMForTokenClassification(XLMPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import XLMTokenizer, XLMForTokenClassification
import torch
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-100-1280')
model = XLMForTokenClassification.from_pretrained('xlm-mlm-100-1280')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
""" """
outputs = self.transformer( outputs = self.transformer(
input_ids, input_ids,
...@@ -1121,6 +1067,8 @@ class XLMForTokenClassification(XLMPreTrainedModel): ...@@ -1121,6 +1067,8 @@ class XLMForTokenClassification(XLMPreTrainedModel):
langs=langs, langs=langs,
token_type_ids=token_type_ids, token_type_ids=token_type_ids,
position_ids=position_ids, position_ids=position_ids,
lengths=lengths,
cache=cache,
head_mask=head_mask, head_mask=head_mask,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
......
...@@ -26,12 +26,14 @@ from torch.nn import functional as F ...@@ -26,12 +26,14 @@ from torch.nn import functional as F
from .activations import gelu_new, swish from .activations import gelu_new, swish
from .configuration_xlnet import XLNetConfig from .configuration_xlnet import XLNetConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_utils import PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits, PreTrainedModel, SequenceSummary from .modeling_utils import PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits, PreTrainedModel, SequenceSummary
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "XLNetTokenizer"
XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [ XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
"xlnet-base-cased", "xlnet-base-cased",
"xlnet-large-cased", "xlnet-large-cased",
...@@ -749,6 +751,7 @@ class XLNetModel(XLNetPreTrainedModel): ...@@ -749,6 +751,7 @@ class XLNetModel(XLNetPreTrainedModel):
return pos_emb return pos_emb
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -785,20 +788,6 @@ class XLNetModel(XLNetPreTrainedModel): ...@@ -785,20 +788,6 @@ class XLNetModel(XLNetPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import XLNetTokenizer, XLNetModel
import torch
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = XLNetModel.from_pretrained('xlnet-large-cased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=False)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
""" """
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = ( output_hidden_states = (
...@@ -1164,6 +1153,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel): ...@@ -1164,6 +1153,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -1208,20 +1198,6 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel): ...@@ -1208,20 +1198,6 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import XLNetTokenizer, XLNetForSequenceClassification
import torch
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
""" """
transformer_outputs = self.transformer( transformer_outputs = self.transformer(
input_ids, input_ids,
...@@ -1273,6 +1249,7 @@ class XLNetForTokenClassification(XLNetPreTrainedModel): ...@@ -1273,6 +1249,7 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -1316,21 +1293,6 @@ class XLNetForTokenClassification(XLNetPreTrainedModel): ...@@ -1316,21 +1293,6 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import XLNetTokenizer, XLNetForTokenClassification
import torch
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = XLNetForTokenClassification.from_pretrained('xlnet-large-cased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
scores = outputs[0]
""" """
outputs = self.transformer( outputs = self.transformer(
...@@ -1386,6 +1348,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel): ...@@ -1386,6 +1348,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -1431,22 +1394,6 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel): ...@@ -1431,22 +1394,6 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import XLNetTokenizer, XLNetForMultipleChoice
import torch
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = XLNetForMultipleChoice.from_pretrained('xlnet-base-cased')
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
labels = torch.tensor(1).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, classification_scores = outputs[:2]
""" """
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
...@@ -1508,6 +1455,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel): ...@@ -1508,6 +1455,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -1558,22 +1506,6 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel): ...@@ -1558,22 +1506,6 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import XLNetTokenizer, XLNetForQuestionAnsweringSimple
import torch
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = XLNetForQuestionAnsweringSimple.from_pretrained('xlnet-base-cased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
start_positions = torch.tensor([1])
end_positions = torch.tensor([3])
outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
loss = outputs[0]
""" """
outputs = self.transformer( outputs = self.transformer(
...@@ -1705,20 +1637,20 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel): ...@@ -1705,20 +1637,20 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples:: Example::
from transformers import XLNetTokenizer, XLNetForQuestionAnswering >>> from transformers import XLNetTokenizer, XLNetForQuestionAnswering
import torch >>> import torch
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') >>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = XLNetForQuestionAnswering.from_pretrained('xlnet-base-cased') >>> model = XLNetForQuestionAnswering.from_pretrained('xlnet-base-cased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
start_positions = torch.tensor([1]) >>> start_positions = torch.tensor([1])
end_positions = torch.tensor([3]) >>> end_positions = torch.tensor([3])
outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
loss = outputs[0]
>>> loss = outputs[0]
""" """
transformer_outputs = self.transformer( transformer_outputs = self.transformer(
input_ids, input_ids,
......
...@@ -66,13 +66,15 @@ class MBartTokenizer(XLMRobertaTokenizer): ...@@ -66,13 +66,15 @@ class MBartTokenizer(XLMRobertaTokenizer):
The tokenization method is <tokens> <eos> <language code>. There is no BOS token. The tokenization method is <tokens> <eos> <language code>. There is no BOS token.
Examples:: Examples::
from transformers import MBartTokenizer
tokenizer = MBartTokenizer.from_pretrained('mbart-large-en-ro') >>> from transformers import MBartTokenizer
example_english_phrase = " UN Chief Says There Is No Military Solution in Syria" >>> tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-en-ro')
expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria" >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
batch: dict = tokenizer.prepare_translation_batch( >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
example_english_phrase, src_lang="en_XX", tgt_lang="ro_RO", tgt_texts=expected_translation_romanian >>> batch: dict = tokenizer.prepare_translation_batch(
) ... example_english_phrase, src_lang="en_XX", tgt_lang="ro_RO", tgt_texts=expected_translation_romanian
... )
""" """
vocab_files_names = {"vocab_file": "sentencepiece.bpe.model"} vocab_files_names = {"vocab_file": "sentencepiece.bpe.model"}
......
...@@ -25,13 +25,13 @@ class MarianTokenizer(PreTrainedTokenizer): ...@@ -25,13 +25,13 @@ class MarianTokenizer(PreTrainedTokenizer):
Examples:: Examples::
from transformers import MarianTokenizer >>> from transformers import MarianTokenizer
tok = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de') >>> tok = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
src_texts = [ "I am a small frog.", "Tom asked his teacher for advice."] >>> src_texts = [ "I am a small frog.", "Tom asked his teacher for advice."]
tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."] # optional >>> tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."] # optional
batch_enc: BatchEncoding = tok.prepare_translation_batch(src_texts, tgt_texts=tgt_texts) >>> batch_enc: BatchEncoding = tok.prepare_translation_batch(src_texts, tgt_texts=tgt_texts)
# keys [input_ids, attention_mask, decoder_input_ids, decoder_attention_mask]. >>> # keys [input_ids, attention_mask, decoder_input_ids, decoder_attention_mask].
# model(**batch) should work >>> # model(**batch) should work
""" """
vocab_files_names = vocab_files_names vocab_files_names = vocab_files_names
......
...@@ -81,6 +81,7 @@ class ReformerTokenizer(PreTrainedTokenizer): ...@@ -81,6 +81,7 @@ class ReformerTokenizer(PreTrainedTokenizer):
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"]
def __init__( def __init__(
self, self,
......
...@@ -94,6 +94,7 @@ class T5Tokenizer(PreTrainedTokenizer): ...@@ -94,6 +94,7 @@ class T5Tokenizer(PreTrainedTokenizer):
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"]
def __init__( def __init__(
self, self,
......
...@@ -13,52 +13,19 @@ ...@@ -13,52 +13,19 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import doctest
import logging
import os import os
import unittest import unittest
from pathlib import Path
from typing import List, Union from typing import List, Union
from .utils import require_tf, require_torch, slow import transformers
def get_examples_from_file(file): from .utils import require_tf, require_torch, slow
examples = []
example = []
example_mode = False
example_indentation = None
for i, line in enumerate(file):
if example_mode:
current_indentation = len(line) - len(line.strip()) - 1
# Check if the indentation is 0 for the example, so that we don't exit as soon as there's a line return.
empty_line = example_indentation == 0 and len(line) == 1
# If we're back to the example indentation or if it's the end of the docstring.
if (current_indentation == example_indentation and not empty_line) or '"""' in line:
# Exit the example mode and add the example to the examples list
example_mode = False
example_indentation = None
examples.append(example)
example = []
else:
# If line is not empty, add it to the current example
if line != "\n":
example.append(line[example_indentation + 4 : -1])
# Detect the example from '::' or 'example::'
if "example::" in line.lower():
example_mode = True
example_indentation = line.lower().find("example::")
elif "examples::" in line.lower():
example_mode = True
example_indentation = line.lower().find("examples::")
# elif "::" in line.lower() and len(line.strip()) == 2:
# example_mode = True
# example_indentation = line.lower().find("::")
examples = ["\n".join(example) for example in examples]
examples = [example for example in examples if "not runnable" not in example.lower()]
return examples logger = logging.getLogger()
@require_torch @require_torch
...@@ -66,68 +33,81 @@ def get_examples_from_file(file): ...@@ -66,68 +33,81 @@ def get_examples_from_file(file):
@slow @slow
class TestCodeExamples(unittest.TestCase): class TestCodeExamples(unittest.TestCase):
def analyze_directory( def analyze_directory(
self, directory: str, identifier: Union[str, None] = None, ignore_files: Union[List[str], None] = None self,
directory: Path,
identifier: Union[str, None] = None,
ignore_files: Union[List[str], None] = [],
n_identifier: Union[str, None] = None,
only_modules: bool = True,
): ):
"""
Runs through the specific directory, looking for the files identified with `identifier`. Executes
the doctests in those files
Args:
directory (:obj:`str`): Directory containing the files
identifier (:obj:`str`): Will parse files containing this
ignore_files (:obj:`List[str]`): List of files to skip
n_identifier (:obj:`str` or :obj:`List[str]`): Will not parse files containing this/these identifiers.
only_modules (:obj:`bool`): Whether to only analyze modules
"""
files = [file for file in os.listdir(directory) if os.path.isfile(os.path.join(directory, file))] files = [file for file in os.listdir(directory) if os.path.isfile(os.path.join(directory, file))]
if identifier is not None: if identifier is not None:
files = [file for file in files if identifier in file] files = [file for file in files if identifier in file]
if ignore_files is not None: if n_identifier is not None:
files = [file for file in files if file not in ignore_files] if isinstance(n_identifier, List):
for n_ in n_identifier:
files = [file for file in files if n_ not in file]
else:
files = [file for file in files if n_identifier not in file]
ignore_files.append("__init__.py")
files = [file for file in files if file not in ignore_files]
for file in files: for file in files:
# Open all files # Open all files
print("Testing", file, end=" ") print("Testing", file)
with open(os.path.join(directory, file)) as f:
# Retrieve examples if only_modules:
examples = get_examples_from_file(f) try:
joined_examples = [] module_identifier = file.split(".")[0]
module_identifier = getattr(transformers, module_identifier)
def execute_example(code_example): suite = doctest.DocTestSuite(module_identifier)
exec(code_example, {}) result = unittest.TextTestRunner().run(suite)
self.assertIs(len(result.failures), 0)
# Some examples are the continuation of others. except AttributeError:
if len(examples) > 0: logger.info(f"{module_identifier} is not a module.")
joined_examples.append(examples[0]) else:
joined_examples_index = 0 result = doctest.testfile(str(".." / directory / file), optionflags=doctest.ELLIPSIS)
for example in examples[1:]: self.assertIs(result.failed, 0)
# If they contain this line, then they're a continuation of the previous script
if "# Continuation of the previous script" in example:
joined_examples[joined_examples_index] += "\n" + example
# If not, create a new example and increment the index
else:
joined_examples.append(example)
joined_examples_index += 1
print(str(len(joined_examples)) + "/" + str(len(joined_examples)))
# Execute sub tests with every example.
for index, code_example in enumerate(joined_examples):
with self.subTest(msg=file + " " + str(index) + "/" + str(len(joined_examples)) + code_example):
execute_example(code_example)
def test_configuration_examples(self):
transformers_directory = "src/transformers"
configuration_files = "configuration"
ignore_files = ["configuration_auto.py", "configuration_utils.py"]
self.analyze_directory(transformers_directory, identifier=configuration_files, ignore_files=ignore_files)
def test_main_doc_examples(self):
doc_directory = "docs/source"
ignore_files = ["favicon.ico"]
self.analyze_directory(doc_directory, ignore_files=ignore_files)
def test_modeling_examples(self): def test_modeling_examples(self):
transformers_directory = "src/transformers" transformers_directory = "src/transformers"
modeling_files = "modeling" files = "modeling"
ignore_files = [ ignore_files = [
"modeling_auto.py", "modeling_ctrl.py",
"modeling_t5.py", "modeling_tf_ctrl.py",
"modeling_tf_auto.py",
"modeling_utils.py",
"modeling_tf_t5.py",
"modeling_bart.py",
"modeling_tf_utils.py",
] ]
self.analyze_directory(transformers_directory, identifier=modeling_files, ignore_files=ignore_files) self.analyze_directory(transformers_directory, identifier=files, ignore_files=ignore_files)
def test_tokenization_examples(self):
transformers_directory = Path("src/transformers")
files = "tokenization"
self.analyze_directory(transformers_directory, identifier=files)
def test_configuration_examples(self):
transformers_directory = Path("src/transformers")
files = "configuration"
self.analyze_directory(transformers_directory, identifier=files)
def test_remaining_examples(self):
transformers_directory = Path("src/transformers")
n_identifiers = ["configuration", "modeling", "tokenization"]
self.analyze_directory(transformers_directory, n_identifier=n_identifiers)
def test_doc_sources(self):
doc_source_directory = Path("docs/source")
ignore_files = ["favicon.ico"]
self.analyze_directory(doc_source_directory, ignore_files=ignore_files, only_modules=False)
...@@ -31,6 +31,7 @@ if is_tf_available(): ...@@ -31,6 +31,7 @@ if is_tf_available():
TFXLMWithLMHeadModel, TFXLMWithLMHeadModel,
TFXLMForSequenceClassification, TFXLMForSequenceClassification,
TFXLMForQuestionAnsweringSimple, TFXLMForQuestionAnsweringSimple,
TFXLMForTokenClassification,
TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST, TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST,
) )
...@@ -219,6 +220,26 @@ class TFXLMModelTester: ...@@ -219,6 +220,26 @@ class TFXLMModelTester:
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.type_sequence_label_size]) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.type_sequence_label_size])
def create_and_check_xlm_for_token_classification(
self,
config,
input_ids,
token_type_ids,
input_lengths,
sequence_labels,
token_labels,
is_impossible_labels,
input_mask,
):
config.num_labels = self.num_labels
model = TFXLMForTokenClassification(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
(logits,) = model(inputs)
result = {
"logits": logits.numpy(),
}
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels])
def prepare_config_and_inputs_for_common(self): def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs() config_and_inputs = self.prepare_config_and_inputs()
( (
...@@ -244,7 +265,14 @@ class TFXLMModelTester: ...@@ -244,7 +265,14 @@ class TFXLMModelTester:
class TFXLMModelTest(TFModelTesterMixin, unittest.TestCase): class TFXLMModelTest(TFModelTesterMixin, unittest.TestCase):
all_model_classes = ( all_model_classes = (
(TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, TFXLMForQuestionAnsweringSimple) # TODO The multiple choice model is missing and should be added.
(
TFXLMModel,
TFXLMWithLMHeadModel,
TFXLMForSequenceClassification,
TFXLMForQuestionAnsweringSimple,
TFXLMForTokenClassification,
)
if is_tf_available() if is_tf_available()
else () else ()
) )
...@@ -275,6 +303,10 @@ class TFXLMModelTest(TFModelTesterMixin, unittest.TestCase): ...@@ -275,6 +303,10 @@ class TFXLMModelTest(TFModelTesterMixin, unittest.TestCase):
config_and_inputs = self.model_tester.prepare_config_and_inputs() config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_xlm_sequence_classif(*config_and_inputs) self.model_tester.create_and_check_xlm_sequence_classif(*config_and_inputs)
def test_for_token_classification(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_xlm_for_token_classification(*config_and_inputs)
@slow @slow
def test_model_from_pretrained(self): def test_model_from_pretrained(self):
for model_name in TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: for model_name in TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment