Unverified Commit ba8c4d0a authored by Thomas Wolf's avatar Thomas Wolf Committed by GitHub
Browse files

[Dependencies|tokenizers] Make both SentencePiece and Tokenizers optional dependencies (#7659)

* splitting fast and slow tokenizers [WIP]

* [WIP] splitting sentencepiece and tokenizers dependencies

* update dummy objects

* add name_or_path to models and tokenizers

* prefix added to file names

* prefix

* styling + quality

* spliting all the tokenizer files - sorting sentencepiece based ones

* update tokenizer version up to 0.9.0

* remove hard dependency on sentencepiece 🎉

* and removed hard dependency on tokenizers 🎉



* update conversion script

* update missing models

* fixing tests

* move test_tokenization_fast to main tokenization tests - fix bugs

* bump up tokenizers

* fix bert_generation

* update ad fix several tokenizers

* keep sentencepiece in deps for now

* fix funnel and deberta tests

* fix fsmt

* fix marian tests

* fix layoutlm

* fix squeezebert and gpt2

* fix T5 tokenization

* fix xlnet tests

* style

* fix mbart

* bump up tokenizers to 0.9.2

* fix model tests

* fix tf models

* fix seq2seq examples

* fix tests without sentencepiece

* fix slow => fast  conversion without sentencepiece

* update auto and bert generation tests

* fix mbart tests

* fix auto and common test without tokenizers

* fix tests without tokenizers

* clean up tests lighten up when tokenizers + sentencepiece are both off

* style quality and tests fixing

* add sentencepiece to doc/examples reqs

* leave sentencepiece on for now

* style quality split hebert and fix pegasus

* WIP Herbert fast

* add sample_text_no_unicode and fix hebert tokenization

* skip FSMT example test for now

* fix style

* fix fsmt in example tests

* update following Lysandre and Sylvain's comments

* Update src/transformers/testing_utils.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/testing_utils.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/tokenization_utils_base.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/tokenization_utils_base.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
parent c65863ce
...@@ -14,8 +14,7 @@ ...@@ -14,8 +14,7 @@
# limitations under the License. # limitations under the License.
from transformers.testing_utils import slow from transformers import (
from transformers.tokenization_dpr import (
DPRContextEncoderTokenizer, DPRContextEncoderTokenizer,
DPRContextEncoderTokenizerFast, DPRContextEncoderTokenizerFast,
DPRQuestionEncoderTokenizer, DPRQuestionEncoderTokenizer,
...@@ -24,11 +23,13 @@ from transformers.tokenization_dpr import ( ...@@ -24,11 +23,13 @@ from transformers.tokenization_dpr import (
DPRReaderTokenizer, DPRReaderTokenizer,
DPRReaderTokenizerFast, DPRReaderTokenizerFast,
) )
from transformers.testing_utils import require_tokenizers, slow
from transformers.tokenization_utils_base import BatchEncoding from transformers.tokenization_utils_base import BatchEncoding
from .test_tokenization_bert import BertTokenizationTest from .test_tokenization_bert import BertTokenizationTest
@require_tokenizers
class DPRContextEncoderTokenizationTest(BertTokenizationTest): class DPRContextEncoderTokenizationTest(BertTokenizationTest):
tokenizer_class = DPRContextEncoderTokenizer tokenizer_class = DPRContextEncoderTokenizer
...@@ -36,6 +37,7 @@ class DPRContextEncoderTokenizationTest(BertTokenizationTest): ...@@ -36,6 +37,7 @@ class DPRContextEncoderTokenizationTest(BertTokenizationTest):
test_rust_tokenizer = True test_rust_tokenizer = True
@require_tokenizers
class DPRQuestionEncoderTokenizationTest(BertTokenizationTest): class DPRQuestionEncoderTokenizationTest(BertTokenizationTest):
tokenizer_class = DPRQuestionEncoderTokenizer tokenizer_class = DPRQuestionEncoderTokenizer
...@@ -43,6 +45,7 @@ class DPRQuestionEncoderTokenizationTest(BertTokenizationTest): ...@@ -43,6 +45,7 @@ class DPRQuestionEncoderTokenizationTest(BertTokenizationTest):
test_rust_tokenizer = True test_rust_tokenizer = True
@require_tokenizers
class DPRReaderTokenizationTest(BertTokenizationTest): class DPRReaderTokenizationTest(BertTokenizationTest):
tokenizer_class = DPRReaderTokenizer tokenizer_class = DPRReaderTokenizer
......
import logging
import shutil
import tempfile
import unittest
from collections import namedtuple
from itertools import takewhile
from transformers import (
AlbertTokenizer,
AlbertTokenizerFast,
BartTokenizer,
BartTokenizerFast,
BertTokenizer,
BertTokenizerFast,
CamembertTokenizer,
CamembertTokenizerFast,
DistilBertTokenizer,
DistilBertTokenizerFast,
DPRContextEncoderTokenizer,
DPRContextEncoderTokenizerFast,
DPRQuestionEncoderTokenizer,
DPRQuestionEncoderTokenizerFast,
DPRReaderTokenizer,
DPRReaderTokenizerFast,
FunnelTokenizer,
FunnelTokenizerFast,
GPT2Tokenizer,
GPT2TokenizerFast,
LxmertTokenizer,
LxmertTokenizerFast,
MBartTokenizer,
MBartTokenizerFast,
OpenAIGPTTokenizer,
OpenAIGPTTokenizerFast,
PegasusTokenizer,
PegasusTokenizerFast,
ReformerTokenizer,
ReformerTokenizerFast,
RobertaTokenizer,
RobertaTokenizerFast,
T5Tokenizer,
T5TokenizerFast,
XLMRobertaTokenizer,
XLMRobertaTokenizerFast,
XLNetTokenizer,
XLNetTokenizerFast,
is_torch_available,
)
from transformers.testing_utils import get_tests_dir
logger = logging.getLogger(__name__)
NON_ENGLISH_TAGS = ["chinese", "dutch", "french", "finnish", "german", "multilingual"]
Tokenizer = namedtuple("Tokenizer", ["name", "rust_cls", "python_cls", "vocab_key", "filter", "kwargs"])
def filter_non_english(_: Tokenizer, pretrained_name: str):
""" Filter all the model for non-english language """
return not any([lang in pretrained_name for lang in NON_ENGLISH_TAGS])
def filter_roberta_detectors(_: Tokenizer, pretrained_name: str):
return "detector" not in pretrained_name
class CommonFastTokenizerTest(unittest.TestCase):
TOKENIZERS_CLASSES = frozenset([])
def setUp(self) -> None:
# Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the
# information available in Tokenizer (name, rust class, python class, vocab key name)
self.tokenizers_list = [
(tok_case, pretrained_name, dict(t for t in tok_case.kwargs) if tok_case.kwargs else {})
for tok_case in self.TOKENIZERS_CLASSES
for pretrained_name in tok_case.python_cls.pretrained_vocab_files_map[tok_case.vocab_key].keys()
if tok_case.filter is None or (tok_case.filter is not None and tok_case.filter(tok_case, pretrained_name))
]
with open(f"{get_tests_dir()}/fixtures/sample_text.txt", encoding="utf-8") as f_data:
self._data = f_data.read().replace("\n\n", "\n").strip()
self.tmpdirname = tempfile.mkdtemp()
def tearDown(self):
shutil.rmtree(self.tmpdirname)
def test_is_fast(self):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
# Check is_fast is set correctly
self.assertFalse(tokenizer_p.is_fast)
self.assertTrue(tokenizer_r.is_fast)
def test_fast_only_inputs(self):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
# Ensure None raise an error
self.assertRaises(TypeError, tokenizer_r.tokenize, None)
self.assertRaises(TypeError, tokenizer_r.encode, None)
self.assertRaises(TypeError, tokenizer_r.encode_plus, None)
self.assertRaises(TypeError, tokenizer_r.batch_encode_plus, None)
def test_alignement_methods(self):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
words = ["Wonderful", "no", "inspiration", "example", "with", "subtoken"]
text = " ".join(words)
batch_size = 3
encoding = tokenizer_r.encode_plus(text, add_special_tokens=False)
batch_encoding = tokenizer_r.batch_encode_plus([text] * batch_size, add_special_tokens=False)
num_tokens = len(encoding["input_ids"])
last_word_index = len(words) - 1
last_token_index = num_tokens - 1
last_batch_index = batch_size - 1
last_char_index = len(text) - 1
# words, tokens
self.assertEqual(len(encoding.words(0)), num_tokens)
self.assertEqual(max(encoding.words(0)), last_word_index)
self.assertEqual(min(encoding.words(0)), 0)
self.assertEqual(len(batch_encoding.words(last_batch_index)), num_tokens)
self.assertEqual(max(batch_encoding.words(last_batch_index)), last_word_index)
self.assertEqual(min(batch_encoding.words(last_batch_index)), 0)
self.assertEqual(len(encoding.tokens(0)), num_tokens)
# Assert token_to_word
self.assertEqual(encoding.token_to_word(0), 0)
self.assertEqual(encoding.token_to_word(0, 0), 0)
self.assertEqual(encoding.token_to_word(last_token_index), last_word_index)
self.assertEqual(encoding.token_to_word(0, last_token_index), last_word_index)
self.assertEqual(batch_encoding.token_to_word(1, 0), 0)
self.assertEqual(batch_encoding.token_to_word(0, last_token_index), last_word_index)
self.assertEqual(batch_encoding.token_to_word(last_batch_index, last_token_index), last_word_index)
# Assert word_to_tokens
self.assertEqual(encoding.word_to_tokens(0).start, 0)
self.assertEqual(encoding.word_to_tokens(0, 0).start, 0)
self.assertEqual(encoding.word_to_tokens(last_word_index).end, last_token_index + 1)
self.assertEqual(encoding.word_to_tokens(0, last_word_index).end, last_token_index + 1)
self.assertEqual(batch_encoding.word_to_tokens(1, 0).start, 0)
self.assertEqual(batch_encoding.word_to_tokens(0, last_word_index).end, last_token_index + 1)
self.assertEqual(
batch_encoding.word_to_tokens(last_batch_index, last_word_index).end, last_token_index + 1
)
# Assert token_to_chars
self.assertEqual(encoding.token_to_chars(0).start, 0)
self.assertEqual(encoding.token_to_chars(0, 0).start, 0)
self.assertEqual(encoding.token_to_chars(last_token_index).end, last_char_index + 1)
self.assertEqual(encoding.token_to_chars(0, last_token_index).end, last_char_index + 1)
self.assertEqual(batch_encoding.token_to_chars(1, 0).start, 0)
self.assertEqual(batch_encoding.token_to_chars(0, last_token_index).end, last_char_index + 1)
self.assertEqual(
batch_encoding.token_to_chars(last_batch_index, last_token_index).end, last_char_index + 1
)
# Assert char_to_token
self.assertEqual(encoding.char_to_token(0), 0)
self.assertEqual(encoding.char_to_token(0, 0), 0)
self.assertEqual(encoding.char_to_token(last_char_index), last_token_index)
self.assertEqual(encoding.char_to_token(0, last_char_index), last_token_index)
self.assertEqual(batch_encoding.char_to_token(1, 0), 0)
self.assertEqual(batch_encoding.char_to_token(0, last_char_index), last_token_index)
self.assertEqual(batch_encoding.char_to_token(last_batch_index, last_char_index), last_token_index)
# Assert char_to_word
self.assertEqual(encoding.char_to_word(0), 0)
self.assertEqual(encoding.char_to_word(0, 0), 0)
self.assertEqual(encoding.char_to_word(last_char_index), last_word_index)
self.assertEqual(encoding.char_to_word(0, last_char_index), last_word_index)
self.assertEqual(batch_encoding.char_to_word(1, 0), 0)
self.assertEqual(batch_encoding.char_to_word(0, last_char_index), last_word_index)
self.assertEqual(batch_encoding.char_to_word(last_batch_index, last_char_index), last_word_index)
# Assert word_to_chars
self.assertEqual(encoding.word_to_chars(0).start, 0)
self.assertEqual(encoding.word_to_chars(0, 0).start, 0)
self.assertEqual(encoding.word_to_chars(last_word_index).end, last_char_index + 1)
self.assertEqual(encoding.word_to_chars(0, last_word_index).end, last_char_index + 1)
self.assertEqual(batch_encoding.word_to_chars(1, 0).start, 0)
self.assertEqual(batch_encoding.word_to_chars(0, last_word_index).end, last_char_index + 1)
self.assertEqual(
batch_encoding.word_to_chars(last_batch_index, last_word_index).end, last_char_index + 1
)
def test_tokenization_python_rust_equals(self):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
# Ensure basic input match
input_p = tokenizer_p.encode_plus(self._data)
input_r = tokenizer_r.encode_plus(self._data)
for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()):
self.assertSequenceEqual(input_p[key], input_r[key])
input_pairs_p = tokenizer_p.encode_plus(self._data, self._data)
input_pairs_r = tokenizer_r.encode_plus(self._data, self._data)
for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()):
self.assertSequenceEqual(input_pairs_p[key], input_pairs_r[key])
# Ensure truncation match
input_p = tokenizer_p.encode_plus(self._data, max_length=512, truncation=True)
input_r = tokenizer_r.encode_plus(self._data, max_length=512, truncation=True)
for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()):
self.assertSequenceEqual(input_p[key], input_r[key])
# Ensure truncation with stride match
input_p = tokenizer_p.encode_plus(
self._data, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True
)
input_r = tokenizer_r.encode_plus(
self._data, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True
)
for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()):
self.assertSequenceEqual(input_p[key], input_r[key][0])
def test_num_special_tokens_to_add_equal(self):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
# Check we have the same number of added_tokens for both pair and non-pair inputs.
self.assertEqual(
tokenizer_r.num_special_tokens_to_add(False), tokenizer_p.num_special_tokens_to_add(False)
)
self.assertEqual(
tokenizer_r.num_special_tokens_to_add(True), tokenizer_p.num_special_tokens_to_add(True)
)
def test_max_length_equal(self):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
# Check we have the correct max_length for both pair and non-pair inputs.
self.assertEqual(tokenizer_r.max_len_single_sentence, tokenizer_p.max_len_single_sentence)
self.assertEqual(tokenizer_r.max_len_sentences_pair, tokenizer_p.max_len_sentences_pair)
def test_special_tokens_map_equal(self):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
# Assert the set of special tokens match.
self.assertSequenceEqual(
tokenizer_p.special_tokens_map.items(),
tokenizer_r.special_tokens_map.items(),
)
def test_add_tokens(self):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
vocab_size = len(tokenizer_r)
self.assertEqual(tokenizer_r.add_tokens(""), 0)
self.assertEqual(tokenizer_r.add_tokens("testoken"), 1)
self.assertEqual(tokenizer_r.add_tokens(["testoken1", "testtoken2"]), 2)
self.assertEqual(len(tokenizer_r), vocab_size + 3)
self.assertEqual(tokenizer_r.add_special_tokens({}), 0)
self.assertEqual(tokenizer_r.add_special_tokens({"bos_token": "[BOS]", "eos_token": "[EOS]"}), 2)
self.assertRaises(
AssertionError, tokenizer_r.add_special_tokens, {"additional_special_tokens": "<testtoken1>"}
)
self.assertEqual(tokenizer_r.add_special_tokens({"additional_special_tokens": ["<testtoken2>"]}), 1)
self.assertEqual(
tokenizer_r.add_special_tokens({"additional_special_tokens": ["<testtoken3>", "<testtoken4>"]}), 2
)
self.assertEqual(len(tokenizer_r), vocab_size + 8)
def test_offsets_mapping(self):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
text = "Wonderful no inspiration example with subtoken"
pair = "Along with an awesome pair"
# No pair
tokens_with_offsets = tokenizer_r.encode_plus(
text, return_special_tokens_mask=True, return_offsets_mapping=True, add_special_tokens=True
)
added_tokens = tokenizer_r.num_special_tokens_to_add(False)
offsets = tokens_with_offsets["offset_mapping"]
# Assert there is the same number of tokens and offsets
self.assertEqual(len(offsets), len(tokens_with_offsets["input_ids"]))
# Assert there is online added_tokens special_tokens
self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens)
# Pairs
tokens_with_offsets = tokenizer_r.encode_plus(
text, pair, return_special_tokens_mask=True, return_offsets_mapping=True, add_special_tokens=True
)
added_tokens = tokenizer_r.num_special_tokens_to_add(True)
offsets = tokens_with_offsets["offset_mapping"]
# Assert there is the same number of tokens and offsets
self.assertEqual(len(offsets), len(tokens_with_offsets["input_ids"]))
# Assert there is online added_tokens special_tokens
self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens)
def test_batch_encode_dynamic_overflowing(self):
"""
When calling batch_encode with multiple sequence it can returns different number of
overflowing encoding for each sequence:
[
Sequence 1: [Encoding 1, Encoding 2],
Sequence 2: [Encoding 1],
Sequence 3: [Encoding 1, Encoding 2, ... Encoding N]
]
This needs to be padded so that it can represented as a tensor
"""
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
tokenizer = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
with self.subTest("{} ({}, {})".format(tok_case.name, pretrained_name, tokenizer.__class__.__name__)):
returned_tensor = "pt" if is_torch_available() else "tf"
if not tokenizer.pad_token or tokenizer.pad_token_id < 0:
return
tokens = tokenizer.encode_plus(
"HuggingFace is solving NLP one commit at a time",
max_length=6,
padding=True,
truncation=True,
return_tensors=returned_tensor,
return_overflowing_tokens=True,
)
for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()):
self.assertEqual(len(tokens[key].shape), 2)
# Mono sample
tokens = tokenizer.batch_encode_plus(
["HuggingFace is solving NLP one commit at a time"],
max_length=6,
padding=True,
truncation="only_first",
return_tensors=returned_tensor,
return_overflowing_tokens=True,
)
for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()):
self.assertEqual(len(tokens[key].shape), 2)
self.assertEqual(tokens[key].shape[-1], 6)
# Multi sample
tokens = tokenizer.batch_encode_plus(
["HuggingFace is solving NLP one commit at a time", "Very tiny input"],
max_length=6,
padding=True,
truncation="only_first",
return_tensors=returned_tensor,
return_overflowing_tokens=True,
)
for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()):
self.assertEqual(len(tokens[key].shape), 2)
self.assertEqual(tokens[key].shape[-1], 6)
def test_pretokenized_inputs(self):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
# Input string
pretokenized_input_simple = "This is a sample input".split()
pretokenized_input_pair = "This is a sample pair".split()
# Test encode for pretokenized inputs
output_r = tokenizer_r.encode(
pretokenized_input_simple, is_split_into_words=True, add_special_tokens=False
)
output_p = tokenizer_p.encode(
pretokenized_input_simple, is_split_into_words=True, add_special_tokens=False
)
self.assertEqual(output_p, output_r)
kwargs = {
"is_split_into_words": True,
# "return_token_type_ids": True, # Use the defaults for each tokenizers
# "return_attention_mask": True, # Use the defaults for each tokenizers
"return_overflowing_tokens": False,
"return_special_tokens_mask": True,
"return_offsets_mapping": False, # Not implemented in python tokenizers
# "add_special_tokens": False,
}
batch_kwargs = {
"is_split_into_words": True,
# "return_token_type_ids": True, # Use the defaults for each tokenizers
# "return_attention_mask": True, # Use the defaults for each tokenizers
"return_overflowing_tokens": False,
"return_special_tokens_mask": True,
"return_offsets_mapping": False, # Not implemented in python tokenizers
# "add_special_tokens": False,
}
# Test encode_plus for pretokenized inputs
output_r = tokenizer_r.encode_plus(pretokenized_input_simple, **kwargs)
output_p = tokenizer_p.encode_plus(pretokenized_input_simple, **kwargs)
for key in output_p.keys():
self.assertEqual(output_p[key], output_r[key])
# Test batch_encode_plus for pretokenized inputs
input_batch = ([pretokenized_input_simple] * 2) + [pretokenized_input_simple + pretokenized_input_pair]
output_r = tokenizer_r.batch_encode_plus(input_batch, **batch_kwargs)
output_p = tokenizer_p.batch_encode_plus(input_batch, **batch_kwargs)
for key in output_p.keys():
self.assertEqual(output_p[key], output_r[key])
# Test encode for pretokenized inputs pairs
output_r = tokenizer_r.encode(
pretokenized_input_simple, pretokenized_input_pair, is_split_into_words=True
)
output_p = tokenizer_p.encode(
pretokenized_input_simple, pretokenized_input_pair, is_split_into_words=True
)
self.assertEqual(output_p, output_r)
# Test encode_plus for pretokenized inputs
output_r = tokenizer_r.encode_plus(pretokenized_input_simple, pretokenized_input_pair, **kwargs)
output_p = tokenizer_p.encode_plus(pretokenized_input_simple, pretokenized_input_pair, **kwargs)
for key in output_p.keys():
self.assertEqual(output_p[key], output_r[key])
# Test batch_encode_plus for pretokenized inputs
input_batch_pair = ([pretokenized_input_simple, pretokenized_input_pair] * 2) + [
pretokenized_input_simple + pretokenized_input_pair,
pretokenized_input_pair,
]
output_r = tokenizer_r.batch_encode_plus(input_batch_pair, **batch_kwargs)
output_p = tokenizer_p.batch_encode_plus(input_batch_pair, **batch_kwargs)
for key in output_p.keys():
self.assertEqual(output_p[key], output_r[key])
def test_create_token_type_ids(self):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
input_simple = [1, 2, 3]
input_pair = [1, 2, 3]
# Generate output
output_r = tokenizer_r.create_token_type_ids_from_sequences(input_simple)
output_p = tokenizer_p.create_token_type_ids_from_sequences(input_simple)
self.assertEqual(output_p, output_r)
# Generate pair output
output_r = tokenizer_r.create_token_type_ids_from_sequences(input_simple, input_pair)
output_p = tokenizer_p.create_token_type_ids_from_sequences(input_simple, input_pair)
self.assertEqual(output_p, output_r)
def test_build_inputs_with_special_tokens(self):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
# # Input string
# input_simple = tokenizer_p.tokenize("This is a sample input", add_special_tokens=False)
# input_pair = tokenizer_p.tokenize("This is a sample pair", add_special_tokens=False)
# # Generate output
# output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple)
# output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple)
# self.assertEqual(output_p, output_r)
# # Generate pair output
# output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple, input_pair)
# output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair)
# self.assertEqual(output_p, output_r)
# Input tokens id
input_simple = tokenizer_p.encode("This is a sample input", add_special_tokens=False)
input_pair = tokenizer_p.encode("This is a sample pair", add_special_tokens=False)
# Generate output
output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple)
output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple)
self.assertEqual(output_p, output_r)
# Generate pair output
output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple, input_pair)
output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair)
self.assertEqual(output_p, output_r)
def test_padding(self, max_length=50):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
def assert_padded_input_match(input_r: list, input_p: list, max_length: int):
# Ensure we match max_length
self.assertEqual(len(input_r), max_length)
self.assertEqual(len(input_p), max_length)
# Ensure the number of padded tokens is the same
padded_tokens_r = list(takewhile(lambda i: i == tokenizer_r.pad_token_id, reversed(input_r)))
padded_tokens_p = list(takewhile(lambda i: i == tokenizer_p.pad_token_id, reversed(input_p)))
self.assertSequenceEqual(padded_tokens_r, padded_tokens_p)
def assert_batch_padded_input_match(input_r: dict, input_p: dict, max_length: int):
for i_r in input_r.values():
self.assertEqual(len(i_r), 2), self.assertEqual(len(i_r[0]), max_length), self.assertEqual(
len(i_r[1]), max_length
)
self.assertEqual(len(i_r), 2), self.assertEqual(len(i_r[0]), max_length), self.assertEqual(
len(i_r[1]), max_length
)
for i_r, i_p in zip(input_r["input_ids"], input_p["input_ids"]):
assert_padded_input_match(i_r, i_p, max_length)
for i_r, i_p in zip(input_r["attention_mask"], input_p["attention_mask"]):
self.assertSequenceEqual(i_r, i_p)
# Encode - Simple input
input_r = tokenizer_r.encode("This is a simple input", max_length=max_length, pad_to_max_length=True)
input_p = tokenizer_p.encode("This is a simple input", max_length=max_length, pad_to_max_length=True)
assert_padded_input_match(input_r, input_p, max_length)
input_r = tokenizer_r.encode("This is a simple input", max_length=max_length, padding="max_length")
input_p = tokenizer_p.encode("This is a simple input", max_length=max_length, padding="max_length")
assert_padded_input_match(input_r, input_p, max_length)
input_r = tokenizer_r.encode("This is a simple input", padding="longest")
input_p = tokenizer_p.encode("This is a simple input", padding=True)
assert_padded_input_match(input_r, input_p, len(input_r))
# Encode - Pair input
input_r = tokenizer_r.encode(
"This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
)
input_p = tokenizer_p.encode(
"This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
)
assert_padded_input_match(input_r, input_p, max_length)
input_r = tokenizer_r.encode(
"This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
)
input_p = tokenizer_p.encode(
"This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
)
assert_padded_input_match(input_r, input_p, max_length)
input_r = tokenizer_r.encode("This is a simple input", "This is a pair", padding=True)
input_p = tokenizer_p.encode("This is a simple input", "This is a pair", padding="longest")
assert_padded_input_match(input_r, input_p, len(input_r))
# Encode_plus - Simple input
input_r = tokenizer_r.encode_plus(
"This is a simple input", max_length=max_length, pad_to_max_length=True
)
input_p = tokenizer_p.encode_plus(
"This is a simple input", max_length=max_length, pad_to_max_length=True
)
assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length)
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
input_r = tokenizer_r.encode_plus(
"This is a simple input", max_length=max_length, padding="max_length"
)
input_p = tokenizer_p.encode_plus(
"This is a simple input", max_length=max_length, padding="max_length"
)
assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length)
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
input_r = tokenizer_r.encode_plus("This is a simple input", padding="longest")
input_p = tokenizer_p.encode_plus("This is a simple input", padding=True)
assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]))
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
# Encode_plus - Pair input
input_r = tokenizer_r.encode_plus(
"This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
)
input_p = tokenizer_p.encode_plus(
"This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
)
assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length)
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
input_r = tokenizer_r.encode_plus(
"This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
)
input_p = tokenizer_p.encode_plus(
"This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
)
assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length)
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
input_r = tokenizer_r.encode_plus("This is a simple input", "This is a pair", padding="longest")
input_p = tokenizer_p.encode_plus("This is a simple input", "This is a pair", padding=True)
assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]))
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
# Batch_encode_plus - Simple input
input_r = tokenizer_r.batch_encode_plus(
["This is a simple input 1", "This is a simple input 2"],
max_length=max_length,
pad_to_max_length=True,
)
input_p = tokenizer_p.batch_encode_plus(
["This is a simple input 1", "This is a simple input 2"],
max_length=max_length,
pad_to_max_length=True,
)
assert_batch_padded_input_match(input_r, input_p, max_length)
input_r = tokenizer_r.batch_encode_plus(
["This is a simple input 1", "This is a simple input 2"],
max_length=max_length,
padding="max_length",
)
input_p = tokenizer_p.batch_encode_plus(
["This is a simple input 1", "This is a simple input 2"],
max_length=max_length,
padding="max_length",
)
assert_batch_padded_input_match(input_r, input_p, max_length)
input_r = tokenizer_r.batch_encode_plus(
["This is a simple input 1", "This is a simple input 2"],
max_length=max_length,
padding="longest",
)
input_p = tokenizer_p.batch_encode_plus(
["This is a simple input 1", "This is a simple input 2"],
max_length=max_length,
padding=True,
)
assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]))
input_r = tokenizer_r.batch_encode_plus(
["This is a simple input 1", "This is a simple input 2"], padding="longest"
)
input_p = tokenizer_p.batch_encode_plus(
["This is a simple input 1", "This is a simple input 2"], padding=True
)
assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]))
# Batch_encode_plus - Pair input
input_r = tokenizer_r.batch_encode_plus(
[
("This is a simple input 1", "This is a simple input 2"),
("This is a simple pair 1", "This is a simple pair 2"),
],
max_length=max_length,
truncation=True,
padding="max_length",
)
input_p = tokenizer_p.batch_encode_plus(
[
("This is a simple input 1", "This is a simple input 2"),
("This is a simple pair 1", "This is a simple pair 2"),
],
max_length=max_length,
truncation=True,
padding="max_length",
)
assert_batch_padded_input_match(input_r, input_p, max_length)
input_r = tokenizer_r.batch_encode_plus(
[
("This is a simple input 1", "This is a simple input 2"),
("This is a simple pair 1", "This is a simple pair 2"),
],
padding=True,
)
input_p = tokenizer_p.batch_encode_plus(
[
("This is a simple input 1", "This is a simple input 2"),
("This is a simple pair 1", "This is a simple pair 2"),
],
padding="longest",
)
assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]))
# Using pad on single examples after tokenization
input_r = tokenizer_r.encode_plus("This is a input 1")
input_r = tokenizer_r.pad(input_r)
input_p = tokenizer_r.encode_plus("This is a input 1")
input_p = tokenizer_r.pad(input_p)
assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]))
# Using pad on single examples after tokenization
input_r = tokenizer_r.encode_plus("This is a input 1")
input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length")
input_p = tokenizer_r.encode_plus("This is a input 1")
input_p = tokenizer_r.pad(input_p, max_length=max_length, padding="max_length")
assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length)
# Using pad after tokenization
input_r = tokenizer_r.batch_encode_plus(
["This is a input 1", "This is a much longer input whilch should be padded"]
)
input_r = tokenizer_r.pad(input_r)
input_p = tokenizer_r.batch_encode_plus(
["This is a input 1", "This is a much longer input whilch should be padded"]
)
input_p = tokenizer_r.pad(input_p)
assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]))
# Using pad after tokenization
input_r = tokenizer_r.batch_encode_plus(
["This is a input 1", "This is a much longer input whilch should be padded"]
)
input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length")
input_p = tokenizer_r.batch_encode_plus(
["This is a input 1", "This is a much longer input whilch should be padded"]
)
input_p = tokenizer_r.pad(input_p, max_length=max_length, padding="max_length")
assert_batch_padded_input_match(input_r, input_p, max_length)
def test_save_pretrained(self):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
# Checks it save with the same files
self.assertSequenceEqual(
tokenizer_r.save_vocabulary(self.tmpdirname), tokenizer_p.save_vocabulary(self.tmpdirname)
)
# Checks everything loads correctly in the same way
tokenizer_rp, tokenizer_pp = tokenizer_r.from_pretrained(self.tmpdirname), tokenizer_p.from_pretrained(
self.tmpdirname
)
# Check special tokens are set accordingly on Rust and Python
for key in tokenizer_pp.special_tokens_map:
self.assertTrue(hasattr(tokenizer_rp, key))
# self.assertEqual(getattr(tokenizer_rp, key), getattr(tokenizer_pp, key))
# self.assertEqual(getattr(tokenizer_rp, key + "_id"), getattr(tokenizer_pp, key + "_id"))
def test_embeded_special_tokens(self):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
sentence = "A, <mask> AllenNLP sentence."
tokens_r = tokenizer_r.encode_plus(
sentence,
add_special_tokens=True,
)
tokens_p = tokenizer_p.encode_plus(
sentence,
add_special_tokens=True,
)
for key in tokens_p.keys():
self.assertEqual(tokens_r[key], tokens_p[key])
if "token_type_ids" in tokens_r:
self.assertEqual(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"]))
tokens_r = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"])
tokens_p = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])
self.assertSequenceEqual(tokens_r, tokens_p)
def test_add_special_tokens(self):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
simple_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=False)
# pair_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=True)
for text in ["", " "]:
# tokenize()
no_special_tokens = tokenizer_r.tokenize(text, add_special_tokens=False)
with_special_tokens = tokenizer_r.tokenize(text, add_special_tokens=True)
self.assertEqual(
len(no_special_tokens), len(with_special_tokens) - simple_num_special_tokens_to_add
)
# encode()
no_special_tokens = tokenizer_r.encode(text, add_special_tokens=False)
with_special_tokens = tokenizer_r.encode(text, add_special_tokens=True)
self.assertEqual(
len(no_special_tokens), len(with_special_tokens) - simple_num_special_tokens_to_add
)
# encode_plus()
no_special_tokens = tokenizer_r.encode_plus(text, add_special_tokens=False)
with_special_tokens = tokenizer_r.encode_plus(text, add_special_tokens=True)
for key in no_special_tokens.keys():
self.assertEqual(
len(no_special_tokens[key]),
len(with_special_tokens[key]) - simple_num_special_tokens_to_add,
)
# # batch_encode_plus
no_special_tokens = tokenizer_r.batch_encode_plus([text, text], add_special_tokens=False)
with_special_tokens = tokenizer_r.batch_encode_plus([text, text], add_special_tokens=True)
for key in no_special_tokens.keys():
for i_no, i_with in zip(no_special_tokens[key], with_special_tokens[key]):
self.assertEqual(len(i_no), len(i_with) - simple_num_special_tokens_to_add)
def test_prepare_for_model(self):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
string_sequence = "Asserting that both tokenizers are equal"
python_output = tokenizer_p.prepare_for_model(
tokenizer_p.encode(string_sequence, add_special_tokens=False)
)
rust_output = tokenizer_r.prepare_for_model(
tokenizer_r.encode(string_sequence, add_special_tokens=False)
)
for key in python_output:
self.assertEqual(python_output[key], rust_output[key])
class WordPieceFastTokenizerTest(CommonFastTokenizerTest):
"""
Override all the specific methods to test WordPiece behavior
"""
TOKENIZERS_CLASSES = frozenset(
[
Tokenizer("Bert", BertTokenizerFast, BertTokenizer, "vocab_file", filter_non_english, None),
Tokenizer(
"DistilBert", DistilBertTokenizerFast, DistilBertTokenizer, "vocab_file", filter_non_english, None
),
Tokenizer(
"DPRReaderTokenizer",
DPRReaderTokenizerFast,
DPRReaderTokenizer,
"vocab_file",
filter_non_english,
None,
),
Tokenizer(
"DPRQuestionEncoderTokenizer",
DPRQuestionEncoderTokenizerFast,
DPRQuestionEncoderTokenizer,
"vocab_file",
filter_non_english,
None,
),
Tokenizer(
"DPRContextEncoderTokenizer",
DPRContextEncoderTokenizerFast,
DPRContextEncoderTokenizer,
"vocab_file",
filter_non_english,
None,
),
Tokenizer("FunnelTokenizer", FunnelTokenizerFast, FunnelTokenizer, "vocab_file", filter_non_english, None),
Tokenizer("LxmertTokenizer", LxmertTokenizerFast, LxmertTokenizer, "vocab_file", filter_non_english, None),
]
)
def test_offsets_with_special_characters(self):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
tokens = tokenizer_r.encode_plus(
sentence,
return_attention_mask=False,
return_token_type_ids=False,
return_offsets_mapping=True,
add_special_tokens=True,
)
do_lower_case = tokenizer_r.do_lower_case if hasattr(tokenizer_r, "do_lower_case") else False
expected_results = (
[
((0, 0), tokenizer_r.cls_token),
((0, 1), "A"),
((1, 2), ","),
((3, 5), "na"),
((5, 6), "##ï"),
((6, 8), "##ve"),
((9, 15), tokenizer_r.mask_token),
((16, 21), "Allen"),
((21, 23), "##NL"),
((23, 24), "##P"),
((25, 33), "sentence"),
((33, 34), "."),
((0, 0), tokenizer_r.sep_token),
]
if not do_lower_case
else [
((0, 0), tokenizer_r.cls_token),
((0, 1), "a"),
((1, 2), ","),
((3, 8), "naive"),
((9, 15), tokenizer_r.mask_token),
((16, 21), "allen"),
((21, 23), "##nl"),
((23, 24), "##p"),
((25, 33), "sentence"),
((33, 34), "."),
((0, 0), tokenizer_r.sep_token),
]
)
self.assertEqual(
[e[1] for e in expected_results], tokenizer_r.convert_ids_to_tokens(tokens["input_ids"])
)
self.assertEqual([e[0] for e in expected_results], tokens["offset_mapping"])
class RobertaFastTokenizerTest(CommonFastTokenizerTest):
TOKENIZERS_CLASSES = frozenset(
[
Tokenizer(
"Roberta",
RobertaTokenizerFast,
RobertaTokenizer,
"vocab_file",
filter_roberta_detectors,
(("cls_token", "<s>"),),
),
Tokenizer(
"Bart",
BartTokenizerFast,
BartTokenizer,
"vocab_file",
None,
None,
),
]
)
def test_pretokenized_inputs(self):
pass
def test_embeded_special_tokens(self):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
sentence = "A, <mask> AllenNLP sentence."
tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
# token_type_ids should put 0 everywhere
self.assertEqual(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"]))
# attention_mask should put 1 everywhere, so sum over length should be 1
self.assertEqual(
sum(tokens_r["attention_mask"]) / len(tokens_r["attention_mask"]),
sum(tokens_p["attention_mask"]) / len(tokens_p["attention_mask"]),
)
tokens_r_str = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"])
tokens_p_str = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])
# Rust correctly handles the space before the mask while python doesnt
self.assertSequenceEqual(tokens_p["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
self.assertSequenceEqual(tokens_r["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
self.assertSequenceEqual(
tokens_p_str, ["<s>", "A", ",", "<mask>", "ĠAllen", "N", "LP", "Ġsentence", ".", "</s>"]
)
self.assertSequenceEqual(
tokens_r_str, ["<s>", "A", ",", "<mask>", "ĠAllen", "N", "LP", "Ġsentence", ".", "</s>"]
)
class NoPaddingTokenFastTokenizerMatchingTest(CommonFastTokenizerTest):
TOKENIZERS_CLASSES = [
Tokenizer("OpenAI GPT", OpenAIGPTTokenizerFast, OpenAIGPTTokenizer, "vocab_file", None, None),
Tokenizer("GPT2", GPT2TokenizerFast, GPT2Tokenizer, "vocab_file", None, [("add_prefix_space", True)]),
]
def test_pretokenized_inputs(self):
pass
def test_padding(self, max_length=15):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
# Simple input
s = "This is a simple input"
s2 = ["This is a simple input 1", "This is a simple input 2"]
p = ("This is a simple input", "This is a pair")
p2 = [
("This is a simple input 1", "This is a simple input 2"),
("This is a simple pair 1", "This is a simple pair 2"),
]
# Simple input tests
self.assertRaises(ValueError, tokenizer_r.encode, s, max_length=max_length, padding="max_length")
# Simple input
self.assertRaises(ValueError, tokenizer_r.encode_plus, s, max_length=max_length, padding="max_length")
# Simple input
self.assertRaises(
ValueError,
tokenizer_r.batch_encode_plus,
s2,
max_length=max_length,
padding="max_length",
)
# Pair input
self.assertRaises(ValueError, tokenizer_r.encode, p, max_length=max_length, padding="max_length")
# Pair input
self.assertRaises(ValueError, tokenizer_r.encode_plus, p, max_length=max_length, padding="max_length")
# Pair input
self.assertRaises(
ValueError,
tokenizer_r.batch_encode_plus,
p2,
max_length=max_length,
padding="max_length",
)
class SentencePieceFastTokenizerTest(CommonFastTokenizerTest):
"""
Override specific methods to test SentencePiece behavior
"""
TOKENIZERS_CLASSES = frozenset(
[
Tokenizer("Albert", AlbertTokenizerFast, AlbertTokenizer, "vocab_file", None, None),
Tokenizer("Camembert", CamembertTokenizerFast, CamembertTokenizer, "vocab_file", None, None),
Tokenizer("T5", T5TokenizerFast, T5Tokenizer, "vocab_file", None, None),
Tokenizer(
"MBart",
MBartTokenizerFast,
MBartTokenizer,
"vocab_file",
None,
None,
),
Tokenizer("Pegasus", PegasusTokenizerFast, PegasusTokenizer, "vocab_file", None, None),
Tokenizer("Reformer", ReformerTokenizerFast, ReformerTokenizer, "vocab_file", None, None),
Tokenizer("XLMRoberta", XLMRobertaTokenizerFast, XLMRobertaTokenizer, "vocab_file", None, None),
Tokenizer("XLNet", XLNetTokenizerFast, XLNetTokenizer, "vocab_file", None, None),
]
)
...@@ -17,14 +17,18 @@ ...@@ -17,14 +17,18 @@
import os import os
import unittest import unittest
from transformers.tokenization_funnel import VOCAB_FILES_NAMES, FunnelTokenizer, FunnelTokenizerFast from transformers import FunnelTokenizer, FunnelTokenizerFast
from transformers.testing_utils import require_tokenizers
from transformers.tokenization_funnel import VOCAB_FILES_NAMES
from .test_tokenization_common import TokenizerTesterMixin from .test_tokenization_common import TokenizerTesterMixin
@require_tokenizers
class FunnelTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class FunnelTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = FunnelTokenizer tokenizer_class = FunnelTokenizer
rust_tokenizer_class = FunnelTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
space_between_special_tokens = True space_between_special_tokens = True
......
...@@ -18,16 +18,20 @@ import json ...@@ -18,16 +18,20 @@ import json
import os import os
import unittest import unittest
from transformers.tokenization_gpt2 import VOCAB_FILES_NAMES, GPT2Tokenizer, GPT2TokenizerFast from transformers import GPT2Tokenizer, GPT2TokenizerFast
from transformers.testing_utils import require_tokenizers
from transformers.tokenization_gpt2 import VOCAB_FILES_NAMES
from .test_tokenization_common import TokenizerTesterMixin from .test_tokenization_common import TokenizerTesterMixin
@require_tokenizers
class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = GPT2Tokenizer tokenizer_class = GPT2Tokenizer
rust_tokenizer_class = GPT2TokenizerFast rust_tokenizer_class = GPT2TokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
from_pretrained_kwargs = {"add_prefix_space": True}
def setUp(self): def setUp(self):
super().setUp() super().setUp()
...@@ -125,3 +129,47 @@ class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -125,3 +129,47 @@ class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# It's very difficult to mix/test pretokenization with byte-level # It's very difficult to mix/test pretokenization with byte-level
# And get both GPT2 and Roberta to work at the same time (mostly an issue of adding a space before the string) # And get both GPT2 and Roberta to work at the same time (mostly an issue of adding a space before the string)
pass pass
def test_padding(self, max_length=15):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
# Simple input
s = "This is a simple input"
s2 = ["This is a simple input 1", "This is a simple input 2"]
p = ("This is a simple input", "This is a pair")
p2 = [
("This is a simple input 1", "This is a simple input 2"),
("This is a simple pair 1", "This is a simple pair 2"),
]
# Simple input tests
self.assertRaises(ValueError, tokenizer_r.encode, s, max_length=max_length, padding="max_length")
# Simple input
self.assertRaises(ValueError, tokenizer_r.encode_plus, s, max_length=max_length, padding="max_length")
# Simple input
self.assertRaises(
ValueError,
tokenizer_r.batch_encode_plus,
s2,
max_length=max_length,
padding="max_length",
)
# Pair input
self.assertRaises(ValueError, tokenizer_r.encode, p, max_length=max_length, padding="max_length")
# Pair input
self.assertRaises(ValueError, tokenizer_r.encode_plus, p, max_length=max_length, padding="max_length")
# Pair input
self.assertRaises(
ValueError,
tokenizer_r.batch_encode_plus,
p2,
max_length=max_length,
padding="max_length",
)
...@@ -18,12 +18,14 @@ import json ...@@ -18,12 +18,14 @@ import json
import os import os
import unittest import unittest
from transformers.testing_utils import slow from transformers import HerbertTokenizer, HerbertTokenizerFast
from transformers.tokenization_herbert import VOCAB_FILES_NAMES, HerbertTokenizer, HerbertTokenizerFast from transformers.testing_utils import get_tests_dir, require_tokenizers, slow
from transformers.tokenization_herbert import VOCAB_FILES_NAMES
from .test_tokenization_common import TokenizerTesterMixin from .test_tokenization_common import TokenizerTesterMixin
@require_tokenizers
class HerbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class HerbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = HerbertTokenizer tokenizer_class = HerbertTokenizer
...@@ -33,6 +35,10 @@ class HerbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -33,6 +35,10 @@ class HerbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def setUp(self): def setUp(self):
super().setUp() super().setUp()
# Use a simpler test file without japanese/chinese characters
with open(f"{get_tests_dir()}/fixtures/sample_text_no_unicode.txt", encoding="utf-8") as f_data:
self._data = f_data.read().replace("\n\n", "\n").strip()
vocab = [ vocab = [
"<s>", "<s>",
"</s>", "</s>",
......
...@@ -17,14 +17,20 @@ ...@@ -17,14 +17,20 @@
import os import os
import unittest import unittest
from transformers.tokenization_layoutlm import VOCAB_FILES_NAMES, LayoutLMTokenizer from transformers import LayoutLMTokenizer, LayoutLMTokenizerFast
from transformers.testing_utils import require_tokenizers
from transformers.tokenization_layoutlm import VOCAB_FILES_NAMES
from .test_tokenization_common import TokenizerTesterMixin from .test_tokenization_common import TokenizerTesterMixin
@require_tokenizers
class LayoutLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class LayoutLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = LayoutLMTokenizer tokenizer_class = LayoutLMTokenizer
rust_tokenizer_class = LayoutLMTokenizerFast
test_rust_tokenizer = True
space_between_special_tokens = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()
......
...@@ -17,12 +17,14 @@ ...@@ -17,12 +17,14 @@
import os import os
import unittest import unittest
from transformers import LxmertTokenizer, LxmertTokenizerFast
from transformers.testing_utils import require_tokenizers
from transformers.tokenization_bert import VOCAB_FILES_NAMES from transformers.tokenization_bert import VOCAB_FILES_NAMES
from transformers.tokenization_lxmert import LxmertTokenizer, LxmertTokenizerFast
from .test_tokenization_common import TokenizerTesterMixin from .test_tokenization_common import TokenizerTesterMixin
@require_tokenizers
class LxmertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class LxmertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = LxmertTokenizer tokenizer_class = LxmertTokenizer
......
...@@ -20,9 +20,12 @@ import unittest ...@@ -20,9 +20,12 @@ import unittest
from pathlib import Path from pathlib import Path
from shutil import copyfile from shutil import copyfile
from transformers.testing_utils import _torch_available from transformers import BatchEncoding, MarianTokenizer
from transformers.tokenization_marian import MarianTokenizer, save_json, vocab_files_names from transformers.testing_utils import _sentencepiece_available, _torch_available, require_sentencepiece
from transformers.tokenization_utils import BatchEncoding
if _sentencepiece_available:
from transformers.tokenization_marian import save_json, vocab_files_names
from .test_tokenization_common import TokenizerTesterMixin from .test_tokenization_common import TokenizerTesterMixin
...@@ -35,6 +38,7 @@ ORG_NAME = "Helsinki-NLP/" ...@@ -35,6 +38,7 @@ ORG_NAME = "Helsinki-NLP/"
FRAMEWORK = "pt" if _torch_available else "tf" FRAMEWORK = "pt" if _torch_available else "tf"
@require_sentencepiece
class MarianTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class MarianTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = MarianTokenizer tokenizer_class = MarianTokenizer
......
import tempfile import tempfile
import unittest import unittest
from transformers import AutoTokenizer, BatchEncoding, MBartTokenizer, MBartTokenizerFast, is_torch_available from transformers import (
from transformers.testing_utils import require_torch SPIECE_UNDERLINE,
AutoTokenizer,
BatchEncoding,
MBartTokenizer,
MBartTokenizerFast,
is_torch_available,
)
from transformers.testing_utils import (
_sentencepiece_available,
require_sentencepiece,
require_tokenizers,
require_torch,
)
from .test_tokenization_common import TokenizerTesterMixin from .test_tokenization_common import TokenizerTesterMixin
from .test_tokenization_xlm_roberta import SAMPLE_VOCAB, SPIECE_UNDERLINE
if _sentencepiece_available:
from .test_tokenization_xlm_roberta import SAMPLE_VOCAB
if is_torch_available(): if is_torch_available():
...@@ -15,6 +30,8 @@ EN_CODE = 250004 ...@@ -15,6 +30,8 @@ EN_CODE = 250004
RO_CODE = 250020 RO_CODE = 250020
@require_sentencepiece
@require_tokenizers
class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = MBartTokenizer tokenizer_class = MBartTokenizer
rust_tokenizer_class = MBartTokenizerFast rust_tokenizer_class = MBartTokenizerFast
...@@ -105,6 +122,8 @@ class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -105,6 +122,8 @@ class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@require_torch @require_torch
@require_sentencepiece
@require_tokenizers
class MBartEnroIntegrationTest(unittest.TestCase): class MBartEnroIntegrationTest(unittest.TestCase):
checkpoint_name = "facebook/mbart-large-en-ro" checkpoint_name = "facebook/mbart-large-en-ro"
src_text = [ src_text = [
......
...@@ -18,11 +18,14 @@ import json ...@@ -18,11 +18,14 @@ import json
import os import os
import unittest import unittest
from transformers.tokenization_openai import VOCAB_FILES_NAMES, OpenAIGPTTokenizer, OpenAIGPTTokenizerFast from transformers import OpenAIGPTTokenizer, OpenAIGPTTokenizerFast
from transformers.testing_utils import require_tokenizers
from transformers.tokenization_openai import VOCAB_FILES_NAMES
from .test_tokenization_common import TokenizerTesterMixin from .test_tokenization_common import TokenizerTesterMixin
@require_tokenizers
class OpenAIGPTTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class OpenAIGPTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = OpenAIGPTTokenizer tokenizer_class = OpenAIGPTTokenizer
...@@ -80,3 +83,47 @@ class OpenAIGPTTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -80,3 +83,47 @@ class OpenAIGPTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
input_tokens = tokens + ["<unk>"] input_tokens = tokens + ["<unk>"]
input_bpe_tokens = [14, 15, 20] input_bpe_tokens = [14, 15, 20]
self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
def test_padding(self, max_length=15):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
# Simple input
s = "This is a simple input"
s2 = ["This is a simple input 1", "This is a simple input 2"]
p = ("This is a simple input", "This is a pair")
p2 = [
("This is a simple input 1", "This is a simple input 2"),
("This is a simple pair 1", "This is a simple pair 2"),
]
# Simple input tests
self.assertRaises(ValueError, tokenizer_r.encode, s, max_length=max_length, padding="max_length")
# Simple input
self.assertRaises(ValueError, tokenizer_r.encode_plus, s, max_length=max_length, padding="max_length")
# Simple input
self.assertRaises(
ValueError,
tokenizer_r.batch_encode_plus,
s2,
max_length=max_length,
padding="max_length",
)
# Pair input
self.assertRaises(ValueError, tokenizer_r.encode, p, max_length=max_length, padding="max_length")
# Pair input
self.assertRaises(ValueError, tokenizer_r.encode_plus, p, max_length=max_length, padding="max_length")
# Pair input
self.assertRaises(
ValueError,
tokenizer_r.batch_encode_plus,
p2,
max_length=max_length,
padding="max_length",
)
import unittest import unittest
from transformers import PegasusTokenizer, PegasusTokenizerFast
from transformers.file_utils import cached_property from transformers.file_utils import cached_property
from transformers.testing_utils import get_tests_dir, require_torch from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, require_torch
from transformers.tokenization_pegasus import PegasusTokenizer, PegasusTokenizerFast
from .test_tokenization_common import TokenizerTesterMixin from .test_tokenization_common import TokenizerTesterMixin
...@@ -10,6 +10,8 @@ from .test_tokenization_common import TokenizerTesterMixin ...@@ -10,6 +10,8 @@ from .test_tokenization_common import TokenizerTesterMixin
SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_no_bos.model") SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_no_bos.model")
@require_sentencepiece
@require_tokenizers
class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = PegasusTokenizer tokenizer_class = PegasusTokenizer
......
...@@ -17,9 +17,9 @@ ...@@ -17,9 +17,9 @@
import os import os
import unittest import unittest
from transformers import SPIECE_UNDERLINE, ReformerTokenizer, ReformerTokenizerFast
from transformers.file_utils import cached_property from transformers.file_utils import cached_property
from transformers.testing_utils import require_torch, slow from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow
from transformers.tokenization_reformer import SPIECE_UNDERLINE, ReformerTokenizer, ReformerTokenizerFast
from .test_tokenization_common import TokenizerTesterMixin from .test_tokenization_common import TokenizerTesterMixin
...@@ -27,6 +27,8 @@ from .test_tokenization_common import TokenizerTesterMixin ...@@ -27,6 +27,8 @@ from .test_tokenization_common import TokenizerTesterMixin
SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model") SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
@require_sentencepiece
@require_tokenizers
class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = ReformerTokenizer tokenizer_class = ReformerTokenizer
......
...@@ -18,16 +18,19 @@ import json ...@@ -18,16 +18,19 @@ import json
import os import os
import unittest import unittest
from transformers.testing_utils import slow from transformers import AddedToken, RobertaTokenizer, RobertaTokenizerFast
from transformers.tokenization_roberta import VOCAB_FILES_NAMES, AddedToken, RobertaTokenizer, RobertaTokenizerFast from transformers.testing_utils import require_tokenizers, slow
from transformers.tokenization_roberta import VOCAB_FILES_NAMES
from .test_tokenization_common import TokenizerTesterMixin from .test_tokenization_common import TokenizerTesterMixin
@require_tokenizers
class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = RobertaTokenizer tokenizer_class = RobertaTokenizer
rust_tokenizer_class = RobertaTokenizerFast rust_tokenizer_class = RobertaTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
from_pretrained_kwargs = {"cls_token": "<s>"}
def setUp(self): def setUp(self):
super().setUp() super().setUp()
...@@ -158,3 +161,38 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -158,3 +161,38 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
mask_loc = encoded.index(mask_ind) mask_loc = encoded.index(mask_ind)
first_char = tokenizer.convert_ids_to_tokens(encoded[mask_loc + 1])[0] first_char = tokenizer.convert_ids_to_tokens(encoded[mask_loc + 1])[0]
self.assertNotEqual(first_char, space_encoding) self.assertNotEqual(first_char, space_encoding)
def test_pretokenized_inputs(self):
pass
def test_embeded_special_tokens(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
sentence = "A, <mask> AllenNLP sentence."
tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
# token_type_ids should put 0 everywhere
self.assertEqual(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"]))
# attention_mask should put 1 everywhere, so sum over length should be 1
self.assertEqual(
sum(tokens_r["attention_mask"]) / len(tokens_r["attention_mask"]),
sum(tokens_p["attention_mask"]) / len(tokens_p["attention_mask"]),
)
tokens_r_str = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"])
tokens_p_str = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])
# Rust correctly handles the space before the mask while python doesnt
self.assertSequenceEqual(tokens_p["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
self.assertSequenceEqual(tokens_r["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
self.assertSequenceEqual(
tokens_p_str, ["<s>", "A", ",", "<mask>", "ĠAllen", "N", "LP", "Ġsentence", ".", "</s>"]
)
self.assertSequenceEqual(
tokens_r_str, ["<s>", "A", ",", "<mask>", "ĠAllen", "N", "LP", "Ġsentence", ".", "</s>"]
)
...@@ -14,15 +14,18 @@ ...@@ -14,15 +14,18 @@
# limitations under the License. # limitations under the License.
from transformers.testing_utils import slow from transformers import SqueezeBertTokenizer, SqueezeBertTokenizerFast
from transformers.tokenization_squeezebert import SqueezeBertTokenizer, SqueezeBertTokenizerFast from transformers.testing_utils import require_tokenizers, slow
from .test_tokenization_bert import BertTokenizationTest from .test_tokenization_bert import BertTokenizationTest
@require_tokenizers
class SqueezeBertTokenizationTest(BertTokenizationTest): class SqueezeBertTokenizationTest(BertTokenizationTest):
tokenizer_class = SqueezeBertTokenizer tokenizer_class = SqueezeBertTokenizer
rust_tokenizer_class = SqueezeBertTokenizerFast
test_rust_tokenizer = True
def get_rust_tokenizer(self, **kwargs): def get_rust_tokenizer(self, **kwargs):
return SqueezeBertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs) return SqueezeBertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
......
...@@ -16,11 +16,9 @@ ...@@ -16,11 +16,9 @@
import unittest import unittest
from transformers import BatchEncoding from transformers import SPIECE_UNDERLINE, BatchEncoding, T5Tokenizer, T5TokenizerFast
from transformers.file_utils import cached_property from transformers.file_utils import cached_property
from transformers.testing_utils import _torch_available, get_tests_dir from transformers.testing_utils import _torch_available, get_tests_dir, require_sentencepiece, require_tokenizers
from transformers.tokenization_t5 import T5Tokenizer, T5TokenizerFast
from transformers.tokenization_xlnet import SPIECE_UNDERLINE
from .test_tokenization_common import TokenizerTesterMixin from .test_tokenization_common import TokenizerTesterMixin
...@@ -30,6 +28,8 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") ...@@ -30,6 +28,8 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
FRAMEWORK = "pt" if _torch_available else "tf" FRAMEWORK = "pt" if _torch_available else "tf"
@require_sentencepiece
@require_tokenizers
class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase): class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = T5Tokenizer tokenizer_class = T5Tokenizer
......
...@@ -19,7 +19,7 @@ from typing import Callable, Optional ...@@ -19,7 +19,7 @@ from typing import Callable, Optional
import numpy as np import numpy as np
from transformers import BatchEncoding, BertTokenizer, BertTokenizerFast, PreTrainedTokenizer, TensorType from transformers import BatchEncoding, BertTokenizer, BertTokenizerFast, PreTrainedTokenizer, TensorType
from transformers.testing_utils import require_tf, require_torch, slow from transformers.testing_utils import require_tf, require_tokenizers, require_torch, slow
from transformers.tokenization_gpt2 import GPT2Tokenizer from transformers.tokenization_gpt2 import GPT2Tokenizer
...@@ -68,6 +68,7 @@ class TokenizerUtilsTest(unittest.TestCase): ...@@ -68,6 +68,7 @@ class TokenizerUtilsTest(unittest.TestCase):
self.assertEqual(TensorType("pt"), TensorType.PYTORCH) self.assertEqual(TensorType("pt"), TensorType.PYTORCH)
self.assertEqual(TensorType("np"), TensorType.NUMPY) self.assertEqual(TensorType("np"), TensorType.NUMPY)
@require_tokenizers
def test_batch_encoding_pickle(self): def test_batch_encoding_pickle(self):
import numpy as np import numpy as np
...@@ -92,6 +93,7 @@ class TokenizerUtilsTest(unittest.TestCase): ...@@ -92,6 +93,7 @@ class TokenizerUtilsTest(unittest.TestCase):
) )
@require_tf @require_tf
@require_tokenizers
def test_batch_encoding_pickle_tf(self): def test_batch_encoding_pickle_tf(self):
import tensorflow as tf import tensorflow as tf
...@@ -112,6 +114,7 @@ class TokenizerUtilsTest(unittest.TestCase): ...@@ -112,6 +114,7 @@ class TokenizerUtilsTest(unittest.TestCase):
) )
@require_torch @require_torch
@require_tokenizers
def test_batch_encoding_pickle_pt(self): def test_batch_encoding_pickle_pt(self):
import torch import torch
...@@ -128,6 +131,7 @@ class TokenizerUtilsTest(unittest.TestCase): ...@@ -128,6 +131,7 @@ class TokenizerUtilsTest(unittest.TestCase):
tokenizer_r("Small example to encode", return_tensors=TensorType.PYTORCH), torch.equal tokenizer_r("Small example to encode", return_tensors=TensorType.PYTORCH), torch.equal
) )
@require_tokenizers
def test_batch_encoding_is_fast(self): def test_batch_encoding_is_fast(self):
tokenizer_p = BertTokenizer.from_pretrained("bert-base-cased") tokenizer_p = BertTokenizer.from_pretrained("bert-base-cased")
tokenizer_r = BertTokenizerFast.from_pretrained("bert-base-cased") tokenizer_r = BertTokenizerFast.from_pretrained("bert-base-cased")
......
...@@ -17,9 +17,9 @@ ...@@ -17,9 +17,9 @@
import os import os
import unittest import unittest
from transformers import SPIECE_UNDERLINE, XLMRobertaTokenizer, XLMRobertaTokenizerFast
from transformers.file_utils import cached_property from transformers.file_utils import cached_property
from transformers.testing_utils import slow from transformers.testing_utils import require_sentencepiece, require_tokenizers, slow
from transformers.tokenization_xlm_roberta import SPIECE_UNDERLINE, XLMRobertaTokenizer, XLMRobertaTokenizerFast
from .test_tokenization_common import TokenizerTesterMixin from .test_tokenization_common import TokenizerTesterMixin
...@@ -27,6 +27,8 @@ from .test_tokenization_common import TokenizerTesterMixin ...@@ -27,6 +27,8 @@ from .test_tokenization_common import TokenizerTesterMixin
SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model") SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
@require_sentencepiece
@require_tokenizers
class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = XLMRobertaTokenizer tokenizer_class = XLMRobertaTokenizer
......
...@@ -17,8 +17,8 @@ ...@@ -17,8 +17,8 @@
import os import os
import unittest import unittest
from transformers.testing_utils import slow from transformers import SPIECE_UNDERLINE, XLNetTokenizer, XLNetTokenizerFast
from transformers.tokenization_xlnet import SPIECE_UNDERLINE, XLNetTokenizer, XLNetTokenizerFast from transformers.testing_utils import require_sentencepiece, require_tokenizers, slow
from .test_tokenization_common import TokenizerTesterMixin from .test_tokenization_common import TokenizerTesterMixin
...@@ -26,6 +26,8 @@ from .test_tokenization_common import TokenizerTesterMixin ...@@ -26,6 +26,8 @@ from .test_tokenization_common import TokenizerTesterMixin
SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model") SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
@require_sentencepiece
@require_tokenizers
class XLNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class XLNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = XLNetTokenizer tokenizer_class = XLNetTokenizer
......
...@@ -23,7 +23,7 @@ import numpy as np ...@@ -23,7 +23,7 @@ import numpy as np
from transformers import AutoTokenizer, PretrainedConfig, TrainingArguments, is_torch_available from transformers import AutoTokenizer, PretrainedConfig, TrainingArguments, is_torch_available
from transformers.file_utils import WEIGHTS_NAME from transformers.file_utils import WEIGHTS_NAME
from transformers.testing_utils import get_tests_dir, require_torch, slow from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, require_torch, slow
if is_torch_available(): if is_torch_available():
...@@ -151,6 +151,8 @@ if is_torch_available(): ...@@ -151,6 +151,8 @@ if is_torch_available():
@require_torch @require_torch
@require_sentencepiece
@require_tokenizers
class TrainerIntegrationTest(unittest.TestCase): class TrainerIntegrationTest(unittest.TestCase):
def setUp(self): def setUp(self):
args = TrainingArguments(".") args = TrainingArguments(".")
......
...@@ -49,6 +49,7 @@ def {0}(*args, **kwargs): ...@@ -49,6 +49,7 @@ def {0}(*args, **kwargs):
requires_pytorch({0}) requires_pytorch({0})
""" """
DUMMY_TF_PRETRAINED_CLASS = """ DUMMY_TF_PRETRAINED_CLASS = """
class {0}: class {0}:
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
...@@ -71,12 +72,111 @@ def {0}(*args, **kwargs): ...@@ -71,12 +72,111 @@ def {0}(*args, **kwargs):
""" """
DUMMY_SENTENCEPIECE_PRETRAINED_CLASS = """
class {0}:
def __init__(self, *args, **kwargs):
requires_sentencepiece(self)
@classmethod
def from_pretrained(self, *args, **kwargs):
requires_sentencepiece(self)
"""
DUMMY_SENTENCEPIECE_CLASS = """
class {0}:
def __init__(self, *args, **kwargs):
requires_sentencepiece(self)
"""
DUMMY_SENTENCEPIECE_FUNCTION = """
def {0}(*args, **kwargs):
requires_sentencepiece({0})
"""
DUMMY_TOKENIZERS_PRETRAINED_CLASS = """
class {0}:
def __init__(self, *args, **kwargs):
requires_tokenizers(self)
@classmethod
def from_pretrained(self, *args, **kwargs):
requires_tokenizers(self)
"""
DUMMY_TOKENIZERS_CLASS = """
class {0}:
def __init__(self, *args, **kwargs):
requires_tokenizers(self)
"""
DUMMY_TOKENIZERS_FUNCTION = """
def {0}(*args, **kwargs):
requires_tokenizers({0})
"""
# Map all these to dummy type
DUMMY_PRETRAINED_CLASS = {
"pt": DUMMY_PT_PRETRAINED_CLASS,
"tf": DUMMY_TF_PRETRAINED_CLASS,
"sentencepiece": DUMMY_SENTENCEPIECE_PRETRAINED_CLASS,
"tokenizers": DUMMY_TOKENIZERS_PRETRAINED_CLASS,
}
DUMMY_CLASS = {
"pt": DUMMY_PT_CLASS,
"tf": DUMMY_TF_CLASS,
"sentencepiece": DUMMY_SENTENCEPIECE_CLASS,
"tokenizers": DUMMY_TOKENIZERS_CLASS,
}
DUMMY_FUNCTION = {
"pt": DUMMY_PT_FUNCTION,
"tf": DUMMY_TF_FUNCTION,
"sentencepiece": DUMMY_SENTENCEPIECE_FUNCTION,
"tokenizers": DUMMY_TOKENIZERS_FUNCTION,
}
def read_init(): def read_init():
""" Read the init and exctracts PyTorch and TensorFlow objects. """ """ Read the init and exctracts PyTorch, TensorFlow, SentencePiece and Tokenizers objects. """
with open(os.path.join(PATH_TO_TRANSFORMERS, "__init__.py"), "r", encoding="utf-8") as f: with open(os.path.join(PATH_TO_TRANSFORMERS, "__init__.py"), "r", encoding="utf-8") as f:
lines = f.readlines() lines = f.readlines()
line_index = 0 line_index = 0
# Find where the SentencePiece imports begin
sentencepiece_objects = []
while not lines[line_index].startswith("if is_sentencepiece_available():"):
line_index += 1
line_index += 1
# Until we unindent, add SentencePiece objects to the list
while len(lines[line_index]) <= 1 or lines[line_index].startswith(" "):
line = lines[line_index]
search = _re_single_line_import.search(line)
if search is not None:
sentencepiece_objects += search.groups()[0].split(", ")
elif line.startswith(" "):
sentencepiece_objects.append(line[8:-2])
line_index += 1
# Find where the Tokenizers imports begin
tokenizers_objects = []
while not lines[line_index].startswith("if is_tokenizers_available():"):
line_index += 1
line_index += 1
# Until we unindent, add Tokenizers objects to the list
while len(lines[line_index]) <= 1 or lines[line_index].startswith(" "):
line = lines[line_index]
search = _re_single_line_import.search(line)
if search is not None:
tokenizers_objects += search.groups()[0].split(", ")
elif line.startswith(" "):
tokenizers_objects.append(line[8:-2])
line_index += 1
# Find where the PyTorch imports begin # Find where the PyTorch imports begin
pt_objects = [] pt_objects = []
while not lines[line_index].startswith("if is_torch_available():"): while not lines[line_index].startswith("if is_torch_available():"):
...@@ -108,10 +208,10 @@ def read_init(): ...@@ -108,10 +208,10 @@ def read_init():
elif line.startswith(" "): elif line.startswith(" "):
tf_objects.append(line[8:-2]) tf_objects.append(line[8:-2])
line_index += 1 line_index += 1
return pt_objects, tf_objects return sentencepiece_objects, tokenizers_objects, pt_objects, tf_objects
def create_dummy_object(name, is_pytorch=True): def create_dummy_object(name, type="pt"):
""" Create the code for the dummy object corresponding to `name`.""" """ Create the code for the dummy object corresponding to `name`."""
_pretrained = [ _pretrained = [
"Config" "ForCausalLM", "Config" "ForCausalLM",
...@@ -124,10 +224,11 @@ def create_dummy_object(name, is_pytorch=True): ...@@ -124,10 +224,11 @@ def create_dummy_object(name, is_pytorch=True):
"Model", "Model",
"Tokenizer", "Tokenizer",
] ]
assert type in ["pt", "tf", "sentencepiece", "tokenizers"]
if name.isupper(): if name.isupper():
return DUMMY_CONSTANT.format(name) return DUMMY_CONSTANT.format(name)
elif name.islower(): elif name.islower():
return (DUMMY_PT_FUNCTION if is_pytorch else DUMMY_TF_FUNCTION).format(name) return (DUMMY_FUNCTION[type]).format(name)
else: else:
is_pretrained = False is_pretrained = False
for part in _pretrained: for part in _pretrained:
...@@ -135,39 +236,75 @@ def create_dummy_object(name, is_pytorch=True): ...@@ -135,39 +236,75 @@ def create_dummy_object(name, is_pytorch=True):
is_pretrained = True is_pretrained = True
break break
if is_pretrained: if is_pretrained:
template = DUMMY_PT_PRETRAINED_CLASS if is_pytorch else DUMMY_TF_PRETRAINED_CLASS template = DUMMY_PRETRAINED_CLASS[type]
else: else:
template = DUMMY_PT_CLASS if is_pytorch else DUMMY_TF_CLASS template = DUMMY_CLASS[type]
return template.format(name) return template.format(name)
def create_dummy_files(): def create_dummy_files():
""" Create the content of the dummy files. """ """ Create the content of the dummy files. """
pt_objects, tf_objects = read_init() sentencepiece_objects, tokenizers_objects, pt_objects, tf_objects = read_init()
sentencepiece_dummies = "# This file is autogenerated by the command `make fix-copies`, do not edit.\n"
sentencepiece_dummies += "from ..file_utils import requires_sentencepiece\n\n"
sentencepiece_dummies += "\n".join([create_dummy_object(o, type="sentencepiece") for o in sentencepiece_objects])
tokenizers_dummies = "# This file is autogenerated by the command `make fix-copies`, do not edit.\n"
tokenizers_dummies += "from ..file_utils import requires_tokenizers\n\n"
tokenizers_dummies += "\n".join([create_dummy_object(o, type="tokenizers") for o in tokenizers_objects])
pt_dummies = "# This file is autogenerated by the command `make fix-copies`, do not edit.\n" pt_dummies = "# This file is autogenerated by the command `make fix-copies`, do not edit.\n"
pt_dummies += "from ..file_utils import requires_pytorch\n\n" pt_dummies += "from ..file_utils import requires_pytorch\n\n"
pt_dummies += "\n".join([create_dummy_object(o) for o in pt_objects]) pt_dummies += "\n".join([create_dummy_object(o, type="pt") for o in pt_objects])
tf_dummies = "# This file is autogenerated by the command `make fix-copies`, do not edit.\n" tf_dummies = "# This file is autogenerated by the command `make fix-copies`, do not edit.\n"
tf_dummies += "from ..file_utils import requires_tf\n\n" tf_dummies += "from ..file_utils import requires_tf\n\n"
tf_dummies += "\n".join([create_dummy_object(o, False) for o in tf_objects]) tf_dummies += "\n".join([create_dummy_object(o, type="tf") for o in tf_objects])
return pt_dummies, tf_dummies return sentencepiece_dummies, tokenizers_dummies, pt_dummies, tf_dummies
def check_dummies(overwrite=False): def check_dummies(overwrite=False):
""" Check if the dummy files are up to date and maybe `overwrite` with the right content. """ """ Check if the dummy files are up to date and maybe `overwrite` with the right content. """
pt_dummies, tf_dummies = create_dummy_files() sentencepiece_dummies, tokenizers_dummies, pt_dummies, tf_dummies = create_dummy_files()
path = os.path.join(PATH_TO_TRANSFORMERS, "utils") path = os.path.join(PATH_TO_TRANSFORMERS, "utils")
sentencepiece_file = os.path.join(path, "dummy_sentencepiece_objects.py")
tokenizers_file = os.path.join(path, "dummy_tokenizers_objects.py")
pt_file = os.path.join(path, "dummy_pt_objects.py") pt_file = os.path.join(path, "dummy_pt_objects.py")
tf_file = os.path.join(path, "dummy_tf_objects.py") tf_file = os.path.join(path, "dummy_tf_objects.py")
with open(sentencepiece_file, "r", encoding="utf-8") as f:
actual_sentencepiece_dummies = f.read()
with open(tokenizers_file, "r", encoding="utf-8") as f:
actual_tokenizers_dummies = f.read()
with open(pt_file, "r", encoding="utf-8") as f: with open(pt_file, "r", encoding="utf-8") as f:
actual_pt_dummies = f.read() actual_pt_dummies = f.read()
with open(tf_file, "r", encoding="utf-8") as f: with open(tf_file, "r", encoding="utf-8") as f:
actual_tf_dummies = f.read() actual_tf_dummies = f.read()
if sentencepiece_dummies != actual_sentencepiece_dummies:
if overwrite:
print("Updating transformers.utils.dummy_sentencepiece_objects.py as the main __init__ has new objects.")
with open(sentencepiece_file, "w", encoding="utf-8") as f:
f.write(sentencepiece_dummies)
else:
raise ValueError(
"The main __init__ has objects that are not present in transformers.utils.dummy_sentencepiece_objects.py.",
"Run `make fix-copies` to fix this.",
)
if tokenizers_dummies != actual_tokenizers_dummies:
if overwrite:
print("Updating transformers.utils.dummy_tokenizers_objects.py as the main __init__ has new objects.")
with open(tokenizers_file, "w", encoding="utf-8") as f:
f.write(tokenizers_dummies)
else:
raise ValueError(
"The main __init__ has objects that are not present in transformers.utils.dummy_tokenizers_objects.py.",
"Run `make fix-copies` to fix this.",
)
if pt_dummies != actual_pt_dummies: if pt_dummies != actual_pt_dummies:
if overwrite: if overwrite:
print("Updating transformers.utils.dummy_pt_objects.py as the main __init__ has new objects.") print("Updating transformers.utils.dummy_pt_objects.py as the main __init__ has new objects.")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment