Unverified Commit ba8c4d0a authored by Thomas Wolf's avatar Thomas Wolf Committed by GitHub
Browse files

[Dependencies|tokenizers] Make both SentencePiece and Tokenizers optional dependencies (#7659)

* splitting fast and slow tokenizers [WIP]

* [WIP] splitting sentencepiece and tokenizers dependencies

* update dummy objects

* add name_or_path to models and tokenizers

* prefix added to file names

* prefix

* styling + quality

* spliting all the tokenizer files - sorting sentencepiece based ones

* update tokenizer version up to 0.9.0

* remove hard dependency on sentencepiece 🎉

* and removed hard dependency on tokenizers 🎉



* update conversion script

* update missing models

* fixing tests

* move test_tokenization_fast to main tokenization tests - fix bugs

* bump up tokenizers

* fix bert_generation

* update ad fix several tokenizers

* keep sentencepiece in deps for now

* fix funnel and deberta tests

* fix fsmt

* fix marian tests

* fix layoutlm

* fix squeezebert and gpt2

* fix T5 tokenization

* fix xlnet tests

* style

* fix mbart

* bump up tokenizers to 0.9.2

* fix model tests

* fix tf models

* fix seq2seq examples

* fix tests without sentencepiece

* fix slow => fast  conversion without sentencepiece

* update auto and bert generation tests

* fix mbart tests

* fix auto and common test without tokenizers

* fix tests without tokenizers

* clean up tests lighten up when tokenizers + sentencepiece are both off

* style quality and tests fixing

* add sentencepiece to doc/examples reqs

* leave sentencepiece on for now

* style quality split hebert and fix pegasus

* WIP Herbert fast

* add sample_text_no_unicode and fix hebert tokenization

* skip FSMT example test for now

* fix style

* fix fsmt in example tests

* update following Lysandre and Sylvain's comments

* Update src/transformers/testing_utils.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/testing_utils.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/tokenization_utils_base.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/tokenization_utils_base.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
parent c65863ce
......@@ -14,8 +14,7 @@
# limitations under the License.
from transformers.testing_utils import slow
from transformers.tokenization_dpr import (
from transformers import (
DPRContextEncoderTokenizer,
DPRContextEncoderTokenizerFast,
DPRQuestionEncoderTokenizer,
......@@ -24,11 +23,13 @@ from transformers.tokenization_dpr import (
DPRReaderTokenizer,
DPRReaderTokenizerFast,
)
from transformers.testing_utils import require_tokenizers, slow
from transformers.tokenization_utils_base import BatchEncoding
from .test_tokenization_bert import BertTokenizationTest
@require_tokenizers
class DPRContextEncoderTokenizationTest(BertTokenizationTest):
tokenizer_class = DPRContextEncoderTokenizer
......@@ -36,6 +37,7 @@ class DPRContextEncoderTokenizationTest(BertTokenizationTest):
test_rust_tokenizer = True
@require_tokenizers
class DPRQuestionEncoderTokenizationTest(BertTokenizationTest):
tokenizer_class = DPRQuestionEncoderTokenizer
......@@ -43,6 +45,7 @@ class DPRQuestionEncoderTokenizationTest(BertTokenizationTest):
test_rust_tokenizer = True
@require_tokenizers
class DPRReaderTokenizationTest(BertTokenizationTest):
tokenizer_class = DPRReaderTokenizer
......
import logging
import shutil
import tempfile
import unittest
from collections import namedtuple
from itertools import takewhile
from transformers import (
AlbertTokenizer,
AlbertTokenizerFast,
BartTokenizer,
BartTokenizerFast,
BertTokenizer,
BertTokenizerFast,
CamembertTokenizer,
CamembertTokenizerFast,
DistilBertTokenizer,
DistilBertTokenizerFast,
DPRContextEncoderTokenizer,
DPRContextEncoderTokenizerFast,
DPRQuestionEncoderTokenizer,
DPRQuestionEncoderTokenizerFast,
DPRReaderTokenizer,
DPRReaderTokenizerFast,
FunnelTokenizer,
FunnelTokenizerFast,
GPT2Tokenizer,
GPT2TokenizerFast,
LxmertTokenizer,
LxmertTokenizerFast,
MBartTokenizer,
MBartTokenizerFast,
OpenAIGPTTokenizer,
OpenAIGPTTokenizerFast,
PegasusTokenizer,
PegasusTokenizerFast,
ReformerTokenizer,
ReformerTokenizerFast,
RobertaTokenizer,
RobertaTokenizerFast,
T5Tokenizer,
T5TokenizerFast,
XLMRobertaTokenizer,
XLMRobertaTokenizerFast,
XLNetTokenizer,
XLNetTokenizerFast,
is_torch_available,
)
from transformers.testing_utils import get_tests_dir
logger = logging.getLogger(__name__)
NON_ENGLISH_TAGS = ["chinese", "dutch", "french", "finnish", "german", "multilingual"]
Tokenizer = namedtuple("Tokenizer", ["name", "rust_cls", "python_cls", "vocab_key", "filter", "kwargs"])
def filter_non_english(_: Tokenizer, pretrained_name: str):
""" Filter all the model for non-english language """
return not any([lang in pretrained_name for lang in NON_ENGLISH_TAGS])
def filter_roberta_detectors(_: Tokenizer, pretrained_name: str):
return "detector" not in pretrained_name
class CommonFastTokenizerTest(unittest.TestCase):
TOKENIZERS_CLASSES = frozenset([])
def setUp(self) -> None:
# Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the
# information available in Tokenizer (name, rust class, python class, vocab key name)
self.tokenizers_list = [
(tok_case, pretrained_name, dict(t for t in tok_case.kwargs) if tok_case.kwargs else {})
for tok_case in self.TOKENIZERS_CLASSES
for pretrained_name in tok_case.python_cls.pretrained_vocab_files_map[tok_case.vocab_key].keys()
if tok_case.filter is None or (tok_case.filter is not None and tok_case.filter(tok_case, pretrained_name))
]
with open(f"{get_tests_dir()}/fixtures/sample_text.txt", encoding="utf-8") as f_data:
self._data = f_data.read().replace("\n\n", "\n").strip()
self.tmpdirname = tempfile.mkdtemp()
def tearDown(self):
shutil.rmtree(self.tmpdirname)
def test_is_fast(self):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
# Check is_fast is set correctly
self.assertFalse(tokenizer_p.is_fast)
self.assertTrue(tokenizer_r.is_fast)
def test_fast_only_inputs(self):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
# Ensure None raise an error
self.assertRaises(TypeError, tokenizer_r.tokenize, None)
self.assertRaises(TypeError, tokenizer_r.encode, None)
self.assertRaises(TypeError, tokenizer_r.encode_plus, None)
self.assertRaises(TypeError, tokenizer_r.batch_encode_plus, None)
def test_alignement_methods(self):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
words = ["Wonderful", "no", "inspiration", "example", "with", "subtoken"]
text = " ".join(words)
batch_size = 3
encoding = tokenizer_r.encode_plus(text, add_special_tokens=False)
batch_encoding = tokenizer_r.batch_encode_plus([text] * batch_size, add_special_tokens=False)
num_tokens = len(encoding["input_ids"])
last_word_index = len(words) - 1
last_token_index = num_tokens - 1
last_batch_index = batch_size - 1
last_char_index = len(text) - 1
# words, tokens
self.assertEqual(len(encoding.words(0)), num_tokens)
self.assertEqual(max(encoding.words(0)), last_word_index)
self.assertEqual(min(encoding.words(0)), 0)
self.assertEqual(len(batch_encoding.words(last_batch_index)), num_tokens)
self.assertEqual(max(batch_encoding.words(last_batch_index)), last_word_index)
self.assertEqual(min(batch_encoding.words(last_batch_index)), 0)
self.assertEqual(len(encoding.tokens(0)), num_tokens)
# Assert token_to_word
self.assertEqual(encoding.token_to_word(0), 0)
self.assertEqual(encoding.token_to_word(0, 0), 0)
self.assertEqual(encoding.token_to_word(last_token_index), last_word_index)
self.assertEqual(encoding.token_to_word(0, last_token_index), last_word_index)
self.assertEqual(batch_encoding.token_to_word(1, 0), 0)
self.assertEqual(batch_encoding.token_to_word(0, last_token_index), last_word_index)
self.assertEqual(batch_encoding.token_to_word(last_batch_index, last_token_index), last_word_index)
# Assert word_to_tokens
self.assertEqual(encoding.word_to_tokens(0).start, 0)
self.assertEqual(encoding.word_to_tokens(0, 0).start, 0)
self.assertEqual(encoding.word_to_tokens(last_word_index).end, last_token_index + 1)
self.assertEqual(encoding.word_to_tokens(0, last_word_index).end, last_token_index + 1)
self.assertEqual(batch_encoding.word_to_tokens(1, 0).start, 0)
self.assertEqual(batch_encoding.word_to_tokens(0, last_word_index).end, last_token_index + 1)
self.assertEqual(
batch_encoding.word_to_tokens(last_batch_index, last_word_index).end, last_token_index + 1
)
# Assert token_to_chars
self.assertEqual(encoding.token_to_chars(0).start, 0)
self.assertEqual(encoding.token_to_chars(0, 0).start, 0)
self.assertEqual(encoding.token_to_chars(last_token_index).end, last_char_index + 1)
self.assertEqual(encoding.token_to_chars(0, last_token_index).end, last_char_index + 1)
self.assertEqual(batch_encoding.token_to_chars(1, 0).start, 0)
self.assertEqual(batch_encoding.token_to_chars(0, last_token_index).end, last_char_index + 1)
self.assertEqual(
batch_encoding.token_to_chars(last_batch_index, last_token_index).end, last_char_index + 1
)
# Assert char_to_token
self.assertEqual(encoding.char_to_token(0), 0)
self.assertEqual(encoding.char_to_token(0, 0), 0)
self.assertEqual(encoding.char_to_token(last_char_index), last_token_index)
self.assertEqual(encoding.char_to_token(0, last_char_index), last_token_index)
self.assertEqual(batch_encoding.char_to_token(1, 0), 0)
self.assertEqual(batch_encoding.char_to_token(0, last_char_index), last_token_index)
self.assertEqual(batch_encoding.char_to_token(last_batch_index, last_char_index), last_token_index)
# Assert char_to_word
self.assertEqual(encoding.char_to_word(0), 0)
self.assertEqual(encoding.char_to_word(0, 0), 0)
self.assertEqual(encoding.char_to_word(last_char_index), last_word_index)
self.assertEqual(encoding.char_to_word(0, last_char_index), last_word_index)
self.assertEqual(batch_encoding.char_to_word(1, 0), 0)
self.assertEqual(batch_encoding.char_to_word(0, last_char_index), last_word_index)
self.assertEqual(batch_encoding.char_to_word(last_batch_index, last_char_index), last_word_index)
# Assert word_to_chars
self.assertEqual(encoding.word_to_chars(0).start, 0)
self.assertEqual(encoding.word_to_chars(0, 0).start, 0)
self.assertEqual(encoding.word_to_chars(last_word_index).end, last_char_index + 1)
self.assertEqual(encoding.word_to_chars(0, last_word_index).end, last_char_index + 1)
self.assertEqual(batch_encoding.word_to_chars(1, 0).start, 0)
self.assertEqual(batch_encoding.word_to_chars(0, last_word_index).end, last_char_index + 1)
self.assertEqual(
batch_encoding.word_to_chars(last_batch_index, last_word_index).end, last_char_index + 1
)
def test_tokenization_python_rust_equals(self):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
# Ensure basic input match
input_p = tokenizer_p.encode_plus(self._data)
input_r = tokenizer_r.encode_plus(self._data)
for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()):
self.assertSequenceEqual(input_p[key], input_r[key])
input_pairs_p = tokenizer_p.encode_plus(self._data, self._data)
input_pairs_r = tokenizer_r.encode_plus(self._data, self._data)
for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()):
self.assertSequenceEqual(input_pairs_p[key], input_pairs_r[key])
# Ensure truncation match
input_p = tokenizer_p.encode_plus(self._data, max_length=512, truncation=True)
input_r = tokenizer_r.encode_plus(self._data, max_length=512, truncation=True)
for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()):
self.assertSequenceEqual(input_p[key], input_r[key])
# Ensure truncation with stride match
input_p = tokenizer_p.encode_plus(
self._data, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True
)
input_r = tokenizer_r.encode_plus(
self._data, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True
)
for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()):
self.assertSequenceEqual(input_p[key], input_r[key][0])
def test_num_special_tokens_to_add_equal(self):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
# Check we have the same number of added_tokens for both pair and non-pair inputs.
self.assertEqual(
tokenizer_r.num_special_tokens_to_add(False), tokenizer_p.num_special_tokens_to_add(False)
)
self.assertEqual(
tokenizer_r.num_special_tokens_to_add(True), tokenizer_p.num_special_tokens_to_add(True)
)
def test_max_length_equal(self):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
# Check we have the correct max_length for both pair and non-pair inputs.
self.assertEqual(tokenizer_r.max_len_single_sentence, tokenizer_p.max_len_single_sentence)
self.assertEqual(tokenizer_r.max_len_sentences_pair, tokenizer_p.max_len_sentences_pair)
def test_special_tokens_map_equal(self):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
# Assert the set of special tokens match.
self.assertSequenceEqual(
tokenizer_p.special_tokens_map.items(),
tokenizer_r.special_tokens_map.items(),
)
def test_add_tokens(self):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
vocab_size = len(tokenizer_r)
self.assertEqual(tokenizer_r.add_tokens(""), 0)
self.assertEqual(tokenizer_r.add_tokens("testoken"), 1)
self.assertEqual(tokenizer_r.add_tokens(["testoken1", "testtoken2"]), 2)
self.assertEqual(len(tokenizer_r), vocab_size + 3)
self.assertEqual(tokenizer_r.add_special_tokens({}), 0)
self.assertEqual(tokenizer_r.add_special_tokens({"bos_token": "[BOS]", "eos_token": "[EOS]"}), 2)
self.assertRaises(
AssertionError, tokenizer_r.add_special_tokens, {"additional_special_tokens": "<testtoken1>"}
)
self.assertEqual(tokenizer_r.add_special_tokens({"additional_special_tokens": ["<testtoken2>"]}), 1)
self.assertEqual(
tokenizer_r.add_special_tokens({"additional_special_tokens": ["<testtoken3>", "<testtoken4>"]}), 2
)
self.assertEqual(len(tokenizer_r), vocab_size + 8)
def test_offsets_mapping(self):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
text = "Wonderful no inspiration example with subtoken"
pair = "Along with an awesome pair"
# No pair
tokens_with_offsets = tokenizer_r.encode_plus(
text, return_special_tokens_mask=True, return_offsets_mapping=True, add_special_tokens=True
)
added_tokens = tokenizer_r.num_special_tokens_to_add(False)
offsets = tokens_with_offsets["offset_mapping"]
# Assert there is the same number of tokens and offsets
self.assertEqual(len(offsets), len(tokens_with_offsets["input_ids"]))
# Assert there is online added_tokens special_tokens
self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens)
# Pairs
tokens_with_offsets = tokenizer_r.encode_plus(
text, pair, return_special_tokens_mask=True, return_offsets_mapping=True, add_special_tokens=True
)
added_tokens = tokenizer_r.num_special_tokens_to_add(True)
offsets = tokens_with_offsets["offset_mapping"]
# Assert there is the same number of tokens and offsets
self.assertEqual(len(offsets), len(tokens_with_offsets["input_ids"]))
# Assert there is online added_tokens special_tokens
self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens)
def test_batch_encode_dynamic_overflowing(self):
"""
When calling batch_encode with multiple sequence it can returns different number of
overflowing encoding for each sequence:
[
Sequence 1: [Encoding 1, Encoding 2],
Sequence 2: [Encoding 1],
Sequence 3: [Encoding 1, Encoding 2, ... Encoding N]
]
This needs to be padded so that it can represented as a tensor
"""
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
tokenizer = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
with self.subTest("{} ({}, {})".format(tok_case.name, pretrained_name, tokenizer.__class__.__name__)):
returned_tensor = "pt" if is_torch_available() else "tf"
if not tokenizer.pad_token or tokenizer.pad_token_id < 0:
return
tokens = tokenizer.encode_plus(
"HuggingFace is solving NLP one commit at a time",
max_length=6,
padding=True,
truncation=True,
return_tensors=returned_tensor,
return_overflowing_tokens=True,
)
for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()):
self.assertEqual(len(tokens[key].shape), 2)
# Mono sample
tokens = tokenizer.batch_encode_plus(
["HuggingFace is solving NLP one commit at a time"],
max_length=6,
padding=True,
truncation="only_first",
return_tensors=returned_tensor,
return_overflowing_tokens=True,
)
for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()):
self.assertEqual(len(tokens[key].shape), 2)
self.assertEqual(tokens[key].shape[-1], 6)
# Multi sample
tokens = tokenizer.batch_encode_plus(
["HuggingFace is solving NLP one commit at a time", "Very tiny input"],
max_length=6,
padding=True,
truncation="only_first",
return_tensors=returned_tensor,
return_overflowing_tokens=True,
)
for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()):
self.assertEqual(len(tokens[key].shape), 2)
self.assertEqual(tokens[key].shape[-1], 6)
def test_pretokenized_inputs(self):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
# Input string
pretokenized_input_simple = "This is a sample input".split()
pretokenized_input_pair = "This is a sample pair".split()
# Test encode for pretokenized inputs
output_r = tokenizer_r.encode(
pretokenized_input_simple, is_split_into_words=True, add_special_tokens=False
)
output_p = tokenizer_p.encode(
pretokenized_input_simple, is_split_into_words=True, add_special_tokens=False
)
self.assertEqual(output_p, output_r)
kwargs = {
"is_split_into_words": True,
# "return_token_type_ids": True, # Use the defaults for each tokenizers
# "return_attention_mask": True, # Use the defaults for each tokenizers
"return_overflowing_tokens": False,
"return_special_tokens_mask": True,
"return_offsets_mapping": False, # Not implemented in python tokenizers
# "add_special_tokens": False,
}
batch_kwargs = {
"is_split_into_words": True,
# "return_token_type_ids": True, # Use the defaults for each tokenizers
# "return_attention_mask": True, # Use the defaults for each tokenizers
"return_overflowing_tokens": False,
"return_special_tokens_mask": True,
"return_offsets_mapping": False, # Not implemented in python tokenizers
# "add_special_tokens": False,
}
# Test encode_plus for pretokenized inputs
output_r = tokenizer_r.encode_plus(pretokenized_input_simple, **kwargs)
output_p = tokenizer_p.encode_plus(pretokenized_input_simple, **kwargs)
for key in output_p.keys():
self.assertEqual(output_p[key], output_r[key])
# Test batch_encode_plus for pretokenized inputs
input_batch = ([pretokenized_input_simple] * 2) + [pretokenized_input_simple + pretokenized_input_pair]
output_r = tokenizer_r.batch_encode_plus(input_batch, **batch_kwargs)
output_p = tokenizer_p.batch_encode_plus(input_batch, **batch_kwargs)
for key in output_p.keys():
self.assertEqual(output_p[key], output_r[key])
# Test encode for pretokenized inputs pairs
output_r = tokenizer_r.encode(
pretokenized_input_simple, pretokenized_input_pair, is_split_into_words=True
)
output_p = tokenizer_p.encode(
pretokenized_input_simple, pretokenized_input_pair, is_split_into_words=True
)
self.assertEqual(output_p, output_r)
# Test encode_plus for pretokenized inputs
output_r = tokenizer_r.encode_plus(pretokenized_input_simple, pretokenized_input_pair, **kwargs)
output_p = tokenizer_p.encode_plus(pretokenized_input_simple, pretokenized_input_pair, **kwargs)
for key in output_p.keys():
self.assertEqual(output_p[key], output_r[key])
# Test batch_encode_plus for pretokenized inputs
input_batch_pair = ([pretokenized_input_simple, pretokenized_input_pair] * 2) + [
pretokenized_input_simple + pretokenized_input_pair,
pretokenized_input_pair,
]
output_r = tokenizer_r.batch_encode_plus(input_batch_pair, **batch_kwargs)
output_p = tokenizer_p.batch_encode_plus(input_batch_pair, **batch_kwargs)
for key in output_p.keys():
self.assertEqual(output_p[key], output_r[key])
def test_create_token_type_ids(self):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
input_simple = [1, 2, 3]
input_pair = [1, 2, 3]
# Generate output
output_r = tokenizer_r.create_token_type_ids_from_sequences(input_simple)
output_p = tokenizer_p.create_token_type_ids_from_sequences(input_simple)
self.assertEqual(output_p, output_r)
# Generate pair output
output_r = tokenizer_r.create_token_type_ids_from_sequences(input_simple, input_pair)
output_p = tokenizer_p.create_token_type_ids_from_sequences(input_simple, input_pair)
self.assertEqual(output_p, output_r)
def test_build_inputs_with_special_tokens(self):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
# # Input string
# input_simple = tokenizer_p.tokenize("This is a sample input", add_special_tokens=False)
# input_pair = tokenizer_p.tokenize("This is a sample pair", add_special_tokens=False)
# # Generate output
# output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple)
# output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple)
# self.assertEqual(output_p, output_r)
# # Generate pair output
# output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple, input_pair)
# output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair)
# self.assertEqual(output_p, output_r)
# Input tokens id
input_simple = tokenizer_p.encode("This is a sample input", add_special_tokens=False)
input_pair = tokenizer_p.encode("This is a sample pair", add_special_tokens=False)
# Generate output
output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple)
output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple)
self.assertEqual(output_p, output_r)
# Generate pair output
output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple, input_pair)
output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair)
self.assertEqual(output_p, output_r)
def test_padding(self, max_length=50):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
def assert_padded_input_match(input_r: list, input_p: list, max_length: int):
# Ensure we match max_length
self.assertEqual(len(input_r), max_length)
self.assertEqual(len(input_p), max_length)
# Ensure the number of padded tokens is the same
padded_tokens_r = list(takewhile(lambda i: i == tokenizer_r.pad_token_id, reversed(input_r)))
padded_tokens_p = list(takewhile(lambda i: i == tokenizer_p.pad_token_id, reversed(input_p)))
self.assertSequenceEqual(padded_tokens_r, padded_tokens_p)
def assert_batch_padded_input_match(input_r: dict, input_p: dict, max_length: int):
for i_r in input_r.values():
self.assertEqual(len(i_r), 2), self.assertEqual(len(i_r[0]), max_length), self.assertEqual(
len(i_r[1]), max_length
)
self.assertEqual(len(i_r), 2), self.assertEqual(len(i_r[0]), max_length), self.assertEqual(
len(i_r[1]), max_length
)
for i_r, i_p in zip(input_r["input_ids"], input_p["input_ids"]):
assert_padded_input_match(i_r, i_p, max_length)
for i_r, i_p in zip(input_r["attention_mask"], input_p["attention_mask"]):
self.assertSequenceEqual(i_r, i_p)
# Encode - Simple input
input_r = tokenizer_r.encode("This is a simple input", max_length=max_length, pad_to_max_length=True)
input_p = tokenizer_p.encode("This is a simple input", max_length=max_length, pad_to_max_length=True)
assert_padded_input_match(input_r, input_p, max_length)
input_r = tokenizer_r.encode("This is a simple input", max_length=max_length, padding="max_length")
input_p = tokenizer_p.encode("This is a simple input", max_length=max_length, padding="max_length")
assert_padded_input_match(input_r, input_p, max_length)
input_r = tokenizer_r.encode("This is a simple input", padding="longest")
input_p = tokenizer_p.encode("This is a simple input", padding=True)
assert_padded_input_match(input_r, input_p, len(input_r))
# Encode - Pair input
input_r = tokenizer_r.encode(
"This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
)
input_p = tokenizer_p.encode(
"This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
)
assert_padded_input_match(input_r, input_p, max_length)
input_r = tokenizer_r.encode(
"This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
)
input_p = tokenizer_p.encode(
"This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
)
assert_padded_input_match(input_r, input_p, max_length)
input_r = tokenizer_r.encode("This is a simple input", "This is a pair", padding=True)
input_p = tokenizer_p.encode("This is a simple input", "This is a pair", padding="longest")
assert_padded_input_match(input_r, input_p, len(input_r))
# Encode_plus - Simple input
input_r = tokenizer_r.encode_plus(
"This is a simple input", max_length=max_length, pad_to_max_length=True
)
input_p = tokenizer_p.encode_plus(
"This is a simple input", max_length=max_length, pad_to_max_length=True
)
assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length)
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
input_r = tokenizer_r.encode_plus(
"This is a simple input", max_length=max_length, padding="max_length"
)
input_p = tokenizer_p.encode_plus(
"This is a simple input", max_length=max_length, padding="max_length"
)
assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length)
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
input_r = tokenizer_r.encode_plus("This is a simple input", padding="longest")
input_p = tokenizer_p.encode_plus("This is a simple input", padding=True)
assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]))
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
# Encode_plus - Pair input
input_r = tokenizer_r.encode_plus(
"This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
)
input_p = tokenizer_p.encode_plus(
"This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
)
assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length)
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
input_r = tokenizer_r.encode_plus(
"This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
)
input_p = tokenizer_p.encode_plus(
"This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
)
assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length)
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
input_r = tokenizer_r.encode_plus("This is a simple input", "This is a pair", padding="longest")
input_p = tokenizer_p.encode_plus("This is a simple input", "This is a pair", padding=True)
assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]))
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
# Batch_encode_plus - Simple input
input_r = tokenizer_r.batch_encode_plus(
["This is a simple input 1", "This is a simple input 2"],
max_length=max_length,
pad_to_max_length=True,
)
input_p = tokenizer_p.batch_encode_plus(
["This is a simple input 1", "This is a simple input 2"],
max_length=max_length,
pad_to_max_length=True,
)
assert_batch_padded_input_match(input_r, input_p, max_length)
input_r = tokenizer_r.batch_encode_plus(
["This is a simple input 1", "This is a simple input 2"],
max_length=max_length,
padding="max_length",
)
input_p = tokenizer_p.batch_encode_plus(
["This is a simple input 1", "This is a simple input 2"],
max_length=max_length,
padding="max_length",
)
assert_batch_padded_input_match(input_r, input_p, max_length)
input_r = tokenizer_r.batch_encode_plus(
["This is a simple input 1", "This is a simple input 2"],
max_length=max_length,
padding="longest",
)
input_p = tokenizer_p.batch_encode_plus(
["This is a simple input 1", "This is a simple input 2"],
max_length=max_length,
padding=True,
)
assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]))
input_r = tokenizer_r.batch_encode_plus(
["This is a simple input 1", "This is a simple input 2"], padding="longest"
)
input_p = tokenizer_p.batch_encode_plus(
["This is a simple input 1", "This is a simple input 2"], padding=True
)
assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]))
# Batch_encode_plus - Pair input
input_r = tokenizer_r.batch_encode_plus(
[
("This is a simple input 1", "This is a simple input 2"),
("This is a simple pair 1", "This is a simple pair 2"),
],
max_length=max_length,
truncation=True,
padding="max_length",
)
input_p = tokenizer_p.batch_encode_plus(
[
("This is a simple input 1", "This is a simple input 2"),
("This is a simple pair 1", "This is a simple pair 2"),
],
max_length=max_length,
truncation=True,
padding="max_length",
)
assert_batch_padded_input_match(input_r, input_p, max_length)
input_r = tokenizer_r.batch_encode_plus(
[
("This is a simple input 1", "This is a simple input 2"),
("This is a simple pair 1", "This is a simple pair 2"),
],
padding=True,
)
input_p = tokenizer_p.batch_encode_plus(
[
("This is a simple input 1", "This is a simple input 2"),
("This is a simple pair 1", "This is a simple pair 2"),
],
padding="longest",
)
assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]))
# Using pad on single examples after tokenization
input_r = tokenizer_r.encode_plus("This is a input 1")
input_r = tokenizer_r.pad(input_r)
input_p = tokenizer_r.encode_plus("This is a input 1")
input_p = tokenizer_r.pad(input_p)
assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]))
# Using pad on single examples after tokenization
input_r = tokenizer_r.encode_plus("This is a input 1")
input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length")
input_p = tokenizer_r.encode_plus("This is a input 1")
input_p = tokenizer_r.pad(input_p, max_length=max_length, padding="max_length")
assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length)
# Using pad after tokenization
input_r = tokenizer_r.batch_encode_plus(
["This is a input 1", "This is a much longer input whilch should be padded"]
)
input_r = tokenizer_r.pad(input_r)
input_p = tokenizer_r.batch_encode_plus(
["This is a input 1", "This is a much longer input whilch should be padded"]
)
input_p = tokenizer_r.pad(input_p)
assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]))
# Using pad after tokenization
input_r = tokenizer_r.batch_encode_plus(
["This is a input 1", "This is a much longer input whilch should be padded"]
)
input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length")
input_p = tokenizer_r.batch_encode_plus(
["This is a input 1", "This is a much longer input whilch should be padded"]
)
input_p = tokenizer_r.pad(input_p, max_length=max_length, padding="max_length")
assert_batch_padded_input_match(input_r, input_p, max_length)
def test_save_pretrained(self):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
# Checks it save with the same files
self.assertSequenceEqual(
tokenizer_r.save_vocabulary(self.tmpdirname), tokenizer_p.save_vocabulary(self.tmpdirname)
)
# Checks everything loads correctly in the same way
tokenizer_rp, tokenizer_pp = tokenizer_r.from_pretrained(self.tmpdirname), tokenizer_p.from_pretrained(
self.tmpdirname
)
# Check special tokens are set accordingly on Rust and Python
for key in tokenizer_pp.special_tokens_map:
self.assertTrue(hasattr(tokenizer_rp, key))
# self.assertEqual(getattr(tokenizer_rp, key), getattr(tokenizer_pp, key))
# self.assertEqual(getattr(tokenizer_rp, key + "_id"), getattr(tokenizer_pp, key + "_id"))
def test_embeded_special_tokens(self):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
sentence = "A, <mask> AllenNLP sentence."
tokens_r = tokenizer_r.encode_plus(
sentence,
add_special_tokens=True,
)
tokens_p = tokenizer_p.encode_plus(
sentence,
add_special_tokens=True,
)
for key in tokens_p.keys():
self.assertEqual(tokens_r[key], tokens_p[key])
if "token_type_ids" in tokens_r:
self.assertEqual(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"]))
tokens_r = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"])
tokens_p = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])
self.assertSequenceEqual(tokens_r, tokens_p)
def test_add_special_tokens(self):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
simple_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=False)
# pair_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=True)
for text in ["", " "]:
# tokenize()
no_special_tokens = tokenizer_r.tokenize(text, add_special_tokens=False)
with_special_tokens = tokenizer_r.tokenize(text, add_special_tokens=True)
self.assertEqual(
len(no_special_tokens), len(with_special_tokens) - simple_num_special_tokens_to_add
)
# encode()
no_special_tokens = tokenizer_r.encode(text, add_special_tokens=False)
with_special_tokens = tokenizer_r.encode(text, add_special_tokens=True)
self.assertEqual(
len(no_special_tokens), len(with_special_tokens) - simple_num_special_tokens_to_add
)
# encode_plus()
no_special_tokens = tokenizer_r.encode_plus(text, add_special_tokens=False)
with_special_tokens = tokenizer_r.encode_plus(text, add_special_tokens=True)
for key in no_special_tokens.keys():
self.assertEqual(
len(no_special_tokens[key]),
len(with_special_tokens[key]) - simple_num_special_tokens_to_add,
)
# # batch_encode_plus
no_special_tokens = tokenizer_r.batch_encode_plus([text, text], add_special_tokens=False)
with_special_tokens = tokenizer_r.batch_encode_plus([text, text], add_special_tokens=True)
for key in no_special_tokens.keys():
for i_no, i_with in zip(no_special_tokens[key], with_special_tokens[key]):
self.assertEqual(len(i_no), len(i_with) - simple_num_special_tokens_to_add)
def test_prepare_for_model(self):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
string_sequence = "Asserting that both tokenizers are equal"
python_output = tokenizer_p.prepare_for_model(
tokenizer_p.encode(string_sequence, add_special_tokens=False)
)
rust_output = tokenizer_r.prepare_for_model(
tokenizer_r.encode(string_sequence, add_special_tokens=False)
)
for key in python_output:
self.assertEqual(python_output[key], rust_output[key])
class WordPieceFastTokenizerTest(CommonFastTokenizerTest):
"""
Override all the specific methods to test WordPiece behavior
"""
TOKENIZERS_CLASSES = frozenset(
[
Tokenizer("Bert", BertTokenizerFast, BertTokenizer, "vocab_file", filter_non_english, None),
Tokenizer(
"DistilBert", DistilBertTokenizerFast, DistilBertTokenizer, "vocab_file", filter_non_english, None
),
Tokenizer(
"DPRReaderTokenizer",
DPRReaderTokenizerFast,
DPRReaderTokenizer,
"vocab_file",
filter_non_english,
None,
),
Tokenizer(
"DPRQuestionEncoderTokenizer",
DPRQuestionEncoderTokenizerFast,
DPRQuestionEncoderTokenizer,
"vocab_file",
filter_non_english,
None,
),
Tokenizer(
"DPRContextEncoderTokenizer",
DPRContextEncoderTokenizerFast,
DPRContextEncoderTokenizer,
"vocab_file",
filter_non_english,
None,
),
Tokenizer("FunnelTokenizer", FunnelTokenizerFast, FunnelTokenizer, "vocab_file", filter_non_english, None),
Tokenizer("LxmertTokenizer", LxmertTokenizerFast, LxmertTokenizer, "vocab_file", filter_non_english, None),
]
)
def test_offsets_with_special_characters(self):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
tokens = tokenizer_r.encode_plus(
sentence,
return_attention_mask=False,
return_token_type_ids=False,
return_offsets_mapping=True,
add_special_tokens=True,
)
do_lower_case = tokenizer_r.do_lower_case if hasattr(tokenizer_r, "do_lower_case") else False
expected_results = (
[
((0, 0), tokenizer_r.cls_token),
((0, 1), "A"),
((1, 2), ","),
((3, 5), "na"),
((5, 6), "##ï"),
((6, 8), "##ve"),
((9, 15), tokenizer_r.mask_token),
((16, 21), "Allen"),
((21, 23), "##NL"),
((23, 24), "##P"),
((25, 33), "sentence"),
((33, 34), "."),
((0, 0), tokenizer_r.sep_token),
]
if not do_lower_case
else [
((0, 0), tokenizer_r.cls_token),
((0, 1), "a"),
((1, 2), ","),
((3, 8), "naive"),
((9, 15), tokenizer_r.mask_token),
((16, 21), "allen"),
((21, 23), "##nl"),
((23, 24), "##p"),
((25, 33), "sentence"),
((33, 34), "."),
((0, 0), tokenizer_r.sep_token),
]
)
self.assertEqual(
[e[1] for e in expected_results], tokenizer_r.convert_ids_to_tokens(tokens["input_ids"])
)
self.assertEqual([e[0] for e in expected_results], tokens["offset_mapping"])
class RobertaFastTokenizerTest(CommonFastTokenizerTest):
TOKENIZERS_CLASSES = frozenset(
[
Tokenizer(
"Roberta",
RobertaTokenizerFast,
RobertaTokenizer,
"vocab_file",
filter_roberta_detectors,
(("cls_token", "<s>"),),
),
Tokenizer(
"Bart",
BartTokenizerFast,
BartTokenizer,
"vocab_file",
None,
None,
),
]
)
def test_pretokenized_inputs(self):
pass
def test_embeded_special_tokens(self):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
sentence = "A, <mask> AllenNLP sentence."
tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
# token_type_ids should put 0 everywhere
self.assertEqual(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"]))
# attention_mask should put 1 everywhere, so sum over length should be 1
self.assertEqual(
sum(tokens_r["attention_mask"]) / len(tokens_r["attention_mask"]),
sum(tokens_p["attention_mask"]) / len(tokens_p["attention_mask"]),
)
tokens_r_str = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"])
tokens_p_str = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])
# Rust correctly handles the space before the mask while python doesnt
self.assertSequenceEqual(tokens_p["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
self.assertSequenceEqual(tokens_r["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
self.assertSequenceEqual(
tokens_p_str, ["<s>", "A", ",", "<mask>", "ĠAllen", "N", "LP", "Ġsentence", ".", "</s>"]
)
self.assertSequenceEqual(
tokens_r_str, ["<s>", "A", ",", "<mask>", "ĠAllen", "N", "LP", "Ġsentence", ".", "</s>"]
)
class NoPaddingTokenFastTokenizerMatchingTest(CommonFastTokenizerTest):
TOKENIZERS_CLASSES = [
Tokenizer("OpenAI GPT", OpenAIGPTTokenizerFast, OpenAIGPTTokenizer, "vocab_file", None, None),
Tokenizer("GPT2", GPT2TokenizerFast, GPT2Tokenizer, "vocab_file", None, [("add_prefix_space", True)]),
]
def test_pretokenized_inputs(self):
pass
def test_padding(self, max_length=15):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
# Simple input
s = "This is a simple input"
s2 = ["This is a simple input 1", "This is a simple input 2"]
p = ("This is a simple input", "This is a pair")
p2 = [
("This is a simple input 1", "This is a simple input 2"),
("This is a simple pair 1", "This is a simple pair 2"),
]
# Simple input tests
self.assertRaises(ValueError, tokenizer_r.encode, s, max_length=max_length, padding="max_length")
# Simple input
self.assertRaises(ValueError, tokenizer_r.encode_plus, s, max_length=max_length, padding="max_length")
# Simple input
self.assertRaises(
ValueError,
tokenizer_r.batch_encode_plus,
s2,
max_length=max_length,
padding="max_length",
)
# Pair input
self.assertRaises(ValueError, tokenizer_r.encode, p, max_length=max_length, padding="max_length")
# Pair input
self.assertRaises(ValueError, tokenizer_r.encode_plus, p, max_length=max_length, padding="max_length")
# Pair input
self.assertRaises(
ValueError,
tokenizer_r.batch_encode_plus,
p2,
max_length=max_length,
padding="max_length",
)
class SentencePieceFastTokenizerTest(CommonFastTokenizerTest):
"""
Override specific methods to test SentencePiece behavior
"""
TOKENIZERS_CLASSES = frozenset(
[
Tokenizer("Albert", AlbertTokenizerFast, AlbertTokenizer, "vocab_file", None, None),
Tokenizer("Camembert", CamembertTokenizerFast, CamembertTokenizer, "vocab_file", None, None),
Tokenizer("T5", T5TokenizerFast, T5Tokenizer, "vocab_file", None, None),
Tokenizer(
"MBart",
MBartTokenizerFast,
MBartTokenizer,
"vocab_file",
None,
None,
),
Tokenizer("Pegasus", PegasusTokenizerFast, PegasusTokenizer, "vocab_file", None, None),
Tokenizer("Reformer", ReformerTokenizerFast, ReformerTokenizer, "vocab_file", None, None),
Tokenizer("XLMRoberta", XLMRobertaTokenizerFast, XLMRobertaTokenizer, "vocab_file", None, None),
Tokenizer("XLNet", XLNetTokenizerFast, XLNetTokenizer, "vocab_file", None, None),
]
)
......@@ -17,14 +17,18 @@
import os
import unittest
from transformers.tokenization_funnel import VOCAB_FILES_NAMES, FunnelTokenizer, FunnelTokenizerFast
from transformers import FunnelTokenizer, FunnelTokenizerFast
from transformers.testing_utils import require_tokenizers
from transformers.tokenization_funnel import VOCAB_FILES_NAMES
from .test_tokenization_common import TokenizerTesterMixin
@require_tokenizers
class FunnelTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = FunnelTokenizer
rust_tokenizer_class = FunnelTokenizerFast
test_rust_tokenizer = True
space_between_special_tokens = True
......
......@@ -18,16 +18,20 @@ import json
import os
import unittest
from transformers.tokenization_gpt2 import VOCAB_FILES_NAMES, GPT2Tokenizer, GPT2TokenizerFast
from transformers import GPT2Tokenizer, GPT2TokenizerFast
from transformers.testing_utils import require_tokenizers
from transformers.tokenization_gpt2 import VOCAB_FILES_NAMES
from .test_tokenization_common import TokenizerTesterMixin
@require_tokenizers
class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = GPT2Tokenizer
rust_tokenizer_class = GPT2TokenizerFast
test_rust_tokenizer = True
from_pretrained_kwargs = {"add_prefix_space": True}
def setUp(self):
super().setUp()
......@@ -125,3 +129,47 @@ class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# It's very difficult to mix/test pretokenization with byte-level
# And get both GPT2 and Roberta to work at the same time (mostly an issue of adding a space before the string)
pass
def test_padding(self, max_length=15):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
# Simple input
s = "This is a simple input"
s2 = ["This is a simple input 1", "This is a simple input 2"]
p = ("This is a simple input", "This is a pair")
p2 = [
("This is a simple input 1", "This is a simple input 2"),
("This is a simple pair 1", "This is a simple pair 2"),
]
# Simple input tests
self.assertRaises(ValueError, tokenizer_r.encode, s, max_length=max_length, padding="max_length")
# Simple input
self.assertRaises(ValueError, tokenizer_r.encode_plus, s, max_length=max_length, padding="max_length")
# Simple input
self.assertRaises(
ValueError,
tokenizer_r.batch_encode_plus,
s2,
max_length=max_length,
padding="max_length",
)
# Pair input
self.assertRaises(ValueError, tokenizer_r.encode, p, max_length=max_length, padding="max_length")
# Pair input
self.assertRaises(ValueError, tokenizer_r.encode_plus, p, max_length=max_length, padding="max_length")
# Pair input
self.assertRaises(
ValueError,
tokenizer_r.batch_encode_plus,
p2,
max_length=max_length,
padding="max_length",
)
......@@ -18,12 +18,14 @@ import json
import os
import unittest
from transformers.testing_utils import slow
from transformers.tokenization_herbert import VOCAB_FILES_NAMES, HerbertTokenizer, HerbertTokenizerFast
from transformers import HerbertTokenizer, HerbertTokenizerFast
from transformers.testing_utils import get_tests_dir, require_tokenizers, slow
from transformers.tokenization_herbert import VOCAB_FILES_NAMES
from .test_tokenization_common import TokenizerTesterMixin
@require_tokenizers
class HerbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = HerbertTokenizer
......@@ -33,6 +35,10 @@ class HerbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def setUp(self):
super().setUp()
# Use a simpler test file without japanese/chinese characters
with open(f"{get_tests_dir()}/fixtures/sample_text_no_unicode.txt", encoding="utf-8") as f_data:
self._data = f_data.read().replace("\n\n", "\n").strip()
vocab = [
"<s>",
"</s>",
......
......@@ -17,14 +17,20 @@
import os
import unittest
from transformers.tokenization_layoutlm import VOCAB_FILES_NAMES, LayoutLMTokenizer
from transformers import LayoutLMTokenizer, LayoutLMTokenizerFast
from transformers.testing_utils import require_tokenizers
from transformers.tokenization_layoutlm import VOCAB_FILES_NAMES
from .test_tokenization_common import TokenizerTesterMixin
@require_tokenizers
class LayoutLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = LayoutLMTokenizer
rust_tokenizer_class = LayoutLMTokenizerFast
test_rust_tokenizer = True
space_between_special_tokens = True
def setUp(self):
super().setUp()
......
......@@ -17,12 +17,14 @@
import os
import unittest
from transformers import LxmertTokenizer, LxmertTokenizerFast
from transformers.testing_utils import require_tokenizers
from transformers.tokenization_bert import VOCAB_FILES_NAMES
from transformers.tokenization_lxmert import LxmertTokenizer, LxmertTokenizerFast
from .test_tokenization_common import TokenizerTesterMixin
@require_tokenizers
class LxmertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = LxmertTokenizer
......
......@@ -20,9 +20,12 @@ import unittest
from pathlib import Path
from shutil import copyfile
from transformers.testing_utils import _torch_available
from transformers.tokenization_marian import MarianTokenizer, save_json, vocab_files_names
from transformers.tokenization_utils import BatchEncoding
from transformers import BatchEncoding, MarianTokenizer
from transformers.testing_utils import _sentencepiece_available, _torch_available, require_sentencepiece
if _sentencepiece_available:
from transformers.tokenization_marian import save_json, vocab_files_names
from .test_tokenization_common import TokenizerTesterMixin
......@@ -35,6 +38,7 @@ ORG_NAME = "Helsinki-NLP/"
FRAMEWORK = "pt" if _torch_available else "tf"
@require_sentencepiece
class MarianTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = MarianTokenizer
......
import tempfile
import unittest
from transformers import AutoTokenizer, BatchEncoding, MBartTokenizer, MBartTokenizerFast, is_torch_available
from transformers.testing_utils import require_torch
from transformers import (
SPIECE_UNDERLINE,
AutoTokenizer,
BatchEncoding,
MBartTokenizer,
MBartTokenizerFast,
is_torch_available,
)
from transformers.testing_utils import (
_sentencepiece_available,
require_sentencepiece,
require_tokenizers,
require_torch,
)
from .test_tokenization_common import TokenizerTesterMixin
from .test_tokenization_xlm_roberta import SAMPLE_VOCAB, SPIECE_UNDERLINE
if _sentencepiece_available:
from .test_tokenization_xlm_roberta import SAMPLE_VOCAB
if is_torch_available():
......@@ -15,6 +30,8 @@ EN_CODE = 250004
RO_CODE = 250020
@require_sentencepiece
@require_tokenizers
class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = MBartTokenizer
rust_tokenizer_class = MBartTokenizerFast
......@@ -105,6 +122,8 @@ class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@require_torch
@require_sentencepiece
@require_tokenizers
class MBartEnroIntegrationTest(unittest.TestCase):
checkpoint_name = "facebook/mbart-large-en-ro"
src_text = [
......
......@@ -18,11 +18,14 @@ import json
import os
import unittest
from transformers.tokenization_openai import VOCAB_FILES_NAMES, OpenAIGPTTokenizer, OpenAIGPTTokenizerFast
from transformers import OpenAIGPTTokenizer, OpenAIGPTTokenizerFast
from transformers.testing_utils import require_tokenizers
from transformers.tokenization_openai import VOCAB_FILES_NAMES
from .test_tokenization_common import TokenizerTesterMixin
@require_tokenizers
class OpenAIGPTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = OpenAIGPTTokenizer
......@@ -80,3 +83,47 @@ class OpenAIGPTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
input_tokens = tokens + ["<unk>"]
input_bpe_tokens = [14, 15, 20]
self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
def test_padding(self, max_length=15):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
# Simple input
s = "This is a simple input"
s2 = ["This is a simple input 1", "This is a simple input 2"]
p = ("This is a simple input", "This is a pair")
p2 = [
("This is a simple input 1", "This is a simple input 2"),
("This is a simple pair 1", "This is a simple pair 2"),
]
# Simple input tests
self.assertRaises(ValueError, tokenizer_r.encode, s, max_length=max_length, padding="max_length")
# Simple input
self.assertRaises(ValueError, tokenizer_r.encode_plus, s, max_length=max_length, padding="max_length")
# Simple input
self.assertRaises(
ValueError,
tokenizer_r.batch_encode_plus,
s2,
max_length=max_length,
padding="max_length",
)
# Pair input
self.assertRaises(ValueError, tokenizer_r.encode, p, max_length=max_length, padding="max_length")
# Pair input
self.assertRaises(ValueError, tokenizer_r.encode_plus, p, max_length=max_length, padding="max_length")
# Pair input
self.assertRaises(
ValueError,
tokenizer_r.batch_encode_plus,
p2,
max_length=max_length,
padding="max_length",
)
import unittest
from transformers import PegasusTokenizer, PegasusTokenizerFast
from transformers.file_utils import cached_property
from transformers.testing_utils import get_tests_dir, require_torch
from transformers.tokenization_pegasus import PegasusTokenizer, PegasusTokenizerFast
from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, require_torch
from .test_tokenization_common import TokenizerTesterMixin
......@@ -10,6 +10,8 @@ from .test_tokenization_common import TokenizerTesterMixin
SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_no_bos.model")
@require_sentencepiece
@require_tokenizers
class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = PegasusTokenizer
......
......@@ -17,9 +17,9 @@
import os
import unittest
from transformers import SPIECE_UNDERLINE, ReformerTokenizer, ReformerTokenizerFast
from transformers.file_utils import cached_property
from transformers.testing_utils import require_torch, slow
from transformers.tokenization_reformer import SPIECE_UNDERLINE, ReformerTokenizer, ReformerTokenizerFast
from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow
from .test_tokenization_common import TokenizerTesterMixin
......@@ -27,6 +27,8 @@ from .test_tokenization_common import TokenizerTesterMixin
SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
@require_sentencepiece
@require_tokenizers
class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = ReformerTokenizer
......
......@@ -18,16 +18,19 @@ import json
import os
import unittest
from transformers.testing_utils import slow
from transformers.tokenization_roberta import VOCAB_FILES_NAMES, AddedToken, RobertaTokenizer, RobertaTokenizerFast
from transformers import AddedToken, RobertaTokenizer, RobertaTokenizerFast
from transformers.testing_utils import require_tokenizers, slow
from transformers.tokenization_roberta import VOCAB_FILES_NAMES
from .test_tokenization_common import TokenizerTesterMixin
@require_tokenizers
class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = RobertaTokenizer
rust_tokenizer_class = RobertaTokenizerFast
test_rust_tokenizer = True
from_pretrained_kwargs = {"cls_token": "<s>"}
def setUp(self):
super().setUp()
......@@ -158,3 +161,38 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
mask_loc = encoded.index(mask_ind)
first_char = tokenizer.convert_ids_to_tokens(encoded[mask_loc + 1])[0]
self.assertNotEqual(first_char, space_encoding)
def test_pretokenized_inputs(self):
pass
def test_embeded_special_tokens(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
sentence = "A, <mask> AllenNLP sentence."
tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
# token_type_ids should put 0 everywhere
self.assertEqual(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"]))
# attention_mask should put 1 everywhere, so sum over length should be 1
self.assertEqual(
sum(tokens_r["attention_mask"]) / len(tokens_r["attention_mask"]),
sum(tokens_p["attention_mask"]) / len(tokens_p["attention_mask"]),
)
tokens_r_str = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"])
tokens_p_str = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])
# Rust correctly handles the space before the mask while python doesnt
self.assertSequenceEqual(tokens_p["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
self.assertSequenceEqual(tokens_r["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
self.assertSequenceEqual(
tokens_p_str, ["<s>", "A", ",", "<mask>", "ĠAllen", "N", "LP", "Ġsentence", ".", "</s>"]
)
self.assertSequenceEqual(
tokens_r_str, ["<s>", "A", ",", "<mask>", "ĠAllen", "N", "LP", "Ġsentence", ".", "</s>"]
)
......@@ -14,15 +14,18 @@
# limitations under the License.
from transformers.testing_utils import slow
from transformers.tokenization_squeezebert import SqueezeBertTokenizer, SqueezeBertTokenizerFast
from transformers import SqueezeBertTokenizer, SqueezeBertTokenizerFast
from transformers.testing_utils import require_tokenizers, slow
from .test_tokenization_bert import BertTokenizationTest
@require_tokenizers
class SqueezeBertTokenizationTest(BertTokenizationTest):
tokenizer_class = SqueezeBertTokenizer
rust_tokenizer_class = SqueezeBertTokenizerFast
test_rust_tokenizer = True
def get_rust_tokenizer(self, **kwargs):
return SqueezeBertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
......
......@@ -16,11 +16,9 @@
import unittest
from transformers import BatchEncoding
from transformers import SPIECE_UNDERLINE, BatchEncoding, T5Tokenizer, T5TokenizerFast
from transformers.file_utils import cached_property
from transformers.testing_utils import _torch_available, get_tests_dir
from transformers.tokenization_t5 import T5Tokenizer, T5TokenizerFast
from transformers.tokenization_xlnet import SPIECE_UNDERLINE
from transformers.testing_utils import _torch_available, get_tests_dir, require_sentencepiece, require_tokenizers
from .test_tokenization_common import TokenizerTesterMixin
......@@ -30,6 +28,8 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
FRAMEWORK = "pt" if _torch_available else "tf"
@require_sentencepiece
@require_tokenizers
class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = T5Tokenizer
......
......@@ -19,7 +19,7 @@ from typing import Callable, Optional
import numpy as np
from transformers import BatchEncoding, BertTokenizer, BertTokenizerFast, PreTrainedTokenizer, TensorType
from transformers.testing_utils import require_tf, require_torch, slow
from transformers.testing_utils import require_tf, require_tokenizers, require_torch, slow
from transformers.tokenization_gpt2 import GPT2Tokenizer
......@@ -68,6 +68,7 @@ class TokenizerUtilsTest(unittest.TestCase):
self.assertEqual(TensorType("pt"), TensorType.PYTORCH)
self.assertEqual(TensorType("np"), TensorType.NUMPY)
@require_tokenizers
def test_batch_encoding_pickle(self):
import numpy as np
......@@ -92,6 +93,7 @@ class TokenizerUtilsTest(unittest.TestCase):
)
@require_tf
@require_tokenizers
def test_batch_encoding_pickle_tf(self):
import tensorflow as tf
......@@ -112,6 +114,7 @@ class TokenizerUtilsTest(unittest.TestCase):
)
@require_torch
@require_tokenizers
def test_batch_encoding_pickle_pt(self):
import torch
......@@ -128,6 +131,7 @@ class TokenizerUtilsTest(unittest.TestCase):
tokenizer_r("Small example to encode", return_tensors=TensorType.PYTORCH), torch.equal
)
@require_tokenizers
def test_batch_encoding_is_fast(self):
tokenizer_p = BertTokenizer.from_pretrained("bert-base-cased")
tokenizer_r = BertTokenizerFast.from_pretrained("bert-base-cased")
......
......@@ -17,9 +17,9 @@
import os
import unittest
from transformers import SPIECE_UNDERLINE, XLMRobertaTokenizer, XLMRobertaTokenizerFast
from transformers.file_utils import cached_property
from transformers.testing_utils import slow
from transformers.tokenization_xlm_roberta import SPIECE_UNDERLINE, XLMRobertaTokenizer, XLMRobertaTokenizerFast
from transformers.testing_utils import require_sentencepiece, require_tokenizers, slow
from .test_tokenization_common import TokenizerTesterMixin
......@@ -27,6 +27,8 @@ from .test_tokenization_common import TokenizerTesterMixin
SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
@require_sentencepiece
@require_tokenizers
class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = XLMRobertaTokenizer
......
......@@ -17,8 +17,8 @@
import os
import unittest
from transformers.testing_utils import slow
from transformers.tokenization_xlnet import SPIECE_UNDERLINE, XLNetTokenizer, XLNetTokenizerFast
from transformers import SPIECE_UNDERLINE, XLNetTokenizer, XLNetTokenizerFast
from transformers.testing_utils import require_sentencepiece, require_tokenizers, slow
from .test_tokenization_common import TokenizerTesterMixin
......@@ -26,6 +26,8 @@ from .test_tokenization_common import TokenizerTesterMixin
SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
@require_sentencepiece
@require_tokenizers
class XLNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = XLNetTokenizer
......
......@@ -23,7 +23,7 @@ import numpy as np
from transformers import AutoTokenizer, PretrainedConfig, TrainingArguments, is_torch_available
from transformers.file_utils import WEIGHTS_NAME
from transformers.testing_utils import get_tests_dir, require_torch, slow
from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, require_torch, slow
if is_torch_available():
......@@ -151,6 +151,8 @@ if is_torch_available():
@require_torch
@require_sentencepiece
@require_tokenizers
class TrainerIntegrationTest(unittest.TestCase):
def setUp(self):
args = TrainingArguments(".")
......
......@@ -49,6 +49,7 @@ def {0}(*args, **kwargs):
requires_pytorch({0})
"""
DUMMY_TF_PRETRAINED_CLASS = """
class {0}:
def __init__(self, *args, **kwargs):
......@@ -71,12 +72,111 @@ def {0}(*args, **kwargs):
"""
DUMMY_SENTENCEPIECE_PRETRAINED_CLASS = """
class {0}:
def __init__(self, *args, **kwargs):
requires_sentencepiece(self)
@classmethod
def from_pretrained(self, *args, **kwargs):
requires_sentencepiece(self)
"""
DUMMY_SENTENCEPIECE_CLASS = """
class {0}:
def __init__(self, *args, **kwargs):
requires_sentencepiece(self)
"""
DUMMY_SENTENCEPIECE_FUNCTION = """
def {0}(*args, **kwargs):
requires_sentencepiece({0})
"""
DUMMY_TOKENIZERS_PRETRAINED_CLASS = """
class {0}:
def __init__(self, *args, **kwargs):
requires_tokenizers(self)
@classmethod
def from_pretrained(self, *args, **kwargs):
requires_tokenizers(self)
"""
DUMMY_TOKENIZERS_CLASS = """
class {0}:
def __init__(self, *args, **kwargs):
requires_tokenizers(self)
"""
DUMMY_TOKENIZERS_FUNCTION = """
def {0}(*args, **kwargs):
requires_tokenizers({0})
"""
# Map all these to dummy type
DUMMY_PRETRAINED_CLASS = {
"pt": DUMMY_PT_PRETRAINED_CLASS,
"tf": DUMMY_TF_PRETRAINED_CLASS,
"sentencepiece": DUMMY_SENTENCEPIECE_PRETRAINED_CLASS,
"tokenizers": DUMMY_TOKENIZERS_PRETRAINED_CLASS,
}
DUMMY_CLASS = {
"pt": DUMMY_PT_CLASS,
"tf": DUMMY_TF_CLASS,
"sentencepiece": DUMMY_SENTENCEPIECE_CLASS,
"tokenizers": DUMMY_TOKENIZERS_CLASS,
}
DUMMY_FUNCTION = {
"pt": DUMMY_PT_FUNCTION,
"tf": DUMMY_TF_FUNCTION,
"sentencepiece": DUMMY_SENTENCEPIECE_FUNCTION,
"tokenizers": DUMMY_TOKENIZERS_FUNCTION,
}
def read_init():
""" Read the init and exctracts PyTorch and TensorFlow objects. """
""" Read the init and exctracts PyTorch, TensorFlow, SentencePiece and Tokenizers objects. """
with open(os.path.join(PATH_TO_TRANSFORMERS, "__init__.py"), "r", encoding="utf-8") as f:
lines = f.readlines()
line_index = 0
# Find where the SentencePiece imports begin
sentencepiece_objects = []
while not lines[line_index].startswith("if is_sentencepiece_available():"):
line_index += 1
line_index += 1
# Until we unindent, add SentencePiece objects to the list
while len(lines[line_index]) <= 1 or lines[line_index].startswith(" "):
line = lines[line_index]
search = _re_single_line_import.search(line)
if search is not None:
sentencepiece_objects += search.groups()[0].split(", ")
elif line.startswith(" "):
sentencepiece_objects.append(line[8:-2])
line_index += 1
# Find where the Tokenizers imports begin
tokenizers_objects = []
while not lines[line_index].startswith("if is_tokenizers_available():"):
line_index += 1
line_index += 1
# Until we unindent, add Tokenizers objects to the list
while len(lines[line_index]) <= 1 or lines[line_index].startswith(" "):
line = lines[line_index]
search = _re_single_line_import.search(line)
if search is not None:
tokenizers_objects += search.groups()[0].split(", ")
elif line.startswith(" "):
tokenizers_objects.append(line[8:-2])
line_index += 1
# Find where the PyTorch imports begin
pt_objects = []
while not lines[line_index].startswith("if is_torch_available():"):
......@@ -108,10 +208,10 @@ def read_init():
elif line.startswith(" "):
tf_objects.append(line[8:-2])
line_index += 1
return pt_objects, tf_objects
return sentencepiece_objects, tokenizers_objects, pt_objects, tf_objects
def create_dummy_object(name, is_pytorch=True):
def create_dummy_object(name, type="pt"):
""" Create the code for the dummy object corresponding to `name`."""
_pretrained = [
"Config" "ForCausalLM",
......@@ -124,10 +224,11 @@ def create_dummy_object(name, is_pytorch=True):
"Model",
"Tokenizer",
]
assert type in ["pt", "tf", "sentencepiece", "tokenizers"]
if name.isupper():
return DUMMY_CONSTANT.format(name)
elif name.islower():
return (DUMMY_PT_FUNCTION if is_pytorch else DUMMY_TF_FUNCTION).format(name)
return (DUMMY_FUNCTION[type]).format(name)
else:
is_pretrained = False
for part in _pretrained:
......@@ -135,39 +236,75 @@ def create_dummy_object(name, is_pytorch=True):
is_pretrained = True
break
if is_pretrained:
template = DUMMY_PT_PRETRAINED_CLASS if is_pytorch else DUMMY_TF_PRETRAINED_CLASS
template = DUMMY_PRETRAINED_CLASS[type]
else:
template = DUMMY_PT_CLASS if is_pytorch else DUMMY_TF_CLASS
template = DUMMY_CLASS[type]
return template.format(name)
def create_dummy_files():
""" Create the content of the dummy files. """
pt_objects, tf_objects = read_init()
sentencepiece_objects, tokenizers_objects, pt_objects, tf_objects = read_init()
sentencepiece_dummies = "# This file is autogenerated by the command `make fix-copies`, do not edit.\n"
sentencepiece_dummies += "from ..file_utils import requires_sentencepiece\n\n"
sentencepiece_dummies += "\n".join([create_dummy_object(o, type="sentencepiece") for o in sentencepiece_objects])
tokenizers_dummies = "# This file is autogenerated by the command `make fix-copies`, do not edit.\n"
tokenizers_dummies += "from ..file_utils import requires_tokenizers\n\n"
tokenizers_dummies += "\n".join([create_dummy_object(o, type="tokenizers") for o in tokenizers_objects])
pt_dummies = "# This file is autogenerated by the command `make fix-copies`, do not edit.\n"
pt_dummies += "from ..file_utils import requires_pytorch\n\n"
pt_dummies += "\n".join([create_dummy_object(o) for o in pt_objects])
pt_dummies += "\n".join([create_dummy_object(o, type="pt") for o in pt_objects])
tf_dummies = "# This file is autogenerated by the command `make fix-copies`, do not edit.\n"
tf_dummies += "from ..file_utils import requires_tf\n\n"
tf_dummies += "\n".join([create_dummy_object(o, False) for o in tf_objects])
tf_dummies += "\n".join([create_dummy_object(o, type="tf") for o in tf_objects])
return pt_dummies, tf_dummies
return sentencepiece_dummies, tokenizers_dummies, pt_dummies, tf_dummies
def check_dummies(overwrite=False):
""" Check if the dummy files are up to date and maybe `overwrite` with the right content. """
pt_dummies, tf_dummies = create_dummy_files()
sentencepiece_dummies, tokenizers_dummies, pt_dummies, tf_dummies = create_dummy_files()
path = os.path.join(PATH_TO_TRANSFORMERS, "utils")
sentencepiece_file = os.path.join(path, "dummy_sentencepiece_objects.py")
tokenizers_file = os.path.join(path, "dummy_tokenizers_objects.py")
pt_file = os.path.join(path, "dummy_pt_objects.py")
tf_file = os.path.join(path, "dummy_tf_objects.py")
with open(sentencepiece_file, "r", encoding="utf-8") as f:
actual_sentencepiece_dummies = f.read()
with open(tokenizers_file, "r", encoding="utf-8") as f:
actual_tokenizers_dummies = f.read()
with open(pt_file, "r", encoding="utf-8") as f:
actual_pt_dummies = f.read()
with open(tf_file, "r", encoding="utf-8") as f:
actual_tf_dummies = f.read()
if sentencepiece_dummies != actual_sentencepiece_dummies:
if overwrite:
print("Updating transformers.utils.dummy_sentencepiece_objects.py as the main __init__ has new objects.")
with open(sentencepiece_file, "w", encoding="utf-8") as f:
f.write(sentencepiece_dummies)
else:
raise ValueError(
"The main __init__ has objects that are not present in transformers.utils.dummy_sentencepiece_objects.py.",
"Run `make fix-copies` to fix this.",
)
if tokenizers_dummies != actual_tokenizers_dummies:
if overwrite:
print("Updating transformers.utils.dummy_tokenizers_objects.py as the main __init__ has new objects.")
with open(tokenizers_file, "w", encoding="utf-8") as f:
f.write(tokenizers_dummies)
else:
raise ValueError(
"The main __init__ has objects that are not present in transformers.utils.dummy_tokenizers_objects.py.",
"Run `make fix-copies` to fix this.",
)
if pt_dummies != actual_pt_dummies:
if overwrite:
print("Updating transformers.utils.dummy_pt_objects.py as the main __init__ has new objects.")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment